cuda-cccl 0.1.3.2.0.dev271__cp313-cp313-manylinux_2_26_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1947) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +46 -0
  3. cuda/cccl/cooperative/__init__.py +3 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +8 -0
  5. cuda/cccl/cooperative/experimental/_caching.py +48 -0
  6. cuda/cccl/cooperative/experimental/_common.py +273 -0
  7. cuda/cccl/cooperative/experimental/_nvrtc.py +91 -0
  8. cuda/cccl/cooperative/experimental/_scan_op.py +181 -0
  9. cuda/cccl/cooperative/experimental/_types.py +937 -0
  10. cuda/cccl/cooperative/experimental/_typing.py +107 -0
  11. cuda/cccl/cooperative/experimental/block/__init__.py +39 -0
  12. cuda/cccl/cooperative/experimental/block/_block_exchange.py +251 -0
  13. cuda/cccl/cooperative/experimental/block/_block_load_store.py +215 -0
  14. cuda/cccl/cooperative/experimental/block/_block_merge_sort.py +125 -0
  15. cuda/cccl/cooperative/experimental/block/_block_radix_sort.py +214 -0
  16. cuda/cccl/cooperative/experimental/block/_block_reduce.py +294 -0
  17. cuda/cccl/cooperative/experimental/block/_block_scan.py +983 -0
  18. cuda/cccl/cooperative/experimental/warp/__init__.py +9 -0
  19. cuda/cccl/cooperative/experimental/warp/_warp_merge_sort.py +92 -0
  20. cuda/cccl/cooperative/experimental/warp/_warp_reduce.py +153 -0
  21. cuda/cccl/cooperative/experimental/warp/_warp_scan.py +78 -0
  22. cuda/cccl/headers/__init__.py +7 -0
  23. cuda/cccl/headers/include/__init__.py +1 -0
  24. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +262 -0
  25. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1181 -0
  26. cuda/cccl/headers/include/cub/agent/agent_for.cuh +84 -0
  27. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +925 -0
  28. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +226 -0
  29. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +730 -0
  30. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +766 -0
  31. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
  32. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +704 -0
  33. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +557 -0
  34. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +632 -0
  35. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +804 -0
  36. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1124 -0
  37. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +561 -0
  38. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +473 -0
  39. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +292 -0
  40. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1114 -0
  41. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +341 -0
  42. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +592 -0
  43. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +614 -0
  44. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1342 -0
  45. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +965 -0
  46. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1217 -0
  47. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1306 -0
  48. cuda/cccl/headers/include/cub/block/block_histogram.cuh +420 -0
  49. cuda/cccl/headers/include/cub/block/block_load.cuh +1260 -0
  50. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +787 -0
  51. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1218 -0
  52. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2193 -0
  53. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  54. cuda/cccl/headers/include/cub/block/block_reduce.cuh +665 -0
  55. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +437 -0
  56. cuda/cccl/headers/include/cub/block/block_scan.cuh +2583 -0
  57. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  58. cuda/cccl/headers/include/cub/block/block_store.cuh +1246 -0
  59. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +620 -0
  60. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  61. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  62. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  63. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  64. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
  65. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  66. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  67. cuda/cccl/headers/include/cub/config.cuh +53 -0
  68. cuda/cccl/headers/include/cub/cub.cuh +112 -0
  69. cuda/cccl/headers/include/cub/detail/array_utils.cuh +77 -0
  70. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +155 -0
  71. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +93 -0
  72. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  73. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +246 -0
  74. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +84 -0
  75. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
  76. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
  77. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +118 -0
  78. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  79. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  80. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +61 -0
  81. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  82. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +71 -0
  83. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +93 -0
  84. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  85. cuda/cccl/headers/include/cub/detail/rfa.cuh +724 -0
  86. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  87. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  88. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
  89. cuda/cccl/headers/include/cub/detail/type_traits.cuh +179 -0
  90. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +72 -0
  91. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  92. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  93. cuda/cccl/headers/include/cub/device/device_copy.cuh +187 -0
  94. cuda/cccl/headers/include/cub/device/device_for.cuh +990 -0
  95. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1507 -0
  96. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  97. cuda/cccl/headers/include/cub/device/device_merge.cuh +202 -0
  98. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  99. cuda/cccl/headers/include/cub/device/device_partition.cuh +664 -0
  100. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3435 -0
  101. cuda/cccl/headers/include/cub/device/device_reduce.cuh +1898 -0
  102. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
  103. cuda/cccl/headers/include/cub/device/device_scan.cuh +1899 -0
  104. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  105. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1512 -0
  106. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  107. cuda/cccl/headers/include/cub/device/device_select.cuh +1224 -0
  108. cuda/cccl/headers/include/cub/device/device_transform.cuh +545 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +314 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +109 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +718 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +45 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +197 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1042 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +305 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1749 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1316 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +656 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +497 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +313 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +612 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +497 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +598 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +916 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +838 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +441 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +455 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +558 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +543 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +218 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +799 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +591 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +194 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +330 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +475 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +121 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +987 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +70 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +121 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +63 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +278 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +91 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +118 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1068 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +945 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +675 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +609 -0
  154. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1013 -0
  155. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +249 -0
  156. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1587 -0
  157. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +407 -0
  158. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +448 -0
  159. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +874 -0
  160. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +226 -0
  161. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  162. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  163. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +256 -0
  164. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +260 -0
  165. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +252 -0
  166. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +322 -0
  167. cuda/cccl/headers/include/cub/thread/thread_load.cuh +347 -0
  168. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +684 -0
  169. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +547 -0
  170. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
  171. cuda/cccl/headers/include/cub/thread/thread_search.cuh +198 -0
  172. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +464 -0
  173. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +101 -0
  174. cuda/cccl/headers/include/cub/thread/thread_store.cuh +364 -0
  175. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  176. cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
  177. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  178. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  179. cuda/cccl/headers/include/cub/util_device.cuh +779 -0
  180. cuda/cccl/headers/include/cub/util_macro.cuh +99 -0
  181. cuda/cccl/headers/include/cub/util_math.cuh +115 -0
  182. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  183. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  184. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  185. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  186. cuda/cccl/headers/include/cub/util_type.cuh +1136 -0
  187. cuda/cccl/headers/include/cub/util_vsmem.cuh +251 -0
  188. cuda/cccl/headers/include/cub/version.cuh +89 -0
  189. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  190. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  191. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +729 -0
  192. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +405 -0
  193. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +950 -0
  194. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +713 -0
  195. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  196. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  197. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  198. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +822 -0
  199. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1885 -0
  200. cuda/cccl/headers/include/cub/warp/warp_store.cuh +520 -0
  201. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
  202. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  203. cuda/cccl/headers/include/cuda/__algorithm/copy.h +143 -0
  204. cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
  205. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  206. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  207. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
  208. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  209. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  210. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  211. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  212. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  213. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  214. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  215. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
  216. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +466 -0
  217. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
  218. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  219. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  220. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  221. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  222. cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
  223. cuda/cccl/headers/include/cuda/__cccl_config +36 -0
  224. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  225. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +249 -0
  226. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  227. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  228. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  229. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  230. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  231. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  232. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  233. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  234. cuda/cccl/headers/include/cuda/__device/all_devices.h +240 -0
  235. cuda/cccl/headers/include/cuda/__device/arch_traits.h +613 -0
  236. cuda/cccl/headers/include/cuda/__device/attributes.h +721 -0
  237. cuda/cccl/headers/include/cuda/__device/device_ref.h +176 -0
  238. cuda/cccl/headers/include/cuda/__device/physical_device.h +168 -0
  239. cuda/cccl/headers/include/cuda/__driver/driver_api.h +503 -0
  240. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  241. cuda/cccl/headers/include/cuda/__event/event_ref.h +158 -0
  242. cuda/cccl/headers/include/cuda/__event/timed_event.h +118 -0
  243. cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
  244. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  245. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  246. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  247. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +276 -0
  248. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  249. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  250. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +109 -0
  251. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  252. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  253. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  254. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  255. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +49 -0
  256. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +257 -0
  257. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +460 -0
  258. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +314 -0
  259. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +424 -0
  260. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +292 -0
  261. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +321 -0
  262. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +335 -0
  263. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +501 -0
  264. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +496 -0
  265. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +452 -0
  266. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +94 -0
  267. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +539 -0
  268. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  269. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +462 -0
  270. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +63 -0
  271. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +122 -0
  272. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +51 -0
  273. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  274. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  275. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  276. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  277. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  278. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  279. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  280. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  281. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  282. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
  283. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  284. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  285. cuda/cccl/headers/include/cuda/__memory/address_space.h +211 -0
  286. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  287. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  288. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  289. cuda/cccl/headers/include/cuda/__memory/check_address.h +106 -0
  290. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  291. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  292. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  293. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  294. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  295. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  296. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +69 -0
  297. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
  298. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +654 -0
  299. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
  300. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  301. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  302. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  303. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  304. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2982 -0
  305. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  306. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  307. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  308. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  309. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  310. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  311. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  312. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  313. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  314. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  315. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  411. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  412. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
  413. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  414. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  415. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +97 -0
  416. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  417. cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
  418. cuda/cccl/headers/include/cuda/__stream/stream.h +142 -0
  419. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +296 -0
  420. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  421. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  422. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  423. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  424. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  425. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  426. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  427. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  428. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  429. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +521 -0
  430. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
  431. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
  432. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  433. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  434. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  435. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
  436. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
  437. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +78 -0
  438. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  439. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  440. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
  441. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  442. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  443. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  444. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  445. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  446. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  447. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  448. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  449. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  450. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  451. cuda/cccl/headers/include/cuda/access_property +26 -0
  452. cuda/cccl/headers/include/cuda/algorithm +27 -0
  453. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  454. cuda/cccl/headers/include/cuda/atomic +27 -0
  455. cuda/cccl/headers/include/cuda/barrier +267 -0
  456. cuda/cccl/headers/include/cuda/bit +29 -0
  457. cuda/cccl/headers/include/cuda/cmath +36 -0
  458. cuda/cccl/headers/include/cuda/devices +20 -0
  459. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  460. cuda/cccl/headers/include/cuda/functional +32 -0
  461. cuda/cccl/headers/include/cuda/iterator +38 -0
  462. cuda/cccl/headers/include/cuda/latch +27 -0
  463. cuda/cccl/headers/include/cuda/mdspan +28 -0
  464. cuda/cccl/headers/include/cuda/memory +34 -0
  465. cuda/cccl/headers/include/cuda/memory_resource +35 -0
  466. cuda/cccl/headers/include/cuda/numeric +29 -0
  467. cuda/cccl/headers/include/cuda/pipeline +578 -0
  468. cuda/cccl/headers/include/cuda/ptx +128 -0
  469. cuda/cccl/headers/include/cuda/semaphore +31 -0
  470. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  471. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  472. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  473. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  474. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  475. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +64 -0
  476. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  477. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  478. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  479. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  480. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  481. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +92 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  503. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  504. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  505. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  506. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  507. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  508. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  509. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  510. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +96 -0
  511. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  512. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +68 -0
  513. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  514. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  515. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  516. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  517. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +141 -0
  518. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  519. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
  520. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  521. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  522. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  523. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  524. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  525. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  526. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  527. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +94 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +101 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  566. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  567. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +138 -0
  568. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  569. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  570. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  571. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  572. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  573. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  574. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  575. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  576. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  577. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  578. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  579. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  580. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
  581. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
  582. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  583. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  584. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  585. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  586. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  587. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  588. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  589. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  590. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  591. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
  592. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
  593. cuda/cccl/headers/include/cuda/std/__bit/countl.h +167 -0
  594. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  595. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  596. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  597. cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
  598. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  599. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  600. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  601. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  602. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +150 -0
  603. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  604. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +787 -0
  605. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  606. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  607. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +53 -0
  608. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +69 -0
  609. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  610. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
  611. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
  612. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +79 -0
  613. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +68 -0
  614. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +133 -0
  615. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  616. cuda/cccl/headers/include/cuda/std/__cccl/os.h +48 -0
  617. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1276 -0
  618. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
  619. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
  620. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  621. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  622. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  623. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  624. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  625. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +146 -0
  626. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  627. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
  628. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  629. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  630. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  631. cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
  632. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  633. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  634. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +724 -0
  635. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
  636. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +216 -0
  637. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
  638. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
  639. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
  640. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
  641. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
  642. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
  643. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
  644. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +180 -0
  645. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  646. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  647. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +582 -0
  648. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +260 -0
  649. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  650. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
  651. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
  652. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
  653. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  654. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
  655. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
  656. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  657. cuda/cccl/headers/include/cuda/std/__complex/complex.h +676 -0
  658. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
  659. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  660. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
  661. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  662. cuda/cccl/headers/include/cuda/std/__complex/literals.h +106 -0
  663. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +302 -0
  664. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  665. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +322 -0
  666. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +321 -0
  667. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  668. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  669. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  670. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  671. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  672. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  673. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  674. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  675. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  676. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  677. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +274 -0
  678. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +107 -0
  679. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  680. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  681. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  682. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  683. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  684. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  685. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  686. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  687. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  688. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  689. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  690. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +42 -0
  691. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  692. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  693. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  694. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  695. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  696. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  697. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  698. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  699. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +69 -0
  700. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +58 -0
  701. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  702. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  703. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +72 -0
  704. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +146 -0
  705. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  706. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  707. cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
  708. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  709. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1963 -0
  710. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
  711. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  712. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +172 -0
  713. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  714. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +809 -0
  715. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  716. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  717. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +374 -0
  718. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  719. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +113 -0
  720. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  721. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +39 -0
  722. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +72 -0
  723. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  724. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  725. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  726. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  727. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  728. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  729. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  730. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  731. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  732. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  733. cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
  734. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  735. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  736. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  737. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
  738. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  739. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  740. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  741. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  742. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  743. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  744. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  745. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  746. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  747. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  748. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  749. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  750. cuda/cccl/headers/include/cuda/std/__functional/bind.h +337 -0
  751. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  752. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  753. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  754. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  755. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  756. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  757. cuda/cccl/headers/include/cuda/std/__functional/function.h +1279 -0
  758. cuda/cccl/headers/include/cuda/std/__functional/hash.h +650 -0
  759. cuda/cccl/headers/include/cuda/std/__functional/identity.h +61 -0
  760. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +560 -0
  761. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +43 -0
  762. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +67 -0
  763. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +213 -0
  764. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  765. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  766. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  767. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +65 -0
  768. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +64 -0
  769. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  770. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  771. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  772. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  773. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  774. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +277 -0
  775. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +35 -0
  776. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  777. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +49 -0
  778. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +34 -0
  779. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  780. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  781. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  782. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  783. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  784. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +90 -0
  785. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  786. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  787. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +34 -0
  788. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  789. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  790. cuda/cccl/headers/include/cuda/std/__fwd/string.h +83 -0
  791. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +59 -0
  792. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  793. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  794. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  795. cuda/cccl/headers/include/cuda/std/__internal/features.h +71 -0
  796. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +122 -0
  797. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  798. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  799. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
  800. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +254 -0
  801. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  802. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  803. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  804. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  805. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  806. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  807. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  808. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
  809. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +150 -0
  810. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  811. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  812. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  813. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  814. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  815. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  816. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  817. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +932 -0
  818. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  819. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +433 -0
  820. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  821. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  822. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  823. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  824. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  825. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +91 -0
  826. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  827. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +185 -0
  828. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  829. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
  830. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  831. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  832. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  833. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +247 -0
  834. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  835. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  836. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  837. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  838. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
  839. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
  840. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
  841. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
  842. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  843. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +138 -0
  844. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  845. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
  846. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +757 -0
  847. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  848. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  849. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
  850. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +499 -0
  851. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  852. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  853. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  854. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +64 -0
  855. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  856. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  857. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  858. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  859. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  860. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  861. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +552 -0
  862. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  863. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  864. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  865. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +248 -0
  866. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  867. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +43 -0
  868. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +261 -0
  869. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  870. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  871. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +682 -0
  872. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +767 -0
  873. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
  874. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  875. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  876. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  877. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  878. cuda/cccl/headers/include/cuda/std/__new/launder.h +49 -0
  879. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  880. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  881. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  882. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  883. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  884. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  885. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  886. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  887. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  888. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  889. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  890. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  891. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  892. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  893. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  894. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  895. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  896. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  897. cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
  898. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +432 -0
  899. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  900. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  901. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  902. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  903. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  904. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  905. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  906. cuda/cccl/headers/include/cuda/std/__random_ +29 -0
  907. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  908. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  909. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +314 -0
  910. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  911. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  912. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  913. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  914. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  915. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  916. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  917. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  918. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  919. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  920. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +161 -0
  921. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
  922. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  923. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  924. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  925. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  926. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  927. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  928. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  929. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
  930. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  931. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  932. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  933. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  934. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  935. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +233 -0
  936. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  937. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  938. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  939. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  940. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  941. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  942. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  943. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  944. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +105 -0
  945. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  946. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  947. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  948. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  949. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  950. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +98 -0
  951. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +269 -0
  952. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +218 -0
  953. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  954. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  955. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +90 -0
  956. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +73 -0
  957. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  958. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  959. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +291 -0
  960. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  961. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  962. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  963. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  964. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  965. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  966. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  967. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  968. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  969. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  970. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +69 -0
  971. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  972. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  973. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  974. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  975. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  976. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  977. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  978. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  979. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  980. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  981. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  982. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  983. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +47 -0
  984. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +51 -0
  985. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  986. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +40 -0
  987. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +44 -0
  988. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  989. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  990. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  991. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  992. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +83 -0
  993. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  994. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  995. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  996. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +68 -0
  997. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +54 -0
  998. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  999. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  1000. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1001. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +214 -0
  1002. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1003. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1004. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1005. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1006. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1007. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1008. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +73 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +68 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +81 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +56 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1016. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1017. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1018. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1019. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1020. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1021. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +59 -0
  1022. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1023. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1024. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1025. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1026. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1027. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1028. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  1029. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1030. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1031. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1032. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1033. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1034. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1035. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1036. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1037. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1038. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1039. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +62 -0
  1040. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1041. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1042. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1043. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +63 -0
  1044. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +119 -0
  1045. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1046. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  1047. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1048. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +84 -0
  1049. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1050. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1051. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1052. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1053. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +57 -0
  1054. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +201 -0
  1055. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +56 -0
  1056. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +70 -0
  1057. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +82 -0
  1058. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +60 -0
  1059. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +61 -0
  1060. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +56 -0
  1061. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +55 -0
  1062. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +73 -0
  1063. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +60 -0
  1064. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +58 -0
  1065. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1066. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +57 -0
  1067. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1068. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1069. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1070. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1071. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1072. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1073. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1074. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1075. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1076. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1077. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1078. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1079. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1080. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +123 -0
  1081. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1082. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1083. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1084. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1085. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1086. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +66 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1100. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1101. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1102. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1103. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1104. cuda/cccl/headers/include/cuda/std/__utility/declval.h +63 -0
  1105. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1106. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1107. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1108. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1109. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +77 -0
  1110. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  1111. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1112. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1113. cuda/cccl/headers/include/cuda/std/__utility/pair.h +797 -0
  1114. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1115. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
  1116. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1117. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  1118. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1119. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1120. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1121. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1122. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1123. cuda/cccl/headers/include/cuda/std/array +518 -0
  1124. cuda/cccl/headers/include/cuda/std/atomic +818 -0
  1125. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1126. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1127. cuda/cccl/headers/include/cuda/std/bitset +994 -0
  1128. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1129. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1130. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1131. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1132. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1133. cuda/cccl/headers/include/cuda/std/cmath +25 -0
  1134. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1135. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1136. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1137. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1138. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1139. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1140. cuda/cccl/headers/include/cuda/std/ctime +154 -0
  1141. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1142. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +235 -0
  1143. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1722 -0
  1144. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3630 -0
  1145. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/cmath +520 -0
  1146. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1147. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1148. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
  1149. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2142 -0
  1150. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1151. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1152. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1153. cuda/cccl/headers/include/cuda/std/initializer_list +36 -0
  1154. cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
  1155. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1156. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1157. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1158. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1159. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1160. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1161. cuda/cccl/headers/include/cuda/std/numbers +342 -0
  1162. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1163. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1164. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1165. cuda/cccl/headers/include/cuda/std/ratio +417 -0
  1166. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1167. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1168. cuda/cccl/headers/include/cuda/std/span +628 -0
  1169. cuda/cccl/headers/include/cuda/std/string_view +799 -0
  1170. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1171. cuda/cccl/headers/include/cuda/std/type_traits +177 -0
  1172. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1173. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1174. cuda/cccl/headers/include/cuda/std/version +245 -0
  1175. cuda/cccl/headers/include/cuda/stream +31 -0
  1176. cuda/cccl/headers/include/cuda/stream_ref +54 -0
  1177. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1178. cuda/cccl/headers/include/cuda/utility +27 -0
  1179. cuda/cccl/headers/include/cuda/version +16 -0
  1180. cuda/cccl/headers/include/cuda/warp +28 -0
  1181. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1182. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1183. cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
  1184. cuda/cccl/headers/include/nv/target +235 -0
  1185. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1186. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1187. cuda/cccl/headers/include/thrust/advance.h +59 -0
  1188. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1189. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1190. cuda/cccl/headers/include/thrust/complex.h +858 -0
  1191. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1192. cuda/cccl/headers/include/thrust/count.h +245 -0
  1193. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1194. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +37 -0
  1195. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1196. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +350 -0
  1197. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +371 -0
  1198. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +45 -0
  1199. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +242 -0
  1200. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +39 -0
  1201. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +137 -0
  1202. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +39 -0
  1203. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +99 -0
  1204. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +53 -0
  1205. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +68 -0
  1206. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1207. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +102 -0
  1208. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +86 -0
  1209. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +79 -0
  1210. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +79 -0
  1211. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +39 -0
  1212. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +98 -0
  1213. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1214. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1215. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1216. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1217. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1218. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1219. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1220. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1221. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1222. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1223. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1224. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1225. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1226. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1227. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1228. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1229. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1230. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1231. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1232. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1233. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1234. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1235. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1236. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1237. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1238. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1239. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1240. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1241. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1242. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1243. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1244. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1245. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1246. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1247. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1248. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1249. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1250. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1251. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1252. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1253. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1254. cuda/cccl/headers/include/thrust/detail/device_delete.inl +52 -0
  1255. cuda/cccl/headers/include/thrust/detail/device_free.inl +47 -0
  1256. cuda/cccl/headers/include/thrust/detail/device_new.inl +61 -0
  1257. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1258. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1259. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1260. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +80 -0
  1261. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1262. cuda/cccl/headers/include/thrust/detail/execution_policy.h +80 -0
  1263. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1264. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1265. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1266. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1267. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1268. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1269. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1270. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1271. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1272. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1273. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1274. cuda/cccl/headers/include/thrust/detail/integer_math.h +130 -0
  1275. cuda/cccl/headers/include/thrust/detail/internal_functional.h +293 -0
  1276. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1277. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
  1278. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1279. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1280. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +40 -0
  1281. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1282. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1283. cuda/cccl/headers/include/thrust/detail/mpl/math.h +164 -0
  1284. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +37 -0
  1285. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1286. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1287. cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
  1288. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1289. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1290. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1291. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1292. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1293. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
  1294. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1295. cuda/cccl/headers/include/thrust/detail/reference.h +500 -0
  1296. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1297. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1298. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1299. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1300. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1301. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1302. cuda/cccl/headers/include/thrust/detail/seq.h +54 -0
  1303. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1304. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1305. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1306. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1307. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1308. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1309. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1310. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1311. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1312. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
  1313. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1314. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1315. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1316. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
  1317. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1318. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1319. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1320. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1321. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1322. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1323. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1324. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
  1325. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_discard_iterator.h +44 -0
  1326. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1327. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1328. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1329. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1330. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1331. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1332. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1333. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1334. cuda/cccl/headers/include/thrust/detail/util/align.h +59 -0
  1335. cuda/cccl/headers/include/thrust/detail/vector_base.h +615 -0
  1336. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1212 -0
  1337. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1338. cuda/cccl/headers/include/thrust/device_delete.h +59 -0
  1339. cuda/cccl/headers/include/thrust/device_free.h +72 -0
  1340. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1341. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1342. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1343. cuda/cccl/headers/include/thrust/device_new.h +91 -0
  1344. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1345. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1346. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1347. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1348. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1349. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1350. cuda/cccl/headers/include/thrust/execution_policy.h +384 -0
  1351. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1352. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1353. cuda/cccl/headers/include/thrust/find.h +382 -0
  1354. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1355. cuda/cccl/headers/include/thrust/functional.h +396 -0
  1356. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1357. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1358. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1359. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1360. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1361. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1362. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1363. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1364. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1365. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1366. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1367. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +51 -0
  1368. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +62 -0
  1369. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1370. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +199 -0
  1371. cuda/cccl/headers/include/thrust/iterator/detail/iterator_traversal_tags.h +50 -0
  1372. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +53 -0
  1373. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1374. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1375. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1376. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1377. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1378. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1379. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +215 -0
  1380. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +660 -0
  1381. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +311 -0
  1382. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1383. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1384. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1385. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1386. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
  1387. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1388. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +149 -0
  1389. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1390. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1391. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1392. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
  1393. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1394. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1395. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1396. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1397. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1398. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1399. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1400. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1401. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
  1402. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1403. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1404. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1405. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1406. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1407. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1408. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1409. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1410. cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
  1411. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1412. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1413. cuda/cccl/headers/include/thrust/pair.h +99 -0
  1414. cuda/cccl/headers/include/thrust/partition.h +1391 -0
  1415. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1416. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1417. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1418. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1419. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1420. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1421. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1422. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1423. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1424. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1425. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1426. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1427. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
  1428. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1429. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +217 -0
  1430. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1431. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1432. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1433. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1434. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1435. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1436. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1437. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1438. cuda/cccl/headers/include/thrust/random.h +120 -0
  1439. cuda/cccl/headers/include/thrust/reduce.h +1113 -0
  1440. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1441. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1442. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1443. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1444. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1445. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1446. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1447. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1448. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1449. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1450. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1451. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1452. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1453. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1454. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1455. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1456. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1457. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +90 -0
  1458. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1459. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1460. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1461. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1462. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1463. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1464. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1465. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1466. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1467. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1468. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1469. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1470. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1471. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1472. cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +62 -0
  1473. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1474. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1475. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1476. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1477. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1478. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1479. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1480. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1481. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1482. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1483. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1484. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1485. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1486. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1487. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1488. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1489. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1490. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1491. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1492. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1493. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1494. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1495. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1496. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +158 -0
  1497. cuda/cccl/headers/include/thrust/system/cpp/memory.h +109 -0
  1498. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +75 -0
  1499. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +123 -0
  1500. cuda/cccl/headers/include/thrust/system/cpp/vector.h +99 -0
  1501. cuda/cccl/headers/include/thrust/system/cuda/config.h +123 -0
  1502. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1503. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1504. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1505. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1506. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +129 -0
  1507. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1508. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1509. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1510. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +609 -0
  1511. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1512. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1513. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +210 -0
  1514. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1515. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1516. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +113 -0
  1517. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1518. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +82 -0
  1519. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +272 -0
  1520. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1521. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1522. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +85 -0
  1523. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1524. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1525. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +204 -0
  1526. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +92 -0
  1527. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +69 -0
  1528. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1529. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1530. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1531. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1532. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1533. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +217 -0
  1534. cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +237 -0
  1535. cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +95 -0
  1536. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1537. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1538. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1539. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +782 -0
  1540. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
  1541. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1542. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +152 -0
  1543. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
  1544. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +342 -0
  1545. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +415 -0
  1546. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +79 -0
  1547. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1548. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1738 -0
  1549. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +482 -0
  1550. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
  1551. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +75 -0
  1552. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1553. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1554. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +415 -0
  1555. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1556. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1557. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +114 -0
  1558. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +91 -0
  1559. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
  1560. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +311 -0
  1561. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +251 -0
  1562. cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
  1563. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +39 -0
  1564. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1565. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1566. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
  1567. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1568. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1569. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1570. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1571. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1572. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1573. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1574. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1575. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1576. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1577. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1578. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1579. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1580. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1581. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1582. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1583. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1584. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +64 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/errno.h +125 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +59 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +85 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +167 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +391 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +51 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +70 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +64 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +152 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +54 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +90 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +55 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +66 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +72 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +258 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +60 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +55 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +143 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +64 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +79 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +102 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +51 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +66 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +78 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +65 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +70 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +92 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +105 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +154 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +55 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +74 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +135 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +213 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +49 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +77 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +106 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +89 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +192 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +92 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +127 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +101 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +181 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +54 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +73 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +78 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +141 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +78 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +91 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +132 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +238 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +79 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +91 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +96 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +288 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +482 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +60 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +131 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +119 -0
  1675. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +181 -0
  1676. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +50 -0
  1677. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +82 -0
  1678. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +47 -0
  1679. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +60 -0
  1680. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +53 -0
  1681. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +60 -0
  1682. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +88 -0
  1683. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +56 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +62 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +86 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +119 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +51 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +172 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +51 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +121 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +77 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +119 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +87 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +132 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +123 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +76 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +48 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +142 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +55 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +125 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +55 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +77 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +78 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +116 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +68 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +80 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +129 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +49 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +147 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +51 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +56 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +81 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +151 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +309 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +70 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +104 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +185 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +160 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +151 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +212 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +65 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +61 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +362 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +54 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +130 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +54 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +592 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +64 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +121 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +112 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1755. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1756. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1757. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1758. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1759. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1760. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1761. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1762. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1763. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1764. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1765. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1766. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1767. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +113 -0
  1768. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1769. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1770. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1771. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1772. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1773. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1774. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1775. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1776. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1777. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1778. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1779. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1780. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1781. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1782. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1783. cuda/cccl/headers/include/thrust/system/omp/detail/par.h +62 -0
  1784. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1785. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1786. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1787. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1788. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1789. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1790. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1791. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1792. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1793. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1794. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1795. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1796. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1797. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1798. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +30 -0
  1799. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +30 -0
  1800. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1801. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1802. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1803. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1804. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
  1805. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1806. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1807. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1808. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1809. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1810. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1811. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1812. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1813. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1814. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1815. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1816. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1817. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +157 -0
  1818. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1819. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1820. cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
  1821. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1822. cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
  1823. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1824. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1825. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1826. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1827. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1828. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1829. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1830. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1831. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1832. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +92 -0
  1833. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1834. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1835. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1836. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1837. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1838. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1839. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1840. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1841. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1842. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1843. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1844. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1845. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1846. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1847. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1848. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1849. cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +62 -0
  1850. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1851. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1852. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1853. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1854. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1855. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1856. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1857. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1858. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1859. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1860. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1861. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1862. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1863. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1864. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +30 -0
  1865. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1866. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1867. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1868. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1869. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1870. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1871. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1872. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1873. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1874. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +157 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1887. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1888. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1889. cuda/cccl/headers/include/thrust/transform.h +1045 -0
  1890. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1891. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1892. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1893. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1894. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
  1895. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1896. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1897. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1898. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
  1899. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1900. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +96 -0
  1901. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1902. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1903. cuda/cccl/headers/include/thrust/unique.h +1088 -0
  1904. cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
  1905. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1906. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1907. cuda/cccl/headers/include/thrust/version.h +93 -0
  1908. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1909. cuda/cccl/headers/include_paths.py +54 -0
  1910. cuda/cccl/parallel/__init__.py +9 -0
  1911. cuda/cccl/parallel/experimental/.gitignore +4 -0
  1912. cuda/cccl/parallel/experimental/__init__.py +75 -0
  1913. cuda/cccl/parallel/experimental/_bindings.py +56 -0
  1914. cuda/cccl/parallel/experimental/_bindings.pyi +405 -0
  1915. cuda/cccl/parallel/experimental/_bindings_impl.pyx +1957 -0
  1916. cuda/cccl/parallel/experimental/_caching.py +71 -0
  1917. cuda/cccl/parallel/experimental/_cccl_interop.py +396 -0
  1918. cuda/cccl/parallel/experimental/_utils/__init__.py +0 -0
  1919. cuda/cccl/parallel/experimental/_utils/protocols.py +132 -0
  1920. cuda/cccl/parallel/experimental/_utils/temp_storage_buffer.py +86 -0
  1921. cuda/cccl/parallel/experimental/algorithms/__init__.py +50 -0
  1922. cuda/cccl/parallel/experimental/algorithms/_histogram.py +243 -0
  1923. cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +225 -0
  1924. cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +312 -0
  1925. cuda/cccl/parallel/experimental/algorithms/_reduce.py +184 -0
  1926. cuda/cccl/parallel/experimental/algorithms/_scan.py +261 -0
  1927. cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +257 -0
  1928. cuda/cccl/parallel/experimental/algorithms/_transform.py +308 -0
  1929. cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +252 -0
  1930. cuda/cccl/parallel/experimental/cccl/.gitkeep +0 -0
  1931. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-313-x86_64-linux-gnu.so +0 -0
  1932. cuda/cccl/parallel/experimental/cu12/cccl/libcccl.c.parallel.so +0 -0
  1933. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-313-x86_64-linux-gnu.so +0 -0
  1934. cuda/cccl/parallel/experimental/cu13/cccl/libcccl.c.parallel.so +0 -0
  1935. cuda/cccl/parallel/experimental/iterators/__init__.py +21 -0
  1936. cuda/cccl/parallel/experimental/iterators/_factories.py +214 -0
  1937. cuda/cccl/parallel/experimental/iterators/_iterators.py +627 -0
  1938. cuda/cccl/parallel/experimental/iterators/_zip_iterator.py +207 -0
  1939. cuda/cccl/parallel/experimental/numba_utils.py +6 -0
  1940. cuda/cccl/parallel/experimental/op.py +3 -0
  1941. cuda/cccl/parallel/experimental/struct.py +272 -0
  1942. cuda/cccl/parallel/experimental/typing.py +35 -0
  1943. cuda/cccl/py.typed +0 -0
  1944. cuda_cccl-0.1.3.2.0.dev271.dist-info/METADATA +40 -0
  1945. cuda_cccl-0.1.3.2.0.dev271.dist-info/RECORD +1947 -0
  1946. cuda_cccl-0.1.3.2.0.dev271.dist-info/WHEEL +5 -0
  1947. cuda_cccl-0.1.3.2.0.dev271.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,2193 @@
1
+ /******************************************************************************
2
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
3
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4
+ *
5
+ * Redistribution and use in source and binary forms, with or without
6
+ * modification, are permitted provided that the following conditions are met:
7
+ * * Redistributions of source code must retain the above copyright
8
+ * notice, this list of conditions and the following disclaimer.
9
+ * * Redistributions in binary form must reproduce the above copyright
10
+ * notice, this list of conditions and the following disclaimer in the
11
+ * documentation and/or other materials provided with the distribution.
12
+ * * Neither the name of the NVIDIA CORPORATION nor the
13
+ * names of its contributors may be used to endorse or promote products
14
+ * derived from this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ *
27
+ ******************************************************************************/
28
+
29
+ /**
30
+ * @file
31
+ * The cub::BlockRadixSort class provides [<em>collective</em>](../index.html#sec0) methods for radix
32
+ * sorting of items partitioned across a CUDA thread block.
33
+ */
34
+
35
+ #pragma once
36
+
37
+ #include <cub/config.cuh>
38
+
39
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
40
+ # pragma GCC system_header
41
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
42
+ # pragma clang system_header
43
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
44
+ # pragma system_header
45
+ #endif // no system header
46
+
47
+ #include <cub/block/block_exchange.cuh>
48
+ #include <cub/block/block_radix_rank.cuh>
49
+ #include <cub/block/radix_rank_sort_operations.cuh>
50
+ #include <cub/util_ptx.cuh>
51
+ #include <cub/util_type.cuh>
52
+
53
+ #include <cuda/std/__algorithm_>
54
+ #include <cuda/std/type_traits>
55
+
56
+ CUB_NAMESPACE_BEGIN
57
+
58
+ //! @rst
59
+ //! BlockRadixSort class provides :ref:`collective <collective-primitives>` methods for sorting
60
+ //! items partitioned across a CUDA thread block using a radix sorting method.
61
+ //!
62
+ //! .. image:: ../../img/sorting_logo.png
63
+ //! :align: center
64
+ //!
65
+ //! Overview
66
+ //! --------------------------------------------------
67
+ //!
68
+ //! The `radix sorting method <http://en.wikipedia.org/wiki/Radix_sort>`_ arranges
69
+ //! items into ascending order. It relies upon a positional representation for
70
+ //! keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
71
+ //! characters, etc.) specified from least-significant to most-significant. For a
72
+ //! given input sequence of keys and a set of rules specifying a total ordering
73
+ //! of the symbolic alphabet, the radix sorting method produces a lexicographic
74
+ //! ordering of those keys.
75
+ //!
76
+ //! @rowmajor
77
+ //!
78
+ //! Supported Types
79
+ //! --------------------------------------------------
80
+ //!
81
+ //! BlockRadixSort can sort all of the built-in C++ numeric primitive types
82
+ //! (``unsigned char``, ``int``, ``double``, etc.) as well as CUDA's ``__half``
83
+ //! half-precision floating-point type. User-defined types are supported as long
84
+ //! as decomposer object is provided.
85
+ //!
86
+ //! Floating-Point Special Cases
87
+ //! --------------------------------------------------
88
+ //!
89
+ //! - Positive and negative zeros are considered equivalent, and will be treated
90
+ //! as such in the output.
91
+ //! - No special handling is implemented for NaN values; these are sorted
92
+ //! according to their bit representations after any transformations.
93
+ //!
94
+ //! Bitwise Key Transformations
95
+ //! --------------------------------------------------
96
+ //!
97
+ //! Although the direct radix sorting method can only be applied to unsigned
98
+ //! integral types, BlockRadixSort is able to sort signed and floating-point
99
+ //! types via simple bit-wise transformations that ensure lexicographic key
100
+ //! ordering.
101
+ //!
102
+ //! These transformations must be considered when restricting the
103
+ //! ``[begin_bit, end_bit)`` range, as the bitwise transformations will occur
104
+ //! before the bit-range truncation.
105
+ //!
106
+ //! Any transformations applied to the keys prior to sorting are reversed
107
+ //! while writing to the final output buffer.
108
+ //!
109
+ //! Type Specific Bitwise Transformations
110
+ //! --------------------------------------------------
111
+ //!
112
+ //! To convert the input values into a radix-sortable bitwise representation,
113
+ //! the following transformations take place prior to sorting:
114
+ //!
115
+ //! * For unsigned integral values, the keys are used directly.
116
+ //! * For signed integral values, the sign bit is inverted.
117
+ //! * For positive floating point values, the sign bit is inverted.
118
+ //! * For negative floating point values, the full key is inverted.
119
+ //!
120
+ //! No Descending Sort Transformations
121
+ //! --------------------------------------------------
122
+ //!
123
+ //! Unlike ``DeviceRadixSort``, ``BlockRadixSort`` does not invert the input key bits
124
+ //! when performing a descending sort. Instead, it has special logic to reverse
125
+ //! the order of the keys while sorting.
126
+ //!
127
+ //! Stability
128
+ //! --------------------------------------------------
129
+ //!
130
+ //! BlockRadixSort is stable. For floating-point types -0.0 and +0.0
131
+ //! are considered equal and appear in the result in the same order as they
132
+ //! appear in the input.
133
+ //!
134
+ //!
135
+ //! Performance Considerations
136
+ //! --------------------------------------------------
137
+ //!
138
+ //! * @granularity
139
+ //!
140
+ //! A Simple Example
141
+ //! --------------------------------------------------
142
+ //!
143
+ //! @blockcollective{BlockRadixSort}
144
+ //!
145
+ //! The code snippet below illustrates a sort of 512 integer keys that
146
+ //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
147
+ //! where each thread owns 4 consecutive items.
148
+ //!
149
+ //! .. tab-set-code::
150
+ //!
151
+ //! .. code-block:: c++
152
+ //!
153
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
154
+ //!
155
+ //! __global__ void kernel(...)
156
+ //! {
157
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
158
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
159
+ //!
160
+ //! // Allocate shared memory for BlockRadixSort
161
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
162
+ //!
163
+ //! // Obtain a segment of consecutive items that are blocked across threads
164
+ //! int thread_keys[4];
165
+ //! ...
166
+ //!
167
+ //! // Collectively sort the keys
168
+ //! BlockRadixSort(temp_storage).Sort(thread_keys);
169
+ //!
170
+ //! ...
171
+ //!
172
+ //! .. code-block:: python
173
+ //!
174
+ //! import cuda.cccl.cooperative.experimental as cudax
175
+ //! from pynvjitlink import patch
176
+ //! patch.patch_numba_linker(lto=True)
177
+ //!
178
+ //! # Specialize radix sort for a 1D block of 128 threads owning 4 integer items each
179
+ //! block_radix_sort = cudax.block.radix_sort_keys(numba.int32, 128, 4)
180
+ //! temp_storage_bytes = block_radix_sort.temp_storage_bytes
181
+ //!
182
+ //! @cuda.jit(link=block_radix_sort.files)
183
+ //! def kernel():
184
+ //! Allocate shared memory for radix sort
185
+ //! temp_storage = cuda.shared.array(shape=temp_storage_bytes, dtype='uint8')
186
+ //!
187
+ //! # Obtain a segment of consecutive items that are blocked across threads
188
+ //! thread_keys = cuda.local.array(shape=items_per_thread, dtype=numba.int32)
189
+ //! # ...
190
+ //!
191
+ //! // Collectively sort the keys
192
+ //! block_radix_sort(temp_storage, thread_keys)
193
+ //! # ...
194
+ //!
195
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
196
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
197
+ //! The corresponding output ``thread_keys`` in those threads will be
198
+ //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
199
+ //!
200
+ //! Re-using dynamically allocating shared memory
201
+ //! --------------------------------------------------
202
+ //!
203
+ //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
204
+ //! BlockReduce and how to re-purpose the same memory region.
205
+ //!
206
+ //! This example can be easily adapted to the storage required by BlockRadixSort.
207
+ //! @endrst
208
+ //!
209
+ //! @tparam KeyT
210
+ //! KeyT type
211
+ //!
212
+ //! @tparam BLOCK_DIM_X
213
+ //! The thread block length in threads along the X dimension
214
+ //!
215
+ //! @tparam ITEMS_PER_THREAD
216
+ //! The number of items per thread
217
+ //!
218
+ //! @tparam ValueT
219
+ //! **[optional]** ValueT type (default: cub::NullType, which indicates a keys-only sort)
220
+ //!
221
+ //! @tparam RADIX_BITS
222
+ //! **[optional]** The number of radix bits per digit place (default: 4 bits)
223
+ //!
224
+ //! @tparam MEMOIZE_OUTER_SCAN
225
+ //! **[optional]** Whether or not to buffer outer raking scan partials to incur fewer shared memory
226
+ //! reads at the expense of higher register pressure (default: true for architectures SM35 and
227
+ //! newer, false otherwise).
228
+ //!
229
+ //! @tparam INNER_SCAN_ALGORITHM
230
+ //! **[optional]** The cub::BlockScanAlgorithm algorithm to use
231
+ //! (default: cub::BLOCK_SCAN_WARP_SCANS)
232
+ //!
233
+ //! @tparam SMEM_CONFIG
234
+ //! **[optional]*8 Shared memory bank mode (default: `cudaSharedMemBankSizeFourByte`)
235
+ //!
236
+ //! @tparam BLOCK_DIM_Y
237
+ //! **[optional]** The thread block length in threads along the Y dimension (default: 1)
238
+ //!
239
+ //! @tparam BLOCK_DIM_Z
240
+ //! **[optional]** The thread block length in threads along the Z dimension (default: 1)
241
+ //!
242
+ template <typename KeyT,
243
+ int BLOCK_DIM_X,
244
+ int ITEMS_PER_THREAD,
245
+ typename ValueT = NullType,
246
+ int RADIX_BITS = 4,
247
+ bool MEMOIZE_OUTER_SCAN = true,
248
+ BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
249
+ cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte,
250
+ int BLOCK_DIM_Y = 1,
251
+ int BLOCK_DIM_Z = 1>
252
+ class BlockRadixSort
253
+ {
254
+ private:
255
+ /******************************************************************************
256
+ * Constants and type definitions
257
+ ******************************************************************************/
258
+
259
+ enum
260
+ {
261
+ // The thread block size in threads
262
+ BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
263
+
264
+ // Whether or not there are values to be trucked along with keys
265
+ KEYS_ONLY = ::cuda::std::is_same_v<ValueT, NullType>,
266
+ };
267
+
268
+ // KeyT traits and unsigned bits type
269
+ using traits = detail::radix::traits_t<KeyT>;
270
+ using bit_ordered_type = typename traits::bit_ordered_type;
271
+ using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy;
272
+
273
+ /// Ascending BlockRadixRank utility type
274
+ using AscendingBlockRadixRank =
275
+ BlockRadixRank<BLOCK_DIM_X,
276
+ RADIX_BITS,
277
+ false,
278
+ MEMOIZE_OUTER_SCAN,
279
+ INNER_SCAN_ALGORITHM,
280
+ SMEM_CONFIG,
281
+ BLOCK_DIM_Y,
282
+ BLOCK_DIM_Z>;
283
+
284
+ /// Descending BlockRadixRank utility type
285
+ using DescendingBlockRadixRank =
286
+ BlockRadixRank<BLOCK_DIM_X,
287
+ RADIX_BITS,
288
+ true,
289
+ MEMOIZE_OUTER_SCAN,
290
+ INNER_SCAN_ALGORITHM,
291
+ SMEM_CONFIG,
292
+ BLOCK_DIM_Y,
293
+ BLOCK_DIM_Z>;
294
+
295
+ /// Digit extractor type
296
+ using fundamental_digit_extractor_t = BFEDigitExtractor<KeyT>;
297
+
298
+ /// BlockExchange utility type for keys
299
+ using BlockExchangeKeys = BlockExchange<KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
300
+
301
+ /// BlockExchange utility type for values
302
+ using BlockExchangeValues = BlockExchange<ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
303
+
304
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
305
+ /// Shared memory storage layout type
306
+ union _TempStorage
307
+ {
308
+ typename AscendingBlockRadixRank::TempStorage asending_ranking_storage;
309
+ typename DescendingBlockRadixRank::TempStorage descending_ranking_storage;
310
+ typename BlockExchangeKeys::TempStorage exchange_keys;
311
+ typename BlockExchangeValues::TempStorage exchange_values;
312
+ };
313
+ #endif // _CCCL_DOXYGEN_INVOKED
314
+
315
+ /******************************************************************************
316
+ * Thread fields
317
+ ******************************************************************************/
318
+
319
+ /// Shared storage reference
320
+ _TempStorage& temp_storage;
321
+
322
+ /// Linear thread-id
323
+ unsigned int linear_tid;
324
+
325
+ /******************************************************************************
326
+ * Utility methods
327
+ ******************************************************************************/
328
+
329
+ /// Internal storage allocator
330
+ _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
331
+ {
332
+ __shared__ _TempStorage private_storage;
333
+ return private_storage;
334
+ }
335
+
336
+ /// Rank keys (specialized for ascending sort)
337
+ template <class DigitExtractorT>
338
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
339
+ RankKeys(bit_ordered_type (&unsigned_keys)[ITEMS_PER_THREAD],
340
+ int (&ranks)[ITEMS_PER_THREAD],
341
+ DigitExtractorT digit_extractor,
342
+ ::cuda::std::false_type /*is_descending*/)
343
+ {
344
+ AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys(unsigned_keys, ranks, digit_extractor);
345
+ }
346
+
347
+ /// Rank keys (specialized for descending sort)
348
+ template <class DigitExtractorT>
349
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
350
+ RankKeys(bit_ordered_type (&unsigned_keys)[ITEMS_PER_THREAD],
351
+ int (&ranks)[ITEMS_PER_THREAD],
352
+ DigitExtractorT digit_extractor,
353
+ ::cuda::std::true_type /*is_descending*/)
354
+ {
355
+ DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys(unsigned_keys, ranks, digit_extractor);
356
+ }
357
+
358
+ /// ExchangeValues (specialized for key-value sort, to-blocked arrangement)
359
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
360
+ ValueT (&values)[ITEMS_PER_THREAD],
361
+ int (&ranks)[ITEMS_PER_THREAD],
362
+ ::cuda::std::false_type /*is_keys_only*/,
363
+ ::cuda::std::true_type /*is_blocked*/)
364
+ {
365
+ __syncthreads();
366
+
367
+ // Exchange values through shared memory in blocked arrangement
368
+ BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks);
369
+ }
370
+
371
+ /// ExchangeValues (specialized for key-value sort, to-striped arrangement)
372
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
373
+ ValueT (&values)[ITEMS_PER_THREAD],
374
+ int (&ranks)[ITEMS_PER_THREAD],
375
+ ::cuda::std::false_type /*is_keys_only*/,
376
+ ::cuda::std::false_type /*is_blocked*/)
377
+ {
378
+ __syncthreads();
379
+
380
+ // Exchange values through shared memory in blocked arrangement
381
+ BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
382
+ }
383
+
384
+ /// ExchangeValues (specialized for keys-only sort)
385
+ template <bool IS_BLOCKED>
386
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
387
+ ValueT (& /*values*/)[ITEMS_PER_THREAD],
388
+ int (& /*ranks*/)[ITEMS_PER_THREAD],
389
+ ::cuda::std::true_type /*is_keys_only*/,
390
+ ::cuda::std::bool_constant<IS_BLOCKED> /*is_blocked*/)
391
+ {}
392
+
393
+ /**
394
+ * @brief Sort blocked arrangement
395
+ *
396
+ * @param keys
397
+ * Keys to sort
398
+ *
399
+ * @param values
400
+ * Values to sort
401
+ *
402
+ * @param begin_bit
403
+ * The beginning (least-significant) bit index needed for key comparison
404
+ *
405
+ * @param end_bit
406
+ * The past-the-end (most-significant) bit index needed for key comparison
407
+ *
408
+ * @param is_descending
409
+ * Tag whether is a descending-order sort
410
+ *
411
+ * @param is_keys_only
412
+ * Tag whether is keys-only sort
413
+ */
414
+ template <bool DESCENDING, bool KEYS_ONLY, class DecomposerT = detail::identity_decomposer_t>
415
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlocked(
416
+ KeyT (&keys)[ITEMS_PER_THREAD],
417
+ ValueT (&values)[ITEMS_PER_THREAD],
418
+ int begin_bit,
419
+ int end_bit,
420
+ ::cuda::std::bool_constant<DESCENDING> is_descending,
421
+ ::cuda::std::bool_constant<KEYS_ONLY> is_keys_only,
422
+ DecomposerT decomposer = {})
423
+ {
424
+ bit_ordered_type(&unsigned_keys)[ITEMS_PER_THREAD] = reinterpret_cast<bit_ordered_type(&)[ITEMS_PER_THREAD]>(keys);
425
+
426
+ _CCCL_PRAGMA_UNROLL_FULL()
427
+ for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
428
+ {
429
+ unsigned_keys[KEY] = bit_ordered_conversion::to_bit_ordered(decomposer, unsigned_keys[KEY]);
430
+ }
431
+
432
+ // Radix sorting passes
433
+ while (true)
434
+ {
435
+ int pass_bits = ::cuda::std::min(RADIX_BITS, end_bit - begin_bit);
436
+ auto digit_extractor =
437
+ traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);
438
+
439
+ // Rank the blocked keys
440
+ int ranks[ITEMS_PER_THREAD];
441
+ RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
442
+ begin_bit += RADIX_BITS;
443
+
444
+ __syncthreads();
445
+
446
+ // Exchange keys through shared memory in blocked arrangement
447
+ BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
448
+
449
+ // Exchange values through shared memory in blocked arrangement
450
+ ExchangeValues(values, ranks, is_keys_only, ::cuda::std::true_type());
451
+
452
+ // Quit if done
453
+ if (begin_bit >= end_bit)
454
+ {
455
+ break;
456
+ }
457
+
458
+ __syncthreads();
459
+ }
460
+
461
+ // Untwiddle bits if necessary
462
+ _CCCL_PRAGMA_UNROLL_FULL()
463
+ for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
464
+ {
465
+ unsigned_keys[KEY] = bit_ordered_conversion::from_bit_ordered(decomposer, unsigned_keys[KEY]);
466
+ }
467
+ }
468
+
469
+ public:
470
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
471
+
472
+ /**
473
+ * @brief Sort blocked -> striped arrangement
474
+ *
475
+ * @param keys
476
+ * Keys to sort
477
+ *
478
+ * @param values
479
+ * Values to sort
480
+ *
481
+ * @param begin_bit
482
+ * The beginning (least-significant) bit index needed for key comparison
483
+ *
484
+ * @param end_bit
485
+ * The past-the-end (most-significant) bit index needed for key comparison
486
+ *
487
+ * @param is_descending
488
+ * Tag whether is a descending-order sort
489
+ *
490
+ * @param is_keys_only
491
+ * Tag whether is keys-only sort
492
+ */
493
+ template <bool DESCENDING, bool KEYS_ONLY, class DecomposerT = detail::identity_decomposer_t>
494
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlockedToStriped(
495
+ KeyT (&keys)[ITEMS_PER_THREAD],
496
+ ValueT (&values)[ITEMS_PER_THREAD],
497
+ int begin_bit,
498
+ int end_bit,
499
+ ::cuda::std::bool_constant<DESCENDING> is_descending,
500
+ ::cuda::std::bool_constant<KEYS_ONLY> is_keys_only,
501
+ DecomposerT decomposer = {})
502
+ {
503
+ bit_ordered_type(&unsigned_keys)[ITEMS_PER_THREAD] = reinterpret_cast<bit_ordered_type(&)[ITEMS_PER_THREAD]>(keys);
504
+
505
+ _CCCL_PRAGMA_UNROLL_FULL()
506
+ for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
507
+ {
508
+ unsigned_keys[KEY] = bit_ordered_conversion::to_bit_ordered(decomposer, unsigned_keys[KEY]);
509
+ }
510
+
511
+ // Radix sorting passes
512
+ while (true)
513
+ {
514
+ int pass_bits = ::cuda::std::min(RADIX_BITS, end_bit - begin_bit);
515
+ auto digit_extractor =
516
+ traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);
517
+
518
+ // Rank the blocked keys
519
+ int ranks[ITEMS_PER_THREAD];
520
+ RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
521
+ begin_bit += RADIX_BITS;
522
+
523
+ __syncthreads();
524
+
525
+ // Check if this is the last pass
526
+ if (begin_bit >= end_bit)
527
+ {
528
+ // Last pass exchanges keys through shared memory in striped arrangement
529
+ BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks);
530
+
531
+ // Last pass exchanges through shared memory in striped arrangement
532
+ ExchangeValues(values, ranks, is_keys_only, ::cuda::std::false_type());
533
+
534
+ // Quit
535
+ break;
536
+ }
537
+
538
+ // Exchange keys through shared memory in blocked arrangement
539
+ BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
540
+
541
+ // Exchange values through shared memory in blocked arrangement
542
+ ExchangeValues(values, ranks, is_keys_only, ::cuda::std::true_type());
543
+
544
+ __syncthreads();
545
+ }
546
+
547
+ // Untwiddle bits if necessary
548
+ _CCCL_PRAGMA_UNROLL_FULL()
549
+ for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
550
+ {
551
+ unsigned_keys[KEY] = bit_ordered_conversion::from_bit_ordered(decomposer, unsigned_keys[KEY]);
552
+ }
553
+ }
554
+
555
+ #endif // _CCCL_DOXYGEN_INVOKED
556
+
557
+ /// @smemstorage{BlockRadixSort}
558
+ struct TempStorage : Uninitialized<_TempStorage>
559
+ {};
560
+
561
+ //! @name Collective constructors
562
+ //! @{
563
+
564
+ //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
565
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixSort()
566
+ : temp_storage(PrivateStorage())
567
+ , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
568
+ {}
569
+
570
+ /**
571
+ * @brief Collective constructor using the specified memory allocation as temporary storage.
572
+ *
573
+ * @param[in] temp_storage
574
+ * Reference to memory allocation having layout type TempStorage
575
+ */
576
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixSort(TempStorage& temp_storage)
577
+ : temp_storage(temp_storage.Alias())
578
+ , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
579
+ {}
580
+
581
+ //! @} end member group
582
+ //! @name Sorting (blocked arrangements)
583
+ //! @{
584
+
585
+ //! @rst
586
+ //! Performs an ascending block-wide radix sort over a
587
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
588
+ //!
589
+ //! - @granularity
590
+ //! - @smemreuse
591
+ //!
592
+ //! Snippet
593
+ //! +++++++
594
+ //!
595
+ //! The code snippet below illustrates a sort of 512 integer keys that
596
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
597
+ //! where each thread owns 4 consecutive keys.
598
+ //!
599
+ //! .. code-block:: c++
600
+ //!
601
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
602
+ //!
603
+ //! __global__ void ExampleKernel(...)
604
+ //! {
605
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
606
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
607
+ //!
608
+ //! // Allocate shared memory for BlockRadixSort
609
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
610
+ //!
611
+ //! // Obtain a segment of consecutive items that are blocked across threads
612
+ //! int thread_keys[4];
613
+ //! ...
614
+ //!
615
+ //! // Collectively sort the keys
616
+ //! BlockRadixSort(temp_storage).Sort(thread_keys);
617
+ //!
618
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
619
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
620
+ //! The corresponding output ``thread_keys`` in those threads will be
621
+ //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
622
+ //! @endrst
623
+ //!
624
+ //! @param[in,out] keys
625
+ //! Keys to sort
626
+ //!
627
+ //! @param[in] begin_bit
628
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
629
+ //!
630
+ //! @param[in] end_bit
631
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
632
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
633
+ Sort(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
634
+ {
635
+ NullType values[ITEMS_PER_THREAD];
636
+
637
+ SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
638
+ }
639
+
640
+ //! @rst
641
+ //! Performs an ascending block-wide radix sort over a
642
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
643
+ //!
644
+ //! * @granularity
645
+ //! * @smemreuse
646
+ //!
647
+ //! Snippet
648
+ //! ==========================================================================
649
+ //!
650
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
651
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
652
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
653
+ //! tuple of references to relevant members of the key.
654
+ //!
655
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
656
+ //! :language: c++
657
+ //! :dedent:
658
+ //! :start-after: example-begin custom-type
659
+ //! :end-before: example-end custom-type
660
+ //!
661
+ //! The code snippet below illustrates a sort of 2 keys that
662
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
663
+ //! where each thread owns 1 key.
664
+ //!
665
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
666
+ //! :language: c++
667
+ //! :dedent:
668
+ //! :start-after: example-begin keys-bits
669
+ //! :end-before: example-end keys-bits
670
+ //!
671
+ //! @endrst
672
+ //!
673
+ //! @tparam DecomposerT
674
+ //! **[inferred]** Type of a callable object responsible for decomposing a
675
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
676
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
677
+ //! The leftmost element of the tuple is considered the most significant.
678
+ //! The call operator must not modify members of the key.
679
+ //!
680
+ //! @param[in,out] keys
681
+ //! Keys to sort
682
+ //!
683
+ //! @param decomposer
684
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
685
+ //! references to its constituent arithmetic types. The leftmost element of
686
+ //! the tuple is considered the most significant. The call operator must not
687
+ //! modify members of the key.
688
+ //!
689
+ //! @param[in] begin_bit
690
+ //! The least-significant bit index (inclusive) needed for
691
+ //! key comparison
692
+ //!
693
+ //! @param[in] end_bit
694
+ //! The most-significant bit index (exclusive) needed for key
695
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
696
+ template <class DecomposerT>
697
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
698
+ ::cuda::std::enable_if_t< //
699
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
700
+ Sort(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit)
701
+ {
702
+ NullType values[ITEMS_PER_THREAD];
703
+
704
+ SortBlocked(
705
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
706
+ }
707
+
708
+ //! @rst
709
+ //! Performs an ascending block-wide radix sort over a
710
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
711
+ //!
712
+ //! * @granularity
713
+ //! * @smemreuse
714
+ //!
715
+ //! Snippet
716
+ //! ==========================================================================
717
+ //!
718
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
719
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
720
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
721
+ //! tuple of references to relevant members of the key.
722
+ //!
723
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
724
+ //! :language: c++
725
+ //! :dedent:
726
+ //! :start-after: example-begin custom-type
727
+ //! :end-before: example-end custom-type
728
+ //!
729
+ //! The code snippet below illustrates a sort of 6 keys that
730
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
731
+ //! where each thread owns 3 consecutive keys.
732
+ //!
733
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
734
+ //! :language: c++
735
+ //! :dedent:
736
+ //! :start-after: example-begin keys
737
+ //! :end-before: example-end keys
738
+ //!
739
+ //! @endrst
740
+ //!
741
+ //! @tparam DecomposerT
742
+ //! **[inferred]** Type of a callable object responsible for decomposing a
743
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
744
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
745
+ //! The leftmost element of the tuple is considered the most significant.
746
+ //! The call operator must not modify members of the key.
747
+ //!
748
+ //! @param[in,out] keys
749
+ //! Keys to sort
750
+ //!
751
+ //! @param decomposer
752
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
753
+ //! references to its constituent arithmetic types. The leftmost element of
754
+ //! the tuple is considered the most significant. The call operator must not
755
+ //! modify members of the key.
756
+ template <class DecomposerT>
757
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
758
+ ::cuda::std::enable_if_t< //
759
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
760
+ Sort(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer)
761
+ {
762
+ Sort(keys, decomposer, 0, detail::radix::traits_t<KeyT>::default_end_bit(decomposer));
763
+ }
764
+
765
+ //! @rst
766
+ //! Performs an ascending block-wide radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
767
+ //! of keys and values.
768
+ //!
769
+ //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
770
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
771
+ //! with a temporary value array that enumerates the key indices. The reordered indices
772
+ //! can then be used as a gather-vector for exchanging other associated tile data through
773
+ //! shared memory.
774
+ //! - @granularity
775
+ //! - @smemreuse
776
+ //!
777
+ //! Snippet
778
+ //! +++++++
779
+ //!
780
+ //! The code snippet below illustrates a sort of 512 integer keys and values that
781
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
782
+ //! where each thread owns 4 consecutive pairs.
783
+ //!
784
+ //! .. code-block:: c++
785
+ //!
786
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
787
+ //!
788
+ //! __global__ void ExampleKernel(...)
789
+ //! {
790
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
791
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
792
+ //!
793
+ //! // Allocate shared memory for BlockRadixSort
794
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
795
+ //!
796
+ //! // Obtain a segment of consecutive items that are blocked across threads
797
+ //! int thread_keys[4];
798
+ //! int thread_values[4];
799
+ //! ...
800
+ //!
801
+ //! // Collectively sort the keys and values among block threads
802
+ //! BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
803
+ //!
804
+ //! @endcode
805
+ //! @par
806
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
807
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. The
808
+ //! corresponding output ``thread_keys`` in those threads will be
809
+ //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
810
+ //!
811
+ //! @endrst
812
+ //!
813
+ //! @param[in,out] keys
814
+ //! Keys to sort
815
+ //!
816
+ //! @param[in,out] values
817
+ //! Values to sort
818
+ //!
819
+ //! @param[in] begin_bit
820
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
821
+ //!
822
+ //! @param[in] end_bit
823
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
824
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
825
+ Sort(KeyT (&keys)[ITEMS_PER_THREAD],
826
+ ValueT (&values)[ITEMS_PER_THREAD],
827
+ int begin_bit = 0,
828
+ int end_bit = sizeof(KeyT) * 8)
829
+ {
830
+ SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
831
+ }
832
+
833
+ //! @rst
834
+ //! Performs an ascending block-wide radix sort over a
835
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
836
+ //!
837
+ //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
838
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
839
+ //! with a temporary value array that enumerates the key indices. The reordered indices
840
+ //! can then be used as a gather-vector for exchanging other associated tile data through
841
+ //! shared memory.
842
+ //! * @granularity
843
+ //! * @smemreuse
844
+ //!
845
+ //! Snippet
846
+ //! ==========================================================================
847
+ //!
848
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
849
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
850
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
851
+ //! tuple of references to relevant members of the key.
852
+ //!
853
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
854
+ //! :language: c++
855
+ //! :dedent:
856
+ //! :start-after: example-begin custom-type
857
+ //! :end-before: example-end custom-type
858
+ //!
859
+ //! The code snippet below illustrates a sort of 2 keys and values that
860
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
861
+ //! where each thread owns 1 pair.
862
+ //!
863
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
864
+ //! :language: c++
865
+ //! :dedent:
866
+ //! :start-after: example-begin pairs-bits
867
+ //! :end-before: example-end pairs-bits
868
+ //!
869
+ //! @endrst
870
+ //!
871
+ //! @tparam DecomposerT
872
+ //! **[inferred]** Type of a callable object responsible for decomposing a
873
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
874
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
875
+ //! The leftmost element of the tuple is considered the most significant.
876
+ //! The call operator must not modify members of the key.
877
+ //!
878
+ //! @param[in,out] keys
879
+ //! Keys to sort
880
+ //!
881
+ //! @param[in,out] values
882
+ //! Values to sort
883
+ //!
884
+ //! @param decomposer
885
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
886
+ //! references to its constituent arithmetic types. The leftmost element of
887
+ //! the tuple is considered the most significant. The call operator must not
888
+ //! modify members of the key.
889
+ //!
890
+ //! @param[in] begin_bit
891
+ //! The least-significant bit index (inclusive) needed for
892
+ //! key comparison
893
+ //!
894
+ //! @param[in] end_bit
895
+ //! The most-significant bit index (exclusive) needed for key
896
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
897
+ template <class DecomposerT>
898
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
899
+ ::cuda::std::enable_if_t< //
900
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
901
+ Sort(KeyT (&keys)[ITEMS_PER_THREAD],
902
+ ValueT (&values)[ITEMS_PER_THREAD],
903
+ DecomposerT decomposer,
904
+ int begin_bit,
905
+ int end_bit)
906
+ {
907
+ SortBlocked(
908
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
909
+ }
910
+
911
+ //! @rst
912
+ //! Performs an ascending block-wide radix sort over a
913
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
914
+ //!
915
+ //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
916
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
917
+ //! with a temporary value array that enumerates the key indices. The reordered indices
918
+ //! can then be used as a gather-vector for exchanging other associated tile data through
919
+ //! shared memory.
920
+ //! * @granularity
921
+ //! * @smemreuse
922
+ //!
923
+ //! Snippet
924
+ //! ==========================================================================
925
+ //!
926
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
927
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
928
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
929
+ //! tuple of references to relevant members of the key.
930
+ //!
931
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
932
+ //! :language: c++
933
+ //! :dedent:
934
+ //! :start-after: example-begin custom-type
935
+ //! :end-before: example-end custom-type
936
+ //!
937
+ //! The code snippet below illustrates a sort of 6 keys and values that
938
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
939
+ //! where each thread owns 3 consecutive pairs.
940
+ //!
941
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
942
+ //! :language: c++
943
+ //! :dedent:
944
+ //! :start-after: example-begin pairs
945
+ //! :end-before: example-end pairs
946
+ //!
947
+ //! @endrst
948
+ //!
949
+ //! @tparam DecomposerT
950
+ //! **[inferred]** Type of a callable object responsible for decomposing a
951
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
952
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
953
+ //! The leftmost element of the tuple is considered the most significant.
954
+ //! The call operator must not modify members of the key.
955
+ //!
956
+ //! @param[in,out] keys
957
+ //! Keys to sort
958
+ //!
959
+ //! @param[in,out] values
960
+ //! Values to sort
961
+ //!
962
+ //! @param decomposer
963
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
964
+ //! references to its constituent arithmetic types. The leftmost element of
965
+ //! the tuple is considered the most significant. The call operator must not
966
+ //! modify members of the key.
967
+ template <class DecomposerT>
968
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
969
+ ::cuda::std::enable_if_t< //
970
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
971
+ Sort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer)
972
+ {
973
+ Sort(keys, values, decomposer, 0, detail::radix::traits_t<KeyT>::default_end_bit(decomposer));
974
+ }
975
+
976
+ //! @rst
977
+ //! Performs a descending block-wide radix sort over a :ref:`blocked arrangement <flexible-data-arrangement>`
978
+ //! of keys.
979
+ //!
980
+ //! - @granularity
981
+ //! - @smemreuse
982
+ //!
983
+ //! Snippet
984
+ //! +++++++
985
+ //!
986
+ //! The code snippet below illustrates a sort of 512 integer keys that
987
+ //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
988
+ //! where each thread owns 4 consecutive keys.
989
+ //!
990
+ //! .. code-block:: c++
991
+ //!
992
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
993
+ //!
994
+ //! __global__ void ExampleKernel(...)
995
+ //! {
996
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
997
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
998
+ //!
999
+ //! // Allocate shared memory for BlockRadixSort
1000
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1001
+ //!
1002
+ //! // Obtain a segment of consecutive items that are blocked across threads
1003
+ //! int thread_keys[4];
1004
+ //! ...
1005
+ //!
1006
+ //! // Collectively sort the keys
1007
+ //! BlockRadixSort(temp_storage).Sort(thread_keys);
1008
+ //!
1009
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1010
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
1011
+ //! The corresponding output ``thread_keys`` in those threads will be
1012
+ //! ``{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }``.
1013
+ //!
1014
+ //! @endrst
1015
+ //!
1016
+ //! @param[in,out] keys
1017
+ //! Keys to sort
1018
+ //!
1019
+ //! @param[in] begin_bit
1020
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1021
+ //!
1022
+ //! @param[in] end_bit
1023
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1024
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1025
+ SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
1026
+ {
1027
+ NullType values[ITEMS_PER_THREAD];
1028
+
1029
+ SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
1030
+ }
1031
+
1032
+ //! @rst
1033
+ //! Performs a descending block-wide radix sort over a
1034
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
1035
+ //!
1036
+ //! * @granularity
1037
+ //! * @smemreuse
1038
+ //!
1039
+ //! Snippet
1040
+ //! ==========================================================================
1041
+ //!
1042
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1043
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1044
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1045
+ //! tuple of references to relevant members of the key.
1046
+ //!
1047
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1048
+ //! :language: c++
1049
+ //! :dedent:
1050
+ //! :start-after: example-begin custom-type
1051
+ //! :end-before: example-end custom-type
1052
+ //!
1053
+ //! The code snippet below illustrates a sort of 2 keys that
1054
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1055
+ //! where each thread owns 1 key.
1056
+ //!
1057
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1058
+ //! :language: c++
1059
+ //! :dedent:
1060
+ //! :start-after: example-begin keys-descending-bits
1061
+ //! :end-before: example-end keys-descending-bits
1062
+ //!
1063
+ //! @endrst
1064
+ //!
1065
+ //! @tparam DecomposerT
1066
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1067
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1068
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1069
+ //! The leftmost element of the tuple is considered the most significant.
1070
+ //! The call operator must not modify members of the key.
1071
+ //!
1072
+ //! @param[in,out] keys
1073
+ //! Keys to sort
1074
+ //!
1075
+ //! @param decomposer
1076
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1077
+ //! references to its constituent arithmetic types. The leftmost element of
1078
+ //! the tuple is considered the most significant. The call operator must not
1079
+ //! modify members of the key.
1080
+ //!
1081
+ //! @param[in] begin_bit
1082
+ //! The least-significant bit index (inclusive) needed for
1083
+ //! key comparison
1084
+ //!
1085
+ //! @param[in] end_bit
1086
+ //! The most-significant bit index (exclusive) needed for key
1087
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1088
+ template <class DecomposerT>
1089
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1090
+ ::cuda::std::enable_if_t< //
1091
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1092
+ SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit)
1093
+ {
1094
+ NullType values[ITEMS_PER_THREAD];
1095
+
1096
+ SortBlocked(
1097
+ keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1098
+ }
1099
+
1100
+ //! @rst
1101
+ //! Performs a descending block-wide radix sort over a
1102
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
1103
+ //!
1104
+ //! * @granularity
1105
+ //! * @smemreuse
1106
+ //!
1107
+ //! Snippet
1108
+ //! ==========================================================================
1109
+ //!
1110
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1111
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1112
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1113
+ //! tuple of references to relevant members of the key.
1114
+ //!
1115
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1116
+ //! :language: c++
1117
+ //! :dedent:
1118
+ //! :start-after: example-begin custom-type
1119
+ //! :end-before: example-end custom-type
1120
+ //!
1121
+ //! The code snippet below illustrates a sort of 6 keys that
1122
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1123
+ //! where each thread owns 3 consecutive keys.
1124
+ //!
1125
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1126
+ //! :language: c++
1127
+ //! :dedent:
1128
+ //! :start-after: example-begin keys-descending
1129
+ //! :end-before: example-end keys-descending
1130
+ //!
1131
+ //! @endrst
1132
+ //!
1133
+ //! @tparam DecomposerT
1134
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1135
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1136
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1137
+ //! The leftmost element of the tuple is considered the most significant.
1138
+ //! The call operator must not modify members of the key.
1139
+ //!
1140
+ //! @param[in,out] keys
1141
+ //! Keys to sort
1142
+ //!
1143
+ //! @param decomposer
1144
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1145
+ //! references to its constituent arithmetic types. The leftmost element of
1146
+ //! the tuple is considered the most significant. The call operator must not
1147
+ //! modify members of the key.
1148
+ template <class DecomposerT>
1149
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1150
+ ::cuda::std::enable_if_t< //
1151
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1152
+ SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer)
1153
+ {
1154
+ NullType values[ITEMS_PER_THREAD];
1155
+
1156
+ SortBlocked(
1157
+ keys,
1158
+ values,
1159
+ 0,
1160
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1161
+ ::cuda::std::true_type(),
1162
+ detail::bool_constant_v<KEYS_ONLY>,
1163
+ decomposer);
1164
+ }
1165
+
1166
+ //! @rst
1167
+ //! Performs a descending block-wide radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
1168
+ //! of keys and values.
1169
+ //!
1170
+ //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1171
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1172
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1173
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1174
+ //! shared memory.
1175
+ //! - @granularity
1176
+ //! - @smemreuse
1177
+ //!
1178
+ //! Snippet
1179
+ //! +++++++
1180
+ //!
1181
+ //! The code snippet below illustrates a sort of 512 integer keys and values that
1182
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
1183
+ //! where each thread owns 4 consecutive pairs.
1184
+ //!
1185
+ //! .. code-block:: c++
1186
+ //!
1187
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
1188
+ //!
1189
+ //! __global__ void ExampleKernel(...)
1190
+ //! {
1191
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
1192
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
1193
+ //!
1194
+ //! // Allocate shared memory for BlockRadixSort
1195
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1196
+ //!
1197
+ //! // Obtain a segment of consecutive items that are blocked across threads
1198
+ //! int thread_keys[4];
1199
+ //! int thread_values[4];
1200
+ //! ...
1201
+ //!
1202
+ //! // Collectively sort the keys and values among block threads
1203
+ //! BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
1204
+ //!
1205
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1206
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. The
1207
+ //! corresponding output ``thread_keys`` in those threads will be
1208
+ //! ``{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }``.
1209
+ //!
1210
+ //! @endrst
1211
+ //!
1212
+ //! @param[in,out] keys
1213
+ //! Keys to sort
1214
+ //!
1215
+ //! @param[in,out] values
1216
+ //! Values to sort
1217
+ //!
1218
+ //! @param[in] begin_bit
1219
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1220
+ //!
1221
+ //! @param[in] end_bit
1222
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1223
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortDescending(
1224
+ KeyT (&keys)[ITEMS_PER_THREAD],
1225
+ ValueT (&values)[ITEMS_PER_THREAD],
1226
+ int begin_bit = 0,
1227
+ int end_bit = sizeof(KeyT) * 8)
1228
+ {
1229
+ SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
1230
+ }
1231
+
1232
+ //! @rst
1233
+ //! Performs a descending block-wide radix sort over a
1234
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
1235
+ //!
1236
+ //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1237
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1238
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1239
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1240
+ //! shared memory.
1241
+ //! * @granularity
1242
+ //! * @smemreuse
1243
+ //!
1244
+ //! Snippet
1245
+ //! ==========================================================================
1246
+ //!
1247
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1248
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1249
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1250
+ //! tuple of references to relevant members of the key.
1251
+ //!
1252
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1253
+ //! :language: c++
1254
+ //! :dedent:
1255
+ //! :start-after: example-begin custom-type
1256
+ //! :end-before: example-end custom-type
1257
+ //!
1258
+ //! The code snippet below illustrates a sort of 2 pairs that
1259
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1260
+ //! where each thread owns 1 pair.
1261
+ //!
1262
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1263
+ //! :language: c++
1264
+ //! :dedent:
1265
+ //! :start-after: example-begin pairs-descending-bits
1266
+ //! :end-before: example-end pairs-descending-bits
1267
+ //!
1268
+ //! @endrst
1269
+ //!
1270
+ //! @tparam DecomposerT
1271
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1272
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1273
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1274
+ //! The leftmost element of the tuple is considered the most significant.
1275
+ //! The call operator must not modify members of the key.
1276
+ //!
1277
+ //! @param[in,out] keys
1278
+ //! Keys to sort
1279
+ //!
1280
+ //! @param[in,out] values
1281
+ //! Values to sort
1282
+ //!
1283
+ //! @param decomposer
1284
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1285
+ //! references to its constituent arithmetic types. The leftmost element of
1286
+ //! the tuple is considered the most significant. The call operator must not
1287
+ //! modify members of the key.
1288
+ //!
1289
+ //! @param[in] begin_bit
1290
+ //! The least-significant bit index (inclusive) needed for
1291
+ //! key comparison
1292
+ //!
1293
+ //! @param[in] end_bit
1294
+ //! The most-significant bit index (exclusive) needed for key
1295
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1296
+ template <class DecomposerT>
1297
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1298
+ ::cuda::std::enable_if_t< //
1299
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1300
+ SortDescending(KeyT (&keys)[ITEMS_PER_THREAD],
1301
+ ValueT (&values)[ITEMS_PER_THREAD],
1302
+ DecomposerT decomposer,
1303
+ int begin_bit,
1304
+ int end_bit)
1305
+ {
1306
+ SortBlocked(
1307
+ keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1308
+ }
1309
+
1310
+ //! @rst
1311
+ //! Performs a descending block-wide radix sort over a
1312
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
1313
+ //!
1314
+ //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1315
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1316
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1317
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1318
+ //! shared memory.
1319
+ //! * @granularity
1320
+ //! * @smemreuse
1321
+ //!
1322
+ //! Snippet
1323
+ //! ==========================================================================
1324
+ //!
1325
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1326
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1327
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1328
+ //! tuple of references to relevant members of the key.
1329
+ //!
1330
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1331
+ //! :language: c++
1332
+ //! :dedent:
1333
+ //! :start-after: example-begin custom-type
1334
+ //! :end-before: example-end custom-type
1335
+ //!
1336
+ //! The code snippet below illustrates a sort of 6 keys and values that
1337
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1338
+ //! where each thread owns 3 consecutive pairs.
1339
+ //!
1340
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1341
+ //! :language: c++
1342
+ //! :dedent:
1343
+ //! :start-after: example-begin pairs-descending
1344
+ //! :end-before: example-end pairs-descending
1345
+ //!
1346
+ //! @endrst
1347
+ //!
1348
+ //! @tparam DecomposerT
1349
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1350
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1351
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1352
+ //! The leftmost element of the tuple is considered the most significant.
1353
+ //! The call operator must not modify members of the key.
1354
+ //!
1355
+ //! @param[in,out] keys
1356
+ //! Keys to sort
1357
+ //!
1358
+ //! @param[in,out] values
1359
+ //! Values to sort
1360
+ //!
1361
+ //! @param decomposer
1362
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1363
+ //! references to its constituent arithmetic types. The leftmost element of
1364
+ //! the tuple is considered the most significant. The call operator must not
1365
+ //! modify members of the key.
1366
+ template <class DecomposerT>
1367
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1368
+ ::cuda::std::enable_if_t< //
1369
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1370
+ SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer)
1371
+ {
1372
+ SortBlocked(
1373
+ keys,
1374
+ values,
1375
+ 0,
1376
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1377
+ ::cuda::std::true_type(),
1378
+ detail::bool_constant_v<KEYS_ONLY>,
1379
+ decomposer);
1380
+ }
1381
+
1382
+ //! @} end member group
1383
+ //! @name Sorting (blocked arrangement -> striped arrangement)
1384
+ //! @{
1385
+
1386
+ //! @rst
1387
+ //! Performs an ascending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>` of keys,
1388
+ //! leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`.
1389
+ //!
1390
+ //! - @granularity
1391
+ //! - @smemreuse
1392
+ //!
1393
+ //! Snippet
1394
+ //! +++++++
1395
+ //!
1396
+ //! The code snippet below illustrates a sort of 512 integer keys that
1397
+ //! are initially partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128
1398
+ //! threads where each thread owns 4 consecutive keys. The final partitioning is striped.
1399
+ //!
1400
+ //! .. code-block:: c++
1401
+ //!
1402
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
1403
+ //!
1404
+ //! __global__ void ExampleKernel(...)
1405
+ //! {
1406
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
1407
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
1408
+ //!
1409
+ //! // Allocate shared memory for BlockRadixSort
1410
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1411
+ //!
1412
+ //! // Obtain a segment of consecutive items that are blocked across threads
1413
+ //! int thread_keys[4];
1414
+ //! ...
1415
+ //!
1416
+ //! // Collectively sort the keys
1417
+ //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
1418
+ //!
1419
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1420
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
1421
+ //! The corresponding output ``thread_keys`` in those threads will be
1422
+ //! ``{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }``.
1423
+ //!
1424
+ //! @endrst
1425
+ //!
1426
+ //! @param[in,out] keys
1427
+ //! Keys to sort
1428
+ //!
1429
+ //! @param[in] begin_bit
1430
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1431
+ //!
1432
+ //! @param[in] end_bit
1433
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1434
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1435
+ SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
1436
+ {
1437
+ NullType values[ITEMS_PER_THREAD];
1438
+
1439
+ SortBlockedToStriped(
1440
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
1441
+ }
1442
+
1443
+ //! @rst
1444
+ //! Performs an ascending block-wide radix sort over a
1445
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
1446
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1447
+ //!
1448
+ //! * @granularity
1449
+ //! * @smemreuse
1450
+ //!
1451
+ //! Snippet
1452
+ //! ==========================================================================
1453
+ //!
1454
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1455
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1456
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1457
+ //! tuple of references to relevant members of the key.
1458
+ //!
1459
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1460
+ //! :language: c++
1461
+ //! :dedent:
1462
+ //! :start-after: example-begin custom-type
1463
+ //! :end-before: example-end custom-type
1464
+ //!
1465
+ //! The code snippet below illustrates a sort of 4 keys that
1466
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1467
+ //! where each thread owns 2 consecutive keys. The final partitioning is striped.
1468
+ //!
1469
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1470
+ //! :language: c++
1471
+ //! :dedent:
1472
+ //! :start-after: example-begin keys-striped-bits
1473
+ //! :end-before: example-end keys-striped-bits
1474
+ //!
1475
+ //! @endrst
1476
+ //!
1477
+ //! @tparam DecomposerT
1478
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1479
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1480
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1481
+ //! The leftmost element of the tuple is considered the most significant.
1482
+ //! The call operator must not modify members of the key.
1483
+ //!
1484
+ //! @param[in,out] keys
1485
+ //! Keys to sort
1486
+ //!
1487
+ //! @param decomposer
1488
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1489
+ //! references to its constituent arithmetic types. The leftmost element of
1490
+ //! the tuple is considered the most significant. The call operator must not
1491
+ //! modify members of the key.
1492
+ //!
1493
+ //! @param[in] begin_bit
1494
+ //! The least-significant bit index (inclusive) needed for
1495
+ //! key comparison
1496
+ //!
1497
+ //! @param[in] end_bit
1498
+ //! The most-significant bit index (exclusive) needed for key
1499
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1500
+ template <class DecomposerT>
1501
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1502
+ ::cuda::std::enable_if_t< //
1503
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1504
+ SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit)
1505
+ {
1506
+ NullType values[ITEMS_PER_THREAD];
1507
+
1508
+ SortBlockedToStriped(
1509
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1510
+ }
1511
+
1512
+ //! @rst
1513
+ //! Performs an ascending block-wide radix sort over a
1514
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
1515
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1516
+ //!
1517
+ //! * @granularity
1518
+ //! * @smemreuse
1519
+ //!
1520
+ //! Snippet
1521
+ //! ==========================================================================
1522
+ //!
1523
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1524
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1525
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1526
+ //! tuple of references to relevant members of the key.
1527
+ //!
1528
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1529
+ //! :language: c++
1530
+ //! :dedent:
1531
+ //! :start-after: example-begin custom-type
1532
+ //! :end-before: example-end custom-type
1533
+ //!
1534
+ //! The code snippet below illustrates a sort of 6 keys that
1535
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1536
+ //! where each thread owns 3 consecutive keys. The final partitioning is striped.
1537
+ //!
1538
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1539
+ //! :language: c++
1540
+ //! :dedent:
1541
+ //! :start-after: example-begin keys-striped
1542
+ //! :end-before: example-end keys-striped
1543
+ //!
1544
+ //! @endrst
1545
+ //!
1546
+ //! @tparam DecomposerT
1547
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1548
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1549
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1550
+ //! The leftmost element of the tuple is considered the most significant.
1551
+ //! The call operator must not modify members of the key.
1552
+ //!
1553
+ //! @param[in,out] keys
1554
+ //! Keys to sort
1555
+ //!
1556
+ //! @param decomposer
1557
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1558
+ //! references to its constituent arithmetic types. The leftmost element of
1559
+ //! the tuple is considered the most significant. The call operator must not
1560
+ //! modify members of the key.
1561
+ template <class DecomposerT>
1562
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1563
+ ::cuda::std::enable_if_t< //
1564
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1565
+ SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer)
1566
+ {
1567
+ NullType values[ITEMS_PER_THREAD];
1568
+
1569
+ SortBlockedToStriped(
1570
+ keys,
1571
+ values,
1572
+ 0,
1573
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1574
+ ::cuda::std::false_type(),
1575
+ detail::bool_constant_v<KEYS_ONLY>,
1576
+ decomposer);
1577
+ }
1578
+
1579
+ //! @rst
1580
+ //! Performs an ascending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>` of keys and
1581
+ //! values, leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`.
1582
+ //!
1583
+ //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1584
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1585
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1586
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1587
+ //! shared memory.
1588
+ //! - @granularity
1589
+ //! - @smemreuse
1590
+ //!
1591
+ //! Snippet
1592
+ //! +++++++
1593
+ //!
1594
+ //! The code snippet below illustrates a sort of 512 integer keys and values that
1595
+ //! are initially partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128
1596
+ //! threads where each thread owns 4 consecutive pairs. The final partitioning is striped.
1597
+ //!
1598
+ //! .. code-block:: c++
1599
+ //!
1600
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
1601
+ //!
1602
+ //! __global__ void ExampleKernel(...)
1603
+ //! {
1604
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
1605
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
1606
+ //!
1607
+ //! // Allocate shared memory for BlockRadixSort
1608
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1609
+ //!
1610
+ //! // Obtain a segment of consecutive items that are blocked across threads
1611
+ //! int thread_keys[4];
1612
+ //! int thread_values[4];
1613
+ //! ...
1614
+ //!
1615
+ //! // Collectively sort the keys and values among block threads
1616
+ //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
1617
+ //!
1618
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1619
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
1620
+ //! The corresponding output ``thread_keys`` in those threads will be
1621
+ //! ``{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }``.
1622
+ //!
1623
+ //! @endrst
1624
+ //!
1625
+ //! @param[in,out] keys
1626
+ //! Keys to sort
1627
+ //!
1628
+ //! @param[in,out] values
1629
+ //! Values to sort
1630
+ //!
1631
+ //! @param[in] begin_bit
1632
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1633
+ //!
1634
+ //! @param[in] end_bit
1635
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1636
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlockedToStriped(
1637
+ KeyT (&keys)[ITEMS_PER_THREAD],
1638
+ ValueT (&values)[ITEMS_PER_THREAD],
1639
+ int begin_bit = 0,
1640
+ int end_bit = sizeof(KeyT) * 8)
1641
+ {
1642
+ SortBlockedToStriped(
1643
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
1644
+ }
1645
+
1646
+ //! @rst
1647
+ //! Performs an ascending block-wide radix sort over a
1648
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
1649
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1650
+ //!
1651
+ //! * @granularity
1652
+ //! * @smemreuse
1653
+ //!
1654
+ //! Snippet
1655
+ //! ==========================================================================
1656
+ //!
1657
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1658
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1659
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1660
+ //! tuple of references to relevant members of the key.
1661
+ //!
1662
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1663
+ //! :language: c++
1664
+ //! :dedent:
1665
+ //! :start-after: example-begin custom-type
1666
+ //! :end-before: example-end custom-type
1667
+ //!
1668
+ //! The code snippet below illustrates a sort of 4 pairs that
1669
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1670
+ //! where each thread owns 2 consecutive pairs. The final partitioning is striped.
1671
+ //!
1672
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1673
+ //! :language: c++
1674
+ //! :dedent:
1675
+ //! :start-after: example-begin pairs-striped-bits
1676
+ //! :end-before: example-end pairs-striped-bits
1677
+ //!
1678
+ //! @endrst
1679
+ //!
1680
+ //! @tparam DecomposerT
1681
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1682
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1683
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1684
+ //! The leftmost element of the tuple is considered the most significant.
1685
+ //! The call operator must not modify members of the key.
1686
+ //!
1687
+ //! @param[in,out] keys
1688
+ //! Keys to sort
1689
+ //!
1690
+ //! @param[in,out] values
1691
+ //! Values to sort
1692
+ //!
1693
+ //! @param decomposer
1694
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1695
+ //! references to its constituent arithmetic types. The leftmost element of
1696
+ //! the tuple is considered the most significant. The call operator must not
1697
+ //! modify members of the key.
1698
+ //!
1699
+ //! @param[in] begin_bit
1700
+ //! The least-significant bit index (inclusive) needed for
1701
+ //! key comparison
1702
+ //!
1703
+ //! @param[in] end_bit
1704
+ //! The most-significant bit index (exclusive) needed for key
1705
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1706
+ template <class DecomposerT>
1707
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1708
+ ::cuda::std::enable_if_t< //
1709
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1710
+ SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
1711
+ ValueT (&values)[ITEMS_PER_THREAD],
1712
+ DecomposerT decomposer,
1713
+ int begin_bit,
1714
+ int end_bit)
1715
+ {
1716
+ SortBlockedToStriped(
1717
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1718
+ }
1719
+
1720
+ //! @rst
1721
+ //! Performs an ascending block-wide radix sort over a
1722
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
1723
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1724
+ //!
1725
+ //! * @granularity
1726
+ //! * @smemreuse
1727
+ //!
1728
+ //! Snippet
1729
+ //! ==========================================================================
1730
+ //!
1731
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1732
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1733
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1734
+ //! tuple of references to relevant members of the key.
1735
+ //!
1736
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1737
+ //! :language: c++
1738
+ //! :dedent:
1739
+ //! :start-after: example-begin custom-type
1740
+ //! :end-before: example-end custom-type
1741
+ //!
1742
+ //! The code snippet below illustrates a sort of 6 pairs that
1743
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1744
+ //! where each thread owns 3 consecutive pairs. The final partitioning is striped.
1745
+ //!
1746
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1747
+ //! :language: c++
1748
+ //! :dedent:
1749
+ //! :start-after: example-begin pairs-striped
1750
+ //! :end-before: example-end pairs-striped
1751
+ //!
1752
+ //! @endrst
1753
+ //!
1754
+ //! @tparam DecomposerT
1755
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1756
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1757
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1758
+ //! The leftmost element of the tuple is considered the most significant.
1759
+ //! The call operator must not modify members of the key.
1760
+ //!
1761
+ //! @param[in,out] keys
1762
+ //! Keys to sort
1763
+ //!
1764
+ //! @param[in,out] values
1765
+ //! Values to sort
1766
+ //!
1767
+ //! @param decomposer
1768
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1769
+ //! references to its constituent arithmetic types. The leftmost element of
1770
+ //! the tuple is considered the most significant. The call operator must not
1771
+ //! modify members of the key.
1772
+ template <class DecomposerT>
1773
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1774
+ ::cuda::std::enable_if_t< //
1775
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1776
+ SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer)
1777
+ {
1778
+ SortBlockedToStriped(
1779
+ keys,
1780
+ values,
1781
+ 0,
1782
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1783
+ ::cuda::std::false_type(),
1784
+ detail::bool_constant_v<KEYS_ONLY>,
1785
+ decomposer);
1786
+ }
1787
+
1788
+ //! @rst
1789
+ //! Performs a descending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
1790
+ //! of keys, leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`.
1791
+ //!
1792
+ //! - @granularity
1793
+ //! - @smemreuse
1794
+ //!
1795
+ //! Snippet
1796
+ //! +++++++
1797
+ //!
1798
+ //! The code snippet below illustrates a sort of 512 integer keys that
1799
+ //! are initially partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128
1800
+ //! threads where each thread owns 4 consecutive keys. The final partitioning is striped.
1801
+ //!
1802
+ //! .. code-block:: c++
1803
+ //!
1804
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
1805
+ //!
1806
+ //! __global__ void ExampleKernel(...)
1807
+ //! {
1808
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
1809
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
1810
+ //!
1811
+ //! // Allocate shared memory for BlockRadixSort
1812
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1813
+ //!
1814
+ //! // Obtain a segment of consecutive items that are blocked across threads
1815
+ //! int thread_keys[4];
1816
+ //! ...
1817
+ //!
1818
+ //! // Collectively sort the keys
1819
+ //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
1820
+ //!
1821
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1822
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
1823
+ //! The corresponding output ``thread_keys`` in those threads will be
1824
+ //! ``{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }``.
1825
+ //!
1826
+ //! @endrst
1827
+ //!
1828
+ //! @param[in,out] keys
1829
+ //! Keys to sort
1830
+ //!
1831
+ //! @param[in] begin_bit
1832
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1833
+ //!
1834
+ //! @param[in] end_bit
1835
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1836
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1837
+ SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
1838
+ {
1839
+ NullType values[ITEMS_PER_THREAD];
1840
+
1841
+ SortBlockedToStriped(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
1842
+ }
1843
+
1844
+ //! @rst
1845
+ //! Performs a descending block-wide radix sort over a
1846
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
1847
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1848
+ //!
1849
+ //! * @granularity
1850
+ //! * @smemreuse
1851
+ //!
1852
+ //! Snippet
1853
+ //! ==========================================================================
1854
+ //!
1855
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1856
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1857
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1858
+ //! tuple of references to relevant members of the key.
1859
+ //!
1860
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1861
+ //! :language: c++
1862
+ //! :dedent:
1863
+ //! :start-after: example-begin custom-type
1864
+ //! :end-before: example-end custom-type
1865
+ //!
1866
+ //! The code snippet below illustrates a sort of 4 keys that
1867
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1868
+ //! where each thread owns 2 consecutive keys. The final partitioning is striped.
1869
+ //!
1870
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1871
+ //! :language: c++
1872
+ //! :dedent:
1873
+ //! :start-after: example-begin keys-striped-descending-bits
1874
+ //! :end-before: example-end keys-striped-descending-bits
1875
+ //!
1876
+ //! @endrst
1877
+ //!
1878
+ //! @tparam DecomposerT
1879
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1880
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1881
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1882
+ //! The leftmost element of the tuple is considered the most significant.
1883
+ //! The call operator must not modify members of the key.
1884
+ //!
1885
+ //! @param[in,out] keys
1886
+ //! Keys to sort
1887
+ //!
1888
+ //! @param decomposer
1889
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1890
+ //! references to its constituent arithmetic types. The leftmost element of
1891
+ //! the tuple is considered the most significant. The call operator must not
1892
+ //! modify members of the key.
1893
+ //!
1894
+ //! @param[in] begin_bit
1895
+ //! The least-significant bit index (inclusive) needed for
1896
+ //! key comparison
1897
+ //!
1898
+ //! @param[in] end_bit
1899
+ //! The most-significant bit index (exclusive) needed for key
1900
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1901
+ template <class DecomposerT>
1902
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1903
+ ::cuda::std::enable_if_t< //
1904
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1905
+ SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit)
1906
+ {
1907
+ NullType values[ITEMS_PER_THREAD];
1908
+
1909
+ SortBlockedToStriped(
1910
+ keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1911
+ }
1912
+
1913
+ //! @rst
1914
+ //! Performs a descending block-wide radix sort over a
1915
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
1916
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1917
+ //!
1918
+ //! * @granularity
1919
+ //! * @smemreuse
1920
+ //!
1921
+ //! Snippet
1922
+ //! ==========================================================================
1923
+ //!
1924
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1925
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1926
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1927
+ //! tuple of references to relevant members of the key.
1928
+ //!
1929
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1930
+ //! :language: c++
1931
+ //! :dedent:
1932
+ //! :start-after: example-begin custom-type
1933
+ //! :end-before: example-end custom-type
1934
+ //!
1935
+ //! The code snippet below illustrates a sort of 6 keys that
1936
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1937
+ //! where each thread owns 3 consecutive keys. The final partitioning is striped.
1938
+ //!
1939
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1940
+ //! :language: c++
1941
+ //! :dedent:
1942
+ //! :start-after: example-begin keys-striped-descending
1943
+ //! :end-before: example-end keys-striped-descending
1944
+ //!
1945
+ //! @endrst
1946
+ //!
1947
+ //! @tparam DecomposerT
1948
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1949
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1950
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1951
+ //! The leftmost element of the tuple is considered the most significant.
1952
+ //! The call operator must not modify members of the key.
1953
+ //!
1954
+ //! @param[in,out] keys
1955
+ //! Keys to sort
1956
+ //!
1957
+ //! @param decomposer
1958
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1959
+ //! references to its constituent arithmetic types. The leftmost element of
1960
+ //! the tuple is considered the most significant. The call operator must not
1961
+ //! modify members of the key.
1962
+ template <class DecomposerT>
1963
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1964
+ ::cuda::std::enable_if_t< //
1965
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1966
+ SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer)
1967
+ {
1968
+ NullType values[ITEMS_PER_THREAD];
1969
+
1970
+ SortBlockedToStriped(
1971
+ keys,
1972
+ values,
1973
+ 0,
1974
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1975
+ ::cuda::std::true_type(),
1976
+ detail::bool_constant_v<KEYS_ONLY>,
1977
+ decomposer);
1978
+ }
1979
+
1980
+ //! @rst
1981
+ //! Performs a descending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
1982
+ //! of keys and values, leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`
1983
+ //!
1984
+ //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1985
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1986
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1987
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1988
+ //! shared memory.
1989
+ //! - @granularity
1990
+ //! - @smemreuse
1991
+ //!
1992
+ //! Snippet
1993
+ //! +++++++
1994
+ //!
1995
+ //! The code snippet below illustrates a sort of 512 integer keys and values that
1996
+ //! are initially partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128
1997
+ //! threads where each thread owns 4 consecutive pairs. The final partitioning is striped.
1998
+ //!
1999
+ //! .. code-block:: c++
2000
+ //!
2001
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
2002
+ //!
2003
+ //! __global__ void ExampleKernel(...)
2004
+ //! {
2005
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
2006
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
2007
+ //!
2008
+ //! // Allocate shared memory for BlockRadixSort
2009
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
2010
+ //!
2011
+ //! // Obtain a segment of consecutive items that are blocked across threads
2012
+ //! int thread_keys[4];
2013
+ //! int thread_values[4];
2014
+ //! ...
2015
+ //!
2016
+ //! // Collectively sort the keys and values among block threads
2017
+ //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
2018
+ //!
2019
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
2020
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
2021
+ //! The corresponding output ``thread_keys`` in those threads will be
2022
+ //! ``{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }``.
2023
+ //!
2024
+ //! @endrst
2025
+ //!
2026
+ //! @param[in,out] keys
2027
+ //! Keys to sort
2028
+ //!
2029
+ //! @param[in,out] values
2030
+ //! Values to sort
2031
+ //!
2032
+ //! @param[in] begin_bit
2033
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
2034
+ //!
2035
+ //! @param[in] end_bit
2036
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
2037
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortDescendingBlockedToStriped(
2038
+ KeyT (&keys)[ITEMS_PER_THREAD],
2039
+ ValueT (&values)[ITEMS_PER_THREAD],
2040
+ int begin_bit = 0,
2041
+ int end_bit = sizeof(KeyT) * 8)
2042
+ {
2043
+ SortBlockedToStriped(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
2044
+ }
2045
+
2046
+ //! @rst
2047
+ //! Performs a descending block-wide radix sort over a
2048
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
2049
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
2050
+ //!
2051
+ //! * @granularity
2052
+ //! * @smemreuse
2053
+ //!
2054
+ //! Snippet
2055
+ //! ==========================================================================
2056
+ //!
2057
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2058
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2059
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2060
+ //! tuple of references to relevant members of the key.
2061
+ //!
2062
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
2063
+ //! :language: c++
2064
+ //! :dedent:
2065
+ //! :start-after: example-begin custom-type
2066
+ //! :end-before: example-end custom-type
2067
+ //!
2068
+ //! The code snippet below illustrates a sort of 4 keys and values that
2069
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
2070
+ //! where each thread owns 2 consecutive pairs. The final partitioning is striped.
2071
+ //!
2072
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
2073
+ //! :language: c++
2074
+ //! :dedent:
2075
+ //! :start-after: example-begin pairs-striped-descending-bits
2076
+ //! :end-before: example-end pairs-striped-descending-bits
2077
+ //!
2078
+ //! @endrst
2079
+ //!
2080
+ //! @tparam DecomposerT
2081
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2082
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2083
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2084
+ //! The leftmost element of the tuple is considered the most significant.
2085
+ //! The call operator must not modify members of the key.
2086
+ //!
2087
+ //! @param[in,out] keys
2088
+ //! Keys to sort
2089
+ //!
2090
+ //! @param[in,out] values
2091
+ //! Values to sort
2092
+ //!
2093
+ //! @param decomposer
2094
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2095
+ //! references to its constituent arithmetic types. The leftmost element of
2096
+ //! the tuple is considered the most significant. The call operator must not
2097
+ //! modify members of the key.
2098
+ //!
2099
+ //! @param[in] begin_bit
2100
+ //! The least-significant bit index (inclusive) needed for
2101
+ //! key comparison
2102
+ //!
2103
+ //! @param[in] end_bit
2104
+ //! The most-significant bit index (exclusive) needed for key
2105
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
2106
+ template <class DecomposerT>
2107
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
2108
+ ::cuda::std::enable_if_t< //
2109
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
2110
+ SortDescendingBlockedToStriped(
2111
+ KeyT (&keys)[ITEMS_PER_THREAD],
2112
+ ValueT (&values)[ITEMS_PER_THREAD],
2113
+ DecomposerT decomposer,
2114
+ int begin_bit,
2115
+ int end_bit)
2116
+ {
2117
+ SortBlockedToStriped(
2118
+ keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
2119
+ }
2120
+
2121
+ //! @rst
2122
+ //! Performs a descending block-wide radix sort over a
2123
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
2124
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
2125
+ //!
2126
+ //! * @granularity
2127
+ //! * @smemreuse
2128
+ //!
2129
+ //! Snippet
2130
+ //! ==========================================================================
2131
+ //!
2132
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2133
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2134
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2135
+ //! tuple of references to relevant members of the key.
2136
+ //!
2137
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
2138
+ //! :language: c++
2139
+ //! :dedent:
2140
+ //! :start-after: example-begin custom-type
2141
+ //! :end-before: example-end custom-type
2142
+ //!
2143
+ //! The code snippet below illustrates a sort of 6 keys and values that
2144
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
2145
+ //! where each thread owns 3 consecutive pairs. The final partitioning is striped.
2146
+ //!
2147
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
2148
+ //! :language: c++
2149
+ //! :dedent:
2150
+ //! :start-after: example-begin pairs-striped-descending
2151
+ //! :end-before: example-end pairs-striped-descending
2152
+ //!
2153
+ //! @endrst
2154
+ //!
2155
+ //! @tparam DecomposerT
2156
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2157
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2158
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2159
+ //! The leftmost element of the tuple is considered the most significant.
2160
+ //! The call operator must not modify members of the key.
2161
+ //!
2162
+ //! @param[in,out] keys
2163
+ //! Keys to sort
2164
+ //!
2165
+ //! @param[in,out] values
2166
+ //! Values to sort
2167
+ //!
2168
+ //! @param decomposer
2169
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2170
+ //! references to its constituent arithmetic types. The leftmost element of
2171
+ //! the tuple is considered the most significant. The call operator must not
2172
+ //! modify members of the key.
2173
+ template <class DecomposerT>
2174
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
2175
+ ::cuda::std::enable_if_t< //
2176
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
2177
+ SortDescendingBlockedToStriped(
2178
+ KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer)
2179
+ {
2180
+ SortBlockedToStriped(
2181
+ keys,
2182
+ values,
2183
+ 0,
2184
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
2185
+ ::cuda::std::true_type(),
2186
+ detail::bool_constant_v<KEYS_ONLY>,
2187
+ decomposer);
2188
+ }
2189
+
2190
+ //@} end member group
2191
+ };
2192
+
2193
+ CUB_NAMESPACE_END