cuda-cccl 0.1.3.2.0.dev271__cp313-cp313-manylinux_2_26_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1947) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +46 -0
  3. cuda/cccl/cooperative/__init__.py +3 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +8 -0
  5. cuda/cccl/cooperative/experimental/_caching.py +48 -0
  6. cuda/cccl/cooperative/experimental/_common.py +273 -0
  7. cuda/cccl/cooperative/experimental/_nvrtc.py +91 -0
  8. cuda/cccl/cooperative/experimental/_scan_op.py +181 -0
  9. cuda/cccl/cooperative/experimental/_types.py +937 -0
  10. cuda/cccl/cooperative/experimental/_typing.py +107 -0
  11. cuda/cccl/cooperative/experimental/block/__init__.py +39 -0
  12. cuda/cccl/cooperative/experimental/block/_block_exchange.py +251 -0
  13. cuda/cccl/cooperative/experimental/block/_block_load_store.py +215 -0
  14. cuda/cccl/cooperative/experimental/block/_block_merge_sort.py +125 -0
  15. cuda/cccl/cooperative/experimental/block/_block_radix_sort.py +214 -0
  16. cuda/cccl/cooperative/experimental/block/_block_reduce.py +294 -0
  17. cuda/cccl/cooperative/experimental/block/_block_scan.py +983 -0
  18. cuda/cccl/cooperative/experimental/warp/__init__.py +9 -0
  19. cuda/cccl/cooperative/experimental/warp/_warp_merge_sort.py +92 -0
  20. cuda/cccl/cooperative/experimental/warp/_warp_reduce.py +153 -0
  21. cuda/cccl/cooperative/experimental/warp/_warp_scan.py +78 -0
  22. cuda/cccl/headers/__init__.py +7 -0
  23. cuda/cccl/headers/include/__init__.py +1 -0
  24. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +262 -0
  25. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1181 -0
  26. cuda/cccl/headers/include/cub/agent/agent_for.cuh +84 -0
  27. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +925 -0
  28. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +226 -0
  29. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +730 -0
  30. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +766 -0
  31. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
  32. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +704 -0
  33. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +557 -0
  34. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +632 -0
  35. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +804 -0
  36. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1124 -0
  37. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +561 -0
  38. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +473 -0
  39. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +292 -0
  40. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1114 -0
  41. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +341 -0
  42. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +592 -0
  43. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +614 -0
  44. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1342 -0
  45. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +965 -0
  46. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1217 -0
  47. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1306 -0
  48. cuda/cccl/headers/include/cub/block/block_histogram.cuh +420 -0
  49. cuda/cccl/headers/include/cub/block/block_load.cuh +1260 -0
  50. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +787 -0
  51. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1218 -0
  52. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2193 -0
  53. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  54. cuda/cccl/headers/include/cub/block/block_reduce.cuh +665 -0
  55. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +437 -0
  56. cuda/cccl/headers/include/cub/block/block_scan.cuh +2583 -0
  57. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  58. cuda/cccl/headers/include/cub/block/block_store.cuh +1246 -0
  59. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +620 -0
  60. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  61. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  62. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  63. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  64. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
  65. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  66. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  67. cuda/cccl/headers/include/cub/config.cuh +53 -0
  68. cuda/cccl/headers/include/cub/cub.cuh +112 -0
  69. cuda/cccl/headers/include/cub/detail/array_utils.cuh +77 -0
  70. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +155 -0
  71. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +93 -0
  72. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  73. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +246 -0
  74. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +84 -0
  75. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
  76. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
  77. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +118 -0
  78. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  79. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  80. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +61 -0
  81. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  82. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +71 -0
  83. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +93 -0
  84. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  85. cuda/cccl/headers/include/cub/detail/rfa.cuh +724 -0
  86. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  87. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  88. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
  89. cuda/cccl/headers/include/cub/detail/type_traits.cuh +179 -0
  90. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +72 -0
  91. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  92. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  93. cuda/cccl/headers/include/cub/device/device_copy.cuh +187 -0
  94. cuda/cccl/headers/include/cub/device/device_for.cuh +990 -0
  95. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1507 -0
  96. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  97. cuda/cccl/headers/include/cub/device/device_merge.cuh +202 -0
  98. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  99. cuda/cccl/headers/include/cub/device/device_partition.cuh +664 -0
  100. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3435 -0
  101. cuda/cccl/headers/include/cub/device/device_reduce.cuh +1898 -0
  102. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
  103. cuda/cccl/headers/include/cub/device/device_scan.cuh +1899 -0
  104. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  105. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1512 -0
  106. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  107. cuda/cccl/headers/include/cub/device/device_select.cuh +1224 -0
  108. cuda/cccl/headers/include/cub/device/device_transform.cuh +545 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +314 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +109 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +718 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +45 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +197 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1042 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +305 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1749 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1316 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +656 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +497 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +313 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +612 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +497 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +598 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +916 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +838 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +441 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +455 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +558 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +543 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +218 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +799 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +591 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +194 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +330 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +475 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +121 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +987 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +70 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +121 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +63 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +278 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +91 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +118 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1068 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +945 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +675 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +609 -0
  154. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1013 -0
  155. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +249 -0
  156. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1587 -0
  157. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +407 -0
  158. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +448 -0
  159. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +874 -0
  160. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +226 -0
  161. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  162. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  163. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +256 -0
  164. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +260 -0
  165. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +252 -0
  166. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +322 -0
  167. cuda/cccl/headers/include/cub/thread/thread_load.cuh +347 -0
  168. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +684 -0
  169. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +547 -0
  170. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
  171. cuda/cccl/headers/include/cub/thread/thread_search.cuh +198 -0
  172. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +464 -0
  173. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +101 -0
  174. cuda/cccl/headers/include/cub/thread/thread_store.cuh +364 -0
  175. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  176. cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
  177. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  178. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  179. cuda/cccl/headers/include/cub/util_device.cuh +779 -0
  180. cuda/cccl/headers/include/cub/util_macro.cuh +99 -0
  181. cuda/cccl/headers/include/cub/util_math.cuh +115 -0
  182. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  183. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  184. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  185. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  186. cuda/cccl/headers/include/cub/util_type.cuh +1136 -0
  187. cuda/cccl/headers/include/cub/util_vsmem.cuh +251 -0
  188. cuda/cccl/headers/include/cub/version.cuh +89 -0
  189. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  190. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  191. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +729 -0
  192. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +405 -0
  193. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +950 -0
  194. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +713 -0
  195. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  196. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  197. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  198. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +822 -0
  199. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1885 -0
  200. cuda/cccl/headers/include/cub/warp/warp_store.cuh +520 -0
  201. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
  202. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  203. cuda/cccl/headers/include/cuda/__algorithm/copy.h +143 -0
  204. cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
  205. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  206. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  207. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
  208. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  209. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  210. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  211. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  212. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  213. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  214. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  215. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
  216. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +466 -0
  217. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
  218. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  219. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  220. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  221. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  222. cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
  223. cuda/cccl/headers/include/cuda/__cccl_config +36 -0
  224. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  225. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +249 -0
  226. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  227. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  228. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  229. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  230. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  231. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  232. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  233. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  234. cuda/cccl/headers/include/cuda/__device/all_devices.h +240 -0
  235. cuda/cccl/headers/include/cuda/__device/arch_traits.h +613 -0
  236. cuda/cccl/headers/include/cuda/__device/attributes.h +721 -0
  237. cuda/cccl/headers/include/cuda/__device/device_ref.h +176 -0
  238. cuda/cccl/headers/include/cuda/__device/physical_device.h +168 -0
  239. cuda/cccl/headers/include/cuda/__driver/driver_api.h +503 -0
  240. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  241. cuda/cccl/headers/include/cuda/__event/event_ref.h +158 -0
  242. cuda/cccl/headers/include/cuda/__event/timed_event.h +118 -0
  243. cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
  244. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  245. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  246. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  247. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +276 -0
  248. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  249. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  250. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +109 -0
  251. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  252. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  253. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  254. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  255. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +49 -0
  256. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +257 -0
  257. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +460 -0
  258. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +314 -0
  259. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +424 -0
  260. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +292 -0
  261. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +321 -0
  262. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +335 -0
  263. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +501 -0
  264. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +496 -0
  265. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +452 -0
  266. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +94 -0
  267. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +539 -0
  268. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  269. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +462 -0
  270. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +63 -0
  271. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +122 -0
  272. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +51 -0
  273. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  274. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  275. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  276. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  277. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  278. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  279. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  280. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  281. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  282. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
  283. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  284. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  285. cuda/cccl/headers/include/cuda/__memory/address_space.h +211 -0
  286. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  287. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  288. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  289. cuda/cccl/headers/include/cuda/__memory/check_address.h +106 -0
  290. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  291. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  292. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  293. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  294. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  295. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  296. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +69 -0
  297. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
  298. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +654 -0
  299. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
  300. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  301. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  302. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  303. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  304. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2982 -0
  305. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  306. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  307. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  308. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  309. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  310. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  311. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  312. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  313. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  314. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  315. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  411. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  412. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
  413. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  414. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  415. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +97 -0
  416. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  417. cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
  418. cuda/cccl/headers/include/cuda/__stream/stream.h +142 -0
  419. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +296 -0
  420. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  421. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  422. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  423. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  424. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  425. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  426. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  427. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  428. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  429. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +521 -0
  430. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
  431. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
  432. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  433. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  434. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  435. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
  436. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
  437. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +78 -0
  438. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  439. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  440. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
  441. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  442. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  443. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  444. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  445. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  446. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  447. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  448. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  449. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  450. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  451. cuda/cccl/headers/include/cuda/access_property +26 -0
  452. cuda/cccl/headers/include/cuda/algorithm +27 -0
  453. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  454. cuda/cccl/headers/include/cuda/atomic +27 -0
  455. cuda/cccl/headers/include/cuda/barrier +267 -0
  456. cuda/cccl/headers/include/cuda/bit +29 -0
  457. cuda/cccl/headers/include/cuda/cmath +36 -0
  458. cuda/cccl/headers/include/cuda/devices +20 -0
  459. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  460. cuda/cccl/headers/include/cuda/functional +32 -0
  461. cuda/cccl/headers/include/cuda/iterator +38 -0
  462. cuda/cccl/headers/include/cuda/latch +27 -0
  463. cuda/cccl/headers/include/cuda/mdspan +28 -0
  464. cuda/cccl/headers/include/cuda/memory +34 -0
  465. cuda/cccl/headers/include/cuda/memory_resource +35 -0
  466. cuda/cccl/headers/include/cuda/numeric +29 -0
  467. cuda/cccl/headers/include/cuda/pipeline +578 -0
  468. cuda/cccl/headers/include/cuda/ptx +128 -0
  469. cuda/cccl/headers/include/cuda/semaphore +31 -0
  470. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  471. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  472. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  473. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  474. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  475. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +64 -0
  476. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  477. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  478. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  479. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  480. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  481. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +92 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  503. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  504. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  505. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  506. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  507. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  508. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  509. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  510. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +96 -0
  511. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  512. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +68 -0
  513. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  514. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  515. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  516. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  517. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +141 -0
  518. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  519. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
  520. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  521. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  522. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  523. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  524. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  525. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  526. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  527. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +94 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +101 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  566. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  567. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +138 -0
  568. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  569. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  570. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  571. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  572. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  573. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  574. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  575. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  576. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  577. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  578. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  579. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  580. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
  581. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
  582. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  583. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  584. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  585. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  586. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  587. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  588. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  589. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  590. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  591. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
  592. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
  593. cuda/cccl/headers/include/cuda/std/__bit/countl.h +167 -0
  594. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  595. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  596. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  597. cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
  598. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  599. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  600. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  601. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  602. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +150 -0
  603. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  604. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +787 -0
  605. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  606. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  607. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +53 -0
  608. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +69 -0
  609. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  610. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
  611. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
  612. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +79 -0
  613. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +68 -0
  614. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +133 -0
  615. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  616. cuda/cccl/headers/include/cuda/std/__cccl/os.h +48 -0
  617. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1276 -0
  618. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
  619. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
  620. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  621. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  622. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  623. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  624. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  625. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +146 -0
  626. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  627. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
  628. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  629. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  630. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  631. cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
  632. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  633. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  634. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +724 -0
  635. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
  636. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +216 -0
  637. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
  638. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
  639. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
  640. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
  641. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
  642. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
  643. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
  644. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +180 -0
  645. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  646. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  647. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +582 -0
  648. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +260 -0
  649. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  650. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
  651. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
  652. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
  653. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  654. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
  655. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
  656. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  657. cuda/cccl/headers/include/cuda/std/__complex/complex.h +676 -0
  658. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
  659. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  660. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
  661. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  662. cuda/cccl/headers/include/cuda/std/__complex/literals.h +106 -0
  663. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +302 -0
  664. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  665. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +322 -0
  666. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +321 -0
  667. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  668. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  669. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  670. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  671. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  672. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  673. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  674. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  675. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  676. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  677. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +274 -0
  678. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +107 -0
  679. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  680. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  681. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  682. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  683. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  684. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  685. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  686. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  687. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  688. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  689. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  690. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +42 -0
  691. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  692. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  693. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  694. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  695. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  696. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  697. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  698. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  699. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +69 -0
  700. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +58 -0
  701. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  702. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  703. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +72 -0
  704. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +146 -0
  705. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  706. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  707. cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
  708. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  709. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1963 -0
  710. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
  711. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  712. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +172 -0
  713. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  714. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +809 -0
  715. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  716. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  717. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +374 -0
  718. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  719. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +113 -0
  720. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  721. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +39 -0
  722. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +72 -0
  723. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  724. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  725. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  726. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  727. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  728. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  729. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  730. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  731. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  732. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  733. cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
  734. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  735. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  736. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  737. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
  738. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  739. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  740. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  741. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  742. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  743. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  744. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  745. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  746. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  747. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  748. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  749. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  750. cuda/cccl/headers/include/cuda/std/__functional/bind.h +337 -0
  751. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  752. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  753. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  754. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  755. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  756. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  757. cuda/cccl/headers/include/cuda/std/__functional/function.h +1279 -0
  758. cuda/cccl/headers/include/cuda/std/__functional/hash.h +650 -0
  759. cuda/cccl/headers/include/cuda/std/__functional/identity.h +61 -0
  760. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +560 -0
  761. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +43 -0
  762. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +67 -0
  763. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +213 -0
  764. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  765. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  766. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  767. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +65 -0
  768. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +64 -0
  769. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  770. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  771. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  772. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  773. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  774. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +277 -0
  775. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +35 -0
  776. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  777. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +49 -0
  778. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +34 -0
  779. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  780. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  781. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  782. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  783. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  784. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +90 -0
  785. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  786. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  787. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +34 -0
  788. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  789. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  790. cuda/cccl/headers/include/cuda/std/__fwd/string.h +83 -0
  791. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +59 -0
  792. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  793. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  794. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  795. cuda/cccl/headers/include/cuda/std/__internal/features.h +71 -0
  796. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +122 -0
  797. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  798. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  799. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
  800. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +254 -0
  801. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  802. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  803. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  804. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  805. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  806. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  807. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  808. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
  809. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +150 -0
  810. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  811. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  812. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  813. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  814. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  815. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  816. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  817. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +932 -0
  818. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  819. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +433 -0
  820. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  821. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  822. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  823. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  824. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  825. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +91 -0
  826. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  827. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +185 -0
  828. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  829. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
  830. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  831. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  832. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  833. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +247 -0
  834. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  835. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  836. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  837. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  838. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
  839. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
  840. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
  841. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
  842. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  843. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +138 -0
  844. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  845. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
  846. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +757 -0
  847. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  848. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  849. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
  850. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +499 -0
  851. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  852. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  853. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  854. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +64 -0
  855. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  856. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  857. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  858. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  859. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  860. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  861. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +552 -0
  862. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  863. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  864. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  865. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +248 -0
  866. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  867. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +43 -0
  868. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +261 -0
  869. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  870. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  871. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +682 -0
  872. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +767 -0
  873. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
  874. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  875. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  876. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  877. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  878. cuda/cccl/headers/include/cuda/std/__new/launder.h +49 -0
  879. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  880. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  881. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  882. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  883. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  884. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  885. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  886. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  887. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  888. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  889. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  890. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  891. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  892. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  893. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  894. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  895. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  896. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  897. cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
  898. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +432 -0
  899. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  900. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  901. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  902. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  903. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  904. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  905. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  906. cuda/cccl/headers/include/cuda/std/__random_ +29 -0
  907. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  908. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  909. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +314 -0
  910. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  911. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  912. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  913. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  914. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  915. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  916. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  917. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  918. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  919. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  920. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +161 -0
  921. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
  922. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  923. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  924. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  925. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  926. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  927. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  928. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  929. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
  930. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  931. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  932. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  933. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  934. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  935. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +233 -0
  936. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  937. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  938. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  939. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  940. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  941. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  942. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  943. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  944. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +105 -0
  945. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  946. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  947. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  948. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  949. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  950. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +98 -0
  951. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +269 -0
  952. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +218 -0
  953. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  954. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  955. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +90 -0
  956. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +73 -0
  957. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  958. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  959. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +291 -0
  960. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  961. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  962. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  963. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  964. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  965. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  966. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  967. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  968. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  969. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  970. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +69 -0
  971. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  972. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  973. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  974. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  975. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  976. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  977. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  978. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  979. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  980. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  981. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  982. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  983. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +47 -0
  984. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +51 -0
  985. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  986. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +40 -0
  987. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +44 -0
  988. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  989. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  990. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  991. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  992. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +83 -0
  993. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  994. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  995. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  996. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +68 -0
  997. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +54 -0
  998. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  999. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  1000. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1001. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +214 -0
  1002. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1003. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1004. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1005. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1006. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1007. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1008. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +73 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +68 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +81 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +56 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1016. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1017. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1018. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1019. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1020. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1021. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +59 -0
  1022. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1023. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1024. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1025. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1026. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1027. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1028. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  1029. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1030. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1031. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1032. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1033. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1034. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1035. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1036. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1037. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1038. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1039. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +62 -0
  1040. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1041. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1042. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1043. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +63 -0
  1044. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +119 -0
  1045. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1046. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  1047. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1048. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +84 -0
  1049. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1050. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1051. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1052. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1053. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +57 -0
  1054. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +201 -0
  1055. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +56 -0
  1056. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +70 -0
  1057. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +82 -0
  1058. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +60 -0
  1059. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +61 -0
  1060. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +56 -0
  1061. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +55 -0
  1062. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +73 -0
  1063. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +60 -0
  1064. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +58 -0
  1065. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1066. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +57 -0
  1067. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1068. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1069. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1070. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1071. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1072. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1073. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1074. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1075. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1076. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1077. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1078. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1079. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1080. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +123 -0
  1081. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1082. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1083. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1084. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1085. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1086. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +66 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1100. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1101. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1102. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1103. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1104. cuda/cccl/headers/include/cuda/std/__utility/declval.h +63 -0
  1105. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1106. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1107. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1108. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1109. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +77 -0
  1110. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  1111. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1112. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1113. cuda/cccl/headers/include/cuda/std/__utility/pair.h +797 -0
  1114. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1115. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
  1116. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1117. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  1118. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1119. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1120. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1121. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1122. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1123. cuda/cccl/headers/include/cuda/std/array +518 -0
  1124. cuda/cccl/headers/include/cuda/std/atomic +818 -0
  1125. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1126. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1127. cuda/cccl/headers/include/cuda/std/bitset +994 -0
  1128. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1129. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1130. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1131. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1132. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1133. cuda/cccl/headers/include/cuda/std/cmath +25 -0
  1134. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1135. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1136. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1137. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1138. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1139. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1140. cuda/cccl/headers/include/cuda/std/ctime +154 -0
  1141. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1142. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +235 -0
  1143. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1722 -0
  1144. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3630 -0
  1145. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/cmath +520 -0
  1146. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1147. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1148. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
  1149. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2142 -0
  1150. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1151. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1152. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1153. cuda/cccl/headers/include/cuda/std/initializer_list +36 -0
  1154. cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
  1155. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1156. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1157. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1158. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1159. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1160. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1161. cuda/cccl/headers/include/cuda/std/numbers +342 -0
  1162. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1163. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1164. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1165. cuda/cccl/headers/include/cuda/std/ratio +417 -0
  1166. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1167. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1168. cuda/cccl/headers/include/cuda/std/span +628 -0
  1169. cuda/cccl/headers/include/cuda/std/string_view +799 -0
  1170. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1171. cuda/cccl/headers/include/cuda/std/type_traits +177 -0
  1172. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1173. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1174. cuda/cccl/headers/include/cuda/std/version +245 -0
  1175. cuda/cccl/headers/include/cuda/stream +31 -0
  1176. cuda/cccl/headers/include/cuda/stream_ref +54 -0
  1177. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1178. cuda/cccl/headers/include/cuda/utility +27 -0
  1179. cuda/cccl/headers/include/cuda/version +16 -0
  1180. cuda/cccl/headers/include/cuda/warp +28 -0
  1181. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1182. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1183. cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
  1184. cuda/cccl/headers/include/nv/target +235 -0
  1185. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1186. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1187. cuda/cccl/headers/include/thrust/advance.h +59 -0
  1188. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1189. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1190. cuda/cccl/headers/include/thrust/complex.h +858 -0
  1191. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1192. cuda/cccl/headers/include/thrust/count.h +245 -0
  1193. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1194. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +37 -0
  1195. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1196. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +350 -0
  1197. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +371 -0
  1198. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +45 -0
  1199. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +242 -0
  1200. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +39 -0
  1201. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +137 -0
  1202. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +39 -0
  1203. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +99 -0
  1204. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +53 -0
  1205. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +68 -0
  1206. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1207. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +102 -0
  1208. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +86 -0
  1209. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +79 -0
  1210. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +79 -0
  1211. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +39 -0
  1212. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +98 -0
  1213. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1214. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1215. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1216. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1217. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1218. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1219. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1220. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1221. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1222. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1223. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1224. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1225. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1226. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1227. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1228. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1229. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1230. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1231. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1232. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1233. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1234. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1235. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1236. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1237. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1238. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1239. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1240. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1241. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1242. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1243. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1244. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1245. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1246. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1247. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1248. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1249. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1250. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1251. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1252. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1253. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1254. cuda/cccl/headers/include/thrust/detail/device_delete.inl +52 -0
  1255. cuda/cccl/headers/include/thrust/detail/device_free.inl +47 -0
  1256. cuda/cccl/headers/include/thrust/detail/device_new.inl +61 -0
  1257. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1258. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1259. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1260. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +80 -0
  1261. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1262. cuda/cccl/headers/include/thrust/detail/execution_policy.h +80 -0
  1263. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1264. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1265. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1266. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1267. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1268. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1269. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1270. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1271. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1272. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1273. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1274. cuda/cccl/headers/include/thrust/detail/integer_math.h +130 -0
  1275. cuda/cccl/headers/include/thrust/detail/internal_functional.h +293 -0
  1276. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1277. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
  1278. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1279. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1280. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +40 -0
  1281. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1282. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1283. cuda/cccl/headers/include/thrust/detail/mpl/math.h +164 -0
  1284. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +37 -0
  1285. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1286. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1287. cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
  1288. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1289. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1290. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1291. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1292. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1293. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
  1294. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1295. cuda/cccl/headers/include/thrust/detail/reference.h +500 -0
  1296. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1297. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1298. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1299. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1300. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1301. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1302. cuda/cccl/headers/include/thrust/detail/seq.h +54 -0
  1303. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1304. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1305. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1306. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1307. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1308. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1309. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1310. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1311. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1312. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
  1313. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1314. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1315. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1316. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
  1317. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1318. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1319. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1320. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1321. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1322. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1323. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1324. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
  1325. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_discard_iterator.h +44 -0
  1326. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1327. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1328. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1329. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1330. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1331. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1332. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1333. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1334. cuda/cccl/headers/include/thrust/detail/util/align.h +59 -0
  1335. cuda/cccl/headers/include/thrust/detail/vector_base.h +615 -0
  1336. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1212 -0
  1337. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1338. cuda/cccl/headers/include/thrust/device_delete.h +59 -0
  1339. cuda/cccl/headers/include/thrust/device_free.h +72 -0
  1340. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1341. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1342. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1343. cuda/cccl/headers/include/thrust/device_new.h +91 -0
  1344. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1345. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1346. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1347. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1348. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1349. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1350. cuda/cccl/headers/include/thrust/execution_policy.h +384 -0
  1351. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1352. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1353. cuda/cccl/headers/include/thrust/find.h +382 -0
  1354. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1355. cuda/cccl/headers/include/thrust/functional.h +396 -0
  1356. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1357. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1358. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1359. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1360. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1361. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1362. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1363. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1364. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1365. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1366. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1367. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +51 -0
  1368. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +62 -0
  1369. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1370. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +199 -0
  1371. cuda/cccl/headers/include/thrust/iterator/detail/iterator_traversal_tags.h +50 -0
  1372. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +53 -0
  1373. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1374. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1375. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1376. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1377. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1378. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1379. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +215 -0
  1380. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +660 -0
  1381. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +311 -0
  1382. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1383. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1384. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1385. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1386. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
  1387. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1388. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +149 -0
  1389. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1390. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1391. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1392. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
  1393. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1394. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1395. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1396. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1397. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1398. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1399. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1400. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1401. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
  1402. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1403. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1404. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1405. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1406. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1407. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1408. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1409. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1410. cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
  1411. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1412. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1413. cuda/cccl/headers/include/thrust/pair.h +99 -0
  1414. cuda/cccl/headers/include/thrust/partition.h +1391 -0
  1415. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1416. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1417. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1418. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1419. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1420. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1421. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1422. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1423. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1424. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1425. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1426. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1427. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
  1428. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1429. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +217 -0
  1430. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1431. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1432. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1433. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1434. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1435. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1436. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1437. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1438. cuda/cccl/headers/include/thrust/random.h +120 -0
  1439. cuda/cccl/headers/include/thrust/reduce.h +1113 -0
  1440. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1441. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1442. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1443. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1444. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1445. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1446. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1447. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1448. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1449. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1450. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1451. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1452. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1453. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1454. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1455. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1456. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1457. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +90 -0
  1458. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1459. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1460. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1461. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1462. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1463. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1464. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1465. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1466. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1467. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1468. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1469. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1470. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1471. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1472. cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +62 -0
  1473. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1474. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1475. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1476. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1477. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1478. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1479. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1480. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1481. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1482. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1483. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1484. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1485. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1486. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1487. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1488. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1489. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1490. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1491. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1492. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1493. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1494. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1495. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1496. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +158 -0
  1497. cuda/cccl/headers/include/thrust/system/cpp/memory.h +109 -0
  1498. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +75 -0
  1499. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +123 -0
  1500. cuda/cccl/headers/include/thrust/system/cpp/vector.h +99 -0
  1501. cuda/cccl/headers/include/thrust/system/cuda/config.h +123 -0
  1502. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1503. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1504. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1505. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1506. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +129 -0
  1507. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1508. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1509. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1510. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +609 -0
  1511. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1512. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1513. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +210 -0
  1514. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1515. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1516. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +113 -0
  1517. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1518. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +82 -0
  1519. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +272 -0
  1520. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1521. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1522. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +85 -0
  1523. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1524. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1525. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +204 -0
  1526. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +92 -0
  1527. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +69 -0
  1528. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1529. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1530. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1531. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1532. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1533. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +217 -0
  1534. cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +237 -0
  1535. cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +95 -0
  1536. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1537. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1538. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1539. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +782 -0
  1540. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
  1541. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1542. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +152 -0
  1543. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
  1544. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +342 -0
  1545. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +415 -0
  1546. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +79 -0
  1547. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1548. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1738 -0
  1549. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +482 -0
  1550. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
  1551. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +75 -0
  1552. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1553. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1554. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +415 -0
  1555. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1556. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1557. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +114 -0
  1558. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +91 -0
  1559. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
  1560. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +311 -0
  1561. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +251 -0
  1562. cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
  1563. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +39 -0
  1564. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1565. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1566. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
  1567. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1568. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1569. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1570. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1571. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1572. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1573. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1574. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1575. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1576. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1577. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1578. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1579. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1580. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1581. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1582. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1583. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1584. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +64 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/errno.h +125 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +59 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +85 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +167 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +391 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +51 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +70 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +64 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +152 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +54 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +90 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +55 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +66 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +72 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +258 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +60 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +55 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +143 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +64 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +79 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +102 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +51 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +66 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +78 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +65 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +70 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +92 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +105 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +154 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +55 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +74 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +135 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +213 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +49 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +77 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +106 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +89 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +192 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +92 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +127 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +101 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +181 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +54 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +73 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +78 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +141 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +78 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +91 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +132 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +238 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +79 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +91 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +96 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +288 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +482 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +60 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +131 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +119 -0
  1675. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +181 -0
  1676. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +50 -0
  1677. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +82 -0
  1678. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +47 -0
  1679. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +60 -0
  1680. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +53 -0
  1681. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +60 -0
  1682. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +88 -0
  1683. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +56 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +62 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +86 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +119 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +51 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +172 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +51 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +121 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +77 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +119 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +87 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +132 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +123 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +76 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +48 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +142 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +55 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +125 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +55 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +77 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +78 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +116 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +68 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +80 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +129 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +49 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +147 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +51 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +56 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +81 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +151 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +309 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +70 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +104 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +185 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +160 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +151 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +212 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +65 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +61 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +362 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +54 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +130 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +54 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +592 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +64 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +121 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +112 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1755. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1756. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1757. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1758. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1759. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1760. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1761. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1762. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1763. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1764. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1765. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1766. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1767. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +113 -0
  1768. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1769. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1770. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1771. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1772. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1773. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1774. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1775. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1776. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1777. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1778. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1779. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1780. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1781. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1782. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1783. cuda/cccl/headers/include/thrust/system/omp/detail/par.h +62 -0
  1784. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1785. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1786. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1787. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1788. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1789. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1790. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1791. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1792. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1793. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1794. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1795. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1796. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1797. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1798. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +30 -0
  1799. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +30 -0
  1800. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1801. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1802. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1803. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1804. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
  1805. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1806. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1807. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1808. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1809. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1810. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1811. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1812. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1813. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1814. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1815. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1816. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1817. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +157 -0
  1818. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1819. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1820. cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
  1821. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1822. cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
  1823. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1824. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1825. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1826. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1827. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1828. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1829. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1830. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1831. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1832. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +92 -0
  1833. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1834. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1835. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1836. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1837. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1838. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1839. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1840. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1841. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1842. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1843. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1844. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1845. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1846. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1847. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1848. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1849. cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +62 -0
  1850. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1851. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1852. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1853. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1854. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1855. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1856. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1857. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1858. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1859. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1860. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1861. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1862. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1863. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1864. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +30 -0
  1865. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1866. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1867. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1868. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1869. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1870. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1871. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1872. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1873. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1874. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +157 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1887. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1888. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1889. cuda/cccl/headers/include/thrust/transform.h +1045 -0
  1890. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1891. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1892. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1893. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1894. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
  1895. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1896. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1897. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1898. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
  1899. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1900. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +96 -0
  1901. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1902. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1903. cuda/cccl/headers/include/thrust/unique.h +1088 -0
  1904. cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
  1905. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1906. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1907. cuda/cccl/headers/include/thrust/version.h +93 -0
  1908. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1909. cuda/cccl/headers/include_paths.py +54 -0
  1910. cuda/cccl/parallel/__init__.py +9 -0
  1911. cuda/cccl/parallel/experimental/.gitignore +4 -0
  1912. cuda/cccl/parallel/experimental/__init__.py +75 -0
  1913. cuda/cccl/parallel/experimental/_bindings.py +56 -0
  1914. cuda/cccl/parallel/experimental/_bindings.pyi +405 -0
  1915. cuda/cccl/parallel/experimental/_bindings_impl.pyx +1957 -0
  1916. cuda/cccl/parallel/experimental/_caching.py +71 -0
  1917. cuda/cccl/parallel/experimental/_cccl_interop.py +396 -0
  1918. cuda/cccl/parallel/experimental/_utils/__init__.py +0 -0
  1919. cuda/cccl/parallel/experimental/_utils/protocols.py +132 -0
  1920. cuda/cccl/parallel/experimental/_utils/temp_storage_buffer.py +86 -0
  1921. cuda/cccl/parallel/experimental/algorithms/__init__.py +50 -0
  1922. cuda/cccl/parallel/experimental/algorithms/_histogram.py +243 -0
  1923. cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +225 -0
  1924. cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +312 -0
  1925. cuda/cccl/parallel/experimental/algorithms/_reduce.py +184 -0
  1926. cuda/cccl/parallel/experimental/algorithms/_scan.py +261 -0
  1927. cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +257 -0
  1928. cuda/cccl/parallel/experimental/algorithms/_transform.py +308 -0
  1929. cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +252 -0
  1930. cuda/cccl/parallel/experimental/cccl/.gitkeep +0 -0
  1931. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-313-x86_64-linux-gnu.so +0 -0
  1932. cuda/cccl/parallel/experimental/cu12/cccl/libcccl.c.parallel.so +0 -0
  1933. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-313-x86_64-linux-gnu.so +0 -0
  1934. cuda/cccl/parallel/experimental/cu13/cccl/libcccl.c.parallel.so +0 -0
  1935. cuda/cccl/parallel/experimental/iterators/__init__.py +21 -0
  1936. cuda/cccl/parallel/experimental/iterators/_factories.py +214 -0
  1937. cuda/cccl/parallel/experimental/iterators/_iterators.py +627 -0
  1938. cuda/cccl/parallel/experimental/iterators/_zip_iterator.py +207 -0
  1939. cuda/cccl/parallel/experimental/numba_utils.py +6 -0
  1940. cuda/cccl/parallel/experimental/op.py +3 -0
  1941. cuda/cccl/parallel/experimental/struct.py +272 -0
  1942. cuda/cccl/parallel/experimental/typing.py +35 -0
  1943. cuda/cccl/py.typed +0 -0
  1944. cuda_cccl-0.1.3.2.0.dev271.dist-info/METADATA +40 -0
  1945. cuda_cccl-0.1.3.2.0.dev271.dist-info/RECORD +1947 -0
  1946. cuda_cccl-0.1.3.2.0.dev271.dist-info/WHEEL +5 -0
  1947. cuda_cccl-0.1.3.2.0.dev271.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,1749 @@
1
+ /******************************************************************************
2
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
3
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4
+ *
5
+ * Redistribution and use in source and binary forms, with or without
6
+ * modification, are permitted provided that the following conditions are met:
7
+ * * Redistributions of source code must retain the above copyright
8
+ * notice, this list of conditions and the following disclaimer.
9
+ * * Redistributions in binary form must reproduce the above copyright
10
+ * notice, this list of conditions and the following disclaimer in the
11
+ * documentation and/or other materials provided with the distribution.
12
+ * * Neither the name of the NVIDIA CORPORATION nor the
13
+ * names of its contributors may be used to endorse or promote products
14
+ * derived from this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ *
27
+ ******************************************************************************/
28
+
29
+ /**
30
+ * @file
31
+ * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across
32
+ * a sequence of data items residing within device-accessible memory.
33
+ */
34
+
35
+ #pragma once
36
+
37
+ #include <cub/config.cuh>
38
+
39
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
40
+ # pragma GCC system_header
41
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
42
+ # pragma clang system_header
43
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
44
+ # pragma system_header
45
+ #endif // no system header
46
+
47
+ #include <cub/device/dispatch/dispatch_advance_iterators.cuh>
48
+ #include <cub/device/dispatch/kernels/radix_sort.cuh>
49
+ #include <cub/device/dispatch/tuning/tuning_radix_sort.cuh>
50
+ #include <cub/util_debug.cuh>
51
+ #include <cub/util_device.cuh>
52
+ #include <cub/util_math.cuh>
53
+ #include <cub/util_type.cuh>
54
+
55
+ #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
56
+
57
+ #include <cuda/std/__algorithm_>
58
+ #include <cuda/std/type_traits>
59
+
60
+ // suppress warnings triggered by #pragma unroll:
61
+ // "warning: loop not unrolled: the optimizer was unable to perform the requested transformation; the transformation
62
+ // might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]"
63
+ _CCCL_DIAG_PUSH
64
+ _CCCL_DIAG_SUPPRESS_CLANG("-Wpass-failed")
65
+
66
+ CUB_NAMESPACE_BEGIN
67
+
68
+ namespace detail::radix_sort
69
+ {
70
+ template <typename MaxPolicyT, SortOrder Order, typename KeyT, typename ValueT, typename OffsetT, typename DecomposerT>
71
+ struct DeviceRadixSortKernelSource
72
+ {
73
+ CUB_DEFINE_KERNEL_GETTER(RadixSortSingleTileKernel,
74
+ DeviceRadixSortSingleTileKernel<MaxPolicyT, Order, KeyT, ValueT, OffsetT, DecomposerT>);
75
+
76
+ CUB_DEFINE_KERNEL_GETTER(RadixSortUpsweepKernel,
77
+ DeviceRadixSortUpsweepKernel<MaxPolicyT, false, Order, KeyT, OffsetT, DecomposerT>);
78
+
79
+ CUB_DEFINE_KERNEL_GETTER(RadixSortAltUpsweepKernel,
80
+ DeviceRadixSortUpsweepKernel<MaxPolicyT, true, Order, KeyT, OffsetT, DecomposerT>);
81
+
82
+ CUB_DEFINE_KERNEL_GETTER(DeviceRadixSortScanBinsKernel, RadixSortScanBinsKernel<MaxPolicyT, OffsetT>);
83
+
84
+ CUB_DEFINE_KERNEL_GETTER(RadixSortDownsweepKernel,
85
+ DeviceRadixSortDownsweepKernel<MaxPolicyT, false, Order, KeyT, ValueT, OffsetT, DecomposerT>);
86
+
87
+ CUB_DEFINE_KERNEL_GETTER(RadixSortAltDownsweepKernel,
88
+ DeviceRadixSortDownsweepKernel<MaxPolicyT, true, Order, KeyT, ValueT, OffsetT, DecomposerT>);
89
+
90
+ CUB_DEFINE_KERNEL_GETTER(RadixSortHistogramKernel,
91
+ DeviceRadixSortHistogramKernel<MaxPolicyT, Order, KeyT, OffsetT, DecomposerT>);
92
+
93
+ CUB_DEFINE_KERNEL_GETTER(RadixSortExclusiveSumKernel, DeviceRadixSortExclusiveSumKernel<MaxPolicyT, OffsetT>);
94
+
95
+ CUB_DEFINE_KERNEL_GETTER(
96
+ RadixSortOnesweepKernel,
97
+ DeviceRadixSortOnesweepKernel<MaxPolicyT, Order, KeyT, ValueT, OffsetT, int, int, DecomposerT>);
98
+
99
+ CUB_RUNTIME_FUNCTION static constexpr size_t KeySize()
100
+ {
101
+ return sizeof(KeyT);
102
+ }
103
+
104
+ CUB_RUNTIME_FUNCTION static constexpr size_t ValueSize()
105
+ {
106
+ return sizeof(ValueT);
107
+ }
108
+ };
109
+
110
+ template <typename MaxPolicyT,
111
+ SortOrder Order,
112
+ typename KeyT,
113
+ typename ValueT,
114
+ typename BeginOffsetIteratorT,
115
+ typename EndOffsetIteratorT,
116
+ typename SegmentSizeT,
117
+ typename DecomposerT>
118
+ struct DeviceSegmentedRadixSortKernelSource
119
+ {
120
+ CUB_DEFINE_KERNEL_GETTER(
121
+ SegmentedRadixSortKernel,
122
+ DeviceSegmentedRadixSortKernel<
123
+ MaxPolicyT,
124
+ false,
125
+ Order,
126
+ KeyT,
127
+ ValueT,
128
+ BeginOffsetIteratorT,
129
+ EndOffsetIteratorT,
130
+ SegmentSizeT,
131
+ DecomposerT>);
132
+
133
+ CUB_DEFINE_KERNEL_GETTER(
134
+ AltSegmentedRadixSortKernel,
135
+ DeviceSegmentedRadixSortKernel<
136
+ MaxPolicyT,
137
+ true,
138
+ Order,
139
+ KeyT,
140
+ ValueT,
141
+ BeginOffsetIteratorT,
142
+ EndOffsetIteratorT,
143
+ SegmentSizeT,
144
+ DecomposerT>);
145
+
146
+ CUB_RUNTIME_FUNCTION static constexpr size_t KeySize()
147
+ {
148
+ return sizeof(KeyT);
149
+ }
150
+
151
+ CUB_RUNTIME_FUNCTION static constexpr size_t ValueSize()
152
+ {
153
+ return sizeof(ValueT);
154
+ }
155
+ };
156
+
157
+ } // namespace detail::radix_sort
158
+
159
+ /******************************************************************************
160
+ * Single-problem dispatch
161
+ ******************************************************************************/
162
+
163
+ /**
164
+ * Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort
165
+ *
166
+ * @tparam SortOrder
167
+ * Whether to sort in ascending or descending order
168
+ *
169
+ * @tparam KeyT
170
+ * Key type
171
+ *
172
+ * @tparam ValueT
173
+ * Value type
174
+ *
175
+ * @tparam OffsetT
176
+ * Signed integer type for global offsets
177
+ *
178
+ * @tparam DecomposerT
179
+ * Implementation detail, do not specify directly, requirements on the
180
+ * content of this type are subject to breaking change.
181
+ */
182
+ template <SortOrder Order,
183
+ typename KeyT,
184
+ typename ValueT,
185
+ typename OffsetT,
186
+ typename DecomposerT = detail::identity_decomposer_t,
187
+ typename PolicyHub = detail::radix::policy_hub<KeyT, ValueT, OffsetT>,
188
+ typename KernelSource = detail::radix_sort::
189
+ DeviceRadixSortKernelSource<typename PolicyHub::MaxPolicy, Order, KeyT, ValueT, OffsetT, DecomposerT>,
190
+ typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY>
191
+ struct DispatchRadixSort
192
+ {
193
+ //------------------------------------------------------------------------------
194
+ // Constants
195
+ //------------------------------------------------------------------------------
196
+
197
+ // Whether this is a keys-only (or key-value) sort
198
+ static constexpr bool KEYS_ONLY = ::cuda::std::is_same_v<ValueT, NullType>;
199
+
200
+ //------------------------------------------------------------------------------
201
+ // Problem state
202
+ //------------------------------------------------------------------------------
203
+
204
+ /// Device-accessible allocation of temporary storage.
205
+ // When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is
206
+ // done.
207
+ void* d_temp_storage;
208
+
209
+ /// Reference to size in bytes of `d_temp_storage` allocation
210
+ size_t& temp_storage_bytes;
211
+
212
+ /// Double-buffer whose current buffer contains the unsorted input keys and, upon return, is
213
+ /// updated to point to the sorted output keys
214
+ DoubleBuffer<KeyT>& d_keys;
215
+
216
+ /// Double-buffer whose current buffer contains the unsorted input values and, upon return, is
217
+ /// updated to point to the sorted output values
218
+ DoubleBuffer<ValueT>& d_values;
219
+
220
+ /// Number of items to sort
221
+ OffsetT num_items;
222
+
223
+ /// The beginning (least-significant) bit index needed for key comparison
224
+ int begin_bit;
225
+
226
+ /// The past-the-end (most-significant) bit index needed for key comparison
227
+ int end_bit;
228
+
229
+ /// CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
230
+ cudaStream_t stream;
231
+
232
+ /// PTX version
233
+ int ptx_version;
234
+
235
+ /// Whether is okay to overwrite source buffers
236
+ bool is_overwrite_okay;
237
+
238
+ DecomposerT decomposer;
239
+
240
+ KernelSource kernel_source;
241
+
242
+ KernelLauncherFactory launcher_factory;
243
+
244
+ //------------------------------------------------------------------------------
245
+ // Constructor
246
+ //------------------------------------------------------------------------------
247
+
248
+ /// Constructor
249
+ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchRadixSort(
250
+ void* d_temp_storage,
251
+ size_t& temp_storage_bytes,
252
+ DoubleBuffer<KeyT>& d_keys,
253
+ DoubleBuffer<ValueT>& d_values,
254
+ OffsetT num_items,
255
+ int begin_bit,
256
+ int end_bit,
257
+ bool is_overwrite_okay,
258
+ cudaStream_t stream,
259
+ int ptx_version,
260
+ DecomposerT decomposer = {},
261
+ KernelSource kernel_source = {},
262
+ KernelLauncherFactory launcher_factory = {})
263
+ : d_temp_storage(d_temp_storage)
264
+ , temp_storage_bytes(temp_storage_bytes)
265
+ , d_keys(d_keys)
266
+ , d_values(d_values)
267
+ , num_items(num_items)
268
+ , begin_bit(begin_bit)
269
+ , end_bit(end_bit)
270
+ , stream(stream)
271
+ , ptx_version(ptx_version)
272
+ , is_overwrite_okay(is_overwrite_okay)
273
+ , decomposer(decomposer)
274
+ , kernel_source(kernel_source)
275
+ , launcher_factory(launcher_factory)
276
+ {}
277
+
278
+ //------------------------------------------------------------------------------
279
+ // Small-problem (single tile) invocation
280
+ //------------------------------------------------------------------------------
281
+
282
+ /**
283
+ * @brief Invoke a single block to sort in-core
284
+ *
285
+ * @tparam ActivePolicyT
286
+ * Umbrella policy active for the target device
287
+ *
288
+ * @tparam SingleTileKernelT
289
+ * Function type of cub::DeviceRadixSortSingleTileKernel
290
+ *
291
+ * @param[in] single_tile_kernel
292
+ * Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel
293
+ */
294
+ template <typename ActivePolicyT, typename SingleTileKernelT>
295
+ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
296
+ InvokeSingleTile(SingleTileKernelT single_tile_kernel, ActivePolicyT policy = {})
297
+ {
298
+ cudaError error = cudaSuccess;
299
+ do
300
+ {
301
+ // Return if the caller is simply requesting the size of the storage allocation
302
+ if (d_temp_storage == nullptr)
303
+ {
304
+ temp_storage_bytes = 1;
305
+ break;
306
+ }
307
+
308
+ // Log single_tile_kernel configuration
309
+ #ifdef CUB_DEBUG_LOG
310
+ _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit "
311
+ "%d, bit_grain %d\n",
312
+ 1,
313
+ policy.SingleTile().BlockThreads(),
314
+ (long long) stream,
315
+ policy.SingleTile().ItemsPerThread(),
316
+ 1,
317
+ begin_bit,
318
+ policy.RadixBits(policy.SingleTile()));
319
+ #endif
320
+
321
+ // Invoke upsweep_kernel with same grid size as downsweep_kernel
322
+ launcher_factory(1, policy.SingleTile().BlockThreads(), 0, stream)
323
+ .doit(single_tile_kernel,
324
+ d_keys.Current(),
325
+ d_keys.Alternate(),
326
+ d_values.Current(),
327
+ d_values.Alternate(),
328
+ num_items,
329
+ begin_bit,
330
+ end_bit,
331
+ decomposer);
332
+
333
+ // Check for failure to launch
334
+ error = CubDebug(cudaPeekAtLastError());
335
+ if (cudaSuccess != error)
336
+ {
337
+ break;
338
+ }
339
+
340
+ // Sync the stream if specified to flush runtime errors
341
+ error = CubDebug(detail::DebugSyncStream(stream));
342
+ if (cudaSuccess != error)
343
+ {
344
+ break;
345
+ }
346
+
347
+ // Update selector
348
+ d_keys.selector ^= 1;
349
+ d_values.selector ^= 1;
350
+ } while (0);
351
+
352
+ return error;
353
+ }
354
+
355
+ //------------------------------------------------------------------------------
356
+ // Normal problem size invocation
357
+ //------------------------------------------------------------------------------
358
+
359
+ /**
360
+ * Invoke a three-kernel sorting pass at the current bit.
361
+ */
362
+ template <typename PassConfigT>
363
+ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t InvokePass(
364
+ const KeyT* d_keys_in,
365
+ KeyT* d_keys_out,
366
+ const ValueT* d_values_in,
367
+ ValueT* d_values_out,
368
+ OffsetT* d_spine,
369
+ int /*spine_length*/,
370
+ int& current_bit,
371
+ PassConfigT& pass_config)
372
+ {
373
+ cudaError error = cudaSuccess;
374
+ do
375
+ {
376
+ int pass_bits = ::cuda::std::min(pass_config.radix_bits, end_bit - current_bit);
377
+
378
+ // Log upsweep_kernel configuration
379
+ #ifdef CUB_DEBUG_LOG
380
+ _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, "
381
+ "bit_grain %d\n",
382
+ pass_config.even_share.grid_size,
383
+ pass_config.upsweep_config.block_threads,
384
+ (long long) stream,
385
+ pass_config.upsweep_config.items_per_thread,
386
+ pass_config.upsweep_config.sm_occupancy,
387
+ current_bit,
388
+ pass_bits);
389
+ #endif
390
+
391
+ // Spine length written by the upsweep kernel in the current pass.
392
+ int pass_spine_length = pass_config.even_share.grid_size * pass_config.radix_digits;
393
+
394
+ // Invoke upsweep_kernel with same grid size as downsweep_kernel
395
+ launcher_factory(pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream)
396
+ .doit(pass_config.upsweep_kernel,
397
+ d_keys_in,
398
+ d_spine,
399
+ num_items,
400
+ current_bit,
401
+ pass_bits,
402
+ pass_config.even_share,
403
+ decomposer);
404
+
405
+ // Check for failure to launch
406
+ error = CubDebug(cudaPeekAtLastError());
407
+ if (cudaSuccess != error)
408
+ {
409
+ break;
410
+ }
411
+
412
+ // Sync the stream if specified to flush runtime errors
413
+ error = CubDebug(detail::DebugSyncStream(stream));
414
+ if (cudaSuccess != error)
415
+ {
416
+ break;
417
+ }
418
+
419
+ // Log scan_kernel configuration
420
+ #ifdef CUB_DEBUG_LOG
421
+ _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
422
+ 1,
423
+ pass_config.scan_config.block_threads,
424
+ (long long) stream,
425
+ pass_config.scan_config.items_per_thread);
426
+ #endif
427
+
428
+ // Invoke scan_kernel
429
+ launcher_factory(1, pass_config.scan_config.block_threads, 0, stream)
430
+ .doit(pass_config.scan_kernel, d_spine, pass_spine_length);
431
+
432
+ // Check for failure to launch
433
+ error = CubDebug(cudaPeekAtLastError());
434
+ if (cudaSuccess != error)
435
+ {
436
+ break;
437
+ }
438
+
439
+ // Sync the stream if specified to flush runtime errors
440
+ error = CubDebug(detail::DebugSyncStream(stream));
441
+ if (cudaSuccess != error)
442
+ {
443
+ break;
444
+ }
445
+
446
+ // Log downsweep_kernel configuration
447
+ #ifdef CUB_DEBUG_LOG
448
+ _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
449
+ pass_config.even_share.grid_size,
450
+ pass_config.downsweep_config.block_threads,
451
+ (long long) stream,
452
+ pass_config.downsweep_config.items_per_thread,
453
+ pass_config.downsweep_config.sm_occupancy);
454
+ #endif
455
+
456
+ // Invoke downsweep_kernel
457
+ launcher_factory(pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0, stream)
458
+ .doit(pass_config.downsweep_kernel,
459
+ d_keys_in,
460
+ d_keys_out,
461
+ d_values_in,
462
+ d_values_out,
463
+ d_spine,
464
+ num_items,
465
+ current_bit,
466
+ pass_bits,
467
+ pass_config.even_share,
468
+ decomposer);
469
+
470
+ // Check for failure to launch
471
+ error = CubDebug(cudaPeekAtLastError());
472
+ if (cudaSuccess != error)
473
+ {
474
+ break;
475
+ }
476
+
477
+ // Sync the stream if specified to flush runtime errors
478
+ error = CubDebug(detail::DebugSyncStream(stream));
479
+ if (cudaSuccess != error)
480
+ {
481
+ break;
482
+ }
483
+
484
+ // Update current bit
485
+ current_bit += pass_bits;
486
+ } while (0);
487
+
488
+ return error;
489
+ }
490
+
491
+ /// Pass configuration structure
492
+ template <typename UpsweepKernelT, typename ScanKernelT, typename DownsweepKernelT>
493
+ struct PassConfig
494
+ {
495
+ UpsweepKernelT upsweep_kernel;
496
+ detail::KernelConfig upsweep_config;
497
+ ScanKernelT scan_kernel;
498
+ detail::KernelConfig scan_config;
499
+ DownsweepKernelT downsweep_kernel;
500
+ detail::KernelConfig downsweep_config;
501
+ int radix_bits;
502
+ int radix_digits;
503
+ int max_downsweep_grid_size;
504
+ GridEvenShare<OffsetT> even_share;
505
+
506
+ /// Initialize pass configuration
507
+ template <typename ActivePolicyT, typename UpsweepPolicyT, typename ScanPolicyT, typename DownsweepPolicyT>
508
+ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t InitPassConfig(
509
+ UpsweepKernelT upsweep_kernel,
510
+ ScanKernelT scan_kernel,
511
+ DownsweepKernelT downsweep_kernel,
512
+ int /*ptx_version*/,
513
+ int sm_count,
514
+ OffsetT num_items,
515
+ ActivePolicyT policy = {},
516
+ UpsweepPolicyT upsweep_policy = {},
517
+ ScanPolicyT scan_policy = {},
518
+ DownsweepPolicyT downsweep_policy = {},
519
+ KernelLauncherFactory launcher_factory = {})
520
+ {
521
+ cudaError error = cudaSuccess;
522
+ do
523
+ {
524
+ this->upsweep_kernel = upsweep_kernel;
525
+ this->scan_kernel = scan_kernel;
526
+ this->downsweep_kernel = downsweep_kernel;
527
+ radix_bits = policy.RadixBits(downsweep_policy);
528
+ radix_digits = 1 << radix_bits;
529
+
530
+ error = CubDebug(upsweep_config.Init(upsweep_kernel, upsweep_policy, launcher_factory));
531
+ if (cudaSuccess != error)
532
+ {
533
+ break;
534
+ }
535
+
536
+ error = CubDebug(scan_config.Init(scan_kernel, scan_policy, launcher_factory));
537
+ if (cudaSuccess != error)
538
+ {
539
+ break;
540
+ }
541
+
542
+ error = CubDebug(downsweep_config.Init(downsweep_kernel, downsweep_policy, launcher_factory));
543
+ if (cudaSuccess != error)
544
+ {
545
+ break;
546
+ }
547
+
548
+ max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * detail::subscription_factor;
549
+
550
+ even_share.DispatchInit(
551
+ num_items, max_downsweep_grid_size, ::cuda::std::max(downsweep_config.tile_size, upsweep_config.tile_size));
552
+
553
+ } while (0);
554
+ return error;
555
+ }
556
+ };
557
+
558
+ template <typename ActivePolicyT>
559
+ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t InvokeOnesweep(ActivePolicyT policy = {})
560
+ {
561
+ // PortionOffsetT is used for offsets within a portion, and must be signed.
562
+ using PortionOffsetT = int;
563
+ using AtomicOffsetT = PortionOffsetT;
564
+
565
+ // compute temporary storage size
566
+ const int RADIX_BITS = policy.RadixBits(policy.Onesweep());
567
+ const int RADIX_DIGITS = 1 << RADIX_BITS;
568
+ const int ONESWEEP_ITEMS_PER_THREAD = policy.Onesweep().ItemsPerThread();
569
+ const int ONESWEEP_BLOCK_THREADS = policy.Onesweep().BlockThreads();
570
+ const int ONESWEEP_TILE_ITEMS = ONESWEEP_ITEMS_PER_THREAD * ONESWEEP_BLOCK_THREADS;
571
+ // portions handle inputs with >=2**30 elements, due to the way lookback works
572
+ // for testing purposes, one portion is <= 2**28 elements
573
+ const PortionOffsetT PORTION_SIZE = ((1 << 28) - 1) / ONESWEEP_TILE_ITEMS * ONESWEEP_TILE_ITEMS;
574
+ int num_passes = ::cuda::ceil_div(end_bit - begin_bit, RADIX_BITS);
575
+ OffsetT num_portions = static_cast<OffsetT>(::cuda::ceil_div(num_items, PORTION_SIZE));
576
+ PortionOffsetT max_num_blocks = ::cuda::ceil_div(
577
+ static_cast<int>(::cuda::std::min(num_items, static_cast<OffsetT>(PORTION_SIZE))), ONESWEEP_TILE_ITEMS);
578
+
579
+ size_t value_size = KEYS_ONLY ? 0 : kernel_source.ValueSize();
580
+ size_t allocation_sizes[] = {
581
+ // bins
582
+ num_portions * num_passes * RADIX_DIGITS * sizeof(OffsetT),
583
+ // lookback
584
+ max_num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT),
585
+ // extra key buffer
586
+ is_overwrite_okay || num_passes <= 1 ? 0 : num_items * kernel_source.KeySize(),
587
+ // extra value buffer
588
+ is_overwrite_okay || num_passes <= 1 ? 0 : num_items * value_size,
589
+ // counters
590
+ num_portions * num_passes * sizeof(AtomicOffsetT),
591
+ };
592
+ constexpr int NUM_ALLOCATIONS = sizeof(allocation_sizes) / sizeof(allocation_sizes[0]);
593
+ void* allocations[NUM_ALLOCATIONS] = {};
594
+ detail::AliasTemporaries<NUM_ALLOCATIONS>(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes);
595
+
596
+ // just return if no temporary storage is provided
597
+ cudaError_t error = cudaSuccess;
598
+ if (d_temp_storage == nullptr)
599
+ {
600
+ return error;
601
+ }
602
+
603
+ OffsetT* d_bins = (OffsetT*) allocations[0];
604
+ AtomicOffsetT* d_lookback = (AtomicOffsetT*) allocations[1];
605
+ KeyT* d_keys_tmp2 = (KeyT*) allocations[2];
606
+ ValueT* d_values_tmp2 = (ValueT*) allocations[3];
607
+ AtomicOffsetT* d_ctrs = (AtomicOffsetT*) allocations[4];
608
+
609
+ do
610
+ {
611
+ // initialization
612
+ error = CubDebug(cudaMemsetAsync(d_ctrs, 0, num_portions * num_passes * sizeof(AtomicOffsetT), stream));
613
+ if (cudaSuccess != error)
614
+ {
615
+ break;
616
+ }
617
+
618
+ // compute num_passes histograms with RADIX_DIGITS bins each
619
+ error = CubDebug(cudaMemsetAsync(d_bins, 0, num_passes * RADIX_DIGITS * sizeof(OffsetT), stream));
620
+ if (cudaSuccess != error)
621
+ {
622
+ break;
623
+ }
624
+ int device = -1;
625
+ int num_sms = 0;
626
+
627
+ error = CubDebug(cudaGetDevice(&device));
628
+ if (cudaSuccess != error)
629
+ {
630
+ break;
631
+ }
632
+
633
+ error = CubDebug(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device));
634
+ if (cudaSuccess != error)
635
+ {
636
+ break;
637
+ }
638
+
639
+ const int HISTO_BLOCK_THREADS = policy.Histogram().BlockThreads();
640
+ int histo_blocks_per_sm = 1;
641
+ auto histogram_kernel = kernel_source.RadixSortHistogramKernel();
642
+
643
+ error = CubDebug(launcher_factory.MaxSmOccupancy(histo_blocks_per_sm, histogram_kernel, HISTO_BLOCK_THREADS, 0));
644
+ if (cudaSuccess != error)
645
+ {
646
+ break;
647
+ }
648
+
649
+ // log histogram_kernel configuration
650
+ #ifdef CUB_DEBUG_LOG
651
+ _CubLog("Invoking histogram_kernel<<<%d, %d, 0, %lld>>>(), %d items per iteration, "
652
+ "%d SM occupancy, bit_grain %d\n",
653
+ histo_blocks_per_sm * num_sms,
654
+ HISTO_BLOCK_THREADS,
655
+ reinterpret_cast<long long>(stream),
656
+ policy.Histogram().ItemsPerThread(),
657
+ histo_blocks_per_sm,
658
+ policy.RadixBits(policy.Histogram()));
659
+ #endif
660
+
661
+ error = launcher_factory(histo_blocks_per_sm * num_sms, HISTO_BLOCK_THREADS, 0, stream)
662
+ .doit(histogram_kernel, d_bins, d_keys.Current(), num_items, begin_bit, end_bit, decomposer);
663
+ error = CubDebug(error);
664
+ if (cudaSuccess != error)
665
+ {
666
+ break;
667
+ }
668
+
669
+ error = CubDebug(detail::DebugSyncStream(stream));
670
+ if (cudaSuccess != error)
671
+ {
672
+ break;
673
+ }
674
+
675
+ // exclusive sums to determine starts
676
+ const int SCAN_BLOCK_THREADS = policy.BlockThreads(policy.ExclusiveSum());
677
+
678
+ // log exclusive_sum_kernel configuration
679
+ #ifdef CUB_DEBUG_LOG
680
+ _CubLog("Invoking exclusive_sum_kernel<<<%d, %d, 0, %lld>>>(), bit_grain %d\n",
681
+ num_passes,
682
+ SCAN_BLOCK_THREADS,
683
+ reinterpret_cast<long long>(stream),
684
+ policy.RadixBits(policy.ExclusiveSum()));
685
+ #endif
686
+
687
+ error = launcher_factory(num_passes, SCAN_BLOCK_THREADS, 0, stream)
688
+ .doit(kernel_source.RadixSortExclusiveSumKernel(), d_bins);
689
+ error = CubDebug(error);
690
+ if (cudaSuccess != error)
691
+ {
692
+ break;
693
+ }
694
+
695
+ error = CubDebug(detail::DebugSyncStream(stream));
696
+ if (cudaSuccess != error)
697
+ {
698
+ break;
699
+ }
700
+
701
+ // use the other buffer if no overwrite is allowed
702
+ KeyT* d_keys_tmp = d_keys.Alternate();
703
+ ValueT* d_values_tmp = d_values.Alternate();
704
+ if (!is_overwrite_okay && num_passes % 2 == 0)
705
+ {
706
+ d_keys.d_buffers[1] = d_keys_tmp2;
707
+ d_values.d_buffers[1] = d_values_tmp2;
708
+ }
709
+
710
+ for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass)
711
+ {
712
+ int num_bits = ::cuda::std::min(end_bit - current_bit, RADIX_BITS);
713
+ for (OffsetT portion = 0; portion < num_portions; ++portion)
714
+ {
715
+ PortionOffsetT portion_num_items = static_cast<PortionOffsetT>(
716
+ ::cuda::std::min(num_items - portion * PORTION_SIZE, static_cast<OffsetT>(PORTION_SIZE)));
717
+
718
+ PortionOffsetT num_blocks = ::cuda::ceil_div(portion_num_items, ONESWEEP_TILE_ITEMS);
719
+
720
+ error = CubDebug(cudaMemsetAsync(d_lookback, 0, num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT), stream));
721
+ if (cudaSuccess != error)
722
+ {
723
+ break;
724
+ }
725
+
726
+ // log onesweep_kernel configuration
727
+ #ifdef CUB_DEBUG_LOG
728
+ _CubLog("Invoking onesweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, "
729
+ "current bit %d, bit_grain %d, portion %d/%d\n",
730
+ num_blocks,
731
+ ONESWEEP_BLOCK_THREADS,
732
+ reinterpret_cast<long long>(stream),
733
+ policy.Onesweep().ItemsPerThread(),
734
+ current_bit,
735
+ num_bits,
736
+ static_cast<int>(portion),
737
+ static_cast<int>(num_portions));
738
+ #endif
739
+
740
+ auto onesweep_kernel = kernel_source.RadixSortOnesweepKernel();
741
+
742
+ error =
743
+ launcher_factory(num_blocks, ONESWEEP_BLOCK_THREADS, 0, stream)
744
+ .doit(onesweep_kernel,
745
+ d_lookback,
746
+ d_ctrs + portion * num_passes + pass,
747
+ portion < num_portions - 1 ? d_bins + ((portion + 1) * num_passes + pass) * RADIX_DIGITS : nullptr,
748
+ d_bins + (portion * num_passes + pass) * RADIX_DIGITS,
749
+ d_keys.Alternate(),
750
+ d_keys.Current() + portion * PORTION_SIZE,
751
+ d_values.Alternate(),
752
+ d_values.Current() + portion * PORTION_SIZE,
753
+ portion_num_items,
754
+ current_bit,
755
+ num_bits,
756
+ decomposer);
757
+ error = CubDebug(error);
758
+ if (cudaSuccess != error)
759
+ {
760
+ break;
761
+ }
762
+
763
+ error = CubDebug(detail::DebugSyncStream(stream));
764
+ if (cudaSuccess != error)
765
+ {
766
+ break;
767
+ }
768
+ }
769
+
770
+ if (error != cudaSuccess)
771
+ {
772
+ break;
773
+ }
774
+
775
+ // use the temporary buffers if no overwrite is allowed
776
+ if (!is_overwrite_okay && pass == 0)
777
+ {
778
+ d_keys = num_passes % 2 == 0 ? DoubleBuffer<KeyT>(d_keys_tmp, d_keys_tmp2)
779
+ : DoubleBuffer<KeyT>(d_keys_tmp2, d_keys_tmp);
780
+ d_values = num_passes % 2 == 0 ? DoubleBuffer<ValueT>(d_values_tmp, d_values_tmp2)
781
+ : DoubleBuffer<ValueT>(d_values_tmp2, d_values_tmp);
782
+ }
783
+ d_keys.selector ^= 1;
784
+ d_values.selector ^= 1;
785
+ }
786
+ } while (0);
787
+
788
+ return error;
789
+ }
790
+
791
+ /**
792
+ * @brief Invocation (run multiple digit passes)
793
+ *
794
+ * @tparam ActivePolicyT
795
+ * Umbrella policy active for the target device
796
+ *
797
+ * @tparam UpsweepKernelT
798
+ * Function type of cub::DeviceRadixSortUpsweepKernel
799
+ *
800
+ * @tparam ScanKernelT
801
+ * Function type of cub::SpineScanKernel
802
+ *
803
+ * @tparam DownsweepKernelT
804
+ * Function type of cub::DeviceRadixSortDownsweepKernel
805
+ *
806
+ * @param[in] upsweep_kernel
807
+ * Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
808
+ *
809
+ * @param[in] alt_upsweep_kernel
810
+ * Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
811
+ *
812
+ * @param[in] scan_kernel
813
+ * Kernel function pointer to parameterization of cub::SpineScanKernel
814
+ *
815
+ * @param[in] downsweep_kernel
816
+ * Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
817
+ *
818
+ * @param[in] alt_downsweep_kernel
819
+ * Alternate kernel function pointer to parameterization of
820
+ * cub::DeviceRadixSortDownsweepKernel
821
+ */
822
+ template <typename ActivePolicyT, typename UpsweepKernelT, typename ScanKernelT, typename DownsweepKernelT>
823
+ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t InvokePasses(
824
+ UpsweepKernelT upsweep_kernel,
825
+ UpsweepKernelT alt_upsweep_kernel,
826
+ ScanKernelT scan_kernel,
827
+ DownsweepKernelT downsweep_kernel,
828
+ DownsweepKernelT alt_downsweep_kernel,
829
+ ActivePolicyT policy = {})
830
+ {
831
+ cudaError error = cudaSuccess;
832
+ do
833
+ {
834
+ // Get device ordinal
835
+ int device_ordinal;
836
+ error = CubDebug(cudaGetDevice(&device_ordinal));
837
+ if (cudaSuccess != error)
838
+ {
839
+ break;
840
+ }
841
+
842
+ // Get SM count
843
+ int sm_count;
844
+ error = CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal));
845
+ if (cudaSuccess != error)
846
+ {
847
+ break;
848
+ }
849
+
850
+ // Init regular and alternate-digit kernel configurations
851
+ PassConfig<UpsweepKernelT, ScanKernelT, DownsweepKernelT> pass_config, alt_pass_config;
852
+ error = pass_config.InitPassConfig(
853
+ upsweep_kernel,
854
+ scan_kernel,
855
+ downsweep_kernel,
856
+ ptx_version,
857
+ sm_count,
858
+ num_items,
859
+ policy,
860
+ policy.Upsweep(),
861
+ policy.Scan(),
862
+ policy.Downsweep(),
863
+ launcher_factory);
864
+ if (error)
865
+ {
866
+ break;
867
+ }
868
+
869
+ error = alt_pass_config.InitPassConfig(
870
+ alt_upsweep_kernel,
871
+ scan_kernel,
872
+ alt_downsweep_kernel,
873
+ ptx_version,
874
+ sm_count,
875
+ num_items,
876
+ policy,
877
+ policy.AltUpsweep(),
878
+ policy.Scan(),
879
+ policy.AltDownsweep(),
880
+ launcher_factory);
881
+ if (error)
882
+ {
883
+ break;
884
+ }
885
+
886
+ // Get maximum spine length
887
+ int max_grid_size =
888
+ ::cuda::std::max(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
889
+ int spine_length = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size;
890
+
891
+ // Temporary storage allocation requirements
892
+ void* allocations[3] = {};
893
+ size_t allocation_sizes[3] = {
894
+ // bytes needed for privatized block digit histograms
895
+ spine_length * sizeof(OffsetT),
896
+
897
+ // bytes needed for 3rd keys buffer
898
+ (is_overwrite_okay) ? 0 : num_items * kernel_source.KeySize(),
899
+
900
+ // bytes needed for 3rd values buffer
901
+ (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * kernel_source.ValueSize(),
902
+ };
903
+
904
+ // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
905
+ error = CubDebug(detail::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
906
+ if (cudaSuccess != error)
907
+ {
908
+ break;
909
+ }
910
+
911
+ // Return if the caller is simply requesting the size of the storage allocation
912
+ if (d_temp_storage == nullptr)
913
+ {
914
+ return cudaSuccess;
915
+ }
916
+
917
+ // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our
918
+ // preferred digit size
919
+ int num_bits = end_bit - begin_bit;
920
+ int num_passes = ::cuda::ceil_div(num_bits, pass_config.radix_bits);
921
+ bool is_num_passes_odd = num_passes & 1;
922
+ int max_alt_passes = (num_passes * pass_config.radix_bits) - num_bits;
923
+ int alt_end_bit = ::cuda::std::min(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
924
+
925
+ // Alias the temporary storage allocations
926
+ OffsetT* d_spine = static_cast<OffsetT*>(allocations[0]);
927
+
928
+ DoubleBuffer<KeyT> d_keys_remaining_passes(
929
+ (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[1]),
930
+ (is_overwrite_okay) ? d_keys.Current()
931
+ : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[1])
932
+ : d_keys.Alternate());
933
+
934
+ DoubleBuffer<ValueT> d_values_remaining_passes(
935
+ (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[2]),
936
+ (is_overwrite_okay) ? d_values.Current()
937
+ : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[2])
938
+ : d_values.Alternate());
939
+
940
+ // Run first pass, consuming from the input's current buffers
941
+ int current_bit = begin_bit;
942
+ error = CubDebug(InvokePass(
943
+ d_keys.Current(),
944
+ d_keys_remaining_passes.Current(),
945
+ d_values.Current(),
946
+ d_values_remaining_passes.Current(),
947
+ d_spine,
948
+ spine_length,
949
+ current_bit,
950
+ (current_bit < alt_end_bit) ? alt_pass_config : pass_config));
951
+ if (cudaSuccess != error)
952
+ {
953
+ break;
954
+ }
955
+
956
+ // Run remaining passes
957
+ while (current_bit < end_bit)
958
+ {
959
+ error = CubDebug(InvokePass(
960
+ d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],
961
+ d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
962
+ d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],
963
+ d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
964
+ d_spine,
965
+ spine_length,
966
+ current_bit,
967
+ (current_bit < alt_end_bit) ? alt_pass_config : pass_config));
968
+
969
+ if (cudaSuccess != error)
970
+ {
971
+ break;
972
+ }
973
+
974
+ // Invert selectors
975
+ d_keys_remaining_passes.selector ^= 1;
976
+ d_values_remaining_passes.selector ^= 1;
977
+ }
978
+
979
+ // Update selector
980
+ if (!is_overwrite_okay)
981
+ {
982
+ num_passes = 1; // Sorted data always ends up in the other vector
983
+ }
984
+
985
+ d_keys.selector = (d_keys.selector + num_passes) & 1;
986
+ d_values.selector = (d_values.selector + num_passes) & 1;
987
+ } while (0);
988
+
989
+ return error;
990
+ }
991
+
992
+ //------------------------------------------------------------------------------
993
+ // Chained policy invocation
994
+ //------------------------------------------------------------------------------
995
+
996
+ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t InvokeCopy()
997
+ {
998
+ // is_overwrite_okay == false here
999
+ // Return the number of temporary bytes if requested
1000
+ if (d_temp_storage == nullptr)
1001
+ {
1002
+ temp_storage_bytes = 1;
1003
+ return cudaSuccess;
1004
+ }
1005
+
1006
+ // Copy keys
1007
+ #ifdef CUB_DEBUG_LOG
1008
+ _CubLog("Invoking async copy of %lld keys on stream %lld\n", (long long) num_items, (long long) stream);
1009
+ #endif
1010
+ cudaError_t error = cudaSuccess;
1011
+
1012
+ error = CubDebug(cudaMemcpyAsync(
1013
+ d_keys.Alternate(), d_keys.Current(), num_items * kernel_source.KeySize(), cudaMemcpyDefault, stream));
1014
+ if (cudaSuccess != error)
1015
+ {
1016
+ return error;
1017
+ }
1018
+
1019
+ error = CubDebug(detail::DebugSyncStream(stream));
1020
+ if (cudaSuccess != error)
1021
+ {
1022
+ return error;
1023
+ }
1024
+ d_keys.selector ^= 1;
1025
+
1026
+ // Copy values if necessary
1027
+ if (!KEYS_ONLY)
1028
+ {
1029
+ #ifdef CUB_DEBUG_LOG
1030
+ _CubLog("Invoking async copy of %lld values on stream %lld\n", (long long) num_items, (long long) stream);
1031
+ #endif
1032
+ error = CubDebug(cudaMemcpyAsync(
1033
+ d_values.Alternate(), d_values.Current(), num_items * kernel_source.ValueSize(), cudaMemcpyDefault, stream));
1034
+ if (cudaSuccess != error)
1035
+ {
1036
+ return error;
1037
+ }
1038
+
1039
+ error = CubDebug(detail::DebugSyncStream(stream));
1040
+ if (cudaSuccess != error)
1041
+ {
1042
+ return error;
1043
+ }
1044
+ }
1045
+ d_values.selector ^= 1;
1046
+
1047
+ return error;
1048
+ }
1049
+
1050
+ /// Invocation
1051
+ template <typename ActivePolicyT>
1052
+ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT policy = {})
1053
+ {
1054
+ auto wrapped_policy = detail::radix::MakeRadixSortPolicyWrapper(policy);
1055
+
1056
+ // Return if empty problem, or if no bits to sort and double-buffering is used
1057
+ if (num_items == 0 || (begin_bit == end_bit && is_overwrite_okay))
1058
+ {
1059
+ if (d_temp_storage == nullptr)
1060
+ {
1061
+ temp_storage_bytes = 1;
1062
+ }
1063
+ return cudaSuccess;
1064
+ }
1065
+
1066
+ // Check if simple copy suffices (is_overwrite_okay == false at this point)
1067
+ if (begin_bit == end_bit)
1068
+ {
1069
+ bool has_uva = false;
1070
+ cudaError_t error = detail::HasUVA(has_uva);
1071
+ if (error != cudaSuccess)
1072
+ {
1073
+ return error;
1074
+ }
1075
+ if (has_uva)
1076
+ {
1077
+ return InvokeCopy();
1078
+ }
1079
+ }
1080
+
1081
+ // Force kernel code-generation in all compiler passes
1082
+ if (num_items <= static_cast<OffsetT>(
1083
+ wrapped_policy.SingleTile().BlockThreads() * wrapped_policy.SingleTile().ItemsPerThread()))
1084
+ {
1085
+ // Small, single tile size
1086
+ return InvokeSingleTile(kernel_source.RadixSortSingleTileKernel(), wrapped_policy);
1087
+ }
1088
+
1089
+ if CUB_DETAIL_CONSTEXPR_ISH (wrapped_policy.IsOnesweep())
1090
+ {
1091
+ return InvokeOnesweep(wrapped_policy);
1092
+ }
1093
+ else
1094
+ {
1095
+ return InvokePasses(
1096
+ kernel_source.RadixSortUpsweepKernel(),
1097
+ kernel_source.RadixSortAltUpsweepKernel(),
1098
+ kernel_source.DeviceRadixSortScanBinsKernel(),
1099
+ kernel_source.RadixSortDownsweepKernel(),
1100
+ kernel_source.RadixSortAltDownsweepKernel(),
1101
+ wrapped_policy);
1102
+ }
1103
+ }
1104
+
1105
+ //------------------------------------------------------------------------------
1106
+ // Dispatch entrypoints
1107
+ //------------------------------------------------------------------------------
1108
+
1109
+ /**
1110
+ * @brief Internal dispatch routine
1111
+ *
1112
+ * @param[in] d_temp_storage
1113
+ * Device-accessible allocation of temporary storage. When nullptr, the required
1114
+ * allocation size is written to `temp_storage_bytes` and no work is done.
1115
+ *
1116
+ * @param[in,out] temp_storage_bytes
1117
+ * Reference to size in bytes of `d_temp_storage` allocation
1118
+ *
1119
+ * @param[in,out] d_keys
1120
+ * Double-buffer whose current buffer contains the unsorted input keys and,
1121
+ * upon return, is updated to point to the sorted output keys
1122
+ *
1123
+ * @param[in,out] d_values
1124
+ * Double-buffer whose current buffer contains the unsorted input values and,
1125
+ * upon return, is updated to point to the sorted output values
1126
+ *
1127
+ * @param[in] num_items
1128
+ * Number of items to sort
1129
+ *
1130
+ * @param[in] begin_bit
1131
+ * The beginning (least-significant) bit index needed for key comparison
1132
+ *
1133
+ * @param[in] end_bit
1134
+ * The past-the-end (most-significant) bit index needed for key comparison
1135
+ *
1136
+ * @param[in] is_overwrite_okay
1137
+ * Whether is okay to overwrite source buffers
1138
+ *
1139
+ * @param[in] stream
1140
+ * CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
1141
+ */
1142
+ template <typename MaxPolicyT = typename PolicyHub::MaxPolicy>
1143
+ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
1144
+ void* d_temp_storage,
1145
+ size_t& temp_storage_bytes,
1146
+ DoubleBuffer<KeyT>& d_keys,
1147
+ DoubleBuffer<ValueT>& d_values,
1148
+ OffsetT num_items,
1149
+ int begin_bit,
1150
+ int end_bit,
1151
+ bool is_overwrite_okay,
1152
+ cudaStream_t stream,
1153
+ DecomposerT decomposer = {},
1154
+ KernelSource kernel_source = {},
1155
+ KernelLauncherFactory launcher_factory = {},
1156
+ MaxPolicyT max_policy = {})
1157
+ {
1158
+ cudaError_t error;
1159
+ do
1160
+ {
1161
+ // Get PTX version
1162
+ int ptx_version = 0;
1163
+
1164
+ error = CubDebug(launcher_factory.PtxVersion(ptx_version));
1165
+ if (cudaSuccess != error)
1166
+ {
1167
+ break;
1168
+ }
1169
+
1170
+ // Create dispatch functor
1171
+ DispatchRadixSort dispatch(
1172
+ d_temp_storage,
1173
+ temp_storage_bytes,
1174
+ d_keys,
1175
+ d_values,
1176
+ num_items,
1177
+ begin_bit,
1178
+ end_bit,
1179
+ is_overwrite_okay,
1180
+ stream,
1181
+ ptx_version,
1182
+ decomposer,
1183
+ kernel_source,
1184
+ launcher_factory);
1185
+
1186
+ // Dispatch to chained policy
1187
+ error = CubDebug(max_policy.Invoke(ptx_version, dispatch));
1188
+ if (cudaSuccess != error)
1189
+ {
1190
+ break;
1191
+ }
1192
+ } while (0);
1193
+
1194
+ return error;
1195
+ }
1196
+ };
1197
+
1198
+ /******************************************************************************
1199
+ * Segmented dispatch
1200
+ ******************************************************************************/
1201
+
1202
+ /**
1203
+ * @brief Utility class for dispatching the appropriately-tuned kernels for segmented device-wide
1204
+ * radix sort
1205
+ *
1206
+ * @tparam SortOrder
1207
+ * Whether to sort in ascending or descending order
1208
+ *
1209
+ * @tparam KeyT
1210
+ * Key type
1211
+ *
1212
+ * @tparam ValueT
1213
+ * Value type
1214
+ *
1215
+ * @tparam BeginOffsetIteratorT
1216
+ * Random-access input iterator type for reading segment beginning offsets @iterator
1217
+ *
1218
+ * @tparam EndOffsetIteratorT
1219
+ * Random-access input iterator type for reading segment ending offsets @iterator
1220
+ *
1221
+ * @tparam SegmentSizeT
1222
+ * Integer type to index items within a segment
1223
+ */
1224
+ template <SortOrder Order,
1225
+ typename KeyT,
1226
+ typename ValueT,
1227
+ typename BeginOffsetIteratorT,
1228
+ typename EndOffsetIteratorT,
1229
+ typename SegmentSizeT,
1230
+ typename PolicyHub = detail::radix::policy_hub<KeyT, ValueT, SegmentSizeT>,
1231
+ typename DecomposerT = detail::identity_decomposer_t,
1232
+ typename KernelSource = detail::radix_sort::DeviceSegmentedRadixSortKernelSource<
1233
+ typename PolicyHub::MaxPolicy,
1234
+ Order,
1235
+ KeyT,
1236
+ ValueT,
1237
+ BeginOffsetIteratorT,
1238
+ EndOffsetIteratorT,
1239
+ SegmentSizeT,
1240
+ DecomposerT>,
1241
+ typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY>
1242
+ struct DispatchSegmentedRadixSort
1243
+ {
1244
+ //------------------------------------------------------------------------------
1245
+ // Constants
1246
+ //------------------------------------------------------------------------------
1247
+
1248
+ // Whether this is a keys-only (or key-value) sort
1249
+ static constexpr bool KEYS_ONLY = ::cuda::std::is_same_v<ValueT, NullType>;
1250
+
1251
+ //------------------------------------------------------------------------------
1252
+ // Parameter members
1253
+ //------------------------------------------------------------------------------
1254
+
1255
+ /// Device-accessible allocation of temporary storage. When nullptr, the required allocation size
1256
+ /// is written to `temp_storage_bytes` and no work is done.
1257
+ void* d_temp_storage;
1258
+
1259
+ /// Reference to size in bytes of `d_temp_storage` allocation
1260
+ size_t& temp_storage_bytes;
1261
+
1262
+ /// Double-buffer whose current buffer contains the unsorted input keys and, upon return, is
1263
+ /// updated to point to the sorted output keys
1264
+ DoubleBuffer<KeyT>& d_keys;
1265
+
1266
+ /// Double-buffer whose current buffer contains the unsorted input values and, upon return, is
1267
+ /// updated to point to the sorted output values
1268
+ DoubleBuffer<ValueT>& d_values;
1269
+
1270
+ /// Number of items to sort
1271
+ ::cuda::std::int64_t num_items;
1272
+
1273
+ /// The number of segments that comprise the sorting data
1274
+ ::cuda::std::int64_t num_segments;
1275
+
1276
+ /// Random-access input iterator to the sequence of beginning offsets of length `num_segments`,
1277
+ /// such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup>
1278
+ /// data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
1279
+ BeginOffsetIteratorT d_begin_offsets;
1280
+
1281
+ /// Random-access input iterator to the sequence of ending offsets of length `num_segments`,
1282
+ /// such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>
1283
+ /// data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>. If <tt>d_end_offsets[i]-1</tt>
1284
+ /// <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
1285
+ EndOffsetIteratorT d_end_offsets;
1286
+
1287
+ /// The beginning (least-significant) bit index needed for key comparison
1288
+ int begin_bit;
1289
+
1290
+ /// The past-the-end (most-significant) bit index needed for key comparison
1291
+ int end_bit;
1292
+
1293
+ /// CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
1294
+ cudaStream_t stream;
1295
+
1296
+ /// PTX version
1297
+ int ptx_version;
1298
+
1299
+ /// Whether is okay to overwrite source buffers
1300
+ bool is_overwrite_okay;
1301
+
1302
+ DecomposerT decomposer;
1303
+
1304
+ KernelSource kernel_source;
1305
+
1306
+ KernelLauncherFactory launcher_factory;
1307
+
1308
+ //------------------------------------------------------------------------------
1309
+ // Constructors
1310
+ //------------------------------------------------------------------------------
1311
+
1312
+ /// Constructor
1313
+ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedRadixSort(
1314
+ void* d_temp_storage,
1315
+ size_t& temp_storage_bytes,
1316
+ DoubleBuffer<KeyT>& d_keys,
1317
+ DoubleBuffer<ValueT>& d_values,
1318
+ ::cuda::std::int64_t num_items,
1319
+ ::cuda::std::int64_t num_segments,
1320
+ BeginOffsetIteratorT d_begin_offsets,
1321
+ EndOffsetIteratorT d_end_offsets,
1322
+ int begin_bit,
1323
+ int end_bit,
1324
+ bool is_overwrite_okay,
1325
+ cudaStream_t stream,
1326
+ int ptx_version,
1327
+ DecomposerT decomposer = {},
1328
+ KernelSource kernel_source = {},
1329
+ KernelLauncherFactory launcher_factory = {})
1330
+ : d_temp_storage(d_temp_storage)
1331
+ , temp_storage_bytes(temp_storage_bytes)
1332
+ , d_keys(d_keys)
1333
+ , d_values(d_values)
1334
+ , num_items(num_items)
1335
+ , num_segments(num_segments)
1336
+ , d_begin_offsets(d_begin_offsets)
1337
+ , d_end_offsets(d_end_offsets)
1338
+ , begin_bit(begin_bit)
1339
+ , end_bit(end_bit)
1340
+ , stream(stream)
1341
+ , ptx_version(ptx_version)
1342
+ , is_overwrite_okay(is_overwrite_okay)
1343
+ , decomposer(decomposer)
1344
+ , kernel_source(kernel_source)
1345
+ , launcher_factory(launcher_factory)
1346
+ {}
1347
+
1348
+ //------------------------------------------------------------------------------
1349
+ // Multi-segment invocation
1350
+ //------------------------------------------------------------------------------
1351
+
1352
+ /// Invoke a three-kernel sorting pass at the current bit.
1353
+ template <typename PassConfigT>
1354
+ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t InvokePass(
1355
+ const KeyT* d_keys_in,
1356
+ KeyT* d_keys_out,
1357
+ const ValueT* d_values_in,
1358
+ ValueT* d_values_out,
1359
+ int& current_bit,
1360
+ PassConfigT& pass_config)
1361
+ {
1362
+ cudaError error = cudaSuccess;
1363
+
1364
+ // The number of bits to process in this pass
1365
+ int pass_bits = ::cuda::std::min(pass_config.radix_bits, (end_bit - current_bit));
1366
+
1367
+ // The offset type (used to specialize the kernel template), large enough to index any segment within a single
1368
+ // invocation
1369
+ using per_invocation_segment_offset_t = ::cuda::std::int32_t;
1370
+
1371
+ // The upper bound of segments that a single kernel invocation will process
1372
+ constexpr auto max_num_segments_per_invocation =
1373
+ static_cast<::cuda::std::int64_t>(::cuda::std::numeric_limits<per_invocation_segment_offset_t>::max());
1374
+
1375
+ // Number of radix sort invocations until all segments have been processed
1376
+ const auto num_invocations = ::cuda::ceil_div(num_segments, max_num_segments_per_invocation);
1377
+
1378
+ // If d_begin_offsets and d_end_offsets do not support operator+ then we can't have more than
1379
+ // max_num_segments_per_invocation segments per invocation
1380
+ if (num_invocations > 1
1381
+ && !detail::all_iterators_support_add_assign_operator(::cuda::std::int64_t{}, d_begin_offsets, d_end_offsets))
1382
+ {
1383
+ return cudaErrorInvalidValue;
1384
+ }
1385
+
1386
+ BeginOffsetIteratorT begin_offsets_current_it = d_begin_offsets;
1387
+ EndOffsetIteratorT end_offsets_current_it = d_end_offsets;
1388
+
1389
+ // Iterate over chunks of segments
1390
+ for (::cuda::std::int64_t invocation_index = 0; invocation_index < num_invocations; invocation_index++)
1391
+ {
1392
+ const auto current_segment_offset = invocation_index * max_num_segments_per_invocation;
1393
+ const auto num_current_segments =
1394
+ ::cuda::std::min(max_num_segments_per_invocation, num_segments - current_segment_offset);
1395
+
1396
+ // Log kernel configuration
1397
+ #ifdef CUB_DEBUG_LOG
1398
+ _CubLog(
1399
+ "Invoking segmented_kernels<<<%lld, %lld, 0, %lld>>>(), "
1400
+ "%lld items per thread, %lld SM occupancy, "
1401
+ "current segment offset %lld, current bit %d, bit_grain %d\n",
1402
+ (long long) num_current_segments,
1403
+ (long long) pass_config.segmented_config.block_threads,
1404
+ (long long) stream,
1405
+ (long long) pass_config.segmented_config.items_per_thread,
1406
+ (long long) pass_config.segmented_config.sm_occupancy,
1407
+ (long long) current_segment_offset,
1408
+ current_bit,
1409
+ pass_bits);
1410
+ #endif
1411
+
1412
+ launcher_factory(
1413
+ static_cast<unsigned int>(num_current_segments), pass_config.segmented_config.block_threads, 0, stream)
1414
+ .doit(pass_config.segmented_kernel,
1415
+ d_keys_in,
1416
+ d_keys_out,
1417
+ d_values_in,
1418
+ d_values_out,
1419
+ begin_offsets_current_it,
1420
+ end_offsets_current_it,
1421
+ current_bit,
1422
+ pass_bits,
1423
+ decomposer);
1424
+
1425
+ // Check for failure to launch
1426
+ error = CubDebug(cudaPeekAtLastError());
1427
+ if (cudaSuccess != error)
1428
+ {
1429
+ return error;
1430
+ }
1431
+
1432
+ if (invocation_index + 1 < num_invocations)
1433
+ {
1434
+ detail::advance_iterators_inplace_if_supported(begin_offsets_current_it, num_current_segments);
1435
+ detail::advance_iterators_inplace_if_supported(end_offsets_current_it, num_current_segments);
1436
+ }
1437
+
1438
+ // Sync the stream if specified to flush runtime errors
1439
+ error = CubDebug(detail::DebugSyncStream(stream));
1440
+ if (cudaSuccess != error)
1441
+ {
1442
+ return error;
1443
+ }
1444
+ }
1445
+
1446
+ // Update current bit once all segments have been processed for the current pass
1447
+ current_bit += pass_bits;
1448
+
1449
+ return error;
1450
+ }
1451
+
1452
+ /// PassConfig data structure
1453
+ template <typename SegmentedKernelT>
1454
+ struct PassConfig
1455
+ {
1456
+ SegmentedKernelT segmented_kernel;
1457
+ detail::KernelConfig segmented_config;
1458
+ int radix_bits;
1459
+ int radix_digits;
1460
+
1461
+ /// Initialize pass configuration
1462
+ template <typename SegmentedPolicyT>
1463
+ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t InitPassConfig(
1464
+ SegmentedKernelT segmented_kernel,
1465
+ int radix_bits,
1466
+ SegmentedPolicyT policy = {},
1467
+ KernelLauncherFactory launcher_factory = {})
1468
+ {
1469
+ this->segmented_kernel = segmented_kernel;
1470
+ this->radix_bits = radix_bits;
1471
+ this->radix_digits = 1 << radix_bits;
1472
+
1473
+ return CubDebug(segmented_config.Init(segmented_kernel, policy, launcher_factory));
1474
+ }
1475
+ };
1476
+
1477
+ /**
1478
+ * @brief Invocation (run multiple digit passes)
1479
+ *
1480
+ * @tparam ActivePolicyT
1481
+ * Umbrella policy active for the target device
1482
+ *
1483
+ * @tparam SegmentedKernelT
1484
+ * Function type of cub::DeviceSegmentedRadixSortKernel
1485
+ *
1486
+ * @param[in] segmented_kernel
1487
+ * Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
1488
+ *
1489
+ * @param[in] alt_segmented_kernel
1490
+ * Alternate kernel function pointer to parameterization of
1491
+ * cub::DeviceSegmentedRadixSortKernel
1492
+ */
1493
+ template <typename ActivePolicyT, typename SegmentedKernelT>
1494
+ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
1495
+ InvokePasses(SegmentedKernelT segmented_kernel, SegmentedKernelT alt_segmented_kernel, ActivePolicyT policy = {})
1496
+ {
1497
+ cudaError error = cudaSuccess;
1498
+ do
1499
+ {
1500
+ // Init regular and alternate kernel configurations
1501
+ PassConfig<SegmentedKernelT> pass_config, alt_pass_config;
1502
+ if ((error = pass_config.InitPassConfig(
1503
+ segmented_kernel, policy.RadixBits(policy.Segmented()), policy.Segmented(), launcher_factory)))
1504
+ {
1505
+ break;
1506
+ }
1507
+ if ((error = alt_pass_config.InitPassConfig(
1508
+ alt_segmented_kernel, policy.RadixBits(policy.AltSegmented()), policy.AltSegmented(), launcher_factory)))
1509
+ {
1510
+ break;
1511
+ }
1512
+
1513
+ // Temporary storage allocation requirements
1514
+ void* allocations[2] = {};
1515
+ size_t allocation_sizes[2] = {
1516
+ // bytes needed for 3rd keys buffer
1517
+ (is_overwrite_okay) ? 0 : num_items * kernel_source.KeySize(),
1518
+
1519
+ // bytes needed for 3rd values buffer
1520
+ (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),
1521
+ };
1522
+
1523
+ // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
1524
+ error = CubDebug(detail::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
1525
+ if (cudaSuccess != error)
1526
+ {
1527
+ break;
1528
+ }
1529
+
1530
+ // Return if the caller is simply requesting the size of the storage allocation
1531
+ if (d_temp_storage == nullptr)
1532
+ {
1533
+ if (temp_storage_bytes == 0)
1534
+ {
1535
+ temp_storage_bytes = 1;
1536
+ }
1537
+ return cudaSuccess;
1538
+ }
1539
+
1540
+ // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our
1541
+ // preferred digit size
1542
+ int radix_bits = policy.RadixBits(policy.Segmented());
1543
+ int alt_radix_bits = policy.RadixBits(policy.AltSegmented());
1544
+ int num_bits = end_bit - begin_bit;
1545
+ int num_passes = ::cuda::std::max(::cuda::ceil_div(num_bits, radix_bits), 1); // num_bits may be zero
1546
+ bool is_num_passes_odd = num_passes & 1;
1547
+ int max_alt_passes = (num_passes * radix_bits) - num_bits;
1548
+ int alt_end_bit = ::cuda::std::min(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));
1549
+
1550
+ DoubleBuffer<KeyT> d_keys_remaining_passes(
1551
+ (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[0]),
1552
+ (is_overwrite_okay) ? d_keys.Current()
1553
+ : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[0])
1554
+ : d_keys.Alternate());
1555
+
1556
+ DoubleBuffer<ValueT> d_values_remaining_passes(
1557
+ (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[1]),
1558
+ (is_overwrite_okay) ? d_values.Current()
1559
+ : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[1])
1560
+ : d_values.Alternate());
1561
+
1562
+ // Run first pass, consuming from the input's current buffers
1563
+ int current_bit = begin_bit;
1564
+
1565
+ error = CubDebug(InvokePass(
1566
+ d_keys.Current(),
1567
+ d_keys_remaining_passes.Current(),
1568
+ d_values.Current(),
1569
+ d_values_remaining_passes.Current(),
1570
+ current_bit,
1571
+ (current_bit < alt_end_bit) ? alt_pass_config : pass_config));
1572
+ if (cudaSuccess != error)
1573
+ {
1574
+ break;
1575
+ }
1576
+
1577
+ // Run remaining passes
1578
+ while (current_bit < end_bit)
1579
+ {
1580
+ error = CubDebug(InvokePass(
1581
+ d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],
1582
+ d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
1583
+ d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],
1584
+ d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
1585
+ current_bit,
1586
+ (current_bit < alt_end_bit) ? alt_pass_config : pass_config));
1587
+ if (cudaSuccess != error)
1588
+ {
1589
+ break;
1590
+ }
1591
+
1592
+ // Invert selectors and update current bit
1593
+ d_keys_remaining_passes.selector ^= 1;
1594
+ d_values_remaining_passes.selector ^= 1;
1595
+ }
1596
+
1597
+ // Update selector
1598
+ if (!is_overwrite_okay)
1599
+ {
1600
+ num_passes = 1; // Sorted data always ends up in the other vector
1601
+ }
1602
+
1603
+ d_keys.selector = (d_keys.selector + num_passes) & 1;
1604
+ d_values.selector = (d_values.selector + num_passes) & 1;
1605
+ } while (0);
1606
+
1607
+ return error;
1608
+ }
1609
+
1610
+ //------------------------------------------------------------------------------
1611
+ // Chained policy invocation
1612
+ //------------------------------------------------------------------------------
1613
+
1614
+ /// Invocation
1615
+ template <typename ActivePolicyT>
1616
+ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT policy = {})
1617
+ {
1618
+ // Return if empty problem, or if no bits to sort and double-buffering is used
1619
+ if (num_items == 0 || num_segments == 0 || (begin_bit == end_bit && is_overwrite_okay))
1620
+ {
1621
+ if (d_temp_storage == nullptr)
1622
+ {
1623
+ temp_storage_bytes = 1;
1624
+ }
1625
+ return cudaSuccess;
1626
+ }
1627
+
1628
+ // Force kernel code-generation in all compiler passes
1629
+ return InvokePasses(kernel_source.SegmentedRadixSortKernel(),
1630
+ kernel_source.AltSegmentedRadixSortKernel(),
1631
+ detail::radix::MakeRadixSortPolicyWrapper(policy));
1632
+ }
1633
+
1634
+ //------------------------------------------------------------------------------
1635
+ // Dispatch entrypoints
1636
+ //------------------------------------------------------------------------------
1637
+
1638
+ /**
1639
+ * @brief Internal dispatch routine
1640
+ *
1641
+ * @param[in] d_temp_storage
1642
+ * Device-accessible allocation of temporary storage. When nullptr, the required allocation size
1643
+ * is written to `temp_storage_bytes` and no work is done.
1644
+ *
1645
+ * @param[in,out] temp_storage_bytes
1646
+ * Reference to size in bytes of `d_temp_storage` allocation
1647
+ *
1648
+ * @param[in,out] d_keys
1649
+ * Double-buffer whose current buffer contains the unsorted input keys and, upon return, is
1650
+ * updated to point to the sorted output keys
1651
+ *
1652
+ * @param[in,out] d_values
1653
+ * Double-buffer whose current buffer contains the unsorted input values and, upon return, is
1654
+ * updated to point to the sorted output values
1655
+ *
1656
+ * @param[in] num_items
1657
+ * Number of items to sort
1658
+ *
1659
+ * @param[in] num_segments
1660
+ * The number of segments that comprise the sorting data
1661
+ *
1662
+ * @param[in] d_begin_offsets
1663
+ * Random-access input iterator to the sequence of beginning offsets of length
1664
+ * `num_segments`, such that <tt>d_begin_offsets[i]</tt> is the first element of the
1665
+ * <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
1666
+ *
1667
+ * @param[in] d_end_offsets
1668
+ * Random-access input iterator to the sequence of ending offsets of length `num_segments`,
1669
+ * such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>
1670
+ * data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.
1671
+ * If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>,
1672
+ * the <em>i</em><sup>th</sup> is considered empty.
1673
+ *
1674
+ * @param[in] begin_bit
1675
+ * The beginning (least-significant) bit index needed for key comparison
1676
+ *
1677
+ * @param[in] end_bit
1678
+ * The past-the-end (most-significant) bit index needed for key comparison
1679
+ *
1680
+ * @param[in] is_overwrite_okay
1681
+ * Whether is okay to overwrite source buffers
1682
+ *
1683
+ * @param[in] stream
1684
+ * CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
1685
+ */
1686
+ template <typename MaxPolicyT = typename PolicyHub::MaxPolicy>
1687
+ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
1688
+ void* d_temp_storage,
1689
+ size_t& temp_storage_bytes,
1690
+ DoubleBuffer<KeyT>& d_keys,
1691
+ DoubleBuffer<ValueT>& d_values,
1692
+ ::cuda::std::int64_t num_items,
1693
+ ::cuda::std::int64_t num_segments,
1694
+ BeginOffsetIteratorT d_begin_offsets,
1695
+ EndOffsetIteratorT d_end_offsets,
1696
+ int begin_bit,
1697
+ int end_bit,
1698
+ bool is_overwrite_okay,
1699
+ cudaStream_t stream,
1700
+ KernelSource kernel_source = {},
1701
+ KernelLauncherFactory launcher_factory = {},
1702
+ MaxPolicyT max_policy = {})
1703
+ {
1704
+ cudaError_t error;
1705
+ do
1706
+ {
1707
+ // Get PTX version
1708
+ int ptx_version = 0;
1709
+
1710
+ error = CubDebug(launcher_factory.PtxVersion(ptx_version));
1711
+ if (cudaSuccess != error)
1712
+ {
1713
+ break;
1714
+ }
1715
+
1716
+ // Create dispatch functor
1717
+ DispatchSegmentedRadixSort dispatch(
1718
+ d_temp_storage,
1719
+ temp_storage_bytes,
1720
+ d_keys,
1721
+ d_values,
1722
+ num_items,
1723
+ num_segments,
1724
+ d_begin_offsets,
1725
+ d_end_offsets,
1726
+ begin_bit,
1727
+ end_bit,
1728
+ is_overwrite_okay,
1729
+ stream,
1730
+ ptx_version,
1731
+ {},
1732
+ kernel_source,
1733
+ launcher_factory);
1734
+
1735
+ // Dispatch to chained policy
1736
+ error = CubDebug(max_policy.Invoke(ptx_version, dispatch));
1737
+ if (cudaSuccess != error)
1738
+ {
1739
+ break;
1740
+ }
1741
+ } while (0);
1742
+
1743
+ return error;
1744
+ }
1745
+ };
1746
+
1747
+ CUB_NAMESPACE_END
1748
+
1749
+ _CCCL_DIAG_POP