cuda-cccl 0.1.3.2.0.dev271__cp311-cp311-manylinux_2_26_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1947) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +46 -0
  3. cuda/cccl/cooperative/__init__.py +3 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +8 -0
  5. cuda/cccl/cooperative/experimental/_caching.py +48 -0
  6. cuda/cccl/cooperative/experimental/_common.py +273 -0
  7. cuda/cccl/cooperative/experimental/_nvrtc.py +91 -0
  8. cuda/cccl/cooperative/experimental/_scan_op.py +181 -0
  9. cuda/cccl/cooperative/experimental/_types.py +937 -0
  10. cuda/cccl/cooperative/experimental/_typing.py +107 -0
  11. cuda/cccl/cooperative/experimental/block/__init__.py +39 -0
  12. cuda/cccl/cooperative/experimental/block/_block_exchange.py +251 -0
  13. cuda/cccl/cooperative/experimental/block/_block_load_store.py +215 -0
  14. cuda/cccl/cooperative/experimental/block/_block_merge_sort.py +125 -0
  15. cuda/cccl/cooperative/experimental/block/_block_radix_sort.py +214 -0
  16. cuda/cccl/cooperative/experimental/block/_block_reduce.py +294 -0
  17. cuda/cccl/cooperative/experimental/block/_block_scan.py +983 -0
  18. cuda/cccl/cooperative/experimental/warp/__init__.py +9 -0
  19. cuda/cccl/cooperative/experimental/warp/_warp_merge_sort.py +92 -0
  20. cuda/cccl/cooperative/experimental/warp/_warp_reduce.py +153 -0
  21. cuda/cccl/cooperative/experimental/warp/_warp_scan.py +78 -0
  22. cuda/cccl/headers/__init__.py +7 -0
  23. cuda/cccl/headers/include/__init__.py +1 -0
  24. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +262 -0
  25. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1181 -0
  26. cuda/cccl/headers/include/cub/agent/agent_for.cuh +84 -0
  27. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +925 -0
  28. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +226 -0
  29. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +730 -0
  30. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +766 -0
  31. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
  32. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +704 -0
  33. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +557 -0
  34. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +632 -0
  35. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +804 -0
  36. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1124 -0
  37. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +561 -0
  38. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +473 -0
  39. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +292 -0
  40. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1114 -0
  41. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +341 -0
  42. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +592 -0
  43. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +614 -0
  44. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1342 -0
  45. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +965 -0
  46. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1217 -0
  47. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1306 -0
  48. cuda/cccl/headers/include/cub/block/block_histogram.cuh +420 -0
  49. cuda/cccl/headers/include/cub/block/block_load.cuh +1260 -0
  50. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +787 -0
  51. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1218 -0
  52. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2193 -0
  53. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  54. cuda/cccl/headers/include/cub/block/block_reduce.cuh +665 -0
  55. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +437 -0
  56. cuda/cccl/headers/include/cub/block/block_scan.cuh +2583 -0
  57. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  58. cuda/cccl/headers/include/cub/block/block_store.cuh +1246 -0
  59. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +620 -0
  60. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  61. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  62. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  63. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  64. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
  65. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  66. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  67. cuda/cccl/headers/include/cub/config.cuh +53 -0
  68. cuda/cccl/headers/include/cub/cub.cuh +112 -0
  69. cuda/cccl/headers/include/cub/detail/array_utils.cuh +77 -0
  70. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +155 -0
  71. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +93 -0
  72. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  73. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +246 -0
  74. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +84 -0
  75. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
  76. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
  77. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +118 -0
  78. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  79. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  80. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +61 -0
  81. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  82. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +71 -0
  83. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +93 -0
  84. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  85. cuda/cccl/headers/include/cub/detail/rfa.cuh +724 -0
  86. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  87. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  88. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
  89. cuda/cccl/headers/include/cub/detail/type_traits.cuh +179 -0
  90. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +72 -0
  91. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  92. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  93. cuda/cccl/headers/include/cub/device/device_copy.cuh +187 -0
  94. cuda/cccl/headers/include/cub/device/device_for.cuh +990 -0
  95. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1507 -0
  96. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  97. cuda/cccl/headers/include/cub/device/device_merge.cuh +202 -0
  98. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  99. cuda/cccl/headers/include/cub/device/device_partition.cuh +664 -0
  100. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3435 -0
  101. cuda/cccl/headers/include/cub/device/device_reduce.cuh +1898 -0
  102. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
  103. cuda/cccl/headers/include/cub/device/device_scan.cuh +1899 -0
  104. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  105. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1512 -0
  106. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  107. cuda/cccl/headers/include/cub/device/device_select.cuh +1224 -0
  108. cuda/cccl/headers/include/cub/device/device_transform.cuh +545 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +314 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +109 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +718 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +45 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +197 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1042 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +305 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1749 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1316 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +656 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +497 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +313 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +612 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +497 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +598 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +916 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +838 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +441 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +455 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +558 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +543 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +218 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +799 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +591 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +194 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +330 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +475 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +121 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +987 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +70 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +121 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +63 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +278 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +91 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +118 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1068 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +945 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +675 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +609 -0
  154. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1013 -0
  155. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +249 -0
  156. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1587 -0
  157. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +407 -0
  158. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +448 -0
  159. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +874 -0
  160. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +226 -0
  161. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  162. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  163. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +256 -0
  164. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +260 -0
  165. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +252 -0
  166. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +322 -0
  167. cuda/cccl/headers/include/cub/thread/thread_load.cuh +347 -0
  168. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +684 -0
  169. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +547 -0
  170. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
  171. cuda/cccl/headers/include/cub/thread/thread_search.cuh +198 -0
  172. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +464 -0
  173. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +101 -0
  174. cuda/cccl/headers/include/cub/thread/thread_store.cuh +364 -0
  175. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  176. cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
  177. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  178. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  179. cuda/cccl/headers/include/cub/util_device.cuh +779 -0
  180. cuda/cccl/headers/include/cub/util_macro.cuh +99 -0
  181. cuda/cccl/headers/include/cub/util_math.cuh +115 -0
  182. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  183. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  184. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  185. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  186. cuda/cccl/headers/include/cub/util_type.cuh +1136 -0
  187. cuda/cccl/headers/include/cub/util_vsmem.cuh +251 -0
  188. cuda/cccl/headers/include/cub/version.cuh +89 -0
  189. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  190. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  191. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +729 -0
  192. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +405 -0
  193. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +950 -0
  194. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +713 -0
  195. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  196. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  197. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  198. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +822 -0
  199. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1885 -0
  200. cuda/cccl/headers/include/cub/warp/warp_store.cuh +520 -0
  201. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
  202. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  203. cuda/cccl/headers/include/cuda/__algorithm/copy.h +143 -0
  204. cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
  205. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  206. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  207. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
  208. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  209. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  210. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  211. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  212. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  213. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  214. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  215. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
  216. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +466 -0
  217. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
  218. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  219. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  220. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  221. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  222. cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
  223. cuda/cccl/headers/include/cuda/__cccl_config +36 -0
  224. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  225. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +249 -0
  226. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  227. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  228. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  229. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  230. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  231. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  232. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  233. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  234. cuda/cccl/headers/include/cuda/__device/all_devices.h +240 -0
  235. cuda/cccl/headers/include/cuda/__device/arch_traits.h +613 -0
  236. cuda/cccl/headers/include/cuda/__device/attributes.h +721 -0
  237. cuda/cccl/headers/include/cuda/__device/device_ref.h +176 -0
  238. cuda/cccl/headers/include/cuda/__device/physical_device.h +168 -0
  239. cuda/cccl/headers/include/cuda/__driver/driver_api.h +503 -0
  240. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  241. cuda/cccl/headers/include/cuda/__event/event_ref.h +158 -0
  242. cuda/cccl/headers/include/cuda/__event/timed_event.h +118 -0
  243. cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
  244. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  245. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  246. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  247. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +276 -0
  248. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  249. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  250. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +109 -0
  251. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  252. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  253. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  254. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  255. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +49 -0
  256. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +257 -0
  257. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +460 -0
  258. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +314 -0
  259. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +424 -0
  260. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +292 -0
  261. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +321 -0
  262. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +335 -0
  263. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +501 -0
  264. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +496 -0
  265. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +452 -0
  266. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +94 -0
  267. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +539 -0
  268. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  269. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +462 -0
  270. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +63 -0
  271. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +122 -0
  272. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +51 -0
  273. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  274. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  275. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  276. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  277. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  278. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  279. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  280. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  281. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  282. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
  283. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  284. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  285. cuda/cccl/headers/include/cuda/__memory/address_space.h +211 -0
  286. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  287. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  288. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  289. cuda/cccl/headers/include/cuda/__memory/check_address.h +106 -0
  290. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  291. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  292. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  293. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  294. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  295. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  296. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +69 -0
  297. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
  298. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +654 -0
  299. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
  300. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  301. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  302. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  303. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  304. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2982 -0
  305. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  306. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  307. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  308. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  309. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  310. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  311. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  312. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  313. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  314. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  315. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  411. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  412. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
  413. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  414. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  415. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +97 -0
  416. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  417. cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
  418. cuda/cccl/headers/include/cuda/__stream/stream.h +142 -0
  419. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +296 -0
  420. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  421. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  422. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  423. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  424. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  425. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  426. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  427. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  428. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  429. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +521 -0
  430. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
  431. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
  432. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  433. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  434. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  435. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
  436. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
  437. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +78 -0
  438. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  439. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  440. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
  441. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  442. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  443. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  444. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  445. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  446. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  447. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  448. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  449. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  450. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  451. cuda/cccl/headers/include/cuda/access_property +26 -0
  452. cuda/cccl/headers/include/cuda/algorithm +27 -0
  453. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  454. cuda/cccl/headers/include/cuda/atomic +27 -0
  455. cuda/cccl/headers/include/cuda/barrier +267 -0
  456. cuda/cccl/headers/include/cuda/bit +29 -0
  457. cuda/cccl/headers/include/cuda/cmath +36 -0
  458. cuda/cccl/headers/include/cuda/devices +20 -0
  459. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  460. cuda/cccl/headers/include/cuda/functional +32 -0
  461. cuda/cccl/headers/include/cuda/iterator +38 -0
  462. cuda/cccl/headers/include/cuda/latch +27 -0
  463. cuda/cccl/headers/include/cuda/mdspan +28 -0
  464. cuda/cccl/headers/include/cuda/memory +34 -0
  465. cuda/cccl/headers/include/cuda/memory_resource +35 -0
  466. cuda/cccl/headers/include/cuda/numeric +29 -0
  467. cuda/cccl/headers/include/cuda/pipeline +578 -0
  468. cuda/cccl/headers/include/cuda/ptx +128 -0
  469. cuda/cccl/headers/include/cuda/semaphore +31 -0
  470. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  471. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  472. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  473. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  474. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  475. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +64 -0
  476. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  477. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  478. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  479. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  480. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  481. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +92 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  503. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  504. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  505. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  506. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  507. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  508. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  509. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  510. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +96 -0
  511. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  512. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +68 -0
  513. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  514. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  515. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  516. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  517. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +141 -0
  518. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  519. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
  520. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  521. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  522. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  523. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  524. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  525. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  526. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  527. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +94 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +101 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  566. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  567. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +138 -0
  568. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  569. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  570. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  571. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  572. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  573. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  574. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  575. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  576. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  577. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  578. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  579. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  580. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
  581. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
  582. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  583. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  584. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  585. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  586. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  587. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  588. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  589. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  590. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  591. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
  592. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
  593. cuda/cccl/headers/include/cuda/std/__bit/countl.h +167 -0
  594. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  595. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  596. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  597. cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
  598. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  599. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  600. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  601. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  602. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +150 -0
  603. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  604. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +787 -0
  605. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  606. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  607. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +53 -0
  608. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +69 -0
  609. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  610. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
  611. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
  612. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +79 -0
  613. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +68 -0
  614. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +133 -0
  615. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  616. cuda/cccl/headers/include/cuda/std/__cccl/os.h +48 -0
  617. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1276 -0
  618. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
  619. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
  620. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  621. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  622. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  623. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  624. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  625. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +146 -0
  626. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  627. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
  628. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  629. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  630. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  631. cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
  632. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  633. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  634. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +724 -0
  635. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
  636. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +216 -0
  637. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
  638. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
  639. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
  640. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
  641. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
  642. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
  643. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
  644. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +180 -0
  645. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  646. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  647. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +582 -0
  648. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +260 -0
  649. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  650. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
  651. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
  652. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
  653. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  654. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
  655. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
  656. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  657. cuda/cccl/headers/include/cuda/std/__complex/complex.h +676 -0
  658. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
  659. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  660. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
  661. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  662. cuda/cccl/headers/include/cuda/std/__complex/literals.h +106 -0
  663. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +302 -0
  664. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  665. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +322 -0
  666. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +321 -0
  667. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  668. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  669. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  670. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  671. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  672. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  673. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  674. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  675. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  676. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  677. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +274 -0
  678. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +107 -0
  679. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  680. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  681. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  682. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  683. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  684. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  685. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  686. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  687. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  688. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  689. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  690. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +42 -0
  691. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  692. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  693. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  694. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  695. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  696. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  697. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  698. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  699. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +69 -0
  700. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +58 -0
  701. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  702. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  703. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +72 -0
  704. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +146 -0
  705. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  706. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  707. cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
  708. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  709. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1963 -0
  710. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
  711. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  712. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +172 -0
  713. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  714. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +809 -0
  715. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  716. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  717. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +374 -0
  718. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  719. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +113 -0
  720. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  721. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +39 -0
  722. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +72 -0
  723. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  724. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  725. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  726. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  727. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  728. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  729. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  730. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  731. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  732. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  733. cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
  734. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  735. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  736. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  737. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
  738. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  739. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  740. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  741. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  742. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  743. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  744. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  745. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  746. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  747. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  748. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  749. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  750. cuda/cccl/headers/include/cuda/std/__functional/bind.h +337 -0
  751. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  752. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  753. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  754. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  755. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  756. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  757. cuda/cccl/headers/include/cuda/std/__functional/function.h +1279 -0
  758. cuda/cccl/headers/include/cuda/std/__functional/hash.h +650 -0
  759. cuda/cccl/headers/include/cuda/std/__functional/identity.h +61 -0
  760. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +560 -0
  761. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +43 -0
  762. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +67 -0
  763. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +213 -0
  764. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  765. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  766. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  767. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +65 -0
  768. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +64 -0
  769. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  770. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  771. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  772. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  773. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  774. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +277 -0
  775. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +35 -0
  776. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  777. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +49 -0
  778. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +34 -0
  779. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  780. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  781. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  782. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  783. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  784. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +90 -0
  785. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  786. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  787. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +34 -0
  788. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  789. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  790. cuda/cccl/headers/include/cuda/std/__fwd/string.h +83 -0
  791. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +59 -0
  792. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  793. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  794. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  795. cuda/cccl/headers/include/cuda/std/__internal/features.h +71 -0
  796. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +122 -0
  797. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  798. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  799. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
  800. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +254 -0
  801. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  802. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  803. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  804. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  805. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  806. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  807. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  808. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
  809. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +150 -0
  810. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  811. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  812. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  813. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  814. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  815. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  816. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  817. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +932 -0
  818. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  819. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +433 -0
  820. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  821. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  822. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  823. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  824. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  825. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +91 -0
  826. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  827. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +185 -0
  828. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  829. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
  830. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  831. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  832. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  833. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +247 -0
  834. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  835. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  836. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  837. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  838. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
  839. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
  840. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
  841. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
  842. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  843. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +138 -0
  844. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  845. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
  846. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +757 -0
  847. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  848. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  849. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
  850. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +499 -0
  851. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  852. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  853. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  854. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +64 -0
  855. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  856. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  857. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  858. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  859. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  860. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  861. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +552 -0
  862. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  863. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  864. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  865. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +248 -0
  866. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  867. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +43 -0
  868. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +261 -0
  869. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  870. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  871. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +682 -0
  872. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +767 -0
  873. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
  874. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  875. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  876. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  877. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  878. cuda/cccl/headers/include/cuda/std/__new/launder.h +49 -0
  879. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  880. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  881. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  882. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  883. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  884. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  885. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  886. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  887. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  888. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  889. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  890. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  891. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  892. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  893. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  894. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  895. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  896. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  897. cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
  898. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +432 -0
  899. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  900. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  901. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  902. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  903. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  904. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  905. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  906. cuda/cccl/headers/include/cuda/std/__random_ +29 -0
  907. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  908. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  909. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +314 -0
  910. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  911. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  912. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  913. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  914. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  915. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  916. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  917. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  918. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  919. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  920. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +161 -0
  921. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
  922. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  923. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  924. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  925. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  926. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  927. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  928. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  929. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
  930. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  931. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  932. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  933. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  934. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  935. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +233 -0
  936. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  937. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  938. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  939. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  940. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  941. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  942. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  943. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  944. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +105 -0
  945. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  946. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  947. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  948. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  949. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  950. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +98 -0
  951. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +269 -0
  952. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +218 -0
  953. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  954. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  955. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +90 -0
  956. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +73 -0
  957. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  958. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  959. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +291 -0
  960. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  961. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  962. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  963. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  964. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  965. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  966. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  967. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  968. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  969. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  970. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +69 -0
  971. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  972. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  973. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  974. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  975. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  976. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  977. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  978. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  979. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  980. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  981. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  982. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  983. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +47 -0
  984. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +51 -0
  985. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  986. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +40 -0
  987. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +44 -0
  988. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  989. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  990. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  991. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  992. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +83 -0
  993. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  994. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  995. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  996. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +68 -0
  997. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +54 -0
  998. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  999. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  1000. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1001. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +214 -0
  1002. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1003. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1004. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1005. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1006. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1007. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1008. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +73 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +68 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +81 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +56 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1016. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1017. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1018. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1019. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1020. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1021. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +59 -0
  1022. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1023. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1024. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1025. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1026. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1027. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1028. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  1029. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1030. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1031. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1032. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1033. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1034. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1035. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1036. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1037. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1038. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1039. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +62 -0
  1040. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1041. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1042. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1043. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +63 -0
  1044. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +119 -0
  1045. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1046. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  1047. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1048. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +84 -0
  1049. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1050. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1051. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1052. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1053. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +57 -0
  1054. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +201 -0
  1055. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +56 -0
  1056. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +70 -0
  1057. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +82 -0
  1058. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +60 -0
  1059. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +61 -0
  1060. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +56 -0
  1061. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +55 -0
  1062. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +73 -0
  1063. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +60 -0
  1064. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +58 -0
  1065. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1066. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +57 -0
  1067. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1068. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1069. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1070. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1071. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1072. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1073. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1074. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1075. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1076. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1077. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1078. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1079. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1080. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +123 -0
  1081. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1082. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1083. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1084. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1085. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1086. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +66 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1100. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1101. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1102. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1103. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1104. cuda/cccl/headers/include/cuda/std/__utility/declval.h +63 -0
  1105. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1106. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1107. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1108. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1109. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +77 -0
  1110. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  1111. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1112. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1113. cuda/cccl/headers/include/cuda/std/__utility/pair.h +797 -0
  1114. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1115. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
  1116. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1117. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  1118. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1119. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1120. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1121. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1122. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1123. cuda/cccl/headers/include/cuda/std/array +518 -0
  1124. cuda/cccl/headers/include/cuda/std/atomic +818 -0
  1125. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1126. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1127. cuda/cccl/headers/include/cuda/std/bitset +994 -0
  1128. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1129. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1130. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1131. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1132. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1133. cuda/cccl/headers/include/cuda/std/cmath +25 -0
  1134. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1135. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1136. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1137. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1138. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1139. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1140. cuda/cccl/headers/include/cuda/std/ctime +154 -0
  1141. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1142. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +235 -0
  1143. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1722 -0
  1144. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3630 -0
  1145. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/cmath +520 -0
  1146. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1147. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1148. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
  1149. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2142 -0
  1150. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1151. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1152. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1153. cuda/cccl/headers/include/cuda/std/initializer_list +36 -0
  1154. cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
  1155. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1156. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1157. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1158. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1159. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1160. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1161. cuda/cccl/headers/include/cuda/std/numbers +342 -0
  1162. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1163. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1164. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1165. cuda/cccl/headers/include/cuda/std/ratio +417 -0
  1166. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1167. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1168. cuda/cccl/headers/include/cuda/std/span +628 -0
  1169. cuda/cccl/headers/include/cuda/std/string_view +799 -0
  1170. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1171. cuda/cccl/headers/include/cuda/std/type_traits +177 -0
  1172. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1173. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1174. cuda/cccl/headers/include/cuda/std/version +245 -0
  1175. cuda/cccl/headers/include/cuda/stream +31 -0
  1176. cuda/cccl/headers/include/cuda/stream_ref +54 -0
  1177. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1178. cuda/cccl/headers/include/cuda/utility +27 -0
  1179. cuda/cccl/headers/include/cuda/version +16 -0
  1180. cuda/cccl/headers/include/cuda/warp +28 -0
  1181. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1182. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1183. cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
  1184. cuda/cccl/headers/include/nv/target +235 -0
  1185. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1186. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1187. cuda/cccl/headers/include/thrust/advance.h +59 -0
  1188. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1189. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1190. cuda/cccl/headers/include/thrust/complex.h +858 -0
  1191. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1192. cuda/cccl/headers/include/thrust/count.h +245 -0
  1193. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1194. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +37 -0
  1195. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1196. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +350 -0
  1197. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +371 -0
  1198. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +45 -0
  1199. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +242 -0
  1200. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +39 -0
  1201. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +137 -0
  1202. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +39 -0
  1203. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +99 -0
  1204. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +53 -0
  1205. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +68 -0
  1206. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1207. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +102 -0
  1208. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +86 -0
  1209. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +79 -0
  1210. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +79 -0
  1211. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +39 -0
  1212. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +98 -0
  1213. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1214. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1215. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1216. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1217. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1218. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1219. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1220. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1221. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1222. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1223. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1224. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1225. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1226. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1227. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1228. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1229. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1230. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1231. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1232. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1233. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1234. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1235. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1236. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1237. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1238. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1239. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1240. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1241. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1242. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1243. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1244. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1245. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1246. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1247. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1248. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1249. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1250. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1251. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1252. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1253. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1254. cuda/cccl/headers/include/thrust/detail/device_delete.inl +52 -0
  1255. cuda/cccl/headers/include/thrust/detail/device_free.inl +47 -0
  1256. cuda/cccl/headers/include/thrust/detail/device_new.inl +61 -0
  1257. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1258. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1259. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1260. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +80 -0
  1261. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1262. cuda/cccl/headers/include/thrust/detail/execution_policy.h +80 -0
  1263. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1264. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1265. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1266. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1267. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1268. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1269. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1270. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1271. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1272. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1273. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1274. cuda/cccl/headers/include/thrust/detail/integer_math.h +130 -0
  1275. cuda/cccl/headers/include/thrust/detail/internal_functional.h +293 -0
  1276. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1277. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
  1278. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1279. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1280. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +40 -0
  1281. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1282. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1283. cuda/cccl/headers/include/thrust/detail/mpl/math.h +164 -0
  1284. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +37 -0
  1285. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1286. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1287. cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
  1288. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1289. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1290. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1291. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1292. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1293. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
  1294. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1295. cuda/cccl/headers/include/thrust/detail/reference.h +500 -0
  1296. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1297. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1298. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1299. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1300. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1301. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1302. cuda/cccl/headers/include/thrust/detail/seq.h +54 -0
  1303. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1304. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1305. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1306. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1307. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1308. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1309. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1310. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1311. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1312. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
  1313. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1314. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1315. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1316. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
  1317. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1318. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1319. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1320. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1321. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1322. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1323. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1324. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
  1325. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_discard_iterator.h +44 -0
  1326. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1327. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1328. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1329. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1330. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1331. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1332. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1333. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1334. cuda/cccl/headers/include/thrust/detail/util/align.h +59 -0
  1335. cuda/cccl/headers/include/thrust/detail/vector_base.h +615 -0
  1336. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1212 -0
  1337. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1338. cuda/cccl/headers/include/thrust/device_delete.h +59 -0
  1339. cuda/cccl/headers/include/thrust/device_free.h +72 -0
  1340. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1341. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1342. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1343. cuda/cccl/headers/include/thrust/device_new.h +91 -0
  1344. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1345. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1346. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1347. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1348. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1349. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1350. cuda/cccl/headers/include/thrust/execution_policy.h +384 -0
  1351. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1352. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1353. cuda/cccl/headers/include/thrust/find.h +382 -0
  1354. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1355. cuda/cccl/headers/include/thrust/functional.h +396 -0
  1356. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1357. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1358. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1359. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1360. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1361. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1362. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1363. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1364. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1365. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1366. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1367. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +51 -0
  1368. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +62 -0
  1369. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1370. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +199 -0
  1371. cuda/cccl/headers/include/thrust/iterator/detail/iterator_traversal_tags.h +50 -0
  1372. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +53 -0
  1373. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1374. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1375. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1376. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1377. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1378. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1379. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +215 -0
  1380. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +660 -0
  1381. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +311 -0
  1382. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1383. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1384. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1385. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1386. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
  1387. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1388. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +149 -0
  1389. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1390. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1391. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1392. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
  1393. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1394. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1395. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1396. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1397. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1398. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1399. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1400. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1401. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
  1402. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1403. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1404. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1405. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1406. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1407. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1408. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1409. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1410. cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
  1411. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1412. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1413. cuda/cccl/headers/include/thrust/pair.h +99 -0
  1414. cuda/cccl/headers/include/thrust/partition.h +1391 -0
  1415. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1416. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1417. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1418. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1419. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1420. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1421. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1422. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1423. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1424. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1425. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1426. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1427. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
  1428. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1429. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +217 -0
  1430. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1431. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1432. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1433. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1434. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1435. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1436. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1437. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1438. cuda/cccl/headers/include/thrust/random.h +120 -0
  1439. cuda/cccl/headers/include/thrust/reduce.h +1113 -0
  1440. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1441. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1442. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1443. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1444. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1445. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1446. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1447. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1448. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1449. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1450. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1451. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1452. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1453. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1454. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1455. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1456. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1457. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +90 -0
  1458. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1459. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1460. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1461. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1462. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1463. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1464. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1465. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1466. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1467. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1468. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1469. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1470. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1471. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1472. cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +62 -0
  1473. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1474. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1475. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1476. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1477. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1478. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1479. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1480. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1481. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1482. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1483. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1484. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1485. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1486. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1487. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1488. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1489. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1490. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1491. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1492. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1493. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1494. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1495. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1496. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +158 -0
  1497. cuda/cccl/headers/include/thrust/system/cpp/memory.h +109 -0
  1498. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +75 -0
  1499. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +123 -0
  1500. cuda/cccl/headers/include/thrust/system/cpp/vector.h +99 -0
  1501. cuda/cccl/headers/include/thrust/system/cuda/config.h +123 -0
  1502. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1503. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1504. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1505. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1506. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +129 -0
  1507. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1508. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1509. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1510. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +609 -0
  1511. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1512. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1513. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +210 -0
  1514. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1515. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1516. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +113 -0
  1517. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1518. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +82 -0
  1519. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +272 -0
  1520. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1521. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1522. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +85 -0
  1523. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1524. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1525. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +204 -0
  1526. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +92 -0
  1527. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +69 -0
  1528. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1529. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1530. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1531. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1532. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1533. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +217 -0
  1534. cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +237 -0
  1535. cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +95 -0
  1536. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1537. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1538. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1539. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +782 -0
  1540. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
  1541. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1542. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +152 -0
  1543. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
  1544. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +342 -0
  1545. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +415 -0
  1546. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +79 -0
  1547. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1548. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1738 -0
  1549. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +482 -0
  1550. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
  1551. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +75 -0
  1552. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1553. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1554. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +415 -0
  1555. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1556. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1557. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +114 -0
  1558. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +91 -0
  1559. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
  1560. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +311 -0
  1561. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +251 -0
  1562. cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
  1563. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +39 -0
  1564. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1565. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1566. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
  1567. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1568. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1569. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1570. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1571. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1572. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1573. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1574. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1575. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1576. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1577. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1578. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1579. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1580. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1581. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1582. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1583. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1584. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +64 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/errno.h +125 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +59 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +85 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +167 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +391 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +51 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +70 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +64 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +152 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +54 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +90 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +55 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +66 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +72 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +258 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +60 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +55 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +143 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +64 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +79 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +102 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +51 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +66 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +78 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +65 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +70 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +92 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +105 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +154 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +55 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +74 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +135 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +213 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +49 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +77 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +106 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +89 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +192 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +92 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +127 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +101 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +181 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +54 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +73 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +78 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +141 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +78 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +91 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +132 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +238 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +79 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +91 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +96 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +288 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +482 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +60 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +131 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +119 -0
  1675. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +181 -0
  1676. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +50 -0
  1677. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +82 -0
  1678. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +47 -0
  1679. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +60 -0
  1680. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +53 -0
  1681. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +60 -0
  1682. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +88 -0
  1683. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +56 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +62 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +86 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +119 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +51 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +172 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +51 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +121 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +77 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +119 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +87 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +132 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +123 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +76 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +48 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +142 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +55 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +125 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +55 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +77 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +78 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +116 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +68 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +80 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +129 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +49 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +147 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +51 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +56 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +81 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +151 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +309 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +70 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +104 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +185 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +160 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +151 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +212 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +65 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +61 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +362 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +54 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +130 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +54 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +592 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +64 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +121 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +112 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1755. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1756. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1757. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1758. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1759. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1760. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1761. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1762. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1763. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1764. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1765. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1766. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1767. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +113 -0
  1768. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1769. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1770. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1771. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1772. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1773. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1774. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1775. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1776. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1777. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1778. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1779. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1780. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1781. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1782. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1783. cuda/cccl/headers/include/thrust/system/omp/detail/par.h +62 -0
  1784. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1785. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1786. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1787. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1788. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1789. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1790. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1791. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1792. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1793. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1794. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1795. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1796. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1797. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1798. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +30 -0
  1799. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +30 -0
  1800. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1801. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1802. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1803. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1804. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
  1805. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1806. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1807. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1808. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1809. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1810. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1811. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1812. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1813. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1814. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1815. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1816. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1817. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +157 -0
  1818. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1819. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1820. cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
  1821. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1822. cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
  1823. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1824. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1825. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1826. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1827. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1828. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1829. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1830. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1831. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1832. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +92 -0
  1833. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1834. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1835. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1836. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1837. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1838. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1839. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1840. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1841. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1842. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1843. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1844. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1845. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1846. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1847. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1848. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1849. cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +62 -0
  1850. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1851. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1852. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1853. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1854. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1855. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1856. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1857. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1858. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1859. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1860. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1861. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1862. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1863. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1864. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +30 -0
  1865. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1866. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1867. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1868. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1869. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1870. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1871. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1872. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1873. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1874. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +157 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1887. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1888. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1889. cuda/cccl/headers/include/thrust/transform.h +1045 -0
  1890. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1891. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1892. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1893. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1894. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
  1895. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1896. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1897. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1898. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
  1899. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1900. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +96 -0
  1901. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1902. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1903. cuda/cccl/headers/include/thrust/unique.h +1088 -0
  1904. cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
  1905. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1906. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1907. cuda/cccl/headers/include/thrust/version.h +93 -0
  1908. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1909. cuda/cccl/headers/include_paths.py +54 -0
  1910. cuda/cccl/parallel/__init__.py +9 -0
  1911. cuda/cccl/parallel/experimental/.gitignore +4 -0
  1912. cuda/cccl/parallel/experimental/__init__.py +75 -0
  1913. cuda/cccl/parallel/experimental/_bindings.py +56 -0
  1914. cuda/cccl/parallel/experimental/_bindings.pyi +405 -0
  1915. cuda/cccl/parallel/experimental/_bindings_impl.pyx +1957 -0
  1916. cuda/cccl/parallel/experimental/_caching.py +71 -0
  1917. cuda/cccl/parallel/experimental/_cccl_interop.py +396 -0
  1918. cuda/cccl/parallel/experimental/_utils/__init__.py +0 -0
  1919. cuda/cccl/parallel/experimental/_utils/protocols.py +132 -0
  1920. cuda/cccl/parallel/experimental/_utils/temp_storage_buffer.py +86 -0
  1921. cuda/cccl/parallel/experimental/algorithms/__init__.py +50 -0
  1922. cuda/cccl/parallel/experimental/algorithms/_histogram.py +243 -0
  1923. cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +225 -0
  1924. cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +312 -0
  1925. cuda/cccl/parallel/experimental/algorithms/_reduce.py +184 -0
  1926. cuda/cccl/parallel/experimental/algorithms/_scan.py +261 -0
  1927. cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +257 -0
  1928. cuda/cccl/parallel/experimental/algorithms/_transform.py +308 -0
  1929. cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +252 -0
  1930. cuda/cccl/parallel/experimental/cccl/.gitkeep +0 -0
  1931. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-311-x86_64-linux-gnu.so +0 -0
  1932. cuda/cccl/parallel/experimental/cu12/cccl/libcccl.c.parallel.so +0 -0
  1933. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-311-x86_64-linux-gnu.so +0 -0
  1934. cuda/cccl/parallel/experimental/cu13/cccl/libcccl.c.parallel.so +0 -0
  1935. cuda/cccl/parallel/experimental/iterators/__init__.py +21 -0
  1936. cuda/cccl/parallel/experimental/iterators/_factories.py +214 -0
  1937. cuda/cccl/parallel/experimental/iterators/_iterators.py +627 -0
  1938. cuda/cccl/parallel/experimental/iterators/_zip_iterator.py +207 -0
  1939. cuda/cccl/parallel/experimental/numba_utils.py +6 -0
  1940. cuda/cccl/parallel/experimental/op.py +3 -0
  1941. cuda/cccl/parallel/experimental/struct.py +272 -0
  1942. cuda/cccl/parallel/experimental/typing.py +35 -0
  1943. cuda/cccl/py.typed +0 -0
  1944. cuda_cccl-0.1.3.2.0.dev271.dist-info/METADATA +40 -0
  1945. cuda_cccl-0.1.3.2.0.dev271.dist-info/RECORD +1947 -0
  1946. cuda_cccl-0.1.3.2.0.dev271.dist-info/WHEEL +5 -0
  1947. cuda_cccl-0.1.3.2.0.dev271.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,1885 @@
1
+ /******************************************************************************
2
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
3
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4
+ *
5
+ * Redistribution and use in source and binary forms, with or without
6
+ * modification, are permitted provided that the following conditions are met:
7
+ * * Redistributions of source code must retain the above copyright
8
+ * notice, this list of conditions and the following disclaimer.
9
+ * * Redistributions in binary form must reproduce the above copyright
10
+ * notice, this list of conditions and the following disclaimer in the
11
+ * documentation and/or other materials provided with the distribution.
12
+ * * Neither the name of the NVIDIA CORPORATION nor the
13
+ * names of its contributors may be used to endorse or promote products
14
+ * derived from this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ *
27
+ ******************************************************************************/
28
+
29
+ //! @file
30
+ //! @rst
31
+ //! The ``cub::WarpScan`` class provides :ref:`collective <collective-primitives>` methods for
32
+ //! computing a parallel prefix scan of items partitioned across a CUDA thread warp.
33
+ //! @endrst
34
+
35
+ #pragma once
36
+
37
+ #include <cub/config.cuh>
38
+
39
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
40
+ # pragma GCC system_header
41
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
42
+ # pragma clang system_header
43
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
44
+ # pragma system_header
45
+ #endif // no system header
46
+
47
+ #include <cub/thread/thread_operators.cuh>
48
+ #include <cub/util_type.cuh>
49
+ #include <cub/warp/specializations/warp_scan_shfl.cuh>
50
+ #include <cub/warp/specializations/warp_scan_smem.cuh>
51
+
52
+ #include <cuda/ptx>
53
+ #include <cuda/std/type_traits>
54
+
55
+ CUB_NAMESPACE_BEGIN
56
+
57
+ //! @rst
58
+ //! The WarpScan class provides :ref:`collective <collective-primitives>` methods for computing a
59
+ //! parallel prefix scan of items partitioned across a CUDA thread warp.
60
+ //!
61
+ //! .. image:: ../../img/warp_scan_logo.png
62
+ //! :align: center
63
+ //!
64
+ //! Overview
65
+ //! ++++++++++++++++++++++++++
66
+ //!
67
+ //! * Given a list of input elements and a binary reduction operator, a
68
+ //! `prefix scan <http://en.wikipedia.org/wiki/Prefix_sum>`__ produces an output list where each
69
+ //! element is computed to be the reduction of the elements occurring earlier in the input list.
70
+ //! *Prefix sum* connotes a prefix scan with the addition operator. The term *inclusive*
71
+ //! indicates that the *i*\ :sup:`th` output reduction incorporates the *i*\ :sup:`th` input.
72
+ //! The term *exclusive* indicates the *i*\ :sup:`th` input is not incorporated into
73
+ //! the *i*\ :sup:`th` output reduction.
74
+ //! * Supports non-commutative scan operators
75
+ //! * Supports "logical" warps smaller than the physical warp size
76
+ //! (e.g., a logical warp of 8 threads)
77
+ //! * The number of entrant threads must be an multiple of ``LOGICAL_WARP_THREADS``
78
+ //!
79
+ //! Performance Considerations
80
+ //! ++++++++++++++++++++++++++
81
+ //!
82
+ //! * Uses special instructions when applicable (e.g., warp ``SHFL``)
83
+ //! * Uses synchronization-free communication between warp lanes when applicable
84
+ //! * Incurs zero bank conflicts for most types
85
+ //! * Computation is slightly more efficient (i.e., having lower instruction overhead) for:
86
+ //!
87
+ //! * Summation (**vs.** generic scan)
88
+ //! * The architecture's warp size is a whole multiple of ``LOGICAL_WARP_THREADS``
89
+ //!
90
+ //! Simple Examples
91
+ //! ++++++++++++++++++++++++++
92
+ //!
93
+ //! @warpcollective{WarpScan}
94
+ //!
95
+ //! The code snippet below illustrates four concurrent warp prefix sums within a block of
96
+ //! 128 threads (one per each of the 32-thread warps).
97
+ //!
98
+ //! .. code-block:: c++
99
+ //!
100
+ //! #include <cub/cub.cuh>
101
+ //!
102
+ //! __global__ void ExampleKernel(...)
103
+ //! {
104
+ //! // Specialize WarpScan for type int
105
+ //! using WarpScan = cub::WarpScan<int>;
106
+ //!
107
+ //! // Allocate WarpScan shared memory for 4 warps
108
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
109
+ //!
110
+ //! // Obtain one input item per thread
111
+ //! int thread_data = ...
112
+ //!
113
+ //! // Compute warp-wide prefix sums
114
+ //! int warp_id = threadIdx.x / 32;
115
+ //! WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
116
+ //!
117
+ //! Suppose the set of input ``thread_data`` across the block of threads is
118
+ //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps of
119
+ //! threads will be ``0, 1, 2, 3, ..., 31}``.
120
+ //!
121
+ //! The code snippet below illustrates a single warp prefix sum within a block of
122
+ //! 128 threads.
123
+ //!
124
+ //! .. code-block:: c++
125
+ //!
126
+ //! #include <cub/cub.cuh>
127
+ //!
128
+ //! __global__ void ExampleKernel(...)
129
+ //! {
130
+ //! // Specialize WarpScan for type int
131
+ //! using WarpScan = cub::WarpScan<int>;
132
+ //!
133
+ //! // Allocate WarpScan shared memory for one warp
134
+ //! __shared__ typename WarpScan::TempStorage temp_storage;
135
+ //! ...
136
+ //!
137
+ //! // Only the first warp performs a prefix sum
138
+ //! if (threadIdx.x < 32)
139
+ //! {
140
+ //! // Obtain one input item per thread
141
+ //! int thread_data = ...
142
+ //!
143
+ //! // Compute warp-wide prefix sums
144
+ //! WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
145
+ //!
146
+ //! Suppose the set of input ``thread_data`` across the warp of threads is
147
+ //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` will be
148
+ //! ``{0, 1, 2, 3, ..., 31}``.
149
+ //! @endrst
150
+ //!
151
+ //! @tparam T
152
+ //! The scan input/output element type
153
+ //!
154
+ //! @tparam LOGICAL_WARP_THREADS
155
+ //! **[optional]** The number of threads per "logical" warp (may be less than the number of
156
+ //! hardware warp threads). Default is the warp size associated with the CUDA Compute Capability
157
+ //! targeted by the compiler (e.g., 32 threads for SM20).
158
+ //!
159
+ template <typename T, int LOGICAL_WARP_THREADS = detail::warp_threads>
160
+ class WarpScan
161
+ {
162
+ private:
163
+ /******************************************************************************
164
+ * Constants and type definitions
165
+ ******************************************************************************/
166
+
167
+ enum
168
+ {
169
+ /// Whether the logical warp size and the PTX warp size coincide
170
+ IS_ARCH_WARP = (LOGICAL_WARP_THREADS == detail::warp_threads),
171
+
172
+ /// Whether the logical warp size is a power-of-two
173
+ IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
174
+
175
+ /// Whether the data type is an integer (which has fully-associative addition)
176
+ IS_INTEGER = cuda::std::is_integral_v<T>
177
+ };
178
+
179
+ /// Internal specialization.
180
+ /// Use SHFL-based scan if LOGICAL_WARP_THREADS is a power-of-two
181
+ using InternalWarpScan = ::cuda::std::
182
+ _If<IS_POW_OF_TWO, detail::WarpScanShfl<T, LOGICAL_WARP_THREADS>, detail::WarpScanSmem<T, LOGICAL_WARP_THREADS>>;
183
+
184
+ /// Shared memory storage layout type for WarpScan
185
+ using _TempStorage = typename InternalWarpScan::TempStorage;
186
+
187
+ /******************************************************************************
188
+ * Thread fields
189
+ ******************************************************************************/
190
+
191
+ /// Shared storage reference
192
+ _TempStorage& temp_storage;
193
+ unsigned int lane_id;
194
+
195
+ /******************************************************************************
196
+ * Public types
197
+ ******************************************************************************/
198
+
199
+ public:
200
+ /// @smemstorage{WarpScan}
201
+ struct TempStorage : Uninitialized<_TempStorage>
202
+ {};
203
+
204
+ //! @name Collective constructors
205
+ //! @{
206
+
207
+ //! @brief Collective constructor using the specified memory allocation as temporary storage.
208
+ //! Logical warp and lane identifiers are constructed from `threadIdx.x`.
209
+ //!
210
+ //! @param[in] temp_storage
211
+ //! Reference to memory allocation having layout type TempStorage
212
+ _CCCL_DEVICE _CCCL_FORCEINLINE WarpScan(TempStorage& temp_storage)
213
+ : temp_storage(temp_storage.Alias())
214
+ , lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : ::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS)
215
+ {}
216
+
217
+ //! @} end member group
218
+ //! @name Inclusive prefix sums
219
+ //! @{
220
+
221
+ //! @rst
222
+ //! Computes an inclusive prefix sum across the calling warp.
223
+ //!
224
+ //! * @smemwarpreuse
225
+ //!
226
+ //! Snippet
227
+ //! +++++++
228
+ //!
229
+ //! The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a
230
+ //! block of 128 threads (one per each of the 32-thread warps).
231
+ //!
232
+ //! .. code-block:: c++
233
+ //!
234
+ //! #include <cub/cub.cuh>
235
+ //!
236
+ //! __global__ void ExampleKernel(...)
237
+ //! {
238
+ //! // Specialize WarpScan for type int
239
+ //! using WarpScan = cub::WarpScan<int>;
240
+ //!
241
+ //! // Allocate WarpScan shared memory for 4 warps
242
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
243
+ //!
244
+ //! // Obtain one input item per thread
245
+ //! int thread_data = ...
246
+ //!
247
+ //! // Compute inclusive warp-wide prefix sums
248
+ //! int warp_id = threadIdx.x / 32;
249
+ //! WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data);
250
+ //!
251
+ //! Suppose the set of input ``thread_data`` across the block of threads is
252
+ //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
253
+ //! of threads will be ``1, 2, 3, ..., 32}``.
254
+ //! @endrst
255
+ //!
256
+ //! @param[in] input
257
+ //! Calling thread's input item.
258
+ //!
259
+ //! @param[out] inclusive_output
260
+ //! Calling thread's output item. May be aliased with `input`.
261
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& inclusive_output)
262
+ {
263
+ InclusiveScan(input, inclusive_output, ::cuda::std::plus<>{});
264
+ }
265
+
266
+ //! @rst
267
+ //! Computes an inclusive prefix sum across the calling warp.
268
+ //! Also provides every thread with the warp-wide ``warp_aggregate`` of all inputs.
269
+ //!
270
+ //! * @smemwarpreuse
271
+ //!
272
+ //! Snippet
273
+ //! +++++++
274
+ //!
275
+ //! The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a
276
+ //! block of 128 threads (one per each of the 32-thread warps).
277
+ //!
278
+ //! .. code-block:: c++
279
+ //!
280
+ //! #include <cub/cub.cuh>
281
+ //!
282
+ //! __global__ void ExampleKernel(...)
283
+ //! {
284
+ //! // Specialize WarpScan for type int
285
+ //! using WarpScan = cub::WarpScan<int>;
286
+ //!
287
+ //! // Allocate WarpScan shared memory for 4 warps
288
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
289
+ //!
290
+ //! // Obtain one input item per thread
291
+ //! int thread_data = ...
292
+ //!
293
+ //! // Compute inclusive warp-wide prefix sums
294
+ //! int warp_aggregate;
295
+ //! int warp_id = threadIdx.x / 32;
296
+ //! WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data,
297
+ //! thread_data,
298
+ //! warp_aggregate);
299
+ //!
300
+ //! Suppose the set of input ``thread_data`` across the block of threads is
301
+ //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
302
+ //! of threads will be ``1, 2, 3, ..., 32}``. Furthermore, ``warp_aggregate`` for all threads
303
+ //! in all warps will be ``32``.
304
+ //! @endrst
305
+ //!
306
+ //! @param[in] input
307
+ //! Calling thread's input item
308
+ //!
309
+ //! @param[out] inclusive_output
310
+ //! Calling thread's output item. May be aliased with `input`
311
+ //!
312
+ //! @param[out] warp_aggregate
313
+ //! Warp-wide aggregate reduction of input items
314
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& inclusive_output, T& warp_aggregate)
315
+ {
316
+ InclusiveScan(input, inclusive_output, ::cuda::std::plus<>{}, warp_aggregate);
317
+ }
318
+
319
+ //! @} end member group
320
+ //! @name Exclusive prefix sums
321
+ //! @{
322
+
323
+ //! @rst
324
+ //! Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the
325
+ //! initial value, and is assigned to ``exclusive_output`` in *lane*\ :sub:`0`.
326
+ //!
327
+ //! * @identityzero
328
+ //! * @smemwarpreuse
329
+ //!
330
+ //! Snippet
331
+ //! +++++++
332
+ //!
333
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a
334
+ //! block of 128 threads (one per each of the 32-thread warps).
335
+ //!
336
+ //! .. code-block:: c++
337
+ //!
338
+ //! #include <cub/cub.cuh>
339
+ //!
340
+ //! __global__ void ExampleKernel(...)
341
+ //! {
342
+ //! // Specialize WarpScan for type int
343
+ //! using WarpScan = cub::WarpScan<int>;
344
+ //!
345
+ //! // Allocate WarpScan shared memory for 4 warps
346
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
347
+ //!
348
+ //! // Obtain one input item per thread
349
+ //! int thread_data = ...
350
+ //!
351
+ //! // Compute exclusive warp-wide prefix sums
352
+ //! int warp_id = threadIdx.x / 32;
353
+ //! WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
354
+ //!
355
+ //! Suppose the set of input ``thread_data`` across the block of threads is
356
+ //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
357
+ //! of threads will be ``0, 1, 2, ..., 31}``.
358
+ //! @endrst
359
+ //!
360
+ //! @param[in] input
361
+ //! Calling thread's input item.
362
+ //!
363
+ //! @param[out] exclusive_output
364
+ //! Calling thread's output item. May be aliased with `input`.
365
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& exclusive_output)
366
+ {
367
+ T initial_value{};
368
+ ExclusiveScan(input, exclusive_output, initial_value, ::cuda::std::plus<>{});
369
+ }
370
+
371
+ //! @rst
372
+ //! Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the
373
+ //! initial value, and is assigned to ``exclusive_output`` in *lane*\ :sub:`0`.
374
+ //! Also provides every thread with the warp-wide ``warp_aggregate`` of all inputs.
375
+ //!
376
+ //! * @identityzero
377
+ //! * @smemwarpreuse
378
+ //!
379
+ //! Snippet
380
+ //! +++++++
381
+ //!
382
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a
383
+ //! block of 128 threads (one per each of the 32-thread warps).
384
+ //!
385
+ //! .. code-block:: c++
386
+ //!
387
+ //! #include <cub/cub.cuh>
388
+ //!
389
+ //! __global__ void ExampleKernel(...)
390
+ //! {
391
+ //! // Specialize WarpScan for type int
392
+ //! using WarpScan = cub::WarpScan<int>;
393
+ //!
394
+ //! // Allocate WarpScan shared memory for 4 warps
395
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
396
+ //!
397
+ //! // Obtain one input item per thread
398
+ //! int thread_data = ...
399
+ //!
400
+ //! // Compute exclusive warp-wide prefix sums
401
+ //! int warp_aggregate;
402
+ //! int warp_id = threadIdx.x / 32;
403
+ //! WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data,
404
+ //! thread_data,
405
+ //! warp_aggregate);
406
+ //!
407
+ //! Suppose the set of input ``thread_data`` across the block of threads is
408
+ //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
409
+ //! of threads will be ``0, 1, 2, ..., 31}``. Furthermore, ``warp_aggregate`` for all threads
410
+ //! in all warps will be ``32``.
411
+ //! @endrst
412
+ //!
413
+ //!
414
+ //! @param[in] input
415
+ //! Calling thread's input item
416
+ //!
417
+ //! @param[out] exclusive_output
418
+ //! Calling thread's output item. May be aliased with `input`
419
+ //!
420
+ //! @param[out] warp_aggregate
421
+ //! Warp-wide aggregate reduction of input items
422
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& exclusive_output, T& warp_aggregate)
423
+ {
424
+ T initial_value{};
425
+ ExclusiveScan(input, exclusive_output, initial_value, ::cuda::std::plus<>{}, warp_aggregate);
426
+ }
427
+
428
+ //! @} end member group
429
+ //! @name Inclusive prefix scans
430
+ //! @{
431
+
432
+ //! @rst
433
+ //! Computes an inclusive prefix scan using the specified binary scan functor across the
434
+ //! calling warp.
435
+ //!
436
+ //! * @smemwarpreuse
437
+ //!
438
+ //! Snippet
439
+ //! +++++++
440
+ //!
441
+ //! The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans
442
+ //! within a block of 128 threads (one per each of the 32-thread warps).
443
+ //!
444
+ //! .. code-block:: c++
445
+ //!
446
+ //! #include <cub/cub.cuh>
447
+ //!
448
+ //! __global__ void ExampleKernel(...)
449
+ //! {
450
+ //! // Specialize WarpScan for type int
451
+ //! using WarpScan = cub::WarpScan<int>;
452
+ //!
453
+ //! // Allocate WarpScan shared memory for 4 warps
454
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
455
+ //!
456
+ //! // Obtain one input item per thread
457
+ //! int thread_data = ...
458
+ //!
459
+ //! // Compute inclusive warp-wide prefix max scans
460
+ //! int warp_id = threadIdx.x / 32;
461
+ //! WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cuda::maximum<>{});
462
+ //!
463
+ //! Suppose the set of input ``thread_data`` across the block of threads is
464
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
465
+ //! warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be
466
+ //! ``32, 32, 34, 34, ..., 62, 62``, etc.
467
+ //! @endrst
468
+ //!
469
+ //! @tparam ScanOp
470
+ //! **[inferred]** Binary scan operator type having member
471
+ //! `T operator()(const T &a, const T &b)`
472
+ //!
473
+ //! @param[in] input
474
+ //! Calling thread's input item
475
+ //!
476
+ //! @param[out] inclusive_output
477
+ //! Calling thread's output item. May be aliased with `input`
478
+ //!
479
+ //! @param[in] scan_op
480
+ //! Binary scan operator
481
+ template <typename ScanOp>
482
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& inclusive_output, ScanOp scan_op)
483
+ {
484
+ InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op);
485
+ }
486
+
487
+ //! @rst
488
+ //! Computes an inclusive prefix scan using the specified binary scan functor across the
489
+ //! calling warp.
490
+ //!
491
+ //! * @smemwarpreuse
492
+ //!
493
+ //! Snippet
494
+ //! +++++++
495
+ //!
496
+ //! The code snippet below illustrates four concurrent warp-wide inclusive prefix sum scans
497
+ //! within a block of 128 threads (one per each of the 32-thread warps).
498
+ //!
499
+ //! .. literalinclude:: ../../../cub/test/catch2_test_warp_scan_api.cu
500
+ //! :language: c++
501
+ //! :dedent:
502
+ //! :start-after: example-begin inclusive-warp-scan-init-value
503
+ //! :end-before: example-end inclusive-warp-scan-init-value
504
+ //!
505
+ //! Suppose the set of input ``thread_data`` in the first warp is
506
+ //! ``{0, 1, 2, 3, ..., 31}``, in the second warp is ``{1, 2, 3, 4, ..., 32}`` etc.
507
+ //! The corresponding output ``thread_data`` for a max operation in the first
508
+ //! warp would be ``{3, 3, 3, 3, ..., 31}``, the output for the second warp would be
509
+ //! ``{3, 3, 3, 4, ..., 32}``, etc.
510
+ //! @endrst
511
+ //!
512
+ //! @tparam ScanOp
513
+ //! **[inferred]** Binary scan operator type having member
514
+ //! `T operator()(const T &a, const T &b)`
515
+ //!
516
+ //! @param[in] input
517
+ //! Calling thread's input item
518
+ //!
519
+ //! @param[out] inclusive_output
520
+ //! Calling thread's output item. May be aliased with `input`
521
+ //!
522
+ //! @param[in] initial_value
523
+ //! Initial value to seed the inclusive scan (uniform across warp)
524
+ //!
525
+ //! @param[in] scan_op
526
+ //! Binary scan operator
527
+ template <typename ScanOp>
528
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& inclusive_output, T initial_value, ScanOp scan_op)
529
+ {
530
+ InternalWarpScan internal(temp_storage);
531
+
532
+ T exclusive_output;
533
+ internal.InclusiveScan(input, inclusive_output, scan_op);
534
+
535
+ internal.Update(
536
+ input, inclusive_output, exclusive_output, scan_op, initial_value, detail::bool_constant_v<IS_INTEGER>);
537
+ }
538
+
539
+ //! @rst
540
+ //! Computes an inclusive prefix scan using the specified binary scan functor across the
541
+ //! calling warp. Also provides every thread with the warp-wide ``warp_aggregate`` of
542
+ //! all inputs.
543
+ //!
544
+ //! * @smemwarpreuse
545
+ //!
546
+ //! Snippet
547
+ //! +++++++
548
+ //!
549
+ //! The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans
550
+ //! within a block of 128 threads (one per each of the 32-thread warps).
551
+ //!
552
+ //! .. code-block:: c++
553
+ //!
554
+ //! #include <cub/cub.cuh>
555
+ //!
556
+ //! __global__ void ExampleKernel(...)
557
+ //! {
558
+ //! // Specialize WarpScan for type int
559
+ //! using WarpScan = cub::WarpScan<int>;
560
+ //!
561
+ //! // Allocate WarpScan shared memory for 4 warps
562
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
563
+ //!
564
+ //! // Obtain one input item per thread
565
+ //! int thread_data = ...
566
+ //!
567
+ //! // Compute inclusive warp-wide prefix max scans
568
+ //! int warp_aggregate;
569
+ //! int warp_id = threadIdx.x / 32;
570
+ //! WarpScan(temp_storage[warp_id]).InclusiveScan(
571
+ //! thread_data, thread_data, cuda::maximum<>{}, warp_aggregate);
572
+ //!
573
+ //! Suppose the set of input ``thread_data`` across the block of threads is
574
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
575
+ //! warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be
576
+ //! ``32, 32, 34, 34, ..., 62, 62``, etc. Furthermore, ``warp_aggregate`` would be assigned
577
+ //! ``30`` for threads in the first warp, ``62`` for threads in the second warp, etc.
578
+ //! @endrst
579
+ //!
580
+ //! @tparam ScanOp
581
+ //! **[inferred]** Binary scan operator type having member
582
+ //! `T operator()(const T &a, const T &b)`
583
+ //!
584
+ //! @param[in] input
585
+ //! Calling thread's input item
586
+ //!
587
+ //! @param[out] inclusive_output
588
+ //! Calling thread's output item. May be aliased with ``input``
589
+ //!
590
+ //! @param[in] scan_op
591
+ //! Binary scan operator
592
+ //!
593
+ //! @param[out] warp_aggregate
594
+ //! Warp-wide aggregate reduction of input items.
595
+ template <typename ScanOp>
596
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& inclusive_output, ScanOp scan_op, T& warp_aggregate)
597
+ {
598
+ InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate);
599
+ }
600
+
601
+ //! @rst
602
+ //! Computes an inclusive prefix scan using the specified binary scan functor across the
603
+ //! calling warp. Also provides every thread with the warp-wide ``warp_aggregate`` of
604
+ //! all inputs.
605
+ //!
606
+ //! * @smemwarpreuse
607
+ //!
608
+ //! Snippet
609
+ //! +++++++
610
+ //!
611
+ //! The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans
612
+ //! within a block of 128 threads (one scan per warp).
613
+ //!
614
+ //! .. literalinclude:: ../../../cub/test/catch2_test_warp_scan_api.cu
615
+ //! :language: c++
616
+ //! :dedent:
617
+ //! :start-after: example-begin inclusive-warp-scan-init-value-aggregate
618
+ //! :end-before: example-end inclusive-warp-scan-init-value-aggregate
619
+ //!
620
+ //! Suppose the set of input ``thread_data`` across the block of threads is
621
+ //! ``{1, 1, 1, 1, ..., 1}``. For initial value equal to 3, the corresponding output
622
+ //! ``thread_data`` for a sum operation in the first warp would be
623
+ //! ``{4, 5, 6, 7, ..., 35}``, the output for the second warp would be
624
+ //! ``{4, 5, 6, 7, ..., 35}``, etc. Furthermore, ``warp_aggregate`` would be assigned
625
+ //! ``32`` for threads in each warp.
626
+ //! @endrst
627
+ //!
628
+ //! @tparam ScanOp
629
+ //! **[inferred]** Binary scan operator type having member
630
+ //! `T operator()(const T &a, const T &b)`
631
+ //!
632
+ //! @param[in] input
633
+ //! Calling thread's input item
634
+ //!
635
+ //! @param[out] inclusive_output
636
+ //! Calling thread's output item. May be aliased with ``input``
637
+ //!
638
+ //! @param[in] initial_value
639
+ //! Initial value to seed the inclusive scan (uniform across warp). It is not taken
640
+ //! into account for warp_aggregate.
641
+ //!
642
+ //! @param[in] scan_op
643
+ //! Binary scan operator
644
+ //!
645
+ //! @param[out] warp_aggregate
646
+ //! Warp-wide aggregate reduction of input items.
647
+ template <typename ScanOp>
648
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
649
+ InclusiveScan(T input, T& inclusive_output, T initial_value, ScanOp scan_op, T& warp_aggregate)
650
+ {
651
+ InternalWarpScan internal(temp_storage);
652
+
653
+ // Perform the inclusive scan operation
654
+ internal.InclusiveScan(input, inclusive_output, scan_op);
655
+
656
+ // Update the inclusive_output and warp_aggregate using the Update function
657
+ T exclusive_output;
658
+ internal.Update(
659
+ input,
660
+ inclusive_output,
661
+ exclusive_output,
662
+ warp_aggregate,
663
+ scan_op,
664
+ initial_value,
665
+ detail::bool_constant_v<IS_INTEGER>);
666
+ }
667
+
668
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document partial inclusive scans
669
+ //! @rst
670
+ //! Computes an inclusive prefix scan using the specified binary scan functor across the
671
+ //! calling warp. But only the first ``valid_items`` elements (corresponding to warp lanes) are
672
+ //! used in the calculation. The leftover invalid elements are never passed to the binary scan functor.
673
+
674
+ //!
675
+ //! * @smemwarpreuse
676
+ //!
677
+ //! Snippet
678
+ //! +++++++
679
+ //!
680
+ //! The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans
681
+ //! within a block of 128 threads (one per each of the 32-thread warps).
682
+ //!
683
+ //! .. code-block:: c++
684
+ //!
685
+ //! #include <cub/cub.cuh>
686
+ //!
687
+ //! __global__ void ExampleKernel(...)
688
+ //! {
689
+ //! // Specialize WarpScan for type int
690
+ //! using WarpScan = cub::WarpScan<int>;
691
+ //!
692
+ //! // Allocate WarpScan shared memory for 4 warps
693
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
694
+ //!
695
+ //! // Obtain one input item per thread
696
+ //! int thread_data = ...
697
+ //! int warp_id = threadIdx.x / 32;
698
+ //! int block_valid_items = 35;
699
+ //! int warp_valid_items = block_valid_items - warp_id * 32;
700
+ //!
701
+ //! // Compute inclusive warp-wide prefix max scans
702
+ //! WarpScan(temp_storage[warp_id]).InclusiveScanPartial(
703
+ //! thread_data, thread_data, cuda::maximum<>{}, warp_valid_items);
704
+ //!
705
+ //! Suppose the set of input ``thread_data`` across the block of threads is
706
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
707
+ //! warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be
708
+ //! ``32, 32, 34, -35, ..., 62, -63`` and the output in the third and fourth warps would remain unmodified.
709
+ //! @endrst
710
+ //!
711
+ //! @tparam ScanOp
712
+ //! **[inferred]** Binary scan operator type having member
713
+ //! `T operator()(const T &a, const T &b)`
714
+ //!
715
+ //! @param[in] input
716
+ //! Calling thread's input item
717
+ //!
718
+ //! @param[out] inclusive_output
719
+ //! Calling thread's output item. May be aliased with `input`
720
+ //!
721
+ //! @param[in] scan_op
722
+ //! Binary scan operator
723
+ //!
724
+ //! @param[in] valid_items
725
+ //! Number of valid items in warp
726
+ template <typename ScanOp>
727
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScanPartial(T input, T& inclusive_output, ScanOp scan_op, int valid_items)
728
+ {
729
+ InternalWarpScan(temp_storage).InclusiveScanPartial(input, inclusive_output, scan_op, valid_items);
730
+ }
731
+
732
+ //! @rst
733
+ //! Computes an inclusive prefix scan using the specified binary scan functor across the
734
+ //! calling warp. But only the first ``valid_items`` elements (corresponding to warp lanes) are
735
+ //! used in the calculation. The leftover invalid elements are never passed to the binary scan functor.
736
+
737
+ //!
738
+ //! * @smemwarpreuse
739
+ //!
740
+ //! Snippet
741
+ //! +++++++
742
+ //!
743
+ //! The code snippet below illustrates four concurrent warp-wide inclusive prefix sum scans
744
+ //! within a block of 128 threads (one per each of the 32-thread warps).
745
+ //!
746
+ //! .. literalinclude:: ../../../cub/test/catch2_test_warp_scan_partial_api.cu
747
+ //! :language: c++
748
+ //! :dedent:
749
+ //! :start-after: example-begin inclusive-warp-scan-init-value-partial
750
+ //! :end-before: example-end inclusive-warp-scan-init-value-partial
751
+ //!
752
+ //! Suppose the set of input ``thread_data`` in the first warp is
753
+ //! ``{0, -1, 2, -3, ..., 28, -29, 30, -31}``, in the second warp is ``{1, -2, 3, -4, ..., 29, -30, 31, -32}`` etc.
754
+ //! The corresponding output ``thread_data`` for a max operation in the first
755
+ //! warp would be ``{3, 3, 3, 3, ..., 28, 28, 30, 30}``, the output for the second warp would be
756
+ //! ``{3, 3, 3, 3, ..., 29, 29, 31, -32}``, etc.
757
+ //! @endrst
758
+ //!
759
+ //! @tparam ScanOp
760
+ //! **[inferred]** Binary scan operator type having member
761
+ //! `T operator()(const T &a, const T &b)`
762
+ //!
763
+ //! @param[in] input
764
+ //! Calling thread's input item
765
+ //!
766
+ //! @param[out] inclusive_output
767
+ //! Calling thread's output item. May be aliased with `input`
768
+ //!
769
+ //! @param[in] initial_value
770
+ //! Initial value to seed the inclusive scan (uniform across warp)
771
+ //!
772
+ //! @param[in] scan_op
773
+ //! Binary scan operator
774
+ //!
775
+ //! @param[in] valid_items
776
+ //! Number of valid items in warp
777
+ template <typename ScanOp>
778
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
779
+ InclusiveScanPartial(T input, T& inclusive_output, T initial_value, ScanOp scan_op, int valid_items)
780
+ {
781
+ InternalWarpScan internal(temp_storage);
782
+
783
+ T exclusive_output;
784
+ internal.InclusiveScanPartial(input, inclusive_output, scan_op, valid_items);
785
+
786
+ internal.UpdatePartial(input, inclusive_output, exclusive_output, scan_op, valid_items, initial_value);
787
+ }
788
+
789
+ //! @rst
790
+ //! Computes an inclusive prefix scan using the specified binary scan functor across the
791
+ //! calling warp. But only the first ``valid_items`` elements (corresponding to warp lanes) are
792
+ //! used in the calculation. The leftover invalid elements are never passed to the binary scan functor.
793
+ //! Also provides every thread with the warp-wide ``warp_aggregate`` of all valid inputs. If there are no valid
794
+ //! inputs, the aggregate is undefined.
795
+
796
+ //!
797
+ //! * @smemwarpreuse
798
+ //!
799
+ //! Snippet
800
+ //! +++++++
801
+ //!
802
+ //! The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans
803
+ //! within a block of 128 threads (one per each of the 32-thread warps).
804
+ //!
805
+ //! .. code-block:: c++
806
+ //!
807
+ //! #include <cub/cub.cuh>
808
+ //!
809
+ //! __global__ void ExampleKernel(...)
810
+ //! {
811
+ //! // Specialize WarpScan for type int
812
+ //! using WarpScan = cub::WarpScan<int>;
813
+ //!
814
+ //! // Allocate WarpScan shared memory for 4 warps
815
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
816
+ //!
817
+ //! // Obtain one input item per thread
818
+ //! int thread_data = ...
819
+ //! int warp_id = threadIdx.x / 32;
820
+ //! int block_valid_items = 35;
821
+ //! int warp_valid_items = block_valid_items - warp_id * 32;
822
+ //!
823
+ //! // Compute inclusive warp-wide prefix max scans
824
+ //! int warp_aggregate;
825
+ //! WarpScan(temp_storage[warp_id]).InclusiveScan(
826
+ //! thread_data, thread_data, cuda::maximum<>{}, warp_valid_items, warp_aggregate);
827
+ //!
828
+ //! Suppose the set of input ``thread_data`` across the block of threads is
829
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
830
+ //! warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be
831
+ //! ``32, 32, 34, -35, ..., 62, -63`` and the output in the third and fourth warps would remain
832
+ //! unmodified. Furthermore, ``warp_aggregate`` would be assigned ``30`` for threads in
833
+ //! the first warp, ``34`` for threads in the second warp, and undefined for the third and
834
+ //! fourth warps.
835
+ //! @endrst
836
+ //!
837
+ //! @tparam ScanOp
838
+ //! **[inferred]** Binary scan operator type having member
839
+ //! `T operator()(const T &a, const T &b)`
840
+ //!
841
+ //! @param[in] input
842
+ //! Calling thread's input item
843
+ //!
844
+ //! @param[out] inclusive_output
845
+ //! Calling thread's output item. May be aliased with ``input``
846
+ //!
847
+ //! @param[in] scan_op
848
+ //! Binary scan operator
849
+ //!
850
+ //! @param[in] valid_items
851
+ //! Number of valid items in warp
852
+ //!
853
+ //! @param[out] warp_aggregate
854
+ //! Warp-wide aggregate reduction of input items.
855
+ template <typename ScanOp>
856
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
857
+ InclusiveScanPartial(T input, T& inclusive_output, ScanOp scan_op, int valid_items, T& warp_aggregate)
858
+ {
859
+ InternalWarpScan(temp_storage).InclusiveScanPartial(input, inclusive_output, scan_op, valid_items, warp_aggregate);
860
+ }
861
+
862
+ //! @rst
863
+ //! Computes an inclusive prefix scan using the specified binary scan functor across the
864
+ //! calling warp. But only the first ``valid_items`` elements (corresponding to warp lanes) are
865
+ //! used in the calculation. The leftover invalid elements are never passed to the binary scan functor.
866
+ //! Also provides every thread with the warp-wide ``warp_aggregate`` of all valid inputs. If there are no valid
867
+ //! inputs, the aggregate is undefined.
868
+
869
+ //!
870
+ //! * @smemwarpreuse
871
+ //!
872
+ //! Snippet
873
+ //! +++++++
874
+ //!
875
+ //! The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans
876
+ //! within a block of 128 threads (one scan per warp).
877
+ //!
878
+ //! .. literalinclude:: ../../../cub/test/catch2_test_warp_scan_api.cu
879
+ //! :language: c++
880
+ //! :dedent:
881
+ //! :start-after: example-begin inclusive-warp-scan-init-value-aggregate-partial
882
+ //! :end-before: example-end inclusive-warp-scan-init-value-aggregate-partial
883
+ //!
884
+ //! Suppose the set of input ``thread_data`` in the first warp is
885
+ //! ``{0, 0, 0, 1, ..., 1}``, in the second warp is ``{0, 0, 1, ..., 1}`` etc.
886
+ //! For initial value equal to 3, the corresponding output
887
+ //! ``thread_data`` for a sum operation in the first warp would be
888
+ //! ``{3, 3, 3, 4, ..., 29, 30, 31, 32}``, the output for the second warp would be
889
+ //! ``{3, 3, 4, 5, ..., 30, 31, 32, 1}``, etc. Furthermore, ``warp_aggregate`` would be assigned
890
+ //! ``29`` for threads in the first warp, ``30`` for the threads in the second warp, etc.
891
+ //! @endrst
892
+ //!
893
+ //! @tparam ScanOp
894
+ //! **[inferred]** Binary scan operator type having member
895
+ //! `T operator()(const T &a, const T &b)`
896
+ //!
897
+ //! @param[in] input
898
+ //! Calling thread's input item
899
+ //!
900
+ //! @param[out] inclusive_output
901
+ //! Calling thread's output item. May be aliased with ``input``
902
+ //!
903
+ //! @param[in] initial_value
904
+ //! Initial value to seed the inclusive scan (uniform across warp). It is not taken
905
+ //! into account for warp_aggregate.
906
+ //!
907
+ //! @param[in] scan_op
908
+ //! Binary scan operator
909
+ //!
910
+ //! @param[in] valid_items
911
+ //! Number of valid items in warp
912
+ //!
913
+ //! @param[out] warp_aggregate
914
+ //! Warp-wide aggregate reduction of input items.
915
+ template <typename ScanOp>
916
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScanPartial(
917
+ T input, T& inclusive_output, T initial_value, ScanOp scan_op, int valid_items, T& warp_aggregate)
918
+ {
919
+ InternalWarpScan internal(temp_storage);
920
+
921
+ // Perform the inclusive scan operation
922
+ internal.InclusiveScanPartial(input, inclusive_output, scan_op, valid_items);
923
+
924
+ // Update the inclusive_output and warp_aggregate using the Update function
925
+ T exclusive_output;
926
+ internal.UpdatePartial(
927
+ input, inclusive_output, exclusive_output, warp_aggregate, scan_op, valid_items, initial_value);
928
+ }
929
+
930
+ #endif // _CCCL_DOXYGEN_INVOKED // Do not document partial inclusive scans
931
+
932
+ //! @} end member group
933
+ //! @name Exclusive prefix scans
934
+ //! @{
935
+
936
+ //! @rst
937
+ //! Computes an exclusive prefix scan using the specified binary scan functor across the
938
+ //! calling warp. Because no initial value is supplied, the ``output`` computed for
939
+ //! *lane*\ :sub:`0` is undefined.
940
+ //!
941
+ //! * @smemwarpreuse
942
+ //!
943
+ //! Snippet
944
+ //! +++++++
945
+ //!
946
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
947
+ //! within a block of 128 threads (one per each of the 32-thread warps).
948
+ //!
949
+ //! .. code-block:: c++
950
+ //!
951
+ //! #include <cub/cub.cuh>
952
+ //!
953
+ //! __global__ void ExampleKernel(...)
954
+ //! {
955
+ //! // Specialize WarpScan for type int
956
+ //! using WarpScan = cub::WarpScan<int>;
957
+ //!
958
+ //! // Allocate WarpScan shared memory for 4 warps
959
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
960
+ //!
961
+ //! // Obtain one input item per thread
962
+ //! int thread_data = ...
963
+ //!
964
+ //! // Compute exclusive warp-wide prefix max scans
965
+ //! int warp_id = threadIdx.x / 32;
966
+ //! WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cuda::maximum<>{});
967
+ //!
968
+ //! Suppose the set of input ``thread_data`` across the block of threads is
969
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
970
+ //! warp would be ``?, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
971
+ //! ``?, 32, 32, 34, ..., 60, 62``, etc.
972
+ //! (The output ``thread_data`` in warp *lane*\ :sub:`0` is undefined.)
973
+ //! @endrst
974
+ //!
975
+ //! @tparam ScanOp
976
+ //! **[inferred]** Binary scan operator type having member
977
+ //! `T operator()(const T &a, const T &b)`
978
+ //!
979
+ //! @param[in] input
980
+ //! Calling thread's input item
981
+ //!
982
+ //! @param[out] exclusive_output
983
+ //! Calling thread's output item. May be aliased with `input`
984
+ //!
985
+ //! @param[in] scan_op
986
+ //! Binary scan operator
987
+ template <typename ScanOp>
988
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& exclusive_output, ScanOp scan_op)
989
+ {
990
+ InternalWarpScan internal(temp_storage);
991
+
992
+ T inclusive_output;
993
+ internal.InclusiveScan(input, inclusive_output, scan_op);
994
+
995
+ internal.Update(input, inclusive_output, exclusive_output, scan_op, detail::bool_constant_v<IS_INTEGER>);
996
+ }
997
+
998
+ //! @rst
999
+ //! Computes an exclusive prefix scan using the specified binary scan functor across the
1000
+ //! calling warp.
1001
+ //!
1002
+ //! * @smemwarpreuse
1003
+ //!
1004
+ //! Snippet
1005
+ //! +++++++
1006
+ //!
1007
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
1008
+ //! within a block of 128 threads (one per each of the 32-thread warps).
1009
+ //!
1010
+ //! .. code-block:: c++
1011
+ //!
1012
+ //! #include <cub/cub.cuh>
1013
+ //!
1014
+ //! __global__ void ExampleKernel(...)
1015
+ //! {
1016
+ //! // Specialize WarpScan for type int
1017
+ //! using WarpScan = cub::WarpScan<int>;
1018
+ //!
1019
+ //! // Allocate WarpScan shared memory for 4 warps
1020
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
1021
+ //!
1022
+ //! // Obtain one input item per thread
1023
+ //! int thread_data = ...
1024
+ //!
1025
+ //! // Compute exclusive warp-wide prefix max scans
1026
+ //! int warp_id = threadIdx.x / 32;
1027
+ //! WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data,
1028
+ //! thread_data,
1029
+ //! INT_MIN,
1030
+ //! cuda::maximum<>{});
1031
+ //!
1032
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1033
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
1034
+ //! warp would be ``INT_MIN, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
1035
+ //! ``30, 32, 32, 34, ..., 60, 62``, etc.
1036
+ //! @endrst
1037
+ //!
1038
+ //! @tparam ScanOp
1039
+ //! **[inferred]** Binary scan operator type having member
1040
+ //! `T operator()(const T &a, const T &b)`
1041
+ //!
1042
+ //! @param[in] input
1043
+ //! Calling thread's input item
1044
+ //!
1045
+ //! @param[out] exclusive_output
1046
+ //! Calling thread's output item. May be aliased with `input`
1047
+ //!
1048
+ //! @param[in] initial_value
1049
+ //! Initial value to seed the exclusive scan
1050
+ //!
1051
+ //! @param[in] scan_op
1052
+ //! Binary scan operator
1053
+ template <typename ScanOp>
1054
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& exclusive_output, T initial_value, ScanOp scan_op)
1055
+ {
1056
+ InternalWarpScan internal(temp_storage);
1057
+
1058
+ T inclusive_output;
1059
+ internal.InclusiveScan(input, inclusive_output, scan_op);
1060
+
1061
+ internal.Update(
1062
+ input, inclusive_output, exclusive_output, scan_op, initial_value, detail::bool_constant_v<IS_INTEGER>);
1063
+ }
1064
+
1065
+ //! @rst
1066
+ //! Computes an exclusive prefix scan using the specified binary scan functor across the
1067
+ //! calling warp. Because no initial value is supplied, the ``output`` computed for
1068
+ //! *lane*\ :sub:`0` is undefined. Also provides every thread with the warp-wide
1069
+ //! ``warp_aggregate`` of all inputs.
1070
+ //!
1071
+ //! * @smemwarpreuse
1072
+ //!
1073
+ //! Snippet
1074
+ //! +++++++
1075
+ //!
1076
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
1077
+ //! within a block of 128 threads (one per each of the 32-thread warps).
1078
+ //!
1079
+ //! .. code-block:: c++
1080
+ //!
1081
+ //! #include <cub/cub.cuh>
1082
+ //!
1083
+ //! __global__ void ExampleKernel(...)
1084
+ //! {
1085
+ //! // Specialize WarpScan for type int
1086
+ //! using WarpScan = cub::WarpScan<int>;
1087
+ //!
1088
+ //! // Allocate WarpScan shared memory for 4 warps
1089
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
1090
+ //!
1091
+ //! // Obtain one input item per thread
1092
+ //! int thread_data = ...
1093
+ //!
1094
+ //! // Compute exclusive warp-wide prefix max scans
1095
+ //! int warp_aggregate;
1096
+ //! int warp_id = threadIdx.x / 32;
1097
+ //! WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data,
1098
+ //! thread_data,
1099
+ //! cuda::maximum<>{},
1100
+ //! warp_aggregate);
1101
+ //!
1102
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1103
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
1104
+ //! warp would be ``?, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
1105
+ //! ``?, 32, 32, 34, ..., 60, 62``, etc. (The output ``thread_data`` in warp *lane*\ :sub:`0`
1106
+ //! is undefined). Furthermore, ``warp_aggregate`` would be assigned ``30`` for threads in the
1107
+ //! first warp, \p 62 for threads in the second warp, etc.
1108
+ //! @endrst
1109
+ //!
1110
+ //! @tparam ScanOp
1111
+ //! **[inferred]** Binary scan operator type having member
1112
+ //! `T operator()(const T &a, const T &b)`
1113
+ //!
1114
+ //! @param[in] input
1115
+ //! Calling thread's input item
1116
+ //!
1117
+ //! @param[out] exclusive_output
1118
+ //! Calling thread's output item. May be aliased with `input`
1119
+ //!
1120
+ //! @param[in] scan_op
1121
+ //! Binary scan operator
1122
+ //!
1123
+ //! @param[out] warp_aggregate
1124
+ //! Warp-wide aggregate reduction of input items
1125
+ template <typename ScanOp>
1126
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& exclusive_output, ScanOp scan_op, T& warp_aggregate)
1127
+ {
1128
+ InternalWarpScan internal(temp_storage);
1129
+
1130
+ T inclusive_output;
1131
+ internal.InclusiveScan(input, inclusive_output, scan_op);
1132
+
1133
+ internal.Update(
1134
+ input, inclusive_output, exclusive_output, warp_aggregate, scan_op, detail::bool_constant_v<IS_INTEGER>);
1135
+ }
1136
+
1137
+ //! @rst
1138
+ //! Computes an exclusive prefix scan using the specified binary scan functor across the
1139
+ //! calling warp. Also provides every thread with the warp-wide ``warp_aggregate`` of
1140
+ //! all inputs.
1141
+ //!
1142
+ //! * @smemwarpreuse
1143
+ //!
1144
+ //! Snippet
1145
+ //! +++++++
1146
+ //!
1147
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
1148
+ //! within a block of 128 threads (one per each of the 32-thread warps).
1149
+ //!
1150
+ //! .. code-block:: c++
1151
+ //!
1152
+ //! #include <cub/cub.cuh>
1153
+ //!
1154
+ //! __global__ void ExampleKernel(...)
1155
+ //! {
1156
+ //! // Specialize WarpScan for type int
1157
+ //! using WarpScan = cub::WarpScan<int>;
1158
+ //!
1159
+ //! // Allocate WarpScan shared memory for 4 warps
1160
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
1161
+ //!
1162
+ //! // Obtain one input item per thread
1163
+ //! int thread_data = ...
1164
+ //!
1165
+ //! // Compute exclusive warp-wide prefix max scans
1166
+ //! int warp_aggregate;
1167
+ //! int warp_id = threadIdx.x / 32;
1168
+ //! WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data,
1169
+ //! thread_data,
1170
+ //! INT_MIN,
1171
+ //! cuda::maximum<>{},
1172
+ //! warp_aggregate);
1173
+ //!
1174
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1175
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
1176
+ //! warp would be ``INT_MIN, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
1177
+ //! ``INT_MIN, 32, 32, 34, ..., 60, 62``, etc. Furthermore, ``warp_aggregate`` would be assigned
1178
+ //! ``30`` for threads in the first warp, ``62`` for threads in the second warp, etc.
1179
+ //! @endrst
1180
+ //!
1181
+ //! @tparam ScanOp
1182
+ //! **[inferred]** Binary scan operator type having member
1183
+ //! `T operator()(const T &a, const T &b)`
1184
+ //!
1185
+ //! @param[in] input
1186
+ //! Calling thread's input item
1187
+ //!
1188
+ //! @param[out] exclusive_output
1189
+ //! Calling thread's output item. May be aliased with `input`
1190
+ //!
1191
+ //! @param[in] initial_value
1192
+ //! Initial value to seed the exclusive scan
1193
+ //!
1194
+ //! @param[in] scan_op
1195
+ //! Binary scan operator
1196
+ //!
1197
+ //! @param[out] warp_aggregate
1198
+ //! Warp-wide aggregate reduction of input items
1199
+ //!
1200
+ template <typename ScanOp>
1201
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1202
+ ExclusiveScan(T input, T& exclusive_output, T initial_value, ScanOp scan_op, T& warp_aggregate)
1203
+ {
1204
+ InternalWarpScan internal(temp_storage);
1205
+
1206
+ T inclusive_output;
1207
+ internal.InclusiveScan(input, inclusive_output, scan_op);
1208
+
1209
+ internal.Update(
1210
+ input,
1211
+ inclusive_output,
1212
+ exclusive_output,
1213
+ warp_aggregate,
1214
+ scan_op,
1215
+ initial_value,
1216
+ detail::bool_constant_v<IS_INTEGER>);
1217
+ }
1218
+
1219
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document partial exclusive scans
1220
+ //! @rst
1221
+ //! Computes an exclusive prefix scan using the specified binary scan functor across the
1222
+ //! calling warp. But only the first ``valid_items`` elements (corresponding to warp lanes) are
1223
+ //! used in the calculation. The leftover invalid elements are never passed to the binary scan functor.
1224
+ //! Because no initial value is supplied, the ``output`` computed for
1225
+ //! *lane*\ :sub:`0` is undefined.
1226
+ //!
1227
+ //! * @smemwarpreuse
1228
+ //!
1229
+ //! Snippet
1230
+ //! +++++++
1231
+ //!
1232
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
1233
+ //! within a block of 128 threads (one per each of the 32-thread warps).
1234
+ //!
1235
+ //! .. code-block:: c++
1236
+ //!
1237
+ //! #include <cub/cub.cuh>
1238
+ //!
1239
+ //! __global__ void ExampleKernel(...)
1240
+ //! {
1241
+ //! // Specialize WarpScan for type int
1242
+ //! using WarpScan = cub::WarpScan<int>;
1243
+ //!
1244
+ //! // Allocate WarpScan shared memory for 4 warps
1245
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
1246
+ //!
1247
+ //! // Obtain one input item per thread
1248
+ //! int thread_data = ...
1249
+ //! int warp_id = threadIdx.x / 32;
1250
+ //! int block_valid_items = 35;
1251
+ //! int warp_valid_items = block_valid_items - warp_id * 32;
1252
+ //!
1253
+ //! // Compute exclusive warp-wide prefix max scans
1254
+ //! WarpScan(temp_storage[warp_id]).ExclusiveScanPartial(
1255
+ //! thread_data, thread_data, cuda::maximum<>{}, warp_valid_items);
1256
+ //!
1257
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1258
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
1259
+ //! warp would be ``?, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
1260
+ //! ``?, 32, 32, -35, ..., 62, -63`` and the output in the third and fourth warps would remain unmodified.
1261
+ //! (The output ``thread_data`` in warp *lane*\ :sub:`0` is undefined.)
1262
+ //! @endrst
1263
+ //!
1264
+ //! @tparam ScanOp
1265
+ //! **[inferred]** Binary scan operator type having member
1266
+ //! `T operator()(const T &a, const T &b)`
1267
+ //!
1268
+ //! @param[in] input
1269
+ //! Calling thread's input item
1270
+ //!
1271
+ //! @param[out] exclusive_output
1272
+ //! Calling thread's output item. May be aliased with `input`
1273
+ //!
1274
+ //! @param[in] scan_op
1275
+ //! Binary scan operator
1276
+ //!
1277
+ //! @param[in] valid_items
1278
+ //! Number of valid items in warp
1279
+ template <typename ScanOp>
1280
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScanPartial(T input, T& exclusive_output, ScanOp scan_op, int valid_items)
1281
+ {
1282
+ InternalWarpScan internal(temp_storage);
1283
+
1284
+ T inclusive_output;
1285
+ internal.InclusiveScanPartial(input, inclusive_output, scan_op, valid_items);
1286
+
1287
+ internal.UpdatePartial(input, inclusive_output, exclusive_output, scan_op, valid_items);
1288
+ }
1289
+
1290
+ //! @rst
1291
+ //! Computes an exclusive prefix scan using the specified binary scan functor across the
1292
+ //! calling warp. But only the first ``valid_items`` elements (corresponding to warp lanes) are
1293
+ //! used in the calculation. The leftover invalid elements are never passed to the binary scan functor.
1294
+
1295
+ //!
1296
+ //! * @smemwarpreuse
1297
+ //!
1298
+ //! Snippet
1299
+ //! +++++++
1300
+ //!
1301
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
1302
+ //! within a block of 128 threads (one per each of the 32-thread warps).
1303
+ //!
1304
+ //! .. code-block:: c++
1305
+ //!
1306
+ //! #include <cub/cub.cuh>
1307
+ //!
1308
+ //! __global__ void ExampleKernel(...)
1309
+ //! {
1310
+ //! // Specialize WarpScan for type int
1311
+ //! using WarpScan = cub::WarpScan<int>;
1312
+ //!
1313
+ //! // Allocate WarpScan shared memory for 4 warps
1314
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
1315
+ //!
1316
+ //! // Obtain one input item per thread
1317
+ //! int thread_data = ...
1318
+ //! int warp_id = threadIdx.x / 32;
1319
+ //! int block_valid_items = 35;
1320
+ //! int warp_valid_items = block_valid_items - warp_id * 32;
1321
+ //!
1322
+ //! // Compute exclusive warp-wide prefix max scans
1323
+ //! WarpScan(temp_storage[warp_id]).ExclusiveScanPartial(
1324
+ //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, warp_valid_items);
1325
+ //!
1326
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1327
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
1328
+ //! warp would be ``INT_MIN, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
1329
+ //! ``30, 32, 32, -35, ..., 62, -63`` and the output in the third and fourth warps would remain unmodified.
1330
+ //! @endrst
1331
+ //!
1332
+ //! @tparam ScanOp
1333
+ //! **[inferred]** Binary scan operator type having member
1334
+ //! `T operator()(const T &a, const T &b)`
1335
+ //!
1336
+ //! @param[in] input
1337
+ //! Calling thread's input item
1338
+ //!
1339
+ //! @param[out] exclusive_output
1340
+ //! Calling thread's output item. May be aliased with `input`
1341
+ //!
1342
+ //! @param[in] initial_value
1343
+ //! Initial value to seed the exclusive scan
1344
+ //!
1345
+ //! @param[in] scan_op
1346
+ //! Binary scan operator
1347
+ //!
1348
+ //! @param[in] valid_items
1349
+ //! Number of valid items in warp
1350
+ template <typename ScanOp>
1351
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1352
+ ExclusiveScanPartial(T input, T& exclusive_output, T initial_value, ScanOp scan_op, int valid_items)
1353
+ {
1354
+ InternalWarpScan internal(temp_storage);
1355
+
1356
+ T inclusive_output;
1357
+ internal.InclusiveScanPartial(input, inclusive_output, scan_op, valid_items);
1358
+
1359
+ internal.UpdatePartial(input, inclusive_output, exclusive_output, scan_op, valid_items, initial_value);
1360
+ }
1361
+
1362
+ //! @rst
1363
+ //! Computes an exclusive prefix scan using the specified binary scan functor across the
1364
+ //! calling warp. But only the first ``valid_items`` elements (corresponding to warp lanes) are
1365
+ //! used in the calculation. The leftover invalid elements are never passed to the binary scan functor.
1366
+ //! Because no initial value is supplied, the ``output`` computed for *lane*\ :sub:`0` is undefined.
1367
+ //! Also provides every thread with the warp-wide ``warp_aggregate`` of all valid inputs. If there are no valid
1368
+ //! inputs, the aggregate is undefined.
1369
+ //!
1370
+ //! * @smemwarpreuse
1371
+ //!
1372
+ //! Snippet
1373
+ //! +++++++
1374
+ //!
1375
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
1376
+ //! within a block of 128 threads (one per each of the 32-thread warps).
1377
+ //!
1378
+ //! .. code-block:: c++
1379
+ //!
1380
+ //! #include <cub/cub.cuh>
1381
+ //!
1382
+ //! __global__ void ExampleKernel(...)
1383
+ //! {
1384
+ //! // Specialize WarpScan for type int
1385
+ //! using WarpScan = cub::WarpScan<int>;
1386
+ //!
1387
+ //! // Allocate WarpScan shared memory for 4 warps
1388
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
1389
+ //!
1390
+ //! // Obtain one input item per thread
1391
+ //! int thread_data = ...
1392
+ //! int warp_id = threadIdx.x / 32;
1393
+ //! int block_valid_items = 35;
1394
+ //! int warp_valid_items = block_valid_items - warp_id * 32;
1395
+ //!
1396
+ //! // Compute exclusive warp-wide prefix max scans
1397
+ //! int warp_aggregate;
1398
+ //! WarpScan(temp_storage[warp_id]).ExclusiveScanPartial(
1399
+ //! thread_data, thread_data, cuda::maximum<>{}, warp_valid_items, warp_aggregate);
1400
+ //!
1401
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1402
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
1403
+ //! warp would be ``?, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
1404
+ //! ``?, 32, 32, -35, ..., 62, -63``, and the output in the third and fourth warps would remain unmodified
1405
+ //! (The output ``thread_data`` in warp *lane*\ :sub:`0` is undefined). Furthermore, ``warp_aggregate``
1406
+ //! would be assigned ``30`` for threads in the first warp, ``34`` for threads in the second warp and
1407
+ //! undefined for the third and fourth warps.
1408
+ //! @endrst
1409
+ //!
1410
+ //! @tparam ScanOp
1411
+ //! **[inferred]** Binary scan operator type having member
1412
+ //! `T operator()(const T &a, const T &b)`
1413
+ //!
1414
+ //! @param[in] input
1415
+ //! Calling thread's input item
1416
+ //!
1417
+ //! @param[out] exclusive_output
1418
+ //! Calling thread's output item. May be aliased with `input`
1419
+ //!
1420
+ //! @param[in] scan_op
1421
+ //! Binary scan operator
1422
+ //!
1423
+ //! @param[in] valid_items
1424
+ //! Number of valid items in warp
1425
+ //!
1426
+ //! @param[out] warp_aggregate
1427
+ //! Warp-wide aggregate reduction of input items
1428
+ template <typename ScanOp>
1429
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1430
+ ExclusiveScanPartial(T input, T& exclusive_output, ScanOp scan_op, int valid_items, T& warp_aggregate)
1431
+ {
1432
+ InternalWarpScan internal(temp_storage);
1433
+
1434
+ T inclusive_output;
1435
+ internal.InclusiveScanPartial(input, inclusive_output, scan_op, valid_items);
1436
+
1437
+ internal.UpdatePartial(input, inclusive_output, exclusive_output, warp_aggregate, scan_op, valid_items);
1438
+ }
1439
+
1440
+ //! @rst
1441
+ //! Computes an exclusive prefix scan using the specified binary scan functor across the
1442
+ //! calling warp. But only the first ``valid_items`` elements (corresponding to warp lanes) are
1443
+ //! used in the calculation. The leftover invalid elements are never passed to the binary scan functor.
1444
+ //! Also provides every thread with the warp-wide ``warp_aggregate`` of all valid inputs. If there are no valid
1445
+ //! inputs, the aggregate is undefined.
1446
+
1447
+ //!
1448
+ //! * @smemwarpreuse
1449
+ //!
1450
+ //! Snippet
1451
+ //! +++++++
1452
+ //!
1453
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
1454
+ //! within a block of 128 threads (one per each of the 32-thread warps).
1455
+ //!
1456
+ //! .. code-block:: c++
1457
+ //!
1458
+ //! #include <cub/cub.cuh>
1459
+ //!
1460
+ //! __global__ void ExampleKernel(...)
1461
+ //! {
1462
+ //! // Specialize WarpScan for type int
1463
+ //! using WarpScan = cub::WarpScan<int>;
1464
+ //!
1465
+ //! // Allocate WarpScan shared memory for 4 warps
1466
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
1467
+ //!
1468
+ //! // Obtain one input item per thread
1469
+ //! int thread_data = ...
1470
+ //! int warp_id = threadIdx.x / 32;
1471
+ //! int block_valid_items = 35;
1472
+ //! int warp_valid_items = block_valid_items - warp_id * 32;
1473
+ //!
1474
+ //! // Compute exclusive warp-wide prefix max scans
1475
+ //! int warp_aggregate;
1476
+ //! WarpScan(temp_storage[warp_id]).ExclusiveScanPartial(
1477
+ //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, warp_valid_items, warp_aggregate);
1478
+ //!
1479
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1480
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
1481
+ //! warp would be ``INT_MIN, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
1482
+ //! ``INT_MIN, 32, 32, -35, ..., 62, -63``, and the output in the third and fourth warps would
1483
+ //! remain unmodified. Furthermore, ``warp_aggregate`` would be assigned
1484
+ //! ``30`` for threads in the first warp, ``34`` for threads in the second warp and undefined
1485
+ //! for the third and fourth warps.
1486
+ //! @endrst
1487
+ //!
1488
+ //! @tparam ScanOp
1489
+ //! **[inferred]** Binary scan operator type having member
1490
+ //! `T operator()(const T &a, const T &b)`
1491
+ //!
1492
+ //! @param[in] input
1493
+ //! Calling thread's input item
1494
+ //!
1495
+ //! @param[out] exclusive_output
1496
+ //! Calling thread's output item. May be aliased with `input`
1497
+ //!
1498
+ //! @param[in] initial_value
1499
+ //! Initial value to seed the exclusive scan
1500
+ //!
1501
+ //! @param[in] scan_op
1502
+ //! Binary scan operator
1503
+ //!
1504
+ //! @param[in] valid_items
1505
+ //! Number of valid items in warp
1506
+ //!
1507
+ //! @param[out] warp_aggregate
1508
+ //! Warp-wide aggregate reduction of input items
1509
+ //!
1510
+ template <typename ScanOp>
1511
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScanPartial(
1512
+ T input, T& exclusive_output, T initial_value, ScanOp scan_op, int valid_items, T& warp_aggregate)
1513
+ {
1514
+ InternalWarpScan internal(temp_storage);
1515
+
1516
+ T inclusive_output;
1517
+ internal.InclusiveScanPartial(input, inclusive_output, scan_op, valid_items);
1518
+
1519
+ internal.UpdatePartial(
1520
+ input, inclusive_output, exclusive_output, warp_aggregate, scan_op, valid_items, initial_value);
1521
+ }
1522
+
1523
+ #endif // _CCCL_DOXYGEN_INVOKED // Do not document partial exclusive scans
1524
+
1525
+ //! @} end member group
1526
+ //! @name Combination (inclusive & exclusive) prefix scans
1527
+ //! @{
1528
+
1529
+ //! @rst
1530
+ //! Computes both inclusive and exclusive prefix scans using the specified binary scan functor
1531
+ //! across the calling warp. Because no initial value is supplied, the ``exclusive_output``
1532
+ //! computed for *lane*\ :sub:`0` is undefined.
1533
+ //!
1534
+ //! * @smemwarpreuse
1535
+ //!
1536
+ //! Snippet
1537
+ //! +++++++
1538
+ //!
1539
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
1540
+ //! within a block of 128 threads (one per each of the 32-thread warps).
1541
+ //!
1542
+ //! .. code-block:: c++
1543
+ //!
1544
+ //! #include <cub/cub.cuh>
1545
+ //!
1546
+ //! __global__ void ExampleKernel(...)
1547
+ //! {
1548
+ //! // Specialize WarpScan for type int
1549
+ //! using WarpScan = cub::WarpScan<int>;
1550
+ //!
1551
+ //! // Allocate WarpScan shared memory for 4 warps
1552
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
1553
+ //!
1554
+ //! // Obtain one input item per thread
1555
+ //! int thread_data = ...
1556
+ //!
1557
+ //! // Compute exclusive warp-wide prefix max scans
1558
+ //! int inclusive_partial, exclusive_partial;
1559
+ //! WarpScan(temp_storage[warp_id]).Scan(thread_data,
1560
+ //! inclusive_partial,
1561
+ //! exclusive_partial,
1562
+ //! cuda::maximum<>{});
1563
+ //!
1564
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1565
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``inclusive_partial`` in the
1566
+ //! first warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be
1567
+ //! ``32, 32, 34, 34, ..., 62, 62``, etc. The corresponding output ``exclusive_partial`` in the
1568
+ //! first warp would be ``?, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
1569
+ //! ``?, 32, 32, 34, ..., 60, 62``, etc.
1570
+ //! (The output ``thread_data`` in warp *lane*\ :sub:`0` is undefined.)
1571
+ //! @endrst
1572
+ //!
1573
+ //! @tparam ScanOp
1574
+ //! **[inferred]** Binary scan operator type having member
1575
+ //! `T operator()(const T &a, const T &b)`
1576
+ //!
1577
+ //! @param[in] input
1578
+ //! Calling thread's input item
1579
+ //!
1580
+ //! @param[out] inclusive_output
1581
+ //! Calling thread's inclusive-scan output item
1582
+ //!
1583
+ //! @param[out] exclusive_output
1584
+ //! Calling thread's exclusive-scan output item
1585
+ //!
1586
+ //! @param[in] scan_op
1587
+ //! Binary scan operator
1588
+ template <typename ScanOp>
1589
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Scan(T input, T& inclusive_output, T& exclusive_output, ScanOp scan_op)
1590
+ {
1591
+ InternalWarpScan internal(temp_storage);
1592
+
1593
+ internal.InclusiveScan(input, inclusive_output, scan_op);
1594
+
1595
+ internal.Update(input, inclusive_output, exclusive_output, scan_op, detail::bool_constant_v<IS_INTEGER>);
1596
+ }
1597
+
1598
+ //! @rst
1599
+ //! Computes both inclusive and exclusive prefix scans using the specified binary scan functor
1600
+ //! across the calling warp.
1601
+ //!
1602
+ //! * @smemwarpreuse
1603
+ //!
1604
+ //! Snippet
1605
+ //! +++++++
1606
+ //!
1607
+ //! The code snippet below illustrates four concurrent warp-wide prefix max scans within a
1608
+ //! block of 128 threads (one per each of the 32-thread warps).
1609
+ //!
1610
+ //! .. code-block:: c++
1611
+ //!
1612
+ //! #include <cub/cub.cuh>
1613
+ //!
1614
+ //! __global__ void ExampleKernel(...)
1615
+ //! {
1616
+ //! // Specialize WarpScan for type int
1617
+ //! using WarpScan = cub::WarpScan<int>;
1618
+ //!
1619
+ //! // Allocate WarpScan shared memory for 4 warps
1620
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
1621
+ //!
1622
+ //! // Obtain one input item per thread
1623
+ //! int thread_data = ...
1624
+ //!
1625
+ //! // Compute inclusive warp-wide prefix max scans
1626
+ //! int warp_id = threadIdx.x / 32;
1627
+ //! int inclusive_partial, exclusive_partial;
1628
+ //! WarpScan(temp_storage[warp_id]).Scan(thread_data,
1629
+ //! inclusive_partial,
1630
+ //! exclusive_partial,
1631
+ //! INT_MIN,
1632
+ //! cuda::maximum<>{});
1633
+ //!
1634
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1635
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``inclusive_partial`` in the
1636
+ //! first warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be
1637
+ //! ``32, 32, 34, 34, ..., 62, 62``, etc. The corresponding output ``exclusive_partial`` in the
1638
+ //! first warp would be ``INT_MIN, 0, 0, 2, ..., 28, 30``, the output for the second warp would
1639
+ //! be ``INT_MIN, 32, 32, 34, ..., 60, 62``, etc.
1640
+ //! @endrst
1641
+ //!
1642
+ //! @tparam ScanOp
1643
+ //! **[inferred]** Binary scan operator type having member
1644
+ //! `T operator()(const T &a, const T &b)`
1645
+ //!
1646
+ //! @param[in] input
1647
+ //! Calling thread's input item
1648
+ //!
1649
+ //! @param[out] inclusive_output
1650
+ //! Calling thread's inclusive-scan output item
1651
+ //!
1652
+ //! @param[out] exclusive_output
1653
+ //! Calling thread's exclusive-scan output item
1654
+ //!
1655
+ //! @param[in] initial_value
1656
+ //! Initial value to seed the exclusive scan
1657
+ //!
1658
+ //! @param[in] scan_op
1659
+ //! Binary scan operator
1660
+ template <typename ScanOp>
1661
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1662
+ Scan(T input, T& inclusive_output, T& exclusive_output, T initial_value, ScanOp scan_op)
1663
+ {
1664
+ InternalWarpScan internal(temp_storage);
1665
+
1666
+ internal.InclusiveScan(input, inclusive_output, scan_op);
1667
+
1668
+ internal.Update(
1669
+ input, inclusive_output, exclusive_output, scan_op, initial_value, detail::bool_constant_v<IS_INTEGER>);
1670
+ }
1671
+
1672
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document partial combined scans
1673
+ //! @rst
1674
+ //! Computes both inclusive and exclusive prefix scans using the specified binary scan functor
1675
+ //! across the calling warp. But only the first ``valid_items`` elements (corresponding to warp lanes) are
1676
+ //! used in the calculation. The leftover invalid elements are never passed to the binary scan functor.
1677
+ //! Because no initial value is supplied, the ``exclusive_output``
1678
+ //! computed for *lane*\ :sub:`0` is undefined.
1679
+ //!
1680
+ //! * @smemwarpreuse
1681
+ //!
1682
+ //! Snippet
1683
+ //! +++++++
1684
+ //!
1685
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
1686
+ //! within a block of 128 threads (one per each of the 32-thread warps).
1687
+ //!
1688
+ //! .. code-block:: c++
1689
+ //!
1690
+ //! #include <cub/cub.cuh>
1691
+ //!
1692
+ //! __global__ void ExampleKernel(...)
1693
+ //! {
1694
+ //! // Specialize WarpScan for type int
1695
+ //! using WarpScan = cub::WarpScan<int>;
1696
+ //!
1697
+ //! // Allocate WarpScan shared memory for 4 warps
1698
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
1699
+ //!
1700
+ //! // Obtain one input item per thread
1701
+ //! int thread_data = ...
1702
+ //! int warp_id = threadIdx.x / 32;
1703
+ //! int block_valid_items = 35;
1704
+ //! int warp_valid_items = block_valid_items - warp_id * 32;
1705
+ //!
1706
+ //! // Compute exclusive warp-wide prefix max scans
1707
+ //! int inclusive_partial, exclusive_partial;
1708
+ //! WarpScan(temp_storage[warp_id]).ScanPartial(
1709
+ //! thread_data, inclusive_partial, exclusive_partial, cuda::maximum<>{}, warp_valid_items);
1710
+ //!
1711
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1712
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``inclusive_partial`` in the
1713
+ //! first warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be
1714
+ //! ``32, 32, 34, -35, ..., 62, -63``. The corresponding output ``exclusive_partial`` in the
1715
+ //! first warp would be ``?, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
1716
+ //! ``?, 32, 32, -35, ..., 62, -63``. The third and fourth warps ``inclusive_partial`` and
1717
+ //! ``exclusive_parttial`` would remain unmodified.
1718
+ //! (The output ``thread_data`` in warp *lane*\ :sub:`0` is undefined.)
1719
+ //! @endrst
1720
+ //!
1721
+ //! @tparam ScanOp
1722
+ //! **[inferred]** Binary scan operator type having member
1723
+ //! `T operator()(const T &a, const T &b)`
1724
+ //!
1725
+ //! @param[in] input
1726
+ //! Calling thread's input item
1727
+ //!
1728
+ //! @param[out] inclusive_output
1729
+ //! Calling thread's inclusive-scan output item
1730
+ //!
1731
+ //! @param[out] exclusive_output
1732
+ //! Calling thread's exclusive-scan output item
1733
+ //!
1734
+ //! @param[in] scan_op
1735
+ //! Binary scan operator
1736
+ //!
1737
+ //! @param[in] valid_items
1738
+ //! Number of valid items in warp
1739
+ template <typename ScanOp>
1740
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1741
+ ScanPartial(T input, T& inclusive_output, T& exclusive_output, ScanOp scan_op, int valid_items)
1742
+ {
1743
+ InternalWarpScan internal(temp_storage);
1744
+
1745
+ internal.InclusiveScanPartial(input, inclusive_output, scan_op, valid_items);
1746
+
1747
+ internal.UpdatePartial(input, inclusive_output, exclusive_output, scan_op, valid_items);
1748
+ }
1749
+
1750
+ //! @rst
1751
+ //! Computes both inclusive and exclusive prefix scans using the specified binary scan functor
1752
+ //! across the calling warp. But only the first ``valid_items`` elements (corresponding to warp lanes) are
1753
+ //! used in the calculation. The leftover invalid elements are never passed to the binary scan functor.
1754
+
1755
+ //!
1756
+ //! * @smemwarpreuse
1757
+ //!
1758
+ //! Snippet
1759
+ //! +++++++
1760
+ //!
1761
+ //! The code snippet below illustrates four concurrent warp-wide prefix max scans within a
1762
+ //! block of 128 threads (one per each of the 32-thread warps).
1763
+ //!
1764
+ //! .. code-block:: c++
1765
+ //!
1766
+ //! #include <cub/cub.cuh>
1767
+ //!
1768
+ //! __global__ void ExampleKernel(...)
1769
+ //! {
1770
+ //! // Specialize WarpScan for type int
1771
+ //! using WarpScan = cub::WarpScan<int>;
1772
+ //!
1773
+ //! // Allocate WarpScan shared memory for 4 warps
1774
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
1775
+ //!
1776
+ //! // Obtain one input item per thread
1777
+ //! int thread_data = ...
1778
+ //! int warp_id = threadIdx.x / 32;
1779
+ //! int block_valid_items = 35;
1780
+ //! int warp_valid_items = block_valid_items - warp_id * 32;
1781
+ //!
1782
+ //! // Compute inclusive warp-wide prefix max scans
1783
+ //! int inclusive_partial, exclusive_partial;
1784
+ //! WarpScan(temp_storage[warp_id]).ScanPartial(
1785
+ //! thread_data, inclusive_partial, exclusive_partial, INT_MIN, cuda::maximum<>{}, warp_valid_items);
1786
+ //!
1787
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1788
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``inclusive_partial`` in the
1789
+ //! first warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be
1790
+ //! ``32, 32, 34, -35, ..., 62, -63``. The corresponding output ``exclusive_partial`` in the
1791
+ //! first warp would be ``INT_MIN, 0, 0, 2, ..., 28, 30``, the output for the second warp would
1792
+ //! be ``INT_MIN, 32, 32, -35, ..., 62, -63``. The third and fourth warps ``inclusive_partial`` and
1793
+ //! ``exclusive_parttial`` would remain unmodified.
1794
+
1795
+ //! @endrst
1796
+ //!
1797
+ //! @tparam ScanOp
1798
+ //! **[inferred]** Binary scan operator type having member
1799
+ //! `T operator()(const T &a, const T &b)`
1800
+ //!
1801
+ //! @param[in] input
1802
+ //! Calling thread's input item
1803
+ //!
1804
+ //! @param[out] inclusive_output
1805
+ //! Calling thread's inclusive-scan output item
1806
+ //!
1807
+ //! @param[out] exclusive_output
1808
+ //! Calling thread's exclusive-scan output item
1809
+ //!
1810
+ //! @param[in] initial_value
1811
+ //! Initial value to seed the exclusive scan
1812
+ //!
1813
+ //! @param[in] scan_op
1814
+ //! Binary scan operator
1815
+ //!
1816
+ //! @param[in] valid_items
1817
+ //! Number of valid items in warp
1818
+ template <typename ScanOp>
1819
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1820
+ ScanPartial(T input, T& inclusive_output, T& exclusive_output, T initial_value, ScanOp scan_op, int valid_items)
1821
+ {
1822
+ InternalWarpScan internal(temp_storage);
1823
+
1824
+ internal.InclusiveScanPartial(input, inclusive_output, scan_op, valid_items);
1825
+
1826
+ internal.UpdatePartial(input, inclusive_output, exclusive_output, scan_op, valid_items, initial_value);
1827
+ }
1828
+
1829
+ #endif // _CCCL_DOXYGEN_INVOKED // Do not document partial combined scans
1830
+
1831
+ //! @} end member group
1832
+ //! @name Data exchange
1833
+ //! @{
1834
+
1835
+ //! @rst
1836
+ //! Broadcast the value ``input`` from *lane*\ :sub:`src_lane` to all lanes in the warp
1837
+ //!
1838
+ //! * @smemwarpreuse
1839
+ //!
1840
+ //! Snippet
1841
+ //! +++++++
1842
+ //!
1843
+ //! The code snippet below illustrates the warp-wide broadcasts of values from *lane*\ :sub:`0`
1844
+ //! in each of four warps to all other threads in those warps.
1845
+ //!
1846
+ //! .. code-block:: c++
1847
+ //!
1848
+ //! #include <cub/cub.cuh>
1849
+ //!
1850
+ //! __global__ void ExampleKernel(...)
1851
+ //! {
1852
+ //! // Specialize WarpScan for type int
1853
+ //! using WarpScan = cub::WarpScan<int>;
1854
+ //!
1855
+ //! // Allocate WarpScan shared memory for 4 warps
1856
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
1857
+ //!
1858
+ //! // Obtain one input item per thread
1859
+ //! int thread_data = ...
1860
+ //!
1861
+ //! // Broadcast from lane0 in each warp to all other threads in the warp
1862
+ //! int warp_id = threadIdx.x / 32;
1863
+ //! thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0);
1864
+ //!
1865
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1866
+ //! ``{0, 1, 2, 3, ..., 127}``. The corresponding output ``thread_data`` will be
1867
+ //! ``{0, 0, ..., 0}`` in warp\ :sub:`0`,
1868
+ //! ``{32, 32, ..., 32}`` in warp\ :sub:`1`,
1869
+ //! ``{64, 64, ..., 64}`` in warp\ :sub:`2`, etc.
1870
+ //! @endrst
1871
+ //!
1872
+ //! @param[in] input
1873
+ //! The value to broadcast
1874
+ //!
1875
+ //! @param[in] src_lane
1876
+ //! Which warp lane is to do the broadcasting
1877
+ _CCCL_DEVICE _CCCL_FORCEINLINE T Broadcast(T input, unsigned int src_lane)
1878
+ {
1879
+ return InternalWarpScan(temp_storage).Broadcast(input, src_lane);
1880
+ }
1881
+
1882
+ //@} end member group
1883
+ };
1884
+
1885
+ CUB_NAMESPACE_END