cuda-cccl 0.1.3.1.0.dev1486__cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1819) hide show
  1. cuda/cccl/__init__.py +14 -0
  2. cuda/cccl/cooperative/__init__.py +3 -0
  3. cuda/cccl/cooperative/experimental/__init__.py +8 -0
  4. cuda/cccl/cooperative/experimental/_caching.py +48 -0
  5. cuda/cccl/cooperative/experimental/_common.py +276 -0
  6. cuda/cccl/cooperative/experimental/_nvrtc.py +91 -0
  7. cuda/cccl/cooperative/experimental/_scan_op.py +181 -0
  8. cuda/cccl/cooperative/experimental/_types.py +953 -0
  9. cuda/cccl/cooperative/experimental/_typing.py +107 -0
  10. cuda/cccl/cooperative/experimental/block/__init__.py +33 -0
  11. cuda/cccl/cooperative/experimental/block/_block_load_store.py +215 -0
  12. cuda/cccl/cooperative/experimental/block/_block_merge_sort.py +125 -0
  13. cuda/cccl/cooperative/experimental/block/_block_radix_sort.py +214 -0
  14. cuda/cccl/cooperative/experimental/block/_block_reduce.py +294 -0
  15. cuda/cccl/cooperative/experimental/block/_block_scan.py +983 -0
  16. cuda/cccl/cooperative/experimental/warp/__init__.py +9 -0
  17. cuda/cccl/cooperative/experimental/warp/_warp_merge_sort.py +98 -0
  18. cuda/cccl/cooperative/experimental/warp/_warp_reduce.py +153 -0
  19. cuda/cccl/cooperative/experimental/warp/_warp_scan.py +78 -0
  20. cuda/cccl/headers/__init__.py +7 -0
  21. cuda/cccl/headers/include/__init__.py +1 -0
  22. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +261 -0
  23. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1181 -0
  24. cuda/cccl/headers/include/cub/agent/agent_for.cuh +84 -0
  25. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +919 -0
  26. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +227 -0
  27. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +752 -0
  28. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +766 -0
  29. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
  30. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +704 -0
  31. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +557 -0
  32. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +678 -0
  33. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +804 -0
  34. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +997 -0
  35. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +561 -0
  36. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +473 -0
  37. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +292 -0
  38. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1032 -0
  39. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +342 -0
  40. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +592 -0
  41. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +614 -0
  42. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1346 -0
  43. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +965 -0
  44. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1217 -0
  45. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1306 -0
  46. cuda/cccl/headers/include/cub/block/block_histogram.cuh +420 -0
  47. cuda/cccl/headers/include/cub/block/block_load.cuh +1259 -0
  48. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +787 -0
  49. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1218 -0
  50. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2193 -0
  51. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  52. cuda/cccl/headers/include/cub/block/block_reduce.cuh +629 -0
  53. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +437 -0
  54. cuda/cccl/headers/include/cub/block/block_scan.cuh +2600 -0
  55. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  56. cuda/cccl/headers/include/cub/block/block_store.cuh +1246 -0
  57. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +620 -0
  58. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  59. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  60. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  61. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  62. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +259 -0
  63. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  64. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  65. cuda/cccl/headers/include/cub/config.cuh +60 -0
  66. cuda/cccl/headers/include/cub/cub.cuh +112 -0
  67. cuda/cccl/headers/include/cub/detail/array_utils.cuh +77 -0
  68. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +155 -0
  69. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +93 -0
  70. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  71. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +246 -0
  72. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +120 -0
  73. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +74 -0
  74. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +118 -0
  75. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  76. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  77. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +61 -0
  78. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  79. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +71 -0
  80. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +93 -0
  81. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  82. cuda/cccl/headers/include/cub/detail/rfa.cuh +724 -0
  83. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  84. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  85. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +355 -0
  86. cuda/cccl/headers/include/cub/detail/type_traits.cuh +206 -0
  87. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +72 -0
  88. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  89. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  90. cuda/cccl/headers/include/cub/device/device_copy.cuh +187 -0
  91. cuda/cccl/headers/include/cub/device/device_for.cuh +994 -0
  92. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1507 -0
  93. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  94. cuda/cccl/headers/include/cub/device/device_merge.cuh +202 -0
  95. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  96. cuda/cccl/headers/include/cub/device/device_partition.cuh +664 -0
  97. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3431 -0
  98. cuda/cccl/headers/include/cub/device/device_reduce.cuh +1387 -0
  99. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +368 -0
  100. cuda/cccl/headers/include/cub/device/device_scan.cuh +1901 -0
  101. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  102. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1512 -0
  103. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  104. cuda/cccl/headers/include/cub/device/device_select.cuh +1224 -0
  105. cuda/cccl/headers/include/cub/device/device_transform.cuh +313 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +314 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +109 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +718 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +45 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +197 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1051 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +305 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1748 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1316 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +625 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +502 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +548 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +497 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +598 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +1374 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +838 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +439 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +552 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +397 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +543 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +218 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +338 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +799 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +523 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +194 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +330 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +437 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +70 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +121 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +63 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +278 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +91 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +118 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1068 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +397 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +945 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +675 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +555 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1013 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +249 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1587 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +407 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +283 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +874 -0
  154. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +215 -0
  155. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  156. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  157. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +256 -0
  158. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +238 -0
  159. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +252 -0
  160. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +322 -0
  161. cuda/cccl/headers/include/cub/thread/thread_load.cuh +347 -0
  162. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +629 -0
  163. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +504 -0
  164. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +340 -0
  165. cuda/cccl/headers/include/cub/thread/thread_search.cuh +198 -0
  166. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +406 -0
  167. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +101 -0
  168. cuda/cccl/headers/include/cub/thread/thread_store.cuh +364 -0
  169. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  170. cuda/cccl/headers/include/cub/util_arch.cuh +163 -0
  171. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  172. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  173. cuda/cccl/headers/include/cub/util_device.cuh +779 -0
  174. cuda/cccl/headers/include/cub/util_macro.cuh +91 -0
  175. cuda/cccl/headers/include/cub/util_math.cuh +115 -0
  176. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  177. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  178. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  179. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  180. cuda/cccl/headers/include/cub/util_type.cuh +1111 -0
  181. cuda/cccl/headers/include/cub/util_vsmem.cuh +251 -0
  182. cuda/cccl/headers/include/cub/version.cuh +89 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  184. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  185. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +729 -0
  186. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +405 -0
  187. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +688 -0
  188. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +437 -0
  189. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  190. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  191. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  192. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +822 -0
  193. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1156 -0
  194. cuda/cccl/headers/include/cub/warp/warp_store.cuh +520 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +169 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +210 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +84 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +127 -0
  201. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +209 -0
  202. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  203. cuda/cccl/headers/include/cuda/__barrier/aligned_size.h +61 -0
  204. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier.h +66 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +100 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +454 -0
  208. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +72 -0
  209. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  210. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +61 -0
  211. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  212. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  213. cuda/cccl/headers/include/cuda/__bit/bitmask.h +88 -0
  214. cuda/cccl/headers/include/cuda/__cccl_config +36 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +126 -0
  216. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  217. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  218. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  219. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  220. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_down.h +104 -0
  222. cuda/cccl/headers/include/cuda/__cmath/round_up.h +106 -0
  223. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  224. cuda/cccl/headers/include/cuda/__execution/determinism.h +90 -0
  225. cuda/cccl/headers/include/cuda/__execution/require.h +67 -0
  226. cuda/cccl/headers/include/cuda/__execution/tune.h +62 -0
  227. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  228. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +279 -0
  229. cuda/cccl/headers/include/cuda/__functional/get_device_address.h +58 -0
  230. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  231. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  232. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
  233. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  234. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  235. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  236. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  237. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +261 -0
  238. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +407 -0
  239. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +314 -0
  240. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +323 -0
  241. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +481 -0
  242. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  243. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +457 -0
  244. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +63 -0
  245. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +123 -0
  246. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +51 -0
  247. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  248. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  249. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  250. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  251. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +98 -0
  252. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +162 -0
  253. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +49 -0
  254. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  255. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  256. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +99 -0
  257. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  258. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  259. cuda/cccl/headers/include/cuda/__memory/address_space.h +86 -0
  260. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +94 -0
  261. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +158 -0
  262. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +73 -0
  263. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +129 -0
  264. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +653 -0
  265. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  266. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +57 -0
  267. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  268. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +101 -0
  269. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2982 -0
  270. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  271. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  272. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  273. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  274. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  275. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  276. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  277. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  278. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  279. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  280. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  281. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  282. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  283. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  284. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  285. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  286. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  287. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +240 -0
  288. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +193 -0
  289. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  290. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +52 -0
  291. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +957 -0
  292. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +288 -0
  293. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +596 -0
  294. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  295. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  296. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  297. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1445 -0
  298. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  299. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +117 -0
  300. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  301. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  302. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  303. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  304. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  305. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  306. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  307. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +62 -0
  308. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +101 -0
  309. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +62 -0
  310. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  311. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  312. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +15074 -0
  313. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +385 -0
  314. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +176 -0
  315. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +94 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +137 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +138 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +280 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +282 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2148 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1272 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +228 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +430 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1830 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +105 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +81 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +612 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +44 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4446 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4061 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +6438 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +36 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +4582 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +44 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +67 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +750 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +275 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  376. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  377. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +151 -0
  378. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  379. cuda/cccl/headers/include/cuda/__stream/get_stream.h +97 -0
  380. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +165 -0
  381. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  382. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  383. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +66 -0
  384. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +249 -0
  385. cuda/cccl/headers/include/cuda/access_property +26 -0
  386. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  387. cuda/cccl/headers/include/cuda/atomic +27 -0
  388. cuda/cccl/headers/include/cuda/barrier +262 -0
  389. cuda/cccl/headers/include/cuda/bit +29 -0
  390. cuda/cccl/headers/include/cuda/cmath +35 -0
  391. cuda/cccl/headers/include/cuda/discard_memory +61 -0
  392. cuda/cccl/headers/include/cuda/functional +31 -0
  393. cuda/cccl/headers/include/cuda/iterator +31 -0
  394. cuda/cccl/headers/include/cuda/latch +27 -0
  395. cuda/cccl/headers/include/cuda/mdspan +28 -0
  396. cuda/cccl/headers/include/cuda/memory +28 -0
  397. cuda/cccl/headers/include/cuda/memory_resource +41 -0
  398. cuda/cccl/headers/include/cuda/numeric +28 -0
  399. cuda/cccl/headers/include/cuda/pipeline +579 -0
  400. cuda/cccl/headers/include/cuda/ptx +118 -0
  401. cuda/cccl/headers/include/cuda/semaphore +31 -0
  402. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +60 -0
  403. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +46 -0
  404. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +46 -0
  405. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +52 -0
  406. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  407. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +64 -0
  408. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  409. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +143 -0
  410. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +79 -0
  411. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  412. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +74 -0
  413. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  414. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  415. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +129 -0
  416. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  417. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  418. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  419. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +64 -0
  420. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  421. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  422. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  423. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  424. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  425. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  426. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  427. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  428. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  429. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +92 -0
  430. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +51 -0
  431. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  432. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +58 -0
  433. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  434. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +50 -0
  435. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +69 -0
  436. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  437. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +188 -0
  438. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  439. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +83 -0
  440. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +72 -0
  441. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +96 -0
  442. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  443. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +70 -0
  444. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  445. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  446. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +88 -0
  447. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +71 -0
  448. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +141 -0
  449. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  450. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +88 -0
  451. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  452. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +89 -0
  453. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +46 -0
  454. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  455. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  456. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +121 -0
  457. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  458. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  459. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +95 -0
  460. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +89 -0
  461. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +103 -0
  462. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  463. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +99 -0
  464. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +69 -0
  465. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  466. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  467. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  468. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  469. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  470. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  471. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  472. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  473. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  474. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  475. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +264 -0
  476. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  477. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  478. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  479. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  480. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +123 -0
  481. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +135 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +129 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +72 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +77 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +156 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +96 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +127 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  495. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  496. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +138 -0
  497. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  498. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  499. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +218 -0
  500. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  501. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  502. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  503. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  504. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  505. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  506. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  507. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  508. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  509. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +250 -0
  510. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +105 -0
  511. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  512. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +73 -0
  513. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  514. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  515. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  516. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  517. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  518. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  519. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +84 -0
  520. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +77 -0
  521. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +183 -0
  522. cuda/cccl/headers/include/cuda/std/__bit/countl.h +167 -0
  523. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  524. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  525. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  526. cuda/cccl/headers/include/cuda/std/__bit/integral.h +124 -0
  527. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  528. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1274 -0
  529. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  530. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  531. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +146 -0
  532. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +207 -0
  533. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +1343 -0
  534. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +216 -0
  535. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +43 -0
  536. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +53 -0
  537. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +69 -0
  538. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +129 -0
  539. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +124 -0
  540. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +326 -0
  541. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +35 -0
  542. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +68 -0
  543. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +129 -0
  544. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  545. cuda/cccl/headers/include/cuda/std/__cccl/os.h +48 -0
  546. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1234 -0
  547. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +267 -0
  548. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +176 -0
  549. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  550. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  551. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  552. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  553. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  554. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +112 -0
  555. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  556. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  557. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  558. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  559. cuda/cccl/headers/include/cuda/std/__charconv_ +30 -0
  560. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +240 -0
  561. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +187 -0
  562. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +620 -0
  563. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +207 -0
  564. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +181 -0
  565. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +250 -0
  566. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +213 -0
  567. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +250 -0
  568. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +323 -0
  569. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +163 -0
  570. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +201 -0
  571. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +176 -0
  572. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +129 -0
  573. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +106 -0
  574. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +503 -0
  575. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +236 -0
  576. cuda/cccl/headers/include/cuda/std/__cmath/nvbf16.h +58 -0
  577. cuda/cccl/headers/include/cuda/std/__cmath/nvfp16.h +58 -0
  578. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +180 -0
  579. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +877 -0
  580. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +155 -0
  581. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +170 -0
  582. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +292 -0
  583. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +351 -0
  584. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +350 -0
  585. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +135 -0
  586. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  587. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  588. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  589. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +46 -0
  590. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  591. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  592. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +274 -0
  593. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +107 -0
  594. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +71 -0
  595. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  596. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +57 -0
  597. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  598. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  599. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  600. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  601. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  602. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  603. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  604. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  605. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +42 -0
  606. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  607. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  608. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  609. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  610. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  611. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  612. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  613. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  614. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +69 -0
  615. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  616. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +72 -0
  617. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +143 -0
  618. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  619. cuda/cccl/headers/include/cuda/std/__execution/env.h +436 -0
  620. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  621. cuda/cccl/headers/include/cuda/std/__expected/expected.h +2002 -0
  622. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1078 -0
  623. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  624. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +178 -0
  625. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  626. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +809 -0
  627. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  628. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  629. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +172 -0
  630. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +103 -0
  631. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  632. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +39 -0
  633. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +64 -0
  634. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  635. cuda/cccl/headers/include/cuda/std/__floating_point/nvfp_types.h +58 -0
  636. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  637. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  638. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  639. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  640. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  641. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  642. cuda/cccl/headers/include/cuda/std/__functional/bind.h +352 -0
  643. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +88 -0
  644. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  645. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +75 -0
  646. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +75 -0
  647. cuda/cccl/headers/include/cuda/std/__functional/compose.h +69 -0
  648. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  649. cuda/cccl/headers/include/cuda/std/__functional/function.h +1277 -0
  650. cuda/cccl/headers/include/cuda/std/__functional/hash.h +650 -0
  651. cuda/cccl/headers/include/cuda/std/__functional/identity.h +61 -0
  652. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +560 -0
  653. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +43 -0
  654. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +67 -0
  655. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +214 -0
  656. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +121 -0
  657. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  658. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +127 -0
  659. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +65 -0
  660. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +64 -0
  661. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  662. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  663. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  664. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +67 -0
  665. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  666. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +278 -0
  667. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +35 -0
  668. cuda/cccl/headers/include/cuda/std/__fwd/array.h +36 -0
  669. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +49 -0
  670. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +34 -0
  671. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  672. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  673. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  674. cuda/cccl/headers/include/cuda/std/__fwd/iterator_traits.h +40 -0
  675. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +73 -0
  676. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  677. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +34 -0
  678. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  679. cuda/cccl/headers/include/cuda/std/__fwd/span.h +38 -0
  680. cuda/cccl/headers/include/cuda/std/__fwd/string.h +83 -0
  681. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +59 -0
  682. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  683. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  684. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  685. cuda/cccl/headers/include/cuda/std/__internal/features.h +71 -0
  686. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +102 -0
  687. cuda/cccl/headers/include/cuda/std/__iterator/access.h +132 -0
  688. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +230 -0
  689. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +103 -0
  690. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +264 -0
  691. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +608 -0
  692. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +469 -0
  693. cuda/cccl/headers/include/cuda/std/__iterator/data.h +63 -0
  694. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  695. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  696. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +54 -0
  697. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  698. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +98 -0
  699. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +152 -0
  700. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  701. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +105 -0
  702. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +141 -0
  703. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  704. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  705. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  706. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  707. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +935 -0
  708. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  709. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +401 -0
  710. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  711. cuda/cccl/headers/include/cuda/std/__iterator/next.h +102 -0
  712. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +99 -0
  713. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +101 -0
  714. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  715. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +92 -0
  716. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  717. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +185 -0
  718. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +146 -0
  719. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +615 -0
  720. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  721. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  722. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +88 -0
  723. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +259 -0
  724. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  725. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  726. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +781 -0
  727. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  728. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +55 -0
  729. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +140 -0
  730. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +134 -0
  731. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +328 -0
  732. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +100 -0
  733. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
  734. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +74 -0
  735. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +363 -0
  736. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +765 -0
  737. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +317 -0
  738. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +310 -0
  739. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +615 -0
  740. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
  741. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  742. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +190 -0
  743. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +347 -0
  744. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +64 -0
  745. cuda/cccl/headers/include/cuda/std/__memory/align.h +87 -0
  746. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  747. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  748. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  749. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  750. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  751. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +569 -0
  752. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  753. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  754. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +231 -0
  755. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +248 -0
  756. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  757. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +43 -0
  758. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +260 -0
  759. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  760. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +686 -0
  761. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +771 -0
  762. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +55 -0
  763. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  764. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  765. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  766. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  767. cuda/cccl/headers/include/cuda/std/__new/launder.h +49 -0
  768. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  769. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +57 -0
  770. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  771. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  772. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +80 -0
  773. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  774. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  775. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  776. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +100 -0
  777. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +70 -0
  778. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +61 -0
  779. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  780. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  781. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  782. cuda/cccl/headers/include/cuda/std/__ranges/access.h +304 -0
  783. cuda/cccl/headers/include/cuda/std/__ranges/all.h +97 -0
  784. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +313 -0
  785. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  786. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  787. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  788. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +111 -0
  789. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  790. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  791. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +77 -0
  792. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  793. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +271 -0
  794. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  795. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +161 -0
  796. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +114 -0
  797. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  798. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  799. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  800. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +343 -0
  801. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +156 -0
  802. cuda/cccl/headers/include/cuda/std/__ranges/size.h +200 -0
  803. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  804. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +263 -0
  805. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +531 -0
  806. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  807. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +181 -0
  808. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  809. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +233 -0
  810. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  811. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  812. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +591 -0
  813. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +299 -0
  814. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  815. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  816. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  817. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  818. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +105 -0
  819. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  820. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  821. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +144 -0
  822. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  823. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  824. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +98 -0
  825. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +236 -0
  826. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +216 -0
  827. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  828. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  829. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +90 -0
  830. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +73 -0
  831. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  832. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  833. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +242 -0
  834. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  835. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  836. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  837. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  838. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  839. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  840. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  841. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  842. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  843. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  844. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +69 -0
  845. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  846. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +174 -0
  847. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  848. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  849. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  850. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  851. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  852. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  853. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  854. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  855. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  856. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  857. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +47 -0
  858. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +51 -0
  859. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  860. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +40 -0
  861. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +44 -0
  862. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  863. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  864. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  865. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  866. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +83 -0
  867. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  868. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  869. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  870. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +68 -0
  871. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +54 -0
  872. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  873. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  874. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  875. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +214 -0
  876. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  877. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  878. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  879. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +43 -0
  880. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  881. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  882. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +73 -0
  883. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +68 -0
  884. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  885. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +81 -0
  886. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +56 -0
  887. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  888. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  889. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  890. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  891. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  892. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  893. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  894. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +59 -0
  895. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  896. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  897. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  898. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  899. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  900. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  901. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  902. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  903. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  904. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  905. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  906. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +79 -0
  907. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  908. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  909. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  910. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  911. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  912. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +62 -0
  913. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  914. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +87 -0
  915. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +43 -0
  916. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +63 -0
  917. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +119 -0
  918. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  919. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  920. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  921. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +84 -0
  922. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  923. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  924. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  925. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  926. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +57 -0
  927. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +203 -0
  928. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +56 -0
  929. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +70 -0
  930. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +82 -0
  931. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +60 -0
  932. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +61 -0
  933. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +56 -0
  934. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +55 -0
  935. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +73 -0
  936. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +60 -0
  937. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +58 -0
  938. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  939. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +57 -0
  940. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  941. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  942. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  943. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  944. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  945. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  946. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  947. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  948. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  949. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  950. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  951. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  952. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  953. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +123 -0
  954. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  955. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  956. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  957. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  958. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  959. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  960. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  961. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  962. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  963. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  964. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  965. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  966. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  967. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  968. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  969. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1069 -0
  970. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +132 -0
  971. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +66 -0
  972. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  973. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  974. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +32 -0
  975. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  976. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +103 -0
  977. cuda/cccl/headers/include/cuda/std/__utility/declval.h +63 -0
  978. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +162 -0
  979. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  980. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  981. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +56 -0
  982. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +77 -0
  983. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  984. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  985. cuda/cccl/headers/include/cuda/std/__utility/move.h +75 -0
  986. cuda/cccl/headers/include/cuda/std/__utility/pair.h +808 -0
  987. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  988. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +763 -0
  989. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  990. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  991. cuda/cccl/headers/include/cuda/std/__utility/swap.h +65 -0
  992. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  993. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +425 -0
  994. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  995. cuda/cccl/headers/include/cuda/std/array +527 -0
  996. cuda/cccl/headers/include/cuda/std/atomic +823 -0
  997. cuda/cccl/headers/include/cuda/std/barrier +43 -0
  998. cuda/cccl/headers/include/cuda/std/bit +35 -0
  999. cuda/cccl/headers/include/cuda/std/bitset +1026 -0
  1000. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1001. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1002. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1003. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1004. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1005. cuda/cccl/headers/include/cuda/std/cmath +25 -0
  1006. cuda/cccl/headers/include/cuda/std/complex +25 -0
  1007. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1008. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1009. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1010. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1011. cuda/cccl/headers/include/cuda/std/cstring +111 -0
  1012. cuda/cccl/headers/include/cuda/std/ctime +147 -0
  1013. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1014. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +258 -0
  1015. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +2692 -0
  1016. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3689 -0
  1017. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/cmath +685 -0
  1018. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/complex +1610 -0
  1019. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1020. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/optional +1786 -0
  1021. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1022. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1378 -0
  1023. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2160 -0
  1024. cuda/cccl/headers/include/cuda/std/execution +27 -0
  1025. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1026. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1027. cuda/cccl/headers/include/cuda/std/initializer_list +36 -0
  1028. cuda/cccl/headers/include/cuda/std/inplace_vector +2171 -0
  1029. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1030. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1031. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1032. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1033. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1034. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1035. cuda/cccl/headers/include/cuda/std/numbers +335 -0
  1036. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1037. cuda/cccl/headers/include/cuda/std/optional +25 -0
  1038. cuda/cccl/headers/include/cuda/std/ranges +68 -0
  1039. cuda/cccl/headers/include/cuda/std/ratio +417 -0
  1040. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1041. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1042. cuda/cccl/headers/include/cuda/std/span +640 -0
  1043. cuda/cccl/headers/include/cuda/std/string_view +814 -0
  1044. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1045. cuda/cccl/headers/include/cuda/std/type_traits +176 -0
  1046. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1047. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1048. cuda/cccl/headers/include/cuda/std/version +245 -0
  1049. cuda/cccl/headers/include/cuda/stream_ref +54 -0
  1050. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1051. cuda/cccl/headers/include/cuda/version +16 -0
  1052. cuda/cccl/headers/include/cuda/warp +28 -0
  1053. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1054. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1055. cuda/cccl/headers/include/nv/detail/__target_macros +599 -0
  1056. cuda/cccl/headers/include/nv/target +229 -0
  1057. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1058. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1059. cuda/cccl/headers/include/thrust/advance.h +59 -0
  1060. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1061. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1062. cuda/cccl/headers/include/thrust/complex.h +859 -0
  1063. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1064. cuda/cccl/headers/include/thrust/count.h +245 -0
  1065. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1066. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +37 -0
  1067. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1068. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +350 -0
  1069. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +371 -0
  1070. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +45 -0
  1071. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +242 -0
  1072. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +39 -0
  1073. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +137 -0
  1074. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +39 -0
  1075. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +99 -0
  1076. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +53 -0
  1077. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +68 -0
  1078. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1079. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +102 -0
  1080. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +86 -0
  1081. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +79 -0
  1082. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +81 -0
  1083. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +39 -0
  1084. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +98 -0
  1085. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1086. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1087. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1088. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1089. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1090. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1091. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1092. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1093. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1094. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1095. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1096. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1097. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1098. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1099. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1100. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1101. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1102. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1103. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1104. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1105. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1106. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1107. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1108. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1109. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1110. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1111. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1112. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1113. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1114. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1115. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1116. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1117. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1118. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1119. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1120. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1121. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1122. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1123. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1124. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1125. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1126. cuda/cccl/headers/include/thrust/detail/device_delete.inl +52 -0
  1127. cuda/cccl/headers/include/thrust/detail/device_free.inl +47 -0
  1128. cuda/cccl/headers/include/thrust/detail/device_malloc.inl +60 -0
  1129. cuda/cccl/headers/include/thrust/detail/device_new.inl +61 -0
  1130. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1131. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1132. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1133. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +80 -0
  1134. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1135. cuda/cccl/headers/include/thrust/detail/execution_policy.h +80 -0
  1136. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1137. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1138. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1139. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1140. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1141. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1142. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1143. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1144. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1145. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1146. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1147. cuda/cccl/headers/include/thrust/detail/integer_math.h +130 -0
  1148. cuda/cccl/headers/include/thrust/detail/internal_functional.h +285 -0
  1149. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1150. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +92 -0
  1151. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1152. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1153. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +40 -0
  1154. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1155. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1156. cuda/cccl/headers/include/thrust/detail/mpl/math.h +164 -0
  1157. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +37 -0
  1158. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1159. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1160. cuda/cccl/headers/include/thrust/detail/pointer.h +217 -0
  1161. cuda/cccl/headers/include/thrust/detail/pointer.inl +172 -0
  1162. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1163. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1164. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1165. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1166. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1167. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +189 -0
  1168. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1169. cuda/cccl/headers/include/thrust/detail/reference.h +500 -0
  1170. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1171. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1172. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1173. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1174. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1175. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1176. cuda/cccl/headers/include/thrust/detail/seq.h +54 -0
  1177. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1178. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1179. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1180. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1181. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1182. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1183. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1184. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1185. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1186. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +138 -0
  1187. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1188. cuda/cccl/headers/include/thrust/detail/transform.inl +250 -0
  1189. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1190. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1191. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +131 -0
  1192. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1193. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1194. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1195. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1196. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1197. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1198. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1199. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +60 -0
  1200. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_discard_iterator.h +44 -0
  1201. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1202. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1203. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1204. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1205. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1206. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1207. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1208. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1209. cuda/cccl/headers/include/thrust/detail/util/align.h +59 -0
  1210. cuda/cccl/headers/include/thrust/detail/vector_base.h +630 -0
  1211. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1242 -0
  1212. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1213. cuda/cccl/headers/include/thrust/device_delete.h +59 -0
  1214. cuda/cccl/headers/include/thrust/device_free.h +72 -0
  1215. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1216. cuda/cccl/headers/include/thrust/device_malloc.h +108 -0
  1217. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1218. cuda/cccl/headers/include/thrust/device_new.h +91 -0
  1219. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1220. cuda/cccl/headers/include/thrust/device_ptr.h +202 -0
  1221. cuda/cccl/headers/include/thrust/device_reference.h +986 -0
  1222. cuda/cccl/headers/include/thrust/device_vector.h +574 -0
  1223. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1224. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1225. cuda/cccl/headers/include/thrust/execution_policy.h +384 -0
  1226. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1227. cuda/cccl/headers/include/thrust/fill.h +201 -0
  1228. cuda/cccl/headers/include/thrust/find.h +382 -0
  1229. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1230. cuda/cccl/headers/include/thrust/functional.h +396 -0
  1231. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1232. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1233. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1234. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1235. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +219 -0
  1236. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1237. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1238. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1239. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1240. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1241. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1242. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +51 -0
  1243. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +62 -0
  1244. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1245. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +199 -0
  1246. cuda/cccl/headers/include/thrust/iterator/detail/iterator_traversal_tags.h +50 -0
  1247. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +53 -0
  1248. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1249. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1250. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1251. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1252. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +164 -0
  1253. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1254. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +215 -0
  1255. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +660 -0
  1256. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +245 -0
  1257. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +192 -0
  1258. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1259. cuda/cccl/headers/include/thrust/iterator/retag.h +74 -0
  1260. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +221 -0
  1261. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +184 -0
  1262. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1263. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +149 -0
  1264. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1265. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1266. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1267. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +357 -0
  1268. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1269. cuda/cccl/headers/include/thrust/memory.h +395 -0
  1270. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1271. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1272. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1273. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1274. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1275. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1276. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +68 -0
  1277. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1278. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1279. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1280. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1281. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1282. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1283. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1284. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1285. cuda/cccl/headers/include/thrust/mr/tls_pool.h +65 -0
  1286. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1287. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1288. cuda/cccl/headers/include/thrust/pair.h +102 -0
  1289. cuda/cccl/headers/include/thrust/partition.h +1383 -0
  1290. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1291. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1292. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1293. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1294. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1295. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1296. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1297. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1298. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1299. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1300. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1301. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1302. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
  1303. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1304. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +217 -0
  1305. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1306. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1307. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1308. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1309. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1310. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1311. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1312. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1313. cuda/cccl/headers/include/thrust/random.h +120 -0
  1314. cuda/cccl/headers/include/thrust/reduce.h +1112 -0
  1315. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1316. cuda/cccl/headers/include/thrust/replace.h +827 -0
  1317. cuda/cccl/headers/include/thrust/reverse.h +213 -0
  1318. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1319. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1320. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1321. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1322. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1323. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1324. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1325. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1326. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1327. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1328. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1329. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1330. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1331. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1332. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +90 -0
  1333. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1334. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1335. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1336. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1337. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1338. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1339. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1340. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1341. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1342. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1343. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1344. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1345. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1346. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1347. cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +62 -0
  1348. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1349. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1350. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1351. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1352. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1353. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1354. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1355. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1356. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1357. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1358. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1359. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1360. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1361. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1362. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1363. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1364. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1365. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1366. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1367. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1368. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1369. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1370. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1371. cuda/cccl/headers/include/thrust/system/cpp/detail/vector.inl +130 -0
  1372. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +161 -0
  1373. cuda/cccl/headers/include/thrust/system/cpp/memory.h +109 -0
  1374. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +75 -0
  1375. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +119 -0
  1376. cuda/cccl/headers/include/thrust/system/cpp/vector.h +99 -0
  1377. cuda/cccl/headers/include/thrust/system/cuda/config.h +123 -0
  1378. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1379. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1380. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1381. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1382. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +129 -0
  1383. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1384. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1385. cuda/cccl/headers/include/thrust/system/cuda/detail/core/load_iterator.h +58 -0
  1386. cuda/cccl/headers/include/thrust/system/cuda/detail/core/make_load_iterator.h +60 -0
  1387. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1388. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +630 -0
  1389. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1390. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1391. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +210 -0
  1392. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1393. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1394. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +113 -0
  1395. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1396. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +82 -0
  1397. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +272 -0
  1398. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1399. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1400. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +85 -0
  1401. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1402. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1403. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +204 -0
  1404. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +98 -0
  1405. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +69 -0
  1406. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1407. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1408. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1409. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1410. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1411. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +217 -0
  1412. cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +237 -0
  1413. cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +95 -0
  1414. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1415. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1416. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1417. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +961 -0
  1418. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1000 -0
  1419. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1420. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +164 -0
  1421. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +88 -0
  1422. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +342 -0
  1423. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +415 -0
  1424. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +79 -0
  1425. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1426. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1736 -0
  1427. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +482 -0
  1428. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +75 -0
  1429. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +75 -0
  1430. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1431. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1432. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +403 -0
  1433. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1434. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1435. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +94 -0
  1436. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +91 -0
  1437. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +648 -0
  1438. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +311 -0
  1439. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +251 -0
  1440. cuda/cccl/headers/include/thrust/system/cuda/error.h +175 -0
  1441. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +39 -0
  1442. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1443. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1444. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +140 -0
  1445. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1446. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1447. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1448. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1449. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1450. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1451. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1452. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1453. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1454. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1455. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1456. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1457. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1458. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1459. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1460. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1461. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1462. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1463. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1464. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1465. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1466. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1467. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +48 -0
  1468. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1469. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1470. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1471. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1472. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1473. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1474. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1475. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1476. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1477. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1478. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1479. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1480. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1481. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1482. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1483. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1484. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1485. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1486. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1487. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1488. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1489. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +64 -0
  1490. cuda/cccl/headers/include/thrust/system/detail/errno.h +125 -0
  1491. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1492. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1493. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1494. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +59 -0
  1495. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +85 -0
  1496. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +167 -0
  1497. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +391 -0
  1498. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +51 -0
  1499. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +70 -0
  1500. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +64 -0
  1501. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +152 -0
  1502. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +54 -0
  1503. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +90 -0
  1504. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +55 -0
  1505. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +66 -0
  1506. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +72 -0
  1507. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +258 -0
  1508. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +60 -0
  1509. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +55 -0
  1510. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +143 -0
  1511. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +64 -0
  1512. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +79 -0
  1513. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +102 -0
  1514. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +51 -0
  1515. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1516. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +66 -0
  1517. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +78 -0
  1518. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +65 -0
  1519. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +70 -0
  1520. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
  1521. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +105 -0
  1522. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +154 -0
  1523. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +55 -0
  1524. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +74 -0
  1525. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +135 -0
  1526. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +213 -0
  1527. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +49 -0
  1528. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +77 -0
  1529. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +106 -0
  1530. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +89 -0
  1531. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +192 -0
  1532. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +92 -0
  1533. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +127 -0
  1534. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +101 -0
  1535. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +181 -0
  1536. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +54 -0
  1537. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +72 -0
  1538. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +78 -0
  1539. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +141 -0
  1540. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +78 -0
  1541. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +91 -0
  1542. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +132 -0
  1543. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +238 -0
  1544. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +79 -0
  1545. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +91 -0
  1546. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +96 -0
  1547. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +55 -0
  1548. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.inl +95 -0
  1549. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +288 -0
  1550. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +482 -0
  1551. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +60 -0
  1552. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +131 -0
  1553. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +119 -0
  1554. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +181 -0
  1555. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +50 -0
  1556. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +82 -0
  1557. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +47 -0
  1558. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +60 -0
  1559. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +53 -0
  1560. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +60 -0
  1561. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +88 -0
  1562. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +109 -0
  1563. cuda/cccl/headers/include/thrust/system/detail/generic/transform.inl +185 -0
  1564. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +56 -0
  1565. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +62 -0
  1566. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +86 -0
  1567. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +119 -0
  1568. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +51 -0
  1569. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +172 -0
  1570. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +51 -0
  1571. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +121 -0
  1572. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +77 -0
  1573. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +119 -0
  1574. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +87 -0
  1575. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +132 -0
  1576. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +123 -0
  1577. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +76 -0
  1578. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +48 -0
  1579. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +142 -0
  1580. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +55 -0
  1581. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +125 -0
  1582. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +55 -0
  1583. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +77 -0
  1584. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +78 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +116 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +68 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +80 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +129 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +49 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +147 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +51 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +56 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +81 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +151 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +309 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +70 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +104 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +185 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +160 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +151 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +212 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +65 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +187 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +61 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +362 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +54 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +130 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +54 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +592 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +64 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +121 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +112 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1635. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1636. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1637. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1638. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1639. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1640. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1641. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1642. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1643. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1644. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1645. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1646. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1647. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +113 -0
  1648. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1649. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1650. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1651. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1652. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1653. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1654. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1655. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1656. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1657. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1658. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1659. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1660. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1661. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1662. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1663. cuda/cccl/headers/include/thrust/system/omp/detail/par.h +62 -0
  1664. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1665. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1666. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1667. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1668. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1669. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1670. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1671. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1672. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1673. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1674. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1675. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1676. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1677. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1678. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +30 -0
  1679. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +30 -0
  1680. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1681. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1682. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1683. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1684. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +259 -0
  1685. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1686. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1687. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1688. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1689. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1690. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1691. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1692. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1693. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1694. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1695. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1696. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1697. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +160 -0
  1698. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1699. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1700. cuda/cccl/headers/include/thrust/system/omp/pointer.h +120 -0
  1701. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1702. cuda/cccl/headers/include/thrust/system/system_error.h +184 -0
  1703. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1704. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1705. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1706. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1707. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1708. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1709. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1710. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1711. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1712. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +92 -0
  1713. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1714. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1715. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1716. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1717. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1718. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1719. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1720. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1721. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1722. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1723. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1724. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1725. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1726. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1727. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1728. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1729. cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +62 -0
  1730. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1731. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1732. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1733. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1734. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1735. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1736. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1737. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1738. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1739. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1740. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1741. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1742. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1743. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1744. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +30 -0
  1745. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1746. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1747. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1748. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1749. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1750. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1751. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1752. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1753. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1754. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1755. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1756. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1757. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1758. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1759. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1760. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1761. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1762. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +160 -0
  1763. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1764. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1765. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +120 -0
  1766. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1767. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1768. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1769. cuda/cccl/headers/include/thrust/transform.h +903 -0
  1770. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1771. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1772. cuda/cccl/headers/include/thrust/tuple.h +142 -0
  1773. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1774. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +182 -0
  1775. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1776. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1777. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1778. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +306 -0
  1779. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1780. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +93 -0
  1781. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1782. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1783. cuda/cccl/headers/include/thrust/unique.h +1090 -0
  1784. cuda/cccl/headers/include/thrust/universal_allocator.h +90 -0
  1785. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1786. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1787. cuda/cccl/headers/include/thrust/version.h +93 -0
  1788. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1789. cuda/cccl/headers/include_paths.py +72 -0
  1790. cuda/cccl/parallel/__init__.py +3 -0
  1791. cuda/cccl/parallel/experimental/__init__.py +3 -0
  1792. cuda/cccl/parallel/experimental/_bindings.py +24 -0
  1793. cuda/cccl/parallel/experimental/_bindings.pyi +388 -0
  1794. cuda/cccl/parallel/experimental/_bindings_impl.cpython-311-x86_64-linux-gnu.so +0 -0
  1795. cuda/cccl/parallel/experimental/_bindings_impl.pyx +2158 -0
  1796. cuda/cccl/parallel/experimental/_caching.py +71 -0
  1797. cuda/cccl/parallel/experimental/_cccl_interop.py +371 -0
  1798. cuda/cccl/parallel/experimental/_utils/__init__.py +0 -0
  1799. cuda/cccl/parallel/experimental/_utils/protocols.py +132 -0
  1800. cuda/cccl/parallel/experimental/algorithms/__init__.py +28 -0
  1801. cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +172 -0
  1802. cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +244 -0
  1803. cuda/cccl/parallel/experimental/algorithms/_reduce.py +136 -0
  1804. cuda/cccl/parallel/experimental/algorithms/_scan.py +179 -0
  1805. cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +183 -0
  1806. cuda/cccl/parallel/experimental/algorithms/_transform.py +213 -0
  1807. cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +179 -0
  1808. cuda/cccl/parallel/experimental/cccl/.gitkeep +0 -0
  1809. cuda/cccl/parallel/experimental/cccl/libcccl.c.parallel.so +0 -0
  1810. cuda/cccl/parallel/experimental/iterators/__init__.py +157 -0
  1811. cuda/cccl/parallel/experimental/iterators/_iterators.py +650 -0
  1812. cuda/cccl/parallel/experimental/numba_utils.py +6 -0
  1813. cuda/cccl/parallel/experimental/struct.py +150 -0
  1814. cuda/cccl/parallel/experimental/typing.py +27 -0
  1815. cuda/cccl/py.typed +0 -0
  1816. cuda_cccl-0.1.3.1.0.dev1486.dist-info/METADATA +29 -0
  1817. cuda_cccl-0.1.3.1.0.dev1486.dist-info/RECORD +1819 -0
  1818. cuda_cccl-0.1.3.1.0.dev1486.dist-info/WHEEL +6 -0
  1819. cuda_cccl-0.1.3.1.0.dev1486.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,1901 @@
1
+ /******************************************************************************
2
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
3
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
4
+ *
5
+ * Redistribution and use in source and binary forms, with or without
6
+ * modification, are permitted provided that the following conditions are met:
7
+ * * Redistributions of source code must retain the above copyright
8
+ * notice, this list of conditions and the following disclaimer.
9
+ * * Redistributions in binary form must reproduce the above copyright
10
+ * notice, this list of conditions and the following disclaimer in the
11
+ * documentation and/or other materials provided with the distribution.
12
+ * * Neither the name of the NVIDIA CORPORATION nor the
13
+ * names of its contributors may be used to endorse or promote products
14
+ * derived from this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ *
27
+ ******************************************************************************/
28
+
29
+ //! @file
30
+ //! cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data
31
+ //! items residing within device-accessible memory.
32
+
33
+ #pragma once
34
+
35
+ #include <cub/config.cuh>
36
+
37
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
38
+ # pragma GCC system_header
39
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
40
+ # pragma clang system_header
41
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
42
+ # pragma system_header
43
+ #endif // no system header
44
+
45
+ #include <cub/detail/choose_offset.cuh>
46
+ #include <cub/device/dispatch/dispatch_scan.cuh>
47
+ #include <cub/device/dispatch/dispatch_scan_by_key.cuh>
48
+ #include <cub/thread/thread_operators.cuh>
49
+
50
+ #include <cuda/std/__functional/invoke.h>
51
+
52
+ CUB_NAMESPACE_BEGIN
53
+
54
+ //! @rst
55
+ //! DeviceScan provides device-wide, parallel operations for computing a
56
+ //! prefix scan across a sequence of data items residing within
57
+ //! device-accessible memory.
58
+ //!
59
+ //! Overview
60
+ //! +++++++++++++++++++++++++++++++++++++++++++++
61
+ //!
62
+ //! Given a sequence of input elements and a binary reduction operator, a
63
+ //! `prefix scan <http://en.wikipedia.org/wiki/Prefix_sum>`_ produces an output
64
+ //! sequence where each element is computed to be the reduction of the elements
65
+ //! occurring earlier in the input sequence. *Prefix sum* connotes a prefix scan
66
+ //! with the addition operator. The term *inclusive* indicates that the
67
+ //! *i*\ :sup:`th` output reduction incorporates the *i*\ :sup:`th` input.
68
+ //! The term *exclusive* indicates the *i*\ :sup:`th` input is not
69
+ //! incorporated into the *i*\ :sup:`th` output reduction. When the input and
70
+ //! output sequences are the same, the scan is performed in-place.
71
+ //!
72
+ //! As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our
73
+ //! *"decoupled look-back"* algorithm for performing global prefix scan with
74
+ //! only a single pass through the input data, as described in our 2016 technical
75
+ //! report [1]_. The central idea is to leverage a small, constant factor of
76
+ //! redundant work in order to overlap the latencies of global prefix
77
+ //! propagation with local computation. As such, our algorithm requires only
78
+ //! ``~2*n*`` data movement (``n`` inputs are read, ``n`` outputs are written), and
79
+ //! typically proceeds at "memcpy" speeds. Our algorithm supports inplace operations.
80
+ //!
81
+ //! .. [1] Duane Merrill and Michael Garland. `Single-pass Parallel Prefix Scan with Decoupled Look-back
82
+ //! <https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back>`_,
83
+ //! *NVIDIA Technical Report NVR-2016-002*, 2016.
84
+ //!
85
+ //! Usage Considerations
86
+ //! +++++++++++++++++++++++++++++++++++++++++++++
87
+ //!
88
+ //! @cdp_class{DeviceScan}
89
+ //!
90
+ //! Performance
91
+ //! +++++++++++++++++++++++++++++++++++++++++++++
92
+ //!
93
+ //! @linear_performance{prefix scan}
94
+ //!
95
+ //! @endrst
96
+ struct DeviceScan
97
+ {
98
+ //! @name Exclusive scans
99
+ //! @{
100
+
101
+ //! @rst
102
+ //! Computes a device-wide exclusive prefix sum.
103
+ //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_out``.
104
+ //!
105
+ //! - Supports non-commutative sum operators.
106
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
107
+ //! addition of floating-point types). Results for pseudo-associative
108
+ //! operators may vary from run to run. Additional details can be found in
109
+ //! the @lookback description.
110
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
111
+ //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
112
+ //! shall not overlap in any other way.
113
+ //! - @devicestorage
114
+ //!
115
+ //! Snippet
116
+ //! +++++++++++++++++++++++++++++++++++++++++++++
117
+ //!
118
+ //! The code snippet below illustrates the exclusive prefix sum of an ``int``
119
+ //! device vector.
120
+ //!
121
+ //! .. code-block:: c++
122
+ //!
123
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
124
+ //!
125
+ //! // Declare, allocate, and initialize device-accessible pointers for
126
+ //! // input and output
127
+ //! int num_items; // e.g., 7
128
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
129
+ //! int *d_out; // e.g., [ , , , , , , ]
130
+ //! ...
131
+ //!
132
+ //! // Determine temporary device storage requirements
133
+ //! void *d_temp_storage = nullptr;
134
+ //! size_t temp_storage_bytes = 0;
135
+ //! cub::DeviceScan::ExclusiveSum(
136
+ //! d_temp_storage, temp_storage_bytes,
137
+ //! d_in, d_out, num_items);
138
+ //!
139
+ //! // Allocate temporary storage
140
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
141
+ //!
142
+ //! // Run exclusive prefix sum
143
+ //! cub::DeviceScan::ExclusiveSum(
144
+ //! d_temp_storage, temp_storage_bytes,
145
+ //! d_in, d_out, num_items);
146
+ //!
147
+ //! // d_out <-- [0, 8, 14, 21, 26, 29, 29]
148
+ //!
149
+ //! @endrst
150
+ //!
151
+ //! @tparam InputIteratorT
152
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
153
+ //!
154
+ //! @tparam OutputIteratorT
155
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
156
+ //!
157
+ //! @tparam NumItemsT
158
+ //! **[inferred]** An integral type representing the number of input elements
159
+ //!
160
+ //! @param[in] d_temp_storage
161
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
162
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
163
+ //!
164
+ //! @param[in,out] temp_storage_bytes
165
+ //! Reference to size in bytes of `d_temp_storage` allocation
166
+ //!
167
+ //! @param[in] d_in
168
+ //! Random-access iterator to the input sequence of data items
169
+ //!
170
+ //! @param[out] d_out
171
+ //! Random-access iterator to the output sequence of data items
172
+ //!
173
+ //! @param[in] num_items
174
+ //! Total number of input items (i.e., the length of `d_in`)
175
+ //!
176
+ //! @param[in] stream
177
+ //! @rst
178
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
179
+ //! @endrst
180
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
181
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
182
+ void* d_temp_storage,
183
+ size_t& temp_storage_bytes,
184
+ InputIteratorT d_in,
185
+ OutputIteratorT d_out,
186
+ NumItemsT num_items,
187
+ cudaStream_t stream = 0)
188
+ {
189
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSum");
190
+
191
+ // Unsigned integer type for global offsets
192
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
193
+ using InitT = cub::detail::it_value_t<InputIteratorT>;
194
+
195
+ // Initial value
196
+ InitT init_value{};
197
+
198
+ return DispatchScan<InputIteratorT, OutputIteratorT, ::cuda::std::plus<>, detail::InputValue<InitT>, OffsetT>::
199
+ Dispatch(d_temp_storage,
200
+ temp_storage_bytes,
201
+ d_in,
202
+ d_out,
203
+ ::cuda::std::plus<>{},
204
+ detail::InputValue<InitT>(init_value),
205
+ num_items,
206
+ stream);
207
+ }
208
+
209
+ //! @rst
210
+ //! Computes a device-wide exclusive prefix sum in-place.
211
+ //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_data``.
212
+ //!
213
+ //! - Supports non-commutative sum operators.
214
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
215
+ //! addition of floating-point types). Results for pseudo-associative
216
+ //! operators may vary from run to run. Additional details can be found in
217
+ //! the @lookback description.
218
+ //! - @devicestorage
219
+ //!
220
+ //! Snippet
221
+ //! +++++++++++++++++++++++++++++++++++++++++++++
222
+ //!
223
+ //! The code snippet below illustrates the exclusive prefix sum of an ``int``
224
+ //! device vector.
225
+ //!
226
+ //! .. code-block:: c++
227
+ //!
228
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
229
+ //!
230
+ //! // Declare, allocate, and initialize device-accessible pointers for
231
+ //! // input and output
232
+ //! int num_items; // e.g., 7
233
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
234
+ //! ...
235
+ //!
236
+ //! // Determine temporary device storage requirements
237
+ //! void *d_temp_storage = nullptr;
238
+ //! size_t temp_storage_bytes = 0;
239
+ //! cub::DeviceScan::ExclusiveSum(
240
+ //! d_temp_storage, temp_storage_bytes,
241
+ //! d_data, num_items);
242
+ //!
243
+ //! // Allocate temporary storage
244
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
245
+ //!
246
+ //! // Run exclusive prefix sum
247
+ //! cub::DeviceScan::ExclusiveSum(
248
+ //! d_temp_storage, temp_storage_bytes,
249
+ //! d_data, num_items);
250
+ //!
251
+ //! // d_data <-- [0, 8, 14, 21, 26, 29, 29]
252
+ //!
253
+ //! @endrst
254
+ //!
255
+ //! @tparam IteratorT
256
+ //! **[inferred]** Random-access iterator type for reading scan inputs and wrigin scan outputs
257
+ //!
258
+ //! @tparam NumItemsT
259
+ //! **[inferred]** An integral type representing the number of input elements
260
+ //!
261
+ //! @param[in] d_temp_storage
262
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
263
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
264
+ //!
265
+ //! @param[in,out] temp_storage_bytes
266
+ //! Reference to size in bytes of `d_temp_storage` allocation
267
+ //!
268
+ //! @param[in,out] d_data
269
+ //! Random-access iterator to the sequence of data items
270
+ //!
271
+ //! @param[in] num_items
272
+ //! Total number of input items (i.e., the length of `d_in`)
273
+ //!
274
+ //! @param[in] stream
275
+ //! @rst
276
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
277
+ //! @endrst
278
+ template <typename IteratorT, typename NumItemsT>
279
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
280
+ void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0)
281
+ {
282
+ return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
283
+ }
284
+
285
+ //! @rst
286
+ //! Computes a device-wide exclusive prefix scan using the specified
287
+ //! binary ``scan_op`` functor. The ``init_value`` value is applied as
288
+ //! the initial value, and is assigned to ``*d_out``.
289
+ //!
290
+ //! - Supports non-commutative scan operators.
291
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
292
+ //! addition of floating-point types). Results for pseudo-associative
293
+ //! operators may vary from run to run. Additional details can be found in
294
+ //! the @lookback description.
295
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
296
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
297
+ //! shall not overlap in any other way.
298
+ //! - @devicestorage
299
+ //!
300
+ //! Snippet
301
+ //! +++++++++++++++++++++++++++++++++++++++++++++
302
+ //!
303
+ //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
304
+ //!
305
+ //! .. code-block:: c++
306
+ //!
307
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
308
+ //! #include <cuda/std/climits> // for INT_MAX
309
+ //!
310
+ //! // CustomMin functor
311
+ //! struct CustomMin
312
+ //! {
313
+ //! template <typename T>
314
+ //! __host__ __device__ __forceinline__
315
+ //! T operator()(const T &a, const T &b) const {
316
+ //! return (b < a) ? b : a;
317
+ //! }
318
+ //! };
319
+ //!
320
+ //! // Declare, allocate, and initialize device-accessible pointers for
321
+ //! // input and output
322
+ //! int num_items; // e.g., 7
323
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
324
+ //! int *d_out; // e.g., [ , , , , , , ]
325
+ //! CustomMin min_op;
326
+ //! ...
327
+ //!
328
+ //! // Determine temporary device storage requirements for exclusive
329
+ //! // prefix scan
330
+ //! void *d_temp_storage = nullptr;
331
+ //! size_t temp_storage_bytes = 0;
332
+ //! cub::DeviceScan::ExclusiveScan(
333
+ //! d_temp_storage, temp_storage_bytes,
334
+ //! d_in, d_out, min_op, (int) INT_MAX, num_items);
335
+ //!
336
+ //! // Allocate temporary storage for exclusive prefix scan
337
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
338
+ //!
339
+ //! // Run exclusive prefix min-scan
340
+ //! cub::DeviceScan::ExclusiveScan(
341
+ //! d_temp_storage, temp_storage_bytes,
342
+ //! d_in, d_out, min_op, (int) INT_MAX, num_items);
343
+ //!
344
+ //! // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
345
+ //!
346
+ //! @endrst
347
+ //!
348
+ //! @tparam InputIteratorT
349
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
350
+ //!
351
+ //! @tparam OutputIteratorT
352
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
353
+ //!
354
+ //! @tparam ScanOpT
355
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
356
+ //!
357
+ //! @tparam InitValueT
358
+ //! **[inferred]** Type of the `init_value` used Binary scan functor type
359
+ //! having member `T operator()(const T &a, const T &b)`
360
+ //!
361
+ //! @tparam NumItemsT
362
+ //! **[inferred]** An integral type representing the number of input elements
363
+ //!
364
+ //! @param[in] d_temp_storage
365
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
366
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
367
+ //!
368
+ //! @param[in,out] temp_storage_bytes
369
+ //! Reference to size in bytes of `d_temp_storage` allocation
370
+ //!
371
+ //! @param[in] d_in
372
+ //! Random-access iterator to the input sequence of data items
373
+ //!
374
+ //! @param[out] d_out
375
+ //! Random-access iterator to the output sequence of data items
376
+ //!
377
+ //! @param[in] scan_op
378
+ //! Binary scan functor
379
+ //!
380
+ //! @param[in] init_value
381
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
382
+ //!
383
+ //! @param[in] num_items
384
+ //! Total number of input items (i.e., the length of `d_in`)
385
+ //!
386
+ //! @param[in] stream
387
+ //! @rst
388
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
389
+ //! @endrst
390
+ template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
391
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
392
+ void* d_temp_storage,
393
+ size_t& temp_storage_bytes,
394
+ InputIteratorT d_in,
395
+ OutputIteratorT d_out,
396
+ ScanOpT scan_op,
397
+ InitValueT init_value,
398
+ NumItemsT num_items,
399
+ cudaStream_t stream = 0)
400
+ {
401
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan");
402
+
403
+ // Unsigned integer type for global offsets
404
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
405
+
406
+ return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
407
+ d_temp_storage,
408
+ temp_storage_bytes,
409
+ d_in,
410
+ d_out,
411
+ scan_op,
412
+ detail::InputValue<InitValueT>(init_value),
413
+ num_items,
414
+ stream);
415
+ }
416
+
417
+ //! @rst
418
+ //! Computes a device-wide exclusive prefix scan using the specified
419
+ //! binary ``scan_op`` functor. The ``init_value`` value is applied as
420
+ //! the initial value, and is assigned to ``*d_data``.
421
+ //!
422
+ //! - Supports non-commutative scan operators.
423
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
424
+ //! addition of floating-point types). Results for pseudo-associative
425
+ //! operators may vary from run to run. Additional details can be found in
426
+ //! the @lookback description.
427
+ //! - @devicestorage
428
+ //!
429
+ //! Snippet
430
+ //! +++++++++++++++++++++++++++++++++++++++++++++
431
+ //!
432
+ //! The code snippet below illustrates the exclusive prefix min-scan of an
433
+ //! ``int`` device vector:
434
+ //!
435
+ //! .. code-block:: c++
436
+ //!
437
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
438
+ //! #include <cuda/std/climits> // for INT_MAX
439
+ //!
440
+ //! // CustomMin functor
441
+ //! struct CustomMin
442
+ //! {
443
+ //! template <typename T>
444
+ //! __host__ __device__ __forceinline__
445
+ //! T operator()(const T &a, const T &b) const {
446
+ //! return (b < a) ? b : a;
447
+ //! }
448
+ //! };
449
+ //!
450
+ //! // Declare, allocate, and initialize device-accessible pointers for
451
+ //! // input and output
452
+ //! int num_items; // e.g., 7
453
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
454
+ //! CustomMin min_op;
455
+ //! ...
456
+ //!
457
+ //! // Determine temporary device storage requirements for exclusive
458
+ //! // prefix scan
459
+ //! void *d_temp_storage = nullptr;
460
+ //! size_t temp_storage_bytes = 0;
461
+ //! cub::DeviceScan::ExclusiveScan(
462
+ //! d_temp_storage, temp_storage_bytes,
463
+ //! d_data, min_op, (int) INT_MAX, num_items);
464
+ //!
465
+ //! // Allocate temporary storage for exclusive prefix scan
466
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
467
+ //!
468
+ //! // Run exclusive prefix min-scan
469
+ //! cub::DeviceScan::ExclusiveScan(
470
+ //! d_temp_storage, temp_storage_bytes,
471
+ //! d_data, min_op, (int) INT_MAX, num_items);
472
+ //!
473
+ //! // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
474
+ //!
475
+ //! @endrst
476
+ //!
477
+ //! @tparam IteratorT
478
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
479
+ //!
480
+ //! @tparam ScanOpT
481
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
482
+ //!
483
+ //! @tparam InitValueT
484
+ //! **[inferred]** Type of the `init_value` used Binary scan functor type
485
+ //! having member `T operator()(const T &a, const T &b)`
486
+ //!
487
+ //! @tparam NumItemsT
488
+ //! **[inferred]** An integral type representing the number of input elements
489
+ //!
490
+ //! @param[in] d_temp_storage
491
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
492
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
493
+ //!
494
+ //! @param[in,out] temp_storage_bytes
495
+ //! Reference to size in bytes of `d_temp_storage` allocation
496
+ //!
497
+ //! @param[in,out] d_data
498
+ //! Random-access iterator to the sequence of data items
499
+ //!
500
+ //! @param[in] scan_op
501
+ //! Binary scan functor
502
+ //!
503
+ //! @param[in] init_value
504
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
505
+ //!
506
+ //! @param[in] num_items
507
+ //! Total number of input items (i.e., the length of `d_in`)
508
+ //!
509
+ //! @param[in] stream
510
+ //! @rst
511
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
512
+ //! @endrst
513
+ template <typename IteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
514
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
515
+ void* d_temp_storage,
516
+ size_t& temp_storage_bytes,
517
+ IteratorT d_data,
518
+ ScanOpT scan_op,
519
+ InitValueT init_value,
520
+ NumItemsT num_items,
521
+ cudaStream_t stream = 0)
522
+ {
523
+ return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
524
+ }
525
+
526
+ //! @rst
527
+ //! Computes a device-wide exclusive prefix scan using the specified
528
+ //! binary ``scan_op`` functor. The ``init_value`` value is provided as a future value.
529
+ //!
530
+ //! - Supports non-commutative scan operators.
531
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
532
+ //! addition of floating-point types). Results for pseudo-associative
533
+ //! operators may vary from run to run. Additional details can be found in
534
+ //! the @lookback description.
535
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
536
+ //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
537
+ //! shall not overlap in any other way.
538
+ //! - @devicestorage
539
+ //!
540
+ //! Snippet
541
+ //! +++++++++++++++++++++++++++++++++++++++++++++
542
+ //!
543
+ //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
544
+ //!
545
+ //! .. code-block:: c++
546
+ //!
547
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
548
+ //! #include <cuda/std/climits> // for INT_MAX
549
+ //!
550
+ //! // CustomMin functor
551
+ //! struct CustomMin
552
+ //! {
553
+ //! template <typename T>
554
+ //! __host__ __device__ __forceinline__
555
+ //! T operator()(const T &a, const T &b) const {
556
+ //! return (b < a) ? b : a;
557
+ //! }
558
+ //! };
559
+ //!
560
+ //! // Declare, allocate, and initialize device-accessible pointers for
561
+ //! // input and output
562
+ //! int num_items; // e.g., 7
563
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
564
+ //! int *d_out; // e.g., [ , , , , , , ]
565
+ //! int *d_init_iter; // e.g., INT_MAX
566
+ //! CustomMin min_op;
567
+ //!
568
+ //! auto future_init_value =
569
+ //! cub::FutureValue<InitialValueT, IterT>(d_init_iter);
570
+ //!
571
+ //! ...
572
+ //!
573
+ //! // Determine temporary device storage requirements for exclusive
574
+ //! // prefix scan
575
+ //! void *d_temp_storage = nullptr;
576
+ //! size_t temp_storage_bytes = 0;
577
+ //! cub::DeviceScan::ExclusiveScan(
578
+ //! d_temp_storage, temp_storage_bytes,
579
+ //! d_in, d_out, min_op, future_init_value, num_items);
580
+ //!
581
+ //! // Allocate temporary storage for exclusive prefix scan
582
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
583
+ //!
584
+ //! // Run exclusive prefix min-scan
585
+ //! cub::DeviceScan::ExclusiveScan(
586
+ //! d_temp_storage, temp_storage_bytes,
587
+ //! d_in, d_out, min_op, future_init_value, num_items);
588
+ //!
589
+ //! // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
590
+ //!
591
+ //! @endrst
592
+ //!
593
+ //! @tparam InputIteratorT
594
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
595
+ //!
596
+ //! @tparam OutputIteratorT
597
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
598
+ //!
599
+ //! @tparam ScanOpT
600
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
601
+ //!
602
+ //! @tparam InitValueT
603
+ //! **[inferred]** Type of the `init_value` used Binary scan functor type
604
+ //! having member `T operator()(const T &a, const T &b)`
605
+ //!
606
+ //! @tparam NumItemsT
607
+ //! **[inferred]** An integral type representing the number of input elements
608
+ //!
609
+ //! @param[in] d_temp_storage
610
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
611
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
612
+ //!
613
+ //! @param[in,out] temp_storage_bytes
614
+ //! Reference to size in bytes of `d_temp_storage` allocation
615
+ //!
616
+ //! @param[in] d_in
617
+ //! Pointer to the input sequence of data items
618
+ //!
619
+ //! @param[out] d_out
620
+ //! Pointer to the output sequence of data items
621
+ //!
622
+ //! @param[in] scan_op
623
+ //! Binary scan functor
624
+ //!
625
+ //! @param[in] init_value
626
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
627
+ //!
628
+ //! @param[in] num_items
629
+ //! Total number of input items (i.e., the length of `d_in`)
630
+ //!
631
+ //! @param[in] stream
632
+ //! @rst
633
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
634
+ //! @endrst
635
+ template <typename InputIteratorT,
636
+ typename OutputIteratorT,
637
+ typename ScanOpT,
638
+ typename InitValueT,
639
+ typename InitValueIterT = InitValueT*,
640
+ typename NumItemsT = int>
641
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
642
+ void* d_temp_storage,
643
+ size_t& temp_storage_bytes,
644
+ InputIteratorT d_in,
645
+ OutputIteratorT d_out,
646
+ ScanOpT scan_op,
647
+ FutureValue<InitValueT, InitValueIterT> init_value,
648
+ NumItemsT num_items,
649
+ cudaStream_t stream = 0)
650
+ {
651
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan");
652
+
653
+ // Unsigned integer type for global offsets
654
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
655
+
656
+ return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
657
+ d_temp_storage,
658
+ temp_storage_bytes,
659
+ d_in,
660
+ d_out,
661
+ scan_op,
662
+ detail::InputValue<InitValueT>(init_value),
663
+ num_items,
664
+ stream);
665
+ }
666
+
667
+ //! @rst
668
+ //! Computes a device-wide exclusive prefix scan using the specified binary ``scan_op`` functor.
669
+ //! The ``init_value`` value is provided as a future value.
670
+ //!
671
+ //! - Supports non-commutative scan operators.
672
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
673
+ //! addition of floating-point types). Results for pseudo-associative
674
+ //! operators may vary from run to run. Additional details can be found in
675
+ //! the @lookback description.
676
+ //! - @devicestorage
677
+ //!
678
+ //! Snippet
679
+ //! +++++++++++++++++++++++++++++++++++++++++++++
680
+ //!
681
+ //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
682
+ //!
683
+ //! .. code-block:: c++
684
+ //!
685
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
686
+ //! #include <cuda/std/climits> // for INT_MAX
687
+ //!
688
+ //! // CustomMin functor
689
+ //! struct CustomMin
690
+ //! {
691
+ //! template <typename T>
692
+ //! __host__ __device__ __forceinline__
693
+ //! T operator()(const T &a, const T &b) const {
694
+ //! return (b < a) ? b : a;
695
+ //! }
696
+ //! };
697
+ //!
698
+ //! // Declare, allocate, and initialize device-accessible pointers for
699
+ //! // input and output
700
+ //! int num_items; // e.g., 7
701
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
702
+ //! int *d_init_iter; // e.g., INT_MAX
703
+ //! CustomMin min_op;
704
+ //!
705
+ //! auto future_init_value =
706
+ //! cub::FutureValue<InitialValueT, IterT>(d_init_iter);
707
+ //!
708
+ //! ...
709
+ //!
710
+ //! // Determine temporary device storage requirements for exclusive
711
+ //! // prefix scan
712
+ //! void *d_temp_storage = nullptr;
713
+ //! size_t temp_storage_bytes = 0;
714
+ //! cub::DeviceScan::ExclusiveScan(
715
+ //! d_temp_storage, temp_storage_bytes,
716
+ //! d_data, min_op, future_init_value, num_items);
717
+ //!
718
+ //! // Allocate temporary storage for exclusive prefix scan
719
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
720
+ //!
721
+ //! // Run exclusive prefix min-scan
722
+ //! cub::DeviceScan::ExclusiveScan(
723
+ //! d_temp_storage, temp_storage_bytes,
724
+ //! d_data, min_op, future_init_value, num_items);
725
+ //!
726
+ //! // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
727
+ //!
728
+ //! @endrst
729
+ //!
730
+ //! @tparam IteratorT
731
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
732
+ //!
733
+ //! @tparam ScanOpT
734
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
735
+ //!
736
+ //! @tparam InitValueT
737
+ //! **[inferred]** Type of the `init_value` used Binary scan functor type
738
+ //! having member `T operator()(const T &a, const T &b)`
739
+ //!
740
+ //! @tparam NumItemsT
741
+ //! **[inferred]** An integral type representing the number of input elements
742
+ //!
743
+ //! @param[in] d_temp_storage
744
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
745
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
746
+ //!
747
+ //! @param[in,out] temp_storage_bytes
748
+ //! Reference to size in bytes of `d_temp_storage` allocation
749
+ //!
750
+ //! @param[in,out] d_data
751
+ //! Pointer to the sequence of data items
752
+ //!
753
+ //! @param[in] scan_op
754
+ //! Binary scan functor
755
+ //!
756
+ //! @param[in] init_value
757
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
758
+ //!
759
+ //! @param[in] num_items
760
+ //! Total number of input items (i.e., the length of `d_in`)
761
+ //!
762
+ //! @param[in] stream
763
+ //! @rst
764
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
765
+ //! @endrst
766
+ template <typename IteratorT,
767
+ typename ScanOpT,
768
+ typename InitValueT,
769
+ typename InitValueIterT = InitValueT*,
770
+ typename NumItemsT = int>
771
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
772
+ void* d_temp_storage,
773
+ size_t& temp_storage_bytes,
774
+ IteratorT d_data,
775
+ ScanOpT scan_op,
776
+ FutureValue<InitValueT, InitValueIterT> init_value,
777
+ NumItemsT num_items,
778
+ cudaStream_t stream = 0)
779
+ {
780
+ return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
781
+ }
782
+
783
+ //! @} end member group
784
+ //! @name Inclusive scans
785
+ //! @{
786
+
787
+ //! @rst
788
+ //! Computes a device-wide inclusive prefix sum.
789
+ //!
790
+ //! - Supports non-commutative sum operators.
791
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
792
+ //! addition of floating-point types). Results for pseudo-associative
793
+ //! operators may vary from run to run. Additional details can be found in
794
+ //! the @lookback description.
795
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
796
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
797
+ //! shall not overlap in any other way.
798
+ //! - @devicestorage
799
+ //!
800
+ //! Snippet
801
+ //! +++++++++++++++++++++++++++++++++++++++++++++
802
+ //!
803
+ //! The code snippet below illustrates the inclusive prefix sum of an ``int`` device vector.
804
+ //!
805
+ //! .. code-block:: c++
806
+ //!
807
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
808
+ //!
809
+ //! // Declare, allocate, and initialize device-accessible pointers for
810
+ //! // input and output
811
+ //! int num_items; // e.g., 7
812
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
813
+ //! int *d_out; // e.g., [ , , , , , , ]
814
+ //! ...
815
+ //!
816
+ //! // Determine temporary device storage requirements for inclusive
817
+ //! // prefix sum
818
+ //! void *d_temp_storage = nullptr;
819
+ //! size_t temp_storage_bytes = 0;
820
+ //! cub::DeviceScan::InclusiveSum(
821
+ //! d_temp_storage, temp_storage_bytes,
822
+ //! d_in, d_out, num_items);
823
+ //!
824
+ //! // Allocate temporary storage for inclusive prefix sum
825
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
826
+ //!
827
+ //! // Run inclusive prefix sum
828
+ //! cub::DeviceScan::InclusiveSum(
829
+ //! d_temp_storage, temp_storage_bytes,
830
+ //! d_in, d_out, num_items);
831
+ //!
832
+ //! // d_out <-- [8, 14, 21, 26, 29, 29, 38]
833
+ //!
834
+ //! @endrst
835
+ //!
836
+ //! @tparam InputIteratorT
837
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
838
+ //!
839
+ //! @tparam OutputIteratorT
840
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
841
+ //!
842
+ //! @tparam NumItemsT
843
+ //! **[inferred]** An integral type representing the number of input elements
844
+ //!
845
+ //! @param[in] d_temp_storage
846
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
847
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
848
+ //!
849
+ //! @param[in,out] temp_storage_bytes
850
+ //! Reference to size in bytes of `d_temp_storage` allocation
851
+ //!
852
+ //! @param[in] d_in
853
+ //! Random-access iterator to the input sequence of data items
854
+ //!
855
+ //! @param[out] d_out
856
+ //! Random-access iterator to the output sequence of data items
857
+ //!
858
+ //! @param[in] num_items
859
+ //! Total number of input items (i.e., the length of `d_in`)
860
+ //!
861
+ //! @param[in] stream
862
+ //! @rst
863
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
864
+ //! @endrst
865
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
866
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
867
+ void* d_temp_storage,
868
+ size_t& temp_storage_bytes,
869
+ InputIteratorT d_in,
870
+ OutputIteratorT d_out,
871
+ NumItemsT num_items,
872
+ cudaStream_t stream = 0)
873
+ {
874
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSum");
875
+
876
+ // Unsigned integer type for global offsets
877
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
878
+
879
+ return DispatchScan<InputIteratorT, OutputIteratorT, ::cuda::std::plus<>, NullType, OffsetT>::Dispatch(
880
+ d_temp_storage, temp_storage_bytes, d_in, d_out, ::cuda::std::plus<>{}, NullType{}, num_items, stream);
881
+ }
882
+
883
+ //! @rst
884
+ //! Computes a device-wide inclusive prefix sum in-place.
885
+ //!
886
+ //! - Supports non-commutative sum operators.
887
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
888
+ //! addition of floating-point types). Results for pseudo-associative
889
+ //! operators may vary from run to run. Additional details can be found in
890
+ //! the @lookback description.
891
+ //! - @devicestorage
892
+ //!
893
+ //! Snippet
894
+ //! +++++++++++++++++++++++++++++++++++++++++++++
895
+ //!
896
+ //! The code snippet below illustrates the inclusive prefix sum of an ``int`` device vector.
897
+ //!
898
+ //! .. code-block:: c++
899
+ //!
900
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
901
+ //!
902
+ //! // Declare, allocate, and initialize device-accessible pointers for
903
+ //! // input and output
904
+ //! int num_items; // e.g., 7
905
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
906
+ //! ...
907
+ //!
908
+ //! // Determine temporary device storage requirements for inclusive
909
+ //! // prefix sum
910
+ //! void *d_temp_storage = nullptr;
911
+ //! size_t temp_storage_bytes = 0;
912
+ //! cub::DeviceScan::InclusiveSum(
913
+ //! d_temp_storage, temp_storage_bytes,
914
+ //! d_data, num_items);
915
+ //!
916
+ //! // Allocate temporary storage for inclusive prefix sum
917
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
918
+ //!
919
+ //! // Run inclusive prefix sum
920
+ //! cub::DeviceScan::InclusiveSum(
921
+ //! d_temp_storage, temp_storage_bytes,
922
+ //! d_data, num_items);
923
+ //!
924
+ //! // d_data <-- [8, 14, 21, 26, 29, 29, 38]
925
+ //!
926
+ //! @endrst
927
+ //!
928
+ //! @tparam IteratorT
929
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
930
+ //!
931
+ //! @tparam NumItemsT
932
+ //! **[inferred]** An integral type representing the number of input elements
933
+ //!
934
+ //! @param[in] d_temp_storage
935
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
936
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
937
+ //!
938
+ //! @param[in,out] temp_storage_bytes
939
+ //! Reference to size in bytes of `d_temp_storage` allocation
940
+ //!
941
+ //! @param[in,out] d_data
942
+ //! Random-access iterator to the sequence of data items
943
+ //!
944
+ //! @param[in] num_items
945
+ //! Total number of input items (i.e., the length of `d_in`)
946
+ //!
947
+ //! @param[in] stream
948
+ //! @rst
949
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
950
+ //! @endrst
951
+ template <typename IteratorT, typename NumItemsT>
952
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
953
+ void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0)
954
+ {
955
+ return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
956
+ }
957
+
958
+ //! @rst
959
+ //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor.
960
+ //!
961
+ //! - Supports non-commutative scan operators.
962
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
963
+ //! addition of floating-point types). Results for pseudo-associative
964
+ //! operators may vary from run to run. Additional details can be found in
965
+ //! the @lookback description.
966
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
967
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
968
+ //! shall not overlap in any other way.
969
+ //! - @devicestorage
970
+ //!
971
+ //! Snippet
972
+ //! +++++++++++++++++++++++++++++++++++++++++++++
973
+ //!
974
+ //! The code snippet below illustrates the inclusive prefix min-scan of an ``int`` device vector.
975
+ //!
976
+ //! .. code-block:: c++
977
+ //!
978
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
979
+ //! #include <cuda/std/climits> // for INT_MAX
980
+ //!
981
+ //! // CustomMin functor
982
+ //! struct CustomMin
983
+ //! {
984
+ //! template <typename T>
985
+ //! __host__ __device__ __forceinline__
986
+ //! T operator()(const T &a, const T &b) const {
987
+ //! return (b < a) ? b : a;
988
+ //! }
989
+ //! };
990
+ //!
991
+ //! // Declare, allocate, and initialize device-accessible pointers for
992
+ //! // input and output
993
+ //! int num_items; // e.g., 7
994
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
995
+ //! int *d_out; // e.g., [ , , , , , , ]
996
+ //! CustomMin min_op;
997
+ //! ...
998
+ //!
999
+ //! // Determine temporary device storage requirements for inclusive
1000
+ //! // prefix scan
1001
+ //! void *d_temp_storage = nullptr;
1002
+ //! size_t temp_storage_bytes = 0;
1003
+ //! cub::DeviceScan::InclusiveScan(
1004
+ //! d_temp_storage, temp_storage_bytes,
1005
+ //! d_in, d_out, min_op, num_items);
1006
+ //!
1007
+ //! // Allocate temporary storage for inclusive prefix scan
1008
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1009
+ //!
1010
+ //! // Run inclusive prefix min-scan
1011
+ //! cub::DeviceScan::InclusiveScan(
1012
+ //! d_temp_storage, temp_storage_bytes,
1013
+ //! d_in, d_out, min_op, num_items);
1014
+ //!
1015
+ //! // d_out <-- [8, 6, 6, 5, 3, 0, 0]
1016
+ //!
1017
+ //! @endrst
1018
+ //!
1019
+ //! @tparam InputIteratorT
1020
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
1021
+ //!
1022
+ //! @tparam OutputIteratorT
1023
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
1024
+ //!
1025
+ //! @tparam ScanOpT
1026
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1027
+ //!
1028
+ //! @tparam NumItemsT
1029
+ //! **[inferred]** An integral type representing the number of input elements
1030
+ //!
1031
+ //! @param[in]
1032
+ //! d_temp_storage Device-accessible allocation of temporary storage.
1033
+ //! When `nullptr`, the required allocation size is written to
1034
+ //! `temp_storage_bytes` and no work is done.
1035
+ //!
1036
+ //! @param[in,out] temp_storage_bytes
1037
+ //! Reference to size in bytes of `d_temp_storage` allocation
1038
+ //!
1039
+ //! @param[in] d_in
1040
+ //! Random-access iterator to the input sequence of data items
1041
+ //!
1042
+ //! @param[out] d_out
1043
+ //! Random-access iterator to the output sequence of data items
1044
+ //!
1045
+ //! @param[in] scan_op
1046
+ //! Binary scan functor
1047
+ //!
1048
+ //! @param[in] num_items
1049
+ //! Total number of input items (i.e., the length of `d_in`)
1050
+ //!
1051
+ //! @param[in] stream
1052
+ //! @rst
1053
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1054
+ //! @endrst
1055
+ template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename NumItemsT>
1056
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
1057
+ void* d_temp_storage,
1058
+ size_t& temp_storage_bytes,
1059
+ InputIteratorT d_in,
1060
+ OutputIteratorT d_out,
1061
+ ScanOpT scan_op,
1062
+ NumItemsT num_items,
1063
+ cudaStream_t stream = 0)
1064
+ {
1065
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScan");
1066
+
1067
+ // Unsigned integer type for global offsets
1068
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1069
+
1070
+ return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
1071
+ d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream);
1072
+ }
1073
+
1074
+ //! @rst
1075
+ //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor.
1076
+ //! The result of applying the ``scan_op`` binary operator to ``init_value`` value and ``*d_in``
1077
+ //! is assigned to ``*d_out``.
1078
+ //!
1079
+ //! - Supports non-commutative scan operators.
1080
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1081
+ //! addition of floating-point types). Results for pseudo-associative
1082
+ //! operators may vary from run to run. Additional details can be found in
1083
+ //! the @lookback description.
1084
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
1085
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
1086
+ //! shall not overlap in any other way.
1087
+ //! - @devicestorage
1088
+ //!
1089
+ //! Snippet
1090
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1091
+ //!
1092
+ //! The code snippet below illustrates the inclusive max-scan of an ``int`` device vector.
1093
+ //!
1094
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_api.cu
1095
+ //! :language: c++
1096
+ //! :dedent:
1097
+ //! :start-after: example-begin device-inclusive-scan
1098
+ //! :end-before: example-end device-inclusive-scan
1099
+ //!
1100
+ //! @endrst
1101
+ //!
1102
+ //! @tparam InputIteratorT
1103
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
1104
+ //!
1105
+ //! @tparam OutputIteratorT
1106
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
1107
+ //!
1108
+ //! @tparam ScanOpT
1109
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1110
+ //!
1111
+ //! @tparam InitValueT
1112
+ //! **[inferred]** Type of the `init_value`
1113
+ //!
1114
+ //! @tparam NumItemsT
1115
+ //! **[inferred]** An integral type representing the number of input elements
1116
+ //!
1117
+ //! @param[in] d_temp_storage
1118
+ //! Device-accessible allocation of temporary storage.
1119
+ //! When `nullptr`, the required allocation size is written to
1120
+ //! `temp_storage_bytes` and no work is done.
1121
+ //!
1122
+ //! @param[in,out] temp_storage_bytes
1123
+ //! Reference to the size in bytes of the `d_temp_storage` allocation
1124
+ //!
1125
+ //! @param[in] d_in
1126
+ //! Random-access iterator to the input sequence of data items
1127
+ //!
1128
+ //! @param[out] d_out
1129
+ //! Random-access iterator to the output sequence of data items
1130
+ //!
1131
+ //! @param[in] scan_op
1132
+ //! Binary scan functor
1133
+ //!
1134
+ //! @param[in] init_value
1135
+ //! Initial value to seed the inclusive scan (`scan_op(init_value, d_in[0])`
1136
+ //! is assigned to `*d_out`)
1137
+ //!
1138
+ //! @param[in] num_items
1139
+ //! Total number of input items (i.e., the length of `d_in`)
1140
+ //!
1141
+ //! @param[in] stream
1142
+ //! CUDA stream to launch kernels within.
1143
+ template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
1144
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanInit(
1145
+ void* d_temp_storage,
1146
+ size_t& temp_storage_bytes,
1147
+ InputIteratorT d_in,
1148
+ OutputIteratorT d_out,
1149
+ ScanOpT scan_op,
1150
+ InitValueT init_value,
1151
+ NumItemsT num_items,
1152
+ cudaStream_t stream = 0)
1153
+ {
1154
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanInit");
1155
+
1156
+ // Unsigned integer type for global offsets
1157
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1158
+ using AccumT = ::cuda::std::__accumulator_t<ScanOpT, cub::detail::it_value_t<InputIteratorT>, InitValueT>;
1159
+
1160
+ return DispatchScan<
1161
+ InputIteratorT,
1162
+ OutputIteratorT,
1163
+ ScanOpT,
1164
+ detail::InputValue<InitValueT>,
1165
+ OffsetT,
1166
+ AccumT,
1167
+ ForceInclusive::Yes>::Dispatch(d_temp_storage,
1168
+ temp_storage_bytes,
1169
+ d_in,
1170
+ d_out,
1171
+ scan_op,
1172
+ detail::InputValue<InitValueT>(init_value),
1173
+ num_items,
1174
+ stream);
1175
+ }
1176
+
1177
+ //! @rst
1178
+ //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor.
1179
+ //!
1180
+ //! - Supports non-commutative scan operators.
1181
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1182
+ //! addition of floating-point types). Results for pseudo-associative
1183
+ //! operators may vary from run to run. Additional details can be found in
1184
+ //! the @lookback description.
1185
+ //! - @devicestorage
1186
+ //!
1187
+ //! Snippet
1188
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1189
+ //!
1190
+ //! The code snippet below illustrates the inclusive prefix min-scan of an ``int`` device vector.
1191
+ //!
1192
+ //! .. code-block:: c++
1193
+ //!
1194
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1195
+ //! #include <cuda/std/climits> // for INT_MAX
1196
+ //!
1197
+ //! // CustomMin functor
1198
+ //! struct CustomMin
1199
+ //! {
1200
+ //! template <typename T>
1201
+ //! __host__ __device__ __forceinline__
1202
+ //! T operator()(const T &a, const T &b) const {
1203
+ //! return (b < a) ? b : a;
1204
+ //! }
1205
+ //! };
1206
+ //!
1207
+ //! // Declare, allocate, and initialize device-accessible pointers for
1208
+ //! // input and output
1209
+ //! int num_items; // e.g., 7
1210
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
1211
+ //! CustomMin min_op;
1212
+ //! ...
1213
+ //!
1214
+ //! // Determine temporary device storage requirements for inclusive
1215
+ //! // prefix scan
1216
+ //! void *d_temp_storage = nullptr;
1217
+ //! size_t temp_storage_bytes = 0;
1218
+ //! cub::DeviceScan::InclusiveScan(
1219
+ //! d_temp_storage, temp_storage_bytes,
1220
+ //! d_data, min_op, num_items);
1221
+ //!
1222
+ //! // Allocate temporary storage for inclusive prefix scan
1223
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1224
+ //!
1225
+ //! // Run inclusive prefix min-scan
1226
+ //! cub::DeviceScan::InclusiveScan(
1227
+ //! d_temp_storage, temp_storage_bytes,
1228
+ //! d_in, d_out, min_op, num_items);
1229
+ //!
1230
+ //! // d_data <-- [8, 6, 6, 5, 3, 0, 0]
1231
+ //!
1232
+ //! @endrst
1233
+ //!
1234
+ //! @tparam IteratorT
1235
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
1236
+ //!
1237
+ //! @tparam ScanOpT
1238
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1239
+ //!
1240
+ //! @tparam NumItemsT
1241
+ //! **[inferred]** An integral type representing the number of input elements
1242
+ //!
1243
+ //! @param[in]
1244
+ //! d_temp_storage Device-accessible allocation of temporary storage.
1245
+ //! When `nullptr`, the required allocation size is written to
1246
+ //! `temp_storage_bytes` and no work is done.
1247
+ //!
1248
+ //! @param[in,out] temp_storage_bytes
1249
+ //! Reference to size in bytes of `d_temp_storage` allocation
1250
+ //!
1251
+ //! @param[in] d_data
1252
+ //! Random-access iterator to the sequence of data items
1253
+ //!
1254
+ //! @param[in] scan_op
1255
+ //! Binary scan functor
1256
+ //!
1257
+ //! @param[in] num_items
1258
+ //! Total number of input items (i.e., the length of `d_in`)
1259
+ //!
1260
+ //! @param[in] stream
1261
+ //! @rst
1262
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1263
+ //! @endrst
1264
+ template <typename IteratorT, typename ScanOpT, typename NumItemsT>
1265
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
1266
+ void* d_temp_storage,
1267
+ size_t& temp_storage_bytes,
1268
+ IteratorT d_data,
1269
+ ScanOpT scan_op,
1270
+ NumItemsT num_items,
1271
+ cudaStream_t stream = 0)
1272
+ {
1273
+ return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream);
1274
+ }
1275
+
1276
+ //! @rst
1277
+ //! Computes a device-wide exclusive prefix sum-by-key with key equality
1278
+ //! defined by ``equality_op``. The value of ``0`` is applied as the initial
1279
+ //! value, and is assigned to the beginning of each segment in ``d_values_out``.
1280
+ //!
1281
+ //! - Supports non-commutative sum operators.
1282
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1283
+ //! addition of floating-point types). Results for pseudo-associative
1284
+ //! operators may vary from run to run. Additional details can be found in
1285
+ //! the @lookback description.
1286
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
1287
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
1288
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1289
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
1290
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
1291
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1292
+ //! - @devicestorage
1293
+ //!
1294
+ //! Snippet
1295
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1296
+ //!
1297
+ //! The code snippet below illustrates the exclusive prefix sum-by-key of an ``int`` device vector.
1298
+ //!
1299
+ //! .. code-block:: c++
1300
+ //!
1301
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1302
+ //!
1303
+ //! // Declare, allocate, and initialize device-accessible pointers for
1304
+ //! // input and output
1305
+ //! int num_items; // e.g., 7
1306
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
1307
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1308
+ //! int *d_values_out; // e.g., [ , , , , , , ]
1309
+ //! ...
1310
+ //!
1311
+ //! // Determine temporary device storage requirements
1312
+ //! void *d_temp_storage = nullptr;
1313
+ //! size_t temp_storage_bytes = 0;
1314
+ //! cub::DeviceScan::ExclusiveSumByKey(
1315
+ //! d_temp_storage, temp_storage_bytes,
1316
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1317
+ //!
1318
+ //! // Allocate temporary storage
1319
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1320
+ //!
1321
+ //! // Run exclusive prefix sum
1322
+ //! cub::DeviceScan::ExclusiveSumByKey(
1323
+ //! d_temp_storage, temp_storage_bytes,
1324
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1325
+ //!
1326
+ //! // d_values_out <-- [0, 8, 0, 7, 12, 0, 0]
1327
+ //!
1328
+ //! @endrst
1329
+ //!
1330
+ //! @tparam KeysInputIteratorT
1331
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
1332
+ //!
1333
+ //! @tparam ValuesInputIteratorT
1334
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
1335
+ //!
1336
+ //! @tparam ValuesOutputIteratorT
1337
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
1338
+ //!
1339
+ //! @tparam EqualityOpT
1340
+ //! **[inferred]** Functor type having member
1341
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
1342
+ //!
1343
+ //! @tparam NumItemsT
1344
+ //! **[inferred]** An integral type representing the number of input elements
1345
+ //!
1346
+ //! @param[in] d_temp_storage
1347
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1348
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1349
+ //!
1350
+ //! @param[in,out] temp_storage_bytes
1351
+ //! Reference to size in bytes of `d_temp_storage` allocation
1352
+ //!
1353
+ //! @param[in] d_keys_in
1354
+ //! Random-access input iterator to the input sequence of key items
1355
+ //!
1356
+ //! @param[in] d_values_in
1357
+ //! Random-access input iterator to the input sequence of value items
1358
+ //!
1359
+ //! @param[out] d_values_out
1360
+ //! Random-access output iterator to the output sequence of value items
1361
+ //!
1362
+ //! @param[in] num_items
1363
+ //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
1364
+ //!
1365
+ //! @param[in] equality_op
1366
+ //! Binary functor that defines the equality of keys.
1367
+ //! Default is cuda::std::equal_to<>{}.
1368
+ //!
1369
+ //! @param[in] stream
1370
+ //! @rst
1371
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1372
+ //! @endrst
1373
+ template <typename KeysInputIteratorT,
1374
+ typename ValuesInputIteratorT,
1375
+ typename ValuesOutputIteratorT,
1376
+ typename EqualityOpT = ::cuda::std::equal_to<>,
1377
+ typename NumItemsT = uint32_t>
1378
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey(
1379
+ void* d_temp_storage,
1380
+ size_t& temp_storage_bytes,
1381
+ KeysInputIteratorT d_keys_in,
1382
+ ValuesInputIteratorT d_values_in,
1383
+ ValuesOutputIteratorT d_values_out,
1384
+ NumItemsT num_items,
1385
+ EqualityOpT equality_op = EqualityOpT(),
1386
+ cudaStream_t stream = 0)
1387
+ {
1388
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSumByKey");
1389
+
1390
+ // Unsigned integer type for global offsets
1391
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1392
+ using InitT = cub::detail::it_value_t<ValuesInputIteratorT>;
1393
+
1394
+ // Initial value
1395
+ InitT init_value{};
1396
+
1397
+ return DispatchScanByKey<
1398
+ KeysInputIteratorT,
1399
+ ValuesInputIteratorT,
1400
+ ValuesOutputIteratorT,
1401
+ EqualityOpT,
1402
+ ::cuda::std::plus<>,
1403
+ InitT,
1404
+ OffsetT>::Dispatch(d_temp_storage,
1405
+ temp_storage_bytes,
1406
+ d_keys_in,
1407
+ d_values_in,
1408
+ d_values_out,
1409
+ equality_op,
1410
+ ::cuda::std::plus<>{},
1411
+ init_value,
1412
+ num_items,
1413
+ stream);
1414
+ }
1415
+
1416
+ //! @rst
1417
+ //! Computes a device-wide exclusive prefix scan-by-key using the
1418
+ //! specified binary ``scan_op`` functor. The key equality is defined by
1419
+ //! ``equality_op``. The ``init_value`` value is applied as the initial
1420
+ //! value, and is assigned to the beginning of each segment in ``d_values_out``.
1421
+ //!
1422
+ //! - Supports non-commutative scan operators.
1423
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1424
+ //! addition of floating-point types). Results for pseudo-associative
1425
+ //! operators may vary from run to run. Additional details can be found in
1426
+ //! the @lookback description.
1427
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
1428
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
1429
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1430
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
1431
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
1432
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1433
+ //! - @devicestorage
1434
+ //!
1435
+ //! Snippet
1436
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1437
+ //!
1438
+ //! The code snippet below illustrates the exclusive prefix min-scan-by-key of an ``int`` device vector
1439
+ //!
1440
+ //! .. code-block:: c++
1441
+ //!
1442
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1443
+ //! #include <cuda/std/climits> // for INT_MAX
1444
+ //!
1445
+ //! // CustomMin functor
1446
+ //! struct CustomMin
1447
+ //! {
1448
+ //! template <typename T>
1449
+ //! __host__ __device__ __forceinline__
1450
+ //! T operator()(const T &a, const T &b) const {
1451
+ //! return (b < a) ? b : a;
1452
+ //! }
1453
+ //! };
1454
+ //!
1455
+ //! // CustomEqual functor
1456
+ //! struct CustomEqual
1457
+ //! {
1458
+ //! template <typename T>
1459
+ //! __host__ __device__ __forceinline__
1460
+ //! T operator()(const T &a, const T &b) const {
1461
+ //! return a == b;
1462
+ //! }
1463
+ //! };
1464
+ //!
1465
+ //! // Declare, allocate, and initialize device-accessible pointers for
1466
+ //! // input and output
1467
+ //! int num_items; // e.g., 7
1468
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
1469
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1470
+ //! int *d_values_out; // e.g., [ , , , , , , ]
1471
+ //! CustomMin min_op;
1472
+ //! CustomEqual equality_op;
1473
+ //! ...
1474
+ //!
1475
+ //! // Determine temporary device storage requirements for exclusive
1476
+ //! // prefix scan
1477
+ //! void *d_temp_storage = nullptr;
1478
+ //! size_t temp_storage_bytes = 0;
1479
+ //! cub::DeviceScan::ExclusiveScanByKey(
1480
+ //! d_temp_storage, temp_storage_bytes,
1481
+ //! d_keys_in, d_values_in, d_values_out, min_op,
1482
+ //! (int) INT_MAX, num_items, equality_op);
1483
+ //!
1484
+ //! // Allocate temporary storage for exclusive prefix scan
1485
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1486
+ //!
1487
+ //! // Run exclusive prefix min-scan
1488
+ //! cub::DeviceScan::ExclusiveScanByKey(
1489
+ //! d_temp_storage, temp_storage_bytes,
1490
+ //! d_keys_in, d_values_in, d_values_out, min_op,
1491
+ //! (int) INT_MAX, num_items, equality_op);
1492
+ //!
1493
+ //! // d_values_out <-- [2147483647, 8, 2147483647, 7, 5, 2147483647, 0]
1494
+ //!
1495
+ //! @endrst
1496
+ //!
1497
+ //! @tparam KeysInputIteratorT
1498
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
1499
+ //!
1500
+ //! @tparam ValuesInputIteratorT
1501
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
1502
+ //!
1503
+ //! @tparam ValuesOutputIteratorT
1504
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
1505
+ //!
1506
+ //! @tparam ScanOpT
1507
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1508
+ //!
1509
+ //! @tparam InitValueT
1510
+ //! **[inferred]** Type of the `init_value` value used in Binary scan
1511
+ //! functor type having member `T operator()(const T &a, const T &b)`
1512
+ //!
1513
+ //! @tparam EqualityOpT
1514
+ //! **[inferred]** Functor type having member
1515
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
1516
+ //!
1517
+ //! @tparam NumItemsT
1518
+ //! **[inferred]** An integral type representing the number of input elements
1519
+ //!
1520
+ //! @param[in] d_temp_storage
1521
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1522
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1523
+ //!
1524
+ //! @param[in,out] temp_storage_bytes
1525
+ //! Reference to size in bytes of `d_temp_storage` allocation
1526
+ //!
1527
+ //! @param[in] d_keys_in
1528
+ //! Random-access input iterator to the input sequence of key items
1529
+ //!
1530
+ //! @param[in] d_values_in
1531
+ //! Random-access input iterator to the input sequence of value items
1532
+ //!
1533
+ //! @param[out] d_values_out
1534
+ //! Random-access output iterator to the output sequence of value items
1535
+ //!
1536
+ //! @param[in] scan_op
1537
+ //! Binary scan functor
1538
+ //!
1539
+ //! @param[in] init_value
1540
+ //! Initial value to seed the exclusive scan (and is assigned to the
1541
+ //! beginning of each segment in `d_values_out`)
1542
+ //!
1543
+ //! @param[in] num_items
1544
+ //! Total number of input items (i.e., the length of `d_keys_in` and
1545
+ //! `d_values_in`)
1546
+ //!
1547
+ //! @param[in] equality_op
1548
+ //! Binary functor that defines the equality of keys.
1549
+ //! Default is cuda::std::equal_to<>{}.
1550
+ //!
1551
+ //! @param[in] stream
1552
+ //! @rst
1553
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1554
+ //! @endrst
1555
+ template <typename KeysInputIteratorT,
1556
+ typename ValuesInputIteratorT,
1557
+ typename ValuesOutputIteratorT,
1558
+ typename ScanOpT,
1559
+ typename InitValueT,
1560
+ typename EqualityOpT = ::cuda::std::equal_to<>,
1561
+ typename NumItemsT = uint32_t>
1562
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey(
1563
+ void* d_temp_storage,
1564
+ size_t& temp_storage_bytes,
1565
+ KeysInputIteratorT d_keys_in,
1566
+ ValuesInputIteratorT d_values_in,
1567
+ ValuesOutputIteratorT d_values_out,
1568
+ ScanOpT scan_op,
1569
+ InitValueT init_value,
1570
+ NumItemsT num_items,
1571
+ EqualityOpT equality_op = EqualityOpT(),
1572
+ cudaStream_t stream = 0)
1573
+ {
1574
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScanByKey");
1575
+
1576
+ // Unsigned integer type for global offsets
1577
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1578
+
1579
+ return DispatchScanByKey<
1580
+ KeysInputIteratorT,
1581
+ ValuesInputIteratorT,
1582
+ ValuesOutputIteratorT,
1583
+ EqualityOpT,
1584
+ ScanOpT,
1585
+ InitValueT,
1586
+ OffsetT>::Dispatch(d_temp_storage,
1587
+ temp_storage_bytes,
1588
+ d_keys_in,
1589
+ d_values_in,
1590
+ d_values_out,
1591
+ equality_op,
1592
+ scan_op,
1593
+ init_value,
1594
+ num_items,
1595
+ stream);
1596
+ }
1597
+
1598
+ //! @rst
1599
+ //! Computes a device-wide inclusive prefix sum-by-key with key equality defined by ``equality_op``.
1600
+ //!
1601
+ //! - Supports non-commutative sum operators.
1602
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1603
+ //! addition of floating-point types). Results for pseudo-associative
1604
+ //! operators may vary from run to run. Additional details can be found in
1605
+ //! the @lookback description.
1606
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
1607
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
1608
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1609
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
1610
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
1611
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1612
+ //! - @devicestorage
1613
+ //!
1614
+ //! Snippet
1615
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1616
+ //!
1617
+ //! The code snippet below illustrates the inclusive prefix sum-by-key of an ``int`` device vector.
1618
+ //!
1619
+ //! .. code-block:: c++
1620
+ //!
1621
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1622
+ //!
1623
+ //! // Declare, allocate, and initialize device-accessible pointers for
1624
+ //! // input and output
1625
+ //! int num_items; // e.g., 7
1626
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
1627
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1628
+ //! int *d_values_out; // e.g., [ , , , , , , ]
1629
+ //! ...
1630
+ //!
1631
+ //! // Determine temporary device storage requirements for inclusive prefix sum
1632
+ //! void *d_temp_storage = nullptr;
1633
+ //! size_t temp_storage_bytes = 0;
1634
+ //! cub::DeviceScan::InclusiveSumByKey(
1635
+ //! d_temp_storage, temp_storage_bytes,
1636
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1637
+ //!
1638
+ //! // Allocate temporary storage for inclusive prefix sum
1639
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1640
+ //!
1641
+ //! // Run inclusive prefix sum
1642
+ //! cub::DeviceScan::InclusiveSumByKey(
1643
+ //! d_temp_storage, temp_storage_bytes,
1644
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1645
+ //!
1646
+ //! // d_out <-- [8, 14, 7, 12, 15, 0, 9]
1647
+ //!
1648
+ //! @endrst
1649
+ //!
1650
+ //! @tparam KeysInputIteratorT
1651
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
1652
+ //!
1653
+ //! @tparam ValuesInputIteratorT
1654
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
1655
+ //!
1656
+ //! @tparam ValuesOutputIteratorT
1657
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
1658
+ //!
1659
+ //! @tparam EqualityOpT
1660
+ //! **[inferred]** Functor type having member
1661
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
1662
+ //!
1663
+ //! @tparam NumItemsT
1664
+ //! **[inferred]** An integral type representing the number of input elements
1665
+ //!
1666
+ //! @param[in] d_temp_storage
1667
+ //! Device-accessible allocation of temporary storage.
1668
+ //! When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
1669
+ //!
1670
+ //! @param[in,out] temp_storage_bytes
1671
+ //! Reference to size in bytes of `d_temp_storage` allocation
1672
+ //!
1673
+ //! @param[in] d_keys_in
1674
+ //! Random-access input iterator to the input sequence of key items
1675
+ //!
1676
+ //! @param[in] d_values_in
1677
+ //! Random-access input iterator to the input sequence of value items
1678
+ //!
1679
+ //! @param[out] d_values_out
1680
+ //! Random-access output iterator to the output sequence of value items
1681
+ //!
1682
+ //! @param[in] num_items
1683
+ //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
1684
+ //!
1685
+ //! @param[in] equality_op
1686
+ //! Binary functor that defines the equality of keys.
1687
+ //! Default is cuda::std::equal_to<>{}.
1688
+ //!
1689
+ //! @param[in] stream
1690
+ //! @rst
1691
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1692
+ //! @endrst
1693
+ template <typename KeysInputIteratorT,
1694
+ typename ValuesInputIteratorT,
1695
+ typename ValuesOutputIteratorT,
1696
+ typename EqualityOpT = ::cuda::std::equal_to<>,
1697
+ typename NumItemsT = uint32_t>
1698
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey(
1699
+ void* d_temp_storage,
1700
+ size_t& temp_storage_bytes,
1701
+ KeysInputIteratorT d_keys_in,
1702
+ ValuesInputIteratorT d_values_in,
1703
+ ValuesOutputIteratorT d_values_out,
1704
+ NumItemsT num_items,
1705
+ EqualityOpT equality_op = EqualityOpT(),
1706
+ cudaStream_t stream = 0)
1707
+ {
1708
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSumByKey");
1709
+
1710
+ // Unsigned integer type for global offsets
1711
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1712
+
1713
+ return DispatchScanByKey<
1714
+ KeysInputIteratorT,
1715
+ ValuesInputIteratorT,
1716
+ ValuesOutputIteratorT,
1717
+ EqualityOpT,
1718
+ ::cuda::std::plus<>,
1719
+ NullType,
1720
+ OffsetT>::Dispatch(d_temp_storage,
1721
+ temp_storage_bytes,
1722
+ d_keys_in,
1723
+ d_values_in,
1724
+ d_values_out,
1725
+ equality_op,
1726
+ ::cuda::std::plus<>{},
1727
+ NullType{},
1728
+ num_items,
1729
+ stream);
1730
+ }
1731
+
1732
+ //! @rst
1733
+ //! Computes a device-wide inclusive prefix scan-by-key using the
1734
+ //! specified binary ``scan_op`` functor. The key equality is defined by ``equality_op``.
1735
+ //!
1736
+ //! - Supports non-commutative scan operators.
1737
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1738
+ //! addition of floating-point types). Results for pseudo-associative
1739
+ //! operators may vary from run to run. Additional details can be found in
1740
+ //! the @lookback description.
1741
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
1742
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
1743
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1744
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
1745
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
1746
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1747
+ //! - @devicestorage
1748
+ //!
1749
+ //! Snippet
1750
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1751
+ //!
1752
+ //! The code snippet below illustrates the inclusive prefix min-scan-by-key of an ``int`` device vector.
1753
+ //!
1754
+ //! .. code-block:: c++
1755
+ //!
1756
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1757
+ //! #include <cuda/std/climits> // for INT_MAX
1758
+ //!
1759
+ //! // CustomMin functor
1760
+ //! struct CustomMin
1761
+ //! {
1762
+ //! template <typename T>
1763
+ //! __host__ __device__ __forceinline__
1764
+ //! T operator()(const T &a, const T &b) const {
1765
+ //! return (b < a) ? b : a;
1766
+ //! }
1767
+ //! };
1768
+ //!
1769
+ //! // CustomEqual functor
1770
+ //! struct CustomEqual
1771
+ //! {
1772
+ //! template <typename T>
1773
+ //! __host__ __device__ __forceinline__
1774
+ //! T operator()(const T &a, const T &b) const {
1775
+ //! return a == b;
1776
+ //! }
1777
+ //! };
1778
+ //!
1779
+ //! // Declare, allocate, and initialize device-accessible pointers for
1780
+ //! // input and output
1781
+ //! int num_items; // e.g., 7
1782
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
1783
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1784
+ //! int *d_values_out; // e.g., [ , , , , , , ]
1785
+ //! CustomMin min_op;
1786
+ //! CustomEqual equality_op;
1787
+ //! ...
1788
+ //!
1789
+ //! // Determine temporary device storage requirements for inclusive prefix scan
1790
+ //! void *d_temp_storage = nullptr;
1791
+ //! size_t temp_storage_bytes = 0;
1792
+ //! cub::DeviceScan::InclusiveScanByKey(
1793
+ //! d_temp_storage, temp_storage_bytes,
1794
+ //! d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
1795
+ //!
1796
+ //! // Allocate temporary storage for inclusive prefix scan
1797
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1798
+ //!
1799
+ //! // Run inclusive prefix min-scan
1800
+ //! cub::DeviceScan::InclusiveScanByKey(
1801
+ //! d_temp_storage, temp_storage_bytes,
1802
+ //! d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
1803
+ //!
1804
+ //! // d_out <-- [8, 6, 7, 5, 3, 0, 0]
1805
+ //!
1806
+ //! @endrst
1807
+ //!
1808
+ //! @tparam KeysInputIteratorT
1809
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
1810
+ //!
1811
+ //! @tparam ValuesInputIteratorT
1812
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
1813
+ //!
1814
+ //! @tparam ValuesOutputIteratorT
1815
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
1816
+ //!
1817
+ //! @tparam ScanOpT
1818
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1819
+ //!
1820
+ //! @tparam EqualityOpT
1821
+ //! **[inferred]** Functor type having member
1822
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
1823
+ //!
1824
+ //! @tparam NumItemsT
1825
+ //! **[inferred]** An integral type representing the number of input elements
1826
+ //!
1827
+ //! @param[in] d_temp_storage
1828
+ //! Device-accessible allocation of temporary storage.
1829
+ //! When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
1830
+ //!
1831
+ //! @param[in,out] temp_storage_bytes
1832
+ //! Reference to size in bytes of `d_temp_storage` allocation
1833
+ //!
1834
+ //! @param[in] d_keys_in
1835
+ //! Random-access input iterator to the input sequence of key items
1836
+ //!
1837
+ //! @param[in] d_values_in
1838
+ //! Random-access input iterator to the input sequence of value items
1839
+ //!
1840
+ //! @param[out] d_values_out
1841
+ //! Random-access output iterator to the output sequence of value items
1842
+ //!
1843
+ //! @param[in] scan_op
1844
+ //! Binary scan functor
1845
+ //!
1846
+ //! @param[in] num_items
1847
+ //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
1848
+ //!
1849
+ //! @param[in] equality_op
1850
+ //! Binary functor that defines the equality of keys.
1851
+ //! Default is cuda::std::equal_to<>{}.
1852
+ //!
1853
+ //! @param[in] stream
1854
+ //! @rst
1855
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1856
+ //! @endrst
1857
+ template <typename KeysInputIteratorT,
1858
+ typename ValuesInputIteratorT,
1859
+ typename ValuesOutputIteratorT,
1860
+ typename ScanOpT,
1861
+ typename EqualityOpT = ::cuda::std::equal_to<>,
1862
+ typename NumItemsT = uint32_t>
1863
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey(
1864
+ void* d_temp_storage,
1865
+ size_t& temp_storage_bytes,
1866
+ KeysInputIteratorT d_keys_in,
1867
+ ValuesInputIteratorT d_values_in,
1868
+ ValuesOutputIteratorT d_values_out,
1869
+ ScanOpT scan_op,
1870
+ NumItemsT num_items,
1871
+ EqualityOpT equality_op = EqualityOpT(),
1872
+ cudaStream_t stream = 0)
1873
+ {
1874
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanByKey");
1875
+
1876
+ // Unsigned integer type for global offsets
1877
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1878
+
1879
+ return DispatchScanByKey<
1880
+ KeysInputIteratorT,
1881
+ ValuesInputIteratorT,
1882
+ ValuesOutputIteratorT,
1883
+ EqualityOpT,
1884
+ ScanOpT,
1885
+ NullType,
1886
+ OffsetT>::Dispatch(d_temp_storage,
1887
+ temp_storage_bytes,
1888
+ d_keys_in,
1889
+ d_values_in,
1890
+ d_values_out,
1891
+ equality_op,
1892
+ scan_op,
1893
+ NullType(),
1894
+ num_items,
1895
+ stream);
1896
+ }
1897
+
1898
+ //! @} end member group
1899
+ };
1900
+
1901
+ CUB_NAMESPACE_END