cuda-cccl 0.1.3.2.0.dev271__cp310-cp310-manylinux_2_26_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1947) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +46 -0
  3. cuda/cccl/cooperative/__init__.py +3 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +8 -0
  5. cuda/cccl/cooperative/experimental/_caching.py +48 -0
  6. cuda/cccl/cooperative/experimental/_common.py +273 -0
  7. cuda/cccl/cooperative/experimental/_nvrtc.py +91 -0
  8. cuda/cccl/cooperative/experimental/_scan_op.py +181 -0
  9. cuda/cccl/cooperative/experimental/_types.py +937 -0
  10. cuda/cccl/cooperative/experimental/_typing.py +107 -0
  11. cuda/cccl/cooperative/experimental/block/__init__.py +39 -0
  12. cuda/cccl/cooperative/experimental/block/_block_exchange.py +251 -0
  13. cuda/cccl/cooperative/experimental/block/_block_load_store.py +215 -0
  14. cuda/cccl/cooperative/experimental/block/_block_merge_sort.py +125 -0
  15. cuda/cccl/cooperative/experimental/block/_block_radix_sort.py +214 -0
  16. cuda/cccl/cooperative/experimental/block/_block_reduce.py +294 -0
  17. cuda/cccl/cooperative/experimental/block/_block_scan.py +983 -0
  18. cuda/cccl/cooperative/experimental/warp/__init__.py +9 -0
  19. cuda/cccl/cooperative/experimental/warp/_warp_merge_sort.py +92 -0
  20. cuda/cccl/cooperative/experimental/warp/_warp_reduce.py +153 -0
  21. cuda/cccl/cooperative/experimental/warp/_warp_scan.py +78 -0
  22. cuda/cccl/headers/__init__.py +7 -0
  23. cuda/cccl/headers/include/__init__.py +1 -0
  24. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +262 -0
  25. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1181 -0
  26. cuda/cccl/headers/include/cub/agent/agent_for.cuh +84 -0
  27. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +925 -0
  28. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +226 -0
  29. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +730 -0
  30. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +766 -0
  31. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
  32. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +704 -0
  33. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +557 -0
  34. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +632 -0
  35. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +804 -0
  36. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1124 -0
  37. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +561 -0
  38. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +473 -0
  39. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +292 -0
  40. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1114 -0
  41. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +341 -0
  42. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +592 -0
  43. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +614 -0
  44. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1342 -0
  45. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +965 -0
  46. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1217 -0
  47. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1306 -0
  48. cuda/cccl/headers/include/cub/block/block_histogram.cuh +420 -0
  49. cuda/cccl/headers/include/cub/block/block_load.cuh +1260 -0
  50. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +787 -0
  51. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1218 -0
  52. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2193 -0
  53. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  54. cuda/cccl/headers/include/cub/block/block_reduce.cuh +665 -0
  55. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +437 -0
  56. cuda/cccl/headers/include/cub/block/block_scan.cuh +2583 -0
  57. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  58. cuda/cccl/headers/include/cub/block/block_store.cuh +1246 -0
  59. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +620 -0
  60. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  61. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  62. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  63. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  64. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
  65. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  66. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  67. cuda/cccl/headers/include/cub/config.cuh +53 -0
  68. cuda/cccl/headers/include/cub/cub.cuh +112 -0
  69. cuda/cccl/headers/include/cub/detail/array_utils.cuh +77 -0
  70. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +155 -0
  71. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +93 -0
  72. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  73. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +246 -0
  74. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +84 -0
  75. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
  76. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
  77. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +118 -0
  78. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  79. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  80. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +61 -0
  81. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  82. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +71 -0
  83. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +93 -0
  84. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  85. cuda/cccl/headers/include/cub/detail/rfa.cuh +724 -0
  86. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  87. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  88. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
  89. cuda/cccl/headers/include/cub/detail/type_traits.cuh +179 -0
  90. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +72 -0
  91. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  92. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  93. cuda/cccl/headers/include/cub/device/device_copy.cuh +187 -0
  94. cuda/cccl/headers/include/cub/device/device_for.cuh +990 -0
  95. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1507 -0
  96. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  97. cuda/cccl/headers/include/cub/device/device_merge.cuh +202 -0
  98. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  99. cuda/cccl/headers/include/cub/device/device_partition.cuh +664 -0
  100. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3435 -0
  101. cuda/cccl/headers/include/cub/device/device_reduce.cuh +1898 -0
  102. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
  103. cuda/cccl/headers/include/cub/device/device_scan.cuh +1899 -0
  104. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  105. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1512 -0
  106. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  107. cuda/cccl/headers/include/cub/device/device_select.cuh +1224 -0
  108. cuda/cccl/headers/include/cub/device/device_transform.cuh +545 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +314 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +109 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +718 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +45 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +197 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1042 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +305 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1749 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1316 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +656 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +497 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +313 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +612 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +497 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +598 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +916 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +838 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +441 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +455 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +558 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +543 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +218 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +799 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +591 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +194 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +330 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +475 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +121 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +987 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +70 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +121 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +63 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +278 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +91 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +118 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1068 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +945 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +675 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +609 -0
  154. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1013 -0
  155. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +249 -0
  156. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1587 -0
  157. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +407 -0
  158. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +448 -0
  159. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +874 -0
  160. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +226 -0
  161. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  162. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  163. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +256 -0
  164. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +260 -0
  165. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +252 -0
  166. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +322 -0
  167. cuda/cccl/headers/include/cub/thread/thread_load.cuh +347 -0
  168. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +684 -0
  169. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +547 -0
  170. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
  171. cuda/cccl/headers/include/cub/thread/thread_search.cuh +198 -0
  172. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +464 -0
  173. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +101 -0
  174. cuda/cccl/headers/include/cub/thread/thread_store.cuh +364 -0
  175. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  176. cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
  177. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  178. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  179. cuda/cccl/headers/include/cub/util_device.cuh +779 -0
  180. cuda/cccl/headers/include/cub/util_macro.cuh +99 -0
  181. cuda/cccl/headers/include/cub/util_math.cuh +115 -0
  182. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  183. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  184. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  185. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  186. cuda/cccl/headers/include/cub/util_type.cuh +1136 -0
  187. cuda/cccl/headers/include/cub/util_vsmem.cuh +251 -0
  188. cuda/cccl/headers/include/cub/version.cuh +89 -0
  189. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  190. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  191. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +729 -0
  192. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +405 -0
  193. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +950 -0
  194. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +713 -0
  195. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  196. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  197. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  198. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +822 -0
  199. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1885 -0
  200. cuda/cccl/headers/include/cub/warp/warp_store.cuh +520 -0
  201. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
  202. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  203. cuda/cccl/headers/include/cuda/__algorithm/copy.h +143 -0
  204. cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
  205. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  206. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  207. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
  208. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  209. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  210. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  211. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  212. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  213. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  214. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  215. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
  216. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +466 -0
  217. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
  218. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  219. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  220. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  221. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  222. cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
  223. cuda/cccl/headers/include/cuda/__cccl_config +36 -0
  224. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  225. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +249 -0
  226. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  227. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  228. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  229. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  230. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  231. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  232. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  233. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  234. cuda/cccl/headers/include/cuda/__device/all_devices.h +240 -0
  235. cuda/cccl/headers/include/cuda/__device/arch_traits.h +613 -0
  236. cuda/cccl/headers/include/cuda/__device/attributes.h +721 -0
  237. cuda/cccl/headers/include/cuda/__device/device_ref.h +176 -0
  238. cuda/cccl/headers/include/cuda/__device/physical_device.h +168 -0
  239. cuda/cccl/headers/include/cuda/__driver/driver_api.h +503 -0
  240. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  241. cuda/cccl/headers/include/cuda/__event/event_ref.h +158 -0
  242. cuda/cccl/headers/include/cuda/__event/timed_event.h +118 -0
  243. cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
  244. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  245. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  246. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  247. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +276 -0
  248. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  249. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  250. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +109 -0
  251. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  252. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  253. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  254. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  255. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +49 -0
  256. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +257 -0
  257. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +460 -0
  258. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +314 -0
  259. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +424 -0
  260. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +292 -0
  261. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +321 -0
  262. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +335 -0
  263. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +501 -0
  264. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +496 -0
  265. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +452 -0
  266. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +94 -0
  267. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +539 -0
  268. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  269. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +462 -0
  270. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +63 -0
  271. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +122 -0
  272. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +51 -0
  273. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  274. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  275. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  276. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  277. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  278. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  279. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  280. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  281. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  282. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
  283. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  284. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  285. cuda/cccl/headers/include/cuda/__memory/address_space.h +211 -0
  286. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  287. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  288. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  289. cuda/cccl/headers/include/cuda/__memory/check_address.h +106 -0
  290. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  291. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  292. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  293. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  294. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  295. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  296. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +69 -0
  297. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
  298. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +654 -0
  299. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
  300. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  301. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  302. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  303. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  304. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2982 -0
  305. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  306. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  307. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  308. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  309. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  310. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  311. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  312. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  313. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  314. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  315. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  411. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  412. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
  413. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  414. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  415. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +97 -0
  416. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  417. cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
  418. cuda/cccl/headers/include/cuda/__stream/stream.h +142 -0
  419. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +296 -0
  420. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  421. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  422. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  423. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  424. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  425. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  426. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  427. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  428. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  429. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +521 -0
  430. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
  431. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
  432. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  433. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  434. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  435. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
  436. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
  437. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +78 -0
  438. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  439. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  440. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
  441. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  442. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  443. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  444. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  445. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  446. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  447. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  448. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  449. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  450. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  451. cuda/cccl/headers/include/cuda/access_property +26 -0
  452. cuda/cccl/headers/include/cuda/algorithm +27 -0
  453. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  454. cuda/cccl/headers/include/cuda/atomic +27 -0
  455. cuda/cccl/headers/include/cuda/barrier +267 -0
  456. cuda/cccl/headers/include/cuda/bit +29 -0
  457. cuda/cccl/headers/include/cuda/cmath +36 -0
  458. cuda/cccl/headers/include/cuda/devices +20 -0
  459. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  460. cuda/cccl/headers/include/cuda/functional +32 -0
  461. cuda/cccl/headers/include/cuda/iterator +38 -0
  462. cuda/cccl/headers/include/cuda/latch +27 -0
  463. cuda/cccl/headers/include/cuda/mdspan +28 -0
  464. cuda/cccl/headers/include/cuda/memory +34 -0
  465. cuda/cccl/headers/include/cuda/memory_resource +35 -0
  466. cuda/cccl/headers/include/cuda/numeric +29 -0
  467. cuda/cccl/headers/include/cuda/pipeline +578 -0
  468. cuda/cccl/headers/include/cuda/ptx +128 -0
  469. cuda/cccl/headers/include/cuda/semaphore +31 -0
  470. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  471. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  472. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  473. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  474. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  475. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +64 -0
  476. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  477. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  478. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  479. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  480. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  481. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +92 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  503. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  504. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  505. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  506. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  507. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  508. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  509. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  510. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +96 -0
  511. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  512. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +68 -0
  513. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  514. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  515. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  516. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  517. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +141 -0
  518. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  519. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
  520. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  521. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  522. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  523. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  524. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  525. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  526. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  527. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +94 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +101 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  566. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  567. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +138 -0
  568. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  569. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  570. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  571. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  572. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  573. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  574. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  575. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  576. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  577. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  578. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  579. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  580. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
  581. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
  582. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  583. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  584. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  585. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  586. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  587. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  588. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  589. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  590. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  591. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
  592. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
  593. cuda/cccl/headers/include/cuda/std/__bit/countl.h +167 -0
  594. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  595. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  596. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  597. cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
  598. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  599. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  600. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  601. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  602. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +150 -0
  603. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  604. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +787 -0
  605. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  606. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  607. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +53 -0
  608. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +69 -0
  609. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  610. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
  611. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
  612. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +79 -0
  613. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +68 -0
  614. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +133 -0
  615. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  616. cuda/cccl/headers/include/cuda/std/__cccl/os.h +48 -0
  617. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1276 -0
  618. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
  619. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
  620. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  621. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  622. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  623. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  624. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  625. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +146 -0
  626. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  627. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
  628. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  629. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  630. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  631. cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
  632. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  633. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  634. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +724 -0
  635. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
  636. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +216 -0
  637. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
  638. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
  639. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
  640. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
  641. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
  642. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
  643. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
  644. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +180 -0
  645. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  646. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  647. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +582 -0
  648. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +260 -0
  649. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  650. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
  651. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
  652. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
  653. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  654. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
  655. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
  656. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  657. cuda/cccl/headers/include/cuda/std/__complex/complex.h +676 -0
  658. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
  659. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  660. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
  661. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  662. cuda/cccl/headers/include/cuda/std/__complex/literals.h +106 -0
  663. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +302 -0
  664. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  665. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +322 -0
  666. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +321 -0
  667. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  668. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  669. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  670. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  671. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  672. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  673. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  674. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  675. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  676. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  677. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +274 -0
  678. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +107 -0
  679. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  680. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  681. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  682. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  683. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  684. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  685. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  686. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  687. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  688. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  689. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  690. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +42 -0
  691. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  692. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  693. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  694. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  695. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  696. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  697. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  698. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  699. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +69 -0
  700. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +58 -0
  701. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  702. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  703. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +72 -0
  704. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +146 -0
  705. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  706. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  707. cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
  708. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  709. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1963 -0
  710. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
  711. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  712. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +172 -0
  713. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  714. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +809 -0
  715. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  716. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  717. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +374 -0
  718. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  719. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +113 -0
  720. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  721. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +39 -0
  722. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +72 -0
  723. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  724. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  725. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  726. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  727. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  728. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  729. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  730. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  731. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  732. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  733. cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
  734. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  735. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  736. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  737. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
  738. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  739. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  740. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  741. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  742. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  743. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  744. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  745. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  746. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  747. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  748. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  749. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  750. cuda/cccl/headers/include/cuda/std/__functional/bind.h +337 -0
  751. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  752. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  753. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  754. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  755. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  756. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  757. cuda/cccl/headers/include/cuda/std/__functional/function.h +1279 -0
  758. cuda/cccl/headers/include/cuda/std/__functional/hash.h +650 -0
  759. cuda/cccl/headers/include/cuda/std/__functional/identity.h +61 -0
  760. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +560 -0
  761. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +43 -0
  762. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +67 -0
  763. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +213 -0
  764. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  765. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  766. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  767. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +65 -0
  768. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +64 -0
  769. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  770. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  771. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  772. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  773. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  774. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +277 -0
  775. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +35 -0
  776. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  777. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +49 -0
  778. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +34 -0
  779. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  780. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  781. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  782. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  783. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  784. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +90 -0
  785. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  786. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  787. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +34 -0
  788. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  789. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  790. cuda/cccl/headers/include/cuda/std/__fwd/string.h +83 -0
  791. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +59 -0
  792. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  793. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  794. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  795. cuda/cccl/headers/include/cuda/std/__internal/features.h +71 -0
  796. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +122 -0
  797. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  798. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  799. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
  800. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +254 -0
  801. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  802. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  803. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  804. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  805. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  806. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  807. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  808. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
  809. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +150 -0
  810. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  811. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  812. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  813. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  814. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  815. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  816. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  817. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +932 -0
  818. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  819. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +433 -0
  820. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  821. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  822. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  823. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  824. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  825. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +91 -0
  826. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  827. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +185 -0
  828. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  829. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
  830. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  831. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  832. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  833. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +247 -0
  834. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  835. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  836. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  837. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  838. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
  839. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
  840. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
  841. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
  842. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  843. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +138 -0
  844. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  845. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
  846. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +757 -0
  847. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  848. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  849. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
  850. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +499 -0
  851. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  852. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  853. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  854. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +64 -0
  855. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  856. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  857. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  858. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  859. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  860. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  861. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +552 -0
  862. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  863. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  864. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  865. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +248 -0
  866. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  867. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +43 -0
  868. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +261 -0
  869. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  870. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  871. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +682 -0
  872. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +767 -0
  873. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
  874. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  875. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  876. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  877. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  878. cuda/cccl/headers/include/cuda/std/__new/launder.h +49 -0
  879. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  880. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  881. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  882. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  883. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  884. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  885. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  886. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  887. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  888. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  889. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  890. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  891. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  892. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  893. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  894. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  895. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  896. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  897. cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
  898. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +432 -0
  899. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  900. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  901. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  902. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  903. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  904. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  905. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  906. cuda/cccl/headers/include/cuda/std/__random_ +29 -0
  907. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  908. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  909. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +314 -0
  910. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  911. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  912. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  913. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  914. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  915. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  916. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  917. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  918. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  919. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  920. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +161 -0
  921. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
  922. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  923. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  924. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  925. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  926. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  927. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  928. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  929. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
  930. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  931. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  932. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  933. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  934. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  935. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +233 -0
  936. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  937. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  938. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  939. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  940. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  941. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  942. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  943. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  944. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +105 -0
  945. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  946. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  947. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  948. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  949. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  950. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +98 -0
  951. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +269 -0
  952. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +218 -0
  953. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  954. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  955. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +90 -0
  956. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +73 -0
  957. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  958. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  959. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +291 -0
  960. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  961. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  962. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  963. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  964. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  965. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  966. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  967. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  968. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  969. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  970. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +69 -0
  971. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  972. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  973. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  974. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  975. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  976. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  977. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  978. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  979. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  980. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  981. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  982. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  983. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +47 -0
  984. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +51 -0
  985. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  986. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +40 -0
  987. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +44 -0
  988. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  989. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  990. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  991. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  992. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +83 -0
  993. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  994. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  995. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  996. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +68 -0
  997. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +54 -0
  998. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  999. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  1000. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1001. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +214 -0
  1002. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1003. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1004. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1005. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1006. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1007. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1008. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +73 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +68 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +81 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +56 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1016. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1017. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1018. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1019. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1020. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1021. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +59 -0
  1022. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1023. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1024. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1025. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1026. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1027. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1028. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  1029. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1030. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1031. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1032. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1033. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1034. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1035. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1036. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1037. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1038. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1039. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +62 -0
  1040. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1041. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1042. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1043. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +63 -0
  1044. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +119 -0
  1045. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1046. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  1047. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1048. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +84 -0
  1049. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1050. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1051. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1052. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1053. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +57 -0
  1054. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +201 -0
  1055. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +56 -0
  1056. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +70 -0
  1057. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +82 -0
  1058. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +60 -0
  1059. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +61 -0
  1060. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +56 -0
  1061. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +55 -0
  1062. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +73 -0
  1063. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +60 -0
  1064. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +58 -0
  1065. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1066. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +57 -0
  1067. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1068. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1069. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1070. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1071. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1072. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1073. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1074. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1075. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1076. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1077. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1078. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1079. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1080. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +123 -0
  1081. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1082. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1083. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1084. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1085. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1086. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +66 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1100. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1101. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1102. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1103. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1104. cuda/cccl/headers/include/cuda/std/__utility/declval.h +63 -0
  1105. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1106. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1107. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1108. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1109. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +77 -0
  1110. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  1111. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1112. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1113. cuda/cccl/headers/include/cuda/std/__utility/pair.h +797 -0
  1114. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1115. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
  1116. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1117. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  1118. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1119. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1120. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1121. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1122. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1123. cuda/cccl/headers/include/cuda/std/array +518 -0
  1124. cuda/cccl/headers/include/cuda/std/atomic +818 -0
  1125. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1126. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1127. cuda/cccl/headers/include/cuda/std/bitset +994 -0
  1128. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1129. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1130. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1131. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1132. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1133. cuda/cccl/headers/include/cuda/std/cmath +25 -0
  1134. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1135. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1136. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1137. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1138. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1139. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1140. cuda/cccl/headers/include/cuda/std/ctime +154 -0
  1141. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1142. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +235 -0
  1143. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1722 -0
  1144. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3630 -0
  1145. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/cmath +520 -0
  1146. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1147. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1148. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
  1149. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2142 -0
  1150. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1151. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1152. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1153. cuda/cccl/headers/include/cuda/std/initializer_list +36 -0
  1154. cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
  1155. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1156. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1157. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1158. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1159. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1160. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1161. cuda/cccl/headers/include/cuda/std/numbers +342 -0
  1162. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1163. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1164. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1165. cuda/cccl/headers/include/cuda/std/ratio +417 -0
  1166. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1167. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1168. cuda/cccl/headers/include/cuda/std/span +628 -0
  1169. cuda/cccl/headers/include/cuda/std/string_view +799 -0
  1170. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1171. cuda/cccl/headers/include/cuda/std/type_traits +177 -0
  1172. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1173. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1174. cuda/cccl/headers/include/cuda/std/version +245 -0
  1175. cuda/cccl/headers/include/cuda/stream +31 -0
  1176. cuda/cccl/headers/include/cuda/stream_ref +54 -0
  1177. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1178. cuda/cccl/headers/include/cuda/utility +27 -0
  1179. cuda/cccl/headers/include/cuda/version +16 -0
  1180. cuda/cccl/headers/include/cuda/warp +28 -0
  1181. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1182. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1183. cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
  1184. cuda/cccl/headers/include/nv/target +235 -0
  1185. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1186. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1187. cuda/cccl/headers/include/thrust/advance.h +59 -0
  1188. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1189. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1190. cuda/cccl/headers/include/thrust/complex.h +858 -0
  1191. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1192. cuda/cccl/headers/include/thrust/count.h +245 -0
  1193. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1194. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +37 -0
  1195. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1196. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +350 -0
  1197. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +371 -0
  1198. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +45 -0
  1199. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +242 -0
  1200. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +39 -0
  1201. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +137 -0
  1202. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +39 -0
  1203. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +99 -0
  1204. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +53 -0
  1205. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +68 -0
  1206. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1207. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +102 -0
  1208. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +86 -0
  1209. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +79 -0
  1210. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +79 -0
  1211. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +39 -0
  1212. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +98 -0
  1213. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1214. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1215. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1216. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1217. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1218. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1219. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1220. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1221. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1222. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1223. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1224. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1225. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1226. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1227. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1228. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1229. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1230. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1231. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1232. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1233. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1234. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1235. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1236. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1237. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1238. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1239. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1240. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1241. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1242. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1243. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1244. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1245. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1246. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1247. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1248. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1249. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1250. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1251. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1252. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1253. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1254. cuda/cccl/headers/include/thrust/detail/device_delete.inl +52 -0
  1255. cuda/cccl/headers/include/thrust/detail/device_free.inl +47 -0
  1256. cuda/cccl/headers/include/thrust/detail/device_new.inl +61 -0
  1257. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1258. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1259. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1260. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +80 -0
  1261. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1262. cuda/cccl/headers/include/thrust/detail/execution_policy.h +80 -0
  1263. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1264. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1265. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1266. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1267. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1268. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1269. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1270. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1271. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1272. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1273. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1274. cuda/cccl/headers/include/thrust/detail/integer_math.h +130 -0
  1275. cuda/cccl/headers/include/thrust/detail/internal_functional.h +293 -0
  1276. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1277. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
  1278. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1279. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1280. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +40 -0
  1281. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1282. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1283. cuda/cccl/headers/include/thrust/detail/mpl/math.h +164 -0
  1284. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +37 -0
  1285. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1286. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1287. cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
  1288. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1289. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1290. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1291. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1292. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1293. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
  1294. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1295. cuda/cccl/headers/include/thrust/detail/reference.h +500 -0
  1296. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1297. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1298. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1299. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1300. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1301. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1302. cuda/cccl/headers/include/thrust/detail/seq.h +54 -0
  1303. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1304. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1305. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1306. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1307. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1308. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1309. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1310. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1311. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1312. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
  1313. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1314. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1315. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1316. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
  1317. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1318. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1319. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1320. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1321. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1322. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1323. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1324. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
  1325. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_discard_iterator.h +44 -0
  1326. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1327. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1328. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1329. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1330. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1331. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1332. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1333. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1334. cuda/cccl/headers/include/thrust/detail/util/align.h +59 -0
  1335. cuda/cccl/headers/include/thrust/detail/vector_base.h +615 -0
  1336. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1212 -0
  1337. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1338. cuda/cccl/headers/include/thrust/device_delete.h +59 -0
  1339. cuda/cccl/headers/include/thrust/device_free.h +72 -0
  1340. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1341. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1342. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1343. cuda/cccl/headers/include/thrust/device_new.h +91 -0
  1344. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1345. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1346. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1347. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1348. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1349. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1350. cuda/cccl/headers/include/thrust/execution_policy.h +384 -0
  1351. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1352. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1353. cuda/cccl/headers/include/thrust/find.h +382 -0
  1354. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1355. cuda/cccl/headers/include/thrust/functional.h +396 -0
  1356. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1357. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1358. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1359. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1360. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1361. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1362. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1363. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1364. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1365. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1366. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1367. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +51 -0
  1368. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +62 -0
  1369. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1370. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +199 -0
  1371. cuda/cccl/headers/include/thrust/iterator/detail/iterator_traversal_tags.h +50 -0
  1372. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +53 -0
  1373. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1374. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1375. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1376. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1377. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1378. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1379. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +215 -0
  1380. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +660 -0
  1381. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +311 -0
  1382. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1383. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1384. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1385. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1386. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
  1387. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1388. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +149 -0
  1389. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1390. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1391. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1392. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
  1393. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1394. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1395. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1396. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1397. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1398. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1399. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1400. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1401. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
  1402. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1403. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1404. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1405. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1406. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1407. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1408. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1409. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1410. cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
  1411. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1412. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1413. cuda/cccl/headers/include/thrust/pair.h +99 -0
  1414. cuda/cccl/headers/include/thrust/partition.h +1391 -0
  1415. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1416. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1417. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1418. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1419. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1420. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1421. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1422. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1423. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1424. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1425. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1426. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1427. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
  1428. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1429. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +217 -0
  1430. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1431. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1432. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1433. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1434. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1435. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1436. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1437. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1438. cuda/cccl/headers/include/thrust/random.h +120 -0
  1439. cuda/cccl/headers/include/thrust/reduce.h +1113 -0
  1440. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1441. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1442. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1443. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1444. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1445. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1446. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1447. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1448. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1449. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1450. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1451. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1452. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1453. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1454. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1455. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1456. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1457. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +90 -0
  1458. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1459. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1460. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1461. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1462. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1463. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1464. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1465. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1466. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1467. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1468. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1469. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1470. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1471. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1472. cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +62 -0
  1473. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1474. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1475. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1476. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1477. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1478. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1479. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1480. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1481. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1482. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1483. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1484. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1485. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1486. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1487. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1488. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1489. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1490. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1491. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1492. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1493. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1494. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1495. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1496. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +158 -0
  1497. cuda/cccl/headers/include/thrust/system/cpp/memory.h +109 -0
  1498. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +75 -0
  1499. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +123 -0
  1500. cuda/cccl/headers/include/thrust/system/cpp/vector.h +99 -0
  1501. cuda/cccl/headers/include/thrust/system/cuda/config.h +123 -0
  1502. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1503. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1504. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1505. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1506. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +129 -0
  1507. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1508. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1509. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1510. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +609 -0
  1511. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1512. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1513. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +210 -0
  1514. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1515. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1516. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +113 -0
  1517. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1518. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +82 -0
  1519. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +272 -0
  1520. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1521. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1522. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +85 -0
  1523. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1524. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1525. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +204 -0
  1526. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +92 -0
  1527. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +69 -0
  1528. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1529. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1530. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1531. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1532. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1533. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +217 -0
  1534. cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +237 -0
  1535. cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +95 -0
  1536. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1537. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1538. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1539. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +782 -0
  1540. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
  1541. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1542. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +152 -0
  1543. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
  1544. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +342 -0
  1545. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +415 -0
  1546. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +79 -0
  1547. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1548. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1738 -0
  1549. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +482 -0
  1550. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
  1551. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +75 -0
  1552. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1553. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1554. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +415 -0
  1555. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1556. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1557. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +114 -0
  1558. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +91 -0
  1559. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
  1560. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +311 -0
  1561. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +251 -0
  1562. cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
  1563. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +39 -0
  1564. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1565. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1566. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
  1567. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1568. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1569. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1570. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1571. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1572. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1573. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1574. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1575. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1576. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1577. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1578. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1579. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1580. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1581. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1582. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1583. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1584. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +64 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/errno.h +125 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +59 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +85 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +167 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +391 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +51 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +70 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +64 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +152 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +54 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +90 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +55 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +66 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +72 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +258 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +60 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +55 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +143 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +64 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +79 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +102 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +51 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +66 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +78 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +65 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +70 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +92 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +105 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +154 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +55 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +74 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +135 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +213 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +49 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +77 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +106 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +89 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +192 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +92 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +127 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +101 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +181 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +54 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +73 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +78 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +141 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +78 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +91 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +132 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +238 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +79 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +91 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +96 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +288 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +482 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +60 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +131 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +119 -0
  1675. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +181 -0
  1676. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +50 -0
  1677. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +82 -0
  1678. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +47 -0
  1679. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +60 -0
  1680. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +53 -0
  1681. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +60 -0
  1682. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +88 -0
  1683. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +56 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +62 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +86 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +119 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +51 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +172 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +51 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +121 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +77 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +119 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +87 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +132 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +123 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +76 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +48 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +142 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +55 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +125 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +55 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +77 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +78 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +116 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +68 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +80 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +129 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +49 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +147 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +51 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +56 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +81 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +151 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +309 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +70 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +104 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +185 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +160 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +151 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +212 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +65 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +61 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +362 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +54 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +130 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +54 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +592 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +64 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +121 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +112 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1755. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1756. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1757. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1758. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1759. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1760. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1761. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1762. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1763. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1764. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1765. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1766. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1767. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +113 -0
  1768. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1769. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1770. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1771. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1772. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1773. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1774. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1775. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1776. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1777. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1778. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1779. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1780. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1781. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1782. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1783. cuda/cccl/headers/include/thrust/system/omp/detail/par.h +62 -0
  1784. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1785. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1786. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1787. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1788. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1789. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1790. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1791. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1792. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1793. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1794. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1795. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1796. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1797. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1798. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +30 -0
  1799. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +30 -0
  1800. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1801. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1802. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1803. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1804. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
  1805. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1806. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1807. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1808. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1809. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1810. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1811. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1812. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1813. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1814. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1815. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1816. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1817. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +157 -0
  1818. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1819. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1820. cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
  1821. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1822. cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
  1823. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1824. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1825. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1826. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1827. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1828. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1829. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1830. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1831. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1832. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +92 -0
  1833. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1834. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1835. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1836. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1837. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1838. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1839. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1840. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1841. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1842. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1843. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1844. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1845. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1846. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1847. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1848. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1849. cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +62 -0
  1850. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1851. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1852. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1853. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1854. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1855. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1856. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1857. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1858. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1859. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1860. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1861. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1862. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1863. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1864. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +30 -0
  1865. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1866. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1867. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1868. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1869. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1870. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1871. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1872. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1873. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1874. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +157 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1887. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1888. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1889. cuda/cccl/headers/include/thrust/transform.h +1045 -0
  1890. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1891. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1892. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1893. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1894. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
  1895. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1896. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1897. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1898. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
  1899. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1900. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +96 -0
  1901. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1902. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1903. cuda/cccl/headers/include/thrust/unique.h +1088 -0
  1904. cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
  1905. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1906. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1907. cuda/cccl/headers/include/thrust/version.h +93 -0
  1908. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1909. cuda/cccl/headers/include_paths.py +54 -0
  1910. cuda/cccl/parallel/__init__.py +9 -0
  1911. cuda/cccl/parallel/experimental/.gitignore +4 -0
  1912. cuda/cccl/parallel/experimental/__init__.py +75 -0
  1913. cuda/cccl/parallel/experimental/_bindings.py +56 -0
  1914. cuda/cccl/parallel/experimental/_bindings.pyi +405 -0
  1915. cuda/cccl/parallel/experimental/_bindings_impl.pyx +1957 -0
  1916. cuda/cccl/parallel/experimental/_caching.py +71 -0
  1917. cuda/cccl/parallel/experimental/_cccl_interop.py +396 -0
  1918. cuda/cccl/parallel/experimental/_utils/__init__.py +0 -0
  1919. cuda/cccl/parallel/experimental/_utils/protocols.py +132 -0
  1920. cuda/cccl/parallel/experimental/_utils/temp_storage_buffer.py +86 -0
  1921. cuda/cccl/parallel/experimental/algorithms/__init__.py +50 -0
  1922. cuda/cccl/parallel/experimental/algorithms/_histogram.py +243 -0
  1923. cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +225 -0
  1924. cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +312 -0
  1925. cuda/cccl/parallel/experimental/algorithms/_reduce.py +184 -0
  1926. cuda/cccl/parallel/experimental/algorithms/_scan.py +261 -0
  1927. cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +257 -0
  1928. cuda/cccl/parallel/experimental/algorithms/_transform.py +308 -0
  1929. cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +252 -0
  1930. cuda/cccl/parallel/experimental/cccl/.gitkeep +0 -0
  1931. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-310-x86_64-linux-gnu.so +0 -0
  1932. cuda/cccl/parallel/experimental/cu12/cccl/libcccl.c.parallel.so +0 -0
  1933. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-310-x86_64-linux-gnu.so +0 -0
  1934. cuda/cccl/parallel/experimental/cu13/cccl/libcccl.c.parallel.so +0 -0
  1935. cuda/cccl/parallel/experimental/iterators/__init__.py +21 -0
  1936. cuda/cccl/parallel/experimental/iterators/_factories.py +214 -0
  1937. cuda/cccl/parallel/experimental/iterators/_iterators.py +627 -0
  1938. cuda/cccl/parallel/experimental/iterators/_zip_iterator.py +207 -0
  1939. cuda/cccl/parallel/experimental/numba_utils.py +6 -0
  1940. cuda/cccl/parallel/experimental/op.py +3 -0
  1941. cuda/cccl/parallel/experimental/struct.py +272 -0
  1942. cuda/cccl/parallel/experimental/typing.py +35 -0
  1943. cuda/cccl/py.typed +0 -0
  1944. cuda_cccl-0.1.3.2.0.dev271.dist-info/METADATA +40 -0
  1945. cuda_cccl-0.1.3.2.0.dev271.dist-info/RECORD +1947 -0
  1946. cuda_cccl-0.1.3.2.0.dev271.dist-info/WHEEL +5 -0
  1947. cuda_cccl-0.1.3.2.0.dev271.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,1899 @@
1
+ /******************************************************************************
2
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
3
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
4
+ *
5
+ * Redistribution and use in source and binary forms, with or without
6
+ * modification, are permitted provided that the following conditions are met:
7
+ * * Redistributions of source code must retain the above copyright
8
+ * notice, this list of conditions and the following disclaimer.
9
+ * * Redistributions in binary form must reproduce the above copyright
10
+ * notice, this list of conditions and the following disclaimer in the
11
+ * documentation and/or other materials provided with the distribution.
12
+ * * Neither the name of the NVIDIA CORPORATION nor the
13
+ * names of its contributors may be used to endorse or promote products
14
+ * derived from this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ *
27
+ ******************************************************************************/
28
+
29
+ //! @file
30
+ //! cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data
31
+ //! items residing within device-accessible memory.
32
+
33
+ #pragma once
34
+
35
+ #include <cub/config.cuh>
36
+
37
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
38
+ # pragma GCC system_header
39
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
40
+ # pragma clang system_header
41
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
42
+ # pragma system_header
43
+ #endif // no system header
44
+
45
+ #include <cub/detail/choose_offset.cuh>
46
+ #include <cub/device/dispatch/dispatch_scan.cuh>
47
+ #include <cub/device/dispatch/dispatch_scan_by_key.cuh>
48
+ #include <cub/thread/thread_operators.cuh>
49
+
50
+ #include <cuda/std/__functional/invoke.h>
51
+
52
+ CUB_NAMESPACE_BEGIN
53
+
54
+ //! @rst
55
+ //! DeviceScan provides device-wide, parallel operations for computing a
56
+ //! prefix scan across a sequence of data items residing within
57
+ //! device-accessible memory.
58
+ //!
59
+ //! Overview
60
+ //! +++++++++++++++++++++++++++++++++++++++++++++
61
+ //!
62
+ //! Given a sequence of input elements and a binary reduction operator, a
63
+ //! `prefix scan <http://en.wikipedia.org/wiki/Prefix_sum>`_ produces an output
64
+ //! sequence where each element is computed to be the reduction of the elements
65
+ //! occurring earlier in the input sequence. *Prefix sum* connotes a prefix scan
66
+ //! with the addition operator. The term *inclusive* indicates that the
67
+ //! *i*\ :sup:`th` output reduction incorporates the *i*\ :sup:`th` input.
68
+ //! The term *exclusive* indicates the *i*\ :sup:`th` input is not
69
+ //! incorporated into the *i*\ :sup:`th` output reduction. When the input and
70
+ //! output sequences are the same, the scan is performed in-place.
71
+ //!
72
+ //! In order to provide an efficient parallel implementation, the binary reduction operator must be associative. That
73
+ //! is, ``op(op(a, b), c)`` must be equivalent to ``op(a, op(b, c))`` for any input values ``a``, ``b``, and ``c``.
74
+ //!
75
+ //! As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our
76
+ //! *"decoupled look-back"* algorithm for performing global prefix scan with
77
+ //! only a single pass through the input data, as described in our 2016 technical
78
+ //! report [1]_. The central idea is to leverage a small, constant factor of
79
+ //! redundant work in order to overlap the latencies of global prefix
80
+ //! propagation with local computation. As such, our algorithm requires only
81
+ //! ``~2*n*`` data movement (``n`` inputs are read, ``n`` outputs are written), and
82
+ //! typically proceeds at "memcpy" speeds. Our algorithm supports inplace operations.
83
+ //!
84
+ //! .. [1] Duane Merrill and Michael Garland. `Single-pass Parallel Prefix Scan with Decoupled Look-back
85
+ //! <https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back>`_,
86
+ //! *NVIDIA Technical Report NVR-2016-002*, 2016.
87
+ //!
88
+ //! Usage Considerations
89
+ //! +++++++++++++++++++++++++++++++++++++++++++++
90
+ //!
91
+ //! @cdp_class{DeviceScan}
92
+ //!
93
+ //! Performance
94
+ //! +++++++++++++++++++++++++++++++++++++++++++++
95
+ //!
96
+ //! @linear_performance{prefix scan}
97
+ //!
98
+ //! @endrst
99
+ struct DeviceScan
100
+ {
101
+ //! @name Exclusive scans
102
+ //! @{
103
+
104
+ //! @rst
105
+ //! Computes a device-wide exclusive prefix sum.
106
+ //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_out``.
107
+ //!
108
+ //! - Supports non-commutative sum operators.
109
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
110
+ //! addition of floating-point types). Results for pseudo-associative
111
+ //! operators may vary from run to run. Additional details can be found in
112
+ //! the @lookback description.
113
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
114
+ //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
115
+ //! shall not overlap in any other way.
116
+ //! - @devicestorage
117
+ //!
118
+ //! Snippet
119
+ //! +++++++++++++++++++++++++++++++++++++++++++++
120
+ //!
121
+ //! The code snippet below illustrates the exclusive prefix sum of an ``int``
122
+ //! device vector.
123
+ //!
124
+ //! .. code-block:: c++
125
+ //!
126
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
127
+ //!
128
+ //! // Declare, allocate, and initialize device-accessible pointers for
129
+ //! // input and output
130
+ //! int num_items; // e.g., 7
131
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
132
+ //! int *d_out; // e.g., [ , , , , , , ]
133
+ //! ...
134
+ //!
135
+ //! // Determine temporary device storage requirements
136
+ //! void *d_temp_storage = nullptr;
137
+ //! size_t temp_storage_bytes = 0;
138
+ //! cub::DeviceScan::ExclusiveSum(
139
+ //! d_temp_storage, temp_storage_bytes,
140
+ //! d_in, d_out, num_items);
141
+ //!
142
+ //! // Allocate temporary storage
143
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
144
+ //!
145
+ //! // Run exclusive prefix sum
146
+ //! cub::DeviceScan::ExclusiveSum(
147
+ //! d_temp_storage, temp_storage_bytes,
148
+ //! d_in, d_out, num_items);
149
+ //!
150
+ //! // d_out <-- [0, 8, 14, 21, 26, 29, 29]
151
+ //!
152
+ //! @endrst
153
+ //!
154
+ //! @tparam InputIteratorT
155
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
156
+ //!
157
+ //! @tparam OutputIteratorT
158
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
159
+ //!
160
+ //! @tparam NumItemsT
161
+ //! **[inferred]** An integral type representing the number of input elements
162
+ //!
163
+ //! @param[in] d_temp_storage
164
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
165
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
166
+ //!
167
+ //! @param[in,out] temp_storage_bytes
168
+ //! Reference to size in bytes of `d_temp_storage` allocation
169
+ //!
170
+ //! @param[in] d_in
171
+ //! Random-access iterator to the input sequence of data items
172
+ //!
173
+ //! @param[out] d_out
174
+ //! Random-access iterator to the output sequence of data items
175
+ //!
176
+ //! @param[in] num_items
177
+ //! Total number of input items (i.e., the length of `d_in`)
178
+ //!
179
+ //! @param[in] stream
180
+ //! @rst
181
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
182
+ //! @endrst
183
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
184
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
185
+ void* d_temp_storage,
186
+ size_t& temp_storage_bytes,
187
+ InputIteratorT d_in,
188
+ OutputIteratorT d_out,
189
+ NumItemsT num_items,
190
+ cudaStream_t stream = 0)
191
+ {
192
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSum");
193
+
194
+ // Unsigned integer type for global offsets
195
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
196
+ using InitT = cub::detail::it_value_t<InputIteratorT>;
197
+
198
+ // Initial value
199
+ InitT init_value{};
200
+
201
+ return DispatchScan<InputIteratorT, OutputIteratorT, ::cuda::std::plus<>, detail::InputValue<InitT>, OffsetT>::
202
+ Dispatch(d_temp_storage,
203
+ temp_storage_bytes,
204
+ d_in,
205
+ d_out,
206
+ ::cuda::std::plus<>{},
207
+ detail::InputValue<InitT>(init_value),
208
+ num_items,
209
+ stream);
210
+ }
211
+
212
+ //! @rst
213
+ //! Computes a device-wide exclusive prefix sum in-place.
214
+ //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_data``.
215
+ //!
216
+ //! - Supports non-commutative sum operators.
217
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
218
+ //! addition of floating-point types). Results for pseudo-associative
219
+ //! operators may vary from run to run. Additional details can be found in
220
+ //! the @lookback description.
221
+ //! - @devicestorage
222
+ //!
223
+ //! Snippet
224
+ //! +++++++++++++++++++++++++++++++++++++++++++++
225
+ //!
226
+ //! The code snippet below illustrates the exclusive prefix sum of an ``int``
227
+ //! device vector.
228
+ //!
229
+ //! .. code-block:: c++
230
+ //!
231
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
232
+ //!
233
+ //! // Declare, allocate, and initialize device-accessible pointers for
234
+ //! // input and output
235
+ //! int num_items; // e.g., 7
236
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
237
+ //! ...
238
+ //!
239
+ //! // Determine temporary device storage requirements
240
+ //! void *d_temp_storage = nullptr;
241
+ //! size_t temp_storage_bytes = 0;
242
+ //! cub::DeviceScan::ExclusiveSum(
243
+ //! d_temp_storage, temp_storage_bytes,
244
+ //! d_data, num_items);
245
+ //!
246
+ //! // Allocate temporary storage
247
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
248
+ //!
249
+ //! // Run exclusive prefix sum
250
+ //! cub::DeviceScan::ExclusiveSum(
251
+ //! d_temp_storage, temp_storage_bytes,
252
+ //! d_data, num_items);
253
+ //!
254
+ //! // d_data <-- [0, 8, 14, 21, 26, 29, 29]
255
+ //!
256
+ //! @endrst
257
+ //!
258
+ //! @tparam IteratorT
259
+ //! **[inferred]** Random-access iterator type for reading scan inputs and wrigin scan outputs
260
+ //!
261
+ //! @tparam NumItemsT
262
+ //! **[inferred]** An integral type representing the number of input elements
263
+ //!
264
+ //! @param[in] d_temp_storage
265
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
266
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
267
+ //!
268
+ //! @param[in,out] temp_storage_bytes
269
+ //! Reference to size in bytes of `d_temp_storage` allocation
270
+ //!
271
+ //! @param[in,out] d_data
272
+ //! Random-access iterator to the sequence of data items
273
+ //!
274
+ //! @param[in] num_items
275
+ //! Total number of input items (i.e., the length of `d_in`)
276
+ //!
277
+ //! @param[in] stream
278
+ //! @rst
279
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
280
+ //! @endrst
281
+ template <typename IteratorT, typename NumItemsT>
282
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
283
+ void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0)
284
+ {
285
+ return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
286
+ }
287
+
288
+ //! @rst
289
+ //! Computes a device-wide exclusive prefix scan using the specified
290
+ //! binary associative ``scan_op`` functor. The ``init_value`` value is applied as
291
+ //! the initial value, and is assigned to ``*d_out``.
292
+ //!
293
+ //! - Supports non-commutative scan operators.
294
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
295
+ //! addition of floating-point types). Results for pseudo-associative
296
+ //! operators may vary from run to run. Additional details can be found in
297
+ //! the @lookback description.
298
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
299
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
300
+ //! shall not overlap in any other way.
301
+ //! - @devicestorage
302
+ //!
303
+ //! Snippet
304
+ //! +++++++++++++++++++++++++++++++++++++++++++++
305
+ //!
306
+ //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
307
+ //!
308
+ //! .. code-block:: c++
309
+ //!
310
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
311
+ //! #include <cuda/std/climits> // for INT_MAX
312
+ //!
313
+ //! // CustomMin functor
314
+ //! struct CustomMin
315
+ //! {
316
+ //! template <typename T>
317
+ //! __host__ __device__ __forceinline__
318
+ //! T operator()(const T &a, const T &b) const {
319
+ //! return (b < a) ? b : a;
320
+ //! }
321
+ //! };
322
+ //!
323
+ //! // Declare, allocate, and initialize device-accessible pointers for
324
+ //! // input and output
325
+ //! int num_items; // e.g., 7
326
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
327
+ //! int *d_out; // e.g., [ , , , , , , ]
328
+ //! CustomMin min_op;
329
+ //! ...
330
+ //!
331
+ //! // Determine temporary device storage requirements for exclusive
332
+ //! // prefix scan
333
+ //! void *d_temp_storage = nullptr;
334
+ //! size_t temp_storage_bytes = 0;
335
+ //! cub::DeviceScan::ExclusiveScan(
336
+ //! d_temp_storage, temp_storage_bytes,
337
+ //! d_in, d_out, min_op, (int) INT_MAX, num_items);
338
+ //!
339
+ //! // Allocate temporary storage for exclusive prefix scan
340
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
341
+ //!
342
+ //! // Run exclusive prefix min-scan
343
+ //! cub::DeviceScan::ExclusiveScan(
344
+ //! d_temp_storage, temp_storage_bytes,
345
+ //! d_in, d_out, min_op, (int) INT_MAX, num_items);
346
+ //!
347
+ //! // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
348
+ //!
349
+ //! @endrst
350
+ //!
351
+ //! @tparam InputIteratorT
352
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
353
+ //!
354
+ //! @tparam OutputIteratorT
355
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
356
+ //!
357
+ //! @tparam ScanOpT
358
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
359
+ //!
360
+ //! @tparam InitValueT
361
+ //! **[inferred]** Type of the `init_value`
362
+ //!
363
+ //! @tparam NumItemsT
364
+ //! **[inferred]** An integral type representing the number of input elements
365
+ //!
366
+ //! @param[in] d_temp_storage
367
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
368
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
369
+ //!
370
+ //! @param[in,out] temp_storage_bytes
371
+ //! Reference to size in bytes of `d_temp_storage` allocation
372
+ //!
373
+ //! @param[in] d_in
374
+ //! Random-access iterator to the input sequence of data items
375
+ //!
376
+ //! @param[out] d_out
377
+ //! Random-access iterator to the output sequence of data items
378
+ //!
379
+ //! @param[in] scan_op
380
+ //! Binary associative scan functor
381
+ //!
382
+ //! @param[in] init_value
383
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
384
+ //!
385
+ //! @param[in] num_items
386
+ //! Total number of input items (i.e., the length of `d_in`)
387
+ //!
388
+ //! @param[in] stream
389
+ //! @rst
390
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
391
+ //! @endrst
392
+ template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
393
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
394
+ void* d_temp_storage,
395
+ size_t& temp_storage_bytes,
396
+ InputIteratorT d_in,
397
+ OutputIteratorT d_out,
398
+ ScanOpT scan_op,
399
+ InitValueT init_value,
400
+ NumItemsT num_items,
401
+ cudaStream_t stream = 0)
402
+ {
403
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan");
404
+
405
+ // Unsigned integer type for global offsets
406
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
407
+
408
+ return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
409
+ d_temp_storage,
410
+ temp_storage_bytes,
411
+ d_in,
412
+ d_out,
413
+ scan_op,
414
+ detail::InputValue<InitValueT>(init_value),
415
+ num_items,
416
+ stream);
417
+ }
418
+
419
+ //! @rst
420
+ //! Computes a device-wide exclusive prefix scan using the specified
421
+ //! binary associative ``scan_op`` functor. The ``init_value`` value is applied as
422
+ //! the initial value, and is assigned to ``*d_data``.
423
+ //!
424
+ //! - Supports non-commutative scan operators.
425
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
426
+ //! addition of floating-point types). Results for pseudo-associative
427
+ //! operators may vary from run to run. Additional details can be found in
428
+ //! the @lookback description.
429
+ //! - @devicestorage
430
+ //!
431
+ //! Snippet
432
+ //! +++++++++++++++++++++++++++++++++++++++++++++
433
+ //!
434
+ //! The code snippet below illustrates the exclusive prefix min-scan of an
435
+ //! ``int`` device vector:
436
+ //!
437
+ //! .. code-block:: c++
438
+ //!
439
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
440
+ //! #include <cuda/std/climits> // for INT_MAX
441
+ //!
442
+ //! // CustomMin functor
443
+ //! struct CustomMin
444
+ //! {
445
+ //! template <typename T>
446
+ //! __host__ __device__ __forceinline__
447
+ //! T operator()(const T &a, const T &b) const {
448
+ //! return (b < a) ? b : a;
449
+ //! }
450
+ //! };
451
+ //!
452
+ //! // Declare, allocate, and initialize device-accessible pointers for
453
+ //! // input and output
454
+ //! int num_items; // e.g., 7
455
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
456
+ //! CustomMin min_op;
457
+ //! ...
458
+ //!
459
+ //! // Determine temporary device storage requirements for exclusive
460
+ //! // prefix scan
461
+ //! void *d_temp_storage = nullptr;
462
+ //! size_t temp_storage_bytes = 0;
463
+ //! cub::DeviceScan::ExclusiveScan(
464
+ //! d_temp_storage, temp_storage_bytes,
465
+ //! d_data, min_op, (int) INT_MAX, num_items);
466
+ //!
467
+ //! // Allocate temporary storage for exclusive prefix scan
468
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
469
+ //!
470
+ //! // Run exclusive prefix min-scan
471
+ //! cub::DeviceScan::ExclusiveScan(
472
+ //! d_temp_storage, temp_storage_bytes,
473
+ //! d_data, min_op, (int) INT_MAX, num_items);
474
+ //!
475
+ //! // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
476
+ //!
477
+ //! @endrst
478
+ //!
479
+ //! @tparam IteratorT
480
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
481
+ //!
482
+ //! @tparam ScanOpT
483
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
484
+ //!
485
+ //! @tparam InitValueT
486
+ //! **[inferred]** Type of the `init_value`
487
+ //!
488
+ //! @tparam NumItemsT
489
+ //! **[inferred]** An integral type representing the number of input elements
490
+ //!
491
+ //! @param[in] d_temp_storage
492
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
493
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
494
+ //!
495
+ //! @param[in,out] temp_storage_bytes
496
+ //! Reference to size in bytes of `d_temp_storage` allocation
497
+ //!
498
+ //! @param[in,out] d_data
499
+ //! Random-access iterator to the sequence of data items
500
+ //!
501
+ //! @param[in] scan_op
502
+ //! Binary associative scan functor
503
+ //!
504
+ //! @param[in] init_value
505
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
506
+ //!
507
+ //! @param[in] num_items
508
+ //! Total number of input items (i.e., the length of `d_in`)
509
+ //!
510
+ //! @param[in] stream
511
+ //! @rst
512
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
513
+ //! @endrst
514
+ template <typename IteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
515
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
516
+ void* d_temp_storage,
517
+ size_t& temp_storage_bytes,
518
+ IteratorT d_data,
519
+ ScanOpT scan_op,
520
+ InitValueT init_value,
521
+ NumItemsT num_items,
522
+ cudaStream_t stream = 0)
523
+ {
524
+ return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
525
+ }
526
+
527
+ //! @rst
528
+ //! Computes a device-wide exclusive prefix scan using the specified
529
+ //! binary associative ``scan_op`` functor. The ``init_value`` value is provided as a future value.
530
+ //!
531
+ //! - Supports non-commutative scan operators.
532
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
533
+ //! addition of floating-point types). Results for pseudo-associative
534
+ //! operators may vary from run to run. Additional details can be found in
535
+ //! the @lookback description.
536
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
537
+ //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
538
+ //! shall not overlap in any other way.
539
+ //! - @devicestorage
540
+ //!
541
+ //! Snippet
542
+ //! +++++++++++++++++++++++++++++++++++++++++++++
543
+ //!
544
+ //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
545
+ //!
546
+ //! .. code-block:: c++
547
+ //!
548
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
549
+ //! #include <cuda/std/climits> // for INT_MAX
550
+ //!
551
+ //! // CustomMin functor
552
+ //! struct CustomMin
553
+ //! {
554
+ //! template <typename T>
555
+ //! __host__ __device__ __forceinline__
556
+ //! T operator()(const T &a, const T &b) const {
557
+ //! return (b < a) ? b : a;
558
+ //! }
559
+ //! };
560
+ //!
561
+ //! // Declare, allocate, and initialize device-accessible pointers for
562
+ //! // input and output
563
+ //! int num_items; // e.g., 7
564
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
565
+ //! int *d_out; // e.g., [ , , , , , , ]
566
+ //! int *d_init_iter; // e.g., INT_MAX
567
+ //! CustomMin min_op;
568
+ //!
569
+ //! auto future_init_value =
570
+ //! cub::FutureValue<InitialValueT, IterT>(d_init_iter);
571
+ //!
572
+ //! ...
573
+ //!
574
+ //! // Determine temporary device storage requirements for exclusive
575
+ //! // prefix scan
576
+ //! void *d_temp_storage = nullptr;
577
+ //! size_t temp_storage_bytes = 0;
578
+ //! cub::DeviceScan::ExclusiveScan(
579
+ //! d_temp_storage, temp_storage_bytes,
580
+ //! d_in, d_out, min_op, future_init_value, num_items);
581
+ //!
582
+ //! // Allocate temporary storage for exclusive prefix scan
583
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
584
+ //!
585
+ //! // Run exclusive prefix min-scan
586
+ //! cub::DeviceScan::ExclusiveScan(
587
+ //! d_temp_storage, temp_storage_bytes,
588
+ //! d_in, d_out, min_op, future_init_value, num_items);
589
+ //!
590
+ //! // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
591
+ //!
592
+ //! @endrst
593
+ //!
594
+ //! @tparam InputIteratorT
595
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
596
+ //!
597
+ //! @tparam OutputIteratorT
598
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
599
+ //!
600
+ //! @tparam ScanOpT
601
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
602
+ //!
603
+ //! @tparam InitValueT
604
+ //! **[inferred]** Type of the `init_value`
605
+ //!
606
+ //! @tparam NumItemsT
607
+ //! **[inferred]** An integral type representing the number of input elements
608
+ //!
609
+ //! @param[in] d_temp_storage
610
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
611
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
612
+ //!
613
+ //! @param[in,out] temp_storage_bytes
614
+ //! Reference to size in bytes of `d_temp_storage` allocation
615
+ //!
616
+ //! @param[in] d_in
617
+ //! Pointer to the input sequence of data items
618
+ //!
619
+ //! @param[out] d_out
620
+ //! Pointer to the output sequence of data items
621
+ //!
622
+ //! @param[in] scan_op
623
+ //! Binary associative scan functor
624
+ //!
625
+ //! @param[in] init_value
626
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
627
+ //!
628
+ //! @param[in] num_items
629
+ //! Total number of input items (i.e., the length of `d_in`)
630
+ //!
631
+ //! @param[in] stream
632
+ //! @rst
633
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
634
+ //! @endrst
635
+ template <typename InputIteratorT,
636
+ typename OutputIteratorT,
637
+ typename ScanOpT,
638
+ typename InitValueT,
639
+ typename InitValueIterT = InitValueT*,
640
+ typename NumItemsT = int>
641
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
642
+ void* d_temp_storage,
643
+ size_t& temp_storage_bytes,
644
+ InputIteratorT d_in,
645
+ OutputIteratorT d_out,
646
+ ScanOpT scan_op,
647
+ FutureValue<InitValueT, InitValueIterT> init_value,
648
+ NumItemsT num_items,
649
+ cudaStream_t stream = 0)
650
+ {
651
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan");
652
+
653
+ // Unsigned integer type for global offsets
654
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
655
+
656
+ return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
657
+ d_temp_storage,
658
+ temp_storage_bytes,
659
+ d_in,
660
+ d_out,
661
+ scan_op,
662
+ detail::InputValue<InitValueT>(init_value),
663
+ num_items,
664
+ stream);
665
+ }
666
+
667
+ //! @rst
668
+ //! Computes a device-wide exclusive prefix scan using the specified binary associative ``scan_op`` functor.
669
+ //! The ``init_value`` value is provided as a future value.
670
+ //!
671
+ //! - Supports non-commutative scan operators.
672
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
673
+ //! addition of floating-point types). Results for pseudo-associative
674
+ //! operators may vary from run to run. Additional details can be found in
675
+ //! the @lookback description.
676
+ //! - @devicestorage
677
+ //!
678
+ //! Snippet
679
+ //! +++++++++++++++++++++++++++++++++++++++++++++
680
+ //!
681
+ //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
682
+ //!
683
+ //! .. code-block:: c++
684
+ //!
685
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
686
+ //! #include <cuda/std/climits> // for INT_MAX
687
+ //!
688
+ //! // CustomMin functor
689
+ //! struct CustomMin
690
+ //! {
691
+ //! template <typename T>
692
+ //! __host__ __device__ __forceinline__
693
+ //! T operator()(const T &a, const T &b) const {
694
+ //! return (b < a) ? b : a;
695
+ //! }
696
+ //! };
697
+ //!
698
+ //! // Declare, allocate, and initialize device-accessible pointers for
699
+ //! // input and output
700
+ //! int num_items; // e.g., 7
701
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
702
+ //! int *d_init_iter; // e.g., INT_MAX
703
+ //! CustomMin min_op;
704
+ //!
705
+ //! auto future_init_value =
706
+ //! cub::FutureValue<InitialValueT, IterT>(d_init_iter);
707
+ //!
708
+ //! ...
709
+ //!
710
+ //! // Determine temporary device storage requirements for exclusive
711
+ //! // prefix scan
712
+ //! void *d_temp_storage = nullptr;
713
+ //! size_t temp_storage_bytes = 0;
714
+ //! cub::DeviceScan::ExclusiveScan(
715
+ //! d_temp_storage, temp_storage_bytes,
716
+ //! d_data, min_op, future_init_value, num_items);
717
+ //!
718
+ //! // Allocate temporary storage for exclusive prefix scan
719
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
720
+ //!
721
+ //! // Run exclusive prefix min-scan
722
+ //! cub::DeviceScan::ExclusiveScan(
723
+ //! d_temp_storage, temp_storage_bytes,
724
+ //! d_data, min_op, future_init_value, num_items);
725
+ //!
726
+ //! // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
727
+ //!
728
+ //! @endrst
729
+ //!
730
+ //! @tparam IteratorT
731
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
732
+ //!
733
+ //! @tparam ScanOpT
734
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
735
+ //!
736
+ //! @tparam InitValueT
737
+ //! **[inferred]** Type of the `init_value`
738
+ //!
739
+ //! @tparam NumItemsT
740
+ //! **[inferred]** An integral type representing the number of input elements
741
+ //!
742
+ //! @param[in] d_temp_storage
743
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
744
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
745
+ //!
746
+ //! @param[in,out] temp_storage_bytes
747
+ //! Reference to size in bytes of `d_temp_storage` allocation
748
+ //!
749
+ //! @param[in,out] d_data
750
+ //! Pointer to the sequence of data items
751
+ //!
752
+ //! @param[in] scan_op
753
+ //! Binary associative scan functor
754
+ //!
755
+ //! @param[in] init_value
756
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
757
+ //!
758
+ //! @param[in] num_items
759
+ //! Total number of input items (i.e., the length of `d_in`)
760
+ //!
761
+ //! @param[in] stream
762
+ //! @rst
763
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
764
+ //! @endrst
765
+ template <typename IteratorT,
766
+ typename ScanOpT,
767
+ typename InitValueT,
768
+ typename InitValueIterT = InitValueT*,
769
+ typename NumItemsT = int>
770
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
771
+ void* d_temp_storage,
772
+ size_t& temp_storage_bytes,
773
+ IteratorT d_data,
774
+ ScanOpT scan_op,
775
+ FutureValue<InitValueT, InitValueIterT> init_value,
776
+ NumItemsT num_items,
777
+ cudaStream_t stream = 0)
778
+ {
779
+ return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
780
+ }
781
+
782
+ //! @} end member group
783
+ //! @name Inclusive scans
784
+ //! @{
785
+
786
+ //! @rst
787
+ //! Computes a device-wide inclusive prefix sum.
788
+ //!
789
+ //! - Supports non-commutative sum operators.
790
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
791
+ //! addition of floating-point types). Results for pseudo-associative
792
+ //! operators may vary from run to run. Additional details can be found in
793
+ //! the @lookback description.
794
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
795
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
796
+ //! shall not overlap in any other way.
797
+ //! - @devicestorage
798
+ //!
799
+ //! Snippet
800
+ //! +++++++++++++++++++++++++++++++++++++++++++++
801
+ //!
802
+ //! The code snippet below illustrates the inclusive prefix sum of an ``int`` device vector.
803
+ //!
804
+ //! .. code-block:: c++
805
+ //!
806
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
807
+ //!
808
+ //! // Declare, allocate, and initialize device-accessible pointers for
809
+ //! // input and output
810
+ //! int num_items; // e.g., 7
811
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
812
+ //! int *d_out; // e.g., [ , , , , , , ]
813
+ //! ...
814
+ //!
815
+ //! // Determine temporary device storage requirements for inclusive
816
+ //! // prefix sum
817
+ //! void *d_temp_storage = nullptr;
818
+ //! size_t temp_storage_bytes = 0;
819
+ //! cub::DeviceScan::InclusiveSum(
820
+ //! d_temp_storage, temp_storage_bytes,
821
+ //! d_in, d_out, num_items);
822
+ //!
823
+ //! // Allocate temporary storage for inclusive prefix sum
824
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
825
+ //!
826
+ //! // Run inclusive prefix sum
827
+ //! cub::DeviceScan::InclusiveSum(
828
+ //! d_temp_storage, temp_storage_bytes,
829
+ //! d_in, d_out, num_items);
830
+ //!
831
+ //! // d_out <-- [8, 14, 21, 26, 29, 29, 38]
832
+ //!
833
+ //! @endrst
834
+ //!
835
+ //! @tparam InputIteratorT
836
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
837
+ //!
838
+ //! @tparam OutputIteratorT
839
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
840
+ //!
841
+ //! @tparam NumItemsT
842
+ //! **[inferred]** An integral type representing the number of input elements
843
+ //!
844
+ //! @param[in] d_temp_storage
845
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
846
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
847
+ //!
848
+ //! @param[in,out] temp_storage_bytes
849
+ //! Reference to size in bytes of `d_temp_storage` allocation
850
+ //!
851
+ //! @param[in] d_in
852
+ //! Random-access iterator to the input sequence of data items
853
+ //!
854
+ //! @param[out] d_out
855
+ //! Random-access iterator to the output sequence of data items
856
+ //!
857
+ //! @param[in] num_items
858
+ //! Total number of input items (i.e., the length of `d_in`)
859
+ //!
860
+ //! @param[in] stream
861
+ //! @rst
862
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
863
+ //! @endrst
864
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
865
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
866
+ void* d_temp_storage,
867
+ size_t& temp_storage_bytes,
868
+ InputIteratorT d_in,
869
+ OutputIteratorT d_out,
870
+ NumItemsT num_items,
871
+ cudaStream_t stream = 0)
872
+ {
873
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSum");
874
+
875
+ // Unsigned integer type for global offsets
876
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
877
+
878
+ return DispatchScan<InputIteratorT, OutputIteratorT, ::cuda::std::plus<>, NullType, OffsetT>::Dispatch(
879
+ d_temp_storage, temp_storage_bytes, d_in, d_out, ::cuda::std::plus<>{}, NullType{}, num_items, stream);
880
+ }
881
+
882
+ //! @rst
883
+ //! Computes a device-wide inclusive prefix sum in-place.
884
+ //!
885
+ //! - Supports non-commutative sum operators.
886
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
887
+ //! addition of floating-point types). Results for pseudo-associative
888
+ //! operators may vary from run to run. Additional details can be found in
889
+ //! the @lookback description.
890
+ //! - @devicestorage
891
+ //!
892
+ //! Snippet
893
+ //! +++++++++++++++++++++++++++++++++++++++++++++
894
+ //!
895
+ //! The code snippet below illustrates the inclusive prefix sum of an ``int`` device vector.
896
+ //!
897
+ //! .. code-block:: c++
898
+ //!
899
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
900
+ //!
901
+ //! // Declare, allocate, and initialize device-accessible pointers for
902
+ //! // input and output
903
+ //! int num_items; // e.g., 7
904
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
905
+ //! ...
906
+ //!
907
+ //! // Determine temporary device storage requirements for inclusive
908
+ //! // prefix sum
909
+ //! void *d_temp_storage = nullptr;
910
+ //! size_t temp_storage_bytes = 0;
911
+ //! cub::DeviceScan::InclusiveSum(
912
+ //! d_temp_storage, temp_storage_bytes,
913
+ //! d_data, num_items);
914
+ //!
915
+ //! // Allocate temporary storage for inclusive prefix sum
916
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
917
+ //!
918
+ //! // Run inclusive prefix sum
919
+ //! cub::DeviceScan::InclusiveSum(
920
+ //! d_temp_storage, temp_storage_bytes,
921
+ //! d_data, num_items);
922
+ //!
923
+ //! // d_data <-- [8, 14, 21, 26, 29, 29, 38]
924
+ //!
925
+ //! @endrst
926
+ //!
927
+ //! @tparam IteratorT
928
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
929
+ //!
930
+ //! @tparam NumItemsT
931
+ //! **[inferred]** An integral type representing the number of input elements
932
+ //!
933
+ //! @param[in] d_temp_storage
934
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
935
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
936
+ //!
937
+ //! @param[in,out] temp_storage_bytes
938
+ //! Reference to size in bytes of `d_temp_storage` allocation
939
+ //!
940
+ //! @param[in,out] d_data
941
+ //! Random-access iterator to the sequence of data items
942
+ //!
943
+ //! @param[in] num_items
944
+ //! Total number of input items (i.e., the length of `d_in`)
945
+ //!
946
+ //! @param[in] stream
947
+ //! @rst
948
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
949
+ //! @endrst
950
+ template <typename IteratorT, typename NumItemsT>
951
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
952
+ void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0)
953
+ {
954
+ return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
955
+ }
956
+
957
+ //! @rst
958
+ //! Computes a device-wide inclusive prefix scan using the specified binary associative ``scan_op`` functor.
959
+ //!
960
+ //! - Supports non-commutative scan operators.
961
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
962
+ //! addition of floating-point types). Results for pseudo-associative
963
+ //! operators may vary from run to run. Additional details can be found in
964
+ //! the @lookback description.
965
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
966
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
967
+ //! shall not overlap in any other way.
968
+ //! - @devicestorage
969
+ //!
970
+ //! Snippet
971
+ //! +++++++++++++++++++++++++++++++++++++++++++++
972
+ //!
973
+ //! The code snippet below illustrates the inclusive prefix min-scan of an ``int`` device vector.
974
+ //!
975
+ //! .. code-block:: c++
976
+ //!
977
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
978
+ //! #include <cuda/std/climits> // for INT_MAX
979
+ //!
980
+ //! // CustomMin functor
981
+ //! struct CustomMin
982
+ //! {
983
+ //! template <typename T>
984
+ //! __host__ __device__ __forceinline__
985
+ //! T operator()(const T &a, const T &b) const {
986
+ //! return (b < a) ? b : a;
987
+ //! }
988
+ //! };
989
+ //!
990
+ //! // Declare, allocate, and initialize device-accessible pointers for
991
+ //! // input and output
992
+ //! int num_items; // e.g., 7
993
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
994
+ //! int *d_out; // e.g., [ , , , , , , ]
995
+ //! CustomMin min_op;
996
+ //! ...
997
+ //!
998
+ //! // Determine temporary device storage requirements for inclusive
999
+ //! // prefix scan
1000
+ //! void *d_temp_storage = nullptr;
1001
+ //! size_t temp_storage_bytes = 0;
1002
+ //! cub::DeviceScan::InclusiveScan(
1003
+ //! d_temp_storage, temp_storage_bytes,
1004
+ //! d_in, d_out, min_op, num_items);
1005
+ //!
1006
+ //! // Allocate temporary storage for inclusive prefix scan
1007
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1008
+ //!
1009
+ //! // Run inclusive prefix min-scan
1010
+ //! cub::DeviceScan::InclusiveScan(
1011
+ //! d_temp_storage, temp_storage_bytes,
1012
+ //! d_in, d_out, min_op, num_items);
1013
+ //!
1014
+ //! // d_out <-- [8, 6, 6, 5, 3, 0, 0]
1015
+ //!
1016
+ //! @endrst
1017
+ //!
1018
+ //! @tparam InputIteratorT
1019
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
1020
+ //!
1021
+ //! @tparam OutputIteratorT
1022
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
1023
+ //!
1024
+ //! @tparam ScanOpT
1025
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1026
+ //!
1027
+ //! @tparam NumItemsT
1028
+ //! **[inferred]** An integral type representing the number of input elements
1029
+ //!
1030
+ //! @param[in]
1031
+ //! d_temp_storage Device-accessible allocation of temporary storage.
1032
+ //! When `nullptr`, the required allocation size is written to
1033
+ //! `temp_storage_bytes` and no work is done.
1034
+ //!
1035
+ //! @param[in,out] temp_storage_bytes
1036
+ //! Reference to size in bytes of `d_temp_storage` allocation
1037
+ //!
1038
+ //! @param[in] d_in
1039
+ //! Random-access iterator to the input sequence of data items
1040
+ //!
1041
+ //! @param[out] d_out
1042
+ //! Random-access iterator to the output sequence of data items
1043
+ //!
1044
+ //! @param[in] scan_op
1045
+ //! Binary associative scan functor
1046
+ //!
1047
+ //! @param[in] num_items
1048
+ //! Total number of input items (i.e., the length of `d_in`)
1049
+ //!
1050
+ //! @param[in] stream
1051
+ //! @rst
1052
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1053
+ //! @endrst
1054
+ template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename NumItemsT>
1055
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
1056
+ void* d_temp_storage,
1057
+ size_t& temp_storage_bytes,
1058
+ InputIteratorT d_in,
1059
+ OutputIteratorT d_out,
1060
+ ScanOpT scan_op,
1061
+ NumItemsT num_items,
1062
+ cudaStream_t stream = 0)
1063
+ {
1064
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScan");
1065
+
1066
+ // Unsigned integer type for global offsets
1067
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1068
+
1069
+ return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
1070
+ d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream);
1071
+ }
1072
+
1073
+ //! @rst
1074
+ //! Computes a device-wide inclusive prefix scan using the specified binary associative ``scan_op`` functor.
1075
+ //! The result of applying the ``scan_op`` binary operator to ``init_value`` value and ``*d_in``
1076
+ //! is assigned to ``*d_out``.
1077
+ //!
1078
+ //! - Supports non-commutative scan operators.
1079
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1080
+ //! addition of floating-point types). Results for pseudo-associative
1081
+ //! operators may vary from run to run. Additional details can be found in
1082
+ //! the @lookback description.
1083
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
1084
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
1085
+ //! shall not overlap in any other way.
1086
+ //! - @devicestorage
1087
+ //!
1088
+ //! Snippet
1089
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1090
+ //!
1091
+ //! The code snippet below illustrates the inclusive max-scan of an ``int`` device vector.
1092
+ //!
1093
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_api.cu
1094
+ //! :language: c++
1095
+ //! :dedent:
1096
+ //! :start-after: example-begin device-inclusive-scan
1097
+ //! :end-before: example-end device-inclusive-scan
1098
+ //!
1099
+ //! @endrst
1100
+ //!
1101
+ //! @tparam InputIteratorT
1102
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
1103
+ //!
1104
+ //! @tparam OutputIteratorT
1105
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
1106
+ //!
1107
+ //! @tparam ScanOpT
1108
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1109
+ //!
1110
+ //! @tparam InitValueT
1111
+ //! **[inferred]** Type of the `init_value`
1112
+ //!
1113
+ //! @tparam NumItemsT
1114
+ //! **[inferred]** An integral type representing the number of input elements
1115
+ //!
1116
+ //! @param[in] d_temp_storage
1117
+ //! Device-accessible allocation of temporary storage.
1118
+ //! When `nullptr`, the required allocation size is written to
1119
+ //! `temp_storage_bytes` and no work is done.
1120
+ //!
1121
+ //! @param[in,out] temp_storage_bytes
1122
+ //! Reference to the size in bytes of the `d_temp_storage` allocation
1123
+ //!
1124
+ //! @param[in] d_in
1125
+ //! Random-access iterator to the input sequence of data items
1126
+ //!
1127
+ //! @param[out] d_out
1128
+ //! Random-access iterator to the output sequence of data items
1129
+ //!
1130
+ //! @param[in] scan_op
1131
+ //! Binary associative scan functor
1132
+ //!
1133
+ //! @param[in] init_value
1134
+ //! Initial value to seed the inclusive scan (`scan_op(init_value, d_in[0])`
1135
+ //! is assigned to `*d_out`)
1136
+ //!
1137
+ //! @param[in] num_items
1138
+ //! Total number of input items (i.e., the length of `d_in`)
1139
+ //!
1140
+ //! @param[in] stream
1141
+ //! CUDA stream to launch kernels within.
1142
+ template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
1143
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanInit(
1144
+ void* d_temp_storage,
1145
+ size_t& temp_storage_bytes,
1146
+ InputIteratorT d_in,
1147
+ OutputIteratorT d_out,
1148
+ ScanOpT scan_op,
1149
+ InitValueT init_value,
1150
+ NumItemsT num_items,
1151
+ cudaStream_t stream = 0)
1152
+ {
1153
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanInit");
1154
+
1155
+ // Unsigned integer type for global offsets
1156
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1157
+ using AccumT = ::cuda::std::__accumulator_t<ScanOpT, cub::detail::it_value_t<InputIteratorT>, InitValueT>;
1158
+
1159
+ return DispatchScan<
1160
+ InputIteratorT,
1161
+ OutputIteratorT,
1162
+ ScanOpT,
1163
+ detail::InputValue<InitValueT>,
1164
+ OffsetT,
1165
+ AccumT,
1166
+ ForceInclusive::Yes>::Dispatch(d_temp_storage,
1167
+ temp_storage_bytes,
1168
+ d_in,
1169
+ d_out,
1170
+ scan_op,
1171
+ detail::InputValue<InitValueT>(init_value),
1172
+ num_items,
1173
+ stream);
1174
+ }
1175
+
1176
+ //! @rst
1177
+ //! Computes a device-wide inclusive prefix scan using the specified binary associative ``scan_op`` functor.
1178
+ //!
1179
+ //! - Supports non-commutative scan operators.
1180
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1181
+ //! addition of floating-point types). Results for pseudo-associative
1182
+ //! operators may vary from run to run. Additional details can be found in
1183
+ //! the @lookback description.
1184
+ //! - @devicestorage
1185
+ //!
1186
+ //! Snippet
1187
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1188
+ //!
1189
+ //! The code snippet below illustrates the inclusive prefix min-scan of an ``int`` device vector.
1190
+ //!
1191
+ //! .. code-block:: c++
1192
+ //!
1193
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1194
+ //! #include <cuda/std/climits> // for INT_MAX
1195
+ //!
1196
+ //! // CustomMin functor
1197
+ //! struct CustomMin
1198
+ //! {
1199
+ //! template <typename T>
1200
+ //! __host__ __device__ __forceinline__
1201
+ //! T operator()(const T &a, const T &b) const {
1202
+ //! return (b < a) ? b : a;
1203
+ //! }
1204
+ //! };
1205
+ //!
1206
+ //! // Declare, allocate, and initialize device-accessible pointers for
1207
+ //! // input and output
1208
+ //! int num_items; // e.g., 7
1209
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
1210
+ //! CustomMin min_op;
1211
+ //! ...
1212
+ //!
1213
+ //! // Determine temporary device storage requirements for inclusive
1214
+ //! // prefix scan
1215
+ //! void *d_temp_storage = nullptr;
1216
+ //! size_t temp_storage_bytes = 0;
1217
+ //! cub::DeviceScan::InclusiveScan(
1218
+ //! d_temp_storage, temp_storage_bytes,
1219
+ //! d_data, min_op, num_items);
1220
+ //!
1221
+ //! // Allocate temporary storage for inclusive prefix scan
1222
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1223
+ //!
1224
+ //! // Run inclusive prefix min-scan
1225
+ //! cub::DeviceScan::InclusiveScan(
1226
+ //! d_temp_storage, temp_storage_bytes,
1227
+ //! d_in, d_out, min_op, num_items);
1228
+ //!
1229
+ //! // d_data <-- [8, 6, 6, 5, 3, 0, 0]
1230
+ //!
1231
+ //! @endrst
1232
+ //!
1233
+ //! @tparam IteratorT
1234
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
1235
+ //!
1236
+ //! @tparam ScanOpT
1237
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1238
+ //!
1239
+ //! @tparam NumItemsT
1240
+ //! **[inferred]** An integral type representing the number of input elements
1241
+ //!
1242
+ //! @param[in]
1243
+ //! d_temp_storage Device-accessible allocation of temporary storage.
1244
+ //! When `nullptr`, the required allocation size is written to
1245
+ //! `temp_storage_bytes` and no work is done.
1246
+ //!
1247
+ //! @param[in,out] temp_storage_bytes
1248
+ //! Reference to size in bytes of `d_temp_storage` allocation
1249
+ //!
1250
+ //! @param[in] d_data
1251
+ //! Random-access iterator to the sequence of data items
1252
+ //!
1253
+ //! @param[in] scan_op
1254
+ //! Binary associative scan functor
1255
+ //!
1256
+ //! @param[in] num_items
1257
+ //! Total number of input items (i.e., the length of `d_in`)
1258
+ //!
1259
+ //! @param[in] stream
1260
+ //! @rst
1261
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1262
+ //! @endrst
1263
+ template <typename IteratorT, typename ScanOpT, typename NumItemsT>
1264
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
1265
+ void* d_temp_storage,
1266
+ size_t& temp_storage_bytes,
1267
+ IteratorT d_data,
1268
+ ScanOpT scan_op,
1269
+ NumItemsT num_items,
1270
+ cudaStream_t stream = 0)
1271
+ {
1272
+ return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream);
1273
+ }
1274
+
1275
+ //! @rst
1276
+ //! Computes a device-wide exclusive prefix sum-by-key with key equality
1277
+ //! defined by ``equality_op``. The value of ``0`` is applied as the initial
1278
+ //! value, and is assigned to the beginning of each segment in ``d_values_out``.
1279
+ //!
1280
+ //! - Supports non-commutative sum operators.
1281
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1282
+ //! addition of floating-point types). Results for pseudo-associative
1283
+ //! operators may vary from run to run. Additional details can be found in
1284
+ //! the @lookback description.
1285
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
1286
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
1287
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1288
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
1289
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
1290
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1291
+ //! - @devicestorage
1292
+ //!
1293
+ //! Snippet
1294
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1295
+ //!
1296
+ //! The code snippet below illustrates the exclusive prefix sum-by-key of an ``int`` device vector.
1297
+ //!
1298
+ //! .. code-block:: c++
1299
+ //!
1300
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1301
+ //!
1302
+ //! // Declare, allocate, and initialize device-accessible pointers for
1303
+ //! // input and output
1304
+ //! int num_items; // e.g., 7
1305
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
1306
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1307
+ //! int *d_values_out; // e.g., [ , , , , , , ]
1308
+ //! ...
1309
+ //!
1310
+ //! // Determine temporary device storage requirements
1311
+ //! void *d_temp_storage = nullptr;
1312
+ //! size_t temp_storage_bytes = 0;
1313
+ //! cub::DeviceScan::ExclusiveSumByKey(
1314
+ //! d_temp_storage, temp_storage_bytes,
1315
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1316
+ //!
1317
+ //! // Allocate temporary storage
1318
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1319
+ //!
1320
+ //! // Run exclusive prefix sum
1321
+ //! cub::DeviceScan::ExclusiveSumByKey(
1322
+ //! d_temp_storage, temp_storage_bytes,
1323
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1324
+ //!
1325
+ //! // d_values_out <-- [0, 8, 0, 7, 12, 0, 0]
1326
+ //!
1327
+ //! @endrst
1328
+ //!
1329
+ //! @tparam KeysInputIteratorT
1330
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
1331
+ //!
1332
+ //! @tparam ValuesInputIteratorT
1333
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
1334
+ //!
1335
+ //! @tparam ValuesOutputIteratorT
1336
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
1337
+ //!
1338
+ //! @tparam EqualityOpT
1339
+ //! **[inferred]** Functor type having member
1340
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
1341
+ //!
1342
+ //! @tparam NumItemsT
1343
+ //! **[inferred]** An integral type representing the number of input elements
1344
+ //!
1345
+ //! @param[in] d_temp_storage
1346
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1347
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1348
+ //!
1349
+ //! @param[in,out] temp_storage_bytes
1350
+ //! Reference to size in bytes of `d_temp_storage` allocation
1351
+ //!
1352
+ //! @param[in] d_keys_in
1353
+ //! Random-access input iterator to the input sequence of key items
1354
+ //!
1355
+ //! @param[in] d_values_in
1356
+ //! Random-access input iterator to the input sequence of value items
1357
+ //!
1358
+ //! @param[out] d_values_out
1359
+ //! Random-access output iterator to the output sequence of value items
1360
+ //!
1361
+ //! @param[in] num_items
1362
+ //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
1363
+ //!
1364
+ //! @param[in] equality_op
1365
+ //! Binary functor that defines the equality of keys.
1366
+ //! Default is cuda::std::equal_to<>{}.
1367
+ //!
1368
+ //! @param[in] stream
1369
+ //! @rst
1370
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1371
+ //! @endrst
1372
+ template <typename KeysInputIteratorT,
1373
+ typename ValuesInputIteratorT,
1374
+ typename ValuesOutputIteratorT,
1375
+ typename EqualityOpT = ::cuda::std::equal_to<>,
1376
+ typename NumItemsT = uint32_t>
1377
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey(
1378
+ void* d_temp_storage,
1379
+ size_t& temp_storage_bytes,
1380
+ KeysInputIteratorT d_keys_in,
1381
+ ValuesInputIteratorT d_values_in,
1382
+ ValuesOutputIteratorT d_values_out,
1383
+ NumItemsT num_items,
1384
+ EqualityOpT equality_op = EqualityOpT(),
1385
+ cudaStream_t stream = 0)
1386
+ {
1387
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSumByKey");
1388
+
1389
+ // Unsigned integer type for global offsets
1390
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1391
+ using InitT = cub::detail::it_value_t<ValuesInputIteratorT>;
1392
+
1393
+ // Initial value
1394
+ InitT init_value{};
1395
+
1396
+ return DispatchScanByKey<
1397
+ KeysInputIteratorT,
1398
+ ValuesInputIteratorT,
1399
+ ValuesOutputIteratorT,
1400
+ EqualityOpT,
1401
+ ::cuda::std::plus<>,
1402
+ InitT,
1403
+ OffsetT>::Dispatch(d_temp_storage,
1404
+ temp_storage_bytes,
1405
+ d_keys_in,
1406
+ d_values_in,
1407
+ d_values_out,
1408
+ equality_op,
1409
+ ::cuda::std::plus<>{},
1410
+ init_value,
1411
+ num_items,
1412
+ stream);
1413
+ }
1414
+
1415
+ //! @rst
1416
+ //! Computes a device-wide exclusive prefix scan-by-key using the
1417
+ //! specified binary associative ``scan_op`` functor. The key equality is defined by
1418
+ //! ``equality_op``. The ``init_value`` value is applied as the initial
1419
+ //! value, and is assigned to the beginning of each segment in ``d_values_out``.
1420
+ //!
1421
+ //! - Supports non-commutative scan operators.
1422
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1423
+ //! addition of floating-point types). Results for pseudo-associative
1424
+ //! operators may vary from run to run. Additional details can be found in
1425
+ //! the @lookback description.
1426
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
1427
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
1428
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1429
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
1430
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
1431
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1432
+ //! - @devicestorage
1433
+ //!
1434
+ //! Snippet
1435
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1436
+ //!
1437
+ //! The code snippet below illustrates the exclusive prefix min-scan-by-key of an ``int`` device vector
1438
+ //!
1439
+ //! .. code-block:: c++
1440
+ //!
1441
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1442
+ //! #include <cuda/std/climits> // for INT_MAX
1443
+ //!
1444
+ //! // CustomMin functor
1445
+ //! struct CustomMin
1446
+ //! {
1447
+ //! template <typename T>
1448
+ //! __host__ __device__ __forceinline__
1449
+ //! T operator()(const T &a, const T &b) const {
1450
+ //! return (b < a) ? b : a;
1451
+ //! }
1452
+ //! };
1453
+ //!
1454
+ //! // CustomEqual functor
1455
+ //! struct CustomEqual
1456
+ //! {
1457
+ //! template <typename T>
1458
+ //! __host__ __device__ __forceinline__
1459
+ //! T operator()(const T &a, const T &b) const {
1460
+ //! return a == b;
1461
+ //! }
1462
+ //! };
1463
+ //!
1464
+ //! // Declare, allocate, and initialize device-accessible pointers for
1465
+ //! // input and output
1466
+ //! int num_items; // e.g., 7
1467
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
1468
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1469
+ //! int *d_values_out; // e.g., [ , , , , , , ]
1470
+ //! CustomMin min_op;
1471
+ //! CustomEqual equality_op;
1472
+ //! ...
1473
+ //!
1474
+ //! // Determine temporary device storage requirements for exclusive
1475
+ //! // prefix scan
1476
+ //! void *d_temp_storage = nullptr;
1477
+ //! size_t temp_storage_bytes = 0;
1478
+ //! cub::DeviceScan::ExclusiveScanByKey(
1479
+ //! d_temp_storage, temp_storage_bytes,
1480
+ //! d_keys_in, d_values_in, d_values_out, min_op,
1481
+ //! (int) INT_MAX, num_items, equality_op);
1482
+ //!
1483
+ //! // Allocate temporary storage for exclusive prefix scan
1484
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1485
+ //!
1486
+ //! // Run exclusive prefix min-scan
1487
+ //! cub::DeviceScan::ExclusiveScanByKey(
1488
+ //! d_temp_storage, temp_storage_bytes,
1489
+ //! d_keys_in, d_values_in, d_values_out, min_op,
1490
+ //! (int) INT_MAX, num_items, equality_op);
1491
+ //!
1492
+ //! // d_values_out <-- [2147483647, 8, 2147483647, 7, 5, 2147483647, 0]
1493
+ //!
1494
+ //! @endrst
1495
+ //!
1496
+ //! @tparam KeysInputIteratorT
1497
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
1498
+ //!
1499
+ //! @tparam ValuesInputIteratorT
1500
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
1501
+ //!
1502
+ //! @tparam ValuesOutputIteratorT
1503
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
1504
+ //!
1505
+ //! @tparam ScanOpT
1506
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1507
+ //!
1508
+ //! @tparam InitValueT
1509
+ //! **[inferred]** Type of the `init_value`
1510
+ //!
1511
+ //! @tparam EqualityOpT
1512
+ //! **[inferred]** Functor type having member
1513
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
1514
+ //!
1515
+ //! @tparam NumItemsT
1516
+ //! **[inferred]** An integral type representing the number of input elements
1517
+ //!
1518
+ //! @param[in] d_temp_storage
1519
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1520
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1521
+ //!
1522
+ //! @param[in,out] temp_storage_bytes
1523
+ //! Reference to size in bytes of `d_temp_storage` allocation
1524
+ //!
1525
+ //! @param[in] d_keys_in
1526
+ //! Random-access input iterator to the input sequence of key items
1527
+ //!
1528
+ //! @param[in] d_values_in
1529
+ //! Random-access input iterator to the input sequence of value items
1530
+ //!
1531
+ //! @param[out] d_values_out
1532
+ //! Random-access output iterator to the output sequence of value items
1533
+ //!
1534
+ //! @param[in] scan_op
1535
+ //! Binary associative scan functor
1536
+ //!
1537
+ //! @param[in] init_value
1538
+ //! Initial value to seed the exclusive scan (and is assigned to the
1539
+ //! beginning of each segment in `d_values_out`)
1540
+ //!
1541
+ //! @param[in] num_items
1542
+ //! Total number of input items (i.e., the length of `d_keys_in` and
1543
+ //! `d_values_in`)
1544
+ //!
1545
+ //! @param[in] equality_op
1546
+ //! Binary functor that defines the equality of keys.
1547
+ //! Default is cuda::std::equal_to<>{}.
1548
+ //!
1549
+ //! @param[in] stream
1550
+ //! @rst
1551
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1552
+ //! @endrst
1553
+ template <typename KeysInputIteratorT,
1554
+ typename ValuesInputIteratorT,
1555
+ typename ValuesOutputIteratorT,
1556
+ typename ScanOpT,
1557
+ typename InitValueT,
1558
+ typename EqualityOpT = ::cuda::std::equal_to<>,
1559
+ typename NumItemsT = uint32_t>
1560
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey(
1561
+ void* d_temp_storage,
1562
+ size_t& temp_storage_bytes,
1563
+ KeysInputIteratorT d_keys_in,
1564
+ ValuesInputIteratorT d_values_in,
1565
+ ValuesOutputIteratorT d_values_out,
1566
+ ScanOpT scan_op,
1567
+ InitValueT init_value,
1568
+ NumItemsT num_items,
1569
+ EqualityOpT equality_op = EqualityOpT(),
1570
+ cudaStream_t stream = 0)
1571
+ {
1572
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScanByKey");
1573
+
1574
+ // Unsigned integer type for global offsets
1575
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1576
+
1577
+ return DispatchScanByKey<
1578
+ KeysInputIteratorT,
1579
+ ValuesInputIteratorT,
1580
+ ValuesOutputIteratorT,
1581
+ EqualityOpT,
1582
+ ScanOpT,
1583
+ InitValueT,
1584
+ OffsetT>::Dispatch(d_temp_storage,
1585
+ temp_storage_bytes,
1586
+ d_keys_in,
1587
+ d_values_in,
1588
+ d_values_out,
1589
+ equality_op,
1590
+ scan_op,
1591
+ init_value,
1592
+ num_items,
1593
+ stream);
1594
+ }
1595
+
1596
+ //! @rst
1597
+ //! Computes a device-wide inclusive prefix sum-by-key with key equality defined by ``equality_op``.
1598
+ //!
1599
+ //! - Supports non-commutative sum operators.
1600
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1601
+ //! addition of floating-point types). Results for pseudo-associative
1602
+ //! operators may vary from run to run. Additional details can be found in
1603
+ //! the @lookback description.
1604
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
1605
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
1606
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1607
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
1608
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
1609
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1610
+ //! - @devicestorage
1611
+ //!
1612
+ //! Snippet
1613
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1614
+ //!
1615
+ //! The code snippet below illustrates the inclusive prefix sum-by-key of an ``int`` device vector.
1616
+ //!
1617
+ //! .. code-block:: c++
1618
+ //!
1619
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1620
+ //!
1621
+ //! // Declare, allocate, and initialize device-accessible pointers for
1622
+ //! // input and output
1623
+ //! int num_items; // e.g., 7
1624
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
1625
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1626
+ //! int *d_values_out; // e.g., [ , , , , , , ]
1627
+ //! ...
1628
+ //!
1629
+ //! // Determine temporary device storage requirements for inclusive prefix sum
1630
+ //! void *d_temp_storage = nullptr;
1631
+ //! size_t temp_storage_bytes = 0;
1632
+ //! cub::DeviceScan::InclusiveSumByKey(
1633
+ //! d_temp_storage, temp_storage_bytes,
1634
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1635
+ //!
1636
+ //! // Allocate temporary storage for inclusive prefix sum
1637
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1638
+ //!
1639
+ //! // Run inclusive prefix sum
1640
+ //! cub::DeviceScan::InclusiveSumByKey(
1641
+ //! d_temp_storage, temp_storage_bytes,
1642
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1643
+ //!
1644
+ //! // d_out <-- [8, 14, 7, 12, 15, 0, 9]
1645
+ //!
1646
+ //! @endrst
1647
+ //!
1648
+ //! @tparam KeysInputIteratorT
1649
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
1650
+ //!
1651
+ //! @tparam ValuesInputIteratorT
1652
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
1653
+ //!
1654
+ //! @tparam ValuesOutputIteratorT
1655
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
1656
+ //!
1657
+ //! @tparam EqualityOpT
1658
+ //! **[inferred]** Functor type having member
1659
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
1660
+ //!
1661
+ //! @tparam NumItemsT
1662
+ //! **[inferred]** An integral type representing the number of input elements
1663
+ //!
1664
+ //! @param[in] d_temp_storage
1665
+ //! Device-accessible allocation of temporary storage.
1666
+ //! When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
1667
+ //!
1668
+ //! @param[in,out] temp_storage_bytes
1669
+ //! Reference to size in bytes of `d_temp_storage` allocation
1670
+ //!
1671
+ //! @param[in] d_keys_in
1672
+ //! Random-access input iterator to the input sequence of key items
1673
+ //!
1674
+ //! @param[in] d_values_in
1675
+ //! Random-access input iterator to the input sequence of value items
1676
+ //!
1677
+ //! @param[out] d_values_out
1678
+ //! Random-access output iterator to the output sequence of value items
1679
+ //!
1680
+ //! @param[in] num_items
1681
+ //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
1682
+ //!
1683
+ //! @param[in] equality_op
1684
+ //! Binary functor that defines the equality of keys.
1685
+ //! Default is cuda::std::equal_to<>{}.
1686
+ //!
1687
+ //! @param[in] stream
1688
+ //! @rst
1689
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1690
+ //! @endrst
1691
+ template <typename KeysInputIteratorT,
1692
+ typename ValuesInputIteratorT,
1693
+ typename ValuesOutputIteratorT,
1694
+ typename EqualityOpT = ::cuda::std::equal_to<>,
1695
+ typename NumItemsT = uint32_t>
1696
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey(
1697
+ void* d_temp_storage,
1698
+ size_t& temp_storage_bytes,
1699
+ KeysInputIteratorT d_keys_in,
1700
+ ValuesInputIteratorT d_values_in,
1701
+ ValuesOutputIteratorT d_values_out,
1702
+ NumItemsT num_items,
1703
+ EqualityOpT equality_op = EqualityOpT(),
1704
+ cudaStream_t stream = 0)
1705
+ {
1706
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSumByKey");
1707
+
1708
+ // Unsigned integer type for global offsets
1709
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1710
+
1711
+ return DispatchScanByKey<
1712
+ KeysInputIteratorT,
1713
+ ValuesInputIteratorT,
1714
+ ValuesOutputIteratorT,
1715
+ EqualityOpT,
1716
+ ::cuda::std::plus<>,
1717
+ NullType,
1718
+ OffsetT>::Dispatch(d_temp_storage,
1719
+ temp_storage_bytes,
1720
+ d_keys_in,
1721
+ d_values_in,
1722
+ d_values_out,
1723
+ equality_op,
1724
+ ::cuda::std::plus<>{},
1725
+ NullType{},
1726
+ num_items,
1727
+ stream);
1728
+ }
1729
+
1730
+ //! @rst
1731
+ //! Computes a device-wide inclusive prefix scan-by-key using the
1732
+ //! specified binary associative ``scan_op`` functor. The key equality is defined by ``equality_op``.
1733
+ //!
1734
+ //! - Supports non-commutative scan operators.
1735
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1736
+ //! addition of floating-point types). Results for pseudo-associative
1737
+ //! operators may vary from run to run. Additional details can be found in
1738
+ //! the @lookback description.
1739
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
1740
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
1741
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1742
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
1743
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
1744
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1745
+ //! - @devicestorage
1746
+ //!
1747
+ //! Snippet
1748
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1749
+ //!
1750
+ //! The code snippet below illustrates the inclusive prefix min-scan-by-key of an ``int`` device vector.
1751
+ //!
1752
+ //! .. code-block:: c++
1753
+ //!
1754
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1755
+ //! #include <cuda/std/climits> // for INT_MAX
1756
+ //!
1757
+ //! // CustomMin functor
1758
+ //! struct CustomMin
1759
+ //! {
1760
+ //! template <typename T>
1761
+ //! __host__ __device__ __forceinline__
1762
+ //! T operator()(const T &a, const T &b) const {
1763
+ //! return (b < a) ? b : a;
1764
+ //! }
1765
+ //! };
1766
+ //!
1767
+ //! // CustomEqual functor
1768
+ //! struct CustomEqual
1769
+ //! {
1770
+ //! template <typename T>
1771
+ //! __host__ __device__ __forceinline__
1772
+ //! T operator()(const T &a, const T &b) const {
1773
+ //! return a == b;
1774
+ //! }
1775
+ //! };
1776
+ //!
1777
+ //! // Declare, allocate, and initialize device-accessible pointers for
1778
+ //! // input and output
1779
+ //! int num_items; // e.g., 7
1780
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
1781
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1782
+ //! int *d_values_out; // e.g., [ , , , , , , ]
1783
+ //! CustomMin min_op;
1784
+ //! CustomEqual equality_op;
1785
+ //! ...
1786
+ //!
1787
+ //! // Determine temporary device storage requirements for inclusive prefix scan
1788
+ //! void *d_temp_storage = nullptr;
1789
+ //! size_t temp_storage_bytes = 0;
1790
+ //! cub::DeviceScan::InclusiveScanByKey(
1791
+ //! d_temp_storage, temp_storage_bytes,
1792
+ //! d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
1793
+ //!
1794
+ //! // Allocate temporary storage for inclusive prefix scan
1795
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1796
+ //!
1797
+ //! // Run inclusive prefix min-scan
1798
+ //! cub::DeviceScan::InclusiveScanByKey(
1799
+ //! d_temp_storage, temp_storage_bytes,
1800
+ //! d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
1801
+ //!
1802
+ //! // d_out <-- [8, 6, 7, 5, 3, 0, 0]
1803
+ //!
1804
+ //! @endrst
1805
+ //!
1806
+ //! @tparam KeysInputIteratorT
1807
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
1808
+ //!
1809
+ //! @tparam ValuesInputIteratorT
1810
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
1811
+ //!
1812
+ //! @tparam ValuesOutputIteratorT
1813
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
1814
+ //!
1815
+ //! @tparam ScanOpT
1816
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1817
+ //!
1818
+ //! @tparam EqualityOpT
1819
+ //! **[inferred]** Functor type having member
1820
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
1821
+ //!
1822
+ //! @tparam NumItemsT
1823
+ //! **[inferred]** An integral type representing the number of input elements
1824
+ //!
1825
+ //! @param[in] d_temp_storage
1826
+ //! Device-accessible allocation of temporary storage.
1827
+ //! When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
1828
+ //!
1829
+ //! @param[in,out] temp_storage_bytes
1830
+ //! Reference to size in bytes of `d_temp_storage` allocation
1831
+ //!
1832
+ //! @param[in] d_keys_in
1833
+ //! Random-access input iterator to the input sequence of key items
1834
+ //!
1835
+ //! @param[in] d_values_in
1836
+ //! Random-access input iterator to the input sequence of value items
1837
+ //!
1838
+ //! @param[out] d_values_out
1839
+ //! Random-access output iterator to the output sequence of value items
1840
+ //!
1841
+ //! @param[in] scan_op
1842
+ //! Binary associative scan functor
1843
+ //!
1844
+ //! @param[in] num_items
1845
+ //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
1846
+ //!
1847
+ //! @param[in] equality_op
1848
+ //! Binary functor that defines the equality of keys.
1849
+ //! Default is cuda::std::equal_to<>{}.
1850
+ //!
1851
+ //! @param[in] stream
1852
+ //! @rst
1853
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1854
+ //! @endrst
1855
+ template <typename KeysInputIteratorT,
1856
+ typename ValuesInputIteratorT,
1857
+ typename ValuesOutputIteratorT,
1858
+ typename ScanOpT,
1859
+ typename EqualityOpT = ::cuda::std::equal_to<>,
1860
+ typename NumItemsT = uint32_t>
1861
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey(
1862
+ void* d_temp_storage,
1863
+ size_t& temp_storage_bytes,
1864
+ KeysInputIteratorT d_keys_in,
1865
+ ValuesInputIteratorT d_values_in,
1866
+ ValuesOutputIteratorT d_values_out,
1867
+ ScanOpT scan_op,
1868
+ NumItemsT num_items,
1869
+ EqualityOpT equality_op = EqualityOpT(),
1870
+ cudaStream_t stream = 0)
1871
+ {
1872
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanByKey");
1873
+
1874
+ // Unsigned integer type for global offsets
1875
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1876
+
1877
+ return DispatchScanByKey<
1878
+ KeysInputIteratorT,
1879
+ ValuesInputIteratorT,
1880
+ ValuesOutputIteratorT,
1881
+ EqualityOpT,
1882
+ ScanOpT,
1883
+ NullType,
1884
+ OffsetT>::Dispatch(d_temp_storage,
1885
+ temp_storage_bytes,
1886
+ d_keys_in,
1887
+ d_values_in,
1888
+ d_values_out,
1889
+ equality_op,
1890
+ scan_op,
1891
+ NullType(),
1892
+ num_items,
1893
+ stream);
1894
+ }
1895
+
1896
+ //! @} end member group
1897
+ };
1898
+
1899
+ CUB_NAMESPACE_END