cuda-cccl 0.1.3.2.0.dev438__cp310-cp310-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1962) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +3 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +8 -0
  5. cuda/cccl/cooperative/experimental/_caching.py +48 -0
  6. cuda/cccl/cooperative/experimental/_common.py +275 -0
  7. cuda/cccl/cooperative/experimental/_nvrtc.py +91 -0
  8. cuda/cccl/cooperative/experimental/_scan_op.py +181 -0
  9. cuda/cccl/cooperative/experimental/_types.py +937 -0
  10. cuda/cccl/cooperative/experimental/_typing.py +107 -0
  11. cuda/cccl/cooperative/experimental/block/__init__.py +39 -0
  12. cuda/cccl/cooperative/experimental/block/_block_exchange.py +251 -0
  13. cuda/cccl/cooperative/experimental/block/_block_load_store.py +215 -0
  14. cuda/cccl/cooperative/experimental/block/_block_merge_sort.py +125 -0
  15. cuda/cccl/cooperative/experimental/block/_block_radix_sort.py +214 -0
  16. cuda/cccl/cooperative/experimental/block/_block_reduce.py +294 -0
  17. cuda/cccl/cooperative/experimental/block/_block_scan.py +983 -0
  18. cuda/cccl/cooperative/experimental/warp/__init__.py +9 -0
  19. cuda/cccl/cooperative/experimental/warp/_warp_merge_sort.py +92 -0
  20. cuda/cccl/cooperative/experimental/warp/_warp_reduce.py +153 -0
  21. cuda/cccl/cooperative/experimental/warp/_warp_scan.py +78 -0
  22. cuda/cccl/headers/__init__.py +7 -0
  23. cuda/cccl/headers/include/__init__.py +1 -0
  24. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +262 -0
  25. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1185 -0
  26. cuda/cccl/headers/include/cub/agent/agent_for.cuh +84 -0
  27. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +927 -0
  28. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +232 -0
  29. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +730 -0
  30. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +766 -0
  31. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +289 -0
  32. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +706 -0
  33. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +558 -0
  34. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +619 -0
  35. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +806 -0
  36. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1127 -0
  37. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +585 -0
  38. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +477 -0
  39. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +292 -0
  40. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1120 -0
  41. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +341 -0
  42. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +609 -0
  43. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
  44. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +614 -0
  45. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1424 -0
  46. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +965 -0
  47. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1217 -0
  48. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1308 -0
  49. cuda/cccl/headers/include/cub/block/block_histogram.cuh +420 -0
  50. cuda/cccl/headers/include/cub/block/block_load.cuh +1260 -0
  51. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +800 -0
  52. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1220 -0
  53. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2194 -0
  54. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  55. cuda/cccl/headers/include/cub/block/block_reduce.cuh +666 -0
  56. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +434 -0
  57. cuda/cccl/headers/include/cub/block/block_scan.cuh +2584 -0
  58. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  59. cuda/cccl/headers/include/cub/block/block_store.cuh +1246 -0
  60. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +624 -0
  61. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  62. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  63. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  64. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  65. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
  66. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  67. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  68. cuda/cccl/headers/include/cub/config.cuh +53 -0
  69. cuda/cccl/headers/include/cub/cub.cuh +120 -0
  70. cuda/cccl/headers/include/cub/detail/array_utils.cuh +78 -0
  71. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +161 -0
  72. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +74 -0
  73. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  74. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +61 -0
  75. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +253 -0
  76. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +88 -0
  77. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
  78. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
  79. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +118 -0
  80. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  81. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  82. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +62 -0
  83. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  84. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +53 -0
  85. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
  86. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  87. cuda/cccl/headers/include/cub/detail/rfa.cuh +731 -0
  88. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  89. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  90. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
  91. cuda/cccl/headers/include/cub/detail/type_traits.cuh +187 -0
  92. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +73 -0
  93. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  94. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  95. cuda/cccl/headers/include/cub/device/device_copy.cuh +187 -0
  96. cuda/cccl/headers/include/cub/device/device_for.cuh +985 -0
  97. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1509 -0
  98. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  99. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  100. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  101. cuda/cccl/headers/include/cub/device/device_partition.cuh +664 -0
  102. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3437 -0
  103. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2519 -0
  104. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
  105. cuda/cccl/headers/include/cub/device/device_scan.cuh +2205 -0
  106. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  107. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1520 -0
  108. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  109. cuda/cccl/headers/include/cub/device/device_select.cuh +1228 -0
  110. cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
  111. cuda/cccl/headers/include/cub/device/device_transform.cuh +637 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +315 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +111 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +719 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +198 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1046 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +304 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +474 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1753 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1327 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +655 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +536 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +314 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +615 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +500 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +602 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +917 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +842 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +342 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +441 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +389 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +629 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +561 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +545 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +226 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +803 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +578 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +192 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +324 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +475 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +201 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +1009 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +70 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +121 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +63 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +278 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +79 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +118 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1068 -0
  154. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
  155. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +945 -0
  156. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +676 -0
  157. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +621 -0
  158. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1013 -0
  159. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +249 -0
  160. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1588 -0
  161. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +443 -0
  162. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
  163. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +454 -0
  164. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +874 -0
  165. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +227 -0
  166. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  167. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  168. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +254 -0
  169. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +259 -0
  170. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +250 -0
  171. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +320 -0
  172. cuda/cccl/headers/include/cub/thread/thread_load.cuh +349 -0
  173. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +688 -0
  174. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +541 -0
  175. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
  176. cuda/cccl/headers/include/cub/thread/thread_search.cuh +199 -0
  177. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +458 -0
  178. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +102 -0
  179. cuda/cccl/headers/include/cub/thread/thread_store.cuh +365 -0
  180. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  181. cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
  182. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  183. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  184. cuda/cccl/headers/include/cub/util_device.cuh +784 -0
  185. cuda/cccl/headers/include/cub/util_macro.cuh +97 -0
  186. cuda/cccl/headers/include/cub/util_math.cuh +118 -0
  187. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  188. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  189. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  190. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  191. cuda/cccl/headers/include/cub/util_type.cuh +1120 -0
  192. cuda/cccl/headers/include/cub/util_vsmem.cuh +253 -0
  193. cuda/cccl/headers/include/cub/version.cuh +89 -0
  194. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  195. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  196. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +736 -0
  197. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +407 -0
  198. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +952 -0
  199. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +715 -0
  200. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  201. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  202. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  203. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +824 -0
  204. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1886 -0
  205. cuda/cccl/headers/include/cub/warp/warp_store.cuh +520 -0
  206. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
  207. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  208. cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
  209. cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
  210. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  211. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  212. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
  213. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  214. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  215. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  216. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  217. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  218. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  219. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  220. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
  221. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +468 -0
  222. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
  223. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  224. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  225. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  226. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  227. cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
  228. cuda/cccl/headers/include/cuda/__cccl_config +36 -0
  229. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  230. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +249 -0
  231. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  232. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  233. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  234. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  235. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  236. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  237. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  238. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  239. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  240. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +93 -0
  241. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  242. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  243. cuda/cccl/headers/include/cuda/__device/all_devices.h +240 -0
  244. cuda/cccl/headers/include/cuda/__device/arch_traits.h +613 -0
  245. cuda/cccl/headers/include/cuda/__device/attributes.h +721 -0
  246. cuda/cccl/headers/include/cuda/__device/device_ref.h +185 -0
  247. cuda/cccl/headers/include/cuda/__device/physical_device.h +168 -0
  248. cuda/cccl/headers/include/cuda/__driver/driver_api.h +541 -0
  249. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  250. cuda/cccl/headers/include/cuda/__event/event_ref.h +158 -0
  251. cuda/cccl/headers/include/cuda/__event/timed_event.h +118 -0
  252. cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
  253. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
  254. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  255. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  256. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  257. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  258. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  259. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  260. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
  261. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  262. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  263. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  264. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  265. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  266. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +49 -0
  267. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +300 -0
  268. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +483 -0
  269. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  270. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  271. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  272. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +386 -0
  273. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +344 -0
  274. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +498 -0
  275. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +501 -0
  276. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +461 -0
  277. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +112 -0
  278. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +673 -0
  279. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  280. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +462 -0
  281. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +63 -0
  282. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +122 -0
  283. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +51 -0
  284. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  285. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  286. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  287. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  288. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  289. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  290. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  291. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  292. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  293. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
  294. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  295. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  296. cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
  297. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  298. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  299. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  300. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  301. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  302. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  303. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  304. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  305. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  306. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  307. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +69 -0
  308. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
  309. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +654 -0
  310. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
  311. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  312. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  313. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  314. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  315. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2982 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  413. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  414. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  415. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  416. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  417. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  418. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  419. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  420. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  421. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  422. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  423. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
  424. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  425. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  426. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +97 -0
  427. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  428. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  429. cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
  430. cuda/cccl/headers/include/cuda/__stream/stream.h +142 -0
  431. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +296 -0
  432. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  433. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  434. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  435. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  436. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  437. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  438. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  439. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  440. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  441. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +590 -0
  442. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
  443. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
  444. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  445. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  446. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  447. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
  448. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
  449. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  450. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  451. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  452. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
  453. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  454. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  455. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  456. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  457. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  458. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  459. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  460. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  461. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  462. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  463. cuda/cccl/headers/include/cuda/access_property +26 -0
  464. cuda/cccl/headers/include/cuda/algorithm +27 -0
  465. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  466. cuda/cccl/headers/include/cuda/atomic +27 -0
  467. cuda/cccl/headers/include/cuda/barrier +267 -0
  468. cuda/cccl/headers/include/cuda/bit +29 -0
  469. cuda/cccl/headers/include/cuda/cmath +36 -0
  470. cuda/cccl/headers/include/cuda/devices +20 -0
  471. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  472. cuda/cccl/headers/include/cuda/functional +32 -0
  473. cuda/cccl/headers/include/cuda/iterator +38 -0
  474. cuda/cccl/headers/include/cuda/latch +27 -0
  475. cuda/cccl/headers/include/cuda/mdspan +28 -0
  476. cuda/cccl/headers/include/cuda/memory +34 -0
  477. cuda/cccl/headers/include/cuda/memory_resource +35 -0
  478. cuda/cccl/headers/include/cuda/numeric +29 -0
  479. cuda/cccl/headers/include/cuda/pipeline +579 -0
  480. cuda/cccl/headers/include/cuda/ptx +128 -0
  481. cuda/cccl/headers/include/cuda/semaphore +31 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  503. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  504. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  505. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  506. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  507. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  508. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  509. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  510. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +92 -0
  511. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  512. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  513. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  514. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  515. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  516. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  517. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  518. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  519. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  520. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  521. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  522. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +96 -0
  523. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  524. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  525. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  526. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  527. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +140 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  571. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  572. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  573. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
  574. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  575. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  576. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  577. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  578. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  579. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  580. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  581. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  582. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  583. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  584. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  585. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  586. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  587. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  588. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  589. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  590. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  591. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  592. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
  593. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
  594. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  595. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  596. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  597. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  598. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  599. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  600. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  601. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  602. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  603. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
  604. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
  605. cuda/cccl/headers/include/cuda/std/__bit/countl.h +167 -0
  606. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  607. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  608. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  609. cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
  610. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  611. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  612. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  613. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  614. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  615. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  616. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +676 -0
  617. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  618. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  619. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  620. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
  621. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  622. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
  623. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
  624. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +79 -0
  625. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +68 -0
  626. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
  627. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  628. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  629. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1284 -0
  630. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
  631. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
  632. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  633. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  634. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  635. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  636. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  637. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  638. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  639. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
  640. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  641. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  642. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  643. cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
  644. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
  645. cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
  646. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
  647. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
  648. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
  649. cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
  650. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
  651. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
  652. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +258 -0
  653. cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
  654. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  655. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  656. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
  657. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +784 -0
  658. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
  659. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
  660. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +231 -0
  661. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
  662. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
  663. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
  664. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
  665. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
  666. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
  667. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
  668. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +180 -0
  669. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  670. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  671. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +534 -0
  672. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +260 -0
  673. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  674. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  675. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
  676. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
  677. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
  678. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  679. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
  680. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
  681. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  682. cuda/cccl/headers/include/cuda/std/__complex/complex.h +674 -0
  683. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
  684. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  685. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
  686. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  687. cuda/cccl/headers/include/cuda/std/__complex/literals.h +106 -0
  688. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
  689. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  690. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +322 -0
  691. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +321 -0
  692. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  693. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  694. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  695. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  696. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  697. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  698. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  699. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  700. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  701. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  702. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +341 -0
  703. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  704. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  705. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  706. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  707. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  708. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  709. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  710. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  711. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  712. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  713. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  714. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  715. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  716. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  717. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  718. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  719. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  720. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  721. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  722. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  723. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  724. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +69 -0
  725. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  726. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  727. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  728. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +72 -0
  729. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +146 -0
  730. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  731. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  732. cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
  733. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  734. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1956 -0
  735. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
  736. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  737. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +172 -0
  738. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  739. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +809 -0
  740. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  741. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  742. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
  743. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  744. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +113 -0
  745. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
  746. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  747. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  748. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  749. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  750. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  751. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  752. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  753. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  754. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  755. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  756. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  757. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  758. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  759. cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
  760. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  761. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  762. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  763. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
  764. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  765. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  766. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  767. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  768. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  769. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  770. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  771. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  772. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  773. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  774. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  775. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  776. cuda/cccl/headers/include/cuda/std/__functional/bind.h +337 -0
  777. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  778. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  779. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  780. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  781. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  782. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  783. cuda/cccl/headers/include/cuda/std/__functional/function.h +1278 -0
  784. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  785. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  786. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +560 -0
  787. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  788. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +67 -0
  789. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  790. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  791. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  792. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  793. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  794. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  795. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  796. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  797. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  798. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  799. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  800. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +268 -0
  801. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +35 -0
  802. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  803. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +49 -0
  804. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +66 -0
  805. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  806. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  807. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  808. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  809. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  810. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +90 -0
  811. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  812. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  813. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +34 -0
  814. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  815. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  816. cuda/cccl/headers/include/cuda/std/__fwd/string.h +83 -0
  817. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +59 -0
  818. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  819. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  820. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  821. cuda/cccl/headers/include/cuda/std/__internal/features.h +77 -0
  822. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +122 -0
  823. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  824. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  825. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
  826. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  827. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  828. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  829. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  830. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  831. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  832. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  833. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  834. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
  835. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  836. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  837. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  838. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  839. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  840. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  841. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  842. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  843. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  844. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  845. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  846. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  847. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  848. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  849. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  850. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  851. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  852. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  853. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  854. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  855. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
  856. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  857. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  858. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  859. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  860. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  861. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  862. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  863. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  864. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
  865. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
  866. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
  867. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
  868. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  869. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +144 -0
  870. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  871. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
  872. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +758 -0
  873. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  874. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  875. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
  876. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +497 -0
  877. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  878. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  879. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  880. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
  881. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  882. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  883. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  884. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  885. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  886. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  887. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +532 -0
  888. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  889. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  890. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  891. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +248 -0
  892. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  893. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +46 -0
  894. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  895. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  896. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  897. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
  898. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  899. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
  900. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  901. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  902. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  903. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  904. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  905. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  906. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  907. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  908. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  909. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  910. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  911. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  912. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  913. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  914. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  915. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  916. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  917. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  918. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  919. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  920. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  921. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  922. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  923. cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
  924. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +432 -0
  925. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  926. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  927. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  928. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  929. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  930. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  931. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  932. cuda/cccl/headers/include/cuda/std/__random_ +29 -0
  933. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  934. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  935. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +314 -0
  936. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  937. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  938. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  939. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  940. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  941. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  942. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  943. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  944. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  945. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  946. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +161 -0
  947. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
  948. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  949. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  950. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  951. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  952. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  953. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  954. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  955. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
  956. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  957. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  958. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  959. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  960. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  961. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  962. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  963. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  964. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  965. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  966. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  967. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  968. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  969. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  970. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  971. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  972. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  973. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  974. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  975. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  976. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +98 -0
  977. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +260 -0
  978. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +218 -0
  979. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  980. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  981. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +80 -0
  982. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +64 -0
  983. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  984. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  985. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  986. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  987. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  988. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  989. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  990. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  991. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  992. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  993. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  994. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  995. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  996. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  997. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  998. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  999. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  1000. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1001. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1002. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1003. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1004. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1005. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1006. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1007. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1008. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1016. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1017. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1018. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1019. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1020. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1021. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1022. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1023. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1024. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1025. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  1026. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1027. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1028. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1029. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1030. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1031. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1032. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1033. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1034. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1035. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1036. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
  1037. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1038. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1039. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1040. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1041. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1042. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1043. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1044. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1045. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1046. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1047. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1048. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1049. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1050. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1051. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1052. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1053. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1054. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  1055. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1056. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1057. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1058. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1059. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1060. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1061. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1062. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1063. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1064. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1065. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1066. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1067. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1068. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1069. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1070. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +119 -0
  1071. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1072. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  1073. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1074. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1075. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1076. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1077. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1078. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1079. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1080. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +202 -0
  1081. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1082. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1083. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1084. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1085. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1086. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +162 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1125. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1126. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1127. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1128. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1129. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1130. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1131. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1132. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1133. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1134. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1135. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +106 -0
  1136. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  1137. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1138. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1139. cuda/cccl/headers/include/cuda/std/__utility/pair.h +796 -0
  1140. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1141. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
  1142. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1143. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  1144. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1145. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1146. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1147. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1148. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1149. cuda/cccl/headers/include/cuda/std/array +518 -0
  1150. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1151. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1152. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1153. cuda/cccl/headers/include/cuda/std/bitset +994 -0
  1154. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1155. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1156. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1157. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1158. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1159. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1160. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1161. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1162. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1163. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1164. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1165. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1166. cuda/cccl/headers/include/cuda/std/ctime +154 -0
  1167. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1168. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +204 -0
  1169. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1721 -0
  1170. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2509 -0
  1171. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1172. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1173. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
  1174. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2142 -0
  1175. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1176. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1177. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1178. cuda/cccl/headers/include/cuda/std/initializer_list +36 -0
  1179. cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
  1180. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1181. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1182. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1183. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1184. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1185. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1186. cuda/cccl/headers/include/cuda/std/numbers +341 -0
  1187. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1188. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1189. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1190. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1191. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1192. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1193. cuda/cccl/headers/include/cuda/std/span +628 -0
  1194. cuda/cccl/headers/include/cuda/std/string_view +799 -0
  1195. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1196. cuda/cccl/headers/include/cuda/std/type_traits +177 -0
  1197. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1198. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1199. cuda/cccl/headers/include/cuda/std/version +243 -0
  1200. cuda/cccl/headers/include/cuda/stream +31 -0
  1201. cuda/cccl/headers/include/cuda/stream_ref +54 -0
  1202. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1203. cuda/cccl/headers/include/cuda/utility +27 -0
  1204. cuda/cccl/headers/include/cuda/version +16 -0
  1205. cuda/cccl/headers/include/cuda/warp +28 -0
  1206. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1207. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1208. cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
  1209. cuda/cccl/headers/include/nv/target +235 -0
  1210. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1211. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1212. cuda/cccl/headers/include/thrust/advance.h +57 -0
  1213. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1214. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1215. cuda/cccl/headers/include/thrust/complex.h +858 -0
  1216. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1217. cuda/cccl/headers/include/thrust/count.h +245 -0
  1218. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1219. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +37 -0
  1220. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1221. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +350 -0
  1222. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +371 -0
  1223. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +45 -0
  1224. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +242 -0
  1225. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +39 -0
  1226. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +137 -0
  1227. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +39 -0
  1228. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +99 -0
  1229. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +53 -0
  1230. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +68 -0
  1231. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1232. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +102 -0
  1233. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +86 -0
  1234. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +79 -0
  1235. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +79 -0
  1236. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +39 -0
  1237. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +98 -0
  1238. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1239. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1240. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1241. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1242. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1243. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1244. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1245. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1246. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1247. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1248. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1249. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1250. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1251. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1252. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1253. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1254. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1255. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1256. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1257. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1258. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1259. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1260. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1261. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1262. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1263. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1264. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1265. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1266. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1267. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1268. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1269. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1270. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1271. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1272. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1273. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1274. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1275. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1276. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1277. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1278. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1279. cuda/cccl/headers/include/thrust/detail/device_delete.inl +52 -0
  1280. cuda/cccl/headers/include/thrust/detail/device_free.inl +47 -0
  1281. cuda/cccl/headers/include/thrust/detail/device_new.inl +61 -0
  1282. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1283. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1284. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1285. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +80 -0
  1286. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1287. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1288. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1289. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1290. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1291. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1292. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1293. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1294. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1295. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1296. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1297. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1298. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1299. cuda/cccl/headers/include/thrust/detail/integer_math.h +130 -0
  1300. cuda/cccl/headers/include/thrust/detail/internal_functional.h +328 -0
  1301. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1302. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
  1303. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1304. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1305. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +40 -0
  1306. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1307. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1308. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +37 -0
  1309. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1310. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1311. cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
  1312. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1313. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1314. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1315. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1316. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1317. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
  1318. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1319. cuda/cccl/headers/include/thrust/detail/reference.h +494 -0
  1320. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1321. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1322. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1323. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1324. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1325. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1326. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1327. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1328. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1329. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1330. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1331. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1332. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1333. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1334. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1335. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1336. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
  1337. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1338. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1339. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1340. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
  1341. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1342. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1343. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1344. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1345. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1346. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1347. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1348. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
  1349. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1350. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1351. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1352. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1353. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1354. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1355. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1356. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1357. cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
  1358. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1210 -0
  1359. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1360. cuda/cccl/headers/include/thrust/device_delete.h +59 -0
  1361. cuda/cccl/headers/include/thrust/device_free.h +72 -0
  1362. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1363. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1364. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1365. cuda/cccl/headers/include/thrust/device_new.h +91 -0
  1366. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1367. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1368. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1369. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1370. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1371. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1372. cuda/cccl/headers/include/thrust/execution_policy.h +251 -0
  1373. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1374. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1375. cuda/cccl/headers/include/thrust/find.h +382 -0
  1376. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1377. cuda/cccl/headers/include/thrust/functional.h +395 -0
  1378. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1379. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1380. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1381. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1382. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1383. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1384. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1385. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1386. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1387. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1388. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1389. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1390. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1391. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1392. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +182 -0
  1393. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +58 -0
  1394. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1395. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1396. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1397. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1398. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1399. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1400. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1401. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1402. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +323 -0
  1403. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1404. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1405. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1406. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1407. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1408. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
  1409. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1410. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +149 -0
  1411. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1412. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1413. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1414. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
  1415. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1416. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1417. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1418. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1419. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1420. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1421. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1422. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1423. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
  1424. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1425. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1426. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1427. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1428. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1429. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1430. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1431. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1432. cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
  1433. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1434. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1435. cuda/cccl/headers/include/thrust/pair.h +99 -0
  1436. cuda/cccl/headers/include/thrust/partition.h +1391 -0
  1437. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1438. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1439. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1440. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1441. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1442. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1443. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1444. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1445. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1446. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1447. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1448. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1449. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
  1450. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1451. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
  1452. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1453. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1454. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1455. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1456. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1457. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1458. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1459. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1460. cuda/cccl/headers/include/thrust/random.h +120 -0
  1461. cuda/cccl/headers/include/thrust/reduce.h +1113 -0
  1462. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1463. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1464. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1465. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1466. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1467. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1468. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1469. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1470. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1471. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1472. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1473. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1474. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1475. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1476. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1477. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1478. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1479. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +109 -0
  1480. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1481. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1482. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1483. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1484. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1485. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1486. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1487. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1488. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1489. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1490. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1491. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1492. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1493. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1494. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1495. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1496. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1497. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1498. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1499. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1500. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1501. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1502. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1503. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1504. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1505. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1506. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1507. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1508. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1509. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1510. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1511. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1512. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1513. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1514. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1515. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1516. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1517. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1518. cuda/cccl/headers/include/thrust/system/cpp/memory.h +106 -0
  1519. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +72 -0
  1520. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +120 -0
  1521. cuda/cccl/headers/include/thrust/system/cpp/vector.h +96 -0
  1522. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1523. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1524. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1525. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1526. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1527. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +240 -0
  1528. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1529. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1530. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1531. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +593 -0
  1532. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1533. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1534. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +210 -0
  1535. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1536. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1537. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +264 -0
  1538. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1539. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +100 -0
  1540. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +272 -0
  1541. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1542. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1543. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +60 -0
  1544. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1545. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1546. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +80 -0
  1547. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1548. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1549. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1550. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1551. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1552. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +217 -0
  1553. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1554. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1555. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1556. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +785 -0
  1557. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
  1558. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1559. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +122 -0
  1560. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
  1561. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +342 -0
  1562. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +414 -0
  1563. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +91 -0
  1564. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1565. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1734 -0
  1566. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +470 -0
  1567. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
  1568. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +75 -0
  1569. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1570. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1571. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +429 -0
  1572. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1573. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1574. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +117 -0
  1575. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +105 -0
  1576. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
  1577. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +310 -0
  1578. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1579. cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
  1580. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1581. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1582. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1583. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
  1584. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +61 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/errno.h +120 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +53 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +79 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +161 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +384 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +45 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +64 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +58 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +146 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +48 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +84 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +49 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +60 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +66 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +252 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +54 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +49 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +137 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +58 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +73 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +96 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +45 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +60 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +72 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +59 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +64 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +86 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +99 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +148 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +49 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +68 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +129 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +207 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +43 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +71 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +100 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +83 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +186 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +86 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +121 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +95 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +175 -0
  1675. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +48 -0
  1676. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +67 -0
  1677. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +63 -0
  1678. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +126 -0
  1679. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +72 -0
  1680. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +85 -0
  1681. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +126 -0
  1682. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +232 -0
  1683. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +73 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +85 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +104 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +282 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +476 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +54 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +125 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +113 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +175 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +44 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +76 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +41 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +47 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +54 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +82 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +50 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +56 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +80 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +113 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +45 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +166 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +45 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +115 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +71 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +113 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +81 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +126 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +117 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +70 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +42 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +49 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +119 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +49 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +71 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +110 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +62 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +74 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +123 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +43 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +141 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +45 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +50 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +75 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +145 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +301 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +64 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +98 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +179 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +154 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +145 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +206 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +59 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +55 -0
  1755. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +356 -0
  1756. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +48 -0
  1757. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +124 -0
  1758. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +48 -0
  1759. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +586 -0
  1760. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1761. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1762. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1763. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1764. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1765. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1766. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +58 -0
  1767. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1768. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1769. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +115 -0
  1770. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +106 -0
  1771. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1772. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1773. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1774. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1775. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1776. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1777. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1778. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1779. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1780. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1781. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1782. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1783. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1784. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +127 -0
  1785. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1786. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1787. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1788. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1789. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1790. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1791. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1792. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1793. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1794. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1795. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1796. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1797. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1798. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1799. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1800. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1801. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1802. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1803. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1804. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1805. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1806. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1807. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1808. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1809. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1810. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1811. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1812. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1813. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1814. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +30 -0
  1815. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +30 -0
  1816. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1817. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1818. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1819. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1820. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
  1821. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1822. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1823. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1824. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1825. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1826. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1827. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1828. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1829. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1830. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1831. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1832. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1833. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1834. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1835. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1836. cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
  1837. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1838. cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
  1839. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1840. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1841. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1842. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1843. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1844. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1845. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1846. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1847. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1848. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +109 -0
  1849. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1850. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1851. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1852. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1853. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1854. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1855. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1856. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1857. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1858. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1859. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1860. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1861. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1862. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1863. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1864. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1865. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1866. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1867. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1868. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1869. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1870. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1871. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1872. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1873. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1874. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +30 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1887. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1888. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1889. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1890. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1891. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1892. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1893. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1894. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1895. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1896. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1897. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1898. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1899. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1900. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
  1901. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1902. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1903. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1904. cuda/cccl/headers/include/thrust/transform.h +1045 -0
  1905. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1906. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1907. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1908. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1909. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
  1910. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1911. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1912. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1913. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
  1914. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1915. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +96 -0
  1916. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1917. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1918. cuda/cccl/headers/include/thrust/unique.h +1088 -0
  1919. cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
  1920. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1921. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1922. cuda/cccl/headers/include/thrust/version.h +93 -0
  1923. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1924. cuda/cccl/headers/include_paths.py +51 -0
  1925. cuda/cccl/parallel/__init__.py +9 -0
  1926. cuda/cccl/parallel/experimental/.gitignore +4 -0
  1927. cuda/cccl/parallel/experimental/__init__.py +73 -0
  1928. cuda/cccl/parallel/experimental/_bindings.py +79 -0
  1929. cuda/cccl/parallel/experimental/_bindings.pyi +405 -0
  1930. cuda/cccl/parallel/experimental/_bindings_impl.pyx +1984 -0
  1931. cuda/cccl/parallel/experimental/_caching.py +71 -0
  1932. cuda/cccl/parallel/experimental/_cccl_interop.py +422 -0
  1933. cuda/cccl/parallel/experimental/_utils/__init__.py +0 -0
  1934. cuda/cccl/parallel/experimental/_utils/protocols.py +132 -0
  1935. cuda/cccl/parallel/experimental/_utils/temp_storage_buffer.py +86 -0
  1936. cuda/cccl/parallel/experimental/algorithms/__init__.py +50 -0
  1937. cuda/cccl/parallel/experimental/algorithms/_histogram.py +243 -0
  1938. cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +225 -0
  1939. cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +312 -0
  1940. cuda/cccl/parallel/experimental/algorithms/_reduce.py +184 -0
  1941. cuda/cccl/parallel/experimental/algorithms/_scan.py +261 -0
  1942. cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +257 -0
  1943. cuda/cccl/parallel/experimental/algorithms/_transform.py +308 -0
  1944. cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +252 -0
  1945. cuda/cccl/parallel/experimental/cccl/.gitkeep +0 -0
  1946. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  1947. cuda/cccl/parallel/experimental/cu12/cccl/libcccl.c.parallel.so +0 -0
  1948. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  1949. cuda/cccl/parallel/experimental/cu13/cccl/libcccl.c.parallel.so +0 -0
  1950. cuda/cccl/parallel/experimental/iterators/__init__.py +19 -0
  1951. cuda/cccl/parallel/experimental/iterators/_factories.py +191 -0
  1952. cuda/cccl/parallel/experimental/iterators/_iterators.py +612 -0
  1953. cuda/cccl/parallel/experimental/iterators/_zip_iterator.py +199 -0
  1954. cuda/cccl/parallel/experimental/numba_utils.py +53 -0
  1955. cuda/cccl/parallel/experimental/op.py +3 -0
  1956. cuda/cccl/parallel/experimental/struct.py +272 -0
  1957. cuda/cccl/parallel/experimental/typing.py +35 -0
  1958. cuda/cccl/py.typed +0 -0
  1959. cuda_cccl-0.1.3.2.0.dev438.dist-info/METADATA +42 -0
  1960. cuda_cccl-0.1.3.2.0.dev438.dist-info/RECORD +1962 -0
  1961. cuda_cccl-0.1.3.2.0.dev438.dist-info/WHEEL +5 -0
  1962. cuda_cccl-0.1.3.2.0.dev438.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,2205 @@
1
+ /******************************************************************************
2
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
3
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
4
+ *
5
+ * Redistribution and use in source and binary forms, with or without
6
+ * modification, are permitted provided that the following conditions are met:
7
+ * * Redistributions of source code must retain the above copyright
8
+ * notice, this list of conditions and the following disclaimer.
9
+ * * Redistributions in binary form must reproduce the above copyright
10
+ * notice, this list of conditions and the following disclaimer in the
11
+ * documentation and/or other materials provided with the distribution.
12
+ * * Neither the name of the NVIDIA CORPORATION nor the
13
+ * names of its contributors may be used to endorse or promote products
14
+ * derived from this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ *
27
+ ******************************************************************************/
28
+
29
+ //! @file
30
+ //! cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data
31
+ //! items residing within device-accessible memory.
32
+
33
+ #pragma once
34
+
35
+ #include <cub/config.cuh>
36
+
37
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
38
+ # pragma GCC system_header
39
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
40
+ # pragma clang system_header
41
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
42
+ # pragma system_header
43
+ #endif // no system header
44
+
45
+ #include <cub/detail/choose_offset.cuh>
46
+ #include <cub/detail/device_memory_resource.cuh>
47
+ #include <cub/detail/temporary_storage.cuh>
48
+ #include <cub/device/dispatch/dispatch_scan.cuh>
49
+ #include <cub/device/dispatch/dispatch_scan_by_key.cuh>
50
+ #include <cub/thread/thread_operators.cuh>
51
+
52
+ #include <cuda/__execution/determinism.h>
53
+ #include <cuda/__execution/require.h>
54
+ #include <cuda/__execution/tune.h>
55
+ #include <cuda/__memory_resource/get_memory_resource.h>
56
+ #include <cuda/__stream/get_stream.h>
57
+ #include <cuda/std/__execution/env.h>
58
+ #include <cuda/std/__functional/invoke.h>
59
+
60
+ CUB_NAMESPACE_BEGIN
61
+
62
+ namespace detail::scan
63
+ {
64
+ struct get_tuning_query_t
65
+ {};
66
+
67
+ template <class Derived>
68
+ struct tuning
69
+ {
70
+ [[nodiscard]] _CCCL_NODEBUG_API constexpr Derived query(const get_tuning_query_t&) const noexcept
71
+ {
72
+ return static_cast<const Derived&>(*this);
73
+ }
74
+ };
75
+
76
+ struct default_tuning : tuning<default_tuning>
77
+ {
78
+ template <typename InputValueT, typename OutputValueT, typename AccumT, typename OffsetT, typename ScanOpT>
79
+ using fn = policy_hub<InputValueT, OutputValueT, AccumT, OffsetT, ScanOpT>;
80
+ };
81
+
82
+ } // namespace detail::scan
83
+
84
+ //! @rst
85
+ //! DeviceScan provides device-wide, parallel operations for computing a
86
+ //! prefix scan across a sequence of data items residing within
87
+ //! device-accessible memory.
88
+ //!
89
+ //! Overview
90
+ //! +++++++++++++++++++++++++++++++++++++++++++++
91
+ //!
92
+ //! Given a sequence of input elements and a binary reduction operator, a
93
+ //! `prefix scan <http://en.wikipedia.org/wiki/Prefix_sum>`_ produces an output
94
+ //! sequence where each element is computed to be the reduction of the elements
95
+ //! occurring earlier in the input sequence. *Prefix sum* connotes a prefix scan
96
+ //! with the addition operator. The term *inclusive* indicates that the
97
+ //! *i*\ :sup:`th` output reduction incorporates the *i*\ :sup:`th` input.
98
+ //! The term *exclusive* indicates the *i*\ :sup:`th` input is not
99
+ //! incorporated into the *i*\ :sup:`th` output reduction. When the input and
100
+ //! output sequences are the same, the scan is performed in-place.
101
+ //!
102
+ //! In order to provide an efficient parallel implementation, the binary reduction operator must be associative. That
103
+ //! is, ``op(op(a, b), c)`` must be equivalent to ``op(a, op(b, c))`` for any input values ``a``, ``b``, and ``c``.
104
+ //!
105
+ //! As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our
106
+ //! *"decoupled look-back"* algorithm for performing global prefix scan with
107
+ //! only a single pass through the input data, as described in our 2016 technical
108
+ //! report [1]_. The central idea is to leverage a small, constant factor of
109
+ //! redundant work in order to overlap the latencies of global prefix
110
+ //! propagation with local computation. As such, our algorithm requires only
111
+ //! ``~2*n*`` data movement (``n`` inputs are read, ``n`` outputs are written), and
112
+ //! typically proceeds at "memcpy" speeds. Our algorithm supports inplace operations.
113
+ //!
114
+ //! .. [1] Duane Merrill and Michael Garland. `Single-pass Parallel Prefix Scan with Decoupled Look-back
115
+ //! <https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back>`_,
116
+ //! *NVIDIA Technical Report NVR-2016-002*, 2016.
117
+ //!
118
+ //! Usage Considerations
119
+ //! +++++++++++++++++++++++++++++++++++++++++++++
120
+ //!
121
+ //! @cdp_class{DeviceScan}
122
+ //!
123
+ //! Performance
124
+ //! +++++++++++++++++++++++++++++++++++++++++++++
125
+ //!
126
+ //! @linear_performance{prefix scan}
127
+ //!
128
+ //! @endrst
129
+ struct DeviceScan
130
+ {
131
+ template <typename TuningEnvT,
132
+ typename InputIteratorT,
133
+ typename OutputIteratorT,
134
+ typename ScanOpT,
135
+ typename InitValueT,
136
+ typename NumItemsT,
137
+ ::cuda::execution::determinism::__determinism_t Determinism,
138
+ ForceInclusive EnforceInclusive = ForceInclusive::No>
139
+ CUB_RUNTIME_FUNCTION static cudaError_t scan_impl_determinism(
140
+ void* d_temp_storage,
141
+ size_t& temp_storage_bytes,
142
+ InputIteratorT d_in,
143
+ OutputIteratorT d_out,
144
+ ScanOpT scan_op,
145
+ InitValueT init,
146
+ NumItemsT num_items,
147
+ ::cuda::execution::determinism::__determinism_holder_t<Determinism>,
148
+ cudaStream_t stream)
149
+ {
150
+ using scan_tuning_t = ::cuda::std::execution::
151
+ __query_result_or_t<TuningEnvT, detail::scan::get_tuning_query_t, detail::scan::default_tuning>;
152
+
153
+ // Unsigned integer type for global offsets
154
+ using offset_t = detail::choose_offset_t<NumItemsT>;
155
+
156
+ using accum_t =
157
+ ::cuda::std::__accumulator_t<ScanOpT,
158
+ cub::detail::it_value_t<InputIteratorT>,
159
+ ::cuda::std::_If<::cuda::std::is_same_v<InitValueT, NullType>,
160
+ cub::detail::it_value_t<InputIteratorT>,
161
+ typename InitValueT::value_type>>;
162
+
163
+ using policy_t = typename scan_tuning_t::
164
+ template fn<detail::it_value_t<InputIteratorT>, detail::it_value_t<OutputIteratorT>, accum_t, offset_t, ScanOpT>;
165
+
166
+ using dispatch_t =
167
+ DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, offset_t, accum_t, EnforceInclusive, policy_t>;
168
+
169
+ return dispatch_t::Dispatch(
170
+ d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init, static_cast<offset_t>(num_items), stream);
171
+ }
172
+
173
+ template <typename InputIteratorT,
174
+ typename OutputIteratorT,
175
+ typename ScanOpT,
176
+ typename InitValueT,
177
+ typename NumItemsT,
178
+ ForceInclusive EnforceInclusive = ForceInclusive::No,
179
+ typename EnvT>
180
+ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t scan_impl_env(
181
+ InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init, NumItemsT num_items, EnvT env)
182
+ {
183
+ static_assert(!_CUDA_STD_EXEC::__queryable_with<EnvT, _CUDA_EXEC::determinism::__get_determinism_t>,
184
+ "Determinism should be used inside requires to have an effect.");
185
+
186
+ using requirements_t =
187
+ _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_requirements_t, _CUDA_STD_EXEC::env<>>;
188
+
189
+ using requested_determinism_t =
190
+ _CUDA_STD_EXEC::__query_result_or_t<requirements_t, //
191
+ _CUDA_EXEC::determinism::__get_determinism_t,
192
+ _CUDA_EXEC::determinism::run_to_run_t>;
193
+
194
+ // Static assert to reject gpu_to_gpu determinism since it's not implemented
195
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
196
+ "gpu_to_gpu determinism is not supported");
197
+
198
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::not_guaranteed_t>,
199
+ "not_guaranteed determinism is not supported");
200
+
201
+ using determinism_t = ::cuda::execution::determinism::run_to_run_t;
202
+
203
+ // Query relevant properties from the environment
204
+ auto stream = _CUDA_STD_EXEC::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
205
+ auto mr = _CUDA_STD_EXEC::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
206
+
207
+ void* d_temp_storage = nullptr;
208
+ size_t temp_storage_bytes = 0;
209
+
210
+ using tuning_t = _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_tuning_t, _CUDA_STD_EXEC::env<>>;
211
+
212
+ // Query the required temporary storage size
213
+ cudaError_t error = scan_impl_determinism<tuning_t>(
214
+ d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init, num_items, determinism_t{}, stream.get());
215
+
216
+ if (error != cudaSuccess)
217
+ {
218
+ return error;
219
+ }
220
+
221
+ // TODO(gevtushenko): use uninitialized buffer whenit's available
222
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
223
+ if (error != cudaSuccess)
224
+ {
225
+ return error;
226
+ }
227
+
228
+ // Run the algorithm
229
+ error = scan_impl_determinism<tuning_t>(
230
+ d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init, num_items, determinism_t{}, stream.get());
231
+
232
+ // Try to deallocate regardless of the error to avoid memory leaks
233
+ cudaError_t deallocate_error =
234
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
235
+
236
+ if (error != cudaSuccess)
237
+ {
238
+ // Reduction error takes precedence over deallocation error since it happens first
239
+ return error;
240
+ }
241
+
242
+ return deallocate_error;
243
+ }
244
+
245
+ //! @name Exclusive scans
246
+ //! @{
247
+
248
+ //! @rst
249
+ //! Computes a device-wide exclusive prefix sum.
250
+ //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_out``.
251
+ //!
252
+ //! - Supports non-commutative sum operators.
253
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
254
+ //! addition of floating-point types). Results for pseudo-associative
255
+ //! operators may vary from run to run. Additional details can be found in
256
+ //! the @lookback description.
257
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
258
+ //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
259
+ //! shall not overlap in any other way.
260
+ //! - @devicestorage
261
+ //!
262
+ //! Snippet
263
+ //! +++++++++++++++++++++++++++++++++++++++++++++
264
+ //!
265
+ //! The code snippet below illustrates the exclusive prefix sum of an ``int``
266
+ //! device vector.
267
+ //!
268
+ //! .. code-block:: c++
269
+ //!
270
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
271
+ //!
272
+ //! // Declare, allocate, and initialize device-accessible pointers for
273
+ //! // input and output
274
+ //! int num_items; // e.g., 7
275
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
276
+ //! int *d_out; // e.g., [ , , , , , , ]
277
+ //! ...
278
+ //!
279
+ //! // Determine temporary device storage requirements
280
+ //! void *d_temp_storage = nullptr;
281
+ //! size_t temp_storage_bytes = 0;
282
+ //! cub::DeviceScan::ExclusiveSum(
283
+ //! d_temp_storage, temp_storage_bytes,
284
+ //! d_in, d_out, num_items);
285
+ //!
286
+ //! // Allocate temporary storage
287
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
288
+ //!
289
+ //! // Run exclusive prefix sum
290
+ //! cub::DeviceScan::ExclusiveSum(
291
+ //! d_temp_storage, temp_storage_bytes,
292
+ //! d_in, d_out, num_items);
293
+ //!
294
+ //! // d_out <-- [0, 8, 14, 21, 26, 29, 29]
295
+ //!
296
+ //! @endrst
297
+ //!
298
+ //! @tparam InputIteratorT
299
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
300
+ //!
301
+ //! @tparam OutputIteratorT
302
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
303
+ //!
304
+ //! @tparam NumItemsT
305
+ //! **[inferred]** An integral type representing the number of input elements
306
+ //!
307
+ //! @param[in] d_temp_storage
308
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
309
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
310
+ //!
311
+ //! @param[in,out] temp_storage_bytes
312
+ //! Reference to size in bytes of `d_temp_storage` allocation
313
+ //!
314
+ //! @param[in] d_in
315
+ //! Random-access iterator to the input sequence of data items
316
+ //!
317
+ //! @param[out] d_out
318
+ //! Random-access iterator to the output sequence of data items
319
+ //!
320
+ //! @param[in] num_items
321
+ //! Total number of input items (i.e., the length of `d_in`)
322
+ //!
323
+ //! @param[in] stream
324
+ //! @rst
325
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
326
+ //! @endrst
327
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
328
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
329
+ void* d_temp_storage,
330
+ size_t& temp_storage_bytes,
331
+ InputIteratorT d_in,
332
+ OutputIteratorT d_out,
333
+ NumItemsT num_items,
334
+ cudaStream_t stream = 0)
335
+ {
336
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSum");
337
+
338
+ // Unsigned integer type for global offsets
339
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
340
+ using InitT = cub::detail::it_value_t<InputIteratorT>;
341
+
342
+ // Initial value
343
+ InitT init_value{};
344
+
345
+ return DispatchScan<InputIteratorT, OutputIteratorT, ::cuda::std::plus<>, detail::InputValue<InitT>, OffsetT>::
346
+ Dispatch(d_temp_storage,
347
+ temp_storage_bytes,
348
+ d_in,
349
+ d_out,
350
+ ::cuda::std::plus<>{},
351
+ detail::InputValue<InitT>(init_value),
352
+ num_items,
353
+ stream);
354
+ }
355
+
356
+ //! @name Exclusive scans
357
+ //! @{
358
+
359
+ //! @rst
360
+ //! Computes a device-wide exclusive prefix sum.
361
+ //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_out``.
362
+ //!
363
+ //! - Supports non-commutative sum operators.
364
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
365
+ //! addition of floating-point types). Results for pseudo-associative
366
+ //! operators may vary from run to run. Additional details can be found in
367
+ //! the @lookback description.
368
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
369
+ //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
370
+ //! shall not overlap in any other way.
371
+ //! - @devicestorage
372
+ //!
373
+ //! Snippet
374
+ //! +++++++++++++++++++++++++++++++++++++++++++++
375
+ //!
376
+ //! Preconditions
377
+ //! +++++++++++++
378
+ //!
379
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
380
+ //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
381
+ //! shall not overlap in any other way.
382
+ //! - ``d_in`` and ``d_out`` must not be null pointers
383
+ //! The code snippet below illustrates a user-defined exclusive-scan of a
384
+ //! device vector of ``float`` data elements.
385
+ //!
386
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_env_api.cu
387
+ //! :language: c++
388
+ //! :dedent:
389
+ //! :start-after: example-begin exclusive-sum-env-determinism
390
+ //! :end-before: example-end exclusive-sum-env-determinism
391
+ //!
392
+ //! @endrst
393
+ //!
394
+ //! @tparam InputIteratorT
395
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
396
+ //!
397
+ //! @tparam OutputIteratorT
398
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
399
+ //!
400
+ //! @tparam NumItemsT
401
+ //! **[inferred]** An integral type representing the number of input elements
402
+ //!
403
+ //! @tparam EnvT
404
+ //! **[inferred]** Execution environment type. Default is `_CUDA_STD_EXEC::env<>`.
405
+ //!
406
+ //! @param[in] d_in
407
+ //! Random-access iterator to the input sequence of data items
408
+ //!
409
+ //! @param[out] d_out
410
+ //! Random-access iterator to the output sequence of data items
411
+ //!
412
+ //! @param[in] num_items
413
+ //! Total number of input items (i.e., the length of `d_in`)
414
+ //!
415
+ //! @param[in] env
416
+ //! @rst
417
+ //! **[optional]** Execution environment. Default is `_CUDA_STD_EXEC::env{}`.
418
+ //! @endrst
419
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT, typename EnvT = _CUDA_STD_EXEC::env<>>
420
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
421
+ ExclusiveSum(InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, EnvT env = {})
422
+ {
423
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::ExclusiveSum");
424
+
425
+ using InitT = cub::detail::it_value_t<InputIteratorT>;
426
+
427
+ // Initial value
428
+ InitT init_value{};
429
+
430
+ return scan_impl_env(d_in, d_out, ::cuda::std::plus<>{}, detail::InputValue<InitT>(init_value), num_items, env);
431
+ }
432
+
433
+ //! @rst
434
+ //! Computes a device-wide exclusive prefix sum in-place.
435
+ //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_data``.
436
+ //!
437
+ //! - Supports non-commutative sum operators.
438
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
439
+ //! addition of floating-point types). Results for pseudo-associative
440
+ //! operators may vary from run to run. Additional details can be found in
441
+ //! the @lookback description.
442
+ //! - @devicestorage
443
+ //!
444
+ //! Snippet
445
+ //! +++++++++++++++++++++++++++++++++++++++++++++
446
+ //!
447
+ //! The code snippet below illustrates the exclusive prefix sum of an ``int``
448
+ //! device vector.
449
+ //!
450
+ //! .. code-block:: c++
451
+ //!
452
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
453
+ //!
454
+ //! // Declare, allocate, and initialize device-accessible pointers for
455
+ //! // input and output
456
+ //! int num_items; // e.g., 7
457
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
458
+ //! ...
459
+ //!
460
+ //! // Determine temporary device storage requirements
461
+ //! void *d_temp_storage = nullptr;
462
+ //! size_t temp_storage_bytes = 0;
463
+ //! cub::DeviceScan::ExclusiveSum(
464
+ //! d_temp_storage, temp_storage_bytes,
465
+ //! d_data, num_items);
466
+ //!
467
+ //! // Allocate temporary storage
468
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
469
+ //!
470
+ //! // Run exclusive prefix sum
471
+ //! cub::DeviceScan::ExclusiveSum(
472
+ //! d_temp_storage, temp_storage_bytes,
473
+ //! d_data, num_items);
474
+ //!
475
+ //! // d_data <-- [0, 8, 14, 21, 26, 29, 29]
476
+ //!
477
+ //! @endrst
478
+ //!
479
+ //! @tparam IteratorT
480
+ //! **[inferred]** Random-access iterator type for reading scan inputs and wrigin scan outputs
481
+ //!
482
+ //! @tparam NumItemsT
483
+ //! **[inferred]** An integral type representing the number of input elements
484
+ //!
485
+ //! @param[in] d_temp_storage
486
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
487
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
488
+ //!
489
+ //! @param[in,out] temp_storage_bytes
490
+ //! Reference to size in bytes of `d_temp_storage` allocation
491
+ //!
492
+ //! @param[in,out] d_data
493
+ //! Random-access iterator to the sequence of data items
494
+ //!
495
+ //! @param[in] num_items
496
+ //! Total number of input items (i.e., the length of `d_in`)
497
+ //!
498
+ //! @param[in] stream
499
+ //! @rst
500
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
501
+ //! @endrst
502
+ template <typename IteratorT, typename NumItemsT>
503
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
504
+ void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0)
505
+ {
506
+ return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
507
+ }
508
+
509
+ //! @rst
510
+ //! Computes a device-wide exclusive prefix scan using the specified
511
+ //! binary associative ``scan_op`` functor. The ``init_value`` value is applied as
512
+ //! the initial value, and is assigned to ``*d_out``.
513
+ //!
514
+ //! - Supports non-commutative scan operators.
515
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
516
+ //! addition of floating-point types). Results for pseudo-associative
517
+ //! operators may vary from run to run. Additional details can be found in
518
+ //! the @lookback description.
519
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
520
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
521
+ //! shall not overlap in any other way.
522
+ //! - @devicestorage
523
+ //!
524
+ //! Snippet
525
+ //! +++++++++++++++++++++++++++++++++++++++++++++
526
+ //!
527
+ //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
528
+ //!
529
+ //! .. code-block:: c++
530
+ //!
531
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
532
+ //! #include <cuda/std/climits> // for INT_MAX
533
+ //!
534
+ //! // CustomMin functor
535
+ //! struct CustomMin
536
+ //! {
537
+ //! template <typename T>
538
+ //! __host__ __device__ __forceinline__
539
+ //! T operator()(const T &a, const T &b) const {
540
+ //! return (b < a) ? b : a;
541
+ //! }
542
+ //! };
543
+ //!
544
+ //! // Declare, allocate, and initialize device-accessible pointers for
545
+ //! // input and output
546
+ //! int num_items; // e.g., 7
547
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
548
+ //! int *d_out; // e.g., [ , , , , , , ]
549
+ //! CustomMin min_op;
550
+ //! ...
551
+ //!
552
+ //! // Determine temporary device storage requirements for exclusive
553
+ //! // prefix scan
554
+ //! void *d_temp_storage = nullptr;
555
+ //! size_t temp_storage_bytes = 0;
556
+ //! cub::DeviceScan::ExclusiveScan(
557
+ //! d_temp_storage, temp_storage_bytes,
558
+ //! d_in, d_out, min_op, (int) INT_MAX, num_items);
559
+ //!
560
+ //! // Allocate temporary storage for exclusive prefix scan
561
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
562
+ //!
563
+ //! // Run exclusive prefix min-scan
564
+ //! cub::DeviceScan::ExclusiveScan(
565
+ //! d_temp_storage, temp_storage_bytes,
566
+ //! d_in, d_out, min_op, (int) INT_MAX, num_items);
567
+ //!
568
+ //! // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
569
+ //!
570
+ //! @endrst
571
+ //!
572
+ //! @tparam InputIteratorT
573
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
574
+ //!
575
+ //! @tparam OutputIteratorT
576
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
577
+ //!
578
+ //! @tparam ScanOpT
579
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
580
+ //!
581
+ //! @tparam InitValueT
582
+ //! **[inferred]** Type of the `init_value`
583
+ //!
584
+ //! @tparam NumItemsT
585
+ //! **[inferred]** An integral type representing the number of input elements
586
+ //!
587
+ //! @param[in] d_temp_storage
588
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
589
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
590
+ //!
591
+ //! @param[in,out] temp_storage_bytes
592
+ //! Reference to size in bytes of `d_temp_storage` allocation
593
+ //!
594
+ //! @param[in] d_in
595
+ //! Random-access iterator to the input sequence of data items
596
+ //!
597
+ //! @param[out] d_out
598
+ //! Random-access iterator to the output sequence of data items
599
+ //!
600
+ //! @param[in] scan_op
601
+ //! Binary associative scan functor
602
+ //!
603
+ //! @param[in] init_value
604
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
605
+ //!
606
+ //! @param[in] num_items
607
+ //! Total number of input items (i.e., the length of `d_in`)
608
+ //!
609
+ //! @param[in] stream
610
+ //! @rst
611
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
612
+ //! @endrst
613
+ template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
614
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
615
+ void* d_temp_storage,
616
+ size_t& temp_storage_bytes,
617
+ InputIteratorT d_in,
618
+ OutputIteratorT d_out,
619
+ ScanOpT scan_op,
620
+ InitValueT init_value,
621
+ NumItemsT num_items,
622
+ cudaStream_t stream = 0)
623
+ {
624
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan");
625
+
626
+ // Unsigned integer type for global offsets
627
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
628
+
629
+ return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
630
+ d_temp_storage,
631
+ temp_storage_bytes,
632
+ d_in,
633
+ d_out,
634
+ scan_op,
635
+ detail::InputValue<InitValueT>(init_value),
636
+ num_items,
637
+ stream);
638
+ }
639
+
640
+ //! @rst
641
+ //! Computes a device-wide exclusive prefix scan using the specified
642
+ //! binary associative ``scan_op`` functor. The ``init_value`` value is applied as
643
+ //! the initial value, and is assigned to ``*d_out``.
644
+ //!
645
+ //! - Supports non-commutative scan operators.
646
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
647
+ //! addition of floating-point types). Results for pseudo-associative
648
+ //! operators may vary from run to run. Additional details can be found in
649
+ //! the @lookback description.
650
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
651
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
652
+ //! shall not overlap in any other way.
653
+ //! - @devicestorage
654
+ //!
655
+ //! Snippet
656
+ //! +++++++++++++++++++++++++++++++++++++++++++++
657
+ //!
658
+ //! The code snippet below illustrates a user-defined exclusive-scan of a
659
+ //! device vector of ``float`` data elements.
660
+ //!
661
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_env_api.cu
662
+ //! :language: c++
663
+ //! :dedent:
664
+ //! :start-after: example-begin exclusive-scan-env-determinism
665
+ //! :end-before: example-end exclusive-scan-env-determinism
666
+ //!
667
+ //! @endrst
668
+ //!
669
+ //! @tparam InputIteratorT
670
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
671
+ //!
672
+ //! @tparam OutputIteratorT
673
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
674
+ //!
675
+ //! @tparam ScanOpT
676
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
677
+ //!
678
+ //! @tparam InitValueT
679
+ //! **[inferred]** Type of the `init_value`
680
+ //!
681
+ //! @tparam NumItemsT
682
+ //! **[inferred]** An integral type representing the number of input elements
683
+ //!
684
+ //! @tparam EnvT
685
+ //! **[inferred]** Execution environment type. Default is `_CUDA_STD_EXEC::env<>`.
686
+ //!
687
+ //! @param[in] d_in
688
+ //! Random-access iterator to the input sequence of data items
689
+ //!
690
+ //! @param[out] d_out
691
+ //! Random-access iterator to the output sequence of data items
692
+ //!
693
+ //! @param[in] scan_op
694
+ //! Binary associative scan functor
695
+ //!
696
+ //! @param[in] init_value
697
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
698
+ //!
699
+ //! @param[in] num_items
700
+ //! Total number of input items (i.e., the length of `d_in`)
701
+ //!
702
+ //! @param[in] env
703
+ //! @rst
704
+ //! **[optional]** Execution environment. Default is `_CUDA_STD_EXEC::env{}`.
705
+ //! @endrst
706
+ template <typename InputIteratorT,
707
+ typename OutputIteratorT,
708
+ typename ScanOpT,
709
+ typename InitValueT,
710
+ typename NumItemsT,
711
+ typename EnvT = _CUDA_STD_EXEC::env<>>
712
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
713
+ InputIteratorT d_in,
714
+ OutputIteratorT d_out,
715
+ ScanOpT scan_op,
716
+ InitValueT init_value,
717
+ NumItemsT num_items,
718
+ EnvT env = {})
719
+ {
720
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::ExclusiveScan");
721
+
722
+ return scan_impl_env(d_in, d_out, scan_op, detail::InputValue<InitValueT>(init_value), num_items, env);
723
+ }
724
+
725
+ //! @rst
726
+ //! Computes a device-wide exclusive prefix scan using the specified
727
+ //! binary associative ``scan_op`` functor. The ``init_value`` value is applied as
728
+ //! the initial value, and is assigned to ``*d_data``.
729
+ //!
730
+ //! - Supports non-commutative scan operators.
731
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
732
+ //! addition of floating-point types). Results for pseudo-associative
733
+ //! operators may vary from run to run. Additional details can be found in
734
+ //! the @lookback description.
735
+ //! - @devicestorage
736
+ //!
737
+ //! Snippet
738
+ //! +++++++++++++++++++++++++++++++++++++++++++++
739
+ //!
740
+ //! The code snippet below illustrates the exclusive prefix min-scan of an
741
+ //! ``int`` device vector:
742
+ //!
743
+ //! .. code-block:: c++
744
+ //!
745
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
746
+ //! #include <cuda/std/climits> // for INT_MAX
747
+ //!
748
+ //! // CustomMin functor
749
+ //! struct CustomMin
750
+ //! {
751
+ //! template <typename T>
752
+ //! __host__ __device__ __forceinline__
753
+ //! T operator()(const T &a, const T &b) const {
754
+ //! return (b < a) ? b : a;
755
+ //! }
756
+ //! };
757
+ //!
758
+ //! // Declare, allocate, and initialize device-accessible pointers for
759
+ //! // input and output
760
+ //! int num_items; // e.g., 7
761
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
762
+ //! CustomMin min_op;
763
+ //! ...
764
+ //!
765
+ //! // Determine temporary device storage requirements for exclusive
766
+ //! // prefix scan
767
+ //! void *d_temp_storage = nullptr;
768
+ //! size_t temp_storage_bytes = 0;
769
+ //! cub::DeviceScan::ExclusiveScan(
770
+ //! d_temp_storage, temp_storage_bytes,
771
+ //! d_data, min_op, (int) INT_MAX, num_items);
772
+ //!
773
+ //! // Allocate temporary storage for exclusive prefix scan
774
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
775
+ //!
776
+ //! // Run exclusive prefix min-scan
777
+ //! cub::DeviceScan::ExclusiveScan(
778
+ //! d_temp_storage, temp_storage_bytes,
779
+ //! d_data, min_op, (int) INT_MAX, num_items);
780
+ //!
781
+ //! // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
782
+ //!
783
+ //! @endrst
784
+ //!
785
+ //! @tparam IteratorT
786
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
787
+ //!
788
+ //! @tparam ScanOpT
789
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
790
+ //!
791
+ //! @tparam InitValueT
792
+ //! **[inferred]** Type of the `init_value`
793
+ //!
794
+ //! @tparam NumItemsT
795
+ //! **[inferred]** An integral type representing the number of input elements
796
+ //!
797
+ //! @param[in] d_temp_storage
798
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
799
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
800
+ //!
801
+ //! @param[in,out] temp_storage_bytes
802
+ //! Reference to size in bytes of `d_temp_storage` allocation
803
+ //!
804
+ //! @param[in,out] d_data
805
+ //! Random-access iterator to the sequence of data items
806
+ //!
807
+ //! @param[in] scan_op
808
+ //! Binary associative scan functor
809
+ //!
810
+ //! @param[in] init_value
811
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
812
+ //!
813
+ //! @param[in] num_items
814
+ //! Total number of input items (i.e., the length of `d_in`)
815
+ //!
816
+ //! @param[in] stream
817
+ //! @rst
818
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
819
+ //! @endrst
820
+ template <typename IteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
821
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
822
+ void* d_temp_storage,
823
+ size_t& temp_storage_bytes,
824
+ IteratorT d_data,
825
+ ScanOpT scan_op,
826
+ InitValueT init_value,
827
+ NumItemsT num_items,
828
+ cudaStream_t stream = 0)
829
+ {
830
+ return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
831
+ }
832
+
833
+ //! @rst
834
+ //! Computes a device-wide exclusive prefix scan using the specified
835
+ //! binary associative ``scan_op`` functor. The ``init_value`` value is provided as a future value.
836
+ //!
837
+ //! - Supports non-commutative scan operators.
838
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
839
+ //! addition of floating-point types). Results for pseudo-associative
840
+ //! operators may vary from run to run. Additional details can be found in
841
+ //! the @lookback description.
842
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
843
+ //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
844
+ //! shall not overlap in any other way.
845
+ //! - @devicestorage
846
+ //!
847
+ //! Snippet
848
+ //! +++++++++++++++++++++++++++++++++++++++++++++
849
+ //!
850
+ //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
851
+ //!
852
+ //! .. code-block:: c++
853
+ //!
854
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
855
+ //! #include <cuda/std/climits> // for INT_MAX
856
+ //!
857
+ //! // CustomMin functor
858
+ //! struct CustomMin
859
+ //! {
860
+ //! template <typename T>
861
+ //! __host__ __device__ __forceinline__
862
+ //! T operator()(const T &a, const T &b) const {
863
+ //! return (b < a) ? b : a;
864
+ //! }
865
+ //! };
866
+ //!
867
+ //! // Declare, allocate, and initialize device-accessible pointers for
868
+ //! // input and output
869
+ //! int num_items; // e.g., 7
870
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
871
+ //! int *d_out; // e.g., [ , , , , , , ]
872
+ //! int *d_init_iter; // e.g., INT_MAX
873
+ //! CustomMin min_op;
874
+ //!
875
+ //! auto future_init_value =
876
+ //! cub::FutureValue<InitialValueT, IterT>(d_init_iter);
877
+ //!
878
+ //! ...
879
+ //!
880
+ //! // Determine temporary device storage requirements for exclusive
881
+ //! // prefix scan
882
+ //! void *d_temp_storage = nullptr;
883
+ //! size_t temp_storage_bytes = 0;
884
+ //! cub::DeviceScan::ExclusiveScan(
885
+ //! d_temp_storage, temp_storage_bytes,
886
+ //! d_in, d_out, min_op, future_init_value, num_items);
887
+ //!
888
+ //! // Allocate temporary storage for exclusive prefix scan
889
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
890
+ //!
891
+ //! // Run exclusive prefix min-scan
892
+ //! cub::DeviceScan::ExclusiveScan(
893
+ //! d_temp_storage, temp_storage_bytes,
894
+ //! d_in, d_out, min_op, future_init_value, num_items);
895
+ //!
896
+ //! // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
897
+ //!
898
+ //! @endrst
899
+ //!
900
+ //! @tparam InputIteratorT
901
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
902
+ //!
903
+ //! @tparam OutputIteratorT
904
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
905
+ //!
906
+ //! @tparam ScanOpT
907
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
908
+ //!
909
+ //! @tparam InitValueT
910
+ //! **[inferred]** Type of the `init_value`
911
+ //!
912
+ //! @tparam NumItemsT
913
+ //! **[inferred]** An integral type representing the number of input elements
914
+ //!
915
+ //! @param[in] d_temp_storage
916
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
917
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
918
+ //!
919
+ //! @param[in,out] temp_storage_bytes
920
+ //! Reference to size in bytes of `d_temp_storage` allocation
921
+ //!
922
+ //! @param[in] d_in
923
+ //! Pointer to the input sequence of data items
924
+ //!
925
+ //! @param[out] d_out
926
+ //! Pointer to the output sequence of data items
927
+ //!
928
+ //! @param[in] scan_op
929
+ //! Binary associative scan functor
930
+ //!
931
+ //! @param[in] init_value
932
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
933
+ //!
934
+ //! @param[in] num_items
935
+ //! Total number of input items (i.e., the length of `d_in`)
936
+ //!
937
+ //! @param[in] stream
938
+ //! @rst
939
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
940
+ //! @endrst
941
+ template <typename InputIteratorT,
942
+ typename OutputIteratorT,
943
+ typename ScanOpT,
944
+ typename InitValueT,
945
+ typename InitValueIterT = InitValueT*,
946
+ typename NumItemsT = int>
947
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
948
+ void* d_temp_storage,
949
+ size_t& temp_storage_bytes,
950
+ InputIteratorT d_in,
951
+ OutputIteratorT d_out,
952
+ ScanOpT scan_op,
953
+ FutureValue<InitValueT, InitValueIterT> init_value,
954
+ NumItemsT num_items,
955
+ cudaStream_t stream = 0)
956
+ {
957
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan");
958
+
959
+ // Unsigned integer type for global offsets
960
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
961
+
962
+ return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
963
+ d_temp_storage,
964
+ temp_storage_bytes,
965
+ d_in,
966
+ d_out,
967
+ scan_op,
968
+ detail::InputValue<InitValueT>(init_value),
969
+ num_items,
970
+ stream);
971
+ }
972
+
973
+ //! @rst
974
+ //! Computes a device-wide exclusive prefix scan using the specified binary associative ``scan_op`` functor.
975
+ //! The ``init_value`` value is provided as a future value.
976
+ //!
977
+ //! - Supports non-commutative scan operators.
978
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
979
+ //! addition of floating-point types). Results for pseudo-associative
980
+ //! operators may vary from run to run. Additional details can be found in
981
+ //! the @lookback description.
982
+ //! - @devicestorage
983
+ //!
984
+ //! Snippet
985
+ //! +++++++++++++++++++++++++++++++++++++++++++++
986
+ //!
987
+ //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
988
+ //!
989
+ //! .. code-block:: c++
990
+ //!
991
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
992
+ //! #include <cuda/std/climits> // for INT_MAX
993
+ //!
994
+ //! // CustomMin functor
995
+ //! struct CustomMin
996
+ //! {
997
+ //! template <typename T>
998
+ //! __host__ __device__ __forceinline__
999
+ //! T operator()(const T &a, const T &b) const {
1000
+ //! return (b < a) ? b : a;
1001
+ //! }
1002
+ //! };
1003
+ //!
1004
+ //! // Declare, allocate, and initialize device-accessible pointers for
1005
+ //! // input and output
1006
+ //! int num_items; // e.g., 7
1007
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
1008
+ //! int *d_init_iter; // e.g., INT_MAX
1009
+ //! CustomMin min_op;
1010
+ //!
1011
+ //! auto future_init_value =
1012
+ //! cub::FutureValue<InitialValueT, IterT>(d_init_iter);
1013
+ //!
1014
+ //! ...
1015
+ //!
1016
+ //! // Determine temporary device storage requirements for exclusive
1017
+ //! // prefix scan
1018
+ //! void *d_temp_storage = nullptr;
1019
+ //! size_t temp_storage_bytes = 0;
1020
+ //! cub::DeviceScan::ExclusiveScan(
1021
+ //! d_temp_storage, temp_storage_bytes,
1022
+ //! d_data, min_op, future_init_value, num_items);
1023
+ //!
1024
+ //! // Allocate temporary storage for exclusive prefix scan
1025
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1026
+ //!
1027
+ //! // Run exclusive prefix min-scan
1028
+ //! cub::DeviceScan::ExclusiveScan(
1029
+ //! d_temp_storage, temp_storage_bytes,
1030
+ //! d_data, min_op, future_init_value, num_items);
1031
+ //!
1032
+ //! // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
1033
+ //!
1034
+ //! @endrst
1035
+ //!
1036
+ //! @tparam IteratorT
1037
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
1038
+ //!
1039
+ //! @tparam ScanOpT
1040
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1041
+ //!
1042
+ //! @tparam InitValueT
1043
+ //! **[inferred]** Type of the `init_value`
1044
+ //!
1045
+ //! @tparam NumItemsT
1046
+ //! **[inferred]** An integral type representing the number of input elements
1047
+ //!
1048
+ //! @param[in] d_temp_storage
1049
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1050
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1051
+ //!
1052
+ //! @param[in,out] temp_storage_bytes
1053
+ //! Reference to size in bytes of `d_temp_storage` allocation
1054
+ //!
1055
+ //! @param[in,out] d_data
1056
+ //! Pointer to the sequence of data items
1057
+ //!
1058
+ //! @param[in] scan_op
1059
+ //! Binary associative scan functor
1060
+ //!
1061
+ //! @param[in] init_value
1062
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
1063
+ //!
1064
+ //! @param[in] num_items
1065
+ //! Total number of input items (i.e., the length of `d_in`)
1066
+ //!
1067
+ //! @param[in] stream
1068
+ //! @rst
1069
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1070
+ //! @endrst
1071
+ template <typename IteratorT,
1072
+ typename ScanOpT,
1073
+ typename InitValueT,
1074
+ typename InitValueIterT = InitValueT*,
1075
+ typename NumItemsT = int>
1076
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
1077
+ void* d_temp_storage,
1078
+ size_t& temp_storage_bytes,
1079
+ IteratorT d_data,
1080
+ ScanOpT scan_op,
1081
+ FutureValue<InitValueT, InitValueIterT> init_value,
1082
+ NumItemsT num_items,
1083
+ cudaStream_t stream = 0)
1084
+ {
1085
+ return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
1086
+ }
1087
+
1088
+ //! @} end member group
1089
+ //! @name Inclusive scans
1090
+ //! @{
1091
+
1092
+ //! @rst
1093
+ //! Computes a device-wide inclusive prefix sum.
1094
+ //!
1095
+ //! - Supports non-commutative sum operators.
1096
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1097
+ //! addition of floating-point types). Results for pseudo-associative
1098
+ //! operators may vary from run to run. Additional details can be found in
1099
+ //! the @lookback description.
1100
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
1101
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
1102
+ //! shall not overlap in any other way.
1103
+ //! - @devicestorage
1104
+ //!
1105
+ //! Snippet
1106
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1107
+ //!
1108
+ //! The code snippet below illustrates the inclusive prefix sum of an ``int`` device vector.
1109
+ //!
1110
+ //! .. code-block:: c++
1111
+ //!
1112
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1113
+ //!
1114
+ //! // Declare, allocate, and initialize device-accessible pointers for
1115
+ //! // input and output
1116
+ //! int num_items; // e.g., 7
1117
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1118
+ //! int *d_out; // e.g., [ , , , , , , ]
1119
+ //! ...
1120
+ //!
1121
+ //! // Determine temporary device storage requirements for inclusive
1122
+ //! // prefix sum
1123
+ //! void *d_temp_storage = nullptr;
1124
+ //! size_t temp_storage_bytes = 0;
1125
+ //! cub::DeviceScan::InclusiveSum(
1126
+ //! d_temp_storage, temp_storage_bytes,
1127
+ //! d_in, d_out, num_items);
1128
+ //!
1129
+ //! // Allocate temporary storage for inclusive prefix sum
1130
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1131
+ //!
1132
+ //! // Run inclusive prefix sum
1133
+ //! cub::DeviceScan::InclusiveSum(
1134
+ //! d_temp_storage, temp_storage_bytes,
1135
+ //! d_in, d_out, num_items);
1136
+ //!
1137
+ //! // d_out <-- [8, 14, 21, 26, 29, 29, 38]
1138
+ //!
1139
+ //! @endrst
1140
+ //!
1141
+ //! @tparam InputIteratorT
1142
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
1143
+ //!
1144
+ //! @tparam OutputIteratorT
1145
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
1146
+ //!
1147
+ //! @tparam NumItemsT
1148
+ //! **[inferred]** An integral type representing the number of input elements
1149
+ //!
1150
+ //! @param[in] d_temp_storage
1151
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1152
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1153
+ //!
1154
+ //! @param[in,out] temp_storage_bytes
1155
+ //! Reference to size in bytes of `d_temp_storage` allocation
1156
+ //!
1157
+ //! @param[in] d_in
1158
+ //! Random-access iterator to the input sequence of data items
1159
+ //!
1160
+ //! @param[out] d_out
1161
+ //! Random-access iterator to the output sequence of data items
1162
+ //!
1163
+ //! @param[in] num_items
1164
+ //! Total number of input items (i.e., the length of `d_in`)
1165
+ //!
1166
+ //! @param[in] stream
1167
+ //! @rst
1168
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1169
+ //! @endrst
1170
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
1171
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
1172
+ void* d_temp_storage,
1173
+ size_t& temp_storage_bytes,
1174
+ InputIteratorT d_in,
1175
+ OutputIteratorT d_out,
1176
+ NumItemsT num_items,
1177
+ cudaStream_t stream = 0)
1178
+ {
1179
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSum");
1180
+
1181
+ // Unsigned integer type for global offsets
1182
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1183
+
1184
+ return DispatchScan<InputIteratorT, OutputIteratorT, ::cuda::std::plus<>, NullType, OffsetT>::Dispatch(
1185
+ d_temp_storage, temp_storage_bytes, d_in, d_out, ::cuda::std::plus<>{}, NullType{}, num_items, stream);
1186
+ }
1187
+
1188
+ //! @rst
1189
+ //! Computes a device-wide inclusive prefix sum in-place.
1190
+ //!
1191
+ //! - Supports non-commutative sum operators.
1192
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1193
+ //! addition of floating-point types). Results for pseudo-associative
1194
+ //! operators may vary from run to run. Additional details can be found in
1195
+ //! the @lookback description.
1196
+ //! - @devicestorage
1197
+ //!
1198
+ //! Snippet
1199
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1200
+ //!
1201
+ //! The code snippet below illustrates the inclusive prefix sum of an ``int`` device vector.
1202
+ //!
1203
+ //! .. code-block:: c++
1204
+ //!
1205
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1206
+ //!
1207
+ //! // Declare, allocate, and initialize device-accessible pointers for
1208
+ //! // input and output
1209
+ //! int num_items; // e.g., 7
1210
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
1211
+ //! ...
1212
+ //!
1213
+ //! // Determine temporary device storage requirements for inclusive
1214
+ //! // prefix sum
1215
+ //! void *d_temp_storage = nullptr;
1216
+ //! size_t temp_storage_bytes = 0;
1217
+ //! cub::DeviceScan::InclusiveSum(
1218
+ //! d_temp_storage, temp_storage_bytes,
1219
+ //! d_data, num_items);
1220
+ //!
1221
+ //! // Allocate temporary storage for inclusive prefix sum
1222
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1223
+ //!
1224
+ //! // Run inclusive prefix sum
1225
+ //! cub::DeviceScan::InclusiveSum(
1226
+ //! d_temp_storage, temp_storage_bytes,
1227
+ //! d_data, num_items);
1228
+ //!
1229
+ //! // d_data <-- [8, 14, 21, 26, 29, 29, 38]
1230
+ //!
1231
+ //! @endrst
1232
+ //!
1233
+ //! @tparam IteratorT
1234
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
1235
+ //!
1236
+ //! @tparam NumItemsT
1237
+ //! **[inferred]** An integral type representing the number of input elements
1238
+ //!
1239
+ //! @param[in] d_temp_storage
1240
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1241
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1242
+ //!
1243
+ //! @param[in,out] temp_storage_bytes
1244
+ //! Reference to size in bytes of `d_temp_storage` allocation
1245
+ //!
1246
+ //! @param[in,out] d_data
1247
+ //! Random-access iterator to the sequence of data items
1248
+ //!
1249
+ //! @param[in] num_items
1250
+ //! Total number of input items (i.e., the length of `d_in`)
1251
+ //!
1252
+ //! @param[in] stream
1253
+ //! @rst
1254
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1255
+ //! @endrst
1256
+ template <typename IteratorT, typename NumItemsT>
1257
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
1258
+ void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0)
1259
+ {
1260
+ return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
1261
+ }
1262
+
1263
+ //! @rst
1264
+ //! Computes a device-wide inclusive prefix scan using the specified binary associative ``scan_op`` functor.
1265
+ //!
1266
+ //! - Supports non-commutative scan operators.
1267
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1268
+ //! addition of floating-point types). Results for pseudo-associative
1269
+ //! operators may vary from run to run. Additional details can be found in
1270
+ //! the @lookback description.
1271
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
1272
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
1273
+ //! shall not overlap in any other way.
1274
+ //! - @devicestorage
1275
+ //!
1276
+ //! Snippet
1277
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1278
+ //!
1279
+ //! The code snippet below illustrates the inclusive prefix min-scan of an ``int`` device vector.
1280
+ //!
1281
+ //! .. code-block:: c++
1282
+ //!
1283
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1284
+ //! #include <cuda/std/climits> // for INT_MAX
1285
+ //!
1286
+ //! // CustomMin functor
1287
+ //! struct CustomMin
1288
+ //! {
1289
+ //! template <typename T>
1290
+ //! __host__ __device__ __forceinline__
1291
+ //! T operator()(const T &a, const T &b) const {
1292
+ //! return (b < a) ? b : a;
1293
+ //! }
1294
+ //! };
1295
+ //!
1296
+ //! // Declare, allocate, and initialize device-accessible pointers for
1297
+ //! // input and output
1298
+ //! int num_items; // e.g., 7
1299
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1300
+ //! int *d_out; // e.g., [ , , , , , , ]
1301
+ //! CustomMin min_op;
1302
+ //! ...
1303
+ //!
1304
+ //! // Determine temporary device storage requirements for inclusive
1305
+ //! // prefix scan
1306
+ //! void *d_temp_storage = nullptr;
1307
+ //! size_t temp_storage_bytes = 0;
1308
+ //! cub::DeviceScan::InclusiveScan(
1309
+ //! d_temp_storage, temp_storage_bytes,
1310
+ //! d_in, d_out, min_op, num_items);
1311
+ //!
1312
+ //! // Allocate temporary storage for inclusive prefix scan
1313
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1314
+ //!
1315
+ //! // Run inclusive prefix min-scan
1316
+ //! cub::DeviceScan::InclusiveScan(
1317
+ //! d_temp_storage, temp_storage_bytes,
1318
+ //! d_in, d_out, min_op, num_items);
1319
+ //!
1320
+ //! // d_out <-- [8, 6, 6, 5, 3, 0, 0]
1321
+ //!
1322
+ //! @endrst
1323
+ //!
1324
+ //! @tparam InputIteratorT
1325
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
1326
+ //!
1327
+ //! @tparam OutputIteratorT
1328
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
1329
+ //!
1330
+ //! @tparam ScanOpT
1331
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1332
+ //!
1333
+ //! @tparam NumItemsT
1334
+ //! **[inferred]** An integral type representing the number of input elements
1335
+ //!
1336
+ //! @param[in]
1337
+ //! d_temp_storage Device-accessible allocation of temporary storage.
1338
+ //! When `nullptr`, the required allocation size is written to
1339
+ //! `temp_storage_bytes` and no work is done.
1340
+ //!
1341
+ //! @param[in,out] temp_storage_bytes
1342
+ //! Reference to size in bytes of `d_temp_storage` allocation
1343
+ //!
1344
+ //! @param[in] d_in
1345
+ //! Random-access iterator to the input sequence of data items
1346
+ //!
1347
+ //! @param[out] d_out
1348
+ //! Random-access iterator to the output sequence of data items
1349
+ //!
1350
+ //! @param[in] scan_op
1351
+ //! Binary associative scan functor
1352
+ //!
1353
+ //! @param[in] num_items
1354
+ //! Total number of input items (i.e., the length of `d_in`)
1355
+ //!
1356
+ //! @param[in] stream
1357
+ //! @rst
1358
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1359
+ //! @endrst
1360
+ template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename NumItemsT>
1361
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
1362
+ void* d_temp_storage,
1363
+ size_t& temp_storage_bytes,
1364
+ InputIteratorT d_in,
1365
+ OutputIteratorT d_out,
1366
+ ScanOpT scan_op,
1367
+ NumItemsT num_items,
1368
+ cudaStream_t stream = 0)
1369
+ {
1370
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScan");
1371
+
1372
+ // Unsigned integer type for global offsets
1373
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1374
+
1375
+ return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
1376
+ d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream);
1377
+ }
1378
+
1379
+ //! @rst
1380
+ //! Computes a device-wide inclusive prefix scan using the specified binary associative ``scan_op`` functor.
1381
+ //! The result of applying the ``scan_op`` binary operator to ``init_value`` value and ``*d_in``
1382
+ //! is assigned to ``*d_out``.
1383
+ //!
1384
+ //! - Supports non-commutative scan operators.
1385
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1386
+ //! addition of floating-point types). Results for pseudo-associative
1387
+ //! operators may vary from run to run. Additional details can be found in
1388
+ //! the @lookback description.
1389
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
1390
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
1391
+ //! shall not overlap in any other way.
1392
+ //! - @devicestorage
1393
+ //!
1394
+ //! Snippet
1395
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1396
+ //!
1397
+ //! The code snippet below illustrates the inclusive max-scan of an ``int`` device vector.
1398
+ //!
1399
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_api.cu
1400
+ //! :language: c++
1401
+ //! :dedent:
1402
+ //! :start-after: example-begin device-inclusive-scan
1403
+ //! :end-before: example-end device-inclusive-scan
1404
+ //!
1405
+ //! @endrst
1406
+ //!
1407
+ //! @tparam InputIteratorT
1408
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
1409
+ //!
1410
+ //! @tparam OutputIteratorT
1411
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
1412
+ //!
1413
+ //! @tparam ScanOpT
1414
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1415
+ //!
1416
+ //! @tparam InitValueT
1417
+ //! **[inferred]** Type of the `init_value`
1418
+ //!
1419
+ //! @tparam NumItemsT
1420
+ //! **[inferred]** An integral type representing the number of input elements
1421
+ //!
1422
+ //! @param[in] d_temp_storage
1423
+ //! Device-accessible allocation of temporary storage.
1424
+ //! When `nullptr`, the required allocation size is written to
1425
+ //! `temp_storage_bytes` and no work is done.
1426
+ //!
1427
+ //! @param[in,out] temp_storage_bytes
1428
+ //! Reference to the size in bytes of the `d_temp_storage` allocation
1429
+ //!
1430
+ //! @param[in] d_in
1431
+ //! Random-access iterator to the input sequence of data items
1432
+ //!
1433
+ //! @param[out] d_out
1434
+ //! Random-access iterator to the output sequence of data items
1435
+ //!
1436
+ //! @param[in] scan_op
1437
+ //! Binary associative scan functor
1438
+ //!
1439
+ //! @param[in] init_value
1440
+ //! Initial value to seed the inclusive scan (`scan_op(init_value, d_in[0])`
1441
+ //! is assigned to `*d_out`)
1442
+ //!
1443
+ //! @param[in] num_items
1444
+ //! Total number of input items (i.e., the length of `d_in`)
1445
+ //!
1446
+ //! @param[in] stream
1447
+ //! CUDA stream to launch kernels within.
1448
+ template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
1449
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanInit(
1450
+ void* d_temp_storage,
1451
+ size_t& temp_storage_bytes,
1452
+ InputIteratorT d_in,
1453
+ OutputIteratorT d_out,
1454
+ ScanOpT scan_op,
1455
+ InitValueT init_value,
1456
+ NumItemsT num_items,
1457
+ cudaStream_t stream = 0)
1458
+ {
1459
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanInit");
1460
+
1461
+ // Unsigned integer type for global offsets
1462
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1463
+ using AccumT = ::cuda::std::__accumulator_t<ScanOpT, cub::detail::it_value_t<InputIteratorT>, InitValueT>;
1464
+
1465
+ return DispatchScan<
1466
+ InputIteratorT,
1467
+ OutputIteratorT,
1468
+ ScanOpT,
1469
+ detail::InputValue<InitValueT>,
1470
+ OffsetT,
1471
+ AccumT,
1472
+ ForceInclusive::Yes>::Dispatch(d_temp_storage,
1473
+ temp_storage_bytes,
1474
+ d_in,
1475
+ d_out,
1476
+ scan_op,
1477
+ detail::InputValue<InitValueT>(init_value),
1478
+ num_items,
1479
+ stream);
1480
+ }
1481
+
1482
+ //! @rst
1483
+ //! Computes a device-wide inclusive prefix scan using the specified binary associative ``scan_op`` functor.
1484
+ //!
1485
+ //! - Supports non-commutative scan operators.
1486
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1487
+ //! addition of floating-point types). Results for pseudo-associative
1488
+ //! operators may vary from run to run. Additional details can be found in
1489
+ //! the @lookback description.
1490
+ //! - @devicestorage
1491
+ //!
1492
+ //! Snippet
1493
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1494
+ //!
1495
+ //! The code snippet below illustrates the inclusive prefix min-scan of an ``int`` device vector.
1496
+ //!
1497
+ //! .. code-block:: c++
1498
+ //!
1499
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1500
+ //! #include <cuda/std/climits> // for INT_MAX
1501
+ //!
1502
+ //! // CustomMin functor
1503
+ //! struct CustomMin
1504
+ //! {
1505
+ //! template <typename T>
1506
+ //! __host__ __device__ __forceinline__
1507
+ //! T operator()(const T &a, const T &b) const {
1508
+ //! return (b < a) ? b : a;
1509
+ //! }
1510
+ //! };
1511
+ //!
1512
+ //! // Declare, allocate, and initialize device-accessible pointers for
1513
+ //! // input and output
1514
+ //! int num_items; // e.g., 7
1515
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
1516
+ //! CustomMin min_op;
1517
+ //! ...
1518
+ //!
1519
+ //! // Determine temporary device storage requirements for inclusive
1520
+ //! // prefix scan
1521
+ //! void *d_temp_storage = nullptr;
1522
+ //! size_t temp_storage_bytes = 0;
1523
+ //! cub::DeviceScan::InclusiveScan(
1524
+ //! d_temp_storage, temp_storage_bytes,
1525
+ //! d_data, min_op, num_items);
1526
+ //!
1527
+ //! // Allocate temporary storage for inclusive prefix scan
1528
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1529
+ //!
1530
+ //! // Run inclusive prefix min-scan
1531
+ //! cub::DeviceScan::InclusiveScan(
1532
+ //! d_temp_storage, temp_storage_bytes,
1533
+ //! d_in, d_out, min_op, num_items);
1534
+ //!
1535
+ //! // d_data <-- [8, 6, 6, 5, 3, 0, 0]
1536
+ //!
1537
+ //! @endrst
1538
+ //!
1539
+ //! @tparam IteratorT
1540
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
1541
+ //!
1542
+ //! @tparam ScanOpT
1543
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1544
+ //!
1545
+ //! @tparam NumItemsT
1546
+ //! **[inferred]** An integral type representing the number of input elements
1547
+ //!
1548
+ //! @param[in]
1549
+ //! d_temp_storage Device-accessible allocation of temporary storage.
1550
+ //! When `nullptr`, the required allocation size is written to
1551
+ //! `temp_storage_bytes` and no work is done.
1552
+ //!
1553
+ //! @param[in,out] temp_storage_bytes
1554
+ //! Reference to size in bytes of `d_temp_storage` allocation
1555
+ //!
1556
+ //! @param[in] d_data
1557
+ //! Random-access iterator to the sequence of data items
1558
+ //!
1559
+ //! @param[in] scan_op
1560
+ //! Binary associative scan functor
1561
+ //!
1562
+ //! @param[in] num_items
1563
+ //! Total number of input items (i.e., the length of `d_in`)
1564
+ //!
1565
+ //! @param[in] stream
1566
+ //! @rst
1567
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1568
+ //! @endrst
1569
+ template <typename IteratorT, typename ScanOpT, typename NumItemsT>
1570
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
1571
+ void* d_temp_storage,
1572
+ size_t& temp_storage_bytes,
1573
+ IteratorT d_data,
1574
+ ScanOpT scan_op,
1575
+ NumItemsT num_items,
1576
+ cudaStream_t stream = 0)
1577
+ {
1578
+ return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream);
1579
+ }
1580
+
1581
+ //! @rst
1582
+ //! Computes a device-wide exclusive prefix sum-by-key with key equality
1583
+ //! defined by ``equality_op``. The value of ``0`` is applied as the initial
1584
+ //! value, and is assigned to the beginning of each segment in ``d_values_out``.
1585
+ //!
1586
+ //! - Supports non-commutative sum operators.
1587
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1588
+ //! addition of floating-point types). Results for pseudo-associative
1589
+ //! operators may vary from run to run. Additional details can be found in
1590
+ //! the @lookback description.
1591
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
1592
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
1593
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1594
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
1595
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
1596
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1597
+ //! - @devicestorage
1598
+ //!
1599
+ //! Snippet
1600
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1601
+ //!
1602
+ //! The code snippet below illustrates the exclusive prefix sum-by-key of an ``int`` device vector.
1603
+ //!
1604
+ //! .. code-block:: c++
1605
+ //!
1606
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1607
+ //!
1608
+ //! // Declare, allocate, and initialize device-accessible pointers for
1609
+ //! // input and output
1610
+ //! int num_items; // e.g., 7
1611
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
1612
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1613
+ //! int *d_values_out; // e.g., [ , , , , , , ]
1614
+ //! ...
1615
+ //!
1616
+ //! // Determine temporary device storage requirements
1617
+ //! void *d_temp_storage = nullptr;
1618
+ //! size_t temp_storage_bytes = 0;
1619
+ //! cub::DeviceScan::ExclusiveSumByKey(
1620
+ //! d_temp_storage, temp_storage_bytes,
1621
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1622
+ //!
1623
+ //! // Allocate temporary storage
1624
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1625
+ //!
1626
+ //! // Run exclusive prefix sum
1627
+ //! cub::DeviceScan::ExclusiveSumByKey(
1628
+ //! d_temp_storage, temp_storage_bytes,
1629
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1630
+ //!
1631
+ //! // d_values_out <-- [0, 8, 0, 7, 12, 0, 0]
1632
+ //!
1633
+ //! @endrst
1634
+ //!
1635
+ //! @tparam KeysInputIteratorT
1636
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
1637
+ //!
1638
+ //! @tparam ValuesInputIteratorT
1639
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
1640
+ //!
1641
+ //! @tparam ValuesOutputIteratorT
1642
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
1643
+ //!
1644
+ //! @tparam EqualityOpT
1645
+ //! **[inferred]** Functor type having member
1646
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
1647
+ //!
1648
+ //! @tparam NumItemsT
1649
+ //! **[inferred]** An integral type representing the number of input elements
1650
+ //!
1651
+ //! @param[in] d_temp_storage
1652
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1653
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1654
+ //!
1655
+ //! @param[in,out] temp_storage_bytes
1656
+ //! Reference to size in bytes of `d_temp_storage` allocation
1657
+ //!
1658
+ //! @param[in] d_keys_in
1659
+ //! Random-access input iterator to the input sequence of key items
1660
+ //!
1661
+ //! @param[in] d_values_in
1662
+ //! Random-access input iterator to the input sequence of value items
1663
+ //!
1664
+ //! @param[out] d_values_out
1665
+ //! Random-access output iterator to the output sequence of value items
1666
+ //!
1667
+ //! @param[in] num_items
1668
+ //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
1669
+ //!
1670
+ //! @param[in] equality_op
1671
+ //! Binary functor that defines the equality of keys.
1672
+ //! Default is cuda::std::equal_to<>{}.
1673
+ //!
1674
+ //! @param[in] stream
1675
+ //! @rst
1676
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1677
+ //! @endrst
1678
+ template <typename KeysInputIteratorT,
1679
+ typename ValuesInputIteratorT,
1680
+ typename ValuesOutputIteratorT,
1681
+ typename EqualityOpT = ::cuda::std::equal_to<>,
1682
+ typename NumItemsT = uint32_t>
1683
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey(
1684
+ void* d_temp_storage,
1685
+ size_t& temp_storage_bytes,
1686
+ KeysInputIteratorT d_keys_in,
1687
+ ValuesInputIteratorT d_values_in,
1688
+ ValuesOutputIteratorT d_values_out,
1689
+ NumItemsT num_items,
1690
+ EqualityOpT equality_op = EqualityOpT(),
1691
+ cudaStream_t stream = 0)
1692
+ {
1693
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSumByKey");
1694
+
1695
+ // Unsigned integer type for global offsets
1696
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1697
+ using InitT = cub::detail::it_value_t<ValuesInputIteratorT>;
1698
+
1699
+ // Initial value
1700
+ InitT init_value{};
1701
+
1702
+ return DispatchScanByKey<
1703
+ KeysInputIteratorT,
1704
+ ValuesInputIteratorT,
1705
+ ValuesOutputIteratorT,
1706
+ EqualityOpT,
1707
+ ::cuda::std::plus<>,
1708
+ InitT,
1709
+ OffsetT>::Dispatch(d_temp_storage,
1710
+ temp_storage_bytes,
1711
+ d_keys_in,
1712
+ d_values_in,
1713
+ d_values_out,
1714
+ equality_op,
1715
+ ::cuda::std::plus<>{},
1716
+ init_value,
1717
+ num_items,
1718
+ stream);
1719
+ }
1720
+
1721
+ //! @rst
1722
+ //! Computes a device-wide exclusive prefix scan-by-key using the
1723
+ //! specified binary associative ``scan_op`` functor. The key equality is defined by
1724
+ //! ``equality_op``. The ``init_value`` value is applied as the initial
1725
+ //! value, and is assigned to the beginning of each segment in ``d_values_out``.
1726
+ //!
1727
+ //! - Supports non-commutative scan operators.
1728
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1729
+ //! addition of floating-point types). Results for pseudo-associative
1730
+ //! operators may vary from run to run. Additional details can be found in
1731
+ //! the @lookback description.
1732
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
1733
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
1734
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1735
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
1736
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
1737
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1738
+ //! - @devicestorage
1739
+ //!
1740
+ //! Snippet
1741
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1742
+ //!
1743
+ //! The code snippet below illustrates the exclusive prefix min-scan-by-key of an ``int`` device vector
1744
+ //!
1745
+ //! .. code-block:: c++
1746
+ //!
1747
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1748
+ //! #include <cuda/std/climits> // for INT_MAX
1749
+ //!
1750
+ //! // CustomMin functor
1751
+ //! struct CustomMin
1752
+ //! {
1753
+ //! template <typename T>
1754
+ //! __host__ __device__ __forceinline__
1755
+ //! T operator()(const T &a, const T &b) const {
1756
+ //! return (b < a) ? b : a;
1757
+ //! }
1758
+ //! };
1759
+ //!
1760
+ //! // CustomEqual functor
1761
+ //! struct CustomEqual
1762
+ //! {
1763
+ //! template <typename T>
1764
+ //! __host__ __device__ __forceinline__
1765
+ //! T operator()(const T &a, const T &b) const {
1766
+ //! return a == b;
1767
+ //! }
1768
+ //! };
1769
+ //!
1770
+ //! // Declare, allocate, and initialize device-accessible pointers for
1771
+ //! // input and output
1772
+ //! int num_items; // e.g., 7
1773
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
1774
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1775
+ //! int *d_values_out; // e.g., [ , , , , , , ]
1776
+ //! CustomMin min_op;
1777
+ //! CustomEqual equality_op;
1778
+ //! ...
1779
+ //!
1780
+ //! // Determine temporary device storage requirements for exclusive
1781
+ //! // prefix scan
1782
+ //! void *d_temp_storage = nullptr;
1783
+ //! size_t temp_storage_bytes = 0;
1784
+ //! cub::DeviceScan::ExclusiveScanByKey(
1785
+ //! d_temp_storage, temp_storage_bytes,
1786
+ //! d_keys_in, d_values_in, d_values_out, min_op,
1787
+ //! (int) INT_MAX, num_items, equality_op);
1788
+ //!
1789
+ //! // Allocate temporary storage for exclusive prefix scan
1790
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1791
+ //!
1792
+ //! // Run exclusive prefix min-scan
1793
+ //! cub::DeviceScan::ExclusiveScanByKey(
1794
+ //! d_temp_storage, temp_storage_bytes,
1795
+ //! d_keys_in, d_values_in, d_values_out, min_op,
1796
+ //! (int) INT_MAX, num_items, equality_op);
1797
+ //!
1798
+ //! // d_values_out <-- [2147483647, 8, 2147483647, 7, 5, 2147483647, 0]
1799
+ //!
1800
+ //! @endrst
1801
+ //!
1802
+ //! @tparam KeysInputIteratorT
1803
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
1804
+ //!
1805
+ //! @tparam ValuesInputIteratorT
1806
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
1807
+ //!
1808
+ //! @tparam ValuesOutputIteratorT
1809
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
1810
+ //!
1811
+ //! @tparam ScanOpT
1812
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1813
+ //!
1814
+ //! @tparam InitValueT
1815
+ //! **[inferred]** Type of the `init_value`
1816
+ //!
1817
+ //! @tparam EqualityOpT
1818
+ //! **[inferred]** Functor type having member
1819
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
1820
+ //!
1821
+ //! @tparam NumItemsT
1822
+ //! **[inferred]** An integral type representing the number of input elements
1823
+ //!
1824
+ //! @param[in] d_temp_storage
1825
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1826
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1827
+ //!
1828
+ //! @param[in,out] temp_storage_bytes
1829
+ //! Reference to size in bytes of `d_temp_storage` allocation
1830
+ //!
1831
+ //! @param[in] d_keys_in
1832
+ //! Random-access input iterator to the input sequence of key items
1833
+ //!
1834
+ //! @param[in] d_values_in
1835
+ //! Random-access input iterator to the input sequence of value items
1836
+ //!
1837
+ //! @param[out] d_values_out
1838
+ //! Random-access output iterator to the output sequence of value items
1839
+ //!
1840
+ //! @param[in] scan_op
1841
+ //! Binary associative scan functor
1842
+ //!
1843
+ //! @param[in] init_value
1844
+ //! Initial value to seed the exclusive scan (and is assigned to the
1845
+ //! beginning of each segment in `d_values_out`)
1846
+ //!
1847
+ //! @param[in] num_items
1848
+ //! Total number of input items (i.e., the length of `d_keys_in` and
1849
+ //! `d_values_in`)
1850
+ //!
1851
+ //! @param[in] equality_op
1852
+ //! Binary functor that defines the equality of keys.
1853
+ //! Default is cuda::std::equal_to<>{}.
1854
+ //!
1855
+ //! @param[in] stream
1856
+ //! @rst
1857
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1858
+ //! @endrst
1859
+ template <typename KeysInputIteratorT,
1860
+ typename ValuesInputIteratorT,
1861
+ typename ValuesOutputIteratorT,
1862
+ typename ScanOpT,
1863
+ typename InitValueT,
1864
+ typename EqualityOpT = ::cuda::std::equal_to<>,
1865
+ typename NumItemsT = uint32_t>
1866
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey(
1867
+ void* d_temp_storage,
1868
+ size_t& temp_storage_bytes,
1869
+ KeysInputIteratorT d_keys_in,
1870
+ ValuesInputIteratorT d_values_in,
1871
+ ValuesOutputIteratorT d_values_out,
1872
+ ScanOpT scan_op,
1873
+ InitValueT init_value,
1874
+ NumItemsT num_items,
1875
+ EqualityOpT equality_op = EqualityOpT(),
1876
+ cudaStream_t stream = 0)
1877
+ {
1878
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScanByKey");
1879
+
1880
+ // Unsigned integer type for global offsets
1881
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1882
+
1883
+ return DispatchScanByKey<
1884
+ KeysInputIteratorT,
1885
+ ValuesInputIteratorT,
1886
+ ValuesOutputIteratorT,
1887
+ EqualityOpT,
1888
+ ScanOpT,
1889
+ InitValueT,
1890
+ OffsetT>::Dispatch(d_temp_storage,
1891
+ temp_storage_bytes,
1892
+ d_keys_in,
1893
+ d_values_in,
1894
+ d_values_out,
1895
+ equality_op,
1896
+ scan_op,
1897
+ init_value,
1898
+ num_items,
1899
+ stream);
1900
+ }
1901
+
1902
+ //! @rst
1903
+ //! Computes a device-wide inclusive prefix sum-by-key with key equality defined by ``equality_op``.
1904
+ //!
1905
+ //! - Supports non-commutative sum operators.
1906
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1907
+ //! addition of floating-point types). Results for pseudo-associative
1908
+ //! operators may vary from run to run. Additional details can be found in
1909
+ //! the @lookback description.
1910
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
1911
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
1912
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1913
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
1914
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
1915
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1916
+ //! - @devicestorage
1917
+ //!
1918
+ //! Snippet
1919
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1920
+ //!
1921
+ //! The code snippet below illustrates the inclusive prefix sum-by-key of an ``int`` device vector.
1922
+ //!
1923
+ //! .. code-block:: c++
1924
+ //!
1925
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1926
+ //!
1927
+ //! // Declare, allocate, and initialize device-accessible pointers for
1928
+ //! // input and output
1929
+ //! int num_items; // e.g., 7
1930
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
1931
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1932
+ //! int *d_values_out; // e.g., [ , , , , , , ]
1933
+ //! ...
1934
+ //!
1935
+ //! // Determine temporary device storage requirements for inclusive prefix sum
1936
+ //! void *d_temp_storage = nullptr;
1937
+ //! size_t temp_storage_bytes = 0;
1938
+ //! cub::DeviceScan::InclusiveSumByKey(
1939
+ //! d_temp_storage, temp_storage_bytes,
1940
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1941
+ //!
1942
+ //! // Allocate temporary storage for inclusive prefix sum
1943
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1944
+ //!
1945
+ //! // Run inclusive prefix sum
1946
+ //! cub::DeviceScan::InclusiveSumByKey(
1947
+ //! d_temp_storage, temp_storage_bytes,
1948
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1949
+ //!
1950
+ //! // d_out <-- [8, 14, 7, 12, 15, 0, 9]
1951
+ //!
1952
+ //! @endrst
1953
+ //!
1954
+ //! @tparam KeysInputIteratorT
1955
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
1956
+ //!
1957
+ //! @tparam ValuesInputIteratorT
1958
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
1959
+ //!
1960
+ //! @tparam ValuesOutputIteratorT
1961
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
1962
+ //!
1963
+ //! @tparam EqualityOpT
1964
+ //! **[inferred]** Functor type having member
1965
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
1966
+ //!
1967
+ //! @tparam NumItemsT
1968
+ //! **[inferred]** An integral type representing the number of input elements
1969
+ //!
1970
+ //! @param[in] d_temp_storage
1971
+ //! Device-accessible allocation of temporary storage.
1972
+ //! When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
1973
+ //!
1974
+ //! @param[in,out] temp_storage_bytes
1975
+ //! Reference to size in bytes of `d_temp_storage` allocation
1976
+ //!
1977
+ //! @param[in] d_keys_in
1978
+ //! Random-access input iterator to the input sequence of key items
1979
+ //!
1980
+ //! @param[in] d_values_in
1981
+ //! Random-access input iterator to the input sequence of value items
1982
+ //!
1983
+ //! @param[out] d_values_out
1984
+ //! Random-access output iterator to the output sequence of value items
1985
+ //!
1986
+ //! @param[in] num_items
1987
+ //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
1988
+ //!
1989
+ //! @param[in] equality_op
1990
+ //! Binary functor that defines the equality of keys.
1991
+ //! Default is cuda::std::equal_to<>{}.
1992
+ //!
1993
+ //! @param[in] stream
1994
+ //! @rst
1995
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1996
+ //! @endrst
1997
+ template <typename KeysInputIteratorT,
1998
+ typename ValuesInputIteratorT,
1999
+ typename ValuesOutputIteratorT,
2000
+ typename EqualityOpT = ::cuda::std::equal_to<>,
2001
+ typename NumItemsT = uint32_t>
2002
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey(
2003
+ void* d_temp_storage,
2004
+ size_t& temp_storage_bytes,
2005
+ KeysInputIteratorT d_keys_in,
2006
+ ValuesInputIteratorT d_values_in,
2007
+ ValuesOutputIteratorT d_values_out,
2008
+ NumItemsT num_items,
2009
+ EqualityOpT equality_op = EqualityOpT(),
2010
+ cudaStream_t stream = 0)
2011
+ {
2012
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSumByKey");
2013
+
2014
+ // Unsigned integer type for global offsets
2015
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2016
+
2017
+ return DispatchScanByKey<
2018
+ KeysInputIteratorT,
2019
+ ValuesInputIteratorT,
2020
+ ValuesOutputIteratorT,
2021
+ EqualityOpT,
2022
+ ::cuda::std::plus<>,
2023
+ NullType,
2024
+ OffsetT>::Dispatch(d_temp_storage,
2025
+ temp_storage_bytes,
2026
+ d_keys_in,
2027
+ d_values_in,
2028
+ d_values_out,
2029
+ equality_op,
2030
+ ::cuda::std::plus<>{},
2031
+ NullType{},
2032
+ num_items,
2033
+ stream);
2034
+ }
2035
+
2036
+ //! @rst
2037
+ //! Computes a device-wide inclusive prefix scan-by-key using the
2038
+ //! specified binary associative ``scan_op`` functor. The key equality is defined by ``equality_op``.
2039
+ //!
2040
+ //! - Supports non-commutative scan operators.
2041
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
2042
+ //! addition of floating-point types). Results for pseudo-associative
2043
+ //! operators may vary from run to run. Additional details can be found in
2044
+ //! the @lookback description.
2045
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
2046
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
2047
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
2048
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
2049
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
2050
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
2051
+ //! - @devicestorage
2052
+ //!
2053
+ //! Snippet
2054
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2055
+ //!
2056
+ //! The code snippet below illustrates the inclusive prefix min-scan-by-key of an ``int`` device vector.
2057
+ //!
2058
+ //! .. code-block:: c++
2059
+ //!
2060
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
2061
+ //! #include <cuda/std/climits> // for INT_MAX
2062
+ //!
2063
+ //! // CustomMin functor
2064
+ //! struct CustomMin
2065
+ //! {
2066
+ //! template <typename T>
2067
+ //! __host__ __device__ __forceinline__
2068
+ //! T operator()(const T &a, const T &b) const {
2069
+ //! return (b < a) ? b : a;
2070
+ //! }
2071
+ //! };
2072
+ //!
2073
+ //! // CustomEqual functor
2074
+ //! struct CustomEqual
2075
+ //! {
2076
+ //! template <typename T>
2077
+ //! __host__ __device__ __forceinline__
2078
+ //! T operator()(const T &a, const T &b) const {
2079
+ //! return a == b;
2080
+ //! }
2081
+ //! };
2082
+ //!
2083
+ //! // Declare, allocate, and initialize device-accessible pointers for
2084
+ //! // input and output
2085
+ //! int num_items; // e.g., 7
2086
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
2087
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
2088
+ //! int *d_values_out; // e.g., [ , , , , , , ]
2089
+ //! CustomMin min_op;
2090
+ //! CustomEqual equality_op;
2091
+ //! ...
2092
+ //!
2093
+ //! // Determine temporary device storage requirements for inclusive prefix scan
2094
+ //! void *d_temp_storage = nullptr;
2095
+ //! size_t temp_storage_bytes = 0;
2096
+ //! cub::DeviceScan::InclusiveScanByKey(
2097
+ //! d_temp_storage, temp_storage_bytes,
2098
+ //! d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
2099
+ //!
2100
+ //! // Allocate temporary storage for inclusive prefix scan
2101
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2102
+ //!
2103
+ //! // Run inclusive prefix min-scan
2104
+ //! cub::DeviceScan::InclusiveScanByKey(
2105
+ //! d_temp_storage, temp_storage_bytes,
2106
+ //! d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
2107
+ //!
2108
+ //! // d_out <-- [8, 6, 7, 5, 3, 0, 0]
2109
+ //!
2110
+ //! @endrst
2111
+ //!
2112
+ //! @tparam KeysInputIteratorT
2113
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
2114
+ //!
2115
+ //! @tparam ValuesInputIteratorT
2116
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
2117
+ //!
2118
+ //! @tparam ValuesOutputIteratorT
2119
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
2120
+ //!
2121
+ //! @tparam ScanOpT
2122
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
2123
+ //!
2124
+ //! @tparam EqualityOpT
2125
+ //! **[inferred]** Functor type having member
2126
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
2127
+ //!
2128
+ //! @tparam NumItemsT
2129
+ //! **[inferred]** An integral type representing the number of input elements
2130
+ //!
2131
+ //! @param[in] d_temp_storage
2132
+ //! Device-accessible allocation of temporary storage.
2133
+ //! When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
2134
+ //!
2135
+ //! @param[in,out] temp_storage_bytes
2136
+ //! Reference to size in bytes of `d_temp_storage` allocation
2137
+ //!
2138
+ //! @param[in] d_keys_in
2139
+ //! Random-access input iterator to the input sequence of key items
2140
+ //!
2141
+ //! @param[in] d_values_in
2142
+ //! Random-access input iterator to the input sequence of value items
2143
+ //!
2144
+ //! @param[out] d_values_out
2145
+ //! Random-access output iterator to the output sequence of value items
2146
+ //!
2147
+ //! @param[in] scan_op
2148
+ //! Binary associative scan functor
2149
+ //!
2150
+ //! @param[in] num_items
2151
+ //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
2152
+ //!
2153
+ //! @param[in] equality_op
2154
+ //! Binary functor that defines the equality of keys.
2155
+ //! Default is cuda::std::equal_to<>{}.
2156
+ //!
2157
+ //! @param[in] stream
2158
+ //! @rst
2159
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
2160
+ //! @endrst
2161
+ template <typename KeysInputIteratorT,
2162
+ typename ValuesInputIteratorT,
2163
+ typename ValuesOutputIteratorT,
2164
+ typename ScanOpT,
2165
+ typename EqualityOpT = ::cuda::std::equal_to<>,
2166
+ typename NumItemsT = uint32_t>
2167
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey(
2168
+ void* d_temp_storage,
2169
+ size_t& temp_storage_bytes,
2170
+ KeysInputIteratorT d_keys_in,
2171
+ ValuesInputIteratorT d_values_in,
2172
+ ValuesOutputIteratorT d_values_out,
2173
+ ScanOpT scan_op,
2174
+ NumItemsT num_items,
2175
+ EqualityOpT equality_op = EqualityOpT(),
2176
+ cudaStream_t stream = 0)
2177
+ {
2178
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanByKey");
2179
+
2180
+ // Unsigned integer type for global offsets
2181
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2182
+
2183
+ return DispatchScanByKey<
2184
+ KeysInputIteratorT,
2185
+ ValuesInputIteratorT,
2186
+ ValuesOutputIteratorT,
2187
+ EqualityOpT,
2188
+ ScanOpT,
2189
+ NullType,
2190
+ OffsetT>::Dispatch(d_temp_storage,
2191
+ temp_storage_bytes,
2192
+ d_keys_in,
2193
+ d_values_in,
2194
+ d_values_out,
2195
+ equality_op,
2196
+ scan_op,
2197
+ NullType(),
2198
+ num_items,
2199
+ stream);
2200
+ }
2201
+
2202
+ //! @} end member group
2203
+ };
2204
+
2205
+ CUB_NAMESPACE_END