cuda-cccl 0.3.4__cp311-cp311-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1926) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +9 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +24 -0
  5. cuda/cccl/headers/__init__.py +7 -0
  6. cuda/cccl/headers/include/__init__.py +1 -0
  7. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +233 -0
  8. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1158 -0
  9. cuda/cccl/headers/include/cub/agent/agent_for.cuh +55 -0
  10. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +677 -0
  11. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +234 -0
  12. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +722 -0
  13. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +761 -0
  14. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +282 -0
  15. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +702 -0
  16. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +552 -0
  17. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +592 -0
  18. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +780 -0
  19. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1095 -0
  20. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +562 -0
  21. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +448 -0
  22. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +263 -0
  23. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1088 -0
  24. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +320 -0
  25. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +584 -0
  26. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +762 -0
  27. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +605 -0
  28. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1399 -0
  29. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +939 -0
  30. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1203 -0
  31. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1279 -0
  32. cuda/cccl/headers/include/cub/block/block_histogram.cuh +400 -0
  33. cuda/cccl/headers/include/cub/block/block_load.cuh +1242 -0
  34. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +416 -0
  35. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +771 -0
  36. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1203 -0
  37. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2132 -0
  38. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +126 -0
  39. cuda/cccl/headers/include/cub/block/block_reduce.cuh +642 -0
  40. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +406 -0
  41. cuda/cccl/headers/include/cub/block/block_scan.cuh +2287 -0
  42. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +322 -0
  43. cuda/cccl/headers/include/cub/block/block_store.cuh +1223 -0
  44. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +597 -0
  45. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +62 -0
  46. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +216 -0
  47. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +230 -0
  48. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +214 -0
  49. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +257 -0
  50. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +766 -0
  51. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +514 -0
  52. cuda/cccl/headers/include/cub/config.cuh +29 -0
  53. cuda/cccl/headers/include/cub/cub.cuh +96 -0
  54. cuda/cccl/headers/include/cub/detail/array_utils.cuh +54 -0
  55. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +135 -0
  56. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +50 -0
  57. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +94 -0
  58. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +60 -0
  59. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +227 -0
  60. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +86 -0
  61. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +140 -0
  62. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +98 -0
  63. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +112 -0
  64. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +66 -0
  65. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +41 -0
  66. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +39 -0
  67. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +71 -0
  68. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +79 -0
  69. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
  70. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +39 -0
  71. cuda/cccl/headers/include/cub/detail/rfa.cuh +706 -0
  72. cuda/cccl/headers/include/cub/detail/strong_load.cuh +163 -0
  73. cuda/cccl/headers/include/cub/detail/strong_store.cuh +194 -0
  74. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +377 -0
  75. cuda/cccl/headers/include/cub/detail/type_traits.cuh +185 -0
  76. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +48 -0
  77. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +33 -0
  78. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +572 -0
  79. cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
  80. cuda/cccl/headers/include/cub/device/device_for.cuh +1061 -0
  81. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1485 -0
  82. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +171 -0
  83. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  84. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +955 -0
  85. cuda/cccl/headers/include/cub/device/device_partition.cuh +644 -0
  86. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3413 -0
  87. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2497 -0
  88. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +346 -0
  89. cuda/cccl/headers/include/cub/device/device_scan.cuh +2187 -0
  90. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1472 -0
  91. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1406 -0
  92. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2787 -0
  93. cuda/cccl/headers/include/cub/device/device_select.cuh +1204 -0
  94. cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
  95. cuda/cccl/headers/include/cub/device/device_transform.cuh +668 -0
  96. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +289 -0
  97. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +694 -0
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +77 -0
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +172 -0
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1026 -0
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +303 -0
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +449 -0
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1719 -0
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1283 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +629 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +504 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +312 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +603 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +491 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +577 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +951 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +818 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +339 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +455 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +364 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +626 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +541 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +521 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_for_each.cuh +259 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_histogram.cuh +497 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_merge_sort.cuh +332 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_radix_sort.cuh +801 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_reduce.cuh +557 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_scan.cuh +163 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh +295 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_sort.cuh +521 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_three_way_partition.cuh +200 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_transform.cuh +1028 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_unique_by_key.cuh +175 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +43 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +94 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +34 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +255 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +52 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +100 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1063 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +468 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +918 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +647 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +594 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +986 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +373 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1563 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +415 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +84 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +456 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +858 -0
  149. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +203 -0
  150. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +82 -0
  151. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +178 -0
  152. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +230 -0
  153. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +235 -0
  154. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +226 -0
  155. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +296 -0
  156. cuda/cccl/headers/include/cub/thread/thread_load.cuh +324 -0
  157. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +664 -0
  158. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +525 -0
  159. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +472 -0
  160. cuda/cccl/headers/include/cub/thread/thread_search.cuh +175 -0
  161. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +456 -0
  162. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +78 -0
  163. cuda/cccl/headers/include/cub/thread/thread_store.cuh +341 -0
  164. cuda/cccl/headers/include/cub/util_allocator.cuh +897 -0
  165. cuda/cccl/headers/include/cub/util_arch.cuh +141 -0
  166. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +71 -0
  167. cuda/cccl/headers/include/cub/util_debug.cuh +183 -0
  168. cuda/cccl/headers/include/cub/util_device.cuh +759 -0
  169. cuda/cccl/headers/include/cub/util_macro.cuh +73 -0
  170. cuda/cccl/headers/include/cub/util_math.cuh +92 -0
  171. cuda/cccl/headers/include/cub/util_namespace.cuh +151 -0
  172. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +31 -0
  173. cuda/cccl/headers/include/cub/util_ptx.cuh +489 -0
  174. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +96 -0
  175. cuda/cccl/headers/include/cub/util_type.cuh +1093 -0
  176. cuda/cccl/headers/include/cub/util_vsmem.cuh +227 -0
  177. cuda/cccl/headers/include/cub/version.cuh +65 -0
  178. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +304 -0
  179. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +152 -0
  180. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +713 -0
  181. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +378 -0
  182. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +928 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +691 -0
  184. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +381 -0
  185. cuda/cccl/headers/include/cub/warp/warp_load.cuh +591 -0
  186. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +145 -0
  187. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +810 -0
  188. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1866 -0
  189. cuda/cccl/headers/include/cub/warp/warp_store.cuh +498 -0
  190. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +59 -0
  191. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  192. cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
  193. cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
  194. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  201. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  202. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  203. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  204. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +101 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +529 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +73 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  208. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  209. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  210. cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
  211. cuda/cccl/headers/include/cuda/__cccl_config +37 -0
  212. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  213. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
  214. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  216. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  217. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  218. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  219. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  220. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  222. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  223. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  224. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
  225. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  226. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  227. cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
  228. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  229. cuda/cccl/headers/include/cuda/__device/arch_traits.h +537 -0
  230. cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
  231. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  232. cuda/cccl/headers/include/cuda/__device/device_ref.h +156 -0
  233. cuda/cccl/headers/include/cuda/__device/physical_device.h +172 -0
  234. cuda/cccl/headers/include/cuda/__driver/driver_api.h +848 -0
  235. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  236. cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
  237. cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
  238. cuda/cccl/headers/include/cuda/__execution/determinism.h +89 -0
  239. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +87 -0
  240. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  241. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  242. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  243. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  244. cuda/cccl/headers/include/cuda/__functional/maximum.h +76 -0
  245. cuda/cccl/headers/include/cuda/__functional/minimum.h +76 -0
  246. cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
  247. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +106 -0
  248. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  249. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  250. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  251. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  252. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  253. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
  254. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
  255. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +492 -0
  256. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  257. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  258. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  259. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
  260. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
  261. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
  262. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
  263. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
  264. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  265. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +114 -0
  266. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +557 -0
  267. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  268. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  269. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +532 -0
  270. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +238 -0
  271. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
  272. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +117 -0
  273. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  274. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  275. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +81 -0
  276. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  277. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  278. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  279. cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
  280. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  281. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  282. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  283. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +103 -0
  284. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +164 -0
  285. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +58 -0
  286. cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
  287. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  288. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  289. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  290. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  291. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  292. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  293. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  294. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  295. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  296. cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +126 -0
  297. cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +898 -0
  298. cuda/cccl/headers/include/cuda/__memory_resource/device_memory_pool.h +149 -0
  299. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  300. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  301. cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
  302. cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +139 -0
  303. cuda/cccl/headers/include/cuda/__memory_resource/managed_memory_pool.h +146 -0
  304. cuda/cccl/headers/include/cuda/__memory_resource/memory_resource_base.h +578 -0
  305. cuda/cccl/headers/include/cuda/__memory_resource/pinned_memory_pool.h +188 -0
  306. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +113 -0
  307. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +159 -0
  308. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +316 -0
  309. cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
  310. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  311. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  312. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  313. cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +344 -0
  314. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  315. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2977 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  413. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  414. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  415. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  416. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  417. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  418. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  419. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  420. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  421. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  422. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  423. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  424. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
  425. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  426. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  427. cuda/cccl/headers/include/cuda/__runtime/api_wrapper.h +62 -0
  428. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
  429. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  430. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  431. cuda/cccl/headers/include/cuda/__stream/get_stream.h +109 -0
  432. cuda/cccl/headers/include/cuda/__stream/internal_streams.h +44 -0
  433. cuda/cccl/headers/include/cuda/__stream/stream.h +141 -0
  434. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +303 -0
  435. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  436. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  437. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  438. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  439. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  440. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  441. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  442. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  443. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  444. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +591 -0
  445. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
  446. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
  447. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  448. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  449. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  450. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
  451. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +323 -0
  452. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  453. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  454. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +163 -0
  455. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
  456. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  457. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  458. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  459. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  460. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  461. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  462. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  463. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  464. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  465. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  466. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  467. cuda/cccl/headers/include/cuda/access_property +26 -0
  468. cuda/cccl/headers/include/cuda/algorithm +27 -0
  469. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  470. cuda/cccl/headers/include/cuda/atomic +27 -0
  471. cuda/cccl/headers/include/cuda/barrier +293 -0
  472. cuda/cccl/headers/include/cuda/bit +29 -0
  473. cuda/cccl/headers/include/cuda/cmath +37 -0
  474. cuda/cccl/headers/include/cuda/devices +33 -0
  475. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  476. cuda/cccl/headers/include/cuda/functional +32 -0
  477. cuda/cccl/headers/include/cuda/iterator +39 -0
  478. cuda/cccl/headers/include/cuda/latch +27 -0
  479. cuda/cccl/headers/include/cuda/mdspan +28 -0
  480. cuda/cccl/headers/include/cuda/memory +36 -0
  481. cuda/cccl/headers/include/cuda/memory_resource +40 -0
  482. cuda/cccl/headers/include/cuda/numeric +31 -0
  483. cuda/cccl/headers/include/cuda/pipeline +580 -0
  484. cuda/cccl/headers/include/cuda/ptx +129 -0
  485. cuda/cccl/headers/include/cuda/semaphore +31 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  503. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  504. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  505. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  506. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  507. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  508. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  509. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  510. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  511. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  512. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  513. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  514. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
  515. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  516. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  517. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  518. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  519. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  520. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  521. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  522. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  523. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  524. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  525. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  526. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
  527. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  571. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  572. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  573. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  574. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  575. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  576. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  577. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +153 -0
  578. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  579. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  580. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  581. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  582. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  583. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  584. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  585. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  586. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  587. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +458 -0
  588. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +4437 -0
  589. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +184 -0
  590. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +242 -0
  591. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  592. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  593. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  594. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  595. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  596. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
  597. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
  598. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  599. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  600. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +238 -0
  601. cuda/cccl/headers/include/cuda/std/__atomic/types.h +51 -0
  602. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  603. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  604. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  605. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  606. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  607. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
  608. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +209 -0
  609. cuda/cccl/headers/include/cuda/std/__bit/countl.h +174 -0
  610. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  611. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  612. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  613. cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
  614. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  615. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  616. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  617. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  618. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  619. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  620. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  621. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +645 -0
  622. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  623. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  624. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  625. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
  626. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  627. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +130 -0
  628. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +354 -0
  629. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +36 -0
  630. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
  631. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
  632. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  633. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +71 -0
  634. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  635. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  636. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  637. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1286 -0
  638. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +289 -0
  639. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
  640. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  641. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  642. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  643. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  644. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  645. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  646. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  647. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
  648. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  649. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  650. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  651. cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
  652. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +52 -0
  653. cuda/cccl/headers/include/cuda/std/__chrono/day.h +160 -0
  654. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +499 -0
  655. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +53 -0
  656. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +44 -0
  657. cuda/cccl/headers/include/cuda/std/__chrono/month.h +185 -0
  658. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +58 -0
  659. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +78 -0
  660. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +255 -0
  661. cuda/cccl/headers/include/cuda/std/__chrono/year.h +184 -0
  662. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  663. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  664. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +204 -0
  665. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +783 -0
  666. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +122 -0
  667. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +129 -0
  668. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +230 -0
  669. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +204 -0
  670. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +285 -0
  671. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +220 -0
  672. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +285 -0
  673. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +370 -0
  674. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +166 -0
  675. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +204 -0
  676. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +185 -0
  677. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  678. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  679. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +533 -0
  680. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
  681. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  682. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  683. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +210 -0
  684. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +198 -0
  685. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +983 -0
  686. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  687. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +242 -0
  688. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +327 -0
  689. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  690. cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
  691. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
  692. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  693. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
  694. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  695. cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
  696. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
  697. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  698. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  699. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  700. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  701. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  702. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  703. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  704. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  705. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  706. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  707. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  708. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  709. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  710. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +367 -0
  711. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  712. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  713. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  714. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  715. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  716. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  717. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +98 -0
  718. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  719. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  720. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  721. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  722. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  723. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  724. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  725. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  726. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  727. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  728. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  729. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  730. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  731. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  732. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
  733. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  734. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  735. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +118 -0
  736. cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +93 -0
  737. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  738. cuda/cccl/headers/include/cuda/std/__exception/throw_error.h +120 -0
  739. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  740. cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
  741. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  742. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
  743. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
  744. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  745. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +164 -0
  746. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  747. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
  748. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  749. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  750. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
  751. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  752. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
  753. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
  754. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  755. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  756. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  757. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  758. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  759. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  760. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  761. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  762. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  763. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  764. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  765. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  766. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  767. cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
  768. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  769. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  770. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  771. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
  772. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  773. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  774. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  775. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  776. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  777. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  778. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  779. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  780. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  781. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  782. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  783. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  784. cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
  785. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  786. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  787. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  788. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  789. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  790. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  791. cuda/cccl/headers/include/cuda/std/__functional/function.h +1271 -0
  792. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  793. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  794. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +296 -0
  795. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  796. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
  797. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  798. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  799. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  800. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  801. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  802. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  803. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  804. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  805. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  806. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  807. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  808. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
  809. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
  810. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  811. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
  812. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
  813. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  814. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  815. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  816. cuda/cccl/headers/include/cuda/std/__fwd/get.h +122 -0
  817. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  818. cuda/cccl/headers/include/cuda/std/__fwd/ios.h +123 -0
  819. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  820. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
  821. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  822. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  823. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
  824. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +39 -0
  825. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  826. cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
  827. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
  828. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  829. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +37 -0
  830. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  831. cuda/cccl/headers/include/cuda/std/__fwd/variant.h +51 -0
  832. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  833. cuda/cccl/headers/include/cuda/std/__internal/features.h +86 -0
  834. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +143 -0
  835. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  836. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  837. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
  838. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  839. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  840. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  841. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  842. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  843. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  844. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  845. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  846. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
  847. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  848. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  849. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  850. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  851. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  852. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  853. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +181 -0
  854. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  855. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  856. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  857. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  858. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  859. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  860. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  861. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  862. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  863. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  864. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  865. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  866. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  867. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +366 -0
  868. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  869. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  870. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  871. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  872. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  873. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +603 -0
  874. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  875. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +77 -0
  876. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +52 -0
  877. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +137 -0
  878. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +128 -0
  879. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +316 -0
  880. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  881. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +137 -0
  882. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  883. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
  884. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +753 -0
  885. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  886. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  887. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +603 -0
  888. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
  889. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  890. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  891. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  892. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
  893. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  894. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  895. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  896. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  897. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +85 -0
  898. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  899. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +526 -0
  900. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  901. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  902. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  903. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +242 -0
  904. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  905. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +51 -0
  906. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  907. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  908. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  909. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +679 -0
  910. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  911. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +59 -0
  912. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  913. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  914. cuda/cccl/headers/include/cuda/std/__new/allocate.h +131 -0
  915. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  916. cuda/cccl/headers/include/cuda/std/__new/device_new.h +30 -0
  917. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  918. cuda/cccl/headers/include/cuda/std/__new_ +30 -0
  919. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  920. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  921. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  922. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  923. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  924. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  925. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  926. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  927. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  928. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  929. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  930. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  931. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  932. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  933. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  934. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  935. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  936. cuda/cccl/headers/include/cuda/std/__optional/optional.h +860 -0
  937. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +433 -0
  938. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  939. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  940. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  941. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  942. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  943. cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
  944. cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
  945. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  946. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  947. cuda/cccl/headers/include/cuda/std/__random_ +31 -0
  948. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  949. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  950. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  951. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
  952. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  953. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  954. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  955. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  956. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  957. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  958. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  959. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  960. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  961. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +408 -0
  962. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +162 -0
  963. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
  964. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  965. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  966. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  967. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  968. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  969. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  970. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  971. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
  972. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  973. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  974. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  975. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  976. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  977. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  978. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  979. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  980. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  981. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  982. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  983. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  984. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  985. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  986. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  987. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  988. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  989. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  990. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  991. cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
  992. cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
  993. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  994. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
  995. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +100 -0
  996. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
  997. cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
  998. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
  999. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
  1000. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
  1001. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +77 -0
  1002. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  1003. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
  1004. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +83 -0
  1005. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
  1006. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  1007. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  1008. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  1016. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  1017. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  1018. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  1019. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  1020. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  1021. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  1022. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  1023. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1024. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1025. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1026. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1027. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1028. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1029. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1030. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1031. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1032. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1033. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1034. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1035. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1036. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1037. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1038. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1039. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1040. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1041. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1042. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1043. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1044. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1045. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1046. cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
  1047. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1048. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1049. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  1050. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1051. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1052. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1053. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1054. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1055. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1056. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1057. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1058. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1059. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1060. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
  1061. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1062. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1063. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1064. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1065. cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
  1066. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1067. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1068. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1069. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1070. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1071. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1072. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1073. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1074. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1075. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1076. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1077. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1078. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1079. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  1080. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1081. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1082. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1083. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1084. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1085. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1086. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +200 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1125. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1126. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1127. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1128. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1129. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1130. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1131. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1132. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1133. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1134. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1135. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1136. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1137. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1138. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1139. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1140. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1141. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1142. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1143. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1144. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1145. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1146. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1147. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1148. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1149. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1150. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1151. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1152. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1153. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1154. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1155. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1156. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1157. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1158. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1159. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
  1160. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +249 -0
  1161. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1162. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1163. cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
  1164. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1165. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
  1166. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1167. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +61 -0
  1168. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1169. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1170. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1171. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1172. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1173. cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
  1174. cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
  1175. cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
  1176. cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
  1177. cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
  1178. cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
  1179. cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
  1180. cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
  1181. cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
  1182. cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
  1183. cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
  1184. cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
  1185. cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
  1186. cuda/cccl/headers/include/cuda/std/array +518 -0
  1187. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1188. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1189. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1190. cuda/cccl/headers/include/cuda/std/bitset +986 -0
  1191. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1192. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1193. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1194. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1195. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1196. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1197. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1198. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1199. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1200. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1201. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1202. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1203. cuda/cccl/headers/include/cuda/std/ctime +154 -0
  1204. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1205. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +207 -0
  1206. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1718 -0
  1207. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2506 -0
  1208. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1209. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1210. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1211. cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
  1212. cuda/cccl/headers/include/cuda/std/inplace_vector +2171 -0
  1213. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1214. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1215. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1216. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1217. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1218. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1219. cuda/cccl/headers/include/cuda/std/numbers +344 -0
  1220. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1221. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1222. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1223. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1224. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1225. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1226. cuda/cccl/headers/include/cuda/std/span +628 -0
  1227. cuda/cccl/headers/include/cuda/std/string_view +923 -0
  1228. cuda/cccl/headers/include/cuda/std/tuple +43 -0
  1229. cuda/cccl/headers/include/cuda/std/type_traits +176 -0
  1230. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1231. cuda/cccl/headers/include/cuda/std/variant +32 -0
  1232. cuda/cccl/headers/include/cuda/std/version +240 -0
  1233. cuda/cccl/headers/include/cuda/stream +31 -0
  1234. cuda/cccl/headers/include/cuda/stream_ref +59 -0
  1235. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1236. cuda/cccl/headers/include/cuda/utility +28 -0
  1237. cuda/cccl/headers/include/cuda/version +16 -0
  1238. cuda/cccl/headers/include/cuda/warp +28 -0
  1239. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1240. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1241. cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
  1242. cuda/cccl/headers/include/nv/target +236 -0
  1243. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1244. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1245. cuda/cccl/headers/include/thrust/advance.h +57 -0
  1246. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1247. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1248. cuda/cccl/headers/include/thrust/complex.h +858 -0
  1249. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1250. cuda/cccl/headers/include/thrust/count.h +245 -0
  1251. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +108 -0
  1252. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1253. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +624 -0
  1254. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +191 -0
  1255. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +95 -0
  1256. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +79 -0
  1257. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +76 -0
  1258. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +74 -0
  1259. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +113 -0
  1260. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +114 -0
  1261. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
  1262. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +95 -0
  1263. cuda/cccl/headers/include/thrust/detail/binary_search.inl +537 -0
  1264. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1265. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +250 -0
  1266. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +58 -0
  1267. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +869 -0
  1268. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +583 -0
  1269. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +227 -0
  1270. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +155 -0
  1271. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +190 -0
  1272. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +167 -0
  1273. cuda/cccl/headers/include/thrust/detail/complex/clog.h +217 -0
  1274. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +204 -0
  1275. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1276. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1277. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +76 -0
  1278. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +222 -0
  1279. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +162 -0
  1280. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +172 -0
  1281. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +168 -0
  1282. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +202 -0
  1283. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +127 -0
  1284. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +132 -0
  1285. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1286. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1287. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1288. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1289. cuda/cccl/headers/include/thrust/detail/config/device_system.h +57 -0
  1290. cuda/cccl/headers/include/thrust/detail/config/host_system.h +50 -0
  1291. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1292. cuda/cccl/headers/include/thrust/detail/config/namespace.h +161 -0
  1293. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1294. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1295. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +226 -0
  1296. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +271 -0
  1297. cuda/cccl/headers/include/thrust/detail/copy.h +70 -0
  1298. cuda/cccl/headers/include/thrust/detail/copy.inl +139 -0
  1299. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1300. cuda/cccl/headers/include/thrust/detail/copy_if.inl +114 -0
  1301. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1302. cuda/cccl/headers/include/thrust/detail/count.inl +101 -0
  1303. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1304. cuda/cccl/headers/include/thrust/detail/equal.inl +105 -0
  1305. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1306. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +79 -0
  1307. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1308. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1309. cuda/cccl/headers/include/thrust/detail/extrema.inl +196 -0
  1310. cuda/cccl/headers/include/thrust/detail/fill.inl +98 -0
  1311. cuda/cccl/headers/include/thrust/detail/find.inl +125 -0
  1312. cuda/cccl/headers/include/thrust/detail/for_each.inl +96 -0
  1313. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1314. cuda/cccl/headers/include/thrust/detail/functional/actor.h +211 -0
  1315. cuda/cccl/headers/include/thrust/detail/functional/operators.h +383 -0
  1316. cuda/cccl/headers/include/thrust/detail/gather.inl +185 -0
  1317. cuda/cccl/headers/include/thrust/detail/generate.inl +98 -0
  1318. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +60 -0
  1319. cuda/cccl/headers/include/thrust/detail/inner_product.inl +130 -0
  1320. cuda/cccl/headers/include/thrust/detail/internal_functional.h +329 -0
  1321. cuda/cccl/headers/include/thrust/detail/logical.inl +125 -0
  1322. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +89 -0
  1323. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1324. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1325. cuda/cccl/headers/include/thrust/detail/merge.inl +288 -0
  1326. cuda/cccl/headers/include/thrust/detail/mismatch.inl +106 -0
  1327. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +120 -0
  1328. cuda/cccl/headers/include/thrust/detail/partition.inl +390 -0
  1329. cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
  1330. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1331. cuda/cccl/headers/include/thrust/detail/random_bijection.h +175 -0
  1332. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +114 -0
  1333. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +128 -0
  1334. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1335. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +186 -0
  1336. cuda/cccl/headers/include/thrust/detail/reduce.inl +395 -0
  1337. cuda/cccl/headers/include/thrust/detail/reference.h +518 -0
  1338. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1339. cuda/cccl/headers/include/thrust/detail/remove.inl +225 -0
  1340. cuda/cccl/headers/include/thrust/detail/replace.inl +243 -0
  1341. cuda/cccl/headers/include/thrust/detail/reverse.inl +100 -0
  1342. cuda/cccl/headers/include/thrust/detail/scan.inl +536 -0
  1343. cuda/cccl/headers/include/thrust/detail/scatter.inl +169 -0
  1344. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1345. cuda/cccl/headers/include/thrust/detail/sequence.inl +121 -0
  1346. cuda/cccl/headers/include/thrust/detail/set_operations.inl +993 -0
  1347. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1348. cuda/cccl/headers/include/thrust/detail/sort.inl +385 -0
  1349. cuda/cccl/headers/include/thrust/detail/static_assert.h +56 -0
  1350. cuda/cccl/headers/include/thrust/detail/static_map.h +164 -0
  1351. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +77 -0
  1352. cuda/cccl/headers/include/thrust/detail/tabulate.inl +74 -0
  1353. cuda/cccl/headers/include/thrust/detail/temporary_array.h +149 -0
  1354. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +119 -0
  1355. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +92 -0
  1356. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +81 -0
  1357. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +173 -0
  1358. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +128 -0
  1359. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +59 -0
  1360. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1361. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1362. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1363. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +162 -0
  1364. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +67 -0
  1365. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1366. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +55 -0
  1367. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1368. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1369. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +328 -0
  1370. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1371. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +102 -0
  1372. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +98 -0
  1373. cuda/cccl/headers/include/thrust/detail/unique.inl +391 -0
  1374. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1375. cuda/cccl/headers/include/thrust/detail/vector_base.h +611 -0
  1376. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1208 -0
  1377. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1378. cuda/cccl/headers/include/thrust/device_delete.h +74 -0
  1379. cuda/cccl/headers/include/thrust/device_free.h +85 -0
  1380. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1381. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1382. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1383. cuda/cccl/headers/include/thrust/device_new.h +112 -0
  1384. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1385. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1386. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1387. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1388. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1389. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1390. cuda/cccl/headers/include/thrust/execution_policy.h +252 -0
  1391. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1392. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1393. cuda/cccl/headers/include/thrust/find.h +382 -0
  1394. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1395. cuda/cccl/headers/include/thrust/functional.h +393 -0
  1396. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1397. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1398. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1399. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1400. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1401. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1402. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1403. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1404. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +43 -0
  1405. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +42 -0
  1406. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +80 -0
  1407. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1408. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1409. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +56 -0
  1410. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +181 -0
  1411. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +57 -0
  1412. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1413. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1414. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +80 -0
  1415. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +170 -0
  1416. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1417. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1418. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1419. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1420. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
  1421. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1422. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1423. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1424. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1425. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1426. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
  1427. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1428. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
  1429. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +225 -0
  1430. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +349 -0
  1431. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1432. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
  1433. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1434. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1435. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1436. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1437. cuda/cccl/headers/include/thrust/mr/allocator.h +227 -0
  1438. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +46 -0
  1439. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1440. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +116 -0
  1441. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +65 -0
  1442. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +65 -0
  1443. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +42 -0
  1444. cuda/cccl/headers/include/thrust/mr/memory_resource.h +215 -0
  1445. cuda/cccl/headers/include/thrust/mr/new.h +98 -0
  1446. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +61 -0
  1447. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1448. cuda/cccl/headers/include/thrust/mr/pool_options.h +172 -0
  1449. cuda/cccl/headers/include/thrust/mr/sync_pool.h +112 -0
  1450. cuda/cccl/headers/include/thrust/mr/tls_pool.h +62 -0
  1451. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1452. cuda/cccl/headers/include/thrust/mr/validator.h +54 -0
  1453. cuda/cccl/headers/include/thrust/pair.h +99 -0
  1454. cuda/cccl/headers/include/thrust/partition.h +1391 -0
  1455. cuda/cccl/headers/include/thrust/per_device_resource.h +110 -0
  1456. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +182 -0
  1457. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +153 -0
  1458. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +97 -0
  1459. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +149 -0
  1460. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +46 -0
  1461. cuda/cccl/headers/include/thrust/random/detail/mod.h +94 -0
  1462. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +185 -0
  1463. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +155 -0
  1464. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +56 -0
  1465. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +199 -0
  1466. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +196 -0
  1467. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
  1468. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +181 -0
  1469. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
  1470. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +238 -0
  1471. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +287 -0
  1472. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +215 -0
  1473. cuda/cccl/headers/include/thrust/random/normal_distribution.h +255 -0
  1474. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +245 -0
  1475. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +259 -0
  1476. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +256 -0
  1477. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +253 -0
  1478. cuda/cccl/headers/include/thrust/random.h +118 -0
  1479. cuda/cccl/headers/include/thrust/reduce.h +1113 -0
  1480. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1481. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1482. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1483. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1484. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1485. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1486. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1487. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1488. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1489. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1490. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1491. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1492. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1493. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1494. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1495. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1496. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1497. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +113 -0
  1498. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1499. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1500. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1501. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1502. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1503. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1504. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1505. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1506. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1507. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1508. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1509. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +55 -0
  1510. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1511. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1512. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1513. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1514. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1515. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1516. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1517. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1518. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1519. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1520. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1521. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1522. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1523. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1524. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1525. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1526. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1527. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1528. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1529. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1530. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1531. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1532. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1533. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1534. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1535. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1536. cuda/cccl/headers/include/thrust/system/cpp/memory.h +105 -0
  1537. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +70 -0
  1538. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +118 -0
  1539. cuda/cccl/headers/include/thrust/system/cpp/vector.h +95 -0
  1540. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1541. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +215 -0
  1542. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1543. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1544. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1545. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +272 -0
  1546. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +251 -0
  1547. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +282 -0
  1548. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +163 -0
  1549. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +586 -0
  1550. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +73 -0
  1551. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +241 -0
  1552. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +231 -0
  1553. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +62 -0
  1554. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +87 -0
  1555. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +266 -0
  1556. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +472 -0
  1557. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +99 -0
  1558. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +165 -0
  1559. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +82 -0
  1560. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +89 -0
  1561. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +58 -0
  1562. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1563. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1564. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +79 -0
  1565. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1566. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +55 -0
  1567. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +119 -0
  1568. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +55 -0
  1569. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +204 -0
  1570. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +88 -0
  1571. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +79 -0
  1572. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +401 -0
  1573. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +70 -0
  1574. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +780 -0
  1575. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +997 -0
  1576. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +105 -0
  1577. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +121 -0
  1578. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +83 -0
  1579. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +338 -0
  1580. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +411 -0
  1581. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +89 -0
  1582. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1583. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1732 -0
  1584. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +468 -0
  1585. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
  1586. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +61 -0
  1587. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +130 -0
  1588. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1589. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +428 -0
  1590. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +139 -0
  1591. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +117 -0
  1592. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +111 -0
  1593. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +100 -0
  1594. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +286 -0
  1595. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +306 -0
  1596. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1597. cuda/cccl/headers/include/thrust/system/cuda/error.h +159 -0
  1598. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1599. cuda/cccl/headers/include/thrust/system/cuda/memory.h +118 -0
  1600. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +115 -0
  1601. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +155 -0
  1602. cuda/cccl/headers/include/thrust/system/cuda/vector.h +104 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +59 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/errno.h +118 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +298 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +171 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +119 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +77 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +159 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +381 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +43 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +62 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +56 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +143 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +46 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +82 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +47 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +58 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +64 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +249 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +52 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +47 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +135 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +56 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +71 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +94 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +43 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +58 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +70 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +57 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +62 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +97 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +146 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +47 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +66 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +127 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +205 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +41 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +69 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +98 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +81 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +183 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +84 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +119 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +93 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +172 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +46 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +65 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +61 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +124 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +86 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +124 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +229 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +71 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +83 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +103 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +280 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +474 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +52 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +123 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +111 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +173 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +42 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +73 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +39 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +52 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +45 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +52 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +80 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +393 -0
  1675. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +48 -0
  1676. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +54 -0
  1677. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +78 -0
  1678. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +111 -0
  1679. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +43 -0
  1680. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +164 -0
  1681. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +43 -0
  1682. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +112 -0
  1683. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +69 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +111 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +79 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +124 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +115 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +68 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +40 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +134 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +120 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +47 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +69 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +108 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +60 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +72 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +120 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +41 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +139 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +43 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +48 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +147 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +297 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +62 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +96 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +177 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +152 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +143 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +204 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +120 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +354 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +121 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +584 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +56 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +113 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +104 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +106 -0
  1740. cuda/cccl/headers/include/thrust/system/error_code.h +508 -0
  1741. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +34 -0
  1742. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +17 -0
  1743. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +57 -0
  1744. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +61 -0
  1745. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +39 -0
  1746. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +17 -0
  1747. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +48 -0
  1748. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +17 -0
  1749. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +132 -0
  1750. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +46 -0
  1751. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +17 -0
  1752. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +33 -0
  1753. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +73 -0
  1754. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +17 -0
  1755. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +17 -0
  1756. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +17 -0
  1757. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +17 -0
  1758. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +17 -0
  1759. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +17 -0
  1760. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +17 -0
  1761. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +17 -0
  1762. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +17 -0
  1763. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +83 -0
  1764. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +16 -0
  1765. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +30 -0
  1766. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +62 -0
  1767. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +49 -0
  1768. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +87 -0
  1769. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +67 -0
  1770. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +17 -0
  1771. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +17 -0
  1772. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +189 -0
  1773. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +23 -0
  1774. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +17 -0
  1775. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +17 -0
  1776. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +17 -0
  1777. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +245 -0
  1778. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +17 -0
  1779. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +17 -0
  1780. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +16 -0
  1781. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +17 -0
  1782. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +17 -0
  1783. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +17 -0
  1784. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +17 -0
  1785. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +17 -0
  1786. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +51 -0
  1787. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +55 -0
  1788. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1789. cuda/cccl/headers/include/thrust/system/omp/memory.h +153 -0
  1790. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +71 -0
  1791. cuda/cccl/headers/include/thrust/system/omp/pointer.h +120 -0
  1792. cuda/cccl/headers/include/thrust/system/omp/vector.h +96 -0
  1793. cuda/cccl/headers/include/thrust/system/system_error.h +183 -0
  1794. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +33 -0
  1795. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +17 -0
  1796. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +17 -0
  1797. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +59 -0
  1798. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +114 -0
  1799. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +17 -0
  1800. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +17 -0
  1801. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +113 -0
  1802. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +45 -0
  1803. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +17 -0
  1804. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +28 -0
  1805. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +70 -0
  1806. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +17 -0
  1807. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +17 -0
  1808. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +17 -0
  1809. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +17 -0
  1810. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +17 -0
  1811. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +17 -0
  1812. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +17 -0
  1813. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +302 -0
  1814. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +17 -0
  1815. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +78 -0
  1816. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +16 -0
  1817. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +120 -0
  1818. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +378 -0
  1819. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +118 -0
  1820. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +66 -0
  1821. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +17 -0
  1822. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +17 -0
  1823. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +294 -0
  1824. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +20 -0
  1825. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +17 -0
  1826. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +17 -0
  1827. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +17 -0
  1828. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +272 -0
  1829. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +17 -0
  1830. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +17 -0
  1831. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +16 -0
  1832. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +17 -0
  1833. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +17 -0
  1834. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +17 -0
  1835. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +17 -0
  1836. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +17 -0
  1837. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +50 -0
  1838. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +54 -0
  1839. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1840. cuda/cccl/headers/include/thrust/system/tbb/memory.h +139 -0
  1841. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +57 -0
  1842. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +106 -0
  1843. cuda/cccl/headers/include/thrust/system/tbb/vector.h +82 -0
  1844. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1845. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1846. cuda/cccl/headers/include/thrust/transform.h +1056 -0
  1847. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1848. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1849. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1850. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +257 -0
  1851. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +153 -0
  1852. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1853. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +180 -0
  1854. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +112 -0
  1855. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +332 -0
  1856. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1857. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
  1858. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1859. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1860. cuda/cccl/headers/include/thrust/unique.h +1088 -0
  1861. cuda/cccl/headers/include/thrust/universal_allocator.h +101 -0
  1862. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1863. cuda/cccl/headers/include/thrust/universal_vector.h +80 -0
  1864. cuda/cccl/headers/include/thrust/version.h +93 -0
  1865. cuda/cccl/headers/include/thrust/zip_function.h +150 -0
  1866. cuda/cccl/headers/include_paths.py +51 -0
  1867. cuda/cccl/parallel/__init__.py +9 -0
  1868. cuda/cccl/parallel/experimental/__init__.py +24 -0
  1869. cuda/cccl/py.typed +0 -0
  1870. cuda/compute/__init__.py +83 -0
  1871. cuda/compute/_bindings.py +79 -0
  1872. cuda/compute/_bindings.pyi +498 -0
  1873. cuda/compute/_bindings_impl.pyx +2415 -0
  1874. cuda/compute/_caching.py +71 -0
  1875. cuda/compute/_cccl_interop.py +422 -0
  1876. cuda/compute/_utils/__init__.py +0 -0
  1877. cuda/compute/_utils/protocols.py +132 -0
  1878. cuda/compute/_utils/temp_storage_buffer.py +86 -0
  1879. cuda/compute/algorithms/__init__.py +58 -0
  1880. cuda/compute/algorithms/_histogram.py +243 -0
  1881. cuda/compute/algorithms/_reduce.py +182 -0
  1882. cuda/compute/algorithms/_scan.py +331 -0
  1883. cuda/compute/algorithms/_segmented_reduce.py +257 -0
  1884. cuda/compute/algorithms/_sort/__init__.py +23 -0
  1885. cuda/compute/algorithms/_sort/_merge_sort.py +225 -0
  1886. cuda/compute/algorithms/_sort/_radix_sort.py +263 -0
  1887. cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
  1888. cuda/compute/algorithms/_sort/_sort_common.py +52 -0
  1889. cuda/compute/algorithms/_three_way_partition.py +261 -0
  1890. cuda/compute/algorithms/_transform.py +329 -0
  1891. cuda/compute/algorithms/_unique_by_key.py +252 -0
  1892. cuda/compute/cccl/.gitkeep +0 -0
  1893. cuda/compute/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  1894. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  1895. cuda/compute/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  1896. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  1897. cuda/compute/iterators/__init__.py +21 -0
  1898. cuda/compute/iterators/_factories.py +219 -0
  1899. cuda/compute/iterators/_iterators.py +817 -0
  1900. cuda/compute/iterators/_zip_iterator.py +199 -0
  1901. cuda/compute/numba_utils.py +53 -0
  1902. cuda/compute/op.py +3 -0
  1903. cuda/compute/struct.py +272 -0
  1904. cuda/compute/typing.py +37 -0
  1905. cuda/coop/__init__.py +8 -0
  1906. cuda/coop/_caching.py +48 -0
  1907. cuda/coop/_common.py +275 -0
  1908. cuda/coop/_nvrtc.py +92 -0
  1909. cuda/coop/_scan_op.py +181 -0
  1910. cuda/coop/_types.py +937 -0
  1911. cuda/coop/_typing.py +107 -0
  1912. cuda/coop/block/__init__.py +39 -0
  1913. cuda/coop/block/_block_exchange.py +251 -0
  1914. cuda/coop/block/_block_load_store.py +215 -0
  1915. cuda/coop/block/_block_merge_sort.py +125 -0
  1916. cuda/coop/block/_block_radix_sort.py +214 -0
  1917. cuda/coop/block/_block_reduce.py +294 -0
  1918. cuda/coop/block/_block_scan.py +983 -0
  1919. cuda/coop/warp/__init__.py +9 -0
  1920. cuda/coop/warp/_warp_merge_sort.py +92 -0
  1921. cuda/coop/warp/_warp_reduce.py +153 -0
  1922. cuda/coop/warp/_warp_scan.py +78 -0
  1923. cuda_cccl-0.3.4.dist-info/METADATA +78 -0
  1924. cuda_cccl-0.3.4.dist-info/RECORD +1926 -0
  1925. cuda_cccl-0.3.4.dist-info/WHEEL +5 -0
  1926. cuda_cccl-0.3.4.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,986 @@
1
+ // SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2
+ // SPDX-License-Identifier: BSD-3
3
+
4
+ #pragma once
5
+
6
+ #include <cub/config.cuh>
7
+
8
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
9
+ # pragma GCC system_header
10
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
11
+ # pragma clang system_header
12
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
13
+ # pragma system_header
14
+ #endif // no system header
15
+
16
+ #include <cub/agent/agent_scan_by_key.cuh>
17
+ #include <cub/agent/single_pass_scan_operators.cuh>
18
+ #include <cub/block/block_load.cuh>
19
+ #include <cub/block/block_scan.cuh>
20
+ #include <cub/block/block_store.cuh>
21
+ #include <cub/thread/thread_operators.cuh>
22
+ #include <cub/util_device.cuh>
23
+ #include <cub/util_math.cuh>
24
+ #include <cub/util_type.cuh>
25
+
26
+ CUB_NAMESPACE_BEGIN
27
+
28
+ namespace detail::scan_by_key
29
+ {
30
+ enum class primitive_accum
31
+ {
32
+ no,
33
+ yes
34
+ };
35
+ enum class primitive_op
36
+ {
37
+ no,
38
+ yes
39
+ };
40
+ enum class val_size
41
+ {
42
+ _1,
43
+ _2,
44
+ _4,
45
+ _8,
46
+ _16,
47
+ unknown
48
+ };
49
+ enum class key_size
50
+ {
51
+ _1,
52
+ _2,
53
+ _4,
54
+ _8,
55
+ _16,
56
+ unknown
57
+ };
58
+
59
+ template <class AccumT>
60
+ constexpr primitive_accum is_primitive_accum()
61
+ {
62
+ return is_primitive<AccumT>::value ? primitive_accum::yes : primitive_accum::no;
63
+ }
64
+
65
+ template <class ScanOpT>
66
+ constexpr primitive_op is_primitive_op()
67
+ {
68
+ return basic_binary_op_t<ScanOpT>::value ? primitive_op::yes : primitive_op::no;
69
+ }
70
+
71
+ template <class ValueT>
72
+ constexpr val_size classify_val_size()
73
+ {
74
+ return sizeof(ValueT) == 1 ? val_size::_1
75
+ : sizeof(ValueT) == 2 ? val_size::_2
76
+ : sizeof(ValueT) == 4 ? val_size::_4
77
+ : sizeof(ValueT) == 8 ? val_size::_8
78
+ : sizeof(ValueT) == 16
79
+ ? val_size::_16
80
+ : val_size::unknown;
81
+ }
82
+
83
+ template <class KeyT>
84
+ constexpr key_size classify_key_size()
85
+ {
86
+ return sizeof(KeyT) == 1 ? key_size::_1
87
+ : sizeof(KeyT) == 2 ? key_size::_2
88
+ : sizeof(KeyT) == 4 ? key_size::_4
89
+ : sizeof(KeyT) == 8 ? key_size::_8
90
+ : sizeof(KeyT) == 16
91
+ ? key_size::_16
92
+ : key_size::unknown;
93
+ }
94
+
95
+ template <class KeyT,
96
+ class ValueT,
97
+ primitive_op PrimitiveOp,
98
+ key_size KeySize = classify_key_size<KeyT>(),
99
+ val_size ValueSize = classify_val_size<ValueT>(),
100
+ primitive_accum PrimitiveAccumulator = is_primitive_accum<ValueT>()>
101
+ struct sm80_tuning;
102
+
103
+ template <class KeyT, class ValueT>
104
+ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_1, primitive_accum::yes>
105
+ {
106
+ static constexpr int threads = 128;
107
+ static constexpr int items = 12;
108
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
109
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT;
110
+ using delay_constructor = no_delay_constructor_t<795>;
111
+ };
112
+
113
+ template <class KeyT, class ValueT>
114
+ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_2, primitive_accum::yes>
115
+ {
116
+ static constexpr int threads = 288;
117
+ static constexpr int items = 12;
118
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
119
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
120
+ using delay_constructor = no_delay_constructor_t<825>;
121
+ };
122
+
123
+ template <class KeyT, class ValueT>
124
+ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_4, primitive_accum::yes>
125
+ {
126
+ static constexpr int threads = 256;
127
+ static constexpr int items = 15;
128
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
129
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
130
+ using delay_constructor = no_delay_constructor_t<640>;
131
+ };
132
+
133
+ template <class KeyT, class ValueT>
134
+ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_8, primitive_accum::yes>
135
+ {
136
+ static constexpr int threads = 192;
137
+ static constexpr int items = 10;
138
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
139
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
140
+ using delay_constructor = fixed_delay_constructor_t<124, 1040>;
141
+ };
142
+
143
+ #if _CCCL_HAS_INT128()
144
+ template <class KeyT>
145
+ struct sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_1, val_size::_16, primitive_accum::no>
146
+ {
147
+ static constexpr int threads = 128;
148
+ static constexpr int items = 19;
149
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
150
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
151
+ using delay_constructor = no_delay_constructor_t<1095>;
152
+ };
153
+
154
+ template <class KeyT>
155
+ struct sm80_tuning<KeyT, __uint128_t, primitive_op::yes, key_size::_1, val_size::_16, primitive_accum::no>
156
+ : sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_1, val_size::_16, primitive_accum::no>
157
+ {};
158
+ #endif
159
+
160
+ template <class KeyT, class ValueT>
161
+ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_1, primitive_accum::yes>
162
+ {
163
+ static constexpr int threads = 256;
164
+ static constexpr int items = 8;
165
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
166
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT;
167
+ using delay_constructor = no_delay_constructor_t<1070>;
168
+ };
169
+
170
+ template <class KeyT, class ValueT>
171
+ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_2, primitive_accum::yes>
172
+ {
173
+ static constexpr int threads = 320;
174
+ static constexpr int items = 14;
175
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
176
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
177
+ using delay_constructor = no_delay_constructor_t<625>;
178
+ };
179
+
180
+ template <class KeyT, class ValueT>
181
+ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_4, primitive_accum::yes>
182
+ {
183
+ static constexpr int threads = 256;
184
+ static constexpr int items = 15;
185
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
186
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
187
+ using delay_constructor = no_delay_constructor_t<1055>;
188
+ };
189
+
190
+ template <class KeyT, class ValueT>
191
+ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_8, primitive_accum::yes>
192
+ {
193
+ static constexpr int threads = 160;
194
+ static constexpr int items = 17;
195
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
196
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
197
+ using delay_constructor = fixed_delay_constructor_t<160, 695>;
198
+ };
199
+
200
+ #if _CCCL_HAS_INT128()
201
+ template <class KeyT>
202
+ struct sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_2, val_size::_16, primitive_accum::no>
203
+ {
204
+ static constexpr int threads = 160;
205
+ static constexpr int items = 14;
206
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
207
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
208
+ using delay_constructor = no_delay_constructor_t<1105>;
209
+ };
210
+
211
+ template <class KeyT>
212
+ struct sm80_tuning<KeyT, __uint128_t, primitive_op::yes, key_size::_2, val_size::_16, primitive_accum::no>
213
+ : sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_2, val_size::_16, primitive_accum::no>
214
+ {};
215
+ #endif
216
+
217
+ template <class KeyT, class ValueT>
218
+ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_1, primitive_accum::yes>
219
+ {
220
+ static constexpr int threads = 128;
221
+ static constexpr int items = 12;
222
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
223
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT;
224
+ using delay_constructor = no_delay_constructor_t<1130>;
225
+ };
226
+
227
+ template <class KeyT, class ValueT>
228
+ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_2, primitive_accum::yes>
229
+ {
230
+ static constexpr int threads = 256;
231
+ static constexpr int items = 12;
232
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
233
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
234
+ using delay_constructor = no_delay_constructor_t<1130>;
235
+ };
236
+
237
+ template <class KeyT, class ValueT>
238
+ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_4, primitive_accum::yes>
239
+ {
240
+ static constexpr int threads = 256;
241
+ static constexpr int items = 15;
242
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
243
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
244
+ using delay_constructor = no_delay_constructor_t<1140>;
245
+ };
246
+
247
+ template <class KeyT, class ValueT>
248
+ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_8, primitive_accum::yes>
249
+ {
250
+ static constexpr int threads = 256;
251
+ static constexpr int items = 9;
252
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
253
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
254
+ using delay_constructor = fixed_delay_constructor_t<888, 635>;
255
+ };
256
+
257
+ #if _CCCL_HAS_INT128()
258
+ template <class KeyT>
259
+ struct sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_4, val_size::_16, primitive_accum::no>
260
+ {
261
+ static constexpr int threads = 128;
262
+ static constexpr int items = 17;
263
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
264
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
265
+ using delay_constructor = no_delay_constructor_t<1100>;
266
+ };
267
+
268
+ template <class KeyT>
269
+ struct sm80_tuning<KeyT, __uint128_t, primitive_op::yes, key_size::_4, val_size::_16, primitive_accum::no>
270
+ : sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_4, val_size::_16, primitive_accum::no>
271
+ {};
272
+ #endif
273
+
274
+ template <class KeyT, class ValueT>
275
+ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_1, primitive_accum::yes>
276
+ {
277
+ static constexpr int threads = 128;
278
+ static constexpr int items = 11;
279
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
280
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
281
+ using delay_constructor = no_delay_constructor_t<1120>;
282
+ };
283
+
284
+ template <class KeyT, class ValueT>
285
+ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_2, primitive_accum::yes>
286
+ {
287
+ static constexpr int threads = 256;
288
+ static constexpr int items = 10;
289
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
290
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
291
+ using delay_constructor = no_delay_constructor_t<1115>;
292
+ };
293
+
294
+ template <class KeyT, class ValueT>
295
+ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_4, primitive_accum::yes>
296
+ {
297
+ static constexpr int threads = 224;
298
+ static constexpr int items = 13;
299
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
300
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
301
+ using delay_constructor = fixed_delay_constructor_t<24, 1060>;
302
+ };
303
+
304
+ template <class KeyT, class ValueT>
305
+ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_8, primitive_accum::yes>
306
+ {
307
+ static constexpr int threads = 224;
308
+ static constexpr int items = 10;
309
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
310
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
311
+ using delay_constructor = no_delay_constructor_t<1160>;
312
+ };
313
+
314
+ #if _CCCL_HAS_INT128()
315
+ template <class KeyT>
316
+ struct sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_8, val_size::_16, primitive_accum::no>
317
+ {
318
+ static constexpr int threads = 320;
319
+ static constexpr int items = 8;
320
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
321
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
322
+ using delay_constructor = no_delay_constructor_t<220>;
323
+ };
324
+
325
+ template <class KeyT>
326
+ struct sm80_tuning<KeyT, __uint128_t, primitive_op::yes, key_size::_8, val_size::_16, primitive_accum::no>
327
+ : sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_8, val_size::_16, primitive_accum::no>
328
+ {};
329
+ #endif
330
+
331
+ template <class KeyT, class ValueT>
332
+ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_16, val_size::_1, primitive_accum::yes>
333
+ {
334
+ static constexpr int threads = 192;
335
+ static constexpr int items = 7;
336
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
337
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
338
+ using delay_constructor = fixed_delay_constructor_t<144, 1120>;
339
+ };
340
+
341
+ template <class KeyT, class ValueT>
342
+ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_16, val_size::_2, primitive_accum::yes>
343
+ {
344
+ static constexpr int threads = 192;
345
+ static constexpr int items = 7;
346
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
347
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
348
+ using delay_constructor = fixed_delay_constructor_t<364, 780>;
349
+ };
350
+
351
+ template <class KeyT, class ValueT>
352
+ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_16, val_size::_4, primitive_accum::yes>
353
+ {
354
+ static constexpr int threads = 256;
355
+ static constexpr int items = 7;
356
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
357
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
358
+ using delay_constructor = no_delay_constructor_t<1170>;
359
+ };
360
+
361
+ template <class KeyT, class ValueT>
362
+ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_16, val_size::_8, primitive_accum::yes>
363
+ {
364
+ static constexpr int threads = 128;
365
+ static constexpr int items = 15;
366
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
367
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
368
+ using delay_constructor = no_delay_constructor_t<1030>;
369
+ };
370
+
371
+ #if _CCCL_HAS_INT128()
372
+ template <class KeyT>
373
+ struct sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_16, val_size::_16, primitive_accum::no>
374
+ {
375
+ static constexpr int threads = 128;
376
+ static constexpr int items = 15;
377
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
378
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
379
+ using delay_constructor = no_delay_constructor_t<1160>;
380
+ };
381
+
382
+ template <class KeyT>
383
+ struct sm80_tuning<KeyT, __uint128_t, primitive_op::yes, key_size::_16, val_size::_16, primitive_accum::no>
384
+ : sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_16, val_size::_16, primitive_accum::no>
385
+ {};
386
+ #endif
387
+
388
+ template <class KeyT,
389
+ class ValueT,
390
+ primitive_op PrimitiveOp,
391
+ key_size KeySize = classify_key_size<KeyT>(),
392
+ val_size ValueSize = classify_val_size<ValueT>(),
393
+ primitive_accum PrimitiveAccumulator = is_primitive_accum<ValueT>()>
394
+ struct sm90_tuning;
395
+
396
+ template <class KeyT, class ValueT>
397
+ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_1, primitive_accum::yes>
398
+ {
399
+ static constexpr int threads = 128;
400
+ static constexpr int items = 12;
401
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
402
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT;
403
+ using delay_constructor = no_delay_constructor_t<650>;
404
+ };
405
+
406
+ template <class KeyT, class ValueT>
407
+ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_2, primitive_accum::yes>
408
+ {
409
+ static constexpr int threads = 256;
410
+ static constexpr int items = 16;
411
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
412
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
413
+ using delay_constructor = fixed_delay_constructor_t<124, 995>;
414
+ };
415
+
416
+ template <class KeyT, class ValueT>
417
+ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_4, primitive_accum::yes>
418
+ {
419
+ static constexpr int threads = 128;
420
+ static constexpr int items = 15;
421
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
422
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
423
+ using delay_constructor = fixed_delay_constructor_t<488, 545>;
424
+ };
425
+
426
+ template <class KeyT, class ValueT>
427
+ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_8, primitive_accum::yes>
428
+ {
429
+ static constexpr int threads = 224;
430
+ static constexpr int items = 10;
431
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
432
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
433
+ using delay_constructor = fixed_delay_constructor_t<488, 1070>;
434
+ };
435
+
436
+ #if _CCCL_HAS_INT128()
437
+ template <class KeyT>
438
+ struct sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_1, val_size::_16, primitive_accum::no>
439
+ {
440
+ static constexpr int threads = 128;
441
+ static constexpr int items = 23;
442
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
443
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
444
+ using delay_constructor = fixed_delay_constructor_t<936, 1105>;
445
+ };
446
+
447
+ template <class KeyT>
448
+ struct sm90_tuning<KeyT, __uint128_t, primitive_op::yes, key_size::_1, val_size::_16, primitive_accum::no>
449
+ : sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_1, val_size::_16, primitive_accum::no>
450
+ {};
451
+ #endif
452
+
453
+ template <class KeyT, class ValueT>
454
+ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_1, primitive_accum::yes>
455
+ {
456
+ static constexpr int threads = 128;
457
+ static constexpr int items = 12;
458
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
459
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT;
460
+ using delay_constructor = fixed_delay_constructor_t<136, 785>;
461
+ };
462
+
463
+ template <class KeyT, class ValueT>
464
+ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_2, primitive_accum::yes>
465
+ {
466
+ static constexpr int threads = 128;
467
+ static constexpr int items = 20;
468
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
469
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
470
+ using delay_constructor = no_delay_constructor_t<445>;
471
+ };
472
+
473
+ template <class KeyT, class ValueT>
474
+ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_4, primitive_accum::yes>
475
+ {
476
+ static constexpr int threads = 128;
477
+ static constexpr int items = 22;
478
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
479
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
480
+ using delay_constructor = fixed_delay_constructor_t<312, 865>;
481
+ };
482
+
483
+ template <class KeyT, class ValueT>
484
+ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_8, primitive_accum::yes>
485
+ {
486
+ static constexpr int threads = 224;
487
+ static constexpr int items = 10;
488
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
489
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
490
+ using delay_constructor = fixed_delay_constructor_t<352, 1170>;
491
+ };
492
+
493
+ #if _CCCL_HAS_INT128()
494
+ template <class KeyT>
495
+ struct sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_2, val_size::_16, primitive_accum::no>
496
+ {
497
+ static constexpr int threads = 128;
498
+ static constexpr int items = 23;
499
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
500
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
501
+ using delay_constructor = fixed_delay_constructor_t<504, 1190>;
502
+ };
503
+
504
+ template <class KeyT>
505
+ struct sm90_tuning<KeyT, __uint128_t, primitive_op::yes, key_size::_2, val_size::_16, primitive_accum::no>
506
+ : sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_2, val_size::_16, primitive_accum::no>
507
+ {};
508
+ #endif
509
+
510
+ template <class KeyT, class ValueT>
511
+ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_1, primitive_accum::yes>
512
+ {
513
+ static constexpr int threads = 128;
514
+ static constexpr int items = 12;
515
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
516
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT;
517
+ using delay_constructor = no_delay_constructor_t<850>;
518
+ };
519
+
520
+ template <class KeyT, class ValueT>
521
+ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_2, primitive_accum::yes>
522
+ {
523
+ static constexpr int threads = 256;
524
+ static constexpr int items = 14;
525
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
526
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
527
+ using delay_constructor = fixed_delay_constructor_t<128, 965>;
528
+ };
529
+
530
+ template <class KeyT, class ValueT>
531
+ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_4, primitive_accum::yes>
532
+ {
533
+ static constexpr int threads = 288;
534
+ static constexpr int items = 14;
535
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
536
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
537
+ using delay_constructor = fixed_delay_constructor_t<700, 1005>;
538
+ };
539
+
540
+ template <class KeyT, class ValueT>
541
+ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_8, primitive_accum::yes>
542
+ {
543
+ static constexpr int threads = 224;
544
+ static constexpr int items = 14;
545
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
546
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
547
+ using delay_constructor = fixed_delay_constructor_t<556, 1195>;
548
+ };
549
+
550
+ #if _CCCL_HAS_INT128()
551
+ template <class KeyT>
552
+ struct sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_4, val_size::_16, primitive_accum::no>
553
+ {
554
+ static constexpr int threads = 128;
555
+ static constexpr int items = 23;
556
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
557
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
558
+ using delay_constructor = fixed_delay_constructor_t<512, 1030>;
559
+ };
560
+
561
+ template <class KeyT>
562
+ struct sm90_tuning<KeyT, __uint128_t, primitive_op::yes, key_size::_4, val_size::_16, primitive_accum::no>
563
+ : sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_4, val_size::_16, primitive_accum::no>
564
+ {};
565
+ #endif
566
+
567
+ template <class KeyT, class ValueT>
568
+ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_1, primitive_accum::yes>
569
+ {
570
+ static constexpr int threads = 128;
571
+ static constexpr int items = 12;
572
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
573
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT;
574
+ using delay_constructor = fixed_delay_constructor_t<504, 1010>;
575
+ };
576
+
577
+ template <class KeyT, class ValueT>
578
+ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_2, primitive_accum::yes>
579
+ {
580
+ static constexpr int threads = 224;
581
+ static constexpr int items = 10;
582
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
583
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
584
+ using delay_constructor = fixed_delay_constructor_t<420, 970>;
585
+ };
586
+
587
+ template <class KeyT, class ValueT>
588
+ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_4, primitive_accum::yes>
589
+ {
590
+ static constexpr int threads = 192;
591
+ static constexpr int items = 10;
592
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
593
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
594
+ using delay_constructor = fixed_delay_constructor_t<500, 1125>;
595
+ };
596
+
597
+ template <class KeyT, class ValueT>
598
+ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_8, primitive_accum::yes>
599
+ {
600
+ static constexpr int threads = 224;
601
+ static constexpr int items = 11;
602
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
603
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
604
+ using delay_constructor = fixed_delay_constructor_t<600, 930>;
605
+ };
606
+
607
+ #if _CCCL_HAS_INT128()
608
+ template <class KeyT>
609
+ struct sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_8, val_size::_16, primitive_accum::no>
610
+ {
611
+ static constexpr int threads = 192;
612
+ static constexpr int items = 15;
613
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
614
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
615
+ using delay_constructor = fixed_delay_constructor_t<364, 1085>;
616
+ };
617
+
618
+ template <class KeyT>
619
+ struct sm90_tuning<KeyT, __uint128_t, primitive_op::yes, key_size::_8, val_size::_16, primitive_accum::no>
620
+ : sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_8, val_size::_16, primitive_accum::no>
621
+ {};
622
+ #endif
623
+
624
+ template <class KeyT, class ValueT>
625
+ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_16, val_size::_1, primitive_accum::yes>
626
+ {
627
+ static constexpr int threads = 192;
628
+ static constexpr int items = 7;
629
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
630
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
631
+ using delay_constructor = fixed_delay_constructor_t<500, 975>;
632
+ };
633
+
634
+ template <class KeyT, class ValueT>
635
+ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_16, val_size::_2, primitive_accum::yes>
636
+ {
637
+ static constexpr int threads = 224;
638
+ static constexpr int items = 10;
639
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
640
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
641
+ using delay_constructor = fixed_delay_constructor_t<164, 1075>;
642
+ };
643
+
644
+ template <class KeyT, class ValueT>
645
+ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_16, val_size::_4, primitive_accum::yes>
646
+ {
647
+ static constexpr int threads = 256;
648
+ static constexpr int items = 9;
649
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
650
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
651
+ using delay_constructor = fixed_delay_constructor_t<268, 1120>;
652
+ };
653
+
654
+ template <class KeyT, class ValueT>
655
+ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_16, val_size::_8, primitive_accum::yes>
656
+ {
657
+ static constexpr int threads = 192;
658
+ static constexpr int items = 9;
659
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
660
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
661
+ using delay_constructor = fixed_delay_constructor_t<320, 1200>;
662
+ };
663
+
664
+ #if _CCCL_HAS_INT128()
665
+ template <class KeyT>
666
+ struct sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_16, val_size::_16, primitive_accum::no>
667
+ {
668
+ static constexpr int threads = 128;
669
+ static constexpr int items = 23;
670
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
671
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
672
+ using delay_constructor = fixed_delay_constructor_t<364, 1050>;
673
+ };
674
+
675
+ template <class KeyT>
676
+ struct sm90_tuning<KeyT, __uint128_t, primitive_op::yes, key_size::_16, val_size::_16, primitive_accum::no>
677
+ : sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_16, val_size::_16, primitive_accum::no>
678
+ {};
679
+ #endif
680
+
681
+ template <class KeyT,
682
+ class ValueT,
683
+ primitive_op PrimitiveOp,
684
+ key_size KeySize = classify_key_size<KeyT>(),
685
+ val_size ValueSize = classify_val_size<ValueT>(),
686
+ primitive_accum PrimitiveAccumulator = is_primitive_accum<ValueT>()>
687
+ struct sm100_tuning;
688
+
689
+ // key_size = 8 bits
690
+ template <class KeyT, class ValueT>
691
+ struct sm100_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_1, primitive_accum::yes>
692
+ {
693
+ // ipt_13.tpb_288.ns_420.dcid_0.l2w_745.trp_1.ld_0 1.030222 0.998162 1.027506 1.068348
694
+ static constexpr int items = 13;
695
+ static constexpr int threads = 288;
696
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
697
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
698
+ using delay_constructor = no_delay_constructor_t<745>;
699
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
700
+ };
701
+
702
+ template <class KeyT, class ValueT>
703
+ struct sm100_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_2, primitive_accum::yes>
704
+ {
705
+ // ipt_13.tpb_288.ns_388.dcid_1.l2w_570.trp_1.ld_0 1.228612 1.0 1.216841 1.416167
706
+ static constexpr int items = 13;
707
+ static constexpr int threads = 288;
708
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
709
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
710
+ using delay_constructor = fixed_delay_constructor_t<388, 570>;
711
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
712
+ };
713
+
714
+ template <class KeyT, class ValueT>
715
+ struct sm100_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_4, primitive_accum::yes>
716
+ {
717
+ // ipt_19.tpb_224.ns_1028.dcid_5.l2w_910.trp_1.ld_1 1.163440 1.0 1.146400 1.260684
718
+ static constexpr int items = 19;
719
+ static constexpr int threads = 224;
720
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
721
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
722
+ using delay_constructor = exponential_backon_jitter_window_constructor_t<1028, 910>;
723
+ static constexpr CacheLoadModifier load_modifier = LOAD_CA;
724
+ };
725
+
726
+ template <class KeyT, class ValueT>
727
+ struct sm100_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_8, primitive_accum::yes>
728
+ {
729
+ // ipt_18.tpb_192.ns_432.dcid_1.l2w_1035.trp_1.ld_1 1.177638 0.985417 1.157164 1.296477
730
+ static constexpr int items = 18;
731
+ static constexpr int threads = 192;
732
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
733
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
734
+ using delay_constructor = fixed_delay_constructor_t<432, 1035>;
735
+ static constexpr CacheLoadModifier load_modifier = LOAD_CA;
736
+ };
737
+
738
+ // key_size = 16 bits
739
+ template <class KeyT, class ValueT>
740
+ struct sm100_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_1, primitive_accum::yes>
741
+ {
742
+ // ipt_12.tpb_384.ns_1900.dcid_0.l2w_840.trp_1.ld_0 1.010828 0.985782 1.007993 1.048859
743
+ static constexpr int items = 12;
744
+ static constexpr int threads = 384;
745
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
746
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
747
+ using delay_constructor = no_delay_constructor_t<1900>;
748
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
749
+ };
750
+
751
+ template <class KeyT, class ValueT>
752
+ struct sm100_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_2, primitive_accum::yes>
753
+ {
754
+ // ipt_14.tpb_160.ns_1736.dcid_7.l2w_170.trp_1.ld_0 1.095207 1.065061 1.100302 1.142857
755
+ static constexpr int items = 14;
756
+ static constexpr int threads = 160;
757
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
758
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
759
+ using delay_constructor = exponential_backon_constructor_t<1736, 170>;
760
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
761
+ };
762
+
763
+ template <class KeyT, class ValueT>
764
+ struct sm100_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_4, primitive_accum::yes>
765
+ {
766
+ // ipt_14.tpb_160.ns_336.dcid_1.l2w_805.trp_1.ld_0 1.119313 1.095238 1.122013 1.148681
767
+ static constexpr int items = 14;
768
+ static constexpr int threads = 160;
769
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
770
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
771
+ using delay_constructor = fixed_delay_constructor_t<336, 805>;
772
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
773
+ };
774
+
775
+ template <class KeyT, class ValueT>
776
+ struct sm100_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_8, primitive_accum::yes>
777
+ {
778
+ static constexpr int items = 13;
779
+ static constexpr int threads = 224;
780
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
781
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
782
+ using delay_constructor = exponential_backoff_constructor_t<348, 735>;
783
+ static constexpr CacheLoadModifier load_modifier = LOAD_CA;
784
+ };
785
+
786
+ // key_size = 32 bits
787
+ template <class KeyT, class ValueT>
788
+ struct sm100_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_1, primitive_accum::yes>
789
+ {
790
+ // todo(gonidlelis): Significant regression. Search more workloads.
791
+ // ipt_20.tpb_224.ns_1436.dcid_7.l2w_155.trp_1.ld_1 1.135878 0.866667 1.106600 1.339708
792
+ static constexpr int items = 20;
793
+ static constexpr int threads = 224;
794
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
795
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
796
+ using delay_constructor = exponential_backon_constructor_t<1436, 155>;
797
+ static constexpr CacheLoadModifier load_modifier = LOAD_CA;
798
+ };
799
+
800
+ template <class KeyT, class ValueT>
801
+ struct sm100_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_2, primitive_accum::yes>
802
+ {
803
+ // ipt_13.tpb_288.ns_620.dcid_7.l2w_925.trp_1.ld_2 1.050929 1.000000 1.047178 1.115809
804
+ static constexpr int items = 13;
805
+ static constexpr int threads = 288;
806
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
807
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
808
+ using delay_constructor = exponential_backon_constructor_t<620, 925>;
809
+ static constexpr CacheLoadModifier load_modifier = LOAD_CA;
810
+ };
811
+
812
+ template <class KeyT, class ValueT>
813
+ struct sm100_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_4, primitive_accum::yes>
814
+ {
815
+ // ipt_20.tpb_224.ns_1856.dcid_5.l2w_280.trp_1.ld_1 1.247248 1.000000 1.220196 1.446328
816
+ static constexpr int items = 20;
817
+ static constexpr int threads = 224;
818
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
819
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
820
+ using delay_constructor = exponential_backon_jitter_window_constructor_t<1856, 280>;
821
+ static constexpr CacheLoadModifier load_modifier = LOAD_CA;
822
+ };
823
+
824
+ template <class KeyT, class ValueT>
825
+ struct sm100_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_8, primitive_accum::yes>
826
+ {
827
+ // ipt_14.tpb_224.ns_464.dcid_2.l2w_680.trp_1.ld_1 1.070831 1.002088 1.064736 1.105437
828
+ static constexpr int items = 14;
829
+ static constexpr int threads = 224;
830
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
831
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
832
+ using delay_constructor = exponential_backoff_constructor_t<464, 860>;
833
+ static constexpr CacheLoadModifier load_modifier = LOAD_CA;
834
+ };
835
+
836
+ // key_size = 64 bits
837
+ template <class KeyT, class ValueT>
838
+ struct sm100_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_1, primitive_accum::yes>
839
+ {
840
+ // ipt_12.tpb_160.ns_532.dcid_0.l2w_850.trp_1.ld_0 1.041966 1.000000 1.037010 1.078399
841
+ static constexpr int items = 12;
842
+ static constexpr int threads = 160;
843
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
844
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
845
+ using delay_constructor = no_delay_constructor_t<532>;
846
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
847
+ };
848
+
849
+ template <class KeyT, class ValueT>
850
+ struct sm100_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_2, primitive_accum::yes>
851
+ {
852
+ // todo(gonidlelis): Significant regression. Search more workloads.
853
+ // ipt_15.tpb_288.ns_988.dcid_7.l2w_335.trp_1.ld_0 1.064413 0.866667 1.045946 1.116803
854
+ static constexpr int items = 15;
855
+ static constexpr int threads = 288;
856
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
857
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
858
+ using delay_constructor = exponential_backon_constructor_t<988, 335>;
859
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
860
+ };
861
+
862
+ template <class KeyT, class ValueT>
863
+ struct sm100_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_4, primitive_accum::yes>
864
+ {
865
+ // ipt_22.tpb_160.ns_1032.dcid_5.l2w_505.trp_1.ld_2 1.184805 1.000000 1.164843 1.338536
866
+ static constexpr int items = 22;
867
+ static constexpr int threads = 160;
868
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
869
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
870
+ using delay_constructor = exponential_backon_jitter_window_constructor_t<1032, 505>;
871
+ static constexpr CacheLoadModifier load_modifier = LOAD_CA;
872
+ };
873
+
874
+ template <class KeyT, class ValueT>
875
+ struct sm100_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_8, primitive_accum::yes>
876
+ {
877
+ // ipt_23.tpb_256.ns_1232.dcid_0.l2w_810.trp_1.ld_0 1.067631 1.000000 1.059607 1.135646
878
+ static constexpr int items = 23;
879
+ static constexpr int threads = 256;
880
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
881
+ static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
882
+ using delay_constructor = no_delay_constructor_t<1232>;
883
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
884
+ };
885
+
886
+ template <typename KeysInputIteratorT, typename AccumT, typename ValueT, typename ScanOpT>
887
+ struct policy_hub
888
+ {
889
+ using key_t = it_value_t<KeysInputIteratorT>;
890
+ static constexpr int max_input_bytes = static_cast<int>((::cuda::std::max) (sizeof(key_t), sizeof(AccumT)));
891
+ static constexpr int combined_input_bytes = static_cast<int>(sizeof(key_t) + sizeof(AccumT));
892
+
893
+ struct Policy500 : ChainedPolicy<500, Policy500, Policy500>
894
+ {
895
+ static constexpr int nominal_4b_items_per_thread = 6;
896
+ static constexpr int items_per_thread =
897
+ max_input_bytes <= 8 ? 6 : Nominal4BItemsToItemsCombined(nominal_4b_items_per_thread, combined_input_bytes);
898
+
899
+ using ScanByKeyPolicyT =
900
+ AgentScanByKeyPolicy<128,
901
+ items_per_thread,
902
+ BLOCK_LOAD_WARP_TRANSPOSE,
903
+ LOAD_CA,
904
+ BLOCK_SCAN_WARP_SCANS,
905
+ BLOCK_STORE_WARP_TRANSPOSE,
906
+ default_reduce_by_key_delay_constructor_t<AccumT, int>>;
907
+ };
908
+
909
+ template <CacheLoadModifier LoadModifier, typename DelayConstructurValueT>
910
+ struct DefaultPolicy
911
+ {
912
+ static constexpr int nominal_4b_items_per_thread = 9;
913
+ static constexpr int items_per_thread =
914
+ max_input_bytes <= 8 ? 9 : Nominal4BItemsToItemsCombined(nominal_4b_items_per_thread, combined_input_bytes);
915
+
916
+ using ScanByKeyPolicyT =
917
+ AgentScanByKeyPolicy<256,
918
+ items_per_thread,
919
+ BLOCK_LOAD_WARP_TRANSPOSE,
920
+ LoadModifier,
921
+ BLOCK_SCAN_WARP_SCANS,
922
+ BLOCK_STORE_WARP_TRANSPOSE,
923
+ default_reduce_by_key_delay_constructor_t<DelayConstructurValueT, int>>;
924
+ };
925
+
926
+ struct Policy520
927
+ : DefaultPolicy<LOAD_CA, AccumT>
928
+ , ChainedPolicy<520, Policy520, Policy500>
929
+ {};
930
+
931
+ // Use values from tuning if a specialization exists, otherwise pick the default
932
+ template <typename Tuning>
933
+ static auto select_agent_policy(int)
934
+ -> AgentScanByKeyPolicy<Tuning::threads,
935
+ Tuning::items,
936
+ Tuning::load_algorithm,
937
+ LOAD_DEFAULT,
938
+ BLOCK_SCAN_WARP_SCANS,
939
+ Tuning::store_algorithm,
940
+ typename Tuning::delay_constructor>;
941
+
942
+ template <typename Tuning>
943
+ // FIXME(bgruber): should we rather use `AccumT` instead of `ValueT` like the other default policies?
944
+ static auto select_agent_policy(long) -> typename DefaultPolicy<LOAD_DEFAULT, ValueT>::ScanByKeyPolicyT;
945
+
946
+ struct Policy800 : ChainedPolicy<800, Policy800, Policy520>
947
+ {
948
+ using ScanByKeyPolicyT = decltype(select_agent_policy<sm80_tuning<key_t, ValueT, is_primitive_op<ScanOpT>()>>(0));
949
+ };
950
+
951
+ struct Policy860
952
+ : DefaultPolicy<LOAD_CA, AccumT>
953
+ , ChainedPolicy<860, Policy860, Policy800>
954
+ {};
955
+
956
+ struct Policy900 : ChainedPolicy<900, Policy900, Policy860>
957
+ {
958
+ using ScanByKeyPolicyT = decltype(select_agent_policy<sm90_tuning<key_t, ValueT, is_primitive_op<ScanOpT>()>>(0));
959
+ };
960
+
961
+ struct Policy1000 : ChainedPolicy<1000, Policy1000, Policy900>
962
+ {
963
+ // Use values from tuning if a specialization exists, otherwise pick Policy900
964
+ template <typename Tuning>
965
+ static auto select_agent_policy100(int)
966
+ -> AgentScanByKeyPolicy<Tuning::threads,
967
+ Tuning::items,
968
+ Tuning::load_algorithm,
969
+ Tuning::load_modifier,
970
+ BLOCK_SCAN_WARP_SCANS,
971
+ Tuning::store_algorithm,
972
+ typename Tuning::delay_constructor>;
973
+
974
+ template <typename Tuning>
975
+ // FIXME(bgruber): should we rather use `AccumT` instead of `ValueT` like the other default policies?
976
+ static auto select_agent_policy100(long) -> typename Policy900::ScanByKeyPolicyT;
977
+
978
+ using ScanByKeyPolicyT =
979
+ decltype(select_agent_policy100<sm100_tuning<key_t, ValueT, is_primitive_op<ScanOpT>()>>(0));
980
+ };
981
+
982
+ using MaxPolicy = Policy1000;
983
+ };
984
+ } // namespace detail::scan_by_key
985
+
986
+ CUB_NAMESPACE_END