cuda-cccl 0.4.3__cp312-cp312-manylinux_2_26_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2024) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +9 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +24 -0
  5. cuda/cccl/headers/__init__.py +7 -0
  6. cuda/cccl/headers/include/__init__.py +1 -0
  7. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +233 -0
  8. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1158 -0
  9. cuda/cccl/headers/include/cub/agent/agent_for.cuh +55 -0
  10. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +699 -0
  11. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +365 -0
  12. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +721 -0
  13. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +756 -0
  14. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +277 -0
  15. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +715 -0
  16. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +546 -0
  17. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +592 -0
  18. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +780 -0
  19. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1092 -0
  20. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +564 -0
  21. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +448 -0
  22. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +263 -0
  23. cuda/cccl/headers/include/cub/agent/agent_segmented_scan.cuh +292 -0
  24. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1090 -0
  25. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
  26. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +584 -0
  27. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +762 -0
  28. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +599 -0
  29. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1384 -0
  30. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +939 -0
  31. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1200 -0
  32. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1279 -0
  33. cuda/cccl/headers/include/cub/block/block_histogram.cuh +396 -0
  34. cuda/cccl/headers/include/cub/block/block_load.cuh +1269 -0
  35. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +437 -0
  36. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +771 -0
  37. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1215 -0
  38. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2129 -0
  39. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +124 -0
  40. cuda/cccl/headers/include/cub/block/block_reduce.cuh +661 -0
  41. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +406 -0
  42. cuda/cccl/headers/include/cub/block/block_scan.cuh +2168 -0
  43. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +319 -0
  44. cuda/cccl/headers/include/cub/block/block_store.cuh +1238 -0
  45. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +597 -0
  46. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +62 -0
  47. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +209 -0
  48. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +230 -0
  49. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +207 -0
  50. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +257 -0
  51. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +766 -0
  52. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +514 -0
  53. cuda/cccl/headers/include/cub/config.cuh +29 -0
  54. cuda/cccl/headers/include/cub/cub.cuh +96 -0
  55. cuda/cccl/headers/include/cub/detail/array_utils.cuh +54 -0
  56. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +135 -0
  57. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +50 -0
  58. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +94 -0
  59. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +60 -0
  60. cuda/cccl/headers/include/cub/detail/env_dispatch.cuh +87 -0
  61. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +227 -0
  62. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +87 -0
  63. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +149 -0
  64. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +103 -0
  65. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +112 -0
  66. cuda/cccl/headers/include/cub/detail/ptx-json/array.cuh +41 -0
  67. cuda/cccl/headers/include/cub/detail/ptx-json/json.cuh +39 -0
  68. cuda/cccl/headers/include/cub/detail/ptx-json/object.cuh +71 -0
  69. cuda/cccl/headers/include/cub/detail/ptx-json/string.cuh +79 -0
  70. cuda/cccl/headers/include/cub/detail/ptx-json/value.cuh +95 -0
  71. cuda/cccl/headers/include/cub/detail/ptx-json-parser.cuh +39 -0
  72. cuda/cccl/headers/include/cub/detail/rfa.cuh +706 -0
  73. cuda/cccl/headers/include/cub/detail/strong_load.cuh +163 -0
  74. cuda/cccl/headers/include/cub/detail/strong_store.cuh +194 -0
  75. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +377 -0
  76. cuda/cccl/headers/include/cub/detail/type_traits.cuh +185 -0
  77. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +48 -0
  78. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +33 -0
  79. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +572 -0
  80. cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
  81. cuda/cccl/headers/include/cub/device/device_for.cuh +1061 -0
  82. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1485 -0
  83. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +171 -0
  84. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  85. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +955 -0
  86. cuda/cccl/headers/include/cub/device/device_partition.cuh +644 -0
  87. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3413 -0
  88. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2303 -0
  89. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +346 -0
  90. cuda/cccl/headers/include/cub/device/device_scan.cuh +2152 -0
  91. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1472 -0
  92. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1635 -0
  93. cuda/cccl/headers/include/cub/device/device_segmented_scan.cuh +1398 -0
  94. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2787 -0
  95. cuda/cccl/headers/include/cub/device/device_select.cuh +1204 -0
  96. cuda/cccl/headers/include/cub/device/device_topk.cuh +521 -0
  97. cuda/cccl/headers/include/cub/device/device_transform.cuh +666 -0
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +289 -0
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +694 -0
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +50 -0
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +77 -0
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_fixed_size_segmented_reduce.cuh +349 -0
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +160 -0
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1849 -0
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +317 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +429 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1066 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +830 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +629 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +479 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +256 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +603 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +447 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +545 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_radix_sort.cuh +638 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_reduce.cuh +410 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_scan.cuh +278 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +899 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +831 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +321 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +454 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +364 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +626 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +527 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +472 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_for_each.cuh +259 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_histogram.cuh +669 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_merge_sort.cuh +332 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_radix_sort.cuh +553 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_reduce.cuh +584 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_scan.cuh +178 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh +262 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh +295 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_scan.cuh +77 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_sort.cuh +521 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_three_way_partition.cuh +200 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_transform.cuh +1049 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_unique_by_key.cuh +175 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/common.cuh +97 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +43 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +94 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +34 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +268 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +108 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +100 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1045 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +681 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +918 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +647 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +571 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +986 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_scan.cuh +108 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +373 -0
  154. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1563 -0
  155. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +415 -0
  156. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +84 -0
  157. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +476 -0
  158. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +858 -0
  159. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +203 -0
  160. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +82 -0
  161. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +175 -0
  162. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +230 -0
  163. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +235 -0
  164. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +226 -0
  165. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +293 -0
  166. cuda/cccl/headers/include/cub/thread/thread_load.cuh +353 -0
  167. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +664 -0
  168. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +525 -0
  169. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +472 -0
  170. cuda/cccl/headers/include/cub/thread/thread_search.cuh +214 -0
  171. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +456 -0
  172. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +78 -0
  173. cuda/cccl/headers/include/cub/thread/thread_store.cuh +341 -0
  174. cuda/cccl/headers/include/cub/util_allocator.cuh +897 -0
  175. cuda/cccl/headers/include/cub/util_arch.cuh +176 -0
  176. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +71 -0
  177. cuda/cccl/headers/include/cub/util_debug.cuh +183 -0
  178. cuda/cccl/headers/include/cub/util_device.cuh +838 -0
  179. cuda/cccl/headers/include/cub/util_macro.cuh +73 -0
  180. cuda/cccl/headers/include/cub/util_math.cuh +92 -0
  181. cuda/cccl/headers/include/cub/util_namespace.cuh +152 -0
  182. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +31 -0
  183. cuda/cccl/headers/include/cub/util_ptx.cuh +483 -0
  184. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +93 -0
  185. cuda/cccl/headers/include/cub/util_type.cuh +1084 -0
  186. cuda/cccl/headers/include/cub/util_vsmem.cuh +227 -0
  187. cuda/cccl/headers/include/cub/version.cuh +65 -0
  188. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +304 -0
  189. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +152 -0
  190. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +567 -0
  191. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +378 -0
  192. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +922 -0
  193. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +691 -0
  194. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +381 -0
  195. cuda/cccl/headers/include/cub/warp/warp_load.cuh +591 -0
  196. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +145 -0
  197. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +810 -0
  198. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1863 -0
  199. cuda/cccl/headers/include/cub/warp/warp_store.cuh +498 -0
  200. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +59 -0
  201. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  202. cuda/cccl/headers/include/cuda/__algorithm/copy.h +199 -0
  203. cuda/cccl/headers/include/cuda/__algorithm/fill.h +110 -0
  204. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  205. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +171 -0
  206. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +216 -0
  207. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  208. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  209. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  210. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  211. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  212. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  213. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  214. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +101 -0
  215. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +528 -0
  216. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +73 -0
  217. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  218. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +198 -0
  219. cuda/cccl/headers/include/cuda/__bit/bitfield.h +121 -0
  220. cuda/cccl/headers/include/cuda/__bit/bitmask.h +89 -0
  221. cuda/cccl/headers/include/cuda/__cccl_config +38 -0
  222. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +123 -0
  223. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
  224. cuda/cccl/headers/include/cuda/__cmath/ilog.h +194 -0
  225. cuda/cccl/headers/include/cuda/__cmath/ipow.h +111 -0
  226. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  227. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +145 -0
  228. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  229. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  230. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  231. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  232. cuda/cccl/headers/include/cuda/__cmath/sincos.h +134 -0
  233. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  234. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  235. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
  236. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  237. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  238. cuda/cccl/headers/include/cuda/__container/buffer.h +891 -0
  239. cuda/cccl/headers/include/cuda/__container/heterogeneous_iterator.h +436 -0
  240. cuda/cccl/headers/include/cuda/__container/uninitialized_async_buffer.h +416 -0
  241. cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
  242. cuda/cccl/headers/include/cuda/__device/arch_id.h +194 -0
  243. cuda/cccl/headers/include/cuda/__device/arch_traits.h +553 -0
  244. cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
  245. cuda/cccl/headers/include/cuda/__device/compute_capability.h +172 -0
  246. cuda/cccl/headers/include/cuda/__device/device_ref.h +168 -0
  247. cuda/cccl/headers/include/cuda/__device/physical_device.h +178 -0
  248. cuda/cccl/headers/include/cuda/__driver/driver_api.h +1041 -0
  249. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  250. cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
  251. cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
  252. cuda/cccl/headers/include/cuda/__execution/determinism.h +89 -0
  253. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +87 -0
  254. cuda/cccl/headers/include/cuda/__execution/policy.h +53 -0
  255. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  256. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  257. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  258. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  259. cuda/cccl/headers/include/cuda/__functional/maximum.h +77 -0
  260. cuda/cccl/headers/include/cuda/__functional/minimum.h +77 -0
  261. cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
  262. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +106 -0
  263. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  264. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  265. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  266. cuda/cccl/headers/include/cuda/__fwd/execution_policy.h +47 -0
  267. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  268. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  269. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
  270. cuda/cccl/headers/include/cuda/__hierarchy/dimensions.h +162 -0
  271. cuda/cccl/headers/include/cuda/__hierarchy/hierarchy_dimensions.h +986 -0
  272. cuda/cccl/headers/include/cuda/__hierarchy/hierarchy_levels.h +494 -0
  273. cuda/cccl/headers/include/cuda/__hierarchy/level_dimensions.h +225 -0
  274. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
  275. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +490 -0
  276. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  277. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  278. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  279. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
  280. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
  281. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
  282. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
  283. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
  284. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +147 -0
  285. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +114 -0
  286. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +555 -0
  287. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +589 -0
  288. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  289. cuda/cccl/headers/include/cuda/__launch/configuration.h +754 -0
  290. cuda/cccl/headers/include/cuda/__launch/host_launch.h +115 -0
  291. cuda/cccl/headers/include/cuda/__launch/launch.h +334 -0
  292. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +531 -0
  293. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +239 -0
  294. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
  295. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +118 -0
  296. cuda/cccl/headers/include/cuda/__mdspan/shared_memory_accessor.h +208 -0
  297. cuda/cccl/headers/include/cuda/__mdspan/shared_memory_mdspan.h +129 -0
  298. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  299. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  300. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +77 -0
  301. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  302. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  303. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  304. cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
  305. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  306. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  307. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  308. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +102 -0
  309. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +164 -0
  310. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +57 -0
  311. cuda/cccl/headers/include/cuda/__memory/address_space.h +256 -0
  312. cuda/cccl/headers/include/cuda/__memory/align_down.h +77 -0
  313. cuda/cccl/headers/include/cuda/__memory/align_up.h +77 -0
  314. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  315. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  316. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  317. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  318. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +60 -0
  319. cuda/cccl/headers/include/cuda/__memory/is_pointer_accessible.h +278 -0
  320. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +92 -0
  321. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  322. cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +125 -0
  323. cuda/cccl/headers/include/cuda/__memory_pool/device_memory_pool.h +166 -0
  324. cuda/cccl/headers/include/cuda/__memory_pool/managed_memory_pool.h +161 -0
  325. cuda/cccl/headers/include/cuda/__memory_pool/memory_pool_base.h +644 -0
  326. cuda/cccl/headers/include/cuda/__memory_pool/pinned_memory_pool.h +218 -0
  327. cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +882 -0
  328. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  329. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  330. cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
  331. cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +141 -0
  332. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +130 -0
  333. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +135 -0
  334. cuda/cccl/headers/include/cuda/__memory_resource/shared_resource.h +261 -0
  335. cuda/cccl/headers/include/cuda/__memory_resource/synchronous_resource_adapter.h +136 -0
  336. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +331 -0
  337. cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
  338. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  339. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  340. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  341. cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +359 -0
  342. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  343. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2977 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +240 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +245 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +52 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +977 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +302 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +631 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_inval.h +26 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/setmaxnreg.h +58 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +120 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +91 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +693 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +50 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +11437 -0
  413. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +6513 -0
  414. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +6726 -0
  415. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +40 -0
  416. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +4767 -0
  417. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +48 -0
  418. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  419. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +886 -0
  420. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  421. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  422. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  423. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  424. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  425. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  426. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  427. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_inval.h +41 -0
  428. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  429. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  430. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  431. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  432. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  433. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  434. cuda/cccl/headers/include/cuda/__ptx/instructions/setmaxnreg.h +41 -0
  435. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  436. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  437. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  438. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  439. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  440. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  441. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  442. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  443. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  444. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  445. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  446. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  447. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  448. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  449. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  450. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  451. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  452. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  453. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  454. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  455. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  456. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +178 -0
  457. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  458. cuda/cccl/headers/include/cuda/__random/pcg_engine.h +398 -0
  459. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  460. cuda/cccl/headers/include/cuda/__runtime/api_wrapper.h +62 -0
  461. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
  462. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  463. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  464. cuda/cccl/headers/include/cuda/__stream/get_stream.h +109 -0
  465. cuda/cccl/headers/include/cuda/__stream/internal_streams.h +49 -0
  466. cuda/cccl/headers/include/cuda/__stream/invalid_stream.h +47 -0
  467. cuda/cccl/headers/include/cuda/__stream/launch_transform.h +193 -0
  468. cuda/cccl/headers/include/cuda/__stream/stream.h +145 -0
  469. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +356 -0
  470. cuda/cccl/headers/include/cuda/__tma/make_tma_descriptor.h +657 -0
  471. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  472. cuda/cccl/headers/include/cuda/__type_traits/is_instantiable_with.h +47 -0
  473. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  474. cuda/cccl/headers/include/cuda/__type_traits/vector_type.h +355 -0
  475. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  476. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  477. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  478. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  479. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  480. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  481. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  482. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +611 -0
  483. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +170 -0
  484. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +147 -0
  485. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  486. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  487. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  488. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +256 -0
  489. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +323 -0
  490. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  491. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  492. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  493. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +183 -0
  494. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  495. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  496. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  497. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  498. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  499. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  500. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  501. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  502. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  503. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  504. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  505. cuda/cccl/headers/include/cuda/access_property +26 -0
  506. cuda/cccl/headers/include/cuda/algorithm +28 -0
  507. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  508. cuda/cccl/headers/include/cuda/atomic +27 -0
  509. cuda/cccl/headers/include/cuda/barrier +293 -0
  510. cuda/cccl/headers/include/cuda/bit +29 -0
  511. cuda/cccl/headers/include/cuda/buffer +27 -0
  512. cuda/cccl/headers/include/cuda/cmath +38 -0
  513. cuda/cccl/headers/include/cuda/devices +33 -0
  514. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  515. cuda/cccl/headers/include/cuda/functional +32 -0
  516. cuda/cccl/headers/include/cuda/hierarchy +28 -0
  517. cuda/cccl/headers/include/cuda/iterator +39 -0
  518. cuda/cccl/headers/include/cuda/latch +27 -0
  519. cuda/cccl/headers/include/cuda/launch +28 -0
  520. cuda/cccl/headers/include/cuda/mdspan +29 -0
  521. cuda/cccl/headers/include/cuda/memory +37 -0
  522. cuda/cccl/headers/include/cuda/memory_pool +27 -0
  523. cuda/cccl/headers/include/cuda/memory_resource +41 -0
  524. cuda/cccl/headers/include/cuda/numeric +31 -0
  525. cuda/cccl/headers/include/cuda/pipeline +580 -0
  526. cuda/cccl/headers/include/cuda/ptx +131 -0
  527. cuda/cccl/headers/include/cuda/semaphore +31 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +143 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/inplace_merge.h +293 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  571. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  572. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  573. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  574. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  575. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  576. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
  577. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  578. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +91 -0
  579. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  580. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  581. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  582. cuda/cccl/headers/include/cuda/std/__algorithm/nth_element.h +309 -0
  583. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  584. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  585. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  586. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  587. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  588. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  589. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  590. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  591. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_find_if.h +78 -0
  592. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_find_if_not.h +85 -0
  593. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  594. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  595. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  596. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +97 -0
  597. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  598. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  599. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  600. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  601. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  602. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  603. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  604. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  605. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  606. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  607. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  608. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  609. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  610. cuda/cccl/headers/include/cuda/std/__algorithm/sample.h +116 -0
  611. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  612. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  613. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  614. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  615. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  616. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  617. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  618. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  619. cuda/cccl/headers/include/cuda/std/__algorithm/shuffle.h +71 -0
  620. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  621. cuda/cccl/headers/include/cuda/std/__algorithm/sort.h +1097 -0
  622. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  623. cuda/cccl/headers/include/cuda/std/__algorithm/stable_partition.h +359 -0
  624. cuda/cccl/headers/include/cuda/std/__algorithm/stable_sort.h +321 -0
  625. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  626. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  627. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  628. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +153 -0
  629. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  630. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  631. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  632. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  633. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  634. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  635. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  636. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  637. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +458 -0
  638. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +4436 -0
  639. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +184 -0
  640. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +242 -0
  641. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  642. cuda/cccl/headers/include/cuda/std/__atomic/order.h +158 -0
  643. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  644. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  645. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  646. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +242 -0
  647. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +103 -0
  648. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  649. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  650. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +238 -0
  651. cuda/cccl/headers/include/cuda/std/__atomic/types.h +51 -0
  652. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  653. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  654. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  655. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  656. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  657. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +81 -0
  658. cuda/cccl/headers/include/cuda/std/__bit/blsr.h +51 -0
  659. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +209 -0
  660. cuda/cccl/headers/include/cuda/std/__bit/countl.h +191 -0
  661. cuda/cccl/headers/include/cuda/std/__bit/countr.h +202 -0
  662. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  663. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  664. cuda/cccl/headers/include/cuda/std/__bit/integral.h +125 -0
  665. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +172 -0
  666. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  667. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +185 -0
  668. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  669. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  670. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  671. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  672. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +494 -0
  673. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +213 -0
  674. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  675. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  676. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +87 -0
  677. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  678. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +197 -0
  679. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +355 -0
  680. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +36 -0
  681. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
  682. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +139 -0
  683. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +59 -0
  684. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +71 -0
  685. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  686. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  687. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  688. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1288 -0
  689. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +312 -0
  690. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +363 -0
  691. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  692. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  693. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  694. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  695. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  696. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  697. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  698. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +171 -0
  699. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  700. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +192 -0
  701. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  702. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +52 -0
  703. cuda/cccl/headers/include/cuda/std/__chrono/day.h +160 -0
  704. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +499 -0
  705. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +53 -0
  706. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +44 -0
  707. cuda/cccl/headers/include/cuda/std/__chrono/month.h +185 -0
  708. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +58 -0
  709. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +78 -0
  710. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +255 -0
  711. cuda/cccl/headers/include/cuda/std/__chrono/year.h +184 -0
  712. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  713. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  714. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +204 -0
  715. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +783 -0
  716. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +122 -0
  717. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +129 -0
  718. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +230 -0
  719. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +204 -0
  720. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +285 -0
  721. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +220 -0
  722. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +285 -0
  723. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +370 -0
  724. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +166 -0
  725. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +203 -0
  726. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +184 -0
  727. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  728. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  729. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +533 -0
  730. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
  731. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  732. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  733. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +210 -0
  734. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +198 -0
  735. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +983 -0
  736. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  737. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +242 -0
  738. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +327 -0
  739. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  740. cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
  741. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +403 -0
  742. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +119 -0
  743. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +522 -0
  744. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  745. cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
  746. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +302 -0
  747. cuda/cccl/headers/include/cuda/std/__complex/math.h +161 -0
  748. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  749. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  750. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  751. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  752. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  753. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  754. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  755. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  756. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  757. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  758. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  759. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  760. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +385 -0
  761. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  762. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  763. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  764. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  765. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  766. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  767. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +98 -0
  768. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  769. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  770. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  771. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  772. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  773. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  774. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  775. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  776. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  777. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  778. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  779. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  780. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  781. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  782. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
  783. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  784. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  785. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +110 -0
  786. cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +108 -0
  787. cuda/cccl/headers/include/cuda/std/__exception/format_error.h +62 -0
  788. cuda/cccl/headers/include/cuda/std/__exception/msg_storage.h +41 -0
  789. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +74 -0
  790. cuda/cccl/headers/include/cuda/std/__exception/throw_error.h +120 -0
  791. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  792. cuda/cccl/headers/include/cuda/std/__execution/policy.h +90 -0
  793. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  794. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
  795. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1051 -0
  796. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  797. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +164 -0
  798. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  799. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
  800. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  801. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  802. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +375 -0
  803. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  804. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
  805. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +126 -0
  806. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  807. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  808. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  809. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  810. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  811. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  812. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  813. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  814. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  815. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  816. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  817. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  818. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  819. cuda/cccl/headers/include/cuda/std/__format/format_context.h +93 -0
  820. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  821. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  822. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  823. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1265 -0
  824. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  825. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  826. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  827. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  828. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  829. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  830. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  831. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  832. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  833. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  834. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  835. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  836. cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
  837. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +81 -0
  838. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +74 -0
  839. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  840. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  841. cuda/cccl/headers/include/cuda/std/__functional/compose.h +69 -0
  842. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +76 -0
  843. cuda/cccl/headers/include/cuda/std/__functional/function.h +1271 -0
  844. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  845. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  846. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +298 -0
  847. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  848. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
  849. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  850. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  851. cuda/cccl/headers/include/cuda/std/__functional/operations.h +535 -0
  852. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  853. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  854. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  855. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  856. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +114 -0
  857. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  858. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  859. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  860. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
  861. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
  862. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  863. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
  864. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
  865. cuda/cccl/headers/include/cuda/std/__fwd/execution_policy.h +73 -0
  866. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  867. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  868. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  869. cuda/cccl/headers/include/cuda/std/__fwd/get.h +122 -0
  870. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  871. cuda/cccl/headers/include/cuda/std/__fwd/ios.h +123 -0
  872. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  873. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
  874. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  875. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  876. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
  877. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +39 -0
  878. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  879. cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
  880. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
  881. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  882. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +37 -0
  883. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  884. cuda/cccl/headers/include/cuda/std/__fwd/variant.h +51 -0
  885. cuda/cccl/headers/include/cuda/std/__internal/atomic.h +55 -0
  886. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  887. cuda/cccl/headers/include/cuda/std/__internal/features.h +104 -0
  888. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +132 -0
  889. cuda/cccl/headers/include/cuda/std/__internal/pstl_config.h +32 -0
  890. cuda/cccl/headers/include/cuda/std/__internal/thread_api.h +58 -0
  891. cuda/cccl/headers/include/cuda/std/__internal/version.h +52 -0
  892. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  893. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +227 -0
  894. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +164 -0
  895. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  896. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  897. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +465 -0
  898. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  899. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  900. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +124 -0
  901. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  902. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  903. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +100 -0
  904. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  905. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  906. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  907. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  908. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  909. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  910. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +181 -0
  911. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  912. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  913. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  914. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  915. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  916. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  917. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  918. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  919. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  920. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  921. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  922. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  923. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  924. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +366 -0
  925. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  926. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  927. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  928. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  929. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  930. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +603 -0
  931. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  932. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +76 -0
  933. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +52 -0
  934. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +136 -0
  935. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +128 -0
  936. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +315 -0
  937. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  938. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +137 -0
  939. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  940. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +348 -0
  941. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +749 -0
  942. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  943. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  944. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +598 -0
  945. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +515 -0
  946. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +190 -0
  947. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +187 -0
  948. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +339 -0
  949. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +90 -0
  950. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  951. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +82 -0
  952. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  953. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +327 -0
  954. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +85 -0
  955. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  956. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +517 -0
  957. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +59 -0
  958. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  959. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  960. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +241 -0
  961. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  962. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +51 -0
  963. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  964. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  965. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +99 -0
  966. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
  967. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  968. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +59 -0
  969. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  970. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  971. cuda/cccl/headers/include/cuda/std/__new/allocate.h +131 -0
  972. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  973. cuda/cccl/headers/include/cuda/std/__new/device_new.h +30 -0
  974. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  975. cuda/cccl/headers/include/cuda/std/__new_ +30 -0
  976. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  977. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  978. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  979. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  980. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  981. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  982. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  983. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  984. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  985. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  986. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  987. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  988. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  989. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  990. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  991. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  992. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  993. cuda/cccl/headers/include/cuda/std/__optional/optional.h +861 -0
  994. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +439 -0
  995. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  996. cuda/cccl/headers/include/cuda/std/__pstl/cuda/for_each_n.h +97 -0
  997. cuda/cccl/headers/include/cuda/std/__pstl/dispatch.h +123 -0
  998. cuda/cccl/headers/include/cuda/std/__pstl/for_each.h +71 -0
  999. cuda/cccl/headers/include/cuda/std/__pstl/for_each_n.h +68 -0
  1000. cuda/cccl/headers/include/cuda/std/__random/bernoulli_distribution.h +173 -0
  1001. cuda/cccl/headers/include/cuda/std/__random/binomial_distribution.h +254 -0
  1002. cuda/cccl/headers/include/cuda/std/__random/cauchy_distribution.h +192 -0
  1003. cuda/cccl/headers/include/cuda/std/__random/chi_squared_distribution.h +179 -0
  1004. cuda/cccl/headers/include/cuda/std/__random/exponential_distribution.h +187 -0
  1005. cuda/cccl/headers/include/cuda/std/__random/extreme_value_distribution.h +196 -0
  1006. cuda/cccl/headers/include/cuda/std/__random/fisher_f_distribution.h +196 -0
  1007. cuda/cccl/headers/include/cuda/std/__random/gamma_distribution.h +257 -0
  1008. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  1009. cuda/cccl/headers/include/cuda/std/__random/geometric_distribution.h +179 -0
  1010. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  1011. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +70 -0
  1012. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  1013. cuda/cccl/headers/include/cuda/std/__random/lognormal_distribution.h +174 -0
  1014. cuda/cccl/headers/include/cuda/std/__random/negative_binomial_distribution.h +212 -0
  1015. cuda/cccl/headers/include/cuda/std/__random/normal_distribution.h +232 -0
  1016. cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
  1017. cuda/cccl/headers/include/cuda/std/__random/poisson_distribution.h +338 -0
  1018. cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
  1019. cuda/cccl/headers/include/cuda/std/__random/student_t_distribution.h +186 -0
  1020. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +341 -0
  1021. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +192 -0
  1022. cuda/cccl/headers/include/cuda/std/__random/weibull_distribution.h +189 -0
  1023. cuda/cccl/headers/include/cuda/std/__random_ +47 -0
  1024. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  1025. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  1026. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +889 -0
  1027. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
  1028. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  1029. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  1030. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  1031. cuda/cccl/headers/include/cuda/std/__ranges/drop_view.h +389 -0
  1032. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  1033. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  1034. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  1035. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  1036. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  1037. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +264 -0
  1038. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +409 -0
  1039. cuda/cccl/headers/include/cuda/std/__ranges/non_propagating_cache.h +210 -0
  1040. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +163 -0
  1041. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +111 -0
  1042. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  1043. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  1044. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  1045. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +346 -0
  1046. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  1047. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  1048. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +510 -0
  1049. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +472 -0
  1050. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  1051. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  1052. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +52 -0
  1053. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  1054. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  1055. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  1056. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  1057. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +190 -0
  1058. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +580 -0
  1059. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  1060. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  1061. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  1062. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  1063. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  1064. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  1065. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  1066. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  1067. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  1068. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  1069. cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
  1070. cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
  1071. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  1072. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
  1073. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +100 -0
  1074. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
  1075. cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
  1076. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
  1077. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
  1078. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
  1079. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +77 -0
  1080. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  1081. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
  1082. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +83 -0
  1083. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
  1084. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  1085. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  1086. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +155 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
  1125. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1126. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1127. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +49 -0
  1128. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1129. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1130. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1131. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1132. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1133. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1134. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1135. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1136. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1137. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1138. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +63 -0
  1139. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1140. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1141. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1142. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1143. cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
  1144. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1145. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1146. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1147. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1148. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1149. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1150. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1151. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1152. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1153. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1154. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1155. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1156. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1157. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +72 -0
  1158. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1159. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1160. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1161. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1162. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1163. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1164. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1165. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1166. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1167. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1168. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1169. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1170. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1171. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1172. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1173. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
  1174. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1175. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1176. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1177. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1178. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1179. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1180. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1181. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1182. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +200 -0
  1183. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1184. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1185. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1186. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1187. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1188. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1189. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1190. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1191. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1192. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1193. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1194. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1195. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1196. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1197. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1198. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1199. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1200. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1201. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1202. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1203. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1204. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1205. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1206. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1207. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1208. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1209. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1210. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1211. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1212. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1213. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1214. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1215. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1216. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1217. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1218. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1219. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1220. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1221. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1222. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1223. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1224. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1225. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1226. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1227. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1228. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +73 -0
  1229. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1230. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +114 -0
  1231. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1232. cuda/cccl/headers/include/cuda/std/__utility/ctad_support.h +27 -0
  1233. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1234. cuda/cccl/headers/include/cuda/std/__utility/delegate_constructors.h +51 -0
  1235. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +162 -0
  1236. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1237. cuda/cccl/headers/include/cuda/std/__utility/forward.h +82 -0
  1238. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +82 -0
  1239. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
  1240. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +249 -0
  1241. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1242. cuda/cccl/headers/include/cuda/std/__utility/move.h +126 -0
  1243. cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
  1244. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1245. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +425 -0
  1246. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1247. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +61 -0
  1248. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1249. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1250. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1251. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1252. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1253. cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
  1254. cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
  1255. cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
  1256. cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
  1257. cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
  1258. cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
  1259. cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
  1260. cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
  1261. cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
  1262. cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
  1263. cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
  1264. cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
  1265. cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
  1266. cuda/cccl/headers/include/cuda/std/algorithm +138 -0
  1267. cuda/cccl/headers/include/cuda/std/array +519 -0
  1268. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1269. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1270. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1271. cuda/cccl/headers/include/cuda/std/bitset +986 -0
  1272. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1273. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1274. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1275. cuda/cccl/headers/include/cuda/std/charconv +31 -0
  1276. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1277. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1278. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1279. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1280. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1281. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1282. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1283. cuda/cccl/headers/include/cuda/std/cstdlib +31 -0
  1284. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1285. cuda/cccl/headers/include/cuda/std/ctime +155 -0
  1286. cuda/cccl/headers/include/cuda/std/detail/__config +22 -0
  1287. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2506 -0
  1288. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1289. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1290. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1291. cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
  1292. cuda/cccl/headers/include/cuda/std/inplace_vector +2171 -0
  1293. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1294. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1295. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1296. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1297. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1298. cuda/cccl/headers/include/cuda/std/memory +40 -0
  1299. cuda/cccl/headers/include/cuda/std/numbers +344 -0
  1300. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1301. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1302. cuda/cccl/headers/include/cuda/std/ranges +70 -0
  1303. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1304. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1305. cuda/cccl/headers/include/cuda/std/source_location +107 -0
  1306. cuda/cccl/headers/include/cuda/std/span +599 -0
  1307. cuda/cccl/headers/include/cuda/std/string_view +924 -0
  1308. cuda/cccl/headers/include/cuda/std/tuple +43 -0
  1309. cuda/cccl/headers/include/cuda/std/type_traits +176 -0
  1310. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1311. cuda/cccl/headers/include/cuda/std/variant +32 -0
  1312. cuda/cccl/headers/include/cuda/std/version +240 -0
  1313. cuda/cccl/headers/include/cuda/stream +32 -0
  1314. cuda/cccl/headers/include/cuda/stream_ref +59 -0
  1315. cuda/cccl/headers/include/cuda/tma +25 -0
  1316. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1317. cuda/cccl/headers/include/cuda/utility +28 -0
  1318. cuda/cccl/headers/include/cuda/version +16 -0
  1319. cuda/cccl/headers/include/cuda/warp +28 -0
  1320. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1321. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1322. cuda/cccl/headers/include/nv/detail/__target_macros +739 -0
  1323. cuda/cccl/headers/include/nv/target +241 -0
  1324. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1325. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1326. cuda/cccl/headers/include/thrust/advance.h +60 -0
  1327. cuda/cccl/headers/include/thrust/allocate_unique.h +301 -0
  1328. cuda/cccl/headers/include/thrust/binary_search.h +1911 -0
  1329. cuda/cccl/headers/include/thrust/complex.h +859 -0
  1330. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1331. cuda/cccl/headers/include/thrust/count.h +245 -0
  1332. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +108 -0
  1333. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1334. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +629 -0
  1335. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +193 -0
  1336. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +95 -0
  1337. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +79 -0
  1338. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +76 -0
  1339. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +74 -0
  1340. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +113 -0
  1341. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +114 -0
  1342. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
  1343. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +96 -0
  1344. cuda/cccl/headers/include/thrust/detail/binary_search.inl +537 -0
  1345. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1346. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +210 -0
  1347. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +877 -0
  1348. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +591 -0
  1349. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +234 -0
  1350. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +162 -0
  1351. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +194 -0
  1352. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +170 -0
  1353. cuda/cccl/headers/include/thrust/detail/complex/clog.h +222 -0
  1354. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +209 -0
  1355. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1356. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +53 -0
  1357. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +75 -0
  1358. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1359. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +169 -0
  1360. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1361. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1362. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +212 -0
  1363. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +136 -0
  1364. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +130 -0
  1365. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1366. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1367. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1368. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1369. cuda/cccl/headers/include/thrust/detail/config/device_system.h +57 -0
  1370. cuda/cccl/headers/include/thrust/detail/config/host_system.h +50 -0
  1371. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1372. cuda/cccl/headers/include/thrust/detail/config/namespace.h +164 -0
  1373. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1374. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1375. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +227 -0
  1376. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +272 -0
  1377. cuda/cccl/headers/include/thrust/detail/copy.h +70 -0
  1378. cuda/cccl/headers/include/thrust/detail/copy.inl +146 -0
  1379. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1380. cuda/cccl/headers/include/thrust/detail/copy_if.inl +114 -0
  1381. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1382. cuda/cccl/headers/include/thrust/detail/count.inl +101 -0
  1383. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1384. cuda/cccl/headers/include/thrust/detail/equal.inl +105 -0
  1385. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1386. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +79 -0
  1387. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1388. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1389. cuda/cccl/headers/include/thrust/detail/extrema.inl +196 -0
  1390. cuda/cccl/headers/include/thrust/detail/fill.inl +97 -0
  1391. cuda/cccl/headers/include/thrust/detail/find.inl +125 -0
  1392. cuda/cccl/headers/include/thrust/detail/for_each.inl +96 -0
  1393. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1394. cuda/cccl/headers/include/thrust/detail/functional/actor.h +213 -0
  1395. cuda/cccl/headers/include/thrust/detail/functional/operators.h +384 -0
  1396. cuda/cccl/headers/include/thrust/detail/gather.inl +185 -0
  1397. cuda/cccl/headers/include/thrust/detail/generate.inl +97 -0
  1398. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +60 -0
  1399. cuda/cccl/headers/include/thrust/detail/inner_product.inl +130 -0
  1400. cuda/cccl/headers/include/thrust/detail/internal_functional.h +335 -0
  1401. cuda/cccl/headers/include/thrust/detail/logical.inl +125 -0
  1402. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +89 -0
  1403. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1404. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +208 -0
  1405. cuda/cccl/headers/include/thrust/detail/merge.inl +288 -0
  1406. cuda/cccl/headers/include/thrust/detail/mismatch.inl +107 -0
  1407. cuda/cccl/headers/include/thrust/detail/nvtx_policy.h +41 -0
  1408. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +120 -0
  1409. cuda/cccl/headers/include/thrust/detail/partition.inl +390 -0
  1410. cuda/cccl/headers/include/thrust/detail/pointer.h +313 -0
  1411. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1412. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1413. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +114 -0
  1414. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +128 -0
  1415. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1416. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +191 -0
  1417. cuda/cccl/headers/include/thrust/detail/reduce.inl +396 -0
  1418. cuda/cccl/headers/include/thrust/detail/reference.h +521 -0
  1419. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1420. cuda/cccl/headers/include/thrust/detail/remove.inl +225 -0
  1421. cuda/cccl/headers/include/thrust/detail/replace.inl +243 -0
  1422. cuda/cccl/headers/include/thrust/detail/reverse.inl +100 -0
  1423. cuda/cccl/headers/include/thrust/detail/scan.inl +536 -0
  1424. cuda/cccl/headers/include/thrust/detail/scatter.inl +169 -0
  1425. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1426. cuda/cccl/headers/include/thrust/detail/sequence.inl +121 -0
  1427. cuda/cccl/headers/include/thrust/detail/set_operations.inl +993 -0
  1428. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1429. cuda/cccl/headers/include/thrust/detail/sort.inl +385 -0
  1430. cuda/cccl/headers/include/thrust/detail/static_assert.h +56 -0
  1431. cuda/cccl/headers/include/thrust/detail/static_map.h +164 -0
  1432. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +77 -0
  1433. cuda/cccl/headers/include/thrust/detail/tabulate.inl +74 -0
  1434. cuda/cccl/headers/include/thrust/detail/temporary_array.h +150 -0
  1435. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +121 -0
  1436. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +93 -0
  1437. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +81 -0
  1438. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +173 -0
  1439. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +128 -0
  1440. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +59 -0
  1441. cuda/cccl/headers/include/thrust/detail/type_deduction.h +61 -0
  1442. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +50 -0
  1443. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1444. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +162 -0
  1445. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +67 -0
  1446. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1447. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +55 -0
  1448. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +48 -0
  1449. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +91 -0
  1450. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1451. cuda/cccl/headers/include/thrust/detail/type_traits.h +143 -0
  1452. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +102 -0
  1453. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +97 -0
  1454. cuda/cccl/headers/include/thrust/detail/unique.inl +391 -0
  1455. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1456. cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
  1457. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1216 -0
  1458. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1459. cuda/cccl/headers/include/thrust/device_delete.h +74 -0
  1460. cuda/cccl/headers/include/thrust/device_free.h +85 -0
  1461. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1462. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1463. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1464. cuda/cccl/headers/include/thrust/device_new.h +112 -0
  1465. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1466. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1467. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1468. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1469. cuda/cccl/headers/include/thrust/distance.h +44 -0
  1470. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1471. cuda/cccl/headers/include/thrust/execution_policy.h +252 -0
  1472. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1473. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1474. cuda/cccl/headers/include/thrust/find.h +382 -0
  1475. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1476. cuda/cccl/headers/include/thrust/functional.h +399 -0
  1477. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1478. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1479. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1480. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1481. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1482. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +338 -0
  1483. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1484. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1485. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +43 -0
  1486. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +42 -0
  1487. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +83 -0
  1488. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1489. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1490. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +56 -0
  1491. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +184 -0
  1492. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +55 -0
  1493. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1494. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1495. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +80 -0
  1496. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +171 -0
  1497. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1498. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1499. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1500. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1501. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
  1502. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1503. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1504. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1505. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1506. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1507. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +190 -0
  1508. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1509. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
  1510. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +225 -0
  1511. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +353 -0
  1512. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1513. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +365 -0
  1514. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1515. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1516. cuda/cccl/headers/include/thrust/merge.h +726 -0
  1517. cuda/cccl/headers/include/thrust/mismatch.h +262 -0
  1518. cuda/cccl/headers/include/thrust/mr/allocator.h +227 -0
  1519. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +46 -0
  1520. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1521. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +116 -0
  1522. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +65 -0
  1523. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +65 -0
  1524. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +42 -0
  1525. cuda/cccl/headers/include/thrust/mr/memory_resource.h +215 -0
  1526. cuda/cccl/headers/include/thrust/mr/new.h +98 -0
  1527. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +61 -0
  1528. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1529. cuda/cccl/headers/include/thrust/mr/pool_options.h +172 -0
  1530. cuda/cccl/headers/include/thrust/mr/sync_pool.h +112 -0
  1531. cuda/cccl/headers/include/thrust/mr/tls_pool.h +62 -0
  1532. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1533. cuda/cccl/headers/include/thrust/mr/validator.h +54 -0
  1534. cuda/cccl/headers/include/thrust/pair.h +102 -0
  1535. cuda/cccl/headers/include/thrust/partition.h +1392 -0
  1536. cuda/cccl/headers/include/thrust/per_device_resource.h +110 -0
  1537. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +182 -0
  1538. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +153 -0
  1539. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +97 -0
  1540. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +149 -0
  1541. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +46 -0
  1542. cuda/cccl/headers/include/thrust/random/detail/mod.h +94 -0
  1543. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +185 -0
  1544. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +157 -0
  1545. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +56 -0
  1546. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +199 -0
  1547. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +196 -0
  1548. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
  1549. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +181 -0
  1550. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +186 -0
  1551. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +238 -0
  1552. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +287 -0
  1553. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +215 -0
  1554. cuda/cccl/headers/include/thrust/random/normal_distribution.h +256 -0
  1555. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +245 -0
  1556. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +259 -0
  1557. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +257 -0
  1558. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +253 -0
  1559. cuda/cccl/headers/include/thrust/random.h +118 -0
  1560. cuda/cccl/headers/include/thrust/reduce.h +1114 -0
  1561. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1562. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1563. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1564. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1565. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1566. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1567. cuda/cccl/headers/include/thrust/set_operations.h +3027 -0
  1568. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1569. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1570. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1571. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1572. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1573. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1574. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1575. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1576. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1577. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1578. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +113 -0
  1579. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1580. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1581. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1582. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1583. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1584. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1585. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1586. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1587. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1588. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1589. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1590. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +55 -0
  1591. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1592. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1593. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1594. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1595. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1596. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1597. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1598. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1599. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1600. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1601. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1602. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1603. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1604. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1605. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1606. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1607. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1608. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1609. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1610. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1611. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1612. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1613. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1614. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1615. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1616. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1617. cuda/cccl/headers/include/thrust/system/cpp/memory.h +105 -0
  1618. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +70 -0
  1619. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +118 -0
  1620. cuda/cccl/headers/include/thrust/system/cpp/vector.h +95 -0
  1621. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1622. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +218 -0
  1623. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1624. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1625. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1626. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +272 -0
  1627. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +251 -0
  1628. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +280 -0
  1629. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +162 -0
  1630. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +578 -0
  1631. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1632. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +241 -0
  1633. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +230 -0
  1634. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +62 -0
  1635. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +87 -0
  1636. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +266 -0
  1637. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +473 -0
  1638. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +99 -0
  1639. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +165 -0
  1640. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1641. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +89 -0
  1642. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +59 -0
  1643. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1644. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +77 -0
  1645. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +79 -0
  1646. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1647. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +55 -0
  1648. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +119 -0
  1649. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +55 -0
  1650. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +205 -0
  1651. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +88 -0
  1652. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +79 -0
  1653. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +401 -0
  1654. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +70 -0
  1655. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +774 -0
  1656. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +994 -0
  1657. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +105 -0
  1658. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +121 -0
  1659. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +83 -0
  1660. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +340 -0
  1661. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +412 -0
  1662. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +90 -0
  1663. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1664. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1722 -0
  1665. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +473 -0
  1666. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +99 -0
  1667. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +62 -0
  1668. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1669. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1670. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +428 -0
  1671. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +139 -0
  1672. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +117 -0
  1673. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +114 -0
  1674. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +102 -0
  1675. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +288 -0
  1676. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +307 -0
  1677. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1678. cuda/cccl/headers/include/thrust/system/cuda/error.h +159 -0
  1679. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1680. cuda/cccl/headers/include/thrust/system/cuda/memory.h +118 -0
  1681. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +115 -0
  1682. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +155 -0
  1683. cuda/cccl/headers/include/thrust/system/cuda/vector.h +104 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +59 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/errno.h +118 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +298 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +171 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +119 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +51 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +77 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +159 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +370 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +43 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +62 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +56 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +145 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +46 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +82 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +47 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +58 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +65 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +246 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +52 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +47 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +135 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +56 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +71 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +94 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +43 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +64 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +58 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +70 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +57 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +61 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +97 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +146 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +47 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +67 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +127 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +208 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +41 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +69 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +98 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +81 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +183 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +84 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +119 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +93 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +172 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +46 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +65 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +61 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +125 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +86 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +124 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +229 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +71 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +83 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +105 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +281 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +474 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +52 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +123 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +111 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +176 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +42 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +73 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +39 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +45 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +53 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +81 -0
  1755. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +393 -0
  1756. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +48 -0
  1757. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +54 -0
  1758. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +78 -0
  1759. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +111 -0
  1760. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +43 -0
  1761. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +164 -0
  1762. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +43 -0
  1763. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +112 -0
  1764. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +69 -0
  1765. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +112 -0
  1766. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +80 -0
  1767. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +124 -0
  1768. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +115 -0
  1769. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +68 -0
  1770. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +40 -0
  1771. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
  1772. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +120 -0
  1773. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +47 -0
  1774. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +69 -0
  1775. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1776. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1777. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1778. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +109 -0
  1779. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1780. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +60 -0
  1781. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +72 -0
  1782. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1783. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +120 -0
  1784. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1785. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +41 -0
  1786. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1787. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +139 -0
  1788. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +43 -0
  1789. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1790. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +48 -0
  1791. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +147 -0
  1792. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1793. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +298 -0
  1794. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1795. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +62 -0
  1796. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +97 -0
  1797. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +177 -0
  1798. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1799. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1800. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +152 -0
  1801. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +143 -0
  1802. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1803. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1804. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +204 -0
  1805. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +120 -0
  1806. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +353 -0
  1807. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +121 -0
  1808. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +584 -0
  1809. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1810. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1811. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1812. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1813. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1814. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1815. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +56 -0
  1816. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1817. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1818. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +112 -0
  1819. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +105 -0
  1820. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +106 -0
  1821. cuda/cccl/headers/include/thrust/system/error_code.h +508 -0
  1822. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +34 -0
  1823. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +17 -0
  1824. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +57 -0
  1825. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +61 -0
  1826. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +39 -0
  1827. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +17 -0
  1828. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +48 -0
  1829. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +17 -0
  1830. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +132 -0
  1831. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +46 -0
  1832. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +17 -0
  1833. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +33 -0
  1834. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +74 -0
  1835. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +17 -0
  1836. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +17 -0
  1837. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +17 -0
  1838. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +17 -0
  1839. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +17 -0
  1840. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +17 -0
  1841. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +17 -0
  1842. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +17 -0
  1843. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +17 -0
  1844. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +84 -0
  1845. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +16 -0
  1846. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +30 -0
  1847. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +63 -0
  1848. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +48 -0
  1849. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +87 -0
  1850. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +67 -0
  1851. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +17 -0
  1852. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +17 -0
  1853. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +216 -0
  1854. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +23 -0
  1855. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +17 -0
  1856. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +17 -0
  1857. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +17 -0
  1858. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +245 -0
  1859. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +17 -0
  1860. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +17 -0
  1861. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +16 -0
  1862. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +17 -0
  1863. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +17 -0
  1864. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +17 -0
  1865. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +17 -0
  1866. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +17 -0
  1867. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +52 -0
  1868. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +56 -0
  1869. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1870. cuda/cccl/headers/include/thrust/system/omp/memory.h +153 -0
  1871. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +71 -0
  1872. cuda/cccl/headers/include/thrust/system/omp/pointer.h +120 -0
  1873. cuda/cccl/headers/include/thrust/system/omp/vector.h +96 -0
  1874. cuda/cccl/headers/include/thrust/system/system_error.h +183 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +33 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +17 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +17 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +59 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +117 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +17 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +17 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +113 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +45 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +17 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +28 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +72 -0
  1887. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +17 -0
  1888. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +17 -0
  1889. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +17 -0
  1890. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +17 -0
  1891. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +17 -0
  1892. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +17 -0
  1893. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +17 -0
  1894. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +302 -0
  1895. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +17 -0
  1896. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +79 -0
  1897. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +16 -0
  1898. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +121 -0
  1899. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +378 -0
  1900. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +118 -0
  1901. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +66 -0
  1902. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +17 -0
  1903. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +17 -0
  1904. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +294 -0
  1905. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +20 -0
  1906. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +17 -0
  1907. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +17 -0
  1908. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +17 -0
  1909. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +273 -0
  1910. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +17 -0
  1911. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +17 -0
  1912. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +16 -0
  1913. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +17 -0
  1914. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +17 -0
  1915. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +17 -0
  1916. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +17 -0
  1917. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +17 -0
  1918. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +49 -0
  1919. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +55 -0
  1920. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1921. cuda/cccl/headers/include/thrust/system/tbb/memory.h +139 -0
  1922. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +57 -0
  1923. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +106 -0
  1924. cuda/cccl/headers/include/thrust/system/tbb/vector.h +82 -0
  1925. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1926. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1927. cuda/cccl/headers/include/thrust/transform.h +1056 -0
  1928. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1929. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1930. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1931. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +256 -0
  1932. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +156 -0
  1933. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1934. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +180 -0
  1935. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +112 -0
  1936. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +333 -0
  1937. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1938. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
  1939. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1940. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1941. cuda/cccl/headers/include/thrust/unique.h +1089 -0
  1942. cuda/cccl/headers/include/thrust/universal_allocator.h +101 -0
  1943. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1944. cuda/cccl/headers/include/thrust/universal_vector.h +80 -0
  1945. cuda/cccl/headers/include/thrust/version.h +93 -0
  1946. cuda/cccl/headers/include/thrust/zip_function.h +149 -0
  1947. cuda/cccl/headers/include_paths.py +51 -0
  1948. cuda/cccl/headers/lib/cmake/cccl/cccl-config-version.cmake +25 -0
  1949. cuda/cccl/headers/lib/cmake/cccl/cccl-config.cmake +143 -0
  1950. cuda/cccl/headers/lib/cmake/cub/cub-config-version.cmake +29 -0
  1951. cuda/cccl/headers/lib/cmake/cub/cub-config.cmake +172 -0
  1952. cuda/cccl/headers/lib/cmake/cub/cub-header-search.cmake +15 -0
  1953. cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-config-version.cmake +37 -0
  1954. cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-config.cmake +297 -0
  1955. cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-header-search.cmake +15 -0
  1956. cuda/cccl/headers/lib/cmake/thrust/FindTBB.cmake +498 -0
  1957. cuda/cccl/headers/lib/cmake/thrust/README.md +258 -0
  1958. cuda/cccl/headers/lib/cmake/thrust/thrust-config-version.cmake +37 -0
  1959. cuda/cccl/headers/lib/cmake/thrust/thrust-config.cmake +983 -0
  1960. cuda/cccl/headers/lib/cmake/thrust/thrust-header-search.cmake +15 -0
  1961. cuda/cccl/parallel/__init__.py +9 -0
  1962. cuda/cccl/parallel/experimental/__init__.py +24 -0
  1963. cuda/cccl/py.typed +0 -0
  1964. cuda/compute/__init__.py +91 -0
  1965. cuda/compute/_bindings.py +79 -0
  1966. cuda/compute/_bindings.pyi +516 -0
  1967. cuda/compute/_bindings_impl.pyx +2470 -0
  1968. cuda/compute/_caching.py +83 -0
  1969. cuda/compute/_cccl_interop.py +354 -0
  1970. cuda/compute/_odr_helpers.py +238 -0
  1971. cuda/compute/_utils/__init__.py +0 -0
  1972. cuda/compute/_utils/protocols.py +145 -0
  1973. cuda/compute/_utils/temp_storage_buffer.py +87 -0
  1974. cuda/compute/algorithms/__init__.py +62 -0
  1975. cuda/compute/algorithms/_histogram.py +243 -0
  1976. cuda/compute/algorithms/_reduce.py +205 -0
  1977. cuda/compute/algorithms/_scan.py +344 -0
  1978. cuda/compute/algorithms/_segmented_reduce.py +265 -0
  1979. cuda/compute/algorithms/_select.py +196 -0
  1980. cuda/compute/algorithms/_sort/__init__.py +23 -0
  1981. cuda/compute/algorithms/_sort/_merge_sort.py +235 -0
  1982. cuda/compute/algorithms/_sort/_radix_sort.py +263 -0
  1983. cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
  1984. cuda/compute/algorithms/_sort/_sort_common.py +52 -0
  1985. cuda/compute/algorithms/_three_way_partition.py +292 -0
  1986. cuda/compute/algorithms/_transform.py +317 -0
  1987. cuda/compute/algorithms/_unique_by_key.py +259 -0
  1988. cuda/compute/cccl/.gitkeep +0 -0
  1989. cuda/compute/cu12/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
  1990. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  1991. cuda/compute/cu13/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
  1992. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  1993. cuda/compute/determinism.py +3 -0
  1994. cuda/compute/iterators/__init__.py +23 -0
  1995. cuda/compute/iterators/_factories.py +251 -0
  1996. cuda/compute/iterators/_iterators.py +680 -0
  1997. cuda/compute/iterators/_permutation_iterator.py +266 -0
  1998. cuda/compute/iterators/_zip_iterator.py +268 -0
  1999. cuda/compute/numba_utils.py +54 -0
  2000. cuda/compute/op.py +140 -0
  2001. cuda/compute/struct.py +520 -0
  2002. cuda/compute/typing.py +36 -0
  2003. cuda/coop/__init__.py +8 -0
  2004. cuda/coop/_caching.py +48 -0
  2005. cuda/coop/_common.py +275 -0
  2006. cuda/coop/_nvrtc.py +92 -0
  2007. cuda/coop/_scan_op.py +181 -0
  2008. cuda/coop/_types.py +937 -0
  2009. cuda/coop/_typing.py +107 -0
  2010. cuda/coop/block/__init__.py +39 -0
  2011. cuda/coop/block/_block_exchange.py +251 -0
  2012. cuda/coop/block/_block_load_store.py +215 -0
  2013. cuda/coop/block/_block_merge_sort.py +125 -0
  2014. cuda/coop/block/_block_radix_sort.py +214 -0
  2015. cuda/coop/block/_block_reduce.py +294 -0
  2016. cuda/coop/block/_block_scan.py +983 -0
  2017. cuda/coop/warp/__init__.py +9 -0
  2018. cuda/coop/warp/_warp_merge_sort.py +92 -0
  2019. cuda/coop/warp/_warp_reduce.py +153 -0
  2020. cuda/coop/warp/_warp_scan.py +78 -0
  2021. cuda_cccl-0.4.3.dist-info/METADATA +84 -0
  2022. cuda_cccl-0.4.3.dist-info/RECORD +2024 -0
  2023. cuda_cccl-0.4.3.dist-info/WHEEL +5 -0
  2024. cuda_cccl-0.4.3.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,1563 @@
1
+ // SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2
+ // SPDX-License-Identifier: BSD-3
3
+
4
+ #pragma once
5
+
6
+ #include <cub/config.cuh>
7
+
8
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
9
+ # pragma GCC system_header
10
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
11
+ # pragma clang system_header
12
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
13
+ # pragma system_header
14
+ #endif // no system header
15
+
16
+ #include <cub/agent/agent_select_if.cuh>
17
+ #include <cub/agent/single_pass_scan_operators.cuh>
18
+ #include <cub/block/block_load.cuh>
19
+ #include <cub/block/block_scan.cuh>
20
+ #include <cub/util_device.cuh>
21
+ #include <cub/util_math.cuh>
22
+ #include <cub/util_type.cuh>
23
+
24
+ #include <cuda/std/__algorithm/clamp.h>
25
+ #include <cuda/std/__type_traits/is_same.h>
26
+
27
+ CUB_NAMESPACE_BEGIN
28
+
29
+ namespace detail::select
30
+ {
31
+ enum class may_alias
32
+ {
33
+ no,
34
+ yes
35
+ };
36
+
37
+ enum class flagged
38
+ {
39
+ no,
40
+ yes
41
+ };
42
+ enum class keep_rejects
43
+ {
44
+ no,
45
+ yes
46
+ };
47
+ enum class primitive
48
+ {
49
+ no,
50
+ yes
51
+ };
52
+ enum class offset_size
53
+ {
54
+ _4,
55
+ _8,
56
+ unknown
57
+ };
58
+ enum class input_size
59
+ {
60
+ _1,
61
+ _2,
62
+ _4,
63
+ _8,
64
+ _16,
65
+ unknown
66
+ };
67
+ enum class distinct_partitions
68
+ {
69
+ no,
70
+ yes
71
+ };
72
+
73
+ template <class InputT, flagged, keep_rejects, offset_size OffsetSize, primitive, input_size InputSize>
74
+ struct sm80_tuning;
75
+
76
+ template <class Input>
77
+ struct sm80_tuning<Input, flagged::no, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_1>
78
+ {
79
+ static constexpr int threads = 992;
80
+ static constexpr int items = 20;
81
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
82
+ using delay_constructor = detail::no_delay_constructor_t<395>;
83
+ };
84
+
85
+ template <class Input>
86
+ struct sm80_tuning<Input, flagged::no, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_2>
87
+ {
88
+ static constexpr int threads = 576;
89
+ static constexpr int items = 14;
90
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
91
+ using delay_constructor = detail::no_delay_constructor_t<870>;
92
+ };
93
+
94
+ template <class Input>
95
+ struct sm80_tuning<Input, flagged::no, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_4>
96
+ {
97
+ static constexpr int threads = 256;
98
+ static constexpr int items = 18;
99
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
100
+ using delay_constructor = detail::no_delay_constructor_t<1130>;
101
+ };
102
+
103
+ template <class Input>
104
+ struct sm80_tuning<Input, flagged::no, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_8>
105
+ {
106
+ static constexpr int threads = 192;
107
+ static constexpr int items = 10;
108
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
109
+ using delay_constructor = detail::fixed_delay_constructor_t<832, 1165>;
110
+ };
111
+
112
+ #if _CCCL_HAS_INT128()
113
+ template <>
114
+ struct sm80_tuning<__int128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
115
+ {
116
+ static constexpr int threads = 384;
117
+ static constexpr int items = 4;
118
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
119
+ using delay_constructor = detail::no_delay_constructor_t<1140>;
120
+ };
121
+
122
+ template <>
123
+ struct sm80_tuning<__uint128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
124
+ : sm80_tuning<__int128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
125
+ {};
126
+ #endif
127
+
128
+ // select::flagged
129
+ template <class Input>
130
+ struct sm80_tuning<Input, flagged::yes, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_1>
131
+ {
132
+ static constexpr int threads = 224;
133
+ static constexpr int items = 20;
134
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
135
+ using delay_constructor = detail::no_delay_constructor_t<735>;
136
+ };
137
+
138
+ template <class Input>
139
+ struct sm80_tuning<Input, flagged::yes, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_2>
140
+ {
141
+ static constexpr int threads = 256;
142
+ static constexpr int items = 20;
143
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
144
+ using delay_constructor = detail::no_delay_constructor_t<1155>;
145
+ };
146
+
147
+ template <class Input>
148
+ struct sm80_tuning<Input, flagged::yes, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_4>
149
+ {
150
+ static constexpr int threads = 320;
151
+ static constexpr int items = 10;
152
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
153
+ using delay_constructor = detail::fixed_delay_constructor_t<124, 1115>;
154
+ };
155
+
156
+ template <class Input>
157
+ struct sm80_tuning<Input, flagged::yes, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_8>
158
+ {
159
+ static constexpr int threads = 384;
160
+ static constexpr int items = 6;
161
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
162
+ using delay_constructor = detail::no_delay_constructor_t<1130>;
163
+ };
164
+
165
+ #if _CCCL_HAS_INT128()
166
+ template <>
167
+ struct sm80_tuning<__int128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
168
+ {
169
+ static constexpr int threads = 256;
170
+ static constexpr int items = 5;
171
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
172
+ using delay_constructor = detail::fixed_delay_constructor_t<464, 1025>;
173
+ };
174
+
175
+ template <>
176
+ struct sm80_tuning<__uint128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
177
+ : sm80_tuning<__int128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
178
+ {};
179
+ #endif
180
+
181
+ // partition::if
182
+ template <class Input>
183
+ struct sm80_tuning<Input, flagged::no, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_1>
184
+ {
185
+ static constexpr int threads = 512;
186
+ static constexpr int items = 20;
187
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
188
+ using delay_constructor = detail::no_delay_constructor_t<510>;
189
+ };
190
+
191
+ template <class Input>
192
+ struct sm80_tuning<Input, flagged::no, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_2>
193
+ {
194
+ static constexpr int threads = 224;
195
+ static constexpr int items = 18;
196
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
197
+ using delay_constructor = detail::no_delay_constructor_t<1045>;
198
+ };
199
+
200
+ template <class Input>
201
+ struct sm80_tuning<Input, flagged::no, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_4>
202
+ {
203
+ static constexpr int threads = 192;
204
+ static constexpr int items = 15;
205
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
206
+ using delay_constructor = detail::no_delay_constructor_t<1040>;
207
+ };
208
+
209
+ template <class Input>
210
+ struct sm80_tuning<Input, flagged::no, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_8>
211
+ {
212
+ static constexpr int threads = 192;
213
+ static constexpr int items = 10;
214
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
215
+ using delay_constructor = detail::fixed_delay_constructor_t<68, 1160>;
216
+ };
217
+
218
+ #if _CCCL_HAS_INT128()
219
+ template <>
220
+ struct sm80_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
221
+ {
222
+ static constexpr int threads = 256;
223
+ static constexpr int items = 5;
224
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
225
+ using delay_constructor = detail::fixed_delay_constructor_t<400, 1090>;
226
+ };
227
+
228
+ template <>
229
+ struct sm80_tuning<__uint128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
230
+ : sm80_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
231
+ {};
232
+ #endif
233
+
234
+ // partition::flagged
235
+ template <class Input>
236
+ struct sm80_tuning<Input, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_1>
237
+ {
238
+ static constexpr int threads = 512;
239
+ static constexpr int items = 20;
240
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
241
+ using delay_constructor = detail::no_delay_constructor_t<595>;
242
+ };
243
+
244
+ template <class Input>
245
+ struct sm80_tuning<Input, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_2>
246
+ {
247
+ static constexpr int threads = 224;
248
+ static constexpr int items = 18;
249
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
250
+ using delay_constructor = detail::no_delay_constructor_t<1105>;
251
+ };
252
+
253
+ template <class Input>
254
+ struct sm80_tuning<Input, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_4>
255
+ {
256
+ static constexpr int threads = 192;
257
+ static constexpr int items = 12;
258
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
259
+ using delay_constructor = detail::fixed_delay_constructor_t<912, 1025>;
260
+ };
261
+
262
+ template <class Input>
263
+ struct sm80_tuning<Input, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_8>
264
+ {
265
+ static constexpr int threads = 192;
266
+ static constexpr int items = 12;
267
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
268
+ using delay_constructor = detail::fixed_delay_constructor_t<884, 1130>;
269
+ };
270
+
271
+ #if _CCCL_HAS_INT128()
272
+ template <>
273
+ struct sm80_tuning<__int128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
274
+ {
275
+ static constexpr int threads = 256;
276
+ static constexpr int items = 5;
277
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
278
+ using delay_constructor = detail::fixed_delay_constructor_t<400, 1090>;
279
+ };
280
+
281
+ template <>
282
+ struct sm80_tuning<__uint128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
283
+ : sm80_tuning<__int128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
284
+ {};
285
+ #endif
286
+
287
+ template <class InputT, flagged, keep_rejects, offset_size OffsetSize, primitive, input_size InputSize>
288
+ struct sm90_tuning;
289
+
290
+ // select::if
291
+ template <class Input>
292
+ struct sm90_tuning<Input, flagged::no, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_1>
293
+ {
294
+ static constexpr int threads = 256;
295
+ static constexpr int items = 22;
296
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
297
+ using delay_constructor = detail::no_delay_constructor_t<580>;
298
+ };
299
+
300
+ template <class Input>
301
+ struct sm90_tuning<Input, flagged::no, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_2>
302
+ {
303
+ static constexpr int threads = 256;
304
+ static constexpr int items = 22;
305
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
306
+ using delay_constructor = detail::fixed_delay_constructor_t<320, 605>;
307
+ };
308
+
309
+ template <class Input>
310
+ struct sm90_tuning<Input, flagged::no, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_4>
311
+ {
312
+ static constexpr int threads = 384;
313
+ static constexpr int items = 17;
314
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
315
+ using delay_constructor = detail::fixed_delay_constructor_t<76, 1150>;
316
+ };
317
+
318
+ template <class Input>
319
+ struct sm90_tuning<Input, flagged::no, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_8>
320
+ {
321
+ static constexpr int threads = 384;
322
+ static constexpr int items = 11;
323
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
324
+ using delay_constructor = detail::fixed_delay_constructor_t<380, 1140>;
325
+ };
326
+
327
+ #if _CCCL_HAS_INT128()
328
+ template <>
329
+ struct sm90_tuning<__int128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
330
+ {
331
+ static constexpr int threads = 512;
332
+ static constexpr int items = 5;
333
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
334
+ using delay_constructor = detail::fixed_delay_constructor_t<460, 1145>;
335
+ };
336
+
337
+ template <>
338
+ struct sm90_tuning<__uint128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
339
+ : sm90_tuning<__int128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
340
+ {};
341
+ #endif
342
+
343
+ // select::flagged
344
+ template <class Input>
345
+ struct sm90_tuning<Input, flagged::yes, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_1>
346
+ {
347
+ static constexpr int threads = 448;
348
+ static constexpr int items = 20;
349
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
350
+ using delay_constructor = detail::no_delay_constructor_t<715>;
351
+ };
352
+
353
+ template <class Input>
354
+ struct sm90_tuning<Input, flagged::yes, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_2>
355
+ {
356
+ static constexpr int threads = 448;
357
+ static constexpr int items = 20;
358
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
359
+ using delay_constructor = detail::fixed_delay_constructor_t<504, 765>;
360
+ };
361
+
362
+ template <class Input>
363
+ struct sm90_tuning<Input, flagged::yes, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_4>
364
+ {
365
+ static constexpr int threads = 384;
366
+ static constexpr int items = 15;
367
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
368
+ using delay_constructor = detail::fixed_delay_constructor_t<415, 1125>;
369
+ };
370
+
371
+ template <class Input>
372
+ struct sm90_tuning<Input, flagged::yes, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_8>
373
+ {
374
+ static constexpr int threads = 384;
375
+ static constexpr int items = 11;
376
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
377
+ using delay_constructor = detail::fixed_delay_constructor_t<360, 1170>;
378
+ };
379
+
380
+ #if _CCCL_HAS_INT128()
381
+ template <>
382
+ struct sm90_tuning<__int128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
383
+ {
384
+ static constexpr int threads = 512;
385
+ static constexpr int items = 3;
386
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
387
+ using delay_constructor = detail::fixed_delay_constructor_t<284, 1130>;
388
+ };
389
+
390
+ template <>
391
+ struct sm90_tuning<__uint128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
392
+ : sm90_tuning<__int128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
393
+ {};
394
+ #endif
395
+
396
+ // partition::if
397
+ template <class Input>
398
+ struct sm90_tuning<Input, flagged::no, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_1>
399
+ {
400
+ static constexpr int threads = 384;
401
+ static constexpr int items = 20;
402
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
403
+ using delay_constructor = detail::fixed_delay_constructor_t<908, 995>;
404
+ };
405
+
406
+ template <class Input>
407
+ struct sm90_tuning<Input, flagged::no, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_2>
408
+ {
409
+ static constexpr int threads = 320;
410
+ static constexpr int items = 14;
411
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
412
+ using delay_constructor = detail::fixed_delay_constructor_t<500, 560>;
413
+ };
414
+
415
+ template <class Input>
416
+ struct sm90_tuning<Input, flagged::no, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_4>
417
+ {
418
+ static constexpr int threads = 256;
419
+ static constexpr int items = 14;
420
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
421
+ using delay_constructor = detail::fixed_delay_constructor_t<536, 1055>;
422
+ };
423
+
424
+ template <class Input>
425
+ struct sm90_tuning<Input, flagged::no, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_8>
426
+ {
427
+ static constexpr int threads = 128;
428
+ static constexpr int items = 12;
429
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
430
+ using delay_constructor = detail::fixed_delay_constructor_t<512, 1075>;
431
+ };
432
+
433
+ #if _CCCL_HAS_INT128()
434
+ template <>
435
+ struct sm90_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
436
+ {
437
+ static constexpr int threads = 192;
438
+ static constexpr int items = 5;
439
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
440
+ using delay_constructor = detail::fixed_delay_constructor_t<1616, 1115>;
441
+ };
442
+
443
+ template <>
444
+ struct sm90_tuning<__uint128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
445
+ : sm90_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
446
+ {};
447
+ #endif
448
+
449
+ // partition::flagged
450
+ template <class Input>
451
+ struct sm90_tuning<Input, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_1>
452
+ {
453
+ static constexpr int threads = 256;
454
+ static constexpr int items = 20;
455
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
456
+ using delay_constructor = detail::fixed_delay_constructor_t<580, 850>;
457
+ };
458
+
459
+ template <class Input>
460
+ struct sm90_tuning<Input, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_2>
461
+ {
462
+ static constexpr int threads = 512;
463
+ static constexpr int items = 20;
464
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
465
+ using delay_constructor = detail::fixed_delay_constructor_t<388, 1055>;
466
+ };
467
+
468
+ template <class Input>
469
+ struct sm90_tuning<Input, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_4>
470
+ {
471
+ static constexpr int threads = 256;
472
+ static constexpr int items = 20;
473
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
474
+ using delay_constructor = detail::fixed_delay_constructor_t<72, 1165>;
475
+ };
476
+
477
+ template <class Input>
478
+ struct sm90_tuning<Input, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_8>
479
+ {
480
+ static constexpr int threads = 224;
481
+ static constexpr int items = 6;
482
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
483
+ using delay_constructor = detail::fixed_delay_constructor_t<532, 1180>;
484
+ };
485
+
486
+ #if _CCCL_HAS_INT128()
487
+ template <>
488
+ struct sm90_tuning<__int128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
489
+ {
490
+ static constexpr int threads = 160;
491
+ static constexpr int items = 5;
492
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
493
+ using delay_constructor = detail::fixed_delay_constructor_t<720, 1105>;
494
+ };
495
+
496
+ template <>
497
+ struct sm90_tuning<__uint128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
498
+ : sm90_tuning<__int128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
499
+ {};
500
+ #endif
501
+
502
+ template <class InputT,
503
+ flagged,
504
+ keep_rejects,
505
+ offset_size OffsetSize,
506
+ primitive,
507
+ input_size InputSize,
508
+ may_alias,
509
+ distinct_partitions DistinctPartitions>
510
+ struct sm100_tuning;
511
+
512
+ // select::if
513
+ template <class Input, distinct_partitions DistinctPartitions>
514
+ struct sm100_tuning<Input,
515
+ flagged::no,
516
+ keep_rejects::no,
517
+ offset_size::_4,
518
+ primitive::yes,
519
+ input_size::_1,
520
+ may_alias::no,
521
+ DistinctPartitions>
522
+ {
523
+ // trp_0.ld_0.ipt_22.tpb_384.ns_0.dcid_2.l2w_915 1.099232 0.980183 1.096778 1.545455
524
+ static constexpr int threads = 384;
525
+ static constexpr int nominal_4b_items = 22;
526
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
527
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
528
+ using delay_constructor = detail::exponential_backoff_constructor_t<0, 915>;
529
+ };
530
+
531
+ template <class Input, distinct_partitions DistinctPartitions>
532
+ struct sm100_tuning<Input,
533
+ flagged::no,
534
+ keep_rejects::no,
535
+ offset_size::_4,
536
+ primitive::yes,
537
+ input_size::_1,
538
+ may_alias::yes,
539
+ DistinctPartitions>
540
+ {
541
+ // trp_1.ld_0.ipt_20.tpb_448.ns_596.dcid_6.l2w_295 1.214635 1.001421 1.207023 1.307692
542
+ static constexpr int threads = 448;
543
+ static constexpr int nominal_4b_items = 20;
544
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
545
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
546
+ using delay_constructor = detail::exponential_backon_jitter_constructor_t<596, 295>;
547
+ };
548
+
549
+ // todo(gonidelis): for large input size select.unique regresses a lot and select.if regresses a bit.
550
+ // find better tuning.
551
+ // template <class Input, distinct_partitions DistinctPartitions>
552
+ // struct sm100_tuning<Input,
553
+ // flagged::no,
554
+ // keep_rejects::no,
555
+ // offset_size::_4,
556
+ // primitive::yes,
557
+ // input_size::_2,
558
+ // may_alias::no,
559
+ // DistinctPartitions>
560
+ // {
561
+ // // trp_1.ld_0.ipt_20.tpb_256.ns_516.dcid_7.l2w_685 1.065598 0.937984 1.067343 1.452153
562
+ // static constexpr int threads = 256;
563
+ // static constexpr int nominal_4b_items = 20;
564
+ // static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
565
+ // static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
566
+ // using delay_constructor = detail::exponential_backon_constructor_t<516, 685>;
567
+ // };
568
+
569
+ // template <class Input, distinct_partitions DistinctPartitions>
570
+ // struct sm100_tuning<Input,
571
+ // flagged::no,
572
+ // keep_rejects::no,
573
+ // offset_size::_4,
574
+ // primitive::yes,
575
+ // input_size::_2,
576
+ // may_alias::no,
577
+ // DistinctPartitions>
578
+ // {
579
+ // // trp_1.ld_0.ipt_20.tpb_384.ns_1060.dcid_5.l2w_375 1.109871 0.973142 1.105415 1.459135
580
+ // static constexpr int threads = 384;
581
+ // static constexpr int nominal_4b_items = 20;
582
+ // static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
583
+ // static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
584
+ // using delay_constructor = detail::exponential_backon_jitter_window_constructor_t<1060, 375>;
585
+ // };
586
+
587
+ template <class Input, distinct_partitions DistinctPartitions>
588
+ struct sm100_tuning<Input,
589
+ flagged::no,
590
+ keep_rejects::no,
591
+ offset_size::_4,
592
+ primitive::yes,
593
+ input_size::_4,
594
+ may_alias::no,
595
+ DistinctPartitions>
596
+ {
597
+ // trp_1.ld_0.ipt_15.tpb_384.ns_1508.dcid_5.l2w_585 1.201993 0.920103 1.185134 1.441805
598
+ static constexpr int threads = 384;
599
+ static constexpr int nominal_4b_items = 15;
600
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
601
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
602
+ using delay_constructor = detail::exponential_backon_jitter_window_constructor_t<1508, 585>;
603
+ };
604
+
605
+ // todo(gonidelis): for large input size select.unique regresses a lot and select.if regresses a bit.
606
+ // find better tuning.
607
+ // template <class Input, distinct_partitions DistinctPartitions>
608
+ // struct sm100_tuning<Input,
609
+ // flagged::no,
610
+ // keep_rejects::no,
611
+ // offset_size::_4,
612
+ // primitive::yes,
613
+ // input_size::_4,
614
+ // may_alias::yes,
615
+ // DistinctPartitions>
616
+ // {
617
+ // // trp_1.ld_0.ipt_19.tpb_512.ns_928.dcid_7.l2w_770 1.258815 1.000000 1.235251 1.444884
618
+ // static constexpr int threads = 512;
619
+ // static constexpr int nominal_4b_items = 19;
620
+ // static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
621
+ // static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
622
+ // using delay_constructor = detail::exponential_backon_constructor_t<928, 770>;
623
+ // };
624
+
625
+ // template <class Input, may_alias MayAlias, distinct_partitions DistinctPartitions>
626
+ // struct sm100_tuning<Input,
627
+ // flagged::no,
628
+ // keep_rejects::no,
629
+ // offset_size::_4,
630
+ // primitive::yes,
631
+ // input_size::_8,
632
+ // may_alias::yes,
633
+ // MayAlias,
634
+ // DistinctPartitions>
635
+ // {
636
+ // // trp_1.ld_0.ipt_23.tpb_384.ns_1140.dcid_7.l2w_520 1.081506 0.955298 1.088848 1.248971
637
+ // static constexpr int threads = 384;
638
+ // static constexpr int nominal_4b_items = 23;
639
+ // static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
640
+ // static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
641
+ // using delay_constructor = detail::exponential_backon_constructor_t<1140, 520>;
642
+ // };
643
+
644
+ // TODO(gonidelis): Tune for I128.
645
+ #if _CCCL_HAS_INT128()
646
+ // template <>
647
+ // struct sm100_tuning<__int128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
648
+ // {
649
+ // // static constexpr int threads = 512;
650
+ // // static constexpr int nominal_4b_items = 5;
651
+
652
+ // // static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
653
+
654
+ // // using delay_constructor = detail::fixed_delay_constructor_t<460, 1145>;
655
+ // };
656
+
657
+ // template <>
658
+ // struct sm100_tuning<__uint128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
659
+ // {
660
+ // // static constexpr int threads = 512;
661
+ // // static constexpr int nominal_4b_items = 5;
662
+
663
+ // // static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
664
+
665
+ // // using delay_constructor = detail::fixed_delay_constructor_t<460, 1145>;
666
+ // };
667
+ #endif
668
+
669
+ // select::flagged
670
+ template <class Input, distinct_partitions DistinctPartitions>
671
+ struct sm100_tuning<Input,
672
+ flagged::yes,
673
+ keep_rejects::no,
674
+ offset_size::_4,
675
+ primitive::yes,
676
+ input_size::_1,
677
+ may_alias::no,
678
+ DistinctPartitions>
679
+ {
680
+ // trp_0.ld_0.ipt_20.tpb_896.ns_84.dcid_7.l2w_480 1.254262 0.846154 1.222437 1.462665
681
+ static constexpr int threads = 896;
682
+ static constexpr int nominal_4b_items = 20;
683
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
684
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
685
+ using delay_constructor = detail::exponential_backon_constructor_t<84, 480>;
686
+ };
687
+
688
+ template <class Input, distinct_partitions DistinctPartitions>
689
+ struct sm100_tuning<Input,
690
+ flagged::yes,
691
+ keep_rejects::no,
692
+ offset_size::_4,
693
+ primitive::yes,
694
+ input_size::_1,
695
+ may_alias::yes,
696
+ DistinctPartitions>
697
+ {
698
+ // trp_0.ld_0.ipt_20.tpb_1024.ns_360.dcid_6.l2w_380 1.274174 0.748441 1.227123 1.610039
699
+ static constexpr int threads = 1024;
700
+ static constexpr int nominal_4b_items = 20;
701
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
702
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
703
+ using delay_constructor = detail::exponential_backon_jitter_constructor_t<360, 380>;
704
+ };
705
+
706
+ template <class Input, distinct_partitions DistinctPartitions>
707
+ struct sm100_tuning<Input,
708
+ flagged::yes,
709
+ keep_rejects::no,
710
+ offset_size::_4,
711
+ primitive::yes,
712
+ input_size::_2,
713
+ may_alias::no,
714
+ DistinctPartitions>
715
+ {
716
+ // trp_0.ld_0.ipt_22.tpb_256.ns_1292.dcid_5.l2w_750 1.283400 1.002841 1.267822 1.445913
717
+ static constexpr int threads = 256;
718
+ static constexpr int nominal_4b_items = 22;
719
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
720
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
721
+ using delay_constructor = detail::exponential_backon_jitter_window_constructor_t<1292, 750>;
722
+ };
723
+
724
+ template <class Input, distinct_partitions DistinctPartitions>
725
+ struct sm100_tuning<Input,
726
+ flagged::yes,
727
+ keep_rejects::no,
728
+ offset_size::_4,
729
+ primitive::yes,
730
+ input_size::_2,
731
+ may_alias::yes,
732
+ DistinctPartitions>
733
+ {
734
+ // trp_1.ld_0.ipt_20.tpb_448.ns_136.dcid_2.l2w_760 1.318819 0.994090 1.289173 1.551415
735
+ static constexpr int threads = 448;
736
+ static constexpr int nominal_4b_items = 20;
737
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
738
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
739
+ using delay_constructor = detail::exponential_backoff_constructor_t<136, 760>;
740
+ };
741
+
742
+ template <class Input, distinct_partitions DistinctPartitions>
743
+ struct sm100_tuning<Input,
744
+ flagged::yes,
745
+ keep_rejects::no,
746
+ offset_size::_4,
747
+ primitive::yes,
748
+ input_size::_4,
749
+ may_alias::no,
750
+ DistinctPartitions>
751
+ {
752
+ // trp_0.ld_0.ipt_14.tpb_512.ns_844.dcid_6.l2w_675 1.207911 1.068001 1.208890 1.455636
753
+ static constexpr int threads = 512;
754
+ static constexpr int nominal_4b_items = 14;
755
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
756
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
757
+ using delay_constructor = detail::exponential_backon_jitter_constructor_t<844, 675>;
758
+ };
759
+
760
+ template <class Input, distinct_partitions DistinctPartitions>
761
+ struct sm100_tuning<Input,
762
+ flagged::yes,
763
+ keep_rejects::no,
764
+ offset_size::_4,
765
+ primitive::yes,
766
+ input_size::_4,
767
+ may_alias::yes,
768
+ DistinctPartitions>
769
+ {
770
+ // trp_1.ld_0.ipt_14.tpb_384.ns_524.dcid_7.l2w_635 1.256212 1.004808 1.241086 1.373337
771
+ static constexpr int threads = 384;
772
+ static constexpr int nominal_4b_items = 14;
773
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
774
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
775
+ using delay_constructor = detail::exponential_backon_constructor_t<524, 635>;
776
+ };
777
+
778
+ template <class Input, distinct_partitions DistinctPartitions>
779
+ struct sm100_tuning<Input,
780
+ flagged::yes,
781
+ keep_rejects::no,
782
+ offset_size::_4,
783
+ primitive::yes,
784
+ input_size::_8,
785
+ may_alias::no,
786
+ DistinctPartitions>
787
+ {
788
+ // trp_0.ld_1.ipt_22.tpb_320.ns_660.dcid_7.l2w_1030 1.162087 0.997167 1.154955 1.395010
789
+ static constexpr int threads = 320;
790
+ static constexpr int nominal_4b_items = 22;
791
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
792
+ static constexpr CacheLoadModifier load_modifier = LOAD_CA;
793
+ using delay_constructor = detail::exponential_backon_constructor_t<660, 1030>;
794
+ };
795
+
796
+ template <class Input, distinct_partitions DistinctPartitions>
797
+ struct sm100_tuning<Input,
798
+ flagged::yes,
799
+ keep_rejects::no,
800
+ offset_size::_4,
801
+ primitive::yes,
802
+ input_size::_8,
803
+ may_alias::yes,
804
+ DistinctPartitions>
805
+ {
806
+ // trp_1.ld_1.ipt_21.tpb_384.ns_1316.dcid_5.l2w_990 1.221365 1.019231 1.213141 1.372951
807
+ static constexpr int threads = 384;
808
+ static constexpr int nominal_4b_items = 21;
809
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
810
+ static constexpr CacheLoadModifier load_modifier = LOAD_CA;
811
+ using delay_constructor = detail::exponential_backon_jitter_window_constructor_t<1316, 990>;
812
+ };
813
+
814
+ // TODO(gonidelis): Tune for I128.
815
+ #if _CCCL_HAS_INT128()
816
+ // template <>
817
+ // struct sm100_tuning<__int128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
818
+ // {
819
+ // static constexpr int threads = 512;
820
+ // static constexpr int nominal_4b_items = 3;
821
+
822
+ // static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
823
+
824
+ // using delay_constructor = detail::fixed_delay_constructor_t<284, 1130>;
825
+ // };
826
+
827
+ // template <>
828
+ // struct sm100_tuning<__uint128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
829
+ // {
830
+ // static constexpr int threads = 512;
831
+ // static constexpr int nominal_4b_items = 3;
832
+
833
+ // static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
834
+
835
+ // using delay_constructor = detail::fixed_delay_constructor_t<284, 1130>;
836
+ // };
837
+ #endif
838
+
839
+ // partition::if
840
+ template <class Input>
841
+ struct sm100_tuning<Input,
842
+ flagged::no,
843
+ keep_rejects::yes,
844
+ offset_size::_4,
845
+ primitive::yes,
846
+ input_size::_1,
847
+ may_alias::no,
848
+ distinct_partitions::yes>
849
+ {
850
+ // trp_0.ld_0.ipt_15.tpb_608.ns_676.dcid_7.l2w_500 1.171303 1.042818 1.175890 1.456731
851
+ static constexpr int nominal_4b_items = 15;
852
+ static constexpr int threads = 608;
853
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
854
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
855
+ using delay_constructor = exponential_backon_constructor_t<676, 500>;
856
+ };
857
+
858
+ template <class Input>
859
+ struct sm100_tuning<Input,
860
+ flagged::no,
861
+ keep_rejects::yes,
862
+ offset_size::_4,
863
+ primitive::yes,
864
+ input_size::_2,
865
+ may_alias::no,
866
+ distinct_partitions::yes>
867
+ {
868
+ // trp_0.ld_0.ipt_22.tpb_320.ns_1756.dcid_6.l2w_615 1.206387 1.079118 1.202408 1.307692
869
+ static constexpr int nominal_4b_items = 22;
870
+ static constexpr int threads = 320;
871
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
872
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
873
+ using delay_constructor = exponential_backon_jitter_constructor_t<1756, 615>;
874
+ };
875
+
876
+ template <class Input>
877
+ struct sm100_tuning<Input,
878
+ flagged::no,
879
+ keep_rejects::yes,
880
+ offset_size::_4,
881
+ primitive::yes,
882
+ input_size::_4,
883
+ may_alias::no,
884
+ distinct_partitions::yes>
885
+ {
886
+ // trp_1.ld_0.ipt_19.tpb_320.ns_716.dcid_5.l2w_570 1.177521 1.123348 1.177703 1.307692
887
+ static constexpr int nominal_4b_items = 19;
888
+ static constexpr int threads = 320;
889
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
890
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
891
+ using delay_constructor = exponential_backon_jitter_window_constructor_t<716, 570>;
892
+ };
893
+
894
+ // TODO(gonidelis): Tuning Regresses for large input sizes. Find better tuning.
895
+ // template <class Input>
896
+ // struct sm100_tuning<Input,
897
+ // flagged::no,
898
+ // keep_rejects::yes,
899
+ // offset_size::_4,
900
+ // primitive::yes,
901
+ // input_size::_8,
902
+ // may_alias::no,
903
+ // distinct_partitions::yes>
904
+ // {
905
+ // // trp_1.ld_0.ipt_20.tpb_416.ns_1672.dcid_7.l2w_1050 1.086221 0.977775 1.090731 1.257618
906
+ // static constexpr int nominal_4b_items = 20;
907
+ // static constexpr int threads = 416;
908
+ // static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
909
+ // static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
910
+ // using delay_constructor = exponential_backon_constructor_t<1672, 1050>;
911
+ // };
912
+
913
+ template <class Input>
914
+ struct sm100_tuning<Input,
915
+ flagged::no,
916
+ keep_rejects::yes,
917
+ offset_size::_8,
918
+ primitive::yes,
919
+ input_size::_1,
920
+ may_alias::no,
921
+ distinct_partitions::yes>
922
+ {
923
+ // trp_0.ld_0.ipt_22.tpb_576.ns_368.dcid_7.l2w_680 1.191750 0.990521 1.175654 1.433174
924
+ static constexpr int nominal_4b_items = 22;
925
+ static constexpr int threads = 576;
926
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
927
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
928
+ using delay_constructor = exponential_backon_constructor_t<368, 680>;
929
+ };
930
+
931
+ template <class Input>
932
+ struct sm100_tuning<Input,
933
+ flagged::no,
934
+ keep_rejects::yes,
935
+ offset_size::_8,
936
+ primitive::yes,
937
+ input_size::_2,
938
+ may_alias::no,
939
+ distinct_partitions::yes>
940
+ {
941
+ // trp_1.ld_0.ipt_20.tpb_608.ns_516.dcid_7.l2w_635 1.244961 0.848558 1.212567 1.461538
942
+ static constexpr int nominal_4b_items = 20;
943
+ static constexpr int threads = 608;
944
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
945
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
946
+ using delay_constructor = exponential_backon_jitter_constructor_t<516, 635>;
947
+ };
948
+
949
+ template <class Input>
950
+ struct sm100_tuning<Input,
951
+ flagged::no,
952
+ keep_rejects::yes,
953
+ offset_size::_8,
954
+ primitive::yes,
955
+ input_size::_4,
956
+ may_alias::no,
957
+ distinct_partitions::yes>
958
+ {
959
+ // trp_1.ld_0.ipt_18.tpb_608.ns_1712.dcid_5.l2w_825 1.255078 0.990588 1.231055 1.421176
960
+ static constexpr int nominal_4b_items = 18;
961
+ static constexpr int threads = 608;
962
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
963
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
964
+ using delay_constructor = exponential_backon_jitter_window_constructor_t<1712, 825>;
965
+ };
966
+
967
+ // todo(gonidelis): Tuning Regresses for large input sizes. Find better tuning.
968
+ // template <class Input>
969
+ // struct sm100_tuning<Input,
970
+ // flagged::no,
971
+ // keep_rejects::yes,
972
+ // offset_size::_8,
973
+ // primitive::yes,
974
+ // input_size::_8,
975
+ // may_alias::no,
976
+ // distinct_partitions::yes>
977
+ // {
978
+ // // trp_1.ld_0.ipt_14.tpb_512.ns_1468.dcid_7.l2w_820 1.111830 1.011070 1.119481 1.245868
979
+ // static constexpr int nominal_4b_items = 14;
980
+ // static constexpr int threads = 512;
981
+ // static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
982
+ // static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
983
+ // using delay_constructor = exponential_backon_jitter_constructor_t<1468, 820>;
984
+ // };
985
+
986
+ template <class Input>
987
+ struct sm100_tuning<Input,
988
+ flagged::no,
989
+ keep_rejects::yes,
990
+ offset_size::_4,
991
+ primitive::yes,
992
+ input_size::_1,
993
+ may_alias::no,
994
+ distinct_partitions::no>
995
+ {
996
+ // trp_0.ld_0.ipt_22.tpb_224.ns_68.dcid_2.l2w_990 1.151989 1.064433 1.146707 1.305288
997
+ static constexpr int nominal_4b_items = 22;
998
+ static constexpr int threads = 224;
999
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
1000
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
1001
+ using delay_constructor = exponential_backoff_constructor_t<68, 990>;
1002
+ };
1003
+
1004
+ template <class Input>
1005
+ struct sm100_tuning<Input,
1006
+ flagged::no,
1007
+ keep_rejects::yes,
1008
+ offset_size::_4,
1009
+ primitive::yes,
1010
+ input_size::_2,
1011
+ may_alias::no,
1012
+ distinct_partitions::no>
1013
+ {
1014
+ // trp_0.ld_0.ipt_22.tpb_320.ns_560.dcid_5.l2w_640 1.205538 1.080520 1.201709 1.307692
1015
+ static constexpr int nominal_4b_items = 22;
1016
+ static constexpr int threads = 320;
1017
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
1018
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
1019
+ using delay_constructor = exponential_backon_jitter_window_constructor_t<560, 640>;
1020
+ };
1021
+
1022
+ template <class Input>
1023
+ struct sm100_tuning<Input,
1024
+ flagged::no,
1025
+ keep_rejects::yes,
1026
+ offset_size::_4,
1027
+ primitive::yes,
1028
+ input_size::_4,
1029
+ may_alias::no,
1030
+ distinct_partitions::no>
1031
+ {
1032
+ // trp_1.ld_0.ipt_19.tpb_608.ns_724.dcid_5.l2w_970 1.196592 0.982227 1.177984 1.310843
1033
+ static constexpr int nominal_4b_items = 19;
1034
+ static constexpr int threads = 608;
1035
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
1036
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
1037
+ using delay_constructor = exponential_backon_jitter_window_constructor_t<724, 970>;
1038
+ };
1039
+
1040
+ // template <class Input>
1041
+ // struct sm100_tuning<Input,
1042
+ // flagged::no,
1043
+ // keep_rejects::yes,
1044
+ // offset_size::_4,
1045
+ // primitive::yes,
1046
+ // input_size::_8,
1047
+ // may_alias::no,
1048
+ // distinct_partitions::no>
1049
+ // {
1050
+ // // trp_1.ld_0.ipt_23.tpb_416.ns_1608.dcid_2.l2w_560 1.099752 0.977393 1.106477 1.259336
1051
+ // static constexpr int nominal_4b_items = 23;
1052
+ // static constexpr int threads = 416;
1053
+ // static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
1054
+ // static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
1055
+ // using delay_constructor = exponential_backoff_constructor_t<1608, 560>;
1056
+ // };
1057
+
1058
+ template <class Input>
1059
+ struct sm100_tuning<Input,
1060
+ flagged::no,
1061
+ keep_rejects::yes,
1062
+ offset_size::_8,
1063
+ primitive::yes,
1064
+ input_size::_1,
1065
+ may_alias::no,
1066
+ distinct_partitions::no>
1067
+ {
1068
+ // trp_0.ld_0.ipt_20.tpb_608.ns_1016.dcid_6.l2w_545 1.239144 1.002404 1.225460 1.444711
1069
+ static constexpr int nominal_4b_items = 20;
1070
+ static constexpr int threads = 608;
1071
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
1072
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
1073
+ using delay_constructor = exponential_backon_jitter_constructor_t<1016, 545>;
1074
+ };
1075
+
1076
+ template <class Input>
1077
+ struct sm100_tuning<Input,
1078
+ flagged::no,
1079
+ keep_rejects::yes,
1080
+ offset_size::_8,
1081
+ primitive::yes,
1082
+ input_size::_2,
1083
+ may_alias::no,
1084
+ distinct_partitions::no>
1085
+ {
1086
+ // trp_1.ld_0.ipt_22.tpb_288.ns_124.dcid_2.l2w_690 1.202783 1.000000 1.183737 1.311755
1087
+ static constexpr int nominal_4b_items = 22;
1088
+ static constexpr int threads = 288;
1089
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
1090
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
1091
+ using delay_constructor = exponential_backoff_constructor_t<124, 690>;
1092
+ };
1093
+
1094
+ template <class Input>
1095
+ struct sm100_tuning<Input,
1096
+ flagged::no,
1097
+ keep_rejects::yes,
1098
+ offset_size::_8,
1099
+ primitive::yes,
1100
+ input_size::_4,
1101
+ may_alias::no,
1102
+ distinct_partitions::no>
1103
+ {
1104
+ // trp_1.ld_0.ipt_19.tpb_608.ns_1884.dcid_6.l2w_950 1.250302 0.988124 1.225191 1.392931
1105
+ static constexpr int nominal_4b_items = 19;
1106
+ static constexpr int threads = 608;
1107
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
1108
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
1109
+ using delay_constructor = exponential_backon_jitter_constructor_t<1884, 950>;
1110
+ };
1111
+
1112
+ template <class Input>
1113
+ struct sm100_tuning<Input,
1114
+ flagged::no,
1115
+ keep_rejects::yes,
1116
+ offset_size::_8,
1117
+ primitive::yes,
1118
+ input_size::_8,
1119
+ may_alias::no,
1120
+ distinct_partitions::no>
1121
+ {
1122
+ // trp_1.ld_0.ipt_23.tpb_416.ns_0.dcid_2.l2w_1200 1.156864 1.011990 1.152368 1.266667
1123
+ static constexpr int nominal_4b_items = 23;
1124
+ static constexpr int threads = 416;
1125
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
1126
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
1127
+ using delay_constructor = exponential_backoff_constructor_t<0, 1200>;
1128
+ };
1129
+
1130
+ // partition::flagged
1131
+ template <class Input>
1132
+ struct sm100_tuning<Input,
1133
+ flagged::yes,
1134
+ keep_rejects::yes,
1135
+ offset_size::_4,
1136
+ primitive::yes,
1137
+ input_size::_1,
1138
+ may_alias::no,
1139
+ distinct_partitions::yes>
1140
+ {
1141
+ // trp_0.ld_0.ipt_20.tpb_448.ns_964.dcid_7.l2w_385 1.111204 1.036205 1.111986 1.275210
1142
+ static constexpr int nominal_4b_items = 20;
1143
+ static constexpr int threads = 448;
1144
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
1145
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
1146
+ using delay_constructor = exponential_backon_constructor_t<964, 385>;
1147
+ };
1148
+
1149
+ // todo(gonidelis): Tunings regress for large input sizes. Find better tunings.
1150
+ // template <class Input>
1151
+ // struct sm100_tuning<Input,
1152
+ // flagged::yes,
1153
+ // keep_rejects::yes,
1154
+ // offset_size::_4,
1155
+ // primitive::yes,
1156
+ // input_size::_2,
1157
+ // may_alias::no,
1158
+ // distinct_partitions::yes>
1159
+ // {
1160
+ // // trp_0.ld_0.ipt_18.tpb_256.ns_300.dcid_6.l2w_820 1.107466 0.923750 1.126995 1.346591
1161
+ // static constexpr int nominal_4b_items = 18;
1162
+ // static constexpr int threads = 256;
1163
+ // static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
1164
+ // static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
1165
+ // using delay_constructor = exponential_backon_jitter_constructor_t<300, 820>;
1166
+ // };
1167
+
1168
+ // template <class Input>
1169
+ // struct sm100_tuning<Input,
1170
+ // flagged::yes,
1171
+ // keep_rejects::yes,
1172
+ // offset_size::_4,
1173
+ // primitive::yes,
1174
+ // input_size::_4,
1175
+ // may_alias::no,
1176
+ // distinct_partitions::yes>
1177
+ // {
1178
+ // // trp_0.ld_0.ipt_19.tpb_256.ns_1608.dcid_7.l2w_675 1.097548 0.964114 1.109189 1.283333
1179
+ // static constexpr int nominal_4b_items = 19;
1180
+ // static constexpr int threads = 256;
1181
+ // static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
1182
+ // static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
1183
+ // using delay_constructor = exponential_backon_constructor_t<1608, 675>;
1184
+ // };
1185
+
1186
+ template <class Input>
1187
+ struct sm100_tuning<Input,
1188
+ flagged::yes,
1189
+ keep_rejects::yes,
1190
+ offset_size::_4,
1191
+ primitive::yes,
1192
+ input_size::_8,
1193
+ may_alias::no,
1194
+ distinct_partitions::yes>
1195
+ {
1196
+ // trp_0.ld_0.ipt_21.tpb_384.ns_300.dcid_7.l2w_580 1.239128 1.019324 1.238373 1.347458
1197
+ static constexpr int nominal_4b_items = 21;
1198
+ static constexpr int threads = 384;
1199
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
1200
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
1201
+ using delay_constructor = exponential_backon_constructor_t<300, 580>;
1202
+ };
1203
+
1204
+ template <class Input>
1205
+ struct sm100_tuning<Input,
1206
+ flagged::yes,
1207
+ keep_rejects::yes,
1208
+ offset_size::_8,
1209
+ primitive::yes,
1210
+ input_size::_1,
1211
+ may_alias::no,
1212
+ distinct_partitions::yes>
1213
+ {
1214
+ // trp_0.ld_1.ipt_20.tpb_448.ns_240.dcid_6.l2w_845 1.097180 0.990453 1.091667 1.452153
1215
+ static constexpr int nominal_4b_items = 20;
1216
+ static constexpr int threads = 448;
1217
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
1218
+ static constexpr CacheLoadModifier load_modifier = LOAD_CA;
1219
+ using delay_constructor = exponential_backon_jitter_constructor_t<240, 845>;
1220
+ };
1221
+
1222
+ template <class Input>
1223
+ struct sm100_tuning<Input,
1224
+ flagged::yes,
1225
+ keep_rejects::yes,
1226
+ offset_size::_8,
1227
+ primitive::yes,
1228
+ input_size::_2,
1229
+ may_alias::no,
1230
+ distinct_partitions::yes>
1231
+ {
1232
+ // trp_0.ld_0.ipt_14.tpb_320.ns_1428.dcid_7.l2w_830 1.380164 1.133333 1.367514 1.628793
1233
+ static constexpr int nominal_4b_items = 14;
1234
+ static constexpr int threads = 320;
1235
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
1236
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
1237
+ using delay_constructor = exponential_backon_constructor_t<1428, 830>;
1238
+ };
1239
+
1240
+ template <class Input>
1241
+ struct sm100_tuning<Input,
1242
+ flagged::yes,
1243
+ keep_rejects::yes,
1244
+ offset_size::_8,
1245
+ primitive::yes,
1246
+ input_size::_4,
1247
+ may_alias::no,
1248
+ distinct_partitions::yes>
1249
+ {
1250
+ // trp_0.ld_0.ipt_14.tpb_640.ns_1204.dcid_5.l2w_635 1.155209 1.000000 1.143742 1.380659
1251
+ static constexpr int nominal_4b_items = 14;
1252
+ static constexpr int threads = 640;
1253
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
1254
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
1255
+ using delay_constructor = exponential_backon_jitter_window_constructor_t<1204, 635>;
1256
+ };
1257
+
1258
+ template <class Input>
1259
+ struct sm100_tuning<Input,
1260
+ flagged::yes,
1261
+ keep_rejects::yes,
1262
+ offset_size::_8,
1263
+ primitive::yes,
1264
+ input_size::_8,
1265
+ may_alias::no,
1266
+ distinct_partitions::yes>
1267
+ {
1268
+ // trp_0.ld_0.ipt_19.tpb_384.ns_1016.dcid_7.l2w_875 1.227540 1.181818 1.223936 1.261954
1269
+ static constexpr int nominal_4b_items = 19;
1270
+ static constexpr int threads = 384;
1271
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
1272
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
1273
+ using delay_constructor = exponential_backon_constructor_t<1016, 875>;
1274
+ };
1275
+
1276
+ template <class Input>
1277
+ struct sm100_tuning<Input,
1278
+ flagged::yes,
1279
+ keep_rejects::yes,
1280
+ offset_size::_4,
1281
+ primitive::yes,
1282
+ input_size::_1,
1283
+ may_alias::no,
1284
+ distinct_partitions::no>
1285
+ {
1286
+ // trp_0.ld_0.ipt_24.tpb_256.ns_2024.dcid_5.l2w_835 1.146782 1.001841 1.149438 1.439904
1287
+ static constexpr int nominal_4b_items = 24;
1288
+ static constexpr int threads = 256;
1289
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
1290
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
1291
+ using delay_constructor = exponential_backon_jitter_window_constructor_t<2024, 835>;
1292
+ };
1293
+
1294
+ // todo(gonidelis): Tuning regresses for large input size. Find better tuning.
1295
+ // template <class Input>
1296
+ // struct sm100_tuning<Input,
1297
+ // flagged::yes,
1298
+ // keep_rejects::yes,
1299
+ // offset_size::_4,
1300
+ // primitive::yes,
1301
+ // input_size::_2,
1302
+ // may_alias::no,
1303
+ // distinct_partitions::no>
1304
+ // {
1305
+ // // trp_0.ld_0.ipt_18.tpb_256.ns_1832.dcid_5.l2w_590 1.128674 0.984403 1.150806 1.355932
1306
+ // static constexpr int nominal_4b_items = 18;
1307
+ // static constexpr int threads = 256;
1308
+ // static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
1309
+ // static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
1310
+ // using delay_constructor = exponential_backon_jitter_window_constructor_t<1832, 590>;
1311
+ // };
1312
+
1313
+ template <class Input>
1314
+ struct sm100_tuning<Input,
1315
+ flagged::yes,
1316
+ keep_rejects::yes,
1317
+ offset_size::_4,
1318
+ primitive::yes,
1319
+ input_size::_4,
1320
+ may_alias::no,
1321
+ distinct_partitions::no>
1322
+ {
1323
+ // trp_0.ld_0.ipt_11.tpb_448.ns_476.dcid_7.l2w_665 1.173664 1.035556 1.186114 1.393153
1324
+ static constexpr int nominal_4b_items = 11;
1325
+ static constexpr int threads = 448;
1326
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
1327
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
1328
+ using delay_constructor = exponential_backon_constructor_t<476, 665>;
1329
+ };
1330
+
1331
+ template <class Input>
1332
+ struct sm100_tuning<Input,
1333
+ flagged::yes,
1334
+ keep_rejects::yes,
1335
+ offset_size::_4,
1336
+ primitive::yes,
1337
+ input_size::_8,
1338
+ may_alias::no,
1339
+ distinct_partitions::no>
1340
+ {
1341
+ // trp_0.ld_0.ipt_20.tpb_384.ns_1420.dcid_5.l2w_525 (39_new/2.db) 1.157326 1.110920 1.162458 1.259336
1342
+ static constexpr int nominal_4b_items = 20;
1343
+ static constexpr int threads = 384;
1344
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
1345
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
1346
+ using delay_constructor = exponential_backon_jitter_window_constructor_t<1420, 525>;
1347
+ };
1348
+
1349
+ template <class Input>
1350
+ struct sm100_tuning<Input,
1351
+ flagged::yes,
1352
+ keep_rejects::yes,
1353
+ offset_size::_8,
1354
+ primitive::yes,
1355
+ input_size::_1,
1356
+ may_alias::no,
1357
+ distinct_partitions::no>
1358
+ {
1359
+ // trp_0.ld_0.ipt_12.tpb_256.ns_0.dcid_5.l2w_850 1.150864 1.005760 1.157687 1.395833
1360
+ static constexpr int nominal_4b_items = 12;
1361
+ static constexpr int threads = 256;
1362
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
1363
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
1364
+ using delay_constructor = exponential_backon_jitter_window_constructor_t<0, 850>;
1365
+ };
1366
+
1367
+ template <class Input>
1368
+ struct sm100_tuning<Input,
1369
+ flagged::yes,
1370
+ keep_rejects::yes,
1371
+ offset_size::_8,
1372
+ primitive::yes,
1373
+ input_size::_2,
1374
+ may_alias::no,
1375
+ distinct_partitions::no>
1376
+ {
1377
+ // trp_0.ld_0.ipt_12.tpb_256.ns_1552.dcid_7.l2w_730 1.374892 1.171831 1.360076 1.513390
1378
+ static constexpr int nominal_4b_items = 12;
1379
+ static constexpr int threads = 256;
1380
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
1381
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
1382
+ using delay_constructor = exponential_backon_constructor_t<1552, 730>;
1383
+ };
1384
+
1385
+ template <class Input>
1386
+ struct sm100_tuning<Input,
1387
+ flagged::yes,
1388
+ keep_rejects::yes,
1389
+ offset_size::_8,
1390
+ primitive::yes,
1391
+ input_size::_4,
1392
+ may_alias::no,
1393
+ distinct_partitions::no>
1394
+ {
1395
+ // trp_0.ld_0.ipt_14.tpb_352.ns_1444.dcid_5.l2w_655 1.183452 1.000000 1.177224 1.402083
1396
+ static constexpr int nominal_4b_items = 14;
1397
+ static constexpr int threads = 352;
1398
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
1399
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
1400
+ using delay_constructor = exponential_backon_jitter_window_constructor_t<1444, 655>;
1401
+ };
1402
+
1403
+ template <class Input>
1404
+ struct sm100_tuning<Input,
1405
+ flagged::yes,
1406
+ keep_rejects::yes,
1407
+ offset_size::_8,
1408
+ primitive::yes,
1409
+ input_size::_8,
1410
+ may_alias::no,
1411
+ distinct_partitions::no>
1412
+ {
1413
+ // trp_0.ld_0.ipt_11.tpb_512.ns_536.dcid_2.l2w_845 1.248969 1.184659 1.251631 1.360795
1414
+ static constexpr int nominal_4b_items = 11;
1415
+ static constexpr int threads = 512;
1416
+ static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
1417
+ static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
1418
+ using delay_constructor = exponential_backoff_constructor_t<536, 845>;
1419
+ };
1420
+
1421
+ template <class InputT>
1422
+ constexpr primitive is_primitive()
1423
+ {
1424
+ return detail::is_primitive_v<InputT> ? primitive::yes : primitive::no;
1425
+ }
1426
+
1427
+ template <class FlagT>
1428
+ constexpr flagged is_flagged()
1429
+ {
1430
+ return ::cuda::std::is_same_v<FlagT, NullType> ? flagged::no : flagged::yes;
1431
+ }
1432
+
1433
+ template <bool KeepRejects>
1434
+ constexpr keep_rejects are_rejects_kept()
1435
+ {
1436
+ return KeepRejects ? keep_rejects::yes : keep_rejects::no;
1437
+ }
1438
+
1439
+ template <class InputT>
1440
+ constexpr input_size classify_input_size()
1441
+ {
1442
+ return sizeof(InputT) == 1 ? input_size::_1
1443
+ : sizeof(InputT) == 2 ? input_size::_2
1444
+ : sizeof(InputT) == 4 ? input_size::_4
1445
+ : sizeof(InputT) == 8 ? input_size::_8
1446
+ : sizeof(InputT) == 16
1447
+ ? input_size::_16
1448
+ : input_size::unknown;
1449
+ }
1450
+
1451
+ template <class OffsetT>
1452
+ constexpr offset_size classify_offset_size()
1453
+ {
1454
+ return sizeof(OffsetT) == 4 ? offset_size::_4 : sizeof(OffsetT) == 8 ? offset_size::_8 : offset_size::unknown;
1455
+ }
1456
+
1457
+ template <bool Alias>
1458
+ constexpr may_alias should_alias()
1459
+ {
1460
+ return Alias ? may_alias::yes : may_alias::no;
1461
+ }
1462
+
1463
+ template <bool DistinctPartitions>
1464
+ constexpr distinct_partitions is_distinct_partitions()
1465
+ {
1466
+ return DistinctPartitions ? distinct_partitions::yes : distinct_partitions::no;
1467
+ }
1468
+
1469
+ template <class InputT, class FlagT, class OffsetT, bool DistinctPartitions, SelectImpl Impl>
1470
+ struct policy_hub
1471
+ {
1472
+ static constexpr bool may_alias = Impl == SelectImpl::SelectPotentiallyInPlace;
1473
+ static constexpr bool keep_rejects = Impl == SelectImpl::Partition;
1474
+
1475
+ template <CacheLoadModifier LoadModifier>
1476
+ struct DefaultPolicy
1477
+ {
1478
+ static constexpr int nominal_4B_items_per_thread = 10;
1479
+ static constexpr int items_per_thread =
1480
+ ::cuda::std::clamp(nominal_4B_items_per_thread * 4 / int{sizeof(InputT)}, 1, nominal_4B_items_per_thread);
1481
+ using SelectIfPolicyT =
1482
+ AgentSelectIfPolicy<128,
1483
+ items_per_thread,
1484
+ BLOCK_LOAD_DIRECT,
1485
+ LoadModifier,
1486
+ BLOCK_SCAN_WARP_SCANS,
1487
+ detail::fixed_delay_constructor_t<350, 450>>;
1488
+ };
1489
+
1490
+ struct Policy500
1491
+ : DefaultPolicy<may_alias ? LOAD_CA : LOAD_LDG>
1492
+ , ChainedPolicy<500, Policy500, Policy500>
1493
+ {};
1494
+
1495
+ // Use values from tuning if a specialization exists, otherwise pick the default
1496
+ template <typename Tuning>
1497
+ static auto select_agent_policy(int)
1498
+ -> AgentSelectIfPolicy<Tuning::threads,
1499
+ Tuning::items,
1500
+ Tuning::load_algorithm,
1501
+ LOAD_DEFAULT,
1502
+ BLOCK_SCAN_WARP_SCANS,
1503
+ typename Tuning::delay_constructor>;
1504
+ template <typename Tuning>
1505
+ static auto select_agent_policy(long) -> typename DefaultPolicy<LOAD_DEFAULT>::SelectIfPolicyT;
1506
+
1507
+ struct Policy800 : ChainedPolicy<800, Policy800, Policy500>
1508
+ {
1509
+ using SelectIfPolicyT =
1510
+ decltype(select_agent_policy<sm80_tuning<InputT,
1511
+ is_flagged<FlagT>(),
1512
+ are_rejects_kept<keep_rejects>(),
1513
+ offset_size::_4, // before SM100, we only tuned for int32
1514
+ is_primitive<InputT>(),
1515
+ classify_input_size<InputT>()>>(0));
1516
+ };
1517
+
1518
+ struct Policy860
1519
+ : DefaultPolicy<may_alias ? LOAD_CA : LOAD_LDG>
1520
+ , ChainedPolicy<860, Policy860, Policy800>
1521
+ {};
1522
+
1523
+ struct Policy900 : ChainedPolicy<900, Policy900, Policy860>
1524
+ {
1525
+ using SelectIfPolicyT =
1526
+ decltype(select_agent_policy<sm90_tuning<InputT,
1527
+ is_flagged<FlagT>(),
1528
+ are_rejects_kept<keep_rejects>(),
1529
+ offset_size::_4, // before SM100, we only tuned for int32
1530
+ is_primitive<InputT>(),
1531
+ classify_input_size<InputT>()>>(0));
1532
+ };
1533
+
1534
+ struct Policy1000 : ChainedPolicy<1000, Policy1000, Policy900>
1535
+ {
1536
+ // Use values from tuning if a specialization exists, otherwise pick Policy900
1537
+ template <typename Tuning>
1538
+ static auto select_agent_policy100(int)
1539
+ -> AgentSelectIfPolicy<Tuning::threads,
1540
+ Nominal4BItemsToItems<InputT>(Tuning::nominal_4b_items),
1541
+ Tuning::load_algorithm,
1542
+ Tuning::load_modifier,
1543
+ BLOCK_SCAN_WARP_SCANS,
1544
+ typename Tuning::delay_constructor>;
1545
+ template <typename Tuning>
1546
+ static auto select_agent_policy100(long) -> typename Policy900::SelectIfPolicyT;
1547
+
1548
+ using SelectIfPolicyT =
1549
+ decltype(select_agent_policy100<sm100_tuning<InputT,
1550
+ is_flagged<FlagT>(),
1551
+ are_rejects_kept<keep_rejects>(),
1552
+ classify_offset_size<OffsetT>(),
1553
+ is_primitive<InputT>(),
1554
+ classify_input_size<InputT>(),
1555
+ should_alias<may_alias>(),
1556
+ is_distinct_partitions<DistinctPartitions>()>>(0));
1557
+ };
1558
+
1559
+ using MaxPolicy = Policy1000;
1560
+ };
1561
+ } // namespace detail::select
1562
+
1563
+ CUB_NAMESPACE_END