cuda-cccl 0.3.0__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.2__cp310-cp310-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  7. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  8. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  14. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  15. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  20. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  21. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  22. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  23. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  24. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  25. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  26. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
  27. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  28. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  29. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  30. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  31. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  32. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  33. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  34. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  35. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  36. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
  37. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  38. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
  39. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  40. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
  41. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
  42. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  43. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
  44. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  46. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  49. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  52. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  55. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  56. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  57. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  58. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  59. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  60. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  61. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  62. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  63. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  64. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
  65. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  66. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  67. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  68. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  69. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  70. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  71. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  72. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  73. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  74. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  75. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  76. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  77. cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
  78. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  79. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  80. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  81. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  82. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  83. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  84. cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
  85. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  86. cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
  87. cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
  88. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  89. cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
  90. cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
  91. cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
  92. cuda/cccl/headers/include/cuda/__event/event.h +27 -26
  93. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  94. cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
  95. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  96. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  97. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  98. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  99. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  100. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  101. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  102. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  103. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  104. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
  105. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  106. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  107. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  108. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  109. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  110. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  111. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  112. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  113. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  114. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  115. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  116. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
  117. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  118. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
  119. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  120. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  121. cuda/cccl/headers/include/cuda/algorithm +1 -1
  122. cuda/cccl/headers/include/cuda/cmath +1 -0
  123. cuda/cccl/headers/include/cuda/devices +13 -0
  124. cuda/cccl/headers/include/cuda/iterator +1 -0
  125. cuda/cccl/headers/include/cuda/memory +1 -0
  126. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  127. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  128. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  129. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  130. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  131. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  132. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  133. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  134. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  135. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  136. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  137. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  138. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  139. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  140. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  141. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  142. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  143. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  144. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  145. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  146. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  147. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  148. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  149. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  150. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  151. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  152. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  153. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  154. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  155. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  156. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  157. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  158. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  159. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  160. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  161. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  162. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  163. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  164. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  165. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  166. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  167. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  168. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  169. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  170. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  171. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  172. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  173. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  174. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  175. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  176. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  177. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  178. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  179. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  180. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  181. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  182. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  183. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  184. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  185. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  186. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  187. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  188. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  189. cuda/cccl/headers/include/cuda/std/string_view +155 -13
  190. cuda/cccl/headers/include/cuda/std/version +1 -4
  191. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  192. cuda/cccl/headers/include/cuda/utility +1 -0
  193. cuda/cccl/headers/include/nv/target +7 -2
  194. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  195. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  196. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  197. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  198. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  199. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  200. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  201. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  202. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  203. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  204. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  205. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  206. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  207. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  208. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  209. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  210. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  211. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  212. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  213. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  214. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  215. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  216. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  217. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  218. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  219. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  220. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  221. cuda/cccl/parallel/experimental/__init__.py +21 -74
  222. cuda/compute/__init__.py +79 -0
  223. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
  224. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
  225. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  226. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  227. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  228. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
  229. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
  230. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  231. cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
  232. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
  233. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  234. cuda/compute/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  235. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  236. cuda/compute/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  237. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  238. cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
  239. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
  240. cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
  241. cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
  242. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  243. cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
  244. cuda/coop/__init__.py +8 -0
  245. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  246. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  247. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  248. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  249. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  250. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  251. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  252. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  253. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  254. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  255. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  256. cuda/coop/warp/__init__.py +9 -0
  257. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  258. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  259. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  260. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  261. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
  262. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  263. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  264. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  265. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  266. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  267. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  268. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  269. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  270. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  271. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  272. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  273. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  274. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  275. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  276. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  277. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  278. cuda/cccl/parallel/experimental/.gitignore +0 -4
  279. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  280. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  281. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  282. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  283. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  284. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  285. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  286. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  287. /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
  288. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  289. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  290. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  291. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  292. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  293. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  294. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -11,7 +11,7 @@
11
11
  #ifndef _CUDA___DEVICE_ARCH_TRAITS_H
12
12
  #define _CUDA___DEVICE_ARCH_TRAITS_H
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -21,116 +21,93 @@
21
21
  # pragma system_header
22
22
  #endif // no system header
23
23
 
24
- #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
25
- # include <cuda/__device/attributes.h>
24
+ #if _CCCL_HAS_CTK()
25
+
26
+ # include <cuda/__device/arch_id.h>
27
+ # include <cuda/__device/compute_capability.h>
28
+ # include <cuda/__fwd/devices.h>
26
29
  # include <cuda/std/__exception/cuda_error.h>
30
+ # include <cuda/std/__type_traits/always_false.h>
31
+ # include <cuda/std/cstdint>
27
32
  # include <cuda/std/limits>
28
33
 
29
34
  # include <cuda/std/__cccl/prologue.h>
30
35
 
31
36
  _CCCL_BEGIN_NAMESPACE_CUDA
32
- namespace arch
33
- {
34
-
35
- inline constexpr int __arch_specific_id_multiplier = 100000;
36
37
 
37
- // @brief Architecture identifier
38
- // This type identifies an architecture. It has more possible entries than just numeric values of the compute
39
- // capability. For example, sm_90 and sm_90a have the same compute capability, but the identifier is different.
40
- enum class id : int
41
- {
42
- sm_60 = 60,
43
- sm_61 = 61,
44
- sm_70 = 70,
45
- sm_75 = 75,
46
- sm_80 = 80,
47
- sm_86 = 86,
48
- sm_89 = 89,
49
- sm_90 = 90,
50
- sm_100 = 100,
51
- sm_103 = 103,
52
- sm_110 = 110,
53
- sm_120 = 120,
54
- sm_90a = 90 * __arch_specific_id_multiplier,
55
- sm_100a = 100 * __arch_specific_id_multiplier,
56
- sm_103a = 103 * __arch_specific_id_multiplier,
57
- sm_110a = 110 * __arch_specific_id_multiplier,
58
- sm_120a = 120 * __arch_specific_id_multiplier,
59
- };
60
-
61
- // @brief Architecture traits
62
- // This type contains information about an architecture that is constant across devices of that architecture.
63
- struct traits_t
38
+ //! @brief Architecture traits
39
+ //! This type contains information about an architecture that is constant across devices of that architecture.
40
+ struct arch_traits_t
64
41
  {
65
42
  // Maximum number of threads per block
66
- const int max_threads_per_block = 1024;
43
+ int max_threads_per_block;
67
44
 
68
45
  // Maximum x-dimension of a block
69
- const int max_block_dim_x = 1024;
46
+ int max_block_dim_x;
70
47
 
71
48
  // Maximum y-dimension of a block
72
- const int max_block_dim_y = 1024;
49
+ int max_block_dim_y;
73
50
 
74
51
  // Maximum z-dimension of a block
75
- const int max_block_dim_z = 64;
52
+ int max_block_dim_z;
76
53
 
77
54
  // Maximum x-dimension of a grid
78
- const int max_grid_dim_x = ::cuda::std::numeric_limits<int32_t>::max();
55
+ int max_grid_dim_x;
79
56
 
80
57
  // Maximum y-dimension of a grid
81
- const int max_grid_dim_y = 64 * 1024 - 1;
58
+ int max_grid_dim_y;
82
59
 
83
60
  // Maximum z-dimension of a grid
84
- const int max_grid_dim_z = 64 * 1024 - 1;
61
+ int max_grid_dim_z;
85
62
 
86
63
  // Maximum amount of shared memory available to a thread block in bytes
87
- const int max_shared_memory_per_block = 48 * 1024;
64
+ ::cuda::std::size_t max_shared_memory_per_block;
88
65
 
89
66
  // Memory available on device for __constant__ variables in a CUDA C kernel in bytes
90
- const int total_constant_memory = 64 * 1024;
67
+ ::cuda::std::size_t total_constant_memory;
91
68
 
92
69
  // Warp size in threads
93
- const int warp_size = 32;
70
+ int warp_size;
94
71
 
95
72
  // Maximum number of concurrent grids on the device
96
- const int max_resident_grids = 128;
73
+ int max_resident_grids;
97
74
 
98
75
  // true if the device can concurrently copy memory between host and device
99
76
  // while executing a kernel, or false if not
100
- const bool gpu_overlap = true;
77
+ bool gpu_overlap;
101
78
 
102
79
  // true if the device can map host memory into CUDA address space
103
- const bool can_map_host_memory = true;
80
+ bool can_map_host_memory;
104
81
 
105
82
  // true if the device supports executing multiple kernels within the same
106
83
  // context simultaneously, or false if not. It is not guaranteed that multiple
107
84
  // kernels will be resident on the device concurrently so this feature should
108
85
  // not be relied upon for correctness.
109
- const bool concurrent_kernels = true;
86
+ bool concurrent_kernels;
110
87
 
111
88
  // true if the device supports stream priorities, or false if not
112
- const bool stream_priorities_supported = true;
89
+ bool stream_priorities_supported;
113
90
 
114
91
  // true if device supports caching globals in L1 cache, false if not
115
- const bool global_l1_cache_supported = true;
92
+ bool global_l1_cache_supported;
116
93
 
117
94
  // true if device supports caching locals in L1 cache, false if not
118
- const bool local_l1_cache_supported = true;
95
+ bool local_l1_cache_supported;
119
96
 
120
97
  // TODO: We might want to have these per-arch
121
98
  // Maximum number of 32-bit registers available to a thread block
122
- const int max_registers_per_block = 64 * 1024;
99
+ int max_registers_per_block;
123
100
 
124
101
  // Maximum number of 32-bit registers available to a multiprocessor; this
125
102
  // number is shared by all thread blocks simultaneously resident on a
126
103
  // multiprocessor
127
- const int max_registers_per_multiprocessor = 64 * 1024;
104
+ int max_registers_per_multiprocessor;
128
105
 
129
106
  // Maximum number of 32-bit registers available to a thread
130
- const int max_registers_per_thread = 255;
107
+ int max_registers_per_thread;
131
108
 
132
109
  // Identifier for the architecture
133
- id arch_id;
110
+ ::cuda::arch_id arch_id;
134
111
 
135
112
  // Major compute capability version number
136
113
  int compute_capability_major;
@@ -139,12 +116,12 @@ struct traits_t
139
116
  int compute_capability_minor;
140
117
 
141
118
  // Compute capability version number in 100 * major + 10 * minor format
142
- int compute_capability;
119
+ ::cuda::compute_capability compute_capability;
143
120
 
144
121
  // Maximum amount of shared memory available to a multiprocessor in bytes;
145
122
  // this amount is shared by all thread blocks simultaneously resident on a
146
123
  // multiprocessor
147
- int max_shared_memory_per_multiprocessor;
124
+ ::cuda::std::size_t max_shared_memory_per_multiprocessor;
148
125
 
149
126
  // Maximum number of thread blocks that can reside on a multiprocessor
150
127
  int max_blocks_per_multiprocessor;
@@ -156,11 +133,11 @@ struct traits_t
156
133
  int max_warps_per_multiprocessor;
157
134
 
158
135
  // Shared memory reserved by CUDA driver per block in bytes
159
- int reserved_shared_memory_per_block;
136
+ ::cuda::std::size_t reserved_shared_memory_per_block;
160
137
 
161
138
  // Maximum per block shared memory size on the device. This value can be opted
162
139
  // into when using dynamic_shared_memory with NonPortableSize set to true
163
- int max_shared_memory_per_block_optin;
140
+ ::cuda::std::size_t max_shared_memory_per_block_optin;
164
141
 
165
142
  // TODO: Do we want these?:
166
143
  // true if architecture supports clusters
@@ -179,65 +156,81 @@ struct traits_t
179
156
  bool tma_supported;
180
157
  };
181
158
 
182
- // @brief Architecture traits
183
- // Template function that returns the traits for an architecture with a given id.
184
- template <id _Id>
185
- [[nodiscard]] _CCCL_HOST_DEVICE constexpr traits_t traits();
159
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t __common_arch_traits(arch_id __arch_id) noexcept
160
+ {
161
+ const compute_capability __cc{__arch_id};
162
+
163
+ arch_traits_t __traits{};
164
+ __traits.max_threads_per_block = 1024;
165
+ __traits.max_block_dim_x = 1024;
166
+ __traits.max_block_dim_y = 1024;
167
+ __traits.max_block_dim_z = 64;
168
+ __traits.max_grid_dim_x = ::cuda::std::numeric_limits<::cuda::std::int32_t>::max();
169
+ __traits.max_grid_dim_y = 64 * 1024 - 1;
170
+ __traits.max_grid_dim_z = 64 * 1024 - 1;
171
+ __traits.max_shared_memory_per_block = 48 * 1024;
172
+ __traits.total_constant_memory = 64 * 1024;
173
+ __traits.warp_size = 32;
174
+ __traits.max_resident_grids = 128;
175
+ __traits.gpu_overlap = true;
176
+ __traits.can_map_host_memory = true;
177
+ __traits.concurrent_kernels = true;
178
+ __traits.stream_priorities_supported = true;
179
+ __traits.global_l1_cache_supported = true;
180
+ __traits.local_l1_cache_supported = true;
181
+ __traits.max_registers_per_block = 64 * 1024;
182
+ __traits.max_registers_per_multiprocessor = 64 * 1024;
183
+ __traits.max_registers_per_thread = 255;
184
+ __traits.arch_id = __arch_id;
185
+ __traits.compute_capability_major = __cc.major();
186
+ __traits.compute_capability_minor = __cc.minor();
187
+ __traits.compute_capability = __cc;
188
+ // __traits.max_shared_memory_per_multiprocessor; // set up individually
189
+ // __traits.max_blocks_per_multiprocessor; // set up individually
190
+ // __traits.max_threads_per_multiprocessor; // set up individually
191
+ // __traits.max_warps_per_multiprocessor; // set up individually
192
+ __traits.reserved_shared_memory_per_block = (__cc >= compute_capability{80}) ? 1024 : 0;
193
+ // __traits.max_shared_memory_per_block_optin; // set up individually
194
+ __traits.cluster_supported = (__cc >= compute_capability{90});
195
+ __traits.redux_intrinisic = (__cc >= compute_capability{80});
196
+ __traits.elect_intrinsic = (__cc >= compute_capability{90});
197
+ __traits.cp_async_supported = (__cc >= compute_capability{80});
198
+ __traits.tma_supported = (__cc >= compute_capability{90});
199
+ return __traits;
200
+ }
201
+
202
+ //! @brief Gets the architecture traits for the given architecture id \c _Id.
203
+ template <arch_id _Id>
204
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits() noexcept;
186
205
 
187
206
  template <>
188
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_60>()
207
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_60>() noexcept
189
208
  {
190
- traits_t __traits{};
191
- __traits.arch_id = id::sm_60;
192
- __traits.compute_capability_major = 6;
193
- __traits.compute_capability_minor = 0;
194
- __traits.compute_capability = 60;
209
+ auto __traits = ::cuda::__common_arch_traits(arch_id::sm_60);
195
210
  __traits.max_shared_memory_per_multiprocessor = 64 * 1024;
196
211
  __traits.max_blocks_per_multiprocessor = 32;
197
212
  __traits.max_threads_per_multiprocessor = 2048;
198
213
  __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
199
- __traits.reserved_shared_memory_per_block = 0;
200
214
  __traits.max_shared_memory_per_block_optin = 48 * 1024;
201
-
202
- __traits.cluster_supported = false;
203
- __traits.redux_intrinisic = false;
204
- __traits.elect_intrinsic = false;
205
- __traits.cp_async_supported = false;
206
- __traits.tma_supported = false;
207
215
  return __traits;
208
216
  };
209
217
 
210
218
  template <>
211
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_61>()
219
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_61>() noexcept
212
220
  {
213
- traits_t __traits{};
214
- __traits.arch_id = id::sm_61;
215
- __traits.compute_capability_major = 6;
216
- __traits.compute_capability_minor = 1;
217
- __traits.compute_capability = 61;
221
+ auto __traits = ::cuda::__common_arch_traits(arch_id::sm_61);
218
222
  __traits.max_shared_memory_per_multiprocessor = 96 * 1024;
219
223
  __traits.max_blocks_per_multiprocessor = 32;
220
224
  __traits.max_threads_per_multiprocessor = 2048;
221
225
  __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
222
- __traits.reserved_shared_memory_per_block = 0;
223
226
  __traits.max_shared_memory_per_block_optin = 48 * 1024;
224
-
225
- __traits.cluster_supported = false;
226
- __traits.redux_intrinisic = false;
227
- __traits.elect_intrinsic = false;
228
- __traits.cp_async_supported = false;
229
- __traits.tma_supported = false;
230
227
  return __traits;
231
228
  };
232
229
 
233
230
  template <>
234
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_70>()
231
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_70>() noexcept
235
232
  {
236
- traits_t __traits{};
237
- __traits.arch_id = id::sm_70;
238
- __traits.compute_capability_major = 7;
239
- __traits.compute_capability_minor = 0;
240
- __traits.compute_capability = 70;
233
+ auto __traits = ::cuda::__common_arch_traits(arch_id::sm_70);
241
234
  __traits.max_shared_memory_per_multiprocessor = 96 * 1024;
242
235
  __traits.max_blocks_per_multiprocessor = 32;
243
236
  __traits.max_threads_per_multiprocessor = 2048;
@@ -245,369 +238,300 @@ template <>
245
238
  __traits.reserved_shared_memory_per_block = 0;
246
239
  __traits.max_shared_memory_per_block_optin =
247
240
  __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
248
-
249
- __traits.cluster_supported = false;
250
- __traits.redux_intrinisic = false;
251
- __traits.elect_intrinsic = false;
252
- __traits.cp_async_supported = false;
253
- __traits.tma_supported = false;
254
241
  return __traits;
255
242
  };
256
243
 
257
244
  template <>
258
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_75>()
245
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_75>() noexcept
259
246
  {
260
- traits_t __traits{};
261
- __traits.arch_id = id::sm_75;
262
- __traits.compute_capability_major = 7;
263
- __traits.compute_capability_minor = 5;
264
- __traits.compute_capability = 75;
247
+ auto __traits = ::cuda::__common_arch_traits(arch_id::sm_75);
265
248
  __traits.max_shared_memory_per_multiprocessor = 64 * 1024;
266
249
  __traits.max_blocks_per_multiprocessor = 16;
267
250
  __traits.max_threads_per_multiprocessor = 1024;
268
251
  __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
269
- __traits.reserved_shared_memory_per_block = 0;
270
252
  __traits.max_shared_memory_per_block_optin =
271
253
  __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
272
-
273
- __traits.cluster_supported = false;
274
- __traits.redux_intrinisic = false;
275
- __traits.elect_intrinsic = false;
276
- __traits.cp_async_supported = false;
277
- __traits.tma_supported = false;
278
254
  return __traits;
279
255
  };
280
256
 
281
257
  template <>
282
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_80>()
258
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_80>() noexcept
283
259
  {
284
- traits_t __traits{};
285
- __traits.arch_id = id::sm_80;
286
- __traits.compute_capability_major = 8;
287
- __traits.compute_capability_minor = 0;
288
- __traits.compute_capability = 80;
260
+ auto __traits = ::cuda::__common_arch_traits(arch_id::sm_80);
289
261
  __traits.max_shared_memory_per_multiprocessor = 164 * 1024;
290
262
  __traits.max_blocks_per_multiprocessor = 32;
291
263
  __traits.max_threads_per_multiprocessor = 2048;
292
264
  __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
293
- __traits.reserved_shared_memory_per_block = 1024;
294
265
  __traits.max_shared_memory_per_block_optin =
295
266
  __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
296
-
297
- __traits.cluster_supported = false;
298
- __traits.redux_intrinisic = true;
299
- __traits.elect_intrinsic = false;
300
- __traits.cp_async_supported = true;
301
- __traits.tma_supported = false;
302
267
  return __traits;
303
268
  };
304
269
 
305
270
  template <>
306
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_86>()
271
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_86>() noexcept
307
272
  {
308
- traits_t __traits{};
309
- __traits.arch_id = id::sm_86;
310
- __traits.compute_capability_major = 8;
311
- __traits.compute_capability_minor = 6;
312
- __traits.compute_capability = 86;
273
+ auto __traits = ::cuda::__common_arch_traits(arch_id::sm_86);
313
274
  __traits.max_shared_memory_per_multiprocessor = 100 * 1024;
314
275
  __traits.max_blocks_per_multiprocessor = 16;
315
276
  __traits.max_threads_per_multiprocessor = 1536;
316
277
  __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
317
- __traits.reserved_shared_memory_per_block = 1024;
318
278
  __traits.max_shared_memory_per_block_optin =
319
279
  __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
280
+ return __traits;
281
+ };
320
282
 
321
- __traits.cluster_supported = false;
322
- __traits.redux_intrinisic = true;
323
- __traits.elect_intrinsic = false;
324
- __traits.cp_async_supported = true;
325
- __traits.tma_supported = false;
283
+ template <>
284
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_87>() noexcept
285
+ {
286
+ auto __traits = ::cuda::__common_arch_traits(arch_id::sm_87);
287
+ __traits.max_shared_memory_per_multiprocessor = 164 * 1024;
288
+ __traits.max_blocks_per_multiprocessor = 16;
289
+ __traits.max_threads_per_multiprocessor = 1536;
290
+ __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
291
+ __traits.max_shared_memory_per_block_optin =
292
+ __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
293
+ return __traits;
294
+ };
295
+
296
+ template <>
297
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_88>() noexcept
298
+ {
299
+ auto __traits = ::cuda::arch_traits<arch_id::sm_86>();
300
+ __traits.arch_id = arch_id::sm_88;
301
+ __traits.compute_capability_major = 8;
302
+ __traits.compute_capability_minor = 8;
303
+ __traits.compute_capability = compute_capability{88};
326
304
  return __traits;
327
305
  };
328
306
 
329
307
  template <>
330
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_89>()
308
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_89>() noexcept
331
309
  {
332
- traits_t __traits{};
333
- __traits.arch_id = id::sm_89;
334
- __traits.compute_capability_major = 8;
335
- __traits.compute_capability_minor = 9;
336
- __traits.compute_capability = 89;
310
+ auto __traits = ::cuda::__common_arch_traits(arch_id::sm_89);
337
311
  __traits.max_shared_memory_per_multiprocessor = 100 * 1024;
338
312
  __traits.max_blocks_per_multiprocessor = 24;
339
313
  __traits.max_threads_per_multiprocessor = 1536;
340
314
  __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
341
- __traits.reserved_shared_memory_per_block = 1024;
342
315
  __traits.max_shared_memory_per_block_optin =
343
316
  __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
344
-
345
- __traits.cluster_supported = false;
346
- __traits.redux_intrinisic = true;
347
- __traits.elect_intrinsic = false;
348
- __traits.cp_async_supported = true;
349
- __traits.tma_supported = false;
350
317
  return __traits;
351
318
  };
352
319
 
353
320
  template <>
354
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_90>()
321
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_90>() noexcept
355
322
  {
356
- traits_t __traits{};
357
- __traits.arch_id = id::sm_90;
358
- __traits.compute_capability_major = 9;
359
- __traits.compute_capability_minor = 0;
360
- __traits.compute_capability = 90;
323
+ auto __traits = ::cuda::__common_arch_traits(arch_id::sm_90);
361
324
  __traits.max_shared_memory_per_multiprocessor = 228 * 1024;
362
325
  __traits.max_blocks_per_multiprocessor = 32;
363
326
  __traits.max_threads_per_multiprocessor = 2048;
364
327
  __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
365
- __traits.reserved_shared_memory_per_block = 1024;
366
328
  __traits.max_shared_memory_per_block_optin =
367
329
  __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
368
-
369
- __traits.cluster_supported = true;
370
- __traits.redux_intrinisic = true;
371
- __traits.elect_intrinsic = true;
372
- __traits.cp_async_supported = true;
373
- __traits.tma_supported = true;
374
330
  return __traits;
375
331
  };
376
332
 
377
333
  // No sm_90a specific fields for now.
378
334
  template <>
379
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_90a>()
335
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_90a>() noexcept
380
336
  {
381
- return ::cuda::arch::traits<id::sm_90>();
337
+ auto __traits = ::cuda::arch_traits<arch_id::sm_90>();
338
+ __traits.arch_id = arch_id::sm_90a;
339
+ return __traits;
382
340
  };
383
341
 
384
342
  template <>
385
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_100>()
343
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_100>() noexcept
386
344
  {
387
- traits_t __traits{};
388
- __traits.arch_id = id::sm_100;
389
- __traits.compute_capability_major = 10;
390
- __traits.compute_capability_minor = 0;
391
- __traits.compute_capability = 100;
345
+ auto __traits = ::cuda::__common_arch_traits(arch_id::sm_90);
392
346
  __traits.max_shared_memory_per_multiprocessor = 228 * 1024;
393
347
  __traits.max_blocks_per_multiprocessor = 32;
394
348
  __traits.max_threads_per_multiprocessor = 2048;
395
349
  __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
396
- __traits.reserved_shared_memory_per_block = 1024;
397
350
  __traits.max_shared_memory_per_block_optin =
398
351
  __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
399
-
400
- __traits.cluster_supported = true;
401
- __traits.redux_intrinisic = true;
402
- __traits.elect_intrinsic = true;
403
- __traits.cp_async_supported = true;
404
- __traits.tma_supported = true;
405
352
  return __traits;
406
353
  };
407
354
 
408
355
  template <>
409
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_100a>()
356
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_100a>() noexcept
410
357
  {
411
- return ::cuda::arch::traits<id::sm_100>();
358
+ auto __traits = ::cuda::arch_traits<arch_id::sm_100>();
359
+ __traits.arch_id = arch_id::sm_100a;
360
+ return __traits;
412
361
  };
413
362
 
414
363
  template <>
415
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_103>()
364
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_103>() noexcept
416
365
  {
417
- traits_t __traits = ::cuda::arch::traits<id::sm_100>();
418
- __traits.arch_id = id::sm_103;
366
+ auto __traits = ::cuda::arch_traits<arch_id::sm_100>();
367
+ __traits.arch_id = arch_id::sm_103;
419
368
  __traits.compute_capability_major = 10;
420
369
  __traits.compute_capability_minor = 3;
421
- __traits.compute_capability = 103;
370
+ __traits.compute_capability = compute_capability{103};
422
371
  return __traits;
423
372
  };
424
373
 
425
374
  template <>
426
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_103a>()
375
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_103a>() noexcept
427
376
  {
428
- return ::cuda::arch::traits<id::sm_103>();
377
+ auto __traits = ::cuda::arch_traits<arch_id::sm_103>();
378
+ __traits.arch_id = arch_id::sm_103a;
379
+ return __traits;
429
380
  };
430
381
 
431
382
  template <>
432
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_110>()
383
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_110>() noexcept
433
384
  {
434
- traits_t __traits = ::cuda::arch::traits<id::sm_100>();
435
- __traits.arch_id = id::sm_110;
436
- __traits.compute_capability_major = 11;
437
- __traits.compute_capability_minor = 0;
438
- __traits.compute_capability = 110;
385
+ auto __traits = ::cuda::arch_traits<arch_id::sm_100>();
386
+ __traits.arch_id = arch_id::sm_110;
387
+ __traits.compute_capability_major = 11;
388
+ __traits.compute_capability_minor = 0;
389
+ __traits.compute_capability = compute_capability{110};
390
+ __traits.max_blocks_per_multiprocessor = 24;
391
+ __traits.max_threads_per_multiprocessor = 1536;
392
+ __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
439
393
  return __traits;
440
394
  };
441
395
 
442
396
  template <>
443
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_110a>()
397
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_110a>() noexcept
444
398
  {
445
- return ::cuda::arch::traits<id::sm_110>();
399
+ auto __traits = ::cuda::arch_traits<arch_id::sm_110>();
400
+ __traits.arch_id = arch_id::sm_110a;
401
+ return __traits;
446
402
  };
447
403
 
448
404
  template <>
449
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_120>()
405
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_120>() noexcept
450
406
  {
451
- traits_t __traits{};
452
- __traits.arch_id = id::sm_120;
453
- __traits.compute_capability_major = 12;
454
- __traits.compute_capability_minor = 0;
455
- __traits.compute_capability = 120;
407
+ auto __traits = ::cuda::__common_arch_traits(arch_id::sm_120);
456
408
  __traits.max_shared_memory_per_multiprocessor = 100 * 1024;
457
- __traits.max_blocks_per_multiprocessor = 32;
409
+ __traits.max_blocks_per_multiprocessor = 24;
458
410
  __traits.max_threads_per_multiprocessor = 1536;
459
411
  __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
460
- __traits.reserved_shared_memory_per_block = 1024;
461
412
  __traits.max_shared_memory_per_block_optin =
462
413
  __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
414
+ return __traits;
415
+ };
463
416
 
464
- __traits.cluster_supported = true;
465
- __traits.redux_intrinisic = true;
466
- __traits.elect_intrinsic = true;
467
- __traits.cp_async_supported = true;
468
- __traits.tma_supported = true;
417
+ template <>
418
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_120a>() noexcept
419
+ {
420
+ auto __traits = ::cuda::arch_traits<arch_id::sm_120>();
421
+ __traits.arch_id = arch_id::sm_120a;
469
422
  return __traits;
470
423
  };
471
424
 
472
425
  template <>
473
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_120a>()
426
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_121>() noexcept
474
427
  {
475
- return ::cuda::arch::traits<id::sm_120>();
428
+ auto __traits = ::cuda::arch_traits<arch_id::sm_120>();
429
+ __traits.arch_id = arch_id::sm_121;
430
+ __traits.compute_capability_major = 12;
431
+ __traits.compute_capability_minor = 1;
432
+ __traits.compute_capability = compute_capability{121};
433
+ return __traits;
476
434
  };
477
435
 
478
- inline constexpr int __highest_known_arch = 120;
436
+ template <>
437
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_121a>() noexcept
438
+ {
439
+ auto __traits = ::cuda::arch_traits<arch_id::sm_121>();
440
+ __traits.arch_id = arch_id::sm_121a;
441
+ return __traits;
442
+ };
479
443
 
480
- [[nodiscard]] _CCCL_API inline constexpr traits_t traits_for_id(id __id)
444
+ //! @brief Gets the architecture traits for the given architecture id \c __id.
445
+ //!
446
+ //! @throws \c cuda::cuda_error if the \c __id is not a known architecture.
447
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits_for(arch_id __id)
481
448
  {
482
449
  switch (__id)
483
450
  {
484
- case id::sm_60:
485
- return ::cuda::arch::traits<id::sm_60>();
486
- case id::sm_61:
487
- return ::cuda::arch::traits<id::sm_61>();
488
- case id::sm_70:
489
- return ::cuda::arch::traits<id::sm_70>();
490
- case id::sm_75:
491
- return ::cuda::arch::traits<id::sm_75>();
492
- case id::sm_80:
493
- return ::cuda::arch::traits<id::sm_80>();
494
- case id::sm_86:
495
- return ::cuda::arch::traits<id::sm_86>();
496
- case id::sm_89:
497
- return ::cuda::arch::traits<id::sm_89>();
498
- case id::sm_90:
499
- return ::cuda::arch::traits<id::sm_90>();
500
- case id::sm_90a:
501
- return ::cuda::arch::traits<id::sm_90a>();
502
- case id::sm_100:
503
- return ::cuda::arch::traits<id::sm_100>();
504
- case id::sm_100a:
505
- return ::cuda::arch::traits<id::sm_100a>();
506
- case id::sm_103:
507
- return ::cuda::arch::traits<id::sm_103>();
508
- case id::sm_103a:
509
- return ::cuda::arch::traits<id::sm_103a>();
510
- case id::sm_110:
511
- return ::cuda::arch::traits<id::sm_110>();
512
- case id::sm_110a:
513
- return ::cuda::arch::traits<id::sm_110a>();
514
- case id::sm_120:
515
- return ::cuda::arch::traits<id::sm_120>();
516
- case id::sm_120a:
517
- return ::cuda::arch::traits<id::sm_120a>();
451
+ case arch_id::sm_60:
452
+ return ::cuda::arch_traits<arch_id::sm_60>();
453
+ case arch_id::sm_61:
454
+ return ::cuda::arch_traits<arch_id::sm_61>();
455
+ case arch_id::sm_70:
456
+ return ::cuda::arch_traits<arch_id::sm_70>();
457
+ case arch_id::sm_75:
458
+ return ::cuda::arch_traits<arch_id::sm_75>();
459
+ case arch_id::sm_80:
460
+ return ::cuda::arch_traits<arch_id::sm_80>();
461
+ case arch_id::sm_86:
462
+ return ::cuda::arch_traits<arch_id::sm_86>();
463
+ case arch_id::sm_87:
464
+ return ::cuda::arch_traits<arch_id::sm_87>();
465
+ case arch_id::sm_88:
466
+ return ::cuda::arch_traits<arch_id::sm_88>();
467
+ case arch_id::sm_89:
468
+ return ::cuda::arch_traits<arch_id::sm_89>();
469
+ case arch_id::sm_90:
470
+ return ::cuda::arch_traits<arch_id::sm_90>();
471
+ case arch_id::sm_90a:
472
+ return ::cuda::arch_traits<arch_id::sm_90a>();
473
+ case arch_id::sm_100:
474
+ return ::cuda::arch_traits<arch_id::sm_100>();
475
+ case arch_id::sm_100a:
476
+ return ::cuda::arch_traits<arch_id::sm_100a>();
477
+ case arch_id::sm_103:
478
+ return ::cuda::arch_traits<arch_id::sm_103>();
479
+ case arch_id::sm_103a:
480
+ return ::cuda::arch_traits<arch_id::sm_103a>();
481
+ case arch_id::sm_110:
482
+ return ::cuda::arch_traits<arch_id::sm_110>();
483
+ case arch_id::sm_110a:
484
+ return ::cuda::arch_traits<arch_id::sm_110a>();
485
+ case arch_id::sm_120:
486
+ return ::cuda::arch_traits<arch_id::sm_120>();
487
+ case arch_id::sm_120a:
488
+ return ::cuda::arch_traits<arch_id::sm_120a>();
489
+ case arch_id::sm_121:
490
+ return ::cuda::arch_traits<arch_id::sm_121>();
491
+ case arch_id::sm_121a:
492
+ return ::cuda::arch_traits<arch_id::sm_121a>();
518
493
  default:
519
494
  ::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Traits requested for an unknown architecture");
520
495
  break;
521
496
  }
522
497
  }
523
498
 
524
- [[nodiscard]] _CCCL_API inline constexpr id id_for_compute_capability(int compute_capability)
499
+ //! @brief Gets the architecture traits for the given compute capability \c __cc.
500
+ //!
501
+ //! @throws \c cuda::cuda_error if the \c __cc doesn't have a corresponding architecture id.
502
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits_for(compute_capability __cc)
525
503
  {
526
- if (compute_capability < 60 || compute_capability > __highest_known_arch)
527
- {
528
- ::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Compute capability out of range");
529
- }
530
- return static_cast<id>(compute_capability);
504
+ return ::cuda::arch_traits_for(::cuda::to_arch_id(__cc));
531
505
  }
532
506
 
533
- [[nodiscard]] _CCCL_API inline constexpr traits_t traits_for_compute_capability(int compute_capability)
534
- {
535
- return ::cuda::arch::traits_for_id(::cuda::arch::id_for_compute_capability(compute_capability));
536
- }
507
+ _CCCL_END_NAMESPACE_CUDA
537
508
 
538
- _CCCL_API inline constexpr id __special_id_for_compute_capability(int value)
539
- {
540
- switch (value)
541
- {
542
- case 90:
543
- return id::sm_90a;
544
- case 100:
545
- return id::sm_100a;
546
- case 103:
547
- return id::sm_103a;
548
- case 110:
549
- return id::sm_110a;
550
- case 120:
551
- return id::sm_120a;
552
- default:
553
- ::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Compute capability out of range");
554
- break;
555
- }
556
- }
509
+ # if _CCCL_CUDA_COMPILATION()
557
510
 
558
- //! @brief Provides architecture traits of the architecture matching __CUDA_ARCH__ macro
559
- [[nodiscard]] _CCCL_DEVICE inline constexpr arch::traits_t current_traits()
560
- {
561
- // fixme: this doesn't work with nvc++ -cuda
562
- # ifdef __CUDA_ARCH__
563
- # ifdef __CUDA_ARCH_SPECIFIC__
564
- return ::cuda::arch::traits_for_id(::cuda::arch::__special_id_for_compute_capability(__CUDA_ARCH_SPECIFIC__ / 10));
565
- # else
566
- return ::cuda::arch::traits_for_compute_capability(__CUDA_ARCH__ / 10);
567
- # endif // __CUDA_ARCH_SPECIFIC__
568
- # else // __CUDA_ARCH__
569
- // Should be unreachable in __device__ function
570
- return ::cuda::arch::traits_t{};
571
- # endif // __CUDA_ARCH__
572
- }
511
+ _CCCL_BEGIN_NAMESPACE_CUDA_DEVICE
573
512
 
574
- [[nodiscard]] inline constexpr arch::traits_t
575
- __arch_traits_might_be_unknown(int __device, unsigned int __compute_capability)
513
+ //! @brief Returns the \c cuda::arch_trait_t of the architecture that is currently being compiled.
514
+ //!
515
+ //! If the current architecture is not a known architecture from \c cuda::arch_id enumeration, the compilation
516
+ //! will fail.
517
+ //!
518
+ //! @note This API cannot be used in constexpr context when compiling with nvc++ in CUDA mode.
519
+ template <class _Dummy = void>
520
+ [[nodiscard]] _CCCL_DEVICE_API _CCCL_TARGET_CONSTEXPR ::cuda::arch_traits_t current_arch_traits() noexcept
576
521
  {
577
- if (__compute_capability <= arch::__highest_known_arch)
578
- {
579
- return ::cuda::arch::traits_for_compute_capability(__compute_capability);
580
- }
581
- else
582
- {
583
- // If the architecture is unknown, we need to craft the arch_traits from attributes
584
- arch::traits_t __traits{};
585
- __traits.compute_capability_major = __compute_capability / 10;
586
- __traits.compute_capability_minor = __compute_capability % 10;
587
- __traits.compute_capability = __compute_capability;
588
- __traits.max_shared_memory_per_multiprocessor =
589
- ::cuda::device_attributes::max_shared_memory_per_multiprocessor(__device);
590
- __traits.max_blocks_per_multiprocessor = ::cuda::device_attributes::max_blocks_per_multiprocessor(__device);
591
- __traits.max_threads_per_multiprocessor = ::cuda::device_attributes::max_threads_per_multiprocessor(__device);
592
- __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
593
- __traits.reserved_shared_memory_per_block = ::cuda::device_attributes::reserved_shared_memory_per_block(__device);
594
- __traits.max_shared_memory_per_block_optin =
595
- __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
596
-
597
- __traits.cluster_supported = __compute_capability >= 90;
598
- __traits.redux_intrinisic = __compute_capability >= 80;
599
- __traits.elect_intrinsic = __compute_capability >= 90;
600
- __traits.cp_async_supported = __compute_capability >= 80;
601
- __traits.tma_supported = __compute_capability >= 90;
602
- return __traits;
603
- }
522
+ # if _CCCL_DEVICE_COMPILATION()
523
+ return ::cuda::arch_traits_for(::cuda::device::current_arch_id<_Dummy>());
524
+ # else // ^^^ _CCCL_DEVICE_COMPILATION() ^^^ / vvv !_CCCL_DEVICE_COMPILATION() vvv
525
+ return {};
526
+ # endif // ^^^ !_CCCL_DEVICE_COMPILATION() ^^^
604
527
  }
605
- } // namespace arch
606
528
 
607
- _CCCL_END_NAMESPACE_CUDA
529
+ _CCCL_END_NAMESPACE_CUDA_DEVICE
530
+
531
+ # endif // _CCCL_CUDA_COMPILATION
608
532
 
609
533
  # include <cuda/std/__cccl/epilogue.h>
610
534
 
611
- #endif // _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
535
+ #endif // _CCCL_HAS_CTK()
612
536
 
613
537
  #endif // _CUDA___DEVICE_ARCH_TRAITS_H