cuda-cccl 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  7. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  8. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  14. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  15. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  20. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  21. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  22. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  23. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  24. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  25. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  26. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
  27. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  28. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  29. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  30. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  31. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  32. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  33. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  34. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  35. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  36. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
  37. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  38. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
  39. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  40. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
  41. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
  42. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  43. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
  44. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  46. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  49. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  52. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  55. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  56. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  57. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  58. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  59. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  60. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  61. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  62. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  63. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  64. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
  65. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  66. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  67. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  68. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  69. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  70. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  71. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  72. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  73. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  74. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  75. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  76. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  77. cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
  78. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  79. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  80. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  81. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  82. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  83. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  84. cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
  85. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  86. cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
  87. cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
  88. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  89. cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
  90. cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
  91. cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
  92. cuda/cccl/headers/include/cuda/__event/event.h +27 -26
  93. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  94. cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
  95. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  96. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  97. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  98. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  99. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  100. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  101. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  102. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  103. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  104. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
  105. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  106. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  107. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  108. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  109. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  110. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  111. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  112. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  113. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  114. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  115. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  116. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
  117. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  118. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
  119. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  120. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  121. cuda/cccl/headers/include/cuda/algorithm +1 -1
  122. cuda/cccl/headers/include/cuda/cmath +1 -0
  123. cuda/cccl/headers/include/cuda/devices +13 -0
  124. cuda/cccl/headers/include/cuda/iterator +1 -0
  125. cuda/cccl/headers/include/cuda/memory +1 -0
  126. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  127. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  128. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  129. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  130. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  131. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  132. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  133. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  134. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  135. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  136. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  137. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  138. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  139. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  140. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  141. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  142. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  143. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  144. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  145. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  146. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  147. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  148. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  149. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  150. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  151. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  152. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  153. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  154. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  155. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  156. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  157. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  158. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  159. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  160. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  161. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  162. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  163. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  164. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  165. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  166. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  167. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  168. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  169. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  170. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  171. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  172. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  173. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  174. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  175. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  176. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  177. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  178. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  179. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  180. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  181. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  182. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  183. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  184. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  185. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  186. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  187. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  188. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  189. cuda/cccl/headers/include/cuda/std/string_view +155 -13
  190. cuda/cccl/headers/include/cuda/std/version +1 -4
  191. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  192. cuda/cccl/headers/include/cuda/utility +1 -0
  193. cuda/cccl/headers/include/nv/target +7 -2
  194. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  195. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  196. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  197. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  198. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  199. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  200. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  201. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  202. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  203. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  204. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  205. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  206. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  207. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  208. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  209. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  210. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  211. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  212. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  213. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  214. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  215. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  216. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  217. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  218. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  219. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  220. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  221. cuda/cccl/parallel/experimental/__init__.py +21 -74
  222. cuda/compute/__init__.py +79 -0
  223. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
  224. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
  225. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  226. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  227. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  228. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
  229. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
  230. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  231. cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
  232. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
  233. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  234. cuda/compute/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  235. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  236. cuda/compute/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  237. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  238. cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
  239. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
  240. cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
  241. cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
  242. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  243. cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
  244. cuda/coop/__init__.py +8 -0
  245. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  246. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  247. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  248. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  249. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  250. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  251. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  252. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  253. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  254. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  255. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  256. cuda/coop/warp/__init__.py +9 -0
  257. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  258. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  259. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  260. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  261. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
  262. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  263. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  264. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  265. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  266. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  267. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  268. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  269. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  270. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  271. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  272. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  273. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  274. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  275. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  276. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  277. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  278. cuda/cccl/parallel/experimental/.gitignore +0 -4
  279. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  280. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  281. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  282. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  283. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  284. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  285. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  286. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  287. /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
  288. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  289. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  290. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  291. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  292. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  293. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  294. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -11,7 +11,7 @@
11
11
  #ifndef _CUDA___DEVICE_ATTRIBUTES_H
12
12
  #define _CUDA___DEVICE_ATTRIBUTES_H
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -23,29 +23,27 @@
23
23
 
24
24
  #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
25
25
 
26
+ # include <cuda/__device/compute_capability.h>
26
27
  # include <cuda/__device/device_ref.h>
27
28
  # include <cuda/__driver/driver_api.h>
28
- # include <cuda/std/__cccl/attributes.h>
29
- # include <cuda/std/__cuda/api_wrapper.h>
29
+ # include <cuda/__fwd/devices.h>
30
+ # include <cuda/std/__cstddef/types.h>
30
31
 
31
32
  # include <cuda/std/__cccl/prologue.h>
32
33
 
33
34
  _CCCL_BEGIN_NAMESPACE_CUDA
34
35
 
35
- namespace __detail
36
- {
37
-
38
36
  template <::cudaDeviceAttr _Attr, typename _Type>
39
37
  struct __dev_attr_impl
40
38
  {
41
39
  using type = _Type;
42
40
 
43
- [[nodiscard]] constexpr operator ::cudaDeviceAttr() const noexcept
41
+ [[nodiscard]] _CCCL_HOST_API constexpr operator ::cudaDeviceAttr() const noexcept
44
42
  {
45
43
  return _Attr;
46
44
  }
47
45
 
48
- [[nodiscard]] type operator()(device_ref __dev) const
46
+ [[nodiscard]] _CCCL_HOST_API type operator()(device_ref __dev) const
49
47
  {
50
48
  return static_cast<type>(::cuda::__driver::__deviceGetAttribute(
51
49
  static_cast<::CUdevice_attribute>(_Attr), ::cuda::__driver::__deviceGet(__dev.get())));
@@ -55,13 +53,36 @@ struct __dev_attr_impl
55
53
  template <::cudaDeviceAttr _Attr>
56
54
  struct __dev_attr : __dev_attr_impl<_Attr, int>
57
55
  {};
58
-
56
+ template <>
57
+ struct __dev_attr<::cudaDevAttrMaxSharedMemoryPerBlock> //
58
+ : __dev_attr_impl<::cudaDevAttrMaxSharedMemoryPerBlock, ::cuda::std::size_t>
59
+ {};
60
+ template <>
61
+ struct __dev_attr<::cudaDevAttrTotalConstantMemory> //
62
+ : __dev_attr_impl<::cudaDevAttrTotalConstantMemory, ::cuda::std::size_t>
63
+ {};
64
+ template <>
65
+ struct __dev_attr<::cudaDevAttrMaxPitch> //
66
+ : __dev_attr_impl<::cudaDevAttrMaxPitch, ::cuda::std::size_t>
67
+ {};
68
+ template <>
69
+ struct __dev_attr<::cudaDevAttrMaxTexture2DLinearPitch> //
70
+ : __dev_attr_impl<::cudaDevAttrMaxTexture2DLinearPitch, ::cuda::std::size_t>
71
+ {};
59
72
  // TODO: give this a strong type for kilohertz
60
73
  template <>
61
74
  struct __dev_attr<::cudaDevAttrClockRate> //
62
75
  : __dev_attr_impl<::cudaDevAttrClockRate, int>
63
76
  {};
64
77
  template <>
78
+ struct __dev_attr<::cudaDevAttrTextureAlignment> //
79
+ : __dev_attr_impl<::cudaDevAttrTextureAlignment, ::cuda::std::size_t>
80
+ {};
81
+ template <>
82
+ struct __dev_attr<::cudaDevAttrTexturePitchAlignment> //
83
+ : __dev_attr_impl<::cudaDevAttrTexturePitchAlignment, ::cuda::std::size_t>
84
+ {};
85
+ template <>
65
86
  struct __dev_attr<::cudaDevAttrGpuOverlap> //
66
87
  : __dev_attr_impl<::cudaDevAttrGpuOverlap, bool>
67
88
  {};
@@ -107,10 +128,9 @@ template <>
107
128
  struct __dev_attr<::cudaDevAttrGlobalMemoryBusWidth> //
108
129
  : __dev_attr_impl<::cudaDevAttrGlobalMemoryBusWidth, int>
109
130
  {};
110
- // TODO: give this a strong type for bytes
111
131
  template <>
112
132
  struct __dev_attr<::cudaDevAttrL2CacheSize> //
113
- : __dev_attr_impl<::cudaDevAttrL2CacheSize, int>
133
+ : __dev_attr_impl<::cudaDevAttrL2CacheSize, ::cuda::std::size_t>
114
134
  {};
115
135
  template <>
116
136
  struct __dev_attr<::cudaDevAttrUnifiedAddressing> //
@@ -129,6 +149,10 @@ struct __dev_attr<::cudaDevAttrLocalL1CacheSupported> //
129
149
  : __dev_attr_impl<::cudaDevAttrLocalL1CacheSupported, bool>
130
150
  {};
131
151
  template <>
152
+ struct __dev_attr<::cudaDevAttrMaxSharedMemoryPerMultiprocessor> //
153
+ : __dev_attr_impl<::cudaDevAttrMaxSharedMemoryPerMultiprocessor, ::cuda::std::size_t>
154
+ {};
155
+ template <>
132
156
  struct __dev_attr<::cudaDevAttrManagedMemory> //
133
157
  : __dev_attr_impl<::cudaDevAttrManagedMemory, bool>
134
158
  {};
@@ -173,6 +197,22 @@ struct __dev_attr<::cudaDevAttrDirectManagedMemAccessFromHost> //
173
197
  : __dev_attr_impl<::cudaDevAttrDirectManagedMemAccessFromHost, bool>
174
198
  {};
175
199
  template <>
200
+ struct __dev_attr<::cudaDevAttrMaxSharedMemoryPerBlockOptin> //
201
+ : __dev_attr_impl<::cudaDevAttrMaxSharedMemoryPerBlockOptin, ::cuda::std::size_t>
202
+ {};
203
+ template <>
204
+ struct __dev_attr<::cudaDevAttrMaxPersistingL2CacheSize> //
205
+ : __dev_attr_impl<::cudaDevAttrMaxPersistingL2CacheSize, ::cuda::std::size_t>
206
+ {};
207
+ template <>
208
+ struct __dev_attr<::cudaDevAttrMaxAccessPolicyWindowSize> //
209
+ : __dev_attr_impl<::cudaDevAttrMaxAccessPolicyWindowSize, ::cuda::std::size_t>
210
+ {};
211
+ template <>
212
+ struct __dev_attr<::cudaDevAttrReservedSharedMemoryPerBlock> //
213
+ : __dev_attr_impl<::cudaDevAttrReservedSharedMemoryPerBlock, ::cuda::std::size_t>
214
+ {};
215
+ template <>
176
216
  struct __dev_attr<::cudaDevAttrSparseCudaArraySupported> //
177
217
  : __dev_attr_impl<::cudaDevAttrSparseCudaArraySupported, bool>
178
218
  {};
@@ -239,463 +279,460 @@ struct __dev_attr<::cudaDevAttrNumaConfig> //
239
279
  };
240
280
  # endif // _CCCL_CTK_AT_LEAST(12, 2)
241
281
 
242
- } // namespace __detail
243
-
244
282
  namespace device_attributes
245
283
  {
246
284
  // Maximum number of threads per block
247
- using max_threads_per_block_t = __detail::__dev_attr<::cudaDevAttrMaxThreadsPerBlock>;
285
+ using max_threads_per_block_t = __dev_attr<::cudaDevAttrMaxThreadsPerBlock>;
248
286
  static constexpr max_threads_per_block_t max_threads_per_block{};
249
287
 
250
288
  // Maximum x-dimension of a block
251
- using max_block_dim_x_t = __detail::__dev_attr<::cudaDevAttrMaxBlockDimX>;
289
+ using max_block_dim_x_t = __dev_attr<::cudaDevAttrMaxBlockDimX>;
252
290
  static constexpr max_block_dim_x_t max_block_dim_x{};
253
291
 
254
292
  // Maximum y-dimension of a block
255
- using max_block_dim_y_t = __detail::__dev_attr<::cudaDevAttrMaxBlockDimY>;
293
+ using max_block_dim_y_t = __dev_attr<::cudaDevAttrMaxBlockDimY>;
256
294
  static constexpr max_block_dim_y_t max_block_dim_y{};
257
295
 
258
296
  // Maximum z-dimension of a block
259
- using max_block_dim_z_t = __detail::__dev_attr<::cudaDevAttrMaxBlockDimZ>;
297
+ using max_block_dim_z_t = __dev_attr<::cudaDevAttrMaxBlockDimZ>;
260
298
  static constexpr max_block_dim_z_t max_block_dim_z{};
261
299
 
262
300
  // Maximum x-dimension of a grid
263
- using max_grid_dim_x_t = __detail::__dev_attr<::cudaDevAttrMaxGridDimX>;
301
+ using max_grid_dim_x_t = __dev_attr<::cudaDevAttrMaxGridDimX>;
264
302
  static constexpr max_grid_dim_x_t max_grid_dim_x{};
265
303
 
266
304
  // Maximum y-dimension of a grid
267
- using max_grid_dim_y_t = __detail::__dev_attr<::cudaDevAttrMaxGridDimY>;
305
+ using max_grid_dim_y_t = __dev_attr<::cudaDevAttrMaxGridDimY>;
268
306
  static constexpr max_grid_dim_y_t max_grid_dim_y{};
269
307
 
270
308
  // Maximum z-dimension of a grid
271
- using max_grid_dim_z_t = __detail::__dev_attr<::cudaDevAttrMaxGridDimZ>;
309
+ using max_grid_dim_z_t = __dev_attr<::cudaDevAttrMaxGridDimZ>;
272
310
  static constexpr max_grid_dim_z_t max_grid_dim_z{};
273
311
 
274
312
  // Maximum amount of shared memory available to a thread block in bytes
275
- using max_shared_memory_per_block_t = __detail::__dev_attr<::cudaDevAttrMaxSharedMemoryPerBlock>;
313
+ using max_shared_memory_per_block_t = __dev_attr<::cudaDevAttrMaxSharedMemoryPerBlock>;
276
314
  static constexpr max_shared_memory_per_block_t max_shared_memory_per_block{};
277
315
 
278
316
  // Memory available on device for __constant__ variables in a CUDA C kernel in bytes
279
- using total_constant_memory_t = __detail::__dev_attr<::cudaDevAttrTotalConstantMemory>;
317
+ using total_constant_memory_t = __dev_attr<::cudaDevAttrTotalConstantMemory>;
280
318
  static constexpr total_constant_memory_t total_constant_memory{};
281
319
 
282
320
  // Warp size in threads
283
- using warp_size_t = __detail::__dev_attr<::cudaDevAttrWarpSize>;
321
+ using warp_size_t = __dev_attr<::cudaDevAttrWarpSize>;
284
322
  static constexpr warp_size_t warp_size{};
285
323
 
286
324
  // Maximum pitch in bytes allowed by the memory copy functions that involve
287
325
  // memory regions allocated through cudaMallocPitch()
288
- using max_pitch_t = __detail::__dev_attr<::cudaDevAttrMaxPitch>;
326
+ using max_pitch_t = __dev_attr<::cudaDevAttrMaxPitch>;
289
327
  static constexpr max_pitch_t max_pitch{};
290
328
 
291
329
  // Maximum 1D texture width
292
- using max_texture_1d_width_t = __detail::__dev_attr<::cudaDevAttrMaxTexture1DWidth>;
330
+ using max_texture_1d_width_t = __dev_attr<::cudaDevAttrMaxTexture1DWidth>;
293
331
  static constexpr max_texture_1d_width_t max_texture_1d_width{};
294
332
 
295
333
  // Maximum width for a 1D texture bound to linear memory
296
- using max_texture_1d_linear_width_t = __detail::__dev_attr<::cudaDevAttrMaxTexture1DLinearWidth>;
334
+ using max_texture_1d_linear_width_t = __dev_attr<::cudaDevAttrMaxTexture1DLinearWidth>;
297
335
  static constexpr max_texture_1d_linear_width_t max_texture_1d_linear_width{};
298
336
 
299
337
  // Maximum mipmapped 1D texture width
300
- using max_texture_1d_mipmapped_width_t = __detail::__dev_attr<::cudaDevAttrMaxTexture1DMipmappedWidth>;
338
+ using max_texture_1d_mipmapped_width_t = __dev_attr<::cudaDevAttrMaxTexture1DMipmappedWidth>;
301
339
  static constexpr max_texture_1d_mipmapped_width_t max_texture_1d_mipmapped_width{};
302
340
 
303
341
  // Maximum 2D texture width
304
- using max_texture_2d_width_t = __detail::__dev_attr<::cudaDevAttrMaxTexture2DWidth>;
342
+ using max_texture_2d_width_t = __dev_attr<::cudaDevAttrMaxTexture2DWidth>;
305
343
  static constexpr max_texture_2d_width_t max_texture_2d_width{};
306
344
 
307
345
  // Maximum 2D texture height
308
- using max_texture_2d_height_t = __detail::__dev_attr<::cudaDevAttrMaxTexture2DHeight>;
346
+ using max_texture_2d_height_t = __dev_attr<::cudaDevAttrMaxTexture2DHeight>;
309
347
  static constexpr max_texture_2d_height_t max_texture_2d_height{};
310
348
 
311
349
  // Maximum width for a 2D texture bound to linear memory
312
- using max_texture_2d_linear_width_t = __detail::__dev_attr<::cudaDevAttrMaxTexture2DLinearWidth>;
350
+ using max_texture_2d_linear_width_t = __dev_attr<::cudaDevAttrMaxTexture2DLinearWidth>;
313
351
  static constexpr max_texture_2d_linear_width_t max_texture_2d_linear_width{};
314
352
 
315
353
  // Maximum height for a 2D texture bound to linear memory
316
- using max_texture_2d_linear_height_t = __detail::__dev_attr<::cudaDevAttrMaxTexture2DLinearHeight>;
354
+ using max_texture_2d_linear_height_t = __dev_attr<::cudaDevAttrMaxTexture2DLinearHeight>;
317
355
  static constexpr max_texture_2d_linear_height_t max_texture_2d_linear_height{};
318
356
 
319
357
  // Maximum pitch in bytes for a 2D texture bound to linear memory
320
- using max_texture_2d_linear_pitch_t = __detail::__dev_attr<::cudaDevAttrMaxTexture2DLinearPitch>;
358
+ using max_texture_2d_linear_pitch_t = __dev_attr<::cudaDevAttrMaxTexture2DLinearPitch>;
321
359
  static constexpr max_texture_2d_linear_pitch_t max_texture_2d_linear_pitch{};
322
360
 
323
361
  // Maximum mipmapped 2D texture width
324
- using max_texture_2d_mipmapped_width_t = __detail::__dev_attr<::cudaDevAttrMaxTexture2DMipmappedWidth>;
362
+ using max_texture_2d_mipmapped_width_t = __dev_attr<::cudaDevAttrMaxTexture2DMipmappedWidth>;
325
363
  static constexpr max_texture_2d_mipmapped_width_t max_texture_2d_mipmapped_width{};
326
364
 
327
365
  // Maximum mipmapped 2D texture height
328
- using max_texture_2d_mipmapped_height_t = __detail::__dev_attr<::cudaDevAttrMaxTexture2DMipmappedHeight>;
366
+ using max_texture_2d_mipmapped_height_t = __dev_attr<::cudaDevAttrMaxTexture2DMipmappedHeight>;
329
367
  static constexpr max_texture_2d_mipmapped_height_t max_texture_2d_mipmapped_height{};
330
368
 
331
369
  // Maximum 3D texture width
332
- using max_texture_3d_width_t = __detail::__dev_attr<::cudaDevAttrMaxTexture3DWidth>;
370
+ using max_texture_3d_width_t = __dev_attr<::cudaDevAttrMaxTexture3DWidth>;
333
371
  static constexpr max_texture_3d_width_t max_texture_3d_width{};
334
372
 
335
373
  // Maximum 3D texture height
336
- using max_texture_3d_height_t = __detail::__dev_attr<::cudaDevAttrMaxTexture3DHeight>;
374
+ using max_texture_3d_height_t = __dev_attr<::cudaDevAttrMaxTexture3DHeight>;
337
375
  static constexpr max_texture_3d_height_t max_texture_3d_height{};
338
376
 
339
377
  // Maximum 3D texture depth
340
- using max_texture_3d_depth_t = __detail::__dev_attr<::cudaDevAttrMaxTexture3DDepth>;
378
+ using max_texture_3d_depth_t = __dev_attr<::cudaDevAttrMaxTexture3DDepth>;
341
379
  static constexpr max_texture_3d_depth_t max_texture_3d_depth{};
342
380
 
343
381
  // Alternate maximum 3D texture width, 0 if no alternate maximum 3D texture size is supported
344
- using max_texture_3d_width_alt_t = __detail::__dev_attr<::cudaDevAttrMaxTexture3DWidthAlt>;
382
+ using max_texture_3d_width_alt_t = __dev_attr<::cudaDevAttrMaxTexture3DWidthAlt>;
345
383
  static constexpr max_texture_3d_width_alt_t max_texture_3d_width_alt{};
346
384
 
347
385
  // Alternate maximum 3D texture height, 0 if no alternate maximum 3D texture size is supported
348
- using max_texture_3d_height_alt_t = __detail::__dev_attr<::cudaDevAttrMaxTexture3DHeightAlt>;
386
+ using max_texture_3d_height_alt_t = __dev_attr<::cudaDevAttrMaxTexture3DHeightAlt>;
349
387
  static constexpr max_texture_3d_height_alt_t max_texture_3d_height_alt{};
350
388
 
351
389
  // Alternate maximum 3D texture depth, 0 if no alternate maximum 3D texture size is supported
352
- using max_texture_3d_depth_alt_t = __detail::__dev_attr<::cudaDevAttrMaxTexture3DDepthAlt>;
390
+ using max_texture_3d_depth_alt_t = __dev_attr<::cudaDevAttrMaxTexture3DDepthAlt>;
353
391
  static constexpr max_texture_3d_depth_alt_t max_texture_3d_depth_alt{};
354
392
 
355
393
  // Maximum cubemap texture width or height
356
- using max_texture_cubemap_width_t = __detail::__dev_attr<::cudaDevAttrMaxTextureCubemapWidth>;
394
+ using max_texture_cubemap_width_t = __dev_attr<::cudaDevAttrMaxTextureCubemapWidth>;
357
395
  static constexpr max_texture_cubemap_width_t max_texture_cubemap_width{};
358
396
 
359
397
  // Maximum 1D layered texture width
360
- using max_texture_1d_layered_width_t = __detail::__dev_attr<::cudaDevAttrMaxTexture1DLayeredWidth>;
398
+ using max_texture_1d_layered_width_t = __dev_attr<::cudaDevAttrMaxTexture1DLayeredWidth>;
361
399
  static constexpr max_texture_1d_layered_width_t max_texture_1d_layered_width{};
362
400
 
363
401
  // Maximum layers in a 1D layered texture
364
- using max_texture_1d_layered_layers_t = __detail::__dev_attr<::cudaDevAttrMaxTexture1DLayeredLayers>;
402
+ using max_texture_1d_layered_layers_t = __dev_attr<::cudaDevAttrMaxTexture1DLayeredLayers>;
365
403
  static constexpr max_texture_1d_layered_layers_t max_texture_1d_layered_layers{};
366
404
 
367
405
  // Maximum 2D layered texture width
368
- using max_texture_2d_layered_width_t = __detail::__dev_attr<::cudaDevAttrMaxTexture2DLayeredWidth>;
406
+ using max_texture_2d_layered_width_t = __dev_attr<::cudaDevAttrMaxTexture2DLayeredWidth>;
369
407
  static constexpr max_texture_2d_layered_width_t max_texture_2d_layered_width{};
370
408
 
371
409
  // Maximum 2D layered texture height
372
- using max_texture_2d_layered_height_t = __detail::__dev_attr<::cudaDevAttrMaxTexture2DLayeredHeight>;
410
+ using max_texture_2d_layered_height_t = __dev_attr<::cudaDevAttrMaxTexture2DLayeredHeight>;
373
411
  static constexpr max_texture_2d_layered_height_t max_texture_2d_layered_height{};
374
412
 
375
413
  // Maximum layers in a 2D layered texture
376
- using max_texture_2d_layered_layers_t = __detail::__dev_attr<::cudaDevAttrMaxTexture2DLayeredLayers>;
414
+ using max_texture_2d_layered_layers_t = __dev_attr<::cudaDevAttrMaxTexture2DLayeredLayers>;
377
415
  static constexpr max_texture_2d_layered_layers_t max_texture_2d_layered_layers{};
378
416
 
379
417
  // Maximum cubemap layered texture width or height
380
- using max_texture_cubemap_layered_width_t = __detail::__dev_attr<::cudaDevAttrMaxTextureCubemapLayeredWidth>;
418
+ using max_texture_cubemap_layered_width_t = __dev_attr<::cudaDevAttrMaxTextureCubemapLayeredWidth>;
381
419
  static constexpr max_texture_cubemap_layered_width_t max_texture_cubemap_layered_width{};
382
420
 
383
421
  // Maximum layers in a cubemap layered texture
384
- using max_texture_cubemap_layered_layers_t = __detail::__dev_attr<::cudaDevAttrMaxTextureCubemapLayeredLayers>;
422
+ using max_texture_cubemap_layered_layers_t = __dev_attr<::cudaDevAttrMaxTextureCubemapLayeredLayers>;
385
423
  static constexpr max_texture_cubemap_layered_layers_t max_texture_cubemap_layered_layers{};
386
424
 
387
425
  // Maximum 1D surface width
388
- using max_surface_1d_width_t = __detail::__dev_attr<::cudaDevAttrMaxSurface1DWidth>;
426
+ using max_surface_1d_width_t = __dev_attr<::cudaDevAttrMaxSurface1DWidth>;
389
427
  static constexpr max_surface_1d_width_t max_surface_1d_width{};
390
428
 
391
429
  // Maximum 2D surface width
392
- using max_surface_2d_width_t = __detail::__dev_attr<::cudaDevAttrMaxSurface2DWidth>;
430
+ using max_surface_2d_width_t = __dev_attr<::cudaDevAttrMaxSurface2DWidth>;
393
431
  static constexpr max_surface_2d_width_t max_surface_2d_width{};
394
432
 
395
433
  // Maximum 2D surface height
396
- using max_surface_2d_height_t = __detail::__dev_attr<::cudaDevAttrMaxSurface2DHeight>;
434
+ using max_surface_2d_height_t = __dev_attr<::cudaDevAttrMaxSurface2DHeight>;
397
435
  static constexpr max_surface_2d_height_t max_surface_2d_height{};
398
436
 
399
437
  // Maximum 3D surface width
400
- using max_surface_3d_width_t = __detail::__dev_attr<::cudaDevAttrMaxSurface3DWidth>;
438
+ using max_surface_3d_width_t = __dev_attr<::cudaDevAttrMaxSurface3DWidth>;
401
439
  static constexpr max_surface_3d_width_t max_surface_3d_width{};
402
440
 
403
441
  // Maximum 3D surface height
404
- using max_surface_3d_height_t = __detail::__dev_attr<::cudaDevAttrMaxSurface3DHeight>;
442
+ using max_surface_3d_height_t = __dev_attr<::cudaDevAttrMaxSurface3DHeight>;
405
443
  static constexpr max_surface_3d_height_t max_surface_3d_height{};
406
444
 
407
445
  // Maximum 3D surface depth
408
- using max_surface_3d_depth_t = __detail::__dev_attr<::cudaDevAttrMaxSurface3DDepth>;
446
+ using max_surface_3d_depth_t = __dev_attr<::cudaDevAttrMaxSurface3DDepth>;
409
447
  static constexpr max_surface_3d_depth_t max_surface_3d_depth{};
410
448
 
411
449
  // Maximum 1D layered surface width
412
- using max_surface_1d_layered_width_t = __detail::__dev_attr<::cudaDevAttrMaxSurface1DLayeredWidth>;
450
+ using max_surface_1d_layered_width_t = __dev_attr<::cudaDevAttrMaxSurface1DLayeredWidth>;
413
451
  static constexpr max_surface_1d_layered_width_t max_surface_1d_layered_width{};
414
452
 
415
453
  // Maximum layers in a 1D layered surface
416
- using max_surface_1d_layered_layers_t = __detail::__dev_attr<::cudaDevAttrMaxSurface1DLayeredLayers>;
454
+ using max_surface_1d_layered_layers_t = __dev_attr<::cudaDevAttrMaxSurface1DLayeredLayers>;
417
455
  static constexpr max_surface_1d_layered_layers_t max_surface_1d_layered_layers{};
418
456
 
419
457
  // Maximum 2D layered surface width
420
- using max_surface_2d_layered_width_t = __detail::__dev_attr<::cudaDevAttrMaxSurface2DLayeredWidth>;
458
+ using max_surface_2d_layered_width_t = __dev_attr<::cudaDevAttrMaxSurface2DLayeredWidth>;
421
459
  static constexpr max_surface_2d_layered_width_t max_surface_2d_layered_width{};
422
460
 
423
461
  // Maximum 2D layered surface height
424
- using max_surface_2d_layered_height_t = __detail::__dev_attr<::cudaDevAttrMaxSurface2DLayeredHeight>;
462
+ using max_surface_2d_layered_height_t = __dev_attr<::cudaDevAttrMaxSurface2DLayeredHeight>;
425
463
  static constexpr max_surface_2d_layered_height_t max_surface_2d_layered_height{};
426
464
 
427
465
  // Maximum layers in a 2D layered surface
428
- using max_surface_2d_layered_layers_t = __detail::__dev_attr<::cudaDevAttrMaxSurface2DLayeredLayers>;
466
+ using max_surface_2d_layered_layers_t = __dev_attr<::cudaDevAttrMaxSurface2DLayeredLayers>;
429
467
  static constexpr max_surface_2d_layered_layers_t max_surface_2d_layered_layers{};
430
468
 
431
469
  // Maximum cubemap surface width
432
- using max_surface_cubemap_width_t = __detail::__dev_attr<::cudaDevAttrMaxSurfaceCubemapWidth>;
470
+ using max_surface_cubemap_width_t = __dev_attr<::cudaDevAttrMaxSurfaceCubemapWidth>;
433
471
  static constexpr max_surface_cubemap_width_t max_surface_cubemap_width{};
434
472
 
435
473
  // Maximum cubemap layered surface width
436
- using max_surface_cubemap_layered_width_t = __detail::__dev_attr<::cudaDevAttrMaxSurfaceCubemapLayeredWidth>;
474
+ using max_surface_cubemap_layered_width_t = __dev_attr<::cudaDevAttrMaxSurfaceCubemapLayeredWidth>;
437
475
  static constexpr max_surface_cubemap_layered_width_t max_surface_cubemap_layered_width{};
438
476
 
439
477
  // Maximum layers in a cubemap layered surface
440
- using max_surface_cubemap_layered_layers_t = __detail::__dev_attr<::cudaDevAttrMaxSurfaceCubemapLayeredLayers>;
478
+ using max_surface_cubemap_layered_layers_t = __dev_attr<::cudaDevAttrMaxSurfaceCubemapLayeredLayers>;
441
479
  static constexpr max_surface_cubemap_layered_layers_t max_surface_cubemap_layered_layers{};
442
480
 
443
481
  // Maximum number of 32-bit registers available to a thread block
444
- using max_registers_per_block_t = __detail::__dev_attr<::cudaDevAttrMaxRegistersPerBlock>;
482
+ using max_registers_per_block_t = __dev_attr<::cudaDevAttrMaxRegistersPerBlock>;
445
483
  static constexpr max_registers_per_block_t max_registers_per_block{};
446
484
 
447
485
  // Peak clock frequency in kilohertz
448
- using clock_rate_t = __detail::__dev_attr<::cudaDevAttrClockRate>;
486
+ using clock_rate_t = __dev_attr<::cudaDevAttrClockRate>;
449
487
  static constexpr clock_rate_t clock_rate{};
450
488
 
451
489
  // Alignment requirement; texture base addresses aligned to textureAlign bytes
452
490
  // do not need an offset applied to texture fetches
453
- using texture_alignment_t = __detail::__dev_attr<::cudaDevAttrTextureAlignment>;
491
+ using texture_alignment_t = __dev_attr<::cudaDevAttrTextureAlignment>;
454
492
  static constexpr texture_alignment_t texture_alignment{};
455
493
 
456
494
  // Pitch alignment requirement for 2D texture references bound to pitched memory
457
- using texture_pitch_alignment_t = __detail::__dev_attr<::cudaDevAttrTexturePitchAlignment>;
495
+ using texture_pitch_alignment_t = __dev_attr<::cudaDevAttrTexturePitchAlignment>;
458
496
  static constexpr texture_pitch_alignment_t texture_pitch_alignment{};
459
497
 
460
498
  // true if the device can concurrently copy memory between host and device
461
499
  // while executing a kernel, or false if not
462
- using gpu_overlap_t = __detail::__dev_attr<::cudaDevAttrGpuOverlap>;
500
+ using gpu_overlap_t = __dev_attr<::cudaDevAttrGpuOverlap>;
463
501
  static constexpr gpu_overlap_t gpu_overlap{};
464
502
 
465
503
  // Number of multiprocessors on the device
466
- using multiprocessor_count_t = __detail::__dev_attr<::cudaDevAttrMultiProcessorCount>;
504
+ using multiprocessor_count_t = __dev_attr<::cudaDevAttrMultiProcessorCount>;
467
505
  static constexpr multiprocessor_count_t multiprocessor_count{};
468
506
 
469
507
  // true if there is a run time limit for kernels executed on the device, or
470
508
  // false if not
471
- using kernel_exec_timeout_t = __detail::__dev_attr<::cudaDevAttrKernelExecTimeout>;
509
+ using kernel_exec_timeout_t = __dev_attr<::cudaDevAttrKernelExecTimeout>;
472
510
  static constexpr kernel_exec_timeout_t kernel_exec_timeout{};
473
511
 
474
512
  // true if the device is integrated with the memory subsystem, or false if not
475
- using integrated_t = __detail::__dev_attr<::cudaDevAttrIntegrated>;
513
+ using integrated_t = __dev_attr<::cudaDevAttrIntegrated>;
476
514
  static constexpr integrated_t integrated{};
477
515
 
478
516
  // true if the device can map host memory into CUDA address space
479
- using can_map_host_memory_t = __detail::__dev_attr<::cudaDevAttrCanMapHostMemory>;
517
+ using can_map_host_memory_t = __dev_attr<::cudaDevAttrCanMapHostMemory>;
480
518
  static constexpr can_map_host_memory_t can_map_host_memory{};
481
519
 
482
520
  // Compute mode is the compute mode that the device is currently in.
483
- using compute_mode_t = __detail::__dev_attr<::cudaDevAttrComputeMode>;
521
+ using compute_mode_t = __dev_attr<::cudaDevAttrComputeMode>;
484
522
  static constexpr compute_mode_t compute_mode{};
485
523
 
486
524
  // true if the device supports executing multiple kernels within the same
487
525
  // context simultaneously, or false if not. It is not guaranteed that multiple
488
526
  // kernels will be resident on the device concurrently so this feature should
489
527
  // not be relied upon for correctness.
490
- using concurrent_kernels_t = __detail::__dev_attr<::cudaDevAttrConcurrentKernels>;
528
+ using concurrent_kernels_t = __dev_attr<::cudaDevAttrConcurrentKernels>;
491
529
  static constexpr concurrent_kernels_t concurrent_kernels{};
492
530
 
493
531
  // true if error correction is enabled on the device, 0 if error correction is
494
532
  // disabled or not supported by the device
495
- using ecc_enabled_t = __detail::__dev_attr<::cudaDevAttrEccEnabled>;
533
+ using ecc_enabled_t = __dev_attr<::cudaDevAttrEccEnabled>;
496
534
  static constexpr ecc_enabled_t ecc_enabled{};
497
535
 
498
536
  // PCI bus identifier of the device
499
- using pci_bus_id_t = __detail::__dev_attr<::cudaDevAttrPciBusId>;
537
+ using pci_bus_id_t = __dev_attr<::cudaDevAttrPciBusId>;
500
538
  static constexpr pci_bus_id_t pci_bus_id{};
501
539
 
502
540
  // PCI device (also known as slot) identifier of the device
503
- using pci_device_id_t = __detail::__dev_attr<::cudaDevAttrPciDeviceId>;
541
+ using pci_device_id_t = __dev_attr<::cudaDevAttrPciDeviceId>;
504
542
  static constexpr pci_device_id_t pci_device_id{};
505
543
 
506
544
  // true if the device is using a TCC driver. TCC is only available on Tesla
507
545
  // hardware running Windows Vista or later.
508
- using tcc_driver_t = __detail::__dev_attr<::cudaDevAttrTccDriver>;
546
+ using tcc_driver_t = __dev_attr<::cudaDevAttrTccDriver>;
509
547
  static constexpr tcc_driver_t tcc_driver{};
510
548
 
511
549
  // Peak memory clock frequency in kilohertz
512
- using memory_clock_rate_t = __detail::__dev_attr<::cudaDevAttrMemoryClockRate>;
550
+ using memory_clock_rate_t = __dev_attr<::cudaDevAttrMemoryClockRate>;
513
551
  static constexpr memory_clock_rate_t memory_clock_rate{};
514
552
 
515
553
  // Global memory bus width in bits
516
- using global_memory_bus_width_t = __detail::__dev_attr<::cudaDevAttrGlobalMemoryBusWidth>;
554
+ using global_memory_bus_width_t = __dev_attr<::cudaDevAttrGlobalMemoryBusWidth>;
517
555
  static constexpr global_memory_bus_width_t global_memory_bus_width{};
518
556
 
519
557
  // Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
520
- using l2_cache_size_t = __detail::__dev_attr<::cudaDevAttrL2CacheSize>;
558
+ using l2_cache_size_t = __dev_attr<::cudaDevAttrL2CacheSize>;
521
559
  static constexpr l2_cache_size_t l2_cache_size{};
522
560
 
523
561
  // Maximum resident threads per multiprocessor
524
- using max_threads_per_multiprocessor_t = __detail::__dev_attr<::cudaDevAttrMaxThreadsPerMultiProcessor>;
562
+ using max_threads_per_multiprocessor_t = __dev_attr<::cudaDevAttrMaxThreadsPerMultiProcessor>;
525
563
  static constexpr max_threads_per_multiprocessor_t max_threads_per_multiprocessor{};
526
564
 
527
565
  // true if the device shares a unified address space with the host, or false
528
566
  // if not
529
- using unified_addressing_t = __detail::__dev_attr<::cudaDevAttrUnifiedAddressing>;
567
+ using unified_addressing_t = __dev_attr<::cudaDevAttrUnifiedAddressing>;
530
568
  static constexpr unified_addressing_t unified_addressing{};
531
569
 
532
570
  // Major compute capability version number
533
- using compute_capability_major_t = __detail::__dev_attr<::cudaDevAttrComputeCapabilityMajor>;
571
+ using compute_capability_major_t = __dev_attr<::cudaDevAttrComputeCapabilityMajor>;
534
572
  static constexpr compute_capability_major_t compute_capability_major{};
535
573
 
536
574
  // Minor compute capability version number
537
- using compute_capability_minor_t = __detail::__dev_attr<::cudaDevAttrComputeCapabilityMinor>;
575
+ using compute_capability_minor_t = __dev_attr<::cudaDevAttrComputeCapabilityMinor>;
538
576
  static constexpr compute_capability_minor_t compute_capability_minor{};
539
577
 
540
578
  // true if the device supports stream priorities, or false if not
541
- using stream_priorities_supported_t = __detail::__dev_attr<::cudaDevAttrStreamPrioritiesSupported>;
579
+ using stream_priorities_supported_t = __dev_attr<::cudaDevAttrStreamPrioritiesSupported>;
542
580
  static constexpr stream_priorities_supported_t stream_priorities_supported{};
543
581
 
544
582
  // true if device supports caching globals in L1 cache, false if not
545
- using global_l1_cache_supported_t = __detail::__dev_attr<::cudaDevAttrGlobalL1CacheSupported>;
583
+ using global_l1_cache_supported_t = __dev_attr<::cudaDevAttrGlobalL1CacheSupported>;
546
584
  static constexpr global_l1_cache_supported_t global_l1_cache_supported{};
547
585
 
548
586
  // true if device supports caching locals in L1 cache, false if not
549
- using local_l1_cache_supported_t = __detail::__dev_attr<::cudaDevAttrLocalL1CacheSupported>;
587
+ using local_l1_cache_supported_t = __dev_attr<::cudaDevAttrLocalL1CacheSupported>;
550
588
  static constexpr local_l1_cache_supported_t local_l1_cache_supported{};
551
589
 
552
590
  // Maximum amount of shared memory available to a multiprocessor in bytes;
553
591
  // this amount is shared by all thread blocks simultaneously resident on a
554
592
  // multiprocessor
555
- using max_shared_memory_per_multiprocessor_t = __detail::__dev_attr<::cudaDevAttrMaxSharedMemoryPerMultiprocessor>;
593
+ using max_shared_memory_per_multiprocessor_t = __dev_attr<::cudaDevAttrMaxSharedMemoryPerMultiprocessor>;
556
594
  static constexpr max_shared_memory_per_multiprocessor_t max_shared_memory_per_multiprocessor{};
557
595
 
558
596
  // Maximum number of 32-bit registers available to a multiprocessor; this
559
597
  // number is shared by all thread blocks simultaneously resident on a
560
598
  // multiprocessor
561
- using max_registers_per_multiprocessor_t = __detail::__dev_attr<::cudaDevAttrMaxRegistersPerMultiprocessor>;
599
+ using max_registers_per_multiprocessor_t = __dev_attr<::cudaDevAttrMaxRegistersPerMultiprocessor>;
562
600
  static constexpr max_registers_per_multiprocessor_t max_registers_per_multiprocessor{};
563
601
 
564
602
  // true if device supports allocating managed memory, false if not
565
- using managed_memory_t = __detail::__dev_attr<::cudaDevAttrManagedMemory>;
603
+ using managed_memory_t = __dev_attr<::cudaDevAttrManagedMemory>;
566
604
  static constexpr managed_memory_t managed_memory{};
567
605
 
568
606
  // true if device is on a multi-GPU board, false if not
569
- using is_multi_gpu_board_t = __detail::__dev_attr<::cudaDevAttrIsMultiGpuBoard>;
607
+ using is_multi_gpu_board_t = __dev_attr<::cudaDevAttrIsMultiGpuBoard>;
570
608
  static constexpr is_multi_gpu_board_t is_multi_gpu_board{};
571
609
 
572
610
  // Unique identifier for a group of devices on the same multi-GPU board
573
- using multi_gpu_board_group_id_t = __detail::__dev_attr<::cudaDevAttrMultiGpuBoardGroupID>;
611
+ using multi_gpu_board_group_id_t = __dev_attr<::cudaDevAttrMultiGpuBoardGroupID>;
574
612
  static constexpr multi_gpu_board_group_id_t multi_gpu_board_group_id{};
575
613
 
576
614
  // true if the link between the device and the host supports native atomic
577
615
  // operations
578
- using host_native_atomic_supported_t = __detail::__dev_attr<::cudaDevAttrHostNativeAtomicSupported>;
616
+ using host_native_atomic_supported_t = __dev_attr<::cudaDevAttrHostNativeAtomicSupported>;
579
617
  static constexpr host_native_atomic_supported_t host_native_atomic_supported{};
580
618
 
581
619
  // Ratio of single precision performance (in floating-point operations per
582
620
  // second) to double precision performance
583
- using single_to_double_precision_perf_ratio_t = __detail::__dev_attr<::cudaDevAttrSingleToDoublePrecisionPerfRatio>;
621
+ using single_to_double_precision_perf_ratio_t = __dev_attr<::cudaDevAttrSingleToDoublePrecisionPerfRatio>;
584
622
  static constexpr single_to_double_precision_perf_ratio_t single_to_double_precision_perf_ratio{};
585
623
 
586
624
  // true if the device supports coherently accessing pageable memory without
587
625
  // calling cudaHostRegister on it, and false otherwise
588
- using pageable_memory_access_t = __detail::__dev_attr<::cudaDevAttrPageableMemoryAccess>;
626
+ using pageable_memory_access_t = __dev_attr<::cudaDevAttrPageableMemoryAccess>;
589
627
  static constexpr pageable_memory_access_t pageable_memory_access{};
590
628
 
591
629
  // true if the device can coherently access managed memory concurrently with
592
630
  // the CPU, and false otherwise
593
- using concurrent_managed_access_t = __detail::__dev_attr<::cudaDevAttrConcurrentManagedAccess>;
631
+ using concurrent_managed_access_t = __dev_attr<::cudaDevAttrConcurrentManagedAccess>;
594
632
  static constexpr concurrent_managed_access_t concurrent_managed_access{};
595
633
 
596
634
  // true if the device supports Compute Preemption, false if not
597
- using compute_preemption_supported_t = __detail::__dev_attr<::cudaDevAttrComputePreemptionSupported>;
635
+ using compute_preemption_supported_t = __dev_attr<::cudaDevAttrComputePreemptionSupported>;
598
636
  static constexpr compute_preemption_supported_t compute_preemption_supported{};
599
637
 
600
638
  // true if the device can access host registered memory at the same virtual
601
639
  // address as the CPU, and false otherwise
602
- using can_use_host_pointer_for_registered_mem_t = __detail::__dev_attr<::cudaDevAttrCanUseHostPointerForRegisteredMem>;
640
+ using can_use_host_pointer_for_registered_mem_t = __dev_attr<::cudaDevAttrCanUseHostPointerForRegisteredMem>;
603
641
  static constexpr can_use_host_pointer_for_registered_mem_t can_use_host_pointer_for_registered_mem{};
604
642
 
605
643
  // true if the device supports launching cooperative kernels via
606
644
  // cudaLaunchCooperativeKernel, and false otherwise
607
- using cooperative_launch_t = __detail::__dev_attr<::cudaDevAttrCooperativeLaunch>;
645
+ using cooperative_launch_t = __dev_attr<::cudaDevAttrCooperativeLaunch>;
608
646
  static constexpr cooperative_launch_t cooperative_launch{};
609
647
 
610
648
  // true if the device supports flushing of outstanding remote writes, and
611
649
  // false otherwise
612
- using can_flush_remote_writes_t = __detail::__dev_attr<::cudaDevAttrCanFlushRemoteWrites>;
650
+ using can_flush_remote_writes_t = __dev_attr<::cudaDevAttrCanFlushRemoteWrites>;
613
651
  static constexpr can_flush_remote_writes_t can_flush_remote_writes{};
614
652
 
615
653
  // true if the device supports host memory registration via cudaHostRegister,
616
654
  // and false otherwise
617
- using host_register_supported_t = __detail::__dev_attr<::cudaDevAttrHostRegisterSupported>;
655
+ using host_register_supported_t = __dev_attr<::cudaDevAttrHostRegisterSupported>;
618
656
  static constexpr host_register_supported_t host_register_supported{};
619
657
 
620
658
  // true if the device accesses pageable memory via the host's page tables, and
621
659
  // false otherwise
622
- using pageable_memory_access_uses_host_page_tables_t =
623
- __detail::__dev_attr<::cudaDevAttrPageableMemoryAccessUsesHostPageTables>;
660
+ using pageable_memory_access_uses_host_page_tables_t = __dev_attr<::cudaDevAttrPageableMemoryAccessUsesHostPageTables>;
624
661
  static constexpr pageable_memory_access_uses_host_page_tables_t pageable_memory_access_uses_host_page_tables{};
625
662
 
626
663
  // true if the host can directly access managed memory on the device without
627
664
  // migration, and false otherwise
628
- using direct_managed_mem_access_from_host_t = __detail::__dev_attr<::cudaDevAttrDirectManagedMemAccessFromHost>;
665
+ using direct_managed_mem_access_from_host_t = __dev_attr<::cudaDevAttrDirectManagedMemAccessFromHost>;
629
666
  static constexpr direct_managed_mem_access_from_host_t direct_managed_mem_access_from_host{};
630
667
 
631
668
  // Maximum per block shared memory size on the device. This value can be opted
632
669
  // into when using dynamic_shared_memory with NonPortableSize set to true
633
- using max_shared_memory_per_block_optin_t = __detail::__dev_attr<::cudaDevAttrMaxSharedMemoryPerBlockOptin>;
670
+ using max_shared_memory_per_block_optin_t = __dev_attr<::cudaDevAttrMaxSharedMemoryPerBlockOptin>;
634
671
  static constexpr max_shared_memory_per_block_optin_t max_shared_memory_per_block_optin{};
635
672
 
636
673
  // Maximum number of thread blocks that can reside on a multiprocessor
637
- using max_blocks_per_multiprocessor_t = __detail::__dev_attr<::cudaDevAttrMaxBlocksPerMultiprocessor>;
674
+ using max_blocks_per_multiprocessor_t = __dev_attr<::cudaDevAttrMaxBlocksPerMultiprocessor>;
638
675
  static constexpr max_blocks_per_multiprocessor_t max_blocks_per_multiprocessor{};
639
676
 
640
677
  // Maximum L2 persisting lines capacity setting in bytes
641
- using max_persisting_l2_cache_size_t = __detail::__dev_attr<::cudaDevAttrMaxPersistingL2CacheSize>;
678
+ using max_persisting_l2_cache_size_t = __dev_attr<::cudaDevAttrMaxPersistingL2CacheSize>;
642
679
  static constexpr max_persisting_l2_cache_size_t max_persisting_l2_cache_size{};
643
680
 
644
681
  // Maximum value of cudaAccessPolicyWindow::num_bytes
645
- using max_access_policy_window_size_t = __detail::__dev_attr<::cudaDevAttrMaxAccessPolicyWindowSize>;
682
+ using max_access_policy_window_size_t = __dev_attr<::cudaDevAttrMaxAccessPolicyWindowSize>;
646
683
  static constexpr max_access_policy_window_size_t max_access_policy_window_size{};
647
684
 
648
685
  // Shared memory reserved by CUDA driver per block in bytes
649
- using reserved_shared_memory_per_block_t = __detail::__dev_attr<::cudaDevAttrReservedSharedMemoryPerBlock>;
686
+ using reserved_shared_memory_per_block_t = __dev_attr<::cudaDevAttrReservedSharedMemoryPerBlock>;
650
687
  static constexpr reserved_shared_memory_per_block_t reserved_shared_memory_per_block{};
651
688
 
652
689
  // true if the device supports sparse CUDA arrays and sparse CUDA mipmapped arrays.
653
- using sparse_cuda_array_supported_t = __detail::__dev_attr<::cudaDevAttrSparseCudaArraySupported>;
690
+ using sparse_cuda_array_supported_t = __dev_attr<::cudaDevAttrSparseCudaArraySupported>;
654
691
  static constexpr sparse_cuda_array_supported_t sparse_cuda_array_supported{};
655
692
 
656
693
  // Device supports using the cudaHostRegister flag cudaHostRegisterReadOnly to
657
694
  // register memory that must be mapped as read-only to the GPU
658
- using host_register_read_only_supported_t = __detail::__dev_attr<::cudaDevAttrHostRegisterReadOnlySupported>;
695
+ using host_register_read_only_supported_t = __dev_attr<::cudaDevAttrHostRegisterReadOnlySupported>;
659
696
  static constexpr host_register_read_only_supported_t host_register_read_only_supported{};
660
697
 
661
698
  // true if the device supports using the cudaMallocAsync and cudaMemPool
662
699
  // family of APIs, and false otherwise
663
- using memory_pools_supported_t = __detail::__dev_attr<::cudaDevAttrMemoryPoolsSupported>;
700
+ using memory_pools_supported_t = __dev_attr<::cudaDevAttrMemoryPoolsSupported>;
664
701
  static constexpr memory_pools_supported_t memory_pools_supported{};
665
702
 
666
703
  // true if the device supports GPUDirect RDMA APIs, and false otherwise
667
- using gpu_direct_rdma_supported_t = __detail::__dev_attr<::cudaDevAttrGPUDirectRDMASupported>;
704
+ using gpu_direct_rdma_supported_t = __dev_attr<::cudaDevAttrGPUDirectRDMASupported>;
668
705
  static constexpr gpu_direct_rdma_supported_t gpu_direct_rdma_supported{};
669
706
 
670
707
  // bitmask to be interpreted according to the
671
708
  // cudaFlushGPUDirectRDMAWritesOptions enum
672
- using gpu_direct_rdma_flush_writes_options_t = __detail::__dev_attr<::cudaDevAttrGPUDirectRDMAFlushWritesOptions>;
709
+ using gpu_direct_rdma_flush_writes_options_t = __dev_attr<::cudaDevAttrGPUDirectRDMAFlushWritesOptions>;
673
710
  static constexpr gpu_direct_rdma_flush_writes_options_t gpu_direct_rdma_flush_writes_options{};
674
711
 
675
712
  // see the cudaGPUDirectRDMAWritesOrdering enum for numerical values
676
- using gpu_direct_rdma_writes_ordering_t = __detail::__dev_attr<::cudaDevAttrGPUDirectRDMAWritesOrdering>;
713
+ using gpu_direct_rdma_writes_ordering_t = __dev_attr<::cudaDevAttrGPUDirectRDMAWritesOrdering>;
677
714
  static constexpr gpu_direct_rdma_writes_ordering_t gpu_direct_rdma_writes_ordering{};
678
715
 
679
716
  // Bitmask of handle types supported with mempool based IPC
680
- using memory_pool_supported_handle_types_t = __detail::__dev_attr<::cudaDevAttrMemoryPoolSupportedHandleTypes>;
717
+ using memory_pool_supported_handle_types_t = __dev_attr<::cudaDevAttrMemoryPoolSupportedHandleTypes>;
681
718
  static constexpr memory_pool_supported_handle_types_t memory_pool_supported_handle_types{};
682
719
 
683
720
  // true if the device supports deferred mapping CUDA arrays and CUDA mipmapped
684
721
  // arrays.
685
- using deferred_mapping_cuda_array_supported_t = __detail::__dev_attr<::cudaDevAttrDeferredMappingCudaArraySupported>;
722
+ using deferred_mapping_cuda_array_supported_t = __dev_attr<::cudaDevAttrDeferredMappingCudaArraySupported>;
686
723
  static constexpr deferred_mapping_cuda_array_supported_t deferred_mapping_cuda_array_supported{};
687
724
 
688
725
  // true if the device supports IPC Events, false otherwise.
689
- using ipc_event_support_t = __detail::__dev_attr<::cudaDevAttrIpcEventSupport>;
726
+ using ipc_event_support_t = __dev_attr<::cudaDevAttrIpcEventSupport>;
690
727
  static constexpr ipc_event_support_t ipc_event_support{};
691
728
 
692
729
  # if _CCCL_CTK_AT_LEAST(12, 2)
693
730
  // NUMA configuration of a device: value is of type cudaDeviceNumaConfig enum
694
- using numa_config_t = __detail::__dev_attr<::cudaDevAttrNumaConfig>;
731
+ using numa_config_t = __dev_attr<::cudaDevAttrNumaConfig>;
695
732
  static constexpr numa_config_t numa_config{};
696
733
 
697
734
  // NUMA node ID of the GPU memory
698
- using numa_id_t = __detail::__dev_attr<::cudaDevAttrNumaId>;
735
+ using numa_id_t = __dev_attr<::cudaDevAttrNumaId>;
699
736
  static constexpr numa_id_t numa_id{};
700
737
  # endif // _CCCL_CTK_AT_LEAST(12, 2)
701
738
 
@@ -703,15 +740,29 @@ static constexpr numa_id_t numa_id{};
703
740
  // capability in a single query
704
741
  struct compute_capability_t
705
742
  {
706
- [[nodiscard]] int operator()(device_ref __dev_id) const
743
+ using type = ::cuda::compute_capability;
744
+
745
+ [[nodiscard]] _CCCL_HOST_API type operator()(device_ref __dev_id) const
707
746
  {
708
- return 10 * ::cuda::device_attributes::compute_capability_major(__dev_id)
709
- + ::cuda::device_attributes::compute_capability_minor(__dev_id);
747
+ return type{::cuda::device_attributes::compute_capability_major(__dev_id),
748
+ ::cuda::device_attributes::compute_capability_minor(__dev_id)};
710
749
  }
711
750
  };
712
751
  static constexpr compute_capability_t compute_capability{};
713
752
  } // namespace device_attributes
714
753
 
754
+ //! @brief For a given attribute, type of the attribute value.
755
+ //!
756
+ //! @par Example
757
+ //! @code
758
+ //! using threads_per_block_t = device::attr_result_t<device_attributes::max_threads_per_block>;
759
+ //! static_assert(std::is_same_v<threads_per_block_t, int>);
760
+ //! @endcode
761
+ //!
762
+ //! @sa device_attributes
763
+ template <::cudaDeviceAttr _Attr>
764
+ using device_attribute_result_t = typename __dev_attr<_Attr>::type;
765
+
715
766
  _CCCL_END_NAMESPACE_CUDA
716
767
 
717
768
  # include <cuda/std/__cccl/epilogue.h>