cuda-cccl 0.3.1__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.2__cp310-cp310-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (185) hide show
  1. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  2. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  3. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  4. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  5. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  6. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  7. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1 -0
  8. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  9. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  10. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  11. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  12. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  13. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  14. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  15. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +12 -13
  16. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  17. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +2 -3
  18. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +4 -3
  19. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -1
  20. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  21. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  22. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  23. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  24. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  25. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  26. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  27. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  28. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  29. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  30. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  31. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  32. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  33. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  34. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  35. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  36. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  37. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  38. cuda/cccl/headers/include/cuda/__device/arch_traits.h +239 -317
  39. cuda/cccl/headers/include/cuda/__device/attributes.h +4 -3
  40. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  41. cuda/cccl/headers/include/cuda/__device/device_ref.h +0 -10
  42. cuda/cccl/headers/include/cuda/__device/physical_device.h +1 -26
  43. cuda/cccl/headers/include/cuda/__event/event.h +26 -26
  44. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  45. cuda/cccl/headers/include/cuda/__event/timed_event.h +9 -7
  46. cuda/cccl/headers/include/cuda/__fwd/devices.h +4 -4
  47. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  48. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  49. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  50. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  51. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  52. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  53. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +2 -12
  54. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +15 -19
  55. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +59 -60
  56. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  57. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  58. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  59. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  60. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  61. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  62. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  63. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  64. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  65. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  66. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +5 -4
  67. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  68. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -16
  69. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  70. cuda/cccl/headers/include/cuda/cmath +1 -0
  71. cuda/cccl/headers/include/cuda/devices +3 -0
  72. cuda/cccl/headers/include/cuda/memory +1 -0
  73. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  74. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  75. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  76. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  77. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  78. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  79. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  80. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  81. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  82. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  83. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  84. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  85. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  86. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  87. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  88. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  89. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  90. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  91. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  92. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  93. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  94. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  95. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  96. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  97. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  98. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  99. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  100. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  101. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  102. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  103. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  104. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  105. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  106. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  107. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  108. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  109. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  110. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  111. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  112. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  113. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  114. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  115. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  116. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  117. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  118. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  119. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  120. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  121. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  122. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  123. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  124. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  125. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  126. cuda/cccl/headers/include/cuda/std/string_view +146 -11
  127. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  128. cuda/cccl/headers/include/cuda/utility +1 -0
  129. cuda/cccl/headers/include/nv/target +7 -2
  130. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  131. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  132. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  133. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  134. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  135. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  136. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  137. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  138. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  139. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  140. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  141. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  142. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  143. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  144. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  145. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  146. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  147. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  148. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  149. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  150. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  151. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  152. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  153. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  154. cuda/compute/__init__.py +2 -0
  155. cuda/compute/_bindings.pyi +43 -1
  156. cuda/compute/_bindings_impl.pyx +156 -7
  157. cuda/compute/algorithms/_scan.py +108 -36
  158. cuda/compute/algorithms/_transform.py +32 -11
  159. cuda/compute/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  160. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  161. cuda/compute/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  162. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  163. cuda/compute/iterators/__init__.py +2 -0
  164. cuda/compute/iterators/_factories.py +28 -0
  165. cuda/compute/iterators/_iterators.py +206 -1
  166. cuda/compute/numba_utils.py +2 -2
  167. cuda/compute/typing.py +2 -0
  168. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  169. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +171 -175
  170. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  171. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  172. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  173. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  174. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  175. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  176. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  177. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  178. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  179. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  180. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  181. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  182. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  183. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  184. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  185. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -21,118 +21,93 @@
21
21
  # pragma system_header
22
22
  #endif // no system header
23
23
 
24
- #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
24
+ #if _CCCL_HAS_CTK()
25
25
 
26
- # include <cuda/__device/attributes.h>
26
+ # include <cuda/__device/arch_id.h>
27
+ # include <cuda/__device/compute_capability.h>
27
28
  # include <cuda/__fwd/devices.h>
28
29
  # include <cuda/std/__exception/cuda_error.h>
30
+ # include <cuda/std/__type_traits/always_false.h>
31
+ # include <cuda/std/cstdint>
29
32
  # include <cuda/std/limits>
30
33
 
31
34
  # include <cuda/std/__cccl/prologue.h>
32
35
 
33
36
  _CCCL_BEGIN_NAMESPACE_CUDA
34
- namespace arch
35
- {
36
-
37
- inline constexpr int __arch_specific_id_multiplier = 100000;
38
-
39
- // @brief Architecture identifier
40
- // This type identifies an architecture. It has more possible entries than just numeric values of the compute
41
- // capability. For example, sm_90 and sm_90a have the same compute capability, but the identifier is different.
42
- enum class id : int
43
- {
44
- sm_60 = 60,
45
- sm_61 = 61,
46
- sm_70 = 70,
47
- sm_75 = 75,
48
- sm_80 = 80,
49
- sm_86 = 86,
50
- sm_89 = 89,
51
- sm_90 = 90,
52
- sm_100 = 100,
53
- sm_103 = 103,
54
- sm_110 = 110,
55
- sm_120 = 120,
56
- sm_90a = 90 * __arch_specific_id_multiplier,
57
- sm_100a = 100 * __arch_specific_id_multiplier,
58
- sm_103a = 103 * __arch_specific_id_multiplier,
59
- sm_110a = 110 * __arch_specific_id_multiplier,
60
- sm_120a = 120 * __arch_specific_id_multiplier,
61
- };
62
37
 
63
38
  //! @brief Architecture traits
64
39
  //! This type contains information about an architecture that is constant across devices of that architecture.
65
- struct traits_t
40
+ struct arch_traits_t
66
41
  {
67
42
  // Maximum number of threads per block
68
- int max_threads_per_block = 1024;
43
+ int max_threads_per_block;
69
44
 
70
45
  // Maximum x-dimension of a block
71
- int max_block_dim_x = 1024;
46
+ int max_block_dim_x;
72
47
 
73
48
  // Maximum y-dimension of a block
74
- int max_block_dim_y = 1024;
49
+ int max_block_dim_y;
75
50
 
76
51
  // Maximum z-dimension of a block
77
- int max_block_dim_z = 64;
52
+ int max_block_dim_z;
78
53
 
79
54
  // Maximum x-dimension of a grid
80
- int max_grid_dim_x = ::cuda::std::numeric_limits<int32_t>::max();
55
+ int max_grid_dim_x;
81
56
 
82
57
  // Maximum y-dimension of a grid
83
- int max_grid_dim_y = 64 * 1024 - 1;
58
+ int max_grid_dim_y;
84
59
 
85
60
  // Maximum z-dimension of a grid
86
- int max_grid_dim_z = 64 * 1024 - 1;
61
+ int max_grid_dim_z;
87
62
 
88
63
  // Maximum amount of shared memory available to a thread block in bytes
89
- ::cuda::std::size_t max_shared_memory_per_block = 48 * 1024;
64
+ ::cuda::std::size_t max_shared_memory_per_block;
90
65
 
91
66
  // Memory available on device for __constant__ variables in a CUDA C kernel in bytes
92
- ::cuda::std::size_t total_constant_memory = 64 * 1024;
67
+ ::cuda::std::size_t total_constant_memory;
93
68
 
94
69
  // Warp size in threads
95
- int warp_size = 32;
70
+ int warp_size;
96
71
 
97
72
  // Maximum number of concurrent grids on the device
98
- int max_resident_grids = 128;
73
+ int max_resident_grids;
99
74
 
100
75
  // true if the device can concurrently copy memory between host and device
101
76
  // while executing a kernel, or false if not
102
- bool gpu_overlap = true;
77
+ bool gpu_overlap;
103
78
 
104
79
  // true if the device can map host memory into CUDA address space
105
- bool can_map_host_memory = true;
80
+ bool can_map_host_memory;
106
81
 
107
82
  // true if the device supports executing multiple kernels within the same
108
83
  // context simultaneously, or false if not. It is not guaranteed that multiple
109
84
  // kernels will be resident on the device concurrently so this feature should
110
85
  // not be relied upon for correctness.
111
- bool concurrent_kernels = true;
86
+ bool concurrent_kernels;
112
87
 
113
88
  // true if the device supports stream priorities, or false if not
114
- bool stream_priorities_supported = true;
89
+ bool stream_priorities_supported;
115
90
 
116
91
  // true if device supports caching globals in L1 cache, false if not
117
- bool global_l1_cache_supported = true;
92
+ bool global_l1_cache_supported;
118
93
 
119
94
  // true if device supports caching locals in L1 cache, false if not
120
- bool local_l1_cache_supported = true;
95
+ bool local_l1_cache_supported;
121
96
 
122
97
  // TODO: We might want to have these per-arch
123
98
  // Maximum number of 32-bit registers available to a thread block
124
- int max_registers_per_block = 64 * 1024;
99
+ int max_registers_per_block;
125
100
 
126
101
  // Maximum number of 32-bit registers available to a multiprocessor; this
127
102
  // number is shared by all thread blocks simultaneously resident on a
128
103
  // multiprocessor
129
- int max_registers_per_multiprocessor = 64 * 1024;
104
+ int max_registers_per_multiprocessor;
130
105
 
131
106
  // Maximum number of 32-bit registers available to a thread
132
- int max_registers_per_thread = 255;
107
+ int max_registers_per_thread;
133
108
 
134
109
  // Identifier for the architecture
135
- id arch_id;
110
+ ::cuda::arch_id arch_id;
136
111
 
137
112
  // Major compute capability version number
138
113
  int compute_capability_major;
@@ -141,7 +116,7 @@ struct traits_t
141
116
  int compute_capability_minor;
142
117
 
143
118
  // Compute capability version number in 100 * major + 10 * minor format
144
- int compute_capability;
119
+ ::cuda::compute_capability compute_capability;
145
120
 
146
121
  // Maximum amount of shared memory available to a multiprocessor in bytes;
147
122
  // this amount is shared by all thread blocks simultaneously resident on a
@@ -181,65 +156,81 @@ struct traits_t
181
156
  bool tma_supported;
182
157
  };
183
158
 
184
- // @brief Architecture traits
185
- // Template function that returns the traits for an architecture with a given id.
186
- template <id _Id>
187
- [[nodiscard]] _CCCL_API constexpr traits_t traits();
159
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t __common_arch_traits(arch_id __arch_id) noexcept
160
+ {
161
+ const compute_capability __cc{__arch_id};
162
+
163
+ arch_traits_t __traits{};
164
+ __traits.max_threads_per_block = 1024;
165
+ __traits.max_block_dim_x = 1024;
166
+ __traits.max_block_dim_y = 1024;
167
+ __traits.max_block_dim_z = 64;
168
+ __traits.max_grid_dim_x = ::cuda::std::numeric_limits<::cuda::std::int32_t>::max();
169
+ __traits.max_grid_dim_y = 64 * 1024 - 1;
170
+ __traits.max_grid_dim_z = 64 * 1024 - 1;
171
+ __traits.max_shared_memory_per_block = 48 * 1024;
172
+ __traits.total_constant_memory = 64 * 1024;
173
+ __traits.warp_size = 32;
174
+ __traits.max_resident_grids = 128;
175
+ __traits.gpu_overlap = true;
176
+ __traits.can_map_host_memory = true;
177
+ __traits.concurrent_kernels = true;
178
+ __traits.stream_priorities_supported = true;
179
+ __traits.global_l1_cache_supported = true;
180
+ __traits.local_l1_cache_supported = true;
181
+ __traits.max_registers_per_block = 64 * 1024;
182
+ __traits.max_registers_per_multiprocessor = 64 * 1024;
183
+ __traits.max_registers_per_thread = 255;
184
+ __traits.arch_id = __arch_id;
185
+ __traits.compute_capability_major = __cc.major();
186
+ __traits.compute_capability_minor = __cc.minor();
187
+ __traits.compute_capability = __cc;
188
+ // __traits.max_shared_memory_per_multiprocessor; // set up individually
189
+ // __traits.max_blocks_per_multiprocessor; // set up individually
190
+ // __traits.max_threads_per_multiprocessor; // set up individually
191
+ // __traits.max_warps_per_multiprocessor; // set up individually
192
+ __traits.reserved_shared_memory_per_block = (__cc >= compute_capability{80}) ? 1024 : 0;
193
+ // __traits.max_shared_memory_per_block_optin; // set up individually
194
+ __traits.cluster_supported = (__cc >= compute_capability{90});
195
+ __traits.redux_intrinisic = (__cc >= compute_capability{80});
196
+ __traits.elect_intrinsic = (__cc >= compute_capability{90});
197
+ __traits.cp_async_supported = (__cc >= compute_capability{80});
198
+ __traits.tma_supported = (__cc >= compute_capability{90});
199
+ return __traits;
200
+ }
201
+
202
+ //! @brief Gets the architecture traits for the given architecture id \c _Id.
203
+ template <arch_id _Id>
204
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits() noexcept;
188
205
 
189
206
  template <>
190
- [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_60>()
207
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_60>() noexcept
191
208
  {
192
- traits_t __traits{};
193
- __traits.arch_id = id::sm_60;
194
- __traits.compute_capability_major = 6;
195
- __traits.compute_capability_minor = 0;
196
- __traits.compute_capability = 60;
209
+ auto __traits = ::cuda::__common_arch_traits(arch_id::sm_60);
197
210
  __traits.max_shared_memory_per_multiprocessor = 64 * 1024;
198
211
  __traits.max_blocks_per_multiprocessor = 32;
199
212
  __traits.max_threads_per_multiprocessor = 2048;
200
213
  __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
201
- __traits.reserved_shared_memory_per_block = 0;
202
214
  __traits.max_shared_memory_per_block_optin = 48 * 1024;
203
-
204
- __traits.cluster_supported = false;
205
- __traits.redux_intrinisic = false;
206
- __traits.elect_intrinsic = false;
207
- __traits.cp_async_supported = false;
208
- __traits.tma_supported = false;
209
215
  return __traits;
210
216
  };
211
217
 
212
218
  template <>
213
- [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_61>()
219
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_61>() noexcept
214
220
  {
215
- traits_t __traits{};
216
- __traits.arch_id = id::sm_61;
217
- __traits.compute_capability_major = 6;
218
- __traits.compute_capability_minor = 1;
219
- __traits.compute_capability = 61;
221
+ auto __traits = ::cuda::__common_arch_traits(arch_id::sm_61);
220
222
  __traits.max_shared_memory_per_multiprocessor = 96 * 1024;
221
223
  __traits.max_blocks_per_multiprocessor = 32;
222
224
  __traits.max_threads_per_multiprocessor = 2048;
223
225
  __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
224
- __traits.reserved_shared_memory_per_block = 0;
225
226
  __traits.max_shared_memory_per_block_optin = 48 * 1024;
226
-
227
- __traits.cluster_supported = false;
228
- __traits.redux_intrinisic = false;
229
- __traits.elect_intrinsic = false;
230
- __traits.cp_async_supported = false;
231
- __traits.tma_supported = false;
232
227
  return __traits;
233
228
  };
234
229
 
235
230
  template <>
236
- [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_70>()
231
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_70>() noexcept
237
232
  {
238
- traits_t __traits{};
239
- __traits.arch_id = id::sm_70;
240
- __traits.compute_capability_major = 7;
241
- __traits.compute_capability_minor = 0;
242
- __traits.compute_capability = 70;
233
+ auto __traits = ::cuda::__common_arch_traits(arch_id::sm_70);
243
234
  __traits.max_shared_memory_per_multiprocessor = 96 * 1024;
244
235
  __traits.max_blocks_per_multiprocessor = 32;
245
236
  __traits.max_threads_per_multiprocessor = 2048;
@@ -247,369 +238,300 @@ template <>
247
238
  __traits.reserved_shared_memory_per_block = 0;
248
239
  __traits.max_shared_memory_per_block_optin =
249
240
  __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
250
-
251
- __traits.cluster_supported = false;
252
- __traits.redux_intrinisic = false;
253
- __traits.elect_intrinsic = false;
254
- __traits.cp_async_supported = false;
255
- __traits.tma_supported = false;
256
241
  return __traits;
257
242
  };
258
243
 
259
244
  template <>
260
- [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_75>()
245
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_75>() noexcept
261
246
  {
262
- traits_t __traits{};
263
- __traits.arch_id = id::sm_75;
264
- __traits.compute_capability_major = 7;
265
- __traits.compute_capability_minor = 5;
266
- __traits.compute_capability = 75;
247
+ auto __traits = ::cuda::__common_arch_traits(arch_id::sm_75);
267
248
  __traits.max_shared_memory_per_multiprocessor = 64 * 1024;
268
249
  __traits.max_blocks_per_multiprocessor = 16;
269
250
  __traits.max_threads_per_multiprocessor = 1024;
270
251
  __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
271
- __traits.reserved_shared_memory_per_block = 0;
272
252
  __traits.max_shared_memory_per_block_optin =
273
253
  __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
274
-
275
- __traits.cluster_supported = false;
276
- __traits.redux_intrinisic = false;
277
- __traits.elect_intrinsic = false;
278
- __traits.cp_async_supported = false;
279
- __traits.tma_supported = false;
280
254
  return __traits;
281
255
  };
282
256
 
283
257
  template <>
284
- [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_80>()
258
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_80>() noexcept
285
259
  {
286
- traits_t __traits{};
287
- __traits.arch_id = id::sm_80;
288
- __traits.compute_capability_major = 8;
289
- __traits.compute_capability_minor = 0;
290
- __traits.compute_capability = 80;
260
+ auto __traits = ::cuda::__common_arch_traits(arch_id::sm_80);
291
261
  __traits.max_shared_memory_per_multiprocessor = 164 * 1024;
292
262
  __traits.max_blocks_per_multiprocessor = 32;
293
263
  __traits.max_threads_per_multiprocessor = 2048;
294
264
  __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
295
- __traits.reserved_shared_memory_per_block = 1024;
296
265
  __traits.max_shared_memory_per_block_optin =
297
266
  __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
298
-
299
- __traits.cluster_supported = false;
300
- __traits.redux_intrinisic = true;
301
- __traits.elect_intrinsic = false;
302
- __traits.cp_async_supported = true;
303
- __traits.tma_supported = false;
304
267
  return __traits;
305
268
  };
306
269
 
307
270
  template <>
308
- [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_86>()
271
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_86>() noexcept
309
272
  {
310
- traits_t __traits{};
311
- __traits.arch_id = id::sm_86;
312
- __traits.compute_capability_major = 8;
313
- __traits.compute_capability_minor = 6;
314
- __traits.compute_capability = 86;
273
+ auto __traits = ::cuda::__common_arch_traits(arch_id::sm_86);
315
274
  __traits.max_shared_memory_per_multiprocessor = 100 * 1024;
316
275
  __traits.max_blocks_per_multiprocessor = 16;
317
276
  __traits.max_threads_per_multiprocessor = 1536;
318
277
  __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
319
- __traits.reserved_shared_memory_per_block = 1024;
320
278
  __traits.max_shared_memory_per_block_optin =
321
279
  __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
280
+ return __traits;
281
+ };
322
282
 
323
- __traits.cluster_supported = false;
324
- __traits.redux_intrinisic = true;
325
- __traits.elect_intrinsic = false;
326
- __traits.cp_async_supported = true;
327
- __traits.tma_supported = false;
283
+ template <>
284
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_87>() noexcept
285
+ {
286
+ auto __traits = ::cuda::__common_arch_traits(arch_id::sm_87);
287
+ __traits.max_shared_memory_per_multiprocessor = 164 * 1024;
288
+ __traits.max_blocks_per_multiprocessor = 16;
289
+ __traits.max_threads_per_multiprocessor = 1536;
290
+ __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
291
+ __traits.max_shared_memory_per_block_optin =
292
+ __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
328
293
  return __traits;
329
294
  };
330
295
 
331
296
  template <>
332
- [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_89>()
297
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_88>() noexcept
333
298
  {
334
- traits_t __traits{};
335
- __traits.arch_id = id::sm_89;
336
- __traits.compute_capability_major = 8;
337
- __traits.compute_capability_minor = 9;
338
- __traits.compute_capability = 89;
299
+ auto __traits = ::cuda::arch_traits<arch_id::sm_86>();
300
+ __traits.arch_id = arch_id::sm_88;
301
+ __traits.compute_capability_major = 8;
302
+ __traits.compute_capability_minor = 8;
303
+ __traits.compute_capability = compute_capability{88};
304
+ return __traits;
305
+ };
306
+
307
+ template <>
308
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_89>() noexcept
309
+ {
310
+ auto __traits = ::cuda::__common_arch_traits(arch_id::sm_89);
339
311
  __traits.max_shared_memory_per_multiprocessor = 100 * 1024;
340
312
  __traits.max_blocks_per_multiprocessor = 24;
341
313
  __traits.max_threads_per_multiprocessor = 1536;
342
314
  __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
343
- __traits.reserved_shared_memory_per_block = 1024;
344
315
  __traits.max_shared_memory_per_block_optin =
345
316
  __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
346
-
347
- __traits.cluster_supported = false;
348
- __traits.redux_intrinisic = true;
349
- __traits.elect_intrinsic = false;
350
- __traits.cp_async_supported = true;
351
- __traits.tma_supported = false;
352
317
  return __traits;
353
318
  };
354
319
 
355
320
  template <>
356
- [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_90>()
321
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_90>() noexcept
357
322
  {
358
- traits_t __traits{};
359
- __traits.arch_id = id::sm_90;
360
- __traits.compute_capability_major = 9;
361
- __traits.compute_capability_minor = 0;
362
- __traits.compute_capability = 90;
323
+ auto __traits = ::cuda::__common_arch_traits(arch_id::sm_90);
363
324
  __traits.max_shared_memory_per_multiprocessor = 228 * 1024;
364
325
  __traits.max_blocks_per_multiprocessor = 32;
365
326
  __traits.max_threads_per_multiprocessor = 2048;
366
327
  __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
367
- __traits.reserved_shared_memory_per_block = 1024;
368
328
  __traits.max_shared_memory_per_block_optin =
369
329
  __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
370
-
371
- __traits.cluster_supported = true;
372
- __traits.redux_intrinisic = true;
373
- __traits.elect_intrinsic = true;
374
- __traits.cp_async_supported = true;
375
- __traits.tma_supported = true;
376
330
  return __traits;
377
331
  };
378
332
 
379
333
  // No sm_90a specific fields for now.
380
334
  template <>
381
- [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_90a>()
335
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_90a>() noexcept
382
336
  {
383
- return ::cuda::arch::traits<id::sm_90>();
337
+ auto __traits = ::cuda::arch_traits<arch_id::sm_90>();
338
+ __traits.arch_id = arch_id::sm_90a;
339
+ return __traits;
384
340
  };
385
341
 
386
342
  template <>
387
- [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_100>()
343
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_100>() noexcept
388
344
  {
389
- traits_t __traits{};
390
- __traits.arch_id = id::sm_100;
391
- __traits.compute_capability_major = 10;
392
- __traits.compute_capability_minor = 0;
393
- __traits.compute_capability = 100;
345
+ auto __traits = ::cuda::__common_arch_traits(arch_id::sm_90);
394
346
  __traits.max_shared_memory_per_multiprocessor = 228 * 1024;
395
347
  __traits.max_blocks_per_multiprocessor = 32;
396
348
  __traits.max_threads_per_multiprocessor = 2048;
397
349
  __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
398
- __traits.reserved_shared_memory_per_block = 1024;
399
350
  __traits.max_shared_memory_per_block_optin =
400
351
  __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
401
-
402
- __traits.cluster_supported = true;
403
- __traits.redux_intrinisic = true;
404
- __traits.elect_intrinsic = true;
405
- __traits.cp_async_supported = true;
406
- __traits.tma_supported = true;
407
352
  return __traits;
408
353
  };
409
354
 
410
355
  template <>
411
- [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_100a>()
356
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_100a>() noexcept
412
357
  {
413
- return ::cuda::arch::traits<id::sm_100>();
358
+ auto __traits = ::cuda::arch_traits<arch_id::sm_100>();
359
+ __traits.arch_id = arch_id::sm_100a;
360
+ return __traits;
414
361
  };
415
362
 
416
363
  template <>
417
- [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_103>()
364
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_103>() noexcept
418
365
  {
419
- traits_t __traits = ::cuda::arch::traits<id::sm_100>();
420
- __traits.arch_id = id::sm_103;
366
+ auto __traits = ::cuda::arch_traits<arch_id::sm_100>();
367
+ __traits.arch_id = arch_id::sm_103;
421
368
  __traits.compute_capability_major = 10;
422
369
  __traits.compute_capability_minor = 3;
423
- __traits.compute_capability = 103;
370
+ __traits.compute_capability = compute_capability{103};
424
371
  return __traits;
425
372
  };
426
373
 
427
374
  template <>
428
- [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_103a>()
375
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_103a>() noexcept
429
376
  {
430
- return ::cuda::arch::traits<id::sm_103>();
377
+ auto __traits = ::cuda::arch_traits<arch_id::sm_103>();
378
+ __traits.arch_id = arch_id::sm_103a;
379
+ return __traits;
431
380
  };
432
381
 
433
382
  template <>
434
- [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_110>()
383
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_110>() noexcept
435
384
  {
436
- traits_t __traits = ::cuda::arch::traits<id::sm_100>();
437
- __traits.arch_id = id::sm_110;
438
- __traits.compute_capability_major = 11;
439
- __traits.compute_capability_minor = 0;
440
- __traits.compute_capability = 110;
385
+ auto __traits = ::cuda::arch_traits<arch_id::sm_100>();
386
+ __traits.arch_id = arch_id::sm_110;
387
+ __traits.compute_capability_major = 11;
388
+ __traits.compute_capability_minor = 0;
389
+ __traits.compute_capability = compute_capability{110};
390
+ __traits.max_blocks_per_multiprocessor = 24;
391
+ __traits.max_threads_per_multiprocessor = 1536;
392
+ __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
441
393
  return __traits;
442
394
  };
443
395
 
444
396
  template <>
445
- [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_110a>()
397
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_110a>() noexcept
446
398
  {
447
- return ::cuda::arch::traits<id::sm_110>();
399
+ auto __traits = ::cuda::arch_traits<arch_id::sm_110>();
400
+ __traits.arch_id = arch_id::sm_110a;
401
+ return __traits;
448
402
  };
449
403
 
450
404
  template <>
451
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_120>()
405
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_120>() noexcept
452
406
  {
453
- traits_t __traits{};
454
- __traits.arch_id = id::sm_120;
455
- __traits.compute_capability_major = 12;
456
- __traits.compute_capability_minor = 0;
457
- __traits.compute_capability = 120;
407
+ auto __traits = ::cuda::__common_arch_traits(arch_id::sm_120);
458
408
  __traits.max_shared_memory_per_multiprocessor = 100 * 1024;
459
- __traits.max_blocks_per_multiprocessor = 32;
409
+ __traits.max_blocks_per_multiprocessor = 24;
460
410
  __traits.max_threads_per_multiprocessor = 1536;
461
411
  __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
462
- __traits.reserved_shared_memory_per_block = 1024;
463
412
  __traits.max_shared_memory_per_block_optin =
464
413
  __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
414
+ return __traits;
415
+ };
465
416
 
466
- __traits.cluster_supported = true;
467
- __traits.redux_intrinisic = true;
468
- __traits.elect_intrinsic = true;
469
- __traits.cp_async_supported = true;
470
- __traits.tma_supported = true;
417
+ template <>
418
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_120a>() noexcept
419
+ {
420
+ auto __traits = ::cuda::arch_traits<arch_id::sm_120>();
421
+ __traits.arch_id = arch_id::sm_120a;
471
422
  return __traits;
472
423
  };
473
424
 
474
425
  template <>
475
- [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_120a>()
426
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_121>() noexcept
476
427
  {
477
- return ::cuda::arch::traits<id::sm_120>();
428
+ auto __traits = ::cuda::arch_traits<arch_id::sm_120>();
429
+ __traits.arch_id = arch_id::sm_121;
430
+ __traits.compute_capability_major = 12;
431
+ __traits.compute_capability_minor = 1;
432
+ __traits.compute_capability = compute_capability{121};
433
+ return __traits;
478
434
  };
479
435
 
480
- inline constexpr int __highest_known_arch = 120;
436
+ template <>
437
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_121a>() noexcept
438
+ {
439
+ auto __traits = ::cuda::arch_traits<arch_id::sm_121>();
440
+ __traits.arch_id = arch_id::sm_121a;
441
+ return __traits;
442
+ };
481
443
 
482
- [[nodiscard]] _CCCL_API inline constexpr traits_t traits_for_id(id __id)
444
+ //! @brief Gets the architecture traits for the given architecture id \c __id.
445
+ //!
446
+ //! @throws \c cuda::cuda_error if the \c __id is not a known architecture.
447
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits_for(arch_id __id)
483
448
  {
484
449
  switch (__id)
485
450
  {
486
- case id::sm_60:
487
- return ::cuda::arch::traits<id::sm_60>();
488
- case id::sm_61:
489
- return ::cuda::arch::traits<id::sm_61>();
490
- case id::sm_70:
491
- return ::cuda::arch::traits<id::sm_70>();
492
- case id::sm_75:
493
- return ::cuda::arch::traits<id::sm_75>();
494
- case id::sm_80:
495
- return ::cuda::arch::traits<id::sm_80>();
496
- case id::sm_86:
497
- return ::cuda::arch::traits<id::sm_86>();
498
- case id::sm_89:
499
- return ::cuda::arch::traits<id::sm_89>();
500
- case id::sm_90:
501
- return ::cuda::arch::traits<id::sm_90>();
502
- case id::sm_90a:
503
- return ::cuda::arch::traits<id::sm_90a>();
504
- case id::sm_100:
505
- return ::cuda::arch::traits<id::sm_100>();
506
- case id::sm_100a:
507
- return ::cuda::arch::traits<id::sm_100a>();
508
- case id::sm_103:
509
- return ::cuda::arch::traits<id::sm_103>();
510
- case id::sm_103a:
511
- return ::cuda::arch::traits<id::sm_103a>();
512
- case id::sm_110:
513
- return ::cuda::arch::traits<id::sm_110>();
514
- case id::sm_110a:
515
- return ::cuda::arch::traits<id::sm_110a>();
516
- case id::sm_120:
517
- return ::cuda::arch::traits<id::sm_120>();
518
- case id::sm_120a:
519
- return ::cuda::arch::traits<id::sm_120a>();
451
+ case arch_id::sm_60:
452
+ return ::cuda::arch_traits<arch_id::sm_60>();
453
+ case arch_id::sm_61:
454
+ return ::cuda::arch_traits<arch_id::sm_61>();
455
+ case arch_id::sm_70:
456
+ return ::cuda::arch_traits<arch_id::sm_70>();
457
+ case arch_id::sm_75:
458
+ return ::cuda::arch_traits<arch_id::sm_75>();
459
+ case arch_id::sm_80:
460
+ return ::cuda::arch_traits<arch_id::sm_80>();
461
+ case arch_id::sm_86:
462
+ return ::cuda::arch_traits<arch_id::sm_86>();
463
+ case arch_id::sm_87:
464
+ return ::cuda::arch_traits<arch_id::sm_87>();
465
+ case arch_id::sm_88:
466
+ return ::cuda::arch_traits<arch_id::sm_88>();
467
+ case arch_id::sm_89:
468
+ return ::cuda::arch_traits<arch_id::sm_89>();
469
+ case arch_id::sm_90:
470
+ return ::cuda::arch_traits<arch_id::sm_90>();
471
+ case arch_id::sm_90a:
472
+ return ::cuda::arch_traits<arch_id::sm_90a>();
473
+ case arch_id::sm_100:
474
+ return ::cuda::arch_traits<arch_id::sm_100>();
475
+ case arch_id::sm_100a:
476
+ return ::cuda::arch_traits<arch_id::sm_100a>();
477
+ case arch_id::sm_103:
478
+ return ::cuda::arch_traits<arch_id::sm_103>();
479
+ case arch_id::sm_103a:
480
+ return ::cuda::arch_traits<arch_id::sm_103a>();
481
+ case arch_id::sm_110:
482
+ return ::cuda::arch_traits<arch_id::sm_110>();
483
+ case arch_id::sm_110a:
484
+ return ::cuda::arch_traits<arch_id::sm_110a>();
485
+ case arch_id::sm_120:
486
+ return ::cuda::arch_traits<arch_id::sm_120>();
487
+ case arch_id::sm_120a:
488
+ return ::cuda::arch_traits<arch_id::sm_120a>();
489
+ case arch_id::sm_121:
490
+ return ::cuda::arch_traits<arch_id::sm_121>();
491
+ case arch_id::sm_121a:
492
+ return ::cuda::arch_traits<arch_id::sm_121a>();
520
493
  default:
521
494
  ::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Traits requested for an unknown architecture");
522
495
  break;
523
496
  }
524
497
  }
525
498
 
526
- [[nodiscard]] _CCCL_API inline constexpr id id_for_compute_capability(int compute_capability)
499
+ //! @brief Gets the architecture traits for the given compute capability \c __cc.
500
+ //!
501
+ //! @throws \c cuda::cuda_error if the \c __cc doesn't have a corresponding architecture id.
502
+ [[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits_for(compute_capability __cc)
527
503
  {
528
- if (compute_capability < 60 || compute_capability > __highest_known_arch)
529
- {
530
- ::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Compute capability out of range");
531
- }
532
- return static_cast<id>(compute_capability);
504
+ return ::cuda::arch_traits_for(::cuda::to_arch_id(__cc));
533
505
  }
534
506
 
535
- [[nodiscard]] _CCCL_API inline constexpr traits_t traits_for_compute_capability(int compute_capability)
536
- {
537
- return ::cuda::arch::traits_for_id(::cuda::arch::id_for_compute_capability(compute_capability));
538
- }
507
+ _CCCL_END_NAMESPACE_CUDA
539
508
 
540
- [[nodiscard]] _CCCL_API inline constexpr id __special_id_for_compute_capability(int value)
541
- {
542
- switch (value)
543
- {
544
- case 90:
545
- return id::sm_90a;
546
- case 100:
547
- return id::sm_100a;
548
- case 103:
549
- return id::sm_103a;
550
- case 110:
551
- return id::sm_110a;
552
- case 120:
553
- return id::sm_120a;
554
- default:
555
- ::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Compute capability out of range");
556
- break;
557
- }
558
- }
509
+ # if _CCCL_CUDA_COMPILATION()
559
510
 
560
- //! @brief Provides architecture traits of the architecture matching __CUDA_ARCH__ macro
561
- [[nodiscard]] _CCCL_DEVICE_API inline constexpr arch::traits_t current_traits()
562
- {
563
- // fixme: this doesn't work with nvc++ -cuda
564
- # ifdef __CUDA_ARCH__
565
- # ifdef __CUDA_ARCH_SPECIFIC__
566
- return ::cuda::arch::traits_for_id(::cuda::arch::__special_id_for_compute_capability(__CUDA_ARCH_SPECIFIC__ / 10));
567
- # else
568
- return ::cuda::arch::traits_for_compute_capability(__CUDA_ARCH__ / 10);
569
- # endif // __CUDA_ARCH_SPECIFIC__
570
- # else // __CUDA_ARCH__
571
- // Should be unreachable in __device__ function
572
- return ::cuda::arch::traits_t{};
573
- # endif // __CUDA_ARCH__
574
- }
511
+ _CCCL_BEGIN_NAMESPACE_CUDA_DEVICE
575
512
 
576
- [[nodiscard]] _CCCL_HOST_API inline constexpr arch::traits_t
577
- __arch_traits_might_be_unknown(int __device, unsigned int __compute_capability)
513
+ //! @brief Returns the \c cuda::arch_trait_t of the architecture that is currently being compiled.
514
+ //!
515
+ //! If the current architecture is not a known architecture from \c cuda::arch_id enumeration, the compilation
516
+ //! will fail.
517
+ //!
518
+ //! @note This API cannot be used in constexpr context when compiling with nvc++ in CUDA mode.
519
+ template <class _Dummy = void>
520
+ [[nodiscard]] _CCCL_DEVICE_API _CCCL_TARGET_CONSTEXPR ::cuda::arch_traits_t current_arch_traits() noexcept
578
521
  {
579
- if (__compute_capability <= arch::__highest_known_arch)
580
- {
581
- return ::cuda::arch::traits_for_compute_capability(__compute_capability);
582
- }
583
- else
584
- {
585
- // If the architecture is unknown, we need to craft the arch_traits from attributes
586
- arch::traits_t __traits{};
587
- __traits.compute_capability_major = __compute_capability / 10;
588
- __traits.compute_capability_minor = __compute_capability % 10;
589
- __traits.compute_capability = __compute_capability;
590
- __traits.max_shared_memory_per_multiprocessor =
591
- ::cuda::device_attributes::max_shared_memory_per_multiprocessor(__device);
592
- __traits.max_blocks_per_multiprocessor = ::cuda::device_attributes::max_blocks_per_multiprocessor(__device);
593
- __traits.max_threads_per_multiprocessor = ::cuda::device_attributes::max_threads_per_multiprocessor(__device);
594
- __traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
595
- __traits.reserved_shared_memory_per_block = ::cuda::device_attributes::reserved_shared_memory_per_block(__device);
596
- __traits.max_shared_memory_per_block_optin =
597
- __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
598
-
599
- __traits.cluster_supported = __compute_capability >= 90;
600
- __traits.redux_intrinisic = __compute_capability >= 80;
601
- __traits.elect_intrinsic = __compute_capability >= 90;
602
- __traits.cp_async_supported = __compute_capability >= 80;
603
- __traits.tma_supported = __compute_capability >= 90;
604
- return __traits;
605
- }
522
+ # if _CCCL_DEVICE_COMPILATION()
523
+ return ::cuda::arch_traits_for(::cuda::device::current_arch_id<_Dummy>());
524
+ # else // ^^^ _CCCL_DEVICE_COMPILATION() ^^^ / vvv !_CCCL_DEVICE_COMPILATION() vvv
525
+ return {};
526
+ # endif // ^^^ !_CCCL_DEVICE_COMPILATION() ^^^
606
527
  }
607
- } // namespace arch
608
528
 
609
- _CCCL_END_NAMESPACE_CUDA
529
+ _CCCL_END_NAMESPACE_CUDA_DEVICE
530
+
531
+ # endif // _CCCL_CUDA_COMPILATION
610
532
 
611
533
  # include <cuda/std/__cccl/epilogue.h>
612
534
 
613
- #endif // _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
535
+ #endif // _CCCL_HAS_CTK()
614
536
 
615
537
  #endif // _CUDA___DEVICE_ARCH_TRAITS_H