cuda-cccl 0.3.1__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.2__cp311-cp311-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
- cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
- cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +12 -13
- cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +4 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
- cuda/cccl/headers/include/cuda/__cccl_config +1 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +239 -317
- cuda/cccl/headers/include/cuda/__device/attributes.h +4 -3
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +0 -10
- cuda/cccl/headers/include/cuda/__device/physical_device.h +1 -26
- cuda/cccl/headers/include/cuda/__event/event.h +26 -26
- cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +9 -7
- cuda/cccl/headers/include/cuda/__fwd/devices.h +4 -4
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +2 -12
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +15 -19
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +59 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +5 -4
- cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -16
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/cmath +1 -0
- cuda/cccl/headers/include/cuda/devices +3 -0
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
- cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
- cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
- cuda/cccl/headers/include/cuda/std/numbers +5 -0
- cuda/cccl/headers/include/cuda/std/string_view +146 -11
- cuda/cccl/headers/include/cuda/stream_ref +5 -0
- cuda/cccl/headers/include/cuda/utility +1 -0
- cuda/cccl/headers/include/nv/target +7 -2
- cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
- cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
- cuda/cccl/headers/include/thrust/device_delete.h +18 -3
- cuda/cccl/headers/include/thrust/device_free.h +16 -3
- cuda/cccl/headers/include/thrust/device_new.h +29 -8
- cuda/cccl/headers/include/thrust/host_vector.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
- cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
- cuda/compute/__init__.py +2 -0
- cuda/compute/_bindings.pyi +43 -1
- cuda/compute/_bindings_impl.pyx +156 -7
- cuda/compute/algorithms/_scan.py +108 -36
- cuda/compute/algorithms/_transform.py +32 -11
- cuda/compute/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/iterators/__init__.py +2 -0
- cuda/compute/iterators/_factories.py +28 -0
- cuda/compute/iterators/_iterators.py +206 -1
- cuda/compute/numba_utils.py +2 -2
- cuda/compute/typing.py +2 -0
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +171 -175
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
- cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
- cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -21,118 +21,93 @@
|
|
|
21
21
|
# pragma system_header
|
|
22
22
|
#endif // no system header
|
|
23
23
|
|
|
24
|
-
#if _CCCL_HAS_CTK()
|
|
24
|
+
#if _CCCL_HAS_CTK()
|
|
25
25
|
|
|
26
|
-
# include <cuda/__device/
|
|
26
|
+
# include <cuda/__device/arch_id.h>
|
|
27
|
+
# include <cuda/__device/compute_capability.h>
|
|
27
28
|
# include <cuda/__fwd/devices.h>
|
|
28
29
|
# include <cuda/std/__exception/cuda_error.h>
|
|
30
|
+
# include <cuda/std/__type_traits/always_false.h>
|
|
31
|
+
# include <cuda/std/cstdint>
|
|
29
32
|
# include <cuda/std/limits>
|
|
30
33
|
|
|
31
34
|
# include <cuda/std/__cccl/prologue.h>
|
|
32
35
|
|
|
33
36
|
_CCCL_BEGIN_NAMESPACE_CUDA
|
|
34
|
-
namespace arch
|
|
35
|
-
{
|
|
36
|
-
|
|
37
|
-
inline constexpr int __arch_specific_id_multiplier = 100000;
|
|
38
|
-
|
|
39
|
-
// @brief Architecture identifier
|
|
40
|
-
// This type identifies an architecture. It has more possible entries than just numeric values of the compute
|
|
41
|
-
// capability. For example, sm_90 and sm_90a have the same compute capability, but the identifier is different.
|
|
42
|
-
enum class id : int
|
|
43
|
-
{
|
|
44
|
-
sm_60 = 60,
|
|
45
|
-
sm_61 = 61,
|
|
46
|
-
sm_70 = 70,
|
|
47
|
-
sm_75 = 75,
|
|
48
|
-
sm_80 = 80,
|
|
49
|
-
sm_86 = 86,
|
|
50
|
-
sm_89 = 89,
|
|
51
|
-
sm_90 = 90,
|
|
52
|
-
sm_100 = 100,
|
|
53
|
-
sm_103 = 103,
|
|
54
|
-
sm_110 = 110,
|
|
55
|
-
sm_120 = 120,
|
|
56
|
-
sm_90a = 90 * __arch_specific_id_multiplier,
|
|
57
|
-
sm_100a = 100 * __arch_specific_id_multiplier,
|
|
58
|
-
sm_103a = 103 * __arch_specific_id_multiplier,
|
|
59
|
-
sm_110a = 110 * __arch_specific_id_multiplier,
|
|
60
|
-
sm_120a = 120 * __arch_specific_id_multiplier,
|
|
61
|
-
};
|
|
62
37
|
|
|
63
38
|
//! @brief Architecture traits
|
|
64
39
|
//! This type contains information about an architecture that is constant across devices of that architecture.
|
|
65
|
-
struct
|
|
40
|
+
struct arch_traits_t
|
|
66
41
|
{
|
|
67
42
|
// Maximum number of threads per block
|
|
68
|
-
int max_threads_per_block
|
|
43
|
+
int max_threads_per_block;
|
|
69
44
|
|
|
70
45
|
// Maximum x-dimension of a block
|
|
71
|
-
int max_block_dim_x
|
|
46
|
+
int max_block_dim_x;
|
|
72
47
|
|
|
73
48
|
// Maximum y-dimension of a block
|
|
74
|
-
int max_block_dim_y
|
|
49
|
+
int max_block_dim_y;
|
|
75
50
|
|
|
76
51
|
// Maximum z-dimension of a block
|
|
77
|
-
int max_block_dim_z
|
|
52
|
+
int max_block_dim_z;
|
|
78
53
|
|
|
79
54
|
// Maximum x-dimension of a grid
|
|
80
|
-
int max_grid_dim_x
|
|
55
|
+
int max_grid_dim_x;
|
|
81
56
|
|
|
82
57
|
// Maximum y-dimension of a grid
|
|
83
|
-
int max_grid_dim_y
|
|
58
|
+
int max_grid_dim_y;
|
|
84
59
|
|
|
85
60
|
// Maximum z-dimension of a grid
|
|
86
|
-
int max_grid_dim_z
|
|
61
|
+
int max_grid_dim_z;
|
|
87
62
|
|
|
88
63
|
// Maximum amount of shared memory available to a thread block in bytes
|
|
89
|
-
::cuda::std::size_t max_shared_memory_per_block
|
|
64
|
+
::cuda::std::size_t max_shared_memory_per_block;
|
|
90
65
|
|
|
91
66
|
// Memory available on device for __constant__ variables in a CUDA C kernel in bytes
|
|
92
|
-
::cuda::std::size_t total_constant_memory
|
|
67
|
+
::cuda::std::size_t total_constant_memory;
|
|
93
68
|
|
|
94
69
|
// Warp size in threads
|
|
95
|
-
int warp_size
|
|
70
|
+
int warp_size;
|
|
96
71
|
|
|
97
72
|
// Maximum number of concurrent grids on the device
|
|
98
|
-
int max_resident_grids
|
|
73
|
+
int max_resident_grids;
|
|
99
74
|
|
|
100
75
|
// true if the device can concurrently copy memory between host and device
|
|
101
76
|
// while executing a kernel, or false if not
|
|
102
|
-
bool gpu_overlap
|
|
77
|
+
bool gpu_overlap;
|
|
103
78
|
|
|
104
79
|
// true if the device can map host memory into CUDA address space
|
|
105
|
-
bool can_map_host_memory
|
|
80
|
+
bool can_map_host_memory;
|
|
106
81
|
|
|
107
82
|
// true if the device supports executing multiple kernels within the same
|
|
108
83
|
// context simultaneously, or false if not. It is not guaranteed that multiple
|
|
109
84
|
// kernels will be resident on the device concurrently so this feature should
|
|
110
85
|
// not be relied upon for correctness.
|
|
111
|
-
bool concurrent_kernels
|
|
86
|
+
bool concurrent_kernels;
|
|
112
87
|
|
|
113
88
|
// true if the device supports stream priorities, or false if not
|
|
114
|
-
bool stream_priorities_supported
|
|
89
|
+
bool stream_priorities_supported;
|
|
115
90
|
|
|
116
91
|
// true if device supports caching globals in L1 cache, false if not
|
|
117
|
-
bool global_l1_cache_supported
|
|
92
|
+
bool global_l1_cache_supported;
|
|
118
93
|
|
|
119
94
|
// true if device supports caching locals in L1 cache, false if not
|
|
120
|
-
bool local_l1_cache_supported
|
|
95
|
+
bool local_l1_cache_supported;
|
|
121
96
|
|
|
122
97
|
// TODO: We might want to have these per-arch
|
|
123
98
|
// Maximum number of 32-bit registers available to a thread block
|
|
124
|
-
int max_registers_per_block
|
|
99
|
+
int max_registers_per_block;
|
|
125
100
|
|
|
126
101
|
// Maximum number of 32-bit registers available to a multiprocessor; this
|
|
127
102
|
// number is shared by all thread blocks simultaneously resident on a
|
|
128
103
|
// multiprocessor
|
|
129
|
-
int max_registers_per_multiprocessor
|
|
104
|
+
int max_registers_per_multiprocessor;
|
|
130
105
|
|
|
131
106
|
// Maximum number of 32-bit registers available to a thread
|
|
132
|
-
int max_registers_per_thread
|
|
107
|
+
int max_registers_per_thread;
|
|
133
108
|
|
|
134
109
|
// Identifier for the architecture
|
|
135
|
-
|
|
110
|
+
::cuda::arch_id arch_id;
|
|
136
111
|
|
|
137
112
|
// Major compute capability version number
|
|
138
113
|
int compute_capability_major;
|
|
@@ -141,7 +116,7 @@ struct traits_t
|
|
|
141
116
|
int compute_capability_minor;
|
|
142
117
|
|
|
143
118
|
// Compute capability version number in 100 * major + 10 * minor format
|
|
144
|
-
|
|
119
|
+
::cuda::compute_capability compute_capability;
|
|
145
120
|
|
|
146
121
|
// Maximum amount of shared memory available to a multiprocessor in bytes;
|
|
147
122
|
// this amount is shared by all thread blocks simultaneously resident on a
|
|
@@ -181,65 +156,81 @@ struct traits_t
|
|
|
181
156
|
bool tma_supported;
|
|
182
157
|
};
|
|
183
158
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
159
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t __common_arch_traits(arch_id __arch_id) noexcept
|
|
160
|
+
{
|
|
161
|
+
const compute_capability __cc{__arch_id};
|
|
162
|
+
|
|
163
|
+
arch_traits_t __traits{};
|
|
164
|
+
__traits.max_threads_per_block = 1024;
|
|
165
|
+
__traits.max_block_dim_x = 1024;
|
|
166
|
+
__traits.max_block_dim_y = 1024;
|
|
167
|
+
__traits.max_block_dim_z = 64;
|
|
168
|
+
__traits.max_grid_dim_x = ::cuda::std::numeric_limits<::cuda::std::int32_t>::max();
|
|
169
|
+
__traits.max_grid_dim_y = 64 * 1024 - 1;
|
|
170
|
+
__traits.max_grid_dim_z = 64 * 1024 - 1;
|
|
171
|
+
__traits.max_shared_memory_per_block = 48 * 1024;
|
|
172
|
+
__traits.total_constant_memory = 64 * 1024;
|
|
173
|
+
__traits.warp_size = 32;
|
|
174
|
+
__traits.max_resident_grids = 128;
|
|
175
|
+
__traits.gpu_overlap = true;
|
|
176
|
+
__traits.can_map_host_memory = true;
|
|
177
|
+
__traits.concurrent_kernels = true;
|
|
178
|
+
__traits.stream_priorities_supported = true;
|
|
179
|
+
__traits.global_l1_cache_supported = true;
|
|
180
|
+
__traits.local_l1_cache_supported = true;
|
|
181
|
+
__traits.max_registers_per_block = 64 * 1024;
|
|
182
|
+
__traits.max_registers_per_multiprocessor = 64 * 1024;
|
|
183
|
+
__traits.max_registers_per_thread = 255;
|
|
184
|
+
__traits.arch_id = __arch_id;
|
|
185
|
+
__traits.compute_capability_major = __cc.major();
|
|
186
|
+
__traits.compute_capability_minor = __cc.minor();
|
|
187
|
+
__traits.compute_capability = __cc;
|
|
188
|
+
// __traits.max_shared_memory_per_multiprocessor; // set up individually
|
|
189
|
+
// __traits.max_blocks_per_multiprocessor; // set up individually
|
|
190
|
+
// __traits.max_threads_per_multiprocessor; // set up individually
|
|
191
|
+
// __traits.max_warps_per_multiprocessor; // set up individually
|
|
192
|
+
__traits.reserved_shared_memory_per_block = (__cc >= compute_capability{80}) ? 1024 : 0;
|
|
193
|
+
// __traits.max_shared_memory_per_block_optin; // set up individually
|
|
194
|
+
__traits.cluster_supported = (__cc >= compute_capability{90});
|
|
195
|
+
__traits.redux_intrinisic = (__cc >= compute_capability{80});
|
|
196
|
+
__traits.elect_intrinsic = (__cc >= compute_capability{90});
|
|
197
|
+
__traits.cp_async_supported = (__cc >= compute_capability{80});
|
|
198
|
+
__traits.tma_supported = (__cc >= compute_capability{90});
|
|
199
|
+
return __traits;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
//! @brief Gets the architecture traits for the given architecture id \c _Id.
|
|
203
|
+
template <arch_id _Id>
|
|
204
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits() noexcept;
|
|
188
205
|
|
|
189
206
|
template <>
|
|
190
|
-
[[nodiscard]] _CCCL_API
|
|
207
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_60>() noexcept
|
|
191
208
|
{
|
|
192
|
-
|
|
193
|
-
__traits.arch_id = id::sm_60;
|
|
194
|
-
__traits.compute_capability_major = 6;
|
|
195
|
-
__traits.compute_capability_minor = 0;
|
|
196
|
-
__traits.compute_capability = 60;
|
|
209
|
+
auto __traits = ::cuda::__common_arch_traits(arch_id::sm_60);
|
|
197
210
|
__traits.max_shared_memory_per_multiprocessor = 64 * 1024;
|
|
198
211
|
__traits.max_blocks_per_multiprocessor = 32;
|
|
199
212
|
__traits.max_threads_per_multiprocessor = 2048;
|
|
200
213
|
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
201
|
-
__traits.reserved_shared_memory_per_block = 0;
|
|
202
214
|
__traits.max_shared_memory_per_block_optin = 48 * 1024;
|
|
203
|
-
|
|
204
|
-
__traits.cluster_supported = false;
|
|
205
|
-
__traits.redux_intrinisic = false;
|
|
206
|
-
__traits.elect_intrinsic = false;
|
|
207
|
-
__traits.cp_async_supported = false;
|
|
208
|
-
__traits.tma_supported = false;
|
|
209
215
|
return __traits;
|
|
210
216
|
};
|
|
211
217
|
|
|
212
218
|
template <>
|
|
213
|
-
[[nodiscard]] _CCCL_API
|
|
219
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_61>() noexcept
|
|
214
220
|
{
|
|
215
|
-
|
|
216
|
-
__traits.arch_id = id::sm_61;
|
|
217
|
-
__traits.compute_capability_major = 6;
|
|
218
|
-
__traits.compute_capability_minor = 1;
|
|
219
|
-
__traits.compute_capability = 61;
|
|
221
|
+
auto __traits = ::cuda::__common_arch_traits(arch_id::sm_61);
|
|
220
222
|
__traits.max_shared_memory_per_multiprocessor = 96 * 1024;
|
|
221
223
|
__traits.max_blocks_per_multiprocessor = 32;
|
|
222
224
|
__traits.max_threads_per_multiprocessor = 2048;
|
|
223
225
|
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
224
|
-
__traits.reserved_shared_memory_per_block = 0;
|
|
225
226
|
__traits.max_shared_memory_per_block_optin = 48 * 1024;
|
|
226
|
-
|
|
227
|
-
__traits.cluster_supported = false;
|
|
228
|
-
__traits.redux_intrinisic = false;
|
|
229
|
-
__traits.elect_intrinsic = false;
|
|
230
|
-
__traits.cp_async_supported = false;
|
|
231
|
-
__traits.tma_supported = false;
|
|
232
227
|
return __traits;
|
|
233
228
|
};
|
|
234
229
|
|
|
235
230
|
template <>
|
|
236
|
-
[[nodiscard]] _CCCL_API
|
|
231
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_70>() noexcept
|
|
237
232
|
{
|
|
238
|
-
|
|
239
|
-
__traits.arch_id = id::sm_70;
|
|
240
|
-
__traits.compute_capability_major = 7;
|
|
241
|
-
__traits.compute_capability_minor = 0;
|
|
242
|
-
__traits.compute_capability = 70;
|
|
233
|
+
auto __traits = ::cuda::__common_arch_traits(arch_id::sm_70);
|
|
243
234
|
__traits.max_shared_memory_per_multiprocessor = 96 * 1024;
|
|
244
235
|
__traits.max_blocks_per_multiprocessor = 32;
|
|
245
236
|
__traits.max_threads_per_multiprocessor = 2048;
|
|
@@ -247,369 +238,300 @@ template <>
|
|
|
247
238
|
__traits.reserved_shared_memory_per_block = 0;
|
|
248
239
|
__traits.max_shared_memory_per_block_optin =
|
|
249
240
|
__traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
|
|
250
|
-
|
|
251
|
-
__traits.cluster_supported = false;
|
|
252
|
-
__traits.redux_intrinisic = false;
|
|
253
|
-
__traits.elect_intrinsic = false;
|
|
254
|
-
__traits.cp_async_supported = false;
|
|
255
|
-
__traits.tma_supported = false;
|
|
256
241
|
return __traits;
|
|
257
242
|
};
|
|
258
243
|
|
|
259
244
|
template <>
|
|
260
|
-
[[nodiscard]] _CCCL_API
|
|
245
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_75>() noexcept
|
|
261
246
|
{
|
|
262
|
-
|
|
263
|
-
__traits.arch_id = id::sm_75;
|
|
264
|
-
__traits.compute_capability_major = 7;
|
|
265
|
-
__traits.compute_capability_minor = 5;
|
|
266
|
-
__traits.compute_capability = 75;
|
|
247
|
+
auto __traits = ::cuda::__common_arch_traits(arch_id::sm_75);
|
|
267
248
|
__traits.max_shared_memory_per_multiprocessor = 64 * 1024;
|
|
268
249
|
__traits.max_blocks_per_multiprocessor = 16;
|
|
269
250
|
__traits.max_threads_per_multiprocessor = 1024;
|
|
270
251
|
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
271
|
-
__traits.reserved_shared_memory_per_block = 0;
|
|
272
252
|
__traits.max_shared_memory_per_block_optin =
|
|
273
253
|
__traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
|
|
274
|
-
|
|
275
|
-
__traits.cluster_supported = false;
|
|
276
|
-
__traits.redux_intrinisic = false;
|
|
277
|
-
__traits.elect_intrinsic = false;
|
|
278
|
-
__traits.cp_async_supported = false;
|
|
279
|
-
__traits.tma_supported = false;
|
|
280
254
|
return __traits;
|
|
281
255
|
};
|
|
282
256
|
|
|
283
257
|
template <>
|
|
284
|
-
[[nodiscard]] _CCCL_API
|
|
258
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_80>() noexcept
|
|
285
259
|
{
|
|
286
|
-
|
|
287
|
-
__traits.arch_id = id::sm_80;
|
|
288
|
-
__traits.compute_capability_major = 8;
|
|
289
|
-
__traits.compute_capability_minor = 0;
|
|
290
|
-
__traits.compute_capability = 80;
|
|
260
|
+
auto __traits = ::cuda::__common_arch_traits(arch_id::sm_80);
|
|
291
261
|
__traits.max_shared_memory_per_multiprocessor = 164 * 1024;
|
|
292
262
|
__traits.max_blocks_per_multiprocessor = 32;
|
|
293
263
|
__traits.max_threads_per_multiprocessor = 2048;
|
|
294
264
|
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
295
|
-
__traits.reserved_shared_memory_per_block = 1024;
|
|
296
265
|
__traits.max_shared_memory_per_block_optin =
|
|
297
266
|
__traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
|
|
298
|
-
|
|
299
|
-
__traits.cluster_supported = false;
|
|
300
|
-
__traits.redux_intrinisic = true;
|
|
301
|
-
__traits.elect_intrinsic = false;
|
|
302
|
-
__traits.cp_async_supported = true;
|
|
303
|
-
__traits.tma_supported = false;
|
|
304
267
|
return __traits;
|
|
305
268
|
};
|
|
306
269
|
|
|
307
270
|
template <>
|
|
308
|
-
[[nodiscard]] _CCCL_API
|
|
271
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_86>() noexcept
|
|
309
272
|
{
|
|
310
|
-
|
|
311
|
-
__traits.arch_id = id::sm_86;
|
|
312
|
-
__traits.compute_capability_major = 8;
|
|
313
|
-
__traits.compute_capability_minor = 6;
|
|
314
|
-
__traits.compute_capability = 86;
|
|
273
|
+
auto __traits = ::cuda::__common_arch_traits(arch_id::sm_86);
|
|
315
274
|
__traits.max_shared_memory_per_multiprocessor = 100 * 1024;
|
|
316
275
|
__traits.max_blocks_per_multiprocessor = 16;
|
|
317
276
|
__traits.max_threads_per_multiprocessor = 1536;
|
|
318
277
|
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
319
|
-
__traits.reserved_shared_memory_per_block = 1024;
|
|
320
278
|
__traits.max_shared_memory_per_block_optin =
|
|
321
279
|
__traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
|
|
280
|
+
return __traits;
|
|
281
|
+
};
|
|
322
282
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
__traits
|
|
327
|
-
__traits.
|
|
283
|
+
template <>
|
|
284
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_87>() noexcept
|
|
285
|
+
{
|
|
286
|
+
auto __traits = ::cuda::__common_arch_traits(arch_id::sm_87);
|
|
287
|
+
__traits.max_shared_memory_per_multiprocessor = 164 * 1024;
|
|
288
|
+
__traits.max_blocks_per_multiprocessor = 16;
|
|
289
|
+
__traits.max_threads_per_multiprocessor = 1536;
|
|
290
|
+
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
291
|
+
__traits.max_shared_memory_per_block_optin =
|
|
292
|
+
__traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
|
|
328
293
|
return __traits;
|
|
329
294
|
};
|
|
330
295
|
|
|
331
296
|
template <>
|
|
332
|
-
[[nodiscard]] _CCCL_API
|
|
297
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_88>() noexcept
|
|
333
298
|
{
|
|
334
|
-
|
|
335
|
-
__traits.arch_id
|
|
336
|
-
__traits.compute_capability_major
|
|
337
|
-
__traits.compute_capability_minor
|
|
338
|
-
__traits.compute_capability
|
|
299
|
+
auto __traits = ::cuda::arch_traits<arch_id::sm_86>();
|
|
300
|
+
__traits.arch_id = arch_id::sm_88;
|
|
301
|
+
__traits.compute_capability_major = 8;
|
|
302
|
+
__traits.compute_capability_minor = 8;
|
|
303
|
+
__traits.compute_capability = compute_capability{88};
|
|
304
|
+
return __traits;
|
|
305
|
+
};
|
|
306
|
+
|
|
307
|
+
template <>
|
|
308
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_89>() noexcept
|
|
309
|
+
{
|
|
310
|
+
auto __traits = ::cuda::__common_arch_traits(arch_id::sm_89);
|
|
339
311
|
__traits.max_shared_memory_per_multiprocessor = 100 * 1024;
|
|
340
312
|
__traits.max_blocks_per_multiprocessor = 24;
|
|
341
313
|
__traits.max_threads_per_multiprocessor = 1536;
|
|
342
314
|
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
343
|
-
__traits.reserved_shared_memory_per_block = 1024;
|
|
344
315
|
__traits.max_shared_memory_per_block_optin =
|
|
345
316
|
__traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
|
|
346
|
-
|
|
347
|
-
__traits.cluster_supported = false;
|
|
348
|
-
__traits.redux_intrinisic = true;
|
|
349
|
-
__traits.elect_intrinsic = false;
|
|
350
|
-
__traits.cp_async_supported = true;
|
|
351
|
-
__traits.tma_supported = false;
|
|
352
317
|
return __traits;
|
|
353
318
|
};
|
|
354
319
|
|
|
355
320
|
template <>
|
|
356
|
-
[[nodiscard]] _CCCL_API
|
|
321
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_90>() noexcept
|
|
357
322
|
{
|
|
358
|
-
|
|
359
|
-
__traits.arch_id = id::sm_90;
|
|
360
|
-
__traits.compute_capability_major = 9;
|
|
361
|
-
__traits.compute_capability_minor = 0;
|
|
362
|
-
__traits.compute_capability = 90;
|
|
323
|
+
auto __traits = ::cuda::__common_arch_traits(arch_id::sm_90);
|
|
363
324
|
__traits.max_shared_memory_per_multiprocessor = 228 * 1024;
|
|
364
325
|
__traits.max_blocks_per_multiprocessor = 32;
|
|
365
326
|
__traits.max_threads_per_multiprocessor = 2048;
|
|
366
327
|
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
367
|
-
__traits.reserved_shared_memory_per_block = 1024;
|
|
368
328
|
__traits.max_shared_memory_per_block_optin =
|
|
369
329
|
__traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
|
|
370
|
-
|
|
371
|
-
__traits.cluster_supported = true;
|
|
372
|
-
__traits.redux_intrinisic = true;
|
|
373
|
-
__traits.elect_intrinsic = true;
|
|
374
|
-
__traits.cp_async_supported = true;
|
|
375
|
-
__traits.tma_supported = true;
|
|
376
330
|
return __traits;
|
|
377
331
|
};
|
|
378
332
|
|
|
379
333
|
// No sm_90a specific fields for now.
|
|
380
334
|
template <>
|
|
381
|
-
[[nodiscard]] _CCCL_API
|
|
335
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_90a>() noexcept
|
|
382
336
|
{
|
|
383
|
-
|
|
337
|
+
auto __traits = ::cuda::arch_traits<arch_id::sm_90>();
|
|
338
|
+
__traits.arch_id = arch_id::sm_90a;
|
|
339
|
+
return __traits;
|
|
384
340
|
};
|
|
385
341
|
|
|
386
342
|
template <>
|
|
387
|
-
[[nodiscard]] _CCCL_API
|
|
343
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_100>() noexcept
|
|
388
344
|
{
|
|
389
|
-
|
|
390
|
-
__traits.arch_id = id::sm_100;
|
|
391
|
-
__traits.compute_capability_major = 10;
|
|
392
|
-
__traits.compute_capability_minor = 0;
|
|
393
|
-
__traits.compute_capability = 100;
|
|
345
|
+
auto __traits = ::cuda::__common_arch_traits(arch_id::sm_90);
|
|
394
346
|
__traits.max_shared_memory_per_multiprocessor = 228 * 1024;
|
|
395
347
|
__traits.max_blocks_per_multiprocessor = 32;
|
|
396
348
|
__traits.max_threads_per_multiprocessor = 2048;
|
|
397
349
|
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
398
|
-
__traits.reserved_shared_memory_per_block = 1024;
|
|
399
350
|
__traits.max_shared_memory_per_block_optin =
|
|
400
351
|
__traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
|
|
401
|
-
|
|
402
|
-
__traits.cluster_supported = true;
|
|
403
|
-
__traits.redux_intrinisic = true;
|
|
404
|
-
__traits.elect_intrinsic = true;
|
|
405
|
-
__traits.cp_async_supported = true;
|
|
406
|
-
__traits.tma_supported = true;
|
|
407
352
|
return __traits;
|
|
408
353
|
};
|
|
409
354
|
|
|
410
355
|
template <>
|
|
411
|
-
[[nodiscard]] _CCCL_API
|
|
356
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_100a>() noexcept
|
|
412
357
|
{
|
|
413
|
-
|
|
358
|
+
auto __traits = ::cuda::arch_traits<arch_id::sm_100>();
|
|
359
|
+
__traits.arch_id = arch_id::sm_100a;
|
|
360
|
+
return __traits;
|
|
414
361
|
};
|
|
415
362
|
|
|
416
363
|
template <>
|
|
417
|
-
[[nodiscard]] _CCCL_API
|
|
364
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_103>() noexcept
|
|
418
365
|
{
|
|
419
|
-
|
|
420
|
-
__traits.arch_id =
|
|
366
|
+
auto __traits = ::cuda::arch_traits<arch_id::sm_100>();
|
|
367
|
+
__traits.arch_id = arch_id::sm_103;
|
|
421
368
|
__traits.compute_capability_major = 10;
|
|
422
369
|
__traits.compute_capability_minor = 3;
|
|
423
|
-
__traits.compute_capability = 103;
|
|
370
|
+
__traits.compute_capability = compute_capability{103};
|
|
424
371
|
return __traits;
|
|
425
372
|
};
|
|
426
373
|
|
|
427
374
|
template <>
|
|
428
|
-
[[nodiscard]] _CCCL_API
|
|
375
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_103a>() noexcept
|
|
429
376
|
{
|
|
430
|
-
|
|
377
|
+
auto __traits = ::cuda::arch_traits<arch_id::sm_103>();
|
|
378
|
+
__traits.arch_id = arch_id::sm_103a;
|
|
379
|
+
return __traits;
|
|
431
380
|
};
|
|
432
381
|
|
|
433
382
|
template <>
|
|
434
|
-
[[nodiscard]] _CCCL_API
|
|
383
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_110>() noexcept
|
|
435
384
|
{
|
|
436
|
-
|
|
437
|
-
__traits.arch_id
|
|
438
|
-
__traits.compute_capability_major
|
|
439
|
-
__traits.compute_capability_minor
|
|
440
|
-
__traits.compute_capability
|
|
385
|
+
auto __traits = ::cuda::arch_traits<arch_id::sm_100>();
|
|
386
|
+
__traits.arch_id = arch_id::sm_110;
|
|
387
|
+
__traits.compute_capability_major = 11;
|
|
388
|
+
__traits.compute_capability_minor = 0;
|
|
389
|
+
__traits.compute_capability = compute_capability{110};
|
|
390
|
+
__traits.max_blocks_per_multiprocessor = 24;
|
|
391
|
+
__traits.max_threads_per_multiprocessor = 1536;
|
|
392
|
+
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
441
393
|
return __traits;
|
|
442
394
|
};
|
|
443
395
|
|
|
444
396
|
template <>
|
|
445
|
-
[[nodiscard]] _CCCL_API
|
|
397
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_110a>() noexcept
|
|
446
398
|
{
|
|
447
|
-
|
|
399
|
+
auto __traits = ::cuda::arch_traits<arch_id::sm_110>();
|
|
400
|
+
__traits.arch_id = arch_id::sm_110a;
|
|
401
|
+
return __traits;
|
|
448
402
|
};
|
|
449
403
|
|
|
450
404
|
template <>
|
|
451
|
-
[[nodiscard]]
|
|
405
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_120>() noexcept
|
|
452
406
|
{
|
|
453
|
-
|
|
454
|
-
__traits.arch_id = id::sm_120;
|
|
455
|
-
__traits.compute_capability_major = 12;
|
|
456
|
-
__traits.compute_capability_minor = 0;
|
|
457
|
-
__traits.compute_capability = 120;
|
|
407
|
+
auto __traits = ::cuda::__common_arch_traits(arch_id::sm_120);
|
|
458
408
|
__traits.max_shared_memory_per_multiprocessor = 100 * 1024;
|
|
459
|
-
__traits.max_blocks_per_multiprocessor =
|
|
409
|
+
__traits.max_blocks_per_multiprocessor = 24;
|
|
460
410
|
__traits.max_threads_per_multiprocessor = 1536;
|
|
461
411
|
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
462
|
-
__traits.reserved_shared_memory_per_block = 1024;
|
|
463
412
|
__traits.max_shared_memory_per_block_optin =
|
|
464
413
|
__traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
|
|
414
|
+
return __traits;
|
|
415
|
+
};
|
|
465
416
|
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
__traits
|
|
470
|
-
__traits.
|
|
417
|
+
template <>
|
|
418
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_120a>() noexcept
|
|
419
|
+
{
|
|
420
|
+
auto __traits = ::cuda::arch_traits<arch_id::sm_120>();
|
|
421
|
+
__traits.arch_id = arch_id::sm_120a;
|
|
471
422
|
return __traits;
|
|
472
423
|
};
|
|
473
424
|
|
|
474
425
|
template <>
|
|
475
|
-
[[nodiscard]] _CCCL_API
|
|
426
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_121>() noexcept
|
|
476
427
|
{
|
|
477
|
-
|
|
428
|
+
auto __traits = ::cuda::arch_traits<arch_id::sm_120>();
|
|
429
|
+
__traits.arch_id = arch_id::sm_121;
|
|
430
|
+
__traits.compute_capability_major = 12;
|
|
431
|
+
__traits.compute_capability_minor = 1;
|
|
432
|
+
__traits.compute_capability = compute_capability{121};
|
|
433
|
+
return __traits;
|
|
478
434
|
};
|
|
479
435
|
|
|
480
|
-
|
|
436
|
+
template <>
|
|
437
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_121a>() noexcept
|
|
438
|
+
{
|
|
439
|
+
auto __traits = ::cuda::arch_traits<arch_id::sm_121>();
|
|
440
|
+
__traits.arch_id = arch_id::sm_121a;
|
|
441
|
+
return __traits;
|
|
442
|
+
};
|
|
481
443
|
|
|
482
|
-
|
|
444
|
+
//! @brief Gets the architecture traits for the given architecture id \c __id.
|
|
445
|
+
//!
|
|
446
|
+
//! @throws \c cuda::cuda_error if the \c __id is not a known architecture.
|
|
447
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits_for(arch_id __id)
|
|
483
448
|
{
|
|
484
449
|
switch (__id)
|
|
485
450
|
{
|
|
486
|
-
case
|
|
487
|
-
return ::cuda::
|
|
488
|
-
case
|
|
489
|
-
return ::cuda::
|
|
490
|
-
case
|
|
491
|
-
return ::cuda::
|
|
492
|
-
case
|
|
493
|
-
return ::cuda::
|
|
494
|
-
case
|
|
495
|
-
return ::cuda::
|
|
496
|
-
case
|
|
497
|
-
return ::cuda::
|
|
498
|
-
case
|
|
499
|
-
return ::cuda::
|
|
500
|
-
case
|
|
501
|
-
return ::cuda::
|
|
502
|
-
case
|
|
503
|
-
return ::cuda::
|
|
504
|
-
case
|
|
505
|
-
return ::cuda::
|
|
506
|
-
case
|
|
507
|
-
return ::cuda::
|
|
508
|
-
case
|
|
509
|
-
return ::cuda::
|
|
510
|
-
case
|
|
511
|
-
return ::cuda::
|
|
512
|
-
case
|
|
513
|
-
return ::cuda::
|
|
514
|
-
case
|
|
515
|
-
return ::cuda::
|
|
516
|
-
case
|
|
517
|
-
return ::cuda::
|
|
518
|
-
case
|
|
519
|
-
return ::cuda::
|
|
451
|
+
case arch_id::sm_60:
|
|
452
|
+
return ::cuda::arch_traits<arch_id::sm_60>();
|
|
453
|
+
case arch_id::sm_61:
|
|
454
|
+
return ::cuda::arch_traits<arch_id::sm_61>();
|
|
455
|
+
case arch_id::sm_70:
|
|
456
|
+
return ::cuda::arch_traits<arch_id::sm_70>();
|
|
457
|
+
case arch_id::sm_75:
|
|
458
|
+
return ::cuda::arch_traits<arch_id::sm_75>();
|
|
459
|
+
case arch_id::sm_80:
|
|
460
|
+
return ::cuda::arch_traits<arch_id::sm_80>();
|
|
461
|
+
case arch_id::sm_86:
|
|
462
|
+
return ::cuda::arch_traits<arch_id::sm_86>();
|
|
463
|
+
case arch_id::sm_87:
|
|
464
|
+
return ::cuda::arch_traits<arch_id::sm_87>();
|
|
465
|
+
case arch_id::sm_88:
|
|
466
|
+
return ::cuda::arch_traits<arch_id::sm_88>();
|
|
467
|
+
case arch_id::sm_89:
|
|
468
|
+
return ::cuda::arch_traits<arch_id::sm_89>();
|
|
469
|
+
case arch_id::sm_90:
|
|
470
|
+
return ::cuda::arch_traits<arch_id::sm_90>();
|
|
471
|
+
case arch_id::sm_90a:
|
|
472
|
+
return ::cuda::arch_traits<arch_id::sm_90a>();
|
|
473
|
+
case arch_id::sm_100:
|
|
474
|
+
return ::cuda::arch_traits<arch_id::sm_100>();
|
|
475
|
+
case arch_id::sm_100a:
|
|
476
|
+
return ::cuda::arch_traits<arch_id::sm_100a>();
|
|
477
|
+
case arch_id::sm_103:
|
|
478
|
+
return ::cuda::arch_traits<arch_id::sm_103>();
|
|
479
|
+
case arch_id::sm_103a:
|
|
480
|
+
return ::cuda::arch_traits<arch_id::sm_103a>();
|
|
481
|
+
case arch_id::sm_110:
|
|
482
|
+
return ::cuda::arch_traits<arch_id::sm_110>();
|
|
483
|
+
case arch_id::sm_110a:
|
|
484
|
+
return ::cuda::arch_traits<arch_id::sm_110a>();
|
|
485
|
+
case arch_id::sm_120:
|
|
486
|
+
return ::cuda::arch_traits<arch_id::sm_120>();
|
|
487
|
+
case arch_id::sm_120a:
|
|
488
|
+
return ::cuda::arch_traits<arch_id::sm_120a>();
|
|
489
|
+
case arch_id::sm_121:
|
|
490
|
+
return ::cuda::arch_traits<arch_id::sm_121>();
|
|
491
|
+
case arch_id::sm_121a:
|
|
492
|
+
return ::cuda::arch_traits<arch_id::sm_121a>();
|
|
520
493
|
default:
|
|
521
494
|
::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Traits requested for an unknown architecture");
|
|
522
495
|
break;
|
|
523
496
|
}
|
|
524
497
|
}
|
|
525
498
|
|
|
526
|
-
|
|
499
|
+
//! @brief Gets the architecture traits for the given compute capability \c __cc.
|
|
500
|
+
//!
|
|
501
|
+
//! @throws \c cuda::cuda_error if the \c __cc doesn't have a corresponding architecture id.
|
|
502
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits_for(compute_capability __cc)
|
|
527
503
|
{
|
|
528
|
-
|
|
529
|
-
{
|
|
530
|
-
::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Compute capability out of range");
|
|
531
|
-
}
|
|
532
|
-
return static_cast<id>(compute_capability);
|
|
504
|
+
return ::cuda::arch_traits_for(::cuda::to_arch_id(__cc));
|
|
533
505
|
}
|
|
534
506
|
|
|
535
|
-
|
|
536
|
-
{
|
|
537
|
-
return ::cuda::arch::traits_for_id(::cuda::arch::id_for_compute_capability(compute_capability));
|
|
538
|
-
}
|
|
507
|
+
_CCCL_END_NAMESPACE_CUDA
|
|
539
508
|
|
|
540
|
-
|
|
541
|
-
{
|
|
542
|
-
switch (value)
|
|
543
|
-
{
|
|
544
|
-
case 90:
|
|
545
|
-
return id::sm_90a;
|
|
546
|
-
case 100:
|
|
547
|
-
return id::sm_100a;
|
|
548
|
-
case 103:
|
|
549
|
-
return id::sm_103a;
|
|
550
|
-
case 110:
|
|
551
|
-
return id::sm_110a;
|
|
552
|
-
case 120:
|
|
553
|
-
return id::sm_120a;
|
|
554
|
-
default:
|
|
555
|
-
::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Compute capability out of range");
|
|
556
|
-
break;
|
|
557
|
-
}
|
|
558
|
-
}
|
|
509
|
+
# if _CCCL_CUDA_COMPILATION()
|
|
559
510
|
|
|
560
|
-
|
|
561
|
-
[[nodiscard]] _CCCL_DEVICE_API inline constexpr arch::traits_t current_traits()
|
|
562
|
-
{
|
|
563
|
-
// fixme: this doesn't work with nvc++ -cuda
|
|
564
|
-
# ifdef __CUDA_ARCH__
|
|
565
|
-
# ifdef __CUDA_ARCH_SPECIFIC__
|
|
566
|
-
return ::cuda::arch::traits_for_id(::cuda::arch::__special_id_for_compute_capability(__CUDA_ARCH_SPECIFIC__ / 10));
|
|
567
|
-
# else
|
|
568
|
-
return ::cuda::arch::traits_for_compute_capability(__CUDA_ARCH__ / 10);
|
|
569
|
-
# endif // __CUDA_ARCH_SPECIFIC__
|
|
570
|
-
# else // __CUDA_ARCH__
|
|
571
|
-
// Should be unreachable in __device__ function
|
|
572
|
-
return ::cuda::arch::traits_t{};
|
|
573
|
-
# endif // __CUDA_ARCH__
|
|
574
|
-
}
|
|
511
|
+
_CCCL_BEGIN_NAMESPACE_CUDA_DEVICE
|
|
575
512
|
|
|
576
|
-
|
|
577
|
-
|
|
513
|
+
//! @brief Returns the \c cuda::arch_trait_t of the architecture that is currently being compiled.
|
|
514
|
+
//!
|
|
515
|
+
//! If the current architecture is not a known architecture from \c cuda::arch_id enumeration, the compilation
|
|
516
|
+
//! will fail.
|
|
517
|
+
//!
|
|
518
|
+
//! @note This API cannot be used in constexpr context when compiling with nvc++ in CUDA mode.
|
|
519
|
+
template <class _Dummy = void>
|
|
520
|
+
[[nodiscard]] _CCCL_DEVICE_API _CCCL_TARGET_CONSTEXPR ::cuda::arch_traits_t current_arch_traits() noexcept
|
|
578
521
|
{
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
}
|
|
583
|
-
|
|
584
|
-
{
|
|
585
|
-
// If the architecture is unknown, we need to craft the arch_traits from attributes
|
|
586
|
-
arch::traits_t __traits{};
|
|
587
|
-
__traits.compute_capability_major = __compute_capability / 10;
|
|
588
|
-
__traits.compute_capability_minor = __compute_capability % 10;
|
|
589
|
-
__traits.compute_capability = __compute_capability;
|
|
590
|
-
__traits.max_shared_memory_per_multiprocessor =
|
|
591
|
-
::cuda::device_attributes::max_shared_memory_per_multiprocessor(__device);
|
|
592
|
-
__traits.max_blocks_per_multiprocessor = ::cuda::device_attributes::max_blocks_per_multiprocessor(__device);
|
|
593
|
-
__traits.max_threads_per_multiprocessor = ::cuda::device_attributes::max_threads_per_multiprocessor(__device);
|
|
594
|
-
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
595
|
-
__traits.reserved_shared_memory_per_block = ::cuda::device_attributes::reserved_shared_memory_per_block(__device);
|
|
596
|
-
__traits.max_shared_memory_per_block_optin =
|
|
597
|
-
__traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
|
|
598
|
-
|
|
599
|
-
__traits.cluster_supported = __compute_capability >= 90;
|
|
600
|
-
__traits.redux_intrinisic = __compute_capability >= 80;
|
|
601
|
-
__traits.elect_intrinsic = __compute_capability >= 90;
|
|
602
|
-
__traits.cp_async_supported = __compute_capability >= 80;
|
|
603
|
-
__traits.tma_supported = __compute_capability >= 90;
|
|
604
|
-
return __traits;
|
|
605
|
-
}
|
|
522
|
+
# if _CCCL_DEVICE_COMPILATION()
|
|
523
|
+
return ::cuda::arch_traits_for(::cuda::device::current_arch_id<_Dummy>());
|
|
524
|
+
# else // ^^^ _CCCL_DEVICE_COMPILATION() ^^^ / vvv !_CCCL_DEVICE_COMPILATION() vvv
|
|
525
|
+
return {};
|
|
526
|
+
# endif // ^^^ !_CCCL_DEVICE_COMPILATION() ^^^
|
|
606
527
|
}
|
|
607
|
-
} // namespace arch
|
|
608
528
|
|
|
609
|
-
|
|
529
|
+
_CCCL_END_NAMESPACE_CUDA_DEVICE
|
|
530
|
+
|
|
531
|
+
# endif // _CCCL_CUDA_COMPILATION
|
|
610
532
|
|
|
611
533
|
# include <cuda/std/__cccl/epilogue.h>
|
|
612
534
|
|
|
613
|
-
#endif // _CCCL_HAS_CTK()
|
|
535
|
+
#endif // _CCCL_HAS_CTK()
|
|
614
536
|
|
|
615
537
|
#endif // _CUDA___DEVICE_ARCH_TRAITS_H
|