cuda-cccl 0.1.3.2.0.dev438__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.1__cp311-cp311-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +25 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +24 -19
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
- cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
- cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +320 -262
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +10 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +37 -13
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +204 -55
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/util_device.cuh +51 -35
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +4 -4
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__device/all_devices.h +47 -147
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +51 -49
- cuda/cccl/headers/include/cuda/__device/attributes.h +177 -127
- cuda/cccl/headers/include/cuda/__device/device_ref.h +32 -51
- cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +330 -36
- cuda/cccl/headers/include/cuda/__event/event.h +8 -8
- cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +4 -4
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
- cuda/cccl/headers/include/cuda/__runtime/types.h +1 -1
- cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -12
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/devices +10 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +1 -1
- cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
- cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +3 -2
- cuda/cccl/headers/include/cuda/std/string_view +12 -5
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
- cuda/cccl/parallel/experimental/__init__.py +21 -70
- cuda/compute/__init__.py +77 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +28 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +141 -1
- cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +4 -0
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -6
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/compute/algorithms/_three_way_partition.py +261 -0
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +171 -166
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -50,8 +50,8 @@
|
|
|
50
50
|
|
|
51
51
|
#include <cuda/__ptx/instructions/get_sreg.h>
|
|
52
52
|
#include <cuda/std/__algorithm/clamp.h>
|
|
53
|
-
#include <cuda/std/__algorithm/max.h>
|
|
54
53
|
#include <cuda/std/__bit/has_single_bit.h>
|
|
54
|
+
#include <cuda/std/__bit/integral.h>
|
|
55
55
|
#include <cuda/std/__functional/operations.h>
|
|
56
56
|
#include <cuda/std/__type_traits/integral_constant.h>
|
|
57
57
|
#include <cuda/std/__type_traits/is_integral.h>
|
|
@@ -630,7 +630,7 @@ struct WarpScanShfl
|
|
|
630
630
|
ballot = ballot & ::cuda::ptx::get_sreg_lanemask_le();
|
|
631
631
|
|
|
632
632
|
// Find index of first set bit
|
|
633
|
-
int segment_first_lane = ::cuda::std::
|
|
633
|
+
int segment_first_lane = ::cuda::std::__bit_log2(ballot);
|
|
634
634
|
|
|
635
635
|
// Iterate scan steps
|
|
636
636
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
#ifndef __CUDA___ALGORITHM_COPY_H
|
|
12
12
|
#define __CUDA___ALGORITHM_COPY_H
|
|
13
13
|
|
|
14
|
-
#include <cuda/
|
|
14
|
+
#include <cuda/std/detail/__config>
|
|
15
15
|
|
|
16
16
|
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
17
17
|
# pragma GCC system_header
|
|
@@ -38,11 +38,11 @@ enum class source_access_order
|
|
|
38
38
|
{
|
|
39
39
|
# if _CCCL_CTK_AT_LEAST(13, 0)
|
|
40
40
|
//! @brief Access source in stream order
|
|
41
|
-
stream = cudaMemcpySrcAccessOrderStream,
|
|
41
|
+
stream = ::cudaMemcpySrcAccessOrderStream,
|
|
42
42
|
//! @brief Access source during the copy call, source can be destroyed after the API returns
|
|
43
|
-
during_api_call = cudaMemcpySrcAccessOrderDuringApiCall,
|
|
43
|
+
during_api_call = ::cudaMemcpySrcAccessOrderDuringApiCall,
|
|
44
44
|
//! @brief Access source in any order, the order can change across CUDA releases
|
|
45
|
-
any = cudaMemcpySrcAccessOrderAny,
|
|
45
|
+
any = ::cudaMemcpySrcAccessOrderAny,
|
|
46
46
|
# else
|
|
47
47
|
any = 0x3,
|
|
48
48
|
# endif // _CCCL_CTK_BELOW(13, 0)
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
#ifndef _CUDA___DEVICE_ALL_DEVICES_H
|
|
12
12
|
#define _CUDA___DEVICE_ALL_DEVICES_H
|
|
13
13
|
|
|
14
|
-
#include <cuda/
|
|
14
|
+
#include <cuda/std/detail/__config>
|
|
15
15
|
|
|
16
16
|
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
17
17
|
# pragma GCC system_header
|
|
@@ -22,10 +22,12 @@
|
|
|
22
22
|
#endif // no system header
|
|
23
23
|
|
|
24
24
|
#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
25
|
+
|
|
26
|
+
# include <cuda/__device/device_ref.h>
|
|
25
27
|
# include <cuda/__device/physical_device.h>
|
|
26
|
-
# include <cuda/
|
|
27
|
-
# include <cuda/
|
|
28
|
-
# include <cuda/std/
|
|
28
|
+
# include <cuda/__driver/driver_api.h>
|
|
29
|
+
# include <cuda/__fwd/devices.h>
|
|
30
|
+
# include <cuda/std/__cstddef/types.h>
|
|
29
31
|
# include <cuda/std/span>
|
|
30
32
|
|
|
31
33
|
# include <vector>
|
|
@@ -33,132 +35,62 @@
|
|
|
33
35
|
# include <cuda/std/__cccl/prologue.h>
|
|
34
36
|
|
|
35
37
|
_CCCL_BEGIN_NAMESPACE_CUDA
|
|
36
|
-
namespace __detail
|
|
37
|
-
{
|
|
38
|
-
//! @brief A random-access range of all available CUDA devices
|
|
39
|
-
class all_devices
|
|
40
|
-
{
|
|
41
|
-
public:
|
|
42
|
-
using size_type = ::std::vector<physical_device>::size_type;
|
|
43
|
-
using iterator = ::std::vector<physical_device>::const_iterator;
|
|
44
|
-
using const_iterator = ::std::vector<physical_device>::const_iterator;
|
|
45
|
-
|
|
46
|
-
all_devices() = default;
|
|
47
|
-
|
|
48
|
-
[[nodiscard]] const physical_device& operator[](size_type __i) const;
|
|
49
|
-
|
|
50
|
-
[[nodiscard]] size_type size() const;
|
|
51
38
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
[[nodiscard]] iterator end() const noexcept;
|
|
55
|
-
|
|
56
|
-
operator ::cuda::std::span<const device_ref>() const;
|
|
57
|
-
|
|
58
|
-
private:
|
|
59
|
-
struct __initializer_iterator;
|
|
60
|
-
|
|
61
|
-
static const ::std::vector<physical_device>& __devices();
|
|
62
|
-
};
|
|
63
|
-
|
|
64
|
-
//! @brief An iterator used to in-place construct `device` objects in a
|
|
65
|
-
//! std::vector.
|
|
66
|
-
//!
|
|
67
|
-
//! Since `device` objects are not movable or copyable, we need to construct them
|
|
68
|
-
//! in-place with a proxy object that can be implicitly converted to a `device`
|
|
69
|
-
//! object.
|
|
70
|
-
struct all_devices::__initializer_iterator
|
|
39
|
+
[[nodiscard]] _CCCL_HOST_API inline ::std::vector<device_ref> __make_devices()
|
|
71
40
|
{
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
using difference_type = int;
|
|
76
|
-
using pointer = __emplace_device;
|
|
77
|
-
|
|
78
|
-
int __id_;
|
|
79
|
-
|
|
80
|
-
__emplace_device operator*() const noexcept
|
|
41
|
+
::std::vector<device_ref> __ret{};
|
|
42
|
+
__ret.reserve(::cuda::__physical_devices().size());
|
|
43
|
+
for (::cuda::std::size_t __i = 0; __i < ::cuda::__physical_devices().size(); ++__i)
|
|
81
44
|
{
|
|
82
|
-
|
|
45
|
+
__ret.emplace_back(static_cast<int>(__i));
|
|
83
46
|
}
|
|
47
|
+
return __ret;
|
|
48
|
+
}
|
|
84
49
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
}
|
|
50
|
+
[[nodiscard]] inline ::cuda::std::span<const device_ref> __devices()
|
|
51
|
+
{
|
|
52
|
+
static const auto __devices = ::cuda::__make_devices();
|
|
53
|
+
return ::cuda::std::span<const device_ref>{__devices.data(), __devices.size()};
|
|
54
|
+
}
|
|
89
55
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
56
|
+
//! @brief A random-access range of all available CUDA devices
|
|
57
|
+
class __all_devices
|
|
58
|
+
{
|
|
59
|
+
public:
|
|
60
|
+
using value_type = ::cuda::std::span<const device_ref>::value_type;
|
|
61
|
+
using size_type = ::cuda::std::span<const device_ref>::size_type;
|
|
62
|
+
using iterator = ::cuda::std::span<const device_ref>::iterator;
|
|
63
|
+
|
|
64
|
+
_CCCL_HIDE_FROM_ABI __all_devices() = default;
|
|
65
|
+
__all_devices(const __all_devices&) = delete;
|
|
66
|
+
__all_devices(__all_devices&&) = delete;
|
|
67
|
+
__all_devices& operator=(const __all_devices&) = delete;
|
|
68
|
+
__all_devices& operator=(__all_devices&&) = delete;
|
|
95
69
|
|
|
96
|
-
|
|
70
|
+
[[nodiscard]] _CCCL_HOST_API device_ref operator[](size_type __i) const
|
|
97
71
|
{
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
72
|
+
if (__i >= size())
|
|
73
|
+
{
|
|
74
|
+
::cuda::std::__throw_out_of_range("device index out of range");
|
|
75
|
+
}
|
|
76
|
+
return ::cuda::__devices()[__i];
|
|
101
77
|
}
|
|
102
78
|
|
|
103
|
-
|
|
79
|
+
[[nodiscard]] _CCCL_HOST_API size_type size() const
|
|
104
80
|
{
|
|
105
|
-
return
|
|
81
|
+
return ::cuda::__devices().size();
|
|
106
82
|
}
|
|
107
83
|
|
|
108
|
-
|
|
84
|
+
[[nodiscard]] _CCCL_HOST_API iterator begin() const
|
|
109
85
|
{
|
|
110
|
-
return
|
|
86
|
+
return ::cuda::__devices().begin();
|
|
111
87
|
}
|
|
112
|
-
};
|
|
113
88
|
|
|
114
|
-
[[nodiscard]]
|
|
115
|
-
{
|
|
116
|
-
if (__id_ >= size())
|
|
89
|
+
[[nodiscard]] _CCCL_HOST_API iterator end() const
|
|
117
90
|
{
|
|
118
|
-
|
|
119
|
-
{
|
|
120
|
-
::cuda::std::__throw_out_of_range("device was requested but no CUDA devices found");
|
|
121
|
-
}
|
|
122
|
-
else
|
|
123
|
-
{
|
|
124
|
-
::cuda::std::__throw_out_of_range(
|
|
125
|
-
(::std::string("device index out of range: ") + ::std::to_string(__id_)).c_str());
|
|
126
|
-
}
|
|
91
|
+
return ::cuda::__devices().end();
|
|
127
92
|
}
|
|
128
|
-
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
[[nodiscard]] inline all_devices::size_type all_devices::size() const
|
|
132
|
-
{
|
|
133
|
-
return __devices().size();
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
[[nodiscard]] inline all_devices::iterator all_devices::begin() const noexcept
|
|
137
|
-
{
|
|
138
|
-
return __devices().begin();
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
[[nodiscard]] inline all_devices::iterator all_devices::end() const noexcept
|
|
142
|
-
{
|
|
143
|
-
return __devices().end();
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
inline all_devices::operator ::cuda::std::span<const device_ref>() const
|
|
147
|
-
{
|
|
148
|
-
static const ::std::vector<device_ref> __refs(begin(), end());
|
|
149
|
-
return ::cuda::std::span<const device_ref>(__refs);
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
inline const ::std::vector<physical_device>& all_devices::__devices()
|
|
153
|
-
{
|
|
154
|
-
static const ::std::vector<physical_device> __devices = [] {
|
|
155
|
-
int __count = 0;
|
|
156
|
-
_CCCL_TRY_CUDA_API(::cudaGetDeviceCount, "failed to get the count of CUDA devices", &__count);
|
|
157
|
-
return ::std::vector<physical_device>{__initializer_iterator{0}, __initializer_iterator{__count}};
|
|
158
|
-
}();
|
|
159
|
-
return __devices;
|
|
160
|
-
}
|
|
161
|
-
} // namespace __detail
|
|
93
|
+
};
|
|
162
94
|
|
|
163
95
|
//! @brief A range of all available CUDA devices
|
|
164
96
|
//!
|
|
@@ -174,7 +106,7 @@ inline const ::std::vector<physical_device>& all_devices::__devices()
|
|
|
174
106
|
//! struct iterator;
|
|
175
107
|
//! using const_iterator = iterator;
|
|
176
108
|
//!
|
|
177
|
-
//! [[nodiscard]]
|
|
109
|
+
//! [[nodiscard]] device_ref operator[](size_type i) const noexcept;
|
|
178
110
|
//!
|
|
179
111
|
//! [[nodiscard]] size_type size() const;
|
|
180
112
|
//!
|
|
@@ -186,7 +118,7 @@ inline const ::std::vector<physical_device>& all_devices::__devices()
|
|
|
186
118
|
//!
|
|
187
119
|
//! @par
|
|
188
120
|
//! `__all_devices::iterator` is a random access iterator with a `reference`
|
|
189
|
-
//! type of `const
|
|
121
|
+
//! type of `const device_ref&`.
|
|
190
122
|
//!
|
|
191
123
|
//! @par Example
|
|
192
124
|
//! @code
|
|
@@ -197,39 +129,7 @@ inline const ::std::vector<physical_device>& all_devices::__devices()
|
|
|
197
129
|
//! @sa
|
|
198
130
|
//! * device
|
|
199
131
|
//! * device_ref
|
|
200
|
-
inline constexpr
|
|
201
|
-
|
|
202
|
-
inline const arch::traits_t& device_ref::arch_traits() const
|
|
203
|
-
{
|
|
204
|
-
return devices[get()].arch_traits();
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
[[nodiscard]] inline ::std::vector<device_ref> device_ref::peer_devices() const
|
|
208
|
-
{
|
|
209
|
-
::std::vector<device_ref> __result;
|
|
210
|
-
__result.reserve(devices.size());
|
|
211
|
-
|
|
212
|
-
for (const physical_device& __other_dev : devices)
|
|
213
|
-
{
|
|
214
|
-
// Exclude the device this API is called on. The main use case for this API
|
|
215
|
-
// is enable/disable peer access. While enable peer access can be called on
|
|
216
|
-
// device on which memory resides, disable peer access will error-out.
|
|
217
|
-
// Usage of the peer access control is smoother when *this is excluded,
|
|
218
|
-
// while it can be easily added with .push_back() on the vector if a full
|
|
219
|
-
// group of peers is needed (for cases other than peer access control)
|
|
220
|
-
if (__other_dev != *this)
|
|
221
|
-
{
|
|
222
|
-
// While in almost all practical applications peer access should be symmetrical,
|
|
223
|
-
// it is possible to build a system with one directional peer access, check
|
|
224
|
-
// both ways here just to be safe
|
|
225
|
-
if (has_peer_access_to(__other_dev) && __other_dev.has_peer_access_to(*this))
|
|
226
|
-
{
|
|
227
|
-
__result.push_back(__other_dev);
|
|
228
|
-
}
|
|
229
|
-
}
|
|
230
|
-
}
|
|
231
|
-
return __result;
|
|
232
|
-
}
|
|
132
|
+
inline constexpr __all_devices devices{};
|
|
233
133
|
|
|
234
134
|
_CCCL_END_NAMESPACE_CUDA
|
|
235
135
|
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
#ifndef _CUDA___DEVICE_ARCH_TRAITS_H
|
|
12
12
|
#define _CUDA___DEVICE_ARCH_TRAITS_H
|
|
13
13
|
|
|
14
|
-
#include <cuda/
|
|
14
|
+
#include <cuda/std/detail/__config>
|
|
15
15
|
|
|
16
16
|
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
17
17
|
# pragma GCC system_header
|
|
@@ -22,7 +22,9 @@
|
|
|
22
22
|
#endif // no system header
|
|
23
23
|
|
|
24
24
|
#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
25
|
+
|
|
25
26
|
# include <cuda/__device/attributes.h>
|
|
27
|
+
# include <cuda/__fwd/devices.h>
|
|
26
28
|
# include <cuda/std/__exception/cuda_error.h>
|
|
27
29
|
# include <cuda/std/limits>
|
|
28
30
|
|
|
@@ -58,76 +60,76 @@ enum class id : int
|
|
|
58
60
|
sm_120a = 120 * __arch_specific_id_multiplier,
|
|
59
61
|
};
|
|
60
62
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
+
//! @brief Architecture traits
|
|
64
|
+
//! This type contains information about an architecture that is constant across devices of that architecture.
|
|
63
65
|
struct traits_t
|
|
64
66
|
{
|
|
65
67
|
// Maximum number of threads per block
|
|
66
|
-
|
|
68
|
+
int max_threads_per_block = 1024;
|
|
67
69
|
|
|
68
70
|
// Maximum x-dimension of a block
|
|
69
|
-
|
|
71
|
+
int max_block_dim_x = 1024;
|
|
70
72
|
|
|
71
73
|
// Maximum y-dimension of a block
|
|
72
|
-
|
|
74
|
+
int max_block_dim_y = 1024;
|
|
73
75
|
|
|
74
76
|
// Maximum z-dimension of a block
|
|
75
|
-
|
|
77
|
+
int max_block_dim_z = 64;
|
|
76
78
|
|
|
77
79
|
// Maximum x-dimension of a grid
|
|
78
|
-
|
|
80
|
+
int max_grid_dim_x = ::cuda::std::numeric_limits<int32_t>::max();
|
|
79
81
|
|
|
80
82
|
// Maximum y-dimension of a grid
|
|
81
|
-
|
|
83
|
+
int max_grid_dim_y = 64 * 1024 - 1;
|
|
82
84
|
|
|
83
85
|
// Maximum z-dimension of a grid
|
|
84
|
-
|
|
86
|
+
int max_grid_dim_z = 64 * 1024 - 1;
|
|
85
87
|
|
|
86
88
|
// Maximum amount of shared memory available to a thread block in bytes
|
|
87
|
-
|
|
89
|
+
::cuda::std::size_t max_shared_memory_per_block = 48 * 1024;
|
|
88
90
|
|
|
89
91
|
// Memory available on device for __constant__ variables in a CUDA C kernel in bytes
|
|
90
|
-
|
|
92
|
+
::cuda::std::size_t total_constant_memory = 64 * 1024;
|
|
91
93
|
|
|
92
94
|
// Warp size in threads
|
|
93
|
-
|
|
95
|
+
int warp_size = 32;
|
|
94
96
|
|
|
95
97
|
// Maximum number of concurrent grids on the device
|
|
96
|
-
|
|
98
|
+
int max_resident_grids = 128;
|
|
97
99
|
|
|
98
100
|
// true if the device can concurrently copy memory between host and device
|
|
99
101
|
// while executing a kernel, or false if not
|
|
100
|
-
|
|
102
|
+
bool gpu_overlap = true;
|
|
101
103
|
|
|
102
104
|
// true if the device can map host memory into CUDA address space
|
|
103
|
-
|
|
105
|
+
bool can_map_host_memory = true;
|
|
104
106
|
|
|
105
107
|
// true if the device supports executing multiple kernels within the same
|
|
106
108
|
// context simultaneously, or false if not. It is not guaranteed that multiple
|
|
107
109
|
// kernels will be resident on the device concurrently so this feature should
|
|
108
110
|
// not be relied upon for correctness.
|
|
109
|
-
|
|
111
|
+
bool concurrent_kernels = true;
|
|
110
112
|
|
|
111
113
|
// true if the device supports stream priorities, or false if not
|
|
112
|
-
|
|
114
|
+
bool stream_priorities_supported = true;
|
|
113
115
|
|
|
114
116
|
// true if device supports caching globals in L1 cache, false if not
|
|
115
|
-
|
|
117
|
+
bool global_l1_cache_supported = true;
|
|
116
118
|
|
|
117
119
|
// true if device supports caching locals in L1 cache, false if not
|
|
118
|
-
|
|
120
|
+
bool local_l1_cache_supported = true;
|
|
119
121
|
|
|
120
122
|
// TODO: We might want to have these per-arch
|
|
121
123
|
// Maximum number of 32-bit registers available to a thread block
|
|
122
|
-
|
|
124
|
+
int max_registers_per_block = 64 * 1024;
|
|
123
125
|
|
|
124
126
|
// Maximum number of 32-bit registers available to a multiprocessor; this
|
|
125
127
|
// number is shared by all thread blocks simultaneously resident on a
|
|
126
128
|
// multiprocessor
|
|
127
|
-
|
|
129
|
+
int max_registers_per_multiprocessor = 64 * 1024;
|
|
128
130
|
|
|
129
131
|
// Maximum number of 32-bit registers available to a thread
|
|
130
|
-
|
|
132
|
+
int max_registers_per_thread = 255;
|
|
131
133
|
|
|
132
134
|
// Identifier for the architecture
|
|
133
135
|
id arch_id;
|
|
@@ -144,7 +146,7 @@ struct traits_t
|
|
|
144
146
|
// Maximum amount of shared memory available to a multiprocessor in bytes;
|
|
145
147
|
// this amount is shared by all thread blocks simultaneously resident on a
|
|
146
148
|
// multiprocessor
|
|
147
|
-
|
|
149
|
+
::cuda::std::size_t max_shared_memory_per_multiprocessor;
|
|
148
150
|
|
|
149
151
|
// Maximum number of thread blocks that can reside on a multiprocessor
|
|
150
152
|
int max_blocks_per_multiprocessor;
|
|
@@ -156,11 +158,11 @@ struct traits_t
|
|
|
156
158
|
int max_warps_per_multiprocessor;
|
|
157
159
|
|
|
158
160
|
// Shared memory reserved by CUDA driver per block in bytes
|
|
159
|
-
|
|
161
|
+
::cuda::std::size_t reserved_shared_memory_per_block;
|
|
160
162
|
|
|
161
163
|
// Maximum per block shared memory size on the device. This value can be opted
|
|
162
164
|
// into when using dynamic_shared_memory with NonPortableSize set to true
|
|
163
|
-
|
|
165
|
+
::cuda::std::size_t max_shared_memory_per_block_optin;
|
|
164
166
|
|
|
165
167
|
// TODO: Do we want these?:
|
|
166
168
|
// true if architecture supports clusters
|
|
@@ -182,10 +184,10 @@ struct traits_t
|
|
|
182
184
|
// @brief Architecture traits
|
|
183
185
|
// Template function that returns the traits for an architecture with a given id.
|
|
184
186
|
template <id _Id>
|
|
185
|
-
[[nodiscard]]
|
|
187
|
+
[[nodiscard]] _CCCL_API constexpr traits_t traits();
|
|
186
188
|
|
|
187
189
|
template <>
|
|
188
|
-
[[nodiscard]]
|
|
190
|
+
[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_60>()
|
|
189
191
|
{
|
|
190
192
|
traits_t __traits{};
|
|
191
193
|
__traits.arch_id = id::sm_60;
|
|
@@ -208,7 +210,7 @@ template <>
|
|
|
208
210
|
};
|
|
209
211
|
|
|
210
212
|
template <>
|
|
211
|
-
[[nodiscard]]
|
|
213
|
+
[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_61>()
|
|
212
214
|
{
|
|
213
215
|
traits_t __traits{};
|
|
214
216
|
__traits.arch_id = id::sm_61;
|
|
@@ -231,7 +233,7 @@ template <>
|
|
|
231
233
|
};
|
|
232
234
|
|
|
233
235
|
template <>
|
|
234
|
-
[[nodiscard]]
|
|
236
|
+
[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_70>()
|
|
235
237
|
{
|
|
236
238
|
traits_t __traits{};
|
|
237
239
|
__traits.arch_id = id::sm_70;
|
|
@@ -255,7 +257,7 @@ template <>
|
|
|
255
257
|
};
|
|
256
258
|
|
|
257
259
|
template <>
|
|
258
|
-
[[nodiscard]]
|
|
260
|
+
[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_75>()
|
|
259
261
|
{
|
|
260
262
|
traits_t __traits{};
|
|
261
263
|
__traits.arch_id = id::sm_75;
|
|
@@ -279,7 +281,7 @@ template <>
|
|
|
279
281
|
};
|
|
280
282
|
|
|
281
283
|
template <>
|
|
282
|
-
[[nodiscard]]
|
|
284
|
+
[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_80>()
|
|
283
285
|
{
|
|
284
286
|
traits_t __traits{};
|
|
285
287
|
__traits.arch_id = id::sm_80;
|
|
@@ -303,7 +305,7 @@ template <>
|
|
|
303
305
|
};
|
|
304
306
|
|
|
305
307
|
template <>
|
|
306
|
-
[[nodiscard]]
|
|
308
|
+
[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_86>()
|
|
307
309
|
{
|
|
308
310
|
traits_t __traits{};
|
|
309
311
|
__traits.arch_id = id::sm_86;
|
|
@@ -327,7 +329,7 @@ template <>
|
|
|
327
329
|
};
|
|
328
330
|
|
|
329
331
|
template <>
|
|
330
|
-
[[nodiscard]]
|
|
332
|
+
[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_89>()
|
|
331
333
|
{
|
|
332
334
|
traits_t __traits{};
|
|
333
335
|
__traits.arch_id = id::sm_89;
|
|
@@ -351,7 +353,7 @@ template <>
|
|
|
351
353
|
};
|
|
352
354
|
|
|
353
355
|
template <>
|
|
354
|
-
[[nodiscard]]
|
|
356
|
+
[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_90>()
|
|
355
357
|
{
|
|
356
358
|
traits_t __traits{};
|
|
357
359
|
__traits.arch_id = id::sm_90;
|
|
@@ -376,13 +378,13 @@ template <>
|
|
|
376
378
|
|
|
377
379
|
// No sm_90a specific fields for now.
|
|
378
380
|
template <>
|
|
379
|
-
[[nodiscard]]
|
|
381
|
+
[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_90a>()
|
|
380
382
|
{
|
|
381
383
|
return ::cuda::arch::traits<id::sm_90>();
|
|
382
384
|
};
|
|
383
385
|
|
|
384
386
|
template <>
|
|
385
|
-
[[nodiscard]]
|
|
387
|
+
[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_100>()
|
|
386
388
|
{
|
|
387
389
|
traits_t __traits{};
|
|
388
390
|
__traits.arch_id = id::sm_100;
|
|
@@ -406,13 +408,13 @@ template <>
|
|
|
406
408
|
};
|
|
407
409
|
|
|
408
410
|
template <>
|
|
409
|
-
[[nodiscard]]
|
|
411
|
+
[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_100a>()
|
|
410
412
|
{
|
|
411
413
|
return ::cuda::arch::traits<id::sm_100>();
|
|
412
414
|
};
|
|
413
415
|
|
|
414
416
|
template <>
|
|
415
|
-
[[nodiscard]]
|
|
417
|
+
[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_103>()
|
|
416
418
|
{
|
|
417
419
|
traits_t __traits = ::cuda::arch::traits<id::sm_100>();
|
|
418
420
|
__traits.arch_id = id::sm_103;
|
|
@@ -423,13 +425,13 @@ template <>
|
|
|
423
425
|
};
|
|
424
426
|
|
|
425
427
|
template <>
|
|
426
|
-
[[nodiscard]]
|
|
428
|
+
[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_103a>()
|
|
427
429
|
{
|
|
428
430
|
return ::cuda::arch::traits<id::sm_103>();
|
|
429
431
|
};
|
|
430
432
|
|
|
431
433
|
template <>
|
|
432
|
-
[[nodiscard]]
|
|
434
|
+
[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_110>()
|
|
433
435
|
{
|
|
434
436
|
traits_t __traits = ::cuda::arch::traits<id::sm_100>();
|
|
435
437
|
__traits.arch_id = id::sm_110;
|
|
@@ -440,7 +442,7 @@ template <>
|
|
|
440
442
|
};
|
|
441
443
|
|
|
442
444
|
template <>
|
|
443
|
-
[[nodiscard]]
|
|
445
|
+
[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_110a>()
|
|
444
446
|
{
|
|
445
447
|
return ::cuda::arch::traits<id::sm_110>();
|
|
446
448
|
};
|
|
@@ -470,7 +472,7 @@ template <>
|
|
|
470
472
|
};
|
|
471
473
|
|
|
472
474
|
template <>
|
|
473
|
-
[[nodiscard]]
|
|
475
|
+
[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_120a>()
|
|
474
476
|
{
|
|
475
477
|
return ::cuda::arch::traits<id::sm_120>();
|
|
476
478
|
};
|
|
@@ -516,7 +518,7 @@ inline constexpr int __highest_known_arch = 120;
|
|
|
516
518
|
case id::sm_120a:
|
|
517
519
|
return ::cuda::arch::traits<id::sm_120a>();
|
|
518
520
|
default:
|
|
519
|
-
::cuda::__throw_cuda_error(cudaErrorInvalidValue, "Traits requested for an unknown architecture");
|
|
521
|
+
::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Traits requested for an unknown architecture");
|
|
520
522
|
break;
|
|
521
523
|
}
|
|
522
524
|
}
|
|
@@ -525,7 +527,7 @@ inline constexpr int __highest_known_arch = 120;
|
|
|
525
527
|
{
|
|
526
528
|
if (compute_capability < 60 || compute_capability > __highest_known_arch)
|
|
527
529
|
{
|
|
528
|
-
::cuda::__throw_cuda_error(cudaErrorInvalidValue, "Compute capability out of range");
|
|
530
|
+
::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Compute capability out of range");
|
|
529
531
|
}
|
|
530
532
|
return static_cast<id>(compute_capability);
|
|
531
533
|
}
|
|
@@ -535,7 +537,7 @@ inline constexpr int __highest_known_arch = 120;
|
|
|
535
537
|
return ::cuda::arch::traits_for_id(::cuda::arch::id_for_compute_capability(compute_capability));
|
|
536
538
|
}
|
|
537
539
|
|
|
538
|
-
_CCCL_API inline constexpr id __special_id_for_compute_capability(int value)
|
|
540
|
+
[[nodiscard]] _CCCL_API inline constexpr id __special_id_for_compute_capability(int value)
|
|
539
541
|
{
|
|
540
542
|
switch (value)
|
|
541
543
|
{
|
|
@@ -550,13 +552,13 @@ _CCCL_API inline constexpr id __special_id_for_compute_capability(int value)
|
|
|
550
552
|
case 120:
|
|
551
553
|
return id::sm_120a;
|
|
552
554
|
default:
|
|
553
|
-
::cuda::__throw_cuda_error(cudaErrorInvalidValue, "Compute capability out of range");
|
|
555
|
+
::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Compute capability out of range");
|
|
554
556
|
break;
|
|
555
557
|
}
|
|
556
558
|
}
|
|
557
559
|
|
|
558
560
|
//! @brief Provides architecture traits of the architecture matching __CUDA_ARCH__ macro
|
|
559
|
-
[[nodiscard]]
|
|
561
|
+
[[nodiscard]] _CCCL_DEVICE_API inline constexpr arch::traits_t current_traits()
|
|
560
562
|
{
|
|
561
563
|
// fixme: this doesn't work with nvc++ -cuda
|
|
562
564
|
# ifdef __CUDA_ARCH__
|
|
@@ -571,7 +573,7 @@ _CCCL_API inline constexpr id __special_id_for_compute_capability(int value)
|
|
|
571
573
|
# endif // __CUDA_ARCH__
|
|
572
574
|
}
|
|
573
575
|
|
|
574
|
-
[[nodiscard]] inline constexpr arch::traits_t
|
|
576
|
+
[[nodiscard]] _CCCL_HOST_API inline constexpr arch::traits_t
|
|
575
577
|
__arch_traits_might_be_unknown(int __device, unsigned int __compute_capability)
|
|
576
578
|
{
|
|
577
579
|
if (__compute_capability <= arch::__highest_known_arch)
|