cuda-cccl 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
- cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
- cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +12 -13
- cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +4 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
- cuda/cccl/headers/include/cuda/__cccl_config +1 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +239 -317
- cuda/cccl/headers/include/cuda/__device/attributes.h +4 -3
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +0 -10
- cuda/cccl/headers/include/cuda/__device/physical_device.h +1 -26
- cuda/cccl/headers/include/cuda/__event/event.h +26 -26
- cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +9 -7
- cuda/cccl/headers/include/cuda/__fwd/devices.h +4 -4
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +2 -12
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +15 -19
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +59 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +5 -4
- cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -16
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/cmath +1 -0
- cuda/cccl/headers/include/cuda/devices +3 -0
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
- cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
- cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
- cuda/cccl/headers/include/cuda/std/numbers +5 -0
- cuda/cccl/headers/include/cuda/std/string_view +146 -11
- cuda/cccl/headers/include/cuda/stream_ref +5 -0
- cuda/cccl/headers/include/cuda/utility +1 -0
- cuda/cccl/headers/include/nv/target +7 -2
- cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
- cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
- cuda/cccl/headers/include/thrust/device_delete.h +18 -3
- cuda/cccl/headers/include/thrust/device_free.h +16 -3
- cuda/cccl/headers/include/thrust/device_new.h +29 -8
- cuda/cccl/headers/include/thrust/host_vector.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
- cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
- cuda/compute/__init__.py +2 -0
- cuda/compute/_bindings.pyi +43 -1
- cuda/compute/_bindings_impl.pyx +156 -7
- cuda/compute/algorithms/_scan.py +108 -36
- cuda/compute/algorithms/_transform.py +32 -11
- cuda/compute/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/iterators/__init__.py +2 -0
- cuda/compute/iterators/_factories.py +28 -0
- cuda/compute/iterators/_iterators.py +206 -1
- cuda/compute/numba_utils.py +2 -2
- cuda/compute/typing.py +2 -0
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +171 -175
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
- cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
- cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -23,6 +23,7 @@
|
|
|
23
23
|
|
|
24
24
|
#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
25
25
|
|
|
26
|
+
# include <cuda/__device/compute_capability.h>
|
|
26
27
|
# include <cuda/__device/device_ref.h>
|
|
27
28
|
# include <cuda/__driver/driver_api.h>
|
|
28
29
|
# include <cuda/__fwd/devices.h>
|
|
@@ -739,12 +740,12 @@ static constexpr numa_id_t numa_id{};
|
|
|
739
740
|
// capability in a single query
|
|
740
741
|
struct compute_capability_t
|
|
741
742
|
{
|
|
742
|
-
using type =
|
|
743
|
+
using type = ::cuda::compute_capability;
|
|
743
744
|
|
|
744
745
|
[[nodiscard]] _CCCL_HOST_API type operator()(device_ref __dev_id) const
|
|
745
746
|
{
|
|
746
|
-
return
|
|
747
|
-
|
|
747
|
+
return type{::cuda::device_attributes::compute_capability_major(__dev_id),
|
|
748
|
+
::cuda::device_attributes::compute_capability_minor(__dev_id)};
|
|
748
749
|
}
|
|
749
750
|
};
|
|
750
751
|
static constexpr compute_capability_t compute_capability{};
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
//===----------------------------------------------------------------------===//
|
|
2
|
+
//
|
|
3
|
+
// Part of libcu++, the C++ Standard Library for your entire system,
|
|
4
|
+
// under the Apache License v2.0 with LLVM Exceptions.
|
|
5
|
+
// See https://llvm.org/LICENSE.txt for license information.
|
|
6
|
+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
7
|
+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
|
|
8
|
+
//
|
|
9
|
+
//===----------------------------------------------------------------------===//
|
|
10
|
+
|
|
11
|
+
#ifndef _CUDA___DEVICE_COMPUTE_CAPABILITY_H
|
|
12
|
+
#define _CUDA___DEVICE_COMPUTE_CAPABILITY_H
|
|
13
|
+
|
|
14
|
+
#include <cuda/std/detail/__config>
|
|
15
|
+
|
|
16
|
+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
17
|
+
# pragma GCC system_header
|
|
18
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
|
|
19
|
+
# pragma clang system_header
|
|
20
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
|
|
21
|
+
# pragma system_header
|
|
22
|
+
#endif // no system header
|
|
23
|
+
|
|
24
|
+
#include <cuda/__fwd/devices.h>
|
|
25
|
+
#include <cuda/std/__utility/to_underlying.h>
|
|
26
|
+
|
|
27
|
+
#include <cuda/std/__cccl/prologue.h>
|
|
28
|
+
|
|
29
|
+
_CCCL_BEGIN_NAMESPACE_CUDA
|
|
30
|
+
|
|
31
|
+
//! @brief Type representing the CUDA compute capability.
|
|
32
|
+
class compute_capability
|
|
33
|
+
{
|
|
34
|
+
int __cc_{}; //!< The stored compute capability in format 10 * major + minor.
|
|
35
|
+
|
|
36
|
+
public:
|
|
37
|
+
_CCCL_HIDE_FROM_ABI constexpr compute_capability() noexcept = default;
|
|
38
|
+
|
|
39
|
+
//! @brief Constructs the object from compute capability \c __cc. The expected format is 10 * major + minor.
|
|
40
|
+
//!
|
|
41
|
+
//! @param __cc Compute capability.
|
|
42
|
+
_CCCL_API explicit constexpr compute_capability(int __cc) noexcept
|
|
43
|
+
: __cc_{__cc}
|
|
44
|
+
{}
|
|
45
|
+
|
|
46
|
+
//! @brief Constructs the object by combining the \c __major and \c __minor compute capability.
|
|
47
|
+
//!
|
|
48
|
+
//! @param __major The major compute capability.
|
|
49
|
+
//! @param __minor The minor compute capability. Must be less than 10.
|
|
50
|
+
_CCCL_API constexpr compute_capability(int __major, int __minor) noexcept
|
|
51
|
+
: __cc_{10 * __major + __minor}
|
|
52
|
+
{
|
|
53
|
+
_CCCL_ASSERT(__minor < 10, "invalid minor compute capability");
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
//! @brief Constructs the object from the architecture id.
|
|
57
|
+
//!
|
|
58
|
+
//! @param __arch_id The architecture id.
|
|
59
|
+
_CCCL_API explicit constexpr compute_capability(arch_id __arch_id) noexcept
|
|
60
|
+
{
|
|
61
|
+
const auto __val = ::cuda::std::to_underlying(__arch_id);
|
|
62
|
+
if (__val > __arch_specific_id_multiplier)
|
|
63
|
+
{
|
|
64
|
+
__cc_ = __val / __arch_specific_id_multiplier;
|
|
65
|
+
}
|
|
66
|
+
else
|
|
67
|
+
{
|
|
68
|
+
__cc_ = __val;
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
_CCCL_HIDE_FROM_ABI constexpr compute_capability(const compute_capability&) noexcept = default;
|
|
73
|
+
|
|
74
|
+
_CCCL_HIDE_FROM_ABI constexpr compute_capability& operator=(const compute_capability& __other) noexcept = default;
|
|
75
|
+
|
|
76
|
+
//! @brief Gets the stored compute capability.
|
|
77
|
+
//!
|
|
78
|
+
//! @return The stored compute capability in format 10 * major + minor.
|
|
79
|
+
[[nodiscard]] _CCCL_API constexpr int get() const noexcept
|
|
80
|
+
{
|
|
81
|
+
return __cc_;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
//! @brief Gets the major compute capability.
|
|
85
|
+
//!
|
|
86
|
+
//! @return Major compute capability.
|
|
87
|
+
[[nodiscard]] _CCCL_API constexpr int major() const noexcept
|
|
88
|
+
{
|
|
89
|
+
return __cc_ / 10;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
//! @brief Gets the minor compute capability.
|
|
93
|
+
//!
|
|
94
|
+
//! @return Minor compute capability. The value is always less than 10.
|
|
95
|
+
[[nodiscard]] _CCCL_API constexpr int minor() const noexcept
|
|
96
|
+
{
|
|
97
|
+
return __cc_ % 10;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
//! @brief Conversion operator to \c int.
|
|
101
|
+
//!
|
|
102
|
+
//! @return The stored compute capability in format 10 * major + minor.
|
|
103
|
+
_CCCL_API explicit constexpr operator int() const noexcept
|
|
104
|
+
{
|
|
105
|
+
return __cc_;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
//! @brief Equality operator.
|
|
109
|
+
[[nodiscard]] friend _CCCL_API constexpr bool operator==(compute_capability __lhs, compute_capability __rhs) noexcept
|
|
110
|
+
{
|
|
111
|
+
return __lhs.__cc_ == __rhs.__cc_;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
//! @brief Inequality operator.
|
|
115
|
+
[[nodiscard]] friend _CCCL_API constexpr bool operator!=(compute_capability __lhs, compute_capability __rhs) noexcept
|
|
116
|
+
{
|
|
117
|
+
return __lhs.__cc_ != __rhs.__cc_;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
//! @brief Less than operator.
|
|
121
|
+
[[nodiscard]] friend _CCCL_API constexpr bool operator<(compute_capability __lhs, compute_capability __rhs) noexcept
|
|
122
|
+
{
|
|
123
|
+
return __lhs.__cc_ < __rhs.__cc_;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
//! @brief Less than or equal to operator.
|
|
127
|
+
[[nodiscard]] friend _CCCL_API constexpr bool operator<=(compute_capability __lhs, compute_capability __rhs) noexcept
|
|
128
|
+
{
|
|
129
|
+
return __lhs.__cc_ <= __rhs.__cc_;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
//! @brief Greater than operator.
|
|
133
|
+
[[nodiscard]] friend _CCCL_API constexpr bool operator>(compute_capability __lhs, compute_capability __rhs) noexcept
|
|
134
|
+
{
|
|
135
|
+
return __lhs.__cc_ > __rhs.__cc_;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
//! @brief Greater than or equal to operator.
|
|
139
|
+
[[nodiscard]] friend _CCCL_API constexpr bool operator>=(compute_capability __lhs, compute_capability __rhs) noexcept
|
|
140
|
+
{
|
|
141
|
+
return __lhs.__cc_ >= __rhs.__cc_;
|
|
142
|
+
}
|
|
143
|
+
};
|
|
144
|
+
|
|
145
|
+
_CCCL_END_NAMESPACE_CUDA
|
|
146
|
+
|
|
147
|
+
#if _CCCL_CUDA_COMPILATION()
|
|
148
|
+
|
|
149
|
+
_CCCL_BEGIN_NAMESPACE_CUDA_DEVICE
|
|
150
|
+
|
|
151
|
+
//! @brief Returns the \c cuda::compute_capability that is currently being compiled.
|
|
152
|
+
//!
|
|
153
|
+
//! @note This API cannot be used in constexpr context when compiling with nvc++ in CUDA mode.
|
|
154
|
+
[[nodiscard]] _CCCL_DEVICE_API _CCCL_TARGET_CONSTEXPR ::cuda::compute_capability current_compute_capability() noexcept
|
|
155
|
+
{
|
|
156
|
+
# if _CCCL_CUDA_COMPILER(NVHPC)
|
|
157
|
+
return ::cuda::compute_capability{__builtin_current_device_sm()};
|
|
158
|
+
# elif _CCCL_DEVICE_COMPILATION()
|
|
159
|
+
return ::cuda::compute_capability{__CUDA_ARCH__ / 10};
|
|
160
|
+
# else // ^^^ _CCCL_DEVICE_COMPILATION() ^^^ / vvv !_CCCL_DEVICE_COMPILATION() vvv
|
|
161
|
+
return {};
|
|
162
|
+
# endif // ^^^ !_CCCL_DEVICE_COMPILATION() ^^^
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
_CCCL_END_NAMESPACE_CUDA_DEVICE
|
|
166
|
+
|
|
167
|
+
#endif // _CCCL_CUDA_COMPILATION()
|
|
168
|
+
|
|
169
|
+
#include <cuda/std/__cccl/epilogue.h>
|
|
170
|
+
|
|
171
|
+
#endif // _CUDA___DEVICE_COMPUTE_CAPABILITY_H
|
|
@@ -133,16 +133,6 @@ public:
|
|
|
133
133
|
::cuda::__driver::__deviceGet(get()), ::cuda::__driver::__deviceGet(__other_dev.get()));
|
|
134
134
|
}
|
|
135
135
|
|
|
136
|
-
//! @brief Retrieve architecture traits of this device.
|
|
137
|
-
//!
|
|
138
|
-
//! Architecture traits object contains information about certain traits
|
|
139
|
-
//! that are shared by all devices belonging to given architecture.
|
|
140
|
-
//!
|
|
141
|
-
//! @return A reference to `arch_traits_t` object containing architecture traits of this device
|
|
142
|
-
[[nodiscard]] _CCCL_HOST_API const arch::traits_t& arch_traits() const; // implemented in
|
|
143
|
-
// <cuda/__device/physical_device.h> to avoid
|
|
144
|
-
// circular dependency
|
|
145
|
-
|
|
146
136
|
// TODO this might return some more complex type in the future
|
|
147
137
|
// TODO we might want to include the calling device, depends on what we decide
|
|
148
138
|
// peer access APIs
|
|
@@ -23,16 +23,15 @@
|
|
|
23
23
|
|
|
24
24
|
#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
25
25
|
|
|
26
|
-
# include <cuda/__device/arch_traits.h>
|
|
27
26
|
# include <cuda/__device/device_ref.h>
|
|
28
27
|
# include <cuda/__driver/driver_api.h>
|
|
29
28
|
# include <cuda/__fwd/devices.h>
|
|
29
|
+
# include <cuda/std/__cccl/memory_wrapper.h>
|
|
30
30
|
# include <cuda/std/__cstddef/types.h>
|
|
31
31
|
# include <cuda/std/span>
|
|
32
32
|
# include <cuda/std/string_view>
|
|
33
33
|
|
|
34
34
|
# include <cassert>
|
|
35
|
-
# include <memory>
|
|
36
35
|
# include <mutex>
|
|
37
36
|
# include <vector>
|
|
38
37
|
|
|
@@ -53,10 +52,6 @@ class __physical_device
|
|
|
53
52
|
|
|
54
53
|
::CUdevice __device_{};
|
|
55
54
|
|
|
56
|
-
// TODO We should have some of the attributes just return from the arch traits
|
|
57
|
-
::std::once_flag __traits_once_flag_{};
|
|
58
|
-
arch::traits_t __traits_{};
|
|
59
|
-
|
|
60
55
|
::std::once_flag __primary_ctx_once_flag_{};
|
|
61
56
|
::CUcontext __primary_ctx_{};
|
|
62
57
|
|
|
@@ -90,21 +85,6 @@ public:
|
|
|
90
85
|
return __primary_ctx_;
|
|
91
86
|
}
|
|
92
87
|
|
|
93
|
-
//! @brief Retrieve architecture traits of this device.
|
|
94
|
-
//!
|
|
95
|
-
//! Architecture traits object contains information about certain traits
|
|
96
|
-
//! that are shared by all devices belonging to given architecture.
|
|
97
|
-
//!
|
|
98
|
-
//! @return A reference to `arch_traits_t` object containing architecture traits of this device
|
|
99
|
-
[[nodiscard]] _CCCL_HOST_API const arch::traits_t& __arch_traits()
|
|
100
|
-
{
|
|
101
|
-
::std::call_once(__traits_once_flag_, [this]() {
|
|
102
|
-
const auto __id = ::cuda::__driver::__cudevice_to_ordinal(__device_);
|
|
103
|
-
__traits_ = ::cuda::arch::__arch_traits_might_be_unknown(__id, device_attributes::compute_capability(__id));
|
|
104
|
-
});
|
|
105
|
-
return __traits_;
|
|
106
|
-
}
|
|
107
|
-
|
|
108
88
|
[[nodiscard]] _CCCL_HOST_API ::cuda::std::string_view __name()
|
|
109
89
|
{
|
|
110
90
|
::std::call_once(__name_once_flag_, [this]() {
|
|
@@ -178,11 +158,6 @@ _CCCL_HOST_API inline void device_ref::init() const
|
|
|
178
158
|
return ::cuda::__physical_devices()[__id_].__name();
|
|
179
159
|
}
|
|
180
160
|
|
|
181
|
-
[[nodiscard]] _CCCL_HOST_API inline const arch::traits_t& device_ref::arch_traits() const
|
|
182
|
-
{
|
|
183
|
-
return ::cuda::__physical_devices()[__id_].__arch_traits();
|
|
184
|
-
}
|
|
185
|
-
|
|
186
161
|
[[nodiscard]] _CCCL_HOST_API inline ::cuda::std::span<const device_ref> device_ref::peers() const
|
|
187
162
|
{
|
|
188
163
|
return ::cuda::__physical_devices()[__id_].__peers();
|
|
@@ -28,8 +28,8 @@
|
|
|
28
28
|
# include <cuda/__event/event_ref.h>
|
|
29
29
|
# include <cuda/__runtime/ensure_current_context.h>
|
|
30
30
|
# include <cuda/__utility/no_init.h>
|
|
31
|
+
# include <cuda/std/__utility/to_underlying.h>
|
|
31
32
|
# include <cuda/std/cstddef>
|
|
32
|
-
# include <cuda/std/utility>
|
|
33
33
|
|
|
34
34
|
# include <cuda/std/__cccl/prologue.h>
|
|
35
35
|
|
|
@@ -37,38 +37,43 @@ _CCCL_BEGIN_NAMESPACE_CUDA
|
|
|
37
37
|
|
|
38
38
|
class timed_event;
|
|
39
39
|
|
|
40
|
+
//! @brief Flags to use when creating the event.
|
|
41
|
+
enum class event_flags : unsigned
|
|
42
|
+
{
|
|
43
|
+
none = cudaEventDefault,
|
|
44
|
+
blocking_sync = cudaEventBlockingSync,
|
|
45
|
+
interprocess = cudaEventInterprocess,
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
[[nodiscard]] _CCCL_HOST_API constexpr event_flags operator|(event_flags __lhs, event_flags __rhs) noexcept
|
|
49
|
+
{
|
|
50
|
+
return static_cast<event_flags>(::cuda::std::to_underlying(__lhs) | ::cuda::std::to_underlying(__rhs));
|
|
51
|
+
}
|
|
52
|
+
|
|
40
53
|
//! @brief An owning wrapper for an untimed `cudaEvent_t`.
|
|
41
54
|
class event : public event_ref
|
|
42
55
|
{
|
|
43
56
|
friend class timed_event;
|
|
44
57
|
|
|
45
58
|
public:
|
|
46
|
-
//! @brief Flags to use when creating the event.
|
|
47
|
-
enum class flags : unsigned
|
|
48
|
-
{
|
|
49
|
-
none = cudaEventDefault,
|
|
50
|
-
blocking_sync = cudaEventBlockingSync,
|
|
51
|
-
interprocess = cudaEventInterprocess,
|
|
52
|
-
};
|
|
53
|
-
|
|
54
59
|
//! @brief Construct a new `event` object with timing disabled, and record
|
|
55
60
|
//! the event in the specified stream.
|
|
56
61
|
//!
|
|
57
62
|
//! @throws cuda_error if the event creation fails.
|
|
58
|
-
explicit event(stream_ref __stream,
|
|
63
|
+
_CCCL_HOST_API explicit event(stream_ref __stream, event_flags __flags = event_flags::none);
|
|
59
64
|
|
|
60
65
|
//! @brief Construct a new `event` object with timing disabled. The event can only be recorded on streams from the
|
|
61
66
|
//! specified device.
|
|
62
67
|
//!
|
|
63
68
|
//! @throws cuda_error if the event creation fails.
|
|
64
|
-
explicit event(device_ref __device,
|
|
65
|
-
: event(__device,
|
|
69
|
+
_CCCL_HOST_API explicit event(device_ref __device, event_flags __flags = event_flags::none)
|
|
70
|
+
: event(__device, ::cuda::std::to_underlying(__flags) | cudaEventDisableTiming)
|
|
66
71
|
{}
|
|
67
72
|
|
|
68
73
|
//! @brief Construct a new `event` object into the moved-from state.
|
|
69
74
|
//!
|
|
70
75
|
//! @post `get()` returns `cudaEvent_t()`.
|
|
71
|
-
explicit constexpr event(no_init_t) noexcept
|
|
76
|
+
_CCCL_HOST_API explicit constexpr event(no_init_t) noexcept
|
|
72
77
|
: event_ref(::cudaEvent_t{})
|
|
73
78
|
{}
|
|
74
79
|
|
|
@@ -77,7 +82,7 @@ public:
|
|
|
77
82
|
//! @param __other
|
|
78
83
|
//!
|
|
79
84
|
//! @post `__other` is in a moved-from state.
|
|
80
|
-
constexpr event(event&& __other) noexcept
|
|
85
|
+
_CCCL_HOST_API constexpr event(event&& __other) noexcept
|
|
81
86
|
: event_ref(::cuda::std::exchange(__other.__event_, {}))
|
|
82
87
|
{}
|
|
83
88
|
|
|
@@ -87,7 +92,7 @@ public:
|
|
|
87
92
|
//! @brief Destroy the `event` object
|
|
88
93
|
//!
|
|
89
94
|
//! @note If the event fails to be destroyed, the error is silently ignored.
|
|
90
|
-
~event()
|
|
95
|
+
_CCCL_HOST_API ~event()
|
|
91
96
|
{
|
|
92
97
|
if (__event_ != nullptr)
|
|
93
98
|
{
|
|
@@ -102,7 +107,7 @@ public:
|
|
|
102
107
|
//! @param __other
|
|
103
108
|
//!
|
|
104
109
|
//! @post `__other` is in a moved-from state.
|
|
105
|
-
event& operator=(event&& __other) noexcept
|
|
110
|
+
_CCCL_HOST_API event& operator=(event&& __other) noexcept
|
|
106
111
|
{
|
|
107
112
|
event __tmp(::cuda::std::move(__other));
|
|
108
113
|
::cuda::std::swap(__event_, __tmp.__event_);
|
|
@@ -119,7 +124,7 @@ public:
|
|
|
119
124
|
//! @return event The constructed `event` object
|
|
120
125
|
//!
|
|
121
126
|
//! @note The constructed `event` object takes ownership of the native handle.
|
|
122
|
-
[[nodiscard]] static event from_native_handle(::cudaEvent_t __evnt) noexcept
|
|
127
|
+
[[nodiscard]] static _CCCL_HOST_API event from_native_handle(::cudaEvent_t __evnt) noexcept
|
|
123
128
|
{
|
|
124
129
|
return event(__evnt);
|
|
125
130
|
}
|
|
@@ -135,26 +140,21 @@ public:
|
|
|
135
140
|
//! @return cudaEvent_t The native handle being held by the `event` object.
|
|
136
141
|
//!
|
|
137
142
|
//! @post The event object is in a moved-from state.
|
|
138
|
-
[[nodiscard]] constexpr ::cudaEvent_t release() noexcept
|
|
143
|
+
[[nodiscard]] _CCCL_HOST_API constexpr ::cudaEvent_t release() noexcept
|
|
139
144
|
{
|
|
140
145
|
return ::cuda::std::exchange(__event_, {});
|
|
141
146
|
}
|
|
142
147
|
|
|
143
|
-
[[nodiscard]] friend constexpr flags operator|(flags __lhs, flags __rhs) noexcept
|
|
144
|
-
{
|
|
145
|
-
return static_cast<flags>(static_cast<unsigned>(__lhs) | static_cast<unsigned>(__rhs));
|
|
146
|
-
}
|
|
147
|
-
|
|
148
148
|
private:
|
|
149
149
|
// Use `event::from_native_handle(e)` to construct an owning `event`
|
|
150
150
|
// object from a `cudaEvent_t` handle.
|
|
151
|
-
explicit constexpr event(::cudaEvent_t __evnt) noexcept
|
|
151
|
+
_CCCL_HOST_API explicit constexpr event(::cudaEvent_t __evnt) noexcept
|
|
152
152
|
: event_ref(__evnt)
|
|
153
153
|
{}
|
|
154
154
|
|
|
155
|
-
explicit event(stream_ref __stream, unsigned __flags);
|
|
155
|
+
_CCCL_HOST_API explicit event(stream_ref __stream, unsigned __flags);
|
|
156
156
|
|
|
157
|
-
explicit event(device_ref __device, unsigned __flags)
|
|
157
|
+
_CCCL_HOST_API explicit event(device_ref __device, unsigned __flags)
|
|
158
158
|
: event_ref(::cudaEvent_t{})
|
|
159
159
|
{
|
|
160
160
|
[[maybe_unused]] __ensure_current_context __ctx_setter(__device);
|
|
@@ -56,7 +56,7 @@ public:
|
|
|
56
56
|
//!
|
|
57
57
|
//! @note: It is the callers responsibility to ensure the `event_ref` does not
|
|
58
58
|
//! outlive the event denoted by the `cudaEvent_t` handle.
|
|
59
|
-
constexpr event_ref(::cudaEvent_t __evnt) noexcept
|
|
59
|
+
_CCCL_HOST_API constexpr event_ref(::cudaEvent_t __evnt) noexcept
|
|
60
60
|
: __event_(__evnt)
|
|
61
61
|
{}
|
|
62
62
|
|
|
@@ -108,7 +108,7 @@ public:
|
|
|
108
108
|
//! @brief Retrieve the native `cudaEvent_t` handle.
|
|
109
109
|
//!
|
|
110
110
|
//! @return cudaEvent_t The native handle being held by the event_ref object.
|
|
111
|
-
[[nodiscard]] constexpr ::cudaEvent_t get() const noexcept
|
|
111
|
+
[[nodiscard]] _CCCL_HOST_API constexpr ::cudaEvent_t get() const noexcept
|
|
112
112
|
{
|
|
113
113
|
return __event_;
|
|
114
114
|
}
|
|
@@ -116,7 +116,7 @@ public:
|
|
|
116
116
|
//! @brief Checks if the `event_ref` is valid
|
|
117
117
|
//!
|
|
118
118
|
//! @return true if the `event_ref` is valid, false otherwise.
|
|
119
|
-
[[nodiscard]] explicit constexpr operator bool() const noexcept
|
|
119
|
+
[[nodiscard]] _CCCL_HOST_API explicit constexpr operator bool() const noexcept
|
|
120
120
|
{
|
|
121
121
|
return __event_ != nullptr;
|
|
122
122
|
}
|
|
@@ -129,7 +129,7 @@ public:
|
|
|
129
129
|
//! @param __lhs The first `event_ref` to compare
|
|
130
130
|
//! @param __rhs The second `event_ref` to compare
|
|
131
131
|
//! @return true if `lhs` and `rhs` refer to the same `cudaEvent_t` object.
|
|
132
|
-
[[nodiscard]] friend constexpr bool operator==(event_ref __lhs, event_ref __rhs) noexcept
|
|
132
|
+
[[nodiscard]] friend _CCCL_HOST_API constexpr bool operator==(event_ref __lhs, event_ref __rhs) noexcept
|
|
133
133
|
{
|
|
134
134
|
return __lhs.__event_ == __rhs.__event_;
|
|
135
135
|
}
|
|
@@ -142,7 +142,7 @@ public:
|
|
|
142
142
|
//! @param __lhs The first `event_ref` to compare
|
|
143
143
|
//! @param __rhs The second `event_ref` to compare
|
|
144
144
|
//! @return true if `lhs` and `rhs` refer to different `cudaEvent_t` objects.
|
|
145
|
-
[[nodiscard]] friend constexpr bool operator!=(event_ref __lhs, event_ref __rhs) noexcept
|
|
145
|
+
[[nodiscard]] friend _CCCL_HOST_API constexpr bool operator!=(event_ref __lhs, event_ref __rhs) noexcept
|
|
146
146
|
{
|
|
147
147
|
return __lhs.__event_ != __rhs.__event_;
|
|
148
148
|
}
|
|
@@ -31,6 +31,7 @@
|
|
|
31
31
|
# include <cuda/__event/event.h>
|
|
32
32
|
# include <cuda/__utility/no_init.h>
|
|
33
33
|
# include <cuda/std/__chrono/duration.h>
|
|
34
|
+
# include <cuda/std/__utility/to_underlying.h>
|
|
34
35
|
# include <cuda/std/cstddef>
|
|
35
36
|
|
|
36
37
|
# include <cuda/std/__cccl/prologue.h>
|
|
@@ -45,20 +46,20 @@ public:
|
|
|
45
46
|
//! and record the event on the specified stream.
|
|
46
47
|
//!
|
|
47
48
|
//! @throws cuda_error if the event creation fails.
|
|
48
|
-
explicit timed_event(stream_ref __stream,
|
|
49
|
+
_CCCL_HOST_API explicit timed_event(stream_ref __stream, event_flags __flags = event_flags::none);
|
|
49
50
|
|
|
50
51
|
//! @brief Construct a new `timed_event` object with the specified flags. The event can only be recorded on streams
|
|
51
52
|
//! from the specified device.
|
|
52
53
|
//!
|
|
53
54
|
//! @throws cuda_error if the event creation fails.
|
|
54
|
-
explicit timed_event(device_ref __device,
|
|
55
|
-
: event(__device,
|
|
55
|
+
_CCCL_HOST_API explicit timed_event(device_ref __device, event_flags __flags = event_flags::none)
|
|
56
|
+
: event(__device, ::cuda::std::to_underlying(__flags))
|
|
56
57
|
{}
|
|
57
58
|
|
|
58
59
|
//! @brief Construct a new `timed_event` object into the moved-from state.
|
|
59
60
|
//!
|
|
60
61
|
//! @post `get()` returns `cudaEvent_t()`.
|
|
61
|
-
explicit constexpr timed_event(no_init_t) noexcept
|
|
62
|
+
_CCCL_HOST_API explicit constexpr timed_event(no_init_t) noexcept
|
|
62
63
|
: event(no_init)
|
|
63
64
|
{}
|
|
64
65
|
|
|
@@ -74,7 +75,7 @@ public:
|
|
|
74
75
|
//! @return timed_event The constructed `timed_event` object
|
|
75
76
|
//!
|
|
76
77
|
//! @note The constructed `timed_event` object takes ownership of the native handle.
|
|
77
|
-
[[nodiscard]] static timed_event from_native_handle(::cudaEvent_t __evnt) noexcept
|
|
78
|
+
[[nodiscard]] static _CCCL_HOST_API timed_event from_native_handle(::cudaEvent_t __evnt) noexcept
|
|
78
79
|
{
|
|
79
80
|
return timed_event(__evnt);
|
|
80
81
|
}
|
|
@@ -95,7 +96,8 @@ public:
|
|
|
95
96
|
//! @return cuda::std::chrono::nanoseconds The elapsed time in nanoseconds.
|
|
96
97
|
//!
|
|
97
98
|
//! @note The elapsed time has a resolution of approximately 0.5 microseconds.
|
|
98
|
-
[[nodiscard]] friend ::cuda::std::chrono::nanoseconds
|
|
99
|
+
[[nodiscard]] friend _CCCL_HOST_API ::cuda::std::chrono::nanoseconds
|
|
100
|
+
operator-(const timed_event& __end, const timed_event& __start)
|
|
99
101
|
{
|
|
100
102
|
const auto __ms = ::cuda::__driver::__eventElapsedTime(__start.get(), __end.get());
|
|
101
103
|
return ::cuda::std::chrono::nanoseconds(static_cast<::cuda::std::chrono::nanoseconds::rep>(__ms * 1'000'000.0));
|
|
@@ -104,7 +106,7 @@ public:
|
|
|
104
106
|
private:
|
|
105
107
|
// Use `timed_event::from_native_handle(e)` to construct an owning `timed_event`
|
|
106
108
|
// object from a `cudaEvent_t` handle.
|
|
107
|
-
explicit constexpr timed_event(::cudaEvent_t __evnt) noexcept
|
|
109
|
+
_CCCL_HOST_API explicit constexpr timed_event(::cudaEvent_t __evnt) noexcept
|
|
108
110
|
: event(__evnt)
|
|
109
111
|
{}
|
|
110
112
|
};
|
|
@@ -31,11 +31,11 @@ class __physical_device;
|
|
|
31
31
|
class device_ref;
|
|
32
32
|
template <::cudaDeviceAttr _Attr>
|
|
33
33
|
struct __dev_attr;
|
|
34
|
+
struct arch_traits_t;
|
|
35
|
+
class compute_capability;
|
|
36
|
+
enum class arch_id : int;
|
|
34
37
|
|
|
35
|
-
|
|
36
|
-
{
|
|
37
|
-
struct traits_t;
|
|
38
|
-
} // namespace arch
|
|
38
|
+
inline constexpr int __arch_specific_id_multiplier = 100000;
|
|
39
39
|
|
|
40
40
|
_CCCL_END_NAMESPACE_CUDA
|
|
41
41
|
|