cuda-cccl 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
- cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
- cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +12 -13
- cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +4 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
- cuda/cccl/headers/include/cuda/__cccl_config +1 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +239 -317
- cuda/cccl/headers/include/cuda/__device/attributes.h +4 -3
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +0 -10
- cuda/cccl/headers/include/cuda/__device/physical_device.h +1 -26
- cuda/cccl/headers/include/cuda/__event/event.h +26 -26
- cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +9 -7
- cuda/cccl/headers/include/cuda/__fwd/devices.h +4 -4
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +2 -12
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +15 -19
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +59 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +5 -4
- cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -16
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/cmath +1 -0
- cuda/cccl/headers/include/cuda/devices +3 -0
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
- cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
- cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
- cuda/cccl/headers/include/cuda/std/numbers +5 -0
- cuda/cccl/headers/include/cuda/std/string_view +146 -11
- cuda/cccl/headers/include/cuda/stream_ref +5 -0
- cuda/cccl/headers/include/cuda/utility +1 -0
- cuda/cccl/headers/include/nv/target +7 -2
- cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
- cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
- cuda/cccl/headers/include/thrust/device_delete.h +18 -3
- cuda/cccl/headers/include/thrust/device_free.h +16 -3
- cuda/cccl/headers/include/thrust/device_new.h +29 -8
- cuda/cccl/headers/include/thrust/host_vector.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
- cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
- cuda/compute/__init__.py +2 -0
- cuda/compute/_bindings.pyi +43 -1
- cuda/compute/_bindings_impl.pyx +156 -7
- cuda/compute/algorithms/_scan.py +108 -36
- cuda/compute/algorithms/_transform.py +32 -11
- cuda/compute/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/iterators/__init__.py +2 -0
- cuda/compute/iterators/_factories.py +28 -0
- cuda/compute/iterators/_iterators.py +206 -1
- cuda/compute/numba_utils.py +2 -2
- cuda/compute/typing.py +2 -0
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +171 -175
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
- cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
- cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -22,6 +22,16 @@
|
|
|
22
22
|
#endif // no system header
|
|
23
23
|
|
|
24
24
|
#include <cuda/__mdspan/restrict_accessor.h>
|
|
25
|
+
#include <cuda/std/__concepts/concept_macros.h>
|
|
26
|
+
#include <cuda/std/__fwd/array.h>
|
|
27
|
+
#include <cuda/std/__fwd/span.h>
|
|
28
|
+
#include <cuda/std/__type_traits/extent.h>
|
|
29
|
+
#include <cuda/std/__type_traits/is_convertible.h>
|
|
30
|
+
#include <cuda/std/__type_traits/is_pointer.h>
|
|
31
|
+
#include <cuda/std/__type_traits/rank.h>
|
|
32
|
+
#include <cuda/std/__type_traits/remove_all_extents.h>
|
|
33
|
+
#include <cuda/std/__type_traits/remove_pointer.h>
|
|
34
|
+
#include <cuda/std/__type_traits/remove_reference.h>
|
|
25
35
|
#include <cuda/std/mdspan>
|
|
26
36
|
|
|
27
37
|
#include <cuda/std/__cccl/prologue.h>
|
|
@@ -32,7 +42,63 @@ template <typename _ElementType,
|
|
|
32
42
|
typename _Extents,
|
|
33
43
|
typename _LayoutPolicy = ::cuda::std::layout_right,
|
|
34
44
|
typename _AccessorPolicy = ::cuda::std::default_accessor<_ElementType>>
|
|
35
|
-
|
|
45
|
+
class restrict_mdspan
|
|
46
|
+
: public ::cuda::std::mdspan<_ElementType, _Extents, _LayoutPolicy, restrict_accessor<_AccessorPolicy>>
|
|
47
|
+
{
|
|
48
|
+
public:
|
|
49
|
+
_LIBCUDACXX_DELEGATE_CONSTRUCTORS(
|
|
50
|
+
restrict_mdspan, ::cuda::std::mdspan, _ElementType, _Extents, _LayoutPolicy, restrict_accessor<_AccessorPolicy>);
|
|
51
|
+
|
|
52
|
+
_CCCL_API friend constexpr void swap(restrict_mdspan& __x, restrict_mdspan& __y) noexcept
|
|
53
|
+
{
|
|
54
|
+
swap(static_cast<__base&>(__x), static_cast<__base&>(__y));
|
|
55
|
+
}
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
_CCCL_TEMPLATE(class _ElementType, class... _OtherIndexTypes)
|
|
59
|
+
_CCCL_REQUIRES((sizeof...(_OtherIndexTypes) > 0)
|
|
60
|
+
_CCCL_AND(::cuda::std::is_convertible_v<_OtherIndexTypes, size_t>&&... && true))
|
|
61
|
+
_CCCL_HOST_DEVICE explicit restrict_mdspan(_ElementType*, _OtherIndexTypes...)
|
|
62
|
+
-> restrict_mdspan<_ElementType, ::cuda::std::extents<size_t, ::cuda::std::__maybe_static_ext<_OtherIndexTypes>...>>;
|
|
63
|
+
|
|
64
|
+
_CCCL_TEMPLATE(class _Pointer)
|
|
65
|
+
_CCCL_REQUIRES(::cuda::std::is_pointer_v<::cuda::std::remove_reference_t<_Pointer>>)
|
|
66
|
+
_CCCL_HOST_DEVICE restrict_mdspan(_Pointer&&)
|
|
67
|
+
-> restrict_mdspan<::cuda::std::remove_pointer_t<::cuda::std::remove_reference_t<_Pointer>>,
|
|
68
|
+
::cuda::std::extents<size_t>>;
|
|
69
|
+
|
|
70
|
+
_CCCL_TEMPLATE(class _CArray)
|
|
71
|
+
_CCCL_REQUIRES(::cuda::std::is_array_v<_CArray> _CCCL_AND(::cuda::std::rank_v<_CArray> == 1))
|
|
72
|
+
_CCCL_HOST_DEVICE restrict_mdspan(_CArray&)
|
|
73
|
+
-> restrict_mdspan<::cuda::std::remove_all_extents_t<_CArray>,
|
|
74
|
+
::cuda::std::extents<size_t, ::cuda::std::extent_v<_CArray, 0>>>;
|
|
75
|
+
|
|
76
|
+
template <class _ElementType, class _OtherIndexType, size_t _Size>
|
|
77
|
+
_CCCL_HOST_DEVICE restrict_mdspan(_ElementType*, const ::cuda::std::array<_OtherIndexType, _Size>&)
|
|
78
|
+
-> restrict_mdspan<_ElementType, ::cuda::std::dextents<size_t, _Size>>;
|
|
79
|
+
|
|
80
|
+
template <class _ElementType, class _OtherIndexType, size_t _Size>
|
|
81
|
+
_CCCL_HOST_DEVICE restrict_mdspan(_ElementType*, ::cuda::std::span<_OtherIndexType, _Size>)
|
|
82
|
+
-> restrict_mdspan<_ElementType, ::cuda::std::dextents<size_t, _Size>>;
|
|
83
|
+
|
|
84
|
+
// This one is necessary because all the constructors take `data_handle_type`s, not
|
|
85
|
+
// `_ElementType*`s, and `data_handle_type` is taken from `accessor_type::data_handle_type`, which
|
|
86
|
+
// seems to throw off automatic deduction guides.
|
|
87
|
+
template <class _ElementType, class _OtherIndexType, size_t... _ExtentsPack>
|
|
88
|
+
_CCCL_HOST_DEVICE restrict_mdspan(_ElementType*, const ::cuda::std::extents<_OtherIndexType, _ExtentsPack...>&)
|
|
89
|
+
-> restrict_mdspan<_ElementType, ::cuda::std::extents<_OtherIndexType, _ExtentsPack...>>;
|
|
90
|
+
|
|
91
|
+
template <class _ElementType, class _MappingType>
|
|
92
|
+
_CCCL_HOST_DEVICE restrict_mdspan(_ElementType*, const _MappingType&)
|
|
93
|
+
-> restrict_mdspan<_ElementType, typename _MappingType::extents_type, typename _MappingType::layout_type>;
|
|
94
|
+
|
|
95
|
+
template <class _MappingType, class _AccessorType>
|
|
96
|
+
_CCCL_HOST_DEVICE
|
|
97
|
+
restrict_mdspan(const typename _AccessorType::data_handle_type, const _MappingType&, const _AccessorType&)
|
|
98
|
+
-> restrict_mdspan<typename _AccessorType::element_type,
|
|
99
|
+
typename _MappingType::extents_type,
|
|
100
|
+
typename _MappingType::layout_type,
|
|
101
|
+
_AccessorType>;
|
|
36
102
|
|
|
37
103
|
/***********************************************************************************************************************
|
|
38
104
|
* Accessibility Traits
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
//===----------------------------------------------------------------------===//
|
|
2
|
+
//
|
|
3
|
+
// Part of libcu++, the C++ Standard Library for your entire system,
|
|
4
|
+
// under the Apache License v2.0 with LLVM Exceptions.
|
|
5
|
+
// See https://llvm.org/LICENSE.txt for license information.
|
|
6
|
+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
7
|
+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
|
|
8
|
+
//
|
|
9
|
+
//===----------------------------------------------------------------------===//
|
|
10
|
+
|
|
11
|
+
#ifndef _CUDA___MEMORY_POINTER_IN_RANGE_H
|
|
12
|
+
#define _CUDA___MEMORY_POINTER_IN_RANGE_H
|
|
13
|
+
|
|
14
|
+
#include <cuda/std/detail/__config>
|
|
15
|
+
|
|
16
|
+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
17
|
+
# pragma GCC system_header
|
|
18
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
|
|
19
|
+
# pragma clang system_header
|
|
20
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
|
|
21
|
+
# pragma system_header
|
|
22
|
+
#endif // no system header
|
|
23
|
+
|
|
24
|
+
#include <cuda/std/__type_traits/is_constant_evaluated.h>
|
|
25
|
+
#include <cuda/std/cstdint>
|
|
26
|
+
#if _CCCL_HOST_COMPILATION()
|
|
27
|
+
# include <functional>
|
|
28
|
+
#endif // _CCCL_HOST_COMPILATION()
|
|
29
|
+
|
|
30
|
+
#include <cuda/std/__cccl/prologue.h>
|
|
31
|
+
|
|
32
|
+
_CCCL_BEGIN_NAMESPACE_CUDA
|
|
33
|
+
|
|
34
|
+
// Pointers comparison <, <=, >=, > is undefined behavior in C++ (https://eel.is/c++draft/expr.rel#4) when pointers
|
|
35
|
+
// don't belong to the same object or array.
|
|
36
|
+
// - Even when a platform guarantees flat address space, the compiler can leverage UB for optimization purposes.
|
|
37
|
+
// - However, the compiler treats ::std::less<> other functional operators in a special way, ensuring a total ordering.
|
|
38
|
+
// - For device code, we can convert pointers to uintptr_t and compare them.
|
|
39
|
+
//
|
|
40
|
+
// References:
|
|
41
|
+
// - https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2024/p3234r0.html
|
|
42
|
+
// - https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2023/p2865r2.pdf
|
|
43
|
+
// - https://www.boost.org/doc/libs/develop/libs/core/doc/html/core/pointer_in_range.html
|
|
44
|
+
// - https://pvs-studio.com/en/blog/posts/cpp/1199/
|
|
45
|
+
// - https://releases.llvm.org/20.1.0/tools/clang/docs/ReleaseNotes.html#resolutions-to-c-defect-reports
|
|
46
|
+
|
|
47
|
+
#if _CCCL_HOST_COMPILATION()
|
|
48
|
+
|
|
49
|
+
template <typename _Tp>
|
|
50
|
+
[[nodiscard]] _CCCL_API bool __ptr_in_range_host(_Tp* __ptr, _Tp* __start, _Tp* __end) noexcept
|
|
51
|
+
{
|
|
52
|
+
_CCCL_ASSERT(::std::greater_equal<>{}(__end, __start), "__ptr_in_range_host: __end must be greater than __start");
|
|
53
|
+
return ::std::greater_equal<>{}(__ptr, __start) && ::std::less<>{}(__ptr, __end);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
#endif // _CCCL_HOST_COMPILATION()
|
|
57
|
+
|
|
58
|
+
#if _CCCL_DEVICE_COMPILATION()
|
|
59
|
+
|
|
60
|
+
template <typename _Tp>
|
|
61
|
+
[[nodiscard]] _CCCL_API bool __ptr_in_range_device(_Tp* __ptr, _Tp* __start, _Tp* __end) noexcept
|
|
62
|
+
{
|
|
63
|
+
using uintptr_t = ::cuda::std::uintptr_t;
|
|
64
|
+
auto __end_ptr = reinterpret_cast<uintptr_t>(__end);
|
|
65
|
+
auto __start_ptr = reinterpret_cast<uintptr_t>(__start);
|
|
66
|
+
auto __ptr_ptr = reinterpret_cast<uintptr_t>(__ptr);
|
|
67
|
+
_CCCL_ASSERT(__end_ptr >= __start_ptr, "__ptr_in_range_device: __end must be greater than __start");
|
|
68
|
+
return __ptr_ptr >= __start_ptr && __ptr_ptr < __end_ptr;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
#endif // _CCCL_DEVICE_COMPILATION()
|
|
72
|
+
|
|
73
|
+
template <typename _Tp>
|
|
74
|
+
[[nodiscard]] _CCCL_API constexpr bool ptr_in_range(_Tp* __ptr, _Tp* __start, _Tp* __end) noexcept
|
|
75
|
+
{
|
|
76
|
+
if (::cuda::std::__cccl_default_is_constant_evaluated())
|
|
77
|
+
{
|
|
78
|
+
_CCCL_ASSERT(__end >= __start, "ptr_in_range: __end must be greater than __start");
|
|
79
|
+
return __ptr >= __start && __ptr < __end; // UB is not possible in a constant expression
|
|
80
|
+
}
|
|
81
|
+
else
|
|
82
|
+
{
|
|
83
|
+
NV_IF_ELSE_TARGET(NV_IS_HOST,
|
|
84
|
+
(return ::cuda::__ptr_in_range_host(__ptr, __start, __end);),
|
|
85
|
+
(return ::cuda::__ptr_in_range_device(__ptr, __start, __end);));
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
_CCCL_END_NAMESPACE_CUDA
|
|
90
|
+
|
|
91
|
+
#include <cuda/std/__cccl/epilogue.h>
|
|
92
|
+
|
|
93
|
+
#endif // _CUDA___MEMORY_POINTER_IN_RANGE_H
|
|
@@ -8,8 +8,8 @@
|
|
|
8
8
|
//
|
|
9
9
|
//===----------------------------------------------------------------------===//
|
|
10
10
|
|
|
11
|
-
#ifndef
|
|
12
|
-
#define
|
|
11
|
+
#ifndef _CUDA___MEMORY_RESOURCE_GET_MEMORY_RESOURCE_H
|
|
12
|
+
#define _CUDA___MEMORY_RESOURCE_GET_MEMORY_RESOURCE_H
|
|
13
13
|
|
|
14
14
|
#include <cuda/std/detail/__config>
|
|
15
15
|
|
|
@@ -23,11 +23,11 @@
|
|
|
23
23
|
|
|
24
24
|
#include <cuda/__memory_resource/properties.h>
|
|
25
25
|
#include <cuda/__memory_resource/resource.h>
|
|
26
|
+
#include <cuda/__stream/stream_ref.h>
|
|
26
27
|
#include <cuda/std/__concepts/equality_comparable.h>
|
|
27
28
|
#include <cuda/std/__execution/env.h>
|
|
28
29
|
#include <cuda/std/__type_traits/is_same.h>
|
|
29
30
|
#include <cuda/std/__type_traits/remove_cvref.h>
|
|
30
|
-
#include <cuda/stream_ref>
|
|
31
31
|
|
|
32
32
|
#include <cuda/std/__cccl/prologue.h>
|
|
33
33
|
|
|
@@ -79,4 +79,4 @@ _CCCL_END_NAMESPACE_CUDA_MR
|
|
|
79
79
|
|
|
80
80
|
#include <cuda/std/__cccl/epilogue.h>
|
|
81
81
|
|
|
82
|
-
#endif //
|
|
82
|
+
#endif //_CUDA__MEMORY_RESOURCE_GET_MEMORY_RESOURCE_H
|
|
@@ -21,6 +21,7 @@
|
|
|
21
21
|
# pragma system_header
|
|
22
22
|
#endif // no system header
|
|
23
23
|
|
|
24
|
+
#include <cuda/std/__type_traits/decay.h>
|
|
24
25
|
#include <cuda/std/__type_traits/type_set.h>
|
|
25
26
|
#include <cuda/std/cstddef>
|
|
26
27
|
|
|
@@ -62,6 +63,49 @@ template <class... _Properties>
|
|
|
62
63
|
inline constexpr bool __contains_execution_space_property =
|
|
63
64
|
__is_host_accessible<_Properties...> || __is_device_accessible<_Properties...>;
|
|
64
65
|
|
|
66
|
+
//! @brief A type representing a list of memory resource properties
|
|
67
|
+
//! @tparam _Properties The properties to be included in the list
|
|
68
|
+
//! It has a member template `rebind` that allows constructing a type by combining
|
|
69
|
+
//! a template and type arguments with the properties from this list. The properties
|
|
70
|
+
//! are appended after the type arguments in the resulting type.
|
|
71
|
+
template <class... _Properties>
|
|
72
|
+
struct properties_list
|
|
73
|
+
{
|
|
74
|
+
//! @brief A type alias for a type template instantiated with the properties
|
|
75
|
+
//! from this list appended to the type arguments.
|
|
76
|
+
template <template <class...> class _Fn, class... _ExtraArgs>
|
|
77
|
+
using rebind = _Fn<_ExtraArgs..., _Properties...>;
|
|
78
|
+
|
|
79
|
+
template <class _QueryProperty>
|
|
80
|
+
_CCCL_HOST_API static constexpr bool has_property([[maybe_unused]] _QueryProperty)
|
|
81
|
+
{
|
|
82
|
+
return ::cuda::std::__type_set_contains_v<::cuda::std::__make_type_set<_Properties...>, _QueryProperty>;
|
|
83
|
+
}
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
template <class _Tp>
|
|
87
|
+
inline constexpr bool __is_queries_list = false;
|
|
88
|
+
|
|
89
|
+
template <class... _Tp>
|
|
90
|
+
inline constexpr bool __is_queries_list<properties_list<_Tp...>> = true;
|
|
91
|
+
|
|
92
|
+
template <typename _Tp>
|
|
93
|
+
_CCCL_CONCEPT __has_default_queries =
|
|
94
|
+
_CCCL_REQUIRES_EXPR((_Tp))(requires(__is_queries_list<typename ::cuda::std::decay_t<_Tp>::default_queries>));
|
|
95
|
+
|
|
96
|
+
template <typename _Resource, bool _HasDefaultQueries = __has_default_queries<_Resource>>
|
|
97
|
+
struct __copy_default_queries;
|
|
98
|
+
|
|
99
|
+
template <typename _Resource>
|
|
100
|
+
struct __copy_default_queries<_Resource, true>
|
|
101
|
+
{
|
|
102
|
+
using default_queries = typename _Resource::default_queries;
|
|
103
|
+
};
|
|
104
|
+
|
|
105
|
+
template <typename _Resource>
|
|
106
|
+
struct __copy_default_queries<_Resource, false>
|
|
107
|
+
{};
|
|
108
|
+
|
|
65
109
|
_CCCL_END_NAMESPACE_CUDA_MR
|
|
66
110
|
|
|
67
111
|
#include <cuda/std/__cccl/epilogue.h>
|
|
@@ -22,6 +22,7 @@
|
|
|
22
22
|
#endif // no system header
|
|
23
23
|
|
|
24
24
|
#include <cuda/__memory_resource/get_property.h>
|
|
25
|
+
#include <cuda/__stream/stream_ref.h>
|
|
25
26
|
#include <cuda/std/__concepts/concept_macros.h>
|
|
26
27
|
#include <cuda/std/__concepts/convertible_to.h>
|
|
27
28
|
#include <cuda/std/__concepts/equality_comparable.h>
|
|
@@ -29,7 +30,6 @@
|
|
|
29
30
|
#include <cuda/std/__tuple_dir/sfinae_helpers.h>
|
|
30
31
|
#include <cuda/std/__type_traits/decay.h>
|
|
31
32
|
#include <cuda/std/__type_traits/fold.h>
|
|
32
|
-
#include <cuda/stream_ref>
|
|
33
33
|
|
|
34
34
|
#include <cuda/std/__cccl/prologue.h>
|
|
35
35
|
|
|
@@ -26,6 +26,7 @@
|
|
|
26
26
|
# include <cuda/__memory_resource/get_property.h>
|
|
27
27
|
# include <cuda/__memory_resource/properties.h>
|
|
28
28
|
# include <cuda/__memory_resource/resource.h>
|
|
29
|
+
# include <cuda/__stream/stream_ref.h>
|
|
29
30
|
# include <cuda/std/__concepts/concept_macros.h>
|
|
30
31
|
# include <cuda/std/__memory/addressof.h>
|
|
31
32
|
# include <cuda/std/__type_traits/is_base_of.h>
|
|
@@ -34,7 +35,6 @@
|
|
|
34
35
|
# include <cuda/std/__utility/exchange.h>
|
|
35
36
|
# include <cuda/std/__utility/move.h>
|
|
36
37
|
# include <cuda/std/cstddef>
|
|
37
|
-
# include <cuda/stream_ref>
|
|
38
38
|
|
|
39
39
|
# include <cuda/std/__cccl/prologue.h>
|
|
40
40
|
|
|
@@ -161,10 +161,7 @@ struct _Resource_vtable_builder
|
|
|
161
161
|
template <class _Resource>
|
|
162
162
|
static void _Dealloc(void* __object, void* __ptr, size_t __bytes, size_t __alignment) noexcept
|
|
163
163
|
{
|
|
164
|
-
|
|
165
|
-
// deallocate_sync functions to be noexcept. Comment out the check for now until
|
|
166
|
-
// we can fix RMM.
|
|
167
|
-
// static_assert(noexcept(static_cast<_Resource*>(__object)->deallocate(__ptr, __bytes, __alignment)));
|
|
164
|
+
static_assert(noexcept(static_cast<_Resource*>(__object)->deallocate_sync(__ptr, __bytes, __alignment)));
|
|
168
165
|
return static_cast<_Resource*>(__object)->deallocate_sync(__ptr, __bytes, __alignment);
|
|
169
166
|
}
|
|
170
167
|
|
|
@@ -176,8 +173,9 @@ struct _Resource_vtable_builder
|
|
|
176
173
|
|
|
177
174
|
template <class _Resource>
|
|
178
175
|
static void
|
|
179
|
-
_Dealloc_async(void* __object, void* __ptr, size_t __bytes, size_t __alignment, ::cuda::stream_ref __stream)
|
|
176
|
+
_Dealloc_async(void* __object, void* __ptr, size_t __bytes, size_t __alignment, ::cuda::stream_ref __stream) noexcept
|
|
180
177
|
{
|
|
178
|
+
static_assert(noexcept(static_cast<_Resource*>(__object)->deallocate(__stream, __ptr, __bytes, __alignment)));
|
|
181
179
|
return static_cast<_Resource*>(__object)->deallocate(__stream, __ptr, __bytes, __alignment);
|
|
182
180
|
}
|
|
183
181
|
|
|
@@ -653,8 +653,9 @@
|
|
|
653
653
|
#ifndef NVTX3_CPP_DEFINITIONS_V1_0
|
|
654
654
|
# define NVTX3_CPP_DEFINITIONS_V1_0
|
|
655
655
|
|
|
656
|
+
# include <cuda/std/__cccl/memory_wrapper.h>
|
|
657
|
+
|
|
656
658
|
# include <cstddef>
|
|
657
|
-
# include <memory>
|
|
658
659
|
# include <string>
|
|
659
660
|
# include <type_traits>
|
|
660
661
|
# include <utility>
|
|
@@ -32,6 +32,7 @@
|
|
|
32
32
|
# ifndef _CCCL_DOXYGEN_INVOKED // Do not document
|
|
33
33
|
|
|
34
34
|
_CCCL_BEGIN_NAMESPACE_CUDA
|
|
35
|
+
|
|
35
36
|
class stream_ref;
|
|
36
37
|
|
|
37
38
|
//! @brief RAII helper which on construction sets the current context to the specified one.
|
|
@@ -45,7 +46,7 @@ struct [[maybe_unused]] __ensure_current_context
|
|
|
45
46
|
//! @param new_device The device to switch the context to
|
|
46
47
|
//!
|
|
47
48
|
//! @throws cuda_error if the context switch fails
|
|
48
|
-
explicit __ensure_current_context(device_ref __new_device)
|
|
49
|
+
_CCCL_HOST_API explicit __ensure_current_context(device_ref __new_device)
|
|
49
50
|
{
|
|
50
51
|
auto __ctx = ::cuda::__physical_devices()[__new_device.get()].__primary_context();
|
|
51
52
|
::cuda::__driver::__ctxPush(__ctx);
|
|
@@ -57,7 +58,7 @@ struct [[maybe_unused]] __ensure_current_context
|
|
|
57
58
|
//! @param ctx The context to switch to
|
|
58
59
|
//!
|
|
59
60
|
//! @throws cuda_error if the context switch fails
|
|
60
|
-
explicit __ensure_current_context(::CUcontext __ctx)
|
|
61
|
+
_CCCL_HOST_API explicit __ensure_current_context(::CUcontext __ctx)
|
|
61
62
|
{
|
|
62
63
|
::cuda::__driver::__ctxPush(__ctx);
|
|
63
64
|
}
|
|
@@ -68,7 +69,7 @@ struct [[maybe_unused]] __ensure_current_context
|
|
|
68
69
|
//! @param stream Stream indicating the context to switch to
|
|
69
70
|
//!
|
|
70
71
|
//! @throws cuda_error if the context switch fails
|
|
71
|
-
explicit __ensure_current_context(stream_ref __stream);
|
|
72
|
+
_CCCL_HOST_API explicit __ensure_current_context(stream_ref __stream);
|
|
72
73
|
|
|
73
74
|
__ensure_current_context(__ensure_current_context&&) = delete;
|
|
74
75
|
__ensure_current_context(__ensure_current_context const&) = delete;
|
|
@@ -80,7 +81,7 @@ struct [[maybe_unused]] __ensure_current_context
|
|
|
80
81
|
//!
|
|
81
82
|
//! @throws cuda_error if the device switch fails. If the destructor is called
|
|
82
83
|
//! during stack unwinding, the program is automatically terminated.
|
|
83
|
-
~__ensure_current_context() noexcept(false)
|
|
84
|
+
_CCCL_HOST_API ~__ensure_current_context() noexcept(false)
|
|
84
85
|
{
|
|
85
86
|
// TODO would it make sense to assert here that we pushed and popped the same thing?
|
|
86
87
|
::cuda::__driver::__ctxPop();
|
|
@@ -43,7 +43,7 @@ struct stream : stream_ref
|
|
|
43
43
|
//! Priority is defaulted to stream::default_priority
|
|
44
44
|
//!
|
|
45
45
|
//! @throws cuda_error if stream creation fails
|
|
46
|
-
explicit stream(device_ref __dev, int __priority = default_priority)
|
|
46
|
+
_CCCL_HOST_API explicit stream(device_ref __dev, int __priority = default_priority)
|
|
47
47
|
: stream_ref(__detail::__invalid_stream)
|
|
48
48
|
{
|
|
49
49
|
[[maybe_unused]] __ensure_current_context __ctx_setter(__dev);
|
|
@@ -54,7 +54,7 @@ struct stream : stream_ref
|
|
|
54
54
|
//!
|
|
55
55
|
//! @post `stream()` returns an invalid stream handle
|
|
56
56
|
// Can't be constexpr because __invalid_stream isn't
|
|
57
|
-
explicit stream(no_init_t) noexcept
|
|
57
|
+
_CCCL_HOST_API explicit stream(no_init_t) noexcept
|
|
58
58
|
: stream_ref(__detail::__invalid_stream)
|
|
59
59
|
{}
|
|
60
60
|
|
|
@@ -63,7 +63,7 @@ struct stream : stream_ref
|
|
|
63
63
|
//! @param __other
|
|
64
64
|
//!
|
|
65
65
|
//! @post `__other` is in moved-from state.
|
|
66
|
-
stream(stream&& __other) noexcept
|
|
66
|
+
_CCCL_HOST_API stream(stream&& __other) noexcept
|
|
67
67
|
: stream(::cuda::std::exchange(__other.__stream, __detail::__invalid_stream))
|
|
68
68
|
{}
|
|
69
69
|
|
|
@@ -72,7 +72,7 @@ struct stream : stream_ref
|
|
|
72
72
|
//! Destroy the `stream` object
|
|
73
73
|
//!
|
|
74
74
|
//! @note If the stream fails to be destroyed, the error is silently ignored.
|
|
75
|
-
~stream()
|
|
75
|
+
_CCCL_HOST_API ~stream()
|
|
76
76
|
{
|
|
77
77
|
if (__stream != __detail::__invalid_stream)
|
|
78
78
|
{
|
|
@@ -87,7 +87,7 @@ struct stream : stream_ref
|
|
|
87
87
|
//! @param __other
|
|
88
88
|
//!
|
|
89
89
|
//! @post `__other` is in a moved-from state.
|
|
90
|
-
stream& operator=(stream&& __other) noexcept
|
|
90
|
+
_CCCL_HOST_API stream& operator=(stream&& __other) noexcept
|
|
91
91
|
{
|
|
92
92
|
stream __tmp(::cuda::std::move(__other));
|
|
93
93
|
::cuda::std::swap(__stream, __tmp.__stream);
|
|
@@ -103,7 +103,7 @@ struct stream : stream_ref
|
|
|
103
103
|
//! @return stream The constructed `stream` object
|
|
104
104
|
//!
|
|
105
105
|
//! @note The constructed `stream` object takes ownership of the native handle.
|
|
106
|
-
[[nodiscard]] static stream from_native_handle(::cudaStream_t __handle)
|
|
106
|
+
[[nodiscard]] static _CCCL_HOST_API stream from_native_handle(::cudaStream_t __handle)
|
|
107
107
|
{
|
|
108
108
|
return stream(__handle);
|
|
109
109
|
}
|
|
@@ -119,7 +119,7 @@ struct stream : stream_ref
|
|
|
119
119
|
//! @return cudaStream_t The native handle being held by the `stream` object.
|
|
120
120
|
//!
|
|
121
121
|
//! @post The stream object is in a moved-from state.
|
|
122
|
-
[[nodiscard]] ::cudaStream_t release()
|
|
122
|
+
[[nodiscard]] _CCCL_HOST_API ::cudaStream_t release()
|
|
123
123
|
{
|
|
124
124
|
return ::cuda::std::exchange(__stream, __detail::__invalid_stream);
|
|
125
125
|
}
|
|
@@ -127,7 +127,7 @@ struct stream : stream_ref
|
|
|
127
127
|
private:
|
|
128
128
|
// Use `stream::from_native_handle(s)` to construct an owning `stream`
|
|
129
129
|
// object from a `cudaStream_t` handle.
|
|
130
|
-
explicit stream(::cudaStream_t __handle)
|
|
130
|
+
_CCCL_HOST_API explicit stream(::cudaStream_t __handle)
|
|
131
131
|
: stream_ref(__handle)
|
|
132
132
|
{}
|
|
133
133
|
};
|
|
@@ -30,6 +30,7 @@
|
|
|
30
30
|
# include <cuda/__runtime/ensure_current_context.h>
|
|
31
31
|
# include <cuda/__utility/no_init.h>
|
|
32
32
|
# include <cuda/std/__exception/cuda_error.h>
|
|
33
|
+
# include <cuda/std/__utility/to_underlying.h>
|
|
33
34
|
# include <cuda/std/cstddef>
|
|
34
35
|
|
|
35
36
|
# include <cuda/std/__cccl/prologue.h>
|
|
@@ -61,9 +62,10 @@ public:
|
|
|
61
62
|
//!
|
|
62
63
|
//! For behavior of the default stream,
|
|
63
64
|
//! @see //! https://docs.nvidia.com/cuda/cuda-runtime-api/stream-sync-behavior.html
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
65
|
+
CCCL_DEPRECATED_BECAUSE("Using the default/null stream is generally discouraged. If you need to use it, please "
|
|
66
|
+
"construct a "
|
|
67
|
+
"stream_ref from cudaStream_t{nullptr}") _CCCL_HIDE_FROM_ABI
|
|
68
|
+
stream_ref() = default;
|
|
67
69
|
|
|
68
70
|
//! @brief Constructs a `stream_ref` from a `cudaStream_t` handle.
|
|
69
71
|
//!
|
|
@@ -124,8 +126,7 @@ public:
|
|
|
124
126
|
//! @brief Deprecated. Use sync() instead.
|
|
125
127
|
//!
|
|
126
128
|
//! @deprecated Use sync() instead.
|
|
127
|
-
|
|
128
|
-
void wait() const
|
|
129
|
+
CCCL_DEPRECATED_BECAUSE("Use sync() instead.") _CCCL_HOST_API void wait() const
|
|
129
130
|
{
|
|
130
131
|
sync();
|
|
131
132
|
}
|
|
@@ -184,7 +185,7 @@ public:
|
|
|
184
185
|
//! @throws cuda::cuda_error if the query fails.
|
|
185
186
|
//!
|
|
186
187
|
//! @return `true` if all operations have completed, or `false` if not.
|
|
187
|
-
[[
|
|
188
|
+
[[nodiscard]] CCCL_DEPRECATED_BECAUSE("Use is_done() instead.") _CCCL_HOST_API bool ready() const
|
|
188
189
|
{
|
|
189
190
|
return is_done();
|
|
190
191
|
}
|
|
@@ -216,7 +217,7 @@ public:
|
|
|
216
217
|
//! @return A new event that was recorded into this stream
|
|
217
218
|
//!
|
|
218
219
|
//! @throws cuda_error if event creation or record failed
|
|
219
|
-
[[nodiscard]] _CCCL_HOST_API event record_event(
|
|
220
|
+
[[nodiscard]] _CCCL_HOST_API event record_event(event_flags __flags = event_flags::none) const
|
|
220
221
|
{
|
|
221
222
|
return event(*this, __flags);
|
|
222
223
|
}
|
|
@@ -226,7 +227,7 @@ public:
|
|
|
226
227
|
//! @return A new timed event that was recorded into this stream
|
|
227
228
|
//!
|
|
228
229
|
//! @throws cuda_error if event creation or record failed
|
|
229
|
-
[[nodiscard]] _CCCL_HOST_API timed_event record_timed_event(
|
|
230
|
+
[[nodiscard]] _CCCL_HOST_API timed_event record_timed_event(event_flags __flags = event_flags::none) const
|
|
230
231
|
{
|
|
231
232
|
return timed_event(*this, __flags);
|
|
232
233
|
}
|
|
@@ -237,7 +238,7 @@ public:
|
|
|
237
238
|
//! returned
|
|
238
239
|
//!
|
|
239
240
|
//! @throws cuda_error if device check fails
|
|
240
|
-
_CCCL_HOST_API device_ref device() const
|
|
241
|
+
[[nodiscard]] _CCCL_HOST_API device_ref device() const
|
|
241
242
|
{
|
|
242
243
|
::CUdevice __device{};
|
|
243
244
|
# if _CCCL_CTK_AT_LEAST(13, 0)
|
|
@@ -260,7 +261,7 @@ public:
|
|
|
260
261
|
}
|
|
261
262
|
};
|
|
262
263
|
|
|
263
|
-
inline void event_ref::record(stream_ref __stream) const
|
|
264
|
+
_CCCL_HOST_API inline void event_ref::record(stream_ref __stream) const
|
|
264
265
|
{
|
|
265
266
|
_CCCL_ASSERT(__event_ != nullptr, "cuda::event_ref::record no event set");
|
|
266
267
|
_CCCL_ASSERT(__stream.get() != nullptr, "cuda::event_ref::record invalid stream passed");
|
|
@@ -268,26 +269,26 @@ inline void event_ref::record(stream_ref __stream) const
|
|
|
268
269
|
::cuda::__driver::__eventRecord(__event_, __stream.get());
|
|
269
270
|
}
|
|
270
271
|
|
|
271
|
-
inline event::event(stream_ref __stream,
|
|
272
|
-
: event(__stream,
|
|
272
|
+
_CCCL_HOST_API inline event::event(stream_ref __stream, event_flags __flags)
|
|
273
|
+
: event(__stream, ::cuda::std::to_underlying(__flags) | cudaEventDisableTiming)
|
|
273
274
|
{
|
|
274
275
|
record(__stream);
|
|
275
276
|
}
|
|
276
277
|
|
|
277
|
-
inline event::event(stream_ref __stream, unsigned __flags)
|
|
278
|
+
_CCCL_HOST_API inline event::event(stream_ref __stream, unsigned __flags)
|
|
278
279
|
: event_ref(::cudaEvent_t{})
|
|
279
280
|
{
|
|
280
281
|
[[maybe_unused]] __ensure_current_context __ctx_setter(__stream);
|
|
281
282
|
__event_ = ::cuda::__driver::__eventCreate(static_cast<unsigned>(__flags));
|
|
282
283
|
}
|
|
283
284
|
|
|
284
|
-
inline timed_event::timed_event(stream_ref __stream,
|
|
285
|
-
: event(__stream,
|
|
285
|
+
_CCCL_HOST_API inline timed_event::timed_event(stream_ref __stream, event_flags __flags)
|
|
286
|
+
: event(__stream, ::cuda::std::to_underlying(__flags))
|
|
286
287
|
{
|
|
287
288
|
record(__stream);
|
|
288
289
|
}
|
|
289
290
|
|
|
290
|
-
inline __ensure_current_context::__ensure_current_context(stream_ref __stream)
|
|
291
|
+
_CCCL_HOST_API inline __ensure_current_context::__ensure_current_context(stream_ref __stream)
|
|
291
292
|
{
|
|
292
293
|
auto __ctx = __driver::__streamGetCtx(__stream.get());
|
|
293
294
|
::cuda::__driver::__ctxPush(__ctx);
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
//===----------------------------------------------------------------------===//
|
|
2
|
+
//
|
|
3
|
+
// Part of libcu++, the C++ Standard Library for your entire system,
|
|
4
|
+
// under the Apache License v2.0 with LLVM Exceptions.
|
|
5
|
+
// See https://llvm.org/LICENSE.txt for license information.
|
|
6
|
+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
7
|
+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
|
|
8
|
+
//
|
|
9
|
+
//===----------------------------------------------------------------------===//
|
|
10
|
+
|
|
11
|
+
#ifndef _CUDA___UTILITY_IN_RANGE_H
|
|
12
|
+
#define _CUDA___UTILITY_IN_RANGE_H
|
|
13
|
+
|
|
14
|
+
#include <cuda/std/detail/__config>
|
|
15
|
+
|
|
16
|
+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
17
|
+
# pragma GCC system_header
|
|
18
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
|
|
19
|
+
# pragma clang system_header
|
|
20
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
|
|
21
|
+
# pragma system_header
|
|
22
|
+
#endif // no system header
|
|
23
|
+
|
|
24
|
+
#include <cuda/__type_traits/is_floating_point.h>
|
|
25
|
+
#include <cuda/std/__cmath/isnan.h>
|
|
26
|
+
#include <cuda/std/__concepts/concept_macros.h>
|
|
27
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
28
|
+
#include <cuda/std/__type_traits/is_extended_floating_point.h>
|
|
29
|
+
#include <cuda/std/__type_traits/is_integer.h>
|
|
30
|
+
#include <cuda/std/__type_traits/is_unsigned_integer.h>
|
|
31
|
+
|
|
32
|
+
#include <cuda/std/__cccl/prologue.h>
|
|
33
|
+
|
|
34
|
+
_CCCL_BEGIN_NAMESPACE_CUDA
|
|
35
|
+
|
|
36
|
+
_CCCL_TEMPLATE(typename _Tp)
|
|
37
|
+
_CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> || ::cuda::std::is_floating_point_v<_Tp>
|
|
38
|
+
|| ::cuda::std::__is_extended_floating_point_v<_Tp>)
|
|
39
|
+
[[nodiscard]] _CCCL_API constexpr bool in_range(_Tp __v, _Tp __start, _Tp __end) noexcept
|
|
40
|
+
{
|
|
41
|
+
_CCCL_ASSERT(::cuda::std::isnan(__start) || ::cuda::std::isnan(__end) || __end >= __start,
|
|
42
|
+
"in_range: __end must be greater than or equal to __start");
|
|
43
|
+
if constexpr (::cuda::std::__cccl_is_unsigned_integer_v<_Tp>)
|
|
44
|
+
{
|
|
45
|
+
// if __end > __start, we know that the range is always positive. Similarly, __v is positive if unsigned.
|
|
46
|
+
// this optimization is useful when __start and __end are compile-time constants, or when in_range is used multiple
|
|
47
|
+
// times with the same range
|
|
48
|
+
using _Up = ::cuda::std::conditional_t<(sizeof(_Tp) <= sizeof(unsigned)), unsigned, _Tp>; // at least 32-bit
|
|
49
|
+
const auto __start1 = static_cast<_Up>(__start);
|
|
50
|
+
const auto __end1 = static_cast<_Up>(__end);
|
|
51
|
+
const auto __v1 = static_cast<_Up>(__v);
|
|
52
|
+
const auto __range = __end1 - __start1;
|
|
53
|
+
return (__v1 - __start1) <= __range;
|
|
54
|
+
}
|
|
55
|
+
else
|
|
56
|
+
{
|
|
57
|
+
return __v >= __start && __v <= __end;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
_CCCL_END_NAMESPACE_CUDA
|
|
62
|
+
|
|
63
|
+
#include <cuda/std/__cccl/epilogue.h>
|
|
64
|
+
|
|
65
|
+
#endif // _CUDA___UTILITY_IN_RANGE_H
|
|
@@ -22,9 +22,12 @@
|
|
|
22
22
|
#endif // no system header
|
|
23
23
|
|
|
24
24
|
#include <cuda/__device/all_devices.h>
|
|
25
|
+
#include <cuda/__device/arch_id.h>
|
|
25
26
|
#include <cuda/__device/arch_traits.h>
|
|
26
27
|
#include <cuda/__device/attributes.h>
|
|
28
|
+
#include <cuda/__device/compute_capability.h>
|
|
27
29
|
#include <cuda/__device/device_ref.h>
|
|
28
30
|
#include <cuda/__device/physical_device.h>
|
|
31
|
+
#include <cuda/version>
|
|
29
32
|
|
|
30
33
|
#endif // _CUDA_DEVICES
|
|
@@ -52,12 +52,12 @@ __equal_range(_Iter __first, _Sent __last, const _Tp& __value, _Compare&& __comp
|
|
|
52
52
|
{
|
|
53
53
|
auto __half_len = ::cuda::std::__half_positive(__len);
|
|
54
54
|
_Iter __mid = _IterOps<_AlgPolicy>::next(__first, __half_len);
|
|
55
|
-
if (::cuda::std::
|
|
55
|
+
if (::cuda::std::invoke(__comp, ::cuda::std::invoke(__proj, *__mid), __value))
|
|
56
56
|
{
|
|
57
57
|
__first = ++__mid;
|
|
58
58
|
__len -= __half_len + 1;
|
|
59
59
|
}
|
|
60
|
-
else if (::cuda::std::
|
|
60
|
+
else if (::cuda::std::invoke(__comp, __value, ::cuda::std::invoke(__proj, *__mid)))
|
|
61
61
|
{
|
|
62
62
|
__end = __mid;
|
|
63
63
|
__len = __half_len;
|