cuda-cccl 0.1.3.2.0.dev438__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +25 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +24 -19
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
- cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
- cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +320 -262
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +10 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +37 -13
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +204 -55
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/util_device.cuh +51 -35
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +4 -4
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__device/all_devices.h +47 -147
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +51 -49
- cuda/cccl/headers/include/cuda/__device/attributes.h +177 -127
- cuda/cccl/headers/include/cuda/__device/device_ref.h +32 -51
- cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +330 -36
- cuda/cccl/headers/include/cuda/__event/event.h +8 -8
- cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +4 -4
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
- cuda/cccl/headers/include/cuda/__runtime/types.h +1 -1
- cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -12
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/devices +10 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +1 -1
- cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
- cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +3 -2
- cuda/cccl/headers/include/cuda/std/string_view +12 -5
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
- cuda/cccl/parallel/experimental/__init__.py +21 -70
- cuda/compute/__init__.py +77 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +28 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +141 -1
- cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +4 -0
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -6
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/compute/algorithms/_three_way_partition.py +261 -0
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +171 -166
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -21,11 +21,12 @@
|
|
|
21
21
|
# pragma system_header
|
|
22
22
|
#endif // no system header
|
|
23
23
|
|
|
24
|
-
#if _CCCL_HAS_CTK()
|
|
24
|
+
#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
25
25
|
|
|
26
26
|
# include <cuda/std/__exception/cuda_error.h>
|
|
27
27
|
# include <cuda/std/__internal/namespaces.h>
|
|
28
28
|
# include <cuda/std/__type_traits/always_false.h>
|
|
29
|
+
# include <cuda/std/__type_traits/is_same.h>
|
|
29
30
|
|
|
30
31
|
# include <cuda.h>
|
|
31
32
|
|
|
@@ -41,31 +42,45 @@ _CCCL_BEGIN_NAMESPACE_CUDA_DRIVER
|
|
|
41
42
|
reinterpret_cast<decltype(::versioned_fn_name)*>( \
|
|
42
43
|
::cuda::__driver::__get_driver_entry_point(#function_name, major, minor))
|
|
43
44
|
|
|
45
|
+
// cudaGetDriverEntryPoint function is deprecated
|
|
44
46
|
_CCCL_SUPPRESS_DEPRECATED_PUSH
|
|
45
47
|
|
|
46
|
-
//! @brief
|
|
47
|
-
|
|
48
|
-
//! For minor version compatibility request the 12.0 version of everything for now, unless requested otherwise
|
|
49
|
-
[[nodiscard]] _CCCL_HOST_API inline void*
|
|
50
|
-
__get_driver_entry_point(const char* __name, [[maybe_unused]] int __major = 12, [[maybe_unused]] int __minor = 0)
|
|
48
|
+
//! @brief Gets the cuGetProcAddress function pointer.
|
|
49
|
+
[[nodiscard]] _CCCL_HOST_API inline auto __getProcAddressFn() -> decltype(cuGetProcAddress)*
|
|
51
50
|
{
|
|
52
|
-
// TODO switch to dlopen of libcuda.so instead of the below
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
51
|
+
// TODO switch to dlopen of libcuda.so instead of the below
|
|
52
|
+
void* __fn;
|
|
53
|
+
::cudaDriverEntryPointQueryResult __result;
|
|
54
|
+
::cudaError_t __status = ::cudaGetDriverEntryPoint("cuGetProcAddress", &__fn, ::cudaEnableDefault, &__result);
|
|
55
|
+
if (__status != ::cudaSuccess || __result != ::cudaDriverEntryPointSuccess)
|
|
56
|
+
{
|
|
57
|
+
::cuda::__throw_cuda_error(::cudaErrorUnknown, "Failed to get cuGetProcAddress");
|
|
58
|
+
}
|
|
59
|
+
return reinterpret_cast<decltype(cuGetProcAddress)*>(__fn);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
_CCCL_SUPPRESS_DEPRECATED_POP
|
|
64
63
|
|
|
64
|
+
//! @brief Gets the driver entry point.
|
|
65
|
+
//!
|
|
66
|
+
//! @param __get_proc_addr_fn Pointer to cuGetProcAddress function.
|
|
67
|
+
//! @param __name Name of the symbol to get the driver entry point for.
|
|
68
|
+
//! @param __major The major CTK version to get the symbol version for.
|
|
69
|
+
//! @param __minor The major CTK version to get the symbol version for.
|
|
70
|
+
//!
|
|
71
|
+
//! @return The address of the symbol.
|
|
72
|
+
//!
|
|
73
|
+
//! @throws @c cuda::cuda_error if the symbol cannot be obtained.
|
|
74
|
+
[[nodiscard]] _CCCL_HOST_API inline void* __get_driver_entry_point_impl(
|
|
75
|
+
decltype(cuGetProcAddress)* __get_proc_addr_fn,
|
|
76
|
+
const char* __name,
|
|
77
|
+
[[maybe_unused]] int __major,
|
|
78
|
+
[[maybe_unused]] int __minor)
|
|
79
|
+
{
|
|
65
80
|
void* __fn;
|
|
66
81
|
::CUdriverProcAddressQueryResult __result;
|
|
67
82
|
::CUresult __status =
|
|
68
|
-
|
|
83
|
+
__get_proc_addr_fn(__name, &__fn, __major * 1000 + __minor * 10, ::CU_GET_PROC_ADDRESS_DEFAULT, &__result);
|
|
69
84
|
if (__status != ::CUDA_SUCCESS || __result != ::CU_GET_PROC_ADDRESS_SUCCESS)
|
|
70
85
|
{
|
|
71
86
|
if (__status == ::CUDA_ERROR_INVALID_VALUE)
|
|
@@ -84,8 +99,13 @@ __get_driver_entry_point(const char* __name, [[maybe_unused]] int __major = 12,
|
|
|
84
99
|
return __fn;
|
|
85
100
|
}
|
|
86
101
|
|
|
87
|
-
|
|
88
|
-
|
|
102
|
+
//! @brief CUDA Driver API call wrapper. Calls a given CUDA Driver API and checks the return value.
|
|
103
|
+
//!
|
|
104
|
+
//! @param __fn A CUDA Driver function.
|
|
105
|
+
//! @param __err_msg Error message describing the call if the all fails.
|
|
106
|
+
//! @param __args The arguments to the @c __fn call.
|
|
107
|
+
//!
|
|
108
|
+
//! @throws @c cuda::cuda_error if the function call doesn't return CUDA_SUCCESS.
|
|
89
109
|
template <typename Fn, typename... Args>
|
|
90
110
|
_CCCL_HOST_API inline void __call_driver_fn(Fn __fn, const char* __err_msg, Args... __args)
|
|
91
111
|
{
|
|
@@ -96,6 +116,48 @@ _CCCL_HOST_API inline void __call_driver_fn(Fn __fn, const char* __err_msg, Args
|
|
|
96
116
|
}
|
|
97
117
|
}
|
|
98
118
|
|
|
119
|
+
//! @brief Initializes the CUDA Driver.
|
|
120
|
+
//!
|
|
121
|
+
//! @param __get_proc_addr_fn The pointer to cuGetProcAddress function.
|
|
122
|
+
//!
|
|
123
|
+
//! @return A dummy bool value.
|
|
124
|
+
//!
|
|
125
|
+
//! @warning This function should be called only once from __get_driver_entry_point function.
|
|
126
|
+
[[nodiscard]] _CCCL_HOST_API inline bool __init(decltype(cuGetProcAddress)* __get_proc_addr_fn)
|
|
127
|
+
{
|
|
128
|
+
auto __driver_fn = reinterpret_cast<decltype(::cuInit)*>(
|
|
129
|
+
::cuda::__driver::__get_driver_entry_point_impl(__get_proc_addr_fn, "cuInit", 12, 0));
|
|
130
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to initialize CUDA Driver", 0);
|
|
131
|
+
return true;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
//! @brief Get a driver function pointer for a given API name and optionally specific CUDA version. This function also
|
|
135
|
+
//! initializes the CUDA Driver.
|
|
136
|
+
//!
|
|
137
|
+
//! @param __name Name of the symbol to get the driver entry point for.
|
|
138
|
+
//! @param __major The major CTK version to get the symbol version for. Defaults to 12.
|
|
139
|
+
//! @param __minor The major CTK version to get the symbol version for. Defaults to 0.
|
|
140
|
+
//!
|
|
141
|
+
//! @return The address of the symbol.
|
|
142
|
+
//!
|
|
143
|
+
//! @throws @c cuda::cuda_error if the symbol cannot be obtained or the CUDA driver failed to initialize.
|
|
144
|
+
[[nodiscard]] _CCCL_HOST_API inline void*
|
|
145
|
+
__get_driver_entry_point(const char* __name, [[maybe_unused]] int __major = 12, [[maybe_unused]] int __minor = 0)
|
|
146
|
+
{
|
|
147
|
+
// Get cuGetProcAddress function and call cuInit(0) only on the first call
|
|
148
|
+
static auto __get_proc_addr_fn = ::cuda::__driver::__getProcAddressFn();
|
|
149
|
+
[[maybe_unused]] static auto __init = ::cuda::__driver::__init(__get_proc_addr_fn);
|
|
150
|
+
return ::cuda::__driver::__get_driver_entry_point_impl(__get_proc_addr_fn, __name, __major, __minor);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
//! @brief Converts CUdevice to ordinal device id.
|
|
154
|
+
//!
|
|
155
|
+
//! @note Currently, CUdevice value is the same as the ordinal device id. But that might change in the future.
|
|
156
|
+
[[nodiscard]] _CCCL_HOST_API inline int __cudevice_to_ordinal(::CUdevice __dev) noexcept
|
|
157
|
+
{
|
|
158
|
+
return static_cast<int>(__dev);
|
|
159
|
+
}
|
|
160
|
+
|
|
99
161
|
// Version management
|
|
100
162
|
|
|
101
163
|
[[nodiscard]] _CCCL_HOST_API inline int __getVersion()
|
|
@@ -119,6 +181,22 @@ _CCCL_HOST_API inline void __call_driver_fn(Fn __fn, const char* __err_msg, Args
|
|
|
119
181
|
return __result;
|
|
120
182
|
}
|
|
121
183
|
|
|
184
|
+
[[nodiscard]] _CCCL_HOST_API inline ::CUdevice __deviceGetAttribute(::CUdevice_attribute __attr, ::CUdevice __device)
|
|
185
|
+
{
|
|
186
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDeviceGetAttribute);
|
|
187
|
+
int __result;
|
|
188
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get device attribute", &__result, __attr, __device);
|
|
189
|
+
return __result;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
[[nodiscard]] _CCCL_HOST_API inline int __deviceGetCount()
|
|
193
|
+
{
|
|
194
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDeviceGetCount);
|
|
195
|
+
int __result;
|
|
196
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get device count", &__result);
|
|
197
|
+
return __result;
|
|
198
|
+
}
|
|
199
|
+
|
|
122
200
|
_CCCL_HOST_API inline void __deviceGetName(char* __name_out, int __len, int __ordinal)
|
|
123
201
|
{
|
|
124
202
|
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDeviceGetName);
|
|
@@ -138,11 +216,10 @@ _CCCL_HOST_API inline void __deviceGetName(char* __name_out, int __len, int __or
|
|
|
138
216
|
return __result;
|
|
139
217
|
}
|
|
140
218
|
|
|
141
|
-
_CCCL_HOST_API inline
|
|
219
|
+
[[nodiscard]] _CCCL_HOST_API inline ::cudaError_t __primaryCtxReleaseNoThrow(::CUdevice __dev)
|
|
142
220
|
{
|
|
143
221
|
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDevicePrimaryCtxRelease);
|
|
144
|
-
|
|
145
|
-
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to release context for a device", __dev);
|
|
222
|
+
return static_cast<::cudaError_t>(__driver_fn(__dev));
|
|
146
223
|
}
|
|
147
224
|
|
|
148
225
|
[[nodiscard]] _CCCL_HOST_API inline bool __isPrimaryCtxActive(::CUdevice __dev)
|
|
@@ -178,6 +255,14 @@ _CCCL_HOST_API inline ::CUcontext __ctxPop()
|
|
|
178
255
|
return __result;
|
|
179
256
|
}
|
|
180
257
|
|
|
258
|
+
[[nodiscard]] _CCCL_HOST_API inline ::CUdevice __ctxGetDevice()
|
|
259
|
+
{
|
|
260
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuCtxGetDevice);
|
|
261
|
+
::CUdevice __result{};
|
|
262
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get current context", &__result);
|
|
263
|
+
return __result;
|
|
264
|
+
}
|
|
265
|
+
|
|
181
266
|
// Memory management
|
|
182
267
|
|
|
183
268
|
_CCCL_HOST_API inline void __memcpyAsync(void* __dst, const void* __src, size_t __count, ::CUstream __stream)
|
|
@@ -239,8 +324,174 @@ _CCCL_HOST_API void __memsetAsync(void* __dst, _Tp __value, size_t __count, ::CU
|
|
|
239
324
|
}
|
|
240
325
|
}
|
|
241
326
|
|
|
327
|
+
_CCCL_HOST_API inline ::cudaError_t __mempoolCreateNoThrow(::CUmemoryPool* __pool, ::CUmemPoolProps* __props)
|
|
328
|
+
{
|
|
329
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolCreate);
|
|
330
|
+
return static_cast<::cudaError_t>(__driver_fn(__pool, __props));
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
_CCCL_HOST_API inline void __mempoolSetAttribute(::CUmemoryPool __pool, ::CUmemPool_attribute __attr, void* __value)
|
|
334
|
+
{
|
|
335
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolSetAttribute);
|
|
336
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to set attribute for a memory pool", __pool, __attr, __value);
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
_CCCL_HOST_API inline size_t __mempoolGetAttribute(::CUmemoryPool __pool, ::CUmemPool_attribute __attr)
|
|
340
|
+
{
|
|
341
|
+
size_t __value = 0;
|
|
342
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolGetAttribute);
|
|
343
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get attribute for a memory pool", __pool, __attr, &__value);
|
|
344
|
+
return __value;
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
_CCCL_HOST_API inline void __mempoolDestroy(::CUmemoryPool __pool)
|
|
348
|
+
{
|
|
349
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolDestroy);
|
|
350
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to destroy a memory pool", __pool);
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
_CCCL_HOST_API inline ::CUdeviceptr
|
|
354
|
+
__mallocFromPoolAsync(::cuda::std::size_t __bytes, ::CUmemoryPool __pool, ::CUstream __stream)
|
|
355
|
+
{
|
|
356
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemAllocFromPoolAsync);
|
|
357
|
+
::CUdeviceptr __result = 0;
|
|
358
|
+
::cuda::__driver::__call_driver_fn(
|
|
359
|
+
__driver_fn, "Failed to allocate memory from a memory pool", &__result, __bytes, __pool, __stream);
|
|
360
|
+
return __result;
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
_CCCL_HOST_API inline void __mempoolTrimTo(::CUmemoryPool __pool, ::cuda::std::size_t __min_bytes_to_keep)
|
|
364
|
+
{
|
|
365
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolTrimTo);
|
|
366
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to trim a memory pool", __pool, __min_bytes_to_keep);
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
_CCCL_HOST_API inline ::cudaError_t __freeAsyncNoThrow(::CUdeviceptr __dptr, ::CUstream __stream)
|
|
370
|
+
{
|
|
371
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemFreeAsync);
|
|
372
|
+
return static_cast<::cudaError_t>(__driver_fn(__dptr, __stream));
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
_CCCL_HOST_API inline void __mempoolSetAccess(::CUmemoryPool __pool, ::CUmemAccessDesc* __descs, ::size_t __count)
|
|
376
|
+
{
|
|
377
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolSetAccess);
|
|
378
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to set access of a memory pool", __pool, __descs, __count);
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
_CCCL_HOST_API inline ::CUmemAccess_flags __mempoolGetAccess(::CUmemoryPool __pool, ::CUmemLocation* __location)
|
|
382
|
+
{
|
|
383
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolGetAccess);
|
|
384
|
+
::CUmemAccess_flags __flags;
|
|
385
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get access of a memory pool", &__flags, __pool, __location);
|
|
386
|
+
return __flags;
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
# if _CCCL_CTK_AT_LEAST(13, 0)
|
|
390
|
+
_CCCL_HOST_API inline ::CUmemoryPool
|
|
391
|
+
__getDefaultMemPool(CUmemLocation __location, CUmemAllocationType_enum __allocation_type)
|
|
392
|
+
{
|
|
393
|
+
static auto __driver_fn =
|
|
394
|
+
_CCCLRT_GET_DRIVER_FUNCTION_VERSIONED(cuMemGetDefaultMemPool, cuMemGetDefaultMemPool, 13, 0);
|
|
395
|
+
::CUmemoryPool __result = nullptr;
|
|
396
|
+
::cuda::__driver::__call_driver_fn(
|
|
397
|
+
__driver_fn, "Failed to get default memory pool", &__result, &__location, __allocation_type);
|
|
398
|
+
return __result;
|
|
399
|
+
}
|
|
400
|
+
# endif // _CCCL_CTK_AT_LEAST(13, 0)
|
|
401
|
+
|
|
402
|
+
_CCCL_HOST_API inline ::CUdeviceptr __mallocManaged(::cuda::std::size_t __bytes, unsigned int __flags)
|
|
403
|
+
{
|
|
404
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemAllocManaged);
|
|
405
|
+
::CUdeviceptr __result = 0;
|
|
406
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to allocate managed memory", &__result, __bytes, __flags);
|
|
407
|
+
return __result;
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
_CCCL_HOST_API inline void* __mallocHost(::cuda::std::size_t __bytes)
|
|
411
|
+
{
|
|
412
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemAllocHost);
|
|
413
|
+
void* __result = nullptr;
|
|
414
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to allocate host memory", &__result, __bytes);
|
|
415
|
+
return __result;
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
_CCCL_HOST_API inline ::cudaError_t __freeNoThrow(::CUdeviceptr __dptr)
|
|
419
|
+
{
|
|
420
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemFree);
|
|
421
|
+
return static_cast<::cudaError_t>(__driver_fn(__dptr));
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
_CCCL_HOST_API inline ::cudaError_t __freeHostNoThrow(void* __dptr)
|
|
425
|
+
{
|
|
426
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemFreeHost);
|
|
427
|
+
return static_cast<::cudaError_t>(__driver_fn(__dptr));
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
// Unified Addressing
|
|
431
|
+
|
|
432
|
+
// TODO: we don't want to have these functions here, refactoring expected
|
|
433
|
+
template <::CUpointer_attribute _Attr>
|
|
434
|
+
[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto __pointer_attribute_value_type_t_impl() noexcept
|
|
435
|
+
{
|
|
436
|
+
if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_CONTEXT)
|
|
437
|
+
{
|
|
438
|
+
return ::CUcontext{};
|
|
439
|
+
}
|
|
440
|
+
else if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE)
|
|
441
|
+
{
|
|
442
|
+
return ::CUmemorytype{};
|
|
443
|
+
}
|
|
444
|
+
else if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER || _Attr == ::CU_POINTER_ATTRIBUTE_HOST_POINTER)
|
|
445
|
+
{
|
|
446
|
+
return static_cast<void*>(nullptr);
|
|
447
|
+
}
|
|
448
|
+
else if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_IS_MANAGED || _Attr == ::CU_POINTER_ATTRIBUTE_MAPPED)
|
|
449
|
+
{
|
|
450
|
+
return bool{};
|
|
451
|
+
}
|
|
452
|
+
else if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL)
|
|
453
|
+
{
|
|
454
|
+
return int{};
|
|
455
|
+
}
|
|
456
|
+
else
|
|
457
|
+
{
|
|
458
|
+
static_assert(::cuda::std::__always_false_v<decltype(_Attr)>, "not implemented attribute");
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
template <::CUpointer_attribute _Attr>
|
|
463
|
+
using __pointer_attribute_value_type_t = decltype(::cuda::__driver::__pointer_attribute_value_type_t_impl<_Attr>());
|
|
464
|
+
|
|
465
|
+
template <::CUpointer_attribute _Attr>
|
|
466
|
+
[[nodiscard]] _CCCL_HOST_API inline ::cudaError_t
|
|
467
|
+
__pointerGetAttributeNoThrow(__pointer_attribute_value_type_t<_Attr>& __result, const void* __ptr)
|
|
468
|
+
{
|
|
469
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuPointerGetAttribute);
|
|
470
|
+
::cudaError_t __status{};
|
|
471
|
+
if constexpr (::cuda::std::is_same_v<__pointer_attribute_value_type_t<_Attr>, bool>)
|
|
472
|
+
{
|
|
473
|
+
int __result2{};
|
|
474
|
+
__status = static_cast<::cudaError_t>(__driver_fn(&__result2, _Attr, reinterpret_cast<::CUdeviceptr>(__ptr)));
|
|
475
|
+
__result = static_cast<bool>(__result2);
|
|
476
|
+
}
|
|
477
|
+
else
|
|
478
|
+
{
|
|
479
|
+
__status =
|
|
480
|
+
static_cast<::cudaError_t>(__driver_fn((void*) &__result, _Attr, reinterpret_cast<::CUdeviceptr>(__ptr)));
|
|
481
|
+
}
|
|
482
|
+
return __status;
|
|
483
|
+
}
|
|
484
|
+
|
|
242
485
|
// Stream management
|
|
243
486
|
|
|
487
|
+
[[nodiscard]] _CCCL_HOST_API inline ::CUstream __streamCreateWithPriority(unsigned __flags, int __priority)
|
|
488
|
+
{
|
|
489
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuStreamCreateWithPriority);
|
|
490
|
+
::CUstream __stream;
|
|
491
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to create a stream", &__stream, __flags, __priority);
|
|
492
|
+
return __stream;
|
|
493
|
+
}
|
|
494
|
+
|
|
244
495
|
_CCCL_HOST_API inline void __streamSynchronize(::CUstream __stream)
|
|
245
496
|
{
|
|
246
497
|
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuStreamSynchronize);
|
|
@@ -294,6 +545,17 @@ struct __ctx_from_stream
|
|
|
294
545
|
}
|
|
295
546
|
# endif // _CCCL_CTK_AT_LEAST(12, 5)
|
|
296
547
|
|
|
548
|
+
// TODO: make this available since CUDA 12.8
|
|
549
|
+
# if _CCCL_CTK_AT_LEAST(13, 0)
|
|
550
|
+
[[nodiscard]] _CCCL_HOST_API inline ::CUdevice __streamGetDevice(::CUstream __stream)
|
|
551
|
+
{
|
|
552
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION_VERSIONED(cuStreamGetDevice, cuStreamGetDevice, 12, 8);
|
|
553
|
+
::CUdevice __result{};
|
|
554
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get the device of the stream", __stream, &__result);
|
|
555
|
+
return __result;
|
|
556
|
+
}
|
|
557
|
+
# endif // _CCCL_CTK_AT_LEAST(13, 0)
|
|
558
|
+
|
|
297
559
|
_CCCL_HOST_API inline void __streamWaitEvent(::CUstream __stream, ::CUevent __evnt)
|
|
298
560
|
{
|
|
299
561
|
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuStreamWaitEvent);
|
|
@@ -323,31 +585,52 @@ _CCCL_HOST_API inline void __streamWaitEvent(::CUstream __stream, ::CUevent __ev
|
|
|
323
585
|
return __id;
|
|
324
586
|
}
|
|
325
587
|
|
|
326
|
-
// Event management
|
|
327
|
-
|
|
328
|
-
_CCCL_HOST_API inline void __eventRecord(::CUevent __evnt, ::CUstream __stream)
|
|
329
|
-
{
|
|
330
|
-
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventRecord);
|
|
331
|
-
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to record CUDA event", __evnt, __stream);
|
|
332
|
-
}
|
|
333
|
-
|
|
334
|
-
// Destroy calls return error codes to let the calling code decide if the error should be ignored
|
|
335
588
|
[[nodiscard]] _CCCL_HOST_API inline ::cudaError_t __streamDestroyNoThrow(::CUstream __stream)
|
|
336
589
|
{
|
|
337
590
|
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuStreamDestroy);
|
|
338
591
|
return static_cast<::cudaError_t>(__driver_fn(__stream));
|
|
339
592
|
}
|
|
340
593
|
|
|
594
|
+
// Event management
|
|
595
|
+
|
|
596
|
+
[[nodiscard]] _CCCL_HOST_API inline ::CUevent __eventCreate(unsigned __flags)
|
|
597
|
+
{
|
|
598
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventCreate);
|
|
599
|
+
::CUevent __evnt;
|
|
600
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to create a CUDA event", &__evnt, __flags);
|
|
601
|
+
return __evnt;
|
|
602
|
+
}
|
|
603
|
+
|
|
341
604
|
[[nodiscard]] _CCCL_HOST_API inline ::cudaError_t __eventDestroyNoThrow(::CUevent __evnt)
|
|
342
605
|
{
|
|
343
606
|
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventDestroy);
|
|
344
607
|
return static_cast<::cudaError_t>(__driver_fn(__evnt));
|
|
345
608
|
}
|
|
346
609
|
|
|
347
|
-
_CCCL_HOST_API inline
|
|
610
|
+
[[nodiscard]] _CCCL_HOST_API inline float __eventElapsedTime(::CUevent __start, ::CUevent __end)
|
|
348
611
|
{
|
|
349
612
|
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventElapsedTime);
|
|
350
|
-
|
|
613
|
+
float __result;
|
|
614
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get event elapsed time", &__result, __start, __end);
|
|
615
|
+
return __result;
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
[[nodiscard]] _CCCL_HOST_API inline ::cudaError_t __eventQueryNoThrow(::CUevent __evnt)
|
|
619
|
+
{
|
|
620
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventQuery);
|
|
621
|
+
return static_cast<::cudaError_t>(__driver_fn(__evnt));
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
_CCCL_HOST_API inline void __eventRecord(::CUevent __evnt, ::CUstream __stream)
|
|
625
|
+
{
|
|
626
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventRecord);
|
|
627
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to record an event", __evnt, __stream);
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
_CCCL_HOST_API inline void __eventSynchronize(::CUevent __evnt)
|
|
631
|
+
{
|
|
632
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventSynchronize);
|
|
633
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to synchronize an event", __evnt);
|
|
351
634
|
}
|
|
352
635
|
|
|
353
636
|
// Library management
|
|
@@ -491,6 +774,17 @@ __graphKernelNodeSetAttribute(::CUgraphNode __node, ::CUkernelNodeAttrID __id, c
|
|
|
491
774
|
_CUDA_DRIVER::__call_driver_fn(__driver_fn, "Failed to set kernel node parameters", __node, __id, &__value);
|
|
492
775
|
}
|
|
493
776
|
|
|
777
|
+
// Peer Context Memory Access
|
|
778
|
+
|
|
779
|
+
[[nodiscard]] _CCCL_HOST_API inline bool __deviceCanAccessPeer(::CUdevice __dev, ::CUdevice __peer_dev)
|
|
780
|
+
{
|
|
781
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDeviceCanAccessPeer);
|
|
782
|
+
int __result;
|
|
783
|
+
_CUDA_DRIVER::__call_driver_fn(
|
|
784
|
+
__driver_fn, "Failed to query if device can access peer's memory", &__result, __dev, __peer_dev);
|
|
785
|
+
return static_cast<bool>(__result);
|
|
786
|
+
}
|
|
787
|
+
|
|
494
788
|
// Green contexts
|
|
495
789
|
|
|
496
790
|
# if _CCCL_CTK_AT_LEAST(12, 5)
|
|
@@ -536,6 +830,6 @@ _CCCL_END_NAMESPACE_CUDA_DRIVER
|
|
|
536
830
|
|
|
537
831
|
# include <cuda/std/__cccl/epilogue.h>
|
|
538
832
|
|
|
539
|
-
#endif // _CCCL_HAS_CTK()
|
|
833
|
+
#endif // _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
540
834
|
|
|
541
835
|
#endif // _CUDA___DRIVER_DRIVER_API_H
|
|
@@ -23,10 +23,11 @@
|
|
|
23
23
|
|
|
24
24
|
#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
25
25
|
|
|
26
|
+
# include <cuda/__device/device_ref.h>
|
|
27
|
+
# include <cuda/__driver/driver_api.h>
|
|
26
28
|
# include <cuda/__event/event_ref.h>
|
|
27
29
|
# include <cuda/__runtime/ensure_current_context.h>
|
|
28
30
|
# include <cuda/__utility/no_init.h>
|
|
29
|
-
# include <cuda/std/__cuda/api_wrapper.h>
|
|
30
31
|
# include <cuda/std/cstddef>
|
|
31
32
|
# include <cuda/std/utility>
|
|
32
33
|
|
|
@@ -43,11 +44,11 @@ class event : public event_ref
|
|
|
43
44
|
|
|
44
45
|
public:
|
|
45
46
|
//! @brief Flags to use when creating the event.
|
|
46
|
-
enum class flags : unsigned
|
|
47
|
+
enum class flags : unsigned
|
|
47
48
|
{
|
|
48
49
|
none = cudaEventDefault,
|
|
49
50
|
blocking_sync = cudaEventBlockingSync,
|
|
50
|
-
interprocess = cudaEventInterprocess
|
|
51
|
+
interprocess = cudaEventInterprocess,
|
|
51
52
|
};
|
|
52
53
|
|
|
53
54
|
//! @brief Construct a new `event` object with timing disabled, and record
|
|
@@ -141,7 +142,7 @@ public:
|
|
|
141
142
|
|
|
142
143
|
[[nodiscard]] friend constexpr flags operator|(flags __lhs, flags __rhs) noexcept
|
|
143
144
|
{
|
|
144
|
-
return static_cast<flags>(static_cast<unsigned
|
|
145
|
+
return static_cast<flags>(static_cast<unsigned>(__lhs) | static_cast<unsigned>(__rhs));
|
|
145
146
|
}
|
|
146
147
|
|
|
147
148
|
private:
|
|
@@ -151,14 +152,13 @@ private:
|
|
|
151
152
|
: event_ref(__evnt)
|
|
152
153
|
{}
|
|
153
154
|
|
|
154
|
-
explicit event(stream_ref __stream, unsigned
|
|
155
|
+
explicit event(stream_ref __stream, unsigned __flags);
|
|
155
156
|
|
|
156
|
-
explicit event(device_ref __device, unsigned
|
|
157
|
+
explicit event(device_ref __device, unsigned __flags)
|
|
157
158
|
: event_ref(::cudaEvent_t{})
|
|
158
159
|
{
|
|
159
160
|
[[maybe_unused]] __ensure_current_context __ctx_setter(__device);
|
|
160
|
-
|
|
161
|
-
::cudaEventCreateWithFlags, "Failed to create CUDA event", &__event_, static_cast<unsigned int>(__flags));
|
|
161
|
+
__event_ = ::cuda::__driver::__eventCreate(static_cast<unsigned>(__flags));
|
|
162
162
|
}
|
|
163
163
|
};
|
|
164
164
|
|
|
@@ -24,7 +24,6 @@
|
|
|
24
24
|
#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
25
25
|
|
|
26
26
|
# include <cuda/__driver/driver_api.h>
|
|
27
|
-
# include <cuda/std/__cuda/api_wrapper.h>
|
|
28
27
|
# include <cuda/std/cassert>
|
|
29
28
|
# include <cuda/std/cstddef>
|
|
30
29
|
# include <cuda/std/utility>
|
|
@@ -80,7 +79,7 @@ public:
|
|
|
80
79
|
_CCCL_HOST_API void sync() const
|
|
81
80
|
{
|
|
82
81
|
_CCCL_ASSERT(__event_ != nullptr, "cuda::event_ref::sync no event set");
|
|
83
|
-
|
|
82
|
+
::cuda::__driver::__eventSynchronize(__event_);
|
|
84
83
|
}
|
|
85
84
|
|
|
86
85
|
//! @brief Checks if all the work in the stream prior to the record of the event has completed.
|
|
@@ -91,12 +90,12 @@ public:
|
|
|
91
90
|
[[nodiscard]] _CCCL_HOST_API bool is_done() const
|
|
92
91
|
{
|
|
93
92
|
_CCCL_ASSERT(__event_ != nullptr, "cuda::event_ref::sync no event set");
|
|
94
|
-
cudaError_t __status = ::
|
|
95
|
-
if (__status == cudaSuccess)
|
|
93
|
+
::cudaError_t __status = ::cuda::__driver::__eventQueryNoThrow(__event_);
|
|
94
|
+
if (__status == ::cudaSuccess)
|
|
96
95
|
{
|
|
97
96
|
return true;
|
|
98
97
|
}
|
|
99
|
-
else if (__status == cudaErrorNotReady)
|
|
98
|
+
else if (__status == ::cudaErrorNotReady)
|
|
100
99
|
{
|
|
101
100
|
return false;
|
|
102
101
|
}
|
|
@@ -26,10 +26,11 @@
|
|
|
26
26
|
|
|
27
27
|
#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
28
28
|
|
|
29
|
+
# include <cuda/__device/device_ref.h>
|
|
30
|
+
# include <cuda/__driver/driver_api.h>
|
|
29
31
|
# include <cuda/__event/event.h>
|
|
30
32
|
# include <cuda/__utility/no_init.h>
|
|
31
33
|
# include <cuda/std/__chrono/duration.h>
|
|
32
|
-
# include <cuda/std/__cuda/api_wrapper.h>
|
|
33
34
|
# include <cuda/std/cstddef>
|
|
34
35
|
|
|
35
36
|
# include <cuda/std/__cccl/prologue.h>
|
|
@@ -51,7 +52,7 @@ public:
|
|
|
51
52
|
//!
|
|
52
53
|
//! @throws cuda_error if the event creation fails.
|
|
53
54
|
explicit timed_event(device_ref __device, flags __flags = flags::none)
|
|
54
|
-
: event(__device, static_cast<unsigned
|
|
55
|
+
: event(__device, static_cast<unsigned>(__flags))
|
|
55
56
|
{}
|
|
56
57
|
|
|
57
58
|
//! @brief Construct a new `timed_event` object into the moved-from state.
|
|
@@ -96,8 +97,7 @@ public:
|
|
|
96
97
|
//! @note The elapsed time has a resolution of approximately 0.5 microseconds.
|
|
97
98
|
[[nodiscard]] friend ::cuda::std::chrono::nanoseconds operator-(const timed_event& __end, const timed_event& __start)
|
|
98
99
|
{
|
|
99
|
-
|
|
100
|
-
::cuda::__driver::__eventElapsedTime(__start.get(), __end.get(), &__ms);
|
|
100
|
+
const auto __ms = ::cuda::__driver::__eventElapsedTime(__start.get(), __end.get());
|
|
101
101
|
return ::cuda::std::chrono::nanoseconds(static_cast<::cuda::std::chrono::nanoseconds::rep>(__ms * 1'000'000.0));
|
|
102
102
|
}
|
|
103
103
|
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
//===----------------------------------------------------------------------===//
|
|
2
|
+
//
|
|
3
|
+
// Part of libcu++, the C++ Standard Library for your entire system,
|
|
4
|
+
// under the Apache License v2.0 with LLVM Exceptions.
|
|
5
|
+
// See https://llvm.org/LICENSE.txt for license information.
|
|
6
|
+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
7
|
+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
|
|
8
|
+
//
|
|
9
|
+
//===----------------------------------------------------------------------===//
|
|
10
|
+
|
|
11
|
+
#ifndef _CUDA___FWD_DEVICES_H
|
|
12
|
+
#define _CUDA___FWD_DEVICES_H
|
|
13
|
+
|
|
14
|
+
#include <cuda/std/detail/__config>
|
|
15
|
+
|
|
16
|
+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
17
|
+
# pragma GCC system_header
|
|
18
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
|
|
19
|
+
# pragma clang system_header
|
|
20
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
|
|
21
|
+
# pragma system_header
|
|
22
|
+
#endif // no system header
|
|
23
|
+
|
|
24
|
+
#include <cuda/std/__fwd/span.h>
|
|
25
|
+
|
|
26
|
+
#include <cuda/std/__cccl/prologue.h>
|
|
27
|
+
|
|
28
|
+
_CCCL_BEGIN_NAMESPACE_CUDA
|
|
29
|
+
|
|
30
|
+
class __physical_device;
|
|
31
|
+
class device_ref;
|
|
32
|
+
template <::cudaDeviceAttr _Attr>
|
|
33
|
+
struct __dev_attr;
|
|
34
|
+
|
|
35
|
+
namespace arch
|
|
36
|
+
{
|
|
37
|
+
struct traits_t;
|
|
38
|
+
} // namespace arch
|
|
39
|
+
|
|
40
|
+
_CCCL_END_NAMESPACE_CUDA
|
|
41
|
+
|
|
42
|
+
#include <cuda/std/__cccl/epilogue.h>
|
|
43
|
+
|
|
44
|
+
#endif // _CUDA___FWD_DEVICES_H
|
|
@@ -42,6 +42,15 @@ inline constexpr bool __is_zip_function = false;
|
|
|
42
42
|
template <class _Fn>
|
|
43
43
|
inline constexpr bool __is_zip_function<zip_function<_Fn>> = true;
|
|
44
44
|
|
|
45
|
+
template <class _Fn, class... _Iterators>
|
|
46
|
+
class zip_transform_iterator;
|
|
47
|
+
|
|
48
|
+
template <class>
|
|
49
|
+
inline constexpr bool __is_zip_transform_iterator = false;
|
|
50
|
+
|
|
51
|
+
template <class _Fn, class... _Iterators>
|
|
52
|
+
inline constexpr bool __is_zip_transform_iterator<zip_transform_iterator<_Fn, _Iterators...>> = true;
|
|
53
|
+
|
|
45
54
|
_CCCL_END_NAMESPACE_CUDA
|
|
46
55
|
|
|
47
56
|
#include <cuda/std/__cccl/epilogue.h>
|
|
@@ -159,11 +159,11 @@ public:
|
|
|
159
159
|
::cuda::std::ranges::__movable_box<_OutputFn> __output_func_{};
|
|
160
160
|
|
|
161
161
|
using iterator_concept = ::cuda::std::conditional_t<
|
|
162
|
-
::cuda::std::
|
|
162
|
+
::cuda::std::__has_random_access_traversal<_Iter>,
|
|
163
163
|
::cuda::std::random_access_iterator_tag,
|
|
164
|
-
::cuda::std::conditional_t<::cuda::std::
|
|
164
|
+
::cuda::std::conditional_t<::cuda::std::__has_bidirectional_traversal<_Iter>,
|
|
165
165
|
::cuda::std::bidirectional_iterator_tag,
|
|
166
|
-
::cuda::std::conditional_t<::cuda::std::
|
|
166
|
+
::cuda::std::conditional_t<::cuda::std::__has_forward_traversal<_Iter>,
|
|
167
167
|
::cuda::std::forward_iterator_tag,
|
|
168
168
|
::cuda::std::output_iterator_tag>>>;
|
|
169
169
|
using iterator_category = ::cuda::std::output_iterator_tag;
|