cuda-cccl 0.1.3.2.0.dev438__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.0__cp311-cp311-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +23 -0
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +22 -14
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
- cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
- cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +321 -262
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +37 -13
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +203 -51
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
- cuda/cccl/headers/include/cub/util_device.cuh +51 -35
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +3 -3
- cuda/cccl/headers/include/cuda/__device/all_devices.h +3 -6
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +3 -3
- cuda/cccl/headers/include/cuda/__device/attributes.h +7 -7
- cuda/cccl/headers/include/cuda/__device/device_ref.h +3 -10
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +225 -33
- cuda/cccl/headers/include/cuda/__event/event.h +7 -8
- cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +3 -4
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
- cuda/cccl/headers/include/cuda/__runtime/types.h +1 -1
- cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -12
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +1 -1
- cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
- cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +3 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
- cuda/cccl/parallel/experimental/__init__.py +4 -0
- cuda/cccl/parallel/experimental/_bindings.pyi +28 -0
- cuda/cccl/parallel/experimental/_bindings_impl.pyx +140 -0
- cuda/cccl/parallel/experimental/algorithms/__init__.py +4 -0
- cuda/cccl/parallel/experimental/algorithms/_reduce.py +0 -2
- cuda/cccl/parallel/experimental/algorithms/_scan.py +0 -2
- cuda/cccl/parallel/experimental/algorithms/_three_way_partition.py +261 -0
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/cccl/libcccl.c.parallel.so +0 -0
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/METADATA +1 -1
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/RECORD +59 -57
- cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -47,7 +47,6 @@
|
|
|
47
47
|
// for backward compatibility
|
|
48
48
|
#include <cub/util_temporary_storage.cuh>
|
|
49
49
|
|
|
50
|
-
#include <cuda/std/__cuda/ensure_current_device.h> // IWYU pragma: export
|
|
51
50
|
#include <cuda/std/__type_traits/conditional.h>
|
|
52
51
|
#include <cuda/std/__utility/forward.h>
|
|
53
52
|
#include <cuda/std/array>
|
|
@@ -104,7 +103,34 @@ CUB_RUNTIME_FUNCTION inline int CurrentDevice()
|
|
|
104
103
|
|
|
105
104
|
//! @brief RAII helper which saves the current device and switches to the specified device on construction and switches
|
|
106
105
|
//! to the saved device on destruction.
|
|
107
|
-
|
|
106
|
+
class SwitchDevice
|
|
107
|
+
{
|
|
108
|
+
int target_device_;
|
|
109
|
+
int original_device_;
|
|
110
|
+
|
|
111
|
+
public:
|
|
112
|
+
//! @brief Queries the current device and if that is different than @p target_device sets the current device to
|
|
113
|
+
//! @p target_device
|
|
114
|
+
SwitchDevice(const int target_device)
|
|
115
|
+
: target_device_(target_device)
|
|
116
|
+
{
|
|
117
|
+
CubDebug(cudaGetDevice(&original_device_));
|
|
118
|
+
if (original_device_ != target_device_)
|
|
119
|
+
{
|
|
120
|
+
CubDebug(cudaSetDevice(target_device_));
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
//! @brief If the @p original_device was not equal to @p target_device sets the current device back to
|
|
125
|
+
//! @p original_device
|
|
126
|
+
~SwitchDevice()
|
|
127
|
+
{
|
|
128
|
+
if (original_device_ != target_device_)
|
|
129
|
+
{
|
|
130
|
+
CubDebug(cudaSetDevice(original_device_));
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
};
|
|
108
134
|
|
|
109
135
|
# endif // _CCCL_DOXYGEN_INVOKED
|
|
110
136
|
|
|
@@ -684,16 +710,31 @@ struct KernelConfig
|
|
|
684
710
|
return launcher_factory.MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
|
|
685
711
|
}
|
|
686
712
|
};
|
|
687
|
-
|
|
688
713
|
} // namespace detail
|
|
689
714
|
#endif // !_CCCL_COMPILER(NVRTC)
|
|
690
715
|
|
|
716
|
+
namespace detail
|
|
717
|
+
{
|
|
718
|
+
template <typename T>
|
|
719
|
+
struct get_active_policy
|
|
720
|
+
{
|
|
721
|
+
using type = typename T::ActivePolicy;
|
|
722
|
+
};
|
|
723
|
+
} // namespace detail
|
|
724
|
+
|
|
691
725
|
/// Helper for dispatching into a policy chain
|
|
692
726
|
template <int PolicyPtxVersion, typename PolicyT, typename PrevPolicyT>
|
|
693
727
|
struct ChainedPolicy
|
|
694
728
|
{
|
|
729
|
+
private:
|
|
730
|
+
static constexpr bool have_previous_policy = !::cuda::std::is_same_v<PolicyT, PrevPolicyT>;
|
|
731
|
+
|
|
732
|
+
public:
|
|
695
733
|
/// The policy for the active compiler pass
|
|
696
|
-
using ActivePolicy =
|
|
734
|
+
using ActivePolicy =
|
|
735
|
+
typename ::cuda::std::_If<(CUB_PTX_ARCH < PolicyPtxVersion && have_previous_policy),
|
|
736
|
+
detail::get_active_policy<PrevPolicyT>,
|
|
737
|
+
::cuda::std::type_identity<PolicyT>>::type;
|
|
697
738
|
|
|
698
739
|
#if !_CCCL_COMPILER(NVRTC)
|
|
699
740
|
/// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
|
|
@@ -708,9 +749,12 @@ struct ChainedPolicy
|
|
|
708
749
|
# elif defined(NV_TARGET_SM_INTEGER_LIST)
|
|
709
750
|
return runtime_to_compiletime<10, NV_TARGET_SM_INTEGER_LIST>(device_ptx_version, op);
|
|
710
751
|
# else
|
|
711
|
-
if (
|
|
752
|
+
if constexpr (have_previous_policy)
|
|
712
753
|
{
|
|
713
|
-
|
|
754
|
+
if (device_ptx_version < PolicyPtxVersion)
|
|
755
|
+
{
|
|
756
|
+
return PrevPolicyT::Invoke(device_ptx_version, op);
|
|
757
|
+
}
|
|
714
758
|
}
|
|
715
759
|
return op.template Invoke<PolicyT>();
|
|
716
760
|
# endif
|
|
@@ -738,7 +782,7 @@ private:
|
|
|
738
782
|
template <int DevicePtxVersion, typename FunctorT>
|
|
739
783
|
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t invoke_static(FunctorT& op)
|
|
740
784
|
{
|
|
741
|
-
if constexpr (DevicePtxVersion < PolicyPtxVersion)
|
|
785
|
+
if constexpr (DevicePtxVersion < PolicyPtxVersion && have_previous_policy)
|
|
742
786
|
{
|
|
743
787
|
return PrevPolicyT::template invoke_static<DevicePtxVersion>(op);
|
|
744
788
|
}
|
|
@@ -749,34 +793,6 @@ private:
|
|
|
749
793
|
}
|
|
750
794
|
#endif // !_CCCL_COMPILER(NVRTC)
|
|
751
795
|
};
|
|
752
|
-
|
|
753
|
-
/// Helper for dispatching into a policy chain (end-of-chain specialization)
|
|
754
|
-
template <int PolicyPtxVersion, typename PolicyT>
|
|
755
|
-
struct ChainedPolicy<PolicyPtxVersion, PolicyT, PolicyT>
|
|
756
|
-
{
|
|
757
|
-
template <int, typename, typename>
|
|
758
|
-
friend struct ChainedPolicy; // befriend primary template, so it can call invoke_static
|
|
759
|
-
|
|
760
|
-
/// The policy for the active compiler pass
|
|
761
|
-
using ActivePolicy = PolicyT;
|
|
762
|
-
|
|
763
|
-
#if !_CCCL_COMPILER(NVRTC)
|
|
764
|
-
/// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
|
|
765
|
-
template <typename FunctorT>
|
|
766
|
-
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Invoke(int /*ptx_version*/, FunctorT& op)
|
|
767
|
-
{
|
|
768
|
-
return op.template Invoke<PolicyT>();
|
|
769
|
-
}
|
|
770
|
-
|
|
771
|
-
private:
|
|
772
|
-
template <int, typename FunctorT>
|
|
773
|
-
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t invoke_static(FunctorT& op)
|
|
774
|
-
{
|
|
775
|
-
return op.template Invoke<PolicyT>();
|
|
776
|
-
}
|
|
777
|
-
#endif // !_CCCL_COMPILER(NVRTC)
|
|
778
|
-
};
|
|
779
|
-
|
|
780
796
|
CUB_NAMESPACE_END
|
|
781
797
|
|
|
782
798
|
#if _CCCL_HAS_CUDA_COMPILER() && !_CCCL_COMPILER(NVRTC)
|
|
@@ -38,11 +38,11 @@ enum class source_access_order
|
|
|
38
38
|
{
|
|
39
39
|
# if _CCCL_CTK_AT_LEAST(13, 0)
|
|
40
40
|
//! @brief Access source in stream order
|
|
41
|
-
stream = cudaMemcpySrcAccessOrderStream,
|
|
41
|
+
stream = ::cudaMemcpySrcAccessOrderStream,
|
|
42
42
|
//! @brief Access source during the copy call, source can be destroyed after the API returns
|
|
43
|
-
during_api_call = cudaMemcpySrcAccessOrderDuringApiCall,
|
|
43
|
+
during_api_call = ::cudaMemcpySrcAccessOrderDuringApiCall,
|
|
44
44
|
//! @brief Access source in any order, the order can change across CUDA releases
|
|
45
|
-
any = cudaMemcpySrcAccessOrderAny,
|
|
45
|
+
any = ::cudaMemcpySrcAccessOrderAny,
|
|
46
46
|
# else
|
|
47
47
|
any = 0x3,
|
|
48
48
|
# endif // _CCCL_CTK_BELOW(13, 0)
|
|
@@ -23,7 +23,7 @@
|
|
|
23
23
|
|
|
24
24
|
#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
25
25
|
# include <cuda/__device/physical_device.h>
|
|
26
|
-
# include <cuda/
|
|
26
|
+
# include <cuda/__driver/driver_api.h>
|
|
27
27
|
# include <cuda/std/cassert>
|
|
28
28
|
# include <cuda/std/detail/libcxx/include/stdexcept>
|
|
29
29
|
# include <cuda/std/span>
|
|
@@ -151,11 +151,8 @@ inline all_devices::operator ::cuda::std::span<const device_ref>() const
|
|
|
151
151
|
|
|
152
152
|
inline const ::std::vector<physical_device>& all_devices::__devices()
|
|
153
153
|
{
|
|
154
|
-
static const ::std::vector<physical_device> __devices
|
|
155
|
-
|
|
156
|
-
_CCCL_TRY_CUDA_API(::cudaGetDeviceCount, "failed to get the count of CUDA devices", &__count);
|
|
157
|
-
return ::std::vector<physical_device>{__initializer_iterator{0}, __initializer_iterator{__count}};
|
|
158
|
-
}();
|
|
154
|
+
static const ::std::vector<physical_device> __devices{
|
|
155
|
+
__initializer_iterator{0}, __initializer_iterator{::cuda::__driver::__deviceGetCount()}};
|
|
159
156
|
return __devices;
|
|
160
157
|
}
|
|
161
158
|
} // namespace __detail
|
|
@@ -516,7 +516,7 @@ inline constexpr int __highest_known_arch = 120;
|
|
|
516
516
|
case id::sm_120a:
|
|
517
517
|
return ::cuda::arch::traits<id::sm_120a>();
|
|
518
518
|
default:
|
|
519
|
-
::cuda::__throw_cuda_error(cudaErrorInvalidValue, "Traits requested for an unknown architecture");
|
|
519
|
+
::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Traits requested for an unknown architecture");
|
|
520
520
|
break;
|
|
521
521
|
}
|
|
522
522
|
}
|
|
@@ -525,7 +525,7 @@ inline constexpr int __highest_known_arch = 120;
|
|
|
525
525
|
{
|
|
526
526
|
if (compute_capability < 60 || compute_capability > __highest_known_arch)
|
|
527
527
|
{
|
|
528
|
-
::cuda::__throw_cuda_error(cudaErrorInvalidValue, "Compute capability out of range");
|
|
528
|
+
::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Compute capability out of range");
|
|
529
529
|
}
|
|
530
530
|
return static_cast<id>(compute_capability);
|
|
531
531
|
}
|
|
@@ -550,7 +550,7 @@ _CCCL_API inline constexpr id __special_id_for_compute_capability(int value)
|
|
|
550
550
|
case 120:
|
|
551
551
|
return id::sm_120a;
|
|
552
552
|
default:
|
|
553
|
-
::cuda::__throw_cuda_error(cudaErrorInvalidValue, "Compute capability out of range");
|
|
553
|
+
::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Compute capability out of range");
|
|
554
554
|
break;
|
|
555
555
|
}
|
|
556
556
|
}
|
|
@@ -24,6 +24,7 @@
|
|
|
24
24
|
#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
25
25
|
|
|
26
26
|
# include <cuda/__device/device_ref.h>
|
|
27
|
+
# include <cuda/__driver/driver_api.h>
|
|
27
28
|
# include <cuda/std/__cccl/attributes.h>
|
|
28
29
|
# include <cuda/std/__cuda/api_wrapper.h>
|
|
29
30
|
|
|
@@ -44,11 +45,10 @@ struct __dev_attr_impl
|
|
|
44
45
|
return _Attr;
|
|
45
46
|
}
|
|
46
47
|
|
|
47
|
-
[[nodiscard]] type operator()(device_ref
|
|
48
|
+
[[nodiscard]] type operator()(device_ref __dev) const
|
|
48
49
|
{
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
return static_cast<type>(__value);
|
|
50
|
+
return static_cast<type>(::cuda::__driver::__deviceGetAttribute(
|
|
51
|
+
static_cast<::CUdevice_attribute>(_Attr), ::cuda::__driver::__deviceGet(__dev.get())));
|
|
52
52
|
}
|
|
53
53
|
};
|
|
54
54
|
|
|
@@ -81,9 +81,9 @@ template <>
|
|
|
81
81
|
struct __dev_attr<::cudaDevAttrComputeMode> //
|
|
82
82
|
: __dev_attr_impl<::cudaDevAttrComputeMode, ::cudaComputeMode>
|
|
83
83
|
{
|
|
84
|
-
static constexpr type default_mode = cudaComputeModeDefault;
|
|
85
|
-
static constexpr type prohibited_mode = cudaComputeModeProhibited;
|
|
86
|
-
static constexpr type exclusive_process_mode = cudaComputeModeExclusiveProcess;
|
|
84
|
+
static constexpr type default_mode = ::cudaComputeModeDefault;
|
|
85
|
+
static constexpr type prohibited_mode = ::cudaComputeModeProhibited;
|
|
86
|
+
static constexpr type exclusive_process_mode = ::cudaComputeModeExclusiveProcess;
|
|
87
87
|
};
|
|
88
88
|
template <>
|
|
89
89
|
struct __dev_attr<::cudaDevAttrConcurrentKernels> //
|
|
@@ -24,7 +24,6 @@
|
|
|
24
24
|
#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
25
25
|
# include <cuda/__driver/driver_api.h>
|
|
26
26
|
# include <cuda/__runtime/types.h>
|
|
27
|
-
# include <cuda/std/__cuda/api_wrapper.h>
|
|
28
27
|
|
|
29
28
|
# include <string>
|
|
30
29
|
# include <vector>
|
|
@@ -143,16 +142,10 @@ public:
|
|
|
143
142
|
//!
|
|
144
143
|
//! @param __other_dev Device to query the peer access
|
|
145
144
|
//! @return true if its possible for this device to access the specified device's memory
|
|
146
|
-
bool has_peer_access_to(device_ref __other_dev) const
|
|
145
|
+
[[nodiscard]] bool has_peer_access_to(device_ref __other_dev) const
|
|
147
146
|
{
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
::cudaDeviceCanAccessPeer,
|
|
151
|
-
"Could not query if device can be peer accessed",
|
|
152
|
-
&__can_access,
|
|
153
|
-
get(),
|
|
154
|
-
__other_dev.get());
|
|
155
|
-
return __can_access;
|
|
147
|
+
return ::cuda::__driver::__deviceCanAccessPeer(
|
|
148
|
+
::cuda::__driver::__deviceGet(get()), ::cuda::__driver::__deviceGet(__other_dev.get()));
|
|
156
149
|
}
|
|
157
150
|
|
|
158
151
|
//! @brief Retrieve architecture traits of this device.
|
|
@@ -21,11 +21,12 @@
|
|
|
21
21
|
# pragma system_header
|
|
22
22
|
#endif // no system header
|
|
23
23
|
|
|
24
|
-
#if _CCCL_HAS_CTK()
|
|
24
|
+
#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
25
25
|
|
|
26
26
|
# include <cuda/std/__exception/cuda_error.h>
|
|
27
27
|
# include <cuda/std/__internal/namespaces.h>
|
|
28
28
|
# include <cuda/std/__type_traits/always_false.h>
|
|
29
|
+
# include <cuda/std/__type_traits/is_same.h>
|
|
29
30
|
|
|
30
31
|
# include <cuda.h>
|
|
31
32
|
|
|
@@ -41,31 +42,45 @@ _CCCL_BEGIN_NAMESPACE_CUDA_DRIVER
|
|
|
41
42
|
reinterpret_cast<decltype(::versioned_fn_name)*>( \
|
|
42
43
|
::cuda::__driver::__get_driver_entry_point(#function_name, major, minor))
|
|
43
44
|
|
|
45
|
+
// cudaGetDriverEntryPoint function is deprecated
|
|
44
46
|
_CCCL_SUPPRESS_DEPRECATED_PUSH
|
|
45
47
|
|
|
46
|
-
//! @brief
|
|
47
|
-
|
|
48
|
-
//! For minor version compatibility request the 12.0 version of everything for now, unless requested otherwise
|
|
49
|
-
[[nodiscard]] _CCCL_HOST_API inline void*
|
|
50
|
-
__get_driver_entry_point(const char* __name, [[maybe_unused]] int __major = 12, [[maybe_unused]] int __minor = 0)
|
|
48
|
+
//! @brief Gets the cuGetProcAddress function pointer.
|
|
49
|
+
[[nodiscard]] _CCCL_HOST_API inline auto __getProcAddressFn() -> decltype(cuGetProcAddress)*
|
|
51
50
|
{
|
|
52
|
-
// TODO switch to dlopen of libcuda.so instead of the below
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
51
|
+
// TODO switch to dlopen of libcuda.so instead of the below
|
|
52
|
+
void* __fn;
|
|
53
|
+
::cudaDriverEntryPointQueryResult __result;
|
|
54
|
+
::cudaError_t __status = ::cudaGetDriverEntryPoint("cuGetProcAddress", &__fn, ::cudaEnableDefault, &__result);
|
|
55
|
+
if (__status != ::cudaSuccess || __result != ::cudaDriverEntryPointSuccess)
|
|
56
|
+
{
|
|
57
|
+
::cuda::__throw_cuda_error(::cudaErrorUnknown, "Failed to get cuGetProcAddress");
|
|
58
|
+
}
|
|
59
|
+
return reinterpret_cast<decltype(cuGetProcAddress)*>(__fn);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
_CCCL_SUPPRESS_DEPRECATED_POP
|
|
64
63
|
|
|
64
|
+
//! @brief Gets the driver entry point.
|
|
65
|
+
//!
|
|
66
|
+
//! @param __get_proc_addr_fn Pointer to cuGetProcAddress function.
|
|
67
|
+
//! @param __name Name of the symbol to get the driver entry point for.
|
|
68
|
+
//! @param __major The major CTK version to get the symbol version for.
|
|
69
|
+
//! @param __minor The major CTK version to get the symbol version for.
|
|
70
|
+
//!
|
|
71
|
+
//! @return The address of the symbol.
|
|
72
|
+
//!
|
|
73
|
+
//! @throws @c cuda::cuda_error if the symbol cannot be obtained.
|
|
74
|
+
[[nodiscard]] _CCCL_HOST_API inline void* __get_driver_entry_point_impl(
|
|
75
|
+
decltype(cuGetProcAddress)* __get_proc_addr_fn,
|
|
76
|
+
const char* __name,
|
|
77
|
+
[[maybe_unused]] int __major,
|
|
78
|
+
[[maybe_unused]] int __minor)
|
|
79
|
+
{
|
|
65
80
|
void* __fn;
|
|
66
81
|
::CUdriverProcAddressQueryResult __result;
|
|
67
82
|
::CUresult __status =
|
|
68
|
-
|
|
83
|
+
__get_proc_addr_fn(__name, &__fn, __major * 1000 + __minor * 10, ::CU_GET_PROC_ADDRESS_DEFAULT, &__result);
|
|
69
84
|
if (__status != ::CUDA_SUCCESS || __result != ::CU_GET_PROC_ADDRESS_SUCCESS)
|
|
70
85
|
{
|
|
71
86
|
if (__status == ::CUDA_ERROR_INVALID_VALUE)
|
|
@@ -84,8 +99,13 @@ __get_driver_entry_point(const char* __name, [[maybe_unused]] int __major = 12,
|
|
|
84
99
|
return __fn;
|
|
85
100
|
}
|
|
86
101
|
|
|
87
|
-
|
|
88
|
-
|
|
102
|
+
//! @brief CUDA Driver API call wrapper. Calls a given CUDA Driver API and checks the return value.
|
|
103
|
+
//!
|
|
104
|
+
//! @param __fn A CUDA Driver function.
|
|
105
|
+
//! @param __err_msg Error message describing the call if the all fails.
|
|
106
|
+
//! @param __args The arguments to the @c __fn call.
|
|
107
|
+
//!
|
|
108
|
+
//! @throws @c cuda::cuda_error if the function call doesn't return CUDA_SUCCESS.
|
|
89
109
|
template <typename Fn, typename... Args>
|
|
90
110
|
_CCCL_HOST_API inline void __call_driver_fn(Fn __fn, const char* __err_msg, Args... __args)
|
|
91
111
|
{
|
|
@@ -96,6 +116,48 @@ _CCCL_HOST_API inline void __call_driver_fn(Fn __fn, const char* __err_msg, Args
|
|
|
96
116
|
}
|
|
97
117
|
}
|
|
98
118
|
|
|
119
|
+
//! @brief Initializes the CUDA Driver.
|
|
120
|
+
//!
|
|
121
|
+
//! @param __get_proc_addr_fn The pointer to cuGetProcAddress function.
|
|
122
|
+
//!
|
|
123
|
+
//! @return A dummy bool value.
|
|
124
|
+
//!
|
|
125
|
+
//! @warning This function should be called only once from __get_driver_entry_point function.
|
|
126
|
+
[[nodiscard]] _CCCL_HOST_API inline bool __init(decltype(cuGetProcAddress)* __get_proc_addr_fn)
|
|
127
|
+
{
|
|
128
|
+
auto __driver_fn = reinterpret_cast<decltype(::cuInit)*>(
|
|
129
|
+
::cuda::__driver::__get_driver_entry_point_impl(__get_proc_addr_fn, "cuInit", 12, 0));
|
|
130
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to initialize CUDA Driver", 0);
|
|
131
|
+
return true;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
//! @brief Get a driver function pointer for a given API name and optionally specific CUDA version. This function also
|
|
135
|
+
//! initializes the CUDA Driver.
|
|
136
|
+
//!
|
|
137
|
+
//! @param __name Name of the symbol to get the driver entry point for.
|
|
138
|
+
//! @param __major The major CTK version to get the symbol version for. Defaults to 12.
|
|
139
|
+
//! @param __minor The major CTK version to get the symbol version for. Defaults to 0.
|
|
140
|
+
//!
|
|
141
|
+
//! @return The address of the symbol.
|
|
142
|
+
//!
|
|
143
|
+
//! @throws @c cuda::cuda_error if the symbol cannot be obtained or the CUDA driver failed to initialize.
|
|
144
|
+
[[nodiscard]] _CCCL_HOST_API inline void*
|
|
145
|
+
__get_driver_entry_point(const char* __name, [[maybe_unused]] int __major = 12, [[maybe_unused]] int __minor = 0)
|
|
146
|
+
{
|
|
147
|
+
// Get cuGetProcAddress function and call cuInit(0) only on the first call
|
|
148
|
+
static auto __get_proc_addr_fn = ::cuda::__driver::__getProcAddressFn();
|
|
149
|
+
[[maybe_unused]] static auto __init = ::cuda::__driver::__init(__get_proc_addr_fn);
|
|
150
|
+
return ::cuda::__driver::__get_driver_entry_point_impl(__get_proc_addr_fn, __name, __major, __minor);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
//! @brief Converts CUdevice to ordinal device id.
|
|
154
|
+
//!
|
|
155
|
+
//! @note Currently, CUdevice value is the same as the ordinal device id. But that might change in the future.
|
|
156
|
+
[[nodiscard]] _CCCL_HOST_API inline int __cudevice_to_ordinal(::CUdevice __dev) noexcept
|
|
157
|
+
{
|
|
158
|
+
return static_cast<int>(__dev);
|
|
159
|
+
}
|
|
160
|
+
|
|
99
161
|
// Version management
|
|
100
162
|
|
|
101
163
|
[[nodiscard]] _CCCL_HOST_API inline int __getVersion()
|
|
@@ -119,6 +181,22 @@ _CCCL_HOST_API inline void __call_driver_fn(Fn __fn, const char* __err_msg, Args
|
|
|
119
181
|
return __result;
|
|
120
182
|
}
|
|
121
183
|
|
|
184
|
+
[[nodiscard]] _CCCL_HOST_API inline ::CUdevice __deviceGetAttribute(::CUdevice_attribute __attr, ::CUdevice __device)
|
|
185
|
+
{
|
|
186
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDeviceGetAttribute);
|
|
187
|
+
int __result;
|
|
188
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get device attribute", &__result, __attr, __device);
|
|
189
|
+
return __result;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
[[nodiscard]] _CCCL_HOST_API inline int __deviceGetCount()
|
|
193
|
+
{
|
|
194
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDeviceGetCount);
|
|
195
|
+
int __result;
|
|
196
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get device count", &__result);
|
|
197
|
+
return __result;
|
|
198
|
+
}
|
|
199
|
+
|
|
122
200
|
_CCCL_HOST_API inline void __deviceGetName(char* __name_out, int __len, int __ordinal)
|
|
123
201
|
{
|
|
124
202
|
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDeviceGetName);
|
|
@@ -178,6 +256,14 @@ _CCCL_HOST_API inline ::CUcontext __ctxPop()
|
|
|
178
256
|
return __result;
|
|
179
257
|
}
|
|
180
258
|
|
|
259
|
+
[[nodiscard]] _CCCL_HOST_API inline ::CUdevice __ctxGetDevice()
|
|
260
|
+
{
|
|
261
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuCtxGetDevice);
|
|
262
|
+
::CUdevice __result{};
|
|
263
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get current context", &__result);
|
|
264
|
+
return __result;
|
|
265
|
+
}
|
|
266
|
+
|
|
181
267
|
// Memory management
|
|
182
268
|
|
|
183
269
|
_CCCL_HOST_API inline void __memcpyAsync(void* __dst, const void* __src, size_t __count, ::CUstream __stream)
|
|
@@ -239,8 +325,71 @@ _CCCL_HOST_API void __memsetAsync(void* __dst, _Tp __value, size_t __count, ::CU
|
|
|
239
325
|
}
|
|
240
326
|
}
|
|
241
327
|
|
|
328
|
+
// Unified Addressing
|
|
329
|
+
|
|
330
|
+
// TODO: we don't want to have these functions here, refactoring expected
|
|
331
|
+
template <::CUpointer_attribute _Attr>
|
|
332
|
+
[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto __pointer_attribute_value_type_t_impl() noexcept
|
|
333
|
+
{
|
|
334
|
+
if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_CONTEXT)
|
|
335
|
+
{
|
|
336
|
+
return ::CUcontext{};
|
|
337
|
+
}
|
|
338
|
+
else if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE)
|
|
339
|
+
{
|
|
340
|
+
return ::CUmemorytype{};
|
|
341
|
+
}
|
|
342
|
+
else if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER || _Attr == ::CU_POINTER_ATTRIBUTE_HOST_POINTER)
|
|
343
|
+
{
|
|
344
|
+
return static_cast<void*>(nullptr);
|
|
345
|
+
}
|
|
346
|
+
else if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_IS_MANAGED || _Attr == ::CU_POINTER_ATTRIBUTE_MAPPED)
|
|
347
|
+
{
|
|
348
|
+
return bool{};
|
|
349
|
+
}
|
|
350
|
+
else if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL)
|
|
351
|
+
{
|
|
352
|
+
return int{};
|
|
353
|
+
}
|
|
354
|
+
else
|
|
355
|
+
{
|
|
356
|
+
static_assert(::cuda::std::__always_false_v<decltype(_Attr)>, "not implemented attribute");
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
template <::CUpointer_attribute _Attr>
|
|
361
|
+
using __pointer_attribute_value_type_t = decltype(::cuda::__driver::__pointer_attribute_value_type_t_impl<_Attr>());
|
|
362
|
+
|
|
363
|
+
template <::CUpointer_attribute _Attr>
|
|
364
|
+
[[nodiscard]] _CCCL_HOST_API inline ::cudaError_t
|
|
365
|
+
__pointerGetAttributeNoThrow(__pointer_attribute_value_type_t<_Attr>& __result, const void* __ptr)
|
|
366
|
+
{
|
|
367
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuPointerGetAttribute);
|
|
368
|
+
::cudaError_t __status{};
|
|
369
|
+
if constexpr (::cuda::std::is_same_v<__pointer_attribute_value_type_t<_Attr>, bool>)
|
|
370
|
+
{
|
|
371
|
+
int __result2{};
|
|
372
|
+
__status = static_cast<::cudaError_t>(__driver_fn(&__result2, _Attr, reinterpret_cast<::CUdeviceptr>(__ptr)));
|
|
373
|
+
__result = static_cast<bool>(__result2);
|
|
374
|
+
}
|
|
375
|
+
else
|
|
376
|
+
{
|
|
377
|
+
__status =
|
|
378
|
+
static_cast<::cudaError_t>(__driver_fn((void*) &__result, _Attr, reinterpret_cast<::CUdeviceptr>(__ptr)));
|
|
379
|
+
}
|
|
380
|
+
return __status;
|
|
381
|
+
}
|
|
382
|
+
|
|
242
383
|
// Stream management
|
|
243
384
|
|
|
385
|
+
[[nodiscard]] _CCCL_HOST_API inline ::CUstream __streamCreateWithPriority(unsigned __flags, int __priority)
|
|
386
|
+
{
|
|
387
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuStreamCreateWithPriority);
|
|
388
|
+
::CUstream __stream;
|
|
389
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to create a stream", &__stream, __flags, __priority);
|
|
390
|
+
return __stream;
|
|
391
|
+
}
|
|
392
|
+
|
|
244
393
|
_CCCL_HOST_API inline void __streamSynchronize(::CUstream __stream)
|
|
245
394
|
{
|
|
246
395
|
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuStreamSynchronize);
|
|
@@ -294,6 +443,17 @@ struct __ctx_from_stream
|
|
|
294
443
|
}
|
|
295
444
|
# endif // _CCCL_CTK_AT_LEAST(12, 5)
|
|
296
445
|
|
|
446
|
+
// TODO: make this available since CUDA 12.8
|
|
447
|
+
# if _CCCL_CTK_AT_LEAST(13, 0)
|
|
448
|
+
[[nodiscard]] _CCCL_HOST_API inline ::CUdevice __streamGetDevice(::CUstream __stream)
|
|
449
|
+
{
|
|
450
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION_VERSIONED(cuStreamGetDevice, cuStreamGetDevice, 12, 8);
|
|
451
|
+
::CUdevice __result{};
|
|
452
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get the device of the stream", __stream, &__result);
|
|
453
|
+
return __result;
|
|
454
|
+
}
|
|
455
|
+
# endif // _CCCL_CTK_AT_LEAST(13, 0)
|
|
456
|
+
|
|
297
457
|
_CCCL_HOST_API inline void __streamWaitEvent(::CUstream __stream, ::CUevent __evnt)
|
|
298
458
|
{
|
|
299
459
|
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuStreamWaitEvent);
|
|
@@ -323,31 +483,52 @@ _CCCL_HOST_API inline void __streamWaitEvent(::CUstream __stream, ::CUevent __ev
|
|
|
323
483
|
return __id;
|
|
324
484
|
}
|
|
325
485
|
|
|
326
|
-
// Event management
|
|
327
|
-
|
|
328
|
-
_CCCL_HOST_API inline void __eventRecord(::CUevent __evnt, ::CUstream __stream)
|
|
329
|
-
{
|
|
330
|
-
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventRecord);
|
|
331
|
-
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to record CUDA event", __evnt, __stream);
|
|
332
|
-
}
|
|
333
|
-
|
|
334
|
-
// Destroy calls return error codes to let the calling code decide if the error should be ignored
|
|
335
486
|
[[nodiscard]] _CCCL_HOST_API inline ::cudaError_t __streamDestroyNoThrow(::CUstream __stream)
|
|
336
487
|
{
|
|
337
488
|
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuStreamDestroy);
|
|
338
489
|
return static_cast<::cudaError_t>(__driver_fn(__stream));
|
|
339
490
|
}
|
|
340
491
|
|
|
492
|
+
// Event management
|
|
493
|
+
|
|
494
|
+
[[nodiscard]] _CCCL_HOST_API inline ::CUevent __eventCreate(unsigned __flags)
|
|
495
|
+
{
|
|
496
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventCreate);
|
|
497
|
+
::CUevent __evnt;
|
|
498
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to create a CUDA event", &__evnt, __flags);
|
|
499
|
+
return __evnt;
|
|
500
|
+
}
|
|
501
|
+
|
|
341
502
|
[[nodiscard]] _CCCL_HOST_API inline ::cudaError_t __eventDestroyNoThrow(::CUevent __evnt)
|
|
342
503
|
{
|
|
343
504
|
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventDestroy);
|
|
344
505
|
return static_cast<::cudaError_t>(__driver_fn(__evnt));
|
|
345
506
|
}
|
|
346
507
|
|
|
347
|
-
_CCCL_HOST_API inline
|
|
508
|
+
[[nodiscard]] _CCCL_HOST_API inline float __eventElapsedTime(::CUevent __start, ::CUevent __end)
|
|
348
509
|
{
|
|
349
510
|
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventElapsedTime);
|
|
350
|
-
|
|
511
|
+
float __result;
|
|
512
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get event elapsed time", &__result, __start, __end);
|
|
513
|
+
return __result;
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
[[nodiscard]] _CCCL_HOST_API inline ::cudaError_t __eventQueryNoThrow(::CUevent __evnt)
|
|
517
|
+
{
|
|
518
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventQuery);
|
|
519
|
+
return static_cast<::cudaError_t>(__driver_fn(__evnt));
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
_CCCL_HOST_API inline void __eventRecord(::CUevent __evnt, ::CUstream __stream)
|
|
523
|
+
{
|
|
524
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventRecord);
|
|
525
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to record an event", __evnt, __stream);
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
_CCCL_HOST_API inline void __eventSynchronize(::CUevent __evnt)
|
|
529
|
+
{
|
|
530
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventSynchronize);
|
|
531
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to synchronize an event", __evnt);
|
|
351
532
|
}
|
|
352
533
|
|
|
353
534
|
// Library management
|
|
@@ -491,6 +672,17 @@ __graphKernelNodeSetAttribute(::CUgraphNode __node, ::CUkernelNodeAttrID __id, c
|
|
|
491
672
|
_CUDA_DRIVER::__call_driver_fn(__driver_fn, "Failed to set kernel node parameters", __node, __id, &__value);
|
|
492
673
|
}
|
|
493
674
|
|
|
675
|
+
// Peer Context Memory Access
|
|
676
|
+
|
|
677
|
+
[[nodiscard]] _CCCL_HOST_API inline bool __deviceCanAccessPeer(::CUdevice __dev, ::CUdevice __peer_dev)
|
|
678
|
+
{
|
|
679
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDeviceCanAccessPeer);
|
|
680
|
+
int __result;
|
|
681
|
+
_CUDA_DRIVER::__call_driver_fn(
|
|
682
|
+
__driver_fn, "Failed to query if device can access peer's memory", &__result, __dev, __peer_dev);
|
|
683
|
+
return static_cast<bool>(__result);
|
|
684
|
+
}
|
|
685
|
+
|
|
494
686
|
// Green contexts
|
|
495
687
|
|
|
496
688
|
# if _CCCL_CTK_AT_LEAST(12, 5)
|
|
@@ -536,6 +728,6 @@ _CCCL_END_NAMESPACE_CUDA_DRIVER
|
|
|
536
728
|
|
|
537
729
|
# include <cuda/std/__cccl/epilogue.h>
|
|
538
730
|
|
|
539
|
-
#endif // _CCCL_HAS_CTK()
|
|
731
|
+
#endif // _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
540
732
|
|
|
541
733
|
#endif // _CUDA___DRIVER_DRIVER_API_H
|
|
@@ -23,10 +23,10 @@
|
|
|
23
23
|
|
|
24
24
|
#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
25
25
|
|
|
26
|
+
# include <cuda/__driver/driver_api.h>
|
|
26
27
|
# include <cuda/__event/event_ref.h>
|
|
27
28
|
# include <cuda/__runtime/ensure_current_context.h>
|
|
28
29
|
# include <cuda/__utility/no_init.h>
|
|
29
|
-
# include <cuda/std/__cuda/api_wrapper.h>
|
|
30
30
|
# include <cuda/std/cstddef>
|
|
31
31
|
# include <cuda/std/utility>
|
|
32
32
|
|
|
@@ -43,11 +43,11 @@ class event : public event_ref
|
|
|
43
43
|
|
|
44
44
|
public:
|
|
45
45
|
//! @brief Flags to use when creating the event.
|
|
46
|
-
enum class flags : unsigned
|
|
46
|
+
enum class flags : unsigned
|
|
47
47
|
{
|
|
48
48
|
none = cudaEventDefault,
|
|
49
49
|
blocking_sync = cudaEventBlockingSync,
|
|
50
|
-
interprocess = cudaEventInterprocess
|
|
50
|
+
interprocess = cudaEventInterprocess,
|
|
51
51
|
};
|
|
52
52
|
|
|
53
53
|
//! @brief Construct a new `event` object with timing disabled, and record
|
|
@@ -141,7 +141,7 @@ public:
|
|
|
141
141
|
|
|
142
142
|
[[nodiscard]] friend constexpr flags operator|(flags __lhs, flags __rhs) noexcept
|
|
143
143
|
{
|
|
144
|
-
return static_cast<flags>(static_cast<unsigned
|
|
144
|
+
return static_cast<flags>(static_cast<unsigned>(__lhs) | static_cast<unsigned>(__rhs));
|
|
145
145
|
}
|
|
146
146
|
|
|
147
147
|
private:
|
|
@@ -151,14 +151,13 @@ private:
|
|
|
151
151
|
: event_ref(__evnt)
|
|
152
152
|
{}
|
|
153
153
|
|
|
154
|
-
explicit event(stream_ref __stream, unsigned
|
|
154
|
+
explicit event(stream_ref __stream, unsigned __flags);
|
|
155
155
|
|
|
156
|
-
explicit event(device_ref __device, unsigned
|
|
156
|
+
explicit event(device_ref __device, unsigned __flags)
|
|
157
157
|
: event_ref(::cudaEvent_t{})
|
|
158
158
|
{
|
|
159
159
|
[[maybe_unused]] __ensure_current_context __ctx_setter(__device);
|
|
160
|
-
|
|
161
|
-
::cudaEventCreateWithFlags, "Failed to create CUDA event", &__event_, static_cast<unsigned int>(__flags));
|
|
160
|
+
__event_ = ::cuda::__driver::__eventCreate(static_cast<unsigned>(__flags));
|
|
162
161
|
}
|
|
163
162
|
};
|
|
164
163
|
|