cuda-cccl 0.1.3.2.0.dev438__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (60) hide show
  1. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +23 -0
  2. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +22 -14
  3. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
  4. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
  5. cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
  6. cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
  7. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
  8. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +321 -262
  9. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +8 -0
  10. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
  11. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +37 -13
  12. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +203 -51
  13. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
  14. cuda/cccl/headers/include/cub/util_device.cuh +51 -35
  15. cuda/cccl/headers/include/cuda/__algorithm/copy.h +3 -3
  16. cuda/cccl/headers/include/cuda/__device/all_devices.h +3 -6
  17. cuda/cccl/headers/include/cuda/__device/arch_traits.h +3 -3
  18. cuda/cccl/headers/include/cuda/__device/attributes.h +7 -7
  19. cuda/cccl/headers/include/cuda/__device/device_ref.h +3 -10
  20. cuda/cccl/headers/include/cuda/__driver/driver_api.h +225 -33
  21. cuda/cccl/headers/include/cuda/__event/event.h +7 -8
  22. cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
  23. cuda/cccl/headers/include/cuda/__event/timed_event.h +3 -4
  24. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +3 -3
  25. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +3 -3
  26. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +3 -3
  27. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
  28. cuda/cccl/headers/include/cuda/__runtime/types.h +1 -1
  29. cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
  30. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -12
  31. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
  32. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
  33. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
  34. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
  35. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
  36. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +1 -1
  37. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
  38. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
  39. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
  40. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
  41. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
  42. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +3 -2
  43. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
  44. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
  45. cuda/cccl/parallel/experimental/__init__.py +4 -0
  46. cuda/cccl/parallel/experimental/_bindings.pyi +28 -0
  47. cuda/cccl/parallel/experimental/_bindings_impl.pyx +140 -0
  48. cuda/cccl/parallel/experimental/algorithms/__init__.py +4 -0
  49. cuda/cccl/parallel/experimental/algorithms/_reduce.py +0 -2
  50. cuda/cccl/parallel/experimental/algorithms/_scan.py +0 -2
  51. cuda/cccl/parallel/experimental/algorithms/_three_way_partition.py +261 -0
  52. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  53. cuda/cccl/parallel/experimental/cu12/cccl/libcccl.c.parallel.so +0 -0
  54. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  55. cuda/cccl/parallel/experimental/cu13/cccl/libcccl.c.parallel.so +0 -0
  56. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/METADATA +1 -1
  57. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/RECORD +59 -57
  58. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
  59. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/WHEEL +0 -0
  60. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -47,7 +47,6 @@
47
47
  // for backward compatibility
48
48
  #include <cub/util_temporary_storage.cuh>
49
49
 
50
- #include <cuda/std/__cuda/ensure_current_device.h> // IWYU pragma: export
51
50
  #include <cuda/std/__type_traits/conditional.h>
52
51
  #include <cuda/std/__utility/forward.h>
53
52
  #include <cuda/std/array>
@@ -104,7 +103,34 @@ CUB_RUNTIME_FUNCTION inline int CurrentDevice()
104
103
 
105
104
  //! @brief RAII helper which saves the current device and switches to the specified device on construction and switches
106
105
  //! to the saved device on destruction.
107
- using SwitchDevice = ::cuda::__ensure_current_device;
106
+ class SwitchDevice
107
+ {
108
+ int target_device_;
109
+ int original_device_;
110
+
111
+ public:
112
+ //! @brief Queries the current device and if that is different than @p target_device sets the current device to
113
+ //! @p target_device
114
+ SwitchDevice(const int target_device)
115
+ : target_device_(target_device)
116
+ {
117
+ CubDebug(cudaGetDevice(&original_device_));
118
+ if (original_device_ != target_device_)
119
+ {
120
+ CubDebug(cudaSetDevice(target_device_));
121
+ }
122
+ }
123
+
124
+ //! @brief If the @p original_device was not equal to @p target_device sets the current device back to
125
+ //! @p original_device
126
+ ~SwitchDevice()
127
+ {
128
+ if (original_device_ != target_device_)
129
+ {
130
+ CubDebug(cudaSetDevice(original_device_));
131
+ }
132
+ }
133
+ };
108
134
 
109
135
  # endif // _CCCL_DOXYGEN_INVOKED
110
136
 
@@ -684,16 +710,31 @@ struct KernelConfig
684
710
  return launcher_factory.MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
685
711
  }
686
712
  };
687
-
688
713
  } // namespace detail
689
714
  #endif // !_CCCL_COMPILER(NVRTC)
690
715
 
716
+ namespace detail
717
+ {
718
+ template <typename T>
719
+ struct get_active_policy
720
+ {
721
+ using type = typename T::ActivePolicy;
722
+ };
723
+ } // namespace detail
724
+
691
725
  /// Helper for dispatching into a policy chain
692
726
  template <int PolicyPtxVersion, typename PolicyT, typename PrevPolicyT>
693
727
  struct ChainedPolicy
694
728
  {
729
+ private:
730
+ static constexpr bool have_previous_policy = !::cuda::std::is_same_v<PolicyT, PrevPolicyT>;
731
+
732
+ public:
695
733
  /// The policy for the active compiler pass
696
- using ActivePolicy = ::cuda::std::_If<(CUB_PTX_ARCH < PolicyPtxVersion), typename PrevPolicyT::ActivePolicy, PolicyT>;
734
+ using ActivePolicy =
735
+ typename ::cuda::std::_If<(CUB_PTX_ARCH < PolicyPtxVersion && have_previous_policy),
736
+ detail::get_active_policy<PrevPolicyT>,
737
+ ::cuda::std::type_identity<PolicyT>>::type;
697
738
 
698
739
  #if !_CCCL_COMPILER(NVRTC)
699
740
  /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
@@ -708,9 +749,12 @@ struct ChainedPolicy
708
749
  # elif defined(NV_TARGET_SM_INTEGER_LIST)
709
750
  return runtime_to_compiletime<10, NV_TARGET_SM_INTEGER_LIST>(device_ptx_version, op);
710
751
  # else
711
- if (device_ptx_version < PolicyPtxVersion)
752
+ if constexpr (have_previous_policy)
712
753
  {
713
- return PrevPolicyT::Invoke(device_ptx_version, op);
754
+ if (device_ptx_version < PolicyPtxVersion)
755
+ {
756
+ return PrevPolicyT::Invoke(device_ptx_version, op);
757
+ }
714
758
  }
715
759
  return op.template Invoke<PolicyT>();
716
760
  # endif
@@ -738,7 +782,7 @@ private:
738
782
  template <int DevicePtxVersion, typename FunctorT>
739
783
  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t invoke_static(FunctorT& op)
740
784
  {
741
- if constexpr (DevicePtxVersion < PolicyPtxVersion)
785
+ if constexpr (DevicePtxVersion < PolicyPtxVersion && have_previous_policy)
742
786
  {
743
787
  return PrevPolicyT::template invoke_static<DevicePtxVersion>(op);
744
788
  }
@@ -749,34 +793,6 @@ private:
749
793
  }
750
794
  #endif // !_CCCL_COMPILER(NVRTC)
751
795
  };
752
-
753
- /// Helper for dispatching into a policy chain (end-of-chain specialization)
754
- template <int PolicyPtxVersion, typename PolicyT>
755
- struct ChainedPolicy<PolicyPtxVersion, PolicyT, PolicyT>
756
- {
757
- template <int, typename, typename>
758
- friend struct ChainedPolicy; // befriend primary template, so it can call invoke_static
759
-
760
- /// The policy for the active compiler pass
761
- using ActivePolicy = PolicyT;
762
-
763
- #if !_CCCL_COMPILER(NVRTC)
764
- /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
765
- template <typename FunctorT>
766
- CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Invoke(int /*ptx_version*/, FunctorT& op)
767
- {
768
- return op.template Invoke<PolicyT>();
769
- }
770
-
771
- private:
772
- template <int, typename FunctorT>
773
- CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t invoke_static(FunctorT& op)
774
- {
775
- return op.template Invoke<PolicyT>();
776
- }
777
- #endif // !_CCCL_COMPILER(NVRTC)
778
- };
779
-
780
796
  CUB_NAMESPACE_END
781
797
 
782
798
  #if _CCCL_HAS_CUDA_COMPILER() && !_CCCL_COMPILER(NVRTC)
@@ -38,11 +38,11 @@ enum class source_access_order
38
38
  {
39
39
  # if _CCCL_CTK_AT_LEAST(13, 0)
40
40
  //! @brief Access source in stream order
41
- stream = cudaMemcpySrcAccessOrderStream,
41
+ stream = ::cudaMemcpySrcAccessOrderStream,
42
42
  //! @brief Access source during the copy call, source can be destroyed after the API returns
43
- during_api_call = cudaMemcpySrcAccessOrderDuringApiCall,
43
+ during_api_call = ::cudaMemcpySrcAccessOrderDuringApiCall,
44
44
  //! @brief Access source in any order, the order can change across CUDA releases
45
- any = cudaMemcpySrcAccessOrderAny,
45
+ any = ::cudaMemcpySrcAccessOrderAny,
46
46
  # else
47
47
  any = 0x3,
48
48
  # endif // _CCCL_CTK_BELOW(13, 0)
@@ -23,7 +23,7 @@
23
23
 
24
24
  #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
25
25
  # include <cuda/__device/physical_device.h>
26
- # include <cuda/std/__cuda/api_wrapper.h>
26
+ # include <cuda/__driver/driver_api.h>
27
27
  # include <cuda/std/cassert>
28
28
  # include <cuda/std/detail/libcxx/include/stdexcept>
29
29
  # include <cuda/std/span>
@@ -151,11 +151,8 @@ inline all_devices::operator ::cuda::std::span<const device_ref>() const
151
151
 
152
152
  inline const ::std::vector<physical_device>& all_devices::__devices()
153
153
  {
154
- static const ::std::vector<physical_device> __devices = [] {
155
- int __count = 0;
156
- _CCCL_TRY_CUDA_API(::cudaGetDeviceCount, "failed to get the count of CUDA devices", &__count);
157
- return ::std::vector<physical_device>{__initializer_iterator{0}, __initializer_iterator{__count}};
158
- }();
154
+ static const ::std::vector<physical_device> __devices{
155
+ __initializer_iterator{0}, __initializer_iterator{::cuda::__driver::__deviceGetCount()}};
159
156
  return __devices;
160
157
  }
161
158
  } // namespace __detail
@@ -516,7 +516,7 @@ inline constexpr int __highest_known_arch = 120;
516
516
  case id::sm_120a:
517
517
  return ::cuda::arch::traits<id::sm_120a>();
518
518
  default:
519
- ::cuda::__throw_cuda_error(cudaErrorInvalidValue, "Traits requested for an unknown architecture");
519
+ ::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Traits requested for an unknown architecture");
520
520
  break;
521
521
  }
522
522
  }
@@ -525,7 +525,7 @@ inline constexpr int __highest_known_arch = 120;
525
525
  {
526
526
  if (compute_capability < 60 || compute_capability > __highest_known_arch)
527
527
  {
528
- ::cuda::__throw_cuda_error(cudaErrorInvalidValue, "Compute capability out of range");
528
+ ::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Compute capability out of range");
529
529
  }
530
530
  return static_cast<id>(compute_capability);
531
531
  }
@@ -550,7 +550,7 @@ _CCCL_API inline constexpr id __special_id_for_compute_capability(int value)
550
550
  case 120:
551
551
  return id::sm_120a;
552
552
  default:
553
- ::cuda::__throw_cuda_error(cudaErrorInvalidValue, "Compute capability out of range");
553
+ ::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Compute capability out of range");
554
554
  break;
555
555
  }
556
556
  }
@@ -24,6 +24,7 @@
24
24
  #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
25
25
 
26
26
  # include <cuda/__device/device_ref.h>
27
+ # include <cuda/__driver/driver_api.h>
27
28
  # include <cuda/std/__cccl/attributes.h>
28
29
  # include <cuda/std/__cuda/api_wrapper.h>
29
30
 
@@ -44,11 +45,10 @@ struct __dev_attr_impl
44
45
  return _Attr;
45
46
  }
46
47
 
47
- [[nodiscard]] type operator()(device_ref __dev_id) const
48
+ [[nodiscard]] type operator()(device_ref __dev) const
48
49
  {
49
- int __value = 0;
50
- _CCCL_TRY_CUDA_API(::cudaDeviceGetAttribute, "failed to get device attribute", &__value, _Attr, __dev_id.get());
51
- return static_cast<type>(__value);
50
+ return static_cast<type>(::cuda::__driver::__deviceGetAttribute(
51
+ static_cast<::CUdevice_attribute>(_Attr), ::cuda::__driver::__deviceGet(__dev.get())));
52
52
  }
53
53
  };
54
54
 
@@ -81,9 +81,9 @@ template <>
81
81
  struct __dev_attr<::cudaDevAttrComputeMode> //
82
82
  : __dev_attr_impl<::cudaDevAttrComputeMode, ::cudaComputeMode>
83
83
  {
84
- static constexpr type default_mode = cudaComputeModeDefault;
85
- static constexpr type prohibited_mode = cudaComputeModeProhibited;
86
- static constexpr type exclusive_process_mode = cudaComputeModeExclusiveProcess;
84
+ static constexpr type default_mode = ::cudaComputeModeDefault;
85
+ static constexpr type prohibited_mode = ::cudaComputeModeProhibited;
86
+ static constexpr type exclusive_process_mode = ::cudaComputeModeExclusiveProcess;
87
87
  };
88
88
  template <>
89
89
  struct __dev_attr<::cudaDevAttrConcurrentKernels> //
@@ -24,7 +24,6 @@
24
24
  #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
25
25
  # include <cuda/__driver/driver_api.h>
26
26
  # include <cuda/__runtime/types.h>
27
- # include <cuda/std/__cuda/api_wrapper.h>
28
27
 
29
28
  # include <string>
30
29
  # include <vector>
@@ -143,16 +142,10 @@ public:
143
142
  //!
144
143
  //! @param __other_dev Device to query the peer access
145
144
  //! @return true if its possible for this device to access the specified device's memory
146
- bool has_peer_access_to(device_ref __other_dev) const
145
+ [[nodiscard]] bool has_peer_access_to(device_ref __other_dev) const
147
146
  {
148
- int __can_access;
149
- _CCCL_TRY_CUDA_API(
150
- ::cudaDeviceCanAccessPeer,
151
- "Could not query if device can be peer accessed",
152
- &__can_access,
153
- get(),
154
- __other_dev.get());
155
- return __can_access;
147
+ return ::cuda::__driver::__deviceCanAccessPeer(
148
+ ::cuda::__driver::__deviceGet(get()), ::cuda::__driver::__deviceGet(__other_dev.get()));
156
149
  }
157
150
 
158
151
  //! @brief Retrieve architecture traits of this device.
@@ -21,11 +21,12 @@
21
21
  # pragma system_header
22
22
  #endif // no system header
23
23
 
24
- #if _CCCL_HAS_CTK()
24
+ #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
25
25
 
26
26
  # include <cuda/std/__exception/cuda_error.h>
27
27
  # include <cuda/std/__internal/namespaces.h>
28
28
  # include <cuda/std/__type_traits/always_false.h>
29
+ # include <cuda/std/__type_traits/is_same.h>
29
30
 
30
31
  # include <cuda.h>
31
32
 
@@ -41,31 +42,45 @@ _CCCL_BEGIN_NAMESPACE_CUDA_DRIVER
41
42
  reinterpret_cast<decltype(::versioned_fn_name)*>( \
42
43
  ::cuda::__driver::__get_driver_entry_point(#function_name, major, minor))
43
44
 
45
+ // cudaGetDriverEntryPoint function is deprecated
44
46
  _CCCL_SUPPRESS_DEPRECATED_PUSH
45
47
 
46
- //! @brief Get a driver function pointer for a given API name and optionally specific CUDA version
47
- //!
48
- //! For minor version compatibility request the 12.0 version of everything for now, unless requested otherwise
49
- [[nodiscard]] _CCCL_HOST_API inline void*
50
- __get_driver_entry_point(const char* __name, [[maybe_unused]] int __major = 12, [[maybe_unused]] int __minor = 0)
48
+ //! @brief Gets the cuGetProcAddress function pointer.
49
+ [[nodiscard]] _CCCL_HOST_API inline auto __getProcAddressFn() -> decltype(cuGetProcAddress)*
51
50
  {
52
- // TODO switch to dlopen of libcuda.so instead of the below and maybe pair it with cuInit to avoid checking for two
53
- // initializations
54
- static auto __get_driver_entry_point_fn = reinterpret_cast<decltype(cuGetProcAddress)*>([]() {
55
- void* __fn;
56
- ::cudaDriverEntryPointQueryResult __result;
57
- ::cudaError_t __status = ::cudaGetDriverEntryPoint("cuGetProcAddress", &__fn, ::cudaEnableDefault, &__result);
58
- if (__status != ::cudaSuccess || __result != ::cudaDriverEntryPointSuccess)
59
- {
60
- ::cuda::__throw_cuda_error(::cudaErrorUnknown, "Failed to get cuGetProcAddress");
61
- }
62
- return __fn;
63
- }());
51
+ // TODO switch to dlopen of libcuda.so instead of the below
52
+ void* __fn;
53
+ ::cudaDriverEntryPointQueryResult __result;
54
+ ::cudaError_t __status = ::cudaGetDriverEntryPoint("cuGetProcAddress", &__fn, ::cudaEnableDefault, &__result);
55
+ if (__status != ::cudaSuccess || __result != ::cudaDriverEntryPointSuccess)
56
+ {
57
+ ::cuda::__throw_cuda_error(::cudaErrorUnknown, "Failed to get cuGetProcAddress");
58
+ }
59
+ return reinterpret_cast<decltype(cuGetProcAddress)*>(__fn);
60
+ }
61
+
62
+ _CCCL_SUPPRESS_DEPRECATED_POP
64
63
 
64
+ //! @brief Gets the driver entry point.
65
+ //!
66
+ //! @param __get_proc_addr_fn Pointer to cuGetProcAddress function.
67
+ //! @param __name Name of the symbol to get the driver entry point for.
68
+ //! @param __major The major CTK version to get the symbol version for.
69
+ //! @param __minor The major CTK version to get the symbol version for.
70
+ //!
71
+ //! @return The address of the symbol.
72
+ //!
73
+ //! @throws @c cuda::cuda_error if the symbol cannot be obtained.
74
+ [[nodiscard]] _CCCL_HOST_API inline void* __get_driver_entry_point_impl(
75
+ decltype(cuGetProcAddress)* __get_proc_addr_fn,
76
+ const char* __name,
77
+ [[maybe_unused]] int __major,
78
+ [[maybe_unused]] int __minor)
79
+ {
65
80
  void* __fn;
66
81
  ::CUdriverProcAddressQueryResult __result;
67
82
  ::CUresult __status =
68
- __get_driver_entry_point_fn(__name, &__fn, __major * 1000 + __minor * 10, ::CU_GET_PROC_ADDRESS_DEFAULT, &__result);
83
+ __get_proc_addr_fn(__name, &__fn, __major * 1000 + __minor * 10, ::CU_GET_PROC_ADDRESS_DEFAULT, &__result);
69
84
  if (__status != ::CUDA_SUCCESS || __result != ::CU_GET_PROC_ADDRESS_SUCCESS)
70
85
  {
71
86
  if (__status == ::CUDA_ERROR_INVALID_VALUE)
@@ -84,8 +99,13 @@ __get_driver_entry_point(const char* __name, [[maybe_unused]] int __major = 12,
84
99
  return __fn;
85
100
  }
86
101
 
87
- _CCCL_SUPPRESS_DEPRECATED_POP
88
-
102
+ //! @brief CUDA Driver API call wrapper. Calls a given CUDA Driver API and checks the return value.
103
+ //!
104
+ //! @param __fn A CUDA Driver function.
105
+ //! @param __err_msg Error message describing the call if the all fails.
106
+ //! @param __args The arguments to the @c __fn call.
107
+ //!
108
+ //! @throws @c cuda::cuda_error if the function call doesn't return CUDA_SUCCESS.
89
109
  template <typename Fn, typename... Args>
90
110
  _CCCL_HOST_API inline void __call_driver_fn(Fn __fn, const char* __err_msg, Args... __args)
91
111
  {
@@ -96,6 +116,48 @@ _CCCL_HOST_API inline void __call_driver_fn(Fn __fn, const char* __err_msg, Args
96
116
  }
97
117
  }
98
118
 
119
+ //! @brief Initializes the CUDA Driver.
120
+ //!
121
+ //! @param __get_proc_addr_fn The pointer to cuGetProcAddress function.
122
+ //!
123
+ //! @return A dummy bool value.
124
+ //!
125
+ //! @warning This function should be called only once from __get_driver_entry_point function.
126
+ [[nodiscard]] _CCCL_HOST_API inline bool __init(decltype(cuGetProcAddress)* __get_proc_addr_fn)
127
+ {
128
+ auto __driver_fn = reinterpret_cast<decltype(::cuInit)*>(
129
+ ::cuda::__driver::__get_driver_entry_point_impl(__get_proc_addr_fn, "cuInit", 12, 0));
130
+ ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to initialize CUDA Driver", 0);
131
+ return true;
132
+ }
133
+
134
+ //! @brief Get a driver function pointer for a given API name and optionally specific CUDA version. This function also
135
+ //! initializes the CUDA Driver.
136
+ //!
137
+ //! @param __name Name of the symbol to get the driver entry point for.
138
+ //! @param __major The major CTK version to get the symbol version for. Defaults to 12.
139
+ //! @param __minor The major CTK version to get the symbol version for. Defaults to 0.
140
+ //!
141
+ //! @return The address of the symbol.
142
+ //!
143
+ //! @throws @c cuda::cuda_error if the symbol cannot be obtained or the CUDA driver failed to initialize.
144
+ [[nodiscard]] _CCCL_HOST_API inline void*
145
+ __get_driver_entry_point(const char* __name, [[maybe_unused]] int __major = 12, [[maybe_unused]] int __minor = 0)
146
+ {
147
+ // Get cuGetProcAddress function and call cuInit(0) only on the first call
148
+ static auto __get_proc_addr_fn = ::cuda::__driver::__getProcAddressFn();
149
+ [[maybe_unused]] static auto __init = ::cuda::__driver::__init(__get_proc_addr_fn);
150
+ return ::cuda::__driver::__get_driver_entry_point_impl(__get_proc_addr_fn, __name, __major, __minor);
151
+ }
152
+
153
+ //! @brief Converts CUdevice to ordinal device id.
154
+ //!
155
+ //! @note Currently, CUdevice value is the same as the ordinal device id. But that might change in the future.
156
+ [[nodiscard]] _CCCL_HOST_API inline int __cudevice_to_ordinal(::CUdevice __dev) noexcept
157
+ {
158
+ return static_cast<int>(__dev);
159
+ }
160
+
99
161
  // Version management
100
162
 
101
163
  [[nodiscard]] _CCCL_HOST_API inline int __getVersion()
@@ -119,6 +181,22 @@ _CCCL_HOST_API inline void __call_driver_fn(Fn __fn, const char* __err_msg, Args
119
181
  return __result;
120
182
  }
121
183
 
184
+ [[nodiscard]] _CCCL_HOST_API inline ::CUdevice __deviceGetAttribute(::CUdevice_attribute __attr, ::CUdevice __device)
185
+ {
186
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDeviceGetAttribute);
187
+ int __result;
188
+ ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get device attribute", &__result, __attr, __device);
189
+ return __result;
190
+ }
191
+
192
+ [[nodiscard]] _CCCL_HOST_API inline int __deviceGetCount()
193
+ {
194
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDeviceGetCount);
195
+ int __result;
196
+ ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get device count", &__result);
197
+ return __result;
198
+ }
199
+
122
200
  _CCCL_HOST_API inline void __deviceGetName(char* __name_out, int __len, int __ordinal)
123
201
  {
124
202
  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDeviceGetName);
@@ -178,6 +256,14 @@ _CCCL_HOST_API inline ::CUcontext __ctxPop()
178
256
  return __result;
179
257
  }
180
258
 
259
+ [[nodiscard]] _CCCL_HOST_API inline ::CUdevice __ctxGetDevice()
260
+ {
261
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuCtxGetDevice);
262
+ ::CUdevice __result{};
263
+ ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get current context", &__result);
264
+ return __result;
265
+ }
266
+
181
267
  // Memory management
182
268
 
183
269
  _CCCL_HOST_API inline void __memcpyAsync(void* __dst, const void* __src, size_t __count, ::CUstream __stream)
@@ -239,8 +325,71 @@ _CCCL_HOST_API void __memsetAsync(void* __dst, _Tp __value, size_t __count, ::CU
239
325
  }
240
326
  }
241
327
 
328
+ // Unified Addressing
329
+
330
+ // TODO: we don't want to have these functions here, refactoring expected
331
+ template <::CUpointer_attribute _Attr>
332
+ [[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto __pointer_attribute_value_type_t_impl() noexcept
333
+ {
334
+ if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_CONTEXT)
335
+ {
336
+ return ::CUcontext{};
337
+ }
338
+ else if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE)
339
+ {
340
+ return ::CUmemorytype{};
341
+ }
342
+ else if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER || _Attr == ::CU_POINTER_ATTRIBUTE_HOST_POINTER)
343
+ {
344
+ return static_cast<void*>(nullptr);
345
+ }
346
+ else if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_IS_MANAGED || _Attr == ::CU_POINTER_ATTRIBUTE_MAPPED)
347
+ {
348
+ return bool{};
349
+ }
350
+ else if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL)
351
+ {
352
+ return int{};
353
+ }
354
+ else
355
+ {
356
+ static_assert(::cuda::std::__always_false_v<decltype(_Attr)>, "not implemented attribute");
357
+ }
358
+ }
359
+
360
+ template <::CUpointer_attribute _Attr>
361
+ using __pointer_attribute_value_type_t = decltype(::cuda::__driver::__pointer_attribute_value_type_t_impl<_Attr>());
362
+
363
+ template <::CUpointer_attribute _Attr>
364
+ [[nodiscard]] _CCCL_HOST_API inline ::cudaError_t
365
+ __pointerGetAttributeNoThrow(__pointer_attribute_value_type_t<_Attr>& __result, const void* __ptr)
366
+ {
367
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuPointerGetAttribute);
368
+ ::cudaError_t __status{};
369
+ if constexpr (::cuda::std::is_same_v<__pointer_attribute_value_type_t<_Attr>, bool>)
370
+ {
371
+ int __result2{};
372
+ __status = static_cast<::cudaError_t>(__driver_fn(&__result2, _Attr, reinterpret_cast<::CUdeviceptr>(__ptr)));
373
+ __result = static_cast<bool>(__result2);
374
+ }
375
+ else
376
+ {
377
+ __status =
378
+ static_cast<::cudaError_t>(__driver_fn((void*) &__result, _Attr, reinterpret_cast<::CUdeviceptr>(__ptr)));
379
+ }
380
+ return __status;
381
+ }
382
+
242
383
  // Stream management
243
384
 
385
+ [[nodiscard]] _CCCL_HOST_API inline ::CUstream __streamCreateWithPriority(unsigned __flags, int __priority)
386
+ {
387
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuStreamCreateWithPriority);
388
+ ::CUstream __stream;
389
+ ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to create a stream", &__stream, __flags, __priority);
390
+ return __stream;
391
+ }
392
+
244
393
  _CCCL_HOST_API inline void __streamSynchronize(::CUstream __stream)
245
394
  {
246
395
  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuStreamSynchronize);
@@ -294,6 +443,17 @@ struct __ctx_from_stream
294
443
  }
295
444
  # endif // _CCCL_CTK_AT_LEAST(12, 5)
296
445
 
446
+ // TODO: make this available since CUDA 12.8
447
+ # if _CCCL_CTK_AT_LEAST(13, 0)
448
+ [[nodiscard]] _CCCL_HOST_API inline ::CUdevice __streamGetDevice(::CUstream __stream)
449
+ {
450
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION_VERSIONED(cuStreamGetDevice, cuStreamGetDevice, 12, 8);
451
+ ::CUdevice __result{};
452
+ ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get the device of the stream", __stream, &__result);
453
+ return __result;
454
+ }
455
+ # endif // _CCCL_CTK_AT_LEAST(13, 0)
456
+
297
457
  _CCCL_HOST_API inline void __streamWaitEvent(::CUstream __stream, ::CUevent __evnt)
298
458
  {
299
459
  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuStreamWaitEvent);
@@ -323,31 +483,52 @@ _CCCL_HOST_API inline void __streamWaitEvent(::CUstream __stream, ::CUevent __ev
323
483
  return __id;
324
484
  }
325
485
 
326
- // Event management
327
-
328
- _CCCL_HOST_API inline void __eventRecord(::CUevent __evnt, ::CUstream __stream)
329
- {
330
- static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventRecord);
331
- ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to record CUDA event", __evnt, __stream);
332
- }
333
-
334
- // Destroy calls return error codes to let the calling code decide if the error should be ignored
335
486
  [[nodiscard]] _CCCL_HOST_API inline ::cudaError_t __streamDestroyNoThrow(::CUstream __stream)
336
487
  {
337
488
  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuStreamDestroy);
338
489
  return static_cast<::cudaError_t>(__driver_fn(__stream));
339
490
  }
340
491
 
492
+ // Event management
493
+
494
+ [[nodiscard]] _CCCL_HOST_API inline ::CUevent __eventCreate(unsigned __flags)
495
+ {
496
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventCreate);
497
+ ::CUevent __evnt;
498
+ ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to create a CUDA event", &__evnt, __flags);
499
+ return __evnt;
500
+ }
501
+
341
502
  [[nodiscard]] _CCCL_HOST_API inline ::cudaError_t __eventDestroyNoThrow(::CUevent __evnt)
342
503
  {
343
504
  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventDestroy);
344
505
  return static_cast<::cudaError_t>(__driver_fn(__evnt));
345
506
  }
346
507
 
347
- _CCCL_HOST_API inline void __eventElapsedTime(::CUevent __start, ::CUevent __end, float* __ms)
508
+ [[nodiscard]] _CCCL_HOST_API inline float __eventElapsedTime(::CUevent __start, ::CUevent __end)
348
509
  {
349
510
  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventElapsedTime);
350
- ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get CUDA event elapsed time", __ms, __start, __end);
511
+ float __result;
512
+ ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get event elapsed time", &__result, __start, __end);
513
+ return __result;
514
+ }
515
+
516
+ [[nodiscard]] _CCCL_HOST_API inline ::cudaError_t __eventQueryNoThrow(::CUevent __evnt)
517
+ {
518
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventQuery);
519
+ return static_cast<::cudaError_t>(__driver_fn(__evnt));
520
+ }
521
+
522
+ _CCCL_HOST_API inline void __eventRecord(::CUevent __evnt, ::CUstream __stream)
523
+ {
524
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventRecord);
525
+ ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to record an event", __evnt, __stream);
526
+ }
527
+
528
+ _CCCL_HOST_API inline void __eventSynchronize(::CUevent __evnt)
529
+ {
530
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventSynchronize);
531
+ ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to synchronize an event", __evnt);
351
532
  }
352
533
 
353
534
  // Library management
@@ -491,6 +672,17 @@ __graphKernelNodeSetAttribute(::CUgraphNode __node, ::CUkernelNodeAttrID __id, c
491
672
  _CUDA_DRIVER::__call_driver_fn(__driver_fn, "Failed to set kernel node parameters", __node, __id, &__value);
492
673
  }
493
674
 
675
+ // Peer Context Memory Access
676
+
677
+ [[nodiscard]] _CCCL_HOST_API inline bool __deviceCanAccessPeer(::CUdevice __dev, ::CUdevice __peer_dev)
678
+ {
679
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDeviceCanAccessPeer);
680
+ int __result;
681
+ _CUDA_DRIVER::__call_driver_fn(
682
+ __driver_fn, "Failed to query if device can access peer's memory", &__result, __dev, __peer_dev);
683
+ return static_cast<bool>(__result);
684
+ }
685
+
494
686
  // Green contexts
495
687
 
496
688
  # if _CCCL_CTK_AT_LEAST(12, 5)
@@ -536,6 +728,6 @@ _CCCL_END_NAMESPACE_CUDA_DRIVER
536
728
 
537
729
  # include <cuda/std/__cccl/epilogue.h>
538
730
 
539
- #endif // _CCCL_HAS_CTK()
731
+ #endif // _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
540
732
 
541
733
  #endif // _CUDA___DRIVER_DRIVER_API_H
@@ -23,10 +23,10 @@
23
23
 
24
24
  #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
25
25
 
26
+ # include <cuda/__driver/driver_api.h>
26
27
  # include <cuda/__event/event_ref.h>
27
28
  # include <cuda/__runtime/ensure_current_context.h>
28
29
  # include <cuda/__utility/no_init.h>
29
- # include <cuda/std/__cuda/api_wrapper.h>
30
30
  # include <cuda/std/cstddef>
31
31
  # include <cuda/std/utility>
32
32
 
@@ -43,11 +43,11 @@ class event : public event_ref
43
43
 
44
44
  public:
45
45
  //! @brief Flags to use when creating the event.
46
- enum class flags : unsigned int
46
+ enum class flags : unsigned
47
47
  {
48
48
  none = cudaEventDefault,
49
49
  blocking_sync = cudaEventBlockingSync,
50
- interprocess = cudaEventInterprocess
50
+ interprocess = cudaEventInterprocess,
51
51
  };
52
52
 
53
53
  //! @brief Construct a new `event` object with timing disabled, and record
@@ -141,7 +141,7 @@ public:
141
141
 
142
142
  [[nodiscard]] friend constexpr flags operator|(flags __lhs, flags __rhs) noexcept
143
143
  {
144
- return static_cast<flags>(static_cast<unsigned int>(__lhs) | static_cast<unsigned int>(__rhs));
144
+ return static_cast<flags>(static_cast<unsigned>(__lhs) | static_cast<unsigned>(__rhs));
145
145
  }
146
146
 
147
147
  private:
@@ -151,14 +151,13 @@ private:
151
151
  : event_ref(__evnt)
152
152
  {}
153
153
 
154
- explicit event(stream_ref __stream, unsigned int __flags);
154
+ explicit event(stream_ref __stream, unsigned __flags);
155
155
 
156
- explicit event(device_ref __device, unsigned int __flags)
156
+ explicit event(device_ref __device, unsigned __flags)
157
157
  : event_ref(::cudaEvent_t{})
158
158
  {
159
159
  [[maybe_unused]] __ensure_current_context __ctx_setter(__device);
160
- _CCCL_TRY_CUDA_API(
161
- ::cudaEventCreateWithFlags, "Failed to create CUDA event", &__event_, static_cast<unsigned int>(__flags));
160
+ __event_ = ::cuda::__driver::__eventCreate(static_cast<unsigned>(__flags));
162
161
  }
163
162
  };
164
163