PyPI - cuda-cccl - Versions diffs - 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (185) hide show

cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h CHANGED Viewed

@@ -22,6 +22,16 @@
 #endif // no system header
 #include <cuda/__mdspan/restrict_accessor.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__fwd/array.h>
+#include <cuda/std/__fwd/span.h>
+#include <cuda/std/__type_traits/extent.h>
+#include <cuda/std/__type_traits/is_convertible.h>
+#include <cuda/std/__type_traits/is_pointer.h>
+#include <cuda/std/__type_traits/rank.h>
+#include <cuda/std/__type_traits/remove_all_extents.h>
+#include <cuda/std/__type_traits/remove_pointer.h>
+#include <cuda/std/__type_traits/remove_reference.h>
 #include <cuda/std/mdspan>
 #include <cuda/std/__cccl/prologue.h>
@@ -32,7 +42,63 @@ template <typename _ElementType,
           typename _Extents,
           typename _LayoutPolicy   = ::cuda::std::layout_right,
           typename _AccessorPolicy = ::cuda::std::default_accessor<_ElementType>>
-using restrict_mdspan = ::cuda::std::mdspan<_ElementType, _Extents, _LayoutPolicy, restrict_accessor<_AccessorPolicy>>;
+class restrict_mdspan
+    : public ::cuda::std::mdspan<_ElementType, _Extents, _LayoutPolicy, restrict_accessor<_AccessorPolicy>>
+{
+public:
+  _LIBCUDACXX_DELEGATE_CONSTRUCTORS(
+    restrict_mdspan, ::cuda::std::mdspan, _ElementType, _Extents, _LayoutPolicy, restrict_accessor<_AccessorPolicy>);
+  _CCCL_API friend constexpr void swap(restrict_mdspan& __x, restrict_mdspan& __y) noexcept
+  {
+    swap(static_cast<__base&>(__x), static_cast<__base&>(__y));
+  }
+};
+_CCCL_TEMPLATE(class _ElementType, class... _OtherIndexTypes)
+_CCCL_REQUIRES((sizeof...(_OtherIndexTypes) > 0)
+                 _CCCL_AND(::cuda::std::is_convertible_v<_OtherIndexTypes, size_t>&&... && true))
+_CCCL_HOST_DEVICE explicit restrict_mdspan(_ElementType*, _OtherIndexTypes...)
+  -> restrict_mdspan<_ElementType, ::cuda::std::extents<size_t, ::cuda::std::__maybe_static_ext<_OtherIndexTypes>...>>;
+_CCCL_TEMPLATE(class _Pointer)
+_CCCL_REQUIRES(::cuda::std::is_pointer_v<::cuda::std::remove_reference_t<_Pointer>>)
+_CCCL_HOST_DEVICE restrict_mdspan(_Pointer&&)
+  -> restrict_mdspan<::cuda::std::remove_pointer_t<::cuda::std::remove_reference_t<_Pointer>>,
+                     ::cuda::std::extents<size_t>>;
+_CCCL_TEMPLATE(class _CArray)
+_CCCL_REQUIRES(::cuda::std::is_array_v<_CArray> _CCCL_AND(::cuda::std::rank_v<_CArray> == 1))
+_CCCL_HOST_DEVICE restrict_mdspan(_CArray&)
+  -> restrict_mdspan<::cuda::std::remove_all_extents_t<_CArray>,
+                     ::cuda::std::extents<size_t, ::cuda::std::extent_v<_CArray, 0>>>;
+template <class _ElementType, class _OtherIndexType, size_t _Size>
+_CCCL_HOST_DEVICE restrict_mdspan(_ElementType*, const ::cuda::std::array<_OtherIndexType, _Size>&)
+  -> restrict_mdspan<_ElementType, ::cuda::std::dextents<size_t, _Size>>;
+template <class _ElementType, class _OtherIndexType, size_t _Size>
+_CCCL_HOST_DEVICE restrict_mdspan(_ElementType*, ::cuda::std::span<_OtherIndexType, _Size>)
+  -> restrict_mdspan<_ElementType, ::cuda::std::dextents<size_t, _Size>>;
+// This one is necessary because all the constructors take `data_handle_type`s, not
+// `_ElementType*`s, and `data_handle_type` is taken from `accessor_type::data_handle_type`, which
+// seems to throw off automatic deduction guides.
+template <class _ElementType, class _OtherIndexType, size_t... _ExtentsPack>
+_CCCL_HOST_DEVICE restrict_mdspan(_ElementType*, const ::cuda::std::extents<_OtherIndexType, _ExtentsPack...>&)
+  -> restrict_mdspan<_ElementType, ::cuda::std::extents<_OtherIndexType, _ExtentsPack...>>;
+template <class _ElementType, class _MappingType>
+_CCCL_HOST_DEVICE restrict_mdspan(_ElementType*, const _MappingType&)
+  -> restrict_mdspan<_ElementType, typename _MappingType::extents_type, typename _MappingType::layout_type>;
+template <class _MappingType, class _AccessorType>
+_CCCL_HOST_DEVICE
+restrict_mdspan(const typename _AccessorType::data_handle_type, const _MappingType&, const _AccessorType&)
+  -> restrict_mdspan<typename _AccessorType::element_type,
+                     typename _MappingType::extents_type,
+                     typename _MappingType::layout_type,
+                     _AccessorType>;
 /***********************************************************************************************************************
  * Accessibility Traits

cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h ADDED Viewed

@@ -0,0 +1,93 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+#ifndef _CUDA___MEMORY_POINTER_IN_RANGE_H
+#define _CUDA___MEMORY_POINTER_IN_RANGE_H
+#include <cuda/std/detail/__config>
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+#include <cuda/std/__type_traits/is_constant_evaluated.h>
+#include <cuda/std/cstdint>
+#if _CCCL_HOST_COMPILATION()
+#  include <functional>
+#endif // _CCCL_HOST_COMPILATION()
+#include <cuda/std/__cccl/prologue.h>
+_CCCL_BEGIN_NAMESPACE_CUDA
+// Pointers comparison <, <=, >=, > is undefined behavior in C++ (https://eel.is/c++draft/expr.rel#4) when pointers
+// don't belong to the same object or array.
+// - Even when a platform guarantees flat address space, the compiler can leverage UB for optimization purposes.
+// - However, the compiler treats ::std::less<> other functional operators in a special way, ensuring a total ordering.
+// - For device code, we can convert pointers to uintptr_t and compare them.
+//
+// References:
+// - https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2024/p3234r0.html
+// - https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2023/p2865r2.pdf
+// - https://www.boost.org/doc/libs/develop/libs/core/doc/html/core/pointer_in_range.html
+// - https://pvs-studio.com/en/blog/posts/cpp/1199/
+// - https://releases.llvm.org/20.1.0/tools/clang/docs/ReleaseNotes.html#resolutions-to-c-defect-reports
+#if _CCCL_HOST_COMPILATION()
+template <typename _Tp>
+[[nodiscard]] _CCCL_API bool __ptr_in_range_host(_Tp* __ptr, _Tp* __start, _Tp* __end) noexcept
+{
+  _CCCL_ASSERT(::std::greater_equal<>{}(__end, __start), "__ptr_in_range_host: __end must be greater than __start");
+  return ::std::greater_equal<>{}(__ptr, __start) && ::std::less<>{}(__ptr, __end);
+}
+#endif // _CCCL_HOST_COMPILATION()
+#if _CCCL_DEVICE_COMPILATION()
+template <typename _Tp>
+[[nodiscard]] _CCCL_API bool __ptr_in_range_device(_Tp* __ptr, _Tp* __start, _Tp* __end) noexcept
+{
+  using uintptr_t  = ::cuda::std::uintptr_t;
+  auto __end_ptr   = reinterpret_cast<uintptr_t>(__end);
+  auto __start_ptr = reinterpret_cast<uintptr_t>(__start);
+  auto __ptr_ptr   = reinterpret_cast<uintptr_t>(__ptr);
+  _CCCL_ASSERT(__end_ptr >= __start_ptr, "__ptr_in_range_device: __end must be greater than __start");
+  return __ptr_ptr >= __start_ptr && __ptr_ptr < __end_ptr;
+}
+#endif // _CCCL_DEVICE_COMPILATION()
+template <typename _Tp>
+[[nodiscard]] _CCCL_API constexpr bool ptr_in_range(_Tp* __ptr, _Tp* __start, _Tp* __end) noexcept
+{
+  if (::cuda::std::__cccl_default_is_constant_evaluated())
+  {
+    _CCCL_ASSERT(__end >= __start, "ptr_in_range: __end must be greater than __start");
+    return __ptr >= __start && __ptr < __end; // UB is not possible in a constant expression
+  }
+  else
+  {
+    NV_IF_ELSE_TARGET(NV_IS_HOST,
+                      (return ::cuda::__ptr_in_range_host(__ptr, __start, __end);),
+                      (return ::cuda::__ptr_in_range_device(__ptr, __start, __end);));
+  }
+}
+_CCCL_END_NAMESPACE_CUDA
+#include <cuda/std/__cccl/epilogue.h>
+#endif // _CUDA___MEMORY_POINTER_IN_RANGE_H

cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h CHANGED Viewed

@@ -8,8 +8,8 @@
 //
 //===----------------------------------------------------------------------===//
-#ifndef _CUDA___MEMORY_RESOURCE_GET_MEMORY_RESOURCE_CUH
-#define _CUDA___MEMORY_RESOURCE_GET_MEMORY_RESOURCE_CUH
+#ifndef _CUDA___MEMORY_RESOURCE_GET_MEMORY_RESOURCE_H
+#define _CUDA___MEMORY_RESOURCE_GET_MEMORY_RESOURCE_H
 #include <cuda/std/detail/__config>
@@ -23,11 +23,11 @@
 #include <cuda/__memory_resource/properties.h>
 #include <cuda/__memory_resource/resource.h>
+#include <cuda/__stream/stream_ref.h>
 #include <cuda/std/__concepts/equality_comparable.h>
 #include <cuda/std/__execution/env.h>
 #include <cuda/std/__type_traits/is_same.h>
 #include <cuda/std/__type_traits/remove_cvref.h>
-#include <cuda/stream_ref>
 #include <cuda/std/__cccl/prologue.h>
@@ -79,4 +79,4 @@ _CCCL_END_NAMESPACE_CUDA_MR
 #include <cuda/std/__cccl/epilogue.h>
-#endif //_CUDAX__MEMORY_RESOURCE_GET_MEMORY_RESOURCE_CUH
+#endif //_CUDA__MEMORY_RESOURCE_GET_MEMORY_RESOURCE_H

cuda/cccl/headers/include/cuda/__memory_resource/properties.h CHANGED Viewed

@@ -21,6 +21,7 @@
 #  pragma system_header
 #endif // no system header
+#include <cuda/std/__type_traits/decay.h>
 #include <cuda/std/__type_traits/type_set.h>
 #include <cuda/std/cstddef>
@@ -62,6 +63,49 @@ template <class... _Properties>
 inline constexpr bool __contains_execution_space_property =
   __is_host_accessible<_Properties...> || __is_device_accessible<_Properties...>;
+//! @brief A type representing a list of memory resource properties
+//! @tparam _Properties The properties to be included in the list
+//! It has a member template `rebind` that allows constructing a type by combining
+//! a template and type arguments with the properties from this list. The properties
+//! are appended after the type arguments in the resulting type.
+template <class... _Properties>
+struct properties_list
+{
+  //! @brief A type alias for a type template instantiated with the properties
+  //! from this list appended to the type arguments.
+  template <template <class...> class _Fn, class... _ExtraArgs>
+  using rebind = _Fn<_ExtraArgs..., _Properties...>;
+  template <class _QueryProperty>
+  _CCCL_HOST_API static constexpr bool has_property([[maybe_unused]] _QueryProperty)
+  {
+    return ::cuda::std::__type_set_contains_v<::cuda::std::__make_type_set<_Properties...>, _QueryProperty>;
+  }
+};
+template <class _Tp>
+inline constexpr bool __is_queries_list = false;
+template <class... _Tp>
+inline constexpr bool __is_queries_list<properties_list<_Tp...>> = true;
+template <typename _Tp>
+_CCCL_CONCEPT __has_default_queries =
+  _CCCL_REQUIRES_EXPR((_Tp))(requires(__is_queries_list<typename ::cuda::std::decay_t<_Tp>::default_queries>));
+template <typename _Resource, bool _HasDefaultQueries = __has_default_queries<_Resource>>
+struct __copy_default_queries;
+template <typename _Resource>
+struct __copy_default_queries<_Resource, true>
+{
+  using default_queries = typename _Resource::default_queries;
+};
+template <typename _Resource>
+struct __copy_default_queries<_Resource, false>
+{};
 _CCCL_END_NAMESPACE_CUDA_MR
 #include <cuda/std/__cccl/epilogue.h>

cuda/cccl/headers/include/cuda/__memory_resource/resource.h CHANGED Viewed

@@ -22,6 +22,7 @@
 #endif // no system header
 #include <cuda/__memory_resource/get_property.h>
+#include <cuda/__stream/stream_ref.h>
 #include <cuda/std/__concepts/concept_macros.h>
 #include <cuda/std/__concepts/convertible_to.h>
 #include <cuda/std/__concepts/equality_comparable.h>
@@ -29,7 +30,6 @@
 #include <cuda/std/__tuple_dir/sfinae_helpers.h>
 #include <cuda/std/__type_traits/decay.h>
 #include <cuda/std/__type_traits/fold.h>
-#include <cuda/stream_ref>
 #include <cuda/std/__cccl/prologue.h>

cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h CHANGED Viewed

@@ -26,6 +26,7 @@
 #  include <cuda/__memory_resource/get_property.h>
 #  include <cuda/__memory_resource/properties.h>
 #  include <cuda/__memory_resource/resource.h>
+#  include <cuda/__stream/stream_ref.h>
 #  include <cuda/std/__concepts/concept_macros.h>
 #  include <cuda/std/__memory/addressof.h>
 #  include <cuda/std/__type_traits/is_base_of.h>
@@ -34,7 +35,6 @@
 #  include <cuda/std/__utility/exchange.h>
 #  include <cuda/std/__utility/move.h>
 #  include <cuda/std/cstddef>
-#  include <cuda/stream_ref>
 #  include <cuda/std/__cccl/prologue.h>
@@ -161,10 +161,7 @@ struct _Resource_vtable_builder
   template <class _Resource>
   static void _Dealloc(void* __object, void* __ptr, size_t __bytes, size_t __alignment) noexcept
   {
-    // TODO: this breaks RMM because their memory resources do not declare their
-    // deallocate_sync functions to be noexcept. Comment out the check for now until
-    // we can fix RMM.
-    // static_assert(noexcept(static_cast<_Resource*>(__object)->deallocate(__ptr, __bytes, __alignment)));
+    static_assert(noexcept(static_cast<_Resource*>(__object)->deallocate_sync(__ptr, __bytes, __alignment)));
     return static_cast<_Resource*>(__object)->deallocate_sync(__ptr, __bytes, __alignment);
   }
@@ -176,8 +173,9 @@ struct _Resource_vtable_builder
   template <class _Resource>
   static void
-  _Dealloc_async(void* __object, void* __ptr, size_t __bytes, size_t __alignment, ::cuda::stream_ref __stream)
+  _Dealloc_async(void* __object, void* __ptr, size_t __bytes, size_t __alignment, ::cuda::stream_ref __stream) noexcept
   {
+    static_assert(noexcept(static_cast<_Resource*>(__object)->deallocate(__stream, __ptr, __bytes, __alignment)));
     return static_cast<_Resource*>(__object)->deallocate(__stream, __ptr, __bytes, __alignment);
   }

cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h CHANGED Viewed

@@ -653,8 +653,9 @@
 #ifndef NVTX3_CPP_DEFINITIONS_V1_0
 #  define NVTX3_CPP_DEFINITIONS_V1_0
+#  include <cuda/std/__cccl/memory_wrapper.h>
 #  include <cstddef>
-#  include <memory>
 #  include <string>
 #  include <type_traits>
 #  include <utility>

cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h CHANGED Viewed

@@ -32,6 +32,7 @@
 #  ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 _CCCL_BEGIN_NAMESPACE_CUDA
 class stream_ref;
 //! @brief RAII helper which on construction sets the current context to the specified one.
@@ -45,7 +46,7 @@ struct [[maybe_unused]] __ensure_current_context
   //! @param new_device The device to switch the context to
   //!
   //! @throws cuda_error if the context switch fails
-  explicit __ensure_current_context(device_ref __new_device)
+  _CCCL_HOST_API explicit __ensure_current_context(device_ref __new_device)
   {
     auto __ctx = ::cuda::__physical_devices()[__new_device.get()].__primary_context();
     ::cuda::__driver::__ctxPush(__ctx);
@@ -57,7 +58,7 @@ struct [[maybe_unused]] __ensure_current_context
   //! @param ctx The context to switch to
   //!
   //! @throws cuda_error if the context switch fails
-  explicit __ensure_current_context(::CUcontext __ctx)
+  _CCCL_HOST_API explicit __ensure_current_context(::CUcontext __ctx)
   {
     ::cuda::__driver::__ctxPush(__ctx);
   }
@@ -68,7 +69,7 @@ struct [[maybe_unused]] __ensure_current_context
   //! @param stream Stream indicating the context to switch to
   //!
   //! @throws cuda_error if the context switch fails
-  explicit __ensure_current_context(stream_ref __stream);
+  _CCCL_HOST_API explicit __ensure_current_context(stream_ref __stream);
   __ensure_current_context(__ensure_current_context&&)                 = delete;
   __ensure_current_context(__ensure_current_context const&)            = delete;
@@ -80,7 +81,7 @@ struct [[maybe_unused]] __ensure_current_context
   //!
   //! @throws cuda_error if the device switch fails. If the destructor is called
   //!         during stack unwinding, the program is automatically terminated.
-  ~__ensure_current_context() noexcept(false)
+  _CCCL_HOST_API ~__ensure_current_context() noexcept(false)
   {
     // TODO would it make sense to assert here that we pushed and popped the same thing?
     ::cuda::__driver::__ctxPop();

cuda/cccl/headers/include/cuda/__stream/stream.h CHANGED Viewed

@@ -43,7 +43,7 @@ struct stream : stream_ref
   //! Priority is defaulted to stream::default_priority
   //!
   //! @throws cuda_error if stream creation fails
-  explicit stream(device_ref __dev, int __priority = default_priority)
+  _CCCL_HOST_API explicit stream(device_ref __dev, int __priority = default_priority)
       : stream_ref(__detail::__invalid_stream)
   {
     [[maybe_unused]] __ensure_current_context __ctx_setter(__dev);
@@ -54,7 +54,7 @@ struct stream : stream_ref
   //!
   //! @post `stream()` returns an invalid stream handle
   // Can't be constexpr because __invalid_stream isn't
-  explicit stream(no_init_t) noexcept
+  _CCCL_HOST_API explicit stream(no_init_t) noexcept
       : stream_ref(__detail::__invalid_stream)
   {}
@@ -63,7 +63,7 @@ struct stream : stream_ref
   //! @param __other
   //!
   //! @post `__other` is in moved-from state.
-  stream(stream&& __other) noexcept
+  _CCCL_HOST_API stream(stream&& __other) noexcept
       : stream(::cuda::std::exchange(__other.__stream, __detail::__invalid_stream))
   {}
@@ -72,7 +72,7 @@ struct stream : stream_ref
   //! Destroy the `stream` object
   //!
   //! @note If the stream fails to be destroyed, the error is silently ignored.
-  ~stream()
+  _CCCL_HOST_API ~stream()
   {
     if (__stream != __detail::__invalid_stream)
     {
@@ -87,7 +87,7 @@ struct stream : stream_ref
   //! @param __other
   //!
   //! @post `__other` is in a moved-from state.
-  stream& operator=(stream&& __other) noexcept
+  _CCCL_HOST_API stream& operator=(stream&& __other) noexcept
   {
     stream __tmp(::cuda::std::move(__other));
     ::cuda::std::swap(__stream, __tmp.__stream);
@@ -103,7 +103,7 @@ struct stream : stream_ref
   //! @return stream The constructed `stream` object
   //!
   //! @note The constructed `stream` object takes ownership of the native handle.
-  [[nodiscard]] static stream from_native_handle(::cudaStream_t __handle)
+  [[nodiscard]] static _CCCL_HOST_API stream from_native_handle(::cudaStream_t __handle)
   {
     return stream(__handle);
   }
@@ -119,7 +119,7 @@ struct stream : stream_ref
   //! @return cudaStream_t The native handle being held by the `stream` object.
   //!
   //! @post The stream object is in a moved-from state.
-  [[nodiscard]] ::cudaStream_t release()
+  [[nodiscard]] _CCCL_HOST_API ::cudaStream_t release()
   {
     return ::cuda::std::exchange(__stream, __detail::__invalid_stream);
   }
@@ -127,7 +127,7 @@ struct stream : stream_ref
 private:
   // Use `stream::from_native_handle(s)` to construct an owning `stream`
   // object from a `cudaStream_t` handle.
-  explicit stream(::cudaStream_t __handle)
+  _CCCL_HOST_API explicit stream(::cudaStream_t __handle)
       : stream_ref(__handle)
   {}
 };

cuda/cccl/headers/include/cuda/__stream/stream_ref.h CHANGED Viewed

@@ -30,6 +30,7 @@
 #  include <cuda/__runtime/ensure_current_context.h>
 #  include <cuda/__utility/no_init.h>
 #  include <cuda/std/__exception/cuda_error.h>
+#  include <cuda/std/__utility/to_underlying.h>
 #  include <cuda/std/cstddef>
 #  include <cuda/std/__cccl/prologue.h>
@@ -61,9 +62,10 @@ public:
   //!
   //! For behavior of the default stream,
   //! @see //! https://docs.nvidia.com/cuda/cuda-runtime-api/stream-sync-behavior.html
-  [[deprecated("Using the default/null stream is generally discouraged. If you need to use it, please construct a "
-               "stream_ref from cudaStream_t{nullptr}")]]
-  _CCCL_HIDE_FROM_ABI stream_ref() = default;
+  CCCL_DEPRECATED_BECAUSE("Using the default/null stream is generally discouraged. If you need to use it, please "
+                          "construct a "
+                          "stream_ref from cudaStream_t{nullptr}") _CCCL_HIDE_FROM_ABI
+  stream_ref() = default;
   //! @brief Constructs a `stream_ref` from a `cudaStream_t` handle.
   //!
@@ -124,8 +126,7 @@ public:
   //! @brief Deprecated. Use sync() instead.
   //!
   //! @deprecated Use sync() instead.
-  [[deprecated("Use sync() instead.")]]
-  void wait() const
+  CCCL_DEPRECATED_BECAUSE("Use sync() instead.") _CCCL_HOST_API void wait() const
   {
     sync();
   }
@@ -184,7 +185,7 @@ public:
   //! @throws cuda::cuda_error if the query fails.
   //!
   //! @return `true` if all operations have completed, or `false` if not.
-  [[deprecated("Use is_done() instead.")]] [[nodiscard]] bool ready() const
+  [[nodiscard]] CCCL_DEPRECATED_BECAUSE("Use is_done() instead.") _CCCL_HOST_API bool ready() const
   {
     return is_done();
   }
@@ -216,7 +217,7 @@ public:
   //! @return A new event that was recorded into this stream
   //!
   //! @throws cuda_error if event creation or record failed
-  [[nodiscard]] _CCCL_HOST_API event record_event(event::flags __flags = event::flags::none) const
+  [[nodiscard]] _CCCL_HOST_API event record_event(event_flags __flags = event_flags::none) const
   {
     return event(*this, __flags);
   }
@@ -226,7 +227,7 @@ public:
   //! @return A new timed event that was recorded into this stream
   //!
   //! @throws cuda_error if event creation or record failed
-  [[nodiscard]] _CCCL_HOST_API timed_event record_timed_event(event::flags __flags = event::flags::none) const
+  [[nodiscard]] _CCCL_HOST_API timed_event record_timed_event(event_flags __flags = event_flags::none) const
   {
     return timed_event(*this, __flags);
   }
@@ -237,7 +238,7 @@ public:
   //! returned
   //!
   //! @throws cuda_error if device check fails
-  _CCCL_HOST_API device_ref device() const
+  [[nodiscard]] _CCCL_HOST_API device_ref device() const
   {
     ::CUdevice __device{};
 #  if _CCCL_CTK_AT_LEAST(13, 0)
@@ -260,7 +261,7 @@ public:
   }
 };
-inline void event_ref::record(stream_ref __stream) const
+_CCCL_HOST_API inline void event_ref::record(stream_ref __stream) const
 {
   _CCCL_ASSERT(__event_ != nullptr, "cuda::event_ref::record no event set");
   _CCCL_ASSERT(__stream.get() != nullptr, "cuda::event_ref::record invalid stream passed");
@@ -268,26 +269,26 @@ inline void event_ref::record(stream_ref __stream) const
   ::cuda::__driver::__eventRecord(__event_, __stream.get());
 }
-inline event::event(stream_ref __stream, event::flags __flags)
-    : event(__stream, static_cast<unsigned>(__flags) | cudaEventDisableTiming)
+_CCCL_HOST_API inline event::event(stream_ref __stream, event_flags __flags)
+    : event(__stream, ::cuda::std::to_underlying(__flags) | cudaEventDisableTiming)
 {
   record(__stream);
 }
-inline event::event(stream_ref __stream, unsigned __flags)
+_CCCL_HOST_API inline event::event(stream_ref __stream, unsigned __flags)
     : event_ref(::cudaEvent_t{})
 {
   [[maybe_unused]] __ensure_current_context __ctx_setter(__stream);
   __event_ = ::cuda::__driver::__eventCreate(static_cast<unsigned>(__flags));
 }
-inline timed_event::timed_event(stream_ref __stream, event::flags __flags)
-    : event(__stream, static_cast<unsigned>(__flags))
+_CCCL_HOST_API inline timed_event::timed_event(stream_ref __stream, event_flags __flags)
+    : event(__stream, ::cuda::std::to_underlying(__flags))
 {
   record(__stream);
 }
-inline __ensure_current_context::__ensure_current_context(stream_ref __stream)
+_CCCL_HOST_API inline __ensure_current_context::__ensure_current_context(stream_ref __stream)
 {
   auto __ctx = __driver::__streamGetCtx(__stream.get());
   ::cuda::__driver::__ctxPush(__ctx);

cuda/cccl/headers/include/cuda/__utility/in_range.h ADDED Viewed

@@ -0,0 +1,65 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+#ifndef _CUDA___UTILITY_IN_RANGE_H
+#define _CUDA___UTILITY_IN_RANGE_H
+#include <cuda/std/detail/__config>
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+#include <cuda/__type_traits/is_floating_point.h>
+#include <cuda/std/__cmath/isnan.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__type_traits/conditional.h>
+#include <cuda/std/__type_traits/is_extended_floating_point.h>
+#include <cuda/std/__type_traits/is_integer.h>
+#include <cuda/std/__type_traits/is_unsigned_integer.h>
+#include <cuda/std/__cccl/prologue.h>
+_CCCL_BEGIN_NAMESPACE_CUDA
+_CCCL_TEMPLATE(typename _Tp)
+_CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> || ::cuda::std::is_floating_point_v<_Tp>
+               || ::cuda::std::__is_extended_floating_point_v<_Tp>)
+[[nodiscard]] _CCCL_API constexpr bool in_range(_Tp __v, _Tp __start, _Tp __end) noexcept
+{
+  _CCCL_ASSERT(::cuda::std::isnan(__start) || ::cuda::std::isnan(__end) || __end >= __start,
+               "in_range: __end must be greater than or equal to __start");
+  if constexpr (::cuda::std::__cccl_is_unsigned_integer_v<_Tp>)
+  {
+    // if __end > __start, we know that the range is always positive. Similarly, __v is positive if unsigned.
+    // this optimization is useful when __start and __end are compile-time constants, or when in_range is used multiple
+    // times with the same range
+    using _Up = ::cuda::std::conditional_t<(sizeof(_Tp) <= sizeof(unsigned)), unsigned, _Tp>; // at least 32-bit
+    const auto __start1 = static_cast<_Up>(__start);
+    const auto __end1   = static_cast<_Up>(__end);
+    const auto __v1     = static_cast<_Up>(__v);
+    const auto __range  = __end1 - __start1;
+    return (__v1 - __start1) <= __range;
+  }
+  else
+  {
+    return __v >= __start && __v <= __end;
+  }
+}
+_CCCL_END_NAMESPACE_CUDA
+#include <cuda/std/__cccl/epilogue.h>
+#endif // _CUDA___UTILITY_IN_RANGE_H

cuda/cccl/headers/include/cuda/cmath CHANGED Viewed

@@ -26,6 +26,7 @@
 #include <cuda/__cmath/ilog.h>
 #include <cuda/__cmath/ipow.h>
 #include <cuda/__cmath/isqrt.h>
+#include <cuda/__cmath/mul_hi.h>
 #include <cuda/__cmath/neg.h>
 #include <cuda/__cmath/pow2.h>
 #include <cuda/__cmath/round_down.h>

cuda/cccl/headers/include/cuda/devices CHANGED Viewed

@@ -22,9 +22,12 @@
 #endif // no system header
 #include <cuda/__device/all_devices.h>
+#include <cuda/__device/arch_id.h>
 #include <cuda/__device/arch_traits.h>
 #include <cuda/__device/attributes.h>
+#include <cuda/__device/compute_capability.h>
 #include <cuda/__device/device_ref.h>
 #include <cuda/__device/physical_device.h>
+#include <cuda/version>
 #endif // _CUDA_DEVICES

cuda/cccl/headers/include/cuda/memory CHANGED Viewed

@@ -28,6 +28,7 @@
 #include <cuda/__memory/discard_memory.h>
 #include <cuda/__memory/get_device_address.h>
 #include <cuda/__memory/is_aligned.h>
+#include <cuda/__memory/ptr_in_range.h>
 #include <cuda/__memory/ptr_rebind.h>
 #include <cuda/std/memory>

cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h CHANGED Viewed

@@ -52,12 +52,12 @@ __equal_range(_Iter __first, _Sent __last, const _Tp& __value, _Compare&& __comp
   {
     auto __half_len = ::cuda::std::__half_positive(__len);
     _Iter __mid     = _IterOps<_AlgPolicy>::next(__first, __half_len);
-    if (::cuda::std::__invoke(__comp, ::cuda::std::__invoke(__proj, *__mid), __value))
+    if (::cuda::std::invoke(__comp, ::cuda::std::invoke(__proj, *__mid), __value))
     {
       __first = ++__mid;
       __len -= __half_len + 1;
     }
-    else if (::cuda::std::__invoke(__comp, __value, ::cuda::std::__invoke(__proj, *__mid)))
+    else if (::cuda::std::invoke(__comp, __value, ::cuda::std::invoke(__proj, *__mid)))
     {
       __end = __mid;
       __len = __half_len;