PyPI - cuda-cccl - Versions diffs - 0.1.3.2.0.dev438__cp311-cp311-manylinux_2_26_x86_64.whl → 0.3.0__cp311-cp311-manylinux_2_26_x86_64.whl - Mend

cuda-cccl 0.1.3.2.0.dev438__cp311-cp311-manylinux_2_26_x86_64.whl → 0.3.0__cp311-cp311-manylinux_2_26_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (60) hide show

cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh CHANGED Viewed

@@ -51,6 +51,7 @@
 #include <cub/block/radix_rank_sort_operations.cuh>
 #include <cub/iterator/cache_modified_input_iterator.cuh>
 #include <cub/thread/thread_load.cuh>
+#include <cub/util_device.cuh>
 #include <cub/util_type.cuh>
 #include <cuda/std/cstdint>
@@ -119,6 +120,28 @@ struct AgentRadixSortDownsweepPolicy : ScalingType
   static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
 };
+#if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
+namespace detail
+{
+// Only define this when needed.
+// Because of overload woes, this depends on C++20 concepts. util_device.h checks that concepts are available when
+// either runtime policies or PTX JSON information are enabled, so if they are, this is always valid. The generic
+// version is always defined, and that's the only one needed for regular CUB operations.
+//
+// TODO: enable this unconditionally once concepts are always available
+CUB_DETAIL_POLICY_WRAPPER_DEFINE(
+  RadixSortDownsweepAgentPolicy,
+  (GenericAgentPolicy),
+  (BLOCK_THREADS, BlockThreads, int),
+  (ITEMS_PER_THREAD, ItemsPerThread, int),
+  (RADIX_BITS, RadixBits, int),
+  (LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
+  (LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
+  (RANK_ALGORITHM, RankAlgorithm, cub::RadixRankAlgorithm),
+  (SCAN_ALGORITHM, ScanAlgorithm, cub::BlockScanAlgorithm))
+} // namespace detail
+#endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
 /******************************************************************************
  * Thread block abstractions
  ******************************************************************************/

cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh CHANGED Viewed

@@ -48,33 +48,41 @@
 CUB_NAMESPACE_BEGIN
-template <int WARP_THREADS_ARG,
+template <int BLOCK_THREADS_ARG,
+          int WARP_THREADS_ARG,
           int ITEMS_PER_THREAD_ARG,
           cub::WarpLoadAlgorithm LOAD_ALGORITHM_ARG   = cub::WARP_LOAD_DIRECT,
           cub::CacheLoadModifier LOAD_MODIFIER_ARG    = cub::LOAD_LDG,
           cub::WarpStoreAlgorithm STORE_ALGORITHM_ARG = cub::WARP_STORE_DIRECT>
 struct AgentSubWarpMergeSortPolicy
 {
-  static constexpr int WARP_THREADS     = WARP_THREADS_ARG;
-  static constexpr int ITEMS_PER_THREAD = ITEMS_PER_THREAD_ARG;
-  static constexpr int ITEMS_PER_TILE   = WARP_THREADS * ITEMS_PER_THREAD;
+  static constexpr int BLOCK_THREADS      = BLOCK_THREADS_ARG;
+  static constexpr int WARP_THREADS       = WARP_THREADS_ARG;
+  static constexpr int ITEMS_PER_THREAD   = ITEMS_PER_THREAD_ARG;
+  static constexpr int ITEMS_PER_TILE     = WARP_THREADS * ITEMS_PER_THREAD;
+  static constexpr int SEGMENTS_PER_BLOCK = BLOCK_THREADS / WARP_THREADS;
   static constexpr cub::WarpLoadAlgorithm LOAD_ALGORITHM   = LOAD_ALGORITHM_ARG;
   static constexpr cub::CacheLoadModifier LOAD_MODIFIER    = LOAD_MODIFIER_ARG;
   static constexpr cub::WarpStoreAlgorithm STORE_ALGORITHM = STORE_ALGORITHM_ARG;
 };
-template <int BLOCK_THREADS_ARG, typename SmallPolicy, typename MediumPolicy>
-struct AgentSmallAndMediumSegmentedSortPolicy
+#if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
+namespace detail
 {
-  static constexpr int BLOCK_THREADS = BLOCK_THREADS_ARG;
-  using SmallPolicyT                 = SmallPolicy;
-  using MediumPolicyT                = MediumPolicy;
-  static constexpr int SEGMENTS_PER_MEDIUM_BLOCK = BLOCK_THREADS / MediumPolicyT::WARP_THREADS;
-  static constexpr int SEGMENTS_PER_SMALL_BLOCK = BLOCK_THREADS / SmallPolicyT::WARP_THREADS;
-};
+CUB_DETAIL_POLICY_WRAPPER_DEFINE(
+  SubWarpMergeSortAgentPolicy,
+  (GenericAgentPolicy),
+  (BLOCK_THREADS, BlockThreads, int),
+  (WARP_THREADS, WarpThreads, int),
+  (ITEMS_PER_THREAD, ItemsPerThread, int),
+  (ITEMS_PER_TILE, ItemsPerTile, int),
+  (SEGMENTS_PER_BLOCK, SegmentsPerBlock, int),
+  (LOAD_ALGORITHM, LoadAlgorithm, cub::WarpLoadAlgorithm),
+  (LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
+  (STORE_ALGORITHM, StoreAlgorithm, cub::WarpStoreAlgorithm))
+} // namespace detail
+#endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
 namespace detail
 {

cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh ADDED Viewed

@@ -0,0 +1,432 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//! @file
+//! The @c cub::BlockLoadToShared class provides a :ref:`collective <collective-primitives>` method for asynchronously
+//! loading data from global to shared memory.
+#pragma once
+#include <cub/config.cuh>
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+#include <thrust/type_traits/is_trivially_relocatable.h>
+#include <cuda/cmath>
+#include <cuda/memory>
+#include <cuda/ptx>
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__bit/has_single_bit.h>
+#include <cuda/std/cstdint>
+#include <cuda/std/span>
+#include <nv/target>
+CUB_NAMESPACE_BEGIN
+namespace detail
+{
+//! @rst
+//! The @c BlockLoadToShared class provides a :ref:`collective <collective-primitives>` method for asynchronously
+//! loading data from global to shared memory.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - Given one or more spans of input elements in global memory and buffers in shared memory, this primitive
+//!   asynchronously copies the elements to shared memory and takes care of synchronization.
+//! - @rowmajor
+//! - Shared memory buffers are assumed to be aligned according to `SharedBufferAlignBytes<T>()`.
+//! - Global memory spans are by default assumed to be aligned according to the value type. Higher alignment guarantees
+//!   can optionally be specified.
+//! - After one or more calls to `CopyAsync`, `Commit` needs to be called before optionally doing other work and then
+//!   calling `Wait` which guarantees the data to be available in shared memory and resets the state and allows for the
+//!   next wave of `CopyAsync`.
+//!
+//! Performance Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - Uses special instructions/hardware acceleration when available (cp.async.bulk on Hopper+, copy.async on Ampere).
+//! - By guaranteeing 16 byte alignment and size multiple for the global span, a faster path is taken.
+template <int BlockDimX, int BlockDimY = 1, int BlockDimZ = 1>
+struct BlockLoadToShared
+{
+private:
+  /// Constants
+  static constexpr int block_threads = BlockDimX * BlockDimY * BlockDimZ;
+  // The alignment needed for cp.async.bulk and L1-skipping cp.async
+  static constexpr int minimum_align = 16;
+  // Helper for fallback to gmem->reg->smem
+  struct alignas(minimum_align) vec_load_t
+  {
+    char c_array[minimum_align];
+  };
+  struct _TempStorage
+  {
+    ::cuda::std::uint64_t mbarrier_handle;
+  };
+#ifdef CCCL_ENABLE_DEVICE_ASSERTIONS
+  enum struct State
+  {
+    ready_to_copy,
+    ready_to_copy_or_commit,
+    committed,
+    invalidated,
+  };
+#endif // CCCL_ENABLE_DEVICE_ASSERTIONS
+  /// Shared storage reference
+  _TempStorage& temp_storage;
+  const int linear_tid{cub::RowMajorTid(BlockDimX, BlockDimY, BlockDimZ)};
+  // Thread selection for uniform operations
+  const bool elected{__elect_thread()};
+  // Keep track of current mbarrier phase for waiting.
+  uint32_t phase_parity{};
+  // Keep track of the amount of bytes from multiple transactions for Commit() (only needed for TMA).
+  // Also used to check for proper ordering of member function calls in debug mode.
+  uint32_t num_bytes_bulk_total{};
+#ifdef CCCL_ENABLE_DEVICE_ASSERTIONS
+  State state{State::ready_to_copy};
+#endif // CCCL_ENABLE_DEVICE_ASSERTIONS
+  /// Internal storage allocator
+  _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& __private_storage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+  _CCCL_DEVICE _CCCL_FORCEINLINE bool __elect_thread() const
+  {
+    // Otherwise elect.sync in the last warp with a full mask is UB.
+    static_assert(block_threads % cub::detail::warp_threads == 0, "The block size must be a multiple of the warp size");
+    return NV_DISPATCH_TARGET(
+      NV_PROVIDES_SM_90,
+      ( // Use last warp to try to avoid having the elected thread also working on the peeling in the first warp.
+        (linear_tid >= block_threads - cub::detail::warp_threads) && ::cuda::ptx::elect_sync(~0u)),
+      NV_IS_DEVICE,
+      (linear_tid == 0));
+  }
+  _CCCL_DEVICE _CCCL_FORCEINLINE void __init_mbarrier()
+  {
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_90,
+                   (if (elected) { ::cuda::ptx::mbarrier_init(&temp_storage.mbarrier_handle, 1); }
+                    // TODO The following sync was added to avoid a racecheck posititive. Is it really needed?
+                    __syncthreads();));
+    }
+  }
+  _CCCL_DEVICE _CCCL_FORCEINLINE void __copy_aligned_async_bulk(char* smem_dst, const char* gmem_src, int num_bytes)
+  {
+    if (elected)
+    {
+#if __cccl_ptx_isa >= 860
+      NV_IF_TARGET(
+        NV_PROVIDES_SM_90,
+        (::cuda::ptx::cp_async_bulk(
+           ::cuda::ptx::space_shared,
+           ::cuda::ptx::space_global,
+           smem_dst,
+           gmem_src,
+           num_bytes,
+           &temp_storage.mbarrier_handle);));
+#else
+      NV_IF_TARGET(
+        NV_PROVIDES_SM_90,
+        (::cuda::ptx::cp_async_bulk(
+           ::cuda::ptx::space_cluster,
+           ::cuda::ptx::space_global,
+           smem_dst,
+           gmem_src,
+           num_bytes,
+           &temp_storage.mbarrier_handle);));
+#endif // __cccl_ptx_isa >= 800
+      // Needed for arrival on mbarrier in Commit()
+      num_bytes_bulk_total += num_bytes;
+    }
+  }
+  _CCCL_DEVICE _CCCL_FORCEINLINE void __copy_aligned_async(char* smem_dst, const char* gmem_src, int num_bytes)
+  {
+    for (int offset = linear_tid * minimum_align; offset < num_bytes; offset += block_threads * minimum_align)
+    {
+      [[maybe_unused]] const auto thread_src = gmem_src + offset;
+      [[maybe_unused]] const auto thread_dst = smem_dst + offset;
+      // LDGSTS borrowed from cuda::memcpy_async, assumes 16 byte alignment to avoid L1 (.cg)
+      NV_IF_TARGET(NV_PROVIDES_SM_80,
+                   (asm volatile("cp.async.cg.shared.global [%0], [%1], %2, %2;" : : "r"(
+                                   static_cast<::cuda::std::uint32_t>(::__cvta_generic_to_shared(thread_dst))),
+                                 "l"(thread_src),
+                                 "n"(16) : "memory");));
+    }
+  }
+  _CCCL_DEVICE _CCCL_FORCEINLINE void __copy_aligned_fallback(char* smem_dst, const char* gmem_src, int num_bytes)
+  {
+    for (int offset = linear_tid * minimum_align; offset < num_bytes; offset += block_threads * minimum_align)
+    {
+      const auto thread_src                       = gmem_src + offset;
+      const auto thread_dst                       = smem_dst + offset;
+      *::cuda::ptr_rebind<vec_load_t>(thread_dst) = *::cuda::ptr_rebind<vec_load_t>(thread_src);
+    }
+  }
+  _CCCL_DEVICE _CCCL_FORCEINLINE void __copy_aligned(char* smem_dst, const char* gmem_src, int num_bytes)
+  {
+    NV_DISPATCH_TARGET(
+      NV_PROVIDES_SM_90,
+      (__copy_aligned_async_bulk(smem_dst, gmem_src, num_bytes);),
+      NV_PROVIDES_SM_80,
+      (__copy_aligned_async(smem_dst, gmem_src, num_bytes);),
+      NV_IS_DEVICE,
+      (__copy_aligned_fallback(smem_dst, gmem_src, num_bytes);));
+  }
+  // Dispatch to fallback for waiting pre TMA/SM_90
+  _CCCL_DEVICE _CCCL_FORCEINLINE bool __try_wait()
+  {
+    NV_DISPATCH_TARGET(
+      NV_PROVIDES_SM_90,
+      (return ::cuda::ptx::mbarrier_try_wait_parity(&temp_storage.mbarrier_handle, phase_parity);),
+      NV_PROVIDES_SM_80,
+      (asm volatile("cp.async.wait_group 0;" :: : "memory"); //
+       __syncthreads();
+       return true;),
+      NV_IS_DEVICE,
+      (__syncthreads(); //
+       return true;));
+  }
+public:
+  /// @smemstorage{BlockLoadToShared}
+  using TempStorage = cub::Uninitialized<_TempStorage>;
+  //! @name Collective constructors
+  //! @{
+  //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockLoadToShared()
+      : temp_storage(__private_storage())
+  {
+    __init_mbarrier();
+  }
+  //! @brief Collective constructor using the specified memory allocation as temporary storage.
+  //!
+  //! @param[in] temp_storage
+  //!   Reference to memory allocation having layout type TempStorage
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockLoadToShared(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+  {
+    _CCCL_ASSERT(::cuda::device::is_object_from(temp_storage, ::cuda::device::address_space::shared),
+                 "temp_storage has to be in shared memory");
+    __init_mbarrier();
+  }
+  _CCCL_DEVICE BlockLoadToShared(const BlockLoadToShared<BlockDimX, BlockDimY, BlockDimZ>&) = delete;
+  //! @}  end member group
+  _CCCL_DEVICE BlockLoadToShared& operator=(const BlockLoadToShared<BlockDimX, BlockDimY, BlockDimZ>&) = delete;
+  //! @brief Invalidates underlying @c mbarrier enabling reuse of its temporary storage.
+  //! @note
+  //! Block-synchronization is needed after calling `Invalidate()` to reuse the shared memory from the temporary
+  //! storage.
+  // This is not the destructor to avoid overhead when shared memory reuse is not needed.
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Invalidate()
+  {
+#ifdef CCCL_ENABLE_DEVICE_ASSERTIONS
+    _CCCL_ASSERT(state == State::ready_to_copy, "Wait() must be called before Invalidate()");
+    state = State::invalidated;
+#endif // CCCL_ENABLE_DEVICE_ASSERTIONS
+    // Make sure all threads are done interacting with the mbarrier
+    __syncthreads();
+    if (elected)
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_90,
+                   (
+                     // Borrowed from cuda::barrier
+                     // TODO Make this available through cuda::ptx::
+                     asm volatile("mbarrier.inval.shared.b64 [%0];" ::"r"(static_cast<::cuda::std::uint32_t>(
+                       ::__cvta_generic_to_shared(&temp_storage.mbarrier_handle))) : "memory");));
+    }
+    // Make sure the elected thread is done invalidating the mbarrier
+    __syncthreads();
+  }
+  //! @brief Copy elements from global to shared memory
+  //! @tparam T
+  //!   **[inferred]** Value type for this transaction
+  //! @tparam GmemAlign
+  //!   Guaranteed alignment in bytes of the source range (both begin and end) in global memory
+  //! @param[in] smem_dst
+  //!   Destination buffer in shared memory that is aligned to `SharedBufferAlignBytes<T>()` and at least
+  //!   `SharedBufferSizeBytes<T, GmemAlign>(size(gmem_src))` big.
+  //! @param[in] gmem_src
+  //!   Source range in global memory, determines the size of the transaction
+  //! @return
+  //!   The range in shared memory (same size as `gmem_src`) which should be used to access the data after `Commit` and
+  //!   `Wait`.
+  // TODO Allow spans with static sizes?
+  template <typename T, int GmemAlign = alignof(T)>
+  [[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::span<T>
+  CopyAsync(::cuda::std::span<char> smem_dst, ::cuda::std::span<const T> gmem_src)
+  {
+    // TODO Should this be weakened to thrust::is_trivially_relocatable?
+    static_assert(THRUST_NS_QUALIFIER::is_trivially_relocatable_v<T>);
+    static_assert(::cuda::std::has_single_bit(unsigned{GmemAlign}));
+    static_assert(GmemAlign >= int{alignof(T)});
+    constexpr bool bulk_aligned = GmemAlign >= minimum_align;
+    // Avoid 64b multiplication in span::size_bytes()
+    const int num_bytes = static_cast<int>(sizeof(T)) * static_cast<int>(size(gmem_src));
+    const auto dst_ptr  = data(smem_dst);
+    const auto src_ptr  = ::cuda::ptr_rebind<char>(data(gmem_src));
+    _CCCL_ASSERT(dst_ptr == nullptr || ::cuda::device::is_address_from(dst_ptr, ::cuda::device::address_space::shared),
+                 "Destination address needs to point to shared memory");
+    _CCCL_ASSERT(src_ptr == nullptr || ::cuda::device::is_address_from(src_ptr, ::cuda::device::address_space::global),
+                 "Source address needs to point to global memory");
+    _CCCL_ASSERT((src_ptr != nullptr && dst_ptr != nullptr) || num_bytes == 0,
+                 "Only when the source range is empty are nullptrs allowed");
+    _CCCL_ASSERT(::cuda::is_aligned(src_ptr, GmemAlign),
+                 "Begin of global memory range needs to be aligned according to GmemAlign.");
+    _CCCL_ASSERT(::cuda::is_aligned(src_ptr + num_bytes, GmemAlign),
+                 "End of global memory range needs to be aligned according to GmemAlign.");
+    _CCCL_ASSERT(::cuda::is_aligned(dst_ptr, SharedBufferAlignBytes<T>()),
+                 "Shared memory needs to be 16 byte aligned.");
+    _CCCL_ASSERT((static_cast<int>(size(smem_dst)) >= SharedBufferSizeBytes<T, GmemAlign>(size(gmem_src))),
+                 "Shared memory destination buffer must have enough space");
+#ifdef CCCL_ENABLE_DEVICE_ASSERTIONS
+    _CCCL_ASSERT(state == State::ready_to_copy || state == State::ready_to_copy_or_commit,
+                 "Wait() must be called before another CopyAsync()");
+    state = State::ready_to_copy_or_commit;
+#endif // CCCL_ENABLE_DEVICE_ASSERTIONS
+    if constexpr (bulk_aligned)
+    {
+      __copy_aligned(dst_ptr, src_ptr, num_bytes);
+      return {::cuda::ptr_rebind<T>(data(smem_dst)), size(gmem_src)};
+    }
+    else
+    {
+      const auto src_ptr_aligned   = ::cuda::align_up(src_ptr, minimum_align);
+      const int align_diff         = static_cast<int>(src_ptr_aligned - src_ptr);
+      const int head_padding_bytes = (minimum_align - align_diff) % minimum_align;
+      const auto actual_dst_ptr    = dst_ptr + head_padding_bytes;
+      const int head_peeling_bytes = ::cuda::std::min(align_diff, num_bytes);
+      const int num_bytes_bulk     = ::cuda::round_down(num_bytes - head_peeling_bytes, minimum_align);
+      __copy_aligned(actual_dst_ptr + head_peeling_bytes, src_ptr_aligned, num_bytes_bulk);
+      // Peel head and tail
+      // Make sure we have enough threads for the worst case of minimum_align bytes on each side.
+      static_assert(block_threads >= 2 * (minimum_align - 1));
+      // |-------------head--------------|--------------------------tail--------------------------|
+      // 0, 1, ... head_peeling_bytes - 1, head_peeling_bytes + num_bytes_bulk, ..., num_bytes - 1
+      const int begin_offset = linear_tid < head_peeling_bytes ? 0 : num_bytes_bulk;
+      if (const int idx = begin_offset + linear_tid; idx < num_bytes)
+      {
+        actual_dst_ptr[idx] = src_ptr[idx];
+      }
+      return {::cuda::ptr_rebind<T>(actual_dst_ptr), size(gmem_src)};
+    }
+  }
+  // Avoid need to explicitly specify `T` for non-const src.
+  //! @brief Convenience overload, see `CopyAsync(span<char>, span<const T>)`.
+  template <typename T, int GmemAlign = alignof(T)>
+  [[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::span<T>
+  CopyAsync(::cuda::std::span<char> smem_dst, ::cuda::std::span<T> gmem_src)
+  {
+    return CopyAsync<T, GmemAlign>(smem_dst, ::cuda::std::span<const T>{gmem_src});
+  }
+  //! @brief Commit one or more @c CopyAsync() calls.
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Commit()
+  {
+#ifdef CCCL_ENABLE_DEVICE_ASSERTIONS
+    _CCCL_ASSERT(state == State::ready_to_copy_or_commit, "CopyAsync() must be called before Commit()");
+    state = State::committed;
+#endif // CCCL_ENABLE_DEVICE_ASSERTIONS
+    NV_DISPATCH_TARGET(
+      NV_PROVIDES_SM_90,
+      (if (elected) {
+        ::cuda::ptx::mbarrier_arrive_expect_tx(
+          ::cuda::ptx::sem_release,
+          ::cuda::ptx::scope_cta,
+          ::cuda::ptx::space_shared,
+          &temp_storage.mbarrier_handle,
+          num_bytes_bulk_total);
+        num_bytes_bulk_total = 0u;
+      } //
+       __syncthreads();),
+      NV_PROVIDES_SM_80,
+      (asm volatile("cp.async.commit_group ;" :: : "memory");));
+  }
+  //! @brief Wait for previously committed copies to arrive. Prepare for next calls to @c CopyAsync() .
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Wait()
+  {
+#ifdef CCCL_ENABLE_DEVICE_ASSERTIONS
+    _CCCL_ASSERT(state == State::committed, "Commit() must be called before Wait()");
+    state = State::ready_to_copy;
+#endif // CCCL_ENABLE_DEVICE_ASSERTIONS
+    while (!__try_wait())
+      ;
+    phase_parity ^= 1u;
+  }
+  // Having these as static members does require using "template" in user code which is kludgy.
+  //! @brief Returns the alignment needed for the shared memory destination buffer.
+  //! @tparam T
+  //!   Value type to be loaded.
+  template <typename T>
+  _CCCL_HOST_DEVICE static constexpr int SharedBufferAlignBytes()
+  {
+    return (::cuda::std::max) (int{alignof(T)}, minimum_align);
+  }
+  //! @brief Returns the size needed for the shared memory destination buffer.
+  //! @tparam T
+  //!   Value type to be loaded.
+  //! @tparam GmemAlign
+  //!   Guaranteed alignment in bytes of the source range (both begin and end) in global memory
+  //! @param[in] num_items
+  //!   Size of the source range in global memory
+  template <typename T, int GmemAlign = alignof(T)>
+  _CCCL_HOST_DEVICE static constexpr int SharedBufferSizeBytes(::cuda::std::size_t num_items)
+  {
+    static_assert(::cuda::std::has_single_bit(unsigned{GmemAlign}));
+    static_assert(GmemAlign >= int{alignof(T)});
+    _CCCL_ASSERT(num_items <= ::cuda::std::size_t{::cuda::std::numeric_limits<int>::max()},
+                 "num_items must fit into an int");
+    constexpr bool bulk_aligned = GmemAlign >= minimum_align;
+    const int num_bytes         = static_cast<int>(num_items) * int{sizeof(T)};
+    const int extra_space       = (bulk_aligned || num_bytes == 0) ? 0 : minimum_align;
+    return bulk_aligned ? num_bytes : (::cuda::round_up(num_bytes, minimum_align) + extra_space);
+  }
+};
+} // namespace detail
+CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh CHANGED Viewed

@@ -1,29 +1,5 @@
-/******************************************************************************
- * Copyright (c) 2011-2024, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
 #pragma once
@@ -113,6 +89,22 @@ template <int Rank, typename Extents>
   return true;
 }
+template <typename MappingTypeLhs, typename MappingTypeRhs>
+[[nodiscard]] _CCCL_API bool have_same_strides(const MappingTypeLhs& mapping_lhs, const MappingTypeRhs& mapping_rhs)
+{
+  auto extents_lhs = mapping_lhs.extents();
+  auto extents_rhs = mapping_rhs.extents();
+  _CCCL_ASSERT(extents_lhs.rank() == extents_rhs.rank(), "extents must have the same rank");
+  for (size_t i = 0; i < extents_lhs.rank(); i++)
+  {
+    if (mapping_lhs.stride(i) != mapping_rhs.stride(i))
+    {
+      return false;
+    }
+  }
+  return true;
+}
 } // namespace detail
 CUB_NAMESPACE_END