PyPI - cuda-cccl - Versions diffs - 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh CHANGED Viewed

@@ -47,9 +47,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace three_way_partition
+namespace detail::three_way_partition
 {
 template <typename PolicyT, typename = void>
@@ -437,7 +435,6 @@ struct policy_hub
   using MaxPolicy = Policy1000;
 };
-} // namespace three_way_partition
-} // namespace detail
+} // namespace detail::three_way_partition
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh CHANGED Viewed

@@ -113,11 +113,11 @@ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
   (max_items_per_thread, MaxItemsPerThread, int),
   (not_a_vectorized_policy, NotAVectorizedPolicy, int) ) // TODO: remove with C++20
-template <int BlockThreads, int ItemsPerThread, int LoadStoreWordSize>
-struct vectorized_policy_t : prefetch_policy_t<BlockThreads>
+template <typename Tuning>
+struct vectorized_policy_t : prefetch_policy_t<Tuning::block_threads>
 {
-  static constexpr int items_per_thread_vectorized = ItemsPerThread;
-  static constexpr int load_store_word_size        = LoadStoreWordSize;
+  static constexpr int items_per_thread_vectorized = Tuning::items_per_thread;
+  static constexpr int vec_size                    = Tuning::vec_size;
   using not_a_vectorized_policy = void; // TODO: remove with C++20, shadows the variable in prefetch_policy_t
 };
@@ -130,7 +130,7 @@ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
   (min_items_per_thread, MinItemsPerThread, int),
   (max_items_per_thread, MaxItemsPerThread, int),
   (items_per_thread_vectorized, ItemsPerThreadVectorized, int),
-  (load_store_word_size, LoadStoreWordSize, int) )
+  (vec_size, VecSize, int) )
 template <int BlockThreads, int BulkCopyAlignment>
 struct async_copy_policy_t
@@ -282,47 +282,6 @@ _CCCL_HOST_DEVICE constexpr int arch_to_min_bytes_in_flight(int sm_arch)
   return 12 * 1024; // V100 and below
 }
-template <typename H, typename... Ts>
-_CCCL_HOST_DEVICE constexpr bool all_nonzero_equal(H head, Ts... values)
-{
-  size_t first = 0;
-  for (size_t v : ::cuda::std::array<H, 1 + sizeof...(Ts)>{head, values...})
-  {
-    if (v == 0)
-    {
-      continue;
-    }
-    if (first == 0)
-    {
-      first = v;
-    }
-    else if (v != first)
-    {
-      return false;
-    }
-  }
-  return true;
-}
-_CCCL_HOST_DEVICE constexpr bool all_nonzero_equal()
-{
-  return true;
-}
-template <typename H, typename... Ts>
-_CCCL_HOST_DEVICE constexpr auto first_nonzero_value(H head, Ts... values)
-{
-  for (auto v : ::cuda::std::array<H, 1 + sizeof...(Ts)>{head, values...})
-  {
-    if (v != 0)
-    {
-      return v;
-    }
-  }
-  // we only reach here when all input are not contiguous and the output has a void value type
-  return H{1};
-}
 template <typename T>
 inline constexpr size_t size_of = sizeof(T);
@@ -337,6 +296,47 @@ _CCCL_HOST_DEVICE static constexpr auto make_sizes_alignments()
     {{sizeof(it_value_t<RandomAccessIteratorsIn>), alignof(it_value_t<RandomAccessIteratorsIn>)}...}};
 }
+template <int PtxVersion, int StoreSize, int... LoadSizes>
+struct tuning_vec
+{
+  // defaults from fill on RTX 5090, but can be changed
+  static constexpr int block_threads    = 256;
+  static constexpr int vec_size         = 4;
+  static constexpr int items_per_thread = 8;
+};
+// manually tuned fill on A100
+template <int StoreSize>
+struct tuning_vec<800, StoreSize>
+{
+  static constexpr int block_threads    = 256;
+  static constexpr int vec_size         = ::cuda::std::max(8 / StoreSize, 1); // 64-bit instructions
+  static constexpr int items_per_thread = 8;
+};
+// manually tuned fill on H200
+template <int StoreSize>
+struct tuning_vec<900, StoreSize>
+{
+  static constexpr int block_threads    = StoreSize > 4 ? 128 : 256;
+  static constexpr int vec_size         = ::cuda::std::max(8 / StoreSize, 1); // 64-bit instructions
+  static constexpr int items_per_thread = 16;
+};
+// manually tuned fill on B200, same as H200
+template <int StoreSize>
+struct tuning_vec<1000, StoreSize> : tuning_vec<900, StoreSize>
+{};
+// manually tuned fill on RTX 5090
+template <int StoreSize>
+struct tuning_vec<1200, StoreSize>
+{
+  static constexpr int block_threads    = 256;
+  static constexpr int vec_size         = 4;
+  static constexpr int items_per_thread = 8;
+};
 template <bool RequiresStableAddress,
           bool DenseOutput,
           typename RandomAccessIteratorTupleIn,
@@ -367,29 +367,12 @@ struct policy_hub<RequiresStableAddress,
       || THRUST_NS_QUALIFIER::is_trivially_relocatable_v<it_value_t<RandomAccessIteratorsIn>>)
      && ...);
-  // for vectorized policy:
-  static constexpr bool all_contiguous_input_values_same_size = all_nonzero_equal(
-    (sizeof(it_value_t<RandomAccessIteratorsIn>)
-     * THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...);
-  static constexpr int load_store_word_size = 8; // TODO(bgruber): make this 16, and 32 on Blackwell+
-  // find the value type size of the first contiguous iterator. if there are no inputs, we take the size of the output
-  // value type
-  static constexpr int contiguous_value_type_size = first_nonzero_value(
-    (int{sizeof(it_value_t<RandomAccessIteratorsIn>)}
-     * THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...,
-    int{size_of<it_value_t<RandomAccessIteratorOut>>});
-  static constexpr bool value_type_divides_load_store_size =
-    load_store_word_size % contiguous_value_type_size == 0; // implicitly checks that value_type_size <=
-                                                            // load_store_word_size
-  static constexpr int target_bytes_per_thread =
-    no_input_streams ? 16 /* by experiment on RTX 5090 */ : 32 /* guestimate by gevtushenko for loading */;
-  static constexpr int items_per_thread_vec =
-    ::cuda::round_up(target_bytes_per_thread, load_store_word_size) / contiguous_value_type_size;
-  using default_vectorized_policy_t = vectorized_policy_t<256, items_per_thread_vec, load_store_word_size>;
+  static constexpr bool all_value_types_have_power_of_two_size =
+    (::cuda::is_power_of_two(sizeof(it_value_t<RandomAccessIteratorsIn>)) && ...)
+    && ::cuda::is_power_of_two(size_of<it_value_t<RandomAccessIteratorOut>>);
   static constexpr bool fallback_to_prefetch =
-    RequiresStableAddress || !can_memcpy_contiguous_inputs || !all_contiguous_input_values_same_size
-    || !value_type_divides_load_store_size || !DenseOutput;
+    RequiresStableAddress || !can_memcpy_contiguous_inputs || !all_value_types_have_power_of_two_size || !DenseOutput;
   // TODO(bgruber): consider a separate kernel for just filling
@@ -398,12 +381,16 @@ struct policy_hub<RequiresStableAddress,
     static constexpr int min_bif = arch_to_min_bytes_in_flight(300);
     // TODO(bgruber): we don't need algo, because we can just detect the type of algo_policy
     static constexpr auto algorithm = fallback_to_prefetch ? Algorithm::prefetch : Algorithm::vectorized;
-    using algo_policy = ::cuda::std::_If<fallback_to_prefetch, prefetch_policy_t<256>, default_vectorized_policy_t>;
+    using vec_policy_t              = vectorized_policy_t<
+                   tuning_vec<500, size_of<it_value_t<RandomAccessIteratorOut>>, sizeof(it_value_t<RandomAccessIteratorsIn>)...>>;
+    using algo_policy = ::cuda::std::_If<fallback_to_prefetch, prefetch_policy_t<256>, vec_policy_t>;
   };
   struct policy800 : ChainedPolicy<800, policy800, policy300>
   {
   private:
+    using vec_policy_t = vectorized_policy_t<
+      tuning_vec<800, size_of<it_value_t<RandomAccessIteratorOut>>, sizeof(it_value_t<RandomAccessIteratorsIn>)...>>;
     static constexpr int block_threads = 256;
     using async_policy                 = async_copy_policy_t<block_threads, ldgsts_size_and_align>;
     // We cannot use the architecture-specific amount of SMEM here instead of max_smem_per_block, because this is not
@@ -427,13 +414,17 @@ struct policy_hub<RequiresStableAddress,
     using algo_policy =
       ::cuda::std::_If<fallback_to_prefetch,
                        prefetch_policy_t<block_threads>,
-                       ::cuda::std::_If<fallback_to_vectorized, default_vectorized_policy_t, async_policy>>;
+                       ::cuda::std::_If<fallback_to_vectorized, vec_policy_t, async_policy>>;
   };
   template <int AsyncBlockSize, int PtxVersion>
   struct bulk_copy_policy_base
   {
   private:
+    using vec_policy_t =
+      vectorized_policy_t<tuning_vec<PtxVersion,
+                                     size_of<it_value_t<RandomAccessIteratorOut>>,
+                                     sizeof(it_value_t<RandomAccessIteratorsIn>)...>>;
     static constexpr int alignment = bulk_copy_alignment(PtxVersion);
     using async_policy             = async_copy_policy_t<AsyncBlockSize, alignment>;
     // We cannot use the architecture-specific amount of SMEM here instead of max_smem_per_block, because this is not
@@ -469,7 +460,7 @@ struct policy_hub<RequiresStableAddress,
     using algo_policy =
       ::cuda::std::_If<fallback_to_prefetch,
                        prefetch_policy_t<256>,
-                       ::cuda::std::_If<fallback_to_vectorized, default_vectorized_policy_t, async_policy>>;
+                       ::cuda::std::_If<fallback_to_vectorized, vec_policy_t, async_policy>>;
   };
   struct policy900

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh CHANGED Viewed

@@ -788,6 +788,16 @@ struct UniqueByKeyPolicyWrapper<StaticPolicyT,
   {
     return cub::detail::MakePolicyWrapper(typename StaticPolicyT::UniqueByKeyPolicyT());
   }
+#if defined(CUB_ENABLE_POLICY_PTX_JSON)
+  _CCCL_DEVICE static constexpr auto EncodedPolicy()
+  {
+    using namespace ptx_json;
+    return object<key<"UniqueByKeyPolicyT">() = UniqueByKey().EncodedPolicy(),
+                  key<"DelayConstructor">() =
+                    StaticPolicyT::UniqueByKeyPolicyT::detail::delay_constructor_t::EncodedConstructor()>();
+  }
+#endif
 };
 template <typename PolicyT>

cuda/cccl/headers/include/cub/thread/thread_reduce.cuh CHANGED Viewed

@@ -136,6 +136,7 @@ CUB_NAMESPACE_BEGIN
 //!    {
 //!        int array[4] = {1, 2, 3, 4};
 //!        int sum      = cub::ThreadReduce(array, ::cuda::std::plus<>{}); // sum = 10
+//!    }
 //!
 //! @endrst
 //!
@@ -437,10 +438,13 @@ template <typename Input, typename ReductionOp, typename ValueT, typename AccumT
                 "Input must support the subscript operator[] and have a compile-time size");
   static_assert(has_binary_call_operator<ReductionOp, ValueT>::value,
                 "ReductionOp must have the binary call operator: operator(ValueT, ValueT)");
-  if constexpr (static_size_v<Input> == 1)
+  static constexpr auto length = static_size_v<Input>;
+  if constexpr (length == 1)
   {
     return static_cast<AccumT>(input[0]);
   }
   using PromT = ::cuda::std::_If<enable_min_max_promotion_v<ReductionOp, ValueT>, int, AccumT>;
   // TODO: should be part of the tuning policy
   if constexpr ((!is_simd_enabled_cuda_operator<ReductionOp, ValueT> && !is_simd_operator_v<ReductionOp>)
@@ -449,38 +453,41 @@ template <typename Input, typename ReductionOp, typename ValueT, typename AccumT
     return ThreadReduceSequential<AccumT>(input, reduction_op);
   }
-  constexpr auto length = static_size_v<Input>;
-  if constexpr (::cuda::std::is_same_v<Input, AccumT> && enable_sm90_simd_reduction_v<Input, ReductionOp, length>)
+  if constexpr (::cuda::std::is_same_v<ValueT, AccumT> && enable_sm90_simd_reduction_v<ValueT, ReductionOp, length>)
   {
     NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceSimd(input, reduction_op);))
   }
-  if constexpr (::cuda::std::is_same_v<Input, AccumT> && enable_sm80_simd_reduction_v<Input, ReductionOp, length>)
+  if constexpr (::cuda::std::is_same_v<ValueT, AccumT> && enable_sm80_simd_reduction_v<ValueT, ReductionOp, length>)
   {
     NV_IF_TARGET(NV_PROVIDES_SM_80, (return ThreadReduceSimd(input, reduction_op);))
   }
-  if constexpr (::cuda::std::is_same_v<Input, AccumT> && enable_sm70_simd_reduction_v<Input, ReductionOp, length>)
+  if constexpr (::cuda::std::is_same_v<ValueT, AccumT> && enable_sm70_simd_reduction_v<ValueT, ReductionOp, length>)
   {
     NV_IF_TARGET(NV_PROVIDES_SM_70, (return ThreadReduceSimd(input, reduction_op);))
   }
-  if constexpr (enable_ternary_reduction_sm90_v<Input, ReductionOp>)
+  if constexpr (length >= 6)
   {
-    // with the current tuning policies, SM90/int32/+ uses too many registers (TODO: fix tuning policy)
-    if constexpr ((is_one_of_v<ReductionOp, ::cuda::std::plus<>, ::cuda::std::plus<PromT>>
-                   && is_one_of_v<PromT, int32_t, uint32_t>)
-                  // the compiler generates bad code for int8/uint8 and min/max for SM90
-                  || (is_cuda_minimum_maximum_v<ReductionOp, ValueT> && is_one_of_v<PromT, int8_t, uint8_t>) )
+    // apply SM90 min/max ternary reduction only if the input is natively int32/uint32
+    if constexpr (enable_ternary_reduction_sm90_v<ValueT, ReductionOp>)
     {
-      NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceSequential<PromT>(input, reduction_op);));
+      // with the current tuning policies, SM90/int32/+ uses too many registers (TODO: fix tuning policy)
+      if constexpr ((is_one_of_v<ReductionOp, ::cuda::std::plus<>, ::cuda::std::plus<PromT>>
+                     && is_one_of_v<PromT, int32_t, uint32_t>)
+                    // the compiler generates bad code for int8/uint8 and min/max for SM90
+                    || (is_cuda_minimum_maximum_v<ReductionOp, ValueT> && is_one_of_v<PromT, int8_t, uint8_t>) )
+      {
+        NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceSequential<PromT>(input, reduction_op);));
+      }
+      NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceTernaryTree<PromT>(input, reduction_op);));
     }
-    NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceTernaryTree<PromT>(input, reduction_op);));
-  }
-  if constexpr (enable_ternary_reduction_sm50_v<Input, ReductionOp>)
-  {
-    NV_IF_TARGET(NV_PROVIDES_SM_50, (return ThreadReduceSequential<PromT>(input, reduction_op);));
+    if constexpr (enable_ternary_reduction_sm50_v<ValueT, ReductionOp>)
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_50, (return ThreadReduceSequential<PromT>(input, reduction_op);));
+    }
   }
   return ThreadReduceBinaryTree<PromT>(input, reduction_op);

cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh CHANGED Viewed

@@ -51,6 +51,7 @@
 #include <cuda/__functional/maximum.h>
 #include <cuda/__functional/minimum.h>
 #include <cuda/__ptx/instructions/get_sreg.h>
+#include <cuda/std/__bit/countr.h>
 #include <cuda/std/__functional/operations.h>
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/integral_constant.h>
@@ -701,7 +702,7 @@ struct WarpReduceShfl
   _CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
   {
     // Get the start flags for each thread in the warp.
-    int warp_flags = __ballot_sync(member_mask, flag);
+    unsigned warp_flags = __ballot_sync(member_mask, flag);
     // Convert to tail-segmented
     if (HEAD_SEGMENTED)
@@ -722,7 +723,7 @@ struct WarpReduceShfl
     warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1);
     // Find the next set flag
-    int last_lane = __clz(__brev(warp_flags));
+    int last_lane = ::cuda::std::countr_zero(warp_flags);
     T output = input;
     // Template-iterate reduction steps

cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh CHANGED Viewed

@@ -49,6 +49,7 @@
 #include <cub/util_type.cuh>
 #include <cuda/__ptx/instructions/get_sreg.h>
+#include <cuda/std/__bit/countr.h>
 #include <cuda/std/__type_traits/integral_constant.h>
 CUB_NAMESPACE_BEGIN
@@ -215,7 +216,7 @@ struct WarpReduceSmem
   SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, ::cuda::std::true_type /*has_ballot*/)
   {
     // Get the start flags for each thread in the warp.
-    int warp_flags = __ballot_sync(member_mask, flag);
+    unsigned warp_flags = __ballot_sync(member_mask, flag);
     if (!HEAD_SEGMENTED)
     {
@@ -232,7 +233,7 @@ struct WarpReduceSmem
     }
     // Find next flag
-    int next_flag = __clz(__brev(warp_flags));
+    int next_flag = ::cuda::std::countr_zero(warp_flags);
     // Clip the next segment at the warp boundary if necessary
     if (LOGICAL_WARP_THREADS != 32)

cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh CHANGED Viewed

@@ -50,8 +50,8 @@
 #include <cuda/__ptx/instructions/get_sreg.h>
 #include <cuda/std/__algorithm/clamp.h>
-#include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__bit/has_single_bit.h>
+#include <cuda/std/__bit/integral.h>
 #include <cuda/std/__functional/operations.h>
 #include <cuda/std/__type_traits/integral_constant.h>
 #include <cuda/std/__type_traits/is_integral.h>
@@ -630,7 +630,7 @@ struct WarpScanShfl
     ballot = ballot & ::cuda::ptx::get_sreg_lanemask_le();
     // Find index of first set bit
-    int segment_first_lane = ::cuda::std::max(0, 31 - __clz(ballot));
+    int segment_first_lane = ::cuda::std::__bit_log2(ballot);
     // Iterate scan steps
     _CCCL_PRAGMA_UNROLL_FULL()

cuda/cccl/headers/include/cub/warp/warp_load.cuh CHANGED Viewed

@@ -191,8 +191,8 @@ enum WarpLoadAlgorithm
 //!
 //!        // Load a segment of consecutive items that are blocked across threads
 //!        int thread_data[items_per_thread];
-//!        WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
-//!                                           thread_data);
+//!        WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, thread_data);
+//!    }
 //!
 //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``.
 //! The set of ``thread_data`` across the first logical warp of threads in those
@@ -484,8 +484,8 @@ public:
   //!
   //!        // Load a segment of consecutive items that are blocked across threads
   //!        int thread_data[items_per_thread];
-  //!        WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
-  //!                                              thread_data);
+  //!        WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, thread_data);
+  //!    }
   //!
   //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``,
   //! The set of ``thread_data`` across the first logical warp of threads in those
@@ -533,9 +533,9 @@ public:
   //!
   //!        // Load a segment of consecutive items that are blocked across threads
   //!        int thread_data[items_per_thread];
-  //!        WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
-  //!                                              thread_data,
+  //!        WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, thread_data,
   //!                                              valid_items);
+  //!    }
   //!
   //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...`` and ``valid_items`` is ``5``.
   //! The set of ``thread_data`` across the first logical warp of threads in those threads will be:

cuda/cccl/headers/include/cub/warp/warp_reduce.cuh CHANGED Viewed

@@ -105,6 +105,7 @@ CUB_NAMESPACE_BEGIN
 //!        // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
 //!        int warp_id   = threadIdx.x / 32;
 //!        int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
+//!    }
 //!
 //! Suppose the set of input ``thread_data`` across the block of threads is ``{0, 1, 2, 3, ..., 127}``.
 //! The corresponding output ``aggregate`` in threads 0, 32, 64, and 96 will be
@@ -130,6 +131,8 @@ CUB_NAMESPACE_BEGIN
 //!            int thread_data = ...
 //!            // Return the warp-wide sum to lane0
 //!            int aggregate = WarpReduce(temp_storage).Sum(thread_data);
+//!        }
+//!    }
 //!
 //! Suppose the set of input ``thread_data`` across the warp of threads is ``{0, 1, 2, 3, ..., 31}``.
 //! The corresponding output ``aggregate`` in thread0 will be ``496`` (and is undefined in other threads).
@@ -218,6 +221,7 @@ public:
   //!        // Return the warp-wide sums to each lane0
   //!        int warp_id = threadIdx.x / 32;
   //!        int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is ``{0, 1, 2, 3, ..., 127}``.
   //! The corresponding output ``aggregate`` in threads 0, 32, 64, and 96 will ``496``, ``1520``, ``2544``, and
@@ -299,8 +303,8 @@ public:
   //!            thread_data = d_data[threadIdx.x];
   //!
   //!        // Return the warp-wide sums to each lane0
-  //!        int aggregate = WarpReduce(temp_storage).Sum(
-  //!            thread_data, valid_items);
+  //!        int aggregate = WarpReduce(temp_storage).Sum(thread_data, valid_items);
+  //!    }
   //!
   //! Suppose the input ``d_data`` is ``{0, 1, 2, 3, 4, ...`` and ``valid_items`` is ``4``.
   //! The corresponding output ``aggregate`` in *lane*\ :sub:`0` is ``6``
@@ -363,6 +367,7 @@ public:
   //!        // Return the warp-wide sums to each lane0
   //!        int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(
   //!            thread_data, head_flag);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` and ``head_flag`` across the block of threads
   //! is ``{0, 1, 2, 3, ..., 31`` and is ``{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0``,

cuda/cccl/headers/include/cub/warp/warp_scan.cuh CHANGED Viewed

@@ -114,6 +114,7 @@ CUB_NAMESPACE_BEGIN
 //!        // Compute warp-wide prefix sums
 //!        int warp_id = threadIdx.x / 32;
 //!        WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
+//!    }
 //!
 //! Suppose the set of input ``thread_data`` across the block of threads is
 //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps of
@@ -143,6 +144,8 @@ CUB_NAMESPACE_BEGIN
 //!
 //!            // Compute warp-wide prefix sums
 //!            WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+//!        }
+//!    }
 //!
 //! Suppose the set of input ``thread_data`` across the warp of threads is
 //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` will be
@@ -248,6 +251,7 @@ public:
   //!        // Compute inclusive warp-wide prefix sums
   //!        int warp_id = threadIdx.x / 32;
   //!        WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
@@ -294,9 +298,8 @@ public:
   //!        // Compute inclusive warp-wide prefix sums
   //!        int warp_aggregate;
   //!        int warp_id = threadIdx.x / 32;
-  //!        WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data,
-  //!                                                     thread_data,
-  //!                                                     warp_aggregate);
+  //!        WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
@@ -352,6 +355,7 @@ public:
   //!        // Compute exclusive warp-wide prefix sums
   //!        int warp_id = threadIdx.x / 32;
   //!        WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps

cuda/cccl/headers/include/cub/warp/warp_store.cuh CHANGED Viewed

@@ -201,6 +201,7 @@ enum WarpStoreAlgorithm
 //!
 //!        // Store items to linear memory
 //!        WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data);
+//!    }
 //!
 //! Suppose the set of ``thread_data`` across the warp threads is
 //! ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }``.

cuda/cccl/headers/include/cuda/__algorithm/common.h CHANGED Viewed

@@ -11,7 +11,7 @@
 #ifndef __CUDA___ALGORITHM_COMMON
 #define __CUDA___ALGORITHM_COMMON
-#include <cuda/__cccl_config>
+#include <cuda/std/detail/__config>
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header

cuda/cccl/headers/include/cuda/__algorithm/copy.h CHANGED Viewed

@@ -11,7 +11,7 @@
 #ifndef __CUDA___ALGORITHM_COPY_H
 #define __CUDA___ALGORITHM_COPY_H
-#include <cuda/__cccl_config>
+#include <cuda/std/detail/__config>
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header

cuda/cccl/headers/include/cuda/__algorithm/fill.h CHANGED Viewed

@@ -11,7 +11,7 @@
 #ifndef __CUDA___ALGORITHM_FILL
 #define __CUDA___ALGORITHM_FILL
-#include <cuda/__cccl_config>
+#include <cuda/std/detail/__config>
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header

cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h CHANGED Viewed

@@ -26,6 +26,7 @@
 #if _CCCL_CUDA_COMPILATION()
 #  include <cuda/__ptx/instructions/get_sreg.h>
 #  include <cuda/__ptx/instructions/mbarrier_arrive.h>
+#  include <cuda/__ptx/instructions/mbarrier_wait.h>
 #  include <cuda/__ptx/ptx_dot_variants.h>
 #  include <cuda/__ptx/ptx_helper_functions.h>
 #endif // _CCCL_CUDA_COMPILATION()
@@ -381,12 +382,30 @@ private:
 public:
   _CCCL_API inline void wait(arrival_token&& __phase) const
   {
+    // no need to back off on a barrier in SMEM on SM90+, SYNCS unit is taking care of this
+    NV_IF_TARGET(NV_PROVIDES_SM_90,
+                 (if (::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared)) {
+                   while (!::cuda::ptx::mbarrier_try_wait(
+                     reinterpret_cast<uint64_t*>(const_cast<__barrier_base*>(&__barrier)), __phase))
+                     ;
+                   return;
+                 }))
+    // fallback implementation
     ::cuda::std::__cccl_thread_poll_with_backoff(
       ::cuda::std::__barrier_poll_tester_phase<barrier>(this, ::cuda::std::move(__phase)));
   }
   _CCCL_API inline void wait_parity(bool __phase_parity) const
   {
+    // no need to back off on a barrier in SMEM on SM90+, SYNCS unit is taking care of this
+    NV_IF_TARGET(NV_PROVIDES_SM_90,
+                 (if (::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared)) {
+                   while (!::cuda::ptx::mbarrier_try_wait_parity(
+                     reinterpret_cast<uint64_t*>(const_cast<__barrier_base*>(&__barrier)), __phase_parity))
+                     ;
+                   return;
+                 }))
+    // fallback implementation
     ::cuda::std::__cccl_thread_poll_with_backoff(
       ::cuda::std::__barrier_poll_tester_parity<barrier>(this, __phase_parity));
   }

cuda/cccl/headers/include/cuda/__cccl_config CHANGED Viewed

@@ -23,6 +23,7 @@
 #include <cuda/std/__cccl/exceptions.h> // IWYU pragma: export
 #include <cuda/std/__cccl/execution_space.h> // IWYU pragma: export
 #include <cuda/std/__cccl/extended_data_types.h> // IWYU pragma: export
+#include <cuda/std/__cccl/host_std_lib.h> // IWYU pragma: export
 #include <cuda/std/__cccl/os.h> // IWYU pragma: export
 #include <cuda/std/__cccl/preprocessor.h> // IWYU pragma: export
 #include <cuda/std/__cccl/ptx_isa.h> // IWYU pragma: export