PyPI - cuda-cccl - Versions diffs - 0.3.1__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.2__cp311-cp311-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.3.1__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.2__cp311-cp311-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (185) hide show

cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh CHANGED Viewed

@@ -14,19 +14,17 @@
 #endif // no system header
 #include <cub/agent/agent_for.cuh>
-#include <cub/detail/fast_modulo_division.cuh> // fast_div_mod
 #include <cub/detail/mdspan_utils.cuh> // is_sub_size_static
 #include <cub/detail/type_traits.cuh> // implicit_prom_t
-#include <cuda/std/__fwd/span.h>
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/integral_constant.h>
 #include <cuda/std/__type_traits/is_convertible.h>
 #include <cuda/std/__type_traits/is_reference.h>
 #include <cuda/std/__type_traits/is_trivially_constructible.h>
-#include <cuda/std/__type_traits/is_trivially_copy_constructible.h>
+#include <cuda/std/__type_traits/is_trivially_copy_assignable.h>
 #include <cuda/std/__type_traits/is_trivially_destructible.h>
-#include <cuda/std/__type_traits/is_trivially_move_constructible.h>
+#include <cuda/std/__type_traits/is_trivially_move_assignable.h>
 #include <cuda/std/__type_traits/make_unsigned.h>
 #include <cuda/std/__utility/integer_sequence.h>
 #include <cuda/std/cstddef> // size_t
@@ -140,16 +138,21 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::for_policy_t::block_threads) //
  * ForEachInExtents
  **********************************************************************************************************************/
-// Returns the extent at the given rank. If the extents is static, returns it, otherwise returns the precomputed value
-template <int Rank, typename ExtentType, typename FastDivModType>
-_CCCL_DEVICE _CCCL_FORCEINLINE auto extent_at(ExtentType extents, FastDivModType dynamic_extent)
+// Retrieves the extent (dimension size) at a specific position in a multi-dimensional array
+//
+// This function efficiently returns the extent at the given position, optimizing for static extents by returning
+// compile-time constants when possible. For dynamic extents, it returns the precomputed value to avoid runtime
+// computation overhead.
+template <int Position, typename ExtentType, typename FastDivModType>
+_CCCL_DEVICE_API auto extent_at(ExtentType extents, FastDivModType dynamic_extent)
 {
-  if constexpr (ExtentType::static_extent(Rank) != ::cuda::std::dynamic_extent)
+  if constexpr (ExtentType::static_extent(Position) != ::cuda::std::dynamic_extent)
   {
     using extent_index_type   = typename ExtentType::index_type;
     using index_type          = implicit_prom_t<extent_index_type>;
     using unsigned_index_type = ::cuda::std::make_unsigned_t<index_type>;
-    return static_cast<unsigned_index_type>(extents.static_extent(Rank));
+    constexpr auto extent     = extents.static_extent(Position);
+    return static_cast<unsigned_index_type>(extent);
   }
   else
   {
@@ -157,17 +160,22 @@ _CCCL_DEVICE _CCCL_FORCEINLINE auto extent_at(ExtentType extents, FastDivModType
   }
 }
-// Returns the product of all extents from position Rank. If the result is static, returns it, otherwise returns the
-// precomputed value
-template <int Rank, typename ExtentType, typename FastDivModType>
-_CCCL_DEVICE _CCCL_FORCEINLINE auto get_extents_sub_size(ExtentType extents, FastDivModType extent_sub_size)
+// Computes the product of extents in a specified range for multi-dimensional indexing.
+// This function calculates the product of all extent dimensions from Start (inclusive) to End (exclusive).
+//
+// Performance characteristics:
+//  - Static extents in range: Product computed at compile-time, zero runtime cost
+//  - Dynamic extents present: Returns precomputed value, avoiding runtime multiplication
+template <int Start, int End, typename ExtentType, typename FastDivModType>
+_CCCL_DEVICE_API auto get_extents_sub_size(ExtentType extents, FastDivModType extent_sub_size)
 {
-  if constexpr (cub::detail::is_sub_size_static<Rank + 1, ExtentType>())
+  if constexpr (cub::detail::are_extents_in_range_static<ExtentType>(Start, End))
   {
     using extent_index_type   = typename ExtentType::index_type;
     using index_type          = implicit_prom_t<extent_index_type>;
     using unsigned_index_type = ::cuda::std::make_unsigned_t<index_type>;
-    return static_cast<unsigned_index_type>(cub::detail::sub_size<Rank + 1>(extents));
+    auto sub_size             = cub::detail::size_range(extents, Start, End);
+    return static_cast<unsigned_index_type>(sub_size);
   }
   else
   {
@@ -175,49 +183,76 @@ _CCCL_DEVICE _CCCL_FORCEINLINE auto get_extents_sub_size(ExtentType extents, Fas
   }
 }
-template <int Rank, typename IndexType, typename ExtentType, typename FastDivModType>
-_CCCL_DEVICE _CCCL_FORCEINLINE auto
+// Converts a linear index to a multi-dimensional coordinate at a specific position.
+//
+// This function performs the mathematical conversion from a linear (flat) index to the coordinate value at a specific
+// position in a multi-dimensional array. It supports both row-major (layout_right) and column-major (layout_left)
+// memory layouts, which affects the indexing calculation order.
+//
+// The mathematical formulation depends on the layout:
+// - Right layout (row-major):   index_i = (index / product(extent[j] for j in [i+1, rank-1])) % extent[i]
+// - Left layout (column-major): index_i = (index / product(extent[j] for j in [0, i])) % extent[i]
+//
+// This function leverages precomputed fast division and modulo operations to minimize runtime arithmetic overhead.
+template <bool IsLayoutRight, int Position, typename IndexType, typename ExtentType, typename FastDivModType>
+_CCCL_DEVICE_API auto
 coordinate_at(IndexType index, ExtentType extents, FastDivModType extent_sub_size, FastDivModType dynamic_extent)
 {
   using cub::detail::for_each::extent_at;
   using cub::detail::for_each::get_extents_sub_size;
   using extent_index_type = typename ExtentType::index_type;
-  return static_cast<extent_index_type>(
-    (index / get_extents_sub_size<Rank>(extents, extent_sub_size)) % extent_at<Rank>(extents, dynamic_extent));
+  constexpr auto start    = IsLayoutRight ? Position + 1 : 0;
+  constexpr auto end      = IsLayoutRight ? ExtentType::rank() : Position;
+  return static_cast<extent_index_type>((index / get_extents_sub_size<start, end>(extents, extent_sub_size))
+                                        % extent_at<Position>(extents, dynamic_extent));
 }
-template <typename OpT, typename ExtentsT, typename FastDivModArrayT>
+// Function object wrapper for applying operations with multi-dimensional coordinate conversion.
+//
+// The wrapped operation will be called with signature: `op(linear_index, coord_0, coord_1, ..., coord_n)`
+// where the number of coordinate parameters matches the rank of the extents object.
+//
+// This wrapper is used internally by DeviceFor::ForEachInLayout/ForEachInExtents
+template <typename OpT, typename ExtentsType, bool IsLayoutRight, typename FastDivModArrayT>
 struct op_wrapper_extents_t
 {
-  OpT op;
-  ExtentsT extents;
-  FastDivModArrayT sub_sizes_div_array;
-  FastDivModArrayT extents_mod_array;
-  template <typename OffsetT, size_t... Ranks>
-  _CCCL_DEVICE _CCCL_FORCEINLINE void impl(OffsetT i, ::cuda::std::index_sequence<Ranks...>)
+  OpT op; ///< The user-provided operation to be called with coordinates
+  ExtentsType extents; ///< The multi-dimensional extents defining array dimensions
+  FastDivModArrayT sub_sizes_div_array; ///< Precomputed fast division values for extent sub-products
+  FastDivModArrayT extents_mod_array; ///< Precomputed fast modulo values for individual extents
+  // Internal implementation that converts linear index to coordinates and calls the user operation
+  template <typename IndexType, size_t... Positions>
+  _CCCL_DEVICE_API void impl(IndexType i, ::cuda::std::index_sequence<Positions...>)
   {
     using cub::detail::for_each::coordinate_at;
-    op(i, coordinate_at<Ranks>(i, extents, sub_sizes_div_array[Ranks], extents_mod_array[Ranks])...);
+    op(i,
+       coordinate_at<IsLayoutRight, Positions>(
+         i, extents, sub_sizes_div_array[Positions], extents_mod_array[Positions])...);
   }
-  template <typename OffsetT, size_t... Ranks>
-  _CCCL_DEVICE _CCCL_FORCEINLINE void impl(OffsetT i, ::cuda::std::index_sequence<Ranks...>) const
+  // Internal implementation that converts linear index to coordinates and calls the user operation
+  template <typename IndexType, size_t... Positions>
+  _CCCL_DEVICE_API void impl(IndexType i, ::cuda::std::index_sequence<Positions...>) const
   {
     using cub::detail::for_each::coordinate_at;
-    op(i, coordinate_at<Ranks>(i, extents, sub_sizes_div_array[Ranks], extents_mod_array[Ranks])...);
+    op(i,
+       coordinate_at<IsLayoutRight, Positions>(
+         i, extents, sub_sizes_div_array[Positions], extents_mod_array[Positions])...);
   }
-  template <typename OffsetT>
-  _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(OffsetT i)
+  // Function call operator that processes a linear index by converting it to multi-dimensional coordinates
+  template <typename IndexType>
+  _CCCL_DEVICE_API void operator()(IndexType i)
   {
-    impl(i, ::cuda::std::make_index_sequence<ExtentsT::rank()>{});
+    impl(i, ::cuda::std::make_index_sequence<ExtentsType::rank()>{});
   }
-  template <typename OffsetT>
-  _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(OffsetT i) const
+  // Function call operator that processes a linear index by converting it to multi-dimensional coordinates
+  template <typename IndexType>
+  _CCCL_DEVICE_API void operator()(IndexType i) const
   {
-    impl(i, ::cuda::std::make_index_sequence<ExtentsT::rank()>{});
+    impl(i, ::cuda::std::make_index_sequence<ExtentsType::rank()>{});
   }
 };

cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh CHANGED Viewed

@@ -217,6 +217,7 @@ _CCCL_DEVICE void transform_kernel_vectorized(
 {
   constexpr int block_dim        = VectorizedPolicy::block_threads;
   constexpr int items_per_thread = VectorizedPolicy::items_per_thread_vectorized;
+  constexpr int vec_size         = VectorizedPolicy::vec_size;
   _CCCL_ASSERT(!can_vectorize || (items_per_thread == num_elem_per_thread_prefetch), "");
   constexpr int tile_size = block_dim * items_per_thread;
   const Offset offset     = static_cast<Offset>(blockIdx.x) * tile_size;
@@ -241,23 +242,13 @@ _CCCL_DEVICE void transform_kernel_vectorized(
     out += offset;
   }
-  constexpr int load_store_size = VectorizedPolicy::load_store_word_size;
-  using load_store_t            = decltype(load_store_type<load_store_size>());
-  using output_t                = it_value_t<RandomAccessIteratorOut>;
+  using output_t = it_value_t<RandomAccessIteratorOut>;
   using result_t = ::cuda::std::decay_t<::cuda::std::invoke_result_t<F, const it_value_t<RandomAccessIteratorsIn>&...>>;
-  // picks output type size if there are no inputs
-  constexpr int element_size     = int{first_nonzero_value(
-    (sizeof(it_value_t<RandomAccessIteratorsIn>)
-     * THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...,
-    size_of<output_t>)};
-  constexpr int load_store_count = (items_per_thread * element_size) / load_store_size;
+  constexpr int load_store_count = items_per_thread / vec_size;
+  static_assert(items_per_thread % vec_size == 0, "The items per thread must be a multiple of the vector size");
-  static_assert((items_per_thread * element_size) % load_store_size == 0);
-  static_assert(load_store_size % element_size == 0);
-  constexpr bool can_vectorize_store =
-    THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorOut>
-    && THRUST_NS_QUALIFIER::is_trivially_relocatable_v<output_t> && size_of<output_t> == element_size;
+  constexpr bool can_vectorize_store = THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorOut>
+                                    && THRUST_NS_QUALIFIER::is_trivially_relocatable_v<output_t>;
   // if we can vectorize, we convert f's return type to the output type right away, so we can reinterpret later
   using THRUST_NS_QUALIFIER::cuda_cub::core::detail::uninitialized_array;
@@ -266,10 +257,15 @@ _CCCL_DEVICE void transform_kernel_vectorized(
   auto provide_array = [&](auto... inputs) {
     // load inputs
     [[maybe_unused]] auto load_tile = [](auto in, auto& input) {
+      using it_t    = decltype(in);
+      using value_t = it_value_t<it_t>;
       if constexpr (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<decltype(in)>)
       {
-        auto in_vec    = reinterpret_cast<const load_store_t*>(in) + threadIdx.x;
-        auto input_vec = reinterpret_cast<load_store_t*>(input.data());
+        // TODO(bgruber): we could add a max_load_store_size to the policy to avoid huge load types and huge alignment
+        // requirements
+        using load_t   = decltype(load_store_type<sizeof(value_t) * vec_size>());
+        auto in_vec    = reinterpret_cast<const load_t*>(in) + threadIdx.x;
+        auto input_vec = reinterpret_cast<load_t*>(input.data());
         _CCCL_PRAGMA_UNROLL_FULL()
         for (int i = 0; i < load_store_count; ++i)
         {
@@ -278,15 +274,14 @@ _CCCL_DEVICE void transform_kernel_vectorized(
       }
       else
       {
-        constexpr int elems = load_store_size / element_size;
-        in += threadIdx.x * elems;
+        in += threadIdx.x * vec_size;
         _CCCL_PRAGMA_UNROLL_FULL()
         for (int i = 0; i < load_store_count; ++i)
         {
           _CCCL_PRAGMA_UNROLL_FULL()
-          for (int j = 0; j < elems; ++j)
+          for (int j = 0; j < vec_size; ++j)
           {
-            input[i * elems + j] = in[i * elems * VectorizedPolicy::block_threads + j];
+            input[i * vec_size + j] = in[i * vec_size * VectorizedPolicy::block_threads + j];
           }
         }
       }
@@ -310,8 +305,9 @@ _CCCL_DEVICE void transform_kernel_vectorized(
   if constexpr (can_vectorize_store)
   {
     // vector path
-    auto output_vec = reinterpret_cast<const load_store_t*>(output.data());
-    auto out_vec    = reinterpret_cast<load_store_t*>(out) + threadIdx.x;
+    using store_t   = decltype(load_store_type<sizeof(output_t) * vec_size>());
+    auto output_vec = reinterpret_cast<const store_t*>(output.data());
+    auto out_vec    = reinterpret_cast<store_t*>(out) + threadIdx.x;
     _CCCL_PRAGMA_UNROLL_FULL()
     for (int i = 0; i < load_store_count; ++i)
     {
@@ -321,15 +317,14 @@ _CCCL_DEVICE void transform_kernel_vectorized(
   else
   {
     // serial path
-    constexpr int elems = load_store_size / element_size;
-    out += threadIdx.x * elems;
+    out += threadIdx.x * vec_size;
     _CCCL_PRAGMA_UNROLL_FULL()
     for (int i = 0; i < load_store_count; ++i)
     {
       _CCCL_PRAGMA_UNROLL_FULL()
-      for (int j = 0; j < elems; ++j)
+      for (int j = 0; j < vec_size; ++j)
       {
-        out[i * elems * VectorizedPolicy::block_threads + j] = output[i * elems + j];
+        out[i * vec_size * VectorizedPolicy::block_threads + j] = output[i * vec_size + j];
       }
     }
   }

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh CHANGED Viewed

@@ -113,11 +113,11 @@ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
   (max_items_per_thread, MaxItemsPerThread, int),
   (not_a_vectorized_policy, NotAVectorizedPolicy, int) ) // TODO: remove with C++20
-template <int BlockThreads, int ItemsPerThread, int LoadStoreWordSize>
-struct vectorized_policy_t : prefetch_policy_t<BlockThreads>
+template <typename Tuning>
+struct vectorized_policy_t : prefetch_policy_t<Tuning::block_threads>
 {
-  static constexpr int items_per_thread_vectorized = ItemsPerThread;
-  static constexpr int load_store_word_size        = LoadStoreWordSize;
+  static constexpr int items_per_thread_vectorized = Tuning::items_per_thread;
+  static constexpr int vec_size                    = Tuning::vec_size;
   using not_a_vectorized_policy = void; // TODO: remove with C++20, shadows the variable in prefetch_policy_t
 };
@@ -130,7 +130,7 @@ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
   (min_items_per_thread, MinItemsPerThread, int),
   (max_items_per_thread, MaxItemsPerThread, int),
   (items_per_thread_vectorized, ItemsPerThreadVectorized, int),
-  (load_store_word_size, LoadStoreWordSize, int) )
+  (vec_size, VecSize, int) )
 template <int BlockThreads, int BulkCopyAlignment>
 struct async_copy_policy_t
@@ -282,47 +282,6 @@ _CCCL_HOST_DEVICE constexpr int arch_to_min_bytes_in_flight(int sm_arch)
   return 12 * 1024; // V100 and below
 }
-template <typename H, typename... Ts>
-_CCCL_HOST_DEVICE constexpr bool all_nonzero_equal(H head, Ts... values)
-{
-  size_t first = 0;
-  for (size_t v : ::cuda::std::array<H, 1 + sizeof...(Ts)>{head, values...})
-  {
-    if (v == 0)
-    {
-      continue;
-    }
-    if (first == 0)
-    {
-      first = v;
-    }
-    else if (v != first)
-    {
-      return false;
-    }
-  }
-  return true;
-}
-_CCCL_HOST_DEVICE constexpr bool all_nonzero_equal()
-{
-  return true;
-}
-template <typename H, typename... Ts>
-_CCCL_HOST_DEVICE constexpr auto first_nonzero_value(H head, Ts... values)
-{
-  for (auto v : ::cuda::std::array<H, 1 + sizeof...(Ts)>{head, values...})
-  {
-    if (v != 0)
-    {
-      return v;
-    }
-  }
-  // we only reach here when all input are not contiguous and the output has a void value type
-  return H{1};
-}
 template <typename T>
 inline constexpr size_t size_of = sizeof(T);
@@ -337,6 +296,47 @@ _CCCL_HOST_DEVICE static constexpr auto make_sizes_alignments()
     {{sizeof(it_value_t<RandomAccessIteratorsIn>), alignof(it_value_t<RandomAccessIteratorsIn>)}...}};
 }
+template <int PtxVersion, int StoreSize, int... LoadSizes>
+struct tuning_vec
+{
+  // defaults from fill on RTX 5090, but can be changed
+  static constexpr int block_threads    = 256;
+  static constexpr int vec_size         = 4;
+  static constexpr int items_per_thread = 8;
+};
+// manually tuned fill on A100
+template <int StoreSize>
+struct tuning_vec<800, StoreSize>
+{
+  static constexpr int block_threads    = 256;
+  static constexpr int vec_size         = ::cuda::std::max(8 / StoreSize, 1); // 64-bit instructions
+  static constexpr int items_per_thread = 8;
+};
+// manually tuned fill on H200
+template <int StoreSize>
+struct tuning_vec<900, StoreSize>
+{
+  static constexpr int block_threads    = StoreSize > 4 ? 128 : 256;
+  static constexpr int vec_size         = ::cuda::std::max(8 / StoreSize, 1); // 64-bit instructions
+  static constexpr int items_per_thread = 16;
+};
+// manually tuned fill on B200, same as H200
+template <int StoreSize>
+struct tuning_vec<1000, StoreSize> : tuning_vec<900, StoreSize>
+{};
+// manually tuned fill on RTX 5090
+template <int StoreSize>
+struct tuning_vec<1200, StoreSize>
+{
+  static constexpr int block_threads    = 256;
+  static constexpr int vec_size         = 4;
+  static constexpr int items_per_thread = 8;
+};
 template <bool RequiresStableAddress,
           bool DenseOutput,
           typename RandomAccessIteratorTupleIn,
@@ -367,29 +367,12 @@ struct policy_hub<RequiresStableAddress,
       || THRUST_NS_QUALIFIER::is_trivially_relocatable_v<it_value_t<RandomAccessIteratorsIn>>)
      && ...);
-  // for vectorized policy:
-  static constexpr bool all_contiguous_input_values_same_size = all_nonzero_equal(
-    (sizeof(it_value_t<RandomAccessIteratorsIn>)
-     * THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...);
-  static constexpr int load_store_word_size = 8; // TODO(bgruber): make this 16, and 32 on Blackwell+
-  // find the value type size of the first contiguous iterator. if there are no inputs, we take the size of the output
-  // value type
-  static constexpr int contiguous_value_type_size = first_nonzero_value(
-    (int{sizeof(it_value_t<RandomAccessIteratorsIn>)}
-     * THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...,
-    int{size_of<it_value_t<RandomAccessIteratorOut>>});
-  static constexpr bool value_type_divides_load_store_size =
-    load_store_word_size % contiguous_value_type_size == 0; // implicitly checks that value_type_size <=
-                                                            // load_store_word_size
-  static constexpr int target_bytes_per_thread =
-    no_input_streams ? 16 /* by experiment on RTX 5090 */ : 32 /* guestimate by gevtushenko for loading */;
-  static constexpr int items_per_thread_vec =
-    ::cuda::round_up(target_bytes_per_thread, load_store_word_size) / contiguous_value_type_size;
-  using default_vectorized_policy_t = vectorized_policy_t<256, items_per_thread_vec, load_store_word_size>;
+  static constexpr bool all_value_types_have_power_of_two_size =
+    (::cuda::is_power_of_two(sizeof(it_value_t<RandomAccessIteratorsIn>)) && ...)
+    && ::cuda::is_power_of_two(size_of<it_value_t<RandomAccessIteratorOut>>);
   static constexpr bool fallback_to_prefetch =
-    RequiresStableAddress || !can_memcpy_contiguous_inputs || !all_contiguous_input_values_same_size
-    || !value_type_divides_load_store_size || !DenseOutput;
+    RequiresStableAddress || !can_memcpy_contiguous_inputs || !all_value_types_have_power_of_two_size || !DenseOutput;
   // TODO(bgruber): consider a separate kernel for just filling
@@ -398,12 +381,16 @@ struct policy_hub<RequiresStableAddress,
     static constexpr int min_bif = arch_to_min_bytes_in_flight(300);
     // TODO(bgruber): we don't need algo, because we can just detect the type of algo_policy
     static constexpr auto algorithm = fallback_to_prefetch ? Algorithm::prefetch : Algorithm::vectorized;
-    using algo_policy = ::cuda::std::_If<fallback_to_prefetch, prefetch_policy_t<256>, default_vectorized_policy_t>;
+    using vec_policy_t              = vectorized_policy_t<
+                   tuning_vec<500, size_of<it_value_t<RandomAccessIteratorOut>>, sizeof(it_value_t<RandomAccessIteratorsIn>)...>>;
+    using algo_policy = ::cuda::std::_If<fallback_to_prefetch, prefetch_policy_t<256>, vec_policy_t>;
   };
   struct policy800 : ChainedPolicy<800, policy800, policy300>
   {
   private:
+    using vec_policy_t = vectorized_policy_t<
+      tuning_vec<800, size_of<it_value_t<RandomAccessIteratorOut>>, sizeof(it_value_t<RandomAccessIteratorsIn>)...>>;
     static constexpr int block_threads = 256;
     using async_policy                 = async_copy_policy_t<block_threads, ldgsts_size_and_align>;
     // We cannot use the architecture-specific amount of SMEM here instead of max_smem_per_block, because this is not
@@ -427,13 +414,17 @@ struct policy_hub<RequiresStableAddress,
     using algo_policy =
       ::cuda::std::_If<fallback_to_prefetch,
                        prefetch_policy_t<block_threads>,
-                       ::cuda::std::_If<fallback_to_vectorized, default_vectorized_policy_t, async_policy>>;
+                       ::cuda::std::_If<fallback_to_vectorized, vec_policy_t, async_policy>>;
   };
   template <int AsyncBlockSize, int PtxVersion>
   struct bulk_copy_policy_base
   {
   private:
+    using vec_policy_t =
+      vectorized_policy_t<tuning_vec<PtxVersion,
+                                     size_of<it_value_t<RandomAccessIteratorOut>>,
+                                     sizeof(it_value_t<RandomAccessIteratorsIn>)...>>;
     static constexpr int alignment = bulk_copy_alignment(PtxVersion);
     using async_policy             = async_copy_policy_t<AsyncBlockSize, alignment>;
     // We cannot use the architecture-specific amount of SMEM here instead of max_smem_per_block, because this is not
@@ -469,7 +460,7 @@ struct policy_hub<RequiresStableAddress,
     using algo_policy =
       ::cuda::std::_If<fallback_to_prefetch,
                        prefetch_policy_t<256>,
-                       ::cuda::std::_If<fallback_to_vectorized, default_vectorized_policy_t, async_policy>>;
+                       ::cuda::std::_If<fallback_to_vectorized, vec_policy_t, async_policy>>;
   };
   struct policy900

cuda/cccl/headers/include/cub/thread/thread_reduce.cuh CHANGED Viewed

@@ -136,6 +136,7 @@ CUB_NAMESPACE_BEGIN
 //!    {
 //!        int array[4] = {1, 2, 3, 4};
 //!        int sum      = cub::ThreadReduce(array, ::cuda::std::plus<>{}); // sum = 10
+//!    }
 //!
 //! @endrst
 //!
@@ -437,10 +438,13 @@ template <typename Input, typename ReductionOp, typename ValueT, typename AccumT
                 "Input must support the subscript operator[] and have a compile-time size");
   static_assert(has_binary_call_operator<ReductionOp, ValueT>::value,
                 "ReductionOp must have the binary call operator: operator(ValueT, ValueT)");
-  if constexpr (static_size_v<Input> == 1)
+  static constexpr auto length = static_size_v<Input>;
+  if constexpr (length == 1)
   {
     return static_cast<AccumT>(input[0]);
   }
   using PromT = ::cuda::std::_If<enable_min_max_promotion_v<ReductionOp, ValueT>, int, AccumT>;
   // TODO: should be part of the tuning policy
   if constexpr ((!is_simd_enabled_cuda_operator<ReductionOp, ValueT> && !is_simd_operator_v<ReductionOp>)
@@ -449,38 +453,41 @@ template <typename Input, typename ReductionOp, typename ValueT, typename AccumT
     return ThreadReduceSequential<AccumT>(input, reduction_op);
   }
-  constexpr auto length = static_size_v<Input>;
-  if constexpr (::cuda::std::is_same_v<Input, AccumT> && enable_sm90_simd_reduction_v<Input, ReductionOp, length>)
+  if constexpr (::cuda::std::is_same_v<ValueT, AccumT> && enable_sm90_simd_reduction_v<ValueT, ReductionOp, length>)
   {
     NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceSimd(input, reduction_op);))
   }
-  if constexpr (::cuda::std::is_same_v<Input, AccumT> && enable_sm80_simd_reduction_v<Input, ReductionOp, length>)
+  if constexpr (::cuda::std::is_same_v<ValueT, AccumT> && enable_sm80_simd_reduction_v<ValueT, ReductionOp, length>)
   {
     NV_IF_TARGET(NV_PROVIDES_SM_80, (return ThreadReduceSimd(input, reduction_op);))
   }
-  if constexpr (::cuda::std::is_same_v<Input, AccumT> && enable_sm70_simd_reduction_v<Input, ReductionOp, length>)
+  if constexpr (::cuda::std::is_same_v<ValueT, AccumT> && enable_sm70_simd_reduction_v<ValueT, ReductionOp, length>)
   {
     NV_IF_TARGET(NV_PROVIDES_SM_70, (return ThreadReduceSimd(input, reduction_op);))
   }
-  if constexpr (enable_ternary_reduction_sm90_v<Input, ReductionOp>)
+  if constexpr (length >= 6)
   {
-    // with the current tuning policies, SM90/int32/+ uses too many registers (TODO: fix tuning policy)
-    if constexpr ((is_one_of_v<ReductionOp, ::cuda::std::plus<>, ::cuda::std::plus<PromT>>
-                   && is_one_of_v<PromT, int32_t, uint32_t>)
-                  // the compiler generates bad code for int8/uint8 and min/max for SM90
-                  || (is_cuda_minimum_maximum_v<ReductionOp, ValueT> && is_one_of_v<PromT, int8_t, uint8_t>) )
+    // apply SM90 min/max ternary reduction only if the input is natively int32/uint32
+    if constexpr (enable_ternary_reduction_sm90_v<ValueT, ReductionOp>)
     {
-      NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceSequential<PromT>(input, reduction_op);));
+      // with the current tuning policies, SM90/int32/+ uses too many registers (TODO: fix tuning policy)
+      if constexpr ((is_one_of_v<ReductionOp, ::cuda::std::plus<>, ::cuda::std::plus<PromT>>
+                     && is_one_of_v<PromT, int32_t, uint32_t>)
+                    // the compiler generates bad code for int8/uint8 and min/max for SM90
+                    || (is_cuda_minimum_maximum_v<ReductionOp, ValueT> && is_one_of_v<PromT, int8_t, uint8_t>) )
+      {
+        NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceSequential<PromT>(input, reduction_op);));
+      }
+      NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceTernaryTree<PromT>(input, reduction_op);));
     }
-    NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceTernaryTree<PromT>(input, reduction_op);));
-  }
-  if constexpr (enable_ternary_reduction_sm50_v<Input, ReductionOp>)
-  {
-    NV_IF_TARGET(NV_PROVIDES_SM_50, (return ThreadReduceSequential<PromT>(input, reduction_op);));
+    if constexpr (enable_ternary_reduction_sm50_v<ValueT, ReductionOp>)
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_50, (return ThreadReduceSequential<PromT>(input, reduction_op);));
+    }
   }
   return ThreadReduceBinaryTree<PromT>(input, reduction_op);

cuda/cccl/headers/include/cub/warp/warp_load.cuh CHANGED Viewed

@@ -191,8 +191,8 @@ enum WarpLoadAlgorithm
 //!
 //!        // Load a segment of consecutive items that are blocked across threads
 //!        int thread_data[items_per_thread];
-//!        WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
-//!                                           thread_data);
+//!        WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, thread_data);
+//!    }
 //!
 //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``.
 //! The set of ``thread_data`` across the first logical warp of threads in those
@@ -484,8 +484,8 @@ public:
   //!
   //!        // Load a segment of consecutive items that are blocked across threads
   //!        int thread_data[items_per_thread];
-  //!        WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
-  //!                                              thread_data);
+  //!        WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, thread_data);
+  //!    }
   //!
   //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``,
   //! The set of ``thread_data`` across the first logical warp of threads in those
@@ -533,9 +533,9 @@ public:
   //!
   //!        // Load a segment of consecutive items that are blocked across threads
   //!        int thread_data[items_per_thread];
-  //!        WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
-  //!                                              thread_data,
+  //!        WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, thread_data,
   //!                                              valid_items);
+  //!    }
   //!
   //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...`` and ``valid_items`` is ``5``.
   //! The set of ``thread_data`` across the first logical warp of threads in those threads will be:

cuda/cccl/headers/include/cub/warp/warp_reduce.cuh CHANGED Viewed

@@ -105,6 +105,7 @@ CUB_NAMESPACE_BEGIN
 //!        // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
 //!        int warp_id   = threadIdx.x / 32;
 //!        int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
+//!    }
 //!
 //! Suppose the set of input ``thread_data`` across the block of threads is ``{0, 1, 2, 3, ..., 127}``.
 //! The corresponding output ``aggregate`` in threads 0, 32, 64, and 96 will be
@@ -130,6 +131,8 @@ CUB_NAMESPACE_BEGIN
 //!            int thread_data = ...
 //!            // Return the warp-wide sum to lane0
 //!            int aggregate = WarpReduce(temp_storage).Sum(thread_data);
+//!        }
+//!    }
 //!
 //! Suppose the set of input ``thread_data`` across the warp of threads is ``{0, 1, 2, 3, ..., 31}``.
 //! The corresponding output ``aggregate`` in thread0 will be ``496`` (and is undefined in other threads).
@@ -218,6 +221,7 @@ public:
   //!        // Return the warp-wide sums to each lane0
   //!        int warp_id = threadIdx.x / 32;
   //!        int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is ``{0, 1, 2, 3, ..., 127}``.
   //! The corresponding output ``aggregate`` in threads 0, 32, 64, and 96 will ``496``, ``1520``, ``2544``, and
@@ -299,8 +303,8 @@ public:
   //!            thread_data = d_data[threadIdx.x];
   //!
   //!        // Return the warp-wide sums to each lane0
-  //!        int aggregate = WarpReduce(temp_storage).Sum(
-  //!            thread_data, valid_items);
+  //!        int aggregate = WarpReduce(temp_storage).Sum(thread_data, valid_items);
+  //!    }
   //!
   //! Suppose the input ``d_data`` is ``{0, 1, 2, 3, 4, ...`` and ``valid_items`` is ``4``.
   //! The corresponding output ``aggregate`` in *lane*\ :sub:`0` is ``6``
@@ -363,6 +367,7 @@ public:
   //!        // Return the warp-wide sums to each lane0
   //!        int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(
   //!            thread_data, head_flag);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` and ``head_flag`` across the block of threads
   //! is ``{0, 1, 2, 3, ..., 31`` and is ``{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0``,