PyPI - cuda-cccl - Versions diffs - 0.1.3.2.0.dev438__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.1.3.2.0.dev438__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (60) hide show

cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh CHANGED Viewed

@@ -172,6 +172,10 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
                 AccumT,
                 TransformOpT>;
+  static_assert(sizeof(typename AgentReduceT::TempStorage) <= max_smem_per_block,
+                "cub::DeviceReduce ran out of CUDA shared memory, which we judged to be extremely unlikely. Please "
+                "file an issue at: https://github.com/NVIDIA/cccl/issues");
   // Shared memory storage
   __shared__ typename AgentReduceT::TempStorage temp_storage;
@@ -253,6 +257,10 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
                 AccumT,
                 TransformOpT>;
+  static_assert(sizeof(typename AgentReduceT::TempStorage) <= max_smem_per_block,
+                "cub::DeviceReduce ran out of CUDA shared memory, which we judged to be extremely unlikely. Please "
+                "file an issue at: https://github.com/NVIDIA/cccl/issues");
   // Shared memory storage
   __shared__ typename AgentReduceT::TempStorage temp_storage;

cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh CHANGED Viewed

@@ -29,6 +29,56 @@ using local_segment_index_t = ::cuda::std::uint32_t;
 // Type used for total number of segments and to index within segments globally
 using global_segment_offset_t = ::cuda::std::int64_t;
+template <typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+struct LargeSegmentsSelectorT
+{
+  OffsetT value{};
+  BeginOffsetIteratorT d_offset_begin{};
+  EndOffsetIteratorT d_offset_end{};
+  global_segment_offset_t base_segment_offset{};
+#if !_CCCL_COMPILER(NVRTC)
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE
+  LargeSegmentsSelectorT(OffsetT value, BeginOffsetIteratorT d_offset_begin, EndOffsetIteratorT d_offset_end)
+      : value(value)
+      , d_offset_begin(d_offset_begin)
+      , d_offset_end(d_offset_end)
+  {}
+#endif
+  _CCCL_DEVICE _CCCL_FORCEINLINE bool operator()(local_segment_index_t segment_id) const
+  {
+    const OffsetT segment_size =
+      d_offset_end[base_segment_offset + segment_id] - d_offset_begin[base_segment_offset + segment_id];
+    return segment_size > value;
+  }
+};
+template <typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+struct SmallSegmentsSelectorT
+{
+  OffsetT value{};
+  BeginOffsetIteratorT d_offset_begin{};
+  EndOffsetIteratorT d_offset_end{};
+  global_segment_offset_t base_segment_offset{};
+#if !_CCCL_COMPILER(NVRTC)
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE
+  SmallSegmentsSelectorT(OffsetT value, BeginOffsetIteratorT d_offset_begin, EndOffsetIteratorT d_offset_end)
+      : value(value)
+      , d_offset_begin(d_offset_begin)
+      , d_offset_end(d_offset_end)
+  {}
+#endif
+  _CCCL_DEVICE _CCCL_FORCEINLINE bool operator()(local_segment_index_t segment_id) const
+  {
+    const OffsetT segment_size =
+      d_offset_end[base_segment_offset + segment_id] - d_offset_begin[base_segment_offset + segment_id];
+    return segment_size < value;
+  }
+};
 /**
  * @brief Fallback kernel, in case there's not enough segments to
  *        take advantage of partitioning.
@@ -89,7 +139,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
 {
   using ActivePolicyT       = typename ChainedPolicyT::ActivePolicy;
   using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy;
-  using MediumPolicyT       = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT::MediumPolicyT;
+  using MediumPolicyT       = typename ActivePolicyT::MediumSegmentPolicy;
   const auto segment_id = static_cast<local_segment_index_t>(blockIdx.x);
   OffsetT segment_begin = d_begin_offsets[segment_id];
@@ -253,7 +303,7 @@ template <SortOrder Order,
           typename BeginOffsetIteratorT,
           typename EndOffsetIteratorT,
           typename OffsetT>
-__launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolicyT::BLOCK_THREADS)
+__launch_bounds__(ChainedPolicyT::ActivePolicy::SmallSegmentPolicy::BLOCK_THREADS)
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortKernelSmall(
     local_segment_index_t small_segments,
     local_segment_index_t medium_segments,
@@ -272,10 +322,9 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolic
   const local_segment_index_t tid = threadIdx.x;
   const local_segment_index_t bid = blockIdx.x;
-  using ActivePolicyT         = typename ChainedPolicyT::ActivePolicy;
-  using SmallAndMediumPolicyT = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT;
-  using MediumPolicyT         = typename SmallAndMediumPolicyT::MediumPolicyT;
-  using SmallPolicyT          = typename SmallAndMediumPolicyT::SmallPolicyT;
+  using ActivePolicyT = typename ChainedPolicyT::ActivePolicy;
+  using SmallPolicyT  = typename ActivePolicyT::SmallSegmentPolicy;
+  using MediumPolicyT = typename ActivePolicyT::MediumSegmentPolicy;
   constexpr auto threads_per_medium_segment = static_cast<local_segment_index_t>(MediumPolicyT::WARP_THREADS);
   constexpr auto threads_per_small_segment  = static_cast<local_segment_index_t>(SmallPolicyT::WARP_THREADS);
@@ -286,11 +335,9 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolic
   using SmallAgentWarpMergeSortT =
     sub_warp_merge_sort::AgentSubWarpSort<Order == SortOrder::Descending, SmallPolicyT, KeyT, ValueT, OffsetT>;
-  constexpr auto segments_per_medium_block =
-    static_cast<local_segment_index_t>(SmallAndMediumPolicyT::SEGMENTS_PER_MEDIUM_BLOCK);
+  constexpr auto segments_per_medium_block = static_cast<local_segment_index_t>(MediumPolicyT::SEGMENTS_PER_BLOCK);
-  constexpr auto segments_per_small_block =
-    static_cast<local_segment_index_t>(SmallAndMediumPolicyT::SEGMENTS_PER_SMALL_BLOCK);
+  constexpr auto segments_per_small_block = static_cast<local_segment_index_t>(SmallPolicyT::SEGMENTS_PER_BLOCK);
   __shared__ union
   {

cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh CHANGED Viewed

@@ -202,14 +202,18 @@ _CCCL_HOST_DEVICE _CCCL_CONSTEVAL auto load_store_type()
   }
 }
-template <typename VectorizedPolicy, typename Offset, typename F, typename RandomAccessIteratorOut, typename... InputT>
+template <typename VectorizedPolicy,
+          typename Offset,
+          typename F,
+          typename RandomAccessIteratorOut,
+          typename... RandomAccessIteratorsIn>
 _CCCL_DEVICE void transform_kernel_vectorized(
   Offset num_items,
   int num_elem_per_thread_prefetch,
   bool can_vectorize,
   F f,
   RandomAccessIteratorOut out,
-  const InputT*... ins)
+  RandomAccessIteratorsIn... ins)
 {
   constexpr int block_dim        = VectorizedPolicy::block_threads;
   constexpr int items_per_thread = VectorizedPolicy::items_per_thread_vectorized;
@@ -240,9 +244,12 @@ _CCCL_DEVICE void transform_kernel_vectorized(
   constexpr int load_store_size = VectorizedPolicy::load_store_word_size;
   using load_store_t            = decltype(load_store_type<load_store_size>());
   using output_t                = it_value_t<RandomAccessIteratorOut>;
-  using result_t                = ::cuda::std::decay_t<::cuda::std::invoke_result_t<F, const InputT&...>>;
+  using result_t = ::cuda::std::decay_t<::cuda::std::invoke_result_t<F, const it_value_t<RandomAccessIteratorsIn>&...>>;
   // picks output type size if there are no inputs
-  constexpr int element_size     = int{first_item(sizeof(InputT)..., size_of<output_t>)};
+  constexpr int element_size     = int{first_nonzero_value(
+    (sizeof(it_value_t<RandomAccessIteratorsIn>)
+     * THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...,
+    size_of<output_t>)};
   constexpr int load_store_count = (items_per_thread * element_size) / load_store_size;
   static_assert((items_per_thread * element_size) % load_store_size == 0);
@@ -258,18 +265,35 @@ _CCCL_DEVICE void transform_kernel_vectorized(
   auto provide_array = [&](auto... inputs) {
     // load inputs
-    // TODO(bgruber): we could support fancy iterators for loading here as well (and only vectorize some inputs)
-    [[maybe_unused]] auto load_tile_vectorized = [&](auto* in, auto& input) {
-      auto in_vec    = reinterpret_cast<const load_store_t*>(in);
-      auto input_vec = reinterpret_cast<load_store_t*>(input.data());
-      _CCCL_PRAGMA_UNROLL_FULL()
-      for (int i = 0; i < load_store_count; ++i)
+    [[maybe_unused]] auto load_tile = [](auto in, auto& input) {
+      if constexpr (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<decltype(in)>)
       {
-        input_vec[i] = in_vec[i * VectorizedPolicy::block_threads + threadIdx.x];
+        auto in_vec    = reinterpret_cast<const load_store_t*>(in) + threadIdx.x;
+        auto input_vec = reinterpret_cast<load_store_t*>(input.data());
+        _CCCL_PRAGMA_UNROLL_FULL()
+        for (int i = 0; i < load_store_count; ++i)
+        {
+          input_vec[i] = in_vec[i * VectorizedPolicy::block_threads];
+        }
+      }
+      else
+      {
+        constexpr int elems = load_store_size / element_size;
+        in += threadIdx.x * elems;
+        _CCCL_PRAGMA_UNROLL_FULL()
+        for (int i = 0; i < load_store_count; ++i)
+        {
+          _CCCL_PRAGMA_UNROLL_FULL()
+          for (int j = 0; j < elems; ++j)
+          {
+            input[i * elems + j] = in[i * elems * VectorizedPolicy::block_threads + j];
+          }
+        }
       }
     };
     _CCCL_PDL_GRID_DEPENDENCY_SYNC();
-    (load_tile_vectorized(ins, inputs), ...);
+    (load_tile(ins, inputs), ...);
     // Benchmarks showed up to 38% slowdown on H200 (some improvements as well), so omitted. See #5249 for details.
     // _CCCL_PDL_TRIGGER_NEXT_LAUNCH();
@@ -280,7 +304,7 @@ _CCCL_DEVICE void transform_kernel_vectorized(
       output[i] = f(inputs[i]...);
     }
   };
-  provide_array(uninitialized_array<InputT, items_per_thread>{}...);
+  provide_array(uninitialized_array<it_value_t<RandomAccessIteratorsIn>, items_per_thread>{}...);
   // write output
   if constexpr (can_vectorize_store)

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh CHANGED Viewed

@@ -47,6 +47,118 @@ namespace detail
 {
 namespace segmented_sort
 {
+template <typename PolicyT, typename = void>
+struct SegmentedSortPolicyWrapper : PolicyT
+{
+  CUB_RUNTIME_FUNCTION SegmentedSortPolicyWrapper(PolicyT base)
+      : PolicyT(base)
+  {}
+};
+template <typename StaticPolicyT>
+struct SegmentedSortPolicyWrapper<StaticPolicyT,
+                                  _CUDA_VSTD::void_t<typename StaticPolicyT::LargeSegmentPolicy,
+                                                     typename StaticPolicyT::SmallSegmentPolicy,
+                                                     typename StaticPolicyT::MediumSegmentPolicy>> : StaticPolicyT
+{
+  CUB_RUNTIME_FUNCTION SegmentedSortPolicyWrapper(StaticPolicyT base)
+      : StaticPolicyT(base)
+  {}
+  CUB_RUNTIME_FUNCTION static constexpr auto LargeSegment()
+  {
+    return cub::detail::MakePolicyWrapper(typename StaticPolicyT::LargeSegmentPolicy());
+  }
+  CUB_RUNTIME_FUNCTION static constexpr auto SmallSegment()
+  {
+    return cub::detail::MakePolicyWrapper(typename StaticPolicyT::SmallSegmentPolicy());
+  }
+  CUB_RUNTIME_FUNCTION static constexpr auto MediumSegment()
+  {
+    return cub::detail::MakePolicyWrapper(typename StaticPolicyT::MediumSegmentPolicy());
+  }
+  CUB_RUNTIME_FUNCTION static constexpr int PartitioningThreshold()
+  {
+    return StaticPolicyT::PARTITIONING_THRESHOLD;
+  }
+  CUB_RUNTIME_FUNCTION static constexpr int LargeSegmentRadixBits()
+  {
+    return StaticPolicyT::LargeSegmentPolicy::RADIX_BITS;
+  }
+  CUB_RUNTIME_FUNCTION static constexpr int SegmentsPerSmallBlock()
+  {
+    return StaticPolicyT::SmallSegmentPolicy::SEGMENTS_PER_BLOCK;
+  }
+  CUB_RUNTIME_FUNCTION static constexpr int SegmentsPerMediumBlock()
+  {
+    return StaticPolicyT::MediumSegmentPolicy::SEGMENTS_PER_BLOCK;
+  }
+  CUB_RUNTIME_FUNCTION static constexpr int SmallPolicyItemsPerTile()
+  {
+    return StaticPolicyT::SmallSegmentPolicy::ITEMS_PER_TILE;
+  }
+  CUB_RUNTIME_FUNCTION static constexpr int MediumPolicyItemsPerTile()
+  {
+    return StaticPolicyT::MediumSegmentPolicy::ITEMS_PER_TILE;
+  }
+  CUB_RUNTIME_FUNCTION static constexpr CacheLoadModifier LargeSegmentLoadModifier()
+  {
+    return StaticPolicyT::LargeSegmentPolicy::LOAD_MODIFIER;
+  }
+  CUB_RUNTIME_FUNCTION static constexpr BlockLoadAlgorithm LargeSegmentLoadAlgorithm()
+  {
+    return StaticPolicyT::LargeSegmentPolicy::LOAD_ALGORITHM;
+  }
+  CUB_RUNTIME_FUNCTION static constexpr WarpLoadAlgorithm MediumSegmentLoadAlgorithm()
+  {
+    return StaticPolicyT::MediumSegmentPolicy::LOAD_ALGORITHM;
+  }
+  CUB_RUNTIME_FUNCTION static constexpr WarpLoadAlgorithm SmallSegmentLoadAlgorithm()
+  {
+    return StaticPolicyT::SmallSegmentPolicy::LOAD_ALGORITHM;
+  }
+  CUB_RUNTIME_FUNCTION static constexpr WarpStoreAlgorithm MediumSegmentStoreAlgorithm()
+  {
+    return StaticPolicyT::MediumSegmentPolicy::STORE_ALGORITHM;
+  }
+  CUB_RUNTIME_FUNCTION static constexpr WarpStoreAlgorithm SmallSegmentStoreAlgorithm()
+  {
+    return StaticPolicyT::SmallSegmentPolicy::STORE_ALGORITHM;
+  }
+#if defined(CUB_ENABLE_POLICY_PTX_JSON)
+  _CCCL_DEVICE static constexpr auto EncodedPolicy()
+  {
+    using namespace ptx_json;
+    return object<key<"LargeSegmentPolicy">()    = LargeSegment().EncodedPolicy(),
+                  key<"SmallSegmentPolicy">()    = SmallSegment().EncodedPolicy(),
+                  key<"MediumSegmentPolicy">()   = MediumSegment().EncodedPolicy(),
+                  key<"PartitioningThreshold">() = value<StaticPolicyT::PARTITIONING_THRESHOLD>()>();
+  }
+#endif
+};
+template <typename PolicyT>
+CUB_RUNTIME_FUNCTION SegmentedSortPolicyWrapper<PolicyT> MakeSegmentedSortPolicyWrapper(PolicyT policy)
+{
+  return SegmentedSortPolicyWrapper<PolicyT>{policy};
+}
 template <typename KeyT, typename ValueT>
 struct policy_hub
 {
@@ -71,12 +183,19 @@ struct policy_hub
     static constexpr int ITEMS_PER_SMALL_THREAD  = Nominal4BItemsToItems<DominantT>(7);
     static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems<DominantT>(7);
-    using SmallAndMediumSegmentedSortPolicyT     = AgentSmallAndMediumSegmentedSortPolicy<
-          BLOCK_THREADS,
-          // Small policy
-          AgentSubWarpMergeSortPolicy<4 /* Threads per segment */, ITEMS_PER_SMALL_THREAD, WARP_LOAD_DIRECT, LOAD_DEFAULT>,
-          // Medium policy
-          AgentSubWarpMergeSortPolicy<32 /* Threads per segment */, ITEMS_PER_MEDIUM_THREAD, WARP_LOAD_DIRECT, LOAD_DEFAULT>>;
+    using SmallSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  4 /* Threads per segment */,
+                                  ITEMS_PER_SMALL_THREAD,
+                                  WARP_LOAD_DIRECT,
+                                  LOAD_DEFAULT>;
+    using MediumSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  32 /* Threads per segment */,
+                                  ITEMS_PER_MEDIUM_THREAD,
+                                  WARP_LOAD_DIRECT,
+                                  LOAD_DEFAULT>;
   };
   struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
@@ -97,12 +216,19 @@ struct policy_hub
     static constexpr int ITEMS_PER_SMALL_THREAD  = Nominal4BItemsToItems<DominantT>(9);
     static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems<DominantT>(9);
-    using SmallAndMediumSegmentedSortPolicyT     = AgentSmallAndMediumSegmentedSortPolicy<
-          BLOCK_THREADS,
-          // Small policy
-          AgentSubWarpMergeSortPolicy<4 /* Threads per segment */, ITEMS_PER_SMALL_THREAD, WARP_LOAD_DIRECT, LOAD_DEFAULT>,
-          // Medium policy
-          AgentSubWarpMergeSortPolicy<32 /* Threads per segment */, ITEMS_PER_MEDIUM_THREAD, WARP_LOAD_DIRECT, LOAD_DEFAULT>>;
+    using SmallSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  4 /* Threads per segment */,
+                                  ITEMS_PER_SMALL_THREAD,
+                                  WARP_LOAD_DIRECT,
+                                  LOAD_DEFAULT>;
+    using MediumSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  32 /* Threads per segment */,
+                                  ITEMS_PER_MEDIUM_THREAD,
+                                  WARP_LOAD_DIRECT,
+                                  LOAD_DEFAULT>;
   };
   struct Policy610 : ChainedPolicy<610, Policy610, Policy600>
@@ -123,12 +249,19 @@ struct policy_hub
     static constexpr int ITEMS_PER_SMALL_THREAD  = Nominal4BItemsToItems<DominantT>(9);
     static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems<DominantT>(9);
-    using SmallAndMediumSegmentedSortPolicyT     = AgentSmallAndMediumSegmentedSortPolicy<
-          BLOCK_THREADS,
-          // Small policy
-          AgentSubWarpMergeSortPolicy<4 /* Threads per segment */, ITEMS_PER_SMALL_THREAD, WARP_LOAD_DIRECT, LOAD_DEFAULT>,
-          // Medium policy
-          AgentSubWarpMergeSortPolicy<32 /* Threads per segment */, ITEMS_PER_MEDIUM_THREAD, WARP_LOAD_DIRECT, LOAD_DEFAULT>>;
+    using SmallSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  4 /* Threads per segment */,
+                                  ITEMS_PER_SMALL_THREAD,
+                                  WARP_LOAD_DIRECT,
+                                  LOAD_DEFAULT>;
+    using MediumSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  32 /* Threads per segment */,
+                                  ITEMS_PER_MEDIUM_THREAD,
+                                  WARP_LOAD_DIRECT,
+                                  LOAD_DEFAULT>;
   };
   struct Policy620 : ChainedPolicy<620, Policy620, Policy610>
@@ -149,12 +282,19 @@ struct policy_hub
     static constexpr int ITEMS_PER_SMALL_THREAD  = Nominal4BItemsToItems<DominantT>(9);
     static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems<DominantT>(9);
-    using SmallAndMediumSegmentedSortPolicyT     = AgentSmallAndMediumSegmentedSortPolicy<
-          BLOCK_THREADS,
-          // Small policy
-          AgentSubWarpMergeSortPolicy<4 /* Threads per segment */, ITEMS_PER_SMALL_THREAD, WARP_LOAD_DIRECT, LOAD_DEFAULT>,
-          // Medium policy
-          AgentSubWarpMergeSortPolicy<32 /* Threads per segment */, ITEMS_PER_MEDIUM_THREAD, WARP_LOAD_DIRECT, LOAD_DEFAULT>>;
+    using SmallSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  4 /* Threads per segment */,
+                                  ITEMS_PER_SMALL_THREAD,
+                                  WARP_LOAD_DIRECT,
+                                  LOAD_DEFAULT>;
+    using MediumSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  32 /* Threads per segment */,
+                                  ITEMS_PER_MEDIUM_THREAD,
+                                  WARP_LOAD_DIRECT,
+                                  LOAD_DEFAULT>;
   };
   struct Policy700 : ChainedPolicy<700, Policy700, Policy620>
@@ -175,15 +315,19 @@ struct policy_hub
     static constexpr int ITEMS_PER_SMALL_THREAD  = Nominal4BItemsToItems<DominantT>(7);
     static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems<DominantT>(KEYS_ONLY ? 11 : 7);
-    using SmallAndMediumSegmentedSortPolicyT     = AgentSmallAndMediumSegmentedSortPolicy<
-          BLOCK_THREADS,
-          // Small policy
-          AgentSubWarpMergeSortPolicy<KEYS_ONLY ? 4 : 8 /* Threads per segment */,
-                                      ITEMS_PER_SMALL_THREAD,
-                                      WARP_LOAD_DIRECT,
-                                      LOAD_DEFAULT>,
-          // Medium policy
-          AgentSubWarpMergeSortPolicy<32 /* Threads per segment */, ITEMS_PER_MEDIUM_THREAD, WARP_LOAD_DIRECT, LOAD_DEFAULT>>;
+    using SmallSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  KEYS_ONLY ? 4 : 8 /* Threads per segment */,
+                                  ITEMS_PER_SMALL_THREAD,
+                                  WARP_LOAD_DIRECT,
+                                  LOAD_DEFAULT>;
+    using MediumSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  32 /* Threads per segment */,
+                                  ITEMS_PER_MEDIUM_THREAD,
+                                  WARP_LOAD_DIRECT,
+                                  LOAD_DEFAULT>;
   };
   struct Policy800 : ChainedPolicy<800, Policy800, Policy700>
@@ -202,15 +346,19 @@ struct policy_hub
     static constexpr int ITEMS_PER_SMALL_THREAD  = Nominal4BItemsToItems<DominantT>(9);
     static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems<DominantT>(KEYS_ONLY ? 7 : 11);
-    using SmallAndMediumSegmentedSortPolicyT     = AgentSmallAndMediumSegmentedSortPolicy<
-          BLOCK_THREADS,
-          // Small policy
-          AgentSubWarpMergeSortPolicy<KEYS_ONLY ? 4 : 2 /* Threads per segment */,
-                                      ITEMS_PER_SMALL_THREAD,
-                                      WARP_LOAD_TRANSPOSE,
-                                      LOAD_DEFAULT>,
-          // Medium policy
-          AgentSubWarpMergeSortPolicy<32 /* Threads per segment */, ITEMS_PER_MEDIUM_THREAD, WARP_LOAD_TRANSPOSE, LOAD_DEFAULT>>;
+    using SmallSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  KEYS_ONLY ? 4 : 2 /* Threads per segment */,
+                                  ITEMS_PER_SMALL_THREAD,
+                                  WARP_LOAD_TRANSPOSE,
+                                  LOAD_DEFAULT>;
+    using MediumSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  32 /* Threads per segment */,
+                                  ITEMS_PER_MEDIUM_THREAD,
+                                  WARP_LOAD_TRANSPOSE,
+                                  LOAD_DEFAULT>;
   };
   struct Policy860 : ChainedPolicy<860, Policy860, Policy800>
@@ -230,15 +378,19 @@ struct policy_hub
     static constexpr bool LARGE_ITEMS            = sizeof(DominantT) > 4;
     static constexpr int ITEMS_PER_SMALL_THREAD  = Nominal4BItemsToItems<DominantT>(LARGE_ITEMS ? 7 : 9);
     static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems<DominantT>(LARGE_ITEMS ? 9 : 7);
-    using SmallAndMediumSegmentedSortPolicyT     = AgentSmallAndMediumSegmentedSortPolicy<
-          BLOCK_THREADS,
-          // Small policy
-          AgentSubWarpMergeSortPolicy<LARGE_ITEMS ? 8 : 2 /* Threads per segment */,
-                                      ITEMS_PER_SMALL_THREAD,
-                                      WARP_LOAD_TRANSPOSE,
-                                      LOAD_LDG>,
-          // Medium policy
-          AgentSubWarpMergeSortPolicy<16 /* Threads per segment */, ITEMS_PER_MEDIUM_THREAD, WARP_LOAD_TRANSPOSE, LOAD_LDG>>;
+    using SmallSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  LARGE_ITEMS ? 8 : 2 /* Threads per segment */,
+                                  ITEMS_PER_SMALL_THREAD,
+                                  WARP_LOAD_TRANSPOSE,
+                                  LOAD_LDG>;
+    using MediumSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  16 /* Threads per segment */,
+                                  ITEMS_PER_MEDIUM_THREAD,
+                                  WARP_LOAD_TRANSPOSE,
+                                  LOAD_LDG>;
   };
   using MaxPolicy = Policy860;

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh CHANGED Viewed

@@ -282,21 +282,45 @@ _CCCL_HOST_DEVICE constexpr int arch_to_min_bytes_in_flight(int sm_arch)
   return 12 * 1024; // V100 and below
 }
-template <typename T, typename... Ts>
-_CCCL_HOST_DEVICE constexpr bool all_equal([[maybe_unused]] T head, Ts... tail)
+template <typename H, typename... Ts>
+_CCCL_HOST_DEVICE constexpr bool all_nonzero_equal(H head, Ts... values)
 {
-  return ((head == tail) && ...);
+  size_t first = 0;
+  for (size_t v : ::cuda::std::array<H, 1 + sizeof...(Ts)>{head, values...})
+  {
+    if (v == 0)
+    {
+      continue;
+    }
+    if (first == 0)
+    {
+      first = v;
+    }
+    else if (v != first)
+    {
+      return false;
+    }
+  }
+  return true;
 }
-_CCCL_HOST_DEVICE constexpr bool all_equal()
+_CCCL_HOST_DEVICE constexpr bool all_nonzero_equal()
 {
   return true;
 }
-template <typename T, typename... Ts>
-_CCCL_HOST_DEVICE constexpr auto first_item(T head, Ts...) -> T
+template <typename H, typename... Ts>
+_CCCL_HOST_DEVICE constexpr auto first_nonzero_value(H head, Ts... values)
 {
-  return head;
+  for (auto v : ::cuda::std::array<H, 1 + sizeof...(Ts)>{head, values...})
+  {
+    if (v != 0)
+    {
+      return v;
+    }
+  }
+  // we only reach here when all input are not contiguous and the output has a void value type
+  return H{1};
 }
 template <typename T>
@@ -336,25 +360,36 @@ struct policy_hub<RequiresStableAddress,
     (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn> && ...);
   static constexpr bool all_input_values_trivially_reloc =
     (THRUST_NS_QUALIFIER::is_trivially_relocatable_v<it_value_t<RandomAccessIteratorsIn>> && ...);
-  static constexpr bool can_memcpy_inputs = all_inputs_contiguous && all_input_values_trivially_reloc;
+  static constexpr bool can_memcpy_all_inputs = all_inputs_contiguous && all_input_values_trivially_reloc;
+  // the vectorized kernel supports mixing contiguous and non-contiguous iterators
+  static constexpr bool can_memcpy_contiguous_inputs =
+    ((!THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>
+      || THRUST_NS_QUALIFIER::is_trivially_relocatable_v<it_value_t<RandomAccessIteratorsIn>>)
+     && ...);
   // for vectorized policy:
-  static constexpr bool all_input_values_same_size = all_equal(sizeof(it_value_t<RandomAccessIteratorsIn>)...);
-  static constexpr int load_store_word_size        = 8; // TODO(bgruber): make this 16, and 32 on Blackwell+
-  // if there are no inputs, we take the size of the output value
-  static constexpr int value_type_size =
-    first_item(int{sizeof(it_value_t<RandomAccessIteratorsIn>)}..., int{size_of<it_value_t<RandomAccessIteratorOut>>});
+  static constexpr bool all_contiguous_input_values_same_size = all_nonzero_equal(
+    (sizeof(it_value_t<RandomAccessIteratorsIn>)
+     * THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...);
+  static constexpr int load_store_word_size = 8; // TODO(bgruber): make this 16, and 32 on Blackwell+
+  // find the value type size of the first contiguous iterator. if there are no inputs, we take the size of the output
+  // value type
+  static constexpr int contiguous_value_type_size = first_nonzero_value(
+    (int{sizeof(it_value_t<RandomAccessIteratorsIn>)}
+     * THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...,
+    int{size_of<it_value_t<RandomAccessIteratorOut>>});
   static constexpr bool value_type_divides_load_store_size =
-    load_store_word_size % value_type_size == 0; // implicitly checks that value_type_size <= load_store_word_size
+    load_store_word_size % contiguous_value_type_size == 0; // implicitly checks that value_type_size <=
+                                                            // load_store_word_size
   static constexpr int target_bytes_per_thread =
     no_input_streams ? 16 /* by experiment on RTX 5090 */ : 32 /* guestimate by gevtushenko for loading */;
   static constexpr int items_per_thread_vec =
-    ::cuda::round_up(target_bytes_per_thread, load_store_word_size) / value_type_size;
+    ::cuda::round_up(target_bytes_per_thread, load_store_word_size) / contiguous_value_type_size;
   using default_vectorized_policy_t = vectorized_policy_t<256, items_per_thread_vec, load_store_word_size>;
   static constexpr bool fallback_to_prefetch =
-    RequiresStableAddress || !can_memcpy_inputs || !all_input_values_same_size || !value_type_divides_load_store_size
-    || !DenseOutput;
+    RequiresStableAddress || !can_memcpy_contiguous_inputs || !all_contiguous_input_values_same_size
+    || !value_type_divides_load_store_size || !DenseOutput;
   // TODO(bgruber): consider a separate kernel for just filling
@@ -380,7 +415,7 @@ struct policy_hub<RequiresStableAddress,
         block_threads* async_policy::min_items_per_thread,
         ldgsts_size_and_align)
       > int{max_smem_per_block};
-    static constexpr bool fallback_to_vectorized = exhaust_smem || no_input_streams;
+    static constexpr bool fallback_to_vectorized = exhaust_smem || no_input_streams || !can_memcpy_all_inputs;
   public:
     static constexpr int min_bif = arch_to_min_bytes_in_flight(800);
@@ -421,7 +456,8 @@ struct policy_hub<RequiresStableAddress,
       (((int{sizeof(it_value_t<RandomAccessIteratorsIn>)} * AsyncBlockSize) % max_alignment == 0) && ...);
     static constexpr bool enough_threads_for_peeling = AsyncBlockSize >= alignment; // head and tail bytes
     static constexpr bool fallback_to_vectorized =
-      exhaust_smem || !tile_sizes_retain_alignment || !enough_threads_for_peeling || no_input_streams;
+      exhaust_smem || !tile_sizes_retain_alignment || !enough_threads_for_peeling || no_input_streams
+      || !can_memcpy_all_inputs;
   public:
     static constexpr int min_bif = arch_to_min_bytes_in_flight(PtxVersion);