PyPI - cuda-cccl - Versions diffs - 0.1.3.2.0.dev438__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.1__cp310-cp310-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.1.3.2.0.dev438__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.1__cp310-cp310-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (177) hide show

cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh CHANGED Viewed

@@ -84,70 +84,6 @@ CUB_NAMESPACE_BEGIN
 //! @endrst
 struct DeviceSegmentedReduce
 {
-private:
-  template <typename InputIteratorT,
-            typename OutputIteratorT,
-            typename BeginOffsetIteratorT,
-            typename EndOffsetIteratorT,
-            typename OffsetT,
-            typename ReductionOpT,
-            typename InitT,
-            typename... Ts>
-  CUB_RUNTIME_FUNCTION static cudaError_t segmented_reduce(
-    ::cuda::std::false_type,
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    ::cuda::std::int64_t num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    ReductionOpT reduction_op,
-    InitT initial_value,
-    cudaStream_t stream);
-  template <typename InputIteratorT,
-            typename OutputIteratorT,
-            typename BeginOffsetIteratorT,
-            typename EndOffsetIteratorT,
-            typename OffsetT,
-            typename ReductionOpT,
-            typename InitT,
-            typename... Ts>
-  CUB_RUNTIME_FUNCTION static cudaError_t segmented_reduce(
-    ::cuda::std::true_type,
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    ::cuda::std::int64_t num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    ReductionOpT reduction_op,
-    InitT initial_value,
-    cudaStream_t stream)
-  {
-    return DispatchSegmentedReduce<
-      InputIteratorT,
-      OutputIteratorT,
-      BeginOffsetIteratorT,
-      EndOffsetIteratorT,
-      OffsetT,
-      ReductionOpT,
-      InitT,
-      Ts...>::Dispatch(d_temp_storage,
-                       temp_storage_bytes,
-                       d_in,
-                       d_out,
-                       num_segments,
-                       d_begin_offsets,
-                       d_end_offsets,
-                       reduction_op,
-                       initial_value,
-                       stream);
-  }
-public:
   //! @rst
   //! Computes a device-wide segmented reduction using the specified
   //! binary ``reduction_op`` functor.
@@ -261,24 +197,29 @@ public:
   {
     _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Reduce");
-    // Integer type for global offsets
-    using OffsetT               = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
-    using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
-    static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
-    return segmented_reduce<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, ReductionOpT>(
-      integral_offset_check{},
-      d_temp_storage,
-      temp_storage_bytes,
-      d_in,
-      d_out,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      reduction_op,
-      initial_value, // zero-initialize
-      stream);
+    using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
+    static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
+    if constexpr (::cuda::std::is_integral_v<OffsetT>)
+    {
+      return DispatchSegmentedReduce<
+        InputIteratorT,
+        OutputIteratorT,
+        BeginOffsetIteratorT,
+        EndOffsetIteratorT,
+        OffsetT,
+        ReductionOpT,
+        T>::Dispatch(d_temp_storage,
+                     temp_storage_bytes,
+                     d_in,
+                     d_out,
+                     num_segments,
+                     d_begin_offsets,
+                     d_end_offsets,
+                     reduction_op,
+                     initial_value, // zero-initialize
+                     stream);
+    }
+    _CCCL_UNREACHABLE();
   }
   //! @rst
@@ -465,32 +406,31 @@ public:
   {
     _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Sum");
-    // Integer type for global offsets
     using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
-    // The output value type
-    using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
-    using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
-    static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
-    return segmented_reduce<InputIteratorT,
-                            OutputIteratorT,
-                            BeginOffsetIteratorT,
-                            EndOffsetIteratorT,
-                            OffsetT,
-                            ::cuda::std::plus<>>(
-      integral_offset_check{},
-      d_temp_storage,
-      temp_storage_bytes,
-      d_in,
-      d_out,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      ::cuda::std::plus<>{},
-      OutputT(), // zero-initialize
-      stream);
+    using OutputT = detail::non_void_value_t<OutputIteratorT, detail::it_value_t<InputIteratorT>>;
+    using init_t  = OutputT;
+    static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
+    if constexpr (::cuda::std::is_integral_v<OffsetT>)
+    {
+      return DispatchSegmentedReduce<
+        InputIteratorT,
+        OutputIteratorT,
+        BeginOffsetIteratorT,
+        EndOffsetIteratorT,
+        OffsetT,
+        ::cuda::std::plus<>,
+        init_t>::Dispatch(d_temp_storage,
+                          temp_storage_bytes,
+                          d_in,
+                          d_out,
+                          num_segments,
+                          d_begin_offsets,
+                          d_end_offsets,
+                          ::cuda::std::plus<>{},
+                          init_t{}, // zero-initialize
+                          stream);
+    }
+    _CCCL_UNREACHABLE();
   }
   //! @rst
@@ -556,9 +496,7 @@ public:
     // `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
     // integral constant or larger integral types
     using offset_t = int;
-    // The output value type
-    using output_t = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
+    using output_t = detail::non_void_value_t<OutputIteratorT, detail::it_value_t<InputIteratorT>>;
     return detail::reduce::
       DispatchFixedSizeSegmentedReduce<InputIteratorT, OutputIteratorT, offset_t, ::cuda::std::plus<>, output_t>::Dispatch(
@@ -673,32 +611,31 @@ public:
   {
     _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Min");
-    // Integer type for global offsets
     using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
-    // The input value type
-    using InputT                = cub::detail::it_value_t<InputIteratorT>;
-    using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
-    static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
-    return segmented_reduce<InputIteratorT,
-                            OutputIteratorT,
-                            BeginOffsetIteratorT,
-                            EndOffsetIteratorT,
-                            OffsetT,
-                            ::cuda::minimum<>>(
-      integral_offset_check{},
-      d_temp_storage,
-      temp_storage_bytes,
-      d_in,
-      d_out,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      ::cuda::minimum<>{},
-      ::cuda::std::numeric_limits<InputT>::max(),
-      stream);
+    using InputT  = detail::it_value_t<InputIteratorT>;
+    using init_t  = InputT;
+    static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
+    if constexpr (::cuda::std::is_integral_v<OffsetT>)
+    {
+      return DispatchSegmentedReduce<
+        InputIteratorT,
+        OutputIteratorT,
+        BeginOffsetIteratorT,
+        EndOffsetIteratorT,
+        OffsetT,
+        ::cuda::minimum<>,
+        init_t>::Dispatch(d_temp_storage,
+                          temp_storage_bytes,
+                          d_in,
+                          d_out,
+                          num_segments,
+                          d_begin_offsets,
+                          d_end_offsets,
+                          ::cuda::minimum<>{},
+                          ::cuda::std::numeric_limits<init_t>::max(),
+                          stream);
+    }
+    _CCCL_UNREACHABLE();
   }
   //! @rst
@@ -769,9 +706,7 @@ public:
     // `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
     // integral constant or larger integral types
     using offset_t = int;
-    // The input value type
-    using input_t = cub::detail::it_value_t<InputIteratorT>;
+    using input_t  = detail::it_value_t<InputIteratorT>;
     return detail::reduce::
       DispatchFixedSizeSegmentedReduce<InputIteratorT, OutputIteratorT, offset_t, ::cuda::minimum<>, input_t>::Dispatch(
@@ -890,54 +825,45 @@ public:
   {
     _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::ArgMin");
-    // Integer type for global offsets
     // Using common iterator value type is a breaking change, see:
     // https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615
     using OffsetT = int; // detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
-    // The input type
-    using InputValueT = cub::detail::it_value_t<InputIteratorT>;
-    // The output tuple type
-    using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
-    // The output value type
+    using InputValueT  = detail::it_value_t<InputIteratorT>;
+    using OutputTupleT = detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
     using OutputValueT = typename OutputTupleT::Value;
-    using AccumT = OutputTupleT;
-    using InitT = detail::reduce::empty_problem_init_t<AccumT>;
+    using AccumT       = OutputTupleT;
+    using InitT        = detail::reduce::empty_problem_init_t<AccumT>;
     // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
     using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
     ArgIndexInputIteratorT d_indexed_in(d_in);
-    // Initial value
     InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};
-    using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
-    static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
-    return segmented_reduce<ArgIndexInputIteratorT,
-                            OutputIteratorT,
-                            BeginOffsetIteratorT,
-                            EndOffsetIteratorT,
-                            OffsetT,
-                            cub::ArgMin,
-                            InitT,
-                            AccumT>(
-      integral_offset_check{},
-      d_temp_storage,
-      temp_storage_bytes,
-      d_indexed_in,
-      d_out,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      cub::ArgMin(),
-      initial_value,
-      stream);
+    static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
+    if constexpr (::cuda::std::is_integral_v<OffsetT>)
+    {
+      return DispatchSegmentedReduce<
+        ArgIndexInputIteratorT,
+        OutputIteratorT,
+        BeginOffsetIteratorT,
+        EndOffsetIteratorT,
+        OffsetT,
+        cub::ArgMin,
+        InitT,
+        AccumT>::Dispatch(d_temp_storage,
+                          temp_storage_bytes,
+                          d_indexed_in,
+                          d_out,
+                          num_segments,
+                          d_begin_offsets,
+                          d_end_offsets,
+                          cub::ArgMin{},
+                          initial_value,
+                          stream);
+    }
+    _CCCL_UNREACHABLE();
   }
   //! @rst
@@ -1144,27 +1070,32 @@ public:
   {
     _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Max");
-    // Integer type for global offsets
     using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
-    // The input value type
-    using InputT = cub::detail::it_value_t<InputIteratorT>;
-    using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
-    static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
-    return segmented_reduce<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>(
-      integral_offset_check{},
-      d_temp_storage,
-      temp_storage_bytes,
-      d_in,
-      d_out,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      ::cuda::maximum<>{},
-      ::cuda::std::numeric_limits<InputT>::lowest(),
-      stream);
+    using InputT  = cub::detail::it_value_t<InputIteratorT>;
+    using init_t  = InputT;
+    static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
+    if constexpr (::cuda::std::is_integral_v<OffsetT>)
+    {
+      return DispatchSegmentedReduce<
+        InputIteratorT,
+        OutputIteratorT,
+        BeginOffsetIteratorT,
+        EndOffsetIteratorT,
+        OffsetT,
+        ::cuda::maximum<>,
+        init_t>::Dispatch(d_temp_storage,
+                          temp_storage_bytes,
+                          d_in,
+                          d_out,
+                          num_segments,
+                          d_begin_offsets,
+                          d_end_offsets,
+                          ::cuda::maximum<>{},
+                          ::cuda::std::numeric_limits<init_t>::lowest(),
+                          stream);
+    }
+    _CCCL_UNREACHABLE();
   }
   //! @rst
@@ -1229,9 +1160,7 @@ public:
     // `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
     // integral constant or larger integral types
     using offset_t = int;
-    // The input value type
-    using input_t = cub::detail::it_value_t<InputIteratorT>;
+    using input_t  = detail::it_value_t<InputIteratorT>;
     return detail::reduce::
       DispatchFixedSizeSegmentedReduce<InputIteratorT, OutputIteratorT, offset_t, ::cuda::maximum<>, input_t>::Dispatch(
@@ -1353,54 +1282,45 @@ public:
   {
     _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::ArgMax");
-    // Integer type for global offsets
     // Using common iterator value type is a breaking change, see:
     // https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615
     using OffsetT = int; // detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
-    // The input type
-    using InputValueT = cub::detail::it_value_t<InputIteratorT>;
-    // The output tuple type
+    using InputValueT  = cub::detail::it_value_t<InputIteratorT>;
     using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
-    using AccumT = OutputTupleT;
-    using InitT = detail::reduce::empty_problem_init_t<AccumT>;
-    // The output value type
+    using AccumT       = OutputTupleT;
+    using InitT        = detail::reduce::empty_problem_init_t<AccumT>;
     using OutputValueT = typename OutputTupleT::Value;
     // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
     using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
     ArgIndexInputIteratorT d_indexed_in(d_in);
-    // Initial value
     InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};
-    using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
-    static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
-    return segmented_reduce<ArgIndexInputIteratorT,
-                            OutputIteratorT,
-                            BeginOffsetIteratorT,
-                            EndOffsetIteratorT,
-                            OffsetT,
-                            cub::ArgMax,
-                            InitT,
-                            AccumT>(
-      integral_offset_check{},
-      d_temp_storage,
-      temp_storage_bytes,
-      d_indexed_in,
-      d_out,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      cub::ArgMax(),
-      initial_value,
-      stream);
+    static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
+    if constexpr (::cuda::std::is_integral_v<OffsetT>)
+    {
+      return DispatchSegmentedReduce<
+        ArgIndexInputIteratorT,
+        OutputIteratorT,
+        BeginOffsetIteratorT,
+        EndOffsetIteratorT,
+        OffsetT,
+        cub::ArgMax,
+        InitT,
+        AccumT>::Dispatch(d_temp_storage,
+                          temp_storage_bytes,
+                          d_indexed_in,
+                          d_out,
+                          num_segments,
+                          d_begin_offsets,
+                          d_end_offsets,
+                          cub::ArgMax{},
+                          initial_value,
+                          stream);
+    }
+    _CCCL_UNREACHABLE();
   }
   //! @rst
@@ -1476,34 +1396,25 @@ public:
     // integral constant or larger integral types
     using input_t = int;
-    // The input type
-    using input_value_t = cub::detail::it_value_t<InputIteratorT>;
-    // The output tuple type
-    using output_tuple_t = cub::detail::non_void_value_t<OutputIteratorT, ::cuda::std::pair<input_t, input_value_t>>;
-    using accum_t = output_tuple_t;
-    using init_t = detail::reduce::empty_problem_init_t<accum_t>;
-    // The output value type
+    using input_value_t  = detail::it_value_t<InputIteratorT>;
+    using output_tuple_t = detail::non_void_value_t<OutputIteratorT, ::cuda::std::pair<input_t, input_value_t>>;
+    using accum_t        = output_tuple_t;
+    using init_t         = detail::reduce::empty_problem_init_t<accum_t>;
     using output_value_t = typename output_tuple_t::second_type;
     // Wrapped input iterator to produce index-value <input_t, InputT> tuples
     auto d_indexed_in = THRUST_NS_QUALIFIER::make_transform_iterator(
       THRUST_NS_QUALIFIER::counting_iterator<::cuda::std::int64_t>{0},
       detail::reduce::generate_idx_value<InputIteratorT, output_value_t>(d_in, segment_size));
     using arg_index_input_iterator_t = decltype(d_indexed_in);
-    // Initial value
     init_t initial_value{accum_t(1, ::cuda::std::numeric_limits<input_value_t>::lowest())};
     return detail::reduce::DispatchFixedSizeSegmentedReduce<
       arg_index_input_iterator_t,
       OutputIteratorT,
       input_t,
-      cub::detail::arg_max,
+      detail::arg_max,
       init_t,
       accum_t>::Dispatch(d_temp_storage,
                          temp_storage_bytes,
@@ -1511,7 +1422,7 @@ public:
                          d_out,
                          num_segments,
                          segment_size,
-                         cub::detail::arg_max(),
+                         detail::arg_max(),
                          initial_value,
                          stream);
   }

cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh ADDED Viewed

@@ -0,0 +1,79 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//! @file
+#pragma once
+#include <cub/config.cuh>
+#include <cuda/std/__type_traits/is_same.h>
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+#include <cub/device/device_for.cuh>
+#include <cub/device/device_transform.cuh>
+#include <cub/util_debug.cuh>
+#include <cuda/std/functional>
+#include <cuda/std/mdspan>
+CUB_NAMESPACE_BEGIN
+namespace detail::copy_mdspan
+{
+template <typename MdspanIn, typename MdspanOut>
+struct copy_mdspan_t
+{
+  MdspanIn mdspan_in;
+  MdspanOut mdspan_out;
+  _CCCL_API copy_mdspan_t(MdspanIn mdspan_in, MdspanOut mdspan_out)
+      : mdspan_in{mdspan_in}
+      , mdspan_out{mdspan_out}
+  {}
+  template <typename Idx, typename... Indices>
+  _CCCL_DEVICE_API _CCCL_FORCEINLINE void operator()(Idx, Indices... indices)
+  {
+    mdspan_out(indices...) = mdspan_in(indices...);
+  }
+};
+template <typename T_In,
+          typename E_In,
+          typename L_In,
+          typename A_In,
+          typename T_Out,
+          typename E_Out,
+          typename L_Out,
+          typename A_Out>
+[[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
+copy(::cuda::std::mdspan<T_In, E_In, L_In, A_In> mdspan_in,
+     ::cuda::std::mdspan<T_Out, E_Out, L_Out, A_Out> mdspan_out,
+     ::cudaStream_t stream)
+{
+  if (mdspan_in.is_exhaustive() && mdspan_out.is_exhaustive()
+      && detail::have_same_strides(mdspan_in.mapping(), mdspan_out.mapping()))
+  {
+    return cub::DeviceTransform::Transform(
+      mdspan_in.data_handle(),
+      mdspan_out.data_handle(),
+      mdspan_in.size(),
+      ::cuda::proclaim_copyable_arguments(::cuda::std::identity{}),
+      stream);
+  }
+  // TODO (fbusato): add ForEachInLayout when mdspan_in and mdspan_out have compatible layouts
+  // Compatible layouts could use more efficient iteration patterns
+  return cub::DeviceFor::ForEachInExtents(mdspan_in.extents(), copy_mdspan_t{mdspan_in, mdspan_out}, stream);
+}
+} // namespace detail::copy_mdspan
+CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh CHANGED Viewed

@@ -144,11 +144,11 @@ __launch_bounds__(
   auto& temp_storage = vsmem_helper_t::get_temp_storage(shared_temp_storage, global_temp_storage);
   MergeAgent{
     temp_storage.Alias(),
-    try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(keys1),
-    try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(items1),
+    keys1,
+    items1,
     num_keys1,
-    try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(keys2),
-    try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(items2),
+    keys2,
+    items2,
     num_keys2,
     keys_result,
     items_result,

cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh CHANGED Viewed

@@ -44,7 +44,6 @@
 #  pragma system_header
 #endif // no system header
-#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
 #include <cub/device/dispatch/kernels/radix_sort.cuh>
 #include <cub/device/dispatch/tuning/tuning_radix_sort.cuh>
 #include <cub/util_debug.cuh>
@@ -1379,14 +1378,6 @@ struct DispatchSegmentedRadixSort
     // Number of radix sort invocations until all segments have been processed
     const auto num_invocations = ::cuda::ceil_div(num_segments, max_num_segments_per_invocation);
-    // If d_begin_offsets and d_end_offsets do not support operator+ then we can't have more than
-    // max_num_segments_per_invocation segments per invocation
-    if (num_invocations > 1
-        && !detail::all_iterators_support_add_assign_operator(::cuda::std::int64_t{}, d_begin_offsets, d_end_offsets))
-    {
-      return cudaErrorInvalidValue;
-    }
     BeginOffsetIteratorT begin_offsets_current_it = d_begin_offsets;
     EndOffsetIteratorT end_offsets_current_it     = d_end_offsets;
@@ -1435,8 +1426,8 @@ struct DispatchSegmentedRadixSort
       if (invocation_index + 1 < num_invocations)
       {
-        detail::advance_iterators_inplace_if_supported(begin_offsets_current_it, num_current_segments);
-        detail::advance_iterators_inplace_if_supported(end_offsets_current_it, num_current_segments);
+        begin_offsets_current_it += num_current_segments;
+        end_offsets_current_it += num_current_segments;
       }
       // Sync the stream if specified to flush runtime errors