PyPI - cuda-cccl - Versions diffs - 0.3.0__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.2__cp310-cp310-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.3.0__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.2__cp310-cp310-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show

cuda/cccl/headers/include/cub/device/device_transform.cuh CHANGED Viewed

@@ -18,6 +18,8 @@
 #include <cub/util_namespace.cuh>
 #include <cuda/__functional/address_stability.h>
+#include <cuda/__stream/get_stream.h>
+#include <cuda/std/__execution/env.h>
 #include <cuda/std/tuple>
 CUB_NAMESPACE_BEGIN
@@ -49,13 +51,20 @@ CUB_NAMESPACE_BEGIN
 struct DeviceTransform
 {
 private:
-  template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename NumItemsT, typename TransformOp>
+  template <typename... RandomAccessIteratorsIn,
+            typename RandomAccessIteratorOut,
+            typename NumItemsT,
+            typename Predicate,
+            typename TransformOp,
+            typename StableAddress = cuda::std::false_type>
   CUB_RUNTIME_FUNCTION static cudaError_t TransformInternal(
     ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
     RandomAccessIteratorOut output,
     NumItemsT num_items,
+    Predicate predicate,
     TransformOp transform_op,
-    cudaStream_t stream = nullptr)
+    cudaStream_t stream,
+    StableAddress = {})
   {
     using choose_offset_t = detail::choose_signed_offset<NumItemsT>;
     using offset_t        = typename choose_offset_t::type;
@@ -66,18 +75,28 @@ private:
       return error;
     }
-    return detail::transform::dispatch_t<
-      detail::transform::requires_stable_address::no,
-      offset_t,
-      ::cuda::std::tuple<RandomAccessIteratorsIn...>,
-      RandomAccessIteratorOut,
-      detail::transform::always_true_predicate,
-      TransformOp>::dispatch(::cuda::std::move(inputs),
-                             ::cuda::std::move(output),
-                             num_items,
-                             detail::transform::always_true_predicate{},
-                             ::cuda::std::move(transform_op),
-                             stream);
+    return detail::transform::dispatch_t < StableAddress::value
+           ? detail::transform::requires_stable_address::yes
+           : detail::transform::requires_stable_address::no,
+           offset_t, ::cuda::std::tuple<RandomAccessIteratorsIn...>, RandomAccessIteratorOut, Predicate,
+           TransformOp > ::dispatch(
+             ::cuda::std::move(inputs),
+             ::cuda::std::move(output),
+             num_items,
+             ::cuda::std::move(predicate),
+             ::cuda::std::move(transform_op),
+             stream);
+  }
+  template <typename Env>
+  CUB_RUNTIME_FUNCTION static auto get_stream(Env env) -> cudaStream_t
+  {
+    return ::cuda::std::execution::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}).get();
+  }
+  CUB_RUNTIME_FUNCTION static auto get_stream(cudaStream_t stream) -> cudaStream_t
+  {
+    return stream;
   }
 public:
@@ -108,18 +127,28 @@ public:
   //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
   //! types must be convertible to the parameters of the function object's call operator. The return type of the call
   //! operator must be assignable to the dereferenced output iterator.
-  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
-  template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename NumItemsT, typename TransformOp>
+  //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
+  //! stream\ :sub:`0`
+  template <typename... RandomAccessIteratorsIn,
+            typename RandomAccessIteratorOut,
+            typename NumItemsT,
+            typename TransformOp,
+            typename Env = ::cuda::std::execution::env<>>
   CUB_RUNTIME_FUNCTION static cudaError_t Transform(
     ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
     RandomAccessIteratorOut output,
     NumItemsT num_items,
     TransformOp transform_op,
-    cudaStream_t stream = nullptr)
+    Env env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Transform");
     return TransformInternal(
-      ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
+      ::cuda::std::move(inputs),
+      ::cuda::std::move(output),
+      num_items,
+      detail::transform::always_true_predicate{},
+      ::cuda::std::move(transform_op),
+      get_stream(env));
   }
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -160,21 +189,26 @@ public:
   //! @param transform_op A unary function object. The input iterator's value type must be convertible to the parameter
   //! of the function object's call operator. The return type of the call operator must be assignable to the
   //! dereferenced output iterator.
-  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
-  template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename NumItemsT, typename TransformOp>
+  //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
+  //! stream\ :sub:`0`
+  template <typename RandomAccessIteratorIn,
+            typename RandomAccessIteratorOut,
+            typename NumItemsT,
+            typename TransformOp,
+            typename Env = ::cuda::std::execution::env<>>
   CUB_RUNTIME_FUNCTION static cudaError_t Transform(
     RandomAccessIteratorIn input,
     RandomAccessIteratorOut output,
     NumItemsT num_items,
     TransformOp transform_op,
-    cudaStream_t stream = nullptr)
+    Env env = {})
   {
     return Transform(
       ::cuda::std::make_tuple(::cuda::std::move(input)),
       ::cuda::std::move(output),
       num_items,
       ::cuda::std::move(transform_op),
-      stream);
+      ::cuda::std::move(env));
   }
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -215,10 +249,14 @@ public:
   //! @param num_items The number of elements to write to the output sequence.
   //! @param generator A nullary function object. The return type of the call operator must be assignable to the
   //! dereferenced output iterator.
-  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
-  template <typename RandomAccessIteratorOut, typename NumItemsT, typename Generator>
+  //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
+  //! stream\ :sub:`0`
+  template <typename RandomAccessIteratorOut,
+            typename NumItemsT,
+            typename Generator,
+            typename Env = ::cuda::std::execution::env<>>
   CUB_RUNTIME_FUNCTION static cudaError_t
-  Generate(RandomAccessIteratorOut output, NumItemsT num_items, Generator generator, cudaStream_t stream = nullptr)
+  Generate(RandomAccessIteratorOut output, NumItemsT num_items, Generator generator, Env env = {})
   {
     static_assert(::cuda::std::is_invocable_v<Generator>, "The passed generator must be a nullary function object");
     static_assert(
@@ -228,7 +266,12 @@ public:
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Generate");
     return TransformInternal(
-      ::cuda::std::make_tuple(), ::cuda::std::move(output), num_items, ::cuda::std::move(generator), stream);
+      ::cuda::std::make_tuple(),
+      ::cuda::std::move(output),
+      num_items,
+      detail::transform::always_true_predicate{},
+      ::cuda::std::move(generator),
+      get_stream(env));
   }
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -262,10 +305,14 @@ public:
   //! @param output An iterator to the output sequence where num_items results are written to.
   //! @param num_items The number of elements to write to the output sequence.
   //! @param value The value to write. Must be assignable to the dereferenced output iterator.
-  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
-  template <typename RandomAccessIteratorOut, typename NumItemsT, typename Value>
+  //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
+  //! stream\ :sub:`0`
+  template <typename RandomAccessIteratorOut,
+            typename NumItemsT,
+            typename Value,
+            typename Env = ::cuda::std::execution::env<>>
   CUB_RUNTIME_FUNCTION static cudaError_t
-  Fill(RandomAccessIteratorOut output, NumItemsT num_items, Value value, cudaStream_t stream = nullptr)
+  Fill(RandomAccessIteratorOut output, NumItemsT num_items, Value value, Env env = {})
   {
     static_assert(::cuda::std::is_assignable_v<detail::it_reference_t<RandomAccessIteratorOut>, Value>,
                   "The passed value must be assignable to the dereferenced output iterator");
@@ -275,8 +322,9 @@ public:
       ::cuda::std::make_tuple(),
       ::cuda::std::move(output),
       num_items,
+      detail::transform::always_true_predicate{},
       detail::__return_constant<Value>{::cuda::std::move(value)},
-      stream);
+      get_stream(env));
   }
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -296,8 +344,7 @@ public:
       return cudaSuccess;
     }
-    return Generate(
-      ::cuda::std::move(output), num_items, detail::__return_constant<Value>{::cuda::std::move(value)}, stream);
+    return Fill(::cuda::std::move(output), num_items, ::cuda::std::move(value), stream);
   }
 #endif // _CCCL_DOXYGEN_INVOKED
@@ -333,43 +380,30 @@ public:
   //! types must be convertible to the parameters of the function object's call operator. The return type of the call
   //! operator must be assignable to the dereferenced output iterator. Will only be invoked if \p predicate returns
   //! true.
-  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
+  //! stream\ :sub:`0`
   template <typename... RandomAccessIteratorsIn,
             typename RandomAccessIteratorOut,
             typename NumItemsT,
             typename Predicate,
-            typename TransformOp>
+            typename TransformOp,
+            typename Env = ::cuda::std::execution::env<>>
   CUB_RUNTIME_FUNCTION static cudaError_t TransformIf(
     ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
     RandomAccessIteratorOut output,
     NumItemsT num_items,
     Predicate predicate,
     TransformOp transform_op,
-    cudaStream_t stream = nullptr)
+    Env env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformIf");
-    using choose_offset_t = detail::choose_signed_offset<NumItemsT>;
-    using offset_t        = typename choose_offset_t::type;
-    // Check if the number of items exceeds the range covered by the selected signed offset type
-    if (const cudaError_t error = choose_offset_t::is_exceeding_offset_type(num_items); error != cudaSuccess)
-    {
-      return error;
-    }
-    return detail::transform::dispatch_t<
-      detail::transform::requires_stable_address::no,
-      offset_t,
-      ::cuda::std::tuple<RandomAccessIteratorsIn...>,
-      RandomAccessIteratorOut,
-      Predicate,
-      TransformOp>::dispatch(::cuda::std::move(inputs),
-                             ::cuda::std::move(output),
-                             num_items,
-                             ::cuda::std::move(predicate),
-                             ::cuda::std::move(transform_op),
-                             stream);
+    return TransformInternal(
+      ::cuda::std::move(inputs),
+      ::cuda::std::move(output),
+      num_items,
+      ::cuda::std::move(predicate),
+      ::cuda::std::move(transform_op),
+      get_stream(env));
   }
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -435,19 +469,21 @@ public:
   //! @param transform_op A unary function object. The input iterator's value type must be convertible to the
   //! parameter of the function object's call operator. The return type of the call operator must be assignable to the
   //! dereferenced output iterator. Will only be invoked if \p predicate returns true.
-  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
+  //! stream\ :sub:`0`
   template <typename RandomAccessIteratorIn,
             typename RandomAccessIteratorOut,
             typename NumItemsT,
             typename Predicate,
-            typename TransformOp>
+            typename TransformOp,
+            typename Env = ::cuda::std::execution::env<>>
   CUB_RUNTIME_FUNCTION static cudaError_t TransformIf(
     RandomAccessIteratorIn input,
     RandomAccessIteratorOut output,
     NumItemsT num_items,
     Predicate predicate,
     TransformOp transform_op,
-    cudaStream_t stream = nullptr)
+    Env env = {})
   {
     return TransformIf(
       ::cuda::std::make_tuple(::cuda::std::move(input)),
@@ -455,7 +491,7 @@ public:
       num_items,
       ::cuda::std::move(predicate),
       ::cuda::std::move(transform_op),
-      stream);
+      get_stream(env));
   }
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -518,39 +554,29 @@ public:
   //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
   //! types must be convertible to the parameters of the function object's call operator. The return type of the call
   //! operator must be assignable to the dereferenced output iterator.
-  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
-  template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename NumItemsT, typename TransformOp>
+  //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
+  //! stream\ :sub:`0`
+  template <typename... RandomAccessIteratorsIn,
+            typename RandomAccessIteratorOut,
+            typename NumItemsT,
+            typename TransformOp,
+            typename Env = ::cuda::std::execution::env<>>
   CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
     ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
     RandomAccessIteratorOut output,
     NumItemsT num_items,
     TransformOp transform_op,
-    cudaStream_t stream = nullptr)
+    Env env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformStableArgumentAddresses");
-    using choose_offset_t = detail::choose_signed_offset<NumItemsT>;
-    using offset_t        = typename choose_offset_t::type;
-    // Check if the number of items exceeds the range covered by the selected signed offset type
-    cudaError_t error = choose_offset_t::is_exceeding_offset_type(num_items);
-    if (error)
-    {
-      return error;
-    }
-    return detail::transform::dispatch_t<
-      detail::transform::requires_stable_address::yes,
-      offset_t,
-      ::cuda::std::tuple<RandomAccessIteratorsIn...>,
-      RandomAccessIteratorOut,
-      detail::transform::always_true_predicate,
-      TransformOp>::dispatch(::cuda::std::move(inputs),
-                             ::cuda::std::move(output),
-                             num_items,
-                             detail::transform::always_true_predicate{},
-                             ::cuda::std::move(transform_op),
-                             stream);
+    return TransformInternal(
+      ::cuda::std::move(inputs),
+      ::cuda::std::move(output),
+      num_items,
+      detail::transform::always_true_predicate{},
+      ::cuda::std::move(transform_op),
+      get_stream(env),
+      ::cuda::std::true_type{});
   }
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -590,21 +616,26 @@ public:
   //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
   //! types must be convertible to the parameters of the function object's call operator. The return type of the call
   //! operator must be assignable to the dereferenced output iterator.
-  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
-  template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename NumItemsT, typename TransformOp>
+  //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
+  //! stream\ :sub:`0`
+  template <typename RandomAccessIteratorIn,
+            typename RandomAccessIteratorOut,
+            typename NumItemsT,
+            typename TransformOp,
+            typename Env = ::cuda::std::execution::env<>>
   CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
     RandomAccessIteratorIn input,
     RandomAccessIteratorOut output,
     NumItemsT num_items,
     TransformOp transform_op,
-    cudaStream_t stream = nullptr)
+    Env env = {})
   {
     return TransformStableArgumentAddresses(
       ::cuda::std::make_tuple(::cuda::std::move(input)),
       ::cuda::std::move(output),
       num_items,
       ::cuda::std::move(transform_op),
-      stream);
+      get_stream(env));
   }
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document

cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh CHANGED Viewed

@@ -122,9 +122,8 @@ __launch_bounds__(
 {
   // the merge agent loads keys into a local array of KeyIt1::value_type, on which the comparisons are performed
   using key_t = it_value_t<KeyIt1>;
-  static_assert(::cuda::std::__invocable<CompareOp, key_t, key_t>::value,
-                "Comparison operator cannot compare two keys");
-  static_assert(::cuda::std::is_convertible_v<typename ::cuda::std::__invoke_of<CompareOp, key_t, key_t>::type, bool>,
+  static_assert(::cuda::std::is_invocable_v<CompareOp, key_t, key_t>, "Comparison operator cannot compare two keys");
+  static_assert(::cuda::std::is_convertible_v<::cuda::std::invoke_result_t<CompareOp, key_t, key_t>, bool>,
                 "Comparison operator must be convertible to bool");
   using MergeAgent = typename choose_merge_agent<
@@ -144,11 +143,11 @@ __launch_bounds__(
   auto& temp_storage = vsmem_helper_t::get_temp_storage(shared_temp_storage, global_temp_storage);
   MergeAgent{
     temp_storage.Alias(),
-    try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(keys1),
-    try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(items1),
+    keys1,
+    items1,
     num_keys1,
-    try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(keys2),
-    try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(items2),
+    keys2,
+    items2,
     num_keys2,
     keys_result,
     items_result,

cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh CHANGED Viewed

@@ -44,7 +44,6 @@
 #  pragma system_header
 #endif // no system header
-#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
 #include <cub/device/dispatch/kernels/radix_sort.cuh>
 #include <cub/device/dispatch/tuning/tuning_radix_sort.cuh>
 #include <cub/util_debug.cuh>
@@ -1379,14 +1378,6 @@ struct DispatchSegmentedRadixSort
     // Number of radix sort invocations until all segments have been processed
     const auto num_invocations = ::cuda::ceil_div(num_segments, max_num_segments_per_invocation);
-    // If d_begin_offsets and d_end_offsets do not support operator+ then we can't have more than
-    // max_num_segments_per_invocation segments per invocation
-    if (num_invocations > 1
-        && !detail::all_iterators_support_add_assign_operator(::cuda::std::int64_t{}, d_begin_offsets, d_end_offsets))
-    {
-      return cudaErrorInvalidValue;
-    }
     BeginOffsetIteratorT begin_offsets_current_it = d_begin_offsets;
     EndOffsetIteratorT end_offsets_current_it     = d_end_offsets;
@@ -1435,8 +1426,8 @@ struct DispatchSegmentedRadixSort
       if (invocation_index + 1 < num_invocations)
       {
-        detail::advance_iterators_inplace_if_supported(begin_offsets_current_it, num_current_segments);
-        detail::advance_iterators_inplace_if_supported(end_offsets_current_it, num_current_segments);
+        begin_offsets_current_it += num_current_segments;
+        end_offsets_current_it += num_current_segments;
       }
       // Sync the stream if specified to flush runtime errors

cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh CHANGED Viewed

@@ -46,7 +46,6 @@
 #include <cub/detail/launcher/cuda_runtime.cuh>
 #include <cub/detail/type_traits.cuh> // for cub::detail::invoke_result_t
-#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
 #include <cub/device/dispatch/kernels/reduce.cuh>
 #include <cub/device/dispatch/kernels/segmented_reduce.cuh>
 #include <cub/device/dispatch/tuning/tuning_reduce.cuh>
@@ -791,7 +790,7 @@ struct DispatchSegmentedReduce
    *   Function type of cub::DeviceSegmentedReduceKernel
    *
    * @param[in] segmented_reduce_kernel
-   *   Kernel function pointer to parameterization of
+   *   Kernel function pointer to instantiation of
    *   cub::DeviceSegmentedReduceKernel
    */
   template <typename ActivePolicyT, typename DeviceSegmentedReduceKernelT>
@@ -810,7 +809,8 @@ struct DispatchSegmentedReduce
         return cudaSuccess;
       }
-      // Init kernel configuration
+      // Init kernel configuration (computes kernel occupancy)
+      // maybe only used inside CUB_DEBUG_LOG code sections
       [[maybe_unused]] detail::KernelConfig segmented_reduce_config;
       error =
         CubDebug(segmented_reduce_config.Init(segmented_reduce_kernel, policy.SegmentedReduce(), launcher_factory));
@@ -823,17 +823,6 @@ struct DispatchSegmentedReduce
         static_cast<::cuda::std::int64_t>(::cuda::std::numeric_limits<::cuda::std::int32_t>::max());
       const ::cuda::std::int64_t num_invocations = ::cuda::ceil_div(num_segments, num_segments_per_invocation);
-      // If we need multiple passes over the segments but the iterators do not support the + operator, we cannot use the
-      // streaming approach and have to fail, returning cudaErrorInvalidValue. This is because c.parallel passes
-      // indirect_arg_t as the iterator type, which does not support the + operator.
-      // TODO (elstehle): Remove this check once https://github.com/NVIDIA/cccl/issues/4148 is resolved.
-      if (num_invocations > 1
-          && !detail::all_iterators_support_add_assign_operator(
-            ::cuda::std::int64_t{}, d_out, d_begin_offsets, d_end_offsets))
-      {
-        return cudaErrorInvalidValue;
-      }
       for (::cuda::std::int64_t invocation_index = 0; invocation_index < num_invocations; invocation_index++)
       {
         const auto current_seg_offset = invocation_index * num_segments_per_invocation;
@@ -851,7 +840,7 @@ struct DispatchSegmentedReduce
                 segmented_reduce_config.sm_occupancy);
 #endif // CUB_DEBUG_LOG
-        // Invoke DeviceReduceKernel
+        // Invoke DeviceSegmentedReduceKernel
         launcher_factory(
           static_cast<::cuda::std::uint32_t>(num_current_segments), policy.SegmentedReduce().BlockThreads(), 0, stream)
           .doit(segmented_reduce_kernel, d_in, d_out, d_begin_offsets, d_end_offsets, reduction_op, init);
@@ -865,9 +854,9 @@ struct DispatchSegmentedReduce
         if (invocation_index + 1 < num_invocations)
         {
-          detail::advance_iterators_inplace_if_supported(d_out, num_current_segments);
-          detail::advance_iterators_inplace_if_supported(d_begin_offsets, num_current_segments);
-          detail::advance_iterators_inplace_if_supported(d_end_offsets, num_current_segments);
+          d_out += num_current_segments;
+          d_begin_offsets += num_current_segments;
+          d_end_offsets += num_current_segments;
         }
         // Sync the stream if specified to flush runtime errors
@@ -1182,15 +1171,6 @@ struct DispatchFixedSizeSegmentedReduce
     const ::cuda::std::int64_t num_invocations = ::cuda::ceil_div(num_segments, num_segments_per_invocation);
-    // If we need multiple passes over the segments but the iterators do not support the + operator, we cannot use the
-    // streaming approach and have to fail, returning cudaErrorInvalidValue. This is because c.parallel passes
-    // indirect_arg_t as the iterator type, which does not support the + operator.
-    // TODO (srinivas/elstehle): Remove this check once https://github.com/NVIDIA/cccl/issues/4148 is resolved.
-    if (num_invocations > 1 && !detail::all_iterators_support_plus_operator(::cuda::std::int64_t{}, d_in, d_out))
-    {
-      return cudaErrorInvalidValue;
-    }
     cudaError error = cudaSuccess;
     for (::cuda::std::int64_t invocation_index = 0; invocation_index < num_invocations; invocation_index++)
     {
@@ -1204,13 +1184,16 @@ struct DispatchFixedSizeSegmentedReduce
       launcher_factory(
         static_cast<::cuda::std::int32_t>(num_current_blocks), ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream)
         .doit(fixed_size_segmented_reduce_kernel,
-              detail::advance_iterators_if_supported(d_in, current_seg_offset * segment_size),
-              detail::advance_iterators_if_supported(d_out, current_seg_offset),
+              d_in,
+              d_out,
               segment_size,
               static_cast<::cuda::std::int32_t>(num_current_segments),
               reduction_op,
               init);
+      d_in += num_segments_per_invocation * segment_size;
+      d_out += num_segments_per_invocation;
       error = CubDebug(cudaPeekAtLastError());
       if (cudaSuccess != error)
       {

cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh CHANGED Viewed

@@ -77,7 +77,7 @@ namespace rfa
 {
 template <typename Invocable, typename InputT>
-using transformed_input_t = ::cuda::std::decay_t<typename ::cuda::std::__invoke_of<Invocable, InputT>::type>;
+using transformed_input_t = ::cuda::std::decay_t<::cuda::std::invoke_result_t<Invocable, InputT>>;
 template <typename InitT, typename InputIteratorT, typename TransformOpT>
 using accum_t =
@@ -328,11 +328,6 @@ struct DispatchReduceDeterministic
     // Alias the allocation for the privatized per-block reductions
     deterministic_accum_t* d_block_reductions = (deterministic_accum_t*) allocations[0];
-    if (num_chunks > 1 && !detail::all_iterators_support_add_assign_operator(::cuda::std::int32_t{}, d_in))
-    {
-      return cudaErrorInvalidValue;
-    }
     auto d_chunk_block_reductions = d_block_reductions;
     for (int chunk_index = 0; chunk_index < num_chunks; chunk_index++)
     {
@@ -372,7 +367,7 @@ struct DispatchReduceDeterministic
       if (chunk_index + 1 < num_chunks)
       {
-        detail::advance_iterators_inplace_if_supported(d_in, num_current_items);
+        d_in += num_current_items;
         d_chunk_block_reductions += current_grid_size;
       }

cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh CHANGED Viewed

@@ -20,7 +20,6 @@
 #include <cub/detail/launcher/cuda_runtime.cuh>
 #include <cub/detail/type_traits.cuh> // for cub::detail::invoke_result_t
-#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
 #include <cub/device/dispatch/kernels/reduce.cuh>
 #include <cub/device/dispatch/tuning/tuning_reduce.cuh>
 #include <cub/grid/grid_even_share.cuh>

cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh CHANGED Viewed

@@ -40,7 +40,6 @@
 #include <cub/detail/device_double_buffer.cuh>
 #include <cub/detail/temporary_storage.cuh>
 #include <cub/device/device_partition.cuh>
-#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
 #include <cub/device/dispatch/kernels/segmented_sort.cuh>
 #include <cub/device/dispatch/tuning/tuning_segmented_sort.cuh>
 #include <cub/util_debug.cuh>
@@ -764,8 +763,8 @@ private:
       BeginOffsetIteratorT current_begin_offset = d_begin_offsets;
       EndOffsetIteratorT current_end_offset     = d_end_offsets;
-      detail::advance_iterators_inplace_if_supported(current_begin_offset, current_seg_offset);
-      detail::advance_iterators_inplace_if_supported(current_end_offset, current_seg_offset);
+      current_begin_offset += current_seg_offset;
+      current_end_offset += current_seg_offset;
       auto medium_indices_iterator =
         ::cuda::std::make_reverse_iterator(large_and_medium_segments_indices.get() + current_num_segments);

cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh CHANGED Viewed

@@ -18,8 +18,8 @@
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/iterator_adaptor.h>
-#include <thrust/iterator/tabulate_output_iterator.h>
+#include <cuda/__iterator/tabulate_output_iterator.h>
 #include <cuda/std/__functional/identity.h>
 #include <cuda/std/__utility/swap.h>
 #include <cuda/std/limits>
@@ -217,8 +217,7 @@ struct dispatch_streaming_arg_reduce_t
     // The output iterator that implements the logic to accumulate per-partition result to a global aggregate and,
     // eventually, write to the user-provided output iterators
-    using accumulating_transform_out_it_t =
-      THRUST_NS_QUALIFIER::tabulate_output_iterator<accumulating_transform_output_op_t>;
+    using accumulating_transform_out_it_t = ::cuda::tabulate_output_iterator<accumulating_transform_output_op_t>;
     // Empty problem initialization type
     using empty_problem_init_t = empty_problem_init_t<per_partition_accum_t>;
@@ -270,7 +269,7 @@ struct dispatch_streaming_arg_reduce_t
       nullptr,
       allocation_sizes[0],
       d_indexed_offset_in,
-      THRUST_NS_QUALIFIER::make_tabulate_output_iterator(accumulating_out_op),
+      ::cuda::make_tabulate_output_iterator(accumulating_out_op),
       static_cast<PerPartitionOffsetT>(largest_partition_size),
       reduce_op,
       initial_value,
@@ -315,7 +314,7 @@ struct dispatch_streaming_arg_reduce_t
         d_temp_storage,
         temp_storage_bytes,
         d_indexed_offset_in,
-        THRUST_NS_QUALIFIER::make_tabulate_output_iterator(accumulating_out_op),
+        ::cuda::make_tabulate_output_iterator(accumulating_out_op),
         static_cast<PerPartitionOffsetT>(current_num_items),
         reduce_op,
         initial_value,

cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh CHANGED Viewed

@@ -23,7 +23,6 @@
 #include <cub/util_type.cuh>
 #include <thrust/iterator/offset_iterator.h>
-#include <thrust/iterator/tabulate_output_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>