PyPI - cuda-cccl - Versions diffs - 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show

cuda/cccl/headers/include/cub/block/block_radix_sort.cuh CHANGED Viewed

@@ -174,10 +174,12 @@ CUB_NAMESPACE_BEGIN
 //!
 //!    .. code-block:: python
 //!
-//!        import cuda.cccl.cooperative.experimental as cudax
+//!        from cuda import coop
+//!        from pynvjitlink import patch
+//!        patch.patch_numba_linker(lto=True)
 //!
 //!        # Specialize radix sort for a 1D block of 128 threads owning 4 integer items each
-//!        block_radix_sort = cudax.block.radix_sort_keys(numba.int32, 128, 4)
+//!        block_radix_sort = coop.block.radix_sort_keys(numba.int32, 128, 4)
 //!        temp_storage_bytes = block_radix_sort.temp_storage_bytes
 //!
 //!        @cuda.jit(link=block_radix_sort.files)

cuda/cccl/headers/include/cub/block/block_reduce.cuh CHANGED Viewed

@@ -425,6 +425,7 @@ public:
   //!
   //!        // Compute the block-wide max for thread0
   //!        int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cuda::maximum<>{});
+  //!    }
   //!
   //! @endrst
   //!

cuda/cccl/headers/include/cub/block/block_scan.cuh CHANGED Viewed

@@ -190,6 +190,7 @@ enum BlockScanAlgorithm
 //!
 //!        // Collectively compute the block-wide exclusive prefix sum
 //!        BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+//!    }
 //!
 //! Suppose the set of input ``thread_data`` across the block of threads is
 //! ``{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}``.
@@ -333,6 +334,7 @@ public:
   //!
   //!        // Collectively compute the block-wide exclusive prefix sum
   //!        BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
   //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
@@ -386,6 +388,7 @@ public:
   //!        // Collectively compute the block-wide exclusive prefix sum
   //!        int block_aggregate;
   //!        BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
   //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
@@ -479,6 +482,7 @@ public:
   //!            // Store scanned items to output segment
   //!            d_data[block_offset + threadIdx.x] = thread_data;
   //!        }
+  //!    }
   //!
   //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
   //! The corresponding output for the first segment will be ``0, 1, ..., 127``.
@@ -545,6 +549,7 @@ public:
   //!
   //!        // Collectively compute the block-wide exclusive prefix sum
   //!        BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
@@ -606,6 +611,7 @@ public:
   //!        // Collectively compute the block-wide exclusive prefix sum
   //!        int block_aggregate;
   //!        BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
@@ -720,6 +726,7 @@ public:
   //!            BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
   //!            __syncthreads();
   //!        }
+  //!    }
   //!
   //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
   //! The corresponding output for the first segment will be ``0, 1, 2, 3, ..., 510, 511``.
@@ -788,6 +795,7 @@ public:
   //!
   //!        // Collectively compute the block-wide exclusive prefix max scan
   //!        BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{});
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
   //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
@@ -849,8 +857,9 @@ public:
   //!
   //!        // Collectively compute the block-wide exclusive prefix max scan
   //!        int block_aggregate;
-  //!        BlockScan(temp_storage).ExclusiveScan(
-  //!            thread_data, thread_data, INT_MIN, cuda::maximum<>{}, block_aggregate);
+  //!        BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data,
+  //!                                              INT_MIN, cuda::maximum<>{}, block_aggregate);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
   //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
@@ -960,6 +969,7 @@ public:
   //!            // Store scanned items to output segment
   //!            d_data[block_offset + threadIdx.x] = thread_data;
   //!        }
+  //!    }
   //!
   //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
   //! The corresponding output for the first segment will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.

cuda/cccl/headers/include/cub/block/block_store.cuh CHANGED Viewed

@@ -616,6 +616,7 @@ enum BlockStoreAlgorithm
 //!
 //!        // Store items to linear memory
 //!        BlockStore(temp_storage).Store(d_data, thread_data);
+//!    }
 //!
 //! Suppose the set of ``thread_data`` across the block of threads is
 //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
@@ -1156,8 +1157,8 @@ public:
   //!        ...
   //!
   //!        // Store items to linear memory
-  //!        int thread_data[4];
   //!        BlockStore(temp_storage).Store(d_data, thread_data);
+  //!    }
   //!
   //! Suppose the set of ``thread_data`` across the block of threads is
   //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
@@ -1208,8 +1209,8 @@ public:
   //!        ...
   //!
   //!        // Store items to linear memory
-  //!        int thread_data[4];
   //!        BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
+  //!    }
   //!
   //! Suppose the set of ``thread_data`` across the block of threads is
   //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }`` and ``valid_items`` is ``5``.

cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh CHANGED Viewed

@@ -14,6 +14,7 @@
 #endif // no system header
 #include <cuda/__stream/stream_ref.h>
+#include <cuda/std/__cuda/api_wrapper.h>
 #include <cuda/std/cstdint>
 CUB_NAMESPACE_BEGIN

cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh CHANGED Viewed

@@ -15,71 +15,76 @@
 #include <cub/detail/fast_modulo_division.cuh> // fast_div_mod
+#include <cuda/std/__mdspan/extents.h>
 #include <cuda/std/__type_traits/make_unsigned.h>
 #include <cuda/std/__utility/integer_sequence.h>
 #include <cuda/std/array>
 #include <cuda/std/cstddef>
-#include <cuda/std/mdspan>
 CUB_NAMESPACE_BEGIN
 namespace detail
 {
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_MSVC(4702) // unreachable code (even if there are no branches!)
 // Compute the submdspan size of a given rank
-template <size_t Rank, typename IndexType, size_t Extent0, size_t... Extents>
-[[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::make_unsigned_t<IndexType>
-sub_size(const ::cuda::std::extents<IndexType, Extent0, Extents...>& ext)
+template <typename IndexType, size_t... Extents>
+[[nodiscard]] _CCCL_API constexpr ::cuda::std::make_unsigned_t<IndexType>
+size_range(const ::cuda::std::extents<IndexType, Extents...>& ext, int start, int end)
 {
+  _CCCL_ASSERT(start >= 0 && end <= static_cast<int>(ext.rank()), "invalid start or end");
   ::cuda::std::make_unsigned_t<IndexType> s = 1;
-  for (IndexType i = Rank; i < IndexType{1 + sizeof...(Extents)}; i++) // <- pointless comparison with zero-rank extent
+  for (auto i = start; i < end; i++)
   {
     s *= ext.extent(i);
   }
   return s;
 }
-// avoid pointless comparison of unsigned integer with zero (nvcc 11.x doesn't support nv_diag warning suppression)
-template <size_t Rank, typename IndexType>
-[[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::make_unsigned_t<IndexType>
-sub_size(const ::cuda::std::extents<IndexType>&)
+_CCCL_DIAG_POP // MSVC(4702)
+  template <typename IndexType, size_t... Extents>
+  [[nodiscard]] _CCCL_API constexpr ::cuda::std::make_unsigned_t<IndexType>
+  size(const ::cuda::std::extents<IndexType, Extents...>& ext)
 {
-  return ::cuda::std::make_unsigned_t<IndexType>{1};
+  return cub::detail::size_range(ext, 0, static_cast<int>(ext.rank()));
 }
-// TODO: move to cuda::std
-template <typename IndexType, size_t... Extents>
-[[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::make_unsigned_t<IndexType>
-size(const ::cuda::std::extents<IndexType, Extents...>& ext)
+template <bool IsLayoutRight, int Position, typename IndexType, size_t... E>
+[[nodiscard]] _CCCL_API auto sub_size_fast_div_mod_impl(const ::cuda::std::extents<IndexType, E...>& ext)
 {
-  return cub::detail::sub_size<0>(ext);
+  using fast_mod_div_t = fast_div_mod<IndexType>;
+  constexpr auto start = IsLayoutRight ? Position + 1 : 0;
+  constexpr auto end   = IsLayoutRight ? sizeof...(E) : Position;
+  return fast_mod_div_t(cub::detail::size_range(ext, start, end));
 }
 // precompute modulo/division for each submdspan size (by rank)
-template <typename IndexType, size_t... E, size_t... Ranks>
-[[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto
-sub_sizes_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Ranks...> = {})
+template <bool IsLayoutRight, typename IndexType, size_t... E, size_t... Positions>
+[[nodiscard]] _CCCL_API auto
+sub_sizes_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Positions...> = {})
 {
-  // deduction guides don't work with nvcc 11.x
   using fast_mod_div_t = fast_div_mod<IndexType>;
-  return ::cuda::std::array<fast_mod_div_t, sizeof...(Ranks)>{fast_mod_div_t(sub_size<Ranks + 1>(ext))...};
+  using array_t        = ::cuda::std::array<fast_mod_div_t, sizeof...(Positions)>;
+  return array_t{cub::detail::sub_size_fast_div_mod_impl<IsLayoutRight, Positions>(ext)...};
 }
 // precompute modulo/division for each mdspan extent
-template <typename IndexType, size_t... E, size_t... Ranks>
-[[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto
-extents_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Ranks...> = {})
+template <typename IndexType, size_t... E, size_t... Positions>
+[[nodiscard]] _CCCL_API auto
+extents_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Positions...> = {})
 {
   using fast_mod_div_t = fast_div_mod<IndexType>;
-  return ::cuda::std::array<fast_mod_div_t, sizeof...(Ranks)>{fast_mod_div_t(ext.extent(Ranks))...};
+  using array_t        = ::cuda::std::array<fast_mod_div_t, sizeof...(Positions)>;
+  return array_t{fast_mod_div_t(ext.extent(Positions))...};
 }
 // GCC <= 9 constexpr workaround: Extent must be passed as type only, even const Extent& doesn't work
-template <int Rank, typename Extents>
-[[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool is_sub_size_static()
+template <typename Extents>
+[[nodiscard]] _CCCL_API constexpr bool are_extents_in_range_static(int start, int end)
 {
-  using index_type = typename Extents::index_type;
-  for (index_type i = Rank; i < Extents::rank(); i++)
+  for (auto i = start; i < end; i++)
   {
     if (Extents::static_extent(i) == ::cuda::std::dynamic_extent)
     {
@@ -106,5 +111,4 @@ template <typename MappingTypeLhs, typename MappingTypeRhs>
 }
 } // namespace detail
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/detail/ptx-json-parser.h CHANGED Viewed

@@ -29,7 +29,7 @@
 #include <cub/config.cuh>
-#include <thrust/detail/algorithm_wrapper.h>
+#include <cuda/std/__cccl/algorithm_wrapper.h>
 #include <format>
 #include <string_view>

cuda/cccl/headers/include/cub/device/device_for.cuh CHANGED Viewed

@@ -1,29 +1,5 @@
-/******************************************************************************
- * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
 #pragma once
@@ -41,24 +17,23 @@
 #include <cub/util_namespace.cuh>
 #include <thrust/detail/raw_reference_cast.h>
-#include <thrust/distance.h>
 #include <thrust/type_traits/is_contiguous_iterator.h>
 #include <thrust/type_traits/unwrap_contiguous_iterator.h>
 #include <cuda/__cmath/ceil_div.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__fwd/mdspan.h>
 #include <cuda/std/__iterator/distance.h>
 #include <cuda/std/__mdspan/extents.h>
+#include <cuda/std/__mdspan/layout_left.h>
+#include <cuda/std/__mdspan/layout_right.h>
 #include <cuda/std/__memory/is_sufficiently_aligned.h>
 #include <cuda/std/__type_traits/is_integral.h>
-#include <cuda/std/__utility/integer_sequence.h>
 #include <cuda/std/array>
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace for_each
+namespace detail::for_each
 {
 /**
@@ -122,8 +97,7 @@ struct op_wrapper_vectorized_t
   }
 };
-} // namespace for_each
-} // namespace detail
+} // namespace detail::for_each
 struct DeviceFor
 {
@@ -568,6 +542,10 @@ public:
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceFor::Bulk");
     static_assert(::cuda::std::is_integral_v<ShapeT>, "ShapeT must be an integral type");
+    if (shape == 0)
+    {
+      return cudaSuccess;
+    }
     using offset_t = ShapeT;
     return detail::for_each::dispatch_t<offset_t, OpT>::dispatch(static_cast<offset_t>(shape), op, stream);
   }
@@ -833,7 +811,8 @@ public:
   //! Overview
   //! +++++++++++++++++++++++++++++++++++++++++++++
   //!
-  //! Iterate through a multi-dimensional extents into
+  //! Iterate through a multi-dimensional extents into a single linear index and a list of indices for each extent
+  //! dimension.
   //!
   //! - a single linear index that represents the current iteration
   //! - indices of each extent dimension
@@ -899,8 +878,6 @@ public:
     OpType op,
     cudaStream_t stream = {})
   {
-    // TODO: check dimensions overflows
-    // TODO: check tha arity of OpType is equal to sizeof...(ExtentsType)
     if (d_temp_storage == nullptr)
     {
       temp_storage_bytes = 1;
@@ -967,19 +944,120 @@ public:
   template <typename IndexType, size_t... Extents, typename OpType>
   CUB_RUNTIME_FUNCTION static cudaError_t
   ForEachInExtents(const ::cuda::std::extents<IndexType, Extents...>& extents, OpType op, cudaStream_t stream = {})
+  {
+    using extents_type = ::cuda::std::extents<IndexType, Extents...>;
+    return cub::DeviceFor::ForEachInLayout(::cuda::std::layout_right::mapping<extents_type>{extents}, op, stream);
+  }
+  /*********************************************************************************************************************
+   * ForEachInLayout
+   ********************************************************************************************************************/
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Iterate through multi-dimensional extents using a specific mdspan layout, applying a function object for each
+  //! element, passing
+  //!
+  //! - a single linear index that represents the current iteration
+  //! - a list of indices containing the coordinates for each extent dimension
+  //!
+  //! The iteration order depends on the layout type:
+  //!
+  //! - ``layout_right``: Iterates in row-major order (rightmost index varies fastest)
+  //! - ``layout_left``: Iterates in column-major order (leftmost index varies fastest)
+  //!
+  //! - The return value of ``op``, if any, is ignored.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use ``ForEachInLayout`` to iterate through a 2D matrix in
+  //! column-major order using ``layout_left``.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_each_in_layout_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-in-layout-op
+  //!     :end-before: example-end for-each-in-layout-op
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_each_in_layout_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-in-layout-example
+  //!     :end-before: example-end for-each-in-layout-example
+  //!
+  //! @endrst
+  //!
+  //! @tparam Layout
+  //!   **[inferred]** The mdspan layout type, must be either ``cuda::std::layout_left`` or ``cuda::std::layout_right``
+  //!
+  //! @tparam IndexType
+  //!   **[inferred]** An integral type that represents the extent index space
+  //!
+  //! @tparam Extents
+  //!   **[inferred]** The extent sizes for each rank index
+  //!
+  //! @tparam OpType
+  //!   **[inferred]** A function object with arity equal to the number of extents + 1 for the linear index (iteration).
+  //!   The first parameter is the linear index, followed by one parameter for each dimension coordinate.
+  //!
+  //! @param[in] layout
+  //!   Layout object that determines the iteration order (layout_left for column-major, layout_right for row-major)
+  //!
+  //! @param[in] extents
+  //!   Extents object that represents a multi-dimensional index space
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each linear index (iteration) and multi-dimensional coordinates.
+  //!   Called as ``op(linear_index, coord_0, coord_1, ..., coord_n)``
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `nullptr`
+  //!
+  //! @return cudaError_t
+  //!   error status
+  _CCCL_TEMPLATE(typename LayoutMapping, typename OpType)
+  _CCCL_REQUIRES(::cuda::std::__is_any_mdspan_layout_mapping_left_or_right_v<LayoutMapping>)
+  [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
+  ForEachInLayout(const LayoutMapping& layout_mapping, OpType op, cudaStream_t stream = {})
   {
     using namespace cub::detail;
-    using extents_type      = ::cuda::std::extents<IndexType, Extents...>;
+    using extents_type      = typename LayoutMapping::extents_type;
     using extent_index_type = typename extents_type::index_type;
     using fast_mod_array_t  = ::cuda::std::array<fast_div_mod<extent_index_type>, extents_type::rank()>;
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceFor::ForEachInExtents");
     static constexpr auto seq            = ::cuda::std::make_index_sequence<extents_type::rank()>{};
-    fast_mod_array_t sub_sizes_div_array = cub::detail::sub_sizes_fast_div_mod(extents, seq);
+    constexpr bool is_layout_right       = ::cuda::std::__is_any_mdspan_layout_mapping_right_v<LayoutMapping>;
+    auto extents                         = layout_mapping.extents();
+    fast_mod_array_t sub_sizes_div_array = cub::detail::sub_sizes_fast_div_mod<is_layout_right>(extents, seq);
     fast_mod_array_t extents_div_array   = cub::detail::extents_fast_div_mod(extents, seq);
-    for_each::op_wrapper_extents_t<OpType, extents_type, fast_mod_array_t> op_wrapper{
+    for_each::op_wrapper_extents_t<OpType, extents_type, is_layout_right, fast_mod_array_t> op_wrapper{
       op, extents, sub_sizes_div_array, extents_div_array};
     return Bulk(static_cast<implicit_prom_t<extent_index_type>>(cub::detail::size(extents)), op_wrapper, stream);
   }
+#ifndef _CCCL_DOXYGEN_INVOKED
+  _CCCL_TEMPLATE(typename LayoutMapping, typename OpType)
+  _CCCL_REQUIRES(::cuda::std::__is_any_mdspan_layout_mapping_left_or_right_v<LayoutMapping>)
+  [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t ForEachInLayout(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    const LayoutMapping& layout_mapping,
+    OpType op,
+    cudaStream_t stream = {})
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+    return ForEachInLayout(layout_mapping, op, stream);
+  }
+#endif // !_CCCL_DOXYGEN_INVOKED
 };
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/device_reduce.cuh CHANGED Viewed

@@ -52,15 +52,15 @@
 #include <cub/thread/thread_operators.cuh>
 #include <cub/util_type.cuh>
-#include <thrust/iterator/tabulate_output_iterator.h>
 #include <cuda/__execution/determinism.h>
 #include <cuda/__execution/require.h>
 #include <cuda/__execution/tune.h>
 #include <cuda/__functional/maximum.h>
 #include <cuda/__functional/minimum.h>
+#include <cuda/__iterator/tabulate_output_iterator.h>
 #include <cuda/__memory_resource/get_memory_resource.h>
 #include <cuda/__stream/get_stream.h>
+#include <cuda/__stream/stream_ref.h>
 #include <cuda/std/__execution/env.h>
 #include <cuda/std/__functional/identity.h>
 #include <cuda/std/__functional/invoke.h>
@@ -70,7 +70,6 @@
 #include <cuda/std/__type_traits/is_same.h>
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
-#include <cuda/stream_ref>
 CUB_NAMESPACE_BEGIN
@@ -1215,7 +1214,7 @@ public:
     OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
     // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
-    auto out_it = THRUST_NS_QUALIFIER::make_tabulate_output_iterator(
+    auto out_it = ::cuda::make_tabulate_output_iterator(
       detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
     return detail::reduce::dispatch_streaming_arg_reduce_t<
@@ -1341,7 +1340,7 @@ public:
     OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
     // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
-    auto out_it = THRUST_NS_QUALIFIER::make_tabulate_output_iterator(
+    auto out_it = ::cuda::make_tabulate_output_iterator(
       detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
     // Query the required temporary storage size
@@ -1883,7 +1882,7 @@ public:
     OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::lowest()};
     // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
-    auto out_it = THRUST_NS_QUALIFIER::make_tabulate_output_iterator(
+    auto out_it = ::cuda::make_tabulate_output_iterator(
       detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_max_out, d_index_out});
     return detail::reduce::dispatch_streaming_arg_reduce_t<
@@ -2133,7 +2132,7 @@ public:
     OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
     // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
-    auto out_it = THRUST_NS_QUALIFIER::make_tabulate_output_iterator(
+    auto out_it = ::cuda::make_tabulate_output_iterator(
       detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_max_out, d_index_out});
     // Query the required temporary storage size