PyPI - cuda-cccl - Versions diffs - 0.3.1__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.2__cp311-cp311-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.3.1__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.2__cp311-cp311-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (185) hide show

cuda/cccl/headers/include/cub/device/device_for.cuh CHANGED Viewed

@@ -1,29 +1,5 @@
-/******************************************************************************
- * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
 #pragma once
@@ -41,24 +17,23 @@
 #include <cub/util_namespace.cuh>
 #include <thrust/detail/raw_reference_cast.h>
-#include <thrust/distance.h>
 #include <thrust/type_traits/is_contiguous_iterator.h>
 #include <thrust/type_traits/unwrap_contiguous_iterator.h>
 #include <cuda/__cmath/ceil_div.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__fwd/mdspan.h>
 #include <cuda/std/__iterator/distance.h>
 #include <cuda/std/__mdspan/extents.h>
+#include <cuda/std/__mdspan/layout_left.h>
+#include <cuda/std/__mdspan/layout_right.h>
 #include <cuda/std/__memory/is_sufficiently_aligned.h>
 #include <cuda/std/__type_traits/is_integral.h>
-#include <cuda/std/__utility/integer_sequence.h>
 #include <cuda/std/array>
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace for_each
+namespace detail::for_each
 {
 /**
@@ -122,8 +97,7 @@ struct op_wrapper_vectorized_t
   }
 };
-} // namespace for_each
-} // namespace detail
+} // namespace detail::for_each
 struct DeviceFor
 {
@@ -568,6 +542,10 @@ public:
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceFor::Bulk");
     static_assert(::cuda::std::is_integral_v<ShapeT>, "ShapeT must be an integral type");
+    if (shape == 0)
+    {
+      return cudaSuccess;
+    }
     using offset_t = ShapeT;
     return detail::for_each::dispatch_t<offset_t, OpT>::dispatch(static_cast<offset_t>(shape), op, stream);
   }
@@ -833,7 +811,8 @@ public:
   //! Overview
   //! +++++++++++++++++++++++++++++++++++++++++++++
   //!
-  //! Iterate through a multi-dimensional extents into
+  //! Iterate through a multi-dimensional extents into a single linear index and a list of indices for each extent
+  //! dimension.
   //!
   //! - a single linear index that represents the current iteration
   //! - indices of each extent dimension
@@ -899,8 +878,6 @@ public:
     OpType op,
     cudaStream_t stream = {})
   {
-    // TODO: check dimensions overflows
-    // TODO: check tha arity of OpType is equal to sizeof...(ExtentsType)
     if (d_temp_storage == nullptr)
     {
       temp_storage_bytes = 1;
@@ -967,19 +944,120 @@ public:
   template <typename IndexType, size_t... Extents, typename OpType>
   CUB_RUNTIME_FUNCTION static cudaError_t
   ForEachInExtents(const ::cuda::std::extents<IndexType, Extents...>& extents, OpType op, cudaStream_t stream = {})
+  {
+    using extents_type = ::cuda::std::extents<IndexType, Extents...>;
+    return cub::DeviceFor::ForEachInLayout(::cuda::std::layout_right::mapping<extents_type>{extents}, op, stream);
+  }
+  /*********************************************************************************************************************
+   * ForEachInLayout
+   ********************************************************************************************************************/
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Iterate through multi-dimensional extents using a specific mdspan layout, applying a function object for each
+  //! element, passing
+  //!
+  //! - a single linear index that represents the current iteration
+  //! - a list of indices containing the coordinates for each extent dimension
+  //!
+  //! The iteration order depends on the layout type:
+  //!
+  //! - ``layout_right``: Iterates in row-major order (rightmost index varies fastest)
+  //! - ``layout_left``: Iterates in column-major order (leftmost index varies fastest)
+  //!
+  //! - The return value of ``op``, if any, is ignored.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use ``ForEachInLayout`` to iterate through a 2D matrix in
+  //! column-major order using ``layout_left``.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_each_in_layout_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-in-layout-op
+  //!     :end-before: example-end for-each-in-layout-op
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_each_in_layout_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-in-layout-example
+  //!     :end-before: example-end for-each-in-layout-example
+  //!
+  //! @endrst
+  //!
+  //! @tparam Layout
+  //!   **[inferred]** The mdspan layout type, must be either ``cuda::std::layout_left`` or ``cuda::std::layout_right``
+  //!
+  //! @tparam IndexType
+  //!   **[inferred]** An integral type that represents the extent index space
+  //!
+  //! @tparam Extents
+  //!   **[inferred]** The extent sizes for each rank index
+  //!
+  //! @tparam OpType
+  //!   **[inferred]** A function object with arity equal to the number of extents + 1 for the linear index (iteration).
+  //!   The first parameter is the linear index, followed by one parameter for each dimension coordinate.
+  //!
+  //! @param[in] layout
+  //!   Layout object that determines the iteration order (layout_left for column-major, layout_right for row-major)
+  //!
+  //! @param[in] extents
+  //!   Extents object that represents a multi-dimensional index space
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each linear index (iteration) and multi-dimensional coordinates.
+  //!   Called as ``op(linear_index, coord_0, coord_1, ..., coord_n)``
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `nullptr`
+  //!
+  //! @return cudaError_t
+  //!   error status
+  _CCCL_TEMPLATE(typename LayoutMapping, typename OpType)
+  _CCCL_REQUIRES(::cuda::std::__is_any_mdspan_layout_mapping_left_or_right_v<LayoutMapping>)
+  [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
+  ForEachInLayout(const LayoutMapping& layout_mapping, OpType op, cudaStream_t stream = {})
   {
     using namespace cub::detail;
-    using extents_type      = ::cuda::std::extents<IndexType, Extents...>;
+    using extents_type      = typename LayoutMapping::extents_type;
     using extent_index_type = typename extents_type::index_type;
     using fast_mod_array_t  = ::cuda::std::array<fast_div_mod<extent_index_type>, extents_type::rank()>;
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceFor::ForEachInExtents");
     static constexpr auto seq            = ::cuda::std::make_index_sequence<extents_type::rank()>{};
-    fast_mod_array_t sub_sizes_div_array = cub::detail::sub_sizes_fast_div_mod(extents, seq);
+    constexpr bool is_layout_right       = ::cuda::std::__is_any_mdspan_layout_mapping_right_v<LayoutMapping>;
+    auto extents                         = layout_mapping.extents();
+    fast_mod_array_t sub_sizes_div_array = cub::detail::sub_sizes_fast_div_mod<is_layout_right>(extents, seq);
     fast_mod_array_t extents_div_array   = cub::detail::extents_fast_div_mod(extents, seq);
-    for_each::op_wrapper_extents_t<OpType, extents_type, fast_mod_array_t> op_wrapper{
+    for_each::op_wrapper_extents_t<OpType, extents_type, is_layout_right, fast_mod_array_t> op_wrapper{
       op, extents, sub_sizes_div_array, extents_div_array};
     return Bulk(static_cast<implicit_prom_t<extent_index_type>>(cub::detail::size(extents)), op_wrapper, stream);
   }
+#ifndef _CCCL_DOXYGEN_INVOKED
+  _CCCL_TEMPLATE(typename LayoutMapping, typename OpType)
+  _CCCL_REQUIRES(::cuda::std::__is_any_mdspan_layout_mapping_left_or_right_v<LayoutMapping>)
+  [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t ForEachInLayout(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    const LayoutMapping& layout_mapping,
+    OpType op,
+    cudaStream_t stream = {})
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+    return ForEachInLayout(layout_mapping, op, stream);
+  }
+#endif // !_CCCL_DOXYGEN_INVOKED
 };
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/device_reduce.cuh CHANGED Viewed

@@ -52,15 +52,15 @@
 #include <cub/thread/thread_operators.cuh>
 #include <cub/util_type.cuh>
-#include <thrust/iterator/tabulate_output_iterator.h>
 #include <cuda/__execution/determinism.h>
 #include <cuda/__execution/require.h>
 #include <cuda/__execution/tune.h>
 #include <cuda/__functional/maximum.h>
 #include <cuda/__functional/minimum.h>
+#include <cuda/__iterator/tabulate_output_iterator.h>
 #include <cuda/__memory_resource/get_memory_resource.h>
 #include <cuda/__stream/get_stream.h>
+#include <cuda/__stream/stream_ref.h>
 #include <cuda/std/__execution/env.h>
 #include <cuda/std/__functional/identity.h>
 #include <cuda/std/__functional/invoke.h>
@@ -70,7 +70,6 @@
 #include <cuda/std/__type_traits/is_same.h>
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
-#include <cuda/stream_ref>
 CUB_NAMESPACE_BEGIN
@@ -1215,7 +1214,7 @@ public:
     OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
     // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
-    auto out_it = THRUST_NS_QUALIFIER::make_tabulate_output_iterator(
+    auto out_it = ::cuda::make_tabulate_output_iterator(
       detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
     return detail::reduce::dispatch_streaming_arg_reduce_t<
@@ -1341,7 +1340,7 @@ public:
     OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
     // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
-    auto out_it = THRUST_NS_QUALIFIER::make_tabulate_output_iterator(
+    auto out_it = ::cuda::make_tabulate_output_iterator(
       detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
     // Query the required temporary storage size
@@ -1883,7 +1882,7 @@ public:
     OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::lowest()};
     // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
-    auto out_it = THRUST_NS_QUALIFIER::make_tabulate_output_iterator(
+    auto out_it = ::cuda::make_tabulate_output_iterator(
       detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_max_out, d_index_out});
     return detail::reduce::dispatch_streaming_arg_reduce_t<
@@ -2133,7 +2132,7 @@ public:
     OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
     // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
-    auto out_it = THRUST_NS_QUALIFIER::make_tabulate_output_iterator(
+    auto out_it = ::cuda::make_tabulate_output_iterator(
       detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_max_out, d_index_out});
     // Query the required temporary storage size

cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh CHANGED Viewed

@@ -156,14 +156,14 @@ struct DeviceSegmentedReduce
   //!   @rst
   //!   Random-access input iterator to the sequence of beginning offsets of
   //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
-  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   element of the *i*\ :sup:`th` data segment in ``d_in``
   //!   @endrst
   //!
   //! @param[in] d_end_offsets
   //!   @rst
   //!   Random-access input iterator to the sequence of ending offsets of length
   //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
-  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   the *i*\ :sup:`th` data segment in ``d_in``.
   //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
   //!   @endrst
   //!
@@ -372,15 +372,14 @@ struct DeviceSegmentedReduce
   //!   @rst
   //!   Random-access input iterator to the sequence of beginning offsets of
   //!   length ``num_segments`, such that ``d_begin_offsets[i]`` is the first
-  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and
-  //!   ``d_values_*``
+  //!   element of the *i*\ :sup:`th` data segment in ``d_in``
   //!   @endrst
   //!
   //! @param[in] d_end_offsets
   //!   @rst
   //!   Random-access input iterator to the sequence of ending offsets of length
   //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
-  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   the *i*\ :sup:`th` data segment in ``d_in``.
   //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
   //!   @endrst
   //!
@@ -578,14 +577,14 @@ struct DeviceSegmentedReduce
   //!   @rst
   //!   Random-access input iterator to the sequence of beginning offsets of
   //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
-  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   element of the *i*\ :sup:`th` data segment in ``d_in``
   //!   @endrst
   //!
   //! @param[in] d_end_offsets
   //!   @rst
   //!   Random-access input iterator to the sequence of ending offsets of length
   //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
-  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   the *i*\ :sup:`th` data segment in ``d_in``.
   //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
   //!   @endrst
   //!
@@ -792,14 +791,14 @@ struct DeviceSegmentedReduce
   //!   @rst
   //!   Random-access input iterator to the sequence of beginning offsets of
   //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
-  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   element of the *i*\ :sup:`th` data segment in ``d_in``
   //!   @endrst
   //!
   //! @param[in] d_end_offsets
   //!   @rst
   //!   Random-access input iterator to the sequence of ending offsets of length
   //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
-  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   the *i*\ :sup:`th` data segment in ``d_in``.
   //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
   //!   @endrst
   //!
@@ -1037,14 +1036,14 @@ struct DeviceSegmentedReduce
   //!   @rst
   //!   Random-access input iterator to the sequence of beginning offsets of
   //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
-  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   element of the *i*\ :sup:`th` data segment in ``d_in``
   //!   @endrst
   //!
   //! @param[in] d_end_offsets
   //!   @rst
   //!   Random-access input iterator to the sequence of ending offsets of length
   //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
-  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   the *i*\ :sup:`th` data segment in ``d_in``.
   //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
   //!   @endrst
   //!
@@ -1249,14 +1248,14 @@ struct DeviceSegmentedReduce
   //!   @rst
   //!   Random-access input iterator to the sequence of beginning offsets of
   //!   length `num_segments`, such that ``d_begin_offsets[i]`` is the first
-  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   element of the *i*\ :sup:`th` data segment in ``d_in``
   //!   @endrst
   //!
   //! @param[in] d_end_offsets
   //!   @rst
   //!   Random-access input iterator to the sequence of ending offsets of length
   //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
-  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   the *i*\ :sup:`th` data segment in ``d_in``.
   //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
   //!   @endrst
   //!