PyPI - cuda-cccl - Versions diffs - 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (185) hide show

cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh CHANGED Viewed

@@ -111,10 +111,9 @@ CUB_NAMESPACE_BEGIN
 //!        // Collectively compute adjacent_difference
 //!        int result[4];
 //!
-//!        BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
-//!            thread_data,
-//!            result,
-//!            CustomDifference());
+//!        BlockAdjacentDifferenceT(temp_storage).SubtractLeft(thread_data, result,
+//!                                                            CustomDifference());
+//!    }
 //!
 //! Suppose the set of input `thread_data` across the block of threads is
 //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
@@ -283,10 +282,9 @@ public:
   //!        ...
   //!
   //!        // Collectively compute adjacent_difference
-  //!        BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
-  //!            thread_data,
-  //!            thread_data,
-  //!            CustomDifference());
+  //!        BlockAdjacentDifferenceT(temp_storage).SubtractLeft(thread_data, thread_data,
+  //!                                                            CustomDifference());
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.

cuda/cccl/headers/include/cub/block/block_discontinuity.cuh CHANGED Viewed

@@ -96,6 +96,7 @@ CUB_NAMESPACE_BEGIN
 //!        // Collectively compute head flags for discontinuities in the segment
 //!        int head_flags[4];
 //!        BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+//!    }
 //!
 //! Suppose the set of input ``thread_data`` across the block of threads is
 //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``.
@@ -387,6 +388,7 @@ public:
   //!        // Collectively compute head flags for discontinuities in the segment
   //!        int head_flags[4];
   //!        BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``.
@@ -463,8 +465,9 @@ public:
   //!
   //!        // Collectively compute head flags for discontinuities in the segment
   //!        int head_flags[4];
-  //!        BlockDiscontinuity(temp_storage).FlagHeads(
-  //!            head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
+  //!        BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data,
+  //!                                                   cub::Inequality(), tile_predecessor_item);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``,
@@ -549,6 +552,7 @@ public:
   //!        // Collectively compute tail flags for discontinuities in the segment
   //!        int tail_flags[4];
   //!        BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``.
@@ -640,8 +644,9 @@ public:
   //!
   //!        // Collectively compute tail flags for discontinuities in the segment
   //!        int tail_flags[4];
-  //!        BlockDiscontinuity(temp_storage).FlagTails(
-  //!            tail_flags, thread_data, cub::Inequality(), tile_successor_item);
+  //!        BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data,
+  //!                                                   cub::Inequality(), tile_successor_item);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
@@ -742,8 +747,9 @@ public:
   //!        // Collectively compute head and flags for discontinuities in the segment
   //!        int head_flags[4];
   //!        int tail_flags[4];
-  //!        BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
-  //!            head_flags, tail_flags, thread_data, cub::Inequality());
+  //!        BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tail_flags, thread_data,
+  //!                                                           cub::Inequality());
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
@@ -864,8 +870,10 @@ public:
   //!        // Collectively compute head and flags for discontinuities in the segment
   //!        int head_flags[4];
   //!        int tail_flags[4];
-  //!        BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
-  //!            head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
+  //!        BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tail_flags,
+  //!                                                           tile_successor_item, thread_data,
+  //!                                                           cub::Inequality());
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
@@ -997,9 +1005,10 @@ public:
   //!        // Collectively compute head and flags for discontinuities in the segment
   //!        int head_flags[4];
   //!        int tail_flags[4];
-  //!        BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
-  //!            head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
-  //!            thread_data, cub::Inequality());
+  //!        BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tile_predecessor_item,
+  //!                                                           tail_flags, tile_successor_item,
+  //!                                                           thread_data, cub::Inequality());
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``,
@@ -1126,9 +1135,10 @@ public:
   //!        // Collectively compute head and flags for discontinuities in the segment
   //!        int head_flags[4];
   //!        int tail_flags[4];
-  //!        BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
-  //!            head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
-  //!            thread_data, cub::Inequality());
+  //!        BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tile_predecessor_item,
+  //!                                                           tail_flags, tile_successor_item,
+  //!                                                           thread_data, cub::Inequality());
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``,

cuda/cccl/headers/include/cub/block/block_exchange.cuh CHANGED Viewed

@@ -101,6 +101,7 @@ CUB_NAMESPACE_BEGIN
 //!
 //!        // Collectively exchange data into a blocked arrangement across threads
 //!        BlockExchange(temp_storage).StripedToBlocked(thread_data);
+//!    }
 //!
 //! Suppose the set of striped input ``thread_data`` across the block of threads is ``{ [0,128,256,384],
 //! [1,129,257,385], ..., [127,255,383,511] }``. The corresponding output ``thread_data`` in those threads will be
@@ -883,6 +884,7 @@ public:
   //!
   //!        // Collectively exchange data into a blocked arrangement across threads
   //!        BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
+  //!    }
   //!
   //! Suppose the set of striped input ``thread_data`` across the block of threads is ``{ [0,128,256,384],
   //! [1,129,257,385], ..., [127,255,383,511] }`` after loading from device-accessible memory. The corresponding output
@@ -933,6 +935,7 @@ public:
   //!
   //!        // Store data striped across block threads into an ordered tile
   //!        cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+  //!    }
   //!
   //! Suppose the set of blocked input ``thread_data`` across the block of threads is ``{ [0,1,2,3], [4,5,6,7],
   //! [8,9,10,11], ..., [508,509,510,511] }``. The corresponding output ``thread_data`` in those threads will be
@@ -983,6 +986,7 @@ public:
   //!
   //!        // Collectively exchange data into a blocked arrangement across threads
   //!        BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
+  //!    }
   //!
   //! Suppose the set of warp-striped input ``thread_data`` across the block of threads is ``{ [0,32,64,96],
   //! [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }`` after loading from device-accessible memory. (The first 128
@@ -1037,6 +1041,7 @@ public:
   //!
   //!        // Store data striped across warp threads into an ordered tile
   //!        cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+  //!    }
   //!
   //! Suppose the set of blocked input ``thread_data`` across the block of threads is ``{ [0,1,2,3], [4,5,6,7],
   //! [8,9,10,11], ..., [508,509,510,511] }``. The corresponding output ``thread_data`` in those threads will be

cuda/cccl/headers/include/cub/block/block_histogram.cuh CHANGED Viewed

@@ -140,6 +140,7 @@ enum BlockHistogramAlgorithm
 //!
 //!        // Compute the block-wide histogram
 //!        BlockHistogram(temp_storage).Histogram(data, smem_histogram);
+//!    }
 //!
 //! Performance and Usage Considerations
 //! +++++++++++++++++++++++++++++++++++++++++++++
@@ -281,6 +282,7 @@ public:
   //!
   //!      // Update the block-wide histogram
   //!      BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+  //!    }
   //!
   //! @endrst
   //!
@@ -338,6 +340,7 @@ public:
   //!
   //!        // Compute the block-wide histogram
   //!        BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
+  //!    }
   //!
   //! @endrst
   //!
@@ -399,6 +402,7 @@ public:
   //!
   //!        // Update the block-wide histogram
   //!        BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+  //!    }
   //!
   //! @endrst
   //!

cuda/cccl/headers/include/cub/block/block_load.cuh CHANGED Viewed

@@ -771,6 +771,7 @@ enum BlockLoadAlgorithm
 //!        // Load a segment of consecutive items that are blocked across threads
 //!        int thread_data[4];
 //!        BlockLoad(temp_storage).Load(d_data, thread_data);
+//!    }
 //!
 //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads in
 //! those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
@@ -1123,6 +1124,7 @@ public:
   //!        // Load a segment of consecutive items that are blocked across threads
   //!        int thread_data[4];
   //!        BlockLoad(temp_storage).Load(d_data, thread_data);
+  //!    }
   //!
   //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads
   //! in those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
@@ -1170,6 +1172,7 @@ public:
   //!        // Load a segment of consecutive items that are blocked across threads
   //!        int thread_data[4];
   //!        BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end);
+  //!    }
   //!
   //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...`` and ``block_items_end`` is ``5``. The set of
   //! ``thread_data`` across the block of threads in those threads will be ``{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }``,
@@ -1222,6 +1225,7 @@ public:
   //!        // Load a segment of consecutive items that are blocked across threads
   //!        int thread_data[4];
   //!        BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end, -1);
+  //!    }
   //!
   //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...``, ``block_items_end`` is ``5``, and the out-of-bounds
   //! default is ``-1``. The set of ``thread_data`` across the block of threads in those threads will be

cuda/cccl/headers/include/cub/block/block_radix_rank.cuh CHANGED Viewed

@@ -169,6 +169,7 @@ struct warp_in_block_matcher_t<Bits, 0, PartialWarpId>
 //!      block_radix_rank(temp_storage).RankKeys(keys, ranks, extractor);
 //!
 //!      ...
+//!    }
 //!
 //! Suppose the set of input ``keys`` across the block of threads is ``{ [16,10], [9,11] }``.
 //! The corresponding output ``ranks`` in those threads will be ``{ [3,1], [0,2] }``.

cuda/cccl/headers/include/cub/block/block_reduce.cuh CHANGED Viewed

@@ -425,6 +425,7 @@ public:
   //!
   //!        // Compute the block-wide max for thread0
   //!        int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cuda::maximum<>{});
+  //!    }
   //!
   //! @endrst
   //!

cuda/cccl/headers/include/cub/block/block_scan.cuh CHANGED Viewed

@@ -190,6 +190,7 @@ enum BlockScanAlgorithm
 //!
 //!        // Collectively compute the block-wide exclusive prefix sum
 //!        BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+//!    }
 //!
 //! Suppose the set of input ``thread_data`` across the block of threads is
 //! ``{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}``.
@@ -333,6 +334,7 @@ public:
   //!
   //!        // Collectively compute the block-wide exclusive prefix sum
   //!        BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
   //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
@@ -386,6 +388,7 @@ public:
   //!        // Collectively compute the block-wide exclusive prefix sum
   //!        int block_aggregate;
   //!        BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
   //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
@@ -479,6 +482,7 @@ public:
   //!            // Store scanned items to output segment
   //!            d_data[block_offset + threadIdx.x] = thread_data;
   //!        }
+  //!    }
   //!
   //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
   //! The corresponding output for the first segment will be ``0, 1, ..., 127``.
@@ -545,6 +549,7 @@ public:
   //!
   //!        // Collectively compute the block-wide exclusive prefix sum
   //!        BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
@@ -606,6 +611,7 @@ public:
   //!        // Collectively compute the block-wide exclusive prefix sum
   //!        int block_aggregate;
   //!        BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
@@ -720,6 +726,7 @@ public:
   //!            BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
   //!            __syncthreads();
   //!        }
+  //!    }
   //!
   //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
   //! The corresponding output for the first segment will be ``0, 1, 2, 3, ..., 510, 511``.
@@ -788,6 +795,7 @@ public:
   //!
   //!        // Collectively compute the block-wide exclusive prefix max scan
   //!        BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{});
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
   //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
@@ -849,8 +857,9 @@ public:
   //!
   //!        // Collectively compute the block-wide exclusive prefix max scan
   //!        int block_aggregate;
-  //!        BlockScan(temp_storage).ExclusiveScan(
-  //!            thread_data, thread_data, INT_MIN, cuda::maximum<>{}, block_aggregate);
+  //!        BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data,
+  //!                                              INT_MIN, cuda::maximum<>{}, block_aggregate);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
   //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
@@ -960,6 +969,7 @@ public:
   //!            // Store scanned items to output segment
   //!            d_data[block_offset + threadIdx.x] = thread_data;
   //!        }
+  //!    }
   //!
   //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
   //! The corresponding output for the first segment will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.

cuda/cccl/headers/include/cub/block/block_store.cuh CHANGED Viewed

@@ -616,6 +616,7 @@ enum BlockStoreAlgorithm
 //!
 //!        // Store items to linear memory
 //!        BlockStore(temp_storage).Store(d_data, thread_data);
+//!    }
 //!
 //! Suppose the set of ``thread_data`` across the block of threads is
 //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
@@ -1156,8 +1157,8 @@ public:
   //!        ...
   //!
   //!        // Store items to linear memory
-  //!        int thread_data[4];
   //!        BlockStore(temp_storage).Store(d_data, thread_data);
+  //!    }
   //!
   //! Suppose the set of ``thread_data`` across the block of threads is
   //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
@@ -1208,8 +1209,8 @@ public:
   //!        ...
   //!
   //!        // Store items to linear memory
-  //!        int thread_data[4];
   //!        BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
+  //!    }
   //!
   //! Suppose the set of ``thread_data`` across the block of threads is
   //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }`` and ``valid_items`` is ``5``.

cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh CHANGED Viewed

@@ -15,71 +15,76 @@
 #include <cub/detail/fast_modulo_division.cuh> // fast_div_mod
+#include <cuda/std/__mdspan/extents.h>
 #include <cuda/std/__type_traits/make_unsigned.h>
 #include <cuda/std/__utility/integer_sequence.h>
 #include <cuda/std/array>
 #include <cuda/std/cstddef>
-#include <cuda/std/mdspan>
 CUB_NAMESPACE_BEGIN
 namespace detail
 {
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_MSVC(4702) // unreachable code (even if there are no branches!)
 // Compute the submdspan size of a given rank
-template <size_t Rank, typename IndexType, size_t Extent0, size_t... Extents>
-[[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::make_unsigned_t<IndexType>
-sub_size(const ::cuda::std::extents<IndexType, Extent0, Extents...>& ext)
+template <typename IndexType, size_t... Extents>
+[[nodiscard]] _CCCL_API constexpr ::cuda::std::make_unsigned_t<IndexType>
+size_range(const ::cuda::std::extents<IndexType, Extents...>& ext, int start, int end)
 {
+  _CCCL_ASSERT(start >= 0 && end <= static_cast<int>(ext.rank()), "invalid start or end");
   ::cuda::std::make_unsigned_t<IndexType> s = 1;
-  for (IndexType i = Rank; i < IndexType{1 + sizeof...(Extents)}; i++) // <- pointless comparison with zero-rank extent
+  for (auto i = start; i < end; i++)
   {
     s *= ext.extent(i);
   }
   return s;
 }
-// avoid pointless comparison of unsigned integer with zero (nvcc 11.x doesn't support nv_diag warning suppression)
-template <size_t Rank, typename IndexType>
-[[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::make_unsigned_t<IndexType>
-sub_size(const ::cuda::std::extents<IndexType>&)
+_CCCL_DIAG_POP // MSVC(4702)
+  template <typename IndexType, size_t... Extents>
+  [[nodiscard]] _CCCL_API constexpr ::cuda::std::make_unsigned_t<IndexType>
+  size(const ::cuda::std::extents<IndexType, Extents...>& ext)
 {
-  return ::cuda::std::make_unsigned_t<IndexType>{1};
+  return cub::detail::size_range(ext, 0, static_cast<int>(ext.rank()));
 }
-// TODO: move to cuda::std
-template <typename IndexType, size_t... Extents>
-[[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::make_unsigned_t<IndexType>
-size(const ::cuda::std::extents<IndexType, Extents...>& ext)
+template <bool IsLayoutRight, int Position, typename IndexType, size_t... E>
+[[nodiscard]] _CCCL_API auto sub_size_fast_div_mod_impl(const ::cuda::std::extents<IndexType, E...>& ext)
 {
-  return cub::detail::sub_size<0>(ext);
+  using fast_mod_div_t = fast_div_mod<IndexType>;
+  constexpr auto start = IsLayoutRight ? Position + 1 : 0;
+  constexpr auto end   = IsLayoutRight ? sizeof...(E) : Position;
+  return fast_mod_div_t(cub::detail::size_range(ext, start, end));
 }
 // precompute modulo/division for each submdspan size (by rank)
-template <typename IndexType, size_t... E, size_t... Ranks>
-[[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto
-sub_sizes_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Ranks...> = {})
+template <bool IsLayoutRight, typename IndexType, size_t... E, size_t... Positions>
+[[nodiscard]] _CCCL_API auto
+sub_sizes_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Positions...> = {})
 {
-  // deduction guides don't work with nvcc 11.x
   using fast_mod_div_t = fast_div_mod<IndexType>;
-  return ::cuda::std::array<fast_mod_div_t, sizeof...(Ranks)>{fast_mod_div_t(sub_size<Ranks + 1>(ext))...};
+  using array_t        = ::cuda::std::array<fast_mod_div_t, sizeof...(Positions)>;
+  return array_t{cub::detail::sub_size_fast_div_mod_impl<IsLayoutRight, Positions>(ext)...};
 }
 // precompute modulo/division for each mdspan extent
-template <typename IndexType, size_t... E, size_t... Ranks>
-[[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto
-extents_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Ranks...> = {})
+template <typename IndexType, size_t... E, size_t... Positions>
+[[nodiscard]] _CCCL_API auto
+extents_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Positions...> = {})
 {
   using fast_mod_div_t = fast_div_mod<IndexType>;
-  return ::cuda::std::array<fast_mod_div_t, sizeof...(Ranks)>{fast_mod_div_t(ext.extent(Ranks))...};
+  using array_t        = ::cuda::std::array<fast_mod_div_t, sizeof...(Positions)>;
+  return array_t{fast_mod_div_t(ext.extent(Positions))...};
 }
 // GCC <= 9 constexpr workaround: Extent must be passed as type only, even const Extent& doesn't work
-template <int Rank, typename Extents>
-[[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool is_sub_size_static()
+template <typename Extents>
+[[nodiscard]] _CCCL_API constexpr bool are_extents_in_range_static(int start, int end)
 {
-  using index_type = typename Extents::index_type;
-  for (index_type i = Rank; i < Extents::rank(); i++)
+  for (auto i = start; i < end; i++)
   {
     if (Extents::static_extent(i) == ::cuda::std::dynamic_extent)
     {
@@ -106,5 +111,4 @@ template <typename MappingTypeLhs, typename MappingTypeRhs>
 }
 } // namespace detail
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/detail/ptx-json-parser.h CHANGED Viewed

@@ -29,7 +29,7 @@
 #include <cub/config.cuh>
-#include <thrust/detail/algorithm_wrapper.h>
+#include <cuda/std/__cccl/algorithm_wrapper.h>
 #include <format>
 #include <string_view>