PyPI - cuda-cccl - Versions diffs - 0.3.0__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.2__cp310-cp310-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.3.0__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.2__cp310-cp310-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show

cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh CHANGED Viewed

@@ -387,15 +387,13 @@ struct DispatchTopK
         return error;
       }
-      _CubLog("Invoking topk_kernel<<<{%d,%d,%d}, %d, 0, "
+      _CubLog("Invoking topk_kernel<<<%d, %d, 0, "
               "%lld>>>(), %d items per thread, %d SM occupancy\n",
-              topk_grid_size.x,
-              topk_grid_size.y,
-              topk_grid_size.z,
+              topk_grid_size,
               block_threads,
               (long long) stream,
               items_per_thread,
-              topk_blocks_per_sm);
+              main_kernel_blocks_per_sm);
     }
 #endif // CUB_DEBUG_LOG

cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh CHANGED Viewed

@@ -109,8 +109,9 @@ struct TransformKernelSource<Offset,
     return detail::transform::make_aligned_base_ptr_kernel_arg(it, align);
   }
+private:
   template <typename T>
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static auto IsPointerAligned(T it, [[maybe_unused]] int alignment)
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static auto is_pointer_aligned(T it, [[maybe_unused]] int alignment)
   {
     if constexpr (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<decltype(it)>)
     {
@@ -121,6 +122,14 @@ struct TransformKernelSource<Offset,
       return true; // fancy iterators are aligned, since the vectorized kernel chooses a different code path
     }
   }
+public:
+  CUB_RUNTIME_FUNCTION constexpr static bool
+  CanVectorize(int vec_size, const RandomAccessIteratorOut& out, const RandomAccessIteratorsIn&... in)
+  {
+    return is_pointer_aligned(out, sizeof(it_value_t<RandomAccessIteratorOut>) * vec_size)
+        && (is_pointer_aligned(in, sizeof(it_value_t<RandomAccessIteratorsIn>) * vec_size) && ...);
+  }
 };
 enum class requires_stable_address
@@ -384,7 +393,7 @@ struct dispatch_t<StableAddress,
   }
   CUB_DEFINE_SFINAE_GETTER(items_per_thread_no_input, prefetch, ItemsPerThreadNoInput)
-  CUB_DEFINE_SFINAE_GETTER(load_store_word_size, vectorized, LoadStoreWordSize)
+  CUB_DEFINE_SFINAE_GETTER(vec_size, vectorized, VecSize)
   CUB_DEFINE_SFINAE_GETTER(items_per_thread_vectorized, vectorized, ItemsPerThreadVectorized)
 #undef CUB_DEFINE_SFINAE_GETTER
@@ -441,9 +450,8 @@ struct dispatch_t<StableAddress,
     // the policy already handles the compile-time checks if we can vectorize. Do the remaining alignment check here
     if CUB_DETAIL_CONSTEXPR_ISH (Algorithm::vectorized == wrapped_policy.Algorithm())
     {
-      const int alignment = load_store_word_size(wrapped_policy.AlgorithmPolicy());
-      can_vectorize       = (kernel_source.IsPointerAligned(::cuda::std::get<Is>(in), alignment) && ...)
-                   && kernel_source.IsPointerAligned(out, alignment);
+      const int vs  = vec_size(wrapped_policy.AlgorithmPolicy());
+      can_vectorize = kernel_source.CanVectorize(vs, out, ::cuda::std::get<Is>(in)...);
     }
     int ipt        = 0;

cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh CHANGED Viewed

@@ -14,19 +14,17 @@
 #endif // no system header
 #include <cub/agent/agent_for.cuh>
-#include <cub/detail/fast_modulo_division.cuh> // fast_div_mod
 #include <cub/detail/mdspan_utils.cuh> // is_sub_size_static
 #include <cub/detail/type_traits.cuh> // implicit_prom_t
-#include <cuda/std/__fwd/span.h>
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/integral_constant.h>
 #include <cuda/std/__type_traits/is_convertible.h>
 #include <cuda/std/__type_traits/is_reference.h>
 #include <cuda/std/__type_traits/is_trivially_constructible.h>
-#include <cuda/std/__type_traits/is_trivially_copy_constructible.h>
+#include <cuda/std/__type_traits/is_trivially_copy_assignable.h>
 #include <cuda/std/__type_traits/is_trivially_destructible.h>
-#include <cuda/std/__type_traits/is_trivially_move_constructible.h>
+#include <cuda/std/__type_traits/is_trivially_move_assignable.h>
 #include <cuda/std/__type_traits/make_unsigned.h>
 #include <cuda/std/__utility/integer_sequence.h>
 #include <cuda/std/cstddef> // size_t
@@ -140,16 +138,21 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::for_policy_t::block_threads) //
  * ForEachInExtents
  **********************************************************************************************************************/
-// Returns the extent at the given rank. If the extents is static, returns it, otherwise returns the precomputed value
-template <int Rank, typename ExtentType, typename FastDivModType>
-_CCCL_DEVICE _CCCL_FORCEINLINE auto extent_at(ExtentType extents, FastDivModType dynamic_extent)
+// Retrieves the extent (dimension size) at a specific position in a multi-dimensional array
+//
+// This function efficiently returns the extent at the given position, optimizing for static extents by returning
+// compile-time constants when possible. For dynamic extents, it returns the precomputed value to avoid runtime
+// computation overhead.
+template <int Position, typename ExtentType, typename FastDivModType>
+_CCCL_DEVICE_API auto extent_at(ExtentType extents, FastDivModType dynamic_extent)
 {
-  if constexpr (ExtentType::static_extent(Rank) != ::cuda::std::dynamic_extent)
+  if constexpr (ExtentType::static_extent(Position) != ::cuda::std::dynamic_extent)
   {
     using extent_index_type   = typename ExtentType::index_type;
     using index_type          = implicit_prom_t<extent_index_type>;
     using unsigned_index_type = ::cuda::std::make_unsigned_t<index_type>;
-    return static_cast<unsigned_index_type>(extents.static_extent(Rank));
+    constexpr auto extent     = extents.static_extent(Position);
+    return static_cast<unsigned_index_type>(extent);
   }
   else
   {
@@ -157,17 +160,22 @@ _CCCL_DEVICE _CCCL_FORCEINLINE auto extent_at(ExtentType extents, FastDivModType
   }
 }
-// Returns the product of all extents from position Rank. If the result is static, returns it, otherwise returns the
-// precomputed value
-template <int Rank, typename ExtentType, typename FastDivModType>
-_CCCL_DEVICE _CCCL_FORCEINLINE auto get_extents_sub_size(ExtentType extents, FastDivModType extent_sub_size)
+// Computes the product of extents in a specified range for multi-dimensional indexing.
+// This function calculates the product of all extent dimensions from Start (inclusive) to End (exclusive).
+//
+// Performance characteristics:
+//  - Static extents in range: Product computed at compile-time, zero runtime cost
+//  - Dynamic extents present: Returns precomputed value, avoiding runtime multiplication
+template <int Start, int End, typename ExtentType, typename FastDivModType>
+_CCCL_DEVICE_API auto get_extents_sub_size(ExtentType extents, FastDivModType extent_sub_size)
 {
-  if constexpr (cub::detail::is_sub_size_static<Rank + 1, ExtentType>())
+  if constexpr (cub::detail::are_extents_in_range_static<ExtentType>(Start, End))
   {
     using extent_index_type   = typename ExtentType::index_type;
     using index_type          = implicit_prom_t<extent_index_type>;
     using unsigned_index_type = ::cuda::std::make_unsigned_t<index_type>;
-    return static_cast<unsigned_index_type>(cub::detail::sub_size<Rank + 1>(extents));
+    auto sub_size             = cub::detail::size_range(extents, Start, End);
+    return static_cast<unsigned_index_type>(sub_size);
   }
   else
   {
@@ -175,49 +183,76 @@ _CCCL_DEVICE _CCCL_FORCEINLINE auto get_extents_sub_size(ExtentType extents, Fas
   }
 }
-template <int Rank, typename IndexType, typename ExtentType, typename FastDivModType>
-_CCCL_DEVICE _CCCL_FORCEINLINE auto
+// Converts a linear index to a multi-dimensional coordinate at a specific position.
+//
+// This function performs the mathematical conversion from a linear (flat) index to the coordinate value at a specific
+// position in a multi-dimensional array. It supports both row-major (layout_right) and column-major (layout_left)
+// memory layouts, which affects the indexing calculation order.
+//
+// The mathematical formulation depends on the layout:
+// - Right layout (row-major):   index_i = (index / product(extent[j] for j in [i+1, rank-1])) % extent[i]
+// - Left layout (column-major): index_i = (index / product(extent[j] for j in [0, i])) % extent[i]
+//
+// This function leverages precomputed fast division and modulo operations to minimize runtime arithmetic overhead.
+template <bool IsLayoutRight, int Position, typename IndexType, typename ExtentType, typename FastDivModType>
+_CCCL_DEVICE_API auto
 coordinate_at(IndexType index, ExtentType extents, FastDivModType extent_sub_size, FastDivModType dynamic_extent)
 {
   using cub::detail::for_each::extent_at;
   using cub::detail::for_each::get_extents_sub_size;
   using extent_index_type = typename ExtentType::index_type;
-  return static_cast<extent_index_type>(
-    (index / get_extents_sub_size<Rank>(extents, extent_sub_size)) % extent_at<Rank>(extents, dynamic_extent));
+  constexpr auto start    = IsLayoutRight ? Position + 1 : 0;
+  constexpr auto end      = IsLayoutRight ? ExtentType::rank() : Position;
+  return static_cast<extent_index_type>((index / get_extents_sub_size<start, end>(extents, extent_sub_size))
+                                        % extent_at<Position>(extents, dynamic_extent));
 }
-template <typename OpT, typename ExtentsT, typename FastDivModArrayT>
+// Function object wrapper for applying operations with multi-dimensional coordinate conversion.
+//
+// The wrapped operation will be called with signature: `op(linear_index, coord_0, coord_1, ..., coord_n)`
+// where the number of coordinate parameters matches the rank of the extents object.
+//
+// This wrapper is used internally by DeviceFor::ForEachInLayout/ForEachInExtents
+template <typename OpT, typename ExtentsType, bool IsLayoutRight, typename FastDivModArrayT>
 struct op_wrapper_extents_t
 {
-  OpT op;
-  ExtentsT extents;
-  FastDivModArrayT sub_sizes_div_array;
-  FastDivModArrayT extents_mod_array;
-  template <typename OffsetT, size_t... Ranks>
-  _CCCL_DEVICE _CCCL_FORCEINLINE void impl(OffsetT i, ::cuda::std::index_sequence<Ranks...>)
+  OpT op; ///< The user-provided operation to be called with coordinates
+  ExtentsType extents; ///< The multi-dimensional extents defining array dimensions
+  FastDivModArrayT sub_sizes_div_array; ///< Precomputed fast division values for extent sub-products
+  FastDivModArrayT extents_mod_array; ///< Precomputed fast modulo values for individual extents
+  // Internal implementation that converts linear index to coordinates and calls the user operation
+  template <typename IndexType, size_t... Positions>
+  _CCCL_DEVICE_API void impl(IndexType i, ::cuda::std::index_sequence<Positions...>)
   {
     using cub::detail::for_each::coordinate_at;
-    op(i, coordinate_at<Ranks>(i, extents, sub_sizes_div_array[Ranks], extents_mod_array[Ranks])...);
+    op(i,
+       coordinate_at<IsLayoutRight, Positions>(
+         i, extents, sub_sizes_div_array[Positions], extents_mod_array[Positions])...);
   }
-  template <typename OffsetT, size_t... Ranks>
-  _CCCL_DEVICE _CCCL_FORCEINLINE void impl(OffsetT i, ::cuda::std::index_sequence<Ranks...>) const
+  // Internal implementation that converts linear index to coordinates and calls the user operation
+  template <typename IndexType, size_t... Positions>
+  _CCCL_DEVICE_API void impl(IndexType i, ::cuda::std::index_sequence<Positions...>) const
   {
     using cub::detail::for_each::coordinate_at;
-    op(i, coordinate_at<Ranks>(i, extents, sub_sizes_div_array[Ranks], extents_mod_array[Ranks])...);
+    op(i,
+       coordinate_at<IsLayoutRight, Positions>(
+         i, extents, sub_sizes_div_array[Positions], extents_mod_array[Positions])...);
   }
-  template <typename OffsetT>
-  _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(OffsetT i)
+  // Function call operator that processes a linear index by converting it to multi-dimensional coordinates
+  template <typename IndexType>
+  _CCCL_DEVICE_API void operator()(IndexType i)
   {
-    impl(i, ::cuda::std::make_index_sequence<ExtentsT::rank()>{});
+    impl(i, ::cuda::std::make_index_sequence<ExtentsType::rank()>{});
   }
-  template <typename OffsetT>
-  _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(OffsetT i) const
+  // Function call operator that processes a linear index by converting it to multi-dimensional coordinates
+  template <typename IndexType>
+  _CCCL_DEVICE_API void operator()(IndexType i) const
   {
-    impl(i, ::cuda::std::make_index_sequence<ExtentsT::rank()>{});
+    impl(i, ::cuda::std::make_index_sequence<ExtentsType::rank()>{});
   }
 };

cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh CHANGED Viewed

@@ -47,9 +47,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace reduce
+namespace detail::reduce
 {
 /**
@@ -580,7 +578,6 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(
   }
 }
-} // namespace reduce
-} // namespace detail
+} // namespace detail::reduce
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh CHANGED Viewed

@@ -42,9 +42,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace scan
+namespace detail::scan
 {
 /******************************************************************************
@@ -186,7 +184,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
   AgentScanT(temp_storage, d_in, d_out, scan_op, real_init_value).ConsumeRange(num_items, tile_state, start_tile);
 }
-} // namespace scan
-} // namespace detail
+} // namespace detail::scan
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh CHANGED Viewed

@@ -43,9 +43,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace reduce
+namespace detail::reduce
 {
 /// Normalize input iterator to segment offset
@@ -318,7 +316,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
   }
 }
-} // namespace reduce
-} // namespace detail
+} // namespace detail::reduce
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh CHANGED Viewed

@@ -217,6 +217,7 @@ _CCCL_DEVICE void transform_kernel_vectorized(
 {
   constexpr int block_dim        = VectorizedPolicy::block_threads;
   constexpr int items_per_thread = VectorizedPolicy::items_per_thread_vectorized;
+  constexpr int vec_size         = VectorizedPolicy::vec_size;
   _CCCL_ASSERT(!can_vectorize || (items_per_thread == num_elem_per_thread_prefetch), "");
   constexpr int tile_size = block_dim * items_per_thread;
   const Offset offset     = static_cast<Offset>(blockIdx.x) * tile_size;
@@ -241,23 +242,13 @@ _CCCL_DEVICE void transform_kernel_vectorized(
     out += offset;
   }
-  constexpr int load_store_size = VectorizedPolicy::load_store_word_size;
-  using load_store_t            = decltype(load_store_type<load_store_size>());
-  using output_t                = it_value_t<RandomAccessIteratorOut>;
+  using output_t = it_value_t<RandomAccessIteratorOut>;
   using result_t = ::cuda::std::decay_t<::cuda::std::invoke_result_t<F, const it_value_t<RandomAccessIteratorsIn>&...>>;
-  // picks output type size if there are no inputs
-  constexpr int element_size     = int{first_nonzero_value(
-    (sizeof(it_value_t<RandomAccessIteratorsIn>)
-     * THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...,
-    size_of<output_t>)};
-  constexpr int load_store_count = (items_per_thread * element_size) / load_store_size;
+  constexpr int load_store_count = items_per_thread / vec_size;
+  static_assert(items_per_thread % vec_size == 0, "The items per thread must be a multiple of the vector size");
-  static_assert((items_per_thread * element_size) % load_store_size == 0);
-  static_assert(load_store_size % element_size == 0);
-  constexpr bool can_vectorize_store =
-    THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorOut>
-    && THRUST_NS_QUALIFIER::is_trivially_relocatable_v<output_t> && size_of<output_t> == element_size;
+  constexpr bool can_vectorize_store = THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorOut>
+                                    && THRUST_NS_QUALIFIER::is_trivially_relocatable_v<output_t>;
   // if we can vectorize, we convert f's return type to the output type right away, so we can reinterpret later
   using THRUST_NS_QUALIFIER::cuda_cub::core::detail::uninitialized_array;
@@ -266,10 +257,15 @@ _CCCL_DEVICE void transform_kernel_vectorized(
   auto provide_array = [&](auto... inputs) {
     // load inputs
     [[maybe_unused]] auto load_tile = [](auto in, auto& input) {
+      using it_t    = decltype(in);
+      using value_t = it_value_t<it_t>;
       if constexpr (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<decltype(in)>)
       {
-        auto in_vec    = reinterpret_cast<const load_store_t*>(in) + threadIdx.x;
-        auto input_vec = reinterpret_cast<load_store_t*>(input.data());
+        // TODO(bgruber): we could add a max_load_store_size to the policy to avoid huge load types and huge alignment
+        // requirements
+        using load_t   = decltype(load_store_type<sizeof(value_t) * vec_size>());
+        auto in_vec    = reinterpret_cast<const load_t*>(in) + threadIdx.x;
+        auto input_vec = reinterpret_cast<load_t*>(input.data());
         _CCCL_PRAGMA_UNROLL_FULL()
         for (int i = 0; i < load_store_count; ++i)
         {
@@ -278,15 +274,14 @@ _CCCL_DEVICE void transform_kernel_vectorized(
       }
       else
       {
-        constexpr int elems = load_store_size / element_size;
-        in += threadIdx.x * elems;
+        in += threadIdx.x * vec_size;
         _CCCL_PRAGMA_UNROLL_FULL()
         for (int i = 0; i < load_store_count; ++i)
         {
           _CCCL_PRAGMA_UNROLL_FULL()
-          for (int j = 0; j < elems; ++j)
+          for (int j = 0; j < vec_size; ++j)
           {
-            input[i * elems + j] = in[i * elems * VectorizedPolicy::block_threads + j];
+            input[i * vec_size + j] = in[i * vec_size * VectorizedPolicy::block_threads + j];
           }
         }
       }
@@ -310,8 +305,9 @@ _CCCL_DEVICE void transform_kernel_vectorized(
   if constexpr (can_vectorize_store)
   {
     // vector path
-    auto output_vec = reinterpret_cast<const load_store_t*>(output.data());
-    auto out_vec    = reinterpret_cast<load_store_t*>(out) + threadIdx.x;
+    using store_t   = decltype(load_store_type<sizeof(output_t) * vec_size>());
+    auto output_vec = reinterpret_cast<const store_t*>(output.data());
+    auto out_vec    = reinterpret_cast<store_t*>(out) + threadIdx.x;
     _CCCL_PRAGMA_UNROLL_FULL()
     for (int i = 0; i < load_store_count; ++i)
     {
@@ -321,15 +317,14 @@ _CCCL_DEVICE void transform_kernel_vectorized(
   else
   {
     // serial path
-    constexpr int elems = load_store_size / element_size;
-    out += threadIdx.x * elems;
+    out += threadIdx.x * vec_size;
     _CCCL_PRAGMA_UNROLL_FULL()
     for (int i = 0; i < load_store_count; ++i)
     {
       _CCCL_PRAGMA_UNROLL_FULL()
-      for (int j = 0; j < elems; ++j)
+      for (int j = 0; j < vec_size; ++j)
       {
-        out[i * elems * VectorizedPolicy::block_threads + j] = output[i * elems + j];
+        out[i * vec_size * VectorizedPolicy::block_threads + j] = output[i * vec_size + j];
       }
     }
   }

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh CHANGED Viewed

@@ -43,9 +43,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace adjacent_difference
+namespace detail::adjacent_difference
 {
 template <typename InputIteratorT, bool MayAlias>
 struct policy_hub
@@ -64,7 +62,6 @@ struct policy_hub
   using MaxPolicy = Policy500;
 };
-} // namespace adjacent_difference
-} // namespace detail
+} // namespace detail::adjacent_difference
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh CHANGED Viewed

@@ -43,9 +43,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace batch_memcpy
+namespace detail::batch_memcpy
 {
 /**
  * Parameterizable tuning policy type for AgentBatchMemcpy
@@ -115,7 +113,6 @@ struct policy_hub
   using MaxPolicy = Policy700;
 };
-} // namespace batch_memcpy
-} // namespace detail
+} // namespace detail::batch_memcpy
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh CHANGED Viewed

@@ -42,9 +42,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace for_each
+namespace detail::for_each
 {
 struct policy_hub_t
@@ -57,7 +55,6 @@ struct policy_hub_t
   using MaxPolicy = policy_500_t;
 };
-} // namespace for_each
-} // namespace detail
+} // namespace detail::for_each
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh CHANGED Viewed

@@ -46,9 +46,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace histogram
+namespace detail::histogram
 {
 enum class primitive_sample
 {
@@ -272,7 +270,6 @@ struct policy_hub
   using MaxPolicy = Policy1000;
 };
-} // namespace histogram
-} // namespace detail
+} // namespace detail::histogram
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh CHANGED Viewed

@@ -42,9 +42,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace merge
+namespace detail::merge
 {
 template <typename KeyT, typename ValueT>
 struct policy_hub
@@ -73,7 +71,6 @@ struct policy_hub
   using max_policy = policy600;
 };
-} // namespace merge
-} // namespace detail
+} // namespace detail::merge
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh CHANGED Viewed

@@ -62,6 +62,14 @@ struct MergeSortPolicyWrapper<StaticPolicyT, ::cuda::std::void_t<decltype(Static
   {}
   CUB_DEFINE_SUB_POLICY_GETTER(MergeSort);
+#if defined(CUB_ENABLE_POLICY_PTX_JSON)
+  _CCCL_DEVICE static constexpr auto EncodedPolicy()
+  {
+    using namespace ptx_json;
+    return object<key<"MergeSortPolicy">() = MergeSort().EncodedPolicy()>();
+  }
+#endif
 };
 template <typename PolicyT>

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh CHANGED Viewed

@@ -46,9 +46,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace radix
+namespace detail::radix
 {
 // sm90 default
 template <size_t KeySize, size_t ValueSize, size_t OffsetSize>
@@ -1062,7 +1060,6 @@ struct policy_hub
   using MaxPolicy = Policy1000;
 };
-} // namespace radix
-} // namespace detail
+} // namespace detail::radix
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh CHANGED Viewed

@@ -50,9 +50,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace reduce_by_key
+namespace detail::reduce_by_key
 {
 enum class primitive_key
 {
@@ -939,7 +937,6 @@ struct policy_hub
   };
   using MaxPolicy = Policy1000;
 };
-} // namespace reduce_by_key
-} // namespace detail
+} // namespace detail::reduce_by_key
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh CHANGED Viewed

@@ -52,9 +52,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace rle
+namespace detail::rle
 {
 enum class primitive_key
 {
@@ -670,7 +668,6 @@ struct policy_hub
   using MaxPolicy = Policy1000;
 };
 } // namespace non_trivial_runs
-} // namespace rle
-} // namespace detail
+} // namespace detail::rle
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh CHANGED Viewed

@@ -53,9 +53,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace scan
+namespace detail::scan
 {
 enum class keep_rejects
 {
@@ -615,7 +613,6 @@ struct policy_hub
   using MaxPolicy = Policy1000;
 };
-} // namespace scan
-} // namespace detail
+} // namespace detail::scan
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh CHANGED Viewed

@@ -49,9 +49,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace scan_by_key
+namespace detail::scan_by_key
 {
 enum class primitive_accum
 {
@@ -1007,7 +1005,6 @@ struct policy_hub
   using MaxPolicy = Policy1000;
 };
-} // namespace scan_by_key
-} // namespace detail
+} // namespace detail::scan_by_key
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh CHANGED Viewed

@@ -43,9 +43,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace segmented_sort
+namespace detail::segmented_sort
 {
 template <typename PolicyT, typename = void>
@@ -395,7 +393,6 @@ struct policy_hub
   using MaxPolicy = Policy860;
 };
-} // namespace segmented_sort
-} // namespace detail
+} // namespace detail::segmented_sort
 CUB_NAMESPACE_END