PyPI - cuda-cccl - Versions diffs - 0.1.3.2.0.dev438__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.1__cp311-cp311-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.1.3.2.0.dev438__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.1__cp311-cp311-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (177) hide show

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh CHANGED Viewed

@@ -43,10 +43,120 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
+namespace detail::segmented_sort
 {
-namespace segmented_sort
+template <typename PolicyT, typename = void>
+struct SegmentedSortPolicyWrapper : PolicyT
+{
+  CUB_RUNTIME_FUNCTION SegmentedSortPolicyWrapper(PolicyT base)
+      : PolicyT(base)
+  {}
+};
+template <typename StaticPolicyT>
+struct SegmentedSortPolicyWrapper<StaticPolicyT,
+                                  _CUDA_VSTD::void_t<typename StaticPolicyT::LargeSegmentPolicy,
+                                                     typename StaticPolicyT::SmallSegmentPolicy,
+                                                     typename StaticPolicyT::MediumSegmentPolicy>> : StaticPolicyT
 {
+  CUB_RUNTIME_FUNCTION SegmentedSortPolicyWrapper(StaticPolicyT base)
+      : StaticPolicyT(base)
+  {}
+  CUB_RUNTIME_FUNCTION static constexpr auto LargeSegment()
+  {
+    return cub::detail::MakePolicyWrapper(typename StaticPolicyT::LargeSegmentPolicy());
+  }
+  CUB_RUNTIME_FUNCTION static constexpr auto SmallSegment()
+  {
+    return cub::detail::MakePolicyWrapper(typename StaticPolicyT::SmallSegmentPolicy());
+  }
+  CUB_RUNTIME_FUNCTION static constexpr auto MediumSegment()
+  {
+    return cub::detail::MakePolicyWrapper(typename StaticPolicyT::MediumSegmentPolicy());
+  }
+  CUB_RUNTIME_FUNCTION static constexpr int PartitioningThreshold()
+  {
+    return StaticPolicyT::PARTITIONING_THRESHOLD;
+  }
+  CUB_RUNTIME_FUNCTION static constexpr int LargeSegmentRadixBits()
+  {
+    return StaticPolicyT::LargeSegmentPolicy::RADIX_BITS;
+  }
+  CUB_RUNTIME_FUNCTION static constexpr int SegmentsPerSmallBlock()
+  {
+    return StaticPolicyT::SmallSegmentPolicy::SEGMENTS_PER_BLOCK;
+  }
+  CUB_RUNTIME_FUNCTION static constexpr int SegmentsPerMediumBlock()
+  {
+    return StaticPolicyT::MediumSegmentPolicy::SEGMENTS_PER_BLOCK;
+  }
+  CUB_RUNTIME_FUNCTION static constexpr int SmallPolicyItemsPerTile()
+  {
+    return StaticPolicyT::SmallSegmentPolicy::ITEMS_PER_TILE;
+  }
+  CUB_RUNTIME_FUNCTION static constexpr int MediumPolicyItemsPerTile()
+  {
+    return StaticPolicyT::MediumSegmentPolicy::ITEMS_PER_TILE;
+  }
+  CUB_RUNTIME_FUNCTION static constexpr CacheLoadModifier LargeSegmentLoadModifier()
+  {
+    return StaticPolicyT::LargeSegmentPolicy::LOAD_MODIFIER;
+  }
+  CUB_RUNTIME_FUNCTION static constexpr BlockLoadAlgorithm LargeSegmentLoadAlgorithm()
+  {
+    return StaticPolicyT::LargeSegmentPolicy::LOAD_ALGORITHM;
+  }
+  CUB_RUNTIME_FUNCTION static constexpr WarpLoadAlgorithm MediumSegmentLoadAlgorithm()
+  {
+    return StaticPolicyT::MediumSegmentPolicy::LOAD_ALGORITHM;
+  }
+  CUB_RUNTIME_FUNCTION static constexpr WarpLoadAlgorithm SmallSegmentLoadAlgorithm()
+  {
+    return StaticPolicyT::SmallSegmentPolicy::LOAD_ALGORITHM;
+  }
+  CUB_RUNTIME_FUNCTION static constexpr WarpStoreAlgorithm MediumSegmentStoreAlgorithm()
+  {
+    return StaticPolicyT::MediumSegmentPolicy::STORE_ALGORITHM;
+  }
+  CUB_RUNTIME_FUNCTION static constexpr WarpStoreAlgorithm SmallSegmentStoreAlgorithm()
+  {
+    return StaticPolicyT::SmallSegmentPolicy::STORE_ALGORITHM;
+  }
+#if defined(CUB_ENABLE_POLICY_PTX_JSON)
+  _CCCL_DEVICE static constexpr auto EncodedPolicy()
+  {
+    using namespace ptx_json;
+    return object<key<"LargeSegmentPolicy">()    = LargeSegment().EncodedPolicy(),
+                  key<"SmallSegmentPolicy">()    = SmallSegment().EncodedPolicy(),
+                  key<"MediumSegmentPolicy">()   = MediumSegment().EncodedPolicy(),
+                  key<"PartitioningThreshold">() = value<StaticPolicyT::PARTITIONING_THRESHOLD>()>();
+  }
+#endif
+};
+template <typename PolicyT>
+CUB_RUNTIME_FUNCTION SegmentedSortPolicyWrapper<PolicyT> MakeSegmentedSortPolicyWrapper(PolicyT policy)
+{
+  return SegmentedSortPolicyWrapper<PolicyT>{policy};
+}
 template <typename KeyT, typename ValueT>
 struct policy_hub
 {
@@ -71,12 +181,19 @@ struct policy_hub
     static constexpr int ITEMS_PER_SMALL_THREAD  = Nominal4BItemsToItems<DominantT>(7);
     static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems<DominantT>(7);
-    using SmallAndMediumSegmentedSortPolicyT     = AgentSmallAndMediumSegmentedSortPolicy<
-          BLOCK_THREADS,
-          // Small policy
-          AgentSubWarpMergeSortPolicy<4 /* Threads per segment */, ITEMS_PER_SMALL_THREAD, WARP_LOAD_DIRECT, LOAD_DEFAULT>,
-          // Medium policy
-          AgentSubWarpMergeSortPolicy<32 /* Threads per segment */, ITEMS_PER_MEDIUM_THREAD, WARP_LOAD_DIRECT, LOAD_DEFAULT>>;
+    using SmallSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  4 /* Threads per segment */,
+                                  ITEMS_PER_SMALL_THREAD,
+                                  WARP_LOAD_DIRECT,
+                                  LOAD_DEFAULT>;
+    using MediumSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  32 /* Threads per segment */,
+                                  ITEMS_PER_MEDIUM_THREAD,
+                                  WARP_LOAD_DIRECT,
+                                  LOAD_DEFAULT>;
   };
   struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
@@ -97,12 +214,19 @@ struct policy_hub
     static constexpr int ITEMS_PER_SMALL_THREAD  = Nominal4BItemsToItems<DominantT>(9);
     static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems<DominantT>(9);
-    using SmallAndMediumSegmentedSortPolicyT     = AgentSmallAndMediumSegmentedSortPolicy<
-          BLOCK_THREADS,
-          // Small policy
-          AgentSubWarpMergeSortPolicy<4 /* Threads per segment */, ITEMS_PER_SMALL_THREAD, WARP_LOAD_DIRECT, LOAD_DEFAULT>,
-          // Medium policy
-          AgentSubWarpMergeSortPolicy<32 /* Threads per segment */, ITEMS_PER_MEDIUM_THREAD, WARP_LOAD_DIRECT, LOAD_DEFAULT>>;
+    using SmallSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  4 /* Threads per segment */,
+                                  ITEMS_PER_SMALL_THREAD,
+                                  WARP_LOAD_DIRECT,
+                                  LOAD_DEFAULT>;
+    using MediumSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  32 /* Threads per segment */,
+                                  ITEMS_PER_MEDIUM_THREAD,
+                                  WARP_LOAD_DIRECT,
+                                  LOAD_DEFAULT>;
   };
   struct Policy610 : ChainedPolicy<610, Policy610, Policy600>
@@ -123,12 +247,19 @@ struct policy_hub
     static constexpr int ITEMS_PER_SMALL_THREAD  = Nominal4BItemsToItems<DominantT>(9);
     static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems<DominantT>(9);
-    using SmallAndMediumSegmentedSortPolicyT     = AgentSmallAndMediumSegmentedSortPolicy<
-          BLOCK_THREADS,
-          // Small policy
-          AgentSubWarpMergeSortPolicy<4 /* Threads per segment */, ITEMS_PER_SMALL_THREAD, WARP_LOAD_DIRECT, LOAD_DEFAULT>,
-          // Medium policy
-          AgentSubWarpMergeSortPolicy<32 /* Threads per segment */, ITEMS_PER_MEDIUM_THREAD, WARP_LOAD_DIRECT, LOAD_DEFAULT>>;
+    using SmallSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  4 /* Threads per segment */,
+                                  ITEMS_PER_SMALL_THREAD,
+                                  WARP_LOAD_DIRECT,
+                                  LOAD_DEFAULT>;
+    using MediumSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  32 /* Threads per segment */,
+                                  ITEMS_PER_MEDIUM_THREAD,
+                                  WARP_LOAD_DIRECT,
+                                  LOAD_DEFAULT>;
   };
   struct Policy620 : ChainedPolicy<620, Policy620, Policy610>
@@ -149,12 +280,19 @@ struct policy_hub
     static constexpr int ITEMS_PER_SMALL_THREAD  = Nominal4BItemsToItems<DominantT>(9);
     static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems<DominantT>(9);
-    using SmallAndMediumSegmentedSortPolicyT     = AgentSmallAndMediumSegmentedSortPolicy<
-          BLOCK_THREADS,
-          // Small policy
-          AgentSubWarpMergeSortPolicy<4 /* Threads per segment */, ITEMS_PER_SMALL_THREAD, WARP_LOAD_DIRECT, LOAD_DEFAULT>,
-          // Medium policy
-          AgentSubWarpMergeSortPolicy<32 /* Threads per segment */, ITEMS_PER_MEDIUM_THREAD, WARP_LOAD_DIRECT, LOAD_DEFAULT>>;
+    using SmallSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  4 /* Threads per segment */,
+                                  ITEMS_PER_SMALL_THREAD,
+                                  WARP_LOAD_DIRECT,
+                                  LOAD_DEFAULT>;
+    using MediumSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  32 /* Threads per segment */,
+                                  ITEMS_PER_MEDIUM_THREAD,
+                                  WARP_LOAD_DIRECT,
+                                  LOAD_DEFAULT>;
   };
   struct Policy700 : ChainedPolicy<700, Policy700, Policy620>
@@ -175,15 +313,19 @@ struct policy_hub
     static constexpr int ITEMS_PER_SMALL_THREAD  = Nominal4BItemsToItems<DominantT>(7);
     static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems<DominantT>(KEYS_ONLY ? 11 : 7);
-    using SmallAndMediumSegmentedSortPolicyT     = AgentSmallAndMediumSegmentedSortPolicy<
-          BLOCK_THREADS,
-          // Small policy
-          AgentSubWarpMergeSortPolicy<KEYS_ONLY ? 4 : 8 /* Threads per segment */,
-                                      ITEMS_PER_SMALL_THREAD,
-                                      WARP_LOAD_DIRECT,
-                                      LOAD_DEFAULT>,
-          // Medium policy
-          AgentSubWarpMergeSortPolicy<32 /* Threads per segment */, ITEMS_PER_MEDIUM_THREAD, WARP_LOAD_DIRECT, LOAD_DEFAULT>>;
+    using SmallSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  KEYS_ONLY ? 4 : 8 /* Threads per segment */,
+                                  ITEMS_PER_SMALL_THREAD,
+                                  WARP_LOAD_DIRECT,
+                                  LOAD_DEFAULT>;
+    using MediumSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  32 /* Threads per segment */,
+                                  ITEMS_PER_MEDIUM_THREAD,
+                                  WARP_LOAD_DIRECT,
+                                  LOAD_DEFAULT>;
   };
   struct Policy800 : ChainedPolicy<800, Policy800, Policy700>
@@ -202,15 +344,19 @@ struct policy_hub
     static constexpr int ITEMS_PER_SMALL_THREAD  = Nominal4BItemsToItems<DominantT>(9);
     static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems<DominantT>(KEYS_ONLY ? 7 : 11);
-    using SmallAndMediumSegmentedSortPolicyT     = AgentSmallAndMediumSegmentedSortPolicy<
-          BLOCK_THREADS,
-          // Small policy
-          AgentSubWarpMergeSortPolicy<KEYS_ONLY ? 4 : 2 /* Threads per segment */,
-                                      ITEMS_PER_SMALL_THREAD,
-                                      WARP_LOAD_TRANSPOSE,
-                                      LOAD_DEFAULT>,
-          // Medium policy
-          AgentSubWarpMergeSortPolicy<32 /* Threads per segment */, ITEMS_PER_MEDIUM_THREAD, WARP_LOAD_TRANSPOSE, LOAD_DEFAULT>>;
+    using SmallSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  KEYS_ONLY ? 4 : 2 /* Threads per segment */,
+                                  ITEMS_PER_SMALL_THREAD,
+                                  WARP_LOAD_TRANSPOSE,
+                                  LOAD_DEFAULT>;
+    using MediumSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  32 /* Threads per segment */,
+                                  ITEMS_PER_MEDIUM_THREAD,
+                                  WARP_LOAD_TRANSPOSE,
+                                  LOAD_DEFAULT>;
   };
   struct Policy860 : ChainedPolicy<860, Policy860, Policy800>
@@ -230,20 +376,23 @@ struct policy_hub
     static constexpr bool LARGE_ITEMS            = sizeof(DominantT) > 4;
     static constexpr int ITEMS_PER_SMALL_THREAD  = Nominal4BItemsToItems<DominantT>(LARGE_ITEMS ? 7 : 9);
     static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems<DominantT>(LARGE_ITEMS ? 9 : 7);
-    using SmallAndMediumSegmentedSortPolicyT     = AgentSmallAndMediumSegmentedSortPolicy<
-          BLOCK_THREADS,
-          // Small policy
-          AgentSubWarpMergeSortPolicy<LARGE_ITEMS ? 8 : 2 /* Threads per segment */,
-                                      ITEMS_PER_SMALL_THREAD,
-                                      WARP_LOAD_TRANSPOSE,
-                                      LOAD_LDG>,
-          // Medium policy
-          AgentSubWarpMergeSortPolicy<16 /* Threads per segment */, ITEMS_PER_MEDIUM_THREAD, WARP_LOAD_TRANSPOSE, LOAD_LDG>>;
+    using SmallSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  LARGE_ITEMS ? 8 : 2 /* Threads per segment */,
+                                  ITEMS_PER_SMALL_THREAD,
+                                  WARP_LOAD_TRANSPOSE,
+                                  LOAD_LDG>;
+    using MediumSegmentPolicy =
+      AgentSubWarpMergeSortPolicy<BLOCK_THREADS,
+                                  16 /* Threads per segment */,
+                                  ITEMS_PER_MEDIUM_THREAD,
+                                  WARP_LOAD_TRANSPOSE,
+                                  LOAD_LDG>;
   };
   using MaxPolicy = Policy860;
 };
-} // namespace segmented_sort
-} // namespace detail
+} // namespace detail::segmented_sort
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh CHANGED Viewed

@@ -47,9 +47,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace three_way_partition
+namespace detail::three_way_partition
 {
 template <typename PolicyT, typename = void>
@@ -437,7 +435,6 @@ struct policy_hub
   using MaxPolicy = Policy1000;
 };
-} // namespace three_way_partition
-} // namespace detail
+} // namespace detail::three_way_partition
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh CHANGED Viewed

@@ -282,21 +282,45 @@ _CCCL_HOST_DEVICE constexpr int arch_to_min_bytes_in_flight(int sm_arch)
   return 12 * 1024; // V100 and below
 }
-template <typename T, typename... Ts>
-_CCCL_HOST_DEVICE constexpr bool all_equal([[maybe_unused]] T head, Ts... tail)
+template <typename H, typename... Ts>
+_CCCL_HOST_DEVICE constexpr bool all_nonzero_equal(H head, Ts... values)
 {
-  return ((head == tail) && ...);
+  size_t first = 0;
+  for (size_t v : ::cuda::std::array<H, 1 + sizeof...(Ts)>{head, values...})
+  {
+    if (v == 0)
+    {
+      continue;
+    }
+    if (first == 0)
+    {
+      first = v;
+    }
+    else if (v != first)
+    {
+      return false;
+    }
+  }
+  return true;
 }
-_CCCL_HOST_DEVICE constexpr bool all_equal()
+_CCCL_HOST_DEVICE constexpr bool all_nonzero_equal()
 {
   return true;
 }
-template <typename T, typename... Ts>
-_CCCL_HOST_DEVICE constexpr auto first_item(T head, Ts...) -> T
+template <typename H, typename... Ts>
+_CCCL_HOST_DEVICE constexpr auto first_nonzero_value(H head, Ts... values)
 {
-  return head;
+  for (auto v : ::cuda::std::array<H, 1 + sizeof...(Ts)>{head, values...})
+  {
+    if (v != 0)
+    {
+      return v;
+    }
+  }
+  // we only reach here when all input are not contiguous and the output has a void value type
+  return H{1};
 }
 template <typename T>
@@ -336,25 +360,36 @@ struct policy_hub<RequiresStableAddress,
     (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn> && ...);
   static constexpr bool all_input_values_trivially_reloc =
     (THRUST_NS_QUALIFIER::is_trivially_relocatable_v<it_value_t<RandomAccessIteratorsIn>> && ...);
-  static constexpr bool can_memcpy_inputs = all_inputs_contiguous && all_input_values_trivially_reloc;
+  static constexpr bool can_memcpy_all_inputs = all_inputs_contiguous && all_input_values_trivially_reloc;
+  // the vectorized kernel supports mixing contiguous and non-contiguous iterators
+  static constexpr bool can_memcpy_contiguous_inputs =
+    ((!THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>
+      || THRUST_NS_QUALIFIER::is_trivially_relocatable_v<it_value_t<RandomAccessIteratorsIn>>)
+     && ...);
   // for vectorized policy:
-  static constexpr bool all_input_values_same_size = all_equal(sizeof(it_value_t<RandomAccessIteratorsIn>)...);
-  static constexpr int load_store_word_size        = 8; // TODO(bgruber): make this 16, and 32 on Blackwell+
-  // if there are no inputs, we take the size of the output value
-  static constexpr int value_type_size =
-    first_item(int{sizeof(it_value_t<RandomAccessIteratorsIn>)}..., int{size_of<it_value_t<RandomAccessIteratorOut>>});
+  static constexpr bool all_contiguous_input_values_same_size = all_nonzero_equal(
+    (sizeof(it_value_t<RandomAccessIteratorsIn>)
+     * THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...);
+  static constexpr int load_store_word_size = 8; // TODO(bgruber): make this 16, and 32 on Blackwell+
+  // find the value type size of the first contiguous iterator. if there are no inputs, we take the size of the output
+  // value type
+  static constexpr int contiguous_value_type_size = first_nonzero_value(
+    (int{sizeof(it_value_t<RandomAccessIteratorsIn>)}
+     * THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...,
+    int{size_of<it_value_t<RandomAccessIteratorOut>>});
   static constexpr bool value_type_divides_load_store_size =
-    load_store_word_size % value_type_size == 0; // implicitly checks that value_type_size <= load_store_word_size
+    load_store_word_size % contiguous_value_type_size == 0; // implicitly checks that value_type_size <=
+                                                            // load_store_word_size
   static constexpr int target_bytes_per_thread =
     no_input_streams ? 16 /* by experiment on RTX 5090 */ : 32 /* guestimate by gevtushenko for loading */;
   static constexpr int items_per_thread_vec =
-    ::cuda::round_up(target_bytes_per_thread, load_store_word_size) / value_type_size;
+    ::cuda::round_up(target_bytes_per_thread, load_store_word_size) / contiguous_value_type_size;
   using default_vectorized_policy_t = vectorized_policy_t<256, items_per_thread_vec, load_store_word_size>;
   static constexpr bool fallback_to_prefetch =
-    RequiresStableAddress || !can_memcpy_inputs || !all_input_values_same_size || !value_type_divides_load_store_size
-    || !DenseOutput;
+    RequiresStableAddress || !can_memcpy_contiguous_inputs || !all_contiguous_input_values_same_size
+    || !value_type_divides_load_store_size || !DenseOutput;
   // TODO(bgruber): consider a separate kernel for just filling
@@ -380,7 +415,7 @@ struct policy_hub<RequiresStableAddress,
         block_threads* async_policy::min_items_per_thread,
         ldgsts_size_and_align)
       > int{max_smem_per_block};
-    static constexpr bool fallback_to_vectorized = exhaust_smem || no_input_streams;
+    static constexpr bool fallback_to_vectorized = exhaust_smem || no_input_streams || !can_memcpy_all_inputs;
   public:
     static constexpr int min_bif = arch_to_min_bytes_in_flight(800);
@@ -421,7 +456,8 @@ struct policy_hub<RequiresStableAddress,
       (((int{sizeof(it_value_t<RandomAccessIteratorsIn>)} * AsyncBlockSize) % max_alignment == 0) && ...);
     static constexpr bool enough_threads_for_peeling = AsyncBlockSize >= alignment; // head and tail bytes
     static constexpr bool fallback_to_vectorized =
-      exhaust_smem || !tile_sizes_retain_alignment || !enough_threads_for_peeling || no_input_streams;
+      exhaust_smem || !tile_sizes_retain_alignment || !enough_threads_for_peeling || no_input_streams
+      || !can_memcpy_all_inputs;
   public:
     static constexpr int min_bif = arch_to_min_bytes_in_flight(PtxVersion);

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh CHANGED Viewed

@@ -788,6 +788,16 @@ struct UniqueByKeyPolicyWrapper<StaticPolicyT,
   {
     return cub::detail::MakePolicyWrapper(typename StaticPolicyT::UniqueByKeyPolicyT());
   }
+#if defined(CUB_ENABLE_POLICY_PTX_JSON)
+  _CCCL_DEVICE static constexpr auto EncodedPolicy()
+  {
+    using namespace ptx_json;
+    return object<key<"UniqueByKeyPolicyT">() = UniqueByKey().EncodedPolicy(),
+                  key<"DelayConstructor">() =
+                    StaticPolicyT::UniqueByKeyPolicyT::detail::delay_constructor_t::EncodedConstructor()>();
+  }
+#endif
 };
 template <typename PolicyT>

cuda/cccl/headers/include/cub/util_device.cuh CHANGED Viewed

@@ -47,7 +47,6 @@
 // for backward compatibility
 #include <cub/util_temporary_storage.cuh>
-#include <cuda/std/__cuda/ensure_current_device.h> // IWYU pragma: export
 #include <cuda/std/__type_traits/conditional.h>
 #include <cuda/std/__utility/forward.h>
 #include <cuda/std/array>
@@ -104,7 +103,34 @@ CUB_RUNTIME_FUNCTION inline int CurrentDevice()
 //! @brief RAII helper which saves the current device and switches to the specified device on construction and switches
 //! to the saved device on destruction.
-using SwitchDevice = ::cuda::__ensure_current_device;
+class SwitchDevice
+{
+  int target_device_;
+  int original_device_;
+public:
+  //! @brief Queries the current device and if that is different than @p target_device sets the current device to
+  //! @p target_device
+  SwitchDevice(const int target_device)
+      : target_device_(target_device)
+  {
+    CubDebug(cudaGetDevice(&original_device_));
+    if (original_device_ != target_device_)
+    {
+      CubDebug(cudaSetDevice(target_device_));
+    }
+  }
+  //! @brief If the @p original_device was not equal to @p target_device sets the current device back to
+  //! @p original_device
+  ~SwitchDevice()
+  {
+    if (original_device_ != target_device_)
+    {
+      CubDebug(cudaSetDevice(original_device_));
+    }
+  }
+};
 #  endif // _CCCL_DOXYGEN_INVOKED
@@ -684,16 +710,31 @@ struct KernelConfig
     return launcher_factory.MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
   }
 };
 } // namespace detail
 #endif // !_CCCL_COMPILER(NVRTC)
+namespace detail
+{
+template <typename T>
+struct get_active_policy
+{
+  using type = typename T::ActivePolicy;
+};
+} // namespace detail
 /// Helper for dispatching into a policy chain
 template <int PolicyPtxVersion, typename PolicyT, typename PrevPolicyT>
 struct ChainedPolicy
 {
+private:
+  static constexpr bool have_previous_policy = !::cuda::std::is_same_v<PolicyT, PrevPolicyT>;
+public:
   /// The policy for the active compiler pass
-  using ActivePolicy = ::cuda::std::_If<(CUB_PTX_ARCH < PolicyPtxVersion), typename PrevPolicyT::ActivePolicy, PolicyT>;
+  using ActivePolicy =
+    typename ::cuda::std::_If<(CUB_PTX_ARCH < PolicyPtxVersion && have_previous_policy),
+                              detail::get_active_policy<PrevPolicyT>,
+                              ::cuda::std::type_identity<PolicyT>>::type;
 #if !_CCCL_COMPILER(NVRTC)
   /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
@@ -708,9 +749,12 @@ struct ChainedPolicy
 #  elif defined(NV_TARGET_SM_INTEGER_LIST)
     return runtime_to_compiletime<10, NV_TARGET_SM_INTEGER_LIST>(device_ptx_version, op);
 #  else
-    if (device_ptx_version < PolicyPtxVersion)
+    if constexpr (have_previous_policy)
     {
-      return PrevPolicyT::Invoke(device_ptx_version, op);
+      if (device_ptx_version < PolicyPtxVersion)
+      {
+        return PrevPolicyT::Invoke(device_ptx_version, op);
+      }
     }
     return op.template Invoke<PolicyT>();
 #  endif
@@ -738,7 +782,7 @@ private:
   template <int DevicePtxVersion, typename FunctorT>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t invoke_static(FunctorT& op)
   {
-    if constexpr (DevicePtxVersion < PolicyPtxVersion)
+    if constexpr (DevicePtxVersion < PolicyPtxVersion && have_previous_policy)
     {
       return PrevPolicyT::template invoke_static<DevicePtxVersion>(op);
     }
@@ -749,34 +793,6 @@ private:
   }
 #endif // !_CCCL_COMPILER(NVRTC)
 };
-/// Helper for dispatching into a policy chain (end-of-chain specialization)
-template <int PolicyPtxVersion, typename PolicyT>
-struct ChainedPolicy<PolicyPtxVersion, PolicyT, PolicyT>
-{
-  template <int, typename, typename>
-  friend struct ChainedPolicy; // befriend primary template, so it can call invoke_static
-  /// The policy for the active compiler pass
-  using ActivePolicy = PolicyT;
-#if !_CCCL_COMPILER(NVRTC)
-  /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
-  template <typename FunctorT>
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Invoke(int /*ptx_version*/, FunctorT& op)
-  {
-    return op.template Invoke<PolicyT>();
-  }
-private:
-  template <int, typename FunctorT>
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t invoke_static(FunctorT& op)
-  {
-    return op.template Invoke<PolicyT>();
-  }
-#endif // !_CCCL_COMPILER(NVRTC)
-};
 CUB_NAMESPACE_END
 #if _CCCL_HAS_CUDA_COMPILER() && !_CCCL_COMPILER(NVRTC)

cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh CHANGED Viewed

@@ -51,6 +51,7 @@
 #include <cuda/__functional/maximum.h>
 #include <cuda/__functional/minimum.h>
 #include <cuda/__ptx/instructions/get_sreg.h>
+#include <cuda/std/__bit/countr.h>
 #include <cuda/std/__functional/operations.h>
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/integral_constant.h>
@@ -701,7 +702,7 @@ struct WarpReduceShfl
   _CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
   {
     // Get the start flags for each thread in the warp.
-    int warp_flags = __ballot_sync(member_mask, flag);
+    unsigned warp_flags = __ballot_sync(member_mask, flag);
     // Convert to tail-segmented
     if (HEAD_SEGMENTED)
@@ -722,7 +723,7 @@ struct WarpReduceShfl
     warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1);
     // Find the next set flag
-    int last_lane = __clz(__brev(warp_flags));
+    int last_lane = ::cuda::std::countr_zero(warp_flags);
     T output = input;
     // Template-iterate reduction steps

cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh CHANGED Viewed

@@ -49,6 +49,7 @@
 #include <cub/util_type.cuh>
 #include <cuda/__ptx/instructions/get_sreg.h>
+#include <cuda/std/__bit/countr.h>
 #include <cuda/std/__type_traits/integral_constant.h>
 CUB_NAMESPACE_BEGIN
@@ -215,7 +216,7 @@ struct WarpReduceSmem
   SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, ::cuda::std::true_type /*has_ballot*/)
   {
     // Get the start flags for each thread in the warp.
-    int warp_flags = __ballot_sync(member_mask, flag);
+    unsigned warp_flags = __ballot_sync(member_mask, flag);
     if (!HEAD_SEGMENTED)
     {
@@ -232,7 +233,7 @@ struct WarpReduceSmem
     }
     // Find next flag
-    int next_flag = __clz(__brev(warp_flags));
+    int next_flag = ::cuda::std::countr_zero(warp_flags);
     // Clip the next segment at the warp boundary if necessary
     if (LOGICAL_WARP_THREADS != 32)