PyPI - cuda-cccl - Versions diffs - 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show

cuda/cccl/headers/include/cub/agent/agent_merge.cuh CHANGED Viewed

@@ -53,14 +53,8 @@ struct agent_t
   using policy = Policy;
   // key and value type are taken from the first input sequence (consistent with old Thrust behavior)
-  using key_type  = it_value_t<KeysIt1>;
-  using item_type = it_value_t<ItemsIt1>;
-  using keys_load_it1  = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, KeysIt1>;
-  using keys_load_it2  = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, KeysIt2>;
-  using items_load_it1 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt1>;
-  using items_load_it2 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt2>;
+  using key_type          = it_value_t<KeysIt1>;
+  using item_type         = it_value_t<ItemsIt1>;
   using block_store_keys  = typename BlockStoreType<Policy, KeysOutputIt, key_type>::type;
   using block_store_items = typename BlockStoreType<Policy, ItemsOutputIt, item_type>::type;
@@ -84,11 +78,11 @@ struct agent_t
   // Per thread data
   temp_storages& storage;
-  keys_load_it1 keys1_in;
-  items_load_it1 items1_in;
+  KeysIt1 keys1_in;
+  ItemsIt1 items1_in;
   Offset keys1_count;
-  keys_load_it2 keys2_in;
-  items_load_it2 items2_in;
+  KeysIt2 keys2_in;
+  ItemsIt2 items2_in;
   Offset keys2_count;
   KeysOutputIt keys_out;
   ItemsOutputIt items_out;
@@ -128,10 +122,14 @@ struct agent_t
     }
     key_type keys_loc[items_per_thread];
-    merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
-      keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, keys1_count_tile, keys2_count_tile);
-    merge_sort::reg_to_shared<threads_per_block>(&storage.keys_shared[0], keys_loc);
-    __syncthreads();
+    {
+      auto keys1_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(keys1_in);
+      auto keys2_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(keys2_in);
+      merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
+        keys_loc, keys1_in_cm + keys1_beg, keys2_in_cm + keys2_beg, keys1_count_tile, keys2_count_tile);
+      merge_sort::reg_to_shared<threads_per_block>(&storage.keys_shared[0], keys_loc);
+      __syncthreads();
+    }
     // now find the merge path for each of thread.
     // we can use int type here, because the number of items in shared memory is limited
@@ -186,11 +184,15 @@ struct agent_t
     if constexpr (have_items)
     {
       item_type items_loc[items_per_thread];
-      merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
-        items_loc, items1_in + keys1_beg, items2_in + keys2_beg, keys1_count_tile, keys2_count_tile);
-      __syncthreads(); // block_store_keys above uses SMEM, so make sure all threads are done before we write to it
-      merge_sort::reg_to_shared<threads_per_block>(&storage.items_shared[0], items_loc);
-      __syncthreads();
+      {
+        auto items1_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(items1_in);
+        auto items2_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(items2_in);
+        merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
+          items_loc, items1_in_cm + keys1_beg, items2_in_cm + keys2_beg, keys1_count_tile, keys2_count_tile);
+        __syncthreads(); // block_store_keys above uses SMEM, so make sure all threads are done before we write to it
+        merge_sort::reg_to_shared<threads_per_block>(&storage.items_shared[0], items_loc);
+        __syncthreads();
+      }
       // gather items from shared mem
       _CCCL_PRAGMA_UNROLL_FULL()

cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh CHANGED Viewed

@@ -66,9 +66,28 @@ struct AgentMergeSortPolicy
   static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
 };
+#if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
 namespace detail
 {
-namespace merge_sort
+// Only define this when needed.
+// Because of overload woes, this depends on C++20 concepts. util_device.h checks that concepts are available when
+// either runtime policies or PTX JSON information are enabled, so if they are, this is always valid. The generic
+// version is always defined, and that's the only one needed for regular CUB operations.
+//
+// TODO: enable this unconditionally once concepts are always available
+CUB_DETAIL_POLICY_WRAPPER_DEFINE(
+  MergeSortAgentPolicy,
+  (GenericAgentPolicy),
+  (BLOCK_THREADS, BlockThreads, int),
+  (ITEMS_PER_THREAD, ItemsPerThread, int),
+  (ITEMS_PER_TILE, ItemsPerTile, int),
+  (LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
+  (LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
+  (STORE_ALGORITHM, StoreAlgorithm, cub::BlockStoreAlgorithm))
+} // namespace detail
+#endif // defined(CUB_DEFINE_RUNTIME_POLICIES
+namespace detail::merge_sort
 {
 template <typename Policy,
@@ -724,7 +743,6 @@ struct AgentMerge
   }
 };
-} // namespace merge_sort
-} // namespace detail
+} // namespace detail::merge_sort
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh CHANGED Viewed

@@ -146,9 +146,7 @@ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
  * Thread block abstractions
  ******************************************************************************/
-namespace detail
-{
-namespace radix_sort
+namespace detail::radix_sort
 {
 /**
@@ -783,7 +781,6 @@ struct AgentRadixSortDownsweep
   }
 };
-} // namespace radix_sort
-} // namespace detail
+} // namespace detail::radix_sort
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh CHANGED Viewed

@@ -85,9 +85,7 @@ struct AgentRadixSortExclusiveSumPolicy
   };
 };
-namespace detail
-{
-namespace radix_sort
+namespace detail::radix_sort
 {
 template <typename AgentRadixSortHistogramPolicy,
@@ -283,7 +281,6 @@ struct AgentRadixSortHistogram
   }
 };
-} // namespace radix_sort
-} // namespace detail
+} // namespace detail::radix_sort
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh CHANGED Viewed

@@ -100,9 +100,7 @@ struct AgentRadixSortOnesweepPolicy : ScalingType
   static constexpr RadixSortStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
 };
-namespace detail
-{
-namespace radix_sort
+namespace detail::radix_sort
 {
 template <typename AgentRadixSortOnesweepPolicy,
@@ -700,7 +698,6 @@ struct AgentRadixSortOnesweep
   }
 };
-} // namespace radix_sort
-} // namespace detail
+} // namespace detail::radix_sort
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh CHANGED Viewed

@@ -103,9 +103,7 @@ struct AgentRadixSortUpsweepPolicy : ScalingType
  * Thread block abstractions
  ******************************************************************************/
-namespace detail
-{
-namespace radix_sort
+namespace detail::radix_sort
 {
 /**
@@ -552,7 +550,6 @@ struct AgentRadixSortUpsweep
   }
 };
-} // namespace radix_sort
-} // namespace detail
+} // namespace detail::radix_sort
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/agent/agent_rle.cuh CHANGED Viewed

@@ -134,9 +134,7 @@ struct AgentRlePolicy
  * Thread block abstractions
  ******************************************************************************/
-namespace detail
-{
-namespace rle
+namespace detail::rle
 {
 /**
@@ -1121,7 +1119,6 @@ struct AgentRle
   }
 };
-} // namespace rle
-} // namespace detail
+} // namespace detail::rle
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/agent/agent_scan.cuh CHANGED Viewed

@@ -51,6 +51,10 @@
 #include <cub/iterator/cache_modified_input_iterator.cuh>
 #include <cub/util_device.cuh>
+#if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
+#  include <cub/agent/agent_unique_by_key.cuh> // for UniqueByKeyAgentPolicy
+#endif
 #include <cuda/std/__type_traits/conditional.h>
 #include <cuda/std/__type_traits/is_pointer.h>
 #include <cuda/std/__type_traits/is_same.h>
@@ -123,7 +127,7 @@ namespace detail
 // TODO: enable this unconditionally once concepts are always available
 CUB_DETAIL_POLICY_WRAPPER_DEFINE(
   ScanAgentPolicy,
-  (GenericAgentPolicy),
+  (UniqueByKeyAgentPolicy),
   (BLOCK_THREADS, BlockThreads, int),
   (ITEMS_PER_THREAD, ItemsPerThread, int),
   (LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),

cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh CHANGED Viewed

@@ -96,9 +96,7 @@ struct AgentScanByKeyPolicy
  * Thread block abstractions
  ******************************************************************************/
-namespace detail
-{
-namespace scan_by_key
+namespace detail::scan_by_key
 {
 /**
@@ -471,7 +469,6 @@ struct AgentScanByKey
   }
 };
-} // namespace scan_by_key
-} // namespace detail
+} // namespace detail::scan_by_key
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh CHANGED Viewed

@@ -45,9 +45,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace radix_sort
+namespace detail::radix_sort
 {
 /**
@@ -286,7 +284,6 @@ struct AgentSegmentedRadixSort
   }
 };
-} // namespace radix_sort
-} // namespace detail
+} // namespace detail::radix_sort
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/agent/agent_select_if.cuh CHANGED Viewed

@@ -126,9 +126,7 @@ struct AgentSelectIfPolicy
  * Thread block abstractions
  ******************************************************************************/
-namespace detail
-{
-namespace select
+namespace detail::select
 {
 template <typename EqualityOpT>
@@ -1114,7 +1112,6 @@ struct AgentSelectIf
   }
 };
-} // namespace select
-} // namespace detail
+} // namespace detail::select
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh CHANGED Viewed

@@ -84,9 +84,7 @@ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
 } // namespace detail
 #endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
-namespace detail
-{
-namespace sub_warp_merge_sort
+namespace detail::sub_warp_merge_sort
 {
 /**
@@ -343,7 +341,6 @@ private:
   }
 };
-} // namespace sub_warp_merge_sort
-} // namespace detail
+} // namespace detail::sub_warp_merge_sort
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh CHANGED Viewed

@@ -91,9 +91,7 @@ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
 } // namespace detail
 #endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
-namespace detail
-{
-namespace three_way_partition
+namespace detail::three_way_partition
 {
 template <class OffsetT>
@@ -603,7 +601,6 @@ struct AgentThreeWayPartition
   }
 };
-} // namespace three_way_partition
-} // namespace detail
+} // namespace detail::three_way_partition
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh CHANGED Viewed

@@ -85,13 +85,31 @@ struct AgentUniqueByKeyPolicy
   };
 };
+#if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
+namespace detail
+{
+// Only define this when needed.
+// Because of overload woes, this depends on C++20 concepts. util_device.h checks that concepts are available when
+// either runtime policies or PTX JSON information are enabled, so if they are, this is always valid. The generic
+// version is always defined, and that's the only one needed for regular CUB operations.
+//
+// TODO: enable this unconditionally once concepts are always available
+CUB_DETAIL_POLICY_WRAPPER_DEFINE(
+  UniqueByKeyAgentPolicy,
+  (GenericAgentPolicy),
+  (BLOCK_THREADS, BlockThreads, int),
+  (ITEMS_PER_THREAD, ItemsPerThread, int),
+  (LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
+  (LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
+  (SCAN_ALGORITHM, ScanAlgorithm, cub::BlockScanAlgorithm))
+} // namespace detail
+#endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
 /******************************************************************************
  * Thread block abstractions
  ******************************************************************************/
-namespace detail
-{
-namespace unique_by_key
+namespace detail::unique_by_key
 {
 /**
@@ -608,7 +626,6 @@ struct AgentUniqueByKey
   }
 };
-} // namespace unique_by_key
-} // namespace detail
+} // namespace detail::unique_by_key
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh CHANGED Viewed

@@ -111,10 +111,9 @@ CUB_NAMESPACE_BEGIN
 //!        // Collectively compute adjacent_difference
 //!        int result[4];
 //!
-//!        BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
-//!            thread_data,
-//!            result,
-//!            CustomDifference());
+//!        BlockAdjacentDifferenceT(temp_storage).SubtractLeft(thread_data, result,
+//!                                                            CustomDifference());
+//!    }
 //!
 //! Suppose the set of input `thread_data` across the block of threads is
 //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
@@ -283,10 +282,9 @@ public:
   //!        ...
   //!
   //!        // Collectively compute adjacent_difference
-  //!        BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
-  //!            thread_data,
-  //!            thread_data,
-  //!            CustomDifference());
+  //!        BlockAdjacentDifferenceT(temp_storage).SubtractLeft(thread_data, thread_data,
+  //!                                                            CustomDifference());
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.

cuda/cccl/headers/include/cub/block/block_discontinuity.cuh CHANGED Viewed

@@ -96,6 +96,7 @@ CUB_NAMESPACE_BEGIN
 //!        // Collectively compute head flags for discontinuities in the segment
 //!        int head_flags[4];
 //!        BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+//!    }
 //!
 //! Suppose the set of input ``thread_data`` across the block of threads is
 //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``.
@@ -387,6 +388,7 @@ public:
   //!        // Collectively compute head flags for discontinuities in the segment
   //!        int head_flags[4];
   //!        BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``.
@@ -463,8 +465,9 @@ public:
   //!
   //!        // Collectively compute head flags for discontinuities in the segment
   //!        int head_flags[4];
-  //!        BlockDiscontinuity(temp_storage).FlagHeads(
-  //!            head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
+  //!        BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data,
+  //!                                                   cub::Inequality(), tile_predecessor_item);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``,
@@ -549,6 +552,7 @@ public:
   //!        // Collectively compute tail flags for discontinuities in the segment
   //!        int tail_flags[4];
   //!        BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``.
@@ -640,8 +644,9 @@ public:
   //!
   //!        // Collectively compute tail flags for discontinuities in the segment
   //!        int tail_flags[4];
-  //!        BlockDiscontinuity(temp_storage).FlagTails(
-  //!            tail_flags, thread_data, cub::Inequality(), tile_successor_item);
+  //!        BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data,
+  //!                                                   cub::Inequality(), tile_successor_item);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
@@ -742,8 +747,9 @@ public:
   //!        // Collectively compute head and flags for discontinuities in the segment
   //!        int head_flags[4];
   //!        int tail_flags[4];
-  //!        BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
-  //!            head_flags, tail_flags, thread_data, cub::Inequality());
+  //!        BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tail_flags, thread_data,
+  //!                                                           cub::Inequality());
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
@@ -864,8 +870,10 @@ public:
   //!        // Collectively compute head and flags for discontinuities in the segment
   //!        int head_flags[4];
   //!        int tail_flags[4];
-  //!        BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
-  //!            head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
+  //!        BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tail_flags,
+  //!                                                           tile_successor_item, thread_data,
+  //!                                                           cub::Inequality());
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
@@ -997,9 +1005,10 @@ public:
   //!        // Collectively compute head and flags for discontinuities in the segment
   //!        int head_flags[4];
   //!        int tail_flags[4];
-  //!        BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
-  //!            head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
-  //!            thread_data, cub::Inequality());
+  //!        BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tile_predecessor_item,
+  //!                                                           tail_flags, tile_successor_item,
+  //!                                                           thread_data, cub::Inequality());
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``,
@@ -1126,9 +1135,10 @@ public:
   //!        // Collectively compute head and flags for discontinuities in the segment
   //!        int head_flags[4];
   //!        int tail_flags[4];
-  //!        BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
-  //!            head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
-  //!            thread_data, cub::Inequality());
+  //!        BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tile_predecessor_item,
+  //!                                                           tail_flags, tile_successor_item,
+  //!                                                           thread_data, cub::Inequality());
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``,

cuda/cccl/headers/include/cub/block/block_exchange.cuh CHANGED Viewed

@@ -101,6 +101,7 @@ CUB_NAMESPACE_BEGIN
 //!
 //!        // Collectively exchange data into a blocked arrangement across threads
 //!        BlockExchange(temp_storage).StripedToBlocked(thread_data);
+//!    }
 //!
 //! Suppose the set of striped input ``thread_data`` across the block of threads is ``{ [0,128,256,384],
 //! [1,129,257,385], ..., [127,255,383,511] }``. The corresponding output ``thread_data`` in those threads will be
@@ -883,6 +884,7 @@ public:
   //!
   //!        // Collectively exchange data into a blocked arrangement across threads
   //!        BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
+  //!    }
   //!
   //! Suppose the set of striped input ``thread_data`` across the block of threads is ``{ [0,128,256,384],
   //! [1,129,257,385], ..., [127,255,383,511] }`` after loading from device-accessible memory. The corresponding output
@@ -933,6 +935,7 @@ public:
   //!
   //!        // Store data striped across block threads into an ordered tile
   //!        cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+  //!    }
   //!
   //! Suppose the set of blocked input ``thread_data`` across the block of threads is ``{ [0,1,2,3], [4,5,6,7],
   //! [8,9,10,11], ..., [508,509,510,511] }``. The corresponding output ``thread_data`` in those threads will be
@@ -983,6 +986,7 @@ public:
   //!
   //!        // Collectively exchange data into a blocked arrangement across threads
   //!        BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
+  //!    }
   //!
   //! Suppose the set of warp-striped input ``thread_data`` across the block of threads is ``{ [0,32,64,96],
   //! [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }`` after loading from device-accessible memory. (The first 128
@@ -1037,6 +1041,7 @@ public:
   //!
   //!        // Store data striped across warp threads into an ordered tile
   //!        cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+  //!    }
   //!
   //! Suppose the set of blocked input ``thread_data`` across the block of threads is ``{ [0,1,2,3], [4,5,6,7],
   //! [8,9,10,11], ..., [508,509,510,511] }``. The corresponding output ``thread_data`` in those threads will be

cuda/cccl/headers/include/cub/block/block_histogram.cuh CHANGED Viewed

@@ -140,6 +140,7 @@ enum BlockHistogramAlgorithm
 //!
 //!        // Compute the block-wide histogram
 //!        BlockHistogram(temp_storage).Histogram(data, smem_histogram);
+//!    }
 //!
 //! Performance and Usage Considerations
 //! +++++++++++++++++++++++++++++++++++++++++++++
@@ -281,6 +282,7 @@ public:
   //!
   //!      // Update the block-wide histogram
   //!      BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+  //!    }
   //!
   //! @endrst
   //!
@@ -338,6 +340,7 @@ public:
   //!
   //!        // Compute the block-wide histogram
   //!        BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
+  //!    }
   //!
   //! @endrst
   //!
@@ -399,6 +402,7 @@ public:
   //!
   //!        // Update the block-wide histogram
   //!        BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+  //!    }
   //!
   //! @endrst
   //!

cuda/cccl/headers/include/cub/block/block_load.cuh CHANGED Viewed

@@ -771,6 +771,7 @@ enum BlockLoadAlgorithm
 //!        // Load a segment of consecutive items that are blocked across threads
 //!        int thread_data[4];
 //!        BlockLoad(temp_storage).Load(d_data, thread_data);
+//!    }
 //!
 //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads in
 //! those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
@@ -1123,6 +1124,7 @@ public:
   //!        // Load a segment of consecutive items that are blocked across threads
   //!        int thread_data[4];
   //!        BlockLoad(temp_storage).Load(d_data, thread_data);
+  //!    }
   //!
   //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads
   //! in those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
@@ -1170,6 +1172,7 @@ public:
   //!        // Load a segment of consecutive items that are blocked across threads
   //!        int thread_data[4];
   //!        BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end);
+  //!    }
   //!
   //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...`` and ``block_items_end`` is ``5``. The set of
   //! ``thread_data`` across the block of threads in those threads will be ``{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }``,
@@ -1222,6 +1225,7 @@ public:
   //!        // Load a segment of consecutive items that are blocked across threads
   //!        int thread_data[4];
   //!        BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end, -1);
+  //!    }
   //!
   //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...``, ``block_items_end`` is ``5``, and the out-of-bounds
   //! default is ``-1``. The set of ``thread_data`` across the block of threads in those threads will be

cuda/cccl/headers/include/cub/block/block_radix_rank.cuh CHANGED Viewed

@@ -50,6 +50,7 @@
 #include <cuda/__ptx/instructions/get_sreg.h>
 #include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__bit/integral.h>
 #include <cuda/std/__functional/operations.h>
 #include <cuda/std/__type_traits/conditional.h>
 #include <cuda/std/__type_traits/is_same.h>
@@ -168,6 +169,7 @@ struct warp_in_block_matcher_t<Bits, 0, PartialWarpId>
 //!      block_radix_rank(temp_storage).RankKeys(keys, ranks, extractor);
 //!
 //!      ...
+//!    }
 //!
 //! Suppose the set of input ``keys`` across the block of threads is ``{ [16,10], [9,11] }``.
 //! The corresponding output ``ranks`` in those threads will be ``{ [3,1], [0,2] }``.
@@ -1072,7 +1074,7 @@ struct BlockRadixRankMatchEarlyCounts
         atomicOr(p_match_mask, lane_mask);
         __syncwarp(WARP_MASK);
         int bin_mask    = *p_match_mask;
-        int leader      = (WARP_THREADS - 1) - __clz(bin_mask);
+        int leader      = ::cuda::std::__bit_log2(static_cast<unsigned>(bin_mask));
         int warp_offset = 0;
         int popc        = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
         if (lane == leader)
@@ -1102,7 +1104,7 @@ struct BlockRadixRankMatchEarlyCounts
         ::cuda::std::uint32_t bin = Digit(keys[u]);
         int bin_mask =
           detail::warp_in_block_matcher_t<RADIX_BITS, PARTIAL_WARP_THREADS, BLOCK_WARPS - 1>::match_any(bin, warp);
-        int leader      = (WARP_THREADS - 1) - __clz(bin_mask);
+        int leader      = ::cuda::std::__bit_log2(static_cast<unsigned>(bin_mask));
         int warp_offset = 0;
         int popc        = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
         if (lane == leader)