PyPI - cuda-cccl - Versions diffs - 0.1.3.2.0.dev438__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.1__cp311-cp311-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.1.3.2.0.dev438__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.1__cp311-cp311-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (177) hide show

cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh CHANGED Viewed

@@ -47,9 +47,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace reduce
+namespace detail::reduce
 {
 /**
@@ -172,6 +170,10 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
                 AccumT,
                 TransformOpT>;
+  static_assert(sizeof(typename AgentReduceT::TempStorage) <= max_smem_per_block,
+                "cub::DeviceReduce ran out of CUDA shared memory, which we judged to be extremely unlikely. Please "
+                "file an issue at: https://github.com/NVIDIA/cccl/issues");
   // Shared memory storage
   __shared__ typename AgentReduceT::TempStorage temp_storage;
@@ -253,6 +255,10 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
                 AccumT,
                 TransformOpT>;
+  static_assert(sizeof(typename AgentReduceT::TempStorage) <= max_smem_per_block,
+                "cub::DeviceReduce ran out of CUDA shared memory, which we judged to be extremely unlikely. Please "
+                "file an issue at: https://github.com/NVIDIA/cccl/issues");
   // Shared memory storage
   __shared__ typename AgentReduceT::TempStorage temp_storage;
@@ -572,7 +578,6 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(
   }
 }
-} // namespace reduce
-} // namespace detail
+} // namespace detail::reduce
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh CHANGED Viewed

@@ -42,9 +42,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace scan
+namespace detail::scan
 {
 /******************************************************************************
@@ -186,7 +184,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
   AgentScanT(temp_storage, d_in, d_out, scan_op, real_init_value).ConsumeRange(num_items, tile_state, start_tile);
 }
-} // namespace scan
-} // namespace detail
+} // namespace detail::scan
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh CHANGED Viewed

@@ -43,9 +43,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace reduce
+namespace detail::reduce
 {
 /// Normalize input iterator to segment offset
@@ -318,7 +316,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
   }
 }
-} // namespace reduce
-} // namespace detail
+} // namespace detail::reduce
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh CHANGED Viewed

@@ -29,6 +29,56 @@ using local_segment_index_t = ::cuda::std::uint32_t;
 // Type used for total number of segments and to index within segments globally
 using global_segment_offset_t = ::cuda::std::int64_t;
+template <typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+struct LargeSegmentsSelectorT
+{
+  OffsetT value{};
+  BeginOffsetIteratorT d_offset_begin{};
+  EndOffsetIteratorT d_offset_end{};
+  global_segment_offset_t base_segment_offset{};
+#if !_CCCL_COMPILER(NVRTC)
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE
+  LargeSegmentsSelectorT(OffsetT value, BeginOffsetIteratorT d_offset_begin, EndOffsetIteratorT d_offset_end)
+      : value(value)
+      , d_offset_begin(d_offset_begin)
+      , d_offset_end(d_offset_end)
+  {}
+#endif
+  _CCCL_DEVICE _CCCL_FORCEINLINE bool operator()(local_segment_index_t segment_id) const
+  {
+    const OffsetT segment_size =
+      d_offset_end[base_segment_offset + segment_id] - d_offset_begin[base_segment_offset + segment_id];
+    return segment_size > value;
+  }
+};
+template <typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+struct SmallSegmentsSelectorT
+{
+  OffsetT value{};
+  BeginOffsetIteratorT d_offset_begin{};
+  EndOffsetIteratorT d_offset_end{};
+  global_segment_offset_t base_segment_offset{};
+#if !_CCCL_COMPILER(NVRTC)
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE
+  SmallSegmentsSelectorT(OffsetT value, BeginOffsetIteratorT d_offset_begin, EndOffsetIteratorT d_offset_end)
+      : value(value)
+      , d_offset_begin(d_offset_begin)
+      , d_offset_end(d_offset_end)
+  {}
+#endif
+  _CCCL_DEVICE _CCCL_FORCEINLINE bool operator()(local_segment_index_t segment_id) const
+  {
+    const OffsetT segment_size =
+      d_offset_end[base_segment_offset + segment_id] - d_offset_begin[base_segment_offset + segment_id];
+    return segment_size < value;
+  }
+};
 /**
  * @brief Fallback kernel, in case there's not enough segments to
  *        take advantage of partitioning.
@@ -89,7 +139,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
 {
   using ActivePolicyT       = typename ChainedPolicyT::ActivePolicy;
   using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy;
-  using MediumPolicyT       = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT::MediumPolicyT;
+  using MediumPolicyT       = typename ActivePolicyT::MediumSegmentPolicy;
   const auto segment_id = static_cast<local_segment_index_t>(blockIdx.x);
   OffsetT segment_begin = d_begin_offsets[segment_id];
@@ -253,7 +303,7 @@ template <SortOrder Order,
           typename BeginOffsetIteratorT,
           typename EndOffsetIteratorT,
           typename OffsetT>
-__launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolicyT::BLOCK_THREADS)
+__launch_bounds__(ChainedPolicyT::ActivePolicy::SmallSegmentPolicy::BLOCK_THREADS)
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortKernelSmall(
     local_segment_index_t small_segments,
     local_segment_index_t medium_segments,
@@ -272,10 +322,9 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolic
   const local_segment_index_t tid = threadIdx.x;
   const local_segment_index_t bid = blockIdx.x;
-  using ActivePolicyT         = typename ChainedPolicyT::ActivePolicy;
-  using SmallAndMediumPolicyT = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT;
-  using MediumPolicyT         = typename SmallAndMediumPolicyT::MediumPolicyT;
-  using SmallPolicyT          = typename SmallAndMediumPolicyT::SmallPolicyT;
+  using ActivePolicyT = typename ChainedPolicyT::ActivePolicy;
+  using SmallPolicyT  = typename ActivePolicyT::SmallSegmentPolicy;
+  using MediumPolicyT = typename ActivePolicyT::MediumSegmentPolicy;
   constexpr auto threads_per_medium_segment = static_cast<local_segment_index_t>(MediumPolicyT::WARP_THREADS);
   constexpr auto threads_per_small_segment  = static_cast<local_segment_index_t>(SmallPolicyT::WARP_THREADS);
@@ -286,11 +335,9 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolic
   using SmallAgentWarpMergeSortT =
     sub_warp_merge_sort::AgentSubWarpSort<Order == SortOrder::Descending, SmallPolicyT, KeyT, ValueT, OffsetT>;
-  constexpr auto segments_per_medium_block =
-    static_cast<local_segment_index_t>(SmallAndMediumPolicyT::SEGMENTS_PER_MEDIUM_BLOCK);
+  constexpr auto segments_per_medium_block = static_cast<local_segment_index_t>(MediumPolicyT::SEGMENTS_PER_BLOCK);
-  constexpr auto segments_per_small_block =
-    static_cast<local_segment_index_t>(SmallAndMediumPolicyT::SEGMENTS_PER_SMALL_BLOCK);
+  constexpr auto segments_per_small_block = static_cast<local_segment_index_t>(SmallPolicyT::SEGMENTS_PER_BLOCK);
   __shared__ union
   {

cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh CHANGED Viewed

@@ -202,14 +202,18 @@ _CCCL_HOST_DEVICE _CCCL_CONSTEVAL auto load_store_type()
   }
 }
-template <typename VectorizedPolicy, typename Offset, typename F, typename RandomAccessIteratorOut, typename... InputT>
+template <typename VectorizedPolicy,
+          typename Offset,
+          typename F,
+          typename RandomAccessIteratorOut,
+          typename... RandomAccessIteratorsIn>
 _CCCL_DEVICE void transform_kernel_vectorized(
   Offset num_items,
   int num_elem_per_thread_prefetch,
   bool can_vectorize,
   F f,
   RandomAccessIteratorOut out,
-  const InputT*... ins)
+  RandomAccessIteratorsIn... ins)
 {
   constexpr int block_dim        = VectorizedPolicy::block_threads;
   constexpr int items_per_thread = VectorizedPolicy::items_per_thread_vectorized;
@@ -240,9 +244,12 @@ _CCCL_DEVICE void transform_kernel_vectorized(
   constexpr int load_store_size = VectorizedPolicy::load_store_word_size;
   using load_store_t            = decltype(load_store_type<load_store_size>());
   using output_t                = it_value_t<RandomAccessIteratorOut>;
-  using result_t                = ::cuda::std::decay_t<::cuda::std::invoke_result_t<F, const InputT&...>>;
+  using result_t = ::cuda::std::decay_t<::cuda::std::invoke_result_t<F, const it_value_t<RandomAccessIteratorsIn>&...>>;
   // picks output type size if there are no inputs
-  constexpr int element_size     = int{first_item(sizeof(InputT)..., size_of<output_t>)};
+  constexpr int element_size     = int{first_nonzero_value(
+    (sizeof(it_value_t<RandomAccessIteratorsIn>)
+     * THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...,
+    size_of<output_t>)};
   constexpr int load_store_count = (items_per_thread * element_size) / load_store_size;
   static_assert((items_per_thread * element_size) % load_store_size == 0);
@@ -258,18 +265,35 @@ _CCCL_DEVICE void transform_kernel_vectorized(
   auto provide_array = [&](auto... inputs) {
     // load inputs
-    // TODO(bgruber): we could support fancy iterators for loading here as well (and only vectorize some inputs)
-    [[maybe_unused]] auto load_tile_vectorized = [&](auto* in, auto& input) {
-      auto in_vec    = reinterpret_cast<const load_store_t*>(in);
-      auto input_vec = reinterpret_cast<load_store_t*>(input.data());
-      _CCCL_PRAGMA_UNROLL_FULL()
-      for (int i = 0; i < load_store_count; ++i)
+    [[maybe_unused]] auto load_tile = [](auto in, auto& input) {
+      if constexpr (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<decltype(in)>)
       {
-        input_vec[i] = in_vec[i * VectorizedPolicy::block_threads + threadIdx.x];
+        auto in_vec    = reinterpret_cast<const load_store_t*>(in) + threadIdx.x;
+        auto input_vec = reinterpret_cast<load_store_t*>(input.data());
+        _CCCL_PRAGMA_UNROLL_FULL()
+        for (int i = 0; i < load_store_count; ++i)
+        {
+          input_vec[i] = in_vec[i * VectorizedPolicy::block_threads];
+        }
+      }
+      else
+      {
+        constexpr int elems = load_store_size / element_size;
+        in += threadIdx.x * elems;
+        _CCCL_PRAGMA_UNROLL_FULL()
+        for (int i = 0; i < load_store_count; ++i)
+        {
+          _CCCL_PRAGMA_UNROLL_FULL()
+          for (int j = 0; j < elems; ++j)
+          {
+            input[i * elems + j] = in[i * elems * VectorizedPolicy::block_threads + j];
+          }
+        }
       }
     };
     _CCCL_PDL_GRID_DEPENDENCY_SYNC();
-    (load_tile_vectorized(ins, inputs), ...);
+    (load_tile(ins, inputs), ...);
     // Benchmarks showed up to 38% slowdown on H200 (some improvements as well), so omitted. See #5249 for details.
     // _CCCL_PDL_TRIGGER_NEXT_LAUNCH();
@@ -280,7 +304,7 @@ _CCCL_DEVICE void transform_kernel_vectorized(
       output[i] = f(inputs[i]...);
     }
   };
-  provide_array(uninitialized_array<InputT, items_per_thread>{}...);
+  provide_array(uninitialized_array<it_value_t<RandomAccessIteratorsIn>, items_per_thread>{}...);
   // write output
   if constexpr (can_vectorize_store)

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh CHANGED Viewed

@@ -43,9 +43,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace adjacent_difference
+namespace detail::adjacent_difference
 {
 template <typename InputIteratorT, bool MayAlias>
 struct policy_hub
@@ -64,7 +62,6 @@ struct policy_hub
   using MaxPolicy = Policy500;
 };
-} // namespace adjacent_difference
-} // namespace detail
+} // namespace detail::adjacent_difference
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh CHANGED Viewed

@@ -43,9 +43,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace batch_memcpy
+namespace detail::batch_memcpy
 {
 /**
  * Parameterizable tuning policy type for AgentBatchMemcpy
@@ -115,7 +113,6 @@ struct policy_hub
   using MaxPolicy = Policy700;
 };
-} // namespace batch_memcpy
-} // namespace detail
+} // namespace detail::batch_memcpy
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh CHANGED Viewed

@@ -42,9 +42,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace for_each
+namespace detail::for_each
 {
 struct policy_hub_t
@@ -57,7 +55,6 @@ struct policy_hub_t
   using MaxPolicy = policy_500_t;
 };
-} // namespace for_each
-} // namespace detail
+} // namespace detail::for_each
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh CHANGED Viewed

@@ -46,9 +46,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace histogram
+namespace detail::histogram
 {
 enum class primitive_sample
 {
@@ -272,7 +270,6 @@ struct policy_hub
   using MaxPolicy = Policy1000;
 };
-} // namespace histogram
-} // namespace detail
+} // namespace detail::histogram
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh CHANGED Viewed

@@ -42,9 +42,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace merge
+namespace detail::merge
 {
 template <typename KeyT, typename ValueT>
 struct policy_hub
@@ -73,7 +71,6 @@ struct policy_hub
   using max_policy = policy600;
 };
-} // namespace merge
-} // namespace detail
+} // namespace detail::merge
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh CHANGED Viewed

@@ -62,6 +62,14 @@ struct MergeSortPolicyWrapper<StaticPolicyT, ::cuda::std::void_t<decltype(Static
   {}
   CUB_DEFINE_SUB_POLICY_GETTER(MergeSort);
+#if defined(CUB_ENABLE_POLICY_PTX_JSON)
+  _CCCL_DEVICE static constexpr auto EncodedPolicy()
+  {
+    using namespace ptx_json;
+    return object<key<"MergeSortPolicy">() = MergeSort().EncodedPolicy()>();
+  }
+#endif
 };
 template <typename PolicyT>

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh CHANGED Viewed

@@ -46,9 +46,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace radix
+namespace detail::radix
 {
 // sm90 default
 template <size_t KeySize, size_t ValueSize, size_t OffsetSize>
@@ -1062,7 +1060,6 @@ struct policy_hub
   using MaxPolicy = Policy1000;
 };
-} // namespace radix
-} // namespace detail
+} // namespace detail::radix
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh CHANGED Viewed

@@ -50,9 +50,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace reduce_by_key
+namespace detail::reduce_by_key
 {
 enum class primitive_key
 {
@@ -939,7 +937,6 @@ struct policy_hub
   };
   using MaxPolicy = Policy1000;
 };
-} // namespace reduce_by_key
-} // namespace detail
+} // namespace detail::reduce_by_key
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh CHANGED Viewed

@@ -52,9 +52,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace rle
+namespace detail::rle
 {
 enum class primitive_key
 {
@@ -670,7 +668,6 @@ struct policy_hub
   using MaxPolicy = Policy1000;
 };
 } // namespace non_trivial_runs
-} // namespace rle
-} // namespace detail
+} // namespace detail::rle
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh CHANGED Viewed

@@ -53,9 +53,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace scan
+namespace detail::scan
 {
 enum class keep_rejects
 {
@@ -615,7 +613,6 @@ struct policy_hub
   using MaxPolicy = Policy1000;
 };
-} // namespace scan
-} // namespace detail
+} // namespace detail::scan
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh CHANGED Viewed

@@ -49,9 +49,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace scan_by_key
+namespace detail::scan_by_key
 {
 enum class primitive_accum
 {
@@ -1007,7 +1005,6 @@ struct policy_hub
   using MaxPolicy = Policy1000;
 };
-} // namespace scan_by_key
-} // namespace detail
+} // namespace detail::scan_by_key
 CUB_NAMESPACE_END