PyPI - cuda-cccl - Versions diffs - 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (144) hide show

cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh CHANGED Viewed

@@ -328,11 +328,6 @@ struct DispatchReduceDeterministic
     // Alias the allocation for the privatized per-block reductions
     deterministic_accum_t* d_block_reductions = (deterministic_accum_t*) allocations[0];
-    if (num_chunks > 1 && !detail::all_iterators_support_add_assign_operator(::cuda::std::int32_t{}, d_in))
-    {
-      return cudaErrorInvalidValue;
-    }
     auto d_chunk_block_reductions = d_block_reductions;
     for (int chunk_index = 0; chunk_index < num_chunks; chunk_index++)
     {
@@ -372,7 +367,7 @@ struct DispatchReduceDeterministic
       if (chunk_index + 1 < num_chunks)
       {
-        detail::advance_iterators_inplace_if_supported(d_in, num_current_items);
+        d_in += num_current_items;
         d_chunk_block_reductions += current_grid_size;
       }

cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh CHANGED Viewed

@@ -20,7 +20,6 @@
 #include <cub/detail/launcher/cuda_runtime.cuh>
 #include <cub/detail/type_traits.cuh> // for cub::detail::invoke_result_t
-#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
 #include <cub/device/dispatch/kernels/reduce.cuh>
 #include <cub/device/dispatch/tuning/tuning_reduce.cuh>
 #include <cub/grid/grid_even_share.cuh>

cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh CHANGED Viewed

@@ -40,7 +40,6 @@
 #include <cub/detail/device_double_buffer.cuh>
 #include <cub/detail/temporary_storage.cuh>
 #include <cub/device/device_partition.cuh>
-#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
 #include <cub/device/dispatch/kernels/segmented_sort.cuh>
 #include <cub/device/dispatch/tuning/tuning_segmented_sort.cuh>
 #include <cub/util_debug.cuh>
@@ -764,8 +763,8 @@ private:
       BeginOffsetIteratorT current_begin_offset = d_begin_offsets;
       EndOffsetIteratorT current_end_offset     = d_end_offsets;
-      detail::advance_iterators_inplace_if_supported(current_begin_offset, current_seg_offset);
-      detail::advance_iterators_inplace_if_supported(current_end_offset, current_seg_offset);
+      current_begin_offset += current_seg_offset;
+      current_end_offset += current_seg_offset;
       auto medium_indices_iterator =
         ::cuda::std::make_reverse_iterator(large_and_medium_segments_indices.get() + current_num_segments);

cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh CHANGED Viewed

@@ -47,9 +47,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace reduce
+namespace detail::reduce
 {
 /**
@@ -580,7 +578,6 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(
   }
 }
-} // namespace reduce
-} // namespace detail
+} // namespace detail::reduce
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh CHANGED Viewed

@@ -42,9 +42,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace scan
+namespace detail::scan
 {
 /******************************************************************************
@@ -186,7 +184,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
   AgentScanT(temp_storage, d_in, d_out, scan_op, real_init_value).ConsumeRange(num_items, tile_state, start_tile);
 }
-} // namespace scan
-} // namespace detail
+} // namespace detail::scan
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh CHANGED Viewed

@@ -43,9 +43,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace reduce
+namespace detail::reduce
 {
 /// Normalize input iterator to segment offset
@@ -318,7 +316,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
   }
 }
-} // namespace reduce
-} // namespace detail
+} // namespace detail::reduce
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh CHANGED Viewed

@@ -43,9 +43,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace adjacent_difference
+namespace detail::adjacent_difference
 {
 template <typename InputIteratorT, bool MayAlias>
 struct policy_hub
@@ -64,7 +62,6 @@ struct policy_hub
   using MaxPolicy = Policy500;
 };
-} // namespace adjacent_difference
-} // namespace detail
+} // namespace detail::adjacent_difference
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh CHANGED Viewed

@@ -43,9 +43,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace batch_memcpy
+namespace detail::batch_memcpy
 {
 /**
  * Parameterizable tuning policy type for AgentBatchMemcpy
@@ -115,7 +113,6 @@ struct policy_hub
   using MaxPolicy = Policy700;
 };
-} // namespace batch_memcpy
-} // namespace detail
+} // namespace detail::batch_memcpy
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh CHANGED Viewed

@@ -42,9 +42,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace for_each
+namespace detail::for_each
 {
 struct policy_hub_t
@@ -57,7 +55,6 @@ struct policy_hub_t
   using MaxPolicy = policy_500_t;
 };
-} // namespace for_each
-} // namespace detail
+} // namespace detail::for_each
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh CHANGED Viewed

@@ -46,9 +46,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace histogram
+namespace detail::histogram
 {
 enum class primitive_sample
 {
@@ -272,7 +270,6 @@ struct policy_hub
   using MaxPolicy = Policy1000;
 };
-} // namespace histogram
-} // namespace detail
+} // namespace detail::histogram
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh CHANGED Viewed

@@ -42,9 +42,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace merge
+namespace detail::merge
 {
 template <typename KeyT, typename ValueT>
 struct policy_hub
@@ -73,7 +71,6 @@ struct policy_hub
   using max_policy = policy600;
 };
-} // namespace merge
-} // namespace detail
+} // namespace detail::merge
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh CHANGED Viewed

@@ -62,6 +62,14 @@ struct MergeSortPolicyWrapper<StaticPolicyT, ::cuda::std::void_t<decltype(Static
   {}
   CUB_DEFINE_SUB_POLICY_GETTER(MergeSort);
+#if defined(CUB_ENABLE_POLICY_PTX_JSON)
+  _CCCL_DEVICE static constexpr auto EncodedPolicy()
+  {
+    using namespace ptx_json;
+    return object<key<"MergeSortPolicy">() = MergeSort().EncodedPolicy()>();
+  }
+#endif
 };
 template <typename PolicyT>

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh CHANGED Viewed

@@ -46,9 +46,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace radix
+namespace detail::radix
 {
 // sm90 default
 template <size_t KeySize, size_t ValueSize, size_t OffsetSize>
@@ -1062,7 +1060,6 @@ struct policy_hub
   using MaxPolicy = Policy1000;
 };
-} // namespace radix
-} // namespace detail
+} // namespace detail::radix
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh CHANGED Viewed

@@ -50,9 +50,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace reduce_by_key
+namespace detail::reduce_by_key
 {
 enum class primitive_key
 {
@@ -939,7 +937,6 @@ struct policy_hub
   };
   using MaxPolicy = Policy1000;
 };
-} // namespace reduce_by_key
-} // namespace detail
+} // namespace detail::reduce_by_key
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh CHANGED Viewed

@@ -52,9 +52,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace rle
+namespace detail::rle
 {
 enum class primitive_key
 {
@@ -670,7 +668,6 @@ struct policy_hub
   using MaxPolicy = Policy1000;
 };
 } // namespace non_trivial_runs
-} // namespace rle
-} // namespace detail
+} // namespace detail::rle
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh CHANGED Viewed

@@ -53,9 +53,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace scan
+namespace detail::scan
 {
 enum class keep_rejects
 {
@@ -615,7 +613,6 @@ struct policy_hub
   using MaxPolicy = Policy1000;
 };
-} // namespace scan
-} // namespace detail
+} // namespace detail::scan
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh CHANGED Viewed

@@ -49,9 +49,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace scan_by_key
+namespace detail::scan_by_key
 {
 enum class primitive_accum
 {
@@ -1007,7 +1005,6 @@ struct policy_hub
   using MaxPolicy = Policy1000;
 };
-} // namespace scan_by_key
-} // namespace detail
+} // namespace detail::scan_by_key
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh CHANGED Viewed

@@ -43,9 +43,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace segmented_sort
+namespace detail::segmented_sort
 {
 template <typename PolicyT, typename = void>
@@ -395,7 +393,6 @@ struct policy_hub
   using MaxPolicy = Policy860;
 };
-} // namespace segmented_sort
-} // namespace detail
+} // namespace detail::segmented_sort
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh CHANGED Viewed

@@ -47,9 +47,7 @@
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace three_way_partition
+namespace detail::three_way_partition
 {
 template <typename PolicyT, typename = void>
@@ -437,7 +435,6 @@ struct policy_hub
   using MaxPolicy = Policy1000;
 };
-} // namespace three_way_partition
-} // namespace detail
+} // namespace detail::three_way_partition
 CUB_NAMESPACE_END

cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh CHANGED Viewed

@@ -788,6 +788,16 @@ struct UniqueByKeyPolicyWrapper<StaticPolicyT,
   {
     return cub::detail::MakePolicyWrapper(typename StaticPolicyT::UniqueByKeyPolicyT());
   }
+#if defined(CUB_ENABLE_POLICY_PTX_JSON)
+  _CCCL_DEVICE static constexpr auto EncodedPolicy()
+  {
+    using namespace ptx_json;
+    return object<key<"UniqueByKeyPolicyT">() = UniqueByKey().EncodedPolicy(),
+                  key<"DelayConstructor">() =
+                    StaticPolicyT::UniqueByKeyPolicyT::detail::delay_constructor_t::EncodedConstructor()>();
+  }
+#endif
 };
 template <typename PolicyT>

cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh CHANGED Viewed

@@ -51,6 +51,7 @@
 #include <cuda/__functional/maximum.h>
 #include <cuda/__functional/minimum.h>
 #include <cuda/__ptx/instructions/get_sreg.h>
+#include <cuda/std/__bit/countr.h>
 #include <cuda/std/__functional/operations.h>
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/integral_constant.h>
@@ -701,7 +702,7 @@ struct WarpReduceShfl
   _CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
   {
     // Get the start flags for each thread in the warp.
-    int warp_flags = __ballot_sync(member_mask, flag);
+    unsigned warp_flags = __ballot_sync(member_mask, flag);
     // Convert to tail-segmented
     if (HEAD_SEGMENTED)
@@ -722,7 +723,7 @@ struct WarpReduceShfl
     warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1);
     // Find the next set flag
-    int last_lane = __clz(__brev(warp_flags));
+    int last_lane = ::cuda::std::countr_zero(warp_flags);
     T output = input;
     // Template-iterate reduction steps

cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh CHANGED Viewed

@@ -49,6 +49,7 @@
 #include <cub/util_type.cuh>
 #include <cuda/__ptx/instructions/get_sreg.h>
+#include <cuda/std/__bit/countr.h>
 #include <cuda/std/__type_traits/integral_constant.h>
 CUB_NAMESPACE_BEGIN
@@ -215,7 +216,7 @@ struct WarpReduceSmem
   SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, ::cuda::std::true_type /*has_ballot*/)
   {
     // Get the start flags for each thread in the warp.
-    int warp_flags = __ballot_sync(member_mask, flag);
+    unsigned warp_flags = __ballot_sync(member_mask, flag);
     if (!HEAD_SEGMENTED)
     {
@@ -232,7 +233,7 @@ struct WarpReduceSmem
     }
     // Find next flag
-    int next_flag = __clz(__brev(warp_flags));
+    int next_flag = ::cuda::std::countr_zero(warp_flags);
     // Clip the next segment at the warp boundary if necessary
     if (LOGICAL_WARP_THREADS != 32)

cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh CHANGED Viewed

@@ -50,8 +50,8 @@
 #include <cuda/__ptx/instructions/get_sreg.h>
 #include <cuda/std/__algorithm/clamp.h>
-#include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__bit/has_single_bit.h>
+#include <cuda/std/__bit/integral.h>
 #include <cuda/std/__functional/operations.h>
 #include <cuda/std/__type_traits/integral_constant.h>
 #include <cuda/std/__type_traits/is_integral.h>
@@ -630,7 +630,7 @@ struct WarpScanShfl
     ballot = ballot & ::cuda::ptx::get_sreg_lanemask_le();
     // Find index of first set bit
-    int segment_first_lane = ::cuda::std::max(0, 31 - __clz(ballot));
+    int segment_first_lane = ::cuda::std::__bit_log2(ballot);
     // Iterate scan steps
     _CCCL_PRAGMA_UNROLL_FULL()

cuda/cccl/headers/include/cuda/__algorithm/common.h CHANGED Viewed

@@ -11,7 +11,7 @@
 #ifndef __CUDA___ALGORITHM_COMMON
 #define __CUDA___ALGORITHM_COMMON
-#include <cuda/__cccl_config>
+#include <cuda/std/detail/__config>
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header

cuda/cccl/headers/include/cuda/__algorithm/copy.h CHANGED Viewed

@@ -11,7 +11,7 @@
 #ifndef __CUDA___ALGORITHM_COPY_H
 #define __CUDA___ALGORITHM_COPY_H
-#include <cuda/__cccl_config>
+#include <cuda/std/detail/__config>
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header

cuda/cccl/headers/include/cuda/__algorithm/fill.h CHANGED Viewed

@@ -11,7 +11,7 @@
 #ifndef __CUDA___ALGORITHM_FILL
 #define __CUDA___ALGORITHM_FILL
-#include <cuda/__cccl_config>
+#include <cuda/std/detail/__config>
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header