PyPI - cuda-cccl - Versions diffs - 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (144) hide show

cuda/cccl/headers/include/cuda/std/__bit/countl.h CHANGED Viewed

@@ -100,7 +100,14 @@ template <typename _Tp>
 template <typename _Tp>
 [[nodiscard]] _CCCL_HIDE_FROM_ABI _CCCL_DEVICE int __cccl_countl_zero_impl_device(_Tp __v) noexcept
 {
-  return (sizeof(_Tp) == sizeof(uint32_t)) ? ::__clz(static_cast<int>(__v)) : ::__clzll(static_cast<long long>(__v));
+  if constexpr (sizeof(_Tp) == sizeof(uint32_t))
+  {
+    return static_cast<int>(::__clz(static_cast<int>(__v)));
+  }
+  else
+  {
+    return static_cast<int>(::__clzll(static_cast<long long>(__v)));
+  }
 }
 #endif // _CCCL_CUDA_COMPILATION()

cuda/cccl/headers/include/cuda/std/__bit/countr.h CHANGED Viewed

@@ -114,11 +114,11 @@ template <typename _Tp>
 {
   if constexpr (sizeof(_Tp) == sizeof(uint32_t))
   {
-    return ::__clz(static_cast<int>(::__brev(__v)));
+    return static_cast<int>(::__clz(static_cast<int>(::__brev(__v))));
   }
   else
   {
-    return ::__clzll(static_cast<long long>(::__brevll(__v)));
+    return static_cast<int>(::__clzll(static_cast<long long>(::__brevll(__v))));
   }
 }
 #endif // _CCCL_CUDA_COMPILATION()

cuda/cccl/headers/include/cuda/std/__bit/reference.h CHANGED Viewed

@@ -275,10 +275,10 @@ _CCCL_API constexpr __bit_iterator<_Cp, false> __copy_aligned(
     // do first word
     if (__first.__ctz_ != 0)
     {
-      unsigned __clz       = __bits_per_word - __first.__ctz_;
-      difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz), __n);
+      unsigned __clz_f     = __bits_per_word - __first.__ctz_;
+      difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
       __n -= __dn;
-      __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+      __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
       __storage_type __b = *__first.__seg_ & __m;
       *__result.__seg_ &= ~__m;
       *__result.__seg_ |= __b;
@@ -420,8 +420,8 @@ _CCCL_API constexpr __bit_iterator<_Cp, false> __copy_backward_aligned(
     {
       difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__last.__ctz_), __n);
       __n -= __dn;
-      unsigned __clz     = __bits_per_word - __last.__ctz_;
-      __storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
+      unsigned __clz_f   = __bits_per_word - __last.__ctz_;
+      __storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_f);
       __storage_type __b = *__last.__seg_ & __m;
       *__result.__seg_ &= ~__m;
       *__result.__seg_ |= __b;
@@ -635,10 +635,10 @@ _CCCL_API inline __bit_iterator<_Cr, false> __swap_ranges_aligned(
     // do first word
     if (__first.__ctz_ != 0)
     {
-      unsigned __clz       = __bits_per_word - __first.__ctz_;
-      difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz), __n);
+      unsigned __clz_f     = __bits_per_word - __first.__ctz_;
+      difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
       __n -= __dn;
-      __storage_type __m  = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+      __storage_type __m  = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
       __storage_type __b1 = *__first.__seg_ & __m;
       *__first.__seg_ &= ~__m;
       __storage_type __b2 = *__result.__seg_ & __m;
@@ -988,10 +988,10 @@ _CCCL_API constexpr bool __equal_aligned(
     // do first word
     if (__first1.__ctz_ != 0)
     {
-      unsigned __clz       = __bits_per_word - __first1.__ctz_;
-      difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz), __n);
+      unsigned __clz_f     = __bits_per_word - __first1.__ctz_;
+      difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
       __n -= __dn;
-      __storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+      __storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
       if ((*__first2.__seg_ & __m) != (*__first1.__seg_ & __m))
       {
         return false;

cuda/cccl/headers/include/cuda/std/__chrono/duration.h CHANGED Viewed

@@ -43,19 +43,19 @@ template <class _Rep, class _Period = ratio<1>>
 class _CCCL_TYPE_VISIBILITY_DEFAULT duration;
 template <class _Tp>
-inline const bool __is_duration_v = false;
+inline constexpr bool __is_duration_v = false;
 template <class _Rep, class _Period>
-inline const bool __is_duration_v<duration<_Rep, _Period>> = true;
+inline constexpr bool __is_duration_v<duration<_Rep, _Period>> = true;
 template <class _Rep, class _Period>
-inline const bool __is_duration_v<const duration<_Rep, _Period>> = true;
+inline constexpr bool __is_duration_v<const duration<_Rep, _Period>> = true;
 template <class _Rep, class _Period>
-inline const bool __is_duration_v<volatile duration<_Rep, _Period>> = true;
+inline constexpr bool __is_duration_v<volatile duration<_Rep, _Period>> = true;
 template <class _Rep, class _Period>
-inline const bool __is_duration_v<const volatile duration<_Rep, _Period>> = true;
+inline constexpr bool __is_duration_v<const volatile duration<_Rep, _Period>> = true;
 } // namespace chrono
@@ -190,29 +190,29 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT duration
   struct __no_overflow
   {
   private:
-    static const intmax_t __gcd_n1_n2 = __static_gcd<_R1::num, _R2::num>::value;
-    static const intmax_t __gcd_d1_d2 = __static_gcd<_R1::den, _R2::den>::value;
-    static const intmax_t __n1        = _R1::num / __gcd_n1_n2;
-    static const intmax_t __d1        = _R1::den / __gcd_d1_d2;
-    static const intmax_t __n2        = _R2::num / __gcd_n1_n2;
-    static const intmax_t __d2        = _R2::den / __gcd_d1_d2;
-    static const intmax_t max         = -((intmax_t(1) << (sizeof(intmax_t) * CHAR_BIT - 1)) + 1);
+    static constexpr intmax_t __gcd_n1_n2 = __static_gcd<_R1::num, _R2::num>::value;
+    static constexpr intmax_t __gcd_d1_d2 = __static_gcd<_R1::den, _R2::den>::value;
+    static constexpr intmax_t __n1        = _R1::num / __gcd_n1_n2;
+    static constexpr intmax_t __d1        = _R1::den / __gcd_d1_d2;
+    static constexpr intmax_t __n2        = _R2::num / __gcd_n1_n2;
+    static constexpr intmax_t __d2        = _R2::den / __gcd_d1_d2;
+    static constexpr intmax_t max         = -((intmax_t(1) << (sizeof(intmax_t) * CHAR_BIT - 1)) + 1);
     template <intmax_t _Xp, intmax_t _Yp, bool __overflow>
     struct __mul // __overflow == false
     {
-      static const intmax_t value = _Xp * _Yp;
+      static constexpr intmax_t value = _Xp * _Yp;
     };
     template <intmax_t _Xp, intmax_t _Yp>
     struct __mul<_Xp, _Yp, true>
     {
-      static const intmax_t value = 1;
+      static constexpr intmax_t value = 1;
     };
   public:
-    static const bool value = (__n1 <= max / __d2) && (__n2 <= max / __d1);
-    using type              = ratio<__mul<__n1, __d2, !value>::value, __mul<__n2, __d1, !value>::value>;
+    static constexpr bool value = (__n1 <= max / __d2) && (__n2 <= max / __d1);
+    using type                  = ratio<__mul<__n1, __d2, !value>::value, __mul<__n2, __d1, !value>::value>;
   };
 public:

cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h CHANGED Viewed

@@ -40,11 +40,11 @@ namespace chrono
 class _CCCL_TYPE_VISIBILITY_DEFAULT steady_clock
 {
 public:
-  using duration                        = nanoseconds;
-  using rep                             = duration::rep;
-  using period                          = duration::period;
-  using time_point                      = ::cuda::std::chrono::time_point<steady_clock, duration>;
-  static constexpr const bool is_steady = true;
+  using duration                  = nanoseconds;
+  using rep                       = duration::rep;
+  using period                    = duration::period;
+  using time_point                = ::cuda::std::chrono::time_point<steady_clock, duration>;
+  static constexpr bool is_steady = true;
   [[nodiscard]] _CCCL_API static time_point now() noexcept;
 };

cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h CHANGED Viewed

@@ -39,11 +39,11 @@ namespace chrono
 class _CCCL_TYPE_VISIBILITY_DEFAULT system_clock
 {
 public:
-  using duration                        = ::cuda::std::chrono::nanoseconds;
-  using rep                             = duration::rep;
-  using period                          = duration::period;
-  using time_point                      = ::cuda::std::chrono::time_point<system_clock>;
-  static constexpr const bool is_steady = false;
+  using duration                  = ::cuda::std::chrono::nanoseconds;
+  using rep                       = duration::rep;
+  using period                    = duration::period;
+  using time_point                = ::cuda::std::chrono::time_point<system_clock>;
+  static constexpr bool is_steady = false;
   [[nodiscard]] _CCCL_API inline static time_point now() noexcept
   {

cuda/cccl/headers/include/cuda/std/__floating_point/fp.h CHANGED Viewed

@@ -11,7 +11,7 @@
 #ifndef _CUDA_STD___FLOATING_POINT_FP_H
 #define _CUDA_STD___FLOATING_POINT_FP_H
-#include <cuda/__cccl_config>
+#include <cuda/std/detail/__config>
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header

cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h CHANGED Viewed

@@ -20,7 +20,9 @@
 #  pragma system_header
 #endif // no system header
+#include <cuda/__fwd/complex.h>
 #include <cuda/std/__fwd/array.h>
+#include <cuda/std/__fwd/complex.h>
 #include <cuda/std/__fwd/tuple.h>
 #include <cuda/std/__tuple_dir/tuple_element.h>
 #include <cuda/std/__tuple_dir/tuple_indices.h>
@@ -61,7 +63,27 @@ struct __make_tuple_types_flat<array<_Vt, _Np>, __tuple_indices<_Idx...>>
   template <size_t>
   using __value_type = _Vt;
   template <class _Tp, class _ApplyFn = __apply_cvref_fn<_Tp>>
-  using __apply_quals = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
+  using __apply_quals _CCCL_NODEBUG_ALIAS = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
+};
+template <class _Vt, size_t... _Idx>
+struct __make_tuple_types_flat<complex<_Vt>, __tuple_indices<_Idx...>>
+{
+  static_assert(sizeof...(_Idx) == 2, "__make_tuple_types: complex has only 2 members");
+  template <size_t>
+  using __value_type = _Vt;
+  template <class _Tp, class _ApplyFn = __apply_cvref_fn<_Tp>>
+  using __apply_quals _CCCL_NODEBUG_ALIAS = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
+};
+template <class _Vt, size_t... _Idx>
+struct __make_tuple_types_flat<::cuda::complex<_Vt>, __tuple_indices<_Idx...>>
+{
+  static_assert(sizeof...(_Idx) == 2, "__make_tuple_types: complex has only 2 members");
+  template <size_t>
+  using __value_type = _Vt;
+  template <class _Tp, class _ApplyFn = __apply_cvref_fn<_Tp>>
+  using __apply_quals _CCCL_NODEBUG_ALIAS = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
 };
 template <class _Tp,

cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h CHANGED Viewed

@@ -20,6 +20,7 @@
 #  pragma system_header
 #endif // no system header
+#include <cuda/__fwd/complex.h>
 #include <cuda/std/__concepts/concept_macros.h>
 #include <cuda/std/__fwd/array.h>
 #include <cuda/std/__fwd/complex.h>
@@ -58,6 +59,9 @@ inline constexpr bool __tuple_like_impl<array<_Tp, _Size>> = true;
 template <class _Tp>
 inline constexpr bool __tuple_like_impl<complex<_Tp>> = true;
+template <class _Tp>
+inline constexpr bool __tuple_like_impl<::cuda::complex<_Tp>> = true;
 template <class _Ip, class _Sp, ::cuda::std::ranges::subrange_kind _Kp>
 inline constexpr bool __tuple_like_impl<::cuda::std::ranges::subrange<_Ip, _Sp, _Kp>> = true;

cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h CHANGED Viewed

@@ -20,6 +20,7 @@
 #  pragma system_header
 #endif // no system header
+#include <cuda/__fwd/complex.h>
 #include <cuda/std/__fwd/array.h>
 #include <cuda/std/__fwd/complex.h>
 #include <cuda/std/__fwd/pair.h>
@@ -54,6 +55,9 @@ inline constexpr bool __tuple_like_ext<array<_Tp, _Size>> = true;
 template <class _Tp>
 inline constexpr bool __tuple_like_ext<complex<_Tp>> = true;
+template <class _Tp>
+inline constexpr bool __tuple_like_ext<::cuda::complex<_Tp>> = true;
 template <class... _Tp>
 inline constexpr bool __tuple_like_ext<__tuple_types<_Tp...>> = true;

cuda/cccl/headers/include/cuda/std/string_view CHANGED Viewed

@@ -57,7 +57,7 @@
 #include <cuda/std/version>
 #if !_CCCL_COMPILER(NVRTC)
-#  include <iosfwd>
+#  include <string_view>
 #endif // !_CCCL_COMPILER(NVRTC)
 #include <cuda/std/__cccl/prologue.h>
@@ -727,14 +727,21 @@ _CCCL_HOST_DEVICE basic_string_view(_Range&&) -> basic_string_view<::cuda::std::
 // operator <<
-#if 0 // todo: we need to implement char_traits stream types & functions
+#if !_CCCL_COMPILER(NVRTC)
+template <class _CharT>
+_CCCL_HOST_API ::std::basic_ostream<_CharT>&
+operator<<(::std::basic_ostream<_CharT>& __os, basic_string_view<_CharT> __str)
+{
+  return __os << ::std::basic_string_view<_CharT>{__str.data(), __str.size()};
+}
 template <class _CharT, class _Traits>
-_CCCL_API inline ::std::basic_ostream<_CharT, _Traits>&
+_CCCL_HOST_API ::std::basic_ostream<_CharT, _Traits>&
 operator<<(::std::basic_ostream<_CharT, _Traits>& __os, basic_string_view<_CharT, _Traits> __str)
 {
-  return __os.write(__str.data(), static_cast<::std::streamsize>(__str.size()));
+  return __os << ::std::basic_string_view<_CharT, _Traits>{__str.data(), __str.size()};
 }
-#endif // 0
+#endif // !_CCCL_COMPILER(NVRTC)
 // literals

cuda/cccl/headers/include/cuda/std/version CHANGED Viewed

@@ -141,7 +141,7 @@
 // #   define __cccl_lib_shared_mutex                       201505L
 // # define __cccl_lib_shared_ptr_arrays                    201611L
 // # define __cccl_lib_shared_ptr_weak_type                 201606L
-// # define __cccl_lib_string_view                          201606L
+#define __cccl_lib_string_view 201803L
 // # define __cccl_lib_to_chars                             201611L
 // #  define __cccl_lib_uncaught_exceptions           201411L
 // #  define __cccl_lib_unordered_map_try_emplace     201411L
@@ -171,7 +171,6 @@
 // # define __cccl_lib_constexpr_misc                       201811L
 // # define __cccl_lib_constexpr_numeric                    201911L
 // # define __cccl_lib_constexpr_string                     201907L
-// # define __cccl_lib_constexpr_string_view                201811L
 // # define __cccl_lib_constexpr_swap_algorithms            201806L
 // # define __cccl_lib_constexpr_tuple                      201811L
 // # define __cccl_lib_constexpr_utility                    201811L
@@ -204,8 +203,6 @@
 // # define __cccl_lib_source_location                      201907L
 // # define __cccl_lib_ssize                                201902L
 // # define __cccl_lib_starts_ends_with                     201711L
-// # undef  __cccl_lib_string_view
-// # define __cccl_lib_string_view                          201803L
 // # define __cccl_lib_syncbuf                              201803L
 // # define __cccl_lib_three_way_comparison                 201907L
 #  define __cccl_lib_unwrap_ref 201811L

cuda/cccl/headers/include/thrust/detail/integer_math.h CHANGED Viewed

@@ -27,6 +27,8 @@
 #endif // no system header
 #include <thrust/detail/type_deduction.h>
+#include <cuda/std/__bit/countl.h>
+#include <cuda/std/__type_traits/make_unsigned.h>
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
@@ -36,25 +38,6 @@ THRUST_NAMESPACE_BEGIN
 namespace detail
 {
-template <typename Integer>
-_CCCL_HOST_DEVICE _CCCL_FORCEINLINE Integer clz(Integer x)
-{
-  Integer result;
-  NV_IF_TARGET(NV_IS_DEVICE,
-               (result = ::__clz(x);),
-               (int num_bits = 8 * sizeof(Integer); int num_bits_minus_one = num_bits - 1; result = num_bits;
-                for (int i = num_bits_minus_one; i >= 0; --i) {
-                  if ((Integer(1) << i) & x)
-                  {
-                    result = num_bits_minus_one - i;
-                    break;
-                  }
-                }));
-  return result;
-}
 template <typename Integer>
 _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool is_power_of_2(Integer x)
 {
@@ -85,7 +68,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE Integer log2(Integer x)
   Integer num_bits           = 8 * sizeof(Integer);
   Integer num_bits_minus_one = num_bits - 1;
-  return num_bits_minus_one - clz(x);
+  return num_bits_minus_one - ::cuda::std::countl_zero(::cuda::std::__to_unsigned_like(x));
 }
 template <typename Integer>

cuda/cccl/headers/include/thrust/iterator/iterator_traits.h CHANGED Viewed

@@ -316,6 +316,17 @@ struct iterator_traversal<::cuda::zip_iterator<Iterators...>>
   using type = detail::minimum_type<iterator_traversal_t<Iterators>...>;
 };
+template <class Fn, class... Iterators>
+struct iterator_system<::cuda::zip_transform_iterator<Fn, Iterators...>>
+{
+  using type = detail::minimum_system_t<iterator_system_t<Iterators>...>;
+};
+template <class Fn, class... Iterators>
+struct iterator_traversal<::cuda::zip_transform_iterator<Fn, Iterators...>>
+{
+  using type = detail::minimum_type<iterator_traversal_t<Iterators>...>;
+};
 //! \} // end iterator_traits
 THRUST_NAMESPACE_END

cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h CHANGED Viewed

@@ -48,6 +48,13 @@
 #include <thrust/system/cuda/detail/util.h>
 #include <thrust/type_traits/is_trivially_relocatable.h>
+#if _CCCL_HAS_CUDA_COMPILER()
+#  include <cub/device/dispatch/tuning/tuning_transform.cuh>
+#endif // _CCCL_HAS_CUDA_COMPILER()
+#include <cuda/__fwd/zip_iterator.h>
+#include <cuda/std/tuple>
 THRUST_NAMESPACE_BEGIN
 namespace cuda_cub
 {
@@ -61,6 +68,21 @@ template <class Derived, class InputIt, class OutputIt, class TransformOp>
 OutputIt _CCCL_API _CCCL_FORCEINLINE
 transform(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, TransformOp transform_op);
+// Forward declare to work around a cyclic include, since "cuda/detail/transform.h" includes this header
+// We want this to unwrap zip_transform_iterator
+namespace __transform
+{
+_CCCL_EXEC_CHECK_DISABLE
+template <class Derived, class Offset, class... InputIts, class OutputIt, class TransformOp, class Predicate>
+OutputIt _CCCL_API _CCCL_FORCEINLINE cub_transform_many(
+  execution_policy<Derived>& policy,
+  ::cuda::std::tuple<InputIts...> firsts,
+  OutputIt result,
+  Offset num_items,
+  TransformOp transform_op,
+  Predicate pred);
+} // namespace __transform
 namespace __copy
 {
 template <class H, class D, class T, class Size>
@@ -190,6 +212,17 @@ device_to_device(execution_policy<Derived>& policy, InputIt first, InputIt last,
     return result + n;
   }
+  else if constexpr (::cuda::__is_zip_transform_iterator<InputIt>)
+  {
+    const auto n = ::cuda::std::distance(first, last);
+    return cuda_cub::__transform::cub_transform_many(
+      policy,
+      ::cuda::std::move(first).__base(),
+      result,
+      n,
+      ::cuda::std::move(first).__pred(),
+      cub::detail::transform::always_true_predicate{});
+  }
   else
   {
     return cuda_cub::transform(

cuda/cccl/parallel/experimental/__init__.py CHANGED Viewed

@@ -1,77 +1,24 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2025, NVIDIA CORPORATION.
 #
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
-from .algorithms import (
-    DoubleBuffer,
-    SortOrder,
-    binary_transform,
-    exclusive_scan,
-    histogram_even,
-    inclusive_scan,
-    make_binary_transform,
-    make_exclusive_scan,
-    make_histogram_even,
-    make_inclusive_scan,
-    make_merge_sort,
-    make_radix_sort,
-    make_reduce_into,
-    make_segmented_reduce,
-    make_three_way_partition,
-    make_unary_transform,
-    make_unique_by_key,
-    merge_sort,
-    radix_sort,
-    reduce_into,
-    segmented_reduce,
-    three_way_partition,
-    unary_transform,
-    unique_by_key,
-)
-from .iterators import (
-    CacheModifiedInputIterator,
-    ConstantIterator,
-    CountingIterator,
-    ReverseIterator,
-    TransformIterator,
-    TransformOutputIterator,
-    ZipIterator,
-)
-from .op import OpKind
-from .struct import gpu_struct
+# alias for backwards compatibility
-__all__ = [
-    "binary_transform",
-    "CacheModifiedInputIterator",
-    "ConstantIterator",
-    "CountingIterator",
-    "DoubleBuffer",
-    "exclusive_scan",
-    "gpu_struct",
-    "histogram_even",
-    "inclusive_scan",
-    "make_binary_transform",
-    "make_exclusive_scan",
-    "make_histogram_even",
-    "make_inclusive_scan",
-    "make_merge_sort",
-    "make_radix_sort",
-    "make_reduce_into",
-    "make_segmented_reduce",
-    "make_three_way_partition",
-    "make_unary_transform",
-    "make_unique_by_key",
-    "merge_sort",
-    "OpKind",
-    "radix_sort",
-    "reduce_into",
-    "ReverseIterator",
-    "segmented_reduce",
-    "SortOrder",
-    "TransformIterator",
-    "three_way_partition",
-    "TransformOutputIterator",
-    "unary_transform",
-    "unique_by_key",
-    "ZipIterator",
-]
+from warnings import warn
+from cuda.compute import *  # noqa: F403
+warn(
+    "The module cuda.cccl.parallel.experimental is deprecated. Use cuda.compute instead.",
+    FutureWarning,
+)

cuda/compute/__init__.py ADDED Viewed

@@ -0,0 +1,77 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+from .algorithms import (
+    DoubleBuffer,
+    SortOrder,
+    binary_transform,
+    exclusive_scan,
+    histogram_even,
+    inclusive_scan,
+    make_binary_transform,
+    make_exclusive_scan,
+    make_histogram_even,
+    make_inclusive_scan,
+    make_merge_sort,
+    make_radix_sort,
+    make_reduce_into,
+    make_segmented_reduce,
+    make_three_way_partition,
+    make_unary_transform,
+    make_unique_by_key,
+    merge_sort,
+    radix_sort,
+    reduce_into,
+    segmented_reduce,
+    three_way_partition,
+    unary_transform,
+    unique_by_key,
+)
+from .iterators import (
+    CacheModifiedInputIterator,
+    ConstantIterator,
+    CountingIterator,
+    ReverseIterator,
+    TransformIterator,
+    TransformOutputIterator,
+    ZipIterator,
+)
+from .op import OpKind
+from .struct import gpu_struct
+__all__ = [
+    "binary_transform",
+    "CacheModifiedInputIterator",
+    "ConstantIterator",
+    "CountingIterator",
+    "DoubleBuffer",
+    "exclusive_scan",
+    "gpu_struct",
+    "histogram_even",
+    "inclusive_scan",
+    "make_binary_transform",
+    "make_exclusive_scan",
+    "make_histogram_even",
+    "make_inclusive_scan",
+    "make_merge_sort",
+    "make_radix_sort",
+    "make_reduce_into",
+    "make_segmented_reduce",
+    "make_three_way_partition",
+    "make_unary_transform",
+    "make_unique_by_key",
+    "merge_sort",
+    "OpKind",
+    "radix_sort",
+    "reduce_into",
+    "ReverseIterator",
+    "segmented_reduce",
+    "SortOrder",
+    "TransformIterator",
+    "TransformOutputIterator",
+    "three_way_partition",
+    "unary_transform",
+    "unique_by_key",
+    "ZipIterator",
+]

cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx RENAMED Viewed

@@ -4,7 +4,7 @@
 # Python signatures are declared in the companion Python stub file _bindings.pyi
 # Make sure to update PYI with change to Python API to ensure that Python
-# static type checker tools like mypy green-lights cuda.cccl.parallel
+# static type checker tools like mypy green-lights cuda.compute
 from libc.string cimport memset, memcpy
 from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t

cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py RENAMED Viewed

@@ -148,7 +148,7 @@ def make_histogram_even(
     Example:
         Below, ``make_histogram_even`` is used to create a histogram object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/histogram/histogram_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/histogram/histogram_object.py
           :language: python
           :start-after: # example-begin
@@ -190,7 +190,7 @@ def histogram_even(
     Example:
         Below, ``histogram_even`` is used to compute a histogram with evenly-spaced bins.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/histogram/histogram_even_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/histogram/histogram_even_basic.py
             :language: python
             :start-after: # example-begin
             :caption: Basic histogram example.