PyPI - cuda-cccl - Versions diffs - 0.1.3.2.0.dev438__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.1__cp310-cp310-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.1.3.2.0.dev438__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.1__cp310-cp310-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (177) hide show

cuda/cccl/headers/include/cuda/std/__floating_point/cast.h CHANGED Viewed

@@ -76,25 +76,25 @@ template <class _To, class _From>
 #if _CCCL_HAS_NVFP8_E8M0()
     else if constexpr (is_same_v<_To, __nv_fp8_e8m0>)
     {
-      return ::cuda::std::__fp_from_storage<__nv_fp8_e8m0>(::__nv_cvt_float_to_e8m0(__v, __NV_NOSAT, cudaRoundZero));
+      return ::cuda::std::__fp_from_storage<__nv_fp8_e8m0>(::__nv_cvt_float_to_e8m0(__v, __NV_NOSAT, ::cudaRoundZero));
     }
 #endif // _CCCL_HAS_NVFP8_E8M0()
 #if _CCCL_HAS_NVFP6_E2M3()
     else if constexpr (is_same_v<_To, __nv_fp6_e2m3>)
     {
-      return ::cuda::std::__fp_from_storage<__nv_fp6_e2m3>(::__nv_cvt_float_to_fp6(__v, __NV_E2M3, cudaRoundNearest));
+      return ::cuda::std::__fp_from_storage<__nv_fp6_e2m3>(::__nv_cvt_float_to_fp6(__v, __NV_E2M3, ::cudaRoundNearest));
     }
 #endif // _CCCL_HAS_NVFP6_E2M3()
 #if _CCCL_HAS_NVFP6_E3M2()
     else if constexpr (is_same_v<_To, __nv_fp6_e3m2>)
     {
-      return ::cuda::std::__fp_from_storage<__nv_fp6_e3m2>(::__nv_cvt_float_to_fp6(__v, __NV_E3M2, cudaRoundNearest));
+      return ::cuda::std::__fp_from_storage<__nv_fp6_e3m2>(::__nv_cvt_float_to_fp6(__v, __NV_E3M2, ::cudaRoundNearest));
     }
 #endif // _CCCL_HAS_NVFP6_E3M2()
 #if _CCCL_HAS_NVFP4_E2M1()
     else if constexpr (is_same_v<_To, __nv_fp4_e2m1>)
     {
-      return ::cuda::std::__fp_from_storage<__nv_fp4_e2m1>(::__nv_cvt_float_to_fp4(__v, __NV_E2M1, cudaRoundNearest));
+      return ::cuda::std::__fp_from_storage<__nv_fp4_e2m1>(::__nv_cvt_float_to_fp4(__v, __NV_E2M1, ::cudaRoundNearest));
     }
 #endif // _CCCL_HAS_NVFP4_E2M1()
     else
@@ -145,25 +145,28 @@ template <class _To, class _From>
 #if _CCCL_HAS_NVFP8_E8M0()
     else if constexpr (is_same_v<_To, __nv_fp8_e8m0>)
     {
-      return ::cuda::std::__fp_from_storage<__nv_fp8_e8m0>(::__nv_cvt_double_to_e8m0(__v, __NV_NOSAT, cudaRoundZero));
+      return ::cuda::std::__fp_from_storage<__nv_fp8_e8m0>(::__nv_cvt_double_to_e8m0(__v, __NV_NOSAT, ::cudaRoundZero));
     }
 #endif // _CCCL_HAS_NVFP8_E8M0()
 #if _CCCL_HAS_NVFP6_E2M3()
     else if constexpr (is_same_v<_To, __nv_fp6_e2m3>)
     {
-      return ::cuda::std::__fp_from_storage<__nv_fp6_e2m3>(::__nv_cvt_double_to_fp6(__v, __NV_E2M3, cudaRoundNearest));
+      return ::cuda::std::__fp_from_storage<__nv_fp6_e2m3>(
+        ::__nv_cvt_double_to_fp6(__v, __NV_E2M3, ::cudaRoundNearest));
     }
 #endif // _CCCL_HAS_NVFP6_E2M3()
 #if _CCCL_HAS_NVFP6_E3M2()
     else if constexpr (is_same_v<_To, __nv_fp6_e3m2>)
     {
-      return ::cuda::std::__fp_from_storage<__nv_fp6_e3m2>(::__nv_cvt_double_to_fp6(__v, __NV_E3M2, cudaRoundNearest));
+      return ::cuda::std::__fp_from_storage<__nv_fp6_e3m2>(
+        ::__nv_cvt_double_to_fp6(__v, __NV_E3M2, ::cudaRoundNearest));
     }
 #endif // _CCCL_HAS_NVFP6_E3M2()
 #if _CCCL_HAS_NVFP4_E2M1()
     else if constexpr (is_same_v<_To, __nv_fp4_e2m1>)
     {
-      return ::cuda::std::__fp_from_storage<__nv_fp4_e2m1>(::__nv_cvt_double_to_fp4(__v, __NV_E2M1, cudaRoundNearest));
+      return ::cuda::std::__fp_from_storage<__nv_fp4_e2m1>(
+        ::__nv_cvt_double_to_fp4(__v, __NV_E2M1, ::cudaRoundNearest));
     }
 #endif // _CCCL_HAS_NVFP4_E2M1()
     else
@@ -352,28 +355,28 @@ template <class _To, class _From>
     else if constexpr (is_same_v<_To, __nv_fp8_e8m0>)
     {
       return ::cuda::std::__fp_from_storage<__nv_fp8_e8m0>(
-        ::__nv_cvt_bfloat16raw_to_e8m0(__v, __NV_NOSAT, cudaRoundZero));
+        ::__nv_cvt_bfloat16raw_to_e8m0(__v, __NV_NOSAT, ::cudaRoundZero));
     }
 #  endif // _CCCL_HAS_NVFP8_E8M0()
 #  if _CCCL_HAS_NVFP6_E2M3()
     else if constexpr (is_same_v<_To, __nv_fp6_e2m3>)
     {
       return ::cuda::std::__fp_from_storage<__nv_fp6_e2m3>(
-        ::__nv_cvt_bfloat16raw_to_fp6(__v, __NV_E2M3, cudaRoundNearest));
+        ::__nv_cvt_bfloat16raw_to_fp6(__v, __NV_E2M3, ::cudaRoundNearest));
     }
 #  endif // _CCCL_HAS_NVFP6_E2M3()
 #  if _CCCL_HAS_NVFP6_E3M2()
     else if constexpr (is_same_v<_To, __nv_fp6_e3m2>)
     {
       return ::cuda::std::__fp_from_storage<__nv_fp6_e3m2>(
-        ::__nv_cvt_bfloat16raw_to_fp6(__v, __NV_E3M2, cudaRoundNearest));
+        ::__nv_cvt_bfloat16raw_to_fp6(__v, __NV_E3M2, ::cudaRoundNearest));
     }
 #  endif // _CCCL_HAS_NVFP6_E3M2()
 #  if _CCCL_HAS_NVFP4_E2M1()
     else if constexpr (is_same_v<_To, __nv_fp4_e2m1>)
     {
       return ::cuda::std::__fp_from_storage<__nv_fp4_e2m1>(
-        ::__nv_cvt_bfloat16raw_to_fp4(__v, __NV_E2M1, cudaRoundNearest));
+        ::__nv_cvt_bfloat16raw_to_fp4(__v, __NV_E2M1, ::cudaRoundNearest));
     }
 #  endif // _CCCL_HAS_NVFP4_E2M1()
     else

cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h CHANGED Viewed

@@ -55,6 +55,9 @@ _CCCL_DIAG_SUPPRESS_MSVC(4100) // unreferenced formal parameter
 _CCCL_DIAG_POP
 #endif // _CCCL_HAS_NVFP4()
+// crt/device_fp128_functions.h is available in CUDA 12.8+.
+// _CCCL_HAS_FLOAT128() checks the *compiler* compatibility with __float128.
+// We also need to check the toolkit version to ensure the compatibility with nvc++.
 #if _CCCL_HAS_FLOAT128() && _CCCL_DEVICE_COMPILATION() && _CCCL_CTK_AT_LEAST(12, 8)
 #  if !_CCCL_COMPILER(NVRTC)
 _CCCL_DIAG_PUSH

cuda/cccl/headers/include/cuda/std/__floating_point/fp.h CHANGED Viewed

@@ -11,7 +11,7 @@
 #ifndef _CUDA_STD___FLOATING_POINT_FP_H
 #define _CUDA_STD___FLOATING_POINT_FP_H
-#include <cuda/__cccl_config>
+#include <cuda/std/detail/__config>
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header

cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h CHANGED Viewed

@@ -439,7 +439,8 @@ public:
   [[nodiscard]] _CCCL_API constexpr bool is_exhaustive() const
     noexcept(noexcept(::cuda::std::declval<const mapping_type&>().is_exhaustive()))
   {
-    return mapping().is_exhaustive();
+    auto __tmp = mapping(); // workaround for clang with nodiscard
+    return __tmp.is_exhaustive();
   }
   [[nodiscard]] _CCCL_API constexpr bool is_strided() const
     noexcept(noexcept(::cuda::std::declval<const mapping_type&>().is_strided()))

cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h CHANGED Viewed

@@ -20,7 +20,9 @@
 #  pragma system_header
 #endif // no system header
+#include <cuda/__fwd/complex.h>
 #include <cuda/std/__fwd/array.h>
+#include <cuda/std/__fwd/complex.h>
 #include <cuda/std/__fwd/tuple.h>
 #include <cuda/std/__tuple_dir/tuple_element.h>
 #include <cuda/std/__tuple_dir/tuple_indices.h>
@@ -61,7 +63,27 @@ struct __make_tuple_types_flat<array<_Vt, _Np>, __tuple_indices<_Idx...>>
   template <size_t>
   using __value_type = _Vt;
   template <class _Tp, class _ApplyFn = __apply_cvref_fn<_Tp>>
-  using __apply_quals = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
+  using __apply_quals _CCCL_NODEBUG_ALIAS = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
+};
+template <class _Vt, size_t... _Idx>
+struct __make_tuple_types_flat<complex<_Vt>, __tuple_indices<_Idx...>>
+{
+  static_assert(sizeof...(_Idx) == 2, "__make_tuple_types: complex has only 2 members");
+  template <size_t>
+  using __value_type = _Vt;
+  template <class _Tp, class _ApplyFn = __apply_cvref_fn<_Tp>>
+  using __apply_quals _CCCL_NODEBUG_ALIAS = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
+};
+template <class _Vt, size_t... _Idx>
+struct __make_tuple_types_flat<::cuda::complex<_Vt>, __tuple_indices<_Idx...>>
+{
+  static_assert(sizeof...(_Idx) == 2, "__make_tuple_types: complex has only 2 members");
+  template <size_t>
+  using __value_type = _Vt;
+  template <class _Tp, class _ApplyFn = __apply_cvref_fn<_Tp>>
+  using __apply_quals _CCCL_NODEBUG_ALIAS = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
 };
 template <class _Tp,

cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h CHANGED Viewed

@@ -20,6 +20,7 @@
 #  pragma system_header
 #endif // no system header
+#include <cuda/__fwd/complex.h>
 #include <cuda/std/__concepts/concept_macros.h>
 #include <cuda/std/__fwd/array.h>
 #include <cuda/std/__fwd/complex.h>
@@ -58,6 +59,9 @@ inline constexpr bool __tuple_like_impl<array<_Tp, _Size>> = true;
 template <class _Tp>
 inline constexpr bool __tuple_like_impl<complex<_Tp>> = true;
+template <class _Tp>
+inline constexpr bool __tuple_like_impl<::cuda::complex<_Tp>> = true;
 template <class _Ip, class _Sp, ::cuda::std::ranges::subrange_kind _Kp>
 inline constexpr bool __tuple_like_impl<::cuda::std::ranges::subrange<_Ip, _Sp, _Kp>> = true;

cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h CHANGED Viewed

@@ -20,6 +20,7 @@
 #  pragma system_header
 #endif // no system header
+#include <cuda/__fwd/complex.h>
 #include <cuda/std/__fwd/array.h>
 #include <cuda/std/__fwd/complex.h>
 #include <cuda/std/__fwd/pair.h>
@@ -54,6 +55,9 @@ inline constexpr bool __tuple_like_ext<array<_Tp, _Size>> = true;
 template <class _Tp>
 inline constexpr bool __tuple_like_ext<complex<_Tp>> = true;
+template <class _Tp>
+inline constexpr bool __tuple_like_ext<::cuda::complex<_Tp>> = true;
 template <class... _Tp>
 inline constexpr bool __tuple_like_ext<__tuple_types<_Tp...>> = true;

cuda/cccl/headers/include/cuda/std/__type_traits/promote.h CHANGED Viewed

@@ -20,10 +20,8 @@
 #  pragma system_header
 #endif // no system header
-#include <cuda/std/__type_traits/integral_constant.h>
 #include <cuda/std/__type_traits/is_same.h>
 #include <cuda/std/__utility/declval.h>
-#include <cuda/std/cstddef>
 #include <cuda/std/__cccl/prologue.h>
@@ -49,6 +47,9 @@ struct __numeric_type
   _CCCL_API inline static double __test(unsigned long long);
   _CCCL_API inline static double __test(double);
   _CCCL_API inline static long double __test(long double);
+#if _CCCL_HAS_FLOAT128()
+  _CCCL_API inline static __float128 __test(__float128);
+#endif // _CCCL_HAS_FLOAT128()
   using type              = decltype(__test(declval<_Tp>()));
   static const bool value = !is_same_v<type, void>;

cuda/cccl/headers/include/cuda/std/string_view CHANGED Viewed

@@ -57,7 +57,7 @@
 #include <cuda/std/version>
 #if !_CCCL_COMPILER(NVRTC)
-#  include <iosfwd>
+#  include <string_view>
 #endif // !_CCCL_COMPILER(NVRTC)
 #include <cuda/std/__cccl/prologue.h>
@@ -727,14 +727,21 @@ _CCCL_HOST_DEVICE basic_string_view(_Range&&) -> basic_string_view<::cuda::std::
 // operator <<
-#if 0 // todo: we need to implement char_traits stream types & functions
+#if !_CCCL_COMPILER(NVRTC)
+template <class _CharT>
+_CCCL_HOST_API ::std::basic_ostream<_CharT>&
+operator<<(::std::basic_ostream<_CharT>& __os, basic_string_view<_CharT> __str)
+{
+  return __os << ::std::basic_string_view<_CharT>{__str.data(), __str.size()};
+}
 template <class _CharT, class _Traits>
-_CCCL_API inline ::std::basic_ostream<_CharT, _Traits>&
+_CCCL_HOST_API ::std::basic_ostream<_CharT, _Traits>&
 operator<<(::std::basic_ostream<_CharT, _Traits>& __os, basic_string_view<_CharT, _Traits> __str)
 {
-  return __os.write(__str.data(), static_cast<::std::streamsize>(__str.size()));
+  return __os << ::std::basic_string_view<_CharT, _Traits>{__str.data(), __str.size()};
 }
-#endif // 0
+#endif // !_CCCL_COMPILER(NVRTC)
 // literals

cuda/cccl/headers/include/cuda/std/version CHANGED Viewed

@@ -141,7 +141,7 @@
 // #   define __cccl_lib_shared_mutex                       201505L
 // # define __cccl_lib_shared_ptr_arrays                    201611L
 // # define __cccl_lib_shared_ptr_weak_type                 201606L
-// # define __cccl_lib_string_view                          201606L
+#define __cccl_lib_string_view 201803L
 // # define __cccl_lib_to_chars                             201611L
 // #  define __cccl_lib_uncaught_exceptions           201411L
 // #  define __cccl_lib_unordered_map_try_emplace     201411L
@@ -171,7 +171,6 @@
 // # define __cccl_lib_constexpr_misc                       201811L
 // # define __cccl_lib_constexpr_numeric                    201911L
 // # define __cccl_lib_constexpr_string                     201907L
-// # define __cccl_lib_constexpr_string_view                201811L
 // # define __cccl_lib_constexpr_swap_algorithms            201806L
 // # define __cccl_lib_constexpr_tuple                      201811L
 // # define __cccl_lib_constexpr_utility                    201811L
@@ -204,8 +203,6 @@
 // # define __cccl_lib_source_location                      201907L
 // # define __cccl_lib_ssize                                201902L
 // # define __cccl_lib_starts_ends_with                     201711L
-// # undef  __cccl_lib_string_view
-// # define __cccl_lib_string_view                          201803L
 // # define __cccl_lib_syncbuf                              201803L
 // # define __cccl_lib_three_way_comparison                 201907L
 #  define __cccl_lib_unwrap_ref 201811L

cuda/cccl/headers/include/thrust/detail/integer_math.h CHANGED Viewed

@@ -27,6 +27,8 @@
 #endif // no system header
 #include <thrust/detail/type_deduction.h>
+#include <cuda/std/__bit/countl.h>
+#include <cuda/std/__type_traits/make_unsigned.h>
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
@@ -36,25 +38,6 @@ THRUST_NAMESPACE_BEGIN
 namespace detail
 {
-template <typename Integer>
-_CCCL_HOST_DEVICE _CCCL_FORCEINLINE Integer clz(Integer x)
-{
-  Integer result;
-  NV_IF_TARGET(NV_IS_DEVICE,
-               (result = ::__clz(x);),
-               (int num_bits = 8 * sizeof(Integer); int num_bits_minus_one = num_bits - 1; result = num_bits;
-                for (int i = num_bits_minus_one; i >= 0; --i) {
-                  if ((Integer(1) << i) & x)
-                  {
-                    result = num_bits_minus_one - i;
-                    break;
-                  }
-                }));
-  return result;
-}
 template <typename Integer>
 _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool is_power_of_2(Integer x)
 {
@@ -85,7 +68,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE Integer log2(Integer x)
   Integer num_bits           = 8 * sizeof(Integer);
   Integer num_bits_minus_one = num_bits - 1;
-  return num_bits_minus_one - clz(x);
+  return num_bits_minus_one - ::cuda::std::countl_zero(::cuda::std::__to_unsigned_like(x));
 }
 template <typename Integer>

cuda/cccl/headers/include/thrust/iterator/iterator_traits.h CHANGED Viewed

@@ -316,6 +316,17 @@ struct iterator_traversal<::cuda::zip_iterator<Iterators...>>
   using type = detail::minimum_type<iterator_traversal_t<Iterators>...>;
 };
+template <class Fn, class... Iterators>
+struct iterator_system<::cuda::zip_transform_iterator<Fn, Iterators...>>
+{
+  using type = detail::minimum_system_t<iterator_system_t<Iterators>...>;
+};
+template <class Fn, class... Iterators>
+struct iterator_traversal<::cuda::zip_transform_iterator<Fn, Iterators...>>
+{
+  using type = detail::minimum_type<iterator_traversal_t<Iterators>...>;
+};
 //! \} // end iterator_traits
 THRUST_NAMESPACE_END

cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h CHANGED Viewed

@@ -48,6 +48,13 @@
 #include <thrust/system/cuda/detail/util.h>
 #include <thrust/type_traits/is_trivially_relocatable.h>
+#if _CCCL_HAS_CUDA_COMPILER()
+#  include <cub/device/dispatch/tuning/tuning_transform.cuh>
+#endif // _CCCL_HAS_CUDA_COMPILER()
+#include <cuda/__fwd/zip_iterator.h>
+#include <cuda/std/tuple>
 THRUST_NAMESPACE_BEGIN
 namespace cuda_cub
 {
@@ -61,6 +68,21 @@ template <class Derived, class InputIt, class OutputIt, class TransformOp>
 OutputIt _CCCL_API _CCCL_FORCEINLINE
 transform(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, TransformOp transform_op);
+// Forward declare to work around a cyclic include, since "cuda/detail/transform.h" includes this header
+// We want this to unwrap zip_transform_iterator
+namespace __transform
+{
+_CCCL_EXEC_CHECK_DISABLE
+template <class Derived, class Offset, class... InputIts, class OutputIt, class TransformOp, class Predicate>
+OutputIt _CCCL_API _CCCL_FORCEINLINE cub_transform_many(
+  execution_policy<Derived>& policy,
+  ::cuda::std::tuple<InputIts...> firsts,
+  OutputIt result,
+  Offset num_items,
+  TransformOp transform_op,
+  Predicate pred);
+} // namespace __transform
 namespace __copy
 {
 template <class H, class D, class T, class Size>
@@ -190,6 +212,17 @@ device_to_device(execution_policy<Derived>& policy, InputIt first, InputIt last,
     return result + n;
   }
+  else if constexpr (::cuda::__is_zip_transform_iterator<InputIt>)
+  {
+    const auto n = ::cuda::std::distance(first, last);
+    return cuda_cub::__transform::cub_transform_many(
+      policy,
+      ::cuda::std::move(first).__base(),
+      result,
+      n,
+      ::cuda::std::move(first).__pred(),
+      cub::detail::transform::always_true_predicate{});
+  }
   else
   {
     return cuda_cub::transform(

cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h CHANGED Viewed

@@ -39,37 +39,23 @@
 #if _CCCL_HAS_CUDA_COMPILER()
 #  include <thrust/system/cuda/config.h>
-#  include <thrust/distance.h>
-#  include <thrust/system/cuda/detail/parallel_for.h>
+#  include <thrust/system/cuda/detail/transform.h>
 #  include <thrust/system/cuda/execution_policy.h>
+#  include <cuda/__functional/address_stability.h>
+#  include <cuda/std/iterator>
 THRUST_NAMESPACE_BEGIN
 namespace cuda_cub
 {
-namespace __tabulate
-{
-template <class Iterator, class TabulateOp>
-struct functor
-{
-  Iterator items;
-  TabulateOp op;
-  template <typename Size>
-  void _CCCL_DEVICE operator()(Size idx)
-  {
-    items[idx] = op(idx);
-  }
-};
-} // namespace __tabulate
 template <class Derived, class Iterator, class TabulateOp>
 void _CCCL_HOST_DEVICE tabulate(execution_policy<Derived>& policy, Iterator first, Iterator last, TabulateOp tabulate_op)
 {
-  using size_type = thrust::detail::it_difference_t<Iterator>;
-  size_type count = ::cuda::std::distance(first, last);
-  cuda_cub::parallel_for(policy, __tabulate::functor<Iterator, TabulateOp>{first, tabulate_op}, count);
+  using size_type  = ::cuda::std::iter_difference_t<Iterator>;
+  const auto count = ::cuda::std::distance(first, last);
+  cuda_cub::transform_n(
+    policy, ::cuda::counting_iterator<size_type>{}, count, first, ::cuda::proclaim_copyable_arguments(tabulate_op));
 }
 } // namespace cuda_cub
 THRUST_NAMESPACE_END
 #endif

cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h CHANGED Viewed

@@ -25,72 +25,39 @@
 THRUST_NAMESPACE_BEGIN
-namespace detail
-{
-// Type traits for contiguous iterators:
-template <typename Iterator>
-struct contiguous_iterator_traits
-{
-  static_assert(thrust::is_contiguous_iterator_v<Iterator>,
-                "contiguous_iterator_traits requires a contiguous iterator.");
-  using raw_pointer =
-    typename thrust::detail::pointer_traits<decltype(&*::cuda::std::declval<Iterator>())>::raw_pointer;
-};
-} // namespace detail
-//! Converts a contiguous iterator type to its underlying raw pointer type.
-template <typename ContiguousIterator>
-using unwrap_contiguous_iterator_t = typename detail::contiguous_iterator_traits<ContiguousIterator>::raw_pointer;
 //! Converts a contiguous iterator to its underlying raw pointer.
+_CCCL_EXEC_CHECK_DISABLE
 template <typename ContiguousIterator>
 _CCCL_HOST_DEVICE auto unwrap_contiguous_iterator(ContiguousIterator it)
-  -> unwrap_contiguous_iterator_t<ContiguousIterator>
 {
   static_assert(thrust::is_contiguous_iterator_v<ContiguousIterator>,
                 "unwrap_contiguous_iterator called with non-contiguous iterator.");
   return thrust::raw_pointer_cast(&*it);
 }
-namespace detail
-{
-// Implementation for non-contiguous iterators -- passthrough.
-template <typename Iterator, bool IsContiguous = thrust::is_contiguous_iterator_v<Iterator>>
-struct try_unwrap_contiguous_iterator_impl
-{
-  using type = Iterator;
-  static _CCCL_HOST_DEVICE type get(Iterator it)
-  {
-    return it;
-  }
-};
+//! Converts a contiguous iterator type to its underlying raw pointer type.
+template <typename ContiguousIterator>
+using unwrap_contiguous_iterator_t = decltype(unwrap_contiguous_iterator(::cuda::std::declval<ContiguousIterator>()));
-// Implementation for contiguous iterators -- unwraps to raw pointer.
+//! Takes an iterator and, if it is contiguous, unwraps it to the raw pointer it represents. Otherwise returns the
+//! iterator unmodified.
+_CCCL_EXEC_CHECK_DISABLE
 template <typename Iterator>
-struct try_unwrap_contiguous_iterator_impl<Iterator, true /*is_contiguous*/>
+_CCCL_HOST_DEVICE auto try_unwrap_contiguous_iterator(Iterator it)
 {
-  using type = unwrap_contiguous_iterator_t<Iterator>;
-  static _CCCL_HOST_DEVICE type get(Iterator it)
+  if constexpr (thrust::is_contiguous_iterator_v<Iterator>)
   {
     return unwrap_contiguous_iterator(it);
   }
-};
-} // namespace detail
+  else
+  {
+    return it;
+  }
+}
 //! Takes an iterator type and, if it is contiguous, yields the raw pointer type it represents. Otherwise returns the
 //! iterator type unmodified.
 template <typename Iterator>
-using try_unwrap_contiguous_iterator_t = typename detail::try_unwrap_contiguous_iterator_impl<Iterator>::type;
-//! Takes an iterator and, if it is contiguous, unwraps it to the raw pointer it represents. Otherwise returns the
-//! iterator unmodified.
-template <typename Iterator>
-_CCCL_HOST_DEVICE auto try_unwrap_contiguous_iterator(Iterator it) -> try_unwrap_contiguous_iterator_t<Iterator>
-{
-  return detail::try_unwrap_contiguous_iterator_impl<Iterator>::get(it);
-}
+using try_unwrap_contiguous_iterator_t = decltype(try_unwrap_contiguous_iterator(::cuda::std::declval<Iterator>()));
 THRUST_NAMESPACE_END

cuda/cccl/parallel/experimental/__init__.py CHANGED Viewed

@@ -1,73 +1,24 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2025, NVIDIA CORPORATION.
 #
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
-from .algorithms import (
-    DoubleBuffer,
-    SortOrder,
-    binary_transform,
-    exclusive_scan,
-    histogram_even,
-    inclusive_scan,
-    make_binary_transform,
-    make_exclusive_scan,
-    make_histogram_even,
-    make_inclusive_scan,
-    make_merge_sort,
-    make_radix_sort,
-    make_reduce_into,
-    make_segmented_reduce,
-    make_unary_transform,
-    make_unique_by_key,
-    merge_sort,
-    radix_sort,
-    reduce_into,
-    segmented_reduce,
-    unary_transform,
-    unique_by_key,
-)
-from .iterators import (
-    CacheModifiedInputIterator,
-    ConstantIterator,
-    CountingIterator,
-    ReverseIterator,
-    TransformIterator,
-    TransformOutputIterator,
-    ZipIterator,
-)
-from .op import OpKind
-from .struct import gpu_struct
+# alias for backwards compatibility
-__all__ = [
-    "binary_transform",
-    "CacheModifiedInputIterator",
-    "ConstantIterator",
-    "CountingIterator",
-    "DoubleBuffer",
-    "exclusive_scan",
-    "gpu_struct",
-    "histogram_even",
-    "inclusive_scan",
-    "make_binary_transform",
-    "make_exclusive_scan",
-    "make_histogram_even",
-    "make_inclusive_scan",
-    "make_merge_sort",
-    "make_radix_sort",
-    "make_reduce_into",
-    "make_segmented_reduce",
-    "make_unary_transform",
-    "make_unique_by_key",
-    "merge_sort",
-    "OpKind",
-    "radix_sort",
-    "reduce_into",
-    "ReverseIterator",
-    "segmented_reduce",
-    "SortOrder",
-    "TransformIterator",
-    "TransformOutputIterator",
-    "unary_transform",
-    "unique_by_key",
-    "ZipIterator",
-]
+from warnings import warn
+from cuda.compute import *  # noqa: F403
+warn(
+    "The module cuda.cccl.parallel.experimental is deprecated. Use cuda.compute instead.",
+    FutureWarning,
+)