PyPI - cuda-cccl - Versions diffs - 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show

cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h CHANGED Viewed

@@ -48,6 +48,13 @@
 #include <thrust/system/cuda/detail/util.h>
 #include <thrust/type_traits/is_trivially_relocatable.h>
+#if _CCCL_HAS_CUDA_COMPILER()
+#  include <cub/device/dispatch/tuning/tuning_transform.cuh>
+#endif // _CCCL_HAS_CUDA_COMPILER()
+#include <cuda/__fwd/zip_iterator.h>
+#include <cuda/std/tuple>
 THRUST_NAMESPACE_BEGIN
 namespace cuda_cub
 {
@@ -61,6 +68,21 @@ template <class Derived, class InputIt, class OutputIt, class TransformOp>
 OutputIt _CCCL_API _CCCL_FORCEINLINE
 transform(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, TransformOp transform_op);
+// Forward declare to work around a cyclic include, since "cuda/detail/transform.h" includes this header
+// We want this to unwrap zip_transform_iterator
+namespace __transform
+{
+_CCCL_EXEC_CHECK_DISABLE
+template <class Derived, class Offset, class... InputIts, class OutputIt, class TransformOp, class Predicate>
+OutputIt _CCCL_API _CCCL_FORCEINLINE cub_transform_many(
+  execution_policy<Derived>& policy,
+  ::cuda::std::tuple<InputIts...> firsts,
+  OutputIt result,
+  Offset num_items,
+  TransformOp transform_op,
+  Predicate pred);
+} // namespace __transform
 namespace __copy
 {
 template <class H, class D, class T, class Size>
@@ -190,6 +212,17 @@ device_to_device(execution_policy<Derived>& policy, InputIt first, InputIt last,
     return result + n;
   }
+  else if constexpr (::cuda::__is_zip_transform_iterator<InputIt>)
+  {
+    const auto n = ::cuda::std::distance(first, last);
+    return cuda_cub::__transform::cub_transform_many(
+      policy,
+      ::cuda::std::move(first).__base(),
+      result,
+      n,
+      ::cuda::std::move(first).__pred(),
+      cub::detail::transform::always_true_predicate{});
+  }
   else
   {
     return cuda_cub::transform(

cuda/cccl/headers/include/thrust/system/cuda/detail/find.h CHANGED Viewed

@@ -39,11 +39,13 @@
 #if _CCCL_HAS_CUDA_COMPILER()
 #  include <thrust/system/cuda/config.h>
-#  include <thrust/distance.h>
-#  include <thrust/iterator/counting_iterator.h>
-#  include <thrust/iterator/transform_iterator.h>
 #  include <thrust/system/cuda/detail/execution_policy.h>
+#  include <cuda/__iterator/counting_iterator.h>
+#  include <cuda/__iterator/transform_iterator.h>
+#  include <cuda/__iterator/zip_iterator.h>
+#  include <cuda/std/__iterator/distance.h>
 THRUST_NAMESPACE_BEGIN
 namespace cuda_cub
 {
@@ -62,7 +64,6 @@ InputIt _CCCL_HOST_DEVICE find(execution_policy<Derived>& policy, InputIt first,
 }; // namespace cuda_cub
 THRUST_NAMESPACE_END
-#  include <thrust/iterator/zip_iterator.h>
 #  include <thrust/system/cuda/detail/reduce.h>
 THRUST_NAMESPACE_BEGIN
@@ -92,109 +93,13 @@ struct functor
     }
   }
 };
-template <class ValueType, class InputIt, class UnaryOp>
-struct transform_input_iterator_t
-{
-  using self_t            = transform_input_iterator_t;
-  using difference_type   = thrust::detail::it_difference_t<InputIt>;
-  using value_type        = ValueType;
-  using pointer           = void;
-  using reference         = value_type;
-  using iterator_category = ::cuda::std::random_access_iterator_tag;
-  InputIt input;
-  mutable UnaryOp op;
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE transform_input_iterator_t(InputIt input, UnaryOp op)
-      : input(input)
-      , op(op)
-  {}
-  transform_input_iterator_t(const self_t&) = default;
-  // UnaryOp might not be copy assignable, such as when it is a lambda.  Define
-  // an explicit copy assignment operator that doesn't try to assign it.
-  _CCCL_HOST_DEVICE self_t& operator=(const self_t& o)
-  {
-    input = o.input;
-    return *this;
-  }
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator++(int)
-  {
-    self_t retval = *this;
-    ++input;
-    return retval;
-  }
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator++()
-  {
-    ++input;
-    return *this;
-  }
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator*() const
-  {
-    thrust::detail::it_value_t<InputIt> x = *input;
-    return op(x);
-  }
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator*()
-  {
-    thrust::detail::it_value_t<InputIt> x = *input;
-    return op(x);
-  }
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator+(difference_type n) const
-  {
-    return self_t(input + n, op);
-  }
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t& operator+=(difference_type n)
-  {
-    input += n;
-    return *this;
-  }
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator-(difference_type n) const
-  {
-    return self_t(input - n, op);
-  }
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t& operator-=(difference_type n)
-  {
-    input -= n;
-    return *this;
-  }
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE difference_type operator-(self_t other) const
-  {
-    return input - other.input;
-  }
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator[](difference_type n) const
-  {
-    return op(input[n]);
-  }
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const self_t& rhs) const
-  {
-    return (input == rhs.input);
-  }
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const self_t& rhs) const
-  {
-    return (input != rhs.input);
-  }
-};
 } // namespace __find_if
 template <class Derived, class InputIt, class Size, class Predicate>
 InputIt _CCCL_HOST_DEVICE
 find_if_n(execution_policy<Derived>& policy, InputIt first, Size num_items, Predicate predicate)
 {
-  using result_type = typename thrust::tuple<bool, Size>;
+  using result_type = ::cuda::std::tuple<bool, Size>;
   // empty sequence
   if (num_items == 0)
@@ -212,27 +117,20 @@ find_if_n(execution_policy<Derived>& policy, InputIt first, Size num_items, Pred
   const Size interval_threshold = 1 << 20;
   const Size interval_size      = (::cuda::std::min) (interval_threshold, num_items);
-  // FIXME(bgruber): we should also be able to use transform_iterator here, but it makes nvc++ hang. See:
-  // https://github.com/NVIDIA/cccl/issues/3594. The problem does not occur with nvcc, so we could not add a test :/
-  using XfrmIterator = __find_if::transform_input_iterator_t<bool, InputIt, Predicate>;
-  // using XfrmIterator  = transform_iterator<Predicate, InputIt>;
-  using IteratorTuple = thrust::tuple<XfrmIterator, counting_iterator<Size>>;
-  using ZipIterator   = thrust::zip_iterator<IteratorTuple>;
-  IteratorTuple iter_tuple = thrust::make_tuple(XfrmIterator(first, predicate), counting_iterator<Size>(0));
-  ZipIterator begin = thrust::make_zip_iterator(iter_tuple);
-  ZipIterator end   = begin + num_items;
+  const auto begin = ::cuda::make_zip_iterator(
+    ::cuda::make_transform_iterator(try_unwrap_contiguous_iterator(first), predicate),
+    ::cuda::counting_iterator<Size>(0));
+  const auto end = begin + num_items;
-  for (ZipIterator interval_begin = begin; interval_begin < end; interval_begin += interval_size)
+  for (auto interval_begin = begin; interval_begin < end; interval_begin += interval_size)
   {
-    ZipIterator interval_end = interval_begin + interval_size;
+    auto interval_end = interval_begin + interval_size;
     if (end < interval_end)
     {
       interval_end = end;
     } // end if
-    result_type result = reduce(
+    const result_type result = reduce(
       policy, interval_begin, interval_end, result_type(false, interval_end - begin), __find_if::functor<result_type>());
     // see if we found something

cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h CHANGED Viewed

@@ -73,12 +73,14 @@ struct transform_pair_of_input_iterators_t
   using value_type        = ValueType;
   using pointer           = void;
   using reference         = value_type;
-  using iterator_category = std::random_access_iterator_tag;
+  using iterator_category = ::cuda::std::random_access_iterator_tag;
   InputIt1 input1;
   InputIt2 input2;
   mutable BinaryOp op;
+  transform_pair_of_input_iterators_t() = default;
   _CCCL_HOST_DEVICE _CCCL_FORCEINLINE
   transform_pair_of_input_iterators_t(InputIt1 input1_, InputIt2 input2_, BinaryOp op_)
       : input1(input1_)
@@ -107,7 +109,7 @@ struct transform_pair_of_input_iterators_t
   }
   /// Prefix increment
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator++()
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t& operator++()
   {
     ++input1;
     ++input2;
@@ -177,6 +179,10 @@ struct transform_pair_of_input_iterators_t
     return (input1 != rhs.input1) || (input2 != rhs.input2);
   }
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator<(const self_t& rhs) const
+  {
+    return input1 < rhs.input1;
+  }
 }; // struct transform_pair_of_input_iterators_t
 } // namespace detail

cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h CHANGED Viewed

@@ -79,7 +79,7 @@ namespace detail
 template <typename Iterator>
 inline constexpr bool is_libcxx_wrap_iter_v = false;
-#if defined(_LIBCPP_VERSION)
+#if _CCCL_HOST_STD_LIB(LIBCXX)
 template <typename Iterator>
 inline constexpr bool is_libcxx_wrap_iter_v<
 #  if _LIBCPP_VERSION < 14000
@@ -88,23 +88,23 @@ inline constexpr bool is_libcxx_wrap_iter_v<
   std::__wrap_iter<Iterator>
 #  endif
   > = true;
-#endif
+#endif // _CCCL_HOST_STD_LIB(LIBCXX)
 template <typename Iterator>
 inline constexpr bool is_libstdcxx_normal_iterator_v = false;
-#if defined(__GLIBCXX__)
+#if _CCCL_HOST_STD_LIB(LIBSTDCXX)
 template <typename Iterator, typename Container>
 inline constexpr bool is_libstdcxx_normal_iterator_v<::__gnu_cxx::__normal_iterator<Iterator, Container>> = true;
-#endif
+#endif // _CCCL_HOST_STD_LIB(LIBSTDCXX)
-#if _CCCL_COMPILER(MSVC)
+#if _CCCL_HOST_STD_LIB(STL)
 template <typename Iterator>
 inline constexpr bool is_msvc_contiguous_iterator_v = ::cuda::std::is_pointer_v<::std::_Unwrapped_t<Iterator>>;
-#else
+#else // ^^^ _CCCL_HOST_STD_LIB(STL) ^^^ / vvv !_CCCL_HOST_STD_LIB(STL) vvv
 template <typename Iterator>
 inline constexpr bool is_msvc_contiguous_iterator_v = false;
-#endif
+#endif // ^^^ !_CCCL_HOST_STD_LIB(STL) ^^^
 template <typename Iterator>
 inline constexpr bool is_contiguous_iterator_impl_v =

cuda/cccl/parallel/experimental/__init__.py CHANGED Viewed

@@ -1,77 +1,24 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2025, NVIDIA CORPORATION.
 #
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
-from .algorithms import (
-    DoubleBuffer,
-    SortOrder,
-    binary_transform,
-    exclusive_scan,
-    histogram_even,
-    inclusive_scan,
-    make_binary_transform,
-    make_exclusive_scan,
-    make_histogram_even,
-    make_inclusive_scan,
-    make_merge_sort,
-    make_radix_sort,
-    make_reduce_into,
-    make_segmented_reduce,
-    make_three_way_partition,
-    make_unary_transform,
-    make_unique_by_key,
-    merge_sort,
-    radix_sort,
-    reduce_into,
-    segmented_reduce,
-    three_way_partition,
-    unary_transform,
-    unique_by_key,
-)
-from .iterators import (
-    CacheModifiedInputIterator,
-    ConstantIterator,
-    CountingIterator,
-    ReverseIterator,
-    TransformIterator,
-    TransformOutputIterator,
-    ZipIterator,
-)
-from .op import OpKind
-from .struct import gpu_struct
+# alias for backwards compatibility
-__all__ = [
-    "binary_transform",
-    "CacheModifiedInputIterator",
-    "ConstantIterator",
-    "CountingIterator",
-    "DoubleBuffer",
-    "exclusive_scan",
-    "gpu_struct",
-    "histogram_even",
-    "inclusive_scan",
-    "make_binary_transform",
-    "make_exclusive_scan",
-    "make_histogram_even",
-    "make_inclusive_scan",
-    "make_merge_sort",
-    "make_radix_sort",
-    "make_reduce_into",
-    "make_segmented_reduce",
-    "make_three_way_partition",
-    "make_unary_transform",
-    "make_unique_by_key",
-    "merge_sort",
-    "OpKind",
-    "radix_sort",
-    "reduce_into",
-    "ReverseIterator",
-    "segmented_reduce",
-    "SortOrder",
-    "TransformIterator",
-    "three_way_partition",
-    "TransformOutputIterator",
-    "unary_transform",
-    "unique_by_key",
-    "ZipIterator",
-]
+from warnings import warn
+from cuda.compute import *  # noqa: F403
+warn(
+    "The module cuda.cccl.parallel.experimental is deprecated. Use cuda.compute instead.",
+    FutureWarning,
+)

cuda/compute/__init__.py ADDED Viewed

@@ -0,0 +1,79 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+from .algorithms import (
+    DoubleBuffer,
+    SortOrder,
+    binary_transform,
+    exclusive_scan,
+    histogram_even,
+    inclusive_scan,
+    make_binary_transform,
+    make_exclusive_scan,
+    make_histogram_even,
+    make_inclusive_scan,
+    make_merge_sort,
+    make_radix_sort,
+    make_reduce_into,
+    make_segmented_reduce,
+    make_three_way_partition,
+    make_unary_transform,
+    make_unique_by_key,
+    merge_sort,
+    radix_sort,
+    reduce_into,
+    segmented_reduce,
+    three_way_partition,
+    unary_transform,
+    unique_by_key,
+)
+from .iterators import (
+    CacheModifiedInputIterator,
+    ConstantIterator,
+    CountingIterator,
+    PermutationIterator,
+    ReverseIterator,
+    TransformIterator,
+    TransformOutputIterator,
+    ZipIterator,
+)
+from .op import OpKind
+from .struct import gpu_struct
+__all__ = [
+    "binary_transform",
+    "CacheModifiedInputIterator",
+    "ConstantIterator",
+    "CountingIterator",
+    "DoubleBuffer",
+    "exclusive_scan",
+    "gpu_struct",
+    "histogram_even",
+    "inclusive_scan",
+    "make_binary_transform",
+    "make_exclusive_scan",
+    "make_histogram_even",
+    "make_inclusive_scan",
+    "make_merge_sort",
+    "make_radix_sort",
+    "make_reduce_into",
+    "make_segmented_reduce",
+    "make_three_way_partition",
+    "make_unary_transform",
+    "make_unique_by_key",
+    "merge_sort",
+    "OpKind",
+    "PermutationIterator",
+    "radix_sort",
+    "reduce_into",
+    "ReverseIterator",
+    "segmented_reduce",
+    "SortOrder",
+    "TransformIterator",
+    "TransformOutputIterator",
+    "three_way_partition",
+    "unary_transform",
+    "unique_by_key",
+    "ZipIterator",
+]

cuda/{cccl/parallel/experimental → compute}/_bindings.pyi RENAMED Viewed

@@ -57,6 +57,12 @@ class SortOrder(IntEnum):
     ASCENDING = ...
     DESCENDING = ...
+class InitKind(IntEnum):
+    _value_: int
+    NO_INIT = ...
+    FUTURE_VALUE_INIT = ...
+    VALUE_INIT = ...
 class Op:
     def __init__(
         self,
@@ -133,6 +139,8 @@ class Iterator:
     def state(self, value) -> None: ...
     @property
     def type(self) -> IteratorKind: ...
+    @property
+    def value_type(self) -> TypeInfo: ...
     def as_bytes(self) -> bytes: ...
     def is_kind_pointer(self) -> bool: ...
     def is_kind_iterator(self) -> bool: ...
@@ -197,8 +205,9 @@ class DeviceScanBuildResult:
         d_in: Iterator,
         d_out: Iterator,
         binary_op: Op,
-        h_init: Value,
+        init_type: TypeInfo,
         force_inclusive: bool,
+        init_kind: InitKind,
         info: CommonData,
     ): ...
     def compute_inclusive(
@@ -223,6 +232,39 @@ class DeviceScanBuildResult:
         h_init: Value,
         stream,
     ) -> int: ...
+    def compute_inclusive_future_value(
+        self,
+        temp_storage_ptr: int | None,
+        temp_storage_nbytes: int,
+        d_in: Iterator,
+        d_out: Iterator,
+        num_items: int,
+        binary_op: Op,
+        h_init: Iterator,
+        stream,
+    ) -> int: ...
+    def compute_exclusive_future_value(
+        self,
+        temp_storage_ptr: int | None,
+        temp_storage_nbytes: int,
+        d_in: Iterator,
+        d_out: Iterator,
+        num_items: int,
+        binary_op: Op,
+        h_init: Iterator,
+        stream,
+    ) -> int: ...
+    def compute_inclusive_no_init(
+        self,
+        temp_storage_ptr: int | None,
+        temp_storage_nbytes: int,
+        d_in: Iterator,
+        d_out: Iterator,
+        num_items: int,
+        binary_op: Op,
+        h_init: None,
+        stream,
+    ) -> int: ...
 # ---------------------
 # DeviceSegmentedReduce