PyPI - cuda-cccl - Versions diffs - 0.3.0__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.2__cp310-cp310-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.3.0__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.2__cp310-cp310-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show

cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx RENAMED Viewed

@@ -4,7 +4,7 @@
 # Python signatures are declared in the companion Python stub file _bindings.pyi
 # Make sure to update PYI with change to Python API to ensure that Python
-# static type checker tools like mypy green-lights cuda.cccl.parallel
+# static type checker tools like mypy green-lights cuda.compute
 from libc.string cimport memset, memcpy
 from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t
@@ -120,6 +120,10 @@ cdef extern from "cccl/c/types.h":
         ASCENDING "CCCL_ASCENDING"
         DESCENDING "CCCL_DESCENDING"
+    cpdef enum cccl_init_kind_t:
+        VALUE_INIT "CCCL_VALUE_INIT"
+        FUTURE_VALUE_INIT "CCCL_FUTURE_VALUE_INIT"
+        NO_INIT "CCCL_NO_INIT"
 cdef void arg_type_check(
     str arg_name,
@@ -136,6 +140,7 @@ OpKind = cccl_op_kind_t
 TypeEnum = cccl_type_enum
 IteratorKind = cccl_iterator_kind_t
 SortOrder = cccl_sort_order_t
+InitKind = cccl_init_kind_t
 cdef void _validate_alignment(int alignment) except *:
     """
@@ -724,6 +729,11 @@ cdef class Iterator:
         else:
             return IteratorKind.ITERATOR
+    @property
+    def value_type(self):
+        cdef cccl_type_info type_info = self.iter_data.value_type
+        return TypeInfo(type_info.size, type_info.alignment, type_info.type)
     def is_kind_pointer(self):
         cdef cccl_iterator_kind_t it_kind = self.iter_data.type
         return (it_kind == cccl_iterator_kind_t.POINTER)
@@ -947,8 +957,9 @@ cdef extern from "cccl/c/scan.h":
         cccl_iterator_t,
         cccl_iterator_t,
         cccl_op_t,
-        cccl_value_t,
+        cccl_type_info,
         _Bool,
+        cccl_init_kind_t,
         int, int, const char*, const char*, const char*, const char*
     ) nogil
@@ -976,6 +987,41 @@ cdef extern from "cccl/c/scan.h":
         CUstream
     ) nogil
+    cdef CUresult cccl_device_exclusive_scan_future_value(
+        cccl_device_scan_build_result_t,
+        void *,
+        size_t *,
+        cccl_iterator_t,
+        cccl_iterator_t,
+        uint64_t,
+        cccl_op_t,
+        cccl_iterator_t,
+        CUstream
+    ) nogil
+    cdef CUresult cccl_device_inclusive_scan_future_value(
+        cccl_device_scan_build_result_t,
+        void *,
+        size_t *,
+        cccl_iterator_t,
+        cccl_iterator_t,
+        uint64_t,
+        cccl_op_t,
+        cccl_iterator_t,
+        CUstream
+    ) nogil
+    cdef CUresult cccl_device_inclusive_scan_no_init(
+        cccl_device_scan_build_result_t,
+        void *,
+        size_t *,
+        cccl_iterator_t,
+        cccl_iterator_t,
+        uint64_t,
+        cccl_op_t,
+        CUstream
+    ) nogil
     cdef CUresult cccl_device_scan_cleanup(
         cccl_device_scan_build_result_t*
     ) nogil
@@ -989,8 +1035,9 @@ cdef class DeviceScanBuildResult:
         Iterator d_in,
         Iterator d_out,
         Op op,
-        Value h_init,
+        TypeInfo init_type,
         bint force_inclusive,
+        cccl_init_kind_t init_kind,
         CommonData common_data
     ):
         cdef CUresult status = -1
@@ -1008,8 +1055,9 @@ cdef class DeviceScanBuildResult:
                 d_in.iter_data,
                 d_out.iter_data,
                 op.op_data,
-                h_init.value_data,
+                init_type.type_info,
                 force_inclusive,
+                init_kind,
                 cc_major,
                 cc_minor,
                 cub_path,
@@ -1035,7 +1083,7 @@ cdef class DeviceScanBuildResult:
         Iterator d_out,
         size_t num_items,
         Op op,
-        Value h_init,
+        Value init_value,
         stream
     ):
         cdef CUresult status = -1
@@ -1052,7 +1100,7 @@ cdef class DeviceScanBuildResult:
                 d_out.iter_data,
                 <uint64_t>num_items,
                 op.op_data,
-                h_init.value_data,
+                init_value.value_data,
                 c_stream
             )
         if status != 0:
@@ -1069,7 +1117,7 @@ cdef class DeviceScanBuildResult:
         Iterator d_out,
         size_t num_items,
         Op op,
-        Value h_init,
+        Value init_value,
         stream
     ):
         cdef CUresult status = -1
@@ -1086,7 +1134,7 @@ cdef class DeviceScanBuildResult:
                 d_out.iter_data,
                 <uint64_t>num_items,
                 op.op_data,
-                h_init.value_data,
+                init_value.value_data,
                 c_stream
             )
         if status != 0:
@@ -1095,6 +1143,107 @@ cdef class DeviceScanBuildResult:
             )
         return storage_sz
+    cpdef int compute_inclusive_future_value(
+        DeviceScanBuildResult self,
+        temp_storage_ptr,
+        temp_storage_bytes,
+        Iterator d_in,
+        Iterator d_out,
+        size_t num_items,
+        Op op,
+        Iterator init_value,
+        stream
+    ):
+        cdef CUresult status = -1
+        cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
+        cdef size_t storage_sz = <size_t>temp_storage_bytes
+        cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
+        with nogil:
+            status = cccl_device_inclusive_scan_future_value(
+                self.build_data,
+                storage_ptr,
+                &storage_sz,
+                d_in.iter_data,
+                d_out.iter_data,
+                <uint64_t>num_items,
+                op.op_data,
+                init_value.iter_data,
+                c_stream
+            )
+        if status != 0:
+            raise RuntimeError(
+                f"Failed executing inclusive scan, error code: {status}"
+            )
+        return storage_sz
+    cpdef int compute_exclusive_future_value(
+        DeviceScanBuildResult self,
+        temp_storage_ptr,
+        temp_storage_bytes,
+        Iterator d_in,
+        Iterator d_out,
+        size_t num_items,
+        Op op,
+        Iterator init_value,
+        stream
+    ):
+        cdef CUresult status = -1
+        cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
+        cdef size_t storage_sz = <size_t>temp_storage_bytes
+        cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
+        with nogil:
+            status = cccl_device_exclusive_scan_future_value(
+                self.build_data,
+                storage_ptr,
+                &storage_sz,
+                d_in.iter_data,
+                d_out.iter_data,
+                <uint64_t>num_items,
+                op.op_data,
+                init_value.iter_data,
+                c_stream
+            )
+        if status != 0:
+            raise RuntimeError(
+                f"Failed executing exclusive scan, error code: {status}"
+            )
+        return storage_sz
+    cpdef int compute_inclusive_no_init(
+        DeviceScanBuildResult self,
+        temp_storage_ptr,
+        temp_storage_bytes,
+        Iterator d_in,
+        Iterator d_out,
+        size_t num_items,
+        Op op,
+        object init_value,
+        stream
+    ):
+        cdef CUresult status = -1
+        cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
+        cdef size_t storage_sz = <size_t>temp_storage_bytes
+        cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
+        with nogil:
+            status = cccl_device_inclusive_scan_no_init(
+                self.build_data,
+                storage_ptr,
+                &storage_sz,
+                d_in.iter_data,
+                d_out.iter_data,
+                <uint64_t>num_items,
+                op.op_data,
+                c_stream
+            )
+        if status != 0:
+            raise RuntimeError(
+                f"Failed executing inclusive scan, error code: {status}"
+            )
+        return storage_sz
     def _get_cubin(self):
         return PyBytes_FromStringAndSize(
             <const char*>self.build_data.cubin,

cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py RENAMED Viewed

@@ -148,7 +148,7 @@ def make_histogram_even(
     Example:
         Below, ``make_histogram_even`` is used to create a histogram object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/histogram/histogram_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/histogram/histogram_object.py
           :language: python
           :start-after: # example-begin
@@ -190,7 +190,7 @@ def histogram_even(
     Example:
         Below, ``histogram_even`` is used to compute a histogram with evenly-spaced bins.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/histogram/histogram_even_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/histogram/histogram_even_basic.py
             :language: python
             :start-after: # example-begin
             :caption: Basic histogram example.

cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py RENAMED Viewed

@@ -166,7 +166,7 @@ def make_merge_sort(
     Example:
         Below, ``make_merge_sort`` is used to create a merge sort object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/merge_sort_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/merge_sort_object.py
           :language: python
           :start-after: # example-begin
@@ -201,7 +201,7 @@ def merge_sort(
     Example:
         Below, ``merge_sort`` is used to sort a sequence of keys inplace. It also rearranges the items according to the keys' order.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/merge_sort_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/merge_sort_basic.py
             :language: python
             :start-after: # example-begin

cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py RENAMED Viewed

@@ -222,7 +222,7 @@ def make_radix_sort(
     Example:
         Below, ``make_radix_sort`` is used to create a radix sort object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/radix_sort_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_object.py
           :language: python
           :start-after: # example-begin
@@ -259,14 +259,14 @@ def radix_sort(
     Example:
         Below, ``radix_sort`` is used to sort a sequence of keys. It also rearranges the values according to the keys' order.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/radix_sort_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_basic.py
             :language: python
             :start-after: # example-begin
         In the following example, ``radix_sort`` is used to sort a sequence of keys with a ``DoubleBuffer` for reduced temporary storage.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/radix_sort_buffer.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_buffer.py
             :language: python
             :start-after: # example-begin

cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py RENAMED Viewed

@@ -130,7 +130,7 @@ def make_reduce_into(
     Example:
         Below, ``make_reduce_into`` is used to create a reduction object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/reduction/reduce_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/reduce_object.py
             :language: python
             :start-after: # example-begin
@@ -163,7 +163,7 @@ def reduce_into(
     Example:
         Below, ``reduce_into`` is used to compute the sum of a sequence of integers.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/reduction/sum_reduction.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/sum_reduction.py
             :language: python
             :start-after: # example-begin

cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py RENAMED Viewed

@@ -3,7 +3,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-from typing import Callable, Union
+from typing import Callable, Union, cast
 import numba
 import numpy as np
@@ -20,14 +20,27 @@ from ..op import OpKind
 from ..typing import DeviceArrayLike, GpuStruct
+def get_init_kind(
+    init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
+) -> _bindings.InitKind:
+    match init_value:
+        case None:
+            return _bindings.InitKind.NO_INIT
+        case _ if isinstance(init_value, DeviceArrayLike):
+            return _bindings.InitKind.FUTURE_VALUE_INIT
+        case _:
+            return _bindings.InitKind.VALUE_INIT
 class _Scan:
     __slots__ = [
         "build_result",
         "d_in_cccl",
         "d_out_cccl",
-        "h_init_cccl",
+        "init_value_cccl",
         "op_wrapper",
         "device_scan_fn",
+        "init_kind",
     ]
     # TODO: constructor shouldn't require concrete `d_in`, `d_out`:
@@ -36,36 +49,74 @@ class _Scan:
         d_in: DeviceArrayLike | IteratorBase,
         d_out: DeviceArrayLike | IteratorBase,
         op: Callable | OpKind,
-        h_init: np.ndarray | GpuStruct,
+        init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
         force_inclusive: bool,
     ):
         self.d_in_cccl = cccl.to_cccl_input_iter(d_in)
         self.d_out_cccl = cccl.to_cccl_output_iter(d_out)
-        self.h_init_cccl = cccl.to_cccl_value(h_init)
-        if isinstance(h_init, np.ndarray):
-            value_type = numba.from_dtype(h_init.dtype)
-        else:
-            value_type = numba.typeof(h_init)
+        self.init_kind = get_init_kind(init_value)
+        self.init_value_cccl: _bindings.Iterator | _bindings.Value | None
+        match self.init_kind:
+            case _bindings.InitKind.NO_INIT:
+                # TODO: we just need to extract the dtype from the input iterator
+                if not isinstance(d_in, DeviceArrayLike):
+                    raise ValueError(
+                        "No init value not supported for non-DeviceArrayLike input"
+                    )
+                self.init_value_cccl = None
+                value_type = numba.from_dtype(protocols.get_dtype(d_in))
+                init_value_type_info = self.d_in_cccl.value_type
+            case _bindings.InitKind.FUTURE_VALUE_INIT:
+                self.init_value_cccl = cccl.to_cccl_input_iter(init_value)
+                value_type = numba.from_dtype(
+                    protocols.get_dtype(cast(DeviceArrayLike, init_value))
+                )
+                init_value_type_info = self.init_value_cccl.value_type
+            case _bindings.InitKind.VALUE_INIT:
+                self.init_value_cccl = cccl.to_cccl_value(init_value)
+                value_type = (
+                    numba.from_dtype(init_value.dtype)
+                    if isinstance(init_value, np.ndarray)
+                    else numba.typeof(init_value)
+                )
+                init_value_type_info = self.init_value_cccl.type
         # For well-known operations, we don't need a signature
         if isinstance(op, OpKind):
             self.op_wrapper = cccl.to_cccl_op(op, None)
         else:
             self.op_wrapper = cccl.to_cccl_op(op, value_type(value_type, value_type))
         self.build_result = call_build(
             _bindings.DeviceScanBuildResult,
             self.d_in_cccl,
             self.d_out_cccl,
             self.op_wrapper,
-            self.h_init_cccl,
+            init_value_type_info,
             force_inclusive,
+            self.init_kind,
         )
-        self.device_scan_fn = (
-            self.build_result.compute_inclusive
-            if force_inclusive
-            else self.build_result.compute_exclusive
-        )
+        match (force_inclusive, self.init_kind):
+            case (True, _bindings.InitKind.FUTURE_VALUE_INIT):
+                self.device_scan_fn = self.build_result.compute_inclusive_future_value
+            case (True, _bindings.InitKind.VALUE_INIT):
+                self.device_scan_fn = self.build_result.compute_inclusive
+            case (True, _bindings.InitKind.NO_INIT):
+                self.device_scan_fn = self.build_result.compute_inclusive_no_init
+            case (False, _bindings.InitKind.FUTURE_VALUE_INIT):
+                self.device_scan_fn = self.build_result.compute_exclusive_future_value
+            case (False, _bindings.InitKind.VALUE_INIT):
+                self.device_scan_fn = self.build_result.compute_exclusive
+            case (False, _bindings.InitKind.NO_INIT):
+                raise ValueError("Exclusive scan with No init value is not supported")
     def __call__(
         self,
@@ -73,13 +124,25 @@ class _Scan:
         d_in,
         d_out,
         num_items: int,
-        h_init: np.ndarray | GpuStruct,
+        init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
         stream=None,
     ):
         set_cccl_iterator_state(self.d_in_cccl, d_in)
         set_cccl_iterator_state(self.d_out_cccl, d_out)
-        self.h_init_cccl.state = to_cccl_value_state(h_init)
+        match self.init_kind:
+            case _bindings.InitKind.FUTURE_VALUE_INIT:
+                # We know that the init_value_cccl is an Iterator, so this cast
+                # tells MyPy what the actual type is. cast() is a no-op at runtime,
+                # which makes it better than isinstance() since this is a hot path
+                # and we have to minimize the work we do prior to calling the
+                # kernel.
+                self.init_value_cccl = cast(_bindings.Iterator, self.init_value_cccl)
+                set_cccl_iterator_state(self.init_value_cccl, init_value)
+            case _bindings.InitKind.VALUE_INIT:
+                self.init_value_cccl = cast(_bindings.Value, self.init_value_cccl)
+                self.init_value_cccl.state = to_cccl_value_state(init_value)
         stream_handle = validate_and_get_stream(stream)
@@ -97,7 +160,7 @@ class _Scan:
             self.d_out_cccl,
             num_items,
             self.op_wrapper,
-            self.h_init_cccl,
+            self.init_value_cccl,
             stream_handle,
         )
         return temp_storage_bytes
@@ -107,7 +170,7 @@ def make_cache_key(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
     op: Callable | OpKind,
-    h_init: np.ndarray,
+    init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
 ):
     d_in_key = (
         d_in.kind if isinstance(d_in, IteratorBase) else protocols.get_dtype(d_in)
@@ -123,8 +186,17 @@ def make_cache_key(
     else:
         op_key = CachableFunction(op)
-    h_init_key = h_init.dtype
-    return (d_in_key, d_out_key, op_key, h_init_key)
+    init_kind_key = get_init_kind(init_value)
+    match init_kind_key:
+        case _bindings.InitKind.NO_INIT:
+            init_value_key = None
+        case _bindings.InitKind.FUTURE_VALUE_INIT:
+            init_value_key = protocols.get_dtype(cast(DeviceArrayLike, init_value))
+        case _bindings.InitKind.VALUE_INIT:
+            init_value = cast(np.ndarray | GpuStruct, init_value)
+            init_value_key = init_value.dtype
+    return (d_in_key, d_out_key, op_key, init_value_key, init_kind_key)
 # TODO Figure out `sum` without operator and initial value
@@ -134,14 +206,14 @@ def make_exclusive_scan(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
     op: Callable | OpKind,
-    h_init: np.ndarray,
+    init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
 ):
     """Computes a device-wide scan using the specified binary ``op`` and initial value ``init``.
     Example:
         Below, ``make_exclusive_scan`` is used to create an exclusive scan object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/exclusive_scan_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/exclusive_scan_object.py
           :language: python
           :start-after: # example-begin
@@ -150,19 +222,19 @@ def make_exclusive_scan(
         d_in: Device array or iterator containing the input sequence of data items
         d_out: Device array that will store the result of the scan
         op: Callable or OpKind representing the binary operator to apply
-        init: Numpy array storing initial value of the scan
+        init_value: Numpy array, device array, or GPU struct storing initial value of the scan, or None for no initial value
     Returns:
         A callable object that can be used to perform the scan
     """
-    return _Scan(d_in, d_out, op, h_init, False)
+    return _Scan(d_in, d_out, op, init_value, False)
 def exclusive_scan(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
     op: Callable | OpKind,
-    h_init: np.ndarray | GpuStruct,
+    init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
     num_items: int,
     stream=None,
 ):
@@ -174,7 +246,7 @@ def exclusive_scan(
     Example:
         Below, ``exclusive_scan`` is used to compute an exclusive scan with max operation.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/exclusive_scan_max.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/exclusive_scan_max.py
             :language: python
             :start-after: # example-begin
@@ -183,14 +255,14 @@ def exclusive_scan(
         d_in: Device array or iterator containing the input sequence of data items
         d_out: Device array or iterator to store the result of the scan
         op: Binary scan operator
-        h_init: Initial value for the scan
+        init_value: Initial value for the scan
         num_items: Number of items to scan
         stream: CUDA stream for the operation (optional)
     """
-    scanner = make_exclusive_scan(d_in, d_out, op, h_init)
-    tmp_storage_bytes = scanner(None, d_in, d_out, num_items, h_init, stream)
+    scanner = make_exclusive_scan(d_in, d_out, op, init_value)
+    tmp_storage_bytes = scanner(None, d_in, d_out, num_items, init_value, stream)
     tmp_storage = TempStorageBuffer(tmp_storage_bytes, stream)
-    scanner(tmp_storage, d_in, d_out, num_items, h_init, stream)
+    scanner(tmp_storage, d_in, d_out, num_items, init_value, stream)
 # TODO Figure out `sum` without operator and initial value
@@ -200,14 +272,14 @@ def make_inclusive_scan(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
     op: Callable | OpKind,
-    h_init: np.ndarray,
+    init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
 ):
     """Computes a device-wide scan using the specified binary ``op`` and initial value ``init``.
     Example:
         Below, ``make_inclusive_scan`` is used to create an inclusive scan object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/inclusive_scan_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/inclusive_scan_object.py
           :language: python
           :start-after: # example-begin
@@ -216,19 +288,19 @@ def make_inclusive_scan(
         d_in: Device array or iterator containing the input sequence of data items
         d_out: Device array that will store the result of the scan
         op: Callable or OpKind representing the binary operator to apply
-        init: Numpy array storing initial value of the scan
+        init_value: Numpy array, device array, or GPU struct storing initial value of the scan, or None for no initial value
     Returns:
         A callable object that can be used to perform the scan
     """
-    return _Scan(d_in, d_out, op, h_init, True)
+    return _Scan(d_in, d_out, op, init_value, True)
 def inclusive_scan(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
     op: Callable | OpKind,
-    h_init: np.ndarray | GpuStruct,
+    init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
     num_items: int,
     stream=None,
 ):
@@ -240,7 +312,7 @@ def inclusive_scan(
     Example:
         Below, ``inclusive_scan`` is used to compute an inclusive scan (prefix sum).
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/inclusive_scan_custom.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/inclusive_scan_custom.py
             :language: python
             :start-after: # example-begin
@@ -249,11 +321,11 @@ def inclusive_scan(
         d_in: Device array or iterator containing the input sequence of data items
         d_out: Device array or iterator to store the result of the scan
         op: Binary scan operator
-        h_init: Initial value for the scan
+        init_value: Initial value for the scan
         num_items: Number of items to scan
         stream: CUDA stream for the operation (optional)
     """
-    scanner = make_inclusive_scan(d_in, d_out, op, h_init)
-    tmp_storage_bytes = scanner(None, d_in, d_out, num_items, h_init, stream)
+    scanner = make_inclusive_scan(d_in, d_out, op, init_value)
+    tmp_storage_bytes = scanner(None, d_in, d_out, num_items, init_value, stream)
     tmp_storage = TempStorageBuffer(tmp_storage_bytes, stream)
-    scanner(tmp_storage, d_in, d_out, num_items, h_init, stream)
+    scanner(tmp_storage, d_in, d_out, num_items, init_value, stream)

cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py RENAMED Viewed

@@ -179,7 +179,7 @@ def make_segmented_reduce(
     Example:
         Below, ``make_segmented_reduce`` is used to create a segmented reduction object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/segmented/segmented_reduce_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/segmented/segmented_reduce_object.py
             :language: python
             :start-after: # example-begin
@@ -216,7 +216,7 @@ def segmented_reduce(
     Example:
         Below, ``segmented_reduce`` is used to compute the minimum value of segments in a sequence of integers.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/segmented/segmented_reduce_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/segmented/segmented_reduce_basic.py
             :language: python
             :start-after: # example-begin