PyPI - cuda-cccl - Versions diffs - 0.1.3.2.0.dev438__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.1.3.2.0.dev438__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (177) hide show

cuda/compute/algorithms/_three_way_partition.py ADDED Viewed

@@ -0,0 +1,261 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+from typing import Callable
+import numba
+from .. import _bindings
+from .. import _cccl_interop as cccl
+from .._caching import CachableFunction, cache_with_key
+from .._cccl_interop import call_build, set_cccl_iterator_state
+from .._utils import protocols
+from .._utils.temp_storage_buffer import TempStorageBuffer
+from ..iterators._iterators import IteratorBase
+from ..typing import DeviceArrayLike
+def make_cache_key(
+    d_in: DeviceArrayLike | IteratorBase,
+    d_first_part_out: DeviceArrayLike | IteratorBase,
+    d_second_part_out: DeviceArrayLike | IteratorBase,
+    d_unselected_out: DeviceArrayLike | IteratorBase,
+    d_num_selected_out: DeviceArrayLike | IteratorBase,
+    select_first_part_op: Callable,
+    select_second_part_op: Callable,
+):
+    d_in_key = (
+        d_in.kind if isinstance(d_in, IteratorBase) else protocols.get_dtype(d_in)
+    )
+    d_first_part_out_key = (
+        d_first_part_out.kind
+        if isinstance(d_first_part_out, IteratorBase)
+        else protocols.get_dtype(d_first_part_out)
+    )
+    d_second_part_out_key = (
+        d_second_part_out.kind
+        if isinstance(d_second_part_out, IteratorBase)
+        else protocols.get_dtype(d_second_part_out)
+    )
+    d_unselected_out_key = (
+        d_unselected_out.kind
+        if isinstance(d_unselected_out, IteratorBase)
+        else protocols.get_dtype(d_unselected_out)
+    )
+    d_num_selected_out_key = (
+        d_num_selected_out.kind
+        if isinstance(d_num_selected_out, IteratorBase)
+        else protocols.get_dtype(d_num_selected_out)
+    )
+    select_first_part_op_key = CachableFunction(select_first_part_op)
+    select_second_part_op_key = CachableFunction(select_second_part_op)
+    return (
+        d_in_key,
+        d_first_part_out_key,
+        d_second_part_out_key,
+        d_unselected_out_key,
+        d_num_selected_out_key,
+        select_first_part_op_key,
+        select_second_part_op_key,
+    )
+class _ThreeWayPartition:
+    __slots__ = [
+        "build_result",
+        "d_in_cccl",
+        "d_first_part_out_cccl",
+        "d_second_part_out_cccl",
+        "d_unselected_out_cccl",
+        "d_num_selected_out_cccl",
+        "select_first_part_op_wrapper",
+        "select_second_part_op_wrapper",
+    ]
+    def __init__(
+        self,
+        d_in: DeviceArrayLike | IteratorBase,
+        d_first_part_out: DeviceArrayLike | IteratorBase,
+        d_second_part_out: DeviceArrayLike | IteratorBase,
+        d_unselected_out: DeviceArrayLike | IteratorBase,
+        d_num_selected_out: DeviceArrayLike | IteratorBase,
+        select_first_part_op: Callable,
+        select_second_part_op: Callable,
+    ):
+        self.d_in_cccl = cccl.to_cccl_input_iter(d_in)
+        self.d_first_part_out_cccl = cccl.to_cccl_output_iter(d_first_part_out)
+        self.d_second_part_out_cccl = cccl.to_cccl_output_iter(d_second_part_out)
+        self.d_unselected_out_cccl = cccl.to_cccl_output_iter(d_unselected_out)
+        self.d_num_selected_out_cccl = cccl.to_cccl_output_iter(d_num_selected_out)
+        value_type = cccl.get_value_type(d_in)
+        sig = numba.types.uint8(value_type)
+        # There are no well-known operations that can be used with three_way_partition
+        self.select_first_part_op_wrapper = cccl.to_cccl_op(select_first_part_op, sig)
+        self.select_second_part_op_wrapper = cccl.to_cccl_op(select_second_part_op, sig)
+        self.build_result = call_build(
+            _bindings.DeviceThreeWayPartitionBuildResult,
+            self.d_in_cccl,
+            self.d_first_part_out_cccl,
+            self.d_second_part_out_cccl,
+            self.d_unselected_out_cccl,
+            self.d_num_selected_out_cccl,
+            self.select_first_part_op_wrapper,
+            self.select_second_part_op_wrapper,
+        )
+    def __call__(
+        self,
+        temp_storage,
+        d_in,
+        d_first_part_out,
+        d_second_part_out,
+        d_unselected_out,
+        d_num_selected_out,
+        num_items: int,
+        stream=None,
+    ):
+        set_cccl_iterator_state(self.d_in_cccl, d_in)
+        set_cccl_iterator_state(self.d_first_part_out_cccl, d_first_part_out)
+        set_cccl_iterator_state(self.d_second_part_out_cccl, d_second_part_out)
+        set_cccl_iterator_state(self.d_unselected_out_cccl, d_unselected_out)
+        set_cccl_iterator_state(self.d_num_selected_out_cccl, d_num_selected_out)
+        stream_handle = protocols.validate_and_get_stream(stream)
+        if temp_storage is None:
+            temp_storage_bytes = 0
+            d_temp_storage = 0
+        else:
+            temp_storage_bytes = temp_storage.nbytes
+            d_temp_storage = protocols.get_data_pointer(temp_storage)
+        temp_storage_bytes = self.build_result.compute(
+            d_temp_storage,
+            temp_storage_bytes,
+            self.d_in_cccl,
+            self.d_first_part_out_cccl,
+            self.d_second_part_out_cccl,
+            self.d_unselected_out_cccl,
+            self.d_num_selected_out_cccl,
+            self.select_first_part_op_wrapper,
+            self.select_second_part_op_wrapper,
+            num_items,
+            stream_handle,
+        )
+        return temp_storage_bytes
+@cache_with_key(make_cache_key)
+def make_three_way_partition(
+    d_in: DeviceArrayLike | IteratorBase,
+    d_first_part_out: DeviceArrayLike | IteratorBase,
+    d_second_part_out: DeviceArrayLike | IteratorBase,
+    d_unselected_out: DeviceArrayLike | IteratorBase,
+    d_num_selected_out: DeviceArrayLike | IteratorBase,
+    select_first_part_op: Callable,
+    select_second_part_op: Callable,
+):
+    """
+    Computes a device-wide three-way partition using the specified unary ``select_first_part_op`` and ``select_second_part_op`` operators.
+    Example:
+        Below, ``make_three_way_partition`` is used to create a three-way partition object that can be reused.
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/three_way_partition/three_way_partition_object.py
+            :language: python
+            :start-after: # example-begin
+    Args:
+        d_in: Device array or iterator containing the input sequence of data items
+        d_first_part_out: Device array or iterator to store the first part of the output
+        d_second_part_out: Device array or iterator to store the second part of the output
+        d_unselected_out: Device array or iterator to store the unselected items
+        d_num_selected_out: Device array to store the number of items selected. The total number of items selected by ``select_first_part_op`` and ``select_second_part_op`` is stored in ``d_num_selected_out[0]`` and ``d_num_selected_out[1]``, respectively.
+        select_first_part_op: Callable representing the unary operator to select the first part
+        select_second_part_op: Callable representing the unary operator to select the second part
+    Returns:
+        A callable object that can be used to perform the three-way partition
+    """
+    return _ThreeWayPartition(
+        d_in,
+        d_first_part_out,
+        d_second_part_out,
+        d_unselected_out,
+        d_num_selected_out,
+        select_first_part_op,
+        select_second_part_op,
+    )
+def three_way_partition(
+    d_in: DeviceArrayLike | IteratorBase,
+    d_first_part_out: DeviceArrayLike | IteratorBase,
+    d_second_part_out: DeviceArrayLike | IteratorBase,
+    d_unselected_out: DeviceArrayLike | IteratorBase,
+    d_num_selected_out: DeviceArrayLike | IteratorBase,
+    select_first_part_op: Callable,
+    select_second_part_op: Callable,
+    num_items: int,
+    stream=None,
+):
+    """
+    Performs device-wide three-way partition. Given an input sequence of data items, it partitions the items into three parts:
+    - The first part is selected by the ``select_first_part_op`` operator.
+    - The second part is selected by the ``select_second_part_op`` operator.
+    - The unselected items are not selected by either operator.
+    This function automatically handles temporary storage allocation and execution.
+    Example:
+        Below, ``three_way_partition`` is used to partition a sequence of integers into three parts.
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/three_way_partition/three_way_partition_basic.py
+            :language: python
+            :start-after: # example-begin
+    Args:
+        d_in: Device array or iterator containing the input sequence of data items
+        d_first_part_out: Device array or iterator to store the first part of the output
+        d_second_part_out: Device array or iterator to store the second part of the output
+        d_unselected_out: Device array or iterator to store the unselected items
+        d_num_selected_out: Device array to store the number of items selected. The total number of items selected by ``select_first_part_op`` and ``select_second_part_op`` is stored in ``d_num_selected_out[0]`` and ``d_num_selected_out[1]``, respectively.
+        select_first_part_op: Callable representing the unary operator to select the first part
+        select_second_part_op: Callable representing the unary operator to select the second part
+        num_items: Number of items to partition
+        stream: CUDA stream for the operation (optional)
+    """
+    partitioner = make_three_way_partition(
+        d_in,
+        d_first_part_out,
+        d_second_part_out,
+        d_unselected_out,
+        d_num_selected_out,
+        select_first_part_op,
+        select_second_part_op,
+    )
+    tmp_storage_bytes = partitioner(
+        None,
+        d_in,
+        d_first_part_out,
+        d_second_part_out,
+        d_unselected_out,
+        d_num_selected_out,
+        num_items,
+        stream,
+    )
+    tmp_storage = TempStorageBuffer(tmp_storage_bytes, stream)
+    partitioner(
+        tmp_storage,
+        d_in,
+        d_first_part_out,
+        d_second_part_out,
+        d_unselected_out,
+        d_num_selected_out,
+        num_items,
+        stream,
+    )

cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py RENAMED Viewed

@@ -196,7 +196,7 @@ def make_unary_transform(
     storage allocation. For simpler usage, consider using :func:`unary_transform`.
     Example:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/unary_transform_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/unary_transform_object.py
            :language: python
            :start-after: # example-begin
@@ -227,7 +227,7 @@ def make_binary_transform(
     storage allocation. For simpler usage, consider using :func:`binary_transform`.
     Example:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/binary_transform_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_object.py
            :language: python
            :start-after: # example-begin
@@ -259,7 +259,7 @@ def unary_transform(
     Example:
         Below, ``unary_transform`` is used to apply a transformation to each element of the input.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/unary_transform_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/unary_transform_basic.py
            :language: python
            :start-after: # example-begin
@@ -291,7 +291,7 @@ def binary_transform(
     Example:
         Below, ``binary_transform`` is used to apply a transformation to pairs of elements from two input sequences.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/binary_transform_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_basic.py
            :language: python
            :start-after: # example-begin

cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py RENAMED Viewed

@@ -171,7 +171,7 @@ def make_unique_by_key(
     Example:
         Below, ``make_unique_by_key`` is used to create a unique by key object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/unique/unique_by_key_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/unique/unique_by_key_object.py
           :language: python
           :start-after: # example-begin
@@ -211,7 +211,7 @@ def unique_by_key(
     Example:
         Below, ``unique_by_key`` is used to populate the arrays of output keys and items with the first key and its corresponding item from each sequence of equal keys. It also outputs the number of items selected.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/unique/unique_by_key_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/unique/unique_by_key_basic.py
             :language: python
             :start-after: # example-begin

cuda/compute/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so ADDED Viewed

Binary file

cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so RENAMED Viewed

Binary file

cuda/compute/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so ADDED Viewed

Binary file

cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so RENAMED Viewed

Binary file

cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py RENAMED Viewed

@@ -26,7 +26,7 @@ def CacheModifiedInputIterator(device_array, modifier):
     Example:
         The code snippet below demonstrates the usage of a ``CacheModifiedInputIterator``:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/cache_modified_iterator_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/cache_modified_iterator_basic.py
             :language: python
             :start-after: # example-begin
@@ -55,7 +55,7 @@ def ConstantIterator(value):
         The code snippet below demonstrates the usage of a ``ConstantIterator``
         representing a sequence of constant values:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/constant_iterator_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/constant_iterator_basic.py
             :language: python
             :start-after: # example-begin
@@ -78,7 +78,7 @@ def CountingIterator(offset):
         The code snippet below demonstrates the usage of a ``CountingIterator``
         representing the sequence ``[10, 11, 12]``:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/counting_iterator_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/counting_iterator_basic.py
             :language: python
             :start-after: # example-begin
@@ -100,13 +100,13 @@ def ReverseIterator(sequence):
     Examples:
         The code snippet below demonstrates the usage of a ``ReverseIterator`` as an input iterator:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/reverse_input_iterator.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/reverse_input_iterator.py
             :language: python
             :start-after: # example-begin
         The code snippet below demonstrates the usage of a ``ReverseIterator`` as an output iterator:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/reverse_output_iterator.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/reverse_output_iterator.py
             :language: python
             :start-after: # example-begin
@@ -129,7 +129,7 @@ def TransformIterator(it, op):
         The code snippet below demonstrates the usage of a ``TransformIterator`` composed with a ``CountingIterator``
         to transform the input before performing a reduction.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/transform_iterator_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/transform_iterator_basic.py
             :language: python
             :start-after: # example-begin
     Args:
@@ -151,7 +151,7 @@ def TransformOutputIterator(it, op):
         The code snippet below demonstrates the usage of a ``TransformOutputIterator`` to transform the output
         of a reduction before writing to an output array.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/transform_output_iterator.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/transform_output_iterator.py
             :language: python
             :start-after: # example-begin
@@ -178,7 +178,7 @@ def ZipIterator(*iterators):
         The code snippet below demonstrates the usage of a ``ZipIterator``
         combining two device arrays:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/zip_iterator_elementwise.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/zip_iterator_elementwise.py
             :language: python
             :start-after: # example-begin

cuda/{cccl/parallel/experimental → compute}/struct.py RENAMED Viewed

@@ -207,7 +207,7 @@ def gpu_struct(this: type) -> Type[GpuStruct]:
     to a dataclass). The type of each field must be a subclass of
     `np.number`, like `np.int32` or `np.float64`.
-    Arrays of GPUStruct objects can be used as inputs to cuda.cccl.parallel
+    Arrays of GPUStruct objects can be used as inputs to cuda.compute
     algorithms.
     Example:
@@ -216,7 +216,7 @@ def gpu_struct(this: type) -> Type[GpuStruct]:
         a reduction on an input array of floating point values to compute its
         the smallest and the largest absolute values:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/reduction/minmax_reduction.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/minmax_reduction.py
             :language: python
             :start-after: # example-begin

cuda/coop/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+from . import block, warp
+from ._types import StatefulFunction
+__all__ = ["block", "warp", "StatefulFunction"]

cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py RENAMED Viewed

@@ -5,8 +5,9 @@
 import functools
 from cuda.bindings import nvrtc
-from cuda.cccl.cooperative.experimental._caching import disk_cache
-from cuda.cccl.cooperative.experimental._common import check_in, version
+from ._caching import disk_cache
+from ._common import check_in, version
 def CHECK_NVRTC(err, prog):

cuda/{cccl/cooperative/experimental → coop}/_scan_op.py RENAMED Viewed

@@ -3,8 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 """
-cuda.cccl.cooperative.experimental._scan_op
-======================================
+cuda.coop._scan_op
+==================
 This module implements the ``ScanOp`` class and related functions.
 """
@@ -14,7 +14,7 @@ from enum import Enum
 import numpy as np
-from cuda.cccl.cooperative.experimental._typing import (
+from ._typing import (
     ScanOpType,
 )

cuda/{cccl/cooperative/experimental → coop}/_types.py RENAMED Viewed

@@ -17,8 +17,8 @@ from numba.core.typing import signature
 from numba.cuda import LTOIR
 from numba.cuda.cudadrv import driver as cuda_driver
-import cuda.cccl.cooperative.experimental._nvrtc as nvrtc
-from cuda.cccl.cooperative.experimental._common import find_unsigned
+from . import _nvrtc as nvrtc
+from ._common import find_unsigned
 NUMBA_TYPES_TO_CPP = {
     types.boolean: "bool",

cuda/{cccl/cooperative/experimental → coop}/_typing.py RENAMED Viewed

@@ -9,7 +9,7 @@ if TYPE_CHECKING:
     import numba
     import numpy as np
-    from cuda.cccl.cooperative.experimental._common import dim3
+    from ._common import dim3
 # Type alias for dimension parameters that can be passed to CUDA functions.
 DimType = Union["dim3", int, Tuple[int, int], Tuple[int, int, int]]

cuda/{cccl/cooperative/experimental → coop}/block/__init__.py RENAMED Viewed

@@ -2,18 +2,18 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-from cuda.cccl.cooperative.experimental.block._block_exchange import (
+from ._block_exchange import (
     BlockExchangeType,
     exchange,
 )
-from cuda.cccl.cooperative.experimental.block._block_load_store import load, store
-from cuda.cccl.cooperative.experimental.block._block_merge_sort import merge_sort_keys
-from cuda.cccl.cooperative.experimental.block._block_radix_sort import (
+from ._block_load_store import load, store
+from ._block_merge_sort import merge_sort_keys
+from ._block_radix_sort import (
     radix_sort_keys,
     radix_sort_keys_descending,
 )
-from cuda.cccl.cooperative.experimental.block._block_reduce import reduce, sum
-from cuda.cccl.cooperative.experimental.block._block_scan import (
+from ._block_reduce import reduce, sum
+from ._block_scan import (
     exclusive_scan,
     exclusive_sum,
     inclusive_scan,

cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py RENAMED Viewed

@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 """
-cuda.cccl.cooperative.block_exchange
+cuda.coop.block_exchange
 ====================================
 This module provides a set of :ref:`collective <collective-primitives>` methods
@@ -105,13 +105,13 @@ def exchange(
         perform.  Currently, only :py:attr:`StripedToBlocked` is supported.
     :param dtype: Supplies the data type of the input and output arrays.
-    :type dtype: :py:class:`cuda.cccl.cooperative.experimental._typing.DtypeType`
+    :type dtype: :py:class:`cuda.coop._typing.DtypeType`
     :param threads_per_block: Supplies the number of threads in the block,
         either as an integer for a 1D block or a tuple of two or three integers
         for a 2D or 3D block, respectively.
     :type threads_per_block:
-        :py:class:`cuda.cccl.cooperative.experimental._typing.DimType`
+        :py:class:`cuda.coop._typing.DimType`
     :param items_per_thread: Supplies the number of items partitioned onto each
         thread.
@@ -137,7 +137,7 @@ def exchange(
     :raises ValueError: If ``items_per_thread`` is greater than 1 and
         ``methods`` is not *None* (i.e. a user-defined type is being used).
-    :returns: An :py:class:`cuda.cccl.cooperative.experimental._types.Invocable`
+    :returns: An :py:class:`cuda.coop._types.Invocable`
         object representing the specialized kernel that call be called from
         a Numba JIT'd CUDA kernel.

cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py RENAMED Viewed

@@ -5,12 +5,12 @@
 import numba
-from cuda.cccl.cooperative.experimental._common import (
+from .._common import (
     make_binary_tempfile,
     normalize_dim_param,
     normalize_dtype_param,
 )
-from cuda.cccl.cooperative.experimental._types import (
+from .._types import (
     Algorithm,
     Dependency,
     DependentArray,
@@ -70,13 +70,13 @@ def load(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
         The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
         each thread handling 4 integers.
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_load_store_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
             :end-before: example-end imports
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_load_store_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
             :language: python
             :dedent:
             :start-after: example-begin load_store
@@ -158,13 +158,13 @@ def store(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
         The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
         each thread handling 4 integers.
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_load_store_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
             :end-before: example-end imports
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_load_store_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
             :language: python
             :dedent:
             :start-after: example-begin load_store

cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py RENAMED Viewed

@@ -6,12 +6,12 @@ from typing import TYPE_CHECKING, Callable, Literal, Union
 import numba
-from cuda.cccl.cooperative.experimental._common import (
+from .._common import (
     make_binary_tempfile,
     normalize_dim_param,
     normalize_dtype_param,
 )
-from cuda.cccl.cooperative.experimental._types import (
+from .._types import (
     Algorithm,
     Constant,
     Dependency,
@@ -41,7 +41,7 @@ def merge_sort_keys(
         are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
         where each thread owns 4 consecutive keys. We start by importing necessary modules:
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_merge_sort_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_merge_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -49,7 +49,7 @@ def merge_sort_keys(
         Below is the code snippet that demonstrates the usage of the ``merge_sort_keys`` API:
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_merge_sort_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_merge_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin merge-sort