PyPI - cuda-cccl - Versions diffs - 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.1__cp313-cp313-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.1__cp313-cp313-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (144) hide show

cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py RENAMED Viewed

@@ -166,7 +166,7 @@ def make_merge_sort(
     Example:
         Below, ``make_merge_sort`` is used to create a merge sort object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/merge_sort_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/merge_sort_object.py
           :language: python
           :start-after: # example-begin
@@ -201,7 +201,7 @@ def merge_sort(
     Example:
         Below, ``merge_sort`` is used to sort a sequence of keys inplace. It also rearranges the items according to the keys' order.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/merge_sort_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/merge_sort_basic.py
             :language: python
             :start-after: # example-begin

cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py RENAMED Viewed

@@ -222,7 +222,7 @@ def make_radix_sort(
     Example:
         Below, ``make_radix_sort`` is used to create a radix sort object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/radix_sort_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_object.py
           :language: python
           :start-after: # example-begin
@@ -259,14 +259,14 @@ def radix_sort(
     Example:
         Below, ``radix_sort`` is used to sort a sequence of keys. It also rearranges the values according to the keys' order.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/radix_sort_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_basic.py
             :language: python
             :start-after: # example-begin
         In the following example, ``radix_sort`` is used to sort a sequence of keys with a ``DoubleBuffer` for reduced temporary storage.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/radix_sort_buffer.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_buffer.py
             :language: python
             :start-after: # example-begin

cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py RENAMED Viewed

@@ -130,7 +130,7 @@ def make_reduce_into(
     Example:
         Below, ``make_reduce_into`` is used to create a reduction object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/reduction/reduce_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/reduce_object.py
             :language: python
             :start-after: # example-begin
@@ -163,7 +163,7 @@ def reduce_into(
     Example:
         Below, ``reduce_into`` is used to compute the sum of a sequence of integers.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/reduction/sum_reduction.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/sum_reduction.py
             :language: python
             :start-after: # example-begin

cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py RENAMED Viewed

@@ -141,7 +141,7 @@ def make_exclusive_scan(
     Example:
         Below, ``make_exclusive_scan`` is used to create an exclusive scan object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/exclusive_scan_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/exclusive_scan_object.py
           :language: python
           :start-after: # example-begin
@@ -174,7 +174,7 @@ def exclusive_scan(
     Example:
         Below, ``exclusive_scan`` is used to compute an exclusive scan with max operation.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/exclusive_scan_max.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/exclusive_scan_max.py
             :language: python
             :start-after: # example-begin
@@ -207,7 +207,7 @@ def make_inclusive_scan(
     Example:
         Below, ``make_inclusive_scan`` is used to create an inclusive scan object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/inclusive_scan_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/inclusive_scan_object.py
           :language: python
           :start-after: # example-begin
@@ -240,7 +240,7 @@ def inclusive_scan(
     Example:
         Below, ``inclusive_scan`` is used to compute an inclusive scan (prefix sum).
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/inclusive_scan_custom.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/inclusive_scan_custom.py
             :language: python
             :start-after: # example-begin

cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py RENAMED Viewed

@@ -179,7 +179,7 @@ def make_segmented_reduce(
     Example:
         Below, ``make_segmented_reduce`` is used to create a segmented reduction object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/segmented/segmented_reduce_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/segmented/segmented_reduce_object.py
             :language: python
             :start-after: # example-begin
@@ -216,7 +216,7 @@ def segmented_reduce(
     Example:
         Below, ``segmented_reduce`` is used to compute the minimum value of segments in a sequence of integers.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/segmented/segmented_reduce_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/segmented/segmented_reduce_basic.py
             :language: python
             :start-after: # example-begin

cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py RENAMED Viewed

@@ -165,7 +165,7 @@ def make_three_way_partition(
     Example:
         Below, ``make_three_way_partition`` is used to create a three-way partition object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/three_way_partition/three_way_partition_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/three_way_partition/three_way_partition_object.py
             :language: python
             :start-after: # example-begin
@@ -214,7 +214,7 @@ def three_way_partition(
     Example:
         Below, ``three_way_partition`` is used to partition a sequence of integers into three parts.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/three_way_partition/three_way_partition_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/three_way_partition/three_way_partition_basic.py
             :language: python
             :start-after: # example-begin

cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py RENAMED Viewed

@@ -196,7 +196,7 @@ def make_unary_transform(
     storage allocation. For simpler usage, consider using :func:`unary_transform`.
     Example:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/unary_transform_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/unary_transform_object.py
            :language: python
            :start-after: # example-begin
@@ -227,7 +227,7 @@ def make_binary_transform(
     storage allocation. For simpler usage, consider using :func:`binary_transform`.
     Example:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/binary_transform_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_object.py
            :language: python
            :start-after: # example-begin
@@ -259,7 +259,7 @@ def unary_transform(
     Example:
         Below, ``unary_transform`` is used to apply a transformation to each element of the input.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/unary_transform_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/unary_transform_basic.py
            :language: python
            :start-after: # example-begin
@@ -291,7 +291,7 @@ def binary_transform(
     Example:
         Below, ``binary_transform`` is used to apply a transformation to pairs of elements from two input sequences.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/binary_transform_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_basic.py
            :language: python
            :start-after: # example-begin

cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py RENAMED Viewed

@@ -171,7 +171,7 @@ def make_unique_by_key(
     Example:
         Below, ``make_unique_by_key`` is used to create a unique by key object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/unique/unique_by_key_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/unique/unique_by_key_object.py
           :language: python
           :start-after: # example-begin
@@ -211,7 +211,7 @@ def unique_by_key(
     Example:
         Below, ``unique_by_key`` is used to populate the arrays of output keys and items with the first key and its corresponding item from each sequence of equal keys. It also outputs the number of items selected.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/unique/unique_by_key_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/unique/unique_by_key_basic.py
             :language: python
             :start-after: # example-begin

cuda/{cccl/parallel/experimental → compute}/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so RENAMED Viewed

Binary file

cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so RENAMED Viewed

Binary file

cuda/{cccl/parallel/experimental → compute}/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so RENAMED Viewed

Binary file

cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so RENAMED Viewed

Binary file

cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py RENAMED Viewed

@@ -26,7 +26,7 @@ def CacheModifiedInputIterator(device_array, modifier):
     Example:
         The code snippet below demonstrates the usage of a ``CacheModifiedInputIterator``:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/cache_modified_iterator_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/cache_modified_iterator_basic.py
             :language: python
             :start-after: # example-begin
@@ -55,7 +55,7 @@ def ConstantIterator(value):
         The code snippet below demonstrates the usage of a ``ConstantIterator``
         representing a sequence of constant values:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/constant_iterator_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/constant_iterator_basic.py
             :language: python
             :start-after: # example-begin
@@ -78,7 +78,7 @@ def CountingIterator(offset):
         The code snippet below demonstrates the usage of a ``CountingIterator``
         representing the sequence ``[10, 11, 12]``:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/counting_iterator_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/counting_iterator_basic.py
             :language: python
             :start-after: # example-begin
@@ -100,13 +100,13 @@ def ReverseIterator(sequence):
     Examples:
         The code snippet below demonstrates the usage of a ``ReverseIterator`` as an input iterator:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/reverse_input_iterator.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/reverse_input_iterator.py
             :language: python
             :start-after: # example-begin
         The code snippet below demonstrates the usage of a ``ReverseIterator`` as an output iterator:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/reverse_output_iterator.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/reverse_output_iterator.py
             :language: python
             :start-after: # example-begin
@@ -129,7 +129,7 @@ def TransformIterator(it, op):
         The code snippet below demonstrates the usage of a ``TransformIterator`` composed with a ``CountingIterator``
         to transform the input before performing a reduction.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/transform_iterator_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/transform_iterator_basic.py
             :language: python
             :start-after: # example-begin
     Args:
@@ -151,7 +151,7 @@ def TransformOutputIterator(it, op):
         The code snippet below demonstrates the usage of a ``TransformOutputIterator`` to transform the output
         of a reduction before writing to an output array.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/transform_output_iterator.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/transform_output_iterator.py
             :language: python
             :start-after: # example-begin
@@ -178,7 +178,7 @@ def ZipIterator(*iterators):
         The code snippet below demonstrates the usage of a ``ZipIterator``
         combining two device arrays:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/zip_iterator_elementwise.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/zip_iterator_elementwise.py
             :language: python
             :start-after: # example-begin

cuda/{cccl/parallel/experimental → compute}/struct.py RENAMED Viewed

@@ -207,7 +207,7 @@ def gpu_struct(this: type) -> Type[GpuStruct]:
     to a dataclass). The type of each field must be a subclass of
     `np.number`, like `np.int32` or `np.float64`.
-    Arrays of GPUStruct objects can be used as inputs to cuda.cccl.parallel
+    Arrays of GPUStruct objects can be used as inputs to cuda.compute
     algorithms.
     Example:
@@ -216,7 +216,7 @@ def gpu_struct(this: type) -> Type[GpuStruct]:
         a reduction on an input array of floating point values to compute its
         the smallest and the largest absolute values:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/reduction/minmax_reduction.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/minmax_reduction.py
             :language: python
             :start-after: # example-begin

cuda/coop/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+from . import block, warp
+from ._types import StatefulFunction
+__all__ = ["block", "warp", "StatefulFunction"]

cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py RENAMED Viewed

@@ -5,8 +5,9 @@
 import functools
 from cuda.bindings import nvrtc
-from cuda.cccl.cooperative.experimental._caching import disk_cache
-from cuda.cccl.cooperative.experimental._common import check_in, version
+from ._caching import disk_cache
+from ._common import check_in, version
 def CHECK_NVRTC(err, prog):

cuda/{cccl/cooperative/experimental → coop}/_scan_op.py RENAMED Viewed

@@ -3,8 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 """
-cuda.cccl.cooperative.experimental._scan_op
-======================================
+cuda.coop._scan_op
+==================
 This module implements the ``ScanOp`` class and related functions.
 """
@@ -14,7 +14,7 @@ from enum import Enum
 import numpy as np
-from cuda.cccl.cooperative.experimental._typing import (
+from ._typing import (
     ScanOpType,
 )

cuda/{cccl/cooperative/experimental → coop}/_types.py RENAMED Viewed

@@ -17,8 +17,8 @@ from numba.core.typing import signature
 from numba.cuda import LTOIR
 from numba.cuda.cudadrv import driver as cuda_driver
-import cuda.cccl.cooperative.experimental._nvrtc as nvrtc
-from cuda.cccl.cooperative.experimental._common import find_unsigned
+from . import _nvrtc as nvrtc
+from ._common import find_unsigned
 NUMBA_TYPES_TO_CPP = {
     types.boolean: "bool",

cuda/{cccl/cooperative/experimental → coop}/_typing.py RENAMED Viewed

@@ -9,7 +9,7 @@ if TYPE_CHECKING:
     import numba
     import numpy as np
-    from cuda.cccl.cooperative.experimental._common import dim3
+    from ._common import dim3
 # Type alias for dimension parameters that can be passed to CUDA functions.
 DimType = Union["dim3", int, Tuple[int, int], Tuple[int, int, int]]

cuda/{cccl/cooperative/experimental → coop}/block/__init__.py RENAMED Viewed

@@ -2,18 +2,18 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-from cuda.cccl.cooperative.experimental.block._block_exchange import (
+from ._block_exchange import (
     BlockExchangeType,
     exchange,
 )
-from cuda.cccl.cooperative.experimental.block._block_load_store import load, store
-from cuda.cccl.cooperative.experimental.block._block_merge_sort import merge_sort_keys
-from cuda.cccl.cooperative.experimental.block._block_radix_sort import (
+from ._block_load_store import load, store
+from ._block_merge_sort import merge_sort_keys
+from ._block_radix_sort import (
     radix_sort_keys,
     radix_sort_keys_descending,
 )
-from cuda.cccl.cooperative.experimental.block._block_reduce import reduce, sum
-from cuda.cccl.cooperative.experimental.block._block_scan import (
+from ._block_reduce import reduce, sum
+from ._block_scan import (
     exclusive_scan,
     exclusive_sum,
     inclusive_scan,

cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py RENAMED Viewed

@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 """
-cuda.cccl.cooperative.block_exchange
+cuda.coop.block_exchange
 ====================================
 This module provides a set of :ref:`collective <collective-primitives>` methods
@@ -105,13 +105,13 @@ def exchange(
         perform.  Currently, only :py:attr:`StripedToBlocked` is supported.
     :param dtype: Supplies the data type of the input and output arrays.
-    :type dtype: :py:class:`cuda.cccl.cooperative.experimental._typing.DtypeType`
+    :type dtype: :py:class:`cuda.coop._typing.DtypeType`
     :param threads_per_block: Supplies the number of threads in the block,
         either as an integer for a 1D block or a tuple of two or three integers
         for a 2D or 3D block, respectively.
     :type threads_per_block:
-        :py:class:`cuda.cccl.cooperative.experimental._typing.DimType`
+        :py:class:`cuda.coop._typing.DimType`
     :param items_per_thread: Supplies the number of items partitioned onto each
         thread.
@@ -137,7 +137,7 @@ def exchange(
     :raises ValueError: If ``items_per_thread`` is greater than 1 and
         ``methods`` is not *None* (i.e. a user-defined type is being used).
-    :returns: An :py:class:`cuda.cccl.cooperative.experimental._types.Invocable`
+    :returns: An :py:class:`cuda.coop._types.Invocable`
         object representing the specialized kernel that call be called from
         a Numba JIT'd CUDA kernel.

cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py RENAMED Viewed

@@ -5,12 +5,12 @@
 import numba
-from cuda.cccl.cooperative.experimental._common import (
+from .._common import (
     make_binary_tempfile,
     normalize_dim_param,
     normalize_dtype_param,
 )
-from cuda.cccl.cooperative.experimental._types import (
+from .._types import (
     Algorithm,
     Dependency,
     DependentArray,
@@ -70,13 +70,13 @@ def load(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
         The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
         each thread handling 4 integers.
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_load_store_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
             :end-before: example-end imports
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_load_store_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
             :language: python
             :dedent:
             :start-after: example-begin load_store
@@ -158,13 +158,13 @@ def store(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
         The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
         each thread handling 4 integers.
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_load_store_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
             :end-before: example-end imports
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_load_store_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
             :language: python
             :dedent:
             :start-after: example-begin load_store

cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py RENAMED Viewed

@@ -6,12 +6,12 @@ from typing import TYPE_CHECKING, Callable, Literal, Union
 import numba
-from cuda.cccl.cooperative.experimental._common import (
+from .._common import (
     make_binary_tempfile,
     normalize_dim_param,
     normalize_dtype_param,
 )
-from cuda.cccl.cooperative.experimental._types import (
+from .._types import (
     Algorithm,
     Constant,
     Dependency,
@@ -41,7 +41,7 @@ def merge_sort_keys(
         are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
         where each thread owns 4 consecutive keys. We start by importing necessary modules:
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_merge_sort_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_merge_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -49,7 +49,7 @@ def merge_sort_keys(
         Below is the code snippet that demonstrates the usage of the ``merge_sort_keys`` API:
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_merge_sort_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_merge_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin merge-sort

cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py RENAMED Viewed

@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Tuple, Union
 import numba
-from cuda.cccl.cooperative.experimental._common import (
+from .._common import (
     CUB_BLOCK_SCAN_ALGOS,
     CudaSharedMemConfig,
     dim3,
@@ -14,7 +14,7 @@ from cuda.cccl.cooperative.experimental._common import (
     normalize_dim_param,
     normalize_dtype_param,
 )
-from cuda.cccl.cooperative.experimental._types import (
+from .._types import (
     Algorithm,
     Dependency,
     DependentArray,
@@ -140,7 +140,7 @@ def radix_sort_keys(dtype, threads_per_block, items_per_thread):
         are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
         where each thread owns 4 consecutive keys. We start by importing necessary modules:
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -148,7 +148,7 @@ def radix_sort_keys(dtype, threads_per_block, items_per_thread):
         Below is the code snippet that demonstrates the usage of the ``radix_sort_keys`` API:
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin radix-sort
@@ -181,7 +181,7 @@ def radix_sort_keys_descending(dtype, threads_per_block, items_per_thread):
         are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
         where each thread owns 4 consecutive keys. We start by importing necessary modules:
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -189,7 +189,7 @@ def radix_sort_keys_descending(dtype, threads_per_block, items_per_thread):
         Below is the code snippet that demonstrates the usage of the ``radix_sort_keys`` API:
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin radix-sort-descending

cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py RENAMED Viewed

@@ -6,13 +6,13 @@ from typing import TYPE_CHECKING, Callable, Literal, Tuple, Union
 import numba
-from cuda.cccl.cooperative.experimental._common import (
+from .._common import (
     CUB_BLOCK_REDUCE_ALGOS,
     make_binary_tempfile,
     normalize_dim_param,
     normalize_dtype_param,
 )
-from cuda.cccl.cooperative.experimental._types import (
+from .._types import (
     Algorithm,
     Dependency,
     DependentArray,
@@ -208,13 +208,13 @@ def reduce(
         The code snippet below illustrates a max reduction of 128 integer items that are
         partitioned across 128 threads.
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_reduce_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
             :end-before: example-end imports
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_reduce_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin reduce
@@ -269,13 +269,13 @@ def sum(
         The code snippet below illustrates a sum of 128 integer items that are partitioned
         across 128 threads.
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_reduce_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
             :end-before: example-end imports
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_reduce_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin sum