PyPI - cuda-cccl - Versions diffs - 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show

cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py RENAMED Viewed

@@ -165,7 +165,7 @@ def make_three_way_partition(
     Example:
         Below, ``make_three_way_partition`` is used to create a three-way partition object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/three_way_partition/three_way_partition_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/three_way_partition/three_way_partition_object.py
             :language: python
             :start-after: # example-begin
@@ -214,7 +214,7 @@ def three_way_partition(
     Example:
         Below, ``three_way_partition`` is used to partition a sequence of integers into three parts.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/three_way_partition/three_way_partition_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/three_way_partition/three_way_partition_basic.py
             :language: python
             :start-after: # example-begin

cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py RENAMED Viewed

@@ -11,7 +11,7 @@ from .._caching import CachableFunction, cache_with_key
 from .._cccl_interop import set_cccl_iterator_state
 from .._utils import protocols
 from ..iterators._iterators import IteratorBase
-from ..numba_utils import get_inferred_return_type
+from ..numba_utils import get_inferred_return_type, signature_from_annotations
 from ..op import OpKind
 from ..typing import DeviceArrayLike
@@ -32,16 +32,20 @@ class _UnaryTransform:
     ):
         self.d_in_cccl = cccl.to_cccl_input_iter(d_in)
         self.d_out_cccl = cccl.to_cccl_output_iter(d_out)
-        in_value_type = cccl.get_value_type(d_in)
-        out_value_type = cccl.get_value_type(d_out)
         # For well-known operations, we don't need a signature
         if isinstance(op, OpKind):
             self.op_wrapper = cccl.to_cccl_op(op, None)
         else:
-            if not out_value_type.is_internal:
-                out_value_type = get_inferred_return_type(op, (in_value_type,))
-            sig = out_value_type(in_value_type)
+            try:
+                sig = signature_from_annotations(op)
+            except ValueError:
+                in_value_type = cccl.get_value_type(d_in)
+                out_value_type = cccl.get_value_type(d_out)
+                if not out_value_type.is_internal:
+                    out_value_type = get_inferred_return_type(op, (in_value_type,))
+                sig = out_value_type(in_value_type)
             self.op_wrapper = cccl.to_cccl_op(op, sig=sig)
         self.build_result = cccl.call_build(
             _bindings.DeviceUnaryTransform,
@@ -97,11 +101,14 @@ class _BinaryTransform:
         if isinstance(op, OpKind):
             self.op_wrapper = cccl.to_cccl_op(op, None)
         else:
-            if not out_value_type.is_internal:
-                out_value_type = get_inferred_return_type(
-                    op, (in1_value_type, in2_value_type)
-                )
-            sig = out_value_type(in1_value_type, in2_value_type)
+            try:
+                sig = signature_from_annotations(op)
+            except ValueError:
+                if not out_value_type.is_internal:
+                    out_value_type = get_inferred_return_type(
+                        op, (in1_value_type, in2_value_type)
+                    )
+                sig = out_value_type(in1_value_type, in2_value_type)
             self.op_wrapper = cccl.to_cccl_op(op, sig=sig)
         self.build_result = cccl.call_build(
             _bindings.DeviceBinaryTransform,
@@ -196,7 +203,7 @@ def make_unary_transform(
     storage allocation. For simpler usage, consider using :func:`unary_transform`.
     Example:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/unary_transform_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/unary_transform_object.py
            :language: python
            :start-after: # example-begin
@@ -227,7 +234,7 @@ def make_binary_transform(
     storage allocation. For simpler usage, consider using :func:`binary_transform`.
     Example:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/binary_transform_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_object.py
            :language: python
            :start-after: # example-begin
@@ -259,7 +266,14 @@ def unary_transform(
     Example:
         Below, ``unary_transform`` is used to apply a transformation to each element of the input.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/unary_transform_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/unary_transform_basic.py
+           :language: python
+           :start-after: # example-begin
+        When working with custom struct types, you need to provide type annotations
+        to help with type inference. See the binary transform struct example for reference:
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_struct.py
            :language: python
            :start-after: # example-begin
@@ -291,7 +305,14 @@ def binary_transform(
     Example:
         Below, ``binary_transform`` is used to apply a transformation to pairs of elements from two input sequences.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/binary_transform_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_basic.py
+           :language: python
+           :start-after: # example-begin
+        When working with custom struct types, you need to provide type annotations
+        to help with type inference. See the following example:
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_struct.py
            :language: python
            :start-after: # example-begin

cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py RENAMED Viewed

@@ -171,7 +171,7 @@ def make_unique_by_key(
     Example:
         Below, ``make_unique_by_key`` is used to create a unique by key object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/unique/unique_by_key_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/unique/unique_by_key_object.py
           :language: python
           :start-after: # example-begin
@@ -211,7 +211,7 @@ def unique_by_key(
     Example:
         Below, ``unique_by_key`` is used to populate the arrays of output keys and items with the first key and its corresponding item from each sequence of equal keys. It also outputs the number of items selected.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/unique/unique_by_key_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/unique/unique_by_key_basic.py
             :language: python
             :start-after: # example-begin

cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so ADDED Viewed

Binary file

cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so RENAMED Viewed

Binary file

cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so ADDED Viewed

Binary file

cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so RENAMED Viewed

Binary file

cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py RENAMED Viewed

@@ -2,6 +2,7 @@ from ._factories import (
     CacheModifiedInputIterator,
     ConstantIterator,
     CountingIterator,
+    PermutationIterator,
     ReverseIterator,
     TransformIterator,
     TransformOutputIterator,
@@ -12,6 +13,7 @@ __all__ = [
     "CacheModifiedInputIterator",
     "ConstantIterator",
     "CountingIterator",
+    "PermutationIterator",
     "ReverseIterator",
     "TransformIterator",
     "TransformOutputIterator",

cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py RENAMED Viewed

@@ -10,6 +10,7 @@ from ._iterators import (
     CountingIterator as _CountingIterator,
 )
 from ._iterators import (
+    make_permutation_iterator,
     make_reverse_iterator,
     make_transform_iterator,
 )
@@ -26,7 +27,7 @@ def CacheModifiedInputIterator(device_array, modifier):
     Example:
         The code snippet below demonstrates the usage of a ``CacheModifiedInputIterator``:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/cache_modified_iterator_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/cache_modified_iterator_basic.py
             :language: python
             :start-after: # example-begin
@@ -55,7 +56,7 @@ def ConstantIterator(value):
         The code snippet below demonstrates the usage of a ``ConstantIterator``
         representing a sequence of constant values:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/constant_iterator_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/constant_iterator_basic.py
             :language: python
             :start-after: # example-begin
@@ -78,7 +79,7 @@ def CountingIterator(offset):
         The code snippet below demonstrates the usage of a ``CountingIterator``
         representing the sequence ``[10, 11, 12]``:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/counting_iterator_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/counting_iterator_basic.py
             :language: python
             :start-after: # example-begin
@@ -100,13 +101,13 @@ def ReverseIterator(sequence):
     Examples:
         The code snippet below demonstrates the usage of a ``ReverseIterator`` as an input iterator:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/reverse_input_iterator.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/reverse_input_iterator.py
             :language: python
             :start-after: # example-begin
         The code snippet below demonstrates the usage of a ``ReverseIterator`` as an output iterator:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/reverse_output_iterator.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/reverse_output_iterator.py
             :language: python
             :start-after: # example-begin
@@ -129,7 +130,7 @@ def TransformIterator(it, op):
         The code snippet below demonstrates the usage of a ``TransformIterator`` composed with a ``CountingIterator``
         to transform the input before performing a reduction.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/transform_iterator_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/transform_iterator_basic.py
             :language: python
             :start-after: # example-begin
     Args:
@@ -151,7 +152,7 @@ def TransformOutputIterator(it, op):
         The code snippet below demonstrates the usage of a ``TransformOutputIterator`` to transform the output
         of a reduction before writing to an output array.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/transform_output_iterator.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/transform_output_iterator.py
             :language: python
             :start-after: # example-begin
@@ -165,6 +166,33 @@ def TransformOutputIterator(it, op):
     return make_transform_iterator(it, op, "output")
+def PermutationIterator(values, indices):
+    """Returns an Iterator that accesses values through an index mapping.
+    Similar to https://nvidia.github.io/cccl/thrust/api/classthrust_1_1permutation__iterator.html
+    The permutation iterator accesses elements from the values collection using indices
+    from the indices collection, effectively computing values[indices[i]] at position i.
+    This is useful for gather/scatter operations and indirect array access patterns.
+    Example:
+        The code snippet below demonstrates the usage of a ``PermutationIterator``
+        to access values in a permuted order:
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/permutation_iterator_basic.py
+            :language: python
+            :start-after: # example-begin
+    Args:
+        values: The values array or iterator to be permuted
+        indices: An iterator or device array providing the indices for permutation
+    Returns:
+        A ``PermutationIterator`` object that yields values[indices[i]] at position i
+    """
+    return make_permutation_iterator(values, indices)
 def ZipIterator(*iterators):
     """Returns an Iterator representing a zipped sequence of values from N iterators.
@@ -178,7 +206,7 @@ def ZipIterator(*iterators):
         The code snippet below demonstrates the usage of a ``ZipIterator``
         combining two device arrays:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/zip_iterator_elementwise.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/zip_iterator_elementwise.py
             :language: python
             :start-after: # example-begin

cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py RENAMED Viewed

@@ -207,7 +207,15 @@ def pointer_add_intrinsic(context, ptr, offset):
     def codegen(context, builder, sig, args):
         ptr, index = args
         base = builder.ptrtoint(ptr, ir.IntType(_DEVICE_POINTER_BITWIDTH))
-        offset = builder.mul(index, sizeof_pointee(context, ptr))
+        sizeof = sizeof_pointee(context, ptr)
+        # Cast index to match sizeof type if needed
+        if index.type != sizeof.type:
+            index = (
+                builder.sext(index, sizeof.type)
+                if index.type.width < sizeof.type.width
+                else builder.trunc(index, sizeof.type)
+            )
+        offset = builder.mul(index, sizeof)
         result = builder.add(base, offset)
         return builder.inttoptr(result, ptr.type)
@@ -610,3 +618,200 @@ def _get_last_element_ptr(device_array) -> int:
     ptr = get_data_pointer(device_array)
     return ptr + offset_to_last_element
+class PermutationIteratorKind(IteratorKind):
+    pass
+def make_permutation_iterator(values, indices):
+    """
+    Create a PermutationIterator that accesses values through an index mapping.
+    The permutation iterator accesses elements from `values` using indices from `indices`,
+    effectively computing values[indices[i]] at position i.
+    Args:
+        values: The values array or iterator to permute
+        indices: The indices array or iterator specifying the permutation
+    Returns:
+        PermutationIterator: Iterator that yields permuted values
+    """
+    # Convert arrays to iterators if needed
+    if hasattr(values, "__cuda_array_interface__"):
+        values = pointer(values, numba.from_dtype(get_dtype(values)))
+    elif not isinstance(values, IteratorBase):
+        raise TypeError("values must be a device array or iterator")
+    if hasattr(indices, "__cuda_array_interface__"):
+        indices = pointer(indices, numba.from_dtype(get_dtype(indices)))
+    elif not isinstance(indices, IteratorBase):
+        raise TypeError("indices must be an iterator or device array")
+    # JIT compile value advance/dereference methods
+    value_dtype = values.value_type
+    values_state_type = values.state_type
+    index_type = indices.value_type
+    value_advance = cuda.jit(values.advance, device=True)
+    value_input_dereference = cuda.jit(values.input_dereference, device=True)
+    try:
+        output_deref = values.output_dereference
+        if output_deref is not None:
+            value_output_dereference = cuda.jit(output_deref, device=True)
+            values_is_output_iterator = True
+        else:
+            values_is_output_iterator = False
+    except AttributeError:
+        values_is_output_iterator = False
+    # JIT compile index advance/dereference methods
+    index_advance = cuda.jit(indices.advance, device=True)
+    index_input_dereference = cuda.jit(indices.input_dereference, device=True)
+    # The cvalue and state for PermutationIterator are
+    # structs composed of the cvalues and states of the
+    # value and index iterators.
+    from ..struct import gpu_struct_from_numba_types
+    class PermutationCValueStruct(ctypes.Structure):
+        _fields_ = [
+            ("value_state", values.cvalue.__class__),
+            ("index_state", indices.cvalue.__class__),
+        ]
+    PermutationState = gpu_struct_from_numba_types(
+        "PermutationState",
+        ("value_state", "index_state"),
+        (values_state_type, indices.state_type),
+    )
+    cvalue = PermutationCValueStruct(values.cvalue, indices.cvalue)
+    state_type = PermutationState._numba_type
+    value_type = value_dtype
+    # Define intrinsics for accessing struct fields
+    @intrinsic
+    def get_value_state_field_ptr(context, struct_ptr_type):
+        def codegen(context, builder, sig, args):
+            struct_ptr = args[0]
+            # Use GEP to get pointer to field at index 0 (value_state)
+            field_ptr = builder.gep(
+                struct_ptr,
+                [ir.Constant(ir.IntType(32), 0), ir.Constant(ir.IntType(32), 0)],
+            )
+            return field_ptr
+        from numba.core.datamodel.registry import default_manager
+        struct_model = default_manager.lookup(struct_ptr_type.dtype)
+        field_type = struct_model._members[0]
+        return types.CPointer(field_type)(struct_ptr_type), codegen
+    @intrinsic
+    def get_index_state_field_ptr(context, struct_ptr_type):
+        def codegen(context, builder, sig, args):
+            struct_ptr = args[0]
+            # Use GEP to get pointer to field at index 1 (index_state)
+            field_ptr = builder.gep(
+                struct_ptr,
+                [ir.Constant(ir.IntType(32), 0), ir.Constant(ir.IntType(32), 1)],
+            )
+            return field_ptr
+        from numba.core.datamodel.registry import default_manager
+        struct_model = default_manager.lookup(struct_ptr_type.dtype)
+        field_type = struct_model._members[1]
+        return types.CPointer(field_type)(struct_ptr_type), codegen
+    # Create intrinsic for allocating temporary storage for index
+    @intrinsic
+    def alloca_temp_for_index_type(context):
+        def codegen(context, builder, sig, args):
+            temp_value_type = context.get_value_type(index_type)
+            temp_ptr = builder.alloca(temp_value_type)
+            return temp_ptr
+        return types.CPointer(index_type)(), codegen
+    # Create intrinsic for allocating temporary storage for value state
+    @intrinsic
+    def alloca_temp_for_value_state(context):
+        def codegen(context, builder, sig, args):
+            temp_state_type = context.get_value_type(values_state_type)
+            temp_ptr = builder.alloca(temp_state_type)
+            return temp_ptr
+        return types.CPointer(values_state_type)(), codegen
+    class PermutationIterator(IteratorBase):
+        iterator_kind_type = PermutationIteratorKind
+        def __init__(self, values_it, indices_it):
+            self._values = values_it
+            self._indices = indices_it
+            super().__init__(
+                cvalue=cvalue,
+                state_type=state_type,
+                value_type=value_type,
+            )
+            self._kind = self.__class__.iterator_kind_type(
+                (value_type, values_it.kind, indices_it.kind), state_type
+            )
+        @property
+        def advance(self):
+            return PermutationIterator._advance
+        @property
+        def input_dereference(self):
+            return PermutationIterator._input_dereference
+        @property
+        def output_dereference(self):
+            if not values_is_output_iterator:
+                raise AttributeError(
+                    "PermutationIterator cannot be used as output iterator "
+                    "when values iterator does not support output"
+                )
+            return PermutationIterator._output_dereference
+        @staticmethod
+        def _advance(state, distance):
+            # advance the index iterator
+            index_state_ptr = get_index_state_field_ptr(state)
+            index_advance(index_state_ptr, distance)
+        @staticmethod
+        def _input_dereference(state, result):
+            # dereference index to get the index value
+            index_state_ptr = get_index_state_field_ptr(state)
+            temp_index = alloca_temp_for_index_type()
+            index_input_dereference(index_state_ptr, temp_index)
+            # copy the value state (which always points to position 0)
+            # and advance it by the index value
+            value_state_ptr = get_value_state_field_ptr(state)
+            temp_value_state = alloca_temp_for_value_state()
+            temp_value_state[0] = value_state_ptr[0]
+            value_advance(temp_value_state, temp_index[0])
+            value_input_dereference(temp_value_state, result)
+        @staticmethod
+        def _output_dereference(state, x):
+            # dereference index to get the index value
+            index_state_ptr = get_index_state_field_ptr(state)
+            temp_index = alloca_temp_for_index_type()
+            index_input_dereference(index_state_ptr, temp_index)
+            # copy the value state (which always points to position 0)
+            # and advance it by the index value
+            value_state_ptr = get_value_state_field_ptr(state)
+            temp_value_state = alloca_temp_for_value_state()
+            temp_value_state[0] = value_state_ptr[0]
+            value_advance(temp_value_state, temp_index[0])
+            value_output_dereference(temp_value_state, x)
+    return PermutationIterator(values, indices)

cuda/{cccl/parallel/experimental → compute}/numba_utils.py RENAMED Viewed

@@ -39,10 +39,10 @@ def signature_from_annotations(func) -> numba.core.typing.Signature:
     argspec = inspect.getfullargspec(func)
     num_args = len(argspec.args)
     try:
-        retty = to_numba_type(argspec.annotations["return"])
+        ret_ann = argspec.annotations["return"]
     except KeyError:
         raise ValueError("Function has incomplete annotations: missing return type")
+    retty = to_numba_type(ret_ann)
     if num_args != len(argspec.annotations) - 1:  # -1 for the return type
         raise ValueError("One or more arguments are missing type annotations")
     argtys = tuple(

cuda/{cccl/parallel/experimental → compute}/struct.py RENAMED Viewed

@@ -207,7 +207,7 @@ def gpu_struct(this: type) -> Type[GpuStruct]:
     to a dataclass). The type of each field must be a subclass of
     `np.number`, like `np.int32` or `np.float64`.
-    Arrays of GPUStruct objects can be used as inputs to cuda.cccl.parallel
+    Arrays of GPUStruct objects can be used as inputs to cuda.compute
     algorithms.
     Example:
@@ -216,7 +216,7 @@ def gpu_struct(this: type) -> Type[GpuStruct]:
         a reduction on an input array of floating point values to compute its
         the smallest and the largest absolute values:
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/reduction/minmax_reduction.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/minmax_reduction.py
             :language: python
             :start-after: # example-begin

cuda/{cccl/parallel/experimental → compute}/typing.py RENAMED Viewed

@@ -7,9 +7,11 @@ from typing import Any
 from typing_extensions import (
     Protocol,
+    runtime_checkable,
 )  # TODO: typing_extensions required for Python 3.7 docs env
+@runtime_checkable
 class DeviceArrayLike(Protocol):
     """
     Objects representing a device array, having a `.__cuda_array_interface__`

cuda/coop/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+from . import block, warp
+from ._types import StatefulFunction
+__all__ = ["block", "warp", "StatefulFunction"]

cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py RENAMED Viewed

@@ -5,8 +5,9 @@
 import functools
 from cuda.bindings import nvrtc
-from cuda.cccl.cooperative.experimental._caching import disk_cache
-from cuda.cccl.cooperative.experimental._common import check_in, version
+from ._caching import disk_cache
+from ._common import check_in, version
 def CHECK_NVRTC(err, prog):

cuda/{cccl/cooperative/experimental → coop}/_scan_op.py RENAMED Viewed

@@ -3,8 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 """
-cuda.cccl.cooperative.experimental._scan_op
-======================================
+cuda.coop._scan_op
+==================
 This module implements the ``ScanOp`` class and related functions.
 """
@@ -14,7 +14,7 @@ from enum import Enum
 import numpy as np
-from cuda.cccl.cooperative.experimental._typing import (
+from ._typing import (
     ScanOpType,
 )

cuda/{cccl/cooperative/experimental → coop}/_types.py RENAMED Viewed

@@ -17,8 +17,8 @@ from numba.core.typing import signature
 from numba.cuda import LTOIR
 from numba.cuda.cudadrv import driver as cuda_driver
-import cuda.cccl.cooperative.experimental._nvrtc as nvrtc
-from cuda.cccl.cooperative.experimental._common import find_unsigned
+from . import _nvrtc as nvrtc
+from ._common import find_unsigned
 NUMBA_TYPES_TO_CPP = {
     types.boolean: "bool",

cuda/{cccl/cooperative/experimental → coop}/_typing.py RENAMED Viewed

@@ -9,7 +9,7 @@ if TYPE_CHECKING:
     import numba
     import numpy as np
-    from cuda.cccl.cooperative.experimental._common import dim3
+    from ._common import dim3
 # Type alias for dimension parameters that can be passed to CUDA functions.
 DimType = Union["dim3", int, Tuple[int, int], Tuple[int, int, int]]

cuda/{cccl/cooperative/experimental → coop}/block/__init__.py RENAMED Viewed

@@ -2,18 +2,18 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-from cuda.cccl.cooperative.experimental.block._block_exchange import (
+from ._block_exchange import (
     BlockExchangeType,
     exchange,
 )
-from cuda.cccl.cooperative.experimental.block._block_load_store import load, store
-from cuda.cccl.cooperative.experimental.block._block_merge_sort import merge_sort_keys
-from cuda.cccl.cooperative.experimental.block._block_radix_sort import (
+from ._block_load_store import load, store
+from ._block_merge_sort import merge_sort_keys
+from ._block_radix_sort import (
     radix_sort_keys,
     radix_sort_keys_descending,
 )
-from cuda.cccl.cooperative.experimental.block._block_reduce import reduce, sum
-from cuda.cccl.cooperative.experimental.block._block_scan import (
+from ._block_reduce import reduce, sum
+from ._block_scan import (
     exclusive_scan,
     exclusive_sum,
     inclusive_scan,

cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py RENAMED Viewed

@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 """
-cuda.cccl.cooperative.block_exchange
+cuda.coop.block_exchange
 ====================================
 This module provides a set of :ref:`collective <collective-primitives>` methods
@@ -105,13 +105,13 @@ def exchange(
         perform.  Currently, only :py:attr:`StripedToBlocked` is supported.
     :param dtype: Supplies the data type of the input and output arrays.
-    :type dtype: :py:class:`cuda.cccl.cooperative.experimental._typing.DtypeType`
+    :type dtype: :py:class:`cuda.coop._typing.DtypeType`
     :param threads_per_block: Supplies the number of threads in the block,
         either as an integer for a 1D block or a tuple of two or three integers
         for a 2D or 3D block, respectively.
     :type threads_per_block:
-        :py:class:`cuda.cccl.cooperative.experimental._typing.DimType`
+        :py:class:`cuda.coop._typing.DimType`
     :param items_per_thread: Supplies the number of items partitioned onto each
         thread.
@@ -137,7 +137,7 @@ def exchange(
     :raises ValueError: If ``items_per_thread`` is greater than 1 and
         ``methods`` is not *None* (i.e. a user-defined type is being used).
-    :returns: An :py:class:`cuda.cccl.cooperative.experimental._types.Invocable`
+    :returns: An :py:class:`cuda.coop._types.Invocable`
         object representing the specialized kernel that call be called from
         a Numba JIT'd CUDA kernel.