PyPI - cuda-cccl - Versions diffs - 0.3.1__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.2__cp311-cp311-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.3.1__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.2__cp311-cp311-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (185) hide show

cuda/compute/algorithms/_scan.py CHANGED Viewed

@@ -3,7 +3,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-from typing import Callable, Union
+from typing import Callable, Union, cast
 import numba
 import numpy as np
@@ -20,14 +20,27 @@ from ..op import OpKind
 from ..typing import DeviceArrayLike, GpuStruct
+def get_init_kind(
+    init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
+) -> _bindings.InitKind:
+    match init_value:
+        case None:
+            return _bindings.InitKind.NO_INIT
+        case _ if isinstance(init_value, DeviceArrayLike):
+            return _bindings.InitKind.FUTURE_VALUE_INIT
+        case _:
+            return _bindings.InitKind.VALUE_INIT
 class _Scan:
     __slots__ = [
         "build_result",
         "d_in_cccl",
         "d_out_cccl",
-        "h_init_cccl",
+        "init_value_cccl",
         "op_wrapper",
         "device_scan_fn",
+        "init_kind",
     ]
     # TODO: constructor shouldn't require concrete `d_in`, `d_out`:
@@ -36,36 +49,74 @@ class _Scan:
         d_in: DeviceArrayLike | IteratorBase,
         d_out: DeviceArrayLike | IteratorBase,
         op: Callable | OpKind,
-        h_init: np.ndarray | GpuStruct,
+        init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
         force_inclusive: bool,
     ):
         self.d_in_cccl = cccl.to_cccl_input_iter(d_in)
         self.d_out_cccl = cccl.to_cccl_output_iter(d_out)
-        self.h_init_cccl = cccl.to_cccl_value(h_init)
-        if isinstance(h_init, np.ndarray):
-            value_type = numba.from_dtype(h_init.dtype)
-        else:
-            value_type = numba.typeof(h_init)
+        self.init_kind = get_init_kind(init_value)
+        self.init_value_cccl: _bindings.Iterator | _bindings.Value | None
+        match self.init_kind:
+            case _bindings.InitKind.NO_INIT:
+                # TODO: we just need to extract the dtype from the input iterator
+                if not isinstance(d_in, DeviceArrayLike):
+                    raise ValueError(
+                        "No init value not supported for non-DeviceArrayLike input"
+                    )
+                self.init_value_cccl = None
+                value_type = numba.from_dtype(protocols.get_dtype(d_in))
+                init_value_type_info = self.d_in_cccl.value_type
+            case _bindings.InitKind.FUTURE_VALUE_INIT:
+                self.init_value_cccl = cccl.to_cccl_input_iter(init_value)
+                value_type = numba.from_dtype(
+                    protocols.get_dtype(cast(DeviceArrayLike, init_value))
+                )
+                init_value_type_info = self.init_value_cccl.value_type
+            case _bindings.InitKind.VALUE_INIT:
+                self.init_value_cccl = cccl.to_cccl_value(init_value)
+                value_type = (
+                    numba.from_dtype(init_value.dtype)
+                    if isinstance(init_value, np.ndarray)
+                    else numba.typeof(init_value)
+                )
+                init_value_type_info = self.init_value_cccl.type
         # For well-known operations, we don't need a signature
         if isinstance(op, OpKind):
             self.op_wrapper = cccl.to_cccl_op(op, None)
         else:
             self.op_wrapper = cccl.to_cccl_op(op, value_type(value_type, value_type))
         self.build_result = call_build(
             _bindings.DeviceScanBuildResult,
             self.d_in_cccl,
             self.d_out_cccl,
             self.op_wrapper,
-            self.h_init_cccl,
+            init_value_type_info,
             force_inclusive,
+            self.init_kind,
         )
-        self.device_scan_fn = (
-            self.build_result.compute_inclusive
-            if force_inclusive
-            else self.build_result.compute_exclusive
-        )
+        match (force_inclusive, self.init_kind):
+            case (True, _bindings.InitKind.FUTURE_VALUE_INIT):
+                self.device_scan_fn = self.build_result.compute_inclusive_future_value
+            case (True, _bindings.InitKind.VALUE_INIT):
+                self.device_scan_fn = self.build_result.compute_inclusive
+            case (True, _bindings.InitKind.NO_INIT):
+                self.device_scan_fn = self.build_result.compute_inclusive_no_init
+            case (False, _bindings.InitKind.FUTURE_VALUE_INIT):
+                self.device_scan_fn = self.build_result.compute_exclusive_future_value
+            case (False, _bindings.InitKind.VALUE_INIT):
+                self.device_scan_fn = self.build_result.compute_exclusive
+            case (False, _bindings.InitKind.NO_INIT):
+                raise ValueError("Exclusive scan with No init value is not supported")
     def __call__(
         self,
@@ -73,13 +124,25 @@ class _Scan:
         d_in,
         d_out,
         num_items: int,
-        h_init: np.ndarray | GpuStruct,
+        init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
         stream=None,
     ):
         set_cccl_iterator_state(self.d_in_cccl, d_in)
         set_cccl_iterator_state(self.d_out_cccl, d_out)
-        self.h_init_cccl.state = to_cccl_value_state(h_init)
+        match self.init_kind:
+            case _bindings.InitKind.FUTURE_VALUE_INIT:
+                # We know that the init_value_cccl is an Iterator, so this cast
+                # tells MyPy what the actual type is. cast() is a no-op at runtime,
+                # which makes it better than isinstance() since this is a hot path
+                # and we have to minimize the work we do prior to calling the
+                # kernel.
+                self.init_value_cccl = cast(_bindings.Iterator, self.init_value_cccl)
+                set_cccl_iterator_state(self.init_value_cccl, init_value)
+            case _bindings.InitKind.VALUE_INIT:
+                self.init_value_cccl = cast(_bindings.Value, self.init_value_cccl)
+                self.init_value_cccl.state = to_cccl_value_state(init_value)
         stream_handle = validate_and_get_stream(stream)
@@ -97,7 +160,7 @@ class _Scan:
             self.d_out_cccl,
             num_items,
             self.op_wrapper,
-            self.h_init_cccl,
+            self.init_value_cccl,
             stream_handle,
         )
         return temp_storage_bytes
@@ -107,7 +170,7 @@ def make_cache_key(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
     op: Callable | OpKind,
-    h_init: np.ndarray,
+    init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
 ):
     d_in_key = (
         d_in.kind if isinstance(d_in, IteratorBase) else protocols.get_dtype(d_in)
@@ -123,8 +186,17 @@ def make_cache_key(
     else:
         op_key = CachableFunction(op)
-    h_init_key = h_init.dtype
-    return (d_in_key, d_out_key, op_key, h_init_key)
+    init_kind_key = get_init_kind(init_value)
+    match init_kind_key:
+        case _bindings.InitKind.NO_INIT:
+            init_value_key = None
+        case _bindings.InitKind.FUTURE_VALUE_INIT:
+            init_value_key = protocols.get_dtype(cast(DeviceArrayLike, init_value))
+        case _bindings.InitKind.VALUE_INIT:
+            init_value = cast(np.ndarray | GpuStruct, init_value)
+            init_value_key = init_value.dtype
+    return (d_in_key, d_out_key, op_key, init_value_key, init_kind_key)
 # TODO Figure out `sum` without operator and initial value
@@ -134,7 +206,7 @@ def make_exclusive_scan(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
     op: Callable | OpKind,
-    h_init: np.ndarray,
+    init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
 ):
     """Computes a device-wide scan using the specified binary ``op`` and initial value ``init``.
@@ -150,19 +222,19 @@ def make_exclusive_scan(
         d_in: Device array or iterator containing the input sequence of data items
         d_out: Device array that will store the result of the scan
         op: Callable or OpKind representing the binary operator to apply
-        init: Numpy array storing initial value of the scan
+        init_value: Numpy array, device array, or GPU struct storing initial value of the scan, or None for no initial value
     Returns:
         A callable object that can be used to perform the scan
     """
-    return _Scan(d_in, d_out, op, h_init, False)
+    return _Scan(d_in, d_out, op, init_value, False)
 def exclusive_scan(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
     op: Callable | OpKind,
-    h_init: np.ndarray | GpuStruct,
+    init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
     num_items: int,
     stream=None,
 ):
@@ -183,14 +255,14 @@ def exclusive_scan(
         d_in: Device array or iterator containing the input sequence of data items
         d_out: Device array or iterator to store the result of the scan
         op: Binary scan operator
-        h_init: Initial value for the scan
+        init_value: Initial value for the scan
         num_items: Number of items to scan
         stream: CUDA stream for the operation (optional)
     """
-    scanner = make_exclusive_scan(d_in, d_out, op, h_init)
-    tmp_storage_bytes = scanner(None, d_in, d_out, num_items, h_init, stream)
+    scanner = make_exclusive_scan(d_in, d_out, op, init_value)
+    tmp_storage_bytes = scanner(None, d_in, d_out, num_items, init_value, stream)
     tmp_storage = TempStorageBuffer(tmp_storage_bytes, stream)
-    scanner(tmp_storage, d_in, d_out, num_items, h_init, stream)
+    scanner(tmp_storage, d_in, d_out, num_items, init_value, stream)
 # TODO Figure out `sum` without operator and initial value
@@ -200,7 +272,7 @@ def make_inclusive_scan(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
     op: Callable | OpKind,
-    h_init: np.ndarray,
+    init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
 ):
     """Computes a device-wide scan using the specified binary ``op`` and initial value ``init``.
@@ -216,19 +288,19 @@ def make_inclusive_scan(
         d_in: Device array or iterator containing the input sequence of data items
         d_out: Device array that will store the result of the scan
         op: Callable or OpKind representing the binary operator to apply
-        init: Numpy array storing initial value of the scan
+        init_value: Numpy array, device array, or GPU struct storing initial value of the scan, or None for no initial value
     Returns:
         A callable object that can be used to perform the scan
     """
-    return _Scan(d_in, d_out, op, h_init, True)
+    return _Scan(d_in, d_out, op, init_value, True)
 def inclusive_scan(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
     op: Callable | OpKind,
-    h_init: np.ndarray | GpuStruct,
+    init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
     num_items: int,
     stream=None,
 ):
@@ -249,11 +321,11 @@ def inclusive_scan(
         d_in: Device array or iterator containing the input sequence of data items
         d_out: Device array or iterator to store the result of the scan
         op: Binary scan operator
-        h_init: Initial value for the scan
+        init_value: Initial value for the scan
         num_items: Number of items to scan
         stream: CUDA stream for the operation (optional)
     """
-    scanner = make_inclusive_scan(d_in, d_out, op, h_init)
-    tmp_storage_bytes = scanner(None, d_in, d_out, num_items, h_init, stream)
+    scanner = make_inclusive_scan(d_in, d_out, op, init_value)
+    tmp_storage_bytes = scanner(None, d_in, d_out, num_items, init_value, stream)
     tmp_storage = TempStorageBuffer(tmp_storage_bytes, stream)
-    scanner(tmp_storage, d_in, d_out, num_items, h_init, stream)
+    scanner(tmp_storage, d_in, d_out, num_items, init_value, stream)

cuda/compute/algorithms/_transform.py CHANGED Viewed

@@ -11,7 +11,7 @@ from .._caching import CachableFunction, cache_with_key
 from .._cccl_interop import set_cccl_iterator_state
 from .._utils import protocols
 from ..iterators._iterators import IteratorBase
-from ..numba_utils import get_inferred_return_type
+from ..numba_utils import get_inferred_return_type, signature_from_annotations
 from ..op import OpKind
 from ..typing import DeviceArrayLike
@@ -32,16 +32,20 @@ class _UnaryTransform:
     ):
         self.d_in_cccl = cccl.to_cccl_input_iter(d_in)
         self.d_out_cccl = cccl.to_cccl_output_iter(d_out)
-        in_value_type = cccl.get_value_type(d_in)
-        out_value_type = cccl.get_value_type(d_out)
         # For well-known operations, we don't need a signature
         if isinstance(op, OpKind):
             self.op_wrapper = cccl.to_cccl_op(op, None)
         else:
-            if not out_value_type.is_internal:
-                out_value_type = get_inferred_return_type(op, (in_value_type,))
-            sig = out_value_type(in_value_type)
+            try:
+                sig = signature_from_annotations(op)
+            except ValueError:
+                in_value_type = cccl.get_value_type(d_in)
+                out_value_type = cccl.get_value_type(d_out)
+                if not out_value_type.is_internal:
+                    out_value_type = get_inferred_return_type(op, (in_value_type,))
+                sig = out_value_type(in_value_type)
             self.op_wrapper = cccl.to_cccl_op(op, sig=sig)
         self.build_result = cccl.call_build(
             _bindings.DeviceUnaryTransform,
@@ -97,11 +101,14 @@ class _BinaryTransform:
         if isinstance(op, OpKind):
             self.op_wrapper = cccl.to_cccl_op(op, None)
         else:
-            if not out_value_type.is_internal:
-                out_value_type = get_inferred_return_type(
-                    op, (in1_value_type, in2_value_type)
-                )
-            sig = out_value_type(in1_value_type, in2_value_type)
+            try:
+                sig = signature_from_annotations(op)
+            except ValueError:
+                if not out_value_type.is_internal:
+                    out_value_type = get_inferred_return_type(
+                        op, (in1_value_type, in2_value_type)
+                    )
+                sig = out_value_type(in1_value_type, in2_value_type)
             self.op_wrapper = cccl.to_cccl_op(op, sig=sig)
         self.build_result = cccl.call_build(
             _bindings.DeviceBinaryTransform,
@@ -263,6 +270,13 @@ def unary_transform(
            :language: python
            :start-after: # example-begin
+        When working with custom struct types, you need to provide type annotations
+        to help with type inference. See the binary transform struct example for reference:
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_struct.py
+           :language: python
+           :start-after: # example-begin
     Args:
         d_in: Device array or iterator containing the input sequence of data items.
@@ -295,6 +309,13 @@ def binary_transform(
            :language: python
            :start-after: # example-begin
+        When working with custom struct types, you need to provide type annotations
+        to help with type inference. See the following example:
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_struct.py
+           :language: python
+           :start-after: # example-begin
     Args:
         d_in1: Device array or iterator containing the first input sequence of data items.

cuda/compute/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so CHANGED Viewed

Binary file

cuda/compute/cu12/cccl/libcccl.c.parallel.so CHANGED Viewed

Binary file

cuda/compute/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so CHANGED Viewed

Binary file

cuda/compute/cu13/cccl/libcccl.c.parallel.so CHANGED Viewed

Binary file

cuda/compute/iterators/__init__.py CHANGED Viewed

@@ -2,6 +2,7 @@ from ._factories import (
     CacheModifiedInputIterator,
     ConstantIterator,
     CountingIterator,
+    PermutationIterator,
     ReverseIterator,
     TransformIterator,
     TransformOutputIterator,
@@ -12,6 +13,7 @@ __all__ = [
     "CacheModifiedInputIterator",
     "ConstantIterator",
     "CountingIterator",
+    "PermutationIterator",
     "ReverseIterator",
     "TransformIterator",
     "TransformOutputIterator",

cuda/compute/iterators/_factories.py CHANGED Viewed

@@ -10,6 +10,7 @@ from ._iterators import (
     CountingIterator as _CountingIterator,
 )
 from ._iterators import (
+    make_permutation_iterator,
     make_reverse_iterator,
     make_transform_iterator,
 )
@@ -165,6 +166,33 @@ def TransformOutputIterator(it, op):
     return make_transform_iterator(it, op, "output")
+def PermutationIterator(values, indices):
+    """Returns an Iterator that accesses values through an index mapping.
+    Similar to https://nvidia.github.io/cccl/thrust/api/classthrust_1_1permutation__iterator.html
+    The permutation iterator accesses elements from the values collection using indices
+    from the indices collection, effectively computing values[indices[i]] at position i.
+    This is useful for gather/scatter operations and indirect array access patterns.
+    Example:
+        The code snippet below demonstrates the usage of a ``PermutationIterator``
+        to access values in a permuted order:
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/permutation_iterator_basic.py
+            :language: python
+            :start-after: # example-begin
+    Args:
+        values: The values array or iterator to be permuted
+        indices: An iterator or device array providing the indices for permutation
+    Returns:
+        A ``PermutationIterator`` object that yields values[indices[i]] at position i
+    """
+    return make_permutation_iterator(values, indices)
 def ZipIterator(*iterators):
     """Returns an Iterator representing a zipped sequence of values from N iterators.

cuda/compute/iterators/_iterators.py CHANGED Viewed

@@ -207,7 +207,15 @@ def pointer_add_intrinsic(context, ptr, offset):
     def codegen(context, builder, sig, args):
         ptr, index = args
         base = builder.ptrtoint(ptr, ir.IntType(_DEVICE_POINTER_BITWIDTH))
-        offset = builder.mul(index, sizeof_pointee(context, ptr))
+        sizeof = sizeof_pointee(context, ptr)
+        # Cast index to match sizeof type if needed
+        if index.type != sizeof.type:
+            index = (
+                builder.sext(index, sizeof.type)
+                if index.type.width < sizeof.type.width
+                else builder.trunc(index, sizeof.type)
+            )
+        offset = builder.mul(index, sizeof)
         result = builder.add(base, offset)
         return builder.inttoptr(result, ptr.type)
@@ -610,3 +618,200 @@ def _get_last_element_ptr(device_array) -> int:
     ptr = get_data_pointer(device_array)
     return ptr + offset_to_last_element
+class PermutationIteratorKind(IteratorKind):
+    pass
+def make_permutation_iterator(values, indices):
+    """
+    Create a PermutationIterator that accesses values through an index mapping.
+    The permutation iterator accesses elements from `values` using indices from `indices`,
+    effectively computing values[indices[i]] at position i.
+    Args:
+        values: The values array or iterator to permute
+        indices: The indices array or iterator specifying the permutation
+    Returns:
+        PermutationIterator: Iterator that yields permuted values
+    """
+    # Convert arrays to iterators if needed
+    if hasattr(values, "__cuda_array_interface__"):
+        values = pointer(values, numba.from_dtype(get_dtype(values)))
+    elif not isinstance(values, IteratorBase):
+        raise TypeError("values must be a device array or iterator")
+    if hasattr(indices, "__cuda_array_interface__"):
+        indices = pointer(indices, numba.from_dtype(get_dtype(indices)))
+    elif not isinstance(indices, IteratorBase):
+        raise TypeError("indices must be an iterator or device array")
+    # JIT compile value advance/dereference methods
+    value_dtype = values.value_type
+    values_state_type = values.state_type
+    index_type = indices.value_type
+    value_advance = cuda.jit(values.advance, device=True)
+    value_input_dereference = cuda.jit(values.input_dereference, device=True)
+    try:
+        output_deref = values.output_dereference
+        if output_deref is not None:
+            value_output_dereference = cuda.jit(output_deref, device=True)
+            values_is_output_iterator = True
+        else:
+            values_is_output_iterator = False
+    except AttributeError:
+        values_is_output_iterator = False
+    # JIT compile index advance/dereference methods
+    index_advance = cuda.jit(indices.advance, device=True)
+    index_input_dereference = cuda.jit(indices.input_dereference, device=True)
+    # The cvalue and state for PermutationIterator are
+    # structs composed of the cvalues and states of the
+    # value and index iterators.
+    from ..struct import gpu_struct_from_numba_types
+    class PermutationCValueStruct(ctypes.Structure):
+        _fields_ = [
+            ("value_state", values.cvalue.__class__),
+            ("index_state", indices.cvalue.__class__),
+        ]
+    PermutationState = gpu_struct_from_numba_types(
+        "PermutationState",
+        ("value_state", "index_state"),
+        (values_state_type, indices.state_type),
+    )
+    cvalue = PermutationCValueStruct(values.cvalue, indices.cvalue)
+    state_type = PermutationState._numba_type
+    value_type = value_dtype
+    # Define intrinsics for accessing struct fields
+    @intrinsic
+    def get_value_state_field_ptr(context, struct_ptr_type):
+        def codegen(context, builder, sig, args):
+            struct_ptr = args[0]
+            # Use GEP to get pointer to field at index 0 (value_state)
+            field_ptr = builder.gep(
+                struct_ptr,
+                [ir.Constant(ir.IntType(32), 0), ir.Constant(ir.IntType(32), 0)],
+            )
+            return field_ptr
+        from numba.core.datamodel.registry import default_manager
+        struct_model = default_manager.lookup(struct_ptr_type.dtype)
+        field_type = struct_model._members[0]
+        return types.CPointer(field_type)(struct_ptr_type), codegen
+    @intrinsic
+    def get_index_state_field_ptr(context, struct_ptr_type):
+        def codegen(context, builder, sig, args):
+            struct_ptr = args[0]
+            # Use GEP to get pointer to field at index 1 (index_state)
+            field_ptr = builder.gep(
+                struct_ptr,
+                [ir.Constant(ir.IntType(32), 0), ir.Constant(ir.IntType(32), 1)],
+            )
+            return field_ptr
+        from numba.core.datamodel.registry import default_manager
+        struct_model = default_manager.lookup(struct_ptr_type.dtype)
+        field_type = struct_model._members[1]
+        return types.CPointer(field_type)(struct_ptr_type), codegen
+    # Create intrinsic for allocating temporary storage for index
+    @intrinsic
+    def alloca_temp_for_index_type(context):
+        def codegen(context, builder, sig, args):
+            temp_value_type = context.get_value_type(index_type)
+            temp_ptr = builder.alloca(temp_value_type)
+            return temp_ptr
+        return types.CPointer(index_type)(), codegen
+    # Create intrinsic for allocating temporary storage for value state
+    @intrinsic
+    def alloca_temp_for_value_state(context):
+        def codegen(context, builder, sig, args):
+            temp_state_type = context.get_value_type(values_state_type)
+            temp_ptr = builder.alloca(temp_state_type)
+            return temp_ptr
+        return types.CPointer(values_state_type)(), codegen
+    class PermutationIterator(IteratorBase):
+        iterator_kind_type = PermutationIteratorKind
+        def __init__(self, values_it, indices_it):
+            self._values = values_it
+            self._indices = indices_it
+            super().__init__(
+                cvalue=cvalue,
+                state_type=state_type,
+                value_type=value_type,
+            )
+            self._kind = self.__class__.iterator_kind_type(
+                (value_type, values_it.kind, indices_it.kind), state_type
+            )
+        @property
+        def advance(self):
+            return PermutationIterator._advance
+        @property
+        def input_dereference(self):
+            return PermutationIterator._input_dereference
+        @property
+        def output_dereference(self):
+            if not values_is_output_iterator:
+                raise AttributeError(
+                    "PermutationIterator cannot be used as output iterator "
+                    "when values iterator does not support output"
+                )
+            return PermutationIterator._output_dereference
+        @staticmethod
+        def _advance(state, distance):
+            # advance the index iterator
+            index_state_ptr = get_index_state_field_ptr(state)
+            index_advance(index_state_ptr, distance)
+        @staticmethod
+        def _input_dereference(state, result):
+            # dereference index to get the index value
+            index_state_ptr = get_index_state_field_ptr(state)
+            temp_index = alloca_temp_for_index_type()
+            index_input_dereference(index_state_ptr, temp_index)
+            # copy the value state (which always points to position 0)
+            # and advance it by the index value
+            value_state_ptr = get_value_state_field_ptr(state)
+            temp_value_state = alloca_temp_for_value_state()
+            temp_value_state[0] = value_state_ptr[0]
+            value_advance(temp_value_state, temp_index[0])
+            value_input_dereference(temp_value_state, result)
+        @staticmethod
+        def _output_dereference(state, x):
+            # dereference index to get the index value
+            index_state_ptr = get_index_state_field_ptr(state)
+            temp_index = alloca_temp_for_index_type()
+            index_input_dereference(index_state_ptr, temp_index)
+            # copy the value state (which always points to position 0)
+            # and advance it by the index value
+            value_state_ptr = get_value_state_field_ptr(state)
+            temp_value_state = alloca_temp_for_value_state()
+            temp_value_state[0] = value_state_ptr[0]
+            value_advance(temp_value_state, temp_index[0])
+            value_output_dereference(temp_value_state, x)
+    return PermutationIterator(values, indices)

cuda/compute/numba_utils.py CHANGED Viewed

@@ -39,10 +39,10 @@ def signature_from_annotations(func) -> numba.core.typing.Signature:
     argspec = inspect.getfullargspec(func)
     num_args = len(argspec.args)
     try:
-        retty = to_numba_type(argspec.annotations["return"])
+        ret_ann = argspec.annotations["return"]
     except KeyError:
         raise ValueError("Function has incomplete annotations: missing return type")
+    retty = to_numba_type(ret_ann)
     if num_args != len(argspec.annotations) - 1:  # -1 for the return type
         raise ValueError("One or more arguments are missing type annotations")
     argtys = tuple(

cuda/compute/typing.py CHANGED Viewed

@@ -7,9 +7,11 @@ from typing import Any
 from typing_extensions import (
     Protocol,
+    runtime_checkable,
 )  # TODO: typing_extensions required for Python 3.7 docs env
+@runtime_checkable
 class DeviceArrayLike(Protocol):
     """
     Objects representing a device array, having a `.__cuda_array_interface__`