PyPI - numba-cuda - Versions diffs - 0.21.1__cp313-cp313-win_amd64.whl → 0.24.0__cp313-cp313-win_amd64.whl - Mend

numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.24.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (110) hide show

numba_cuda/VERSION +1 -1
numba_cuda/numba/cuda/__init__.py +4 -1
numba_cuda/numba/cuda/_compat.py +47 -0
numba_cuda/numba/cuda/api.py +4 -1
numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/cext/_dispatcher.cpp +8 -40
numba_cuda/numba/cuda/cext/_hashtable.cpp +5 -0
numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/cext/_pymodule.h +1 -1
numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/cext/_typeof.cpp +56 -119
numba_cuda/numba/cuda/cext/mviewbuf.c +7 -1
numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +4 -5
numba_cuda/numba/cuda/codegen.py +46 -12
numba_cuda/numba/cuda/compiler.py +15 -9
numba_cuda/numba/cuda/core/analysis.py +29 -21
numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +1 -1
numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
numba_cuda/numba/cuda/core/base.py +12 -11
numba_cuda/numba/cuda/core/bytecode.py +21 -13
numba_cuda/numba/cuda/core/byteflow.py +336 -90
numba_cuda/numba/cuda/core/compiler.py +3 -4
numba_cuda/numba/cuda/core/compiler_machinery.py +3 -3
numba_cuda/numba/cuda/core/config.py +5 -7
numba_cuda/numba/cuda/core/consts.py +1 -1
numba_cuda/numba/cuda/core/controlflow.py +17 -9
numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
numba_cuda/numba/cuda/core/errors.py +4 -912
numba_cuda/numba/cuda/core/inline_closurecall.py +82 -67
numba_cuda/numba/cuda/core/interpreter.py +334 -160
numba_cuda/numba/cuda/core/ir.py +191 -119
numba_cuda/numba/cuda/core/ir_utils.py +149 -128
numba_cuda/numba/cuda/core/postproc.py +8 -8
numba_cuda/numba/cuda/core/pythonapi.py +3 -0
numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
numba_cuda/numba/cuda/core/rewrites/static_binop.py +1 -1
numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
numba_cuda/numba/cuda/core/ssa.py +5 -5
numba_cuda/numba/cuda/core/transforms.py +29 -16
numba_cuda/numba/cuda/core/typed_passes.py +10 -10
numba_cuda/numba/cuda/core/typeinfer.py +42 -27
numba_cuda/numba/cuda/core/untyped_passes.py +82 -65
numba_cuda/numba/cuda/cpython/unicode.py +2 -2
numba_cuda/numba/cuda/cpython/unicode_support.py +1 -3
numba_cuda/numba/cuda/cudadecl.py +0 -13
numba_cuda/numba/cuda/cudadrv/devicearray.py +10 -9
numba_cuda/numba/cuda/cudadrv/driver.py +142 -519
numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
numba_cuda/numba/cuda/cudadrv/nvrtc.py +87 -32
numba_cuda/numba/cuda/cudaimpl.py +0 -12
numba_cuda/numba/cuda/debuginfo.py +25 -0
numba_cuda/numba/cuda/descriptor.py +1 -1
numba_cuda/numba/cuda/device_init.py +4 -7
numba_cuda/numba/cuda/deviceufunc.py +3 -6
numba_cuda/numba/cuda/dispatcher.py +39 -49
numba_cuda/numba/cuda/intrinsics.py +150 -1
numba_cuda/numba/cuda/libdeviceimpl.py +1 -2
numba_cuda/numba/cuda/lowering.py +36 -29
numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
numba_cuda/numba/cuda/np/arrayobj.py +61 -9
numba_cuda/numba/cuda/np/numpy_support.py +32 -9
numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +4 -3
numba_cuda/numba/cuda/printimpl.py +20 -0
numba_cuda/numba/cuda/serialize.py +10 -0
numba_cuda/numba/cuda/stubs.py +0 -11
numba_cuda/numba/cuda/testing.py +4 -8
numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +195 -51
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +6 -7
numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +53 -23
numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +61 -9
numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +6 -0
numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +22 -1
numba_cuda/numba/cuda/tests/cudapy/test_complex.py +13 -0
numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_debug.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +94 -0
numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +37 -35
numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +117 -1
numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
numba_cuda/numba/cuda/tests/support.py +11 -0
numba_cuda/numba/cuda/types/cuda_functions.py +1 -1
numba_cuda/numba/cuda/typing/asnumbatype.py +37 -2
numba_cuda/numba/cuda/typing/context.py +3 -1
numba_cuda/numba/cuda/typing/typeof.py +51 -2
{numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/METADATA +4 -13
{numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/RECORD +106 -105
numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
{numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/WHEEL +0 -0
{numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE +0 -0
{numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE.numba +0 -0
{numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/top_level.txt +0 -0

numba_cuda/numba/cuda/cudadrv/dummyarray.py CHANGED Viewed

@@ -279,6 +279,10 @@ class Array(object):
         if not self.dims:
             return {"C_CONTIGUOUS": True, "F_CONTIGUOUS": True}
+        # All 0-size arrays are considered contiguous, even if they are multidimensional
+        if self.size == 0:
+            return {"C_CONTIGUOUS": True, "F_CONTIGUOUS": True}
         # If this is a broadcast array then it is not contiguous
         if any([dim.stride == 0 for dim in self.dims]):
             return {"C_CONTIGUOUS": False, "F_CONTIGUOUS": False}

numba_cuda/numba/cuda/cudadrv/nvrtc.py CHANGED Viewed

@@ -12,7 +12,7 @@ import os
 import warnings
 import functools
-from cuda.core.experimental import Program, ProgramOptions
+from numba.cuda._compat import Program, ProgramOptions
 from cuda.bindings import nvrtc as bindings_nvrtc
 NVRTC_EXTRA_SEARCH_PATHS = _readenv(
@@ -30,6 +30,44 @@ def _get_nvrtc_version():
     return (major, minor)
+def _verify_cc_tuple(cc):
+    version = _get_nvrtc_version()
+    ver_str = lambda version: ".".join(str(v) for v in version)
+    if len(cc) == 3:
+        cc, arch = (cc[0], cc[1]), cc[2]
+    else:
+        arch = ""
+    if arch not in ("", "a", "f"):
+        raise ValueError(
+            f"Invalid architecture suffix '{arch}' in compute capability "
+            f"{ver_str(cc)}{arch}. Expected '', 'a', or 'f'."
+        )
+    supported_ccs = get_supported_ccs()
+    try:
+        found = max(filter(lambda v: v <= cc, [v for v in supported_ccs]))
+    except ValueError:
+        raise RuntimeError(
+            f"Device compute capability {ver_str(cc)} is less than the "
+            f"minimum supported by NVRTC {ver_str(version)}. Supported "
+            "compute capabilities are "
+            f"{', '.join([ver_str(v) for v in supported_ccs])}."
+        )
+    if found != cc:
+        found = (found[0], found[1], arch)
+        warnings.warn(
+            f"Device compute capability {ver_str(cc)} is not supported by "
+            f"NVRTC {ver_str(version)}. Using {ver_str(found)} instead."
+        )
+    else:
+        found = (cc[0], cc[1], arch)
+    return found
 def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False):
     """
     Compile a CUDA C/C++ source to PTX or LTOIR for a given compute capability.
@@ -38,7 +76,8 @@ def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False):
     :type src: str
     :param name: The filename of the source (for information only)
     :type name: str
-    :param cc: A tuple ``(major, minor)`` of the compute capability
+    :param cc: A tuple ``(major, minor)`` or ``(major, minor, arch)`` of the
+        compute capability
     :type cc: tuple
     :param ltoir: Compile into LTOIR if True, otherwise into PTX
     :type ltoir: bool
@@ -49,34 +88,18 @@ def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False):
     :return: The compiled PTX or LTOIR and compilation log
     :rtype: tuple
     """
+    found = _verify_cc_tuple(cc)
     version = _get_nvrtc_version()
-    ver_str = lambda version: ".".join(str(v) for v in version)
-    supported_ccs = get_supported_ccs()
-    try:
-        found = max(filter(lambda v: v <= cc, [v for v in supported_ccs]))
-    except ValueError:
-        raise RuntimeError(
-            f"Device compute capability {ver_str(cc)} is less than the "
-            f"minimum supported by NVRTC {ver_str(version)}. Supported "
-            "compute capabilities are "
-            f"{', '.join([ver_str(v) for v in supported_ccs])}."
-        )
-    if found != cc:
-        warnings.warn(
-            f"Device compute capability {ver_str(cc)} is not supported by "
-            f"NVRTC {ver_str(version)}. Using {ver_str(found)} instead."
-        )
     # Compilation options:
     # - Compile for the current device's compute capability.
     # - The CUDA include path is added.
     # - Relocatable Device Code (rdc) is needed to prevent device functions
     #   being optimized away.
-    major, minor = found
+    major, minor = found[0], found[1]
+    cc_arch = found[2] if len(found) == 3 else ""
-    arch = f"sm_{major}{minor}"
+    arch = f"sm_{major}{minor}{cc_arch}"
     cuda_include_dir = get_cuda_paths()["include_dir"].info
     cuda_includes = [f"{cuda_include_dir}"]
@@ -109,6 +132,22 @@ def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False):
     includes = [numba_include, *cuda_includes, nrt_include, *extra_includes]
+    # TODO: move all this into Program/ProgramOptions
+    # logsz = config.CUDA_LOG_SIZE
+    #
+    # jitinfo = bytearray(logsz)
+    # jiterrors = bytearray(logsz)
+    #
+    # jit_option = binding.CUjit_option
+    # options = {
+    #     jit_option.CU_JIT_INFO_LOG_BUFFER: jitinfo,
+    #     jit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: logsz,
+    #     jit_option.CU_JIT_ERROR_LOG_BUFFER: jiterrors,
+    #     jit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: logsz,
+    #     jit_option.CU_JIT_LOG_VERBOSE: config.CUDA_VERBOSE_JIT_LOG,
+    # }
+    # info_log = jitinfo.decode("utf-8")
     options = ProgramOptions(
         arch=arch,
         include_path=includes,
@@ -140,7 +179,7 @@ def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False):
     return result, log
-def find_closest_arch(mycc):
+def find_closest_arch(cc):
     """
     Given a compute capability, return the closest compute capability supported
     by the CUDA toolkit.
@@ -150,17 +189,17 @@ def find_closest_arch(mycc):
     """
     supported_ccs = get_supported_ccs()
-    for i, cc in enumerate(supported_ccs):
-        if cc == mycc:
+    for i, supported_cc in enumerate(supported_ccs):
+        if supported_cc == cc:
             # Matches
-            return cc
-        elif cc > mycc:
+            return supported_cc
+        elif supported_cc > cc:
             # Exceeded
             if i == 0:
                 # CC lower than supported
                 msg = (
                     "GPU compute capability %d.%d is not supported"
-                    "(requires >=%d.%d)" % (mycc + cc)
+                    "(requires >=%d.%d)" % (cc + supported_cc)
                 )
                 raise CCSupportError(msg)
             else:
@@ -171,13 +210,29 @@ def find_closest_arch(mycc):
     return supported_ccs[-1]  # Choose the highest
-def get_arch_option(major, minor):
+def get_arch_option(major, minor, arch=""):
     """Matches with the closest architecture option"""
     if config.FORCE_CUDA_CC:
-        arch = config.FORCE_CUDA_CC
+        fcc = config.FORCE_CUDA_CC
+        major, minor = fcc[0], fcc[1]
+        if len(fcc) == 3:
+            arch = fcc[2]
+        else:
+            arch = ""
     else:
-        arch = find_closest_arch((major, minor))
-    return "compute_%d%d" % arch
+        new_major, new_minor = find_closest_arch((major, minor))
+        if (new_major, new_minor) != (major, minor):
+            # If we picked a different major / minor, then using an
+            # arch-specific version is invalid
+            if arch != "":
+                raise ValueError(
+                    f"Can't use arch-specific compute_{major}{minor}{arch} with "
+                    "closest found compute capability "
+                    f"compute_{new_major}{new_minor}"
+                )
+        major, minor = new_major, new_minor
+    return f"compute_{major}{minor}{arch}"
 def get_lowest_supported_cc():

numba_cuda/numba/cuda/cudaimpl.py CHANGED Viewed

@@ -280,18 +280,6 @@ def ptx_syncwarp_mask(context, builder, sig, args):
     return context.get_dummy_value()
-@lower(stubs.vote_sync_intrinsic, types.i4, types.i4, types.boolean)
-def ptx_vote_sync(context, builder, sig, args):
-    fname = "llvm.nvvm.vote.sync"
-    lmod = builder.module
-    fnty = ir.FunctionType(
-        ir.LiteralStructType((ir.IntType(32), ir.IntType(1))),
-        (ir.IntType(32), ir.IntType(32), ir.IntType(1)),
-    )
-    func = cgutils.get_or_insert_function(lmod, fnty, fname)
-    return builder.call(func, args)
 @lower(stubs.match_any_sync, types.i4, types.i4)
 @lower(stubs.match_any_sync, types.i4, types.i8)
 @lower(stubs.match_any_sync, types.i4, types.f4)

numba_cuda/numba/cuda/debuginfo.py CHANGED Viewed

@@ -4,6 +4,7 @@
 import abc
 import os
 from contextlib import contextmanager
+from enum import IntEnum
 import llvmlite
 from llvmlite import ir
@@ -71,6 +72,16 @@ if not hasattr(config, "CUDA_DEBUG_POLY_USE_TYPED_CONST"):
     config.CUDA_DEBUG_POLY_USE_TYPED_CONST = DEBUG_POLY_USE_TYPED_CONST
+class DwarfAddressClass(IntEnum):
+    GENERIC = 0x00
+    GLOBAL = 0x01
+    REGISTER = 0x02
+    CONSTANT = 0x05
+    LOCAL = 0x06
+    PARAMETER = 0x07
+    SHARED = 0x08
 @contextmanager
 def suspend_emission(builder):
     """Suspends the emission of debug_metadata for the duration of the context
@@ -179,6 +190,19 @@ class DIBuilder(AbstractDIBuilder):
         # constructing subprograms
         self.dicompileunit = self._di_compile_unit()
+    def get_dwarf_address_class(self, addrspace):
+        # Map NVVM address space to DWARF address class.
+        from numba.cuda.cudadrv import nvvm
+        addrspace_to_addrclass_dict = {
+            nvvm.ADDRSPACE_GENERIC: None,
+            nvvm.ADDRSPACE_GLOBAL: DwarfAddressClass.GLOBAL,
+            nvvm.ADDRSPACE_SHARED: DwarfAddressClass.SHARED,
+            nvvm.ADDRSPACE_CONSTANT: DwarfAddressClass.CONSTANT,
+            nvvm.ADDRSPACE_LOCAL: DwarfAddressClass.LOCAL,
+        }
+        return addrspace_to_addrclass_dict.get(addrspace)
     def _var_type(self, lltype, size, datamodel=None):
         if self._DEBUG:
             print(
@@ -796,6 +820,7 @@ class CUDADIBuilder(DIBuilder):
                     },
                     is_distinct=True,
                 )
         # For other cases, use upstream Numba implementation
         return super()._var_type(lltype, size, datamodel=datamodel)

numba_cuda/numba/cuda/descriptor.py CHANGED Viewed

@@ -28,7 +28,7 @@ class CUDATarget:
     @property
     def target_context(self):
         if self._targetctx is None:
-            self._targetctx = CUDATargetContext(self._typingctx)
+            self._targetctx = CUDATargetContext(self.typing_context)
         return self._targetctx

numba_cuda/numba/cuda/device_init.py CHANGED Viewed

@@ -27,7 +27,6 @@ from .stubs import (
     local,
     const,
     atomic,
-    vote_sync_intrinsic,
     match_any_sync,
     match_all_sync,
     threadfence_block,
@@ -56,6 +55,10 @@ from .intrinsics import (
     shfl_up_sync,
     shfl_down_sync,
     shfl_xor_sync,
+    all_sync,
+    any_sync,
+    eq_sync,
+    ballot_sync,
 )
 from .cudadrv.error import CudaSupportError
 from numba.cuda.cudadrv.driver import (
@@ -79,12 +82,6 @@ from .api import *
 from .api import _auto_device
 from .args import In, Out, InOut
-from .intrinsic_wrapper import (
-    all_sync,
-    any_sync,
-    eq_sync,
-    ballot_sync,
-)
 from .kernels import reduction
 from numba.cuda.cudadrv.linkable_code import (

numba_cuda/numba/cuda/deviceufunc.py CHANGED Viewed

@@ -682,12 +682,9 @@ class GUFuncEngine(object):
             inner_shapes.append(inner_shape)
         # solve output shape
-        oshapes = []
-        for outsig in self.sout:
-            oshape = []
-            for sym in outsig:
-                oshape.append(symbolmap[sym])
-            oshapes.append(tuple(oshape))
+        oshapes = [
+            tuple(map(symbolmap.__getitem__, outsig)) for outsig in self.sout
+        ]
         # find the biggest outershape as looping dimension
         sizes = [reduce(operator.mul, s, 1) for s in outer_shapes]

numba_cuda/numba/cuda/dispatcher.py CHANGED Viewed

@@ -15,6 +15,8 @@ import uuid
 import re
 from warnings import warn
+from numba.cuda._compat import launch, LaunchConfig
 from numba.cuda.core import errors
 from numba.cuda import serialize, utils
 from numba import cuda
@@ -39,6 +41,7 @@ from numba.cuda.compiler import (
 from numba.cuda.core import sigutils, config, entrypoints
 from numba.cuda.flags import Flags
 from numba.cuda.cudadrv import driver, nvvm
 from numba.cuda.locks import module_init_lock
 from numba.cuda.core.caching import Cache, CacheImpl, NullCache
 from numba.cuda.descriptor import cuda_target
@@ -475,18 +478,15 @@ class _Kernel(serialize.ReduceMixin):
         for t, v in zip(self.argument_types, args):
             self._prepare_args(t, v, stream, retr, kernelargs)
-        stream_handle = driver._stream_handle(stream)
         # Invoke kernel
-        driver.launch_kernel(
-            cufunc.handle,
-            *griddim,
-            *blockdim,
-            sharedmem,
-            stream_handle,
-            kernelargs,
-            cooperative=self.cooperative,
+        config = LaunchConfig(
+            grid=griddim,
+            block=blockdim,
+            shmem_size=sharedmem,
+            cooperative_launch=self.cooperative,
         )
+        kernel = cufunc.kernel
+        launch(stream, config, kernel, *kernelargs)
         if self.debug:
             driver.device_to_host(ctypes.addressof(excval), excmem, excsz)
@@ -540,30 +540,26 @@ class _Kernel(serialize.ReduceMixin):
         if isinstance(ty, types.Array):
             devary = wrap_arg(val).to_device(retr, stream)
-            c_intp = ctypes.c_ssize_t
-            meminfo = ctypes.c_void_p(0)
-            parent = ctypes.c_void_p(0)
-            nitems = c_intp(devary.size)
-            itemsize = c_intp(devary.dtype.itemsize)
-            ptr = driver.device_pointer(devary)
-            ptr = int(ptr)
-            data = ctypes.c_void_p(ptr)
+            meminfo = 0
+            parent = 0
             kernelargs.append(meminfo)
             kernelargs.append(parent)
-            kernelargs.append(nitems)
-            kernelargs.append(itemsize)
-            kernelargs.append(data)
-            kernelargs.extend(map(c_intp, devary.shape))
-            kernelargs.extend(map(c_intp, devary.strides))
+            # non-pointer-arguments-without-ctypes might be dicey, since we're
+            # assuming shape, strides, size, and itemsize fit into intptr_t
+            # however, this saves a noticeable amount of overhead in kernel
+            # invocation
+            kernelargs.append(devary.size)
+            kernelargs.append(devary.dtype.itemsize)
+            kernelargs.append(devary.device_ctypes_pointer.value)
+            kernelargs.extend(devary.shape)
+            kernelargs.extend(devary.strides)
         elif isinstance(ty, types.CPointer):
             # Pointer arguments should be a pointer-sized integer
-            kernelargs.append(ctypes.c_uint64(val))
+            kernelargs.append(val)
         elif isinstance(ty, types.Integer):
             cval = getattr(ctypes, "c_%s" % ty)(val)
@@ -582,8 +578,7 @@ class _Kernel(serialize.ReduceMixin):
             kernelargs.append(cval)
         elif ty == types.boolean:
-            cval = ctypes.c_uint8(int(val))
-            kernelargs.append(cval)
+            kernelargs.append(val)
         elif ty == types.complex64:
             kernelargs.append(ctypes.c_float(val.real))
@@ -598,8 +593,7 @@ class _Kernel(serialize.ReduceMixin):
         elif isinstance(ty, types.Record):
             devrec = wrap_arg(val).to_device(retr, stream)
-            ptr = devrec.device_ctypes_pointer
-            kernelargs.append(ptr)
+            kernelargs.append(devrec.device_ctypes_pointer.value)
         elif isinstance(ty, types.BaseTuple):
             assert len(ty) == len(val)
@@ -671,7 +665,7 @@ class _LaunchConfiguration:
         self.dispatcher = dispatcher
         self.griddim = griddim
         self.blockdim = blockdim
-        self.stream = stream
+        self.stream = driver._to_core_stream(stream)
         self.sharedmem = sharedmem
         if (
@@ -700,6 +694,16 @@ class _LaunchConfiguration:
             args, self.griddim, self.blockdim, self.stream, self.sharedmem
         )
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["stream"] = int(state["stream"].handle)
+        return state
+    def __setstate__(self, state):
+        handle = state.pop("stream")
+        self.__dict__.update(state)
+        self.stream = driver._to_core_stream(handle)
 class CUDACacheImpl(CacheImpl):
     def reduce(self, kernel):
@@ -854,7 +858,7 @@ class _DispatcherBase(_dispatcher.Dispatcher):
             for cres in overloads.values():
                 try:
                     targetctx.remove_user_function(cres.entry_point)
-                except KeyError:
+                except KeyError:  # noqa: PERF203
                     pass
         return finalizer
@@ -1622,21 +1626,7 @@ class CUDADispatcher(serialize.ReduceMixin, _MemoMixin, _DispatcherBase):
     def typeof_pyval(self, val):
         # Based on _DispatcherBase.typeof_pyval, but differs from it to support
         # the CUDA Array Interface.
-        try:
-            return typeof(val, Purpose.argument)
-        except ValueError:
-            if (
-                interface := getattr(val, "__cuda_array_interface__")
-            ) is not None:
-                # When typing, we don't need to synchronize on the array's
-                # stream - this is done when the kernel is launched.
-                return typeof(
-                    cuda.from_cuda_array_interface(interface, sync=False),
-                    Purpose.argument,
-                )
-            else:
-                raise
+        return typeof(val, Purpose.argument)
     def specialize(self, *args):
         """
@@ -2100,7 +2090,7 @@ class CUDADispatcher(serialize.ReduceMixin, _MemoMixin, _DispatcherBase):
         if file is None:
             file = sys.stdout
-        for _, defn in self.overloads.items():
+        for defn in self.overloads.values():
             defn.inspect_types(file=file)
     @classmethod

numba_cuda/numba/cuda/intrinsics.py CHANGED Viewed

@@ -6,7 +6,11 @@ from llvmlite import ir
 from numba import cuda
 from numba.cuda import types
 from numba.cuda import cgutils
-from numba.cuda.core.errors import RequireLiteralValue, TypingError
+from numba.cuda.core.errors import (
+    RequireLiteralValue,
+    TypingError,
+    NumbaTypeError,
+)
 from numba.cuda.typing import signature
 from numba.cuda.extending import overload_attribute, overload_method
 from numba.cuda import nvvmutils
@@ -380,3 +384,148 @@ def shfl_sync_intrinsic(
     sig = signature(a_type, membermask_type, a_type, b_type)
     return sig, codegen
+# -------------------------------------------------------------------------------
+# Warp vote functions
+#
+# References:
+#
+# - https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-vote-functions
+# - https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html?highlight=data%2520movement#vote
+#
+# Notes:
+#
+# - The NVVM IR specification requires some of the mode parameter to be
+#   constants. It's therefore essential that we pass in mode values to the
+#   vote_sync_intrinsic.
+@intrinsic
+def all_sync(typingctx, mask_type, predicate_type):
+    """
+    If for all threads in the masked warp the predicate is true, then
+    a non-zero value is returned, otherwise 0 is returned.
+    """
+    mode_value = 0
+    sig, codegen_inner = vote_sync_intrinsic(
+        typingctx, mask_type, mode_value, predicate_type
+    )
+    def codegen(context, builder, sig_outer, args):
+        # Call vote_sync_intrinsic and extract the boolean result (index 1)
+        result_tuple = codegen_inner(context, builder, sig, args)
+        return builder.extract_value(result_tuple, 1)
+    sig_outer = signature(types.b1, mask_type, predicate_type)
+    return sig_outer, codegen
+@intrinsic
+def any_sync(typingctx, mask_type, predicate_type):
+    """
+    If for any thread in the masked warp the predicate is true, then
+    a non-zero value is returned, otherwise 0 is returned.
+    """
+    mode_value = 1
+    sig, codegen_inner = vote_sync_intrinsic(
+        typingctx, mask_type, mode_value, predicate_type
+    )
+    def codegen(context, builder, sig_outer, args):
+        result_tuple = codegen_inner(context, builder, sig, args)
+        return builder.extract_value(result_tuple, 1)
+    sig_outer = signature(types.b1, mask_type, predicate_type)
+    return sig_outer, codegen
+@intrinsic
+def eq_sync(typingctx, mask_type, predicate_type):
+    """
+    If for all threads in the masked warp the boolean predicate is the same,
+    then a non-zero value is returned, otherwise 0 is returned.
+    """
+    mode_value = 2
+    sig, codegen_inner = vote_sync_intrinsic(
+        typingctx, mask_type, mode_value, predicate_type
+    )
+    def codegen(context, builder, sig_outer, args):
+        result_tuple = codegen_inner(context, builder, sig, args)
+        return builder.extract_value(result_tuple, 1)
+    sig_outer = signature(types.b1, mask_type, predicate_type)
+    return sig_outer, codegen
+@intrinsic
+def ballot_sync(typingctx, mask_type, predicate_type):
+    """
+    Returns a mask of all threads in the warp whose predicate is true,
+    and are within the given mask.
+    """
+    mode_value = 3
+    sig, codegen_inner = vote_sync_intrinsic(
+        typingctx, mask_type, mode_value, predicate_type
+    )
+    def codegen(context, builder, sig_outer, args):
+        result_tuple = codegen_inner(context, builder, sig, args)
+        return builder.extract_value(
+            result_tuple, 0
+        )  # Extract ballot result (index 0)
+    sig_outer = signature(types.i4, mask_type, predicate_type)
+    return sig_outer, codegen
+def vote_sync_intrinsic(typingctx, mask_type, mode_value, predicate_type):
+    # Validate mode value
+    if mode_value not in (0, 1, 2, 3):
+        raise ValueError("Mode must be 0 (all), 1 (any), 2 (eq), or 3 (ballot)")
+    if types.unliteral(mask_type) not in types.integer_domain:
+        raise NumbaTypeError(f"Mask type must be an integer. Got {mask_type}")
+    predicate_types = types.integer_domain | {types.boolean}
+    if types.unliteral(predicate_type) not in predicate_types:
+        raise NumbaTypeError(
+            f"Predicate must be an integer or boolean. Got {predicate_type}"
+        )
+    def codegen(context, builder, sig, args):
+        mask, predicate = args
+        # Types
+        i1 = ir.IntType(1)
+        i32 = ir.IntType(32)
+        # NVVM intrinsic definition
+        arg_types = (i32, i32, i1)
+        vote_return_type = ir.LiteralStructType((i32, i1))
+        fnty = ir.FunctionType(vote_return_type, arg_types)
+        fname = "llvm.nvvm.vote.sync"
+        lmod = builder.module
+        vote_sync = cgutils.get_or_insert_function(lmod, fnty, fname)
+        # Intrinsic arguments
+        mode = ir.Constant(i32, mode_value)
+        mask_i32 = builder.trunc(mask, i32)
+        # Convert predicate to i1
+        if predicate.type != ir.IntType(1):
+            predicate_bool = builder.icmp_signed(
+                "!=", predicate, ir.Constant(predicate.type, 0)
+            )
+        else:
+            predicate_bool = predicate
+        return builder.call(vote_sync, [mask_i32, mode, predicate_bool])
+    sig = signature(
+        types.Tuple((types.i4, types.b1)), mask_type, predicate_type
+    )
+    return sig, codegen

numba_cuda/numba/cuda/libdeviceimpl.py CHANGED Viewed

@@ -69,8 +69,7 @@ def libdevice_implement_multiple_returns(func, retty, prototype_args):
         tuple_args = []
         if retty != types.void:
             tuple_args.append(ret)
-        for arg in virtual_args:
-            tuple_args.append(builder.load(arg))
+        tuple_args.extend(map(builder.load, virtual_args))
         if isinstance(nb_retty, types.UniTuple):
             return cgutils.pack_array(builder, tuple_args)