PyPI - numba-cuda - Versions diffs - 0.0.19__py3-none-any.whl → 0.0.21__py3-none-any.whl - Mend

numba-cuda 0.0.19py3-none-any.whl → 0.0.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

numba_cuda/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.0.19
1	+ 0.0.21

numba_cuda/numba/cuda/codegen.py CHANGED Viewed

@@ -9,7 +9,6 @@ import os
 import subprocess
 import tempfile
 CUDA_TRIPLE = 'nvptx64-nvidia-cuda'
@@ -181,17 +180,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
         return ltoir
-    def get_cubin(self, cc=None):
-        cc = self._ensure_cc(cc)
-        cubin = self._cubin_cache.get(cc, None)
-        if cubin:
-            return cubin
-        linker = driver.Linker.new(
-            max_registers=self._max_registers, cc=cc, lto=self._lto
-        )
+    def _link_all(self, linker, cc, ignore_nonlto=False):
         if linker.lto:
             ltoir = self.get_ltoir(cc=cc)
             linker.add_ltoir(ltoir)
@@ -200,11 +189,44 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
             linker.add_ptx(ptx.encode())
         for path in self._linking_files:
-            linker.add_file_guess_ext(path)
+            linker.add_file_guess_ext(path, ignore_nonlto)
         if self.needs_cudadevrt:
-            linker.add_file_guess_ext(get_cudalib('cudadevrt', static=True))
+            linker.add_file_guess_ext(
+                get_cudalib('cudadevrt', static=True), ignore_nonlto
+            )
+    def get_cubin(self, cc=None):
+        cc = self._ensure_cc(cc)
+        cubin = self._cubin_cache.get(cc, None)
+        if cubin:
+            return cubin
+        if self._lto and config.DUMP_ASSEMBLY:
+            linker = driver.Linker.new(
+                max_registers=self._max_registers,
+                cc=cc,
+                additional_flags=["-ptx"],
+                lto=self._lto
+            )
+            # `-ptx` flag is meant to view the optimized PTX for LTO objects.
+            # Non-LTO objects are not passed to linker.
+            self._link_all(linker, cc, ignore_nonlto=True)
+            ptx = linker.get_linked_ptx().decode('utf-8')
+            print(("ASSEMBLY (AFTER LTO) %s" % self._name).center(80, '-'))
+            print(ptx)
+            print('=' * 80)
+        linker = driver.Linker.new(
+            max_registers=self._max_registers,
+            cc=cc,
+            lto=self._lto
+        )
+        self._link_all(linker, cc, ignore_nonlto=False)
         cubin = linker.complete()
         self._cubin_cache[cc] = cubin
         self._linkerinfo_cache[cc] = linker.info_log

numba_cuda/numba/cuda/compiler.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from llvmlite import ir
 from numba.core.typing.templates import ConcreteTemplate
-from numba.core import types, typing, funcdesc, config, compiler, sigutils
+from numba.core import (cgutils, types, typing, funcdesc, config, compiler,
+                        sigutils, utils)
 from numba.core.compiler import (sanitize_compile_result_entries, CompilerBase,
                                  DefaultPassBuilder, Flags, Option,
                                  CompileResult)
@@ -11,7 +12,10 @@ from numba.core.errors import NumbaInvalidConfigWarning
 from numba.core.typed_passes import (IRLegalization, NativeLowering,
                                      AnnotateTypes)
 from warnings import warn
+from numba.cuda import nvvmutils
 from numba.cuda.api import get_current_device
+from numba.cuda.cudadrv import nvvm
+from numba.cuda.descriptor import cuda_target
 from numba.cuda.target import CUDACABICallConv
@@ -24,6 +28,15 @@ def _nvvm_options_type(x):
         return x
+def _optional_int_type(x):
+    if x is None:
+        return None
+    else:
+        assert isinstance(x, int)
+        return x
 class CUDAFlags(Flags):
     nvvm_options = Option(
         type=_nvvm_options_type,
@@ -35,6 +48,16 @@ class CUDAFlags(Flags):
         default=None,
         doc="Compute Capability",
     )
+    max_registers = Option(
+        type=_optional_int_type,
+        default=None,
+        doc="Max registers"
+    )
+    lto = Option(
+        type=bool,
+        default=False,
+        doc="Enable Link-time Optimization"
+    )
 # The CUDACompileResult (CCR) has a specially-defined entry point equal to its
@@ -109,7 +132,11 @@ class CreateLibrary(LoweringPass):
         codegen = state.targetctx.codegen()
         name = state.func_id.func_qualname
         nvvm_options = state.flags.nvvm_options
-        state.library = codegen.create_library(name, nvvm_options=nvvm_options)
+        max_registers = state.flags.max_registers
+        lto = state.flags.lto
+        state.library = codegen.create_library(name, nvvm_options=nvvm_options,
+                                               max_registers=max_registers,
+                                               lto=lto)
         # Enable object caching upfront so that the library can be serialized.
         state.library.enable_object_caching()
@@ -152,7 +179,7 @@ class CUDACompiler(CompilerBase):
 @global_compiler_lock
 def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
                  inline=False, fastmath=False, nvvm_options=None,
-                 cc=None):
+                 cc=None, max_registers=None, lto=False):
     if cc is None:
         raise ValueError('Compute Capability must be supplied')
@@ -189,6 +216,8 @@ def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
     if nvvm_options:
         flags.nvvm_options = nvvm_options
     flags.compute_capability = cc
+    flags.max_registers = max_registers
+    flags.lto = lto
     # Run compilation pipeline
     from numba.core.target_extension import target_override
@@ -247,11 +276,155 @@ def cabi_wrap_function(context, lib, fndesc, wrapper_function_name,
         builder, func, restype, argtypes, callargs)
     builder.ret(return_value)
+    if config.DUMP_LLVM:
+        utils.dump_llvm(fndesc, wrapper_module)
     library.add_ir_module(wrapper_module)
     library.finalize()
     return library
+def kernel_fixup(kernel, debug):
+    if debug:
+        exc_helper = add_exception_store_helper(kernel)
+    # Pass 1 - replace:
+    #
+    #    ret <value>
+    #
+    # with:
+    #
+    #    exc_helper(<value>)
+    #    ret void
+    for block in kernel.blocks:
+        for i, inst in enumerate(block.instructions):
+            if isinstance(inst, ir.Ret):
+                old_ret = block.instructions.pop()
+                block.terminator = None
+                # The original return's metadata will be set on the new
+                # instructions in order to preserve debug info
+                metadata = old_ret.metadata
+                builder = ir.IRBuilder(block)
+                if debug:
+                    status_code = old_ret.operands[0]
+                    exc_helper_call = builder.call(exc_helper, (status_code,))
+                    exc_helper_call.metadata = metadata
+                new_ret = builder.ret_void()
+                new_ret.metadata = old_ret.metadata
+                # Need to break out so we don't carry on modifying what we are
+                # iterating over. There can only be one return in a block
+                # anyway.
+                break
+    # Pass 2: remove stores of null pointer to return value argument pointer
+    return_value = kernel.args[0]
+    for block in kernel.blocks:
+        remove_list = []
+        # Find all stores first
+        for inst in block.instructions:
+            if (isinstance(inst, ir.StoreInstr)
+                    and inst.operands[1] == return_value):
+                remove_list.append(inst)
+        # Remove all stores
+        for to_remove in remove_list:
+            block.instructions.remove(to_remove)
+    # Replace non-void return type with void return type and remove return
+    # value
+    if isinstance(kernel.type, ir.PointerType):
+        new_type = ir.PointerType(ir.FunctionType(ir.VoidType(),
+                                                  kernel.type.pointee.args[1:]))
+    else:
+        new_type = ir.FunctionType(ir.VoidType(), kernel.type.args[1:])
+    kernel.type = new_type
+    kernel.return_value = ir.ReturnValue(kernel, ir.VoidType())
+    kernel.args = kernel.args[1:]
+    # Mark as a kernel for NVVM
+    nvvm.set_cuda_kernel(kernel)
+    if config.DUMP_LLVM:
+        print(f"LLVM DUMP: Post kernel fixup {kernel.name}".center(80, '-'))
+        print(kernel.module)
+        print('=' * 80)
+def add_exception_store_helper(kernel):
+    # Create global variables for exception state
+    def define_error_gv(postfix):
+        name = kernel.name + postfix
+        gv = cgutils.add_global_variable(kernel.module, ir.IntType(32),
+                                         name)
+        gv.initializer = ir.Constant(gv.type.pointee, None)
+        return gv
+    gv_exc = define_error_gv("__errcode__")
+    gv_tid = []
+    gv_ctaid = []
+    for i in 'xyz':
+        gv_tid.append(define_error_gv("__tid%s__" % i))
+        gv_ctaid.append(define_error_gv("__ctaid%s__" % i))
+    # Create exception store helper function
+    helper_name = kernel.name + "__exc_helper__"
+    helper_type = ir.FunctionType(ir.VoidType(), (ir.IntType(32),))
+    helper_func = ir.Function(kernel.module, helper_type, helper_name)
+    block = helper_func.append_basic_block(name="entry")
+    builder = ir.IRBuilder(block)
+    # Implement status check / exception store logic
+    status_code = helper_func.args[0]
+    call_conv = cuda_target.target_context.call_conv
+    status = call_conv._get_return_status(builder, status_code)
+    # Check error status
+    with cgutils.if_likely(builder, status.is_ok):
+        builder.ret_void()
+    with builder.if_then(builder.not_(status.is_python_exc)):
+        # User exception raised
+        old = ir.Constant(gv_exc.type.pointee, None)
+        # Use atomic cmpxchg to prevent rewriting the error status
+        # Only the first error is recorded
+        xchg = builder.cmpxchg(gv_exc, old, status.code,
+                               'monotonic', 'monotonic')
+        changed = builder.extract_value(xchg, 1)
+        # If the xchange is successful, save the thread ID.
+        sreg = nvvmutils.SRegBuilder(builder)
+        with builder.if_then(changed):
+            for dim, ptr, in zip("xyz", gv_tid):
+                val = sreg.tid(dim)
+                builder.store(val, ptr)
+            for dim, ptr, in zip("xyz", gv_ctaid):
+                val = sreg.ctaid(dim)
+                builder.store(val, ptr)
+    builder.ret_void()
+    return helper_func
 @global_compiler_lock
 def compile(pyfunc, sig, debug=None, lineinfo=False, device=True,
             fastmath=False, cc=None, opt=None, abi="c", abi_info=None,
@@ -347,13 +520,10 @@ def compile(pyfunc, sig, debug=None, lineinfo=False, device=True,
             lib = cabi_wrap_function(tgt, lib, cres.fndesc, wrapper_name,
                                      nvvm_options)
     else:
-        code = pyfunc.__code__
-        filename = code.co_filename
-        linenum = code.co_firstlineno
-        lib, kernel = tgt.prepare_cuda_kernel(cres.library, cres.fndesc, debug,
-                                              lineinfo, nvvm_options, filename,
-                                              linenum)
+        lib = cres.library
+        kernel = lib.get_function(cres.fndesc.llvm_func_name)
+        lib._entry_name = cres.fndesc.llvm_func_name
+        kernel_fixup(kernel, debug)
     if lto:
         code = lib.get_ltoir(cc=cc)

numba_cuda/numba/cuda/cuda_paths.py CHANGED Viewed

@@ -310,7 +310,9 @@ def get_conda_include_dir():
         # though usually it shouldn't.
         include_dir = os.path.join(sys.prefix, 'include')
-    if os.path.exists(include_dir):
+    if (os.path.exists(include_dir) and os.path.isdir(include_dir)
+            and os.path.exists(os.path.join(include_dir,
+                                            'cuda_device_runtime_api.h'))):
         return include_dir
     return

numba_cuda/numba/cuda/cudadrv/driver.py CHANGED Viewed

@@ -21,6 +21,9 @@ import threading
 import traceback
 import asyncio
 import pathlib
+import subprocess
+import tempfile
+import re
 from itertools import product
 from abc import ABCMeta, abstractmethod
 from ctypes import (c_int, byref, c_size_t, c_char, c_char_p, addressof,
@@ -36,7 +39,7 @@ from .error import CudaSupportError, CudaDriverError
 from .drvapi import API_PROTOTYPES
 from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid
 from .mappings import FILE_EXTENSION_MAP
-from .linkable_code import LinkableCode
+from .linkable_code import LinkableCode, LTOIR, Fatbin, Object
 from numba.cuda.cudadrv import enums, drvapi, nvrtc
 USE_NV_BINDING = config.CUDA_USE_NVIDIA_BINDING
@@ -2683,12 +2686,18 @@ class Linker(metaclass=ABCMeta):
             cu = f.read()
         self.add_cu(cu, os.path.basename(path))
-    def add_file_guess_ext(self, path_or_code):
+    def add_file_guess_ext(self, path_or_code, ignore_nonlto=False):
         """
         Add a file or LinkableCode object to the link. If a file is
         passed, the type will be inferred from the extension. A LinkableCode
         object represents a file already in memory.
+        When `ignore_nonlto` is set to true, do not add code that will not
+        be LTO-ed in the linking process. This is useful in inspecting the
+        LTO-ed portion of the PTX when linker is added with objects that can be
+        both LTO-ed and not LTO-ed.
         """
         if isinstance(path_or_code, str):
             ext = pathlib.Path(path_or_code).suffix
             if ext == '':
@@ -2704,6 +2713,26 @@ class Linker(metaclass=ABCMeta):
                         "Don't know how to link file with extension "
                         f"{ext}"
                     )
+                if ignore_nonlto:
+                    warn_and_return = False
+                    if kind in (
+                        FILE_EXTENSION_MAP["fatbin"], FILE_EXTENSION_MAP["o"]
+                    ):
+                        entry_types = inspect_obj_content(path_or_code)
+                        if "nvvm" not in entry_types:
+                            warn_and_return = True
+                    elif kind != FILE_EXTENSION_MAP["ltoir"]:
+                        warn_and_return = True
+                    if warn_and_return:
+                        warnings.warn(
+                            f"Not adding {path_or_code} as it is not "
+                            "optimizable at link time, and `ignore_nonlto == "
+                            "True`."
+                        )
+                        return
                 self.add_file(path_or_code, kind)
             return
         else:
@@ -2716,6 +2745,25 @@ class Linker(metaclass=ABCMeta):
             if path_or_code.kind == "cu":
                 self.add_cu(path_or_code.data, path_or_code.name)
             else:
+                if ignore_nonlto:
+                    warn_and_return = False
+                    if isinstance(path_or_code, (Fatbin, Object)):
+                        with tempfile.NamedTemporaryFile("w") as fp:
+                            fp.write(path_or_code.data)
+                            entry_types = inspect_obj_content(fp.name)
+                        if "nvvm" not in entry_types:
+                            warn_and_return = True
+                    elif not isinstance(path_or_code, LTOIR):
+                        warn_and_return = True
+                    if warn_and_return:
+                        warnings.warn(
+                            f"Not adding {path_or_code.name} as it is not "
+                            "optimizable at link time, and `ignore_nonlto == "
+                            "True`."
+                        )
+                        return
                 self.add_data(
                     path_or_code.data, path_or_code.kind, path_or_code.name
                 )
@@ -3065,6 +3113,28 @@ class PyNvJitLinker(Linker):
         name = pathlib.Path(path).name
         self.add_data(data, kind, name)
+    def add_cu(self, cu, name):
+        """Add CUDA source in a string to the link. The name of the source
+        file should be specified in `name`."""
+        with driver.get_active_context() as ac:
+            dev = driver.get_device(ac.devnum)
+            cc = dev.compute_capability
+        program, log = nvrtc.compile(cu, name, cc, ltoir=self.lto)
+        if not self.lto and config.DUMP_ASSEMBLY:
+            print(("ASSEMBLY %s" % name).center(80, "-"))
+            print(program)
+            print("=" * 80)
+        suffix = ".ltoir" if self.lto else ".ptx"
+        program_name = os.path.splitext(name)[0] + suffix
+        # Link the program's PTX or LTOIR using the normal linker mechanism
+        if self.lto:
+            self.add_ltoir(program, program_name)
+        else:
+            self.add_ptx(program.encode(), program_name)
     def add_data(self, data, kind, name):
         if kind == FILE_EXTENSION_MAP["cubin"]:
             fn = self._linker.add_cubin
@@ -3086,6 +3156,12 @@ class PyNvJitLinker(Linker):
         except NvJitLinkError as e:
             raise LinkerError from e
+    def get_linked_ptx(self):
+        try:
+            return self._linker.get_linked_ptx()
+        except NvJitLinkError as e:
+            raise LinkerError from e
     def complete(self):
         try:
             return self._linker.get_linked_cubin()
@@ -3361,3 +3437,28 @@ def get_version():
     Return the driver version as a tuple of (major, minor)
     """
     return driver.get_version()
+def inspect_obj_content(objpath: str):
+    """
+    Given path to a fatbin or object, use `cuobjdump` to examine its content
+    Return the set of entries in the object.
+    """
+    code_types :set[str] = set()
+    try:
+        out = subprocess.run(["cuobjdump", objpath], check=True,
+                             capture_output=True)
+    except FileNotFoundError as e:
+        msg = ("cuobjdump has not been found. You may need "
+               "to install the CUDA toolkit and ensure that "
+               "it is available on your PATH.\n")
+        raise RuntimeError(msg) from e
+    objtable = out.stdout.decode('utf-8')
+    entry_pattern = r"Fatbin (.*) code"
+    for line in objtable.split("\n"):
+        if match := re.match(entry_pattern, line):
+            code_types.add(match.group(1))
+    return code_types

numba_cuda/numba/cuda/cudadrv/enums.py CHANGED Viewed

@@ -55,7 +55,7 @@ CUDA_ERROR_INVALID_HANDLE = 400
 CUDA_ERROR_ILLEGAL_STATE = 401
 CUDA_ERROR_NOT_FOUND = 500
 CUDA_ERROR_NOT_READY = 600
-CUDA_ERROR_LAUNCH_FAILED = 700
+CUDA_ERROR_ILLEGAL_ADDRESS = 700
 CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701
 CUDA_ERROR_LAUNCH_TIMEOUT = 702
 CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703

numba_cuda/numba/cuda/cudadrv/nvrtc.py CHANGED Viewed

@@ -61,6 +61,14 @@ class NVRTC:
     NVVM interface. Initialization is protected by a lock and uses the standard
     (for Numba) open_cudalib function to load the NVRTC library.
     """
+    _CU12ONLY_PROTOTYPES = {
+        # nvrtcResult nvrtcGetLTOIRSize(nvrtcProgram prog, size_t *ltoSizeRet);
+        "nvrtcGetLTOIRSize": (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
+        # nvrtcResult nvrtcGetLTOIR(nvrtcProgram prog, char *lto);
+        "nvrtcGetLTOIR": (nvrtc_result, nvrtc_program, c_char_p)
+    }
     _PROTOTYPES = {
         # nvrtcResult nvrtcVersion(int *major, int *minor)
         'nvrtcVersion': (nvrtc_result, POINTER(c_int), POINTER(c_int)),
@@ -110,6 +118,10 @@ class NVRTC:
                     cls.__INSTANCE = None
                     raise NvrtcSupportError("NVRTC cannot be loaded") from e
+                from numba.cuda.cudadrv.runtime import get_version
+                if get_version() >= (12, 0):
+                    inst._PROTOTYPES |= inst._CU12ONLY_PROTOTYPES
                 # Find & populate functions
                 for name, proto in inst._PROTOTYPES.items():
                     func = getattr(lib, name)
@@ -208,10 +220,22 @@ class NVRTC:
         return ptx.value.decode()
+    def get_lto(self, program):
+        """
+        Get the compiled LTOIR as a Python bytes object.
+        """
+        lto_size = c_size_t()
+        self.nvrtcGetLTOIRSize(program.handle, byref(lto_size))
+        lto = b" " * lto_size.value
+        self.nvrtcGetLTOIR(program.handle, lto)
+        return lto
-def compile(src, name, cc):
+def compile(src, name, cc, ltoir=False):
     """
-    Compile a CUDA C/C++ source to PTX for a given compute capability.
+    Compile a CUDA C/C++ source to PTX or LTOIR for a given compute capability.
     :param src: The source code to compile
     :type src: str
@@ -219,6 +243,8 @@ def compile(src, name, cc):
     :type name: str
     :param cc: A tuple ``(major, minor)`` of the compute capability
     :type cc: tuple
+    :param ltoir: Compile into LTOIR if True, otherwise into PTX
+    :type ltoir: bool
     :return: The compiled PTX and compilation log
     :rtype: tuple
     """
@@ -242,6 +268,9 @@ def compile(src, name, cc):
     numba_include = f'-I{numba_cuda_path}'
     options = [arch, *cuda_include, numba_include, '-rdc', 'true']
+    if ltoir:
+        options.append("-dlto")
     if nvrtc.get_version() < (12, 0):
         options += ["-std=c++17"]
@@ -261,5 +290,9 @@ def compile(src, name, cc):
         msg = (f"NVRTC log messages whilst compiling {name}:\n\n{log}")
         warnings.warn(msg)
-    ptx = nvrtc.get_ptx(program)
-    return ptx, log
+    if ltoir:
+        ltoir = nvrtc.get_lto(program)
+        return ltoir, log
+    else:
+        ptx = nvrtc.get_ptx(program)
+        return ptx, log

numba_cuda/numba/cuda/dispatcher.py CHANGED Viewed

@@ -14,7 +14,7 @@ from numba.core.typing.typeof import Purpose, typeof
 from numba.cuda.api import get_current_device
 from numba.cuda.args import wrap_arg
-from numba.cuda.compiler import compile_cuda, CUDACompiler
+from numba.cuda.compiler import compile_cuda, CUDACompiler, kernel_fixup
 from numba.cuda.cudadrv import driver
 from numba.cuda.cudadrv.devices import get_context
 from numba.cuda.descriptor import cuda_target
@@ -102,15 +102,14 @@ class _Kernel(serialize.ReduceMixin):
                             inline=inline,
                             fastmath=fastmath,
                             nvvm_options=nvvm_options,
-                            cc=cc)
+                            cc=cc,
+                            max_registers=max_registers,
+                            lto=lto)
         tgt_ctx = cres.target_context
-        code = self.py_func.__code__
-        filename = code.co_filename
-        linenum = code.co_firstlineno
-        lib, kernel = tgt_ctx.prepare_cuda_kernel(cres.library, cres.fndesc,
-                                                  debug, lineinfo, nvvm_options,
-                                                  filename, linenum,
-                                                  max_registers, lto)
+        lib = cres.library
+        kernel = lib.get_function(cres.fndesc.llvm_func_name)
+        lib._entry_name = cres.fndesc.llvm_func_name
+        kernel_fixup(kernel, self.debug)
         if not link:
             link = []

numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py CHANGED Viewed

@@ -5,6 +5,10 @@ from numba.cuda.cudadrv.driver import PyNvJitLinker
 import itertools
 import os
+import io
+import contextlib
+import warnings
 from numba.cuda import get_current_device
 from numba import cuda
 from numba import config
@@ -23,6 +27,9 @@ if TEST_BIN_DIR:
     test_device_functions_fatbin = os.path.join(
         TEST_BIN_DIR, "test_device_functions.fatbin"
     )
+    test_device_functions_fatbin_multi = os.path.join(
+        TEST_BIN_DIR, "test_device_functions_multi.fatbin"
+    )
     test_device_functions_o = os.path.join(
         TEST_BIN_DIR, "test_device_functions.o"
     )
@@ -156,32 +163,81 @@ class TestLinker(CUDATestCase):
             test_device_functions_o,
             test_device_functions_ptx,
         )
+        for lto in [True, False]:
+            for file in files:
+                with self.subTest(file=file):
+                    sig = "uint32(uint32, uint32)"
+                    add_from_numba = cuda.declare_device("add_from_numba", sig)
+                    @cuda.jit(link=[file], lto=lto)
+                    def kernel(result):
+                        result[0] = add_from_numba(1, 2)
+                    result = cuda.device_array(1)
+                    kernel[1, 1](result)
+                    assert result[0] == 3
+    def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly(self):
+        files = [
+            test_device_functions_cu,
+            test_device_functions_ltoir,
+            test_device_functions_fatbin_multi
+        ]
+        config.DUMP_ASSEMBLY = True
         for file in files:
             with self.subTest(file=file):
-                sig = "uint32(uint32, uint32)"
-                add_from_numba = cuda.declare_device("add_from_numba", sig)
+                f = io.StringIO()
+                with contextlib.redirect_stdout(f):
+                    sig = "uint32(uint32, uint32)"
+                    add_from_numba = cuda.declare_device("add_from_numba", sig)
-                @cuda.jit(link=[file])
-                def kernel(result):
-                    result[0] = add_from_numba(1, 2)
+                    @cuda.jit(link=[file], lto=True)
+                    def kernel(result):
+                        result[0] = add_from_numba(1, 2)
-                result = cuda.device_array(1)
-                kernel[1, 1](result)
-                assert result[0] == 3
+                    result = cuda.device_array(1)
+                    kernel[1, 1](result)
+                    assert result[0] == 3
-    def test_nvjitlink_jit_with_linkable_code_lto(self):
-        file = test_device_functions_ltoir
+                self.assertTrue("ASSEMBLY (AFTER LTO)" in f.getvalue())
-        sig = "uint32(uint32, uint32)"
-        add_from_numba = cuda.declare_device("add_from_numba", sig)
+        config.DUMP_ASSEMBLY = False
-        @cuda.jit(link=[file], lto=True)
-        def kernel(result):
-            result[0] = add_from_numba(1, 2)
+    def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn(self):
+        files = [
+            test_device_functions_a,
+            test_device_functions_cubin,
+            test_device_functions_fatbin,
+            test_device_functions_o,
+            test_device_functions_ptx,
+        ]
-        result = cuda.device_array(1)
-        kernel[1, 1](result)
-        assert result[0] == 3
+        config.DUMP_ASSEMBLY = True
+        for file in files:
+            with self.subTest(file=file):
+                with warnings.catch_warnings(record=True) as w:
+                    with contextlib.redirect_stdout(None): # suppress other PTX
+                        sig = "uint32(uint32, uint32)"
+                        add_from_numba = cuda.declare_device(
+                            "add_from_numba", sig
+                        )
+                        @cuda.jit(link=[file], lto=True)
+                        def kernel(result):
+                            result[0] = add_from_numba(1, 2)
+                        result = cuda.device_array(1)
+                        kernel[1, 1](result)
+                        assert result[0] == 3
+                assert len(w) == 1
+                self.assertIn("it is not optimizable at link time, and "
+                              "`ignore_nonlto == True`", str(w[0].message))
+        config.DUMP_ASSEMBLY = False
     def test_nvjitlink_jit_with_invalid_linkable_code(self):
         with open(test_device_functions_cubin, "rb") as f:

numba_cuda/numba/cuda/tests/cudapy/test_debug.py CHANGED Viewed

@@ -48,13 +48,11 @@ class TestDebugOutput(CUDATestCase):
                 self.assertRaises(AssertionError, check_meth, out)
     def _check_dump_bytecode(self, out):
-        if PYVERSION in ((3, 11), (3, 12)):
+        if PYVERSION > (3, 10):
             # binop with arg=0 is binary add, see CPython dis.py and opcode.py
             self.assertIn('BINARY_OP(arg=0', out)
-        elif PYVERSION in ((3, 9), (3, 10)):
-            self.assertIn('BINARY_ADD', out)
         else:
-            raise NotImplementedError(PYVERSION)
+            self.assertIn('BINARY_ADD', out)
     def _check_dump_cfg(self, out):
         self.assertIn('CFG dominators', out)

numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py CHANGED Viewed

@@ -72,6 +72,7 @@ class TestCudaDebugInfo(CUDATestCase):
         def f(x):
             x[0] = 0
+    @unittest.skip("Wrappers no longer exist")
     def test_wrapper_has_debuginfo(self):
         sig = (types.int32[::1],)

numba_cuda/numba/cuda/tests/cudapy/test_inspect.py CHANGED Viewed

@@ -33,10 +33,7 @@ class TestInspect(CUDATestCase):
         self.assertIn("foo", llvm)
         # Kernel in LLVM
-        self.assertIn('cuda.kernel.wrapper', llvm)
-        # Wrapped device function body in LLVM
-        self.assertIn("define linkonce_odr i32", llvm)
+        self.assertIn("define void @", llvm)
         asm = foo.inspect_asm(sig)
@@ -72,12 +69,8 @@ class TestInspect(CUDATestCase):
         self.assertIn("foo", llvmirs[float64, float64])
         # Kernels in LLVM
-        self.assertIn('cuda.kernel.wrapper', llvmirs[intp, intp])
-        self.assertIn('cuda.kernel.wrapper', llvmirs[float64, float64])
-        # Wrapped device function bodies in LLVM
-        self.assertIn("define linkonce_odr i32", llvmirs[intp, intp])
-        self.assertIn("define linkonce_odr i32", llvmirs[float64, float64])
+        self.assertIn("define void @", llvmirs[intp, intp])
+        self.assertIn("define void @", llvmirs[float64, float64])
         asmdict = foo.inspect_asm()

numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py CHANGED Viewed

@@ -170,10 +170,9 @@ class TestCudaLineInfo(CUDATestCase):
                 subprograms += 1
         # One DISubprogram for each of:
-        # - The kernel wrapper
         # - The caller
         # - The callee
-        expected_subprograms = 3
+        expected_subprograms = 2
         self.assertEqual(subprograms, expected_subprograms,
                          f'"Expected {expected_subprograms} DISubprograms; '

numba_cuda/numba/cuda/tests/cudapy/test_optimization.py CHANGED Viewed

@@ -14,8 +14,11 @@ def device_func(x, y, z):
 # Fragments of code that are removed from kernel_func's PTX when optimization
-# is on
-removed_by_opt = ( '__local_depot0', 'call.uni', 'st.param.b64')
+# is on. Previously this list was longer when kernel wrappers were used - if
+# the test function were more complex it may be possible to isolate additional
+# fragments of PTX we could check for the absence / presence of, but removal of
+# the use of local memory is a good indicator that optimization was applied.
+removed_by_opt = ( '__local_depot0',)
 @skip_on_cudasim('Simulator does not optimize code')

numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from numba.cuda.testing import (skip_on_cudasim, skip_unless_cudasim, unittest,
                                 CUDATestCase)
-from numba import cuda
+from numba import config, cuda
 # Basic tests that stream APIs execute on the hardware and in the simulator.
 #
@@ -34,7 +34,11 @@ class TestStreamAPI(CUDATestCase):
         # We don't test synchronization on the stream because it's not a real
         # stream - we used a dummy pointer for testing the API, so we just
         # ensure that the stream handle matches the external stream pointer.
-        self.assertEqual(ptr, s.handle.value)
+        if config.CUDA_USE_NVIDIA_BINDING:
+            value = int(s.handle)
+        else:
+            value = s.handle.value
+        self.assertEqual(ptr, value)
     @skip_unless_cudasim("External streams are usable with hardware")
     def test_external_stream_simulator_unavailable(self):

numba_cuda/numba/cuda/tests/test_binary_generation/Makefile CHANGED Viewed

@@ -14,9 +14,14 @@ endif
 # Gencode flags suitable for most tests
 GENCODE := -gencode arch=compute_$(GPU_CC),code=sm_$(GPU_CC)
+MULTI_GENCODE := -gencode arch=compute_$(GPU_CC),code=[sm_$(GPU_CC),lto_$(GPU_CC)]
 # Fatbin tests need to generate code for an additional compute capability
 FATBIN_GENCODE := $(GENCODE) -gencode arch=compute_$(ALT_CC),code=sm_$(ALT_CC)
+# Fatbin that contains both LTO, SASS for multiple architectures
+MULTI_FATBIN_GENCODE := $(MULTI_GENCODE) -gencode arch=compute_$(ALT_CC),code=[sm_$(ALT_CC),lto_$(ALT_CC)]
 # LTO-IR tests need to generate for the LTO "architecture" instead
 LTOIR_GENCODE := -gencode arch=lto_$(GPU_CC),code=lto_$(GPU_CC)
@@ -30,6 +35,7 @@ PTX_FLAGS := $(GENCODE) -ptx
 OBJECT_FLAGS := $(GENCODE) -dc
 LIBRARY_FLAGS := $(GENCODE) -lib
 FATBIN_FLAGS := $(FATBIN_GENCODE) --fatbin
+MULTI_FATBIN_FLAGS := $(MULTI_FATBIN_GENCODE) --fatbin
 LTOIR_FLAGS := $(LTOIR_GENCODE) -dc
 OUTPUT_DIR := ./
@@ -41,6 +47,7 @@ all:
 	nvcc $(NVCC_FLAGS) $(CUBIN_FLAGS) -o $(OUTPUT_DIR)/undefined_extern.cubin undefined_extern.cu
 	nvcc $(NVCC_FLAGS) $(CUBIN_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.cubin test_device_functions.cu
 	nvcc $(NVCC_FLAGS) $(FATBIN_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.fatbin test_device_functions.cu
+	nvcc $(NVCC_FLAGS) $(MULTI_FATBIN_FLAGS) -o $(OUTPUT_DIR)/test_device_functions_multi.fatbin test_device_functions.cu
 	nvcc $(NVCC_FLAGS) $(PTX_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.ptx test_device_functions.cu
 	nvcc $(NVCC_FLAGS) $(OBJECT_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.o test_device_functions.cu
 	nvcc $(NVCC_FLAGS) $(LIBRARY_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.a test_device_functions.cu

{numba_cuda-0.0.19.dist-info → numba_cuda-0.0.21.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: numba-cuda
-Version: 0.0.19
+Version: 0.0.21
 Summary: CUDA target for Numba
 Author: Anaconda Inc., NVIDIA Corporation
 License: BSD 2-clause
@@ -13,17 +13,21 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: numba>=0.59.1
+<div align="center"><img src="docs/source/_static/numba-green-icon-rgb.svg" width="200"/></div>
 # Numba CUDA Target
-An out-of-tree CUDA target for Numba.
+The CUDA target for Numba. Please visit the [official
+documentation](https://nvidia.github.io/numba-cuda) to get started!
-This contains an entire copy of Numba's CUDA target (the `numba.cuda` module),
-and a mechanism to ensure the code from this module (`numba_cuda.numba.cuda`) is
-used as the `numba.cuda` module instead of the code from the `numba` package.
+To report issues or file feature requests, please use the [issue
+tracker](https://github.com/NVIDIA/numba-cuda/issues).
-This is presently in an early state and is published for testing and feedback.
+To raise questions or initiate discussions, please use the [Numba Discourse
+forum](https://numba.discourse.group).
-## Building / testing
+## Building from source
 Install as an editable install:
@@ -31,7 +35,7 @@ Install as an editable install:
 pip install -e .
 ```
-Running tests:
+## Running tests
 ```
 python -m numba.runtests numba.cuda.tests

{numba_cuda-0.0.19.dist-info → numba_cuda-0.0.21.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 _numba_cuda_redirector.pth,sha256=cmfMMmV0JPh3yEpl4bGeM9AuXiVVMSo6Z_b7RaQL3XE,30
 _numba_cuda_redirector.py,sha256=rc56rnb40w3AtrqnhS66JSgYTSTsi3iTn8yP3NuoQV8,2401
-numba_cuda/VERSION,sha256=K2Wn4BRtrXcEkuPZYGGM_h_Orgai6flc272777m5MYQ,7
+numba_cuda/VERSION,sha256=N0wu4MReU0U_7uoeU-17rOqTT3ZYtrLE_x8SJjefmc8,7
 numba_cuda/__init__.py,sha256=atXeUvJKR3JHcAiCFbXCVOJQUHgB1TulmsqSL_9RT3Q,114
 numba_cuda/_version.py,sha256=jbdUsbR7sVllw0KxQNB0-FMd929CGg3kH2fhHdrlkuc,719
 numba_cuda/numba/cuda/__init__.py,sha256=idyVHOObC9lTYnp62v7rVprSacRM4d5F6vhXfG5ElTI,621
@@ -8,12 +8,12 @@ numba_cuda/numba/cuda/api.py,sha256=shLu7NEZHRMcaZAMEXSoyA5Gi5m0tm6ZRymxKLEKCSg,
 numba_cuda/numba/cuda/api_util.py,sha256=aQfUV2-4RM_oGVvckMjbMr5e3effOQNX04v1T0O2EfQ,861
 numba_cuda/numba/cuda/args.py,sha256=HloHkw_PQal2DT-I70Xf_XbnGObS1jiUgcRrQ85Gq28,1978
 numba_cuda/numba/cuda/cg.py,sha256=9V1uZqyGOJX1aFd9c6GAPbLSqq83lE8LoP-vxxrKENY,1490
-numba_cuda/numba/cuda/codegen.py,sha256=9LnTlei-4JK7iq3Rg-H2Y19Oh_u5ZXMC_CPfattANjw,12358
-numba_cuda/numba/cuda/compiler.py,sha256=47SjuI5p4yWCujAglIq0Cb0ARO8QxRp4fOZropkNMtQ,16001
+numba_cuda/numba/cuda/codegen.py,sha256=ghdYBKZ3Mzk2UlLE64HkrAjb60PN9fibSNkWFRQuj4M,13184
+numba_cuda/numba/cuda/compiler.py,sha256=XQHzUCuXl6WCtWWxv1X3Y9ebcVQVJEkzOuckNwKa4Gg,21249
 numba_cuda/numba/cuda/cpp_function_wrappers.cu,sha256=iv84_F6Q9kFjV_kclrQz1msh6Dud8mI3qNkswTid7Qc,953
 numba_cuda/numba/cuda/cuda_fp16.h,sha256=1IC0mdNdkvKbvAe0-f4uYVS7WFrVqOyI1nRUbBiqr6A,126844
 numba_cuda/numba/cuda/cuda_fp16.hpp,sha256=vJ7NUr2X2tKhAP7ojydAiCoOjVO6n4QGoXD6m9Srrlw,89130
-numba_cuda/numba/cuda/cuda_paths.py,sha256=wwZKOUS0FyZloRUgDVDPPCwtm3t6Js7U369_YgMpEC0,9859
+numba_cuda/numba/cuda/cuda_paths.py,sha256=C0gA72QLWUMfvXkFpw1WqqaFqfsQ7HM72hQVXG0A7RU,10023
 numba_cuda/numba/cuda/cudadecl.py,sha256=ynUidit8oPGjedc6p1miMGtS20DOji3DiQHzwmx6m0s,23192
 numba_cuda/numba/cuda/cudaimpl.py,sha256=3YMxQSCv2KClBrpuXGchrTNICV1F6NIjjL2rie5fDZ4,38628
 numba_cuda/numba/cuda/cudamath.py,sha256=EFNtdzEytAZuwijdRoFGzVKCeal76UzzaNy7wUFQx8I,3978
@@ -21,7 +21,7 @@ numba_cuda/numba/cuda/decorators.py,sha256=qSpir16-jPYSe2YuRZ6g9INeobmsMNg6ab9IZ
 numba_cuda/numba/cuda/descriptor.py,sha256=rNMaurJkjNjIBmHPozDoLC35DMURE0fn_LtnXRmaG_w,985
 numba_cuda/numba/cuda/device_init.py,sha256=lP79tCsQ0Np9xcbjv_lXcH4JOiVZvV8nwg3INdETxsc,3586
 numba_cuda/numba/cuda/deviceufunc.py,sha256=yxAH71dpgJWK8okmCJm0FUV6z2AqdThCYOTZspT7z0M,30775
-numba_cuda/numba/cuda/dispatcher.py,sha256=1ND28o_YeP_0YS2iFYwCH9Byc87qTvCVKjT7PHu2Fsg,41233
+numba_cuda/numba/cuda/dispatcher.py,sha256=JuUr0-6xQtDkyaZv7CirWaU5_sSNX4BKCTDgQG5c1xc,41116
 numba_cuda/numba/cuda/errors.py,sha256=XwWHzCllx0DXU6BQdoRH0m3pznGxnTFOBTVYXMmCfqg,1724
 numba_cuda/numba/cuda/extending.py,sha256=URsyBYls2te-mgE0yvDY6akvawYCA0blBFfD7Lf9DO4,142
 numba_cuda/numba/cuda/initialize.py,sha256=TQGHGLQoq4ch4J6CLDcJdGsZzXM-g2kDgdyO1u-Rbhg,546
@@ -47,16 +47,16 @@ numba_cuda/numba/cuda/vectorizers.py,sha256=u_0EzaD5tqVH8uOz4Gmqn3FgPC1rckwDAQuR
 numba_cuda/numba/cuda/cudadrv/__init__.py,sha256=0TL4MZcJXUoo9qA7uu0vLv7eHrXRerVmyfi7O149ITw,199
 numba_cuda/numba/cuda/cudadrv/devicearray.py,sha256=06kM7iFcx1TYiFhs1o9r1kyoA3k5yS7mFAdZDf6nrxA,31215
 numba_cuda/numba/cuda/cudadrv/devices.py,sha256=6SneNmoq83gue0txFWWx4A65vViAa8xA06FzkApoqAk,7992
-numba_cuda/numba/cuda/cudadrv/driver.py,sha256=uPjKugdtSJfIwVSAo3KgkvQhctbABkQphHAfcq6Q7ec,110892
+numba_cuda/numba/cuda/cudadrv/driver.py,sha256=bjlGcJvyjwMjRCNkNqmBIAA0HO_fzbrW2afXsp-YiCg,114794
 numba_cuda/numba/cuda/cudadrv/drvapi.py,sha256=52ms3X6hfPaQB8E1jb6g7QKqRvHzBMlDQ-V2DM1rXxQ,17178
 numba_cuda/numba/cuda/cudadrv/dummyarray.py,sha256=nXRngdr-k3h_BNGQuJUxmp89yGNWxqEDJedpwDPEZ44,14209
-numba_cuda/numba/cuda/cudadrv/enums.py,sha256=37zZmyrLvT-7R8wWtwKJkQhN8siLMxsDGiA3_NQ-yx8,23740
+numba_cuda/numba/cuda/cudadrv/enums.py,sha256=Wy5dzukTk4TnWCowg_PLceET_v2xEyiWLu9TyH8pXr8,23742
 numba_cuda/numba/cuda/cudadrv/error.py,sha256=zEIryW6aIy8GG4ypmTliB6RgY4Gy2n8ckz7I6W99LUM,524
 numba_cuda/numba/cuda/cudadrv/libs.py,sha256=Gk9zQ1CKcsZsWl-_9QneXeP9VH5q5R1I3Cx043UOytk,7240
 numba_cuda/numba/cuda/cudadrv/linkable_code.py,sha256=Q_YTv0apBo9t8pkMlKrthPPSVeLd376ZTmVDF5NtVVo,1328
 numba_cuda/numba/cuda/cudadrv/mappings.py,sha256=-dTPHvAkDjdH6vS5OjgrB71AFuqKO6CRgf7hpOk2wiw,802
 numba_cuda/numba/cuda/cudadrv/ndarray.py,sha256=HtULWWFyDlgqvrH5459yyPTvU4UbUo2DSdtcNfvbH00,473
-numba_cuda/numba/cuda/cudadrv/nvrtc.py,sha256=rv-XQo0snJj4xyEbfeBqivziIxCwMOQzIIEOnvLQaJI,9825
+numba_cuda/numba/cuda/cudadrv/nvrtc.py,sha256=RR096Ic2_Zu96C-GGh8x8WTOyxnmDkwtcwag8a_npkQ,10898
 numba_cuda/numba/cuda/cudadrv/nvvm.py,sha256=v2hJJTAQeRmoG59-hnhgMEp5BSVA73QHtEoy636VKao,24107
 numba_cuda/numba/cuda/cudadrv/rtapi.py,sha256=WdeUoWzsYNYodx8kMRLVIjnNs0QzwpCihd2Q0AaqItE,226
 numba_cuda/numba/cuda/cudadrv/runtime.py,sha256=Tj9ACrzQqNmDSO6xfpzw12EsQknSywQ-ZGuWMbDdHnQ,4255
@@ -103,7 +103,7 @@ numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py,sha256=0KPe4E9wOZsSV_0QI0Lmj
 numba_cuda/numba/cuda/tests/cudadrv/test_linker.py,sha256=_l2_EQEko2Jet5ooj4XMT0L4BjOuqLjbONGj1_MVI50,10161
 numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py,sha256=kYXYMkx_3GPAITKp4reLeM8KSzKkpxiC8nxnBvXpaTA,4979
 numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py,sha256=984jATSa01SRoSrVqxPeO6ujJ7w2jsnZa39ABInFLVI,1529
-numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py,sha256=m5zv6K6PHLnm-AqHKo5x9f_ZBrn3rmvPX_ZGjjrkPfI,6807
+numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py,sha256=VOOl5fLxQL5IKHEi8hL47hAH0BUf_D8NyIxptLxIwus,8856
 numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py,sha256=DF7KV5uh-yMztks0f47NhpalV64dvsNy-f8HY6GhAhE,7373
 numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py,sha256=u_TthSS2N-2J4eBIuF4PGg33AjD-wxly7MKpz0vRAKc,944
 numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py,sha256=MQWZx1j3lbEpWmIpQ1bV9szrGOV3VHN0QrEnJRjAhW4,508
@@ -137,8 +137,8 @@ numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py,sha256=ZQuct24GEZn
 numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py,sha256=73FCQbNaAKpuybAwMOt4eW_dL_K6ZjrRgQw09ojkSbY,15844
 numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py,sha256=y7cNQZOZJo5Sv16ql3E5QaRksw-U3RkXss9YDcNeiTk,2137
 numba_cuda/numba/cuda/tests/cudapy/test_datetime.py,sha256=2in1Cq8y9zAFoka7H72wF1D0awEd3n7bv56sUPgoNAQ,3508
-numba_cuda/numba/cuda/tests/cudapy/test_debug.py,sha256=jwYD1xdWKVOv_axf_ztvsPKL62SKYthBYLX3s9ryz7s,3555
-numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py,sha256=jDPgxSe0G0nAib3wgbfrOg6uvnwmCcuB9GhrzXEvlc0,7875
+numba_cuda/numba/cuda/tests/cudapy/test_debug.py,sha256=3MYNiMe75rgBF1T0vsJ7r-nkW5jPvov_tDms9KXo2UU,3449
+numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py,sha256=8Tm1iD2x1BRryB1QY6qp6tdjJCE6Tx9p0LzcYwiExIU,7922
 numba_cuda/numba/cuda/tests/cudapy/test_device_func.py,sha256=aTRyZSOJB3sAShw0YAEgHILrR-TCuowW9KYjtlRErKM,6892
 numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py,sha256=oX-l_L4H8rME1IolwhAyordSGJ152nnuqGAFdWjfgas,26587
 numba_cuda/numba/cuda/tests/cudapy/test_enums.py,sha256=0GWiwvZ1FTzSl1FfMxttkWaWrowASfXrSDT8XAR4ZHw,3560
@@ -154,14 +154,14 @@ numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py,sha256=0NWfQqHmx7tFh6vdS7QtxT8
 numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py,sha256=Uhe8Q0u42jySrpwAZh8vCf4GMYkiy9NOMolyzEBuri0,5382
 numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py,sha256=luDtBxFS_5ZbVemXe1Z7gfqMliaU_EAOR4SuLsU5rhw,2677
 numba_cuda/numba/cuda/tests/cudapy/test_idiv.py,sha256=HLJ_f2lX8m_NNJjUbl_8zZ0-8GsBlRdBP2CUo_yWb0Y,1056
-numba_cuda/numba/cuda/tests/cudapy/test_inspect.py,sha256=lP9-8SbWFn2Xc-qmF6UNhcY6LreKTnveaK5CGW2pu8E,5196
+numba_cuda/numba/cuda/tests/cudapy/test_inspect.py,sha256=hzK1Kk2c-aKCIL2QSodHpyxemOYaghgsMx7H1WvMHX8,4879
 numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py,sha256=M6-pad8nVM0fuL18uFxvE6tmHw0spLNhnMBLVlO0FKU,36400
 numba_cuda/numba/cuda/tests/cudapy/test_ipc.py,sha256=fggyy-kmsOkCb906_q3kXPGRziccWu7Co7ir83zBMwM,10536
 numba_cuda/numba/cuda/tests/cudapy/test_iterators.py,sha256=daQW3kSkp7icCmlTn9pCvnaauz60k_eBf4x1UQF-XVY,2344
 numba_cuda/numba/cuda/tests/cudapy/test_lang.py,sha256=U1BCVZMjU1AZ4wDSmjsRIPPcAReiq4dB77Cz7GmrdmA,1691
 numba_cuda/numba/cuda/tests/cudapy/test_laplace.py,sha256=yD--H5p_NrBHklFNCnxuQ0S8yUIBYScBkvn7hBlZ5ZM,3211
 numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py,sha256=4NsZBXweDPQpqfgo6T7eQHaWDVBof1CZDTpI1QTkV74,6545
-numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py,sha256=sKPF5l1cDTyA4UT0IO8Yeq6pYPGt9pIBQtrMAJMJHCM,6855
+numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py,sha256=cimoEJqCWepvJPIqUumpLjQimg80je-WNul1MfT6KVY,6824
 numba_cuda/numba/cuda/tests/cudapy/test_localmem.py,sha256=uv9UYuytIXQgzHpPgEoWVVVq5-a7-6Io_mWMiNsZ45I,5376
 numba_cuda/numba/cuda/tests/cudapy/test_mandel.py,sha256=crVQBw46l4iyAv8_pu7v1eBy9ZJG7OkigB5zsyi6s3A,1085
 numba_cuda/numba/cuda/tests/cudapy/test_math.py,sha256=T-KRh9qzwOL3usl_6Cly3FVlvauzGhGnedfAG1hBQy8,27615
@@ -173,7 +173,7 @@ numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py,sha256=AjYbSa9nOlv_yc
 numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py,sha256=MfCbyJZu1XsCJOCSw6vvhs4eiP4LZPcF-e9huPmW-ys,2861
 numba_cuda/numba/cuda/tests/cudapy/test_nondet.py,sha256=mYMX0R1tmBLRe5ZAwiDVFFuSyMuPav5guuqL3WHWGPY,1378
 numba_cuda/numba/cuda/tests/cudapy/test_operator.py,sha256=0nJej4D898_JU-jhlif44fR2yu42keK4GoCLP810l3U,13295
-numba_cuda/numba/cuda/tests/cudapy/test_optimization.py,sha256=SvqRsSFgcGxkFDZS-kul5B-mi8GxINTS98uUzAy4dhw,2647
+numba_cuda/numba/cuda/tests/cudapy/test_optimization.py,sha256=IRTI-b7hwMaJxtxFRzoTjpzzeqWGzNyCJPT6C4GugX4,2925
 numba_cuda/numba/cuda/tests/cudapy/test_overload.py,sha256=u4yUDVFcV9E3NWMlNjM81e3IW4KaIkcDtXig8JYevsw,8538
 numba_cuda/numba/cuda/tests/cudapy/test_powi.py,sha256=TI82rYRnkSnwv9VN6PMpBnr9JqMJ_F3HhH4cKY6O8tw,3276
 numba_cuda/numba/cuda/tests/cudapy/test_print.py,sha256=r2xmMNx80_ANi3uFB3CQt3AHAXG_JdhStY1S796hlK0,4466
@@ -187,7 +187,7 @@ numba_cuda/numba/cuda/tests/cudapy/test_serialize.py,sha256=alE5-lTwbjz3Tv6OvQPS
 numba_cuda/numba/cuda/tests/cudapy/test_slicing.py,sha256=bAh_sIk5V9_0_dOVGdzmyjwZkHMLjEbQuEI4e5zRMoU,903
 numba_cuda/numba/cuda/tests/cudapy/test_sm.py,sha256=kh1F0wwQ2_bd54Q4GUX99y2oiWHQwBpyC__ckk-jiTU,14575
 numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py,sha256=bTXDjU94ezo6Bz_lktlPyowTcJHBOWfy7-nJB9e-B_s,7231
-numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py,sha256=alwSPm2xLvuYEwzpuCE6UUkOp6xcEoVqZjyJk3VJjtY,1743
+numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py,sha256=pCU0B-yBavHLgyhlKYAs1SCG8BWim9dSvl2BjXkhgQ4,1868
 numba_cuda/numba/cuda/tests/cudapy/test_sync.py,sha256=Y851UqNkT80U9q_C05SQfvPRCY7jjRARHOMk6g0lU4Y,7837
 numba_cuda/numba/cuda/tests/cudapy/test_transpose.py,sha256=JAQX2EUHwlpKCfJDGspaldmsIRbHxnXpsNUrvRrnIEE,3134
 numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py,sha256=-ehvkxelr45aT8sUNL9Hq8cn2GU_K4GL1yWeX-rHqEM,9680
@@ -232,12 +232,12 @@ numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py,sha256=n0_-xFaw6QqiZbhe55oy7lnEe
 numba_cuda/numba/cuda/tests/nrt/__init__.py,sha256=43EXdiXXRBd6yIcVGMrU9F_EJCD9Uw3mzOP3SB53AEE,260
 numba_cuda/numba/cuda/tests/nrt/mock_numpy.py,sha256=Qtn52GoKZ_ydre3oqkLWVdImC37tuPClUy4uHSutaJo,1568
 numba_cuda/numba/cuda/tests/nrt/test_nrt.py,sha256=Ox6ei2DldvSSS-CndTXRxLnsvWdteOQNgn6GvKHB244,2789
-numba_cuda/numba/cuda/tests/test_binary_generation/Makefile,sha256=OFC_6irwscCNGAyJJKq7fTchzWosCUuiVWU02m0bcUQ,2248
+numba_cuda/numba/cuda/tests/test_binary_generation/Makefile,sha256=P2WzCc5d64JGq6pJwHEwmKVmJOJxPBtsMTbnuzqYkik,2679
 numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py,sha256=V0raLZLGSiWbE_K-JluI0CnmNkXbhlMVj-TH7P1OV8E,5014
 numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu,sha256=cUf-t6ZM9MK_x7X_aKwsrKW1LdR97XcpR-qnYr5faOE,453
 numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu,sha256=q3oxZziT8KDodeNcEBiWULH6vMrHCWucmJmtrg8C0d0,128
-numba_cuda-0.0.19.dist-info/LICENSE,sha256=eHeYE-XjASmwbxfsP5AImgfzRwZurZGqH1f6OFwJ4io,1326
-numba_cuda-0.0.19.dist-info/METADATA,sha256=GAWms3JiCaxTzo4WMk-5h31_Oqo8YFPgekLKFR_YfqA,1393
-numba_cuda-0.0.19.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-numba_cuda-0.0.19.dist-info/top_level.txt,sha256=C50SsH-8tXDmt7I0Y3nlJYhS5s6pqWflCPdobe9vx2M,11
-numba_cuda-0.0.19.dist-info/RECORD,,
+numba_cuda-0.0.21.dist-info/LICENSE,sha256=eHeYE-XjASmwbxfsP5AImgfzRwZurZGqH1f6OFwJ4io,1326
+numba_cuda-0.0.21.dist-info/METADATA,sha256=U_oWdBsw_mdsI2AnFJDXdxTXL2ytOeuTHwS3wCZswTI,1497
+numba_cuda-0.0.21.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+numba_cuda-0.0.21.dist-info/top_level.txt,sha256=C50SsH-8tXDmt7I0Y3nlJYhS5s6pqWflCPdobe9vx2M,11
+numba_cuda-0.0.21.dist-info/RECORD,,

{numba_cuda-0.0.19.dist-info → numba_cuda-0.0.21.dist-info}/LICENSE RENAMED Viewed

File without changes

{numba_cuda-0.0.19.dist-info → numba_cuda-0.0.21.dist-info}/WHEEL RENAMED Viewed

File without changes

{numba_cuda-0.0.19.dist-info → numba_cuda-0.0.21.dist-info}/top_level.txt RENAMED Viewed

File without changes

numba-cuda 0.0.19__py3-none-any.whl → 0.0.21__py3-none-any.whl

numba-cuda 0.0.19py3-none-any.whl → 0.0.21py3-none-any.whl