PyPI - numba-cuda - Versions diffs - 0.0.18__tar.gz → 0.0.20__tar.gz - Mend

numba-cuda 0.0.18tar.gz → 0.0.20tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (251) hide show

{numba_cuda-0.0.18 → numba_cuda-0.0.20}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: numba-cuda
-Version: 0.0.18
+Version: 0.0.20
 Summary: CUDA target for Numba
 Author: Anaconda Inc., NVIDIA Corporation
 License: BSD 2-clause
@@ -13,17 +13,21 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: numba>=0.59.1
+<div align="center"><img src="docs/source/_static/numba-green-icon-rgb.svg" width="200"/></div>
 # Numba CUDA Target
-An out-of-tree CUDA target for Numba.
+The CUDA target for Numba. Please visit the [official
+documentation](https://nvidia.github.io/numba-cuda) to get started!
-This contains an entire copy of Numba's CUDA target (the `numba.cuda` module),
-and a mechanism to ensure the code from this module (`numba_cuda.numba.cuda`) is
-used as the `numba.cuda` module instead of the code from the `numba` package.
+To report issues or file feature requests, please use the [issue
+tracker](https://github.com/NVIDIA/numba-cuda/issues).
-This is presently in an early state and is published for testing and feedback.
+To raise questions or initiate discussions, please use the [Numba Discourse
+forum](https://numba.discourse.group).
-## Building / testing
+## Building from source
 Install as an editable install:
@@ -31,7 +35,7 @@ Install as an editable install:
 pip install -e .
 ```
-Running tests:
+## Running tests
 ```
 python -m numba.runtests numba.cuda.tests

numba_cuda-0.0.20/README.md ADDED Viewed

@@ -0,0 +1,40 @@
+<div align="center"><img src="docs/source/_static/numba-green-icon-rgb.svg" width="200"/></div>
+# Numba CUDA Target
+The CUDA target for Numba. Please visit the [official
+documentation](https://nvidia.github.io/numba-cuda) to get started!
+To report issues or file feature requests, please use the [issue
+tracker](https://github.com/NVIDIA/numba-cuda/issues).
+To raise questions or initiate discussions, please use the [Numba Discourse
+forum](https://numba.discourse.group).
+## Building from source
+Install as an editable install:
+```
+pip install -e .
+```
+## Running tests
+```
+python -m numba.runtests numba.cuda.tests
+```
+This should discover the`numba.cuda` module from the `numba_cuda` package. You
+can check where `numba.cuda` files are being located by running
+```
+python -c "from numba import cuda; print(cuda.__file__)"
+```
+which will show a path like:
+```
+<path to numba-cuda repo>/numba_cuda/numba/cuda/__init__.py
+```

numba_cuda-0.0.20/numba_cuda/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.0.20

{numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/compiler.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from llvmlite import ir
 from numba.core.typing.templates import ConcreteTemplate
-from numba.core import types, typing, funcdesc, config, compiler, sigutils
+from numba.core import (cgutils, types, typing, funcdesc, config, compiler,
+                        sigutils, utils)
 from numba.core.compiler import (sanitize_compile_result_entries, CompilerBase,
                                  DefaultPassBuilder, Flags, Option,
                                  CompileResult)
@@ -11,7 +12,10 @@ from numba.core.errors import NumbaInvalidConfigWarning
 from numba.core.typed_passes import (IRLegalization, NativeLowering,
                                      AnnotateTypes)
 from warnings import warn
+from numba.cuda import nvvmutils
 from numba.cuda.api import get_current_device
+from numba.cuda.cudadrv import nvvm
+from numba.cuda.descriptor import cuda_target
 from numba.cuda.target import CUDACABICallConv
@@ -24,6 +28,15 @@ def _nvvm_options_type(x):
         return x
+def _optional_int_type(x):
+    if x is None:
+        return None
+    else:
+        assert isinstance(x, int)
+        return x
 class CUDAFlags(Flags):
     nvvm_options = Option(
         type=_nvvm_options_type,
@@ -35,6 +48,16 @@ class CUDAFlags(Flags):
         default=None,
         doc="Compute Capability",
     )
+    max_registers = Option(
+        type=_optional_int_type,
+        default=None,
+        doc="Max registers"
+    )
+    lto = Option(
+        type=bool,
+        default=False,
+        doc="Enable Link-time Optimization"
+    )
 # The CUDACompileResult (CCR) has a specially-defined entry point equal to its
@@ -109,7 +132,11 @@ class CreateLibrary(LoweringPass):
         codegen = state.targetctx.codegen()
         name = state.func_id.func_qualname
         nvvm_options = state.flags.nvvm_options
-        state.library = codegen.create_library(name, nvvm_options=nvvm_options)
+        max_registers = state.flags.max_registers
+        lto = state.flags.lto
+        state.library = codegen.create_library(name, nvvm_options=nvvm_options,
+                                               max_registers=max_registers,
+                                               lto=lto)
         # Enable object caching upfront so that the library can be serialized.
         state.library.enable_object_caching()
@@ -152,7 +179,7 @@ class CUDACompiler(CompilerBase):
 @global_compiler_lock
 def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
                  inline=False, fastmath=False, nvvm_options=None,
-                 cc=None):
+                 cc=None, max_registers=None, lto=False):
     if cc is None:
         raise ValueError('Compute Capability must be supplied')
@@ -189,6 +216,8 @@ def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
     if nvvm_options:
         flags.nvvm_options = nvvm_options
     flags.compute_capability = cc
+    flags.max_registers = max_registers
+    flags.lto = lto
     # Run compilation pipeline
     from numba.core.target_extension import target_override
@@ -247,11 +276,155 @@ def cabi_wrap_function(context, lib, fndesc, wrapper_function_name,
         builder, func, restype, argtypes, callargs)
     builder.ret(return_value)
+    if config.DUMP_LLVM:
+        utils.dump_llvm(fndesc, wrapper_module)
     library.add_ir_module(wrapper_module)
     library.finalize()
     return library
+def kernel_fixup(kernel, debug):
+    if debug:
+        exc_helper = add_exception_store_helper(kernel)
+    # Pass 1 - replace:
+    #
+    #    ret <value>
+    #
+    # with:
+    #
+    #    exc_helper(<value>)
+    #    ret void
+    for block in kernel.blocks:
+        for i, inst in enumerate(block.instructions):
+            if isinstance(inst, ir.Ret):
+                old_ret = block.instructions.pop()
+                block.terminator = None
+                # The original return's metadata will be set on the new
+                # instructions in order to preserve debug info
+                metadata = old_ret.metadata
+                builder = ir.IRBuilder(block)
+                if debug:
+                    status_code = old_ret.operands[0]
+                    exc_helper_call = builder.call(exc_helper, (status_code,))
+                    exc_helper_call.metadata = metadata
+                new_ret = builder.ret_void()
+                new_ret.metadata = old_ret.metadata
+                # Need to break out so we don't carry on modifying what we are
+                # iterating over. There can only be one return in a block
+                # anyway.
+                break
+    # Pass 2: remove stores of null pointer to return value argument pointer
+    return_value = kernel.args[0]
+    for block in kernel.blocks:
+        remove_list = []
+        # Find all stores first
+        for inst in block.instructions:
+            if (isinstance(inst, ir.StoreInstr)
+                    and inst.operands[1] == return_value):
+                remove_list.append(inst)
+        # Remove all stores
+        for to_remove in remove_list:
+            block.instructions.remove(to_remove)
+    # Replace non-void return type with void return type and remove return
+    # value
+    if isinstance(kernel.type, ir.PointerType):
+        new_type = ir.PointerType(ir.FunctionType(ir.VoidType(),
+                                                  kernel.type.pointee.args[1:]))
+    else:
+        new_type = ir.FunctionType(ir.VoidType(), kernel.type.args[1:])
+    kernel.type = new_type
+    kernel.return_value = ir.ReturnValue(kernel, ir.VoidType())
+    kernel.args = kernel.args[1:]
+    # Mark as a kernel for NVVM
+    nvvm.set_cuda_kernel(kernel)
+    if config.DUMP_LLVM:
+        print(f"LLVM DUMP: Post kernel fixup {kernel.name}".center(80, '-'))
+        print(kernel.module)
+        print('=' * 80)
+def add_exception_store_helper(kernel):
+    # Create global variables for exception state
+    def define_error_gv(postfix):
+        name = kernel.name + postfix
+        gv = cgutils.add_global_variable(kernel.module, ir.IntType(32),
+                                         name)
+        gv.initializer = ir.Constant(gv.type.pointee, None)
+        return gv
+    gv_exc = define_error_gv("__errcode__")
+    gv_tid = []
+    gv_ctaid = []
+    for i in 'xyz':
+        gv_tid.append(define_error_gv("__tid%s__" % i))
+        gv_ctaid.append(define_error_gv("__ctaid%s__" % i))
+    # Create exception store helper function
+    helper_name = kernel.name + "__exc_helper__"
+    helper_type = ir.FunctionType(ir.VoidType(), (ir.IntType(32),))
+    helper_func = ir.Function(kernel.module, helper_type, helper_name)
+    block = helper_func.append_basic_block(name="entry")
+    builder = ir.IRBuilder(block)
+    # Implement status check / exception store logic
+    status_code = helper_func.args[0]
+    call_conv = cuda_target.target_context.call_conv
+    status = call_conv._get_return_status(builder, status_code)
+    # Check error status
+    with cgutils.if_likely(builder, status.is_ok):
+        builder.ret_void()
+    with builder.if_then(builder.not_(status.is_python_exc)):
+        # User exception raised
+        old = ir.Constant(gv_exc.type.pointee, None)
+        # Use atomic cmpxchg to prevent rewriting the error status
+        # Only the first error is recorded
+        xchg = builder.cmpxchg(gv_exc, old, status.code,
+                               'monotonic', 'monotonic')
+        changed = builder.extract_value(xchg, 1)
+        # If the xchange is successful, save the thread ID.
+        sreg = nvvmutils.SRegBuilder(builder)
+        with builder.if_then(changed):
+            for dim, ptr, in zip("xyz", gv_tid):
+                val = sreg.tid(dim)
+                builder.store(val, ptr)
+            for dim, ptr, in zip("xyz", gv_ctaid):
+                val = sreg.ctaid(dim)
+                builder.store(val, ptr)
+    builder.ret_void()
+    return helper_func
 @global_compiler_lock
 def compile(pyfunc, sig, debug=None, lineinfo=False, device=True,
             fastmath=False, cc=None, opt=None, abi="c", abi_info=None,
@@ -347,13 +520,10 @@ def compile(pyfunc, sig, debug=None, lineinfo=False, device=True,
             lib = cabi_wrap_function(tgt, lib, cres.fndesc, wrapper_name,
                                      nvvm_options)
     else:
-        code = pyfunc.__code__
-        filename = code.co_filename
-        linenum = code.co_firstlineno
-        lib, kernel = tgt.prepare_cuda_kernel(cres.library, cres.fndesc, debug,
-                                              lineinfo, nvvm_options, filename,
-                                              linenum)
+        lib = cres.library
+        kernel = lib.get_function(cres.fndesc.llvm_func_name)
+        lib._entry_name = cres.fndesc.llvm_func_name
+        kernel_fixup(kernel, debug)
     if lto:
         code = lib.get_ltoir(cc=cc)

{numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cuda_paths.py RENAMED Viewed

@@ -2,9 +2,11 @@ import sys
 import re
 import os
 from collections import namedtuple
+import platform
 from numba.core.config import IS_WIN32
 from numba.misc.findlib import find_lib, find_file
+from numba import config
 _env_path_tuple = namedtuple('_env_path_tuple', ['by', 'info'])
@@ -241,6 +243,7 @@ def get_cuda_paths():
             'libdevice': _get_libdevice_paths(),
             'cudalib_dir': _get_cudalib_dir(),
             'static_cudalib_dir': _get_static_cudalib_dir(),
+            'include_dir': _get_include_dir(),
         }
         # Cache result
         get_cuda_paths._cached_result = d
@@ -256,3 +259,70 @@ def get_debian_pkg_libdevice():
     if not os.path.exists(pkg_libdevice_location):
         return None
     return pkg_libdevice_location
+def get_current_cuda_target_name():
+    """Determine conda's CTK target folder based on system and machine arch.
+    CTK's conda package delivers headers based on its architecture type. For example,
+    `x86_64` machine places header under `$CONDA_PREFIX/targets/x86_64-linux`, and
+    `aarch64` places under `$CONDA_PREFIX/targets/sbsa-linux`. Read more about the
+    nuances at cudart's conda feedstock:
+    https://github.com/conda-forge/cuda-cudart-feedstock/blob/main/recipe/meta.yaml#L8-L11  # noqa: E501
+    """
+    system = platform.system()
+    machine = platform.machine()
+    if system == "Linux":
+        arch_to_targets = {
+            'x86_64': 'x86_64-linux',
+            'aarch64': 'sbsa-linux'
+        }
+    elif system == "Windows":
+        arch_to_targets = {
+            'AMD64': 'x64',
+        }
+    else:
+        arch_to_targets = {}
+    return arch_to_targets.get(machine, None)
+def get_conda_include_dir():
+    """
+    Return the include directory in the current conda environment, if one
+    is active and it exists.
+    """
+    is_conda_env = os.path.exists(os.path.join(sys.prefix, 'conda-meta'))
+    if not is_conda_env:
+        return
+    if platform.system() == "Windows":
+        include_dir = os.path.join(
+            sys.prefix, 'Library', 'include'
+        )
+    elif target_name := get_current_cuda_target_name():
+        include_dir = os.path.join(
+            sys.prefix, 'targets', target_name, 'include'
+        )
+    else:
+        # A fallback when target cannot determined
+        # though usually it shouldn't.
+        include_dir = os.path.join(sys.prefix, 'include')
+    if (os.path.exists(include_dir) and os.path.isdir(include_dir)
+            and os.path.exists(os.path.join(include_dir,
+                                            'cuda_device_runtime_api.h'))):
+        return include_dir
+    return
+def _get_include_dir():
+    """Find the root include directory."""
+    options = [
+        ('Conda environment (NVIDIA package)', get_conda_include_dir()),
+        ('CUDA_INCLUDE_PATH Config Entry', config.CUDA_INCLUDE_PATH),
+        # TODO: add others
+    ]
+    by, include_dir = _find_valid_path(options)
+    return _env_path_tuple(by, include_dir)

{numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cudadrv/devicearray.py RENAMED Viewed

@@ -876,7 +876,10 @@ def auto_device(obj, stream=0, copy=True, user_explicit=False):
             sentry_contiguous(obj)
             devobj = from_array_like(obj, stream=stream)
         if copy:
-            if config.CUDA_WARN_ON_IMPLICIT_COPY:
+            if (
+                config.CUDA_WARN_ON_IMPLICIT_COPY
+                and not config.DISABLE_PERFORMANCE_WARNINGS
+            ):
                 if (
                     not user_explicit and
                     (not isinstance(obj, DeviceNDArray)

{numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cudadrv/enums.py RENAMED Viewed

@@ -55,7 +55,7 @@ CUDA_ERROR_INVALID_HANDLE = 400
 CUDA_ERROR_ILLEGAL_STATE = 401
 CUDA_ERROR_NOT_FOUND = 500
 CUDA_ERROR_NOT_READY = 600
-CUDA_ERROR_LAUNCH_FAILED = 700
+CUDA_ERROR_ILLEGAL_ADDRESS = 700
 CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701
 CUDA_ERROR_LAUNCH_TIMEOUT = 702
 CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703

{numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cudadrv/libs.py RENAMED Viewed

@@ -18,6 +18,7 @@ from numba.misc.findlib import find_lib
 from numba.cuda.cuda_paths import get_cuda_paths
 from numba.cuda.cudadrv.driver import locate_driver_and_loader, load_driver
 from numba.cuda.cudadrv.error import CudaSupportError
+from numba.core import config
 if sys.platform == 'win32':
@@ -60,6 +61,24 @@ def get_cudalib(lib, static=False):
     return max(candidates) if candidates else namepattern % lib
+def get_cuda_include_dir():
+    """
+    Find the path to cuda include dir based on a list of default locations.
+    Note that this does not list the `CUDA_INCLUDE_PATH` entry in user
+    configuration.
+    """
+    return get_cuda_paths()['include_dir'].info
+def check_cuda_include_dir(path):
+    if path is None or not os.path.exists(path):
+        raise FileNotFoundError(f"{path} not found")
+    if not os.path.exists(os.path.join(path, "cuda_runtime.h")):
+        raise FileNotFoundError(f"Unable to find cuda_runtime.h from {path}")
 def open_cudalib(lib):
     path = get_cudalib(lib)
     return ctypes.CDLL(path)
@@ -75,6 +94,8 @@ def _get_source_variable(lib, static=False):
         return get_cuda_paths()['nvvm'].by
     elif lib == 'libdevice':
         return get_cuda_paths()['libdevice'].by
+    elif lib == 'include_dir':
+        return get_cuda_paths()['include_dir'].by
     else:
         dir_type = 'static_cudalib_dir' if static else 'cudalib_dir'
         return get_cuda_paths()[dir_type].by
@@ -173,4 +194,21 @@ def test():
         print('\tERROR: failed to find %s:\n%s' % (lib, e))
         failed = True
+    # Check cuda include paths
+    print("Include directory configuration variable:")
+    print(f"\tCUDA_INCLUDE_PATH={config.CUDA_INCLUDE_PATH}")
+    where = _get_source_variable('include_dir')
+    print(f'Finding include directory from {where}')
+    include = get_cuda_include_dir()
+    print('\tLocated at', include)
+    try:
+        print('\tChecking include directory', end='...')
+        check_cuda_include_dir(include)
+        print('\tok')
+    except FileNotFoundError as e:
+        print('\tERROR: failed to find cuda include directory:\n%s' % e)
+        failed = True
     return not failed

{numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cudadrv/nvrtc.py RENAMED Viewed

@@ -1,9 +1,8 @@
 from ctypes import byref, c_char, c_char_p, c_int, c_size_t, c_void_p, POINTER
 from enum import IntEnum
-from numba.core import config
 from numba.cuda.cudadrv.error import (NvrtcError, NvrtcCompilationError,
                                       NvrtcSupportError)
+from numba.cuda.cuda_paths import get_cuda_paths
 import functools
 import os
 import threading
@@ -233,12 +232,18 @@ def compile(src, name, cc):
     #   being optimized away.
     major, minor = cc
     arch = f'--gpu-architecture=compute_{major}{minor}'
-    include = f'-I{config.CUDA_INCLUDE_PATH}'
+    cuda_include = [
+        f"-I{get_cuda_paths()['include_dir'].info}",
+    ]
     cudadrv_path = os.path.dirname(os.path.abspath(__file__))
     numba_cuda_path = os.path.dirname(cudadrv_path)
     numba_include = f'-I{numba_cuda_path}'
-    options = [arch, include, numba_include, '-rdc', 'true']
+    options = [arch, *cuda_include, numba_include, '-rdc', 'true']
+    if nvrtc.get_version() < (12, 0):
+        options += ["-std=c++17"]
     # Compile the program
     compile_error = nvrtc.compile_program(program, options)

{numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/dispatcher.py RENAMED Viewed

@@ -1,5 +1,6 @@
 import numpy as np
 import os
+import re
 import sys
 import ctypes
 import functools
@@ -13,7 +14,7 @@ from numba.core.typing.typeof import Purpose, typeof
 from numba.cuda.api import get_current_device
 from numba.cuda.args import wrap_arg
-from numba.cuda.compiler import compile_cuda, CUDACompiler
+from numba.cuda.compiler import compile_cuda, CUDACompiler, kernel_fixup
 from numba.cuda.cudadrv import driver
 from numba.cuda.cudadrv.devices import get_context
 from numba.cuda.descriptor import cuda_target
@@ -43,6 +44,21 @@ class _Kernel(serialize.ReduceMixin):
     object launches the kernel on the device.
     '''
+    NRT_functions = [
+        "NRT_Allocate",
+        "NRT_MemInfo_init",
+        "NRT_MemInfo_new",
+        "NRT_Free",
+        "NRT_dealloc",
+        "NRT_MemInfo_destroy",
+        "NRT_MemInfo_call_dtor",
+        "NRT_MemInfo_data_fast",
+        "NRT_MemInfo_alloc_aligned",
+        "NRT_Allocate_External",
+        "NRT_decref",
+        "NRT_incref"
+    ]
     @global_compiler_lock
     def __init__(self, py_func, argtypes, link=None, debug=False,
                  lineinfo=False, inline=False, fastmath=False, extensions=None,
@@ -86,15 +102,14 @@ class _Kernel(serialize.ReduceMixin):
                             inline=inline,
                             fastmath=fastmath,
                             nvvm_options=nvvm_options,
-                            cc=cc)
+                            cc=cc,
+                            max_registers=max_registers,
+                            lto=lto)
         tgt_ctx = cres.target_context
-        code = self.py_func.__code__
-        filename = code.co_filename
-        linenum = code.co_firstlineno
-        lib, kernel = tgt_ctx.prepare_cuda_kernel(cres.library, cres.fndesc,
-                                                  debug, lineinfo, nvvm_options,
-                                                  filename, linenum,
-                                                  max_registers, lto)
+        lib = cres.library
+        kernel = lib.get_function(cres.fndesc.llvm_func_name)
+        lib._entry_name = cres.fndesc.llvm_func_name
+        kernel_fixup(kernel, self.debug)
         if not link:
             link = []
@@ -105,16 +120,20 @@ class _Kernel(serialize.ReduceMixin):
         if self.cooperative:
             lib.needs_cudadevrt = True
+        basedir = os.path.dirname(os.path.abspath(__file__))
+        asm = lib.get_asm_str()
         res = [fn for fn in cuda_fp16_math_funcs
-               if (f'__numba_wrapper_{fn}' in lib.get_asm_str())]
+               if (f'__numba_wrapper_{fn}' in asm)]
         if res:
             # Path to the source containing the foreign function
-            basedir = os.path.dirname(os.path.abspath(__file__))
             functions_cu_path = os.path.join(basedir,
                                              'cpp_function_wrappers.cu')
             link.append(functions_cu_path)
+        link = self.maybe_link_nrt(link, tgt_ctx, asm)
         for filepath in link:
             lib.add_linking_file(filepath)
@@ -136,6 +155,25 @@ class _Kernel(serialize.ReduceMixin):
         self.lifted = []
         self.reload_init = []
+    def maybe_link_nrt(self, link, tgt_ctx, asm):
+        if not tgt_ctx.enable_nrt:
+            return link
+        all_nrt = "|".join(self.NRT_functions)
+        pattern = (
+            r'\.extern\s+\.func\s+(?:\s*\(.+\)\s*)?('
+            + all_nrt + r')\s*\([^)]*\)\s*;'
+        )
+        nrt_in_asm = re.findall(pattern, asm)
+        basedir = os.path.dirname(os.path.abspath(__file__))
+        if nrt_in_asm:
+            nrt_path = os.path.join(basedir, 'runtime', 'nrt.cu')
+            link.append(nrt_path)
+        return link
     @property
     def library(self):
         return self._codelibrary
@@ -385,7 +423,6 @@ class _Kernel(serialize.ReduceMixin):
         if isinstance(ty, types.Array):
             devary = wrap_arg(val).to_device(retr, stream)
             c_intp = ctypes.c_ssize_t
             meminfo = ctypes.c_void_p(0)
@@ -519,7 +556,10 @@ class _LaunchConfiguration:
         self.stream = stream
         self.sharedmem = sharedmem
-        if config.CUDA_LOW_OCCUPANCY_WARNINGS:
+        if (
+            config.CUDA_LOW_OCCUPANCY_WARNINGS
+            and not config.DISABLE_PERFORMANCE_WARNINGS
+        ):
             # Warn when the grid has fewer than 128 blocks. This number is
             # chosen somewhat heuristically - ideally the minimum is 2 times
             # the number of SMs, but the number of SMs varies between devices -
@@ -708,8 +748,7 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
         *args*.
         '''
         cc = get_current_device().compute_capability
-        argtypes = tuple(
-            [self.typingctx.resolve_argument_type(a) for a in args])
+        argtypes = tuple(self.typeof_pyval(a) for a in args)
         if self.specialized:
             raise RuntimeError('Dispatcher already specialized')

numba-cuda 0.0.18__tar.gz → 0.0.20__tar.gz

numba-cuda 0.0.18tar.gz → 0.0.20tar.gz