PyPI - numba-cuda - Versions diffs - 0.5.0__tar.gz → 0.7.0__tar.gz - Mend

numba-cuda 0.5.0tar.gz → 0.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (259) hide show

{numba_cuda-0.5.0 → numba_cuda-0.7.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: numba-cuda
-Version: 0.5.0
+Version: 0.7.0
 Summary: CUDA target for Numba
 Author: Anaconda Inc., NVIDIA Corporation
 License: BSD 2-clause

numba_cuda-0.7.0/numba_cuda/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.7.0

{numba_cuda-0.5.0 → numba_cuda-0.7.0}/numba_cuda/numba/cuda/compiler.py RENAMED Viewed

@@ -1,14 +1,17 @@
 from llvmlite import ir
 from numba.core.typing.templates import ConcreteTemplate
+from numba.core import ir as numba_ir
 from numba.core import (cgutils, types, typing, funcdesc, config, compiler,
                         sigutils, utils)
 from numba.core.compiler import (sanitize_compile_result_entries, CompilerBase,
                                  DefaultPassBuilder, Flags, Option,
                                  CompileResult)
 from numba.core.compiler_lock import global_compiler_lock
-from numba.core.compiler_machinery import (LoweringPass,
+from numba.core.compiler_machinery import (FunctionPass, LoweringPass,
                                            PassManager, register_pass)
+from numba.core.interpreter import Interpreter
 from numba.core.errors import NumbaInvalidConfigWarning
+from numba.core.untyped_passes import TranslateByteCode
 from numba.core.typed_passes import (IRLegalization, NativeLowering,
                                      AnnotateTypes)
 from warnings import warn
@@ -143,13 +146,74 @@ class CreateLibrary(LoweringPass):
         return True
+class CUDABytecodeInterpreter(Interpreter):
+    # Based on the superclass implementation, but names the resulting variable
+    # "$bool<N>" instead of "bool<N>" - see Numba PR #9888:
+    # https://github.com/numba/numba/pull/9888
+    #
+    # This can be removed once that PR is available in an upstream Numba
+    # release.
+    def _op_JUMP_IF(self, inst, pred, iftrue):
+        brs = {
+            True: inst.get_jump_target(),
+            False: inst.next,
+        }
+        truebr = brs[iftrue]
+        falsebr = brs[not iftrue]
+        name = "$bool%s" % (inst.offset)
+        gv_fn = numba_ir.Global("bool", bool, loc=self.loc)
+        self.store(value=gv_fn, name=name)
+        callres = numba_ir.Expr.call(self.get(name), (self.get(pred),), (),
+                                     loc=self.loc)
+        pname = "$%spred" % (inst.offset)
+        predicate = self.store(value=callres, name=pname)
+        bra = numba_ir.Branch(cond=predicate, truebr=truebr, falsebr=falsebr,
+                              loc=self.loc)
+        self.current_block.append(bra)
+@register_pass(mutates_CFG=True, analysis_only=False)
+class CUDATranslateBytecode(FunctionPass):
+    _name = "cuda_translate_bytecode"
+    def __init__(self):
+        FunctionPass.__init__(self)
+    def run_pass(self, state):
+        func_id = state['func_id']
+        bc = state['bc']
+        interp = CUDABytecodeInterpreter(func_id)
+        func_ir = interp.interpret(bc)
+        state['func_ir'] = func_ir
+        return True
 class CUDACompiler(CompilerBase):
     def define_pipelines(self):
         dpb = DefaultPassBuilder
         pm = PassManager('cuda')
         untyped_passes = dpb.define_untyped_pipeline(self.state)
-        pm.passes.extend(untyped_passes.passes)
+        # Rather than replicating the whole untyped passes definition in
+        # numba-cuda, it seems cleaner to take the pass list and replace the
+        # TranslateBytecode pass with our own.
+        def replace_translate_pass(implementation, description):
+            if implementation is TranslateByteCode:
+                return (CUDATranslateBytecode, description)
+            else:
+                return (implementation, description)
+        cuda_untyped_passes = [
+            replace_translate_pass(implementation, description)
+            for implementation, description in untyped_passes.passes
+        ]
+        pm.passes.extend(cuda_untyped_passes)
         typed_passes = dpb.define_typed_pipeline(self.state)
         pm.passes.extend(typed_passes.passes)
@@ -352,6 +416,20 @@ def kernel_fixup(kernel, debug):
     kernel.return_value = ir.ReturnValue(kernel, ir.VoidType())
     kernel.args = kernel.args[1:]
+    # If debug metadata is present, remove the return value from it
+    if kernel_metadata := getattr(kernel, 'metadata', None):
+        if dbg_metadata := kernel_metadata.get('dbg', None):
+            for name, value in dbg_metadata.operands:
+                if name == "type":
+                    type_metadata = value
+                    for tm_name, tm_value in type_metadata.operands:
+                        if tm_name == 'types':
+                            types = tm_value
+                            types.operands = types.operands[1:]
+                            if config.DUMP_LLVM:
+                                types._clear_string_cache()
     # Mark as a kernel for NVVM
     nvvm.set_cuda_kernel(kernel)

{numba_cuda-0.5.0 → numba_cuda-0.7.0}/numba_cuda/numba/cuda/cudadrv/nvvm.py RENAMED Viewed

@@ -199,12 +199,52 @@ class NVVM(object):
 class CompilationUnit(object):
-    def __init__(self):
+    """
+    A CompilationUnit is a set of LLVM modules that are compiled to PTX or
+    LTO-IR with NVVM.
+    Compilation options are accepted as a dict mapping option names to values,
+    with the following considerations:
+    - Underscores (`_`) in option names are converted to dashes (`-`), to match
+      NVVM's option name format.
+    - Options that take a value will be emitted in the form "-<name>=<value>".
+    - Booleans passed as option values will be converted to integers.
+    - Options which take no value (such as `-gen-lto`) should have a value of
+      `None` and will be emitted in the form "-<name>".
+    For documentation on NVVM compilation options, see the CUDA Toolkit
+    Documentation:
+    https://docs.nvidia.com/cuda/libnvvm-api/index.html#_CPPv418nvvmCompileProgram11nvvmProgramiPPKc
+    """
+    def __init__(self, options):
         self.driver = NVVM()
         self._handle = nvvm_program()
         err = self.driver.nvvmCreateProgram(byref(self._handle))
         self.driver.check_error(err, 'Failed to create CU')
+        def stringify_option(k, v):
+            k = k.replace('_', '-')
+            if v is None:
+                return f'-{k}'.encode('utf-8')
+            if isinstance(v, bool):
+                v = int(v)
+            return f'-{k}={v}'.encode('utf-8')
+        options = [stringify_option(k, v) for k, v in options.items()]
+        option_ptrs = (c_char_p * len(options))(*[c_char_p(x) for x in options])
+        # We keep both the options and the pointers to them so that options are
+        # not destroyed before we've used their values
+        self.options = options
+        self.option_ptrs = option_ptrs
+        self.n_options = len(options)
     def __del__(self):
         driver = NVVM()
         err = driver.nvvmDestroyProgram(byref(self._handle))
@@ -230,60 +270,35 @@ class CompilationUnit(object):
                                                      len(buffer), None)
         self.driver.check_error(err, 'Failed to add module')
-    def compile(self, **options):
-        """Perform Compilation.
-        Compilation options are accepted as keyword arguments, with the
-        following considerations:
-        - Underscores (`_`) in option names are converted to dashes (`-`), to
-          match NVVM's option name format.
-        - Options that take a value will be emitted in the form
-          "-<name>=<value>".
-        - Booleans passed as option values will be converted to integers.
-        - Options which take no value (such as `-gen-lto`) should have a value
-          of `None` passed in and will be emitted in the form "-<name>".
-        For documentation on NVVM compilation options, see the CUDA Toolkit
-        Documentation:
-        https://docs.nvidia.com/cuda/libnvvm-api/index.html#_CPPv418nvvmCompileProgram11nvvmProgramiPPKc
+    def verify(self):
         """
-        def stringify_option(k, v):
-            k = k.replace('_', '-')
-            if v is None:
-                return f'-{k}'
-            if isinstance(v, bool):
-                v = int(v)
-            return f'-{k}={v}'
-        options = [stringify_option(k, v) for k, v in options.items()]
-        c_opts = (c_char_p * len(options))(*[c_char_p(x.encode('utf8'))
-                                             for x in options])
-        # verify
-        err = self.driver.nvvmVerifyProgram(self._handle, len(options), c_opts)
+        Run the NVVM verifier on all code added to the compilation unit.
+        """
+        err = self.driver.nvvmVerifyProgram(self._handle, self.n_options,
+                                            self.option_ptrs)
         self._try_error(err, 'Failed to verify\n')
-        # compile
-        err = self.driver.nvvmCompileProgram(self._handle, len(options), c_opts)
+    def compile(self):
+        """
+        Compile all modules added to the compilation unit and return the
+        resulting PTX or LTO-IR (depending on the options).
+        """
+        err = self.driver.nvvmCompileProgram(self._handle, self.n_options,
+                                             self.option_ptrs)
         self._try_error(err, 'Failed to compile\n')
-        # get result
-        reslen = c_size_t()
-        err = self.driver.nvvmGetCompiledResultSize(self._handle, byref(reslen))
+        # Get result
+        result_size = c_size_t()
+        err = self.driver.nvvmGetCompiledResultSize(self._handle,
+                                                    byref(result_size))
         self._try_error(err, 'Failed to get size of compiled result.')
-        output_buffer = (c_char * reslen.value)()
+        output_buffer = (c_char * result_size.value)()
         err = self.driver.nvvmGetCompiledResult(self._handle, output_buffer)
         self._try_error(err, 'Failed to get compiled result.')
-        # get log
+        # Get log
         self.log = self.get_log()
         if self.log:
             warnings.warn(self.log, category=NvvmWarning)
@@ -615,40 +630,44 @@ def llvm_replace(llvmir):
     for decl, fn in replacements:
         llvmir = llvmir.replace(decl, fn)
-    llvmir = llvm140_to_70_ir(llvmir)
+    llvmir = llvm150_to_70_ir(llvmir)
     return llvmir
-def compile_ir(llvmir, **opts):
+def compile_ir(llvmir, **options):
     if isinstance(llvmir, str):
         llvmir = [llvmir]
-    if opts.pop('fastmath', False):
-        opts.update({
+    if options.pop('fastmath', False):
+        options.update({
             'ftz': True,
             'fma': True,
             'prec_div': False,
             'prec_sqrt': False,
         })
-    cu = CompilationUnit()
-    libdevice = LibDevice()
+    cu = CompilationUnit(options)
     for mod in llvmir:
         mod = llvm_replace(mod)
         cu.add_module(mod.encode('utf8'))
+    cu.verify()
+    # We add libdevice following verification so that it is not subject to the
+    # verifier's requirements
+    libdevice = LibDevice()
     cu.lazy_add_module(libdevice.get())
-    return cu.compile(**opts)
+    return cu.compile()
 re_attributes_def = re.compile(r"^attributes #\d+ = \{ ([\w\s]+)\ }")
-def llvm140_to_70_ir(ir):
+def llvm150_to_70_ir(ir):
     """
-    Convert LLVM 14.0 IR for LLVM 7.0.
+    Convert LLVM 15.0 IR for LLVM 7.0.
     """
     buf = []
     for line in ir.splitlines():

numba_cuda-0.7.0/numba_cuda/numba/cuda/debuginfo.py ADDED Viewed

@@ -0,0 +1,44 @@
+from llvmlite import ir
+from numba.core import types
+from numba.core.debuginfo import DIBuilder
+from numba.cuda.types import GridGroup
+_BYTE_SIZE = 8
+class CUDADIBuilder(DIBuilder):
+    def _var_type(self, lltype, size, datamodel=None):
+        is_bool = False
+        is_grid_group = False
+        if isinstance(lltype, ir.IntType):
+            if datamodel is None:
+                if size == 1:
+                    name = str(lltype)
+                    is_bool = True
+            else:
+                name = str(datamodel.fe_type)
+                if isinstance(datamodel.fe_type, types.Boolean):
+                    is_bool = True
+                elif isinstance(datamodel.fe_type, GridGroup):
+                    is_grid_group = True
+        if is_bool or is_grid_group:
+            m = self.module
+            bitsize = _BYTE_SIZE * size
+            # Boolean type workaround until upstream Numba is fixed
+            if is_bool:
+                ditok = "DW_ATE_boolean"
+            # GridGroup type should use numba.cuda implementation
+            elif is_grid_group:
+                ditok = "DW_ATE_unsigned"
+            return m.add_debug_info('DIBasicType', {
+                'name': name,
+                'size': bitsize,
+                'encoding': ir.DIToken(ditok),
+            })
+        # For other cases, use upstream Numba implementation
+        return super()._var_type(lltype, size, datamodel=datamodel)

{numba_cuda-0.5.0 → numba_cuda-0.7.0}/numba_cuda/numba/cuda/dispatcher.py RENAMED Viewed

@@ -4,8 +4,9 @@ import re
 import sys
 import ctypes
 import functools
+from collections import defaultdict
-from numba.core import config, serialize, sigutils, types, typing, utils
+from numba.core import config, ir, serialize, sigutils, types, typing, utils
 from numba.core.caching import Cache, CacheImpl
 from numba.core.compiler_lock import global_compiler_lock
 from numba.core.dispatcher import Dispatcher
@@ -42,6 +43,55 @@ cuda_fp16_math_funcs = ['hsin', 'hcos',
 reshape_funcs = ['nocopy_empty_reshape', 'numba_attempt_nocopy_reshape']
+def get_cres_link_objects(cres):
+    """Given a compile result, return a set of all linkable code objects that
+    are required for it to be fully linked."""
+    link_objects = set()
+    # List of calls into declared device functions
+    device_func_calls = [
+        (name, v) for name, v in cres.fndesc.typemap.items() if (
+            isinstance(v, cuda_types.CUDADispatcher)
+        )
+    ]
+    # List of tuples with SSA name of calls and corresponding signature
+    call_signatures = [
+        (call.func.name, sig)
+        for call, sig in cres.fndesc.calltypes.items() if (
+            isinstance(call, ir.Expr) and call.op == 'call'
+        )
+    ]
+    # Map SSA names to all invoked signatures
+    call_signature_d = defaultdict(list)
+    for name, sig in call_signatures:
+        call_signature_d[name].append(sig)
+    # Add the link objects from the current function's callees
+    for name, v in device_func_calls:
+        for sig in call_signature_d.get(name, []):
+            called_cres = v.dispatcher.overloads[sig.args]
+            called_link_objects = get_cres_link_objects(called_cres)
+            link_objects.update(called_link_objects)
+    # From this point onwards, we are only interested in ExternFunction
+    # declarations - these are the calls made directly in this function to
+    # them.
+    for name, v in cres.fndesc.typemap.items():
+        if not isinstance(v, Function):
+            continue
+        if not isinstance(v.typing_key, ExternFunction):
+            continue
+        for obj in v.typing_key.link:
+            link_objects.add(obj)
+    return link_objects
 class _Kernel(serialize.ReduceMixin):
     '''
     CUDA Kernel specialized for a given set of argument types. When called, this
@@ -159,15 +209,8 @@ class _Kernel(serialize.ReduceMixin):
         self.maybe_link_nrt(link, tgt_ctx, asm)
-        for k, v in cres.fndesc.typemap.items():
-            if not isinstance(v, Function):
-                continue
-            if not isinstance(v.typing_key, ExternFunction):
-                continue
-            for obj in v.typing_key.link:
-                lib.add_linking_file(obj)
+        for obj in get_cres_link_objects(cres):
+            lib.add_linking_file(obj)
         for filepath in link:
             lib.add_linking_file(filepath)
@@ -267,7 +310,11 @@ class _Kernel(serialize.ReduceMixin):
         """
         cufunc = self._codelibrary.get_cufunc()
-        if hasattr(self, "target_context") and self.target_context.enable_nrt:
+        if (
+            hasattr(self, "target_context")
+            and self.target_context.enable_nrt
+            and config.CUDA_NRT_STATS
+        ):
             rtsys.ensure_initialized()
             rtsys.set_memsys_to_module(cufunc.module)
             # We don't know which stream the kernel will be launched on, so

{numba_cuda-0.5.0 → numba_cuda-0.7.0}/numba_cuda/numba/cuda/target.py RENAMED Viewed

@@ -3,8 +3,7 @@ from functools import cached_property
 import llvmlite.binding as ll
 from llvmlite import ir
-from numba.core import (cgutils, config, debuginfo, itanium_mangler, types,
-                        typing, utils)
+from numba.core import cgutils, config, itanium_mangler, types, typing
 from numba.core.dispatcher import Dispatcher
 from numba.core.base import BaseContext
 from numba.core.callconv import BaseCallConv, MinimalCallConv
@@ -12,7 +11,8 @@ from numba.core.typing import cmathdecl
 from numba.core import datamodel
 from .cudadrv import nvvm
-from numba.cuda import codegen, nvvmutils, ufuncs
+from numba.cuda import codegen, ufuncs
+from numba.cuda.debuginfo import CUDADIBuilder
 from numba.cuda.models import cuda_data_manager
 # -----------------------------------------------------------------------------
@@ -80,7 +80,7 @@ class CUDATargetContext(BaseContext):
     @property
     def DIBuilder(self):
-        return debuginfo.DIBuilder
+        return CUDADIBuilder
     @property
     def enable_boundscheck(self):
@@ -150,136 +150,6 @@ class CUDATargetContext(BaseContext):
         return itanium_mangler.mangle(name, argtypes, abi_tags=abi_tags,
                                       uid=uid)
-    def prepare_cuda_kernel(self, codelib, fndesc, debug, lineinfo,
-                            nvvm_options, filename, linenum,
-                            max_registers=None, lto=False):
-        """
-        Adapt a code library ``codelib`` with the numba compiled CUDA kernel
-        with name ``fname`` and arguments ``argtypes`` for NVVM.
-        A new library is created with a wrapper function that can be used as
-        the kernel entry point for the given kernel.
-        Returns the new code library and the wrapper function.
-        Parameters:
-        codelib:       The CodeLibrary containing the device function to wrap
-                       in a kernel call.
-        fndesc:        The FunctionDescriptor of the source function.
-        debug:         Whether to compile with debug.
-        lineinfo:      Whether to emit line info.
-        nvvm_options:  Dict of NVVM options used when compiling the new library.
-        filename:      The source filename that the function is contained in.
-        linenum:       The source line that the function is on.
-        max_registers: The max_registers argument for the code library.
-        """
-        kernel_name = itanium_mangler.prepend_namespace(
-            fndesc.llvm_func_name, ns='cudapy',
-        )
-        library = self.codegen().create_library(f'{codelib.name}_kernel_',
-                                                entry_name=kernel_name,
-                                                nvvm_options=nvvm_options,
-                                                max_registers=max_registers,
-                                                lto=lto
-                                                )
-        library.add_linking_library(codelib)
-        wrapper = self.generate_kernel_wrapper(library, fndesc, kernel_name,
-                                               debug, lineinfo, filename,
-                                               linenum)
-        return library, wrapper
-    def generate_kernel_wrapper(self, library, fndesc, kernel_name, debug,
-                                lineinfo, filename, linenum):
-        """
-        Generate the kernel wrapper in the given ``library``.
-        The function being wrapped is described by ``fndesc``.
-        The wrapper function is returned.
-        """
-        argtypes = fndesc.argtypes
-        arginfo = self.get_arg_packer(argtypes)
-        argtys = list(arginfo.argument_types)
-        wrapfnty = ir.FunctionType(ir.VoidType(), argtys)
-        wrapper_module = self.create_module("cuda.kernel.wrapper")
-        fnty = ir.FunctionType(ir.IntType(32),
-                               [self.call_conv.get_return_type(types.pyobject)]
-                               + argtys)
-        func = ir.Function(wrapper_module, fnty, fndesc.llvm_func_name)
-        prefixed = itanium_mangler.prepend_namespace(func.name, ns='cudapy')
-        wrapfn = ir.Function(wrapper_module, wrapfnty, prefixed)
-        builder = ir.IRBuilder(wrapfn.append_basic_block(''))
-        if debug or lineinfo:
-            directives_only = lineinfo and not debug
-            debuginfo = self.DIBuilder(module=wrapper_module,
-                                       filepath=filename,
-                                       cgctx=self,
-                                       directives_only=directives_only)
-            debuginfo.mark_subprogram(
-                wrapfn, kernel_name, fndesc.args, argtypes, linenum,
-            )
-            debuginfo.mark_location(builder, linenum)
-        # Define error handling variable
-        def define_error_gv(postfix):
-            name = wrapfn.name + postfix
-            gv = cgutils.add_global_variable(wrapper_module, ir.IntType(32),
-                                             name)
-            gv.initializer = ir.Constant(gv.type.pointee, None)
-            return gv
-        gv_exc = define_error_gv("__errcode__")
-        gv_tid = []
-        gv_ctaid = []
-        for i in 'xyz':
-            gv_tid.append(define_error_gv("__tid%s__" % i))
-            gv_ctaid.append(define_error_gv("__ctaid%s__" % i))
-        callargs = arginfo.from_arguments(builder, wrapfn.args)
-        status, _ = self.call_conv.call_function(
-            builder, func, types.void, argtypes, callargs)
-        if debug:
-            # Check error status
-            with cgutils.if_likely(builder, status.is_ok):
-                builder.ret_void()
-            with builder.if_then(builder.not_(status.is_python_exc)):
-                # User exception raised
-                old = ir.Constant(gv_exc.type.pointee, None)
-                # Use atomic cmpxchg to prevent rewriting the error status
-                # Only the first error is recorded
-                xchg = builder.cmpxchg(gv_exc, old, status.code,
-                                       'monotonic', 'monotonic')
-                changed = builder.extract_value(xchg, 1)
-                # If the xchange is successful, save the thread ID.
-                sreg = nvvmutils.SRegBuilder(builder)
-                with builder.if_then(changed):
-                    for dim, ptr, in zip("xyz", gv_tid):
-                        val = sreg.tid(dim)
-                        builder.store(val, ptr)
-                    for dim, ptr, in zip("xyz", gv_ctaid):
-                        val = sreg.ctaid(dim)
-                        builder.store(val, ptr)
-        builder.ret_void()
-        nvvm.set_cuda_kernel(wrapfn)
-        library.add_ir_module(wrapper_module)
-        if debug or lineinfo:
-            debuginfo.finalize()
-        library.finalize()
-        if config.DUMP_LLVM:
-            utils.dump_llvm(fndesc, wrapper_module)
-        return library.get_function(wrapfn.name)
     def make_constant_array(self, builder, aryty, arr):
         """
         Unlike the parent version.  This returns a a pointer in the constant

{numba_cuda-0.5.0 → numba_cuda-0.7.0}/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py RENAMED Viewed

@@ -261,7 +261,8 @@ class TestLinker(CUDATestCase):
 @unittest.skipIf(
-    not PYNVJITLINK_INSTALLED, reason="Pynvjitlink is not installed"
+    not PYNVJITLINK_INSTALLED or not TEST_BIN_DIR,
+    reason="pynvjitlink not enabled"
 )
 class TestLinkerUsage(CUDATestCase):
     """Test that whether pynvjitlink can be enabled by both environment variable

numba-cuda 0.5.0__tar.gz → 0.7.0__tar.gz

numba-cuda 0.5.0tar.gz → 0.7.0tar.gz