PyPI - numba-cuda - Versions diffs - 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl - Mend

numba-cuda 0.10.0py3-none-any.whl → 0.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

numba_cuda/VERSION +1 -1
numba_cuda/numba/cuda/codegen.py +69 -2
numba_cuda/numba/cuda/compiler.py +41 -17
numba_cuda/numba/cuda/cudadecl.py +15 -5
numba_cuda/numba/cuda/cudadrv/driver.py +103 -20
numba_cuda/numba/cuda/cudadrv/linkable_code.py +10 -2
numba_cuda/numba/cuda/cudaimpl.py +103 -11
numba_cuda/numba/cuda/decorators.py +18 -2
numba_cuda/numba/cuda/dispatcher.py +27 -66
numba_cuda/numba/cuda/runtime/nrt.cu +2 -17
numba_cuda/numba/cuda/runtime/nrt.cuh +41 -0
numba_cuda/numba/cuda/runtime/nrt.py +13 -1
numba_cuda/numba/cuda/stubs.py +23 -11
numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +236 -0
numba_cuda/numba/cuda/tests/cudapy/test_extending.py +140 -0
numba_cuda/numba/cuda/tests/cudapy/test_inline.py +98 -1
numba_cuda/numba/cuda/tests/nrt/test_nrt.py +122 -3
numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +11 -0
numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +5 -2
numba_cuda/numba/cuda/tests/test_binary_generation/nrt_extern.cu +7 -0
numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu +4 -0
numba_cuda/numba/cuda/utils.py +7 -0
{numba_cuda-0.10.0.dist-info → numba_cuda-0.11.0.dist-info}/METADATA +1 -1
{numba_cuda-0.10.0.dist-info → numba_cuda-0.11.0.dist-info}/RECORD +27 -24
{numba_cuda-0.10.0.dist-info → numba_cuda-0.11.0.dist-info}/WHEEL +1 -1
{numba_cuda-0.10.0.dist-info → numba_cuda-0.11.0.dist-info}/licenses/LICENSE +0 -0
{numba_cuda-0.10.0.dist-info → numba_cuda-0.11.0.dist-info}/top_level.txt +0 -0

numba_cuda/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.10.0
1	+ 0.11.0

numba_cuda/numba/cuda/codegen.py CHANGED Viewed

@@ -5,6 +5,7 @@ from numba.core.codegen import Codegen, CodeLibrary
 from .cudadrv import devices, driver, nvvm, runtime
 from numba.cuda.cudadrv.libs import get_cudalib
 from numba.cuda.cudadrv.linkable_code import LinkableCode
+from numba.cuda.runtime.nrt import NRT_LIBRARY
 import os
 import subprocess
@@ -57,6 +58,57 @@ def disassemble_cubin_for_cfg(cubin):
     return run_nvdisasm(cubin, flags)
+class ExternalCodeLibrary(CodeLibrary):
+    """Holds code produced externally, for linking with generated code."""
+    def __init__(self, codegen, name):
+        super().__init__(codegen, name)
+        # Files to link
+        self._linking_files = set()
+        # Setup and teardown functions for the module.
+        # The order is determined by the order they are added to the codelib.
+        self._setup_functions = []
+        self._teardown_functions = []
+    @property
+    def modules(self):
+        # There are no LLVM IR modules in an ExternalCodeLibrary
+        return set()
+    def add_linking_file(self, path_or_obj):
+        # Adding new files after finalization is prohibited, in case the list
+        # of libraries has already been added to another code library; the
+        # newly-added files would be omitted from their linking process.
+        self._raise_if_finalized()
+        if isinstance(path_or_obj, LinkableCode):
+            if path_or_obj.setup_callback:
+                self._setup_functions.append(path_or_obj.setup_callback)
+            if path_or_obj.teardown_callback:
+                self._teardown_functions.append(path_or_obj.teardown_callback)
+        self._linking_files.add(path_or_obj)
+    def add_ir_module(self, module):
+        raise NotImplementedError("Cannot add LLVM IR to external code")
+    def add_linking_library(self, library):
+        raise NotImplementedError("Cannot add libraries to external code")
+    def finalize(self):
+        self._raise_if_finalized()
+        self._finalized = True
+    def get_asm_str(self):
+        raise NotImplementedError("No assembly for external code")
+    def get_llvm_str(self):
+        raise NotImplementedError("No LLVM IR for external code")
+    def get_function(self, name):
+        raise NotImplementedError("Cannot get function from external code")
 class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
     """
     The CUDACodeLibrary generates PTX, SASS, cubins for multiple different
@@ -297,6 +349,9 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
         self._raise_if_finalized()
         self._linking_libraries.add(library)
+        self._linking_files.update(library._linking_files)
+        self._setup_functions.extend(library._setup_functions)
+        self._teardown_functions.extend(library._teardown_functions)
     def add_linking_file(self, path_or_obj):
         if isinstance(path_or_obj, LinkableCode):
@@ -362,9 +417,17 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
         but loaded functions are discarded. They are recreated when needed
         after deserialization.
         """
+        nrt = False
         if self._linking_files:
-            msg = "Cannot pickle CUDACodeLibrary with linking files"
-            raise RuntimeError(msg)
+            if (
+                len(self._linking_files) == 1
+                and NRT_LIBRARY in self._linking_files
+            ):
+                nrt = True
+            else:
+                msg = "Cannot pickle CUDACodeLibrary with linking files"
+                raise RuntimeError(msg)
         if not self._finalized:
             raise RuntimeError("Cannot pickle unfinalized CUDACodeLibrary")
         return dict(
@@ -378,6 +441,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
             max_registers=self._max_registers,
             nvvm_options=self._nvvm_options,
             needs_cudadevrt=self.needs_cudadevrt,
+            nrt=nrt,
         )
     @classmethod
@@ -393,6 +457,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
         max_registers,
         nvvm_options,
         needs_cudadevrt,
+        nrt,
     ):
         """
         Rebuild an instance.
@@ -409,6 +474,8 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
         instance.needs_cudadevrt = needs_cudadevrt
         instance._finalized = True
+        if nrt:
+            instance._linking_files = {NRT_LIBRARY}
         return instance

numba_cuda/numba/cuda/compiler.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from llvmlite import ir
-from numba.core.typing.templates import ConcreteTemplate
 from numba.core import ir as numba_ir
 from numba.core import (
     cgutils,
@@ -37,6 +36,7 @@ from numba.core.typed_passes import (
 from warnings import warn
 from numba.cuda import nvvmutils
 from numba.cuda.api import get_current_device
+from numba.cuda.codegen import ExternalCodeLibrary
 from numba.cuda.cudadrv import nvvm
 from numba.cuda.descriptor import cuda_target
 from numba.cuda.target import CUDACABICallConv
@@ -278,7 +278,7 @@ def compile_cuda(
     args,
     debug=False,
     lineinfo=False,
-    inline=False,
+    forceinline=False,
     fastmath=False,
     nvvm_options=None,
     cc=None,
@@ -316,7 +316,7 @@ def compile_cuda(
     else:
         flags.error_model = "numpy"
-    if inline:
+    if forceinline:
         flags.forceinline = True
     if fastmath:
         flags.fastmath = True
@@ -574,6 +574,7 @@ def compile(
     abi="c",
     abi_info=None,
     output="ptx",
+    forceinline=False,
 ):
     """Compile a Python function to PTX or LTO-IR for a given set of argument
     types.
@@ -614,6 +615,11 @@ def compile(
     :type abi_info: dict
     :param output: Type of output to generate, either ``"ptx"`` or ``"ltoir"``.
     :type output: str
+    :param forceinline: Enables inlining at the NVVM IR level when set to
+                        ``True``. This is accomplished by adding the
+                        ``alwaysinline`` function attribute to the function
+                        definition. This is only valid when the output is
+                        ``"ltoir"``.
     :return: (code, resty): The compiled code and inferred return type
     :rtype: tuple
     """
@@ -626,6 +632,12 @@ def compile(
     if output not in ("ptx", "ltoir"):
         raise NotImplementedError(f"Unsupported output type: {output}")
+    if forceinline and not device:
+        raise ValueError("Cannot force-inline kernels")
+    if forceinline and output != "ltoir":
+        raise ValueError("Can only designate forced inlining in LTO-IR")
     debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
     opt = (config.OPT != 0) if opt is None else opt
@@ -660,6 +672,7 @@ def compile(
         fastmath=fastmath,
         nvvm_options=nvvm_options,
         cc=cc,
+        forceinline=forceinline,
     )
     resty = cres.signature.return_type
@@ -699,6 +712,7 @@ def compile_for_current_device(
     abi="c",
     abi_info=None,
     output="ptx",
+    forceinline=False,
 ):
     """Compile a Python function to PTX or LTO-IR for a given signature for the
     current device's compute capabilility. This calls :func:`compile` with an
@@ -716,6 +730,7 @@ def compile_for_current_device(
         abi=abi,
         abi_info=abi_info,
         output=output,
+        forceinline=forceinline,
     )
@@ -730,6 +745,7 @@ def compile_ptx(
     opt=None,
     abi="numba",
     abi_info=None,
+    forceinline=False,
 ):
     """Compile a Python function to PTX for a given signature. See
     :func:`compile`. The defaults for this function are to compile a kernel
@@ -747,6 +763,7 @@ def compile_ptx(
         abi=abi,
         abi_info=abi_info,
         output="ptx",
+        forceinline=forceinline,
     )
@@ -760,6 +777,7 @@ def compile_ptx_for_current_device(
     opt=None,
     abi="numba",
     abi_info=None,
+    forceinline=False,
 ):
     """Compile a Python function to PTX for a given signature for the current
     device's compute capabilility. See :func:`compile_ptx`."""
@@ -775,36 +793,42 @@ def compile_ptx_for_current_device(
         opt=opt,
         abi=abi,
         abi_info=abi_info,
+        forceinline=forceinline,
     )
 def declare_device_function(name, restype, argtypes, link):
-    return declare_device_function_template(name, restype, argtypes, link).key
-def declare_device_function_template(name, restype, argtypes, link):
     from .descriptor import cuda_target
     typingctx = cuda_target.typing_context
     targetctx = cuda_target.target_context
     sig = typing.signature(restype, *argtypes)
-    extfn = ExternFunction(name, sig, link)
-    class device_function_template(ConcreteTemplate):
-        key = extfn
-        cases = [sig]
+    # extfn is the descriptor used to call the function from Python code, and
+    # is used as the key for typing and lowering.
+    extfn = ExternFunction(name, sig)
-    fndesc = funcdesc.ExternalFunctionDescriptor(
-        name=name, restype=restype, argtypes=argtypes
-    )
+    # Typing
+    device_function_template = typing.make_concrete_template(name, extfn, [sig])
     typingctx.insert_user_function(extfn, device_function_template)
-    targetctx.insert_user_function(extfn, fndesc)
+    # Lowering
+    lib = ExternalCodeLibrary(f"{name}_externals", targetctx.codegen())
+    for file in link:
+        lib.add_linking_file(file)
+    # ExternalFunctionDescriptor provides a lowering implementation for calling
+    # external functions
+    fndesc = funcdesc.ExternalFunctionDescriptor(name, restype, argtypes)
+    targetctx.insert_user_function(extfn, fndesc, libs=(lib,))
     return device_function_template
 class ExternFunction:
-    def __init__(self, name, sig, link):
+    """A descriptor that can be used to call the external function from within
+    a Python kernel."""
+    def __init__(self, name, sig):
         self.name = name
         self.sig = sig
-        self.link = link

numba_cuda/numba/cuda/cudadecl.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import operator
-from numba.core import types
+from numba.core import errors, types
 from numba.core.typing.npydecl import (
     parse_dtype,
     parse_shape,
@@ -21,7 +21,7 @@ from numba.core.typing.templates import (
 from numba.cuda.types import dim3
 from numba.core.typeconv import Conversion
 from numba import cuda
-from numba.cuda.compiler import declare_device_function_template
+from numba.cuda.compiler import declare_device_function
 registry = Registry()
 register = registry.register
@@ -33,7 +33,7 @@ register_number_classes(register_global)
 class Cuda_array_decl(CallableTemplate):
     def generic(self):
-        def typer(shape, dtype):
+        def typer(shape, dtype, alignment=None):
             # Only integer literals and tuples of integer literals are valid
             # shapes
             if isinstance(shape, types.Integer):
@@ -47,6 +47,16 @@ class Cuda_array_decl(CallableTemplate):
             else:
                 return None
+            if alignment is not None:
+                permitted = (types.IntegerLiteral, types.NoneType)
+                if not isinstance(alignment, permitted):
+                    msg = "alignment must be a constant integer"
+                    raise errors.RequireLiteralValue(msg)
+            # N.B. We don't use alignment for typing; it's not part of
+            #      types.Array.  The value supplied to the array declaration
+            #      is handled in the lowering.
             ndim = parse_shape(shape)
             nb_dtype = parse_dtype(dtype)
             if nb_dtype is not None and ndim is not None:
@@ -412,7 +422,7 @@ _genfp16_binary_operator(operator.itruediv)
 def _resolve_wrapped_unary(fname):
     link = tuple()
-    decl = declare_device_function_template(
+    decl = declare_device_function(
         f"__numba_wrapper_{fname}", types.float16, (types.float16,), link
     )
     return types.Function(decl)
@@ -420,7 +430,7 @@ def _resolve_wrapped_unary(fname):
 def _resolve_wrapped_binary(fname):
     link = tuple()
-    decl = declare_device_function_template(
+    decl = declare_device_function(
         f"__numba_wrapper_{fname}",
         types.float16,
         (

numba_cuda/numba/cuda/cudadrv/driver.py CHANGED Viewed

@@ -49,7 +49,7 @@ from .drvapi import API_PROTOTYPES
 from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid
 from .mappings import FILE_EXTENSION_MAP
 from .linkable_code import LinkableCode, LTOIR, Fatbin, Object
-from numba.cuda.utils import _readenv
+from numba.cuda.utils import _readenv, cached_file_read
 from numba.cuda.cudadrv import enums, drvapi, nvrtc
 try:
@@ -2797,13 +2797,16 @@ class Linker(metaclass=ABCMeta):
         ptx_name = os.path.splitext(name)[0] + ".ptx"
         self.add_ptx(ptx.encode(), ptx_name)
+    @abstractmethod
+    def add_data(self, data, kind, name):
+        """Add in-memory data to the link"""
     @abstractmethod
     def add_file(self, path, kind):
         """Add code from a file to the link"""
     def add_cu_file(self, path):
-        with open(path, "rb") as f:
-            cu = f.read()
+        cu = cached_file_read(path, how="rb")
         self.add_cu(cu, os.path.basename(path))
     def add_file_guess_ext(self, path_or_code, ignore_nonlto=False):
@@ -2948,6 +2951,10 @@ class MVCLinker(Linker):
         except CubinLinkerError as e:
             raise LinkerError from e
+    def add_data(self, data, kind, name):
+        msg = "Adding in-memory data unsupported in the MVC linker"
+        raise LinkerError(msg)
     def add_file(self, path, kind):
         try:
             from cubinlinker import CubinLinkerError
@@ -2955,8 +2962,7 @@ class MVCLinker(Linker):
             raise ImportError(_MVC_ERROR_MESSAGE) from err
         try:
-            with open(path, "rb") as f:
-                data = f.read()
+            data = cached_file_read(path, how="rb")
         except FileNotFoundError:
             raise LinkerError(f"{path} not found")
@@ -3046,17 +3052,32 @@ class CtypesLinker(Linker):
     def error_log(self):
         return self.linker_errors_buf.value.decode("utf8")
-    def add_ptx(self, ptx, name="<cudapy-ptx>"):
-        ptxbuf = c_char_p(ptx)
-        namebuf = c_char_p(name.encode("utf8"))
-        self._keep_alive += [ptxbuf, namebuf]
+    def add_cubin(self, cubin, name="<unnamed-cubin>"):
+        return self._add_data(enums.CU_JIT_INPUT_CUBIN, cubin, name)
+    def add_ptx(self, ptx, name="<unnamed-ptx>"):
+        return self._add_data(enums.CU_JIT_INPUT_PTX, ptx, name)
+    def add_object(self, object_, name="<unnamed-object>"):
+        return self._add_data(enums.CU_JIT_INPUT_OBJECT, object_, name)
+    def add_fatbin(self, fatbin, name="<unnamed-fatbin>"):
+        return self._add_data(enums.CU_JIT_INPUT_FATBINARY, fatbin, name)
+    def add_library(self, library, name="<unnamed-library>"):
+        return self._add_data(enums.CU_JIT_INPUT_LIBRARY, library, name)
+    def _add_data(self, input_type, data, name):
+        data_buffer = c_char_p(data)
+        name_buffer = c_char_p(name.encode("utf8"))
+        self._keep_alive += [data_buffer, name_buffer]
         try:
             driver.cuLinkAddData(
                 self.handle,
-                enums.CU_JIT_INPUT_PTX,
-                ptxbuf,
-                len(ptx),
-                namebuf,
+                input_type,
+                data_buffer,
+                len(data),
+                name_buffer,
                 0,
                 None,
                 None,
@@ -3064,6 +3085,28 @@ class CtypesLinker(Linker):
         except CudaAPIError as e:
             raise LinkerError("%s\n%s" % (e, self.error_log))
+    def add_data(self, data, kind, name=None):
+        # We pass the name as **kwargs to ensure the default name for the input
+        # type is used if none is supplied
+        kws = {}
+        if name is not None:
+            kws["name"] = name
+        if kind == FILE_EXTENSION_MAP["cubin"]:
+            self.add_cubin(data, **kws)
+        elif kind == FILE_EXTENSION_MAP["fatbin"]:
+            self.add_fatbin(data, **kws)
+        elif kind == FILE_EXTENSION_MAP["a"]:
+            self.add_library(data, **kws)
+        elif kind == FILE_EXTENSION_MAP["ptx"]:
+            self.add_ptx(data, **kws)
+        elif kind == FILE_EXTENSION_MAP["o"]:
+            self.add_object(data, **kws)
+        elif kind == FILE_EXTENSION_MAP["ltoir"]:
+            raise LinkerError("Ctypes linker cannot link LTO-IR")
+        else:
+            raise LinkerError(f"Don't know how to link {kind}")
     def add_file(self, path, kind):
         pathbuf = c_char_p(path.encode("utf8"))
         self._keep_alive.append(pathbuf)
@@ -3151,17 +3194,58 @@ class CudaPythonLinker(Linker):
     def error_log(self):
         return self.linker_errors_buf.decode("utf8")
-    def add_ptx(self, ptx, name="<cudapy-ptx>"):
-        namebuf = name.encode("utf8")
-        self._keep_alive += [ptx, namebuf]
+    def add_cubin(self, cubin, name="<unnamed-cubin>"):
+        input_type = binding.CUjitInputType.CU_JIT_INPUT_CUBIN
+        return self._add_data(input_type, cubin, name)
+    def add_ptx(self, ptx, name="<unnamed-ptx>"):
+        input_type = binding.CUjitInputType.CU_JIT_INPUT_PTX
+        return self._add_data(input_type, ptx, name)
+    def add_object(self, object_, name="<unnamed-object>"):
+        input_type = binding.CUjitInputType.CU_JIT_INPUT_OBJECT
+        return self._add_data(input_type, object_, name)
+    def add_fatbin(self, fatbin, name="<unnamed-fatbin>"):
+        input_type = binding.CUjitInputType.CU_JIT_INPUT_FATBINARY
+        return self._add_data(input_type, fatbin, name)
+    def add_library(self, library, name="<unnamed-library>"):
+        input_type = binding.CUjitInputType.CU_JIT_INPUT_LIBRARY
+        return self._add_data(input_type, library, name)
+    def _add_data(self, input_type, data, name):
+        name_buffer = name.encode("utf8")
+        self._keep_alive += [data, name_buffer]
         try:
-            input_ptx = binding.CUjitInputType.CU_JIT_INPUT_PTX
             driver.cuLinkAddData(
-                self.handle, input_ptx, ptx, len(ptx), namebuf, 0, [], []
+                self.handle, input_type, data, len(data), name_buffer, 0, [], []
             )
         except CudaAPIError as e:
             raise LinkerError("%s\n%s" % (e, self.error_log))
+    def add_data(self, data, kind, name=None):
+        # We pass the name as **kwargs to ensure the default name for the input
+        # type is used if none is supplied
+        kws = {}
+        if name is not None:
+            kws["name"] = name
+        if kind == FILE_EXTENSION_MAP["cubin"]:
+            self.add_cubin(data, **kws)
+        elif kind == FILE_EXTENSION_MAP["fatbin"]:
+            self.add_fatbin(data, **kws)
+        elif kind == FILE_EXTENSION_MAP["a"]:
+            self.add_library(data, **kws)
+        elif kind == FILE_EXTENSION_MAP["ptx"]:
+            self.add_ptx(data, **kws)
+        elif kind == FILE_EXTENSION_MAP["o"]:
+            self.add_object(data, **kws)
+        elif kind == FILE_EXTENSION_MAP["ltoir"]:
+            raise LinkerError("CudaPythonLinker cannot link LTO-IR")
+        else:
+            raise LinkerError(f"Don't know how to link {kind}")
     def add_file(self, path, kind):
         pathbuf = path.encode("utf8")
         self._keep_alive.append(pathbuf)
@@ -3252,8 +3336,7 @@ class PyNvJitLinker(Linker):
     def add_file(self, path, kind):
         try:
-            with open(path, "rb") as f:
-                data = f.read()
+            data = cached_file_read(path, "rb")
         except FileNotFoundError:
             raise LinkerError(f"{path} not found")

numba_cuda/numba/cuda/cudadrv/linkable_code.py CHANGED Viewed

@@ -16,16 +16,24 @@ class LinkableCode:
     :param teardown_callback: A function called just prior to the unloading of
                               a module that has this code object linked into
                               it.
+    :param nrt: If True, assume this object contains NRT function calls and
+                add NRT source code to the final link.
     """
     def __init__(
-        self, data, name=None, setup_callback=None, teardown_callback=None
+        self,
+        data,
+        name=None,
+        setup_callback=None,
+        teardown_callback=None,
+        nrt=False,
     ):
         if setup_callback and not callable(setup_callback):
             raise TypeError("setup_callback must be callable")
         if teardown_callback and not callable(teardown_callback):
             raise TypeError("teardown_callback must be callable")
+        self.nrt = nrt
         self._name = name
         self._data = data
         self.setup_callback = setup_callback
@@ -87,5 +95,5 @@ class Object(LinkableCode):
 class LTOIR(LinkableCode):
     """An LTOIR file in memory."""
-    kind = "ltoir"
+    kind = FILE_EXTENSION_MAP["ltoir"]
     default_name = "<unnamed-ltoir>"

numba-cuda 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl

numba-cuda 0.10.0py3-none-any.whl → 0.11.0py3-none-any.whl