PyPI - numba-cuda - Versions diffs - 0.17.0__py3-none-any.whl → 0.18.1__py3-none-any.whl - Mend

numba-cuda 0.17.0py3-none-any.whl → 0.18.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of numba-cuda might be problematic. Click here for more details.

Files changed (64) hide show

numba_cuda/VERSION +1 -1
numba_cuda/numba/cuda/__init__.py +0 -8
numba_cuda/numba/cuda/_internal/cuda_fp16.py +14225 -0
numba_cuda/numba/cuda/api_util.py +6 -0
numba_cuda/numba/cuda/cgutils.py +1291 -0
numba_cuda/numba/cuda/codegen.py +32 -14
numba_cuda/numba/cuda/compiler.py +113 -10
numba_cuda/numba/cuda/core/caching.py +741 -0
numba_cuda/numba/cuda/core/callconv.py +338 -0
numba_cuda/numba/cuda/core/codegen.py +168 -0
numba_cuda/numba/cuda/core/compiler.py +205 -0
numba_cuda/numba/cuda/core/typed_passes.py +139 -0
numba_cuda/numba/cuda/cudadecl.py +0 -268
numba_cuda/numba/cuda/cudadrv/devicearray.py +3 -0
numba_cuda/numba/cuda/cudadrv/driver.py +2 -1
numba_cuda/numba/cuda/cudadrv/nvvm.py +1 -1
numba_cuda/numba/cuda/cudaimpl.py +4 -178
numba_cuda/numba/cuda/debuginfo.py +469 -3
numba_cuda/numba/cuda/device_init.py +0 -1
numba_cuda/numba/cuda/dispatcher.py +310 -11
numba_cuda/numba/cuda/extending.py +2 -1
numba_cuda/numba/cuda/fp16.py +348 -0
numba_cuda/numba/cuda/intrinsics.py +1 -1
numba_cuda/numba/cuda/libdeviceimpl.py +2 -1
numba_cuda/numba/cuda/lowering.py +1833 -8
numba_cuda/numba/cuda/mathimpl.py +2 -90
numba_cuda/numba/cuda/nvvmutils.py +2 -1
numba_cuda/numba/cuda/printimpl.py +2 -1
numba_cuda/numba/cuda/serialize.py +264 -0
numba_cuda/numba/cuda/simulator/__init__.py +2 -0
numba_cuda/numba/cuda/simulator/dispatcher.py +7 -0
numba_cuda/numba/cuda/stubs.py +0 -308
numba_cuda/numba/cuda/target.py +13 -5
numba_cuda/numba/cuda/testing.py +156 -5
numba_cuda/numba/cuda/tests/complex_usecases.py +113 -0
numba_cuda/numba/cuda/tests/core/serialize_usecases.py +110 -0
numba_cuda/numba/cuda/tests/core/test_serialize.py +359 -0
numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +10 -4
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +33 -0
numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +2 -2
numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +1 -0
numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_caching.py +5 -10
numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +15 -0
numba_cuda/numba/cuda/tests/cudapy/test_complex.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +381 -0
numba_cuda/numba/cuda/tests/cudapy/test_enums.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +108 -24
numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +37 -23
numba_cuda/numba/cuda/tests/cudapy/test_operator.py +43 -27
numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +26 -9
numba_cuda/numba/cuda/tests/cudapy/test_warning.py +27 -2
numba_cuda/numba/cuda/tests/enum_usecases.py +56 -0
numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +1 -2
numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +1 -1
numba_cuda/numba/cuda/utils.py +785 -0
numba_cuda/numba/cuda/vector_types.py +1 -1
{numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/METADATA +18 -4
{numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/RECORD +63 -50
numba_cuda/numba/cuda/cpp_function_wrappers.cu +0 -46
{numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/WHEEL +0 -0
{numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/licenses/LICENSE +0 -0
{numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/top_level.txt +0 -0

numba_cuda/numba/cuda/codegen.py CHANGED Viewed

@@ -1,8 +1,9 @@
 from llvmlite import ir
-from numba.core import config, serialize
-from numba.core.codegen import Codegen, CodeLibrary
-from .cudadrv import devices, driver, nvrtc, nvvm, runtime
+from numba.core import config
+from numba.cuda import serialize
+from .cudadrv import devices, driver, nvvm, runtime, nvrtc
+from numba.cuda.core.codegen import Codegen, CodeLibrary
 from numba.cuda.cudadrv.libs import get_cudalib
 from numba.cuda.cudadrv.linkable_code import LinkableCode
 from numba.cuda.memory_management.nrt import NRT_LIBRARY
@@ -233,6 +234,33 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
         return ptx
+    def get_lto_ptx(self, cc=None):
+        """
+        Get the PTX code after LTO.
+        """
+        if not self._lto:
+            raise RuntimeError("LTO is not enabled")
+        if not driver._have_nvjitlink():
+            raise RuntimeError("Link time optimization requires nvJitLink.")
+        cc = self._ensure_cc(cc)
+        linker = driver._Linker.new(
+            max_registers=self._max_registers,
+            cc=cc,
+            additional_flags=["-ptx"],
+            lto=self._lto,
+        )
+        self._link_all(linker, cc, ignore_nonlto=True)
+        ptx = linker.get_linked_ptx()
+        ptx = ptx.decode("utf-8")
+        return ptx
     def get_ltoir(self, cc=None):
         cc = self._ensure_cc(cc)
@@ -274,17 +302,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
             return cubin
         if self._lto and config.DUMP_ASSEMBLY:
-            linker = driver._Linker.new(
-                max_registers=self._max_registers,
-                cc=cc,
-                additional_flags=["-ptx"],
-                lto=self._lto,
-            )
-            # `-ptx` flag is meant to view the optimized PTX for LTO objects.
-            # Non-LTO objects are not passed to linker.
-            self._link_all(linker, cc, ignore_nonlto=True)
-            ptx = linker.get_linked_ptx()
-            ptx = ptx.decode("utf-8")
+            ptx = self.get_lto_ptx(cc=cc)
             print(("ASSEMBLY (AFTER LTO) %s" % self._name).center(80, "-"))
             print(ptx)

numba_cuda/numba/cuda/compiler.py CHANGED Viewed

@@ -1,21 +1,20 @@
 from llvmlite import ir
+from collections import namedtuple
 from numba.core import ir as numba_ir
+from numba.cuda import cgutils
 from numba.core import (
-    cgutils,
     types,
     typing,
     funcdesc,
     config,
     compiler,
     sigutils,
-    utils,
 )
 from numba.core.compiler import (
     sanitize_compile_result_entries,
-    CompilerBase,
     DefaultPassBuilder,
-    CompileResult,
 )
+from numba.cuda.core.compiler import CompilerBase
 from numba.core.compiler_lock import global_compiler_lock
 from numba.core.compiler_machinery import (
     FunctionPass,
@@ -28,42 +27,146 @@ from numba.core.errors import NumbaInvalidConfigWarning
 from numba.core.untyped_passes import TranslateByteCode
 from numba.core.typed_passes import (
     IRLegalization,
-    NativeLowering,
     AnnotateTypes,
 )
 from warnings import warn
 from numba.cuda import nvvmutils
 from numba.cuda.api import get_current_device
 from numba.cuda.codegen import ExternalCodeLibrary
+from numba.cuda.core.typed_passes import BaseNativeLowering
 from numba.cuda.cudadrv import nvvm, nvrtc
 from numba.cuda.descriptor import cuda_target
 from numba.cuda.flags import CUDAFlags
 from numba.cuda.target import CUDACABICallConv
-from numba.cuda import lowering
+from numba.cuda import lowering, utils
 # The CUDACompileResult (CCR) has a specially-defined entry point equal to its
 # id.  This is because the entry point is used as a key into a dict of
 # overloads by the base dispatcher. The id of the CCR is the only small and
-# unique property of a CompileResult in the CUDA target (cf. the CPU target,
+# unique property of a CUDACompileResult in the CUDA target (cf. the CPU target,
 # which uses its entry_point, which is a pointer value).
 #
 # This does feel a little hackish, and there are two ways in which this could
 # be improved:
 #
-# 1. We could change the core of Numba so that each CompileResult has its own
+# 1. We could change the CUDACompileResult so that each instance has its own
 #    unique ID that can be used as a key - e.g. a count, similar to the way in
 #    which types have unique counts.
 # 2. At some future time when kernel launch uses a compiled function, the entry
 #    point will no longer need to be a synthetic value, but will instead be a
 #    pointer to the compiled function as in the CPU target.
+CR_FIELDS = [
+    "typing_context",
+    "target_context",
+    "entry_point",
+    "typing_error",
+    "type_annotation",
+    "signature",
+    "objectmode",
+    "lifted",
+    "fndesc",
+    "library",
+    "call_helper",
+    "environment",
+    "metadata",
+    # List of functions to call to initialize on unserialization
+    # (i.e cache load).
+    "reload_init",
+    "referenced_envs",
+]
+class CUDACompileResult(namedtuple("_CompileResult", CR_FIELDS)):
+    """
+    A structure holding results from the compilation of a function.
+    """
+    __slots__ = ()
-class CUDACompileResult(CompileResult):
     @property
     def entry_point(self):
         return id(self)
+    def _reduce(self):
+        """
+        Reduce a CompileResult to picklable components.
+        """
+        libdata = self.library.serialize_using_object_code()
+        # Make it (un)picklable efficiently
+        typeann = str(self.type_annotation)
+        fndesc = self.fndesc
+        # Those don't need to be pickled and may fail
+        fndesc.typemap = fndesc.calltypes = None
+        # The CUDA target does not reference environments
+        referenced_envs = tuple()
+        return (
+            libdata,
+            self.fndesc,
+            self.environment,
+            self.signature,
+            self.objectmode,
+            self.lifted,
+            typeann,
+            self.reload_init,
+            referenced_envs,
+        )
+    @classmethod
+    def _rebuild(
+        cls,
+        target_context,
+        libdata,
+        fndesc,
+        env,
+        signature,
+        objectmode,
+        lifted,
+        typeann,
+        reload_init,
+        referenced_envs,
+    ):
+        if reload_init:
+            # Re-run all
+            for fn in reload_init:
+                fn()
+        library = target_context.codegen().unserialize_library(libdata)
+        cfunc = target_context.get_executable(library, fndesc, env)
+        cr = cls(
+            target_context=target_context,
+            typing_context=target_context.typing_context,
+            library=library,
+            environment=env,
+            entry_point=cfunc,
+            fndesc=fndesc,
+            type_annotation=typeann,
+            signature=signature,
+            objectmode=objectmode,
+            lifted=lifted,
+            typing_error=None,
+            call_helper=None,
+            metadata=None,  # Do not store, arbitrary & potentially large!
+            reload_init=reload_init,
+            referenced_envs=referenced_envs,
+        )
+        # Load Environments
+        for env in referenced_envs:
+            library.codegen.set_env(env.env_name, env)
+        return cr
+    @property
+    def codegen(self):
+        return self.target_context.codegen()
+    def dump(self, tab=""):
+        print(f"{tab}DUMP {type(self).__name__} {self.entry_point}")
+        self.signature.dump(tab=tab + "  ")
+        print(f"{tab}END DUMP")
 def cuda_compile_result(**entries):
     entries = sanitize_compile_result_entries(entries)
@@ -129,7 +232,7 @@ class CreateLibrary(LoweringPass):
 @register_pass(mutates_CFG=True, analysis_only=False)
-class CUDANativeLowering(NativeLowering):
+class CUDANativeLowering(BaseNativeLowering):
     """Lowering pass for a CUDA native function IR described solely in terms of
     Numba's standard `numba.core.ir` nodes."""

numba-cuda 0.17.0__py3-none-any.whl → 0.18.1__py3-none-any.whl

Potentially problematic release.

numba-cuda 0.17.0py3-none-any.whl → 0.18.1py3-none-any.whl