PyPI - numba-cuda - Versions diffs - 0.15.2__py3-none-any.whl → 0.17.0__py3-none-any.whl - Mend

numba-cuda 0.15.2py3-none-any.whl → 0.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

numba_cuda/VERSION +1 -1
numba_cuda/numba/cuda/__init__.py +35 -1
numba_cuda/numba/cuda/codegen.py +11 -9
numba_cuda/numba/cuda/compiler.py +3 -39
numba_cuda/numba/cuda/cuda_paths.py +21 -23
numba_cuda/numba/cuda/cudadrv/devices.py +4 -6
numba_cuda/numba/cuda/cudadrv/driver.py +300 -335
numba_cuda/numba/cuda/cudadrv/error.py +4 -0
numba_cuda/numba/cuda/cudadrv/libs.py +1 -1
numba_cuda/numba/cuda/cudadrv/mappings.py +8 -9
numba_cuda/numba/cuda/cudadrv/nvrtc.py +153 -108
numba_cuda/numba/cuda/cudadrv/nvvm.py +1 -197
numba_cuda/numba/cuda/cudadrv/runtime.py +5 -136
numba_cuda/numba/cuda/decorators.py +18 -0
numba_cuda/numba/cuda/dispatcher.py +3 -3
numba_cuda/numba/cuda/flags.py +36 -0
numba_cuda/numba/cuda/memory_management/nrt.py +3 -3
numba_cuda/numba/cuda/simulator/cudadrv/driver.py +6 -2
numba_cuda/numba/cuda/target.py +55 -2
numba_cuda/numba/cuda/testing.py +0 -22
numba_cuda/numba/cuda/tests/__init__.py +0 -2
numba_cuda/numba/cuda/tests/cudadrv/__init__.py +0 -2
numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +31 -6
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +5 -1
numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +1 -1
numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +17 -6
numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +9 -167
numba_cuda/numba/cuda/tests/cudadrv/test_nvrtc.py +27 -0
numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +3 -19
numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +1 -37
numba_cuda/numba/cuda/tests/cudapy/__init__.py +0 -2
numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +1 -5
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +0 -9
numba_cuda/numba/cuda/tests/cudapy/test_errors.py +14 -0
numba_cuda/numba/cuda/tests/cudapy/test_exception.py +0 -6
numba_cuda/numba/cuda/tests/cudapy/test_extending.py +2 -1
numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +0 -4
numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +18 -0
numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +2 -5
numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +0 -7
numba_cuda/numba/cuda/tests/nocuda/__init__.py +0 -2
numba_cuda/numba/cuda/tests/nrt/__init__.py +0 -2
numba_cuda/numba/cuda/tests/nrt/test_nrt.py +10 -1
{numba_cuda-0.15.2.dist-info → numba_cuda-0.17.0.dist-info}/METADATA +5 -2
{numba_cuda-0.15.2.dist-info → numba_cuda-0.17.0.dist-info}/RECORD +49 -47
{numba_cuda-0.15.2.dist-info → numba_cuda-0.17.0.dist-info}/WHEEL +0 -0
{numba_cuda-0.15.2.dist-info → numba_cuda-0.17.0.dist-info}/licenses/LICENSE +0 -0
{numba_cuda-0.15.2.dist-info → numba_cuda-0.17.0.dist-info}/top_level.txt +0 -0

numba_cuda/numba/cuda/cudadrv/driver.py CHANGED Viewed

@@ -42,6 +42,7 @@ import importlib
 import numpy as np
 from collections import namedtuple, deque
 from numba import mviewbuf
 from numba.core import utils, serialize, config
 from .error import CudaSupportError, CudaDriverError
@@ -58,6 +59,22 @@ except ImportError:
     NvJitLinker, NvJitLinkError = None, None
+USE_NV_BINDING = config.CUDA_USE_NVIDIA_BINDING
+if USE_NV_BINDING:
+    from cuda.bindings import driver as binding
+    from cuda.core.experimental import (
+        Linker,
+        LinkerOptions,
+        ObjectCode,
+    )
+    # There is no definition of the default stream in the Nvidia bindings (nor
+    # is there at the C/C++ level), so we define it here so we don't need to
+    # use a magic number 0 in places where we want the default stream.
+    CU_STREAM_DEFAULT = 0
 MIN_REQUIRED_CC = (3, 5)
 SUPPORTS_IPC = sys.platform.startswith("linux")
@@ -108,6 +125,25 @@ def make_logger():
     return logger
+@functools.cache
+def _have_nvjitlink():
+    if not USE_NV_BINDING:
+        return False
+    try:
+        from cuda.bindings._internal import nvjitlink as nvjitlink_internal
+        from cuda.bindings._internal.utils import NotSupportedError
+    except ImportError:
+        return False
+    try:
+        return (
+            nvjitlink_internal._inspect_function_pointer("__nvJitLinkVersion")
+            != 0
+        )
+    except NotSupportedError:
+        # no driver
+        return False
 class DeadMemoryError(RuntimeError):
     pass
@@ -454,11 +490,11 @@ class Driver(object):
         with self.get_active_context() as ac:
             if ac.devnum is not None:
                 if USE_NV_BINDING:
-                    return driver.cuCtxPopCurrent()
+                    popped = drvapi.cu_context(int(driver.cuCtxPopCurrent()))
                 else:
                     popped = drvapi.cu_context()
                     driver.cuCtxPopCurrent(byref(popped))
-                    return popped
+                return popped
     def get_active_context(self):
         """Returns an instance of ``_ActiveContext``."""
@@ -502,6 +538,8 @@ class _ActiveContext(object):
                 hctx = driver.cuCtxGetCurrent()
                 if int(hctx) == 0:
                     hctx = None
+                else:
+                    hctx = drvapi.cu_context(int(hctx))
             else:
                 hctx = drvapi.cu_context(0)
                 driver.cuCtxGetCurrent(byref(hctx))
@@ -680,6 +718,7 @@ class Device(object):
         # create primary context
         if USE_NV_BINDING:
             hctx = driver.cuDevicePrimaryCtxRetain(self.id)
+            hctx = drvapi.cu_context(int(hctx))
         else:
             hctx = drvapi.cu_context()
             driver.cuDevicePrimaryCtxRetain(byref(hctx), self.id)
@@ -1218,6 +1257,7 @@ class _PendingDeallocs(object):
                 [dtor, handle, size] = self._cons.popleft()
                 _logger.info("dealloc: %s %s bytes", dtor.__name__, size)
                 dtor(handle)
             self._size = 0
     @contextlib.contextmanager
@@ -1394,7 +1434,10 @@ class Context(object):
         """
         Pushes this context on the current CPU Thread.
         """
-        driver.cuCtxPushCurrent(self.handle)
+        if USE_NV_BINDING:
+            driver.cuCtxPushCurrent(self.handle.value)
+        else:
+            driver.cuCtxPushCurrent(self.handle)
         self.prepare_for_use()
     def pop(self):
@@ -1403,10 +1446,7 @@ class Context(object):
         must be at the top of the context stack, otherwise an error will occur.
         """
         popped = driver.pop_active_context()
-        if USE_NV_BINDING:
-            assert int(popped) == int(self.handle)
-        else:
-            assert popped.value == self.handle.value
+        assert popped.value == self.handle.value
     def memalloc(self, bytesize):
         return self.memory_manager.memalloc(bytesize)
@@ -1472,7 +1512,7 @@ class Context(object):
         if isinstance(ptx, str):
             ptx = ptx.encode("utf8")
         if USE_NV_BINDING:
-            image = ptx
+            image = ObjectCode.from_ptx(ptx)
         else:
             image = c_char_p(ptx)
         return self.create_module_image(image)
@@ -1499,21 +1539,25 @@ class Context(object):
     def get_default_stream(self):
         if USE_NV_BINDING:
-            handle = binding.CUstream(CU_STREAM_DEFAULT)
+            handle = drvapi.cu_stream(int(binding.CUstream(CU_STREAM_DEFAULT)))
         else:
             handle = drvapi.cu_stream(drvapi.CU_STREAM_DEFAULT)
         return Stream(weakref.proxy(self), handle, None)
     def get_legacy_default_stream(self):
         if USE_NV_BINDING:
-            handle = binding.CUstream(binding.CU_STREAM_LEGACY)
+            handle = drvapi.cu_stream(
+                int(binding.CUstream(binding.CU_STREAM_LEGACY))
+            )
         else:
             handle = drvapi.cu_stream(drvapi.CU_STREAM_LEGACY)
         return Stream(weakref.proxy(self), handle, None)
     def get_per_thread_default_stream(self):
         if USE_NV_BINDING:
-            handle = binding.CUstream(binding.CU_STREAM_PER_THREAD)
+            handle = drvapi.cu_stream(
+                int(binding.CUstream(binding.CU_STREAM_PER_THREAD))
+            )
         else:
             handle = drvapi.cu_stream(drvapi.CU_STREAM_PER_THREAD)
         return Stream(weakref.proxy(self), handle, None)
@@ -1525,7 +1569,7 @@ class Context(object):
             # default stream, which we define also as CU_STREAM_DEFAULT when
             # the NV binding is in use).
             flags = binding.CUstream_flags.CU_STREAM_DEFAULT.value
-            handle = driver.cuStreamCreate(flags)
+            handle = drvapi.cu_stream(int(driver.cuStreamCreate(flags)))
         else:
             handle = drvapi.cu_stream()
             driver.cuStreamCreate(byref(handle), 0)
@@ -1539,7 +1583,7 @@ class Context(object):
         if not isinstance(ptr, int):
             raise TypeError("ptr for external stream must be an int")
         if USE_NV_BINDING:
-            handle = binding.CUstream(ptr)
+            handle = drvapi.cu_stream(int(binding.CUstream(ptr)))
         else:
             handle = drvapi.cu_stream(ptr)
         return Stream(weakref.proxy(self), handle, None, external=True)
@@ -1549,7 +1593,7 @@ class Context(object):
         if not timing:
             flags |= enums.CU_EVENT_DISABLE_TIMING
         if USE_NV_BINDING:
-            handle = driver.cuEventCreate(flags)
+            handle = drvapi.cu_event(int(driver.cuEventCreate(flags)))
         else:
             handle = drvapi.cu_event()
             driver.cuEventCreate(byref(handle), flags)
@@ -1615,7 +1659,6 @@ def load_module_image_ctypes(
     option_keys = (drvapi.cu_jit_option * len(options))(*options.keys())
     option_vals = (c_void_p * len(options))(*options.values())
     handle = drvapi.cu_module()
     try:
         driver.cuModuleLoadDataEx(
@@ -1662,7 +1705,7 @@ def load_module_image_cuda_python(
     try:
         handle = driver.cuModuleLoadDataEx(
-            image, len(options), option_keys, option_vals
+            image.code, len(options), option_keys, option_vals
         )
     except CudaAPIError as e:
         err_string = jiterrors.decode("utf-8")
@@ -1741,14 +1784,14 @@ def _pin_finalizer(memory_manager, ptr, alloc_key, mapped):
 def _event_finalizer(deallocs, handle):
     def core():
-        deallocs.add_item(driver.cuEventDestroy, handle)
+        deallocs.add_item(driver.cuEventDestroy, handle.value)
     return core
 def _stream_finalizer(deallocs, handle):
     def core():
-        deallocs.add_item(driver.cuStreamDestroy, handle)
+        deallocs.add_item(driver.cuStreamDestroy, handle.value)
     return core
@@ -2019,6 +2062,9 @@ class MemoryPointer(object):
     __cuda_memory__ = True
     def __init__(self, context, pointer, size, owner=None, finalizer=None):
+        if USE_NV_BINDING and isinstance(pointer, ctypes.c_void_p):
+            pointer = binding.CUdeviceptr(pointer.value)
         self.context = context
         self.device_pointer = pointer
         self.size = size
@@ -2051,9 +2097,11 @@ class MemoryPointer(object):
     def memset(self, byte, count=None, stream=0):
         count = self.size if count is None else count
         if stream:
-            driver.cuMemsetD8Async(
-                self.device_pointer, byte, count, stream.handle
-            )
+            if USE_NV_BINDING:
+                handle = stream.handle.value
+            else:
+                handle = stream.handle
+            driver.cuMemsetD8Async(self.device_pointer, byte, count, handle)
         else:
             driver.cuMemsetD8(self.device_pointer, byte, count)
@@ -2291,27 +2339,16 @@ class Stream(object):
             weakref.finalize(self, finalizer)
     def __int__(self):
-        if USE_NV_BINDING:
-            return int(self.handle)
-        else:
-            # The default stream's handle.value is 0, which gives `None`
-            return self.handle.value or drvapi.CU_STREAM_DEFAULT
+        # The default stream's handle.value is 0, which gives `None`
+        return self.handle.value or drvapi.CU_STREAM_DEFAULT
     def __repr__(self):
-        if USE_NV_BINDING:
-            default_streams = {
-                CU_STREAM_DEFAULT: "<Default CUDA stream on %s>",
-                binding.CU_STREAM_LEGACY: "<Legacy default CUDA stream on %s>",
-                binding.CU_STREAM_PER_THREAD: "<Per-thread default CUDA stream on %s>",
-            }
-            ptr = int(self.handle) or 0
-        else:
-            default_streams = {
-                drvapi.CU_STREAM_DEFAULT: "<Default CUDA stream on %s>",
-                drvapi.CU_STREAM_LEGACY: "<Legacy default CUDA stream on %s>",
-                drvapi.CU_STREAM_PER_THREAD: "<Per-thread default CUDA stream on %s>",
-            }
-            ptr = self.handle.value or drvapi.CU_STREAM_DEFAULT
+        default_streams = {
+            drvapi.CU_STREAM_DEFAULT: "<Default CUDA stream on %s>",
+            drvapi.CU_STREAM_LEGACY: "<Legacy default CUDA stream on %s>",
+            drvapi.CU_STREAM_PER_THREAD: "<Per-thread default CUDA stream on %s>",
+        }
+        ptr = self.handle.value or drvapi.CU_STREAM_DEFAULT
         if ptr in default_streams:
             return default_streams[ptr] % self.context
@@ -2325,7 +2362,11 @@ class Stream(object):
         Wait for all commands in this stream to execute. This will commit any
         pending memory transfers.
         """
-        driver.cuStreamSynchronize(self.handle)
+        if USE_NV_BINDING:
+            handle = self.handle.value
+        else:
+            handle = self.handle
+        driver.cuStreamSynchronize(handle)
     @contextlib.contextmanager
     def auto_synchronize(self):
@@ -2350,6 +2391,16 @@ class Stream(object):
         callback will block later work in the stream and may block other
         callbacks from being executed.
+        .. warning::
+            There is a potential for deadlock due to a lock ordering issue
+            between the GIL and the CUDA driver lock when using libraries
+            that call CUDA functions without releasing the GIL. This can
+            occur when the callback function, which holds the CUDA driver lock,
+            attempts to acquire the GIL while another thread that holds the GIL
+            is waiting for the CUDA driver lock. Consider using libraries that
+            properly release the GIL around CUDA operations or restructure
+            your code to avoid this situation.
         Note: The driver function underlying this method is marked for
         eventual deprecation and may be replaced in a future CUDA release.
@@ -2363,9 +2414,11 @@ class Stream(object):
             stream_callback = binding.CUstreamCallback(ptr)
             # The callback needs to receive a pointer to the data PyObject
             data = id(data)
+            handle = self.handle.value
         else:
             stream_callback = self._stream_callback
-        driver.cuStreamAddCallback(self.handle, stream_callback, data, 0)
+            handle = self.handle
+        driver.cuStreamAddCallback(handle, stream_callback, data, 0)
     @staticmethod
     @cu_stream_callback_pyobj
@@ -2382,6 +2435,16 @@ class Stream(object):
         """
         Return an awaitable that resolves once all preceding stream operations
         are complete. The result of the awaitable is the current stream.
+        .. warning::
+            There is a potential for deadlock due to a lock ordering issue
+            between the GIL and the CUDA driver lock when using libraries
+            that call CUDA functions without releasing the GIL. This can
+            occur when the callback function (internally used by this method),
+            which holds the CUDA driver lock, attempts to acquire the GIL
+            while another thread that holds the GIL is waiting for the CUDA driver lock.
+            Consider using libraries that properly release the GIL around
+            CUDA operations or restructure your code to avoid this situation.
         """
         loop = asyncio.get_running_loop()
         future = loop.create_future()
@@ -2433,27 +2496,35 @@ class Event(object):
         completed.
         """
         if USE_NV_BINDING:
-            hstream = stream.handle if stream else binding.CUstream(0)
+            hstream = stream.handle.value if stream else binding.CUstream(0)
+            handle = self.handle.value
         else:
             hstream = stream.handle if stream else 0
-        driver.cuEventRecord(self.handle, hstream)
+            handle = self.handle
+        driver.cuEventRecord(handle, hstream)
     def synchronize(self):
         """
         Synchronize the host thread for the completion of the event.
         """
-        driver.cuEventSynchronize(self.handle)
+        if USE_NV_BINDING:
+            handle = self.handle.value
+        else:
+            handle = self.handle
+        driver.cuEventSynchronize(handle)
     def wait(self, stream=0):
         """
         All future works submitted to stream will wait util the event completes.
         """
         if USE_NV_BINDING:
-            hstream = stream.handle if stream else binding.CUstream(0)
+            hstream = stream.handle.value if stream else binding.CUstream(0)
+            handle = self.handle.value
         else:
             hstream = stream.handle if stream else 0
+            handle = self.handle
         flags = 0
-        driver.cuStreamWaitEvent(hstream, self.handle, flags)
+        driver.cuStreamWaitEvent(hstream, handle, flags)
     def elapsed_time(self, evtend):
         return event_elapsed_time(self, evtend)
@@ -2464,7 +2535,9 @@ def event_elapsed_time(evtstart, evtend):
     Compute the elapsed time between two events in milliseconds.
     """
     if USE_NV_BINDING:
-        return driver.cuEventElapsedTime(evtstart.handle, evtend.handle)
+        return driver.cuEventElapsedTime(
+            evtstart.handle.value, evtend.handle.value
+        )
     else:
         msec = c_float()
         driver.cuEventElapsedTime(byref(msec), evtstart.handle, evtend.handle)
@@ -2722,7 +2795,7 @@ def launch_kernel(
         )
-class Linker(metaclass=ABCMeta):
+class _LinkerBase(metaclass=ABCMeta):
     """Abstract base class for linkers"""
     @classmethod
@@ -2735,30 +2808,27 @@ class Linker(metaclass=ABCMeta):
         additional_flags=None,
     ):
         driver_ver = driver.get_version()
-        if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY and driver_ver >= (
-            12,
-            0,
-        ):
-            raise ValueError("Use CUDA_ENABLE_PYNVJITLINK for CUDA >= 12.0 MVC")
-        if config.CUDA_ENABLE_PYNVJITLINK and driver_ver < (12, 0):
-            raise ValueError("Enabling pynvjitlink requires CUDA 12.")
-        if config.CUDA_ENABLE_PYNVJITLINK:
-            linker = PyNvJitLinker
-        elif config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY:
-            linker = MVCLinker
+        if driver_ver < (12, 0):
+            if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY:
+                linker = MVCLinker
+            elif USE_NV_BINDING:
+                linker = _Linker
+            else:
+                linker = CtypesLinker
         else:
             if USE_NV_BINDING:
-                linker = CudaPythonLinker
+                linker = _Linker
             else:
                 linker = CtypesLinker
-        if linker is PyNvJitLinker:
-            return linker(max_registers, lineinfo, cc, lto, additional_flags)
-        elif additional_flags or lto:
-            raise ValueError("LTO and additional flags require PyNvJitLinker")
+        params = (max_registers, lineinfo, cc)
+        if linker is _Linker:
+            params = (*params, lto, additional_flags)
         else:
-            return linker(max_registers, lineinfo, cc)
+            if lto or additional_flags:
+                raise ValueError("LTO and additional flags require nvjitlink")
+        return linker(*params)
     @abstractmethod
     def __init__(self, max_registers, lineinfo, cc):
@@ -2786,7 +2856,6 @@ class Linker(metaclass=ABCMeta):
         with driver.get_active_context() as ac:
             dev = driver.get_device(ac.devnum)
             cc = dev.compute_capability
         ptx, log = nvrtc.compile(cu, name, cc)
         if config.DUMP_ASSEMBLY:
@@ -2821,7 +2890,6 @@ class Linker(metaclass=ABCMeta):
         LTO-ed portion of the PTX when linker is added with objects that can be
         both LTO-ed and not LTO-ed.
         """
         if isinstance(path_or_code, str):
             ext = pathlib.Path(path_or_code).suffix
             if ext == "":
@@ -2901,7 +2969,148 @@ class Linker(metaclass=ABCMeta):
         """
-class MVCLinker(Linker):
+class _Linker(_LinkerBase):
+    def __init__(
+        self,
+        max_registers=None,
+        lineinfo=False,
+        cc=None,
+        lto=None,
+        additional_flags=None,
+    ):
+        arch = f"sm_{cc[0]}{cc[1]}"
+        self.max_registers = max_registers if max_registers else None
+        self.lineinfo = lineinfo
+        self.cc = cc
+        self.arch = arch
+        if lto is False:
+            # WAR for apparent nvjitlink issue
+            lto = None
+        self.lto = lto
+        self.additional_flags = additional_flags
+        self.options = LinkerOptions(
+            max_register_count=self.max_registers,
+            lineinfo=lineinfo,
+            arch=arch,
+            link_time_optimization=lto,
+        )
+        self._complete = False
+        self._object_codes = []
+        self.linker = None  # need at least one program
+    @property
+    def info_log(self):
+        if not self.linker:
+            raise ValueError("Not Initialized")
+        if self._complete:
+            return self._info_log
+        raise RuntimeError("Link not yet complete.")
+    @property
+    def error_log(self):
+        if not self.linker:
+            raise ValueError("Not Initialized")
+        if self._complete:
+            return self._error_log
+        raise RuntimeError("Link not yet complete.")
+    def add_ptx(self, ptx, name="<cudapy-ptx>"):
+        obj = ObjectCode.from_ptx(ptx, name=name)
+        self._object_codes.append(obj)
+    def add_cu(self, cu, name="<cudapy-cu>"):
+        with driver.get_active_context() as ac:
+            dev = driver.get_device(ac.devnum)
+            cc = dev.compute_capability
+        obj, log = nvrtc.compile(cu, name, cc, ltoir=self.lto)
+        if not self.lto and config.DUMP_ASSEMBLY:
+            print(("ASSEMBLY %s" % name).center(80, "-"))
+            print(obj.code)
+        self._object_codes.append(obj)
+    def add_cubin(self, cubin, name="<cudapy-cubin>"):
+        obj = ObjectCode.from_cubin(cubin, name=name)
+        self._object_codes.append(obj)
+    def add_ltoir(self, ltoir, name="<cudapy-ltoir>"):
+        obj = ObjectCode.from_ltoir(ltoir, name=name)
+        self._object_codes.append(obj)
+    def add_fatbin(self, fatbin, name="<cudapy-fatbin>"):
+        obj = ObjectCode.from_fatbin(fatbin, name=name)
+        self._object_codes.append(obj)
+    def add_object(self, obj, name="<cudapy-object>"):
+        obj = ObjectCode.from_object(obj, name=name)
+        self._object_codes.append(obj)
+    def add_library(self, lib, name="<cudapy-lib>"):
+        obj = ObjectCode.from_library(lib, name=name)
+        self._object_codes.append(obj)
+    def add_file(self, path, kind):
+        try:
+            data = cached_file_read(path, how="rb")
+        except FileNotFoundError:
+            raise LinkerError(f"{path} not found")
+        name = pathlib.Path(path).name
+        self.add_data(data, kind, name)
+    def add_data(self, data, kind, name):
+        if kind == FILE_EXTENSION_MAP["ptx"]:
+            fn = self.add_ptx
+        elif kind == FILE_EXTENSION_MAP["cubin"]:
+            fn = self.add_cubin
+        elif kind == "cu":
+            fn = self.add_cu
+        elif (
+            kind == FILE_EXTENSION_MAP["lib"] or kind == FILE_EXTENSION_MAP["a"]
+        ):
+            fn = self.add_library
+        elif kind == FILE_EXTENSION_MAP["fatbin"]:
+            fn = self.add_fatbin
+        elif kind == FILE_EXTENSION_MAP["o"]:
+            fn = self.add_object
+        elif kind == FILE_EXTENSION_MAP["ltoir"]:
+            fn = self.add_ltoir
+        else:
+            raise LinkerError(f"Don't know how to link {kind}")
+        fn(data, name)
+    def get_linked_ptx(self):
+        options = LinkerOptions(
+            max_register_count=self.max_registers,
+            lineinfo=self.lineinfo,
+            arch=self.arch,
+            link_time_optimization=True,
+            ptx=True,
+        )
+        self.linker = Linker(*self._object_codes, options=options)
+        result = self.linker.link("ptx")
+        self.close()
+        self._complete = True
+        return result.code
+    def close(self):
+        self._info_log = self.linker.get_info_log()
+        self._error_log = self.linker.get_error_log()
+        self.linker.close()
+    def complete(self):
+        self.linker = Linker(*self._object_codes, options=self.options)
+        result = self.linker.link("cubin")
+        self.close()
+        self._complete = True
+        return result
+class MVCLinker(_LinkerBase):
     """
     Linker supporting Minor Version Compatibility, backed by the cubinlinker
     package.
@@ -2996,7 +3205,7 @@ class MVCLinker(Linker):
             raise LinkerError from e
-class CtypesLinker(Linker):
+class CtypesLinker(_LinkerBase):
     """
     Links for current device if no CC given
     """
@@ -3139,266 +3348,6 @@ class CtypesLinker(Linker):
         return bytes(np.ctypeslib.as_array(cubin_ptr, shape=(size,)))
-class CudaPythonLinker(Linker):
-    """
-    Links for current device if no CC given
-    """
-    def __init__(self, max_registers=0, lineinfo=False, cc=None):
-        super().__init__(max_registers, lineinfo, cc)
-        logsz = config.CUDA_LOG_SIZE
-        linkerinfo = bytearray(logsz)
-        linkererrors = bytearray(logsz)
-        jit_option = binding.CUjit_option
-        options = {
-            jit_option.CU_JIT_INFO_LOG_BUFFER: linkerinfo,
-            jit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: logsz,
-            jit_option.CU_JIT_ERROR_LOG_BUFFER: linkererrors,
-            jit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: logsz,
-            jit_option.CU_JIT_LOG_VERBOSE: 1,
-        }
-        if max_registers:
-            options[jit_option.CU_JIT_MAX_REGISTERS] = max_registers
-        if lineinfo:
-            options[jit_option.CU_JIT_GENERATE_LINE_INFO] = 1
-        if cc is None:
-            # No option value is needed, but we need something as a placeholder
-            options[jit_option.CU_JIT_TARGET_FROM_CUCONTEXT] = 1
-        else:
-            cc_val = cc[0] * 10 + cc[1]
-            cc_enum = getattr(
-                binding.CUjit_target, f"CU_TARGET_COMPUTE_{cc_val}"
-            )
-            options[jit_option.CU_JIT_TARGET] = cc_enum
-        raw_keys = list(options.keys())
-        raw_values = list(options.values())
-        self.handle = driver.cuLinkCreate(len(raw_keys), raw_keys, raw_values)
-        weakref.finalize(self, driver.cuLinkDestroy, self.handle)
-        self.linker_info_buf = linkerinfo
-        self.linker_errors_buf = linkererrors
-        self._keep_alive = [linkerinfo, linkererrors, raw_keys, raw_values]
-    @property
-    def info_log(self):
-        return self.linker_info_buf.decode("utf8")
-    @property
-    def error_log(self):
-        return self.linker_errors_buf.decode("utf8")
-    def add_cubin(self, cubin, name="<unnamed-cubin>"):
-        input_type = binding.CUjitInputType.CU_JIT_INPUT_CUBIN
-        return self._add_data(input_type, cubin, name)
-    def add_ptx(self, ptx, name="<unnamed-ptx>"):
-        input_type = binding.CUjitInputType.CU_JIT_INPUT_PTX
-        return self._add_data(input_type, ptx, name)
-    def add_object(self, object_, name="<unnamed-object>"):
-        input_type = binding.CUjitInputType.CU_JIT_INPUT_OBJECT
-        return self._add_data(input_type, object_, name)
-    def add_fatbin(self, fatbin, name="<unnamed-fatbin>"):
-        input_type = binding.CUjitInputType.CU_JIT_INPUT_FATBINARY
-        return self._add_data(input_type, fatbin, name)
-    def add_library(self, library, name="<unnamed-library>"):
-        input_type = binding.CUjitInputType.CU_JIT_INPUT_LIBRARY
-        return self._add_data(input_type, library, name)
-    def _add_data(self, input_type, data, name):
-        name_buffer = name.encode("utf8")
-        self._keep_alive += [data, name_buffer]
-        try:
-            driver.cuLinkAddData(
-                self.handle, input_type, data, len(data), name_buffer, 0, [], []
-            )
-        except CudaAPIError as e:
-            raise LinkerError("%s\n%s" % (e, self.error_log))
-    def add_data(self, data, kind, name=None):
-        # We pass the name as **kwargs to ensure the default name for the input
-        # type is used if none is supplied
-        kws = {}
-        if name is not None:
-            kws["name"] = name
-        if kind == FILE_EXTENSION_MAP["cubin"]:
-            self.add_cubin(data, **kws)
-        elif kind == FILE_EXTENSION_MAP["fatbin"]:
-            self.add_fatbin(data, **kws)
-        elif kind == FILE_EXTENSION_MAP["a"]:
-            self.add_library(data, **kws)
-        elif kind == FILE_EXTENSION_MAP["ptx"]:
-            self.add_ptx(data, **kws)
-        elif kind == FILE_EXTENSION_MAP["o"]:
-            self.add_object(data, **kws)
-        elif kind == FILE_EXTENSION_MAP["ltoir"]:
-            raise LinkerError("CudaPythonLinker cannot link LTO-IR")
-        else:
-            raise LinkerError(f"Don't know how to link {kind}")
-    def add_file(self, path, kind):
-        pathbuf = path.encode("utf8")
-        self._keep_alive.append(pathbuf)
-        try:
-            driver.cuLinkAddFile(self.handle, kind, pathbuf, 0, [], [])
-        except CudaAPIError as e:
-            if e.code == binding.CUresult.CUDA_ERROR_FILE_NOT_FOUND:
-                msg = f"{path} not found"
-            else:
-                msg = "%s\n%s" % (e, self.error_log)
-            raise LinkerError(msg)
-    def complete(self):
-        try:
-            cubin_buf, size = driver.cuLinkComplete(self.handle)
-        except CudaAPIError as e:
-            raise LinkerError("%s\n%s" % (e, self.error_log))
-        assert size > 0, "linker returned a zero sized cubin"
-        del self._keep_alive[:]
-        # We return a copy of the cubin because it's owned by the linker
-        cubin_ptr = ctypes.cast(cubin_buf, ctypes.POINTER(ctypes.c_char))
-        return bytes(np.ctypeslib.as_array(cubin_ptr, shape=(size,)))
-class PyNvJitLinker(Linker):
-    def __init__(
-        self,
-        max_registers=None,
-        lineinfo=False,
-        cc=None,
-        lto=False,
-        additional_flags=None,
-    ):
-        if NvJitLinker is None:
-            raise ImportError(
-                "Using pynvjitlink requires the pynvjitlink package to be "
-                "available"
-            )
-        if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY:
-            raise ValueError(
-                "Can't set CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY and "
-                "CUDA_ENABLE_PYNVJITLINK at the same time"
-            )
-        if cc is None:
-            raise RuntimeError("PyNvJitLinker requires CC to be specified")
-        if not any(isinstance(cc, t) for t in [list, tuple]):
-            raise TypeError("`cc` must be a list or tuple of length 2")
-        sm_ver = f"{cc[0] * 10 + cc[1]}"
-        arch = f"-arch=sm_{sm_ver}"
-        options = [arch]
-        if max_registers:
-            options.append(f"-maxrregcount={max_registers}")
-        if lineinfo:
-            options.append("-lineinfo")
-        if lto:
-            options.append("-lto")
-        if additional_flags is not None:
-            options.extend(additional_flags)
-        self._linker = NvJitLinker(*options)
-        self.lto = lto
-        self.options = options
-    @property
-    def info_log(self):
-        return self._linker.info_log
-    @property
-    def error_log(self):
-        return self._linker.error_log
-    def add_ptx(self, ptx, name="<cudapy-ptx>"):
-        self._linker.add_ptx(ptx, name)
-    def add_fatbin(self, fatbin, name="<external-fatbin>"):
-        self._linker.add_fatbin(fatbin, name)
-    def add_ltoir(self, ltoir, name="<external-ltoir>"):
-        self._linker.add_ltoir(ltoir, name)
-    def add_object(self, obj, name="<external-object>"):
-        self._linker.add_object(obj, name)
-    def add_file(self, path, kind):
-        try:
-            data = cached_file_read(path, "rb")
-        except FileNotFoundError:
-            raise LinkerError(f"{path} not found")
-        name = pathlib.Path(path).name
-        self.add_data(data, kind, name)
-    def add_cu(self, cu, name):
-        """Add CUDA source in a string to the link. The name of the source
-        file should be specified in `name`."""
-        with driver.get_active_context() as ac:
-            dev = driver.get_device(ac.devnum)
-            cc = dev.compute_capability
-        program, log = nvrtc.compile(cu, name, cc, ltoir=self.lto)
-        if not self.lto and config.DUMP_ASSEMBLY:
-            print(("ASSEMBLY %s" % name).center(80, "-"))
-            print(program)
-            print("=" * 80)
-        suffix = ".ltoir" if self.lto else ".ptx"
-        program_name = os.path.splitext(name)[0] + suffix
-        # Link the program's PTX or LTOIR using the normal linker mechanism
-        if self.lto:
-            self.add_ltoir(program, program_name)
-        else:
-            self.add_ptx(program.encode(), program_name)
-    def add_data(self, data, kind, name):
-        if kind == FILE_EXTENSION_MAP["cubin"]:
-            fn = self._linker.add_cubin
-        elif kind == FILE_EXTENSION_MAP["fatbin"]:
-            fn = self._linker.add_fatbin
-        elif kind == FILE_EXTENSION_MAP["a"]:
-            fn = self._linker.add_library
-        elif kind == FILE_EXTENSION_MAP["ptx"]:
-            return self.add_ptx(data, name)
-        elif kind == FILE_EXTENSION_MAP["o"]:
-            fn = self._linker.add_object
-        elif kind == FILE_EXTENSION_MAP["ltoir"]:
-            fn = self._linker.add_ltoir
-        else:
-            raise LinkerError(f"Don't know how to link {kind}")
-        try:
-            fn(data, name)
-        except NvJitLinkError as e:
-            raise LinkerError from e
-    def get_linked_ptx(self):
-        try:
-            return self._linker.get_linked_ptx()
-        except NvJitLinkError as e:
-            raise LinkerError from e
-    def complete(self):
-        try:
-            return self._linker.get_linked_cubin()
-        except NvJitLinkError as e:
-            raise LinkerError from e
 # -----------------------------------------------------------------------------
@@ -3566,7 +3515,11 @@ def host_to_device(dst, src, size, stream=0):
     if stream:
         assert isinstance(stream, Stream)
         fn = driver.cuMemcpyHtoDAsync
-        varargs.append(stream.handle)
+        if USE_NV_BINDING:
+            handle = stream.handle.value
+        else:
+            handle = stream.handle
+        varargs.append(handle)
     else:
         fn = driver.cuMemcpyHtoD
@@ -3584,7 +3537,11 @@ def device_to_host(dst, src, size, stream=0):
     if stream:
         assert isinstance(stream, Stream)
         fn = driver.cuMemcpyDtoHAsync
-        varargs.append(stream.handle)
+        if USE_NV_BINDING:
+            handle = stream.handle.value
+        else:
+            handle = stream.handle
+        varargs.append(handle)
     else:
         fn = driver.cuMemcpyDtoH
@@ -3602,7 +3559,11 @@ def device_to_device(dst, src, size, stream=0):
     if stream:
         assert isinstance(stream, Stream)
         fn = driver.cuMemcpyDtoDAsync
-        varargs.append(stream.handle)
+        if USE_NV_BINDING:
+            handle = stream.handle.value
+        else:
+            handle = stream.handle
+        varargs.append(handle)
     else:
         fn = driver.cuMemcpyDtoD
@@ -3623,7 +3584,11 @@ def device_memset(dst, val, size, stream=0):
     if stream:
         assert isinstance(stream, Stream)
         fn = driver.cuMemsetD8Async
-        varargs.append(stream.handle)
+        if USE_NV_BINDING:
+            handle = stream.handle.value
+        else:
+            handle = stream.handle
+        varargs.append(handle)
     else:
         fn = driver.cuMemsetD8

numba-cuda 0.15.2__py3-none-any.whl → 0.17.0__py3-none-any.whl

numba-cuda 0.15.2py3-none-any.whl → 0.17.0py3-none-any.whl