PyPI - numba-cuda - Versions diffs - 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

numba-cuda 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

numba_cuda/VERSION +1 -1
numba_cuda/numba/cuda/compiler.py +7 -6
numba_cuda/numba/cuda/cudadecl.py +6 -2
numba_cuda/numba/cuda/cudadrv/devicearray.py +4 -1
numba_cuda/numba/cuda/cudadrv/driver.py +1 -20
numba_cuda/numba/cuda/cudadrv/linkable_code.py +13 -9
numba_cuda/numba/cuda/cudadrv/nvrtc.py +5 -1
numba_cuda/numba/cuda/cudadrv/nvvm.py +6 -1
numba_cuda/numba/cuda/decorators.py +9 -2
numba_cuda/numba/cuda/dispatcher.py +22 -3
numba_cuda/numba/cuda/runtime/__init__.py +1 -0
numba_cuda/numba/cuda/runtime/memsys.cu +94 -0
numba_cuda/numba/cuda/runtime/memsys.cuh +17 -0
numba_cuda/numba/cuda/runtime/nrt.cu +19 -22
numba_cuda/numba/cuda/runtime/nrt.py +318 -0
numba_cuda/numba/cuda/testing.py +11 -1
numba_cuda/numba/cuda/tests/__init__.py +1 -0
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +31 -0
numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +145 -11
numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +10 -7
numba_cuda/numba/cuda/tests/nrt/mock_numpy.py +105 -1
numba_cuda/numba/cuda/tests/nrt/test_nrt.py +162 -40
numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +114 -0
numba_cuda/numba/cuda/tests/support.py +11 -0
numba_cuda/numba/cuda/utils.py +22 -0
{numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/METADATA +21 -3
{numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/RECORD +30 -23
{numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/WHEEL +1 -1
{numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/LICENSE +0 -0
{numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/top_level.txt +0 -0

numba_cuda/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.3.0
1	+ 0.5.0

numba_cuda/numba/cuda/compiler.py CHANGED Viewed

@@ -570,16 +570,16 @@ def compile_ptx_for_current_device(pyfunc, sig, debug=None, lineinfo=False,
                        abi=abi, abi_info=abi_info)
-def declare_device_function(name, restype, argtypes):
-    return declare_device_function_template(name, restype, argtypes).key
+def declare_device_function(name, restype, argtypes, link):
+    return declare_device_function_template(name, restype, argtypes, link).key
-def declare_device_function_template(name, restype, argtypes):
+def declare_device_function_template(name, restype, argtypes, link):
     from .descriptor import cuda_target
     typingctx = cuda_target.typing_context
     targetctx = cuda_target.target_context
     sig = typing.signature(restype, *argtypes)
-    extfn = ExternFunction(name, sig)
+    extfn = ExternFunction(name, sig, link)
     class device_function_template(ConcreteTemplate):
         key = extfn
@@ -593,7 +593,8 @@ def declare_device_function_template(name, restype, argtypes):
     return device_function_template
-class ExternFunction(object):
-    def __init__(self, name, sig):
+class ExternFunction:
+    def __init__(self, name, sig, link):
         self.name = name
         self.sig = sig
+        self.link = link

numba_cuda/numba/cuda/cudadecl.py CHANGED Viewed

@@ -403,16 +403,20 @@ _genfp16_binary_operator(operator.itruediv)
 def _resolve_wrapped_unary(fname):
+    link = tuple()
     decl = declare_device_function_template(f'__numba_wrapper_{fname}',
                                             types.float16,
-                                            (types.float16,))
+                                            (types.float16,),
+                                            link)
     return types.Function(decl)
 def _resolve_wrapped_binary(fname):
+    link = tuple()
     decl = declare_device_function_template(f'__numba_wrapper_{fname}',
                                             types.float16,
-                                            (types.float16, types.float16,))
+                                            (types.float16, types.float16,),
+                                            link)
     return types.Function(decl)

numba_cuda/numba/cuda/cudadrv/devicearray.py CHANGED Viewed

@@ -570,10 +570,13 @@ class DeviceNDArray(DeviceNDArrayBase):
         '''
         return self._dummy.is_c_contig
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         """
         :return: an `numpy.ndarray`, so copies to the host.
         """
+        if copy is False:
+            msg = "`copy=False` is not supported. A copy is always created."
+            raise ValueError(msg)
         if dtype:
             return self.copy_to_host().__array__(dtype)
         else:

numba_cuda/numba/cuda/cudadrv/driver.py CHANGED Viewed

@@ -18,7 +18,6 @@ import functools
 import warnings
 import logging
 import threading
-import traceback
 import asyncio
 import pathlib
 import subprocess
@@ -40,6 +39,7 @@ from .drvapi import API_PROTOTYPES
 from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid
 from .mappings import FILE_EXTENSION_MAP
 from .linkable_code import LinkableCode, LTOIR, Fatbin, Object
+from numba.cuda.utils import _readenv
 from numba.cuda.cudadrv import enums, drvapi, nvrtc
 try:
@@ -66,25 +66,6 @@ _py_decref.argtypes = [ctypes.py_object]
 _py_incref.argtypes = [ctypes.py_object]
-def _readenv(name, ctor, default):
-    value = os.environ.get(name)
-    if value is None:
-        return default() if callable(default) else default
-    try:
-        if ctor is bool:
-            return value.lower() in {'1', "true"}
-        return ctor(value)
-    except Exception:
-        warnings.warn(
-            f"Environment variable '{name}' is defined but its associated "
-            f"value '{value}' could not be parsed.\n"
-            "The parse failed with exception:\n"
-            f"{traceback.format_exc()}",
-            RuntimeWarning
-        )
-        return default
 _MVC_ERROR_MESSAGE = (
     "Minor version compatibility requires ptxcompiler and cubinlinker packages "
     "to be available"

numba_cuda/numba/cuda/cudadrv/linkable_code.py CHANGED Viewed

@@ -2,8 +2,12 @@ from .mappings import FILE_EXTENSION_MAP
 class LinkableCode:
-    """An object that can be passed in the `link` list argument to `@cuda.jit`
-    kernels to supply code to be linked from memory."""
+    """An object that holds code to be linked from memory.
+    :param data: A buffer containing the data to link.
+    :param name: The name of the file to be referenced in any compilation or
+                 linking errors that may be produced.
+    """
     def __init__(self, data, name=None):
         self.data = data
@@ -15,49 +19,49 @@ class LinkableCode:
 class PTXSource(LinkableCode):
-    """PTX Source code in memory"""
+    """PTX source code in memory."""
     kind = FILE_EXTENSION_MAP["ptx"]
     default_name = "<unnamed-ptx>"
 class CUSource(LinkableCode):
-    """CUDA C/C++ Source code in memory"""
+    """CUDA C/C++ source code in memory."""
     kind = "cu"
     default_name = "<unnamed-cu>"
 class Fatbin(LinkableCode):
-    """A fatbin ELF in memory"""
+    """An ELF Fatbin in memory."""
     kind = FILE_EXTENSION_MAP["fatbin"]
     default_name = "<unnamed-fatbin>"
 class Cubin(LinkableCode):
-    """A cubin ELF in memory"""
+    """An ELF Cubin in memory."""
     kind = FILE_EXTENSION_MAP["cubin"]
     default_name = "<unnamed-cubin>"
 class Archive(LinkableCode):
-    """An archive of objects in memory"""
+    """An archive of objects in memory."""
     kind = FILE_EXTENSION_MAP["a"]
     default_name = "<unnamed-archive>"
 class Object(LinkableCode):
-    """An object file in memory"""
+    """An object file in memory."""
     kind = FILE_EXTENSION_MAP["o"]
     default_name = "<unnamed-object>"
 class LTOIR(LinkableCode):
-    """An LTOIR file in memory"""
+    """An LTOIR file in memory."""
     kind = "ltoir"
     default_name = "<unnamed-ltoir>"

numba_cuda/numba/cuda/cudadrv/nvrtc.py CHANGED Viewed

@@ -266,7 +266,11 @@ def compile(src, name, cc, ltoir=False):
     cudadrv_path = os.path.dirname(os.path.abspath(__file__))
     numba_cuda_path = os.path.dirname(cudadrv_path)
     numba_include = f'-I{numba_cuda_path}'
-    options = [arch, *cuda_include, numba_include, '-rdc', 'true']
+    nrt_path = os.path.join(numba_cuda_path, "runtime")
+    nrt_include = f'-I{nrt_path}'
+    options = [arch, *cuda_include, numba_include, nrt_include, '-rdc', 'true']
     if ltoir:
         options.append("-dlto")

numba_cuda/numba/cuda/cudadrv/nvvm.py CHANGED Viewed

@@ -314,7 +314,9 @@ COMPUTE_CAPABILITIES = (
     (6, 0), (6, 1), (6, 2),
     (7, 0), (7, 2), (7, 5),
     (8, 0), (8, 6), (8, 7), (8, 9),
-    (9, 0)
+    (9, 0),
+    (10, 0), (10, 1),
+    (12, 0),
 )
 # Maps CTK version -> (min supported cc, max supported cc) inclusive
@@ -331,6 +333,9 @@ CTK_SUPPORTED = {
     (12, 2): ((5, 0), (9, 0)),
     (12, 3): ((5, 0), (9, 0)),
     (12, 4): ((5, 0), (9, 0)),
+    (12, 5): ((5, 0), (9, 0)),
+    (12, 6): ((5, 0), (9, 0)),
+    (12, 8): ((5, 0), (12, 0)),
 }

numba_cuda/numba/cuda/decorators.py CHANGED Viewed

@@ -173,7 +173,7 @@ def jit(func_or_sig=None, device=False, inline=False, link=[], debug=None,
                 return disp
-def declare_device(name, sig):
+def declare_device(name, sig, link=None):
     """
     Declare the signature of a foreign function. Returns a descriptor that can
     be used to call the function from a Python kernel.
@@ -181,10 +181,17 @@ def declare_device(name, sig):
     :param name: The name of the foreign function.
     :type name: str
     :param sig: The Numba signature of the function.
+    :param link: External code to link when calling the function.
     """
+    if link is None:
+        link = tuple()
+    else:
+        if not isinstance(link, (list, tuple, set)):
+            link = (link,)
     argtypes, restype = sigutils.normalize_signature(sig)
     if restype is None:
         msg = 'Return type must be provided for device declarations'
         raise TypeError(msg)
-    return declare_device_function(name, restype, argtypes)
+    return declare_device_function(name, restype, argtypes, link)

numba_cuda/numba/cuda/dispatcher.py CHANGED Viewed

@@ -11,16 +11,18 @@ from numba.core.compiler_lock import global_compiler_lock
 from numba.core.dispatcher import Dispatcher
 from numba.core.errors import NumbaPerformanceWarning
 from numba.core.typing.typeof import Purpose, typeof
+from numba.core.types.functions import Function
 from numba.cuda.api import get_current_device
 from numba.cuda.args import wrap_arg
-from numba.cuda.compiler import compile_cuda, CUDACompiler, kernel_fixup
+from numba.cuda.compiler import (compile_cuda, CUDACompiler, kernel_fixup,
+                                 ExternFunction)
 from numba.cuda.cudadrv import driver
 from numba.cuda.cudadrv.devices import get_context
 from numba.cuda.descriptor import cuda_target
 from numba.cuda.errors import (missing_launch_config_msg,
                                normalize_kernel_dimensions)
 from numba.cuda import types as cuda_types
+from numba.cuda.runtime.nrt import rtsys
 from numba import cuda
 from numba import _dispatcher
@@ -157,6 +159,16 @@ class _Kernel(serialize.ReduceMixin):
         self.maybe_link_nrt(link, tgt_ctx, asm)
+        for k, v in cres.fndesc.typemap.items():
+            if not isinstance(v, Function):
+                continue
+            if not isinstance(v.typing_key, ExternFunction):
+                continue
+            for obj in v.typing_key.link:
+                lib.add_linking_file(obj)
         for filepath in link:
             lib.add_linking_file(filepath)
@@ -253,7 +265,14 @@ class _Kernel(serialize.ReduceMixin):
         """
         Force binding to current CUDA context
         """
-        self._codelibrary.get_cufunc()
+        cufunc = self._codelibrary.get_cufunc()
+        if hasattr(self, "target_context") and self.target_context.enable_nrt:
+            rtsys.ensure_initialized()
+            rtsys.set_memsys_to_module(cufunc.module)
+            # We don't know which stream the kernel will be launched on, so
+            # we force synchronize here.
+            cuda.synchronize()
     @property
     def regs_per_thread(self):

numba_cuda/numba/cuda/runtime/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from numba.cuda.runtime.nrt import rtsys # noqa: F401

numba_cuda/numba/cuda/runtime/memsys.cu ADDED Viewed

@@ -0,0 +1,94 @@
+#include "memsys.cuh"
+__device__ size_t memsys_size = sizeof(NRT_MemSys);
+namespace detail
+{
+  void __device__ check_memsys()
+  {
+    if (TheMSys == nullptr)
+    {
+      assert(false && "TheMSys pointer is null. Please use NRT_MemSys_set to set pointer first.");
+    }
+  }
+}
+extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr)
+{
+  TheMSys = memsys_ptr;
+}
+extern "C" __global__ void NRT_MemSys_read(uint64_t *managed_memsys)
+{
+  detail::check_memsys();
+  managed_memsys[0] = TheMSys->stats.alloc;
+  managed_memsys[1] = TheMSys->stats.free;
+  managed_memsys[2] = TheMSys->stats.mi_alloc;
+  managed_memsys[3] = TheMSys->stats.mi_free;
+}
+extern "C" __global__ void NRT_MemSys_read_alloc(uint64_t *managed_result)
+{
+  detail::check_memsys();
+  managed_result[0] = TheMSys->stats.alloc;
+}
+extern "C" __global__ void NRT_MemSys_read_free(uint64_t *managed_result)
+{
+  detail::check_memsys();
+  managed_result[0] = TheMSys->stats.free;
+}
+extern "C" __global__ void NRT_MemSys_read_mi_alloc(uint64_t *managed_result)
+{
+  detail::check_memsys();
+  managed_result[0] = TheMSys->stats.mi_alloc;
+}
+extern "C" __global__ void NRT_MemSys_read_mi_free(uint64_t *managed_result)
+{
+  detail::check_memsys();
+  managed_result[0] = TheMSys->stats.mi_free;
+}
+extern "C" __global__ void NRT_MemSys_init(void)
+{
+  detail::check_memsys();
+  TheMSys->stats.enabled = false;
+  TheMSys->stats.alloc = 0;
+  TheMSys->stats.free = 0;
+  TheMSys->stats.mi_alloc = 0;
+  TheMSys->stats.mi_free = 0;
+}
+extern "C" __global__ void NRT_MemSys_enable_stats(void)
+{
+  detail::check_memsys();
+  TheMSys->stats.enabled = true;
+}
+extern "C" __global__ void NRT_MemSys_disable_stats(void)
+{
+  detail::check_memsys();
+  TheMSys->stats.enabled = false;
+}
+extern "C" __global__ void NRT_MemSys_stats_enabled(uint8_t *enabled)
+{
+  detail::check_memsys();
+  *enabled = static_cast<uint8_t>(TheMSys->stats.enabled);
+}
+extern "C" __global__ void NRT_MemSys_print(void)
+{
+  if (TheMSys != nullptr)
+  {
+    printf("TheMSys->stats.enabled %d\n", TheMSys->stats.enabled);
+    printf("TheMSys->stats.alloc %lu\n", TheMSys->stats.alloc.load());
+    printf("TheMSys->stats.free %lu\n", TheMSys->stats.free.load());
+    printf("TheMSys->stats.mi_alloc %lu\n", TheMSys->stats.mi_alloc.load());
+    printf("TheMSys->stats.mi_free %lu\n", TheMSys->stats.mi_free.load());
+  } else {
+    printf("TheMsys is null.\n");
+  }
+}

numba_cuda/numba/cuda/runtime/memsys.cuh ADDED Viewed

@@ -0,0 +1,17 @@
+#include <cuda/atomic>
+// Globally needed variables
+struct NRT_MemSys {
+    struct {
+      bool enabled;
+      cuda::atomic<size_t, cuda::thread_scope_device> alloc;
+      cuda::atomic<size_t, cuda::thread_scope_device> free;
+      cuda::atomic<size_t, cuda::thread_scope_device> mi_alloc;
+      cuda::atomic<size_t, cuda::thread_scope_device> mi_free;
+    } stats;
+  };
+/* The Memory System object */
+__device__ NRT_MemSys* TheMSys;
+extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr);

numba_cuda/numba/cuda/runtime/nrt.cu CHANGED Viewed

@@ -3,6 +3,8 @@
 #include <cuda/atomic>
+#include "memsys.cuh"
 typedef void (*NRT_dtor_function)(void* ptr, size_t size, void* info);
 typedef void (*NRT_dealloc_func)(void* ptr, void* dealloc_info);
@@ -18,29 +20,21 @@ struct MemInfo {
 };
 }
-// Globally needed variables
-struct NRT_MemSys {
-  struct {
-    bool enabled;
-    cuda::atomic<size_t, cuda::thread_scope_device> alloc;
-    cuda::atomic<size_t, cuda::thread_scope_device> free;
-    cuda::atomic<size_t, cuda::thread_scope_device> mi_alloc;
-    cuda::atomic<size_t, cuda::thread_scope_device> mi_free;
-  } stats;
-};
+extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr)
+{
+  TheMSys = memsys_ptr;
+}
 static __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, NRT_MemInfo **mi);
 static __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out);
 extern "C" __device__ void* NRT_Allocate_External(size_t size);
-/* The Memory System object */
-__device__ NRT_MemSys* TheMSys;
 extern "C" __device__ void* NRT_Allocate(size_t size)
 {
   void* ptr = NULL;
   ptr       = malloc(size);
-//  if (TheMSys->stats.enabled) { TheMSys->stats.alloc++; }
+  if (TheMSys && TheMSys->stats.enabled) {
+    TheMSys->stats.alloc.fetch_add(1, cuda::memory_order_relaxed); }
   return ptr;
 }
@@ -49,14 +43,14 @@ extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
                                             size_t size,
                                             NRT_dtor_function dtor,
                                             void* dtor_info)
-//                                            NRT_MemSys* TheMSys)
 {
   mi->refct     = 1; /* starts with 1 refct */
   mi->dtor      = dtor;
   mi->dtor_info = dtor_info;
   mi->data      = data;
   mi->size      = size;
-//  if (TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; }
+ if (TheMSys && TheMSys->stats.enabled) {
+  TheMSys->stats.mi_alloc.fetch_add(1, cuda::memory_order_relaxed); }
 }
 extern "C"
@@ -71,7 +65,8 @@ __device__ NRT_MemInfo* NRT_MemInfo_new(
 extern "C" __device__ void NRT_Free(void* ptr)
 {
   free(ptr);
-  //if (TheMSys->stats.enabled) { TheMSys->stats.free++; }
+  if (TheMSys && TheMSys->stats.enabled) {
+    TheMSys->stats.free.fetch_add(1, cuda::memory_order_relaxed); }
 }
 extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
@@ -82,8 +77,10 @@ extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
 extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi)
 {
   NRT_dealloc(mi);
-  //if (TheMSys->stats.enabled) { TheMSys->stats.mi_free++; }
+  if (TheMSys && TheMSys->stats.enabled) {
+    TheMSys->stats.mi_free.fetch_add(1, cuda::memory_order_relaxed); }
 }
 extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi)
 {
   if (mi->dtor) /* We have a destructor */
@@ -158,10 +155,10 @@ extern "C" __device__ void* NRT_Allocate_External(size_t size) {
     ptr = malloc(size);
     //NRT_Debug(nrt_debug_print("NRT_Allocate_External bytes=%zu ptr=%p\n", size, ptr));
-    //if (TheMSys.stats.enabled)
-    //{
-    //    TheMSys.stats.alloc++;
-    //}
+    if (TheMSys && TheMSys->stats.enabled)
+    {
+       TheMSys->stats.alloc.fetch_add(1, cuda::memory_order_relaxed);
+    }
     return ptr;
 }

numba-cuda 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

numba-cuda 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl