PyPI - numba-cuda - Versions diffs - 0.0.18__py3-none-any.whl → 0.0.19__py3-none-any.whl - Mend

numba-cuda 0.0.18py3-none-any.whl → 0.0.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

numba_cuda/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.0.18
1	+ 0.0.19

numba_cuda/numba/cuda/cuda_paths.py CHANGED Viewed

@@ -2,9 +2,11 @@ import sys
 import re
 import os
 from collections import namedtuple
+import platform
 from numba.core.config import IS_WIN32
 from numba.misc.findlib import find_lib, find_file
+from numba import config
 _env_path_tuple = namedtuple('_env_path_tuple', ['by', 'info'])
@@ -241,6 +243,7 @@ def get_cuda_paths():
             'libdevice': _get_libdevice_paths(),
             'cudalib_dir': _get_cudalib_dir(),
             'static_cudalib_dir': _get_static_cudalib_dir(),
+            'include_dir': _get_include_dir(),
         }
         # Cache result
         get_cuda_paths._cached_result = d
@@ -256,3 +259,68 @@ def get_debian_pkg_libdevice():
     if not os.path.exists(pkg_libdevice_location):
         return None
     return pkg_libdevice_location
+def get_current_cuda_target_name():
+    """Determine conda's CTK target folder based on system and machine arch.
+    CTK's conda package delivers headers based on its architecture type. For example,
+    `x86_64` machine places header under `$CONDA_PREFIX/targets/x86_64-linux`, and
+    `aarch64` places under `$CONDA_PREFIX/targets/sbsa-linux`. Read more about the
+    nuances at cudart's conda feedstock:
+    https://github.com/conda-forge/cuda-cudart-feedstock/blob/main/recipe/meta.yaml#L8-L11  # noqa: E501
+    """
+    system = platform.system()
+    machine = platform.machine()
+    if system == "Linux":
+        arch_to_targets = {
+            'x86_64': 'x86_64-linux',
+            'aarch64': 'sbsa-linux'
+        }
+    elif system == "Windows":
+        arch_to_targets = {
+            'AMD64': 'x64',
+        }
+    else:
+        arch_to_targets = {}
+    return arch_to_targets.get(machine, None)
+def get_conda_include_dir():
+    """
+    Return the include directory in the current conda environment, if one
+    is active and it exists.
+    """
+    is_conda_env = os.path.exists(os.path.join(sys.prefix, 'conda-meta'))
+    if not is_conda_env:
+        return
+    if platform.system() == "Windows":
+        include_dir = os.path.join(
+            sys.prefix, 'Library', 'include'
+        )
+    elif target_name := get_current_cuda_target_name():
+        include_dir = os.path.join(
+            sys.prefix, 'targets', target_name, 'include'
+        )
+    else:
+        # A fallback when target cannot determined
+        # though usually it shouldn't.
+        include_dir = os.path.join(sys.prefix, 'include')
+    if os.path.exists(include_dir):
+        return include_dir
+    return
+def _get_include_dir():
+    """Find the root include directory."""
+    options = [
+        ('Conda environment (NVIDIA package)', get_conda_include_dir()),
+        ('CUDA_INCLUDE_PATH Config Entry', config.CUDA_INCLUDE_PATH),
+        # TODO: add others
+    ]
+    by, include_dir = _find_valid_path(options)
+    return _env_path_tuple(by, include_dir)

numba_cuda/numba/cuda/cudadrv/devicearray.py CHANGED Viewed

@@ -876,7 +876,10 @@ def auto_device(obj, stream=0, copy=True, user_explicit=False):
             sentry_contiguous(obj)
             devobj = from_array_like(obj, stream=stream)
         if copy:
-            if config.CUDA_WARN_ON_IMPLICIT_COPY:
+            if (
+                config.CUDA_WARN_ON_IMPLICIT_COPY
+                and not config.DISABLE_PERFORMANCE_WARNINGS
+            ):
                 if (
                     not user_explicit and
                     (not isinstance(obj, DeviceNDArray)

numba_cuda/numba/cuda/cudadrv/libs.py CHANGED Viewed

@@ -18,6 +18,7 @@ from numba.misc.findlib import find_lib
 from numba.cuda.cuda_paths import get_cuda_paths
 from numba.cuda.cudadrv.driver import locate_driver_and_loader, load_driver
 from numba.cuda.cudadrv.error import CudaSupportError
+from numba.core import config
 if sys.platform == 'win32':
@@ -60,6 +61,24 @@ def get_cudalib(lib, static=False):
     return max(candidates) if candidates else namepattern % lib
+def get_cuda_include_dir():
+    """
+    Find the path to cuda include dir based on a list of default locations.
+    Note that this does not list the `CUDA_INCLUDE_PATH` entry in user
+    configuration.
+    """
+    return get_cuda_paths()['include_dir'].info
+def check_cuda_include_dir(path):
+    if path is None or not os.path.exists(path):
+        raise FileNotFoundError(f"{path} not found")
+    if not os.path.exists(os.path.join(path, "cuda_runtime.h")):
+        raise FileNotFoundError(f"Unable to find cuda_runtime.h from {path}")
 def open_cudalib(lib):
     path = get_cudalib(lib)
     return ctypes.CDLL(path)
@@ -75,6 +94,8 @@ def _get_source_variable(lib, static=False):
         return get_cuda_paths()['nvvm'].by
     elif lib == 'libdevice':
         return get_cuda_paths()['libdevice'].by
+    elif lib == 'include_dir':
+        return get_cuda_paths()['include_dir'].by
     else:
         dir_type = 'static_cudalib_dir' if static else 'cudalib_dir'
         return get_cuda_paths()[dir_type].by
@@ -173,4 +194,21 @@ def test():
         print('\tERROR: failed to find %s:\n%s' % (lib, e))
         failed = True
+    # Check cuda include paths
+    print("Include directory configuration variable:")
+    print(f"\tCUDA_INCLUDE_PATH={config.CUDA_INCLUDE_PATH}")
+    where = _get_source_variable('include_dir')
+    print(f'Finding include directory from {where}')
+    include = get_cuda_include_dir()
+    print('\tLocated at', include)
+    try:
+        print('\tChecking include directory', end='...')
+        check_cuda_include_dir(include)
+        print('\tok')
+    except FileNotFoundError as e:
+        print('\tERROR: failed to find cuda include directory:\n%s' % e)
+        failed = True
     return not failed

numba_cuda/numba/cuda/cudadrv/nvrtc.py CHANGED Viewed

@@ -1,9 +1,8 @@
 from ctypes import byref, c_char, c_char_p, c_int, c_size_t, c_void_p, POINTER
 from enum import IntEnum
-from numba.core import config
 from numba.cuda.cudadrv.error import (NvrtcError, NvrtcCompilationError,
                                       NvrtcSupportError)
+from numba.cuda.cuda_paths import get_cuda_paths
 import functools
 import os
 import threading
@@ -233,12 +232,18 @@ def compile(src, name, cc):
     #   being optimized away.
     major, minor = cc
     arch = f'--gpu-architecture=compute_{major}{minor}'
-    include = f'-I{config.CUDA_INCLUDE_PATH}'
+    cuda_include = [
+        f"-I{get_cuda_paths()['include_dir'].info}",
+    ]
     cudadrv_path = os.path.dirname(os.path.abspath(__file__))
     numba_cuda_path = os.path.dirname(cudadrv_path)
     numba_include = f'-I{numba_cuda_path}'
-    options = [arch, include, numba_include, '-rdc', 'true']
+    options = [arch, *cuda_include, numba_include, '-rdc', 'true']
+    if nvrtc.get_version() < (12, 0):
+        options += ["-std=c++17"]
     # Compile the program
     compile_error = nvrtc.compile_program(program, options)

numba_cuda/numba/cuda/dispatcher.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import numpy as np
 import os
+import re
 import sys
 import ctypes
 import functools
@@ -43,6 +44,21 @@ class _Kernel(serialize.ReduceMixin):
     object launches the kernel on the device.
     '''
+    NRT_functions = [
+        "NRT_Allocate",
+        "NRT_MemInfo_init",
+        "NRT_MemInfo_new",
+        "NRT_Free",
+        "NRT_dealloc",
+        "NRT_MemInfo_destroy",
+        "NRT_MemInfo_call_dtor",
+        "NRT_MemInfo_data_fast",
+        "NRT_MemInfo_alloc_aligned",
+        "NRT_Allocate_External",
+        "NRT_decref",
+        "NRT_incref"
+    ]
     @global_compiler_lock
     def __init__(self, py_func, argtypes, link=None, debug=False,
                  lineinfo=False, inline=False, fastmath=False, extensions=None,
@@ -105,16 +121,20 @@ class _Kernel(serialize.ReduceMixin):
         if self.cooperative:
             lib.needs_cudadevrt = True
+        basedir = os.path.dirname(os.path.abspath(__file__))
+        asm = lib.get_asm_str()
         res = [fn for fn in cuda_fp16_math_funcs
-               if (f'__numba_wrapper_{fn}' in lib.get_asm_str())]
+               if (f'__numba_wrapper_{fn}' in asm)]
         if res:
             # Path to the source containing the foreign function
-            basedir = os.path.dirname(os.path.abspath(__file__))
             functions_cu_path = os.path.join(basedir,
                                              'cpp_function_wrappers.cu')
             link.append(functions_cu_path)
+        link = self.maybe_link_nrt(link, tgt_ctx, asm)
         for filepath in link:
             lib.add_linking_file(filepath)
@@ -136,6 +156,25 @@ class _Kernel(serialize.ReduceMixin):
         self.lifted = []
         self.reload_init = []
+    def maybe_link_nrt(self, link, tgt_ctx, asm):
+        if not tgt_ctx.enable_nrt:
+            return link
+        all_nrt = "|".join(self.NRT_functions)
+        pattern = (
+            r'\.extern\s+\.func\s+(?:\s*\(.+\)\s*)?('
+            + all_nrt + r')\s*\([^)]*\)\s*;'
+        )
+        nrt_in_asm = re.findall(pattern, asm)
+        basedir = os.path.dirname(os.path.abspath(__file__))
+        if nrt_in_asm:
+            nrt_path = os.path.join(basedir, 'runtime', 'nrt.cu')
+            link.append(nrt_path)
+        return link
     @property
     def library(self):
         return self._codelibrary
@@ -385,7 +424,6 @@ class _Kernel(serialize.ReduceMixin):
         if isinstance(ty, types.Array):
             devary = wrap_arg(val).to_device(retr, stream)
             c_intp = ctypes.c_ssize_t
             meminfo = ctypes.c_void_p(0)
@@ -519,7 +557,10 @@ class _LaunchConfiguration:
         self.stream = stream
         self.sharedmem = sharedmem
-        if config.CUDA_LOW_OCCUPANCY_WARNINGS:
+        if (
+            config.CUDA_LOW_OCCUPANCY_WARNINGS
+            and not config.DISABLE_PERFORMANCE_WARNINGS
+        ):
             # Warn when the grid has fewer than 128 blocks. This number is
             # chosen somewhat heuristically - ideally the minimum is 2 times
             # the number of SMs, but the number of SMs varies between devices -
@@ -708,8 +749,7 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
         *args*.
         '''
         cc = get_current_device().compute_capability
-        argtypes = tuple(
-            [self.typingctx.resolve_argument_type(a) for a in args])
+        argtypes = tuple(self.typeof_pyval(a) for a in args)
         if self.specialized:
             raise RuntimeError('Dispatcher already specialized')

numba_cuda/numba/cuda/runtime/nrt.cu ADDED Viewed

@@ -0,0 +1,190 @@
+#ifndef _NRT_H
+#define _NRT_H
+#include <cuda/atomic>
+typedef void (*NRT_dtor_function)(void* ptr, size_t size, void* info);
+typedef void (*NRT_dealloc_func)(void* ptr, void* dealloc_info);
+typedef struct MemInfo NRT_MemInfo;
+extern "C" {
+struct MemInfo {
+  cuda::atomic<size_t, cuda::thread_scope_device> refct;
+  NRT_dtor_function dtor;
+  void* dtor_info;
+  void* data;
+  size_t size;
+};
+}
+// Globally needed variables
+struct NRT_MemSys {
+  struct {
+    bool enabled;
+    cuda::atomic<size_t, cuda::thread_scope_device> alloc;
+    cuda::atomic<size_t, cuda::thread_scope_device> free;
+    cuda::atomic<size_t, cuda::thread_scope_device> mi_alloc;
+    cuda::atomic<size_t, cuda::thread_scope_device> mi_free;
+  } stats;
+};
+static __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, NRT_MemInfo **mi);
+static __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out);
+extern "C" __device__ void* NRT_Allocate_External(size_t size);
+/* The Memory System object */
+__device__ NRT_MemSys* TheMSys;
+extern "C" __device__ void* NRT_Allocate(size_t size)
+{
+  void* ptr = NULL;
+  ptr       = malloc(size);
+//  if (TheMSys->stats.enabled) { TheMSys->stats.alloc++; }
+  return ptr;
+}
+extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
+                                            void* data,
+                                            size_t size,
+                                            NRT_dtor_function dtor,
+                                            void* dtor_info)
+//                                            NRT_MemSys* TheMSys)
+{
+  mi->refct     = 1; /* starts with 1 refct */
+  mi->dtor      = dtor;
+  mi->dtor_info = dtor_info;
+  mi->data      = data;
+  mi->size      = size;
+//  if (TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; }
+}
+extern "C"
+__device__ NRT_MemInfo* NRT_MemInfo_new(
+  void* data, size_t size, NRT_dtor_function dtor, void* dtor_info)
+{
+  NRT_MemInfo* mi = (NRT_MemInfo*)NRT_Allocate(sizeof(NRT_MemInfo));
+  if (mi != NULL) { NRT_MemInfo_init(mi, data, size, dtor, dtor_info); }
+  return mi;
+}
+extern "C" __device__ void NRT_Free(void* ptr)
+{
+  free(ptr);
+  //if (TheMSys->stats.enabled) { TheMSys->stats.free++; }
+}
+extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
+{
+  NRT_Free(mi);
+}
+extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi)
+{
+  NRT_dealloc(mi);
+  //if (TheMSys->stats.enabled) { TheMSys->stats.mi_free++; }
+}
+extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi)
+{
+  if (mi->dtor) /* We have a destructor */
+    mi->dtor(mi->data, mi->size, NULL);
+  /* Clear and release MemInfo */
+  NRT_MemInfo_destroy(mi);
+}
+extern "C" __device__ void* NRT_MemInfo_data_fast(NRT_MemInfo *mi)
+{
+  return mi->data;
+}
+extern "C" __device__ NRT_MemInfo *NRT_MemInfo_alloc_aligned(size_t size, unsigned align) {
+    NRT_MemInfo *mi = NULL;
+    void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi);
+    if (data == NULL) {
+        return NULL; /* return early as allocation failed */
+    }
+    //NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_aligned %p\n", data));
+    NRT_MemInfo_init(mi, data, size, NULL, NULL);
+    return mi;
+}
+static
+__device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align,
+                                          NRT_MemInfo **mi)
+{
+    size_t offset = 0, intptr = 0, remainder = 0;
+    //NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data_align %p\n", allocator));
+    char *base = (char *)nrt_allocate_meminfo_and_data(size + 2 * align, mi);
+    if (base == NULL) {
+        return NULL; /* return early as allocation failed */
+    }
+    intptr = (size_t) base;
+    /*
+     * See if the allocation is aligned already...
+     * Check if align is a power of 2, if so the modulo can be avoided.
+     */
+    if((align & (align - 1)) == 0)
+    {
+        remainder = intptr & (align - 1);
+    }
+    else
+    {
+        remainder = intptr % align;
+    }
+    if (remainder == 0){ /* Yes */
+        offset = 0;
+    } else { /* No, move forward `offset` bytes */
+        offset = align - remainder;
+    }
+    return (void*)((char *)base + offset);
+}
+static
+__device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out) {
+    NRT_MemInfo *mi = NULL;
+    //NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data %p\n", allocator));
+    char *base = (char *)NRT_Allocate_External(sizeof(NRT_MemInfo) + size);
+    if (base == NULL) {
+        *mi_out = NULL; /* set meminfo to NULL as allocation failed */
+        return NULL; /* return early as allocation failed */
+    }
+    mi = (NRT_MemInfo *) base;
+    *mi_out = mi;
+    return (void*)((char *)base + sizeof(NRT_MemInfo));
+}
+extern "C" __device__ void* NRT_Allocate_External(size_t size) {
+    void *ptr = NULL;
+    ptr = malloc(size);
+    //NRT_Debug(nrt_debug_print("NRT_Allocate_External bytes=%zu ptr=%p\n", size, ptr));
+    //if (TheMSys.stats.enabled)
+    //{
+    //    TheMSys.stats.alloc++;
+    //}
+    return ptr;
+}
+/*
+  c++ version of the NRT_decref function that usually is added to
+  the final kernel link in PTX form by numba. This version may be
+  used by c++ APIs that accept ownership of live objects and must
+  manage them going forward.
+*/
+extern "C" __device__ void NRT_decref(NRT_MemInfo* mi)
+{
+  if (mi != NULL) {
+    mi->refct--;
+    if (mi->refct == 0) { NRT_MemInfo_call_dtor(mi); }
+  }
+}
+#endif
+extern "C" __device__ void NRT_incref(NRT_MemInfo* mi)
+{
+  if (mi != NULL) {
+    mi->refct++;
+  }
+}

numba_cuda/numba/cuda/simulator/api.py CHANGED Viewed

@@ -35,6 +35,20 @@ class stream(object):
         pass
+# Default stream APIs. Since execution from the perspective of the host is
+# synchronous in the simulator, these can be the same as the stream class.
+default_stream = stream
+legacy_default_stream = stream
+per_thread_default_stream = stream
+# There is no way to use external streams with the simulator. Since the
+# implementation is not really using streams, we can't meaningfully interact
+# with external ones.
+def external_stream(ptr):
+    raise RuntimeError("External streams are unsupported in the simulator")
 def synchronize():
     pass

numba_cuda/numba/cuda/target.py CHANGED Viewed

@@ -74,6 +74,10 @@ class CUDATargetContext(BaseContext):
             datamodel.default_manager
         )
+    @property
+    def enable_nrt(self):
+        return getattr(config, 'CUDA_ENABLE_NRT', False)
     @property
     def DIBuilder(self):
         return debuginfo.DIBuilder

numba_cuda/numba/cuda/tests/cudapy/test_print.py CHANGED Viewed

@@ -126,8 +126,8 @@ class TestPrint(CUDATestCase):
     def test_bool(self):
         output, _ = self.run_code(printbool_usecase)
-        expected = "True\nFalse\nTrue\nTrue\nFalse\nFalse"
-        self.assertEqual(output.strip(), expected)
+        expected = "True\r?\nFalse\r?\nTrue\r?\nTrue\r?\nFalse\r?\nFalse"
+        self.assertRegex(output.strip(), expected)
     def test_printempty(self):
         output, _ = self.run_code(printempty_usecase)

numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py ADDED Viewed

@@ -0,0 +1,48 @@
+from numba.cuda.testing import (skip_on_cudasim, skip_unless_cudasim, unittest,
+                                CUDATestCase)
+from numba import cuda
+# Basic tests that stream APIs execute on the hardware and in the simulator.
+#
+# Correctness of semantics is exercised elsewhere in the test suite (though we
+# could improve the comprehensiveness of testing by adding more correctness
+# tests here in future).
+class TestStreamAPI(CUDATestCase):
+    def test_stream_create_and_sync(self):
+        s = cuda.stream()
+        s.synchronize()
+    def test_default_stream_create_and_sync(self):
+        s = cuda.default_stream()
+        s.synchronize()
+    def test_legacy_default_stream_create_and_sync(self):
+        s = cuda.legacy_default_stream()
+        s.synchronize()
+    def test_ptd_stream_create_and_sync(self):
+        s = cuda.per_thread_default_stream()
+        s.synchronize()
+    @skip_on_cudasim("External streams are unsupported on the simulator")
+    def test_external_stream_create(self):
+        #  A dummy pointer value
+        ptr = 0x12345678
+        s = cuda.external_stream(ptr)
+        # We don't test synchronization on the stream because it's not a real
+        # stream - we used a dummy pointer for testing the API, so we just
+        # ensure that the stream handle matches the external stream pointer.
+        self.assertEqual(ptr, s.handle.value)
+    @skip_unless_cudasim("External streams are usable with hardware")
+    def test_external_stream_simulator_unavailable(self):
+        ptr = 0x12345678
+        msg = "External streams are unsupported in the simulator"
+        with self.assertRaisesRegex(RuntimeError, msg):
+            cuda.external_stream(ptr)
+if __name__ == '__main__':
+    unittest.main()

numba_cuda/numba/cuda/tests/nrt/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+from numba.cuda.testing import ensure_supported_ccs_initialized
+from numba.cuda.tests import load_testsuite
+import os
+def load_tests(loader, tests, pattern):
+    ensure_supported_ccs_initialized()
+    return load_testsuite(loader, os.path.dirname(__file__))

numba_cuda/numba/cuda/tests/nrt/mock_numpy.py ADDED Viewed

@@ -0,0 +1,42 @@
+from numba.core import errors, types
+from numba.core.extending import overload
+from numba.np.arrayobj import (_check_const_str_dtype, is_nonelike,
+                               ty_parse_dtype, ty_parse_shape, numpy_empty_nd)
+# Typical tests for allocation use array construction (e.g. np.zeros, np.empty,
+# etc.) to induce allocations. These don't work in the CUDA target because they
+# need keyword arguments, which are presently not supported properly in the
+# CUDA target.
+#
+# To work around this, we can define our own function, that works like
+# the desired one, except that it uses only positional arguments.
+#
+# Once the CUDA target supports keyword arguments, this workaround will no
+# longer be necessary and the tests in this module should be switched to use
+# the relevant NumPy functions instead.
+def cuda_empty(shape, dtype):
+    pass
+@overload(cuda_empty)
+def ol_cuda_empty(shape, dtype):
+    _check_const_str_dtype("empty", dtype)
+    if (dtype is float or
+        (isinstance(dtype, types.Function) and dtype.typing_key is float) or
+            is_nonelike(dtype)): #default
+        nb_dtype = types.double
+    else:
+        nb_dtype = ty_parse_dtype(dtype)
+    ndim = ty_parse_shape(shape)
+    if nb_dtype is not None and ndim is not None:
+        retty = types.Array(dtype=nb_dtype, ndim=ndim, layout='C')
+        def impl(shape, dtype):
+            return numpy_empty_nd(shape, dtype, retty)
+        return impl
+    else:
+        msg = f"Cannot parse input types to function np.empty({shape}, {dtype})"
+        raise errors.TypingError(msg)

numba_cuda/numba/cuda/tests/nrt/test_nrt.py ADDED Viewed

@@ -0,0 +1,110 @@
+import re
+import gc
+import numpy as np
+import unittest
+from unittest.mock import patch
+from numba.core.runtime import rtsys
+from numba.tests.support import EnableNRTStatsMixin
+from numba.cuda.testing import CUDATestCase
+from .mock_numpy import cuda_empty
+from numba import cuda
+class TestNrtRefCt(EnableNRTStatsMixin, CUDATestCase):
+    def setUp(self):
+        # Clean up any NRT-backed objects hanging in a dead reference cycle
+        gc.collect()
+        super(TestNrtRefCt, self).setUp()
+    @unittest.expectedFailure
+    def test_no_return(self):
+        """
+        Test issue #1291
+        """
+        n = 10
+        @cuda.jit
+        def kernel():
+            for i in range(n):
+                temp = cuda_empty(2, np.float64) # noqa: F841
+            return None
+        init_stats = rtsys.get_allocation_stats()
+        with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
+            kernel[1,1]()
+        cur_stats = rtsys.get_allocation_stats()
+        self.assertEqual(cur_stats.alloc - init_stats.alloc, n)
+        self.assertEqual(cur_stats.free - init_stats.free, n)
+class TestNrtBasic(CUDATestCase):
+    def test_nrt_launches(self):
+        @cuda.jit
+        def f(x):
+            return x[:5]
+        @cuda.jit
+        def g():
+            x = cuda_empty(10, np.int64)
+            f(x)
+        with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
+            g[1,1]()
+        cuda.synchronize()
+    def test_nrt_ptx_contains_refcount(self):
+        @cuda.jit
+        def f(x):
+            return x[:5]
+        @cuda.jit
+        def g():
+            x = cuda_empty(10, np.int64)
+            f(x)
+        with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
+            g[1,1]()
+        ptx = next(iter(g.inspect_asm().values()))
+        # The following checks that a `call` PTX instruction is
+        # emitted for NRT_MemInfo_alloc_aligned, NRT_incref and
+        # NRT_decref
+        p1 = r"call\.uni(.|\n)*NRT_MemInfo_alloc_aligned"
+        match = re.search(p1, ptx)
+        assert match is not None
+        p2 = r"call\.uni.*\n.*NRT_incref"
+        match = re.search(p2, ptx)
+        assert match is not None
+        p3 = r"call\.uni.*\n.*NRT_decref"
+        match = re.search(p3, ptx)
+        assert match is not None
+    def test_nrt_returns_correct(self):
+        @cuda.jit
+        def f(x):
+            return x[5:]
+        @cuda.jit
+        def g(out_ary):
+            x = cuda_empty(10, np.int64)
+            x[5] = 1
+            y = f(x)
+            out_ary[0] = y[0]
+        out_ary = np.zeros(1, dtype=np.int64)
+        with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
+            g[1,1](out_ary)
+        self.assertEqual(out_ary[0], 1)
+if __name__ == '__main__':
+    unittest.main()

numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import argparse
 import pathlib
+import platform
 import subprocess
 import sys
@@ -56,7 +57,13 @@ def determine_include_flags():
         print(f"Unexpected return code ({rc}) from `nvcc -v`. Expected 1.")
         return None
-    output = cp.stderr.decode()
+    # NVCC writes to stdout on Windows and stderr on Linux
+    if platform.system() == 'Windows':
+        stream = cp.stdout
+    else:
+        stream = cp.stderr
+    output = stream.decode()
     lines = output.splitlines()
     includes_lines = [line for line in lines if line.startswith("#$ INCLUDES=")]

{numba_cuda-0.0.18.dist-info → numba_cuda-0.0.19.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: numba-cuda
-Version: 0.0.18
+Version: 0.0.19
 Summary: CUDA target for Numba
 Author: Anaconda Inc., NVIDIA Corporation
 License: BSD 2-clause

{numba_cuda-0.0.18.dist-info → numba_cuda-0.0.19.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 _numba_cuda_redirector.pth,sha256=cmfMMmV0JPh3yEpl4bGeM9AuXiVVMSo6Z_b7RaQL3XE,30
 _numba_cuda_redirector.py,sha256=rc56rnb40w3AtrqnhS66JSgYTSTsi3iTn8yP3NuoQV8,2401
-numba_cuda/VERSION,sha256=9p4BNLUELS6P4gQF_geoXDc4ldjt9TTmnJlhGbwWsO0,7
+numba_cuda/VERSION,sha256=K2Wn4BRtrXcEkuPZYGGM_h_Orgai6flc272777m5MYQ,7
 numba_cuda/__init__.py,sha256=atXeUvJKR3JHcAiCFbXCVOJQUHgB1TulmsqSL_9RT3Q,114
 numba_cuda/_version.py,sha256=jbdUsbR7sVllw0KxQNB0-FMd929CGg3kH2fhHdrlkuc,719
 numba_cuda/numba/cuda/__init__.py,sha256=idyVHOObC9lTYnp62v7rVprSacRM4d5F6vhXfG5ElTI,621
@@ -13,7 +13,7 @@ numba_cuda/numba/cuda/compiler.py,sha256=47SjuI5p4yWCujAglIq0Cb0ARO8QxRp4fOZropk
 numba_cuda/numba/cuda/cpp_function_wrappers.cu,sha256=iv84_F6Q9kFjV_kclrQz1msh6Dud8mI3qNkswTid7Qc,953
 numba_cuda/numba/cuda/cuda_fp16.h,sha256=1IC0mdNdkvKbvAe0-f4uYVS7WFrVqOyI1nRUbBiqr6A,126844
 numba_cuda/numba/cuda/cuda_fp16.hpp,sha256=vJ7NUr2X2tKhAP7ojydAiCoOjVO6n4QGoXD6m9Srrlw,89130
-numba_cuda/numba/cuda/cuda_paths.py,sha256=_fPrwCysDSoxwUvU_2xyGe9KSDxtHzunkxVqQNLtTBg,7723
+numba_cuda/numba/cuda/cuda_paths.py,sha256=wwZKOUS0FyZloRUgDVDPPCwtm3t6Js7U369_YgMpEC0,9859
 numba_cuda/numba/cuda/cudadecl.py,sha256=ynUidit8oPGjedc6p1miMGtS20DOji3DiQHzwmx6m0s,23192
 numba_cuda/numba/cuda/cudaimpl.py,sha256=3YMxQSCv2KClBrpuXGchrTNICV1F6NIjjL2rie5fDZ4,38628
 numba_cuda/numba/cuda/cudamath.py,sha256=EFNtdzEytAZuwijdRoFGzVKCeal76UzzaNy7wUFQx8I,3978
@@ -21,7 +21,7 @@ numba_cuda/numba/cuda/decorators.py,sha256=qSpir16-jPYSe2YuRZ6g9INeobmsMNg6ab9IZ
 numba_cuda/numba/cuda/descriptor.py,sha256=rNMaurJkjNjIBmHPozDoLC35DMURE0fn_LtnXRmaG_w,985
 numba_cuda/numba/cuda/device_init.py,sha256=lP79tCsQ0Np9xcbjv_lXcH4JOiVZvV8nwg3INdETxsc,3586
 numba_cuda/numba/cuda/deviceufunc.py,sha256=yxAH71dpgJWK8okmCJm0FUV6z2AqdThCYOTZspT7z0M,30775
-numba_cuda/numba/cuda/dispatcher.py,sha256=CwFksBBcjNg9dLSTgC4GgqOy2sLeZYX8mvZvdzscGBw,40206
+numba_cuda/numba/cuda/dispatcher.py,sha256=1ND28o_YeP_0YS2iFYwCH9Byc87qTvCVKjT7PHu2Fsg,41233
 numba_cuda/numba/cuda/errors.py,sha256=XwWHzCllx0DXU6BQdoRH0m3pznGxnTFOBTVYXMmCfqg,1724
 numba_cuda/numba/cuda/extending.py,sha256=URsyBYls2te-mgE0yvDY6akvawYCA0blBFfD7Lf9DO4,142
 numba_cuda/numba/cuda/initialize.py,sha256=TQGHGLQoq4ch4J6CLDcJdGsZzXM-g2kDgdyO1u-Rbhg,546
@@ -38,33 +38,34 @@ numba_cuda/numba/cuda/printimpl.py,sha256=Y1BCQ7EgO2wQ7O6LibNVYBG3tmjVTvmURATW40
 numba_cuda/numba/cuda/random.py,sha256=khX8iDdde_RTUPWhAqrxZacHRQAorFr7BokPuxRWzrg,10456
 numba_cuda/numba/cuda/simulator_init.py,sha256=W_bPRtmPGOQVuiprbgt7ENnnnELv_LPCeLDIsfsvFZ8,460
 numba_cuda/numba/cuda/stubs.py,sha256=W3tozv4ganMnfbdFqyPjgQXYeX8GQhwx_xXgv8jk6iM,22270
-numba_cuda/numba/cuda/target.py,sha256=LUOJRvGrX7Ch3-vLbZcjti21RAwUctdodVVcl82wYJ0,16954
+numba_cuda/numba/cuda/target.py,sha256=hBflzmxCGlmTugWT1sYhZj9f4HkQAMK2RQ9lO85pMW4,17052
 numba_cuda/numba/cuda/testing.py,sha256=E0wP2vfno1yWsl0v1zg31kpbU8FrKxTF-5y9Iv4WjA4,6412
 numba_cuda/numba/cuda/types.py,sha256=WVfjcly_VUpG9FfKueiEPzZm2NV8Hg0XAFg3bNzPdVc,1314
 numba_cuda/numba/cuda/ufuncs.py,sha256=txw27IxG80W1Yo7e-XwL2AMcQo0fMnxMjBIMy-n5pCo,23317
 numba_cuda/numba/cuda/vector_types.py,sha256=s18dY0IUpT-RcaBvQsa_zEbYuuL2IT0Vh6afCeccwmQ,6750
 numba_cuda/numba/cuda/vectorizers.py,sha256=u_0EzaD5tqVH8uOz4Gmqn3FgPC1rckwDAQuROm0BXm8,8915
 numba_cuda/numba/cuda/cudadrv/__init__.py,sha256=0TL4MZcJXUoo9qA7uu0vLv7eHrXRerVmyfi7O149ITw,199
-numba_cuda/numba/cuda/cudadrv/devicearray.py,sha256=B3ItYQywTnwTWjltxVRx6oaKRq7rxTtvOaiqTWsMQ2w,31123
+numba_cuda/numba/cuda/cudadrv/devicearray.py,sha256=06kM7iFcx1TYiFhs1o9r1kyoA3k5yS7mFAdZDf6nrxA,31215
 numba_cuda/numba/cuda/cudadrv/devices.py,sha256=6SneNmoq83gue0txFWWx4A65vViAa8xA06FzkApoqAk,7992
 numba_cuda/numba/cuda/cudadrv/driver.py,sha256=uPjKugdtSJfIwVSAo3KgkvQhctbABkQphHAfcq6Q7ec,110892
 numba_cuda/numba/cuda/cudadrv/drvapi.py,sha256=52ms3X6hfPaQB8E1jb6g7QKqRvHzBMlDQ-V2DM1rXxQ,17178
 numba_cuda/numba/cuda/cudadrv/dummyarray.py,sha256=nXRngdr-k3h_BNGQuJUxmp89yGNWxqEDJedpwDPEZ44,14209
 numba_cuda/numba/cuda/cudadrv/enums.py,sha256=37zZmyrLvT-7R8wWtwKJkQhN8siLMxsDGiA3_NQ-yx8,23740
 numba_cuda/numba/cuda/cudadrv/error.py,sha256=zEIryW6aIy8GG4ypmTliB6RgY4Gy2n8ckz7I6W99LUM,524
-numba_cuda/numba/cuda/cudadrv/libs.py,sha256=PRyxal4bz9jVZmuLpKiYw-VaR59LekfwJgWKo7R5uRY,6005
+numba_cuda/numba/cuda/cudadrv/libs.py,sha256=Gk9zQ1CKcsZsWl-_9QneXeP9VH5q5R1I3Cx043UOytk,7240
 numba_cuda/numba/cuda/cudadrv/linkable_code.py,sha256=Q_YTv0apBo9t8pkMlKrthPPSVeLd376ZTmVDF5NtVVo,1328
 numba_cuda/numba/cuda/cudadrv/mappings.py,sha256=-dTPHvAkDjdH6vS5OjgrB71AFuqKO6CRgf7hpOk2wiw,802
 numba_cuda/numba/cuda/cudadrv/ndarray.py,sha256=HtULWWFyDlgqvrH5459yyPTvU4UbUo2DSdtcNfvbH00,473
-numba_cuda/numba/cuda/cudadrv/nvrtc.py,sha256=CLpuD9VzPcYoXj8dZ2meSoqbWXHOOC5V5D6dFNdXqmg,9693
+numba_cuda/numba/cuda/cudadrv/nvrtc.py,sha256=rv-XQo0snJj4xyEbfeBqivziIxCwMOQzIIEOnvLQaJI,9825
 numba_cuda/numba/cuda/cudadrv/nvvm.py,sha256=v2hJJTAQeRmoG59-hnhgMEp5BSVA73QHtEoy636VKao,24107
 numba_cuda/numba/cuda/cudadrv/rtapi.py,sha256=WdeUoWzsYNYodx8kMRLVIjnNs0QzwpCihd2Q0AaqItE,226
 numba_cuda/numba/cuda/cudadrv/runtime.py,sha256=Tj9ACrzQqNmDSO6xfpzw12EsQknSywQ-ZGuWMbDdHnQ,4255
 numba_cuda/numba/cuda/kernels/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 numba_cuda/numba/cuda/kernels/reduction.py,sha256=fQnaWtoNB2yp143MNbE1DujqFIYy0KV_2moQVvbaROU,9362
 numba_cuda/numba/cuda/kernels/transpose.py,sha256=5FSu-nbTfhintxwfU-bjT2px2otQF5QkKH-JPDDWq_k,2061
+numba_cuda/numba/cuda/runtime/nrt.cu,sha256=i8Xcf-x84n3uNPzs_xak4c_sLHOH91ast2aE6DKKf9Q,5497
 numba_cuda/numba/cuda/simulator/__init__.py,sha256=crW0VQ_8e7DMRSHKoAIziZ37ea5mpbh_49tR9M3d5YY,1610
-numba_cuda/numba/cuda/simulator/api.py,sha256=i0PQ8_uXVk8l17xQYFaRST-T3PNBkoHUoFgJl2FYYtE,2720
+numba_cuda/numba/cuda/simulator/api.py,sha256=K_fX-w9X4grGx2IAp0XlBW9rth5l7wibMwinQvkE7Jc,3237
 numba_cuda/numba/cuda/simulator/compiler.py,sha256=eXnvmzSKzIZZzBz6ZFJ-vMNyRAgqbCiB-AO5IJXuUyM,232
 numba_cuda/numba/cuda/simulator/kernel.py,sha256=GO4HuXBlEstJtgiuMRB_6hjNizBSINR9_hganvMjHH4,10593
 numba_cuda/numba/cuda/simulator/kernelapi.py,sha256=ZYC_XQqnA51TJCPlAjVHHkOjXeww0yUP6JZeibXw3T8,12397
@@ -175,7 +176,7 @@ numba_cuda/numba/cuda/tests/cudapy/test_operator.py,sha256=0nJej4D898_JU-jhlif44
 numba_cuda/numba/cuda/tests/cudapy/test_optimization.py,sha256=SvqRsSFgcGxkFDZS-kul5B-mi8GxINTS98uUzAy4dhw,2647
 numba_cuda/numba/cuda/tests/cudapy/test_overload.py,sha256=u4yUDVFcV9E3NWMlNjM81e3IW4KaIkcDtXig8JYevsw,8538
 numba_cuda/numba/cuda/tests/cudapy/test_powi.py,sha256=TI82rYRnkSnwv9VN6PMpBnr9JqMJ_F3HhH4cKY6O8tw,3276
-numba_cuda/numba/cuda/tests/cudapy/test_print.py,sha256=QXhhhnEz1d5BlldLINQVnmuHeM_dT3aLvfGS7jm24nE,4451
+numba_cuda/numba/cuda/tests/cudapy/test_print.py,sha256=r2xmMNx80_ANi3uFB3CQt3AHAXG_JdhStY1S796hlK0,4466
 numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py,sha256=R88Vfgg3mSAZ0Jy6WT6dJNmkFTsxnVnEmO7XqpqyxuU,986
 numba_cuda/numba/cuda/tests/cudapy/test_random.py,sha256=rLw7_8a7BBhD_8GNqMal0l_AbWXzLs_Q0hC6_X8gdjA,3467
 numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py,sha256=grR64kdRlsLcR0K3IxSfI2VKsTrrqxsXuROOpvj-6nw,18769
@@ -186,6 +187,7 @@ numba_cuda/numba/cuda/tests/cudapy/test_serialize.py,sha256=alE5-lTwbjz3Tv6OvQPS
 numba_cuda/numba/cuda/tests/cudapy/test_slicing.py,sha256=bAh_sIk5V9_0_dOVGdzmyjwZkHMLjEbQuEI4e5zRMoU,903
 numba_cuda/numba/cuda/tests/cudapy/test_sm.py,sha256=kh1F0wwQ2_bd54Q4GUX99y2oiWHQwBpyC__ckk-jiTU,14575
 numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py,sha256=bTXDjU94ezo6Bz_lktlPyowTcJHBOWfy7-nJB9e-B_s,7231
+numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py,sha256=alwSPm2xLvuYEwzpuCE6UUkOp6xcEoVqZjyJk3VJjtY,1743
 numba_cuda/numba/cuda/tests/cudapy/test_sync.py,sha256=Y851UqNkT80U9q_C05SQfvPRCY7jjRARHOMk6g0lU4Y,7837
 numba_cuda/numba/cuda/tests/cudapy/test_transpose.py,sha256=JAQX2EUHwlpKCfJDGspaldmsIRbHxnXpsNUrvRrnIEE,3134
 numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py,sha256=-ehvkxelr45aT8sUNL9Hq8cn2GU_K4GL1yWeX-rHqEM,9680
@@ -227,12 +229,15 @@ numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py,sha256=o4DYocyHK7
 numba_cuda/numba/cuda/tests/nocuda/test_import.py,sha256=teiL8rpFGQOh41kyBSSNHHFYAJYgpdStXkTcpK4_fxo,1641
 numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py,sha256=7kJOPHEcrjy_kTA9Ym-iT_B972bgFRu3UkRtwIgWtuI,7948
 numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py,sha256=n0_-xFaw6QqiZbhe55oy7lnEeOwqTvA55p5EUFiTpNw,2006
+numba_cuda/numba/cuda/tests/nrt/__init__.py,sha256=43EXdiXXRBd6yIcVGMrU9F_EJCD9Uw3mzOP3SB53AEE,260
+numba_cuda/numba/cuda/tests/nrt/mock_numpy.py,sha256=Qtn52GoKZ_ydre3oqkLWVdImC37tuPClUy4uHSutaJo,1568
+numba_cuda/numba/cuda/tests/nrt/test_nrt.py,sha256=Ox6ei2DldvSSS-CndTXRxLnsvWdteOQNgn6GvKHB244,2789
 numba_cuda/numba/cuda/tests/test_binary_generation/Makefile,sha256=OFC_6irwscCNGAyJJKq7fTchzWosCUuiVWU02m0bcUQ,2248
-numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py,sha256=OqqmFhDk3c0Edt4AvAGm0MQRCXb9jLSO2wpQ72oiXXI,4838
+numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py,sha256=V0raLZLGSiWbE_K-JluI0CnmNkXbhlMVj-TH7P1OV8E,5014
 numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu,sha256=cUf-t6ZM9MK_x7X_aKwsrKW1LdR97XcpR-qnYr5faOE,453
 numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu,sha256=q3oxZziT8KDodeNcEBiWULH6vMrHCWucmJmtrg8C0d0,128
-numba_cuda-0.0.18.dist-info/LICENSE,sha256=eHeYE-XjASmwbxfsP5AImgfzRwZurZGqH1f6OFwJ4io,1326
-numba_cuda-0.0.18.dist-info/METADATA,sha256=kJletXn1FHyLocorf4n5QLO1TH0v6G_8uNkbqBAwiWY,1393
-numba_cuda-0.0.18.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
-numba_cuda-0.0.18.dist-info/top_level.txt,sha256=C50SsH-8tXDmt7I0Y3nlJYhS5s6pqWflCPdobe9vx2M,11
-numba_cuda-0.0.18.dist-info/RECORD,,
+numba_cuda-0.0.19.dist-info/LICENSE,sha256=eHeYE-XjASmwbxfsP5AImgfzRwZurZGqH1f6OFwJ4io,1326
+numba_cuda-0.0.19.dist-info/METADATA,sha256=GAWms3JiCaxTzo4WMk-5h31_Oqo8YFPgekLKFR_YfqA,1393
+numba_cuda-0.0.19.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+numba_cuda-0.0.19.dist-info/top_level.txt,sha256=C50SsH-8tXDmt7I0Y3nlJYhS5s6pqWflCPdobe9vx2M,11
+numba_cuda-0.0.19.dist-info/RECORD,,

{numba_cuda-0.0.18.dist-info → numba_cuda-0.0.19.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.3.0)
+Generator: setuptools (75.6.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{numba_cuda-0.0.18.dist-info → numba_cuda-0.0.19.dist-info}/LICENSE RENAMED Viewed

File without changes

{numba_cuda-0.0.18.dist-info → numba_cuda-0.0.19.dist-info}/top_level.txt RENAMED Viewed

File without changes

numba-cuda 0.0.18__py3-none-any.whl → 0.0.19__py3-none-any.whl

numba-cuda 0.0.18py3-none-any.whl → 0.0.19py3-none-any.whl