PyPI - numba-cuda - Versions diffs - 0.0.18__py3-none-any.whl → 0.0.20__py3-none-any.whl - Mend

numba-cuda 0.0.18py3-none-any.whl → 0.0.20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

numba_cuda/VERSION +1 -1
numba_cuda/numba/cuda/compiler.py +180 -10
numba_cuda/numba/cuda/cuda_paths.py +70 -0
numba_cuda/numba/cuda/cudadrv/devicearray.py +4 -1
numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
numba_cuda/numba/cuda/cudadrv/libs.py +38 -0
numba_cuda/numba/cuda/cudadrv/nvrtc.py +9 -4
numba_cuda/numba/cuda/dispatcher.py +54 -15
numba_cuda/numba/cuda/runtime/nrt.cu +190 -0
numba_cuda/numba/cuda/simulator/api.py +14 -0
numba_cuda/numba/cuda/target.py +4 -0
numba_cuda/numba/cuda/tests/cudapy/test_debug.py +2 -4
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +1 -0
numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +3 -10
numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +1 -2
numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -2
numba_cuda/numba/cuda/tests/cudapy/test_print.py +2 -2
numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +52 -0
numba_cuda/numba/cuda/tests/nrt/__init__.py +8 -0
numba_cuda/numba/cuda/tests/nrt/mock_numpy.py +42 -0
numba_cuda/numba/cuda/tests/nrt/test_nrt.py +110 -0
numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +8 -1
{numba_cuda-0.0.18.dist-info → numba_cuda-0.0.20.dist-info}/METADATA +12 -8
{numba_cuda-0.0.18.dist-info → numba_cuda-0.0.20.dist-info}/RECORD +27 -22
{numba_cuda-0.0.18.dist-info → numba_cuda-0.0.20.dist-info}/WHEEL +1 -1
{numba_cuda-0.0.18.dist-info → numba_cuda-0.0.20.dist-info}/LICENSE +0 -0
{numba_cuda-0.0.18.dist-info → numba_cuda-0.0.20.dist-info}/top_level.txt +0 -0

numba_cuda/numba/cuda/runtime/nrt.cu ADDED Viewed

@@ -0,0 +1,190 @@
+#ifndef _NRT_H
+#define _NRT_H
+#include <cuda/atomic>
+typedef void (*NRT_dtor_function)(void* ptr, size_t size, void* info);
+typedef void (*NRT_dealloc_func)(void* ptr, void* dealloc_info);
+typedef struct MemInfo NRT_MemInfo;
+extern "C" {
+struct MemInfo {
+  cuda::atomic<size_t, cuda::thread_scope_device> refct;
+  NRT_dtor_function dtor;
+  void* dtor_info;
+  void* data;
+  size_t size;
+};
+}
+// Globally needed variables
+struct NRT_MemSys {
+  struct {
+    bool enabled;
+    cuda::atomic<size_t, cuda::thread_scope_device> alloc;
+    cuda::atomic<size_t, cuda::thread_scope_device> free;
+    cuda::atomic<size_t, cuda::thread_scope_device> mi_alloc;
+    cuda::atomic<size_t, cuda::thread_scope_device> mi_free;
+  } stats;
+};
+static __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, NRT_MemInfo **mi);
+static __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out);
+extern "C" __device__ void* NRT_Allocate_External(size_t size);
+/* The Memory System object */
+__device__ NRT_MemSys* TheMSys;
+extern "C" __device__ void* NRT_Allocate(size_t size)
+{
+  void* ptr = NULL;
+  ptr       = malloc(size);
+//  if (TheMSys->stats.enabled) { TheMSys->stats.alloc++; }
+  return ptr;
+}
+extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
+                                            void* data,
+                                            size_t size,
+                                            NRT_dtor_function dtor,
+                                            void* dtor_info)
+//                                            NRT_MemSys* TheMSys)
+{
+  mi->refct     = 1; /* starts with 1 refct */
+  mi->dtor      = dtor;
+  mi->dtor_info = dtor_info;
+  mi->data      = data;
+  mi->size      = size;
+//  if (TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; }
+}
+extern "C"
+__device__ NRT_MemInfo* NRT_MemInfo_new(
+  void* data, size_t size, NRT_dtor_function dtor, void* dtor_info)
+{
+  NRT_MemInfo* mi = (NRT_MemInfo*)NRT_Allocate(sizeof(NRT_MemInfo));
+  if (mi != NULL) { NRT_MemInfo_init(mi, data, size, dtor, dtor_info); }
+  return mi;
+}
+extern "C" __device__ void NRT_Free(void* ptr)
+{
+  free(ptr);
+  //if (TheMSys->stats.enabled) { TheMSys->stats.free++; }
+}
+extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
+{
+  NRT_Free(mi);
+}
+extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi)
+{
+  NRT_dealloc(mi);
+  //if (TheMSys->stats.enabled) { TheMSys->stats.mi_free++; }
+}
+extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi)
+{
+  if (mi->dtor) /* We have a destructor */
+    mi->dtor(mi->data, mi->size, NULL);
+  /* Clear and release MemInfo */
+  NRT_MemInfo_destroy(mi);
+}
+extern "C" __device__ void* NRT_MemInfo_data_fast(NRT_MemInfo *mi)
+{
+  return mi->data;
+}
+extern "C" __device__ NRT_MemInfo *NRT_MemInfo_alloc_aligned(size_t size, unsigned align) {
+    NRT_MemInfo *mi = NULL;
+    void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi);
+    if (data == NULL) {
+        return NULL; /* return early as allocation failed */
+    }
+    //NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_aligned %p\n", data));
+    NRT_MemInfo_init(mi, data, size, NULL, NULL);
+    return mi;
+}
+static
+__device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align,
+                                          NRT_MemInfo **mi)
+{
+    size_t offset = 0, intptr = 0, remainder = 0;
+    //NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data_align %p\n", allocator));
+    char *base = (char *)nrt_allocate_meminfo_and_data(size + 2 * align, mi);
+    if (base == NULL) {
+        return NULL; /* return early as allocation failed */
+    }
+    intptr = (size_t) base;
+    /*
+     * See if the allocation is aligned already...
+     * Check if align is a power of 2, if so the modulo can be avoided.
+     */
+    if((align & (align - 1)) == 0)
+    {
+        remainder = intptr & (align - 1);
+    }
+    else
+    {
+        remainder = intptr % align;
+    }
+    if (remainder == 0){ /* Yes */
+        offset = 0;
+    } else { /* No, move forward `offset` bytes */
+        offset = align - remainder;
+    }
+    return (void*)((char *)base + offset);
+}
+static
+__device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out) {
+    NRT_MemInfo *mi = NULL;
+    //NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data %p\n", allocator));
+    char *base = (char *)NRT_Allocate_External(sizeof(NRT_MemInfo) + size);
+    if (base == NULL) {
+        *mi_out = NULL; /* set meminfo to NULL as allocation failed */
+        return NULL; /* return early as allocation failed */
+    }
+    mi = (NRT_MemInfo *) base;
+    *mi_out = mi;
+    return (void*)((char *)base + sizeof(NRT_MemInfo));
+}
+extern "C" __device__ void* NRT_Allocate_External(size_t size) {
+    void *ptr = NULL;
+    ptr = malloc(size);
+    //NRT_Debug(nrt_debug_print("NRT_Allocate_External bytes=%zu ptr=%p\n", size, ptr));
+    //if (TheMSys.stats.enabled)
+    //{
+    //    TheMSys.stats.alloc++;
+    //}
+    return ptr;
+}
+/*
+  c++ version of the NRT_decref function that usually is added to
+  the final kernel link in PTX form by numba. This version may be
+  used by c++ APIs that accept ownership of live objects and must
+  manage them going forward.
+*/
+extern "C" __device__ void NRT_decref(NRT_MemInfo* mi)
+{
+  if (mi != NULL) {
+    mi->refct--;
+    if (mi->refct == 0) { NRT_MemInfo_call_dtor(mi); }
+  }
+}
+#endif
+extern "C" __device__ void NRT_incref(NRT_MemInfo* mi)
+{
+  if (mi != NULL) {
+    mi->refct++;
+  }
+}

numba_cuda/numba/cuda/simulator/api.py CHANGED Viewed

@@ -35,6 +35,20 @@ class stream(object):
         pass
+# Default stream APIs. Since execution from the perspective of the host is
+# synchronous in the simulator, these can be the same as the stream class.
+default_stream = stream
+legacy_default_stream = stream
+per_thread_default_stream = stream
+# There is no way to use external streams with the simulator. Since the
+# implementation is not really using streams, we can't meaningfully interact
+# with external ones.
+def external_stream(ptr):
+    raise RuntimeError("External streams are unsupported in the simulator")
 def synchronize():
     pass

numba_cuda/numba/cuda/target.py CHANGED Viewed

@@ -74,6 +74,10 @@ class CUDATargetContext(BaseContext):
             datamodel.default_manager
         )
+    @property
+    def enable_nrt(self):
+        return getattr(config, 'CUDA_ENABLE_NRT', False)
     @property
     def DIBuilder(self):
         return debuginfo.DIBuilder

numba_cuda/numba/cuda/tests/cudapy/test_debug.py CHANGED Viewed

@@ -48,13 +48,11 @@ class TestDebugOutput(CUDATestCase):
                 self.assertRaises(AssertionError, check_meth, out)
     def _check_dump_bytecode(self, out):
-        if PYVERSION in ((3, 11), (3, 12)):
+        if PYVERSION > (3, 10):
             # binop with arg=0 is binary add, see CPython dis.py and opcode.py
             self.assertIn('BINARY_OP(arg=0', out)
-        elif PYVERSION in ((3, 9), (3, 10)):
-            self.assertIn('BINARY_ADD', out)
         else:
-            raise NotImplementedError(PYVERSION)
+            self.assertIn('BINARY_ADD', out)
     def _check_dump_cfg(self, out):
         self.assertIn('CFG dominators', out)

numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py CHANGED Viewed

@@ -72,6 +72,7 @@ class TestCudaDebugInfo(CUDATestCase):
         def f(x):
             x[0] = 0
+    @unittest.skip("Wrappers no longer exist")
     def test_wrapper_has_debuginfo(self):
         sig = (types.int32[::1],)

numba_cuda/numba/cuda/tests/cudapy/test_inspect.py CHANGED Viewed

@@ -33,10 +33,7 @@ class TestInspect(CUDATestCase):
         self.assertIn("foo", llvm)
         # Kernel in LLVM
-        self.assertIn('cuda.kernel.wrapper', llvm)
-        # Wrapped device function body in LLVM
-        self.assertIn("define linkonce_odr i32", llvm)
+        self.assertIn("define void @", llvm)
         asm = foo.inspect_asm(sig)
@@ -72,12 +69,8 @@ class TestInspect(CUDATestCase):
         self.assertIn("foo", llvmirs[float64, float64])
         # Kernels in LLVM
-        self.assertIn('cuda.kernel.wrapper', llvmirs[intp, intp])
-        self.assertIn('cuda.kernel.wrapper', llvmirs[float64, float64])
-        # Wrapped device function bodies in LLVM
-        self.assertIn("define linkonce_odr i32", llvmirs[intp, intp])
-        self.assertIn("define linkonce_odr i32", llvmirs[float64, float64])
+        self.assertIn("define void @", llvmirs[intp, intp])
+        self.assertIn("define void @", llvmirs[float64, float64])
         asmdict = foo.inspect_asm()

numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py CHANGED Viewed

@@ -170,10 +170,9 @@ class TestCudaLineInfo(CUDATestCase):
                 subprograms += 1
         # One DISubprogram for each of:
-        # - The kernel wrapper
         # - The caller
         # - The callee
-        expected_subprograms = 3
+        expected_subprograms = 2
         self.assertEqual(subprograms, expected_subprograms,
                          f'"Expected {expected_subprograms} DISubprograms; '

numba_cuda/numba/cuda/tests/cudapy/test_optimization.py CHANGED Viewed

@@ -14,8 +14,11 @@ def device_func(x, y, z):
 # Fragments of code that are removed from kernel_func's PTX when optimization
-# is on
-removed_by_opt = ( '__local_depot0', 'call.uni', 'st.param.b64')
+# is on. Previously this list was longer when kernel wrappers were used - if
+# the test function were more complex it may be possible to isolate additional
+# fragments of PTX we could check for the absence / presence of, but removal of
+# the use of local memory is a good indicator that optimization was applied.
+removed_by_opt = ( '__local_depot0',)
 @skip_on_cudasim('Simulator does not optimize code')

numba_cuda/numba/cuda/tests/cudapy/test_print.py CHANGED Viewed

@@ -126,8 +126,8 @@ class TestPrint(CUDATestCase):
     def test_bool(self):
         output, _ = self.run_code(printbool_usecase)
-        expected = "True\nFalse\nTrue\nTrue\nFalse\nFalse"
-        self.assertEqual(output.strip(), expected)
+        expected = "True\r?\nFalse\r?\nTrue\r?\nTrue\r?\nFalse\r?\nFalse"
+        self.assertRegex(output.strip(), expected)
     def test_printempty(self):
         output, _ = self.run_code(printempty_usecase)

numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py ADDED Viewed

@@ -0,0 +1,52 @@
+from numba.cuda.testing import (skip_on_cudasim, skip_unless_cudasim, unittest,
+                                CUDATestCase)
+from numba import config, cuda
+# Basic tests that stream APIs execute on the hardware and in the simulator.
+#
+# Correctness of semantics is exercised elsewhere in the test suite (though we
+# could improve the comprehensiveness of testing by adding more correctness
+# tests here in future).
+class TestStreamAPI(CUDATestCase):
+    def test_stream_create_and_sync(self):
+        s = cuda.stream()
+        s.synchronize()
+    def test_default_stream_create_and_sync(self):
+        s = cuda.default_stream()
+        s.synchronize()
+    def test_legacy_default_stream_create_and_sync(self):
+        s = cuda.legacy_default_stream()
+        s.synchronize()
+    def test_ptd_stream_create_and_sync(self):
+        s = cuda.per_thread_default_stream()
+        s.synchronize()
+    @skip_on_cudasim("External streams are unsupported on the simulator")
+    def test_external_stream_create(self):
+        #  A dummy pointer value
+        ptr = 0x12345678
+        s = cuda.external_stream(ptr)
+        # We don't test synchronization on the stream because it's not a real
+        # stream - we used a dummy pointer for testing the API, so we just
+        # ensure that the stream handle matches the external stream pointer.
+        if config.CUDA_USE_NVIDIA_BINDING:
+            value = int(s.handle)
+        else:
+            value = s.handle.value
+        self.assertEqual(ptr, value)
+    @skip_unless_cudasim("External streams are usable with hardware")
+    def test_external_stream_simulator_unavailable(self):
+        ptr = 0x12345678
+        msg = "External streams are unsupported in the simulator"
+        with self.assertRaisesRegex(RuntimeError, msg):
+            cuda.external_stream(ptr)
+if __name__ == '__main__':
+    unittest.main()

numba_cuda/numba/cuda/tests/nrt/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+from numba.cuda.testing import ensure_supported_ccs_initialized
+from numba.cuda.tests import load_testsuite
+import os
+def load_tests(loader, tests, pattern):
+    ensure_supported_ccs_initialized()
+    return load_testsuite(loader, os.path.dirname(__file__))

numba_cuda/numba/cuda/tests/nrt/mock_numpy.py ADDED Viewed

@@ -0,0 +1,42 @@
+from numba.core import errors, types
+from numba.core.extending import overload
+from numba.np.arrayobj import (_check_const_str_dtype, is_nonelike,
+                               ty_parse_dtype, ty_parse_shape, numpy_empty_nd)
+# Typical tests for allocation use array construction (e.g. np.zeros, np.empty,
+# etc.) to induce allocations. These don't work in the CUDA target because they
+# need keyword arguments, which are presently not supported properly in the
+# CUDA target.
+#
+# To work around this, we can define our own function, that works like
+# the desired one, except that it uses only positional arguments.
+#
+# Once the CUDA target supports keyword arguments, this workaround will no
+# longer be necessary and the tests in this module should be switched to use
+# the relevant NumPy functions instead.
+def cuda_empty(shape, dtype):
+    pass
+@overload(cuda_empty)
+def ol_cuda_empty(shape, dtype):
+    _check_const_str_dtype("empty", dtype)
+    if (dtype is float or
+        (isinstance(dtype, types.Function) and dtype.typing_key is float) or
+            is_nonelike(dtype)): #default
+        nb_dtype = types.double
+    else:
+        nb_dtype = ty_parse_dtype(dtype)
+    ndim = ty_parse_shape(shape)
+    if nb_dtype is not None and ndim is not None:
+        retty = types.Array(dtype=nb_dtype, ndim=ndim, layout='C')
+        def impl(shape, dtype):
+            return numpy_empty_nd(shape, dtype, retty)
+        return impl
+    else:
+        msg = f"Cannot parse input types to function np.empty({shape}, {dtype})"
+        raise errors.TypingError(msg)

numba_cuda/numba/cuda/tests/nrt/test_nrt.py ADDED Viewed

@@ -0,0 +1,110 @@
+import re
+import gc
+import numpy as np
+import unittest
+from unittest.mock import patch
+from numba.core.runtime import rtsys
+from numba.tests.support import EnableNRTStatsMixin
+from numba.cuda.testing import CUDATestCase
+from .mock_numpy import cuda_empty
+from numba import cuda
+class TestNrtRefCt(EnableNRTStatsMixin, CUDATestCase):
+    def setUp(self):
+        # Clean up any NRT-backed objects hanging in a dead reference cycle
+        gc.collect()
+        super(TestNrtRefCt, self).setUp()
+    @unittest.expectedFailure
+    def test_no_return(self):
+        """
+        Test issue #1291
+        """
+        n = 10
+        @cuda.jit
+        def kernel():
+            for i in range(n):
+                temp = cuda_empty(2, np.float64) # noqa: F841
+            return None
+        init_stats = rtsys.get_allocation_stats()
+        with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
+            kernel[1,1]()
+        cur_stats = rtsys.get_allocation_stats()
+        self.assertEqual(cur_stats.alloc - init_stats.alloc, n)
+        self.assertEqual(cur_stats.free - init_stats.free, n)
+class TestNrtBasic(CUDATestCase):
+    def test_nrt_launches(self):
+        @cuda.jit
+        def f(x):
+            return x[:5]
+        @cuda.jit
+        def g():
+            x = cuda_empty(10, np.int64)
+            f(x)
+        with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
+            g[1,1]()
+        cuda.synchronize()
+    def test_nrt_ptx_contains_refcount(self):
+        @cuda.jit
+        def f(x):
+            return x[:5]
+        @cuda.jit
+        def g():
+            x = cuda_empty(10, np.int64)
+            f(x)
+        with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
+            g[1,1]()
+        ptx = next(iter(g.inspect_asm().values()))
+        # The following checks that a `call` PTX instruction is
+        # emitted for NRT_MemInfo_alloc_aligned, NRT_incref and
+        # NRT_decref
+        p1 = r"call\.uni(.|\n)*NRT_MemInfo_alloc_aligned"
+        match = re.search(p1, ptx)
+        assert match is not None
+        p2 = r"call\.uni.*\n.*NRT_incref"
+        match = re.search(p2, ptx)
+        assert match is not None
+        p3 = r"call\.uni.*\n.*NRT_decref"
+        match = re.search(p3, ptx)
+        assert match is not None
+    def test_nrt_returns_correct(self):
+        @cuda.jit
+        def f(x):
+            return x[5:]
+        @cuda.jit
+        def g(out_ary):
+            x = cuda_empty(10, np.int64)
+            x[5] = 1
+            y = f(x)
+            out_ary[0] = y[0]
+        out_ary = np.zeros(1, dtype=np.int64)
+        with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
+            g[1,1](out_ary)
+        self.assertEqual(out_ary[0], 1)
+if __name__ == '__main__':
+    unittest.main()

numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import argparse
 import pathlib
+import platform
 import subprocess
 import sys
@@ -56,7 +57,13 @@ def determine_include_flags():
         print(f"Unexpected return code ({rc}) from `nvcc -v`. Expected 1.")
         return None
-    output = cp.stderr.decode()
+    # NVCC writes to stdout on Windows and stderr on Linux
+    if platform.system() == 'Windows':
+        stream = cp.stdout
+    else:
+        stream = cp.stderr
+    output = stream.decode()
     lines = output.splitlines()
     includes_lines = [line for line in lines if line.startswith("#$ INCLUDES=")]

{numba_cuda-0.0.18.dist-info → numba_cuda-0.0.20.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: numba-cuda
-Version: 0.0.18
+Version: 0.0.20
 Summary: CUDA target for Numba
 Author: Anaconda Inc., NVIDIA Corporation
 License: BSD 2-clause
@@ -13,17 +13,21 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: numba>=0.59.1
+<div align="center"><img src="docs/source/_static/numba-green-icon-rgb.svg" width="200"/></div>
 # Numba CUDA Target
-An out-of-tree CUDA target for Numba.
+The CUDA target for Numba. Please visit the [official
+documentation](https://nvidia.github.io/numba-cuda) to get started!
-This contains an entire copy of Numba's CUDA target (the `numba.cuda` module),
-and a mechanism to ensure the code from this module (`numba_cuda.numba.cuda`) is
-used as the `numba.cuda` module instead of the code from the `numba` package.
+To report issues or file feature requests, please use the [issue
+tracker](https://github.com/NVIDIA/numba-cuda/issues).
-This is presently in an early state and is published for testing and feedback.
+To raise questions or initiate discussions, please use the [Numba Discourse
+forum](https://numba.discourse.group).
-## Building / testing
+## Building from source
 Install as an editable install:
@@ -31,7 +35,7 @@ Install as an editable install:
 pip install -e .
 ```
-Running tests:
+## Running tests
 ```
 python -m numba.runtests numba.cuda.tests

numba-cuda 0.0.18__py3-none-any.whl → 0.0.20__py3-none-any.whl

numba-cuda 0.0.18py3-none-any.whl → 0.0.20py3-none-any.whl