PyPI - numba-cuda - Versions diffs - 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

numba-cuda 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

numba_cuda/VERSION +1 -1
numba_cuda/numba/cuda/cudadrv/devicearray.py +4 -1
numba_cuda/numba/cuda/cudadrv/driver.py +1 -20
numba_cuda/numba/cuda/cudadrv/nvrtc.py +5 -1
numba_cuda/numba/cuda/dispatcher.py +41 -15
numba_cuda/numba/cuda/reshape_funcs.cu +151 -0
numba_cuda/numba/cuda/runtime/__init__.py +1 -0
numba_cuda/numba/cuda/runtime/memsys.cu +94 -0
numba_cuda/numba/cuda/runtime/memsys.cuh +17 -0
numba_cuda/numba/cuda/runtime/nrt.cu +19 -22
numba_cuda/numba/cuda/runtime/nrt.py +318 -0
numba_cuda/numba/cuda/tests/__init__.py +1 -0
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +31 -0
numba_cuda/numba/cuda/tests/cudapy/test_array.py +73 -0
numba_cuda/numba/cuda/tests/nrt/mock_numpy.py +105 -1
numba_cuda/numba/cuda/tests/nrt/test_nrt.py +162 -40
numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +114 -0
numba_cuda/numba/cuda/tests/support.py +11 -0
numba_cuda/numba/cuda/utils.py +22 -0
{numba_cuda-0.2.0.dist-info → numba_cuda-0.4.0.dist-info}/METADATA +2 -2
{numba_cuda-0.2.0.dist-info → numba_cuda-0.4.0.dist-info}/RECORD +24 -16
{numba_cuda-0.2.0.dist-info → numba_cuda-0.4.0.dist-info}/WHEEL +1 -1
{numba_cuda-0.2.0.dist-info → numba_cuda-0.4.0.dist-info}/LICENSE +0 -0
{numba_cuda-0.2.0.dist-info → numba_cuda-0.4.0.dist-info}/top_level.txt +0 -0

numba_cuda/numba/cuda/runtime/nrt.py ADDED Viewed

@@ -0,0 +1,318 @@
+import ctypes
+import os
+from functools import wraps
+import numpy as np
+from numba import cuda, config
+from numba.core.runtime.nrt import _nrt_mstats
+from numba.cuda.cudadrv.driver import Linker, driver, launch_kernel
+from numba.cuda.cudadrv import devices
+from numba.cuda.api import get_current_device
+from numba.cuda.utils import _readenv
+# Check environment variable or config for NRT statistics enablement
+NRT_STATS = (
+    _readenv("NUMBA_CUDA_NRT_STATS", bool, False) or
+    getattr(config, "NUMBA_CUDA_NRT_STATS", False)
+)
+if not hasattr(config, "NUMBA_CUDA_NRT_STATS"):
+    config.CUDA_NRT_STATS = NRT_STATS
+# Check environment variable or config for NRT enablement
+ENABLE_NRT = (
+    _readenv("NUMBA_CUDA_ENABLE_NRT", bool, False) or
+    getattr(config, "NUMBA_CUDA_ENABLE_NRT", False)
+)
+if not hasattr(config, "NUMBA_CUDA_ENABLE_NRT"):
+    config.CUDA_ENABLE_NRT = ENABLE_NRT
+# Protect method to ensure NRT memory allocation and initialization
+def _alloc_init_guard(method):
+    """
+    Ensure NRT memory allocation and initialization before running the method
+    """
+    @wraps(method)
+    def wrapper(self, *args, **kwargs):
+        self.ensure_allocated()
+        self.ensure_initialized()
+        return method(self, *args, **kwargs)
+    return wrapper
+class _Runtime:
+    """Singleton class for Numba CUDA runtime"""
+    _instance = None
+    def __new__(cls, *args, **kwargs):
+        if cls._instance is None:
+            cls._instance = super(_Runtime, cls).__new__(cls, *args, **kwargs)
+        return cls._instance
+    def __init__(self):
+        """Initialize memsys module and variable"""
+        self._memsys_module = None
+        self._memsys = None
+        self._initialized = False
+    def _compile_memsys_module(self):
+        """
+        Compile memsys.cu and create a module from it in the current context
+        """
+        # Define the path for memsys.cu
+        memsys_mod = os.path.join(
+            os.path.dirname(os.path.abspath(__file__)),
+            "memsys.cu"
+        )
+        cc = get_current_device().compute_capability
+        # Create a new linker instance and add the cu file
+        linker = Linker.new(cc=cc)
+        linker.add_cu_file(memsys_mod)
+        # Complete the linker and create a module from it
+        cubin = linker.complete()
+        ctx = devices.get_context()
+        module = ctx.create_module_image(cubin)
+        # Set the memsys module
+        self._memsys_module = module
+    def ensure_allocated(self, stream=None):
+        """
+        If memsys is not allocated, allocate it; otherwise, perform a no-op
+        """
+        if self._memsys is not None:
+            return
+        # Allocate the memsys
+        self.allocate(stream)
+    def allocate(self, stream=None):
+        """
+        Allocate memsys on global memory
+        """
+        from numba.cuda import device_array
+        # Check if memsys module is defined
+        if self._memsys_module is None:
+            # Compile the memsys module if not defined
+            self._compile_memsys_module()
+        # Allocate space for NRT_MemSys
+        ptr, nbytes = self._memsys_module.get_global_symbol("memsys_size")
+        memsys_size = ctypes.c_uint64()
+        driver.cuMemcpyDtoH(ctypes.addressof(memsys_size),
+                            ptr.device_ctypes_pointer, nbytes)
+        self._memsys = device_array(
+            (memsys_size.value,), dtype="i1", stream=stream)
+        self.set_memsys_to_module(self._memsys_module, stream=stream)
+    def _single_thread_launch(self, module, stream, name, params=()):
+        """
+        Launch the specified kernel with only 1 thread
+        """
+        if stream is None:
+            stream = cuda.default_stream()
+        func = module.get_function(name)
+        launch_kernel(
+            func.handle,
+            1, 1, 1,
+            1, 1, 1,
+            0,
+            stream.handle,
+            params,
+            cooperative=False
+        )
+    def ensure_initialized(self, stream=None):
+        """
+        If memsys is not initialized, initialize memsys
+        """
+        if self._initialized:
+            return
+        # Initialize the memsys
+        self.initialize(stream)
+    def initialize(self, stream=None):
+        """
+        Launch memsys initialization kernel
+        """
+        self.ensure_allocated()
+        self._single_thread_launch(
+            self._memsys_module, stream, "NRT_MemSys_init")
+        self._initialized = True
+        if config.CUDA_NRT_STATS:
+            self.memsys_enable_stats()
+    @_alloc_init_guard
+    def memsys_enable_stats(self, stream=None):
+        """
+        Enable memsys statistics
+        """
+        self._single_thread_launch(
+            self._memsys_module, stream, "NRT_MemSys_enable_stats")
+    @_alloc_init_guard
+    def memsys_disable_stats(self, stream=None):
+        """
+        Disable memsys statistics
+        """
+        self._single_thread_launch(
+            self._memsys_module, stream, "NRT_MemSys_disable_stats")
+    @_alloc_init_guard
+    def memsys_stats_enabled(self, stream=None):
+        """
+        Return a boolean indicating whether memsys is enabled. Synchronizes
+        context
+        """
+        enabled_ar = cuda.managed_array(1, np.uint8)
+        self._single_thread_launch(
+            self._memsys_module,
+            stream,
+            "NRT_MemSys_stats_enabled",
+            (enabled_ar.device_ctypes_pointer,)
+        )
+        cuda.synchronize()
+        return bool(enabled_ar[0])
+    @_alloc_init_guard
+    def _copy_memsys_to_host(self, stream):
+        """
+        Copy all statistics of memsys to the host
+        """
+        dt = np.dtype([
+            ('alloc', np.uint64),
+            ('free', np.uint64),
+            ('mi_alloc', np.uint64),
+            ('mi_free', np.uint64)
+        ])
+        stats_for_read = cuda.managed_array(1, dt)
+        self._single_thread_launch(
+            self._memsys_module,
+            stream,
+            "NRT_MemSys_read",
+            [stats_for_read.device_ctypes_pointer]
+        )
+        cuda.synchronize()
+        return stats_for_read[0]
+    @_alloc_init_guard
+    def get_allocation_stats(self, stream=None):
+        """
+        Get the allocation statistics
+        """
+        enabled = self.memsys_stats_enabled(stream)
+        if not enabled:
+            raise RuntimeError("NRT stats are disabled.")
+        memsys = self._copy_memsys_to_host(stream)
+        return _nrt_mstats(
+            alloc=memsys["alloc"],
+            free=memsys["free"],
+            mi_alloc=memsys["mi_alloc"],
+            mi_free=memsys["mi_free"]
+        )
+    @_alloc_init_guard
+    def _get_single_stat(self, stat, stream=None):
+        """
+        Get a single stat from the memsys
+        """
+        got = cuda.managed_array(1, np.uint64)
+        self._single_thread_launch(
+            self._memsys_module,
+            stream,
+            f"NRT_MemSys_read_{stat}",
+            [got.device_ctypes_pointer]
+        )
+        cuda.synchronize()
+        return got[0]
+    @_alloc_init_guard
+    def memsys_get_stats_alloc(self, stream=None):
+        """
+        Get the allocation statistic
+        """
+        enabled = self.memsys_stats_enabled(stream)
+        if not enabled:
+            raise RuntimeError("NRT stats are disabled.")
+        return self._get_single_stat("alloc")
+    @_alloc_init_guard
+    def memsys_get_stats_free(self, stream=None):
+        """
+        Get the free statistic
+        """
+        enabled = self.memsys_stats_enabled(stream)
+        if not enabled:
+            raise RuntimeError("NRT stats are disabled.")
+        return self._get_single_stat("free")
+    @_alloc_init_guard
+    def memsys_get_stats_mi_alloc(self, stream=None):
+        """
+        Get the mi alloc statistic
+        """
+        enabled = self.memsys_stats_enabled(stream)
+        if not enabled:
+            raise RuntimeError("NRT stats are disabled.")
+        return self._get_single_stat("mi_alloc")
+    @_alloc_init_guard
+    def memsys_get_stats_mi_free(self, stream=None):
+        """
+        Get the mi free statistic
+        """
+        enabled = self.memsys_stats_enabled(stream)
+        if not enabled:
+            raise RuntimeError("NRT stats are disabled.")
+        return self._get_single_stat("mi_free")
+    def set_memsys_to_module(self, module, stream=None):
+        """
+        Set the memsys module. The module must contain `NRT_MemSys_set` kernel,
+        and declare a pointer to NRT_MemSys structure.
+        """
+        if self._memsys is None:
+            raise RuntimeError(
+                "Please allocate NRT Memsys first before setting to module.")
+        self._single_thread_launch(
+            module,
+            stream,
+            "NRT_MemSys_set",
+            [self._memsys.device_ctypes_pointer,]
+        )
+    @_alloc_init_guard
+    def print_memsys(self, stream=None):
+        """
+        Print the current statistics of memsys, for debugging purposes
+        """
+        cuda.synchronize()
+        self._single_thread_launch(
+            self._memsys_module,
+            stream,
+            "NRT_MemSys_print"
+        )
+# Create an instance of the runtime
+rtsys = _Runtime()

numba_cuda/numba/cuda/tests/__init__.py CHANGED Viewed

@@ -49,6 +49,7 @@ def load_tests(loader, tests, pattern):
         if gpus and gpus[0].compute_capability >= (2, 0):
             suite.addTests(load_testsuite(loader, join(this_dir, 'cudadrv')))
             suite.addTests(load_testsuite(loader, join(this_dir, 'cudapy')))
+            suite.addTests(load_testsuite(loader, join(this_dir, 'nrt')))
             suite.addTests(load_testsuite(loader, join(this_dir,
                                                        'doc_examples')))
         else:

numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py CHANGED Viewed

@@ -4,6 +4,7 @@ from numba.cuda.cudadrv import devicearray
 from numba import cuda
 from numba.cuda.testing import unittest, CUDATestCase
 from numba.cuda.testing import skip_on_cudasim
+from numba.tests.support import IS_NUMPY_2
 class TestCudaNDArray(CUDATestCase):
@@ -456,6 +457,36 @@ class TestCudaNDArray(CUDATestCase):
         dev_array_from_host.copy_to_device(dev_array)
+class TestArrayMethod(CUDATestCase):
+    """Tests of the __array__() method via np.array"""
+    def test_np_array(self):
+        dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
+        host_array = np.array(dev_array)
+        np.testing.assert_equal(dev_array.copy_to_host(), host_array)
+    def test_np_array_dtype(self):
+        dtype = np.int32
+        dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
+        host_array = np.array(dev_array, dtype=dtype)
+        np.testing.assert_equal(
+            host_array,
+            dev_array.copy_to_host().astype(dtype)
+        )
+    @unittest.skipUnless(IS_NUMPY_2, "NumPy 1.x does not pass copy kwarg")
+    def test_np_array_copy_false(self):
+        dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
+        with self.assertRaisesRegex(ValueError, "`copy=False` is not"):
+            np.array(dev_array, copy=False)
+    @unittest.skipUnless(IS_NUMPY_2, "NumPy 1.x does not pass copy kwarg")
+    def test_np_array_copy_true(self):
+        dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
+        host_array = np.array(dev_array)
+        np.testing.assert_equal(dev_array.copy_to_host(), host_array)
 class TestRecarray(CUDATestCase):
     def test_recarray(self):
         # From issue #4111

numba_cuda/numba/cuda/tests/cudapy/test_array.py CHANGED Viewed

@@ -12,6 +12,31 @@ else:
                             cuda.pinned_array_like)
+def array_reshape1d(arr, newshape, got):
+    y = arr.reshape(newshape)
+    for i in range(y.shape[0]):
+        got[i] = y[i]
+def array_reshape2d(arr, newshape, got):
+    y = arr.reshape(newshape)
+    for i in range(y.shape[0]):
+        for j in range(y.shape[1]):
+            got[i, j] = y[i, j]
+def array_reshape3d(arr, newshape, got):
+    y = arr.reshape(newshape)
+    for i in range(y.shape[0]):
+        for j in range(y.shape[1]):
+            for k in range(y.shape[2]):
+                got[i, j, k] = y[i, j, k]
+def array_reshape(arr, newshape):
+    return arr.reshape(newshape)
 class TestCudaArray(CUDATestCase):
     def test_gpu_array_zero_length(self):
         x = np.arange(0)
@@ -255,6 +280,54 @@ class TestCudaArray(CUDATestCase):
         self.assertEqual(1, len(func.overloads))
+    def test_array_reshape(self):
+        def check(pyfunc, kernelfunc, arr, shape):
+            kernel = cuda.jit(kernelfunc)
+            expected = pyfunc(arr, shape)
+            got = np.zeros(expected.shape, dtype=arr.dtype)
+            kernel[1, 1](arr, shape, got)
+            self.assertPreciseEqual(got, expected)
+        def check_only_shape(kernelfunc, arr, shape, expected_shape):
+            kernel = cuda.jit(kernelfunc)
+            got = np.zeros(expected_shape, dtype=arr.dtype)
+            kernel[1, 1](arr, shape, got)
+            self.assertEqual(got.shape, expected_shape)
+            self.assertEqual(got.size, arr.size)
+        # 0-sized arrays
+        def check_empty(arr):
+            check(array_reshape, array_reshape1d, arr, 0)
+            check(array_reshape, array_reshape1d, arr, (0,))
+            check(array_reshape, array_reshape3d, arr, (1, 0, 2))
+            check_only_shape(array_reshape2d, arr, (0, -1), (0, 0))
+            check_only_shape(array_reshape2d, arr, (4, -1), (4, 0))
+            check_only_shape(array_reshape3d, arr, (-1, 0, 4), (0, 0, 4))
+        # C-contiguous
+        arr = np.arange(24)
+        check(array_reshape, array_reshape1d, arr, (24,))
+        check(array_reshape, array_reshape2d, arr, (4, 6))
+        check(array_reshape, array_reshape2d, arr, (8, 3))
+        check(array_reshape, array_reshape3d, arr, (8, 1, 3))
+        arr = np.arange(24).reshape((1, 8, 1, 1, 3, 1))
+        check(array_reshape, array_reshape1d, arr, (24,))
+        check(array_reshape, array_reshape2d, arr, (4, 6))
+        check(array_reshape, array_reshape2d, arr, (8, 3))
+        check(array_reshape, array_reshape3d, arr, (8, 1, 3))
+        # Test negative shape value
+        arr = np.arange(25).reshape(5,5)
+        check(array_reshape, array_reshape1d, arr, -1)
+        check(array_reshape, array_reshape1d, arr, (-1,))
+        check(array_reshape, array_reshape2d, arr, (-1, 5))
+        check(array_reshape, array_reshape3d, arr, (5, -1, 5))
+        check(array_reshape, array_reshape3d, arr, (5, 5, -1))
+        arr = np.array([])
+        check_empty(arr)
 if __name__ == '__main__':
     unittest.main()

numba_cuda/numba/cuda/tests/nrt/mock_numpy.py CHANGED Viewed

@@ -1,8 +1,12 @@
+import math
+import numpy as np
 from numba.core import errors, types
 from numba.core.extending import overload
 from numba.np.arrayobj import (_check_const_str_dtype, is_nonelike,
-                               ty_parse_dtype, ty_parse_shape, numpy_empty_nd)
+                               ty_parse_dtype, ty_parse_shape, numpy_empty_nd,
+                               numpy_empty_like_nd)
 # Typical tests for allocation use array construction (e.g. np.zeros, np.empty,
@@ -20,6 +24,18 @@ def cuda_empty(shape, dtype):
     pass
+def cuda_empty_like(arr):
+    pass
+def cuda_arange(start):
+    pass
+def cuda_ones(shape):
+    pass
 @overload(cuda_empty)
 def ol_cuda_empty(shape, dtype):
     _check_const_str_dtype("empty", dtype)
@@ -40,3 +56,91 @@ def ol_cuda_empty(shape, dtype):
     else:
         msg = f"Cannot parse input types to function np.empty({shape}, {dtype})"
         raise errors.TypingError(msg)
+@overload(cuda_empty_like)
+def ol_cuda_empty_like(arr):
+    if isinstance(arr, types.Array):
+        nb_dtype = arr.dtype
+    else:
+        nb_dtype = arr
+    if isinstance(arr, types.Array):
+        layout = arr.layout if arr.layout != 'A' else 'C'
+        retty = arr.copy(dtype=nb_dtype, layout=layout, readonly=False)
+    else:
+        retty = types.Array(nb_dtype, 0, 'C')
+    def impl(arr):
+        dtype = None
+        return numpy_empty_like_nd(arr, dtype, retty)
+    return impl
+def _arange_dtype(*args):
+    bounds = [a for a in args if not isinstance(a, types.NoneType)]
+    if any(isinstance(a, types.Complex) for a in bounds):
+        dtype = types.complex128
+    elif any(isinstance(a, types.Float) for a in bounds):
+        dtype = types.float64
+    else:
+        # `np.arange(10).dtype` is always `np.dtype(int)`, aka `np.int_`, which
+        # in all released versions of numpy corresponds to the C `long` type.
+        # Windows 64 is broken by default here because Numba (as of 0.47) does
+        # not differentiate between Python and NumPy integers, so a `typeof(1)`
+        # on w64 is `int64`, i.e. `intp`. This means an arange(<some int>) will
+        # be typed as arange(int64) and the following will yield int64 opposed
+        # to int32. Example: without a load of analysis to work out of the args
+        # were wrapped in NumPy int*() calls it's not possible to detect the
+        # difference between `np.arange(10)` and `np.arange(np.int64(10)`.
+        NPY_TY = getattr(types, "int%s" % (8 * np.dtype(int).itemsize))
+        # unliteral these types such that `max` works.
+        unliteral_bounds = [types.unliteral(x) for x in bounds]
+        dtype = max(unliteral_bounds + [NPY_TY,])
+    return dtype
+@overload(cuda_arange)
+def ol_cuda_arange(start):
+    """Simplified arange with just 1 argument."""
+    if (not isinstance(start, types.Number)):
+        return
+    start_value = getattr(start, "literal_value", None)
+    def impl(start):
+        # Allow for improved performance if given literal arguments.
+        lit_start = start_value if start_value is not None else start
+        _step = 1
+        _start, _stop = 0, lit_start
+        nitems_c = (_stop - _start) / _step
+        nitems_r = int(math.ceil(nitems_c.real))
+        # Binary operator needed for compiler branch pruning.
+        nitems = max(nitems_r, 0)
+        arr = cuda_empty(nitems, np.int64)
+        val = _start
+        for i in range(nitems):
+            arr[i] = val + (i * _step)
+        return arr
+    return impl
+@overload(cuda_ones)
+def ol_cuda_ones(shape):
+    def impl(shape):
+        arr = cuda_empty(shape, np.float64)
+        arr_flat = arr.flat
+        for idx in range(len(arr_flat)):
+            arr_flat[idx] = 1
+        return arr
+    return impl

numba-cuda 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

numba-cuda 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl