PyPI - numba-cuda - Versions diffs - 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

numba-cuda 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

numba_cuda/VERSION +1 -1
numba_cuda/numba/cuda/compiler.py +7 -6
numba_cuda/numba/cuda/cudadecl.py +6 -2
numba_cuda/numba/cuda/cudadrv/devicearray.py +4 -1
numba_cuda/numba/cuda/cudadrv/driver.py +1 -20
numba_cuda/numba/cuda/cudadrv/linkable_code.py +13 -9
numba_cuda/numba/cuda/cudadrv/nvrtc.py +5 -1
numba_cuda/numba/cuda/cudadrv/nvvm.py +6 -1
numba_cuda/numba/cuda/decorators.py +9 -2
numba_cuda/numba/cuda/dispatcher.py +22 -3
numba_cuda/numba/cuda/runtime/__init__.py +1 -0
numba_cuda/numba/cuda/runtime/memsys.cu +94 -0
numba_cuda/numba/cuda/runtime/memsys.cuh +17 -0
numba_cuda/numba/cuda/runtime/nrt.cu +19 -22
numba_cuda/numba/cuda/runtime/nrt.py +318 -0
numba_cuda/numba/cuda/testing.py +11 -1
numba_cuda/numba/cuda/tests/__init__.py +1 -0
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +31 -0
numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +145 -11
numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +10 -7
numba_cuda/numba/cuda/tests/nrt/mock_numpy.py +105 -1
numba_cuda/numba/cuda/tests/nrt/test_nrt.py +162 -40
numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +114 -0
numba_cuda/numba/cuda/tests/support.py +11 -0
numba_cuda/numba/cuda/utils.py +22 -0
{numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/METADATA +21 -3
{numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/RECORD +30 -23
{numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/WHEEL +1 -1
{numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/LICENSE +0 -0
{numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/top_level.txt +0 -0

numba_cuda/numba/cuda/tests/nrt/mock_numpy.py CHANGED Viewed

@@ -1,8 +1,12 @@
+import math
+import numpy as np
 from numba.core import errors, types
 from numba.core.extending import overload
 from numba.np.arrayobj import (_check_const_str_dtype, is_nonelike,
-                               ty_parse_dtype, ty_parse_shape, numpy_empty_nd)
+                               ty_parse_dtype, ty_parse_shape, numpy_empty_nd,
+                               numpy_empty_like_nd)
 # Typical tests for allocation use array construction (e.g. np.zeros, np.empty,
@@ -20,6 +24,18 @@ def cuda_empty(shape, dtype):
     pass
+def cuda_empty_like(arr):
+    pass
+def cuda_arange(start):
+    pass
+def cuda_ones(shape):
+    pass
 @overload(cuda_empty)
 def ol_cuda_empty(shape, dtype):
     _check_const_str_dtype("empty", dtype)
@@ -40,3 +56,91 @@ def ol_cuda_empty(shape, dtype):
     else:
         msg = f"Cannot parse input types to function np.empty({shape}, {dtype})"
         raise errors.TypingError(msg)
+@overload(cuda_empty_like)
+def ol_cuda_empty_like(arr):
+    if isinstance(arr, types.Array):
+        nb_dtype = arr.dtype
+    else:
+        nb_dtype = arr
+    if isinstance(arr, types.Array):
+        layout = arr.layout if arr.layout != 'A' else 'C'
+        retty = arr.copy(dtype=nb_dtype, layout=layout, readonly=False)
+    else:
+        retty = types.Array(nb_dtype, 0, 'C')
+    def impl(arr):
+        dtype = None
+        return numpy_empty_like_nd(arr, dtype, retty)
+    return impl
+def _arange_dtype(*args):
+    bounds = [a for a in args if not isinstance(a, types.NoneType)]
+    if any(isinstance(a, types.Complex) for a in bounds):
+        dtype = types.complex128
+    elif any(isinstance(a, types.Float) for a in bounds):
+        dtype = types.float64
+    else:
+        # `np.arange(10).dtype` is always `np.dtype(int)`, aka `np.int_`, which
+        # in all released versions of numpy corresponds to the C `long` type.
+        # Windows 64 is broken by default here because Numba (as of 0.47) does
+        # not differentiate between Python and NumPy integers, so a `typeof(1)`
+        # on w64 is `int64`, i.e. `intp`. This means an arange(<some int>) will
+        # be typed as arange(int64) and the following will yield int64 opposed
+        # to int32. Example: without a load of analysis to work out of the args
+        # were wrapped in NumPy int*() calls it's not possible to detect the
+        # difference between `np.arange(10)` and `np.arange(np.int64(10)`.
+        NPY_TY = getattr(types, "int%s" % (8 * np.dtype(int).itemsize))
+        # unliteral these types such that `max` works.
+        unliteral_bounds = [types.unliteral(x) for x in bounds]
+        dtype = max(unliteral_bounds + [NPY_TY,])
+    return dtype
+@overload(cuda_arange)
+def ol_cuda_arange(start):
+    """Simplified arange with just 1 argument."""
+    if (not isinstance(start, types.Number)):
+        return
+    start_value = getattr(start, "literal_value", None)
+    def impl(start):
+        # Allow for improved performance if given literal arguments.
+        lit_start = start_value if start_value is not None else start
+        _step = 1
+        _start, _stop = 0, lit_start
+        nitems_c = (_stop - _start) / _step
+        nitems_r = int(math.ceil(nitems_c.real))
+        # Binary operator needed for compiler branch pruning.
+        nitems = max(nitems_r, 0)
+        arr = cuda_empty(nitems, np.int64)
+        val = _start
+        for i in range(nitems):
+            arr[i] = val + (i * _step)
+        return arr
+    return impl
+@overload(cuda_ones)
+def ol_cuda_ones(shape):
+    def impl(shape):
+        arr = cuda_empty(shape, np.float64)
+        arr_flat = arr.flat
+        for idx in range(len(arr_flat)):
+            arr_flat[idx] = 1
+        return arr
+    return impl

numba_cuda/numba/cuda/tests/nrt/test_nrt.py CHANGED Viewed

@@ -1,47 +1,22 @@
 import re
-import gc
+import os
 import numpy as np
 import unittest
-from unittest.mock import patch
-from numba.core.runtime import rtsys
-from numba.tests.support import EnableNRTStatsMixin
 from numba.cuda.testing import CUDATestCase
-from .mock_numpy import cuda_empty
+from numba.cuda.tests.nrt.mock_numpy import cuda_empty, cuda_ones, cuda_arange
+from numba.tests.support import run_in_subprocess, override_config
 from numba import cuda
-class TestNrtRefCt(EnableNRTStatsMixin, CUDATestCase):
-    def setUp(self):
-        # Clean up any NRT-backed objects hanging in a dead reference cycle
-        gc.collect()
-        super(TestNrtRefCt, self).setUp()
-    @unittest.expectedFailure
-    def test_no_return(self):
-        """
-        Test issue #1291
-        """
-        n = 10
-        @cuda.jit
-        def kernel():
-            for i in range(n):
-                temp = cuda_empty(2, np.float64) # noqa: F841
-            return None
-        init_stats = rtsys.get_allocation_stats()
-        with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
-            kernel[1,1]()
-        cur_stats = rtsys.get_allocation_stats()
-        self.assertEqual(cur_stats.alloc - init_stats.alloc, n)
-        self.assertEqual(cur_stats.free - init_stats.free, n)
+from numba.cuda.runtime.nrt import rtsys
 class TestNrtBasic(CUDATestCase):
+    def run(self, result=None):
+        with override_config("CUDA_ENABLE_NRT", True):
+            super(TestNrtBasic, self).run(result)
     def test_nrt_launches(self):
         @cuda.jit
         def f(x):
@@ -52,8 +27,7 @@ class TestNrtBasic(CUDATestCase):
             x = cuda_empty(10, np.int64)
             f(x)
-        with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
-            g[1,1]()
+        g[1,1]()
         cuda.synchronize()
     def test_nrt_ptx_contains_refcount(self):
@@ -66,8 +40,7 @@ class TestNrtBasic(CUDATestCase):
             x = cuda_empty(10, np.int64)
             f(x)
-        with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
-            g[1,1]()
+        g[1,1]()
         ptx = next(iter(g.inspect_asm().values()))
@@ -100,11 +73,160 @@ class TestNrtBasic(CUDATestCase):
         out_ary = np.zeros(1, dtype=np.int64)
-        with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
-            g[1,1](out_ary)
+        g[1,1](out_ary)
         self.assertEqual(out_ary[0], 1)
+class TestNrtStatistics(CUDATestCase):
+    def setUp(self):
+        self._stream = cuda.default_stream()
+        # Store the current stats state
+        self.__stats_state = rtsys.memsys_stats_enabled(self._stream)
+    def tearDown(self):
+        # Set stats state back to whatever it was before the test ran
+        if self.__stats_state:
+            rtsys.memsys_enable_stats(self._stream)
+        else:
+            rtsys.memsys_disable_stats(self._stream)
+    def test_stats_env_var_explicit_on(self):
+        # Checks that explicitly turning the stats on via the env var works.
+        src = """if 1:
+        from numba import cuda
+        from numba.cuda.runtime import rtsys
+        from numba.cuda.tests.nrt.mock_numpy import cuda_arange
+        @cuda.jit
+        def foo():
+            x = cuda_arange(10)[0]
+        # initialize the NRT before use
+        rtsys.initialize()
+        assert rtsys.memsys_stats_enabled(), "Stats not enabled"
+        orig_stats = rtsys.get_allocation_stats()
+        foo[1, 1]()
+        new_stats = rtsys.get_allocation_stats()
+        total_alloc = new_stats.alloc - orig_stats.alloc
+        total_free = new_stats.free - orig_stats.free
+        total_mi_alloc = new_stats.mi_alloc - orig_stats.mi_alloc
+        total_mi_free = new_stats.mi_free - orig_stats.mi_free
+        expected = 1
+        assert total_alloc == expected, \\
+            f"total_alloc != expected, {total_alloc} != {expected}"
+        assert total_free == expected, \\
+            f"total_free != expected, {total_free} != {expected}"
+        assert total_mi_alloc == expected, \\
+            f"total_mi_alloc != expected, {total_mi_alloc} != {expected}"
+        assert total_mi_free == expected, \\
+            f"total_mi_free != expected, {total_mi_free} != {expected}"
+        """
+        # Check env var explicitly being set works
+        env = os.environ.copy()
+        env['NUMBA_CUDA_NRT_STATS'] = "1"
+        env['NUMBA_CUDA_ENABLE_NRT'] = "1"
+        run_in_subprocess(src, env=env)
+    def check_env_var_off(self, env):
+        src = """if 1:
+        from numba import cuda
+        import numpy as np
+        from numba.cuda.runtime import rtsys
+        @cuda.jit
+        def foo():
+            arr = np.arange(10)[0]
+        assert rtsys.memsys_stats_enabled() == False
+        try:
+            rtsys.get_allocation_stats()
+        except RuntimeError as e:
+            assert "NRT stats are disabled." in str(e)
+        """
+        run_in_subprocess(src, env=env)
+    def test_stats_env_var_explicit_off(self):
+        # Checks that explicitly turning the stats off via the env var works.
+        env = os.environ.copy()
+        env['NUMBA_CUDA_NRT_STATS'] = "0"
+        self.check_env_var_off(env)
+    def test_stats_env_var_default_off(self):
+        # Checks that the env var not being set is the same as "off", i.e.
+        # default for Numba is off.
+        env = os.environ.copy()
+        env.pop('NUMBA_CUDA_NRT_STATS', None)
+        self.check_env_var_off(env)
+    def test_stats_status_toggle(self):
+        @cuda.jit
+        def foo():
+            tmp = cuda_ones(3)
+            arr = cuda_arange(5 * tmp[0]) # noqa: F841
+            return None
+        with override_config('CUDA_ENABLE_NRT', True):
+            # Switch on stats
+            rtsys.memsys_enable_stats()
+            # check the stats are on
+            self.assertTrue(rtsys.memsys_stats_enabled())
+            for i in range(2):
+                # capture the stats state
+                stats_1 = rtsys.get_allocation_stats()
+                # Switch off stats
+                rtsys.memsys_disable_stats()
+                # check the stats are off
+                self.assertFalse(rtsys.memsys_stats_enabled())
+                # run something that would move the counters were they enabled
+                foo[1, 1]()
+                # Switch on stats
+                rtsys.memsys_enable_stats()
+                # check the stats are on
+                self.assertTrue(rtsys.memsys_stats_enabled())
+                # capture the stats state (should not have changed)
+                stats_2 = rtsys.get_allocation_stats()
+                # run something that will move the counters
+                foo[1, 1]()
+                # capture the stats state (should have changed)
+                stats_3 = rtsys.get_allocation_stats()
+                # check stats_1 == stats_2
+                self.assertEqual(stats_1, stats_2)
+                # check stats_2 < stats_3
+                self.assertLess(stats_2, stats_3)
+    def test_rtsys_stats_query_raises_exception_when_disabled(self):
+        # Checks that the standard rtsys.get_allocation_stats() query raises
+        # when stats counters are turned off.
+        rtsys.memsys_disable_stats()
+        self.assertFalse(rtsys.memsys_stats_enabled())
+        with self.assertRaises(RuntimeError) as raises:
+            rtsys.get_allocation_stats()
+        self.assertIn("NRT stats are disabled.", str(raises.exception))
+    def test_nrt_explicit_stats_query_raises_exception_when_disabled(self):
+        # Checks the various memsys_get_stats functions raise if queried when
+        # the stats counters are disabled.
+        method_variations = ('alloc', 'free', 'mi_alloc', 'mi_free')
+        for meth in method_variations:
+            stats_func = getattr(rtsys, f'memsys_get_stats_{meth}')
+            with self.subTest(stats_func=stats_func):
+                # Turn stats off
+                rtsys.memsys_disable_stats()
+                self.assertFalse(rtsys.memsys_stats_enabled())
+                with self.assertRaises(RuntimeError) as raises:
+                    stats_func()
+                self.assertIn("NRT stats are disabled.", str(raises.exception))
 if __name__ == '__main__':
     unittest.main()

numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py ADDED Viewed

@@ -0,0 +1,114 @@
+import numpy as np
+import unittest
+from numba.tests.support import override_config
+from numba.cuda.runtime import rtsys
+from numba.cuda.tests.support import EnableNRTStatsMixin
+from numba.cuda.testing import CUDATestCase
+from numba.cuda.tests.nrt.mock_numpy import cuda_empty, cuda_empty_like
+from numba import cuda
+class TestNrtRefCt(EnableNRTStatsMixin, CUDATestCase):
+    def setUp(self):
+        super(TestNrtRefCt, self).setUp()
+    def tearDown(self):
+        super(TestNrtRefCt, self).tearDown()
+    def run(self, result=None):
+        with override_config("CUDA_ENABLE_NRT", True):
+            super(TestNrtRefCt, self).run(result)
+    def test_no_return(self):
+        """
+        Test issue #1291
+        """
+        n = 10
+        @cuda.jit
+        def kernel():
+            for i in range(n):
+                temp = cuda_empty(2, np.float64) # noqa: F841
+            return None
+        init_stats = rtsys.get_allocation_stats()
+        kernel[1, 1]()
+        cur_stats = rtsys.get_allocation_stats()
+        self.assertEqual(cur_stats.alloc - init_stats.alloc, n)
+        self.assertEqual(cur_stats.free - init_stats.free, n)
+    def test_escaping_var_init_in_loop(self):
+        """
+        Test issue #1297
+        """
+        @cuda.jit
+        def g(n):
+            x = cuda_empty((n, 2), np.float64)
+            for i in range(n):
+                y = x[i]
+            for i in range(n):
+                y = x[i] # noqa: F841
+            return None
+        init_stats = rtsys.get_allocation_stats()
+        g[1, 1](10)
+        cur_stats = rtsys.get_allocation_stats()
+        self.assertEqual(cur_stats.alloc - init_stats.alloc, 1)
+        self.assertEqual(cur_stats.free - init_stats.free, 1)
+    def test_invalid_computation_of_lifetime(self):
+        """
+        Test issue #1573
+        """
+        @cuda.jit
+        def if_with_allocation_and_initialization(arr1, test1):
+            tmp_arr = cuda_empty_like(arr1)
+            for i in range(tmp_arr.shape[0]):
+                pass
+            if test1:
+                cuda_empty_like(arr1)
+        arr = np.random.random((5, 5))  # the values are not consumed
+        init_stats = rtsys.get_allocation_stats()
+        if_with_allocation_and_initialization[1, 1](arr, False)
+        cur_stats = rtsys.get_allocation_stats()
+        self.assertEqual(cur_stats.alloc - init_stats.alloc,
+                         cur_stats.free - init_stats.free)
+    def test_del_at_beginning_of_loop(self):
+        """
+        Test issue #1734
+        """
+        @cuda.jit
+        def f(arr):
+            res = 0
+            for i in (0, 1):
+                # `del t` is issued here before defining t.  It must be
+                # correctly handled by the lowering phase.
+                t = arr[i]
+                if t[i] > 1:
+                    res += t[i]
+        arr = np.ones((2, 2))
+        init_stats = rtsys.get_allocation_stats()
+        f[1, 1](arr)
+        cur_stats = rtsys.get_allocation_stats()
+        self.assertEqual(cur_stats.alloc - init_stats.alloc,
+                         cur_stats.free - init_stats.free)
+if __name__ == '__main__':
+    unittest.main()

numba_cuda/numba/cuda/tests/support.py ADDED Viewed

@@ -0,0 +1,11 @@
+from numba.cuda.runtime.nrt import rtsys
+class EnableNRTStatsMixin(object):
+    """Mixin to enable the NRT statistics counters."""
+    def setUp(self):
+        rtsys.memsys_enable_stats()
+    def tearDown(self):
+        rtsys.memsys_disable_stats()

numba_cuda/numba/cuda/utils.py ADDED Viewed

@@ -0,0 +1,22 @@
+import os
+import warnings
+import traceback
+def _readenv(name, ctor, default):
+    value = os.environ.get(name)
+    if value is None:
+        return default() if callable(default) else default
+    try:
+        if ctor is bool:
+            return value.lower() in {'1', "true"}
+        return ctor(value)
+    except Exception:
+        warnings.warn(
+            f"Environment variable '{name}' is defined but its associated "
+            f"value '{value}' could not be parsed.\n"
+            "The parse failed with exception:\n"
+            f"{traceback.format_exc()}",
+            RuntimeWarning
+        )
+        return default

{numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: numba-cuda
-Version: 0.3.0
+Version: 0.5.0
 Summary: CUDA target for Numba
 Author: Anaconda Inc., NVIDIA Corporation
 License: BSD 2-clause
@@ -27,7 +27,19 @@ tracker](https://github.com/NVIDIA/numba-cuda/issues).
 To raise questions or initiate discussions, please use the [Numba Discourse
 forum](https://numba.discourse.group).
-## Building from source
+## Installation with pip
+```shell
+pip install numba-cuda
+```
+## Installation with Conda
+```shell
+conda install -c conda-forge numba-cuda
+```
+## Installation from source
 Install as an editable install:
@@ -53,3 +65,9 @@ which will show a path like:
 ```
 <path to numba-cuda repo>/numba_cuda/numba/cuda/__init__.py
 ```
+## Contributing Guide
+Review the
+[CONTRIBUTING.md](https://github.com/NVIDIA/numba-cuda/blob/main/CONTRIBUTING.md)
+file for information on how to contribute code and issues to the project.

numba-cuda 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

numba-cuda 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl