PyPI - numba-cuda - Versions diffs - 0.10.1__py3-none-any.whl → 0.12.1__py3-none-any.whl - Mend

numba-cuda 0.10.1py3-none-any.whl → 0.12.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

numba_cuda/VERSION +1 -1
numba_cuda/numba/cuda/{cuda_bf16.py → _internal/cuda_bf16.py} +1 -1
numba_cuda/numba/cuda/api.py +13 -0
numba_cuda/numba/cuda/bf16.py +112 -0
numba_cuda/numba/cuda/cg.py +2 -0
numba_cuda/numba/cuda/codegen.py +77 -2
numba_cuda/numba/cuda/compiler.py +22 -16
numba_cuda/numba/cuda/cudadecl.py +21 -6
numba_cuda/numba/cuda/cudadrv/driver.py +107 -20
numba_cuda/numba/cuda/cudadrv/linkable_code.py +10 -2
numba_cuda/numba/cuda/cudadrv/nvrtc.py +23 -1
numba_cuda/numba/cuda/cudaimpl.py +103 -11
numba_cuda/numba/cuda/debuginfo.py +27 -0
numba_cuda/numba/cuda/decorators.py +7 -2
numba_cuda/numba/cuda/dispatcher.py +25 -65
numba_cuda/numba/cuda/runtime/nrt.cu +2 -17
numba_cuda/numba/cuda/runtime/nrt.cuh +41 -0
numba_cuda/numba/cuda/runtime/nrt.py +13 -1
numba_cuda/numba/cuda/stubs.py +23 -11
numba_cuda/numba/cuda/target.py +10 -1
numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +0 -12
numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +33 -0
numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +236 -0
numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +55 -0
numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +49 -23
numba_cuda/numba/cuda/tests/cudapy/test_caching.py +34 -51
numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +34 -0
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +17 -0
numba_cuda/numba/cuda/tests/cudapy/test_extending.py +140 -0
numba_cuda/numba/cuda/tests/data/cta_barrier.cu +23 -0
numba_cuda/numba/cuda/tests/data/include/add.cuh +3 -0
numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +3 -0
numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +9 -0
numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +48 -1
numba_cuda/numba/cuda/tests/nrt/test_nrt.py +122 -3
numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +11 -0
numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +5 -2
numba_cuda/numba/cuda/tests/test_binary_generation/nrt_extern.cu +7 -0
numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu +4 -0
numba_cuda/numba/cuda/utils.py +7 -0
{numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/METADATA +1 -1
{numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/RECORD +45 -35
{numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/WHEEL +1 -1
{numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/licenses/LICENSE +0 -0
{numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/top_level.txt +0 -0

numba_cuda/numba/cuda/tests/cudapy/test_extending.py CHANGED Viewed

@@ -1,7 +1,10 @@
 from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase
+from llvmlite import ir
 import numpy as np
+import os
 from numba import config, cuda, njit, types
+from numba.extending import overload
 class Interval:
@@ -160,5 +163,142 @@ class TestExtending(CUDATestCase):
         np.testing.assert_allclose(r, expected)
+TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
+if TEST_BIN_DIR:
+    test_device_functions_a = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.a"
+    )
+    test_device_functions_cubin = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.cubin"
+    )
+    test_device_functions_cu = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.cu"
+    )
+    test_device_functions_fatbin = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.fatbin"
+    )
+    test_device_functions_fatbin_multi = os.path.join(
+        TEST_BIN_DIR, "test_device_functions_multi.fatbin"
+    )
+    test_device_functions_o = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.o"
+    )
+    test_device_functions_ptx = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.ptx"
+    )
+    test_device_functions_ltoir = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.ltoir"
+    )
+class TestExtendingLinkage(CUDATestCase):
+    def test_extension_adds_linkable_code(self):
+        cuda_major_version = cuda.runtime.get_version()[0]
+        if cuda_major_version < 12:
+            self.skipTest("CUDA 12 required for linking in-memory data")
+        files = (
+            (test_device_functions_a, cuda.Archive),
+            (test_device_functions_cubin, cuda.Cubin),
+            (test_device_functions_cu, cuda.CUSource),
+            (test_device_functions_fatbin, cuda.Fatbin),
+            (test_device_functions_o, cuda.Object),
+            (test_device_functions_ptx, cuda.PTXSource),
+            (test_device_functions_ltoir, cuda.LTOIR),
+        )
+        lto = config.CUDA_ENABLE_PYNVJITLINK
+        for path, ctor in files:
+            if ctor == cuda.LTOIR and not lto:
+                # Don't try to test with LTOIR if LTO is not enabled
+                continue
+            with open(path, "rb") as f:
+                code_object = ctor(f.read())
+            def external_add(x, y):
+                return x + y
+            @type_callable(external_add)
+            def type_external_add(context):
+                def typer(x, y):
+                    if x == types.uint32 and y == types.uint32:
+                        return types.uint32
+                return typer
+            @lower_builtin(external_add, types.uint32, types.uint32)
+            def lower_external_add(context, builder, sig, args):
+                context.active_code_library.add_linking_file(code_object)
+                i32 = ir.IntType(32)
+                fnty = ir.FunctionType(i32, [i32, i32])
+                fn = cgutils.get_or_insert_function(
+                    builder.module, fnty, "add_cabi"
+                )
+                return builder.call(fn, args)
+            @cuda.jit(lto=lto)
+            def use_external_add(r, x, y):
+                r[0] = external_add(x[0], y[0])
+            r = np.zeros(1, dtype=np.uint32)
+            x = np.ones(1, dtype=np.uint32)
+            y = np.ones(1, dtype=np.uint32) * 2
+            use_external_add[1, 1](r, x, y)
+            np.testing.assert_equal(r[0], 3)
+            @cuda.jit(lto=lto)
+            def use_external_add_device(x, y):
+                return external_add(x, y)
+            @cuda.jit(lto=lto)
+            def use_external_add_kernel(r, x, y):
+                r[0] = use_external_add_device(x[0], y[0])
+            r = np.zeros(1, dtype=np.uint32)
+            x = np.ones(1, dtype=np.uint32)
+            y = np.ones(1, dtype=np.uint32) * 2
+            use_external_add_kernel[1, 1](r, x, y)
+            np.testing.assert_equal(r[0], 3)
+    def test_linked_called_through_overload(self):
+        cu_code = cuda.CUSource("""
+            extern "C" __device__
+            int bar(int *out, int a)
+            {
+              *out = a * 2;
+              return 0;
+            }
+        """)
+        bar = cuda.declare_device("bar", "int32(int32)", link=cu_code)
+        def bar_call(val):
+            pass
+        @overload(bar_call, target="cuda")
+        def ol_bar_call(a):
+            return lambda a: bar(a)
+        @cuda.jit("void(int32[::1], int32[::1])")
+        def foo(r, x):
+            i = cuda.grid(1)
+            if i < len(r):
+                r[i] = bar_call(x[i])
+        x = np.arange(10, dtype=np.int32)
+        r = np.empty_like(x)
+        foo[1, 32](r, x)
+        np.testing.assert_equal(r, x * 2)
 if __name__ == "__main__":
     unittest.main()

numba_cuda/numba/cuda/tests/data/cta_barrier.cu ADDED Viewed

@@ -0,0 +1,23 @@
+#include <cooperative_groups.h>
+#include <cuda/barrier>
+namespace cg = cooperative_groups;
+__device__ void _wait_on_tile(cuda::barrier<cuda::thread_scope_block> &tile)
+{
+    auto token = tile.arrive();
+    tile.wait(std::move(token));
+}
+extern "C"
+__device__ int cta_barrier(int *ret) {
+    auto cta = cg::this_thread_block();
+    cg::thread_block_tile<32> tile = cg::tiled_partition<32>(cta);
+    __shared__ cuda::barrier<cuda::thread_scope_block> barrier;
+    if (threadIdx.x == 0) {
+        init(&barrier, blockDim.x);
+    }
+    _wait_on_tile(barrier);
+    return 0;
+}

numba_cuda/numba/cuda/tests/data/include/add.cuh ADDED Viewed

@@ -0,0 +1,3 @@
+// Templated addition function: myadd
+template <typename T>
+__device__ T myadd(T a, T b) { return a + b; }

numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh ADDED Viewed

@@ -0,0 +1,3 @@
+// Templated multiplication function: mymul
+template <typename T>
+__device__ T mymul(T a, T b) { return a * b; }

numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu ADDED Viewed

@@ -0,0 +1,9 @@
+#include <add.cuh> // In numba/cuda/tests/data/include
+#include <mul.cuh> // In numba/cuda/tests/doc_examples/ffi/include
+extern "C"
+__device__ int saxpy(float *ret, float a, float x, float y)
+{
+    *ret = myadd(mymul(a, x), y);
+    return 0;
+}

numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py CHANGED Viewed

@@ -3,7 +3,7 @@
 import unittest
 from numba.cuda.testing import CUDATestCase, skip_on_cudasim
-from numba.tests.support import skip_unless_cffi
+from numba.tests.support import skip_unless_cffi, override_config
 @skip_unless_cffi
@@ -85,6 +85,53 @@ class TestFFI(CUDATestCase):
         actual = r[()]
         np.testing.assert_allclose(expected, actual)
+    def test_ex_extra_includes(self):
+        import numpy as np
+        from numba import cuda, config
+        import os
+        basedir = os.path.dirname(os.path.abspath(__file__))
+        mul_dir = os.path.join(basedir, "ffi", "include")
+        saxpy_cu = os.path.join(basedir, "ffi", "saxpy.cu")
+        testdir = os.path.dirname(basedir)
+        add_dir = os.path.join(testdir, "data", "include")
+        includedir = ":".join([mul_dir, add_dir])
+        with override_config("CUDA_NVRTC_EXTRA_SEARCH_PATHS", includedir):
+            # magictoken.ex_extra_search_paths.begin
+            from numba import config
+            includedir = ":".join([mul_dir, add_dir])
+            config.CUDA_NVRTC_EXTRA_SEARCH_PATHS = includedir
+            # magictoken.ex_extra_search_paths.end
+            # magictoken.ex_extra_search_paths_kernel.begin
+            sig = "float32(float32, float32, float32)"
+            saxpy = cuda.declare_device("saxpy", sig=sig, link=saxpy_cu)
+            @cuda.jit
+            def vector_saxpy(a, x, y, res):
+                i = cuda.grid(1)
+                if i < len(res):
+                    res[i] = saxpy(a, x[i], y[i])
+            # magictoken.ex_extra_search_paths_kernel.end
+            size = 10_000
+            a = 3.0
+            X = np.ones((size,), dtype="float32")
+            Y = np.ones((size,), dtype="float32")
+            R = np.zeros((size,), dtype="float32")
+            block_size = 32
+            num_blocks = (size // block_size) + 1
+            vector_saxpy[num_blocks, block_size](a, X, Y, R)
+            expected = a * X + Y
+            np.testing.assert_equal(R, expected)
 if __name__ == "__main__":
     unittest.main()

numba_cuda/numba/cuda/tests/nrt/test_nrt.py CHANGED Viewed

@@ -4,11 +4,86 @@ import os
 import numpy as np
 import unittest
 from numba.cuda.testing import CUDATestCase
 from numba.tests.support import run_in_subprocess, override_config
+from numba.cuda import get_current_device
+from numba.cuda.cudadrv.nvrtc import compile
+from numba import types
+from numba.cuda.cudadecl import registry as cuda_decl_registry
+from numba.core.typing import signature
+from numba.cuda.cudaimpl import lower as cuda_lower
 from numba import cuda
-from numba.cuda.runtime.nrt import rtsys
+from numba.cuda.runtime.nrt import rtsys, get_include
+from numba.core.typing.templates import AbstractTemplate
+from numba.cuda.cudadrv.linkable_code import (
+    CUSource,
+    PTXSource,
+    Fatbin,
+    Cubin,
+    Archive,
+    Object,
+)
+TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
+if TEST_BIN_DIR:
+    def make_linkable_code(name, kind, mode):
+        path = os.path.join(TEST_BIN_DIR, name)
+        with open(path, mode) as f:
+            contents = f.read()
+        return kind(contents, nrt=True)
+    nrt_extern_a = make_linkable_code("nrt_extern.a", Archive, "rb")
+    nrt_extern_cubin = make_linkable_code("nrt_extern.cubin", Cubin, "rb")
+    nrt_extern_cu = make_linkable_code(
+        "nrt_extern.cu",
+        CUSource,
+        "rb",
+    )
+    nrt_extern_fatbin = make_linkable_code("nrt_extern.fatbin", Fatbin, "rb")
+    nrt_extern_fatbin_multi = make_linkable_code(
+        "nrt_extern_multi.fatbin", Fatbin, "rb"
+    )
+    nrt_extern_o = make_linkable_code("nrt_extern.o", Object, "rb")
+    nrt_extern_ptx = make_linkable_code("nrt_extern.ptx", PTXSource, "rb")
+def allocate_deallocate_handle():
+    """
+    Handle to call NRT_Allocate and NRT_Free
+    """
+    pass
+@cuda_decl_registry.register_global(allocate_deallocate_handle)
+class AllocateShimImpl(AbstractTemplate):
+    def generic(self, args, kws):
+        return signature(types.void)
+device_fun_shim = cuda.declare_device(
+    "device_allocate_deallocate", types.int32()
+)
+# wrapper to turn the above into a python callable
+def call_device_fun_shim():
+    return device_fun_shim()
+@cuda_lower(allocate_deallocate_handle)
+def allocate_deallocate_impl(context, builder, sig, args):
+    sig_ = types.int32()
+    # call the external function, passing the pointer
+    result = context.compile_internal(
+        builder,
+        call_device_fun_shim,
+        sig_,
+        (),
+    )
+    return result
 class TestNrtBasic(CUDATestCase):
@@ -77,6 +152,50 @@ class TestNrtBasic(CUDATestCase):
         self.assertEqual(out_ary[0], 1)
+class TestNrtLinking(CUDATestCase):
+    def run(self, result=None):
+        with override_config("CUDA_ENABLE_NRT", True):
+            super(TestNrtLinking, self).run(result)
+    def test_nrt_detect_linked_ptx_file(self):
+        src = f"#include <{get_include()}/nrt.cuh>"
+        src += """
+                 extern "C" __device__ int device_allocate_deallocate(int* nb_retval){
+                     auto ptr = NRT_Allocate(1);
+                     NRT_Free(ptr);
+                     return 0;
+                 }
+        """
+        cc = get_current_device().compute_capability
+        ptx, _ = compile(src, "external_nrt.cu", cc)
+        @cuda.jit(link=[PTXSource(ptx.encode(), nrt=True)])
+        def kernel():
+            allocate_deallocate_handle()
+        kernel[1, 1]()
+    @unittest.skipIf(not TEST_BIN_DIR, "necessary binaries not generated.")
+    def test_nrt_detect_linkable_code(self):
+        codes = (
+            nrt_extern_a,
+            nrt_extern_cubin,
+            nrt_extern_cu,
+            nrt_extern_fatbin,
+            nrt_extern_fatbin_multi,
+            nrt_extern_o,
+            nrt_extern_ptx,
+        )
+        for code in codes:
+            with self.subTest(code=code):
+                @cuda.jit(link=[code])
+                def kernel():
+                    allocate_deallocate_handle()
+                kernel[1, 1]()
 class TestNrtStatistics(CUDATestCase):
     def setUp(self):
         self._stream = cuda.default_stream()

numba_cuda/numba/cuda/tests/test_binary_generation/Makefile CHANGED Viewed

@@ -40,6 +40,8 @@ LTOIR_FLAGS := $(LTOIR_GENCODE) -dc
 OUTPUT_DIR := ./
+NRT_INCLUDE_DIR := $(shell python -c "from numba.cuda.runtime.nrt import get_include; print(get_include())")
 all:
 	@echo "GPU CC: $(GPU_CC)"
 	@echo "Alternative CC: $(ALT_CC)"
@@ -52,7 +54,16 @@ all:
 	nvcc $(NVCC_FLAGS) $(OBJECT_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.o test_device_functions.cu
 	nvcc $(NVCC_FLAGS) $(LIBRARY_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.a test_device_functions.cu
+	nvcc $(NVCC_FLAGS) $(CUBIN_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.cubin nrt_extern.cu -I$(NRT_INCLUDE_DIR)
+	nvcc $(NVCC_FLAGS) $(FATBIN_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.fatbin nrt_extern.cu -I$(NRT_INCLUDE_DIR)
+	nvcc $(NVCC_FLAGS) $(MULTI_FATBIN_FLAGS) -o $(OUTPUT_DIR)/nrt_extern_multi.fatbin nrt_extern.cu -I$(NRT_INCLUDE_DIR)
+	nvcc $(NVCC_FLAGS) $(PTX_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.ptx nrt_extern.cu -I$(NRT_INCLUDE_DIR)
+	nvcc $(NVCC_FLAGS) $(OBJECT_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.o nrt_extern.cu -I$(NRT_INCLUDE_DIR)
+	nvcc $(NVCC_FLAGS) $(LIBRARY_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.a nrt_extern.cu -I$(NRT_INCLUDE_DIR)
 	# Generate LTO-IR wrapped in a fatbin
 	nvcc $(NVCC_FLAGS) $(LTOIR_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.ltoir.o test_device_functions.cu
+	nvcc $(NVCC_FLAGS) $(LTOIR_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.ltoir.o nrt_extern.cu -I$(NRT_INCLUDE_DIR)
 	# Generate LTO-IR in a "raw" LTO-IR container
 	python generate_raw_ltoir.py --arch sm_$(GPU_CC) -o $(OUTPUT_DIR)/test_device_functions.ltoir test_device_functions.cu
+	python generate_raw_ltoir.py --arch sm_$(GPU_CC) -o $(OUTPUT_DIR)/nrt_extern.ltoir nrt_extern.cu --nrt

numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py CHANGED Viewed

@@ -7,6 +7,7 @@ import subprocess
 import sys
 from cuda import nvrtc
+from numba.cuda.runtime.nrt import get_include
 # Magic number found at the start of an LTO-IR file
 LTOIR_MAGIC = 0x7F4E43ED
@@ -88,7 +89,9 @@ def get_ltoir(source, name, arch):
         nvrtc.nvrtcCreateProgram(source.encode(), name.encode(), 0, [], [])
     )
-    cuda_include_flags = determine_include_flags()
+    cuda_include_flags = determine_include_flags() + (
+        [f"-I{get_include()}"] if args.nrt else []
+    )
     if cuda_include_flags is None:
         print("Error determining CUDA include flags. Exiting.", file=sys.stderr)
         sys.exit(1)
@@ -160,7 +163,7 @@ if __name__ == "__main__":
         help="compute arch to target (e.g. sm_87). Defaults to sm_50.",
         default="sm_50",
     )
+    parser.add_argument("--nrt", action="store_true")
     args = parser.parse_args()
     outputpath = args.output

numba_cuda/numba/cuda/tests/test_binary_generation/nrt_extern.cu ADDED Viewed

@@ -0,0 +1,7 @@
+#include <nrt.cuh>
+extern "C" __device__ int device_allocate_deallocate(int* nb_retval){
+    auto ptr = NRT_Allocate(1);
+    NRT_Free(ptr);
+    return 0;
+}

numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu CHANGED Viewed

@@ -17,3 +17,7 @@ extern "C" __device__ int add_from_numba(uint32_t *result, uint32_t a,
   *result = a + b;
   return 0;
 }
+extern "C" __device__ uint32_t add_cabi(uint32_t a, uint32_t b) {
+  return a + b;
+}

numba_cuda/numba/cuda/utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import warnings
 import traceback
+import functools
 def _readenv(name, ctor, default):
@@ -20,3 +21,9 @@ def _readenv(name, ctor, default):
             RuntimeWarning,
         )
         return default
+@functools.lru_cache(maxsize=None)
+def cached_file_read(filepath, how="r"):
+    with open(filepath, how) as f:
+        return f.read()

{numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: numba-cuda
-Version: 0.10.1
+Version: 0.12.1
 Summary: CUDA target for Numba
 Author: Anaconda Inc., NVIDIA Corporation
 License: BSD 2-clause

numba-cuda 0.10.1__py3-none-any.whl → 0.12.1__py3-none-any.whl

numba-cuda 0.10.1py3-none-any.whl → 0.12.1py3-none-any.whl