PyPI - numba-cuda - Versions diffs - 0.10.1__py3-none-any.whl → 0.12.1__py3-none-any.whl - Mend

numba-cuda 0.10.1py3-none-any.whl → 0.12.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

numba_cuda/VERSION +1 -1
numba_cuda/numba/cuda/{cuda_bf16.py → _internal/cuda_bf16.py} +1 -1
numba_cuda/numba/cuda/api.py +13 -0
numba_cuda/numba/cuda/bf16.py +112 -0
numba_cuda/numba/cuda/cg.py +2 -0
numba_cuda/numba/cuda/codegen.py +77 -2
numba_cuda/numba/cuda/compiler.py +22 -16
numba_cuda/numba/cuda/cudadecl.py +21 -6
numba_cuda/numba/cuda/cudadrv/driver.py +107 -20
numba_cuda/numba/cuda/cudadrv/linkable_code.py +10 -2
numba_cuda/numba/cuda/cudadrv/nvrtc.py +23 -1
numba_cuda/numba/cuda/cudaimpl.py +103 -11
numba_cuda/numba/cuda/debuginfo.py +27 -0
numba_cuda/numba/cuda/decorators.py +7 -2
numba_cuda/numba/cuda/dispatcher.py +25 -65
numba_cuda/numba/cuda/runtime/nrt.cu +2 -17
numba_cuda/numba/cuda/runtime/nrt.cuh +41 -0
numba_cuda/numba/cuda/runtime/nrt.py +13 -1
numba_cuda/numba/cuda/stubs.py +23 -11
numba_cuda/numba/cuda/target.py +10 -1
numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +0 -12
numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +33 -0
numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +236 -0
numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +55 -0
numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +49 -23
numba_cuda/numba/cuda/tests/cudapy/test_caching.py +34 -51
numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +34 -0
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +17 -0
numba_cuda/numba/cuda/tests/cudapy/test_extending.py +140 -0
numba_cuda/numba/cuda/tests/data/cta_barrier.cu +23 -0
numba_cuda/numba/cuda/tests/data/include/add.cuh +3 -0
numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +3 -0
numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +9 -0
numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +48 -1
numba_cuda/numba/cuda/tests/nrt/test_nrt.py +122 -3
numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +11 -0
numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +5 -2
numba_cuda/numba/cuda/tests/test_binary_generation/nrt_extern.cu +7 -0
numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu +4 -0
numba_cuda/numba/cuda/utils.py +7 -0
{numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/METADATA +1 -1
{numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/RECORD +45 -35
{numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/WHEEL +1 -1
{numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/licenses/LICENSE +0 -0
{numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/top_level.txt +0 -0

numba_cuda/numba/cuda/cudadrv/nvrtc.py CHANGED Viewed

@@ -6,13 +6,21 @@ from numba.cuda.cudadrv.error import (
     NvrtcCompilationError,
     NvrtcSupportError,
 )
+from numba import config
 from numba.cuda.cuda_paths import get_cuda_paths
+from numba.cuda.utils import _readenv
 import functools
 import os
 import threading
 import warnings
+NVRTC_EXTRA_SEARCH_PATHS = _readenv(
+    "NUMBA_CUDA_NVRTC_EXTRA_SEARCH_PATHS", str, ""
+) or getattr(config, "NUMBA_CUDA_NVRTC_EXTRA_SEARCH_PATHS", "")
+if not hasattr(config, "NUMBA_CUDA_NVRTC_EXTRA_SEARCH_PATHS"):
+    config.CUDA_NVRTC_EXTRA_SEARCH_PATHS = NVRTC_EXTRA_SEARCH_PATHS
 # Opaque handle for compilation unit
 nvrtc_program = c_void_p
@@ -383,10 +391,24 @@ def compile(src, name, cc, ltoir=False):
     else:
         numba_include = f"-I{os.path.join(numba_cuda_path, 'include', '12')}"
+    if config.CUDA_NVRTC_EXTRA_SEARCH_PATHS:
+        extra_search_paths = config.CUDA_NVRTC_EXTRA_SEARCH_PATHS.split(":")
+        extra_includes = [f"-I{p}" for p in extra_search_paths]
+    else:
+        extra_includes = []
     nrt_path = os.path.join(numba_cuda_path, "runtime")
     nrt_include = f"-I{nrt_path}"
-    options = [arch, numba_include, *cuda_include, nrt_include, "-rdc", "true"]
+    options = [
+        arch,
+        numba_include,
+        *cuda_include,
+        nrt_include,
+        *extra_includes,
+        "-rdc",
+        "true",
+    ]
     if ltoir:
         options.append("-dlto")

numba_cuda/numba/cuda/cudaimpl.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from functools import reduce
 import operator
 import math
+import struct
 from llvmlite import ir
 import llvmlite.binding as ll
@@ -92,10 +93,61 @@ def _get_unique_smem_id(name):
     return "{0}_{1}".format(name, _unique_smem_id)
+def _validate_alignment(alignment: int):
+    """
+    Ensures that *alignment*, if not None, is a) greater than zero, b) a power
+    of two, and c) a multiple of the size of a pointer.  If any of these
+    conditions are not met, a ValueError is raised.  Otherwise, this
+    function returns None, indicating that the alignment is valid.
+    """
+    if alignment is None:
+        return
+    if not isinstance(alignment, int):
+        raise ValueError("Alignment must be an integer")
+    if alignment <= 0:
+        raise ValueError("Alignment must be positive")
+    if (alignment & (alignment - 1)) != 0:
+        raise ValueError("Alignment must be a power of 2")
+    pointer_size = struct.calcsize("P")
+    if (alignment % pointer_size) != 0:
+        msg = f"Alignment must be a multiple of {pointer_size}"
+        raise ValueError(msg)
+def _try_extract_and_validate_alignment(sig: types.Tuple):
+    """
+    Extracts and validates the alignment from the supplied signature.
+    Returns the alignment if it is present and is an integer literal;
+    otherwise, returns None.
+    N.B. Currently, this routine assumes the signature has exactly
+         three arguments, with the alignment (if present) as the third
+         argument, as is the case with the shared and local array
+         helper routines below.
+         If this routine is called from new places, you may need to
+         review this implicit assumption.
+    """
+    if len(sig.args) != 3:
+        return None
+    alignment_arg = sig.args[2]
+    if not isinstance(alignment_arg, types.IntegerLiteral):
+        return None
+    alignment_arg = alignment_arg.literal_value
+    _validate_alignment(alignment_arg)
+    return alignment_arg
 @lower(cuda.shared.array, types.IntegerLiteral, types.Any)
+@lower(cuda.shared.array, types.IntegerLiteral, types.Any, types.IntegerLiteral)
+@lower(cuda.shared.array, types.IntegerLiteral, types.Any, types.NoneType)
 def cuda_shared_array_integer(context, builder, sig, args):
     length = sig.args[0].literal_value
     dtype = parse_dtype(sig.args[1])
+    alignment = _try_extract_and_validate_alignment(sig)
     return _generic_array(
         context,
         builder,
@@ -104,14 +156,17 @@ def cuda_shared_array_integer(context, builder, sig, args):
         symbol_name=_get_unique_smem_id("_cudapy_smem"),
         addrspace=nvvm.ADDRSPACE_SHARED,
         can_dynsized=True,
+        alignment=alignment,
     )
-@lower(cuda.shared.array, types.Tuple, types.Any)
-@lower(cuda.shared.array, types.UniTuple, types.Any)
+@lower(cuda.shared.array, types.BaseTuple, types.Any)
+@lower(cuda.shared.array, types.BaseTuple, types.Any, types.IntegerLiteral)
+@lower(cuda.shared.array, types.BaseTuple, types.Any, types.NoneType)
 def cuda_shared_array_tuple(context, builder, sig, args):
     shape = [s.literal_value for s in sig.args[0]]
     dtype = parse_dtype(sig.args[1])
+    alignment = _try_extract_and_validate_alignment(sig)
     return _generic_array(
         context,
         builder,
@@ -120,13 +175,17 @@ def cuda_shared_array_tuple(context, builder, sig, args):
         symbol_name=_get_unique_smem_id("_cudapy_smem"),
         addrspace=nvvm.ADDRSPACE_SHARED,
         can_dynsized=True,
+        alignment=alignment,
     )
 @lower(cuda.local.array, types.IntegerLiteral, types.Any)
+@lower(cuda.local.array, types.IntegerLiteral, types.Any, types.IntegerLiteral)
+@lower(cuda.local.array, types.IntegerLiteral, types.Any, types.NoneType)
 def cuda_local_array_integer(context, builder, sig, args):
     length = sig.args[0].literal_value
     dtype = parse_dtype(sig.args[1])
+    alignment = _try_extract_and_validate_alignment(sig)
     return _generic_array(
         context,
         builder,
@@ -135,14 +194,17 @@ def cuda_local_array_integer(context, builder, sig, args):
         symbol_name="_cudapy_lmem",
         addrspace=nvvm.ADDRSPACE_LOCAL,
         can_dynsized=False,
+        alignment=alignment,
     )
-@lower(cuda.local.array, types.Tuple, types.Any)
-@lower(cuda.local.array, types.UniTuple, types.Any)
-def ptx_lmem_alloc_array(context, builder, sig, args):
+@lower(cuda.local.array, types.BaseTuple, types.Any)
+@lower(cuda.local.array, types.BaseTuple, types.Any, types.IntegerLiteral)
+@lower(cuda.local.array, types.BaseTuple, types.Any, types.NoneType)
+def cuda_local_array_tuple(context, builder, sig, args):
     shape = [s.literal_value for s in sig.args[0]]
     dtype = parse_dtype(sig.args[1])
+    alignment = _try_extract_and_validate_alignment(sig)
     return _generic_array(
         context,
         builder,
@@ -151,6 +213,7 @@ def ptx_lmem_alloc_array(context, builder, sig, args):
         symbol_name="_cudapy_lmem",
         addrspace=nvvm.ADDRSPACE_LOCAL,
         can_dynsized=False,
+        alignment=alignment,
     )
@@ -966,7 +1029,14 @@ def ptx_nanosleep(context, builder, sig, args):
 def _generic_array(
-    context, builder, shape, dtype, symbol_name, addrspace, can_dynsized=False
+    context,
+    builder,
+    shape,
+    dtype,
+    symbol_name,
+    addrspace,
+    can_dynsized=False,
+    alignment=None,
 ):
     elemcount = reduce(operator.mul, shape, 1)
@@ -994,6 +1064,14 @@ def _generic_array(
         # NVVM is smart enough to only use local memory if no register is
         # available
         dataptr = cgutils.alloca_once(builder, laryty, name=symbol_name)
+        # If the caller has specified a custom alignment, just set the align
+        # attribute on the alloca IR directly.  We don't do any additional
+        # hand-holding here like checking the underlying data type's alignment
+        # or rounding up to the next power of 2--those checks will have already
+        # been done by the time we see the alignment value.
+        if alignment is not None:
+            dataptr.align = alignment
     else:
         lmod = builder.module
@@ -1001,11 +1079,25 @@ def _generic_array(
         gvmem = cgutils.add_global_variable(
             lmod, laryty, symbol_name, addrspace
         )
-        # Specify alignment to avoid misalignment bug
-        align = context.get_abi_sizeof(lldtype)
-        # Alignment is required to be a power of 2 for shared memory. If it is
-        # not a power of 2 (e.g. for a Record array) then round up accordingly.
-        gvmem.align = 1 << (align - 1).bit_length()
+        # If the caller hasn't specified a custom alignment, obtain the
+        # underlying dtype alignment from the ABI and then round it up to
+        # a power of two.  Otherwise, just use the caller's alignment.
+        #
+        # N.B. The caller *could* provide a valid-but-smaller-than-natural
+        #      alignment here; we'll assume the caller knows what they're
+        #      doing and let that through without error.
+        if alignment is None:
+            abi_alignment = context.get_abi_alignment(lldtype)
+            # Alignment is required to be a power of 2 for shared memory.
+            # If it is not a power of 2 (e.g. for a Record array) then round
+            # up accordingly.
+            actual_alignment = 1 << (abi_alignment - 1).bit_length()
+        else:
+            actual_alignment = alignment
+        gvmem.align = actual_alignment
         if dynamic_smem:
             gvmem.linkage = "external"

numba_cuda/numba/cuda/debuginfo.py CHANGED Viewed

@@ -59,6 +59,33 @@ class CUDADIBuilder(DIBuilder):
         # For other cases, use upstream Numba implementation
         return super()._var_type(lltype, size, datamodel=datamodel)
+    def _di_subroutine_type(self, line, function, argmap):
+        # The function call conv needs encoding.
+        llfunc = function
+        md = []
+        # Create metadata type for return value
+        if len(llfunc.args) > 0:
+            lltype = llfunc.args[0].type
+            size = self.cgctx.get_abi_sizeof(lltype)
+            mdtype = self._var_type(lltype, size, datamodel=None)
+            md.append(mdtype)
+        # Create metadata type for arguments
+        for idx, (name, nbtype) in enumerate(argmap.items()):
+            datamodel = self.cgctx.data_model_manager[nbtype]
+            lltype = self.cgctx.get_value_type(nbtype)
+            size = self.cgctx.get_abi_sizeof(lltype)
+            mdtype = self._var_type(lltype, size, datamodel=datamodel)
+            md.append(mdtype)
+        return self.module.add_debug_info(
+            "DISubroutineType",
+            {
+                "types": self.module.add_metadata(md),
+            },
+        )
     def mark_variable(
         self,
         builder,

numba_cuda/numba/cuda/decorators.py CHANGED Viewed

@@ -229,7 +229,7 @@ def jit(
                 return disp
-def declare_device(name, sig, link=None):
+def declare_device(name, sig, link=None, use_cooperative=False):
     """
     Declare the signature of a foreign function. Returns a descriptor that can
     be used to call the function from a Python kernel.
@@ -238,6 +238,7 @@ def declare_device(name, sig, link=None):
     :type name: str
     :param sig: The Numba signature of the function.
     :param link: External code to link when calling the function.
+    :param use_cooperative: External code requires cooperative launch.
     """
     if link is None:
         link = tuple()
@@ -250,4 +251,8 @@ def declare_device(name, sig, link=None):
         msg = "Return type must be provided for device declarations"
         raise TypeError(msg)
-    return declare_device_function(name, restype, argtypes, link)
+    template = declare_device_function(
+        name, restype, argtypes, link, use_cooperative
+    )
+    return template.key

numba_cuda/numba/cuda/dispatcher.py CHANGED Viewed

@@ -1,27 +1,25 @@
 import numpy as np
 import os
-import re
 import sys
 import ctypes
 import functools
-from collections import defaultdict
-from numba.core import config, ir, serialize, sigutils, types, typing, utils
+from numba.core import config, serialize, sigutils, types, typing, utils
 from numba.core.caching import Cache, CacheImpl
 from numba.core.compiler_lock import global_compiler_lock
 from numba.core.dispatcher import Dispatcher
 from numba.core.errors import NumbaPerformanceWarning
 from numba.core.typing.typeof import Purpose, typeof
-from numba.core.types.functions import Function
 from numba.cuda.api import get_current_device
 from numba.cuda.args import wrap_arg
 from numba.cuda.compiler import (
     compile_cuda,
     CUDACompiler,
     kernel_fixup,
-    ExternFunction,
 )
+import re
 from numba.cuda.cudadrv import driver
+from numba.cuda.cudadrv.linkable_code import LinkableCode
 from numba.cuda.cudadrv.devices import get_context
 from numba.cuda.descriptor import cuda_target
 from numba.cuda.errors import (
@@ -29,7 +27,7 @@ from numba.cuda.errors import (
     normalize_kernel_dimensions,
 )
 from numba.cuda import types as cuda_types
-from numba.cuda.runtime.nrt import rtsys
+from numba.cuda.runtime.nrt import rtsys, NRT_LIBRARY
 from numba.cuda.locks import module_init_lock
 from numba import cuda
@@ -59,54 +57,6 @@ cuda_fp16_math_funcs = [
 reshape_funcs = ["nocopy_empty_reshape", "numba_attempt_nocopy_reshape"]
-def get_cres_link_objects(cres):
-    """Given a compile result, return a set of all linkable code objects that
-    are required for it to be fully linked."""
-    link_objects = set()
-    # List of calls into declared device functions
-    device_func_calls = [
-        (name, v)
-        for name, v in cres.fndesc.typemap.items()
-        if (isinstance(v, cuda_types.CUDADispatcher))
-    ]
-    # List of tuples with SSA name of calls and corresponding signature
-    call_signatures = [
-        (call.func.name, sig)
-        for call, sig in cres.fndesc.calltypes.items()
-        if (isinstance(call, ir.Expr) and call.op == "call")
-    ]
-    # Map SSA names to all invoked signatures
-    call_signature_d = defaultdict(list)
-    for name, sig in call_signatures:
-        call_signature_d[name].append(sig)
-    # Add the link objects from the current function's callees
-    for name, v in device_func_calls:
-        for sig in call_signature_d.get(name, []):
-            called_cres = v.dispatcher.overloads[sig.args]
-            called_link_objects = get_cres_link_objects(called_cres)
-            link_objects.update(called_link_objects)
-    # From this point onwards, we are only interested in ExternFunction
-    # declarations - these are the calls made directly in this function to
-    # them.
-    for name, v in cres.fndesc.typemap.items():
-        if not isinstance(v, Function):
-            continue
-        if not isinstance(v.typing_key, ExternFunction):
-            continue
-        for obj in v.typing_key.link:
-            link_objects.add(obj)
-    return link_objects
 class _Kernel(serialize.ReduceMixin):
     """
     CUDA Kernel specialized for a given set of argument types. When called, this
@@ -201,8 +151,8 @@ class _Kernel(serialize.ReduceMixin):
         asm = lib.get_asm_str()
-        # A kernel needs cooperative launch if grid_sync is being used.
-        self.cooperative = "cudaCGGetIntrinsicHandle" in asm
+        # The code library contains functions that require cooperative launch.
+        self.cooperative = lib.use_cooperative
         # We need to link against cudadevrt if grid sync is being used.
         if self.cooperative:
             lib.needs_cudadevrt = True
@@ -238,9 +188,6 @@ class _Kernel(serialize.ReduceMixin):
         self.maybe_link_nrt(link, tgt_ctx, asm)
-        for obj in get_cres_link_objects(cres):
-            lib.add_linking_file(obj)
         for filepath in link:
             lib.add_linking_file(filepath)
@@ -263,6 +210,13 @@ class _Kernel(serialize.ReduceMixin):
         self.reload_init = []
     def maybe_link_nrt(self, link, tgt_ctx, asm):
+        """
+        Add the NRT source code to the link if the neccesary conditions are met.
+        NRT must be enabled for the CUDATargetContext, and either NRT functions
+        must be detected in the kernel asm or an NRT enabled LinkableCode object
+        must be passed.
+        """
         if not tgt_ctx.enable_nrt:
             return
@@ -272,13 +226,19 @@ class _Kernel(serialize.ReduceMixin):
             + all_nrt
             + r")\s*\([^)]*\)\s*;"
         )
+        link_nrt = False
         nrt_in_asm = re.findall(pattern, asm)
-        basedir = os.path.dirname(os.path.abspath(__file__))
-        if nrt_in_asm:
-            nrt_path = os.path.join(basedir, "runtime", "nrt.cu")
-            link.append(nrt_path)
+        if len(nrt_in_asm) > 0:
+            link_nrt = True
+        if not link_nrt:
+            for file in link:
+                if isinstance(file, LinkableCode):
+                    if file.nrt:
+                        link_nrt = True
+                        break
+        if link_nrt:
+            link.append(NRT_LIBRARY)
     @property
     def library(self):

numba_cuda/numba/cuda/runtime/nrt.cu CHANGED Viewed

@@ -4,30 +4,14 @@
 #include <cuda/atomic>
 #include "memsys.cuh"
+#include "nrt.cuh"
-typedef void (*NRT_dtor_function)(void* ptr, size_t size, void* info);
-typedef void (*NRT_dealloc_func)(void* ptr, void* dealloc_info);
-typedef struct MemInfo NRT_MemInfo;
-extern "C" {
-struct MemInfo {
-  cuda::atomic<size_t, cuda::thread_scope_device> refct;
-  NRT_dtor_function dtor;
-  void* dtor_info;
-  void* data;
-  size_t size;
-};
-}
 extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr)
 {
   TheMSys = memsys_ptr;
 }
-static __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, NRT_MemInfo **mi);
-static __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out);
-extern "C" __device__ void* NRT_Allocate_External(size_t size);
 extern "C" __device__ void* NRT_Allocate(size_t size)
 {
@@ -177,6 +161,7 @@ extern "C" __device__ void NRT_decref(NRT_MemInfo* mi)
   }
 }
 #endif
 extern "C" __device__ void NRT_incref(NRT_MemInfo* mi)

numba_cuda/numba/cuda/runtime/nrt.cuh ADDED Viewed

@@ -0,0 +1,41 @@
+#include <cuda/atomic>
+typedef void (*NRT_dtor_function)(void* ptr, size_t size, void* info);
+typedef void (*NRT_dealloc_func)(void* ptr, void* dealloc_info);
+extern "C"
+struct MemInfo {
+  cuda::atomic<size_t, cuda::thread_scope_device> refct;
+  NRT_dtor_function dtor;
+  void* dtor_info;
+  void* data;
+  size_t size;
+};
+typedef struct MemInfo NRT_MemInfo;
+extern "C" __device__ void* NRT_Allocate(size_t size);
+extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
+                                            void* data,
+                                            size_t size,
+                                            NRT_dtor_function dtor,
+                                            void* dtor_info);
+static __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, NRT_MemInfo **mi);
+static __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out);
+extern "C" __device__ void* NRT_Allocate_External(size_t size);
+extern "C" __device__ void NRT_decref(NRT_MemInfo* mi);
+extern "C" __device__ void NRT_incref(NRT_MemInfo* mi);
+extern "C" __device__ void* NRT_Allocate_External(size_t size);
+static __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out);
+static __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, NRT_MemInfo **mi);
+extern "C" __device__ NRT_MemInfo *NRT_MemInfo_alloc_aligned(size_t size, unsigned align);
+extern "C" __device__ void* NRT_MemInfo_data_fast(NRT_MemInfo *mi);
+extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi);
+extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi);
+extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi);
+extern "C" __device__ void NRT_Free(void* ptr);
+extern "C" __device__ NRT_MemInfo* NRT_MemInfo_new(void* data, size_t size, NRT_dtor_function dtor, void* dtor_info);
+extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
+                                            void* data,
+                                            size_t size,
+                                            NRT_dtor_function dtor,
+                                            void* dtor_info);

numba_cuda/numba/cuda/runtime/nrt.py CHANGED Viewed

@@ -13,7 +13,8 @@ from numba.cuda.cudadrv.driver import (
 )
 from numba.cuda.cudadrv import devices
 from numba.cuda.api import get_current_device
-from numba.cuda.utils import _readenv
+from numba.cuda.utils import _readenv, cached_file_read
+from numba.cuda.cudadrv.linkable_code import CUSource
 # Check environment variable or config for NRT statistics enablement
@@ -32,6 +33,11 @@ if not hasattr(config, "NUMBA_CUDA_ENABLE_NRT"):
     config.CUDA_ENABLE_NRT = ENABLE_NRT
+def get_include():
+    """Return the include path for the NRT header"""
+    return os.path.dirname(os.path.abspath(__file__))
 # Protect method to ensure NRT memory allocation and initialization
 def _alloc_init_guard(method):
     """
@@ -340,3 +346,9 @@ class _Runtime:
 # Create an instance of the runtime
 rtsys = _Runtime()
+basedir = os.path.dirname(os.path.abspath(__file__))
+nrt_path = os.path.join(basedir, "nrt.cu")
+nrt_src = cached_file_read(nrt_path)
+NRT_LIBRARY = CUSource(nrt_src, name="nrt.cu", nrt=True)

numba_cuda/numba/cuda/stubs.py CHANGED Viewed

@@ -129,12 +129,16 @@ class shared(Stub):
     _description_ = "<shared>"
     @stub_function
-    def array(shape, dtype):
+    def array(shape, dtype, alignment=None):
         """
-        Allocate a shared array of the given *shape* and *type*. *shape* is
-        either an integer or a tuple of integers representing the array's
-        dimensions.  *type* is a :ref:`Numba type <numba-types>` of the
-        elements needing to be stored in the array.
+        Allocate a shared array of the given *shape*, *type*, and, optionally,
+        *alignment*.  *shape* is either an integer or a tuple of integers
+        representing the array's dimensions.  *type* is a :ref:`Numba type
+        <numba-types>` of the elements needing to be stored in the array.
+        *alignment* is an optional integer specifying the byte alignment of
+        the array.  When specified, it must be a power of two, and a multiple
+        of the size of a pointer (8 bytes).  When not specified, the array is
+        allocated with an alignment appropriate for the supplied *dtype*.
         The returned array-like object can be read and written to like any
         normal device array (e.g. through indexing).
@@ -149,12 +153,20 @@ class local(Stub):
     _description_ = "<local>"
     @stub_function
-    def array(shape, dtype):
-        """
-        Allocate a local array of the given *shape* and *type*. The array is
-        private to the current thread, and resides in global memory. An
-        array-like object is returned which can be read and written to like any
-        standard array (e.g.  through indexing).
+    def array(shape, dtype, alignment=None):
+        """
+        Allocate a local array of the given *shape*, *type*, and, optionally,
+        *alignment*.  *shape* is either an integer or a tuple of integers
+        representing the array's dimensions.  *type* is a :ref:`Numba type
+        <numba-types>` of the elements needing to be stored in the array.
+        *alignment* is an optional integer specifying the byte alignment of
+        the array.  When specified, it must be a power of two, and a multiple
+        of the size of a pointer (8 bytes).  When not specified, the array is
+        allocated with an alignment appropriate for the supplied *dtype*.
+        The array is private to the current thread, and resides in global
+        memory.  An array-like object is returned which can be read and
+        written to like any standard array (e.g. through indexing).
         """

numba_cuda/numba/cuda/target.py CHANGED Viewed

@@ -290,7 +290,16 @@ class CUDATargetContext(BaseContext):
 class CUDACallConv(MinimalCallConv):
-    pass
+    def decorate_function(self, fn, args, fe_argtypes, noalias=False):
+        """
+        Set names and attributes of function arguments.
+        """
+        assert not noalias
+        arginfo = self._get_arg_packer(fe_argtypes)
+        # Do not prefix "arg." on argument name, so that nvvm compiler
+        # can track debug info of argument more accurately
+        arginfo.assign_names(self.get_arguments(fn), args)
+        fn.args[0].name = ".ret"
 class CUDACABICallConv(BaseCallConv):

numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py CHANGED Viewed

@@ -203,18 +203,6 @@ def simple_usecase_kernel(r, x):
 simple_usecase_caller = CUDAUseCase(simple_usecase_kernel)
-# Usecase with cooperative groups
-@cuda.jit(cache=True)
-def cg_usecase_kernel(r, x):
-    grid = cuda.cg.this_grid()
-    grid.sync()
-cg_usecase = CUDAUseCase(cg_usecase_kernel)
 class _TestModule(CUDATestCase):
     """
     Tests for functionality of this module's functions.

numba-cuda 0.10.1__py3-none-any.whl → 0.12.1__py3-none-any.whl

numba-cuda 0.10.1py3-none-any.whl → 0.12.1py3-none-any.whl