PyPI - numba-cuda - Versions diffs - 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl - Mend

numba-cuda 0.9.0py3-none-any.whl → 0.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

numba_cuda/VERSION +1 -1
numba_cuda/numba/cuda/compiler.py +35 -3
numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
numba_cuda/numba/cuda/cuda_paths.py +2 -0
numba_cuda/numba/cuda/cudadecl.py +0 -42
numba_cuda/numba/cuda/cudadrv/linkable_code.py +11 -2
numba_cuda/numba/cuda/cudadrv/nvrtc.py +10 -3
numba_cuda/numba/cuda/cudaimpl.py +0 -63
numba_cuda/numba/cuda/debuginfo.py +92 -2
numba_cuda/numba/cuda/decorators.py +27 -1
numba_cuda/numba/cuda/device_init.py +4 -5
numba_cuda/numba/cuda/dispatcher.py +4 -3
numba_cuda/numba/cuda/extending.py +54 -0
numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +550 -387
numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +465 -316
numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
numba_cuda/numba/cuda/intrinsics.py +172 -1
numba_cuda/numba/cuda/lowering.py +43 -0
numba_cuda/numba/cuda/stubs.py +0 -11
numba_cuda/numba/cuda/target.py +28 -0
numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +4 -2
numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +46 -0
numba_cuda/numba/cuda/tests/cudapy/test_enums.py +18 -0
numba_cuda/numba/cuda/tests/cudapy/test_extending.py +4 -2
numba_cuda/numba/cuda/tests/cudapy/test_inline.py +156 -0
numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +50 -5
numba_cuda/numba/cuda/vector_types.py +3 -1
numba_cuda/numba/cuda/vectorizers.py +1 -1
{numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/METADATA +1 -1
{numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/RECORD +43 -33
{numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/WHEEL +1 -1
{numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/licenses/LICENSE +0 -0
{numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/top_level.txt +0 -0

numba_cuda/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.9.0
1	+ 0.10.1

numba_cuda/numba/cuda/compiler.py CHANGED Viewed

@@ -40,6 +40,7 @@ from numba.cuda.api import get_current_device
 from numba.cuda.cudadrv import nvvm
 from numba.cuda.descriptor import cuda_target
 from numba.cuda.target import CUDACABICallConv
+from numba.cuda import lowering
 def _nvvm_options_type(x):
@@ -163,6 +164,18 @@ class CreateLibrary(LoweringPass):
         return True
+@register_pass(mutates_CFG=True, analysis_only=False)
+class CUDANativeLowering(NativeLowering):
+    """Lowering pass for a CUDA native function IR described solely in terms of
+    Numba's standard `numba.core.ir` nodes."""
+    _name = "cuda_native_lowering"
+    @property
+    def lowering_class(self):
+        return lowering.CUDALower
 class CUDABytecodeInterpreter(Interpreter):
     # Based on the superclass implementation, but names the resulting variable
     # "$bool<N>" instead of "bool<N>" - see Numba PR #9888:
@@ -251,7 +264,7 @@ class CUDACompiler(CompilerBase):
         # lower
         pm.add_pass(CreateLibrary, "create library")
-        pm.add_pass(NativeLowering, "native lowering")
+        pm.add_pass(CUDANativeLowering, "cuda native lowering")
         pm.add_pass(CUDABackend, "cuda backend")
         pm.finalize()
@@ -265,7 +278,7 @@ def compile_cuda(
     args,
     debug=False,
     lineinfo=False,
-    inline=False,
+    forceinline=False,
     fastmath=False,
     nvvm_options=None,
     cc=None,
@@ -303,7 +316,7 @@ def compile_cuda(
     else:
         flags.error_model = "numpy"
-    if inline:
+    if forceinline:
         flags.forceinline = True
     if fastmath:
         flags.fastmath = True
@@ -561,6 +574,7 @@ def compile(
     abi="c",
     abi_info=None,
     output="ptx",
+    forceinline=False,
 ):
     """Compile a Python function to PTX or LTO-IR for a given set of argument
     types.
@@ -601,6 +615,11 @@ def compile(
     :type abi_info: dict
     :param output: Type of output to generate, either ``"ptx"`` or ``"ltoir"``.
     :type output: str
+    :param forceinline: Enables inlining at the NVVM IR level when set to
+                        ``True``. This is accomplished by adding the
+                        ``alwaysinline`` function attribute to the function
+                        definition. This is only valid when the output is
+                        ``"ltoir"``.
     :return: (code, resty): The compiled code and inferred return type
     :rtype: tuple
     """
@@ -613,6 +632,12 @@ def compile(
     if output not in ("ptx", "ltoir"):
         raise NotImplementedError(f"Unsupported output type: {output}")
+    if forceinline and not device:
+        raise ValueError("Cannot force-inline kernels")
+    if forceinline and output != "ltoir":
+        raise ValueError("Can only designate forced inlining in LTO-IR")
     debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
     opt = (config.OPT != 0) if opt is None else opt
@@ -647,6 +672,7 @@ def compile(
         fastmath=fastmath,
         nvvm_options=nvvm_options,
         cc=cc,
+        forceinline=forceinline,
     )
     resty = cres.signature.return_type
@@ -686,6 +712,7 @@ def compile_for_current_device(
     abi="c",
     abi_info=None,
     output="ptx",
+    forceinline=False,
 ):
     """Compile a Python function to PTX or LTO-IR for a given signature for the
     current device's compute capabilility. This calls :func:`compile` with an
@@ -703,6 +730,7 @@ def compile_for_current_device(
         abi=abi,
         abi_info=abi_info,
         output=output,
+        forceinline=forceinline,
     )
@@ -717,6 +745,7 @@ def compile_ptx(
     opt=None,
     abi="numba",
     abi_info=None,
+    forceinline=False,
 ):
     """Compile a Python function to PTX for a given signature. See
     :func:`compile`. The defaults for this function are to compile a kernel
@@ -734,6 +763,7 @@ def compile_ptx(
         abi=abi,
         abi_info=abi_info,
         output="ptx",
+        forceinline=forceinline,
     )
@@ -747,6 +777,7 @@ def compile_ptx_for_current_device(
     opt=None,
     abi="numba",
     abi_info=None,
+    forceinline=False,
 ):
     """Compile a Python function to PTX for a given signature for the current
     device's compute capabilility. See :func:`compile_ptx`."""
@@ -762,6 +793,7 @@ def compile_ptx_for_current_device(
         opt=opt,
         abi=abi,
         abi_info=abi_info,
+        forceinline=forceinline,
     )

numba-cuda 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

numba-cuda 0.9.0py3-none-any.whl → 0.10.1py3-none-any.whl