PyPI - numba-cuda - Versions diffs - 0.17.0__py3-none-any.whl → 0.18.1__py3-none-any.whl - Mend

numba-cuda 0.17.0py3-none-any.whl → 0.18.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of numba-cuda might be problematic. Click here for more details.

Files changed (64) hide show

numba_cuda/VERSION +1 -1
numba_cuda/numba/cuda/__init__.py +0 -8
numba_cuda/numba/cuda/_internal/cuda_fp16.py +14225 -0
numba_cuda/numba/cuda/api_util.py +6 -0
numba_cuda/numba/cuda/cgutils.py +1291 -0
numba_cuda/numba/cuda/codegen.py +32 -14
numba_cuda/numba/cuda/compiler.py +113 -10
numba_cuda/numba/cuda/core/caching.py +741 -0
numba_cuda/numba/cuda/core/callconv.py +338 -0
numba_cuda/numba/cuda/core/codegen.py +168 -0
numba_cuda/numba/cuda/core/compiler.py +205 -0
numba_cuda/numba/cuda/core/typed_passes.py +139 -0
numba_cuda/numba/cuda/cudadecl.py +0 -268
numba_cuda/numba/cuda/cudadrv/devicearray.py +3 -0
numba_cuda/numba/cuda/cudadrv/driver.py +2 -1
numba_cuda/numba/cuda/cudadrv/nvvm.py +1 -1
numba_cuda/numba/cuda/cudaimpl.py +4 -178
numba_cuda/numba/cuda/debuginfo.py +469 -3
numba_cuda/numba/cuda/device_init.py +0 -1
numba_cuda/numba/cuda/dispatcher.py +310 -11
numba_cuda/numba/cuda/extending.py +2 -1
numba_cuda/numba/cuda/fp16.py +348 -0
numba_cuda/numba/cuda/intrinsics.py +1 -1
numba_cuda/numba/cuda/libdeviceimpl.py +2 -1
numba_cuda/numba/cuda/lowering.py +1833 -8
numba_cuda/numba/cuda/mathimpl.py +2 -90
numba_cuda/numba/cuda/nvvmutils.py +2 -1
numba_cuda/numba/cuda/printimpl.py +2 -1
numba_cuda/numba/cuda/serialize.py +264 -0
numba_cuda/numba/cuda/simulator/__init__.py +2 -0
numba_cuda/numba/cuda/simulator/dispatcher.py +7 -0
numba_cuda/numba/cuda/stubs.py +0 -308
numba_cuda/numba/cuda/target.py +13 -5
numba_cuda/numba/cuda/testing.py +156 -5
numba_cuda/numba/cuda/tests/complex_usecases.py +113 -0
numba_cuda/numba/cuda/tests/core/serialize_usecases.py +110 -0
numba_cuda/numba/cuda/tests/core/test_serialize.py +359 -0
numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +10 -4
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +33 -0
numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +2 -2
numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +1 -0
numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_caching.py +5 -10
numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +15 -0
numba_cuda/numba/cuda/tests/cudapy/test_complex.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +381 -0
numba_cuda/numba/cuda/tests/cudapy/test_enums.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +108 -24
numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +37 -23
numba_cuda/numba/cuda/tests/cudapy/test_operator.py +43 -27
numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +26 -9
numba_cuda/numba/cuda/tests/cudapy/test_warning.py +27 -2
numba_cuda/numba/cuda/tests/enum_usecases.py +56 -0
numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +1 -2
numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +1 -1
numba_cuda/numba/cuda/utils.py +785 -0
numba_cuda/numba/cuda/vector_types.py +1 -1
{numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/METADATA +18 -4
{numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/RECORD +63 -50
numba_cuda/numba/cuda/cpp_function_wrappers.cu +0 -46
{numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/WHEEL +0 -0
{numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/licenses/LICENSE +0 -0
{numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/top_level.txt +0 -0

numba_cuda/numba/cuda/core/typed_passes.py ADDED Viewed

@@ -0,0 +1,139 @@
+import abc
+import warnings
+from contextlib import contextmanager
+from numba.core import errors, types, funcdesc
+from numba.core.compiler_machinery import LoweringPass
+from llvmlite import binding as llvm
+@contextmanager
+def fallback_context(state, msg):
+    """
+    Wraps code that would signal a fallback to object mode
+    """
+    try:
+        yield
+    except Exception as e:
+        if not state.status.can_fallback:
+            raise
+        else:
+            # Clear all references attached to the traceback
+            e = e.with_traceback(None)
+            # this emits a warning containing the error message body in the
+            # case of fallback from npm to objmode
+            loop_lift = "" if state.flags.enable_looplift else "OUT"
+            msg_rewrite = (
+                "\nCompilation is falling back to object mode "
+                "WITH%s looplifting enabled because %s" % (loop_lift, msg)
+            )
+            warnings.warn_explicit(
+                "%s due to: %s" % (msg_rewrite, e),
+                errors.NumbaWarning,
+                state.func_id.filename,
+                state.func_id.firstlineno,
+            )
+            raise
+class BaseNativeLowering(abc.ABC, LoweringPass):
+    """The base class for a lowering pass. The lowering functionality must be
+    specified in inheriting classes by providing an appropriate lowering class
+    implementation in the overridden `lowering_class` property."""
+    _name = None
+    def __init__(self):
+        LoweringPass.__init__(self)
+    @property
+    @abc.abstractmethod
+    def lowering_class(self):
+        """Returns the class that performs the lowering of the IR describing the
+        function that is the target of the current compilation."""
+        pass
+    def run_pass(self, state):
+        if state.library is None:
+            codegen = state.targetctx.codegen()
+            state.library = codegen.create_library(state.func_id.func_qualname)
+            # Enable object caching upfront, so that the library can
+            # be later serialized.
+            state.library.enable_object_caching()
+        library = state.library
+        targetctx = state.targetctx
+        interp = state.func_ir  # why is it called this?!
+        typemap = state.typemap
+        restype = state.return_type
+        calltypes = state.calltypes
+        flags = state.flags
+        metadata = state.metadata
+        pre_stats = llvm.passmanagers.dump_refprune_stats()
+        msg = "Function %s failed at nopython mode lowering" % (
+            state.func_id.func_name,
+        )
+        with fallback_context(state, msg):
+            # Lowering
+            fndesc = (
+                funcdesc.PythonFunctionDescriptor.from_specialized_function(
+                    interp,
+                    typemap,
+                    restype,
+                    calltypes,
+                    mangler=targetctx.mangler,
+                    inline=flags.forceinline,
+                    noalias=flags.noalias,
+                    abi_tags=[flags.get_mangle_string()],
+                )
+            )
+            with targetctx.push_code_library(library):
+                lower = self.lowering_class(
+                    targetctx, library, fndesc, interp, metadata=metadata
+                )
+                lower.lower()
+                if not flags.no_cpython_wrapper:
+                    lower.create_cpython_wrapper(flags.release_gil)
+                if not flags.no_cfunc_wrapper:
+                    # skip cfunc wrapper generation if unsupported
+                    # argument or return types are used
+                    for t in state.args:
+                        if isinstance(t, (types.Omitted, types.Generator)):
+                            break
+                    else:
+                        if isinstance(
+                            restype, (types.Optional, types.Generator)
+                        ):
+                            pass
+                        else:
+                            lower.create_cfunc_wrapper()
+                env = lower.env
+                call_helper = lower.call_helper
+                del lower
+            from numba.core.compiler import _LowerResult  # TODO: move this
+            if flags.no_compile:
+                state["cr"] = _LowerResult(
+                    fndesc, call_helper, cfunc=None, env=env
+                )
+            else:
+                # Prepare for execution
+                # Insert native function for use by other jitted-functions.
+                # We also register its library to allow for inlining.
+                cfunc = targetctx.get_executable(library, fndesc, env)
+                targetctx.insert_user_function(cfunc, fndesc, [library])
+                state["cr"] = _LowerResult(
+                    fndesc, call_helper, cfunc=cfunc, env=env
+                )
+            # capture pruning stats
+            post_stats = llvm.passmanagers.dump_refprune_stats()
+            metadata["prune_stats"] = post_stats - pre_stats
+            # Save the LLVM pass timings
+            metadata["llvm_pass_timings"] = library.recorded_timings
+        return True

numba_cuda/numba/cuda/cudadecl.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import operator
 from numba.core import errors, types
 from numba.core.typing.npydecl import (
     parse_dtype,
@@ -19,9 +18,7 @@ from numba.core.typing.templates import (
     Registry,
 )
 from numba.cuda.types import dim3
-from numba.core.typeconv import Conversion
 from numba import cuda
-from numba.cuda.compiler import declare_device_function
 registry = Registry()
 register = registry.register
@@ -188,14 +185,6 @@ class Cuda_fma(ConcreteTemplate):
     ]
-@register
-class Cuda_hfma(ConcreteTemplate):
-    key = cuda.fp16.hfma
-    cases = [
-        signature(types.float16, types.float16, types.float16, types.float16)
-    ]
 @register
 class Cuda_cbrt(ConcreteTemplate):
     key = cuda.cbrt
@@ -281,37 +270,6 @@ class Cuda_selp(AbstractTemplate):
         return signature(a, test, a, a)
-def _genfp16_unary(l_key):
-    @register
-    class Cuda_fp16_unary(ConcreteTemplate):
-        key = l_key
-        cases = [signature(types.float16, types.float16)]
-    return Cuda_fp16_unary
-def _genfp16_unary_operator(l_key):
-    @register_global(l_key)
-    class Cuda_fp16_unary(AbstractTemplate):
-        key = l_key
-        def generic(self, args, kws):
-            assert not kws
-            if len(args) == 1 and args[0] == types.float16:
-                return signature(types.float16, types.float16)
-    return Cuda_fp16_unary
-def _genfp16_binary(l_key):
-    @register
-    class Cuda_fp16_binary(ConcreteTemplate):
-        key = l_key
-        cases = [signature(types.float16, types.float16, types.float16)]
-    return Cuda_fp16_binary
 @register_global(float)
 class Float(AbstractTemplate):
     def generic(self, args, kws):
@@ -323,16 +281,6 @@ class Float(AbstractTemplate):
             return signature(arg, arg)
-def _genfp16_binary_comparison(l_key):
-    @register
-    class Cuda_fp16_cmp(ConcreteTemplate):
-        key = l_key
-        cases = [signature(types.b1, types.float16, types.float16)]
-    return Cuda_fp16_cmp
 # If multiple ConcreteTemplates provide typing for a single function, then
 # function resolution will pick the first compatible typing it finds even if it
 # involves inserting a cast that would be considered undesirable (in this
@@ -347,124 +295,6 @@ def _genfp16_binary_comparison(l_key):
 # with a ConcreteTemplate to simplify the logic.
-def _fp16_binary_operator(l_key, retty):
-    @register_global(l_key)
-    class Cuda_fp16_operator(AbstractTemplate):
-        key = l_key
-        def generic(self, args, kws):
-            assert not kws
-            if len(args) == 2 and (
-                args[0] == types.float16 or args[1] == types.float16
-            ):
-                if args[0] == types.float16:
-                    convertible = self.context.can_convert(args[1], args[0])
-                else:
-                    convertible = self.context.can_convert(args[0], args[1])
-                # We allow three cases here:
-                #
-                # 1. fp16 to fp16 - Conversion.exact
-                # 2. fp16 to other types fp16 can be promoted to
-                #  - Conversion.promote
-                # 3. fp16 to int8 (safe conversion) -
-                #  - Conversion.safe
-                if (
-                    (convertible == Conversion.exact)
-                    or (convertible == Conversion.promote)
-                    or (convertible == Conversion.safe)
-                ):
-                    return signature(retty, types.float16, types.float16)
-    return Cuda_fp16_operator
-def _genfp16_comparison_operator(op):
-    return _fp16_binary_operator(op, types.b1)
-def _genfp16_binary_operator(op):
-    return _fp16_binary_operator(op, types.float16)
-Cuda_hadd = _genfp16_binary(cuda.fp16.hadd)
-Cuda_add = _genfp16_binary_operator(operator.add)
-Cuda_iadd = _genfp16_binary_operator(operator.iadd)
-Cuda_hsub = _genfp16_binary(cuda.fp16.hsub)
-Cuda_sub = _genfp16_binary_operator(operator.sub)
-Cuda_isub = _genfp16_binary_operator(operator.isub)
-Cuda_hmul = _genfp16_binary(cuda.fp16.hmul)
-Cuda_mul = _genfp16_binary_operator(operator.mul)
-Cuda_imul = _genfp16_binary_operator(operator.imul)
-Cuda_hmax = _genfp16_binary(cuda.fp16.hmax)
-Cuda_hmin = _genfp16_binary(cuda.fp16.hmin)
-Cuda_hneg = _genfp16_unary(cuda.fp16.hneg)
-Cuda_neg = _genfp16_unary_operator(operator.neg)
-Cuda_habs = _genfp16_unary(cuda.fp16.habs)
-Cuda_abs = _genfp16_unary_operator(abs)
-Cuda_heq = _genfp16_binary_comparison(cuda.fp16.heq)
-_genfp16_comparison_operator(operator.eq)
-Cuda_hne = _genfp16_binary_comparison(cuda.fp16.hne)
-_genfp16_comparison_operator(operator.ne)
-Cuda_hge = _genfp16_binary_comparison(cuda.fp16.hge)
-_genfp16_comparison_operator(operator.ge)
-Cuda_hgt = _genfp16_binary_comparison(cuda.fp16.hgt)
-_genfp16_comparison_operator(operator.gt)
-Cuda_hle = _genfp16_binary_comparison(cuda.fp16.hle)
-_genfp16_comparison_operator(operator.le)
-Cuda_hlt = _genfp16_binary_comparison(cuda.fp16.hlt)
-_genfp16_comparison_operator(operator.lt)
-_genfp16_binary_operator(operator.truediv)
-_genfp16_binary_operator(operator.itruediv)
-def _resolve_wrapped_unary(fname):
-    link = tuple()
-    decl = declare_device_function(
-        f"__numba_wrapper_{fname}",
-        types.float16,
-        (types.float16,),
-        link,
-        use_cooperative=False,
-    )
-    return types.Function(decl)
-def _resolve_wrapped_binary(fname):
-    link = tuple()
-    decl = declare_device_function(
-        f"__numba_wrapper_{fname}",
-        types.float16,
-        (
-            types.float16,
-            types.float16,
-        ),
-        link,
-        use_cooperative=False,
-    )
-    return types.Function(decl)
-hsin_device = _resolve_wrapped_unary("hsin")
-hcos_device = _resolve_wrapped_unary("hcos")
-hlog_device = _resolve_wrapped_unary("hlog")
-hlog10_device = _resolve_wrapped_unary("hlog10")
-hlog2_device = _resolve_wrapped_unary("hlog2")
-hexp_device = _resolve_wrapped_unary("hexp")
-hexp10_device = _resolve_wrapped_unary("hexp10")
-hexp2_device = _resolve_wrapped_unary("hexp2")
-hsqrt_device = _resolve_wrapped_unary("hsqrt")
-hrsqrt_device = _resolve_wrapped_unary("hrsqrt")
-hfloor_device = _resolve_wrapped_unary("hfloor")
-hceil_device = _resolve_wrapped_unary("hceil")
-hrcp_device = _resolve_wrapped_unary("hrcp")
-hrint_device = _resolve_wrapped_unary("hrint")
-htrunc_device = _resolve_wrapped_unary("htrunc")
-hdiv_device = _resolve_wrapped_binary("hdiv")
 # generate atomic operations
 def _gen(l_key, supported_types):
     @register
@@ -641,101 +471,6 @@ class CudaAtomicTemplate(AttributeTemplate):
         return types.Function(Cuda_atomic_cas)
-@register_attr
-class CudaFp16Template(AttributeTemplate):
-    key = types.Module(cuda.fp16)
-    def resolve_hadd(self, mod):
-        return types.Function(Cuda_hadd)
-    def resolve_hsub(self, mod):
-        return types.Function(Cuda_hsub)
-    def resolve_hmul(self, mod):
-        return types.Function(Cuda_hmul)
-    def resolve_hdiv(self, mod):
-        return hdiv_device
-    def resolve_hneg(self, mod):
-        return types.Function(Cuda_hneg)
-    def resolve_habs(self, mod):
-        return types.Function(Cuda_habs)
-    def resolve_hfma(self, mod):
-        return types.Function(Cuda_hfma)
-    def resolve_hsin(self, mod):
-        return hsin_device
-    def resolve_hcos(self, mod):
-        return hcos_device
-    def resolve_hlog(self, mod):
-        return hlog_device
-    def resolve_hlog10(self, mod):
-        return hlog10_device
-    def resolve_hlog2(self, mod):
-        return hlog2_device
-    def resolve_hexp(self, mod):
-        return hexp_device
-    def resolve_hexp10(self, mod):
-        return hexp10_device
-    def resolve_hexp2(self, mod):
-        return hexp2_device
-    def resolve_hfloor(self, mod):
-        return hfloor_device
-    def resolve_hceil(self, mod):
-        return hceil_device
-    def resolve_hsqrt(self, mod):
-        return hsqrt_device
-    def resolve_hrsqrt(self, mod):
-        return hrsqrt_device
-    def resolve_hrcp(self, mod):
-        return hrcp_device
-    def resolve_hrint(self, mod):
-        return hrint_device
-    def resolve_htrunc(self, mod):
-        return htrunc_device
-    def resolve_heq(self, mod):
-        return types.Function(Cuda_heq)
-    def resolve_hne(self, mod):
-        return types.Function(Cuda_hne)
-    def resolve_hge(self, mod):
-        return types.Function(Cuda_hge)
-    def resolve_hgt(self, mod):
-        return types.Function(Cuda_hgt)
-    def resolve_hle(self, mod):
-        return types.Function(Cuda_hle)
-    def resolve_hlt(self, mod):
-        return types.Function(Cuda_hlt)
-    def resolve_hmax(self, mod):
-        return types.Function(Cuda_hmax)
-    def resolve_hmin(self, mod):
-        return types.Function(Cuda_hmin)
 @register_attr
 class CudaModuleTemplate(AttributeTemplate):
     key = types.Module(cuda)
@@ -815,9 +550,6 @@ class CudaModuleTemplate(AttributeTemplate):
     def resolve_atomic(self, mod):
         return types.Module(cuda.atomic)
-    def resolve_fp16(self, mod):
-        return types.Module(cuda.fp16)
     def resolve_const(self, mod):
         return types.Module(cuda.const)

numba_cuda/numba/cuda/cudadrv/devicearray.py CHANGED Viewed

@@ -92,6 +92,9 @@ class DeviceNDArrayBase(_devicearray.DeviceArray):
         self._dummy = dummyarray.Array.from_desc(
             0, shape, strides, dtype.itemsize
         )
+        # confirm that all elements of shape are ints
+        if not all(isinstance(dim, (int, np.integer)) for dim in shape):
+            raise TypeError("all elements of shape must be ints")
         self.shape = tuple(shape)
         self.strides = tuple(strides)
         self.dtype = dtype

numba_cuda/numba/cuda/cudadrv/driver.py CHANGED Viewed

@@ -44,7 +44,8 @@ from collections import namedtuple, deque
 from numba import mviewbuf
-from numba.core import utils, serialize, config
+from numba.core import config
+from numba.cuda import utils, serialize
 from .error import CudaSupportError, CudaDriverError
 from .drvapi import API_PROTOTYPES
 from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid

numba_cuda/numba/cuda/cudadrv/nvvm.py CHANGED Viewed

@@ -14,7 +14,7 @@ from llvmlite import ir
 from .error import NvvmError, NvvmSupportError, NvvmWarning
 from .libs import get_libdevice, open_libdevice, open_cudalib
-from numba.core import cgutils
+from numba.cuda import cgutils
 logger = logging.getLogger(__name__)

numba_cuda/numba/cuda/cudaimpl.py CHANGED Viewed

@@ -6,15 +6,16 @@ import struct
 from llvmlite import ir
 import llvmlite.binding as ll
-from numba.core.imputils import Registry, lower_cast
+from numba.core.imputils import Registry
 from numba.core.typing.npydecl import parse_dtype
 from numba.core.datamodel import models
-from numba.core import types, cgutils
+from numba.core import types
+from numba.cuda import cgutils
 from numba.np import ufunc_db
 from numba.np.npyimpl import register_ufuncs
 from .cudadrv import nvvm
 from numba import cuda
-from numba.cuda import nvvmutils, stubs, errors
+from numba.cuda import nvvmutils, stubs
 from numba.cuda.types import dim3, CUDADispatcher
 registry = Registry()
@@ -346,181 +347,6 @@ def ptx_fma(context, builder, sig, args):
     return builder.fma(*args)
-def float16_float_ty_constraint(bitwidth):
-    typemap = {32: ("f32", "f"), 64: ("f64", "d")}
-    try:
-        return typemap[bitwidth]
-    except KeyError:
-        msg = f"Conversion between float16 and float{bitwidth} unsupported"
-        raise errors.CudaLoweringError(msg)
-@lower_cast(types.float16, types.Float)
-def float16_to_float_cast(context, builder, fromty, toty, val):
-    if fromty.bitwidth == toty.bitwidth:
-        return val
-    ty, constraint = float16_float_ty_constraint(toty.bitwidth)
-    fnty = ir.FunctionType(context.get_value_type(toty), [ir.IntType(16)])
-    asm = ir.InlineAsm(fnty, f"cvt.{ty}.f16 $0, $1;", f"={constraint},h")
-    return builder.call(asm, [val])
-@lower_cast(types.Float, types.float16)
-def float_to_float16_cast(context, builder, fromty, toty, val):
-    if fromty.bitwidth == toty.bitwidth:
-        return val
-    ty, constraint = float16_float_ty_constraint(fromty.bitwidth)
-    fnty = ir.FunctionType(ir.IntType(16), [context.get_value_type(fromty)])
-    asm = ir.InlineAsm(fnty, f"cvt.rn.f16.{ty} $0, $1;", f"=h,{constraint}")
-    return builder.call(asm, [val])
-def float16_int_constraint(bitwidth):
-    typemap = {8: "c", 16: "h", 32: "r", 64: "l"}
-    try:
-        return typemap[bitwidth]
-    except KeyError:
-        msg = f"Conversion between float16 and int{bitwidth} unsupported"
-        raise errors.CudaLoweringError(msg)
-@lower_cast(types.float16, types.Integer)
-def float16_to_integer_cast(context, builder, fromty, toty, val):
-    bitwidth = toty.bitwidth
-    constraint = float16_int_constraint(bitwidth)
-    signedness = "s" if toty.signed else "u"
-    fnty = ir.FunctionType(context.get_value_type(toty), [ir.IntType(16)])
-    asm = ir.InlineAsm(
-        fnty, f"cvt.rni.{signedness}{bitwidth}.f16 $0, $1;", f"={constraint},h"
-    )
-    return builder.call(asm, [val])
-@lower_cast(types.Integer, types.float16)
-@lower_cast(types.IntegerLiteral, types.float16)
-def integer_to_float16_cast(context, builder, fromty, toty, val):
-    bitwidth = fromty.bitwidth
-    constraint = float16_int_constraint(bitwidth)
-    signedness = "s" if fromty.signed else "u"
-    fnty = ir.FunctionType(ir.IntType(16), [context.get_value_type(fromty)])
-    asm = ir.InlineAsm(
-        fnty, f"cvt.rn.f16.{signedness}{bitwidth} $0, $1;", f"=h,{constraint}"
-    )
-    return builder.call(asm, [val])
-def lower_fp16_binary(fn, op):
-    @lower(fn, types.float16, types.float16)
-    def ptx_fp16_binary(context, builder, sig, args):
-        fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16), ir.IntType(16)])
-        asm = ir.InlineAsm(fnty, f"{op}.f16 $0,$1,$2;", "=h,h,h")
-        return builder.call(asm, args)
-lower_fp16_binary(stubs.fp16.hadd, "add")
-lower_fp16_binary(operator.add, "add")
-lower_fp16_binary(operator.iadd, "add")
-lower_fp16_binary(stubs.fp16.hsub, "sub")
-lower_fp16_binary(operator.sub, "sub")
-lower_fp16_binary(operator.isub, "sub")
-lower_fp16_binary(stubs.fp16.hmul, "mul")
-lower_fp16_binary(operator.mul, "mul")
-lower_fp16_binary(operator.imul, "mul")
-@lower(stubs.fp16.hneg, types.float16)
-def ptx_fp16_hneg(context, builder, sig, args):
-    fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16)])
-    asm = ir.InlineAsm(fnty, "neg.f16 $0, $1;", "=h,h")
-    return builder.call(asm, args)
-@lower(operator.neg, types.float16)
-def operator_hneg(context, builder, sig, args):
-    return ptx_fp16_hneg(context, builder, sig, args)
-@lower(stubs.fp16.habs, types.float16)
-def ptx_fp16_habs(context, builder, sig, args):
-    fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16)])
-    asm = ir.InlineAsm(fnty, "abs.f16 $0, $1;", "=h,h")
-    return builder.call(asm, args)
-@lower(abs, types.float16)
-def operator_habs(context, builder, sig, args):
-    return ptx_fp16_habs(context, builder, sig, args)
-@lower(stubs.fp16.hfma, types.float16, types.float16, types.float16)
-def ptx_hfma(context, builder, sig, args):
-    argtys = [ir.IntType(16), ir.IntType(16), ir.IntType(16)]
-    fnty = ir.FunctionType(ir.IntType(16), argtys)
-    asm = ir.InlineAsm(fnty, "fma.rn.f16 $0,$1,$2,$3;", "=h,h,h,h")
-    return builder.call(asm, args)
-@lower(operator.truediv, types.float16, types.float16)
-@lower(operator.itruediv, types.float16, types.float16)
-def fp16_div_impl(context, builder, sig, args):
-    def fp16_div(x, y):
-        return cuda.fp16.hdiv(x, y)
-    return context.compile_internal(builder, fp16_div, sig, args)
-_fp16_cmp = """{{
-          .reg .pred __$$f16_cmp_tmp;
-          setp.{op}.f16 __$$f16_cmp_tmp, $1, $2;
-          selp.u16 $0, 1, 0, __$$f16_cmp_tmp;
-        }}"""
-def _gen_fp16_cmp(op):
-    def ptx_fp16_comparison(context, builder, sig, args):
-        fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16), ir.IntType(16)])
-        asm = ir.InlineAsm(fnty, _fp16_cmp.format(op=op), "=h,h,h")
-        result = builder.call(asm, args)
-        zero = context.get_constant(types.int16, 0)
-        int_result = builder.bitcast(result, ir.IntType(16))
-        return builder.icmp_unsigned("!=", int_result, zero)
-    return ptx_fp16_comparison
-lower(stubs.fp16.heq, types.float16, types.float16)(_gen_fp16_cmp("eq"))
-lower(operator.eq, types.float16, types.float16)(_gen_fp16_cmp("eq"))
-lower(stubs.fp16.hne, types.float16, types.float16)(_gen_fp16_cmp("ne"))
-lower(operator.ne, types.float16, types.float16)(_gen_fp16_cmp("ne"))
-lower(stubs.fp16.hge, types.float16, types.float16)(_gen_fp16_cmp("ge"))
-lower(operator.ge, types.float16, types.float16)(_gen_fp16_cmp("ge"))
-lower(stubs.fp16.hgt, types.float16, types.float16)(_gen_fp16_cmp("gt"))
-lower(operator.gt, types.float16, types.float16)(_gen_fp16_cmp("gt"))
-lower(stubs.fp16.hle, types.float16, types.float16)(_gen_fp16_cmp("le"))
-lower(operator.le, types.float16, types.float16)(_gen_fp16_cmp("le"))
-lower(stubs.fp16.hlt, types.float16, types.float16)(_gen_fp16_cmp("lt"))
-lower(operator.lt, types.float16, types.float16)(_gen_fp16_cmp("lt"))
-def lower_fp16_minmax(fn, fname, op):
-    @lower(fn, types.float16, types.float16)
-    def ptx_fp16_minmax(context, builder, sig, args):
-        choice = _gen_fp16_cmp(op)(context, builder, sig, args)
-        return builder.select(choice, args[0], args[1])
-lower_fp16_minmax(stubs.fp16.hmax, "max", "gt")
-lower_fp16_minmax(stubs.fp16.hmin, "min", "lt")
 # See:
 # https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_cbrt.html#__nv_cbrt
 # https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_cbrtf.html#__nv_cbrtf

numba-cuda 0.17.0__py3-none-any.whl → 0.18.1__py3-none-any.whl

Potentially problematic release.

numba-cuda 0.17.0py3-none-any.whl → 0.18.1py3-none-any.whl