PyPI - numba-cuda - Versions diffs - 0.16.0__py3-none-any.whl → 0.18.0__py3-none-any.whl - Mend

numba-cuda 0.16.0py3-none-any.whl → 0.18.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

numba_cuda/VERSION +1 -1
numba_cuda/numba/cuda/__init__.py +0 -8
numba_cuda/numba/cuda/_internal/cuda_fp16.py +14225 -0
numba_cuda/numba/cuda/api_util.py +6 -0
numba_cuda/numba/cuda/cgutils.py +1291 -0
numba_cuda/numba/cuda/codegen.py +32 -14
numba_cuda/numba/cuda/compiler.py +113 -10
numba_cuda/numba/cuda/core/caching.py +741 -0
numba_cuda/numba/cuda/core/callconv.py +338 -0
numba_cuda/numba/cuda/core/codegen.py +168 -0
numba_cuda/numba/cuda/core/compiler.py +205 -0
numba_cuda/numba/cuda/core/typed_passes.py +139 -0
numba_cuda/numba/cuda/cuda_paths.py +1 -1
numba_cuda/numba/cuda/cudadecl.py +0 -268
numba_cuda/numba/cuda/cudadrv/devicearray.py +3 -0
numba_cuda/numba/cuda/cudadrv/devices.py +4 -6
numba_cuda/numba/cuda/cudadrv/driver.py +105 -50
numba_cuda/numba/cuda/cudadrv/nvvm.py +1 -1
numba_cuda/numba/cuda/cudaimpl.py +4 -178
numba_cuda/numba/cuda/debuginfo.py +469 -3
numba_cuda/numba/cuda/device_init.py +0 -1
numba_cuda/numba/cuda/dispatcher.py +311 -14
numba_cuda/numba/cuda/extending.py +2 -1
numba_cuda/numba/cuda/fp16.py +348 -0
numba_cuda/numba/cuda/intrinsics.py +1 -1
numba_cuda/numba/cuda/libdeviceimpl.py +2 -1
numba_cuda/numba/cuda/lowering.py +1833 -8
numba_cuda/numba/cuda/mathimpl.py +2 -90
numba_cuda/numba/cuda/memory_management/nrt.py +1 -1
numba_cuda/numba/cuda/nvvmutils.py +2 -1
numba_cuda/numba/cuda/printimpl.py +2 -1
numba_cuda/numba/cuda/serialize.py +264 -0
numba_cuda/numba/cuda/simulator/__init__.py +2 -0
numba_cuda/numba/cuda/simulator/dispatcher.py +7 -0
numba_cuda/numba/cuda/stubs.py +0 -308
numba_cuda/numba/cuda/target.py +13 -5
numba_cuda/numba/cuda/testing.py +156 -5
numba_cuda/numba/cuda/tests/complex_usecases.py +113 -0
numba_cuda/numba/cuda/tests/core/serialize_usecases.py +110 -0
numba_cuda/numba/cuda/tests/core/test_serialize.py +359 -0
numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +16 -5
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +5 -1
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +33 -0
numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +1 -1
numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +2 -2
numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +1 -0
numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_caching.py +5 -10
numba_cuda/numba/cuda/tests/cudapy/test_complex.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +1 -5
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +381 -0
numba_cuda/numba/cuda/tests/cudapy/test_enums.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +94 -24
numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +37 -23
numba_cuda/numba/cuda/tests/cudapy/test_operator.py +43 -27
numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +2 -5
numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +26 -9
numba_cuda/numba/cuda/tests/cudapy/test_warning.py +27 -2
numba_cuda/numba/cuda/tests/enum_usecases.py +56 -0
numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +1 -2
numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +1 -1
numba_cuda/numba/cuda/utils.py +785 -0
numba_cuda/numba/cuda/vector_types.py +1 -1
{numba_cuda-0.16.0.dist-info → numba_cuda-0.18.0.dist-info}/METADATA +18 -4
{numba_cuda-0.16.0.dist-info → numba_cuda-0.18.0.dist-info}/RECORD +69 -56
numba_cuda/numba/cuda/cpp_function_wrappers.cu +0 -46
{numba_cuda-0.16.0.dist-info → numba_cuda-0.18.0.dist-info}/WHEEL +0 -0
{numba_cuda-0.16.0.dist-info → numba_cuda-0.18.0.dist-info}/licenses/LICENSE +0 -0
{numba_cuda-0.16.0.dist-info → numba_cuda-0.18.0.dist-info}/top_level.txt +0 -0

numba_cuda/numba/cuda/cudadrv/driver.py CHANGED Viewed

@@ -44,7 +44,8 @@ from collections import namedtuple, deque
 from numba import mviewbuf
-from numba.core import utils, serialize, config
+from numba.core import config
+from numba.cuda import utils, serialize
 from .error import CudaSupportError, CudaDriverError
 from .drvapi import API_PROTOTYPES
 from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid
@@ -490,11 +491,11 @@ class Driver(object):
         with self.get_active_context() as ac:
             if ac.devnum is not None:
                 if USE_NV_BINDING:
-                    return driver.cuCtxPopCurrent()
+                    popped = drvapi.cu_context(int(driver.cuCtxPopCurrent()))
                 else:
                     popped = drvapi.cu_context()
                     driver.cuCtxPopCurrent(byref(popped))
-                    return popped
+                return popped
     def get_active_context(self):
         """Returns an instance of ``_ActiveContext``."""
@@ -538,6 +539,8 @@ class _ActiveContext(object):
                 hctx = driver.cuCtxGetCurrent()
                 if int(hctx) == 0:
                     hctx = None
+                else:
+                    hctx = drvapi.cu_context(int(hctx))
             else:
                 hctx = drvapi.cu_context(0)
                 driver.cuCtxGetCurrent(byref(hctx))
@@ -716,6 +719,7 @@ class Device(object):
         # create primary context
         if USE_NV_BINDING:
             hctx = driver.cuDevicePrimaryCtxRetain(self.id)
+            hctx = drvapi.cu_context(int(hctx))
         else:
             hctx = drvapi.cu_context()
             driver.cuDevicePrimaryCtxRetain(byref(hctx), self.id)
@@ -1254,6 +1258,7 @@ class _PendingDeallocs(object):
                 [dtor, handle, size] = self._cons.popleft()
                 _logger.info("dealloc: %s %s bytes", dtor.__name__, size)
                 dtor(handle)
             self._size = 0
     @contextlib.contextmanager
@@ -1430,7 +1435,10 @@ class Context(object):
         """
         Pushes this context on the current CPU Thread.
         """
-        driver.cuCtxPushCurrent(self.handle)
+        if USE_NV_BINDING:
+            driver.cuCtxPushCurrent(self.handle.value)
+        else:
+            driver.cuCtxPushCurrent(self.handle)
         self.prepare_for_use()
     def pop(self):
@@ -1439,10 +1447,7 @@ class Context(object):
         must be at the top of the context stack, otherwise an error will occur.
         """
         popped = driver.pop_active_context()
-        if USE_NV_BINDING:
-            assert int(popped) == int(self.handle)
-        else:
-            assert popped.value == self.handle.value
+        assert popped.value == self.handle.value
     def memalloc(self, bytesize):
         return self.memory_manager.memalloc(bytesize)
@@ -1535,21 +1540,25 @@ class Context(object):
     def get_default_stream(self):
         if USE_NV_BINDING:
-            handle = binding.CUstream(CU_STREAM_DEFAULT)
+            handle = drvapi.cu_stream(int(binding.CUstream(CU_STREAM_DEFAULT)))
         else:
             handle = drvapi.cu_stream(drvapi.CU_STREAM_DEFAULT)
         return Stream(weakref.proxy(self), handle, None)
     def get_legacy_default_stream(self):
         if USE_NV_BINDING:
-            handle = binding.CUstream(binding.CU_STREAM_LEGACY)
+            handle = drvapi.cu_stream(
+                int(binding.CUstream(binding.CU_STREAM_LEGACY))
+            )
         else:
             handle = drvapi.cu_stream(drvapi.CU_STREAM_LEGACY)
         return Stream(weakref.proxy(self), handle, None)
     def get_per_thread_default_stream(self):
         if USE_NV_BINDING:
-            handle = binding.CUstream(binding.CU_STREAM_PER_THREAD)
+            handle = drvapi.cu_stream(
+                int(binding.CUstream(binding.CU_STREAM_PER_THREAD))
+            )
         else:
             handle = drvapi.cu_stream(drvapi.CU_STREAM_PER_THREAD)
         return Stream(weakref.proxy(self), handle, None)
@@ -1561,7 +1570,7 @@ class Context(object):
             # default stream, which we define also as CU_STREAM_DEFAULT when
             # the NV binding is in use).
             flags = binding.CUstream_flags.CU_STREAM_DEFAULT.value
-            handle = driver.cuStreamCreate(flags)
+            handle = drvapi.cu_stream(int(driver.cuStreamCreate(flags)))
         else:
             handle = drvapi.cu_stream()
             driver.cuStreamCreate(byref(handle), 0)
@@ -1575,7 +1584,7 @@ class Context(object):
         if not isinstance(ptr, int):
             raise TypeError("ptr for external stream must be an int")
         if USE_NV_BINDING:
-            handle = binding.CUstream(ptr)
+            handle = drvapi.cu_stream(int(binding.CUstream(ptr)))
         else:
             handle = drvapi.cu_stream(ptr)
         return Stream(weakref.proxy(self), handle, None, external=True)
@@ -1585,7 +1594,7 @@ class Context(object):
         if not timing:
             flags |= enums.CU_EVENT_DISABLE_TIMING
         if USE_NV_BINDING:
-            handle = driver.cuEventCreate(flags)
+            handle = drvapi.cu_event(int(driver.cuEventCreate(flags)))
         else:
             handle = drvapi.cu_event()
             driver.cuEventCreate(byref(handle), flags)
@@ -1776,14 +1785,14 @@ def _pin_finalizer(memory_manager, ptr, alloc_key, mapped):
 def _event_finalizer(deallocs, handle):
     def core():
-        deallocs.add_item(driver.cuEventDestroy, handle)
+        deallocs.add_item(driver.cuEventDestroy, handle.value)
     return core
 def _stream_finalizer(deallocs, handle):
     def core():
-        deallocs.add_item(driver.cuStreamDestroy, handle)
+        deallocs.add_item(driver.cuStreamDestroy, handle.value)
     return core
@@ -2054,6 +2063,9 @@ class MemoryPointer(object):
     __cuda_memory__ = True
     def __init__(self, context, pointer, size, owner=None, finalizer=None):
+        if USE_NV_BINDING and isinstance(pointer, ctypes.c_void_p):
+            pointer = binding.CUdeviceptr(pointer.value)
         self.context = context
         self.device_pointer = pointer
         self.size = size
@@ -2086,9 +2098,11 @@ class MemoryPointer(object):
     def memset(self, byte, count=None, stream=0):
         count = self.size if count is None else count
         if stream:
-            driver.cuMemsetD8Async(
-                self.device_pointer, byte, count, stream.handle
-            )
+            if USE_NV_BINDING:
+                handle = stream.handle.value
+            else:
+                handle = stream.handle
+            driver.cuMemsetD8Async(self.device_pointer, byte, count, handle)
         else:
             driver.cuMemsetD8(self.device_pointer, byte, count)
@@ -2326,27 +2340,16 @@ class Stream(object):
             weakref.finalize(self, finalizer)
     def __int__(self):
-        if USE_NV_BINDING:
-            return int(self.handle)
-        else:
-            # The default stream's handle.value is 0, which gives `None`
-            return self.handle.value or drvapi.CU_STREAM_DEFAULT
+        # The default stream's handle.value is 0, which gives `None`
+        return self.handle.value or drvapi.CU_STREAM_DEFAULT
     def __repr__(self):
-        if USE_NV_BINDING:
-            default_streams = {
-                CU_STREAM_DEFAULT: "<Default CUDA stream on %s>",
-                binding.CU_STREAM_LEGACY: "<Legacy default CUDA stream on %s>",
-                binding.CU_STREAM_PER_THREAD: "<Per-thread default CUDA stream on %s>",
-            }
-            ptr = int(self.handle) or 0
-        else:
-            default_streams = {
-                drvapi.CU_STREAM_DEFAULT: "<Default CUDA stream on %s>",
-                drvapi.CU_STREAM_LEGACY: "<Legacy default CUDA stream on %s>",
-                drvapi.CU_STREAM_PER_THREAD: "<Per-thread default CUDA stream on %s>",
-            }
-            ptr = self.handle.value or drvapi.CU_STREAM_DEFAULT
+        default_streams = {
+            drvapi.CU_STREAM_DEFAULT: "<Default CUDA stream on %s>",
+            drvapi.CU_STREAM_LEGACY: "<Legacy default CUDA stream on %s>",
+            drvapi.CU_STREAM_PER_THREAD: "<Per-thread default CUDA stream on %s>",
+        }
+        ptr = self.handle.value or drvapi.CU_STREAM_DEFAULT
         if ptr in default_streams:
             return default_streams[ptr] % self.context
@@ -2360,7 +2363,11 @@ class Stream(object):
         Wait for all commands in this stream to execute. This will commit any
         pending memory transfers.
         """
-        driver.cuStreamSynchronize(self.handle)
+        if USE_NV_BINDING:
+            handle = self.handle.value
+        else:
+            handle = self.handle
+        driver.cuStreamSynchronize(handle)
     @contextlib.contextmanager
     def auto_synchronize(self):
@@ -2385,6 +2392,16 @@ class Stream(object):
         callback will block later work in the stream and may block other
         callbacks from being executed.
+        .. warning::
+            There is a potential for deadlock due to a lock ordering issue
+            between the GIL and the CUDA driver lock when using libraries
+            that call CUDA functions without releasing the GIL. This can
+            occur when the callback function, which holds the CUDA driver lock,
+            attempts to acquire the GIL while another thread that holds the GIL
+            is waiting for the CUDA driver lock. Consider using libraries that
+            properly release the GIL around CUDA operations or restructure
+            your code to avoid this situation.
         Note: The driver function underlying this method is marked for
         eventual deprecation and may be replaced in a future CUDA release.
@@ -2398,9 +2415,11 @@ class Stream(object):
             stream_callback = binding.CUstreamCallback(ptr)
             # The callback needs to receive a pointer to the data PyObject
             data = id(data)
+            handle = self.handle.value
         else:
             stream_callback = self._stream_callback
-        driver.cuStreamAddCallback(self.handle, stream_callback, data, 0)
+            handle = self.handle
+        driver.cuStreamAddCallback(handle, stream_callback, data, 0)
     @staticmethod
     @cu_stream_callback_pyobj
@@ -2417,6 +2436,16 @@ class Stream(object):
         """
         Return an awaitable that resolves once all preceding stream operations
         are complete. The result of the awaitable is the current stream.
+        .. warning::
+            There is a potential for deadlock due to a lock ordering issue
+            between the GIL and the CUDA driver lock when using libraries
+            that call CUDA functions without releasing the GIL. This can
+            occur when the callback function (internally used by this method),
+            which holds the CUDA driver lock, attempts to acquire the GIL
+            while another thread that holds the GIL is waiting for the CUDA driver lock.
+            Consider using libraries that properly release the GIL around
+            CUDA operations or restructure your code to avoid this situation.
         """
         loop = asyncio.get_running_loop()
         future = loop.create_future()
@@ -2468,27 +2497,35 @@ class Event(object):
         completed.
         """
         if USE_NV_BINDING:
-            hstream = stream.handle if stream else binding.CUstream(0)
+            hstream = stream.handle.value if stream else binding.CUstream(0)
+            handle = self.handle.value
         else:
             hstream = stream.handle if stream else 0
-        driver.cuEventRecord(self.handle, hstream)
+            handle = self.handle
+        driver.cuEventRecord(handle, hstream)
     def synchronize(self):
         """
         Synchronize the host thread for the completion of the event.
         """
-        driver.cuEventSynchronize(self.handle)
+        if USE_NV_BINDING:
+            handle = self.handle.value
+        else:
+            handle = self.handle
+        driver.cuEventSynchronize(handle)
     def wait(self, stream=0):
         """
         All future works submitted to stream will wait util the event completes.
         """
         if USE_NV_BINDING:
-            hstream = stream.handle if stream else binding.CUstream(0)
+            hstream = stream.handle.value if stream else binding.CUstream(0)
+            handle = self.handle.value
         else:
             hstream = stream.handle if stream else 0
+            handle = self.handle
         flags = 0
-        driver.cuStreamWaitEvent(hstream, self.handle, flags)
+        driver.cuStreamWaitEvent(hstream, handle, flags)
     def elapsed_time(self, evtend):
         return event_elapsed_time(self, evtend)
@@ -2499,7 +2536,9 @@ def event_elapsed_time(evtstart, evtend):
     Compute the elapsed time between two events in milliseconds.
     """
     if USE_NV_BINDING:
-        return driver.cuEventElapsedTime(evtstart.handle, evtend.handle)
+        return driver.cuEventElapsedTime(
+            evtstart.handle.value, evtend.handle.value
+        )
     else:
         msec = c_float()
         driver.cuEventElapsedTime(byref(msec), evtstart.handle, evtend.handle)
@@ -3477,7 +3516,11 @@ def host_to_device(dst, src, size, stream=0):
     if stream:
         assert isinstance(stream, Stream)
         fn = driver.cuMemcpyHtoDAsync
-        varargs.append(stream.handle)
+        if USE_NV_BINDING:
+            handle = stream.handle.value
+        else:
+            handle = stream.handle
+        varargs.append(handle)
     else:
         fn = driver.cuMemcpyHtoD
@@ -3495,7 +3538,11 @@ def device_to_host(dst, src, size, stream=0):
     if stream:
         assert isinstance(stream, Stream)
         fn = driver.cuMemcpyDtoHAsync
-        varargs.append(stream.handle)
+        if USE_NV_BINDING:
+            handle = stream.handle.value
+        else:
+            handle = stream.handle
+        varargs.append(handle)
     else:
         fn = driver.cuMemcpyDtoH
@@ -3513,7 +3560,11 @@ def device_to_device(dst, src, size, stream=0):
     if stream:
         assert isinstance(stream, Stream)
         fn = driver.cuMemcpyDtoDAsync
-        varargs.append(stream.handle)
+        if USE_NV_BINDING:
+            handle = stream.handle.value
+        else:
+            handle = stream.handle
+        varargs.append(handle)
     else:
         fn = driver.cuMemcpyDtoD
@@ -3534,7 +3585,11 @@ def device_memset(dst, val, size, stream=0):
     if stream:
         assert isinstance(stream, Stream)
         fn = driver.cuMemsetD8Async
-        varargs.append(stream.handle)
+        if USE_NV_BINDING:
+            handle = stream.handle.value
+        else:
+            handle = stream.handle
+        varargs.append(handle)
     else:
         fn = driver.cuMemsetD8

numba_cuda/numba/cuda/cudadrv/nvvm.py CHANGED Viewed

@@ -14,7 +14,7 @@ from llvmlite import ir
 from .error import NvvmError, NvvmSupportError, NvvmWarning
 from .libs import get_libdevice, open_libdevice, open_cudalib
-from numba.core import cgutils
+from numba.cuda import cgutils
 logger = logging.getLogger(__name__)

numba_cuda/numba/cuda/cudaimpl.py CHANGED Viewed

@@ -6,15 +6,16 @@ import struct
 from llvmlite import ir
 import llvmlite.binding as ll
-from numba.core.imputils import Registry, lower_cast
+from numba.core.imputils import Registry
 from numba.core.typing.npydecl import parse_dtype
 from numba.core.datamodel import models
-from numba.core import types, cgutils
+from numba.core import types
+from numba.cuda import cgutils
 from numba.np import ufunc_db
 from numba.np.npyimpl import register_ufuncs
 from .cudadrv import nvvm
 from numba import cuda
-from numba.cuda import nvvmutils, stubs, errors
+from numba.cuda import nvvmutils, stubs
 from numba.cuda.types import dim3, CUDADispatcher
 registry = Registry()
@@ -346,181 +347,6 @@ def ptx_fma(context, builder, sig, args):
     return builder.fma(*args)
-def float16_float_ty_constraint(bitwidth):
-    typemap = {32: ("f32", "f"), 64: ("f64", "d")}
-    try:
-        return typemap[bitwidth]
-    except KeyError:
-        msg = f"Conversion between float16 and float{bitwidth} unsupported"
-        raise errors.CudaLoweringError(msg)
-@lower_cast(types.float16, types.Float)
-def float16_to_float_cast(context, builder, fromty, toty, val):
-    if fromty.bitwidth == toty.bitwidth:
-        return val
-    ty, constraint = float16_float_ty_constraint(toty.bitwidth)
-    fnty = ir.FunctionType(context.get_value_type(toty), [ir.IntType(16)])
-    asm = ir.InlineAsm(fnty, f"cvt.{ty}.f16 $0, $1;", f"={constraint},h")
-    return builder.call(asm, [val])
-@lower_cast(types.Float, types.float16)
-def float_to_float16_cast(context, builder, fromty, toty, val):
-    if fromty.bitwidth == toty.bitwidth:
-        return val
-    ty, constraint = float16_float_ty_constraint(fromty.bitwidth)
-    fnty = ir.FunctionType(ir.IntType(16), [context.get_value_type(fromty)])
-    asm = ir.InlineAsm(fnty, f"cvt.rn.f16.{ty} $0, $1;", f"=h,{constraint}")
-    return builder.call(asm, [val])
-def float16_int_constraint(bitwidth):
-    typemap = {8: "c", 16: "h", 32: "r", 64: "l"}
-    try:
-        return typemap[bitwidth]
-    except KeyError:
-        msg = f"Conversion between float16 and int{bitwidth} unsupported"
-        raise errors.CudaLoweringError(msg)
-@lower_cast(types.float16, types.Integer)
-def float16_to_integer_cast(context, builder, fromty, toty, val):
-    bitwidth = toty.bitwidth
-    constraint = float16_int_constraint(bitwidth)
-    signedness = "s" if toty.signed else "u"
-    fnty = ir.FunctionType(context.get_value_type(toty), [ir.IntType(16)])
-    asm = ir.InlineAsm(
-        fnty, f"cvt.rni.{signedness}{bitwidth}.f16 $0, $1;", f"={constraint},h"
-    )
-    return builder.call(asm, [val])
-@lower_cast(types.Integer, types.float16)
-@lower_cast(types.IntegerLiteral, types.float16)
-def integer_to_float16_cast(context, builder, fromty, toty, val):
-    bitwidth = fromty.bitwidth
-    constraint = float16_int_constraint(bitwidth)
-    signedness = "s" if fromty.signed else "u"
-    fnty = ir.FunctionType(ir.IntType(16), [context.get_value_type(fromty)])
-    asm = ir.InlineAsm(
-        fnty, f"cvt.rn.f16.{signedness}{bitwidth} $0, $1;", f"=h,{constraint}"
-    )
-    return builder.call(asm, [val])
-def lower_fp16_binary(fn, op):
-    @lower(fn, types.float16, types.float16)
-    def ptx_fp16_binary(context, builder, sig, args):
-        fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16), ir.IntType(16)])
-        asm = ir.InlineAsm(fnty, f"{op}.f16 $0,$1,$2;", "=h,h,h")
-        return builder.call(asm, args)
-lower_fp16_binary(stubs.fp16.hadd, "add")
-lower_fp16_binary(operator.add, "add")
-lower_fp16_binary(operator.iadd, "add")
-lower_fp16_binary(stubs.fp16.hsub, "sub")
-lower_fp16_binary(operator.sub, "sub")
-lower_fp16_binary(operator.isub, "sub")
-lower_fp16_binary(stubs.fp16.hmul, "mul")
-lower_fp16_binary(operator.mul, "mul")
-lower_fp16_binary(operator.imul, "mul")
-@lower(stubs.fp16.hneg, types.float16)
-def ptx_fp16_hneg(context, builder, sig, args):
-    fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16)])
-    asm = ir.InlineAsm(fnty, "neg.f16 $0, $1;", "=h,h")
-    return builder.call(asm, args)
-@lower(operator.neg, types.float16)
-def operator_hneg(context, builder, sig, args):
-    return ptx_fp16_hneg(context, builder, sig, args)
-@lower(stubs.fp16.habs, types.float16)
-def ptx_fp16_habs(context, builder, sig, args):
-    fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16)])
-    asm = ir.InlineAsm(fnty, "abs.f16 $0, $1;", "=h,h")
-    return builder.call(asm, args)
-@lower(abs, types.float16)
-def operator_habs(context, builder, sig, args):
-    return ptx_fp16_habs(context, builder, sig, args)
-@lower(stubs.fp16.hfma, types.float16, types.float16, types.float16)
-def ptx_hfma(context, builder, sig, args):
-    argtys = [ir.IntType(16), ir.IntType(16), ir.IntType(16)]
-    fnty = ir.FunctionType(ir.IntType(16), argtys)
-    asm = ir.InlineAsm(fnty, "fma.rn.f16 $0,$1,$2,$3;", "=h,h,h,h")
-    return builder.call(asm, args)
-@lower(operator.truediv, types.float16, types.float16)
-@lower(operator.itruediv, types.float16, types.float16)
-def fp16_div_impl(context, builder, sig, args):
-    def fp16_div(x, y):
-        return cuda.fp16.hdiv(x, y)
-    return context.compile_internal(builder, fp16_div, sig, args)
-_fp16_cmp = """{{
-          .reg .pred __$$f16_cmp_tmp;
-          setp.{op}.f16 __$$f16_cmp_tmp, $1, $2;
-          selp.u16 $0, 1, 0, __$$f16_cmp_tmp;
-        }}"""
-def _gen_fp16_cmp(op):
-    def ptx_fp16_comparison(context, builder, sig, args):
-        fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16), ir.IntType(16)])
-        asm = ir.InlineAsm(fnty, _fp16_cmp.format(op=op), "=h,h,h")
-        result = builder.call(asm, args)
-        zero = context.get_constant(types.int16, 0)
-        int_result = builder.bitcast(result, ir.IntType(16))
-        return builder.icmp_unsigned("!=", int_result, zero)
-    return ptx_fp16_comparison
-lower(stubs.fp16.heq, types.float16, types.float16)(_gen_fp16_cmp("eq"))
-lower(operator.eq, types.float16, types.float16)(_gen_fp16_cmp("eq"))
-lower(stubs.fp16.hne, types.float16, types.float16)(_gen_fp16_cmp("ne"))
-lower(operator.ne, types.float16, types.float16)(_gen_fp16_cmp("ne"))
-lower(stubs.fp16.hge, types.float16, types.float16)(_gen_fp16_cmp("ge"))
-lower(operator.ge, types.float16, types.float16)(_gen_fp16_cmp("ge"))
-lower(stubs.fp16.hgt, types.float16, types.float16)(_gen_fp16_cmp("gt"))
-lower(operator.gt, types.float16, types.float16)(_gen_fp16_cmp("gt"))
-lower(stubs.fp16.hle, types.float16, types.float16)(_gen_fp16_cmp("le"))
-lower(operator.le, types.float16, types.float16)(_gen_fp16_cmp("le"))
-lower(stubs.fp16.hlt, types.float16, types.float16)(_gen_fp16_cmp("lt"))
-lower(operator.lt, types.float16, types.float16)(_gen_fp16_cmp("lt"))
-def lower_fp16_minmax(fn, fname, op):
-    @lower(fn, types.float16, types.float16)
-    def ptx_fp16_minmax(context, builder, sig, args):
-        choice = _gen_fp16_cmp(op)(context, builder, sig, args)
-        return builder.select(choice, args[0], args[1])
-lower_fp16_minmax(stubs.fp16.hmax, "max", "gt")
-lower_fp16_minmax(stubs.fp16.hmin, "min", "lt")
 # See:
 # https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_cbrt.html#__nv_cbrt
 # https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_cbrtf.html#__nv_cbrtf

numba-cuda 0.16.0__py3-none-any.whl → 0.18.0__py3-none-any.whl

numba-cuda 0.16.0py3-none-any.whl → 0.18.0py3-none-any.whl