PyPI - numba-cuda - Versions diffs - 0.0.0__py3-none-any.whl → 0.0.12__py3-none-any.whl - Mend

numba-cuda 0.0.0py3-none-any.whl → 0.0.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (233) hide show

_numba_cuda_redirector.pth +1 -0
_numba_cuda_redirector.py +74 -0
numba_cuda/VERSION +1 -0
numba_cuda/__init__.py +5 -0
numba_cuda/_version.py +19 -0
numba_cuda/numba/cuda/__init__.py +22 -0
numba_cuda/numba/cuda/api.py +526 -0
numba_cuda/numba/cuda/api_util.py +30 -0
numba_cuda/numba/cuda/args.py +77 -0
numba_cuda/numba/cuda/cg.py +62 -0
numba_cuda/numba/cuda/codegen.py +378 -0
numba_cuda/numba/cuda/compiler.py +422 -0
numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
numba_cuda/numba/cuda/cuda_paths.py +258 -0
numba_cuda/numba/cuda/cudadecl.py +806 -0
numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
numba_cuda/numba/cuda/cudadrv/error.py +36 -0
numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
numba_cuda/numba/cuda/cudaimpl.py +1055 -0
numba_cuda/numba/cuda/cudamath.py +140 -0
numba_cuda/numba/cuda/decorators.py +189 -0
numba_cuda/numba/cuda/descriptor.py +33 -0
numba_cuda/numba/cuda/device_init.py +89 -0
numba_cuda/numba/cuda/deviceufunc.py +908 -0
numba_cuda/numba/cuda/dispatcher.py +1057 -0
numba_cuda/numba/cuda/errors.py +59 -0
numba_cuda/numba/cuda/extending.py +7 -0
numba_cuda/numba/cuda/initialize.py +13 -0
numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
numba_cuda/numba/cuda/intrinsics.py +198 -0
numba_cuda/numba/cuda/kernels/__init__.py +0 -0
numba_cuda/numba/cuda/kernels/reduction.py +262 -0
numba_cuda/numba/cuda/kernels/transpose.py +65 -0
numba_cuda/numba/cuda/libdevice.py +3382 -0
numba_cuda/numba/cuda/libdevicedecl.py +17 -0
numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
numba_cuda/numba/cuda/mathimpl.py +448 -0
numba_cuda/numba/cuda/models.py +48 -0
numba_cuda/numba/cuda/nvvmutils.py +235 -0
numba_cuda/numba/cuda/printimpl.py +86 -0
numba_cuda/numba/cuda/random.py +292 -0
numba_cuda/numba/cuda/simulator/__init__.py +38 -0
numba_cuda/numba/cuda/simulator/api.py +110 -0
numba_cuda/numba/cuda/simulator/compiler.py +9 -0
numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
numba_cuda/numba/cuda/simulator/kernel.py +308 -0
numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
numba_cuda/numba/cuda/simulator/reduction.py +15 -0
numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
numba_cuda/numba/cuda/simulator_init.py +17 -0
numba_cuda/numba/cuda/stubs.py +902 -0
numba_cuda/numba/cuda/target.py +440 -0
numba_cuda/numba/cuda/testing.py +202 -0
numba_cuda/numba/cuda/tests/__init__.py +58 -0
numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
numba_cuda/numba/cuda/tests/data/error.cu +7 -0
numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
numba_cuda/numba/cuda/types.py +37 -0
numba_cuda/numba/cuda/ufuncs.py +662 -0
numba_cuda/numba/cuda/vector_types.py +209 -0
numba_cuda/numba/cuda/vectorizers.py +252 -0
numba_cuda-0.0.12.dist-info/LICENSE +25 -0
numba_cuda-0.0.12.dist-info/METADATA +68 -0
numba_cuda-0.0.12.dist-info/RECORD +231 -0
{numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
numba_cuda-0.0.0.dist-info/METADATA +0 -6
numba_cuda-0.0.0.dist-info/RECORD +0 -5
{numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0

numba_cuda/numba/cuda/tests/cudadrv/test_init.py ADDED Viewed

@@ -0,0 +1,139 @@
+import multiprocessing as mp
+import os
+from numba import cuda
+from numba.cuda.cudadrv.driver import CudaAPIError, driver
+from numba.cuda.cudadrv.error import CudaSupportError
+from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase
+# A mock of cuInit that always raises a CudaAPIError
+def cuInit_raising(arg):
+    raise CudaAPIError(999, 'CUDA_ERROR_UNKNOWN')
+# Test code to run in a child that patches driver.cuInit to a variant that
+# always raises. We can't use mock.patch.object here because driver.cuInit is
+# not assigned until we attempt to initialize - mock.patch.object cannot locate
+# the non-existent original method, and so fails. Instead we patch
+# driver.cuInit with our raising version prior to any attempt to initialize.
+def cuInit_raising_test(result_queue):
+    driver.cuInit = cuInit_raising
+    success = False
+    msg = None
+    try:
+        # A CUDA operation that forces initialization of the device
+        cuda.device_array(1)
+    except CudaSupportError as e:
+        success = True
+        msg = e.msg
+    result_queue.put((success, msg))
+# Similar to cuInit_raising_test above, but for testing that the string
+# returned by cuda_error() is as expected.
+def initialization_error_test(result_queue):
+    driver.cuInit = cuInit_raising
+    success = False
+    msg = None
+    try:
+        # A CUDA operation that forces initialization of the device
+        cuda.device_array(1)
+    except CudaSupportError:
+        success = True
+    msg = cuda.cuda_error()
+    result_queue.put((success, msg))
+# For testing the path where Driver.__init__() catches a CudaSupportError
+def cuda_disabled_test(result_queue):
+    success = False
+    msg = None
+    try:
+        # A CUDA operation that forces initialization of the device
+        cuda.device_array(1)
+    except CudaSupportError as e:
+        success = True
+        msg = e.msg
+    result_queue.put((success, msg))
+# Similar to cuda_disabled_test, but checks cuda.cuda_error() instead of the
+# exception raised on initialization
+def cuda_disabled_error_test(result_queue):
+    success = False
+    msg = None
+    try:
+        # A CUDA operation that forces initialization of the device
+        cuda.device_array(1)
+    except CudaSupportError:
+        success = True
+    msg = cuda.cuda_error()
+    result_queue.put((success, msg))
+@skip_on_cudasim('CUDA Simulator does not initialize driver')
+class TestInit(CUDATestCase):
+    def _test_init_failure(self, target, expected):
+        # Run the initialization failure test in a separate subprocess
+        ctx = mp.get_context('spawn')
+        result_queue = ctx.Queue()
+        proc = ctx.Process(target=target, args=(result_queue,))
+        proc.start()
+        proc.join(30) # should complete within 30s
+        success, msg = result_queue.get()
+        # Ensure the child process raised an exception during initialization
+        # before checking the message
+        if not success:
+            self.fail('CudaSupportError not raised')
+        self.assertIn(expected, msg)
+    def test_init_failure_raising(self):
+        expected = 'Error at driver init: CUDA_ERROR_UNKNOWN (999)'
+        self._test_init_failure(cuInit_raising_test, expected)
+    def test_init_failure_error(self):
+        expected = 'CUDA_ERROR_UNKNOWN (999)'
+        self._test_init_failure(initialization_error_test, expected)
+    def _test_cuda_disabled(self, target):
+        # Uses _test_init_failure to launch the test in a separate subprocess
+        # with CUDA disabled.
+        cuda_disabled = os.environ.get('NUMBA_DISABLE_CUDA')
+        os.environ['NUMBA_DISABLE_CUDA'] = "1"
+        try:
+            expected = 'CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1'
+            self._test_init_failure(cuda_disabled_test, expected)
+        finally:
+            if cuda_disabled is not None:
+                os.environ['NUMBA_DISABLE_CUDA'] = cuda_disabled
+            else:
+                os.environ.pop('NUMBA_DISABLE_CUDA')
+    def test_cuda_disabled_raising(self):
+        self._test_cuda_disabled(cuda_disabled_test)
+    def test_cuda_disabled_error(self):
+        self._test_cuda_disabled(cuda_disabled_error_test)
+    def test_init_success(self):
+        # Here we assume that initialization is successful (because many bad
+        # things will happen with the test suite if it is not) and check that
+        # there is no error recorded.
+        self.assertIsNone(cuda.cuda_error())
+if __name__ == '__main__':
+    unittest.main()

numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py ADDED Viewed

@@ -0,0 +1,37 @@
+from llvmlite import ir
+from numba.cuda.cudadrv import nvvm
+from numba.cuda.testing import unittest, ContextResettingTestCase
+from numba.cuda.testing import skip_on_cudasim
+@skip_on_cudasim('Inline PTX cannot be used in the simulator')
+class TestCudaInlineAsm(ContextResettingTestCase):
+    def test_inline_rsqrt(self):
+        mod = ir.Module(__name__)
+        mod.triple = 'nvptx64-nvidia-cuda'
+        nvvm.add_ir_version(mod)
+        fnty = ir.FunctionType(ir.VoidType(), [ir.PointerType(ir.FloatType())])
+        fn = ir.Function(mod, fnty, 'cu_rsqrt')
+        bldr = ir.IRBuilder(fn.append_basic_block('entry'))
+        rsqrt_approx_fnty = ir.FunctionType(ir.FloatType(), [ir.FloatType()])
+        inlineasm = ir.InlineAsm(rsqrt_approx_fnty,
+                                 'rsqrt.approx.f32 $0, $1;',
+                                 '=f,f', side_effect=True)
+        val = bldr.load(fn.args[0])
+        res = bldr.call(inlineasm, [val])
+        bldr.store(res, fn.args[0])
+        bldr.ret_void()
+        # generate ptx
+        mod.data_layout = nvvm.NVVM().data_layout
+        nvvm.set_cuda_kernel(fn)
+        nvvmir = str(mod)
+        ptx = nvvm.compile_ir(nvvmir)
+        self.assertTrue('rsqrt.approx.f32' in str(ptx))
+if __name__ == '__main__':
+    unittest.main()

numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py ADDED Viewed

@@ -0,0 +1,12 @@
+from numba import cuda
+from numba.cuda.testing import CUDATestCase, skip_on_cudasim, skip_unless_cc_53
+class TestIsFP16Supported(CUDATestCase):
+    def test_is_fp16_supported(self):
+        self.assertTrue(cuda.is_float16_supported())
+    @skip_on_cudasim
+    @skip_unless_cc_53
+    def test_device_supports_float16(self):
+        self.assertTrue(cuda.get_current_device().supports_float16)

numba_cuda/numba/cuda/tests/cudadrv/test_linker.py ADDED Viewed

@@ -0,0 +1,317 @@
+import numpy as np
+import warnings
+from numba.cuda.testing import unittest
+from numba.cuda.testing import (skip_on_cudasim, skip_if_cuda_includes_missing)
+from numba.cuda.testing import CUDATestCase, test_data_dir
+from numba.cuda.cudadrv.driver import (CudaAPIError, Linker,
+                                       LinkerError)
+from numba.cuda.cudadrv.error import NvrtcError
+from numba.cuda import require_context
+from numba.tests.support import ignore_internal_warnings
+from numba import cuda, void, float64, int64, int32, typeof, float32
+CONST1D = np.arange(10, dtype=np.float64)
+def simple_const_mem(A):
+    C = cuda.const.array_like(CONST1D)
+    i = cuda.grid(1)
+    A[i] = C[i] + 1.0
+def func_with_lots_of_registers(x, a, b, c, d, e, f):
+    a1 = 1.0
+    a2 = 1.0
+    a3 = 1.0
+    a4 = 1.0
+    a5 = 1.0
+    b1 = 1.0
+    b2 = 1.0
+    b3 = 1.0
+    b4 = 1.0
+    b5 = 1.0
+    c1 = 1.0
+    c2 = 1.0
+    c3 = 1.0
+    c4 = 1.0
+    c5 = 1.0
+    d1 = 10
+    d2 = 10
+    d3 = 10
+    d4 = 10
+    d5 = 10
+    for i in range(a):
+        a1 += b
+        a2 += c
+        a3 += d
+        a4 += e
+        a5 += f
+        b1 *= b
+        b2 *= c
+        b3 *= d
+        b4 *= e
+        b5 *= f
+        c1 /= b
+        c2 /= c
+        c3 /= d
+        c4 /= e
+        c5 /= f
+        d1 <<= b
+        d2 <<= c
+        d3 <<= d
+        d4 <<= e
+        d5 <<= f
+    x[cuda.grid(1)] = a1 + a2 + a3 + a4 + a5
+    x[cuda.grid(1)] += b1 + b2 + b3 + b4 + b5
+    x[cuda.grid(1)] += c1 + c2 + c3 + c4 + c5
+    x[cuda.grid(1)] += d1 + d2 + d3 + d4 + d5
+def simple_smem(ary, dty):
+    sm = cuda.shared.array(100, dty)
+    i = cuda.grid(1)
+    if i == 0:
+        for j in range(100):
+            sm[j] = j
+    cuda.syncthreads()
+    ary[i] = sm[i]
+def coop_smem2d(ary):
+    i, j = cuda.grid(2)
+    sm = cuda.shared.array((10, 20), float32)
+    sm[i, j] = (i + 1) / (j + 1)
+    cuda.syncthreads()
+    ary[i, j] = sm[i, j]
+def simple_maxthreads(ary):
+    i = cuda.grid(1)
+    ary[i] = i
+LMEM_SIZE = 1000
+def simple_lmem(A, B, dty):
+    C = cuda.local.array(LMEM_SIZE, dty)
+    for i in range(C.shape[0]):
+        C[i] = A[i]
+    for i in range(C.shape[0]):
+        B[i] = C[i]
+@skip_on_cudasim('Linking unsupported in the simulator')
+class TestLinker(CUDATestCase):
+    _NUMBA_NVIDIA_BINDING_0_ENV = {'NUMBA_CUDA_USE_NVIDIA_BINDING': '0'}
+    @require_context
+    def test_linker_basic(self):
+        '''Simply go through the constructor and destructor
+        '''
+        linker = Linker.new(cc=(5, 3))
+        del linker
+    def _test_linking(self, eager):
+        global bar  # must be a global; other it is recognized as a freevar
+        bar = cuda.declare_device('bar', 'int32(int32)')
+        link = str(test_data_dir / 'jitlink.ptx')
+        if eager:
+            args = ['void(int32[:], int32[:])']
+        else:
+            args = []
+        @cuda.jit(*args, link=[link])
+        def foo(x, y):
+            i = cuda.grid(1)
+            x[i] += bar(y[i])
+        A = np.array([123], dtype=np.int32)
+        B = np.array([321], dtype=np.int32)
+        foo[1, 1](A, B)
+        self.assertTrue(A[0] == 123 + 2 * 321)
+    def test_linking_lazy_compile(self):
+        self._test_linking(eager=False)
+    def test_linking_eager_compile(self):
+        self._test_linking(eager=True)
+    def test_linking_cu(self):
+        bar = cuda.declare_device('bar', 'int32(int32)')
+        link = str(test_data_dir / 'jitlink.cu')
+        @cuda.jit(link=[link])
+        def kernel(r, x):
+            i = cuda.grid(1)
+            if i < len(r):
+                r[i] = bar(x[i])
+        x = np.arange(10, dtype=np.int32)
+        r = np.zeros_like(x)
+        kernel[1, 32](r, x)
+        # Matches the operation of bar() in jitlink.cu
+        expected = x * 2
+        np.testing.assert_array_equal(r, expected)
+    def test_linking_cu_log_warning(self):
+        bar = cuda.declare_device('bar', 'int32(int32)')
+        link = str(test_data_dir / 'warn.cu')
+        with warnings.catch_warnings(record=True) as w:
+            ignore_internal_warnings()
+            @cuda.jit('void(int32)', link=[link])
+            def kernel(x):
+                bar(x)
+        self.assertEqual(len(w), 1, 'Expected warnings from NVRTC')
+        # Check the warning refers to the log messages
+        self.assertIn('NVRTC log messages', str(w[0].message))
+        # Check the message pertaining to the unused variable is provided
+        self.assertIn('declared but never referenced', str(w[0].message))
+    def test_linking_cu_error(self):
+        bar = cuda.declare_device('bar', 'int32(int32)')
+        link = str(test_data_dir / 'error.cu')
+        with self.assertRaises(NvrtcError) as e:
+            @cuda.jit('void(int32)', link=[link])
+            def kernel(x):
+                bar(x)
+        msg = e.exception.args[0]
+        # Check the error message refers to the NVRTC compile
+        self.assertIn('NVRTC Compilation failure', msg)
+        # Check the expected error in the CUDA source is reported
+        self.assertIn('identifier "SYNTAX" is undefined', msg)
+        # Check the filename is reported correctly
+        self.assertIn('in the compilation of "error.cu"', msg)
+    def test_linking_unknown_filetype_error(self):
+        expected_err = "Don't know how to link file with extension .cuh"
+        with self.assertRaisesRegex(RuntimeError, expected_err):
+            @cuda.jit('void()', link=['header.cuh'])
+            def kernel():
+                pass
+    def test_linking_file_with_no_extension_error(self):
+        expected_err = "Don't know how to link file with no extension"
+        with self.assertRaisesRegex(RuntimeError, expected_err):
+            @cuda.jit('void()', link=['data'])
+            def kernel():
+                pass
+    @skip_if_cuda_includes_missing
+    def test_linking_cu_cuda_include(self):
+        link = str(test_data_dir / 'cuda_include.cu')
+        # An exception will be raised when linking this kernel due to the
+        # compile failure if CUDA includes cannot be found by Nvrtc.
+        @cuda.jit('void()', link=[link])
+        def kernel():
+            pass
+    def test_try_to_link_nonexistent(self):
+        with self.assertRaises(LinkerError) as e:
+            @cuda.jit('void(int32[::1])', link=['nonexistent.a'])
+            def f(x):
+                x[0] = 0
+        self.assertIn('nonexistent.a not found', e.exception.args)
+    def test_set_registers_no_max(self):
+        """Ensure that the jitted kernel used in the test_set_registers_* tests
+        uses more than 57 registers - this ensures that test_set_registers_*
+        are really checking that they reduced the number of registers used from
+        something greater than the maximum."""
+        compiled = cuda.jit(func_with_lots_of_registers)
+        compiled = compiled.specialize(np.empty(32), *range(6))
+        self.assertGreater(compiled.get_regs_per_thread(), 57)
+    def test_set_registers_57(self):
+        compiled = cuda.jit(max_registers=57)(func_with_lots_of_registers)
+        compiled = compiled.specialize(np.empty(32), *range(6))
+        self.assertLessEqual(compiled.get_regs_per_thread(), 57)
+    def test_set_registers_38(self):
+        compiled = cuda.jit(max_registers=38)(func_with_lots_of_registers)
+        compiled = compiled.specialize(np.empty(32), *range(6))
+        self.assertLessEqual(compiled.get_regs_per_thread(), 38)
+    def test_set_registers_eager(self):
+        sig = void(float64[::1], int64, int64, int64, int64, int64, int64)
+        compiled = cuda.jit(sig, max_registers=38)(func_with_lots_of_registers)
+        self.assertLessEqual(compiled.get_regs_per_thread(), 38)
+    def test_get_const_mem_size(self):
+        sig = void(float64[::1])
+        compiled = cuda.jit(sig)(simple_const_mem)
+        const_mem_size = compiled.get_const_mem_size()
+        self.assertGreaterEqual(const_mem_size, CONST1D.nbytes)
+    def test_get_no_shared_memory(self):
+        compiled = cuda.jit(func_with_lots_of_registers)
+        compiled = compiled.specialize(np.empty(32), *range(6))
+        shared_mem_size = compiled.get_shared_mem_per_block()
+        self.assertEqual(shared_mem_size, 0)
+    def test_get_shared_mem_per_block(self):
+        sig = void(int32[::1], typeof(np.int32))
+        compiled = cuda.jit(sig)(simple_smem)
+        shared_mem_size = compiled.get_shared_mem_per_block()
+        self.assertEqual(shared_mem_size, 400)
+    def test_get_shared_mem_per_specialized(self):
+        compiled = cuda.jit(simple_smem)
+        compiled_specialized = compiled.specialize(
+            np.zeros(100, dtype=np.int32), np.float64)
+        shared_mem_size = compiled_specialized.get_shared_mem_per_block()
+        self.assertEqual(shared_mem_size, 800)
+    def test_get_max_threads_per_block(self):
+        compiled = cuda.jit("void(float32[:,::1])")(coop_smem2d)
+        max_threads = compiled.get_max_threads_per_block()
+        self.assertGreater(max_threads, 0)
+    def test_max_threads_exceeded(self):
+        compiled = cuda.jit("void(int32[::1])")(simple_maxthreads)
+        max_threads = compiled.get_max_threads_per_block()
+        nelem = max_threads + 1
+        ary = np.empty(nelem, dtype=np.int32)
+        try:
+            compiled[1, nelem](ary)
+        except CudaAPIError as e:
+            self.assertIn("cuLaunchKernel", e.msg)
+    def test_get_local_mem_per_thread(self):
+        sig = void(int32[::1], int32[::1], typeof(np.int32))
+        compiled = cuda.jit(sig)(simple_lmem)
+        local_mem_size = compiled.get_local_mem_per_thread()
+        calc_size = np.dtype(np.int32).itemsize * LMEM_SIZE
+        self.assertGreaterEqual(local_mem_size, calc_size)
+    def test_get_local_mem_per_specialized(self):
+        compiled = cuda.jit(simple_lmem)
+        compiled_specialized = compiled.specialize(
+            np.zeros(LMEM_SIZE, dtype=np.int32),
+            np.zeros(LMEM_SIZE, dtype=np.int32),
+            np.float64)
+        local_mem_size = compiled_specialized.get_local_mem_per_thread()
+        calc_size = np.dtype(np.float64).itemsize * LMEM_SIZE
+        self.assertGreaterEqual(local_mem_size, calc_size)
+if __name__ == '__main__':
+    unittest.main()

numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py ADDED Viewed

@@ -0,0 +1,127 @@
+import numpy as np
+from ctypes import byref, c_size_t
+from numba.cuda.cudadrv.driver import device_memset, driver, USE_NV_BINDING
+from numba import cuda
+from numba.cuda.testing import unittest, ContextResettingTestCase
+from numba.cuda.testing import skip_on_cudasim, skip_on_arm
+from numba.tests.support import linux_only
+@skip_on_cudasim('CUDA Driver API unsupported in the simulator')
+@linux_only
+@skip_on_arm('Managed Alloc support is experimental/untested on ARM')
+class TestManagedAlloc(ContextResettingTestCase):
+    def get_total_gpu_memory(self):
+        # We use a driver function to directly get the total GPU memory because
+        # an EMM plugin may report something different (or not implement
+        # get_memory_info at all).
+        if USE_NV_BINDING:
+            free, total = driver.cuMemGetInfo()
+            return total
+        else:
+            free = c_size_t()
+            total = c_size_t()
+            driver.cuMemGetInfo(byref(free), byref(total))
+            return total.value
+    def skip_if_cc_major_lt(self, min_required, reason):
+        """
+        Skip the current test if the compute capability of the device is
+        less than `min_required`.
+        """
+        ctx = cuda.current_context()
+        cc_major = ctx.device.compute_capability[0]
+        if cc_major < min_required:
+            self.skipTest(reason)
+    # CUDA Unified Memory comes in two flavors. For GPUs in the Kepler and
+    # Maxwell generations, managed memory allocations work as opaque,
+    # contiguous segments that can either be on the device or the host. For
+    # GPUs in the Pascal or later generations, managed memory operates on a
+    # per-page basis, so we can have arrays larger than GPU memory, where only
+    # part of them is resident on the device at one time. To ensure that this
+    # test works correctly on all supported GPUs, we'll select the size of our
+    # memory such that we only oversubscribe the GPU memory if we're on a
+    # Pascal or newer GPU (compute capability at least 6.0).
+    def test_managed_alloc_driver_undersubscribe(self):
+        msg = "Managed memory unsupported prior to CC 3.0"
+        self.skip_if_cc_major_lt(3, msg)
+        self._test_managed_alloc_driver(0.5)
+    # This test is skipped by default because it is easy to hang the machine
+    # for a very long time or get OOM killed if the GPU memory size is >50% of
+    # the system memory size. Even if the system does have more than 2x the RAM
+    # of the GPU, this test runs for a very long time (in comparison to the
+    # rest of the tests in the suite).
+    #
+    # However, it is left in here for manual testing as required.
+    @unittest.skip
+    def test_managed_alloc_driver_oversubscribe(self):
+        msg = "Oversubscription of managed memory unsupported prior to CC 6.0"
+        self.skip_if_cc_major_lt(6, msg)
+        self._test_managed_alloc_driver(2.0)
+    def test_managed_alloc_driver_host_attach(self):
+        msg = "Host attached managed memory is not accessible prior to CC 6.0"
+        self.skip_if_cc_major_lt(6, msg)
+        # Only test with a small array (0.01 * memory size) to keep the test
+        # quick.
+        self._test_managed_alloc_driver(0.01, attach_global=False)
+    def _test_managed_alloc_driver(self, memory_factor, attach_global=True):
+        # Verify that we can allocate and operate on managed
+        # memory through the CUDA driver interface.
+        total_mem_size = self.get_total_gpu_memory()
+        n_bytes = int(memory_factor * total_mem_size)
+        ctx = cuda.current_context()
+        mem = ctx.memallocmanaged(n_bytes, attach_global=attach_global)
+        dtype = np.dtype(np.uint8)
+        n_elems = n_bytes // dtype.itemsize
+        ary = np.ndarray(shape=n_elems, dtype=dtype, buffer=mem)
+        magic = 0xab
+        device_memset(mem, magic, n_bytes)
+        ctx.synchronize()
+        # Note that this assertion operates on the CPU, so this
+        # test effectively drives both the CPU and the GPU on
+        # managed memory.
+        self.assertTrue(np.all(ary == magic))
+    def _test_managed_array(self, attach_global=True):
+        # Check the managed_array interface on both host and device.
+        ary = cuda.managed_array(100, dtype=np.double)
+        ary.fill(123.456)
+        self.assertTrue(all(ary == 123.456))
+        @cuda.jit('void(double[:])')
+        def kernel(x):
+            i = cuda.grid(1)
+            if i < x.shape[0]:
+                x[i] = 1.0
+        kernel[10, 10](ary)
+        cuda.current_context().synchronize()
+        self.assertTrue(all(ary == 1.0))
+    def test_managed_array_attach_global(self):
+        self._test_managed_array()
+    def test_managed_array_attach_host(self):
+        self._test_managed_array()
+        msg = "Host attached managed memory is not accessible prior to CC 6.0"
+        self.skip_if_cc_major_lt(6, msg)
+        self._test_managed_array(attach_global=False)
+if __name__ == '__main__':
+    unittest.main()

numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py ADDED Viewed

@@ -0,0 +1,54 @@
+import multiprocessing as mp
+import traceback
+from numba.cuda.testing import unittest, CUDATestCase
+from numba.cuda.testing import (skip_on_cudasim, skip_under_cuda_memcheck,
+                                skip_if_mvc_libraries_unavailable)
+from numba.tests.support import linux_only
+def child_test():
+    from numba import config, cuda
+    # Change the MVC config after importing numba.cuda
+    config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY = 1
+    @cuda.jit
+    def f():
+        pass
+    f[1, 1]()
+def child_test_wrapper(result_queue):
+    try:
+        output = child_test()
+        success = True
+    # Catch anything raised so it can be propagated
+    except: # noqa: E722
+        output = traceback.format_exc()
+        success = False
+    result_queue.put((success, output))
+@linux_only
+@skip_under_cuda_memcheck('May hang CUDA memcheck')
+@skip_on_cudasim('Simulator does not require or implement MVC')
+@skip_if_mvc_libraries_unavailable
+class TestMinorVersionCompatibility(CUDATestCase):
+    def test_mvc(self):
+        # Run test with Minor Version Compatibility enabled in a child process
+        ctx = mp.get_context('spawn')
+        result_queue = ctx.Queue()
+        proc = ctx.Process(target=child_test_wrapper, args=(result_queue,))
+        proc.start()
+        proc.join()
+        success, output = result_queue.get()
+        # Ensure the child process ran to completion before checking its output
+        if not success:
+            self.fail(output)
+if __name__ == '__main__':
+    unittest.main()

numba-cuda 0.0.0__py3-none-any.whl → 0.0.12__py3-none-any.whl

numba-cuda 0.0.0py3-none-any.whl → 0.0.12py3-none-any.whl