numba-cuda 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.py +17 -13
- numba_cuda/VERSION +1 -1
- numba_cuda/_version.py +4 -1
- numba_cuda/numba/cuda/__init__.py +6 -2
- numba_cuda/numba/cuda/api.py +129 -86
- numba_cuda/numba/cuda/api_util.py +3 -3
- numba_cuda/numba/cuda/args.py +12 -16
- numba_cuda/numba/cuda/cg.py +6 -6
- numba_cuda/numba/cuda/codegen.py +74 -43
- numba_cuda/numba/cuda/compiler.py +232 -113
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
- numba_cuda/numba/cuda/cuda_fp16.h +661 -661
- numba_cuda/numba/cuda/cuda_fp16.hpp +3 -3
- numba_cuda/numba/cuda/cuda_paths.py +291 -99
- numba_cuda/numba/cuda/cudadecl.py +125 -69
- numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
- numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
- numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
- numba_cuda/numba/cuda/cudadrv/driver.py +463 -297
- numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
- numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
- numba_cuda/numba/cuda/cudadrv/error.py +6 -2
- numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
- numba_cuda/numba/cuda/cudadrv/linkable_code.py +16 -1
- numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +138 -29
- numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
- numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
- numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
- numba_cuda/numba/cuda/cudaimpl.py +317 -233
- numba_cuda/numba/cuda/cudamath.py +1 -1
- numba_cuda/numba/cuda/debuginfo.py +8 -6
- numba_cuda/numba/cuda/decorators.py +75 -45
- numba_cuda/numba/cuda/descriptor.py +1 -1
- numba_cuda/numba/cuda/device_init.py +69 -18
- numba_cuda/numba/cuda/deviceufunc.py +143 -98
- numba_cuda/numba/cuda/dispatcher.py +300 -213
- numba_cuda/numba/cuda/errors.py +13 -10
- numba_cuda/numba/cuda/extending.py +1 -1
- numba_cuda/numba/cuda/initialize.py +5 -3
- numba_cuda/numba/cuda/intrinsic_wrapper.py +3 -3
- numba_cuda/numba/cuda/intrinsics.py +31 -27
- numba_cuda/numba/cuda/kernels/reduction.py +13 -13
- numba_cuda/numba/cuda/kernels/transpose.py +3 -6
- numba_cuda/numba/cuda/libdevice.py +317 -317
- numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
- numba_cuda/numba/cuda/locks.py +16 -0
- numba_cuda/numba/cuda/mathimpl.py +62 -57
- numba_cuda/numba/cuda/models.py +1 -5
- numba_cuda/numba/cuda/nvvmutils.py +103 -88
- numba_cuda/numba/cuda/printimpl.py +9 -5
- numba_cuda/numba/cuda/random.py +46 -36
- numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
- numba_cuda/numba/cuda/runtime/__init__.py +1 -1
- numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
- numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
- numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
- numba_cuda/numba/cuda/runtime/nrt.py +48 -43
- numba_cuda/numba/cuda/simulator/__init__.py +22 -12
- numba_cuda/numba/cuda/simulator/api.py +38 -22
- numba_cuda/numba/cuda/simulator/compiler.py +2 -2
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
- numba_cuda/numba/cuda/simulator/kernel.py +43 -34
- numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
- numba_cuda/numba/cuda/simulator/reduction.py +1 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
- numba_cuda/numba/cuda/simulator_init.py +2 -4
- numba_cuda/numba/cuda/stubs.py +139 -102
- numba_cuda/numba/cuda/target.py +64 -47
- numba_cuda/numba/cuda/testing.py +24 -19
- numba_cuda/numba/cuda/tests/__init__.py +14 -12
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +7 -6
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +57 -21
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +31 -28
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +6 -7
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +19 -12
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +6 -6
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
- numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +31 -25
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
- numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
- numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
- numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
- numba_cuda/numba/cuda/types.py +5 -2
- numba_cuda/numba/cuda/ufuncs.py +382 -362
- numba_cuda/numba/cuda/utils.py +2 -2
- numba_cuda/numba/cuda/vector_types.py +2 -2
- numba_cuda/numba/cuda/vectorizers.py +37 -32
- {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/METADATA +1 -1
- numba_cuda-0.9.0.dist-info/RECORD +253 -0
- {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/WHEEL +1 -1
- numba_cuda-0.8.0.dist-info/RECORD +0 -251
- {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/top_level.txt +0 -0
@@ -7,12 +7,13 @@ from numba import cuda
|
|
7
7
|
from numba.cuda.testing import skip_on_cudasim, CUDATestCase
|
8
8
|
import unittest
|
9
9
|
|
10
|
-
has_mp_get_context = hasattr(mp,
|
11
|
-
is_unix = os.name ==
|
10
|
+
has_mp_get_context = hasattr(mp, "get_context")
|
11
|
+
is_unix = os.name == "posix"
|
12
12
|
|
13
13
|
|
14
14
|
def fork_test(q):
|
15
15
|
from numba.cuda.cudadrv.error import CudaDriverError
|
16
|
+
|
16
17
|
try:
|
17
18
|
cuda.to_device(np.arange(1))
|
18
19
|
except CudaDriverError as e:
|
@@ -21,17 +22,17 @@ def fork_test(q):
|
|
21
22
|
q.put(None)
|
22
23
|
|
23
24
|
|
24
|
-
@skip_on_cudasim(
|
25
|
+
@skip_on_cudasim("disabled for cudasim")
|
25
26
|
class TestMultiprocessing(CUDATestCase):
|
26
|
-
@unittest.skipUnless(has_mp_get_context,
|
27
|
-
@unittest.skipUnless(is_unix,
|
27
|
+
@unittest.skipUnless(has_mp_get_context, "requires mp.get_context")
|
28
|
+
@unittest.skipUnless(is_unix, "requires Unix")
|
28
29
|
def test_fork(self):
|
29
30
|
"""
|
30
31
|
Test fork detection.
|
31
32
|
"""
|
32
33
|
cuda.current_context() # force cuda initialize
|
33
34
|
# fork in process that also uses CUDA
|
34
|
-
ctx = mp.get_context(
|
35
|
+
ctx = mp.get_context("fork")
|
35
36
|
q = ctx.Queue()
|
36
37
|
proc = ctx.Process(target=fork_test, args=[q])
|
37
38
|
proc.start()
|
@@ -39,8 +40,8 @@ class TestMultiprocessing(CUDATestCase):
|
|
39
40
|
proc.join()
|
40
41
|
# there should be an exception raised in the child process
|
41
42
|
self.assertIsNotNone(exc)
|
42
|
-
self.assertIn(
|
43
|
+
self.assertIn("CUDA initialized before forking", str(exc))
|
43
44
|
|
44
45
|
|
45
|
-
if __name__ ==
|
46
|
+
if __name__ == "__main__":
|
46
47
|
unittest.main()
|
@@ -3,8 +3,11 @@ import threading
|
|
3
3
|
import multiprocessing
|
4
4
|
import numpy as np
|
5
5
|
from numba import cuda
|
6
|
-
from numba.cuda.testing import (
|
7
|
-
|
6
|
+
from numba.cuda.testing import (
|
7
|
+
skip_on_cudasim,
|
8
|
+
skip_under_cuda_memcheck,
|
9
|
+
CUDATestCase,
|
10
|
+
)
|
8
11
|
import unittest
|
9
12
|
|
10
13
|
try:
|
@@ -15,7 +18,7 @@ else:
|
|
15
18
|
has_concurrent_futures = True
|
16
19
|
|
17
20
|
|
18
|
-
has_mp_get_context = hasattr(multiprocessing,
|
21
|
+
has_mp_get_context = hasattr(multiprocessing, "get_context")
|
19
22
|
|
20
23
|
|
21
24
|
def check_concurrent_compiling():
|
@@ -41,15 +44,14 @@ def spawn_process_entry(q):
|
|
41
44
|
# Catch anything that goes wrong in the threads
|
42
45
|
except: # noqa: E722
|
43
46
|
msg = traceback.format_exc()
|
44
|
-
q.put(
|
47
|
+
q.put("\n".join(["", "=" * 80, msg]))
|
45
48
|
else:
|
46
49
|
q.put(None)
|
47
50
|
|
48
51
|
|
49
|
-
@skip_under_cuda_memcheck(
|
50
|
-
@skip_on_cudasim(
|
52
|
+
@skip_under_cuda_memcheck("Hangs cuda-memcheck")
|
53
|
+
@skip_on_cudasim("disabled for cudasim")
|
51
54
|
class TestMultiThreadCompiling(CUDATestCase):
|
52
|
-
|
53
55
|
@unittest.skipIf(not has_concurrent_futures, "no concurrent.futures")
|
54
56
|
def test_concurrent_compiling(self):
|
55
57
|
check_concurrent_compiling()
|
@@ -59,7 +61,7 @@ class TestMultiThreadCompiling(CUDATestCase):
|
|
59
61
|
# force CUDA context init
|
60
62
|
cuda.get_current_device()
|
61
63
|
# use "spawn" to avoid inheriting the CUDA context
|
62
|
-
ctx = multiprocessing.get_context(
|
64
|
+
ctx = multiprocessing.get_context("spawn")
|
63
65
|
|
64
66
|
q = ctx.Queue()
|
65
67
|
p = ctx.Process(target=spawn_process_entry, args=(q,))
|
@@ -70,7 +72,7 @@ class TestMultiThreadCompiling(CUDATestCase):
|
|
70
72
|
p.join()
|
71
73
|
if err is not None:
|
72
74
|
raise AssertionError(err)
|
73
|
-
self.assertEqual(p.exitcode, 0,
|
75
|
+
self.assertEqual(p.exitcode, 0, "test failed in child process")
|
74
76
|
|
75
77
|
def test_invalid_context_error_with_d2h(self):
|
76
78
|
def d2h(arr, out):
|
@@ -97,5 +99,5 @@ class TestMultiThreadCompiling(CUDATestCase):
|
|
97
99
|
np.testing.assert_equal(darr.copy_to_host(), arr)
|
98
100
|
|
99
101
|
|
100
|
-
if __name__ ==
|
102
|
+
if __name__ == "__main__":
|
101
103
|
unittest.main()
|
@@ -1,6 +1,10 @@
|
|
1
1
|
import numpy as np
|
2
|
-
from numba.cuda.testing import (
|
3
|
-
|
2
|
+
from numba.cuda.testing import (
|
3
|
+
unittest,
|
4
|
+
CUDATestCase,
|
5
|
+
skip_unless_cc_53,
|
6
|
+
skip_on_cudasim,
|
7
|
+
)
|
4
8
|
from numba import cuda
|
5
9
|
from numba.core.types import f2, b1
|
6
10
|
from numba.cuda import compile_ptx
|
@@ -73,12 +77,12 @@ def simple_fp16_ne(ary, a, b):
|
|
73
77
|
ary[0] = a != b
|
74
78
|
|
75
79
|
|
76
|
-
@cuda.jit(
|
80
|
+
@cuda.jit("b1(f2, f2)", device=True)
|
77
81
|
def hlt_func_1(x, y):
|
78
82
|
return x < y
|
79
83
|
|
80
84
|
|
81
|
-
@cuda.jit(
|
85
|
+
@cuda.jit("b1(f2, f2)", device=True)
|
82
86
|
def hlt_func_2(x, y):
|
83
87
|
return x < y
|
84
88
|
|
@@ -116,6 +120,7 @@ class TestOperatorModule(CUDATestCase):
|
|
116
120
|
"""
|
117
121
|
Test if operator module is supported by the CUDA target.
|
118
122
|
"""
|
123
|
+
|
119
124
|
def operator_template(self, op):
|
120
125
|
@cuda.jit
|
121
126
|
def foo(a, b):
|
@@ -146,8 +151,12 @@ class TestOperatorModule(CUDATestCase):
|
|
146
151
|
|
147
152
|
@skip_unless_cc_53
|
148
153
|
def test_fp16_binary(self):
|
149
|
-
functions = (
|
150
|
-
|
154
|
+
functions = (
|
155
|
+
simple_fp16add,
|
156
|
+
simple_fp16sub,
|
157
|
+
simple_fp16mul,
|
158
|
+
simple_fp16_div_scalar,
|
159
|
+
)
|
151
160
|
ops = (operator.add, operator.sub, operator.mul, operator.truediv)
|
152
161
|
|
153
162
|
for fn, op in zip(functions, ops):
|
@@ -162,10 +171,10 @@ class TestOperatorModule(CUDATestCase):
|
|
162
171
|
expected = op(arg1, arg2)
|
163
172
|
np.testing.assert_allclose(got, expected)
|
164
173
|
|
165
|
-
@skip_on_cudasim(
|
174
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
166
175
|
def test_fp16_binary_ptx(self):
|
167
176
|
functions = (simple_fp16add, simple_fp16sub, simple_fp16mul)
|
168
|
-
instrs = (
|
177
|
+
instrs = ("add.f16", "sub.f16", "mul.f16")
|
169
178
|
args = (f2[:], f2, f2)
|
170
179
|
for fn, instr in zip(functions, instrs):
|
171
180
|
with self.subTest(instr=instr):
|
@@ -174,11 +183,14 @@ class TestOperatorModule(CUDATestCase):
|
|
174
183
|
|
175
184
|
@skip_unless_cc_53
|
176
185
|
def test_mixed_fp16_binary_arithmetic(self):
|
177
|
-
functions = (
|
178
|
-
|
186
|
+
functions = (
|
187
|
+
simple_fp16add,
|
188
|
+
simple_fp16sub,
|
189
|
+
simple_fp16mul,
|
190
|
+
simple_fp16_div_scalar,
|
191
|
+
)
|
179
192
|
ops = (operator.add, operator.sub, operator.mul, operator.truediv)
|
180
|
-
types = (np.int8, np.int16, np.int32, np.int64,
|
181
|
-
np.float32, np.float64)
|
193
|
+
types = (np.int8, np.int16, np.int32, np.int64, np.float32, np.float64)
|
182
194
|
for (fn, op), ty in itertools.product(zip(functions, ops), types):
|
183
195
|
with self.subTest(op=op, ty=ty):
|
184
196
|
kernel = cuda.jit(fn)
|
@@ -192,10 +204,10 @@ class TestOperatorModule(CUDATestCase):
|
|
192
204
|
expected = op(arg1, arg2)
|
193
205
|
np.testing.assert_allclose(got, expected)
|
194
206
|
|
195
|
-
@skip_on_cudasim(
|
207
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
196
208
|
def test_fp16_inplace_binary_ptx(self):
|
197
209
|
functions = (simple_fp16_iadd, simple_fp16_isub, simple_fp16_imul)
|
198
|
-
instrs = (
|
210
|
+
instrs = ("add.f16", "sub.f16", "mul.f16")
|
199
211
|
args = (f2[:], f2)
|
200
212
|
|
201
213
|
for fn, instr in zip(functions, instrs):
|
@@ -205,8 +217,12 @@ class TestOperatorModule(CUDATestCase):
|
|
205
217
|
|
206
218
|
@skip_unless_cc_53
|
207
219
|
def test_fp16_inplace_binary(self):
|
208
|
-
functions = (
|
209
|
-
|
220
|
+
functions = (
|
221
|
+
simple_fp16_iadd,
|
222
|
+
simple_fp16_isub,
|
223
|
+
simple_fp16_imul,
|
224
|
+
simple_fp16_idiv,
|
225
|
+
)
|
210
226
|
ops = (operator.iadd, operator.isub, operator.imul, operator.itruediv)
|
211
227
|
|
212
228
|
for fn, op in zip(functions, ops):
|
@@ -236,26 +252,37 @@ class TestOperatorModule(CUDATestCase):
|
|
236
252
|
expected = op(arg1)
|
237
253
|
np.testing.assert_allclose(got, expected)
|
238
254
|
|
239
|
-
@skip_on_cudasim(
|
255
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
240
256
|
def test_fp16_neg_ptx(self):
|
241
257
|
args = (f2[:], f2)
|
242
258
|
ptx, _ = compile_ptx(simple_fp16neg, args, cc=(5, 3))
|
243
|
-
self.assertIn(
|
259
|
+
self.assertIn("neg.f16", ptx)
|
244
260
|
|
245
|
-
@skip_on_cudasim(
|
261
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
246
262
|
def test_fp16_abs_ptx(self):
|
247
263
|
args = (f2[:], f2)
|
248
264
|
ptx, _ = compile_ptx(simple_fp16abs, args, cc=(5, 3))
|
249
265
|
|
250
|
-
self.assertIn(
|
266
|
+
self.assertIn("abs.f16", ptx)
|
251
267
|
|
252
268
|
@skip_unless_cc_53
|
253
269
|
def test_fp16_comparison(self):
|
254
|
-
functions = (
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
270
|
+
functions = (
|
271
|
+
simple_fp16_gt,
|
272
|
+
simple_fp16_ge,
|
273
|
+
simple_fp16_lt,
|
274
|
+
simple_fp16_le,
|
275
|
+
simple_fp16_eq,
|
276
|
+
simple_fp16_ne,
|
277
|
+
)
|
278
|
+
ops = (
|
279
|
+
operator.gt,
|
280
|
+
operator.ge,
|
281
|
+
operator.lt,
|
282
|
+
operator.le,
|
283
|
+
operator.eq,
|
284
|
+
operator.ne,
|
285
|
+
)
|
259
286
|
|
260
287
|
for fn, op in zip(functions, ops):
|
261
288
|
with self.subTest(op=op):
|
@@ -271,16 +298,25 @@ class TestOperatorModule(CUDATestCase):
|
|
271
298
|
|
272
299
|
@skip_unless_cc_53
|
273
300
|
def test_mixed_fp16_comparison(self):
|
274
|
-
functions = (
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
301
|
+
functions = (
|
302
|
+
simple_fp16_gt,
|
303
|
+
simple_fp16_ge,
|
304
|
+
simple_fp16_lt,
|
305
|
+
simple_fp16_le,
|
306
|
+
simple_fp16_eq,
|
307
|
+
simple_fp16_ne,
|
308
|
+
)
|
309
|
+
ops = (
|
310
|
+
operator.gt,
|
311
|
+
operator.ge,
|
312
|
+
operator.lt,
|
313
|
+
operator.le,
|
314
|
+
operator.eq,
|
315
|
+
operator.ne,
|
316
|
+
)
|
317
|
+
types = (np.int8, np.int16, np.int32, np.int64, np.float32, np.float64)
|
318
|
+
|
319
|
+
for (fn, op), ty in itertools.product(zip(functions, ops), types):
|
284
320
|
with self.subTest(op=op, ty=ty):
|
285
321
|
kernel = cuda.jit(fn)
|
286
322
|
|
@@ -294,48 +330,68 @@ class TestOperatorModule(CUDATestCase):
|
|
294
330
|
|
295
331
|
@skip_unless_cc_53
|
296
332
|
def test_multiple_float16_comparisons(self):
|
297
|
-
functions = (
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
333
|
+
functions = (
|
334
|
+
test_multiple_hcmp_1,
|
335
|
+
test_multiple_hcmp_2,
|
336
|
+
test_multiple_hcmp_3,
|
337
|
+
test_multiple_hcmp_4,
|
338
|
+
test_multiple_hcmp_5,
|
339
|
+
)
|
302
340
|
for fn in functions:
|
303
341
|
with self.subTest(fn=fn):
|
304
342
|
compiled = cuda.jit("void(b1[:], f2, f2, f2)")(fn)
|
305
343
|
ary = np.zeros(1, dtype=np.bool_)
|
306
|
-
arg1 = np.float16(2.)
|
307
|
-
arg2 = np.float16(3.)
|
308
|
-
arg3 = np.float16(4.)
|
344
|
+
arg1 = np.float16(2.0)
|
345
|
+
arg2 = np.float16(3.0)
|
346
|
+
arg3 = np.float16(4.0)
|
309
347
|
compiled[1, 1](ary, arg1, arg2, arg3)
|
310
348
|
self.assertTrue(ary[0])
|
311
349
|
|
312
350
|
@skip_unless_cc_53
|
313
351
|
def test_multiple_float16_comparisons_false(self):
|
314
|
-
functions = (
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
352
|
+
functions = (
|
353
|
+
test_multiple_hcmp_1,
|
354
|
+
test_multiple_hcmp_2,
|
355
|
+
test_multiple_hcmp_3,
|
356
|
+
test_multiple_hcmp_4,
|
357
|
+
test_multiple_hcmp_5,
|
358
|
+
)
|
319
359
|
for fn in functions:
|
320
360
|
with self.subTest(fn=fn):
|
321
361
|
compiled = cuda.jit("void(b1[:], f2, f2, f2)")(fn)
|
322
362
|
ary = np.zeros(1, dtype=np.bool_)
|
323
|
-
arg1 = np.float16(2.)
|
324
|
-
arg2 = np.float16(3.)
|
325
|
-
arg3 = np.float16(1.)
|
363
|
+
arg1 = np.float16(2.0)
|
364
|
+
arg2 = np.float16(3.0)
|
365
|
+
arg3 = np.float16(1.0)
|
326
366
|
compiled[1, 1](ary, arg1, arg2, arg3)
|
327
367
|
self.assertFalse(ary[0])
|
328
368
|
|
329
|
-
@skip_on_cudasim(
|
369
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
330
370
|
def test_fp16_comparison_ptx(self):
|
331
|
-
functions = (
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
371
|
+
functions = (
|
372
|
+
simple_fp16_gt,
|
373
|
+
simple_fp16_ge,
|
374
|
+
simple_fp16_lt,
|
375
|
+
simple_fp16_le,
|
376
|
+
simple_fp16_eq,
|
377
|
+
simple_fp16_ne,
|
378
|
+
)
|
379
|
+
ops = (
|
380
|
+
operator.gt,
|
381
|
+
operator.ge,
|
382
|
+
operator.lt,
|
383
|
+
operator.le,
|
384
|
+
operator.eq,
|
385
|
+
operator.ne,
|
386
|
+
)
|
387
|
+
opstring = (
|
388
|
+
"setp.gt.f16",
|
389
|
+
"setp.ge.f16",
|
390
|
+
"setp.lt.f16",
|
391
|
+
"setp.le.f16",
|
392
|
+
"setp.eq.f16",
|
393
|
+
"setp.ne.f16",
|
394
|
+
)
|
339
395
|
args = (b1[:], f2, f2)
|
340
396
|
|
341
397
|
for fn, op, s in zip(functions, ops, opstring):
|
@@ -343,51 +399,79 @@ class TestOperatorModule(CUDATestCase):
|
|
343
399
|
ptx, _ = compile_ptx(fn, args, cc=(5, 3))
|
344
400
|
self.assertIn(s, ptx)
|
345
401
|
|
346
|
-
@skip_on_cudasim(
|
402
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
347
403
|
def test_fp16_int8_comparison_ptx(self):
|
348
404
|
# Test that int8 can be safely converted to fp16
|
349
405
|
# in a comparison
|
350
|
-
functions = (
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
406
|
+
functions = (
|
407
|
+
simple_fp16_gt,
|
408
|
+
simple_fp16_ge,
|
409
|
+
simple_fp16_lt,
|
410
|
+
simple_fp16_le,
|
411
|
+
simple_fp16_eq,
|
412
|
+
simple_fp16_ne,
|
413
|
+
)
|
414
|
+
ops = (
|
415
|
+
operator.gt,
|
416
|
+
operator.ge,
|
417
|
+
operator.lt,
|
418
|
+
operator.le,
|
419
|
+
operator.eq,
|
420
|
+
operator.ne,
|
421
|
+
)
|
422
|
+
|
423
|
+
opstring = {
|
424
|
+
operator.gt: "setp.gt.f16",
|
425
|
+
operator.ge: "setp.ge.f16",
|
426
|
+
operator.lt: "setp.lt.f16",
|
427
|
+
operator.le: "setp.le.f16",
|
428
|
+
operator.eq: "setp.eq.f16",
|
429
|
+
operator.ne: "setp.ne.f16",
|
430
|
+
}
|
362
431
|
for fn, op in zip(functions, ops):
|
363
432
|
with self.subTest(op=op):
|
364
433
|
args = (b1[:], f2, from_dtype(np.int8))
|
365
434
|
ptx, _ = compile_ptx(fn, args, cc=(5, 3))
|
366
435
|
self.assertIn(opstring[op], ptx)
|
367
436
|
|
368
|
-
@skip_on_cudasim(
|
437
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
369
438
|
def test_mixed_fp16_comparison_promotion_ptx(self):
|
370
|
-
functions = (
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
439
|
+
functions = (
|
440
|
+
simple_fp16_gt,
|
441
|
+
simple_fp16_ge,
|
442
|
+
simple_fp16_lt,
|
443
|
+
simple_fp16_le,
|
444
|
+
simple_fp16_eq,
|
445
|
+
simple_fp16_ne,
|
446
|
+
)
|
447
|
+
ops = (
|
448
|
+
operator.gt,
|
449
|
+
operator.ge,
|
450
|
+
operator.lt,
|
451
|
+
operator.le,
|
452
|
+
operator.eq,
|
453
|
+
operator.ne,
|
454
|
+
)
|
455
|
+
|
456
|
+
types_promote = (np.int16, np.int32, np.int64, np.float32, np.float64)
|
457
|
+
opstring = {
|
458
|
+
operator.gt: "setp.gt.",
|
459
|
+
operator.ge: "setp.ge.",
|
460
|
+
operator.lt: "setp.lt.",
|
461
|
+
operator.le: "setp.le.",
|
462
|
+
operator.eq: "setp.eq.",
|
463
|
+
operator.ne: "setp.neu.",
|
464
|
+
}
|
465
|
+
opsuffix = {
|
466
|
+
np.dtype("int32"): "f64",
|
467
|
+
np.dtype("int64"): "f64",
|
468
|
+
np.dtype("float32"): "f32",
|
469
|
+
np.dtype("float64"): "f64",
|
470
|
+
}
|
471
|
+
|
472
|
+
for (fn, op), ty in itertools.product(
|
473
|
+
zip(functions, ops), types_promote
|
474
|
+
):
|
391
475
|
with self.subTest(op=op, ty=ty):
|
392
476
|
arg2_ty = np.result_type(np.float16, ty)
|
393
477
|
args = (b1[:], f2, from_dtype(arg2_ty))
|
@@ -397,5 +481,5 @@ class TestOperatorModule(CUDATestCase):
|
|
397
481
|
self.assertIn(ops, ptx)
|
398
482
|
|
399
483
|
|
400
|
-
if __name__ ==
|
484
|
+
if __name__ == "__main__":
|
401
485
|
unittest.main()
|
@@ -18,10 +18,10 @@ def device_func(x, y, z):
|
|
18
18
|
# the test function were more complex it may be possible to isolate additional
|
19
19
|
# fragments of PTX we could check for the absence / presence of, but removal of
|
20
20
|
# the use of local memory is a good indicator that optimization was applied.
|
21
|
-
removed_by_opt = (
|
21
|
+
removed_by_opt = ("__local_depot0",)
|
22
22
|
|
23
23
|
|
24
|
-
@skip_on_cudasim(
|
24
|
+
@skip_on_cudasim("Simulator does not optimize code")
|
25
25
|
class TestOptimization(CUDATestCase):
|
26
26
|
def test_eager_opt(self):
|
27
27
|
# Optimization should occur by default
|
@@ -74,7 +74,7 @@ class TestOptimization(CUDATestCase):
|
|
74
74
|
sig = (float64, float64, float64)
|
75
75
|
device = cuda.jit(sig, device=True)(device_func)
|
76
76
|
ptx = device.inspect_asm(sig)
|
77
|
-
self.assertIn(
|
77
|
+
self.assertIn("fma.rn.f64", ptx)
|
78
78
|
|
79
79
|
def test_device_noopt(self):
|
80
80
|
# Optimization disabled
|
@@ -82,8 +82,8 @@ class TestOptimization(CUDATestCase):
|
|
82
82
|
device = cuda.jit(sig, device=True, opt=False)(device_func)
|
83
83
|
ptx = device.inspect_asm(sig)
|
84
84
|
# Fused-multiply adds should be disabled when not optimizing
|
85
|
-
self.assertNotIn(
|
85
|
+
self.assertNotIn("fma.rn.f64", ptx)
|
86
86
|
|
87
87
|
|
88
|
-
if __name__ ==
|
88
|
+
if __name__ == "__main__":
|
89
89
|
unittest.main()
|