numba-cuda 0.0.1__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.pth +1 -0
- _numba_cuda_redirector.py +74 -0
- numba_cuda/VERSION +1 -0
- numba_cuda/__init__.py +5 -0
- numba_cuda/_version.py +19 -0
- numba_cuda/numba/cuda/__init__.py +22 -0
- numba_cuda/numba/cuda/api.py +526 -0
- numba_cuda/numba/cuda/api_util.py +30 -0
- numba_cuda/numba/cuda/args.py +77 -0
- numba_cuda/numba/cuda/cg.py +62 -0
- numba_cuda/numba/cuda/codegen.py +378 -0
- numba_cuda/numba/cuda/compiler.py +422 -0
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
- numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
- numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
- numba_cuda/numba/cuda/cuda_paths.py +258 -0
- numba_cuda/numba/cuda/cudadecl.py +806 -0
- numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
- numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
- numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
- numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
- numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
- numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
- numba_cuda/numba/cuda/cudadrv/error.py +36 -0
- numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
- numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
- numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
- numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
- numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
- numba_cuda/numba/cuda/cudaimpl.py +1055 -0
- numba_cuda/numba/cuda/cudamath.py +140 -0
- numba_cuda/numba/cuda/decorators.py +189 -0
- numba_cuda/numba/cuda/descriptor.py +33 -0
- numba_cuda/numba/cuda/device_init.py +89 -0
- numba_cuda/numba/cuda/deviceufunc.py +908 -0
- numba_cuda/numba/cuda/dispatcher.py +1057 -0
- numba_cuda/numba/cuda/errors.py +59 -0
- numba_cuda/numba/cuda/extending.py +7 -0
- numba_cuda/numba/cuda/initialize.py +13 -0
- numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
- numba_cuda/numba/cuda/intrinsics.py +198 -0
- numba_cuda/numba/cuda/kernels/__init__.py +0 -0
- numba_cuda/numba/cuda/kernels/reduction.py +262 -0
- numba_cuda/numba/cuda/kernels/transpose.py +65 -0
- numba_cuda/numba/cuda/libdevice.py +3382 -0
- numba_cuda/numba/cuda/libdevicedecl.py +17 -0
- numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
- numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
- numba_cuda/numba/cuda/mathimpl.py +448 -0
- numba_cuda/numba/cuda/models.py +48 -0
- numba_cuda/numba/cuda/nvvmutils.py +235 -0
- numba_cuda/numba/cuda/printimpl.py +86 -0
- numba_cuda/numba/cuda/random.py +292 -0
- numba_cuda/numba/cuda/simulator/__init__.py +38 -0
- numba_cuda/numba/cuda/simulator/api.py +110 -0
- numba_cuda/numba/cuda/simulator/compiler.py +9 -0
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
- numba_cuda/numba/cuda/simulator/kernel.py +308 -0
- numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
- numba_cuda/numba/cuda/simulator/reduction.py +15 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
- numba_cuda/numba/cuda/simulator_init.py +17 -0
- numba_cuda/numba/cuda/stubs.py +902 -0
- numba_cuda/numba/cuda/target.py +440 -0
- numba_cuda/numba/cuda/testing.py +202 -0
- numba_cuda/numba/cuda/tests/__init__.py +58 -0
- numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
- numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
- numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
- numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
- numba_cuda/numba/cuda/tests/data/error.cu +7 -0
- numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
- numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
- numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
- numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
- numba_cuda/numba/cuda/types.py +37 -0
- numba_cuda/numba/cuda/ufuncs.py +662 -0
- numba_cuda/numba/cuda/vector_types.py +209 -0
- numba_cuda/numba/cuda/vectorizers.py +252 -0
- numba_cuda-0.0.12.dist-info/LICENSE +25 -0
- numba_cuda-0.0.12.dist-info/METADATA +68 -0
- numba_cuda-0.0.12.dist-info/RECORD +231 -0
- {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
- numba_cuda-0.0.1.dist-info/METADATA +0 -10
- numba_cuda-0.0.1.dist-info/RECORD +0 -5
- {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,401 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from numba.cuda.testing import (unittest, CUDATestCase, skip_unless_cc_53,
|
3
|
+
skip_on_cudasim)
|
4
|
+
from numba import cuda
|
5
|
+
from numba.core.types import f2, b1
|
6
|
+
from numba.cuda import compile_ptx
|
7
|
+
import operator
|
8
|
+
import itertools
|
9
|
+
from numba.np.numpy_support import from_dtype
|
10
|
+
|
11
|
+
|
12
|
+
def simple_fp16_div_scalar(ary, a, b):
|
13
|
+
ary[0] = a / b
|
14
|
+
|
15
|
+
|
16
|
+
def simple_fp16add(ary, a, b):
|
17
|
+
ary[0] = a + b
|
18
|
+
|
19
|
+
|
20
|
+
def simple_fp16_iadd(ary, a):
|
21
|
+
ary[0] += a
|
22
|
+
|
23
|
+
|
24
|
+
def simple_fp16_isub(ary, a):
|
25
|
+
ary[0] -= a
|
26
|
+
|
27
|
+
|
28
|
+
def simple_fp16_imul(ary, a):
|
29
|
+
ary[0] *= a
|
30
|
+
|
31
|
+
|
32
|
+
def simple_fp16_idiv(ary, a):
|
33
|
+
ary[0] /= a
|
34
|
+
|
35
|
+
|
36
|
+
def simple_fp16sub(ary, a, b):
|
37
|
+
ary[0] = a - b
|
38
|
+
|
39
|
+
|
40
|
+
def simple_fp16mul(ary, a, b):
|
41
|
+
ary[0] = a * b
|
42
|
+
|
43
|
+
|
44
|
+
def simple_fp16neg(ary, a):
|
45
|
+
ary[0] = -a
|
46
|
+
|
47
|
+
|
48
|
+
def simple_fp16abs(ary, a):
|
49
|
+
ary[0] = abs(a)
|
50
|
+
|
51
|
+
|
52
|
+
def simple_fp16_gt(ary, a, b):
|
53
|
+
ary[0] = a > b
|
54
|
+
|
55
|
+
|
56
|
+
def simple_fp16_ge(ary, a, b):
|
57
|
+
ary[0] = a >= b
|
58
|
+
|
59
|
+
|
60
|
+
def simple_fp16_lt(ary, a, b):
|
61
|
+
ary[0] = a < b
|
62
|
+
|
63
|
+
|
64
|
+
def simple_fp16_le(ary, a, b):
|
65
|
+
ary[0] = a <= b
|
66
|
+
|
67
|
+
|
68
|
+
def simple_fp16_eq(ary, a, b):
|
69
|
+
ary[0] = a == b
|
70
|
+
|
71
|
+
|
72
|
+
def simple_fp16_ne(ary, a, b):
|
73
|
+
ary[0] = a != b
|
74
|
+
|
75
|
+
|
76
|
+
@cuda.jit('b1(f2, f2)', device=True)
|
77
|
+
def hlt_func_1(x, y):
|
78
|
+
return x < y
|
79
|
+
|
80
|
+
|
81
|
+
@cuda.jit('b1(f2, f2)', device=True)
|
82
|
+
def hlt_func_2(x, y):
|
83
|
+
return x < y
|
84
|
+
|
85
|
+
|
86
|
+
def test_multiple_hcmp_1(r, a, b, c):
|
87
|
+
# float16 predicates used in two separate functions
|
88
|
+
r[0] = hlt_func_1(a, b) and hlt_func_2(b, c)
|
89
|
+
|
90
|
+
|
91
|
+
def test_multiple_hcmp_2(r, a, b, c):
|
92
|
+
# The same float16 predicate used in the caller and callee
|
93
|
+
r[0] = hlt_func_1(a, b) and b < c
|
94
|
+
|
95
|
+
|
96
|
+
def test_multiple_hcmp_3(r, a, b, c):
|
97
|
+
# Different float16 predicates used in the caller and callee
|
98
|
+
r[0] = hlt_func_1(a, b) and c >= b
|
99
|
+
|
100
|
+
|
101
|
+
def test_multiple_hcmp_4(r, a, b, c):
|
102
|
+
# The same float16 predicates used twice in a function
|
103
|
+
r[0] = a < b and b < c
|
104
|
+
|
105
|
+
|
106
|
+
def test_multiple_hcmp_5(r, a, b, c):
|
107
|
+
# Different float16 predicates used in a function
|
108
|
+
r[0] = a < b and c >= b
|
109
|
+
|
110
|
+
|
111
|
+
class TestOperatorModule(CUDATestCase):
|
112
|
+
def setUp(self):
|
113
|
+
super().setUp()
|
114
|
+
np.random.seed(0)
|
115
|
+
|
116
|
+
"""
|
117
|
+
Test if operator module is supported by the CUDA target.
|
118
|
+
"""
|
119
|
+
def operator_template(self, op):
|
120
|
+
@cuda.jit
|
121
|
+
def foo(a, b):
|
122
|
+
i = 0
|
123
|
+
a[i] = op(a[i], b[i])
|
124
|
+
|
125
|
+
a = np.ones(1)
|
126
|
+
b = np.ones(1)
|
127
|
+
res = a.copy()
|
128
|
+
foo[1, 1](res, b)
|
129
|
+
|
130
|
+
np.testing.assert_equal(res, op(a, b))
|
131
|
+
|
132
|
+
def test_add(self):
|
133
|
+
self.operator_template(operator.add)
|
134
|
+
|
135
|
+
def test_sub(self):
|
136
|
+
self.operator_template(operator.sub)
|
137
|
+
|
138
|
+
def test_mul(self):
|
139
|
+
self.operator_template(operator.mul)
|
140
|
+
|
141
|
+
def test_truediv(self):
|
142
|
+
self.operator_template(operator.truediv)
|
143
|
+
|
144
|
+
def test_floordiv(self):
|
145
|
+
self.operator_template(operator.floordiv)
|
146
|
+
|
147
|
+
@skip_unless_cc_53
|
148
|
+
def test_fp16_binary(self):
|
149
|
+
functions = (simple_fp16add, simple_fp16sub, simple_fp16mul,
|
150
|
+
simple_fp16_div_scalar)
|
151
|
+
ops = (operator.add, operator.sub, operator.mul, operator.truediv)
|
152
|
+
|
153
|
+
for fn, op in zip(functions, ops):
|
154
|
+
with self.subTest(op=op):
|
155
|
+
kernel = cuda.jit("void(f2[:], f2, f2)")(fn)
|
156
|
+
|
157
|
+
got = np.zeros(1, dtype=np.float16)
|
158
|
+
arg1 = np.random.random(1).astype(np.float16)
|
159
|
+
arg2 = np.random.random(1).astype(np.float16)
|
160
|
+
|
161
|
+
kernel[1, 1](got, arg1[0], arg2[0])
|
162
|
+
expected = op(arg1, arg2)
|
163
|
+
np.testing.assert_allclose(got, expected)
|
164
|
+
|
165
|
+
@skip_on_cudasim('Compilation unsupported in the simulator')
|
166
|
+
def test_fp16_binary_ptx(self):
|
167
|
+
functions = (simple_fp16add, simple_fp16sub, simple_fp16mul)
|
168
|
+
instrs = ('add.f16', 'sub.f16', 'mul.f16')
|
169
|
+
args = (f2[:], f2, f2)
|
170
|
+
for fn, instr in zip(functions, instrs):
|
171
|
+
with self.subTest(instr=instr):
|
172
|
+
ptx, _ = compile_ptx(fn, args, cc=(5, 3))
|
173
|
+
self.assertIn(instr, ptx)
|
174
|
+
|
175
|
+
@skip_unless_cc_53
|
176
|
+
def test_mixed_fp16_binary_arithmetic(self):
|
177
|
+
functions = (simple_fp16add, simple_fp16sub, simple_fp16mul,
|
178
|
+
simple_fp16_div_scalar)
|
179
|
+
ops = (operator.add, operator.sub, operator.mul, operator.truediv)
|
180
|
+
types = (np.int8, np.int16, np.int32, np.int64,
|
181
|
+
np.float32, np.float64)
|
182
|
+
for (fn, op), ty in itertools.product(zip(functions, ops), types):
|
183
|
+
with self.subTest(op=op, ty=ty):
|
184
|
+
kernel = cuda.jit(fn)
|
185
|
+
|
186
|
+
arg1 = np.random.random(1).astype(np.float16)
|
187
|
+
arg2 = (np.random.random(1) * 100).astype(ty)
|
188
|
+
res_ty = np.result_type(np.float16, ty)
|
189
|
+
|
190
|
+
got = np.zeros(1, dtype=res_ty)
|
191
|
+
kernel[1, 1](got, arg1[0], arg2[0])
|
192
|
+
expected = op(arg1, arg2)
|
193
|
+
np.testing.assert_allclose(got, expected)
|
194
|
+
|
195
|
+
@skip_on_cudasim('Compilation unsupported in the simulator')
|
196
|
+
def test_fp16_inplace_binary_ptx(self):
|
197
|
+
functions = (simple_fp16_iadd, simple_fp16_isub, simple_fp16_imul)
|
198
|
+
instrs = ('add.f16', 'sub.f16', 'mul.f16')
|
199
|
+
args = (f2[:], f2)
|
200
|
+
|
201
|
+
for fn, instr in zip(functions, instrs):
|
202
|
+
with self.subTest(instr=instr):
|
203
|
+
ptx, _ = compile_ptx(fn, args, cc=(5, 3))
|
204
|
+
self.assertIn(instr, ptx)
|
205
|
+
|
206
|
+
@skip_unless_cc_53
|
207
|
+
def test_fp16_inplace_binary(self):
|
208
|
+
functions = (simple_fp16_iadd, simple_fp16_isub, simple_fp16_imul,
|
209
|
+
simple_fp16_idiv)
|
210
|
+
ops = (operator.iadd, operator.isub, operator.imul, operator.itruediv)
|
211
|
+
|
212
|
+
for fn, op in zip(functions, ops):
|
213
|
+
with self.subTest(op=op):
|
214
|
+
kernel = cuda.jit("void(f2[:], f2)")(fn)
|
215
|
+
|
216
|
+
got = np.random.random(1).astype(np.float16)
|
217
|
+
expected = got.copy()
|
218
|
+
arg = np.random.random(1).astype(np.float16)[0]
|
219
|
+
kernel[1, 1](got, arg)
|
220
|
+
op(expected, arg)
|
221
|
+
np.testing.assert_allclose(got, expected)
|
222
|
+
|
223
|
+
@skip_unless_cc_53
|
224
|
+
def test_fp16_unary(self):
|
225
|
+
functions = (simple_fp16neg, simple_fp16abs)
|
226
|
+
ops = (operator.neg, operator.abs)
|
227
|
+
|
228
|
+
for fn, op in zip(functions, ops):
|
229
|
+
with self.subTest(op=op):
|
230
|
+
kernel = cuda.jit("void(f2[:], f2)")(fn)
|
231
|
+
|
232
|
+
got = np.zeros(1, dtype=np.float16)
|
233
|
+
arg1 = np.random.random(1).astype(np.float16)
|
234
|
+
|
235
|
+
kernel[1, 1](got, arg1[0])
|
236
|
+
expected = op(arg1)
|
237
|
+
np.testing.assert_allclose(got, expected)
|
238
|
+
|
239
|
+
@skip_on_cudasim('Compilation unsupported in the simulator')
|
240
|
+
def test_fp16_neg_ptx(self):
|
241
|
+
args = (f2[:], f2)
|
242
|
+
ptx, _ = compile_ptx(simple_fp16neg, args, cc=(5, 3))
|
243
|
+
self.assertIn('neg.f16', ptx)
|
244
|
+
|
245
|
+
@skip_on_cudasim('Compilation unsupported in the simulator')
|
246
|
+
def test_fp16_abs_ptx(self):
|
247
|
+
args = (f2[:], f2)
|
248
|
+
ptx, _ = compile_ptx(simple_fp16abs, args, cc=(5, 3))
|
249
|
+
|
250
|
+
self.assertIn('abs.f16', ptx)
|
251
|
+
|
252
|
+
@skip_unless_cc_53
|
253
|
+
def test_fp16_comparison(self):
|
254
|
+
functions = (simple_fp16_gt, simple_fp16_ge,
|
255
|
+
simple_fp16_lt, simple_fp16_le,
|
256
|
+
simple_fp16_eq, simple_fp16_ne)
|
257
|
+
ops = (operator.gt, operator.ge, operator.lt, operator.le,
|
258
|
+
operator.eq, operator.ne)
|
259
|
+
|
260
|
+
for fn, op in zip(functions, ops):
|
261
|
+
with self.subTest(op=op):
|
262
|
+
kernel = cuda.jit("void(b1[:], f2, f2)")(fn)
|
263
|
+
|
264
|
+
got = np.zeros(1, dtype=np.bool_)
|
265
|
+
arg1 = np.random.random(1).astype(np.float16)
|
266
|
+
arg2 = np.random.random(1).astype(np.float16)
|
267
|
+
|
268
|
+
kernel[1, 1](got, arg1[0], arg2[0])
|
269
|
+
expected = op(arg1, arg2)
|
270
|
+
self.assertEqual(got[0], expected)
|
271
|
+
|
272
|
+
@skip_unless_cc_53
|
273
|
+
def test_mixed_fp16_comparison(self):
|
274
|
+
functions = (simple_fp16_gt, simple_fp16_ge,
|
275
|
+
simple_fp16_lt, simple_fp16_le,
|
276
|
+
simple_fp16_eq, simple_fp16_ne)
|
277
|
+
ops = (operator.gt, operator.ge, operator.lt, operator.le,
|
278
|
+
operator.eq, operator.ne)
|
279
|
+
types = (np.int8, np.int16, np.int32, np.int64,
|
280
|
+
np.float32, np.float64)
|
281
|
+
|
282
|
+
for (fn, op), ty in itertools.product(zip(functions, ops),
|
283
|
+
types):
|
284
|
+
with self.subTest(op=op, ty=ty):
|
285
|
+
kernel = cuda.jit(fn)
|
286
|
+
|
287
|
+
got = np.zeros(1, dtype=np.bool_)
|
288
|
+
arg1 = np.random.random(1).astype(np.float16)
|
289
|
+
arg2 = (np.random.random(1) * 100).astype(ty)
|
290
|
+
|
291
|
+
kernel[1, 1](got, arg1[0], arg2[0])
|
292
|
+
expected = op(arg1, arg2)
|
293
|
+
self.assertEqual(got[0], expected)
|
294
|
+
|
295
|
+
@skip_unless_cc_53
|
296
|
+
def test_multiple_float16_comparisons(self):
|
297
|
+
functions = (test_multiple_hcmp_1,
|
298
|
+
test_multiple_hcmp_2,
|
299
|
+
test_multiple_hcmp_3,
|
300
|
+
test_multiple_hcmp_4,
|
301
|
+
test_multiple_hcmp_5)
|
302
|
+
for fn in functions:
|
303
|
+
with self.subTest(fn=fn):
|
304
|
+
compiled = cuda.jit("void(b1[:], f2, f2, f2)")(fn)
|
305
|
+
ary = np.zeros(1, dtype=np.bool_)
|
306
|
+
arg1 = np.float16(2.)
|
307
|
+
arg2 = np.float16(3.)
|
308
|
+
arg3 = np.float16(4.)
|
309
|
+
compiled[1, 1](ary, arg1, arg2, arg3)
|
310
|
+
self.assertTrue(ary[0])
|
311
|
+
|
312
|
+
@skip_unless_cc_53
|
313
|
+
def test_multiple_float16_comparisons_false(self):
|
314
|
+
functions = (test_multiple_hcmp_1,
|
315
|
+
test_multiple_hcmp_2,
|
316
|
+
test_multiple_hcmp_3,
|
317
|
+
test_multiple_hcmp_4,
|
318
|
+
test_multiple_hcmp_5)
|
319
|
+
for fn in functions:
|
320
|
+
with self.subTest(fn=fn):
|
321
|
+
compiled = cuda.jit("void(b1[:], f2, f2, f2)")(fn)
|
322
|
+
ary = np.zeros(1, dtype=np.bool_)
|
323
|
+
arg1 = np.float16(2.)
|
324
|
+
arg2 = np.float16(3.)
|
325
|
+
arg3 = np.float16(1.)
|
326
|
+
compiled[1, 1](ary, arg1, arg2, arg3)
|
327
|
+
self.assertFalse(ary[0])
|
328
|
+
|
329
|
+
@skip_on_cudasim('Compilation unsupported in the simulator')
|
330
|
+
def test_fp16_comparison_ptx(self):
|
331
|
+
functions = (simple_fp16_gt, simple_fp16_ge,
|
332
|
+
simple_fp16_lt, simple_fp16_le,
|
333
|
+
simple_fp16_eq, simple_fp16_ne)
|
334
|
+
ops = (operator.gt, operator.ge, operator.lt, operator.le,
|
335
|
+
operator.eq, operator.ne)
|
336
|
+
opstring = ('setp.gt.f16', 'setp.ge.f16',
|
337
|
+
'setp.lt.f16', 'setp.le.f16',
|
338
|
+
'setp.eq.f16', 'setp.ne.f16')
|
339
|
+
args = (b1[:], f2, f2)
|
340
|
+
|
341
|
+
for fn, op, s in zip(functions, ops, opstring):
|
342
|
+
with self.subTest(op=op):
|
343
|
+
ptx, _ = compile_ptx(fn, args, cc=(5, 3))
|
344
|
+
self.assertIn(s, ptx)
|
345
|
+
|
346
|
+
@skip_on_cudasim('Compilation unsupported in the simulator')
|
347
|
+
def test_fp16_int8_comparison_ptx(self):
|
348
|
+
# Test that int8 can be safely converted to fp16
|
349
|
+
# in a comparison
|
350
|
+
functions = (simple_fp16_gt, simple_fp16_ge,
|
351
|
+
simple_fp16_lt, simple_fp16_le,
|
352
|
+
simple_fp16_eq, simple_fp16_ne)
|
353
|
+
ops = (operator.gt, operator.ge, operator.lt, operator.le,
|
354
|
+
operator.eq, operator.ne)
|
355
|
+
|
356
|
+
opstring = {operator.gt:'setp.gt.f16',
|
357
|
+
operator.ge:'setp.ge.f16',
|
358
|
+
operator.lt:'setp.lt.f16',
|
359
|
+
operator.le:'setp.le.f16',
|
360
|
+
operator.eq:'setp.eq.f16',
|
361
|
+
operator.ne:'setp.ne.f16'}
|
362
|
+
for fn, op in zip(functions, ops):
|
363
|
+
with self.subTest(op=op):
|
364
|
+
args = (b1[:], f2, from_dtype(np.int8))
|
365
|
+
ptx, _ = compile_ptx(fn, args, cc=(5, 3))
|
366
|
+
self.assertIn(opstring[op], ptx)
|
367
|
+
|
368
|
+
@skip_on_cudasim('Compilation unsupported in the simulator')
|
369
|
+
def test_mixed_fp16_comparison_promotion_ptx(self):
|
370
|
+
functions = (simple_fp16_gt, simple_fp16_ge,
|
371
|
+
simple_fp16_lt, simple_fp16_le,
|
372
|
+
simple_fp16_eq, simple_fp16_ne)
|
373
|
+
ops = (operator.gt, operator.ge, operator.lt, operator.le,
|
374
|
+
operator.eq, operator.ne)
|
375
|
+
|
376
|
+
types_promote = (np.int16, np.int32, np.int64,
|
377
|
+
np.float32, np.float64)
|
378
|
+
opstring = {operator.gt:'setp.gt.',
|
379
|
+
operator.ge:'setp.ge.',
|
380
|
+
operator.lt:'setp.lt.',
|
381
|
+
operator.le:'setp.le.',
|
382
|
+
operator.eq:'setp.eq.',
|
383
|
+
operator.ne:'setp.neu.'}
|
384
|
+
opsuffix = {np.dtype('int32'): 'f64',
|
385
|
+
np.dtype('int64'): 'f64',
|
386
|
+
np.dtype('float32'): 'f32',
|
387
|
+
np.dtype('float64'): 'f64'}
|
388
|
+
|
389
|
+
for (fn, op), ty in itertools.product(zip(functions, ops),
|
390
|
+
types_promote):
|
391
|
+
with self.subTest(op=op, ty=ty):
|
392
|
+
arg2_ty = np.result_type(np.float16, ty)
|
393
|
+
args = (b1[:], f2, from_dtype(arg2_ty))
|
394
|
+
ptx, _ = compile_ptx(fn, args, cc=(5, 3))
|
395
|
+
|
396
|
+
ops = opstring[op] + opsuffix[arg2_ty]
|
397
|
+
self.assertIn(ops, ptx)
|
398
|
+
|
399
|
+
|
400
|
+
if __name__ == '__main__':
|
401
|
+
unittest.main()
|
@@ -0,0 +1,86 @@
|
|
1
|
+
import numpy as np
|
2
|
+
|
3
|
+
from numba.cuda.testing import skip_on_cudasim, CUDATestCase
|
4
|
+
from numba import cuda, float64
|
5
|
+
import unittest
|
6
|
+
|
7
|
+
|
8
|
+
def kernel_func(x):
|
9
|
+
x[0] = 1
|
10
|
+
|
11
|
+
|
12
|
+
def device_func(x, y, z):
|
13
|
+
return x * y + z
|
14
|
+
|
15
|
+
|
16
|
+
# Fragments of code that are removed from kernel_func's PTX when optimization
|
17
|
+
# is on
|
18
|
+
removed_by_opt = ( '__local_depot0', 'call.uni', 'st.param.b64')
|
19
|
+
|
20
|
+
|
21
|
+
@skip_on_cudasim('Simulator does not optimize code')
|
22
|
+
class TestOptimization(CUDATestCase):
|
23
|
+
def test_eager_opt(self):
|
24
|
+
# Optimization should occur by default
|
25
|
+
sig = (float64[::1],)
|
26
|
+
kernel = cuda.jit(sig)(kernel_func)
|
27
|
+
ptx = kernel.inspect_asm()
|
28
|
+
|
29
|
+
for fragment in removed_by_opt:
|
30
|
+
with self.subTest(fragment=fragment):
|
31
|
+
self.assertNotIn(fragment, ptx[sig])
|
32
|
+
|
33
|
+
def test_eager_noopt(self):
|
34
|
+
# Optimization disabled
|
35
|
+
sig = (float64[::1],)
|
36
|
+
kernel = cuda.jit(sig, opt=False)(kernel_func)
|
37
|
+
ptx = kernel.inspect_asm()
|
38
|
+
|
39
|
+
for fragment in removed_by_opt:
|
40
|
+
with self.subTest(fragment=fragment):
|
41
|
+
self.assertIn(fragment, ptx[sig])
|
42
|
+
|
43
|
+
def test_lazy_opt(self):
|
44
|
+
# Optimization should occur by default
|
45
|
+
kernel = cuda.jit(kernel_func)
|
46
|
+
x = np.zeros(1, dtype=np.float64)
|
47
|
+
kernel[1, 1](x)
|
48
|
+
|
49
|
+
# Grab the PTX for the one definition that has just been jitted
|
50
|
+
ptx = next(iter(kernel.inspect_asm().items()))[1]
|
51
|
+
|
52
|
+
for fragment in removed_by_opt:
|
53
|
+
with self.subTest(fragment=fragment):
|
54
|
+
self.assertNotIn(fragment, ptx)
|
55
|
+
|
56
|
+
def test_lazy_noopt(self):
|
57
|
+
# Optimization disabled
|
58
|
+
kernel = cuda.jit(opt=False)(kernel_func)
|
59
|
+
x = np.zeros(1, dtype=np.float64)
|
60
|
+
kernel[1, 1](x)
|
61
|
+
|
62
|
+
# Grab the PTX for the one definition that has just been jitted
|
63
|
+
ptx = next(iter(kernel.inspect_asm().items()))[1]
|
64
|
+
|
65
|
+
for fragment in removed_by_opt:
|
66
|
+
with self.subTest(fragment=fragment):
|
67
|
+
self.assertIn(fragment, ptx)
|
68
|
+
|
69
|
+
def test_device_opt(self):
|
70
|
+
# Optimization should occur by default
|
71
|
+
sig = (float64, float64, float64)
|
72
|
+
device = cuda.jit(sig, device=True)(device_func)
|
73
|
+
ptx = device.inspect_asm(sig)
|
74
|
+
self.assertIn('fma.rn.f64', ptx)
|
75
|
+
|
76
|
+
def test_device_noopt(self):
|
77
|
+
# Optimization disabled
|
78
|
+
sig = (float64, float64, float64)
|
79
|
+
device = cuda.jit(sig, device=True, opt=False)(device_func)
|
80
|
+
ptx = device.inspect_asm(sig)
|
81
|
+
# Fused-multiply adds should be disabled when not optimizing
|
82
|
+
self.assertNotIn('fma.rn.f64', ptx)
|
83
|
+
|
84
|
+
|
85
|
+
if __name__ == '__main__':
|
86
|
+
unittest.main()
|