numba-cuda 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.py +17 -13
- numba_cuda/VERSION +1 -1
- numba_cuda/_version.py +4 -1
- numba_cuda/numba/cuda/__init__.py +6 -2
- numba_cuda/numba/cuda/api.py +129 -86
- numba_cuda/numba/cuda/api_util.py +3 -3
- numba_cuda/numba/cuda/args.py +12 -16
- numba_cuda/numba/cuda/cg.py +6 -6
- numba_cuda/numba/cuda/codegen.py +74 -43
- numba_cuda/numba/cuda/compiler.py +246 -114
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
- numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
- numba_cuda/numba/cuda/cuda_paths.py +293 -99
- numba_cuda/numba/cuda/cudadecl.py +93 -79
- numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
- numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
- numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
- numba_cuda/numba/cuda/cudadrv/driver.py +460 -297
- numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
- numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
- numba_cuda/numba/cuda/cudadrv/error.py +6 -2
- numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
- numba_cuda/numba/cuda/cudadrv/linkable_code.py +27 -3
- numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +146 -30
- numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
- numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
- numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
- numba_cuda/numba/cuda/cudaimpl.py +296 -275
- numba_cuda/numba/cuda/cudamath.py +1 -1
- numba_cuda/numba/cuda/debuginfo.py +99 -7
- numba_cuda/numba/cuda/decorators.py +87 -45
- numba_cuda/numba/cuda/descriptor.py +1 -1
- numba_cuda/numba/cuda/device_init.py +68 -18
- numba_cuda/numba/cuda/deviceufunc.py +143 -98
- numba_cuda/numba/cuda/dispatcher.py +300 -213
- numba_cuda/numba/cuda/errors.py +13 -10
- numba_cuda/numba/cuda/extending.py +55 -1
- numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
- numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
- numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +1090 -927
- numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +468 -319
- numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
- numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
- numba_cuda/numba/cuda/initialize.py +5 -3
- numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
- numba_cuda/numba/cuda/intrinsics.py +203 -28
- numba_cuda/numba/cuda/kernels/reduction.py +13 -13
- numba_cuda/numba/cuda/kernels/transpose.py +3 -6
- numba_cuda/numba/cuda/libdevice.py +317 -317
- numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
- numba_cuda/numba/cuda/locks.py +16 -0
- numba_cuda/numba/cuda/lowering.py +43 -0
- numba_cuda/numba/cuda/mathimpl.py +62 -57
- numba_cuda/numba/cuda/models.py +1 -5
- numba_cuda/numba/cuda/nvvmutils.py +103 -88
- numba_cuda/numba/cuda/printimpl.py +9 -5
- numba_cuda/numba/cuda/random.py +46 -36
- numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
- numba_cuda/numba/cuda/runtime/__init__.py +1 -1
- numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
- numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
- numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
- numba_cuda/numba/cuda/runtime/nrt.py +48 -43
- numba_cuda/numba/cuda/simulator/__init__.py +22 -12
- numba_cuda/numba/cuda/simulator/api.py +38 -22
- numba_cuda/numba/cuda/simulator/compiler.py +2 -2
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
- numba_cuda/numba/cuda/simulator/kernel.py +43 -34
- numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
- numba_cuda/numba/cuda/simulator/reduction.py +1 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
- numba_cuda/numba/cuda/simulator_init.py +2 -4
- numba_cuda/numba/cuda/stubs.py +134 -108
- numba_cuda/numba/cuda/target.py +92 -47
- numba_cuda/numba/cuda/testing.py +24 -19
- numba_cuda/numba/cuda/tests/__init__.py +14 -12
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +10 -7
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +59 -23
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +77 -28
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +24 -7
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +21 -12
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
- numba_cuda/numba/cuda/tests/cudapy/test_inline.py +59 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
- numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +81 -30
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
- numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
- numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
- numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
- numba_cuda/numba/cuda/types.py +5 -2
- numba_cuda/numba/cuda/ufuncs.py +382 -362
- numba_cuda/numba/cuda/utils.py +2 -2
- numba_cuda/numba/cuda/vector_types.py +5 -3
- numba_cuda/numba/cuda/vectorizers.py +38 -33
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/METADATA +1 -1
- numba_cuda-0.10.0.dist-info/RECORD +263 -0
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/WHEEL +1 -1
- numba_cuda-0.8.1.dist-info/RECORD +0 -251
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/top_level.txt +0 -0
@@ -4,8 +4,7 @@ from numba.cuda import compile_ptx
|
|
4
4
|
from numba.core.types import f2, i1, i2, i4, i8, u1, u2, u4, u8
|
5
5
|
from numba import cuda
|
6
6
|
from numba.core import types
|
7
|
-
from numba.cuda.testing import
|
8
|
-
skip_unless_cc_53)
|
7
|
+
from numba.cuda.testing import CUDATestCase, skip_on_cudasim, skip_unless_cc_53
|
9
8
|
from numba.types import float16, float32
|
10
9
|
import itertools
|
11
10
|
import unittest
|
@@ -50,7 +49,7 @@ def to_uint64(x):
|
|
50
49
|
def to_float16(x):
|
51
50
|
# When division and operators on float16 types are supported, this should
|
52
51
|
# be changed to match the implementation in to_float32.
|
53
|
-
return
|
52
|
+
return np.float16(x) * np.float16(0.5)
|
54
53
|
|
55
54
|
|
56
55
|
def to_float32(x):
|
@@ -76,6 +75,7 @@ def to_complex128(x):
|
|
76
75
|
# - The device version uses cuda.fp16.hmul
|
77
76
|
# - The host version uses the * operator
|
78
77
|
|
78
|
+
|
79
79
|
def cuda_int_literal_to_float16(x):
|
80
80
|
# Note that we need to use `2` and not `np.float16(2)` to ensure that this
|
81
81
|
# types as a literal int and not a const float16.
|
@@ -128,7 +128,7 @@ class TestCasting(CUDATestCase):
|
|
128
128
|
self.assertEqual(cfunc(-12.3), pyfunc(-12.3))
|
129
129
|
self.assertEqual(cfunc(-12.3), int(-12.3))
|
130
130
|
|
131
|
-
@skip_on_cudasim(
|
131
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
132
132
|
def test_float16_to_int_ptx(self):
|
133
133
|
pyfuncs = (to_int8, to_int16, to_int32, to_int64)
|
134
134
|
sizes = (8, 16, 32, 64)
|
@@ -150,7 +150,7 @@ class TestCasting(CUDATestCase):
|
|
150
150
|
self.assertEqual(cfunc(12.3), pyfunc(12.3))
|
151
151
|
self.assertEqual(cfunc(12.3), int(12.3))
|
152
152
|
|
153
|
-
@skip_on_cudasim(
|
153
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
154
154
|
def test_float16_to_uint_ptx(self):
|
155
155
|
pyfuncs = (to_uint8, to_uint16, to_uint32, to_uint64)
|
156
156
|
sizes = (8, 16, 32, 64)
|
@@ -171,17 +171,18 @@ class TestCasting(CUDATestCase):
|
|
171
171
|
|
172
172
|
@skip_unless_cc_53
|
173
173
|
def test_literal_to_float16(self):
|
174
|
-
cudafuncs = (cuda_int_literal_to_float16,
|
175
|
-
|
176
|
-
|
177
|
-
|
174
|
+
cudafuncs = (cuda_int_literal_to_float16, cuda_float_literal_to_float16)
|
175
|
+
hostfuncs = (
|
176
|
+
reference_int_literal_to_float16,
|
177
|
+
reference_float_literal_to_float16,
|
178
|
+
)
|
178
179
|
|
179
180
|
for cudafunc, hostfunc in zip(cudafuncs, hostfuncs):
|
180
181
|
with self.subTest(func=cudafunc):
|
181
182
|
cfunc = self._create_wrapped(cudafunc, np.float16, np.float16)
|
182
183
|
self.assertEqual(cfunc(321), hostfunc(321))
|
183
184
|
|
184
|
-
@skip_on_cudasim(
|
185
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
185
186
|
def test_int_to_float16_ptx(self):
|
186
187
|
fromtys = (i1, i2, i4, i8)
|
187
188
|
sizes = (8, 16, 32, 64)
|
@@ -190,7 +191,7 @@ class TestCasting(CUDATestCase):
|
|
190
191
|
ptx, _ = compile_ptx(to_float16, (ty,), device=True)
|
191
192
|
self.assertIn(f"cvt.rn.f16.s{size}", ptx)
|
192
193
|
|
193
|
-
@skip_on_cudasim(
|
194
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
194
195
|
def test_uint_to_float16_ptx(self):
|
195
196
|
fromtys = (u1, u2, u4, u8)
|
196
197
|
sizes = (8, 16, 32, 64)
|
@@ -211,12 +212,14 @@ class TestCasting(CUDATestCase):
|
|
211
212
|
# the CUDA target doesn't yet implement division (or operators)
|
212
213
|
# for float16 values, so we test by comparing with the computed
|
213
214
|
# expression instead.
|
214
|
-
np.testing.assert_allclose(
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
215
|
+
np.testing.assert_allclose(
|
216
|
+
cfunc(12.3), toty(12.3) / toty(2), rtol=0.0003
|
217
|
+
)
|
218
|
+
np.testing.assert_allclose(
|
219
|
+
cfunc(-12.3), toty(-12.3) / toty(2), rtol=0.0003
|
220
|
+
)
|
221
|
+
|
222
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
220
223
|
def test_float16_to_float_ptx(self):
|
221
224
|
pyfuncs = (to_float32, to_float64)
|
222
225
|
postfixes = ("f32", "f64")
|
@@ -239,12 +242,14 @@ class TestCasting(CUDATestCase):
|
|
239
242
|
# to match the casting that is automatically applied when
|
240
243
|
# passing the input to the cfunc as part of wrapping it in
|
241
244
|
# an array of type fromtype.
|
242
|
-
np.testing.assert_allclose(
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
245
|
+
np.testing.assert_allclose(
|
246
|
+
cfunc(3.21), pyfunc(fromty(3.21))
|
247
|
+
)
|
248
|
+
np.testing.assert_allclose(
|
249
|
+
cfunc(-3.21), pyfunc(fromty(-3.21)) + 0j
|
250
|
+
)
|
251
|
+
|
252
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
248
253
|
def test_native_cast(self):
|
249
254
|
float32_ptx, _ = cuda.compile_ptx(native_cast, (float32,), device=True)
|
250
255
|
self.assertIn("st.f32", float32_ptx)
|
@@ -253,5 +258,5 @@ class TestCasting(CUDATestCase):
|
|
253
258
|
self.assertIn("st.u16", float16_ptx)
|
254
259
|
|
255
260
|
|
256
|
-
if __name__ ==
|
261
|
+
if __name__ == "__main__":
|
257
262
|
unittest.main()
|
@@ -1,21 +1,26 @@
|
|
1
1
|
import numpy as np
|
2
2
|
|
3
3
|
from numba import cuda, types
|
4
|
-
from numba.cuda.testing import (
|
5
|
-
|
4
|
+
from numba.cuda.testing import (
|
5
|
+
skip_on_cudasim,
|
6
|
+
test_data_dir,
|
7
|
+
unittest,
|
8
|
+
CUDATestCase,
|
9
|
+
)
|
6
10
|
from numba.tests.support import skip_unless_cffi
|
7
11
|
|
8
12
|
|
9
13
|
@skip_unless_cffi
|
10
|
-
@skip_on_cudasim(
|
14
|
+
@skip_on_cudasim("Simulator does not support linking")
|
11
15
|
class TestCFFI(CUDATestCase):
|
12
16
|
def test_from_buffer(self):
|
13
17
|
import cffi
|
18
|
+
|
14
19
|
ffi = cffi.FFI()
|
15
20
|
|
16
|
-
link = str(test_data_dir /
|
21
|
+
link = str(test_data_dir / "jitlink.ptx")
|
17
22
|
sig = types.void(types.CPointer(types.int32))
|
18
|
-
array_mutator = cuda.declare_device(
|
23
|
+
array_mutator = cuda.declare_device("array_mutator", sig)
|
19
24
|
|
20
25
|
@cuda.jit(link=[link])
|
21
26
|
def mutate_array(x):
|
@@ -29,5 +34,5 @@ class TestCFFI(CUDATestCase):
|
|
29
34
|
self.assertEqual(x[0], x[1])
|
30
35
|
|
31
36
|
|
32
|
-
if __name__ ==
|
37
|
+
if __name__ == "__main__":
|
33
38
|
unittest.main()
|
@@ -1,7 +1,11 @@
|
|
1
1
|
from math import sqrt
|
2
2
|
from numba import cuda, float32, int16, int32, int64, uint32, void
|
3
|
-
from numba.cuda import (
|
4
|
-
|
3
|
+
from numba.cuda import (
|
4
|
+
compile,
|
5
|
+
compile_for_current_device,
|
6
|
+
compile_ptx,
|
7
|
+
compile_ptx_for_current_device,
|
8
|
+
)
|
5
9
|
from numba.cuda.cudadrv import runtime
|
6
10
|
from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase
|
7
11
|
|
@@ -12,7 +16,7 @@ def f_module(x, y):
|
|
12
16
|
return x + y
|
13
17
|
|
14
18
|
|
15
|
-
@skip_on_cudasim(
|
19
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
16
20
|
class TestCompile(unittest.TestCase):
|
17
21
|
def test_global_kernel(self):
|
18
22
|
def f(r, x, y):
|
@@ -24,11 +28,11 @@ class TestCompile(unittest.TestCase):
|
|
24
28
|
ptx, resty = compile_ptx(f, args)
|
25
29
|
|
26
30
|
# Kernels should not have a func_retval parameter
|
27
|
-
self.assertNotIn(
|
31
|
+
self.assertNotIn("func_retval", ptx)
|
28
32
|
# .visible .func is used to denote a device function
|
29
|
-
self.assertNotIn(
|
33
|
+
self.assertNotIn(".visible .func", ptx)
|
30
34
|
# .visible .entry would denote the presence of a global function
|
31
|
-
self.assertIn(
|
35
|
+
self.assertIn(".visible .entry", ptx)
|
32
36
|
# Return type for kernels should always be void
|
33
37
|
self.assertEqual(resty, void)
|
34
38
|
|
@@ -41,11 +45,11 @@ class TestCompile(unittest.TestCase):
|
|
41
45
|
|
42
46
|
# Device functions take a func_retval parameter for storing the
|
43
47
|
# returned value in by reference
|
44
|
-
self.assertIn(
|
48
|
+
self.assertIn("func_retval", ptx)
|
45
49
|
# .visible .func is used to denote a device function
|
46
|
-
self.assertIn(
|
50
|
+
self.assertIn(".visible .func", ptx)
|
47
51
|
# .visible .entry would denote the presence of a global function
|
48
|
-
self.assertNotIn(
|
52
|
+
self.assertNotIn(".visible .entry", ptx)
|
49
53
|
# Inferred return type as expected?
|
50
54
|
self.assertEqual(resty, float32)
|
51
55
|
|
@@ -71,21 +75,21 @@ class TestCompile(unittest.TestCase):
|
|
71
75
|
|
72
76
|
# Without fastmath, fma contraction is enabled by default, but ftz and
|
73
77
|
# approximate div / sqrt is not.
|
74
|
-
self.assertIn(
|
75
|
-
self.assertIn(
|
76
|
-
self.assertIn(
|
78
|
+
self.assertIn("fma.rn.f32", ptx)
|
79
|
+
self.assertIn("div.rn.f32", ptx)
|
80
|
+
self.assertIn("sqrt.rn.f32", ptx)
|
77
81
|
|
78
82
|
ptx, resty = compile_ptx(f, args, device=True, fastmath=True)
|
79
83
|
|
80
84
|
# With fastmath, ftz and approximate div / sqrt are enabled
|
81
|
-
self.assertIn(
|
82
|
-
self.assertIn(
|
83
|
-
self.assertIn(
|
85
|
+
self.assertIn("fma.rn.ftz.f32", ptx)
|
86
|
+
self.assertIn("div.approx.ftz.f32", ptx)
|
87
|
+
self.assertIn("sqrt.approx.ftz.f32", ptx)
|
84
88
|
|
85
89
|
def check_debug_info(self, ptx):
|
86
90
|
# A debug_info section should exist in the PTX. Whitespace varies
|
87
91
|
# between CUDA toolkit versions.
|
88
|
-
self.assertRegex(ptx,
|
92
|
+
self.assertRegex(ptx, "\\.section\\s+\\.debug_info")
|
89
93
|
# A .file directive should be produced and include the name of the
|
90
94
|
# source. The path and whitespace may vary, so we accept anything
|
91
95
|
# ending in the filename of this module.
|
@@ -136,23 +140,25 @@ class TestCompile(unittest.TestCase):
|
|
136
140
|
def f(x, y):
|
137
141
|
return x[0] + y[0]
|
138
142
|
|
139
|
-
with self.assertRaisesRegex(TypeError,
|
143
|
+
with self.assertRaisesRegex(TypeError, "must have void return type"):
|
140
144
|
compile_ptx(f, (uint32[::1], uint32[::1]))
|
141
145
|
|
142
146
|
def test_c_abi_disallowed_for_kernel(self):
|
143
147
|
def f(x, y):
|
144
148
|
return x + y
|
145
149
|
|
146
|
-
with self.assertRaisesRegex(
|
147
|
-
|
150
|
+
with self.assertRaisesRegex(
|
151
|
+
NotImplementedError, "The C ABI is not supported for kernels"
|
152
|
+
):
|
148
153
|
compile_ptx(f, (int32, int32), abi="c")
|
149
154
|
|
150
155
|
def test_unsupported_abi(self):
|
151
156
|
def f(x, y):
|
152
157
|
return x + y
|
153
158
|
|
154
|
-
with self.assertRaisesRegex(
|
155
|
-
|
159
|
+
with self.assertRaisesRegex(
|
160
|
+
NotImplementedError, "Unsupported ABI: fastcall"
|
161
|
+
):
|
156
162
|
compile_ptx(f, (int32, int32), abi="fastcall")
|
157
163
|
|
158
164
|
def test_c_abi_device_function(self):
|
@@ -166,8 +172,11 @@ class TestCompile(unittest.TestCase):
|
|
166
172
|
# The function name should match the Python function name (not the
|
167
173
|
# qualname, which includes additional info), and its return value
|
168
174
|
# should be 32 bits
|
169
|
-
self.assertRegex(
|
170
|
-
|
175
|
+
self.assertRegex(
|
176
|
+
ptx,
|
177
|
+
r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
|
178
|
+
r"func_retval0\)\s+f\(",
|
179
|
+
)
|
171
180
|
|
172
181
|
# If we compile for 64-bit integers, the return type should be 64 bits
|
173
182
|
# wide
|
@@ -175,44 +184,60 @@ class TestCompile(unittest.TestCase):
|
|
175
184
|
self.assertRegex(ptx, r"\.visible\s+\.func\s+\(\.param\s+\.b64")
|
176
185
|
|
177
186
|
def test_c_abi_device_function_module_scope(self):
|
178
|
-
ptx, resty = compile_ptx(
|
179
|
-
|
187
|
+
ptx, resty = compile_ptx(
|
188
|
+
f_module, int32(int32, int32), device=True, abi="c"
|
189
|
+
)
|
180
190
|
|
181
191
|
# The function name should match the Python function name, and its
|
182
192
|
# return value should be 32 bits
|
183
|
-
self.assertRegex(
|
184
|
-
|
193
|
+
self.assertRegex(
|
194
|
+
ptx,
|
195
|
+
r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
|
196
|
+
r"func_retval0\)\s+f_module\(",
|
197
|
+
)
|
185
198
|
|
186
199
|
def test_c_abi_with_abi_name(self):
|
187
|
-
abi_info = {
|
188
|
-
ptx, resty = compile_ptx(
|
189
|
-
|
200
|
+
abi_info = {"abi_name": "_Z4funcii"}
|
201
|
+
ptx, resty = compile_ptx(
|
202
|
+
f_module,
|
203
|
+
int32(int32, int32),
|
204
|
+
device=True,
|
205
|
+
abi="c",
|
206
|
+
abi_info=abi_info,
|
207
|
+
)
|
190
208
|
|
191
209
|
# The function name should match the one given in the ABI info, and its
|
192
210
|
# return value should be 32 bits
|
193
|
-
self.assertRegex(
|
194
|
-
|
211
|
+
self.assertRegex(
|
212
|
+
ptx,
|
213
|
+
r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
|
214
|
+
r"func_retval0\)\s+_Z4funcii\(",
|
215
|
+
)
|
195
216
|
|
196
217
|
def test_compile_defaults_to_c_abi(self):
|
197
218
|
ptx, resty = compile(f_module, int32(int32, int32), device=True)
|
198
219
|
|
199
220
|
# The function name should match the Python function name, and its
|
200
221
|
# return value should be 32 bits
|
201
|
-
self.assertRegex(
|
202
|
-
|
222
|
+
self.assertRegex(
|
223
|
+
ptx,
|
224
|
+
r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
|
225
|
+
r"func_retval0\)\s+f_module\(",
|
226
|
+
)
|
203
227
|
|
204
228
|
def test_compile_to_ltoir(self):
|
205
229
|
if runtime.get_version() < (11, 5):
|
206
230
|
self.skipTest("-gen-lto unavailable in this toolkit version")
|
207
231
|
|
208
|
-
ltoir, resty = compile(
|
209
|
-
|
232
|
+
ltoir, resty = compile(
|
233
|
+
f_module, int32(int32, int32), device=True, output="ltoir"
|
234
|
+
)
|
210
235
|
|
211
236
|
# There are no tools to interpret the LTOIR output, but we can check
|
212
237
|
# that we appear to have obtained an LTOIR file. This magic number is
|
213
238
|
# not documented, but is expected to remain consistent.
|
214
239
|
LTOIR_MAGIC = 0x7F4E43ED
|
215
|
-
header = int.from_bytes(ltoir[:4], byteorder=
|
240
|
+
header = int.from_bytes(ltoir[:4], byteorder="little")
|
216
241
|
self.assertEqual(header, LTOIR_MAGIC)
|
217
242
|
self.assertEqual(resty, int32)
|
218
243
|
|
@@ -220,11 +245,15 @@ class TestCompile(unittest.TestCase):
|
|
220
245
|
illegal_output = "illegal"
|
221
246
|
msg = f"Unsupported output type: {illegal_output}"
|
222
247
|
with self.assertRaisesRegex(NotImplementedError, msg):
|
223
|
-
compile(
|
224
|
-
|
248
|
+
compile(
|
249
|
+
f_module,
|
250
|
+
int32(int32, int32),
|
251
|
+
device=True,
|
252
|
+
output=illegal_output,
|
253
|
+
)
|
225
254
|
|
226
255
|
|
227
|
-
@skip_on_cudasim(
|
256
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
228
257
|
class TestCompileForCurrentDevice(CUDATestCase):
|
229
258
|
def _check_ptx_for_current_device(self, compile_function):
|
230
259
|
def add(x, y):
|
@@ -237,7 +266,7 @@ class TestCompileForCurrentDevice(CUDATestCase):
|
|
237
266
|
# closest compute capability supported by the current toolkit.
|
238
267
|
device_cc = cuda.get_current_device().compute_capability
|
239
268
|
cc = cuda.cudadrv.nvvm.find_closest_arch(device_cc)
|
240
|
-
target = f
|
269
|
+
target = f".target sm_{cc[0]}{cc[1]}"
|
241
270
|
self.assertIn(target, ptx)
|
242
271
|
|
243
272
|
def test_compile_ptx_for_current_device(self):
|
@@ -247,10 +276,10 @@ class TestCompileForCurrentDevice(CUDATestCase):
|
|
247
276
|
self._check_ptx_for_current_device(compile_for_current_device)
|
248
277
|
|
249
278
|
|
250
|
-
@skip_on_cudasim(
|
279
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
251
280
|
class TestCompileOnlyTests(unittest.TestCase):
|
252
|
-
|
253
|
-
output rather than observing the effects of execution.
|
281
|
+
"""For tests where we can only check correctness by examining the compiler
|
282
|
+
output rather than observing the effects of execution."""
|
254
283
|
|
255
284
|
def test_nanosleep(self):
|
256
285
|
def use_nanosleep(x):
|
@@ -262,15 +291,20 @@ class TestCompileOnlyTests(unittest.TestCase):
|
|
262
291
|
ptx, resty = compile_ptx(use_nanosleep, (uint32,), cc=(7, 0))
|
263
292
|
|
264
293
|
nanosleep_count = 0
|
265
|
-
for line in ptx.split(
|
266
|
-
if
|
294
|
+
for line in ptx.split("\n"):
|
295
|
+
if "nanosleep.u32" in line:
|
267
296
|
nanosleep_count += 1
|
268
297
|
|
269
298
|
expected = 2
|
270
|
-
self.assertEqual(
|
271
|
-
|
272
|
-
|
299
|
+
self.assertEqual(
|
300
|
+
expected,
|
301
|
+
nanosleep_count,
|
302
|
+
(
|
303
|
+
f"Got {nanosleep_count} nanosleep instructions, "
|
304
|
+
f"expected {expected}"
|
305
|
+
),
|
306
|
+
)
|
273
307
|
|
274
308
|
|
275
|
-
if __name__ ==
|
309
|
+
if __name__ == "__main__":
|
276
310
|
unittest.main()
|