numba-cuda 0.0.1__py3-none-any.whl → 0.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.pth +1 -0
- _numba_cuda_redirector.py +74 -0
- numba_cuda/VERSION +1 -0
- numba_cuda/__init__.py +5 -0
- numba_cuda/_version.py +19 -0
- numba_cuda/numba/cuda/__init__.py +22 -0
- numba_cuda/numba/cuda/api.py +526 -0
- numba_cuda/numba/cuda/api_util.py +30 -0
- numba_cuda/numba/cuda/args.py +77 -0
- numba_cuda/numba/cuda/cg.py +62 -0
- numba_cuda/numba/cuda/codegen.py +378 -0
- numba_cuda/numba/cuda/compiler.py +422 -0
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
- numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
- numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
- numba_cuda/numba/cuda/cuda_paths.py +258 -0
- numba_cuda/numba/cuda/cudadecl.py +806 -0
- numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
- numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
- numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
- numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
- numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
- numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
- numba_cuda/numba/cuda/cudadrv/error.py +36 -0
- numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
- numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
- numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
- numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
- numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
- numba_cuda/numba/cuda/cudaimpl.py +1055 -0
- numba_cuda/numba/cuda/cudamath.py +140 -0
- numba_cuda/numba/cuda/decorators.py +189 -0
- numba_cuda/numba/cuda/descriptor.py +33 -0
- numba_cuda/numba/cuda/device_init.py +89 -0
- numba_cuda/numba/cuda/deviceufunc.py +908 -0
- numba_cuda/numba/cuda/dispatcher.py +1057 -0
- numba_cuda/numba/cuda/errors.py +59 -0
- numba_cuda/numba/cuda/extending.py +7 -0
- numba_cuda/numba/cuda/initialize.py +13 -0
- numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
- numba_cuda/numba/cuda/intrinsics.py +198 -0
- numba_cuda/numba/cuda/kernels/__init__.py +0 -0
- numba_cuda/numba/cuda/kernels/reduction.py +262 -0
- numba_cuda/numba/cuda/kernels/transpose.py +65 -0
- numba_cuda/numba/cuda/libdevice.py +3382 -0
- numba_cuda/numba/cuda/libdevicedecl.py +17 -0
- numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
- numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
- numba_cuda/numba/cuda/mathimpl.py +448 -0
- numba_cuda/numba/cuda/models.py +48 -0
- numba_cuda/numba/cuda/nvvmutils.py +235 -0
- numba_cuda/numba/cuda/printimpl.py +86 -0
- numba_cuda/numba/cuda/random.py +292 -0
- numba_cuda/numba/cuda/simulator/__init__.py +38 -0
- numba_cuda/numba/cuda/simulator/api.py +110 -0
- numba_cuda/numba/cuda/simulator/compiler.py +9 -0
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
- numba_cuda/numba/cuda/simulator/kernel.py +308 -0
- numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
- numba_cuda/numba/cuda/simulator/reduction.py +15 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
- numba_cuda/numba/cuda/simulator_init.py +17 -0
- numba_cuda/numba/cuda/stubs.py +902 -0
- numba_cuda/numba/cuda/target.py +440 -0
- numba_cuda/numba/cuda/testing.py +202 -0
- numba_cuda/numba/cuda/tests/__init__.py +58 -0
- numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
- numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
- numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
- numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
- numba_cuda/numba/cuda/tests/data/error.cu +7 -0
- numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
- numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
- numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
- numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
- numba_cuda/numba/cuda/types.py +37 -0
- numba_cuda/numba/cuda/ufuncs.py +662 -0
- numba_cuda/numba/cuda/vector_types.py +209 -0
- numba_cuda/numba/cuda/vectorizers.py +252 -0
- numba_cuda-0.0.13.dist-info/LICENSE +25 -0
- numba_cuda-0.0.13.dist-info/METADATA +69 -0
- numba_cuda-0.0.13.dist-info/RECORD +231 -0
- {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.13.dist-info}/WHEEL +1 -1
- numba_cuda-0.0.1.dist-info/METADATA +0 -10
- numba_cuda-0.0.1.dist-info/RECORD +0 -5
- {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,36 @@
|
|
1
|
+
from numba import vectorize
|
2
|
+
from numba import cuda, float32
|
3
|
+
import numpy as np
|
4
|
+
from numba.cuda.testing import skip_on_cudasim, CUDATestCase
|
5
|
+
import unittest
|
6
|
+
|
7
|
+
|
8
|
+
@skip_on_cudasim('ufunc API unsupported in the simulator')
|
9
|
+
class TestCudaVectorizeDeviceCall(CUDATestCase):
|
10
|
+
def test_cuda_vectorize_device_call(self):
|
11
|
+
|
12
|
+
@cuda.jit(float32(float32, float32, float32), device=True)
|
13
|
+
def cu_device_fn(x, y, z):
|
14
|
+
return x ** y / z
|
15
|
+
|
16
|
+
def cu_ufunc(x, y, z):
|
17
|
+
return cu_device_fn(x, y, z)
|
18
|
+
|
19
|
+
ufunc = vectorize([float32(float32, float32, float32)], target='cuda')(
|
20
|
+
cu_ufunc)
|
21
|
+
|
22
|
+
N = 100
|
23
|
+
|
24
|
+
X = np.array(np.random.sample(N), dtype=np.float32)
|
25
|
+
Y = np.array(np.random.sample(N), dtype=np.float32)
|
26
|
+
Z = np.array(np.random.sample(N), dtype=np.float32) + 0.1
|
27
|
+
|
28
|
+
out = ufunc(X, Y, Z)
|
29
|
+
|
30
|
+
gold = (X ** Y) / Z
|
31
|
+
|
32
|
+
self.assertTrue(np.allclose(out, gold))
|
33
|
+
|
34
|
+
|
35
|
+
if __name__ == '__main__':
|
36
|
+
unittest.main()
|
@@ -0,0 +1,37 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from numba import vectorize
|
3
|
+
from numba import cuda, float64
|
4
|
+
from numba.cuda.testing import skip_on_cudasim, CUDATestCase
|
5
|
+
import unittest
|
6
|
+
|
7
|
+
sig = [float64(float64, float64)]
|
8
|
+
|
9
|
+
|
10
|
+
@skip_on_cudasim('ufunc API unsupported in the simulator')
|
11
|
+
class TestCUDAVectorizeScalarArg(CUDATestCase):
|
12
|
+
|
13
|
+
def test_vectorize_scalar_arg(self):
|
14
|
+
@vectorize(sig, target='cuda')
|
15
|
+
def vector_add(a, b):
|
16
|
+
return a + b
|
17
|
+
|
18
|
+
A = np.arange(10, dtype=np.float64)
|
19
|
+
dA = cuda.to_device(A)
|
20
|
+
v = vector_add(1.0, dA)
|
21
|
+
|
22
|
+
np.testing.assert_array_almost_equal(
|
23
|
+
v.copy_to_host(),
|
24
|
+
np.arange(1, 11, dtype=np.float64))
|
25
|
+
|
26
|
+
def test_vectorize_all_scalars(self):
|
27
|
+
@vectorize(sig, target='cuda')
|
28
|
+
def vector_add(a, b):
|
29
|
+
return a + b
|
30
|
+
|
31
|
+
v = vector_add(1.0, 1.0)
|
32
|
+
|
33
|
+
np.testing.assert_almost_equal(2.0, v)
|
34
|
+
|
35
|
+
|
36
|
+
if __name__ == '__main__':
|
37
|
+
unittest.main()
|
@@ -0,0 +1,139 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from numba import cuda
|
3
|
+
from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
|
4
|
+
from numba.tests.support import linux_only, override_config
|
5
|
+
from numba.core.errors import NumbaPerformanceWarning
|
6
|
+
import warnings
|
7
|
+
|
8
|
+
|
9
|
+
@skip_on_cudasim('cudasim does not raise performance warnings')
|
10
|
+
class TestWarnings(CUDATestCase):
|
11
|
+
def test_inefficient_launch_configuration(self):
|
12
|
+
@cuda.jit
|
13
|
+
def kernel():
|
14
|
+
pass
|
15
|
+
|
16
|
+
with override_config('CUDA_LOW_OCCUPANCY_WARNINGS', 1):
|
17
|
+
with warnings.catch_warnings(record=True) as w:
|
18
|
+
kernel[1, 1]()
|
19
|
+
|
20
|
+
self.assertEqual(w[0].category, NumbaPerformanceWarning)
|
21
|
+
self.assertIn('Grid size', str(w[0].message))
|
22
|
+
self.assertIn('low occupancy', str(w[0].message))
|
23
|
+
|
24
|
+
def test_efficient_launch_configuration(self):
|
25
|
+
@cuda.jit
|
26
|
+
def kernel():
|
27
|
+
pass
|
28
|
+
|
29
|
+
with override_config('CUDA_LOW_OCCUPANCY_WARNINGS', 1):
|
30
|
+
with warnings.catch_warnings(record=True) as w:
|
31
|
+
kernel[256, 256]()
|
32
|
+
|
33
|
+
self.assertEqual(len(w), 0)
|
34
|
+
|
35
|
+
def test_warn_on_host_array(self):
|
36
|
+
@cuda.jit
|
37
|
+
def foo(r, x):
|
38
|
+
r[0] = x + 1
|
39
|
+
|
40
|
+
N = 10
|
41
|
+
arr_f32 = np.zeros(N, dtype=np.float32)
|
42
|
+
with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1):
|
43
|
+
with warnings.catch_warnings(record=True) as w:
|
44
|
+
foo[1, N](arr_f32, N)
|
45
|
+
|
46
|
+
self.assertEqual(w[0].category, NumbaPerformanceWarning)
|
47
|
+
self.assertIn('Host array used in CUDA kernel will incur',
|
48
|
+
str(w[0].message))
|
49
|
+
self.assertIn('copy overhead', str(w[0].message))
|
50
|
+
|
51
|
+
def test_pinned_warn_on_host_array(self):
|
52
|
+
@cuda.jit
|
53
|
+
def foo(r, x):
|
54
|
+
r[0] = x + 1
|
55
|
+
|
56
|
+
N = 10
|
57
|
+
ary = cuda.pinned_array(N, dtype=np.float32)
|
58
|
+
|
59
|
+
with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1):
|
60
|
+
with warnings.catch_warnings(record=True) as w:
|
61
|
+
foo[1, N](ary, N)
|
62
|
+
|
63
|
+
self.assertEqual(w[0].category, NumbaPerformanceWarning)
|
64
|
+
self.assertIn('Host array used in CUDA kernel will incur',
|
65
|
+
str(w[0].message))
|
66
|
+
self.assertIn('copy overhead', str(w[0].message))
|
67
|
+
|
68
|
+
def test_nowarn_on_mapped_array(self):
|
69
|
+
@cuda.jit
|
70
|
+
def foo(r, x):
|
71
|
+
r[0] = x + 1
|
72
|
+
|
73
|
+
N = 10
|
74
|
+
ary = cuda.mapped_array(N, dtype=np.float32)
|
75
|
+
|
76
|
+
with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1):
|
77
|
+
with warnings.catch_warnings(record=True) as w:
|
78
|
+
foo[1, N](ary, N)
|
79
|
+
|
80
|
+
self.assertEqual(len(w), 0)
|
81
|
+
|
82
|
+
@linux_only
|
83
|
+
def test_nowarn_on_managed_array(self):
|
84
|
+
@cuda.jit
|
85
|
+
def foo(r, x):
|
86
|
+
r[0] = x + 1
|
87
|
+
|
88
|
+
N = 10
|
89
|
+
ary = cuda.managed_array(N, dtype=np.float32)
|
90
|
+
|
91
|
+
with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1):
|
92
|
+
with warnings.catch_warnings(record=True) as w:
|
93
|
+
foo[1, N](ary, N)
|
94
|
+
|
95
|
+
self.assertEqual(len(w), 0)
|
96
|
+
|
97
|
+
def test_nowarn_on_device_array(self):
|
98
|
+
@cuda.jit
|
99
|
+
def foo(r, x):
|
100
|
+
r[0] = x + 1
|
101
|
+
|
102
|
+
N = 10
|
103
|
+
ary = cuda.device_array(N, dtype=np.float32)
|
104
|
+
|
105
|
+
with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1):
|
106
|
+
with warnings.catch_warnings(record=True) as w:
|
107
|
+
foo[1, N](ary, N)
|
108
|
+
|
109
|
+
self.assertEqual(len(w), 0)
|
110
|
+
|
111
|
+
def test_warn_on_debug_and_opt(self):
|
112
|
+
with warnings.catch_warnings(record=True) as w:
|
113
|
+
cuda.jit(debug=True, opt=True)
|
114
|
+
|
115
|
+
self.assertEqual(len(w), 1)
|
116
|
+
self.assertIn('not supported by CUDA', str(w[0].message))
|
117
|
+
|
118
|
+
def test_warn_on_debug_and_opt_default(self):
|
119
|
+
with warnings.catch_warnings(record=True) as w:
|
120
|
+
cuda.jit(debug=True)
|
121
|
+
|
122
|
+
self.assertEqual(len(w), 1)
|
123
|
+
self.assertIn('not supported by CUDA', str(w[0].message))
|
124
|
+
|
125
|
+
def test_no_warn_on_debug_and_no_opt(self):
|
126
|
+
with warnings.catch_warnings(record=True) as w:
|
127
|
+
cuda.jit(debug=True, opt=False)
|
128
|
+
|
129
|
+
self.assertEqual(len(w), 0)
|
130
|
+
|
131
|
+
def test_no_warn_with_no_debug_and_opt_kwargs(self):
|
132
|
+
with warnings.catch_warnings(record=True) as w:
|
133
|
+
cuda.jit()
|
134
|
+
|
135
|
+
self.assertEqual(len(w), 0)
|
136
|
+
|
137
|
+
|
138
|
+
if __name__ == '__main__':
|
139
|
+
unittest.main()
|
@@ -0,0 +1,276 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from numba import cuda, int32, int64, float32, float64
|
3
|
+
from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
|
4
|
+
from numba.core import config
|
5
|
+
|
6
|
+
|
7
|
+
def useful_syncwarp(ary):
|
8
|
+
i = cuda.grid(1)
|
9
|
+
if i == 0:
|
10
|
+
ary[0] = 42
|
11
|
+
cuda.syncwarp(0xffffffff)
|
12
|
+
ary[i] = ary[0]
|
13
|
+
|
14
|
+
|
15
|
+
def use_shfl_sync_idx(ary, idx):
|
16
|
+
i = cuda.grid(1)
|
17
|
+
val = cuda.shfl_sync(0xffffffff, i, idx)
|
18
|
+
ary[i] = val
|
19
|
+
|
20
|
+
|
21
|
+
def use_shfl_sync_up(ary, delta):
|
22
|
+
i = cuda.grid(1)
|
23
|
+
val = cuda.shfl_up_sync(0xffffffff, i, delta)
|
24
|
+
ary[i] = val
|
25
|
+
|
26
|
+
|
27
|
+
def use_shfl_sync_down(ary, delta):
|
28
|
+
i = cuda.grid(1)
|
29
|
+
val = cuda.shfl_down_sync(0xffffffff, i, delta)
|
30
|
+
ary[i] = val
|
31
|
+
|
32
|
+
|
33
|
+
def use_shfl_sync_xor(ary, xor):
|
34
|
+
i = cuda.grid(1)
|
35
|
+
val = cuda.shfl_xor_sync(0xffffffff, i, xor)
|
36
|
+
ary[i] = val
|
37
|
+
|
38
|
+
|
39
|
+
def use_shfl_sync_with_val(ary, into):
|
40
|
+
i = cuda.grid(1)
|
41
|
+
val = cuda.shfl_sync(0xffffffff, into, 0)
|
42
|
+
ary[i] = val
|
43
|
+
|
44
|
+
|
45
|
+
def use_vote_sync_all(ary_in, ary_out):
|
46
|
+
i = cuda.grid(1)
|
47
|
+
pred = cuda.all_sync(0xffffffff, ary_in[i])
|
48
|
+
ary_out[i] = pred
|
49
|
+
|
50
|
+
|
51
|
+
def use_vote_sync_any(ary_in, ary_out):
|
52
|
+
i = cuda.grid(1)
|
53
|
+
pred = cuda.any_sync(0xffffffff, ary_in[i])
|
54
|
+
ary_out[i] = pred
|
55
|
+
|
56
|
+
|
57
|
+
def use_vote_sync_eq(ary_in, ary_out):
|
58
|
+
i = cuda.grid(1)
|
59
|
+
pred = cuda.eq_sync(0xffffffff, ary_in[i])
|
60
|
+
ary_out[i] = pred
|
61
|
+
|
62
|
+
|
63
|
+
def use_vote_sync_ballot(ary):
|
64
|
+
i = cuda.threadIdx.x
|
65
|
+
ballot = cuda.ballot_sync(0xffffffff, True)
|
66
|
+
ary[i] = ballot
|
67
|
+
|
68
|
+
|
69
|
+
def use_match_any_sync(ary_in, ary_out):
|
70
|
+
i = cuda.grid(1)
|
71
|
+
ballot = cuda.match_any_sync(0xffffffff, ary_in[i])
|
72
|
+
ary_out[i] = ballot
|
73
|
+
|
74
|
+
|
75
|
+
def use_match_all_sync(ary_in, ary_out):
|
76
|
+
i = cuda.grid(1)
|
77
|
+
ballot, pred = cuda.match_all_sync(0xffffffff, ary_in[i])
|
78
|
+
ary_out[i] = ballot if pred else 0
|
79
|
+
|
80
|
+
|
81
|
+
def use_independent_scheduling(arr):
|
82
|
+
i = cuda.threadIdx.x
|
83
|
+
if i % 4 == 0:
|
84
|
+
ballot = cuda.ballot_sync(0x11111111, True)
|
85
|
+
elif i % 4 == 1:
|
86
|
+
ballot = cuda.ballot_sync(0x22222222, True)
|
87
|
+
elif i % 4 == 2:
|
88
|
+
ballot = cuda.ballot_sync(0x44444444, True)
|
89
|
+
elif i % 4 == 3:
|
90
|
+
ballot = cuda.ballot_sync(0x88888888, True)
|
91
|
+
arr[i] = ballot
|
92
|
+
|
93
|
+
|
94
|
+
def _safe_cc_check(cc):
|
95
|
+
if config.ENABLE_CUDASIM:
|
96
|
+
return True
|
97
|
+
else:
|
98
|
+
return cuda.get_current_device().compute_capability >= cc
|
99
|
+
|
100
|
+
|
101
|
+
@skip_on_cudasim("Warp Operations are not yet implemented on cudasim")
|
102
|
+
class TestCudaWarpOperations(CUDATestCase):
|
103
|
+
def test_useful_syncwarp(self):
|
104
|
+
compiled = cuda.jit("void(int32[:])")(useful_syncwarp)
|
105
|
+
nelem = 32
|
106
|
+
ary = np.empty(nelem, dtype=np.int32)
|
107
|
+
compiled[1, nelem](ary)
|
108
|
+
self.assertTrue(np.all(ary == 42))
|
109
|
+
|
110
|
+
def test_shfl_sync_idx(self):
|
111
|
+
compiled = cuda.jit("void(int32[:], int32)")(use_shfl_sync_idx)
|
112
|
+
nelem = 32
|
113
|
+
idx = 4
|
114
|
+
ary = np.empty(nelem, dtype=np.int32)
|
115
|
+
compiled[1, nelem](ary, idx)
|
116
|
+
self.assertTrue(np.all(ary == idx))
|
117
|
+
|
118
|
+
def test_shfl_sync_up(self):
|
119
|
+
compiled = cuda.jit("void(int32[:], int32)")(use_shfl_sync_up)
|
120
|
+
nelem = 32
|
121
|
+
delta = 4
|
122
|
+
ary = np.empty(nelem, dtype=np.int32)
|
123
|
+
exp = np.arange(nelem, dtype=np.int32)
|
124
|
+
exp[delta:] -= delta
|
125
|
+
compiled[1, nelem](ary, delta)
|
126
|
+
self.assertTrue(np.all(ary == exp))
|
127
|
+
|
128
|
+
def test_shfl_sync_down(self):
|
129
|
+
compiled = cuda.jit("void(int32[:], int32)")(use_shfl_sync_down)
|
130
|
+
nelem = 32
|
131
|
+
delta = 4
|
132
|
+
ary = np.empty(nelem, dtype=np.int32)
|
133
|
+
exp = np.arange(nelem, dtype=np.int32)
|
134
|
+
exp[:-delta] += delta
|
135
|
+
compiled[1, nelem](ary, delta)
|
136
|
+
self.assertTrue(np.all(ary == exp))
|
137
|
+
|
138
|
+
def test_shfl_sync_xor(self):
|
139
|
+
compiled = cuda.jit("void(int32[:], int32)")(use_shfl_sync_xor)
|
140
|
+
nelem = 32
|
141
|
+
xor = 16
|
142
|
+
ary = np.empty(nelem, dtype=np.int32)
|
143
|
+
exp = np.arange(nelem, dtype=np.int32) ^ xor
|
144
|
+
compiled[1, nelem](ary, xor)
|
145
|
+
self.assertTrue(np.all(ary == exp))
|
146
|
+
|
147
|
+
def test_shfl_sync_types(self):
|
148
|
+
types = int32, int64, float32, float64
|
149
|
+
values = (np.int32(-1), np.int64(1 << 42),
|
150
|
+
np.float32(np.pi), np.float64(np.pi))
|
151
|
+
for typ, val in zip(types, values):
|
152
|
+
compiled = cuda.jit((typ[:], typ))(use_shfl_sync_with_val)
|
153
|
+
nelem = 32
|
154
|
+
ary = np.empty(nelem, dtype=val.dtype)
|
155
|
+
compiled[1, nelem](ary, val)
|
156
|
+
self.assertTrue(np.all(ary == val))
|
157
|
+
|
158
|
+
def test_vote_sync_all(self):
|
159
|
+
compiled = cuda.jit("void(int32[:], int32[:])")(use_vote_sync_all)
|
160
|
+
nelem = 32
|
161
|
+
ary_in = np.ones(nelem, dtype=np.int32)
|
162
|
+
ary_out = np.empty(nelem, dtype=np.int32)
|
163
|
+
compiled[1, nelem](ary_in, ary_out)
|
164
|
+
self.assertTrue(np.all(ary_out == 1))
|
165
|
+
ary_in[-1] = 0
|
166
|
+
compiled[1, nelem](ary_in, ary_out)
|
167
|
+
self.assertTrue(np.all(ary_out == 0))
|
168
|
+
|
169
|
+
def test_vote_sync_any(self):
|
170
|
+
compiled = cuda.jit("void(int32[:], int32[:])")(use_vote_sync_any)
|
171
|
+
nelem = 32
|
172
|
+
ary_in = np.zeros(nelem, dtype=np.int32)
|
173
|
+
ary_out = np.empty(nelem, dtype=np.int32)
|
174
|
+
compiled[1, nelem](ary_in, ary_out)
|
175
|
+
self.assertTrue(np.all(ary_out == 0))
|
176
|
+
ary_in[2] = 1
|
177
|
+
ary_in[5] = 1
|
178
|
+
compiled[1, nelem](ary_in, ary_out)
|
179
|
+
self.assertTrue(np.all(ary_out == 1))
|
180
|
+
|
181
|
+
def test_vote_sync_eq(self):
|
182
|
+
compiled = cuda.jit("void(int32[:], int32[:])")(use_vote_sync_eq)
|
183
|
+
nelem = 32
|
184
|
+
ary_in = np.zeros(nelem, dtype=np.int32)
|
185
|
+
ary_out = np.empty(nelem, dtype=np.int32)
|
186
|
+
compiled[1, nelem](ary_in, ary_out)
|
187
|
+
self.assertTrue(np.all(ary_out == 1))
|
188
|
+
ary_in[1] = 1
|
189
|
+
compiled[1, nelem](ary_in, ary_out)
|
190
|
+
self.assertTrue(np.all(ary_out == 0))
|
191
|
+
ary_in[:] = 1
|
192
|
+
compiled[1, nelem](ary_in, ary_out)
|
193
|
+
self.assertTrue(np.all(ary_out == 1))
|
194
|
+
|
195
|
+
def test_vote_sync_ballot(self):
|
196
|
+
compiled = cuda.jit("void(uint32[:])")(use_vote_sync_ballot)
|
197
|
+
nelem = 32
|
198
|
+
ary = np.empty(nelem, dtype=np.uint32)
|
199
|
+
compiled[1, nelem](ary)
|
200
|
+
self.assertTrue(np.all(ary == np.uint32(0xffffffff)))
|
201
|
+
|
202
|
+
@unittest.skipUnless(_safe_cc_check((7, 0)),
|
203
|
+
"Matching requires at least Volta Architecture")
|
204
|
+
def test_match_any_sync(self):
|
205
|
+
compiled = cuda.jit("void(int32[:], int32[:])")(use_match_any_sync)
|
206
|
+
nelem = 10
|
207
|
+
ary_in = np.arange(nelem, dtype=np.int32) % 2
|
208
|
+
ary_out = np.empty(nelem, dtype=np.int32)
|
209
|
+
exp = np.tile((0b0101010101, 0b1010101010), 5)
|
210
|
+
compiled[1, nelem](ary_in, ary_out)
|
211
|
+
self.assertTrue(np.all(ary_out == exp))
|
212
|
+
|
213
|
+
@unittest.skipUnless(_safe_cc_check((7, 0)),
|
214
|
+
"Matching requires at least Volta Architecture")
|
215
|
+
def test_match_all_sync(self):
|
216
|
+
compiled = cuda.jit("void(int32[:], int32[:])")(use_match_all_sync)
|
217
|
+
nelem = 10
|
218
|
+
ary_in = np.zeros(nelem, dtype=np.int32)
|
219
|
+
ary_out = np.empty(nelem, dtype=np.int32)
|
220
|
+
compiled[1, nelem](ary_in, ary_out)
|
221
|
+
self.assertTrue(np.all(ary_out == 0b1111111111))
|
222
|
+
ary_in[1] = 4
|
223
|
+
compiled[1, nelem](ary_in, ary_out)
|
224
|
+
self.assertTrue(np.all(ary_out == 0))
|
225
|
+
|
226
|
+
@unittest.skipUnless(_safe_cc_check((7, 0)),
|
227
|
+
"Independent scheduling requires at least Volta "
|
228
|
+
"Architecture")
|
229
|
+
def test_independent_scheduling(self):
|
230
|
+
compiled = cuda.jit("void(uint32[:])")(use_independent_scheduling)
|
231
|
+
arr = np.empty(32, dtype=np.uint32)
|
232
|
+
exp = np.tile((0x11111111, 0x22222222, 0x44444444, 0x88888888), 8)
|
233
|
+
compiled[1, 32](arr)
|
234
|
+
self.assertTrue(np.all(arr == exp))
|
235
|
+
|
236
|
+
def test_activemask(self):
|
237
|
+
@cuda.jit
|
238
|
+
def use_activemask(x):
|
239
|
+
i = cuda.grid(1)
|
240
|
+
if (i % 2) == 0:
|
241
|
+
# Even numbered threads fill in even numbered array entries
|
242
|
+
# with binary "...01010101"
|
243
|
+
x[i] = cuda.activemask()
|
244
|
+
else:
|
245
|
+
# Odd numbered threads fill in odd numbered array entries
|
246
|
+
# with binary "...10101010"
|
247
|
+
x[i] = cuda.activemask()
|
248
|
+
|
249
|
+
out = np.zeros(32, dtype=np.uint32)
|
250
|
+
use_activemask[1, 32](out)
|
251
|
+
|
252
|
+
# 0x5 = 0101: The pattern from even-numbered threads
|
253
|
+
# 0xA = 1010: The pattern from odd-numbered threads
|
254
|
+
expected = np.tile((0x55555555, 0xAAAAAAAA), 16)
|
255
|
+
np.testing.assert_equal(expected, out)
|
256
|
+
|
257
|
+
def test_lanemask_lt(self):
|
258
|
+
@cuda.jit
|
259
|
+
def use_lanemask_lt(x):
|
260
|
+
i = cuda.grid(1)
|
261
|
+
x[i] = cuda.lanemask_lt()
|
262
|
+
|
263
|
+
out = np.zeros(32, dtype=np.uint32)
|
264
|
+
use_lanemask_lt[1, 32](out)
|
265
|
+
|
266
|
+
# A string of 1s that grows from the LSB for each entry:
|
267
|
+
# 0, 1, 3, 7, F, 1F, 3F, 7F, FF, 1FF, etc.
|
268
|
+
# or in binary:
|
269
|
+
# ...0001, ....0011, ...0111, etc.
|
270
|
+
expected = np.asarray([(2 ** i) - 1 for i in range(32)],
|
271
|
+
dtype=np.uint32)
|
272
|
+
np.testing.assert_equal(expected, out)
|
273
|
+
|
274
|
+
|
275
|
+
if __name__ == '__main__':
|
276
|
+
unittest.main()
|
@@ -0,0 +1,102 @@
|
|
1
|
+
import threading
|
2
|
+
|
3
|
+
import numpy as np
|
4
|
+
|
5
|
+
from numba import cuda
|
6
|
+
from numba.cuda.testing import CUDATestCase, skip_unless_cudasim
|
7
|
+
import numba.cuda.simulator as simulator
|
8
|
+
import unittest
|
9
|
+
|
10
|
+
|
11
|
+
class TestCudaSimIssues(CUDATestCase):
|
12
|
+
def test_record_access(self):
|
13
|
+
backyard_type = [('statue', np.float64),
|
14
|
+
('newspaper', np.float64, (6,))]
|
15
|
+
|
16
|
+
goose_type = [('garden', np.float64, (12,)),
|
17
|
+
('town', np.float64, (42,)),
|
18
|
+
('backyard', backyard_type)]
|
19
|
+
|
20
|
+
goose_np_type = np.dtype(goose_type, align=True)
|
21
|
+
|
22
|
+
@cuda.jit
|
23
|
+
def simple_kernel(f):
|
24
|
+
f.garden[0] = 45.0
|
25
|
+
f.backyard.newspaper[3] = 2.0
|
26
|
+
f.backyard.newspaper[3] = f.backyard.newspaper[3] + 3.0
|
27
|
+
|
28
|
+
item = np.recarray(1, dtype=goose_np_type)
|
29
|
+
simple_kernel[1, 1](item[0])
|
30
|
+
np.testing.assert_equal(item[0]['garden'][0], 45)
|
31
|
+
np.testing.assert_equal(item[0]['backyard']['newspaper'][3], 5)
|
32
|
+
|
33
|
+
def test_recarray_setting(self):
|
34
|
+
recordwith2darray = np.dtype([('i', np.int32),
|
35
|
+
('j', np.float32, (3, 2))])
|
36
|
+
rec = np.recarray(2, dtype=recordwith2darray)
|
37
|
+
rec[0]['i'] = 45
|
38
|
+
|
39
|
+
@cuda.jit
|
40
|
+
def simple_kernel(f):
|
41
|
+
f[1] = f[0]
|
42
|
+
simple_kernel[1, 1](rec)
|
43
|
+
np.testing.assert_equal(rec[0]['i'], rec[1]['i'])
|
44
|
+
|
45
|
+
def test_cuda_module_in_device_function(self):
|
46
|
+
"""
|
47
|
+
Discovered in https://github.com/numba/numba/issues/1837.
|
48
|
+
When the `cuda` module is referenced in a device function,
|
49
|
+
it does not have the kernel API (e.g. cuda.threadIdx, cuda.shared)
|
50
|
+
"""
|
51
|
+
from numba.cuda.tests.cudasim import support
|
52
|
+
|
53
|
+
inner = support.cuda_module_in_device_function
|
54
|
+
|
55
|
+
@cuda.jit
|
56
|
+
def outer(out):
|
57
|
+
tid = inner()
|
58
|
+
if tid < out.size:
|
59
|
+
out[tid] = tid
|
60
|
+
|
61
|
+
arr = np.zeros(10, dtype=np.int32)
|
62
|
+
outer[1, 11](arr)
|
63
|
+
expected = np.arange(arr.size, dtype=np.int32)
|
64
|
+
np.testing.assert_equal(expected, arr)
|
65
|
+
|
66
|
+
@skip_unless_cudasim('Only works on CUDASIM')
|
67
|
+
def test_deadlock_on_exception(self):
|
68
|
+
def assert_no_blockthreads():
|
69
|
+
blockthreads = []
|
70
|
+
for t in threading.enumerate():
|
71
|
+
if not isinstance(t, simulator.kernel.BlockThread):
|
72
|
+
continue
|
73
|
+
|
74
|
+
# join blockthreads with a short timeout to allow aborted
|
75
|
+
# threads to exit
|
76
|
+
t.join(1)
|
77
|
+
if t.is_alive():
|
78
|
+
self.fail("Blocked kernel thread: %s" % t)
|
79
|
+
|
80
|
+
self.assertListEqual(blockthreads, [])
|
81
|
+
|
82
|
+
@simulator.jit
|
83
|
+
def assign_with_sync(x, y):
|
84
|
+
i = cuda.grid(1)
|
85
|
+
y[i] = x[i]
|
86
|
+
|
87
|
+
cuda.syncthreads()
|
88
|
+
cuda.syncthreads()
|
89
|
+
|
90
|
+
x = np.arange(3)
|
91
|
+
y = np.empty(3)
|
92
|
+
assign_with_sync[1, 3](x, y)
|
93
|
+
np.testing.assert_array_equal(x, y)
|
94
|
+
assert_no_blockthreads()
|
95
|
+
|
96
|
+
with self.assertRaises(IndexError):
|
97
|
+
assign_with_sync[1, 6](x, y)
|
98
|
+
assert_no_blockthreads()
|
99
|
+
|
100
|
+
|
101
|
+
if __name__ == '__main__':
|
102
|
+
unittest.main()
|
File without changes
|
@@ -0,0 +1,5 @@
|
|
1
|
+
// Not all CUDA includes are safe to include in device code compiled by NVRTC,
|
2
|
+
// because it does not have paths to all system include directories. Headers
|
3
|
+
// such as cuda_device_runtime_api.h are safe to use in NVRTC without adding
|
4
|
+
// additional includes.
|
5
|
+
#include <cuda_device_runtime_api.h>
|
@@ -0,0 +1,23 @@
|
|
1
|
+
// Compile with:
|
2
|
+
//
|
3
|
+
// nvcc -gencode arch=compute_50,code=compute_50 -rdc true -ptx jitlink.cu
|
4
|
+
//
|
5
|
+
// using the oldest supported toolkit version (10.2 at the time of writing).
|
6
|
+
|
7
|
+
extern "C" __device__
|
8
|
+
int bar(int *out, int a)
|
9
|
+
{
|
10
|
+
*out = a * 2;
|
11
|
+
return 0;
|
12
|
+
}
|
13
|
+
|
14
|
+
|
15
|
+
// The out argument is necessary due to Numba's CUDA calling convention, which
|
16
|
+
// always reserves the first parameter for a pointer to a returned value, even
|
17
|
+
// if there is no return value.
|
18
|
+
extern "C" __device__
|
19
|
+
int array_mutator(void *out, int *a)
|
20
|
+
{
|
21
|
+
a[0] = a[1];
|
22
|
+
return 0;
|
23
|
+
}
|
@@ -0,0 +1,51 @@
|
|
1
|
+
//
|
2
|
+
// Generated by NVIDIA NVVM Compiler
|
3
|
+
//
|
4
|
+
// Compiler Build ID: CL-27506705
|
5
|
+
// Cuda compilation tools, release 10.2, V10.2.89
|
6
|
+
// Based on LLVM 3.4svn
|
7
|
+
//
|
8
|
+
|
9
|
+
.version 6.5
|
10
|
+
.target sm_50
|
11
|
+
.address_size 64
|
12
|
+
|
13
|
+
// .globl bar
|
14
|
+
|
15
|
+
.visible .func (.param .b32 func_retval0) bar(
|
16
|
+
.param .b64 bar_param_0,
|
17
|
+
.param .b32 bar_param_1
|
18
|
+
)
|
19
|
+
{
|
20
|
+
.reg .b32 %r<4>;
|
21
|
+
.reg .b64 %rd<2>;
|
22
|
+
|
23
|
+
|
24
|
+
ld.param.u64 %rd1, [bar_param_0];
|
25
|
+
ld.param.u32 %r1, [bar_param_1];
|
26
|
+
shl.b32 %r2, %r1, 1;
|
27
|
+
st.u32 [%rd1], %r2;
|
28
|
+
mov.u32 %r3, 0;
|
29
|
+
st.param.b32 [func_retval0+0], %r3;
|
30
|
+
ret;
|
31
|
+
}
|
32
|
+
|
33
|
+
// .globl array_mutator
|
34
|
+
.visible .func (.param .b32 func_retval0) array_mutator(
|
35
|
+
.param .b64 array_mutator_param_0,
|
36
|
+
.param .b64 array_mutator_param_1
|
37
|
+
)
|
38
|
+
{
|
39
|
+
.reg .b32 %r<3>;
|
40
|
+
.reg .b64 %rd<2>;
|
41
|
+
|
42
|
+
|
43
|
+
ld.param.u64 %rd1, [array_mutator_param_1];
|
44
|
+
ld.u32 %r1, [%rd1+4];
|
45
|
+
st.u32 [%rd1], %r1;
|
46
|
+
mov.u32 %r2, 0;
|
47
|
+
st.param.b32 [func_retval0+0], %r2;
|
48
|
+
ret;
|
49
|
+
}
|
50
|
+
|
51
|
+
|