numba-cuda 0.0.1__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.pth +1 -0
- _numba_cuda_redirector.py +74 -0
- numba_cuda/VERSION +1 -0
- numba_cuda/__init__.py +5 -0
- numba_cuda/_version.py +19 -0
- numba_cuda/numba/cuda/__init__.py +22 -0
- numba_cuda/numba/cuda/api.py +526 -0
- numba_cuda/numba/cuda/api_util.py +30 -0
- numba_cuda/numba/cuda/args.py +77 -0
- numba_cuda/numba/cuda/cg.py +62 -0
- numba_cuda/numba/cuda/codegen.py +378 -0
- numba_cuda/numba/cuda/compiler.py +422 -0
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
- numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
- numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
- numba_cuda/numba/cuda/cuda_paths.py +258 -0
- numba_cuda/numba/cuda/cudadecl.py +806 -0
- numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
- numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
- numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
- numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
- numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
- numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
- numba_cuda/numba/cuda/cudadrv/error.py +36 -0
- numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
- numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
- numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
- numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
- numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
- numba_cuda/numba/cuda/cudaimpl.py +1055 -0
- numba_cuda/numba/cuda/cudamath.py +140 -0
- numba_cuda/numba/cuda/decorators.py +189 -0
- numba_cuda/numba/cuda/descriptor.py +33 -0
- numba_cuda/numba/cuda/device_init.py +89 -0
- numba_cuda/numba/cuda/deviceufunc.py +908 -0
- numba_cuda/numba/cuda/dispatcher.py +1057 -0
- numba_cuda/numba/cuda/errors.py +59 -0
- numba_cuda/numba/cuda/extending.py +7 -0
- numba_cuda/numba/cuda/initialize.py +13 -0
- numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
- numba_cuda/numba/cuda/intrinsics.py +198 -0
- numba_cuda/numba/cuda/kernels/__init__.py +0 -0
- numba_cuda/numba/cuda/kernels/reduction.py +262 -0
- numba_cuda/numba/cuda/kernels/transpose.py +65 -0
- numba_cuda/numba/cuda/libdevice.py +3382 -0
- numba_cuda/numba/cuda/libdevicedecl.py +17 -0
- numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
- numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
- numba_cuda/numba/cuda/mathimpl.py +448 -0
- numba_cuda/numba/cuda/models.py +48 -0
- numba_cuda/numba/cuda/nvvmutils.py +235 -0
- numba_cuda/numba/cuda/printimpl.py +86 -0
- numba_cuda/numba/cuda/random.py +292 -0
- numba_cuda/numba/cuda/simulator/__init__.py +38 -0
- numba_cuda/numba/cuda/simulator/api.py +110 -0
- numba_cuda/numba/cuda/simulator/compiler.py +9 -0
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
- numba_cuda/numba/cuda/simulator/kernel.py +308 -0
- numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
- numba_cuda/numba/cuda/simulator/reduction.py +15 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
- numba_cuda/numba/cuda/simulator_init.py +17 -0
- numba_cuda/numba/cuda/stubs.py +902 -0
- numba_cuda/numba/cuda/target.py +440 -0
- numba_cuda/numba/cuda/testing.py +202 -0
- numba_cuda/numba/cuda/tests/__init__.py +58 -0
- numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
- numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
- numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
- numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
- numba_cuda/numba/cuda/tests/data/error.cu +7 -0
- numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
- numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
- numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
- numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
- numba_cuda/numba/cuda/types.py +37 -0
- numba_cuda/numba/cuda/ufuncs.py +662 -0
- numba_cuda/numba/cuda/vector_types.py +209 -0
- numba_cuda/numba/cuda/vectorizers.py +252 -0
- numba_cuda-0.0.12.dist-info/LICENSE +25 -0
- numba_cuda-0.0.12.dist-info/METADATA +68 -0
- numba_cuda-0.0.12.dist-info/RECORD +231 -0
- {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
- numba_cuda-0.0.1.dist-info/METADATA +0 -10
- numba_cuda-0.0.1.dist-info/RECORD +0 -5
- {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,74 @@
|
|
1
|
+
import numpy as np
|
2
|
+
|
3
|
+
from numba import cuda, float32, void
|
4
|
+
from numba.cuda.testing import unittest, CUDATestCase
|
5
|
+
from numba.core import config
|
6
|
+
|
7
|
+
# Ensure the test takes a reasonable amount of time in the simulator
|
8
|
+
if config.ENABLE_CUDASIM:
|
9
|
+
bpg, tpb = 2, 8
|
10
|
+
else:
|
11
|
+
bpg, tpb = 50, 32
|
12
|
+
|
13
|
+
n = bpg * tpb
|
14
|
+
SM_SIZE = (tpb, tpb)
|
15
|
+
|
16
|
+
|
17
|
+
class TestCudaMatMul(CUDATestCase):
|
18
|
+
|
19
|
+
def test_func(self):
|
20
|
+
|
21
|
+
@cuda.jit(void(float32[:, ::1], float32[:, ::1], float32[:, ::1]))
|
22
|
+
def cu_square_matrix_mul(A, B, C):
|
23
|
+
sA = cuda.shared.array(shape=SM_SIZE, dtype=float32)
|
24
|
+
sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32)
|
25
|
+
|
26
|
+
tx = cuda.threadIdx.x
|
27
|
+
ty = cuda.threadIdx.y
|
28
|
+
bx = cuda.blockIdx.x
|
29
|
+
by = cuda.blockIdx.y
|
30
|
+
bw = cuda.blockDim.x
|
31
|
+
bh = cuda.blockDim.y
|
32
|
+
|
33
|
+
x = tx + bx * bw
|
34
|
+
y = ty + by * bh
|
35
|
+
|
36
|
+
acc = float32(0) # forces all the math to be f32
|
37
|
+
for i in range(bpg):
|
38
|
+
if x < n and y < n:
|
39
|
+
sA[ty, tx] = A[y, tx + i * tpb]
|
40
|
+
sB[ty, tx] = B[ty + i * tpb, x]
|
41
|
+
|
42
|
+
cuda.syncthreads()
|
43
|
+
|
44
|
+
if x < n and y < n:
|
45
|
+
for j in range(tpb):
|
46
|
+
acc += sA[ty, j] * sB[j, tx]
|
47
|
+
|
48
|
+
cuda.syncthreads()
|
49
|
+
|
50
|
+
if x < n and y < n:
|
51
|
+
C[y, x] = acc
|
52
|
+
|
53
|
+
np.random.seed(42)
|
54
|
+
A = np.array(np.random.random((n, n)), dtype=np.float32)
|
55
|
+
B = np.array(np.random.random((n, n)), dtype=np.float32)
|
56
|
+
C = np.empty_like(A)
|
57
|
+
|
58
|
+
stream = cuda.stream()
|
59
|
+
with stream.auto_synchronize():
|
60
|
+
dA = cuda.to_device(A, stream)
|
61
|
+
dB = cuda.to_device(B, stream)
|
62
|
+
dC = cuda.to_device(C, stream)
|
63
|
+
cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)
|
64
|
+
dC.copy_to_host(C, stream)
|
65
|
+
|
66
|
+
# Host compute
|
67
|
+
Cans = np.dot(A, B)
|
68
|
+
|
69
|
+
# Check result
|
70
|
+
np.testing.assert_allclose(C, Cans, rtol=1e-5)
|
71
|
+
|
72
|
+
|
73
|
+
if __name__ == '__main__':
|
74
|
+
unittest.main()
|
@@ -0,0 +1,113 @@
|
|
1
|
+
import numpy as np
|
2
|
+
|
3
|
+
from numba import cuda, float64
|
4
|
+
from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
|
5
|
+
|
6
|
+
|
7
|
+
def builtin_max(A, B, C):
|
8
|
+
i = cuda.grid(1)
|
9
|
+
|
10
|
+
if i >= len(C):
|
11
|
+
return
|
12
|
+
|
13
|
+
C[i] = float64(max(A[i], B[i]))
|
14
|
+
|
15
|
+
|
16
|
+
def builtin_min(A, B, C):
|
17
|
+
i = cuda.grid(1)
|
18
|
+
|
19
|
+
if i >= len(C):
|
20
|
+
return
|
21
|
+
|
22
|
+
C[i] = float64(min(A[i], B[i]))
|
23
|
+
|
24
|
+
|
25
|
+
@skip_on_cudasim('Tests PTX emission')
|
26
|
+
class TestCudaMinMax(CUDATestCase):
|
27
|
+
def _run(
|
28
|
+
self,
|
29
|
+
kernel,
|
30
|
+
numpy_equivalent,
|
31
|
+
ptx_instruction,
|
32
|
+
dtype_left,
|
33
|
+
dtype_right,
|
34
|
+
n=5):
|
35
|
+
kernel = cuda.jit(kernel)
|
36
|
+
|
37
|
+
c = np.zeros(n, dtype=np.float64)
|
38
|
+
a = np.arange(n, dtype=dtype_left) + .5
|
39
|
+
b = np.full(n, fill_value=2, dtype=dtype_right)
|
40
|
+
|
41
|
+
kernel[1, c.shape](a, b, c)
|
42
|
+
np.testing.assert_allclose(c, numpy_equivalent(a, b))
|
43
|
+
|
44
|
+
ptx = next(p for p in kernel.inspect_asm().values())
|
45
|
+
self.assertIn(ptx_instruction, ptx)
|
46
|
+
|
47
|
+
def test_max_f8f8(self):
|
48
|
+
self._run(
|
49
|
+
builtin_max,
|
50
|
+
np.maximum,
|
51
|
+
'max.f64',
|
52
|
+
np.float64,
|
53
|
+
np.float64)
|
54
|
+
|
55
|
+
def test_max_f4f8(self):
|
56
|
+
self._run(
|
57
|
+
builtin_max,
|
58
|
+
np.maximum,
|
59
|
+
'max.f64',
|
60
|
+
np.float32,
|
61
|
+
np.float64)
|
62
|
+
|
63
|
+
def test_max_f8f4(self):
|
64
|
+
self._run(
|
65
|
+
builtin_max,
|
66
|
+
np.maximum,
|
67
|
+
'max.f64',
|
68
|
+
np.float64,
|
69
|
+
np.float32)
|
70
|
+
|
71
|
+
def test_max_f4f4(self):
|
72
|
+
self._run(
|
73
|
+
builtin_max,
|
74
|
+
np.maximum,
|
75
|
+
'max.f32',
|
76
|
+
np.float32,
|
77
|
+
np.float32)
|
78
|
+
|
79
|
+
def test_min_f8f8(self):
|
80
|
+
self._run(
|
81
|
+
builtin_min,
|
82
|
+
np.minimum,
|
83
|
+
'min.f64',
|
84
|
+
np.float64,
|
85
|
+
np.float64)
|
86
|
+
|
87
|
+
def test_min_f4f8(self):
|
88
|
+
self._run(
|
89
|
+
builtin_min,
|
90
|
+
np.minimum,
|
91
|
+
'min.f64',
|
92
|
+
np.float32,
|
93
|
+
np.float64)
|
94
|
+
|
95
|
+
def test_min_f8f4(self):
|
96
|
+
self._run(
|
97
|
+
builtin_min,
|
98
|
+
np.minimum,
|
99
|
+
'min.f64',
|
100
|
+
np.float64,
|
101
|
+
np.float32)
|
102
|
+
|
103
|
+
def test_min_f4f4(self):
|
104
|
+
self._run(
|
105
|
+
builtin_min,
|
106
|
+
np.minimum,
|
107
|
+
'min.f32',
|
108
|
+
np.float32,
|
109
|
+
np.float32)
|
110
|
+
|
111
|
+
|
112
|
+
if __name__ == '__main__':
|
113
|
+
unittest.main()
|
@@ -0,0 +1,22 @@
|
|
1
|
+
import math
|
2
|
+
from numba import cuda
|
3
|
+
from numba.cuda.testing import unittest, CUDATestCase
|
4
|
+
|
5
|
+
|
6
|
+
class TestCudaMonteCarlo(CUDATestCase):
|
7
|
+
def test_montecarlo(self):
|
8
|
+
"""Just make sure we can compile this
|
9
|
+
"""
|
10
|
+
|
11
|
+
@cuda.jit(
|
12
|
+
'void(double[:], double[:], double, double, double, double[:])')
|
13
|
+
def step(last, paths, dt, c0, c1, normdist):
|
14
|
+
i = cuda.grid(1)
|
15
|
+
if i >= paths.shape[0]:
|
16
|
+
return
|
17
|
+
noise = normdist[i]
|
18
|
+
paths[i] = last[i] * math.exp(c0 * dt + c1 * noise)
|
19
|
+
|
20
|
+
|
21
|
+
if __name__ == '__main__':
|
22
|
+
unittest.main()
|
@@ -0,0 +1,140 @@
|
|
1
|
+
from numba import cuda
|
2
|
+
import numpy as np
|
3
|
+
from numba.cuda.testing import skip_on_cudasim, CUDATestCase
|
4
|
+
import threading
|
5
|
+
import unittest
|
6
|
+
|
7
|
+
|
8
|
+
class TestMultiGPUContext(CUDATestCase):
|
9
|
+
@unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
|
10
|
+
def test_multigpu_context(self):
|
11
|
+
@cuda.jit("void(float64[:], float64[:])")
|
12
|
+
def copy_plus_1(inp, out):
|
13
|
+
i = cuda.grid(1)
|
14
|
+
if i < out.size:
|
15
|
+
out[i] = inp[i] + 1
|
16
|
+
|
17
|
+
def check(inp, out):
|
18
|
+
np.testing.assert_equal(inp + 1, out)
|
19
|
+
|
20
|
+
N = 32
|
21
|
+
A = np.arange(N, dtype=np.float64)
|
22
|
+
B = np.arange(N, dtype=np.float64)
|
23
|
+
|
24
|
+
with cuda.gpus[0]:
|
25
|
+
copy_plus_1[1, N](A, B)
|
26
|
+
|
27
|
+
check(A, B)
|
28
|
+
|
29
|
+
copy_plus_1[1, N](A, B)
|
30
|
+
check(A, B)
|
31
|
+
|
32
|
+
with cuda.gpus[0]:
|
33
|
+
A0 = np.arange(N, dtype=np.float64)
|
34
|
+
B0 = np.arange(N, dtype=np.float64)
|
35
|
+
copy_plus_1[1, N](A0, B0)
|
36
|
+
|
37
|
+
with cuda.gpus[1]:
|
38
|
+
A1 = np.arange(N, dtype=np.float64)
|
39
|
+
B1 = np.arange(N, dtype=np.float64)
|
40
|
+
copy_plus_1[1, N](A1, B1)
|
41
|
+
|
42
|
+
check(A0, B0)
|
43
|
+
check(A1, B1)
|
44
|
+
|
45
|
+
A = np.arange(N, dtype=np.float64)
|
46
|
+
B = np.arange(N, dtype=np.float64)
|
47
|
+
copy_plus_1[1, N](A, B)
|
48
|
+
check(A, B)
|
49
|
+
|
50
|
+
@skip_on_cudasim('Simulator does not support multiple threads')
|
51
|
+
def test_multithreaded(self):
|
52
|
+
def work(gpu, dA, results, ridx):
|
53
|
+
try:
|
54
|
+
with gpu:
|
55
|
+
arr = dA.copy_to_host()
|
56
|
+
|
57
|
+
except Exception as e:
|
58
|
+
results[ridx] = e
|
59
|
+
|
60
|
+
else:
|
61
|
+
results[ridx] = np.all(arr == np.arange(10))
|
62
|
+
|
63
|
+
dA = cuda.to_device(np.arange(10))
|
64
|
+
|
65
|
+
nthreads = 10
|
66
|
+
results = [None] * nthreads
|
67
|
+
threads = [threading.Thread(target=work, args=(cuda.gpus.current,
|
68
|
+
dA, results, i))
|
69
|
+
for i in range(nthreads)]
|
70
|
+
for th in threads:
|
71
|
+
th.start()
|
72
|
+
|
73
|
+
for th in threads:
|
74
|
+
th.join()
|
75
|
+
|
76
|
+
for r in results:
|
77
|
+
if isinstance(r, BaseException):
|
78
|
+
raise r
|
79
|
+
else:
|
80
|
+
self.assertTrue(r)
|
81
|
+
|
82
|
+
@unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
|
83
|
+
def test_with_context(self):
|
84
|
+
|
85
|
+
@cuda.jit
|
86
|
+
def vector_add_scalar(arr, val):
|
87
|
+
i = cuda.grid(1)
|
88
|
+
if i < arr.size:
|
89
|
+
arr[i] += val
|
90
|
+
|
91
|
+
hostarr = np.arange(10, dtype=np.float32)
|
92
|
+
with cuda.gpus[0]:
|
93
|
+
arr1 = cuda.to_device(hostarr)
|
94
|
+
|
95
|
+
with cuda.gpus[1]:
|
96
|
+
arr2 = cuda.to_device(hostarr)
|
97
|
+
|
98
|
+
with cuda.gpus[0]:
|
99
|
+
vector_add_scalar[1, 10](arr1, 1)
|
100
|
+
|
101
|
+
with cuda.gpus[1]:
|
102
|
+
vector_add_scalar[1, 10](arr2, 2)
|
103
|
+
|
104
|
+
with cuda.gpus[0]:
|
105
|
+
np.testing.assert_equal(arr1.copy_to_host(), (hostarr + 1))
|
106
|
+
|
107
|
+
with cuda.gpus[1]:
|
108
|
+
np.testing.assert_equal(arr2.copy_to_host(), (hostarr + 2))
|
109
|
+
|
110
|
+
@unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
|
111
|
+
def test_with_context_peer_copy(self):
|
112
|
+
# Peer access is not always possible - for example, with one GPU in TCC
|
113
|
+
# mode and one in WDDM - if that is the case, this test would fail so
|
114
|
+
# we need to skip it.
|
115
|
+
with cuda.gpus[0]:
|
116
|
+
ctx = cuda.current_context()
|
117
|
+
if not ctx.can_access_peer(1):
|
118
|
+
self.skipTest('Peer access between GPUs disabled')
|
119
|
+
|
120
|
+
# 1. Create a range in an array
|
121
|
+
hostarr = np.arange(10, dtype=np.float32)
|
122
|
+
|
123
|
+
# 2. Copy range array from host -> GPU 0
|
124
|
+
with cuda.gpus[0]:
|
125
|
+
arr1 = cuda.to_device(hostarr)
|
126
|
+
|
127
|
+
# 3. Initialize a zero-filled array on GPU 1
|
128
|
+
with cuda.gpus[1]:
|
129
|
+
arr2 = cuda.to_device(np.zeros_like(hostarr))
|
130
|
+
|
131
|
+
with cuda.gpus[0]:
|
132
|
+
# 4. Copy range from GPU 0 -> GPU 1
|
133
|
+
arr2.copy_to_device(arr1)
|
134
|
+
|
135
|
+
# 5. Copy range from GPU 1 -> host and check contents
|
136
|
+
np.testing.assert_equal(arr2.copy_to_host(), hostarr)
|
137
|
+
|
138
|
+
|
139
|
+
if __name__ == '__main__':
|
140
|
+
unittest.main()
|
@@ -0,0 +1,46 @@
|
|
1
|
+
import os
|
2
|
+
import multiprocessing as mp
|
3
|
+
|
4
|
+
import numpy as np
|
5
|
+
|
6
|
+
from numba import cuda
|
7
|
+
from numba.cuda.testing import skip_on_cudasim, CUDATestCase
|
8
|
+
import unittest
|
9
|
+
|
10
|
+
has_mp_get_context = hasattr(mp, 'get_context')
|
11
|
+
is_unix = os.name == 'posix'
|
12
|
+
|
13
|
+
|
14
|
+
def fork_test(q):
|
15
|
+
from numba.cuda.cudadrv.error import CudaDriverError
|
16
|
+
try:
|
17
|
+
cuda.to_device(np.arange(1))
|
18
|
+
except CudaDriverError as e:
|
19
|
+
q.put(e)
|
20
|
+
else:
|
21
|
+
q.put(None)
|
22
|
+
|
23
|
+
|
24
|
+
@skip_on_cudasim('disabled for cudasim')
|
25
|
+
class TestMultiprocessing(CUDATestCase):
|
26
|
+
@unittest.skipUnless(has_mp_get_context, 'requires mp.get_context')
|
27
|
+
@unittest.skipUnless(is_unix, 'requires Unix')
|
28
|
+
def test_fork(self):
|
29
|
+
"""
|
30
|
+
Test fork detection.
|
31
|
+
"""
|
32
|
+
cuda.current_context() # force cuda initialize
|
33
|
+
# fork in process that also uses CUDA
|
34
|
+
ctx = mp.get_context('fork')
|
35
|
+
q = ctx.Queue()
|
36
|
+
proc = ctx.Process(target=fork_test, args=[q])
|
37
|
+
proc.start()
|
38
|
+
exc = q.get()
|
39
|
+
proc.join()
|
40
|
+
# there should be an exception raised in the child process
|
41
|
+
self.assertIsNotNone(exc)
|
42
|
+
self.assertIn('CUDA initialized before forking', str(exc))
|
43
|
+
|
44
|
+
|
45
|
+
if __name__ == '__main__':
|
46
|
+
unittest.main()
|
@@ -0,0 +1,101 @@
|
|
1
|
+
import traceback
|
2
|
+
import threading
|
3
|
+
import multiprocessing
|
4
|
+
import numpy as np
|
5
|
+
from numba import cuda
|
6
|
+
from numba.cuda.testing import (skip_on_cudasim, skip_under_cuda_memcheck,
|
7
|
+
CUDATestCase)
|
8
|
+
import unittest
|
9
|
+
|
10
|
+
try:
|
11
|
+
from concurrent.futures import ThreadPoolExecutor
|
12
|
+
except ImportError:
|
13
|
+
has_concurrent_futures = False
|
14
|
+
else:
|
15
|
+
has_concurrent_futures = True
|
16
|
+
|
17
|
+
|
18
|
+
has_mp_get_context = hasattr(multiprocessing, 'get_context')
|
19
|
+
|
20
|
+
|
21
|
+
def check_concurrent_compiling():
|
22
|
+
@cuda.jit
|
23
|
+
def foo(x):
|
24
|
+
x[0] += 1
|
25
|
+
|
26
|
+
def use_foo(x):
|
27
|
+
foo[1, 1](x)
|
28
|
+
return x
|
29
|
+
|
30
|
+
arrays = [cuda.to_device(np.arange(10)) for i in range(10)]
|
31
|
+
expected = np.arange(10)
|
32
|
+
expected[0] += 1
|
33
|
+
with ThreadPoolExecutor(max_workers=4) as e:
|
34
|
+
for ary in e.map(use_foo, arrays):
|
35
|
+
np.testing.assert_equal(ary, expected)
|
36
|
+
|
37
|
+
|
38
|
+
def spawn_process_entry(q):
|
39
|
+
try:
|
40
|
+
check_concurrent_compiling()
|
41
|
+
# Catch anything that goes wrong in the threads
|
42
|
+
except: # noqa: E722
|
43
|
+
msg = traceback.format_exc()
|
44
|
+
q.put('\n'.join(['', '=' * 80, msg]))
|
45
|
+
else:
|
46
|
+
q.put(None)
|
47
|
+
|
48
|
+
|
49
|
+
@skip_under_cuda_memcheck('Hangs cuda-memcheck')
|
50
|
+
@skip_on_cudasim('disabled for cudasim')
|
51
|
+
class TestMultiThreadCompiling(CUDATestCase):
|
52
|
+
|
53
|
+
@unittest.skipIf(not has_concurrent_futures, "no concurrent.futures")
|
54
|
+
def test_concurrent_compiling(self):
|
55
|
+
check_concurrent_compiling()
|
56
|
+
|
57
|
+
@unittest.skipIf(not has_mp_get_context, "no multiprocessing.get_context")
|
58
|
+
def test_spawn_concurrent_compilation(self):
|
59
|
+
# force CUDA context init
|
60
|
+
cuda.get_current_device()
|
61
|
+
# use "spawn" to avoid inheriting the CUDA context
|
62
|
+
ctx = multiprocessing.get_context('spawn')
|
63
|
+
|
64
|
+
q = ctx.Queue()
|
65
|
+
p = ctx.Process(target=spawn_process_entry, args=(q,))
|
66
|
+
p.start()
|
67
|
+
try:
|
68
|
+
err = q.get()
|
69
|
+
finally:
|
70
|
+
p.join()
|
71
|
+
if err is not None:
|
72
|
+
raise AssertionError(err)
|
73
|
+
self.assertEqual(p.exitcode, 0, 'test failed in child process')
|
74
|
+
|
75
|
+
def test_invalid_context_error_with_d2h(self):
|
76
|
+
def d2h(arr, out):
|
77
|
+
out[:] = arr.copy_to_host()
|
78
|
+
|
79
|
+
arr = np.arange(1, 4)
|
80
|
+
out = np.zeros_like(arr)
|
81
|
+
darr = cuda.to_device(arr)
|
82
|
+
th = threading.Thread(target=d2h, args=[darr, out])
|
83
|
+
th.start()
|
84
|
+
th.join()
|
85
|
+
np.testing.assert_equal(arr, out)
|
86
|
+
|
87
|
+
def test_invalid_context_error_with_d2d(self):
|
88
|
+
def d2d(dst, src):
|
89
|
+
dst.copy_to_device(src)
|
90
|
+
|
91
|
+
arr = np.arange(100)
|
92
|
+
common = cuda.to_device(arr)
|
93
|
+
darr = cuda.to_device(np.zeros(common.shape, dtype=common.dtype))
|
94
|
+
th = threading.Thread(target=d2d, args=[darr, common])
|
95
|
+
th.start()
|
96
|
+
th.join()
|
97
|
+
np.testing.assert_equal(darr.copy_to_host(), arr)
|
98
|
+
|
99
|
+
|
100
|
+
if __name__ == '__main__':
|
101
|
+
unittest.main()
|
@@ -0,0 +1,49 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from numba import cuda, float32, void
|
3
|
+
from numba.cuda.testing import unittest, CUDATestCase
|
4
|
+
|
5
|
+
|
6
|
+
def generate_input(n):
|
7
|
+
A = np.array(np.arange(n * n).reshape(n, n), dtype=np.float32)
|
8
|
+
B = np.array(np.arange(n) + 0, dtype=A.dtype)
|
9
|
+
return A, B
|
10
|
+
|
11
|
+
|
12
|
+
class TestCudaNonDet(CUDATestCase):
|
13
|
+
def test_for_pre(self):
|
14
|
+
"""Test issue with loop not running due to bad sign-extension at the for
|
15
|
+
loop precondition.
|
16
|
+
"""
|
17
|
+
|
18
|
+
@cuda.jit(void(float32[:, :], float32[:, :], float32[:]))
|
19
|
+
def diagproduct(c, a, b):
|
20
|
+
startX, startY = cuda.grid(2)
|
21
|
+
gridX = cuda.gridDim.x * cuda.blockDim.x
|
22
|
+
gridY = cuda.gridDim.y * cuda.blockDim.y
|
23
|
+
height = c.shape[0]
|
24
|
+
width = c.shape[1]
|
25
|
+
|
26
|
+
for x in range(startX, width, (gridX)):
|
27
|
+
for y in range(startY, height, (gridY)):
|
28
|
+
c[y, x] = a[y, x] * b[x]
|
29
|
+
|
30
|
+
N = 8
|
31
|
+
|
32
|
+
A, B = generate_input(N)
|
33
|
+
|
34
|
+
F = np.empty(A.shape, dtype=A.dtype)
|
35
|
+
|
36
|
+
blockdim = (32, 8)
|
37
|
+
griddim = (1, 1)
|
38
|
+
|
39
|
+
dA = cuda.to_device(A)
|
40
|
+
dB = cuda.to_device(B)
|
41
|
+
dF = cuda.to_device(F, copy=False)
|
42
|
+
diagproduct[griddim, blockdim](dF, dA, dB)
|
43
|
+
|
44
|
+
E = np.dot(A, np.diag(B))
|
45
|
+
np.testing.assert_array_almost_equal(dF.copy_to_host(), E)
|
46
|
+
|
47
|
+
|
48
|
+
if __name__ == '__main__':
|
49
|
+
unittest.main()
|