numba-cuda 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.py +17 -13
- numba_cuda/VERSION +1 -1
- numba_cuda/_version.py +4 -1
- numba_cuda/numba/cuda/__init__.py +6 -2
- numba_cuda/numba/cuda/api.py +129 -86
- numba_cuda/numba/cuda/api_util.py +3 -3
- numba_cuda/numba/cuda/args.py +12 -16
- numba_cuda/numba/cuda/cg.py +6 -6
- numba_cuda/numba/cuda/codegen.py +74 -43
- numba_cuda/numba/cuda/compiler.py +246 -114
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
- numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
- numba_cuda/numba/cuda/cuda_paths.py +293 -99
- numba_cuda/numba/cuda/cudadecl.py +93 -79
- numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
- numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
- numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
- numba_cuda/numba/cuda/cudadrv/driver.py +460 -297
- numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
- numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
- numba_cuda/numba/cuda/cudadrv/error.py +6 -2
- numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
- numba_cuda/numba/cuda/cudadrv/linkable_code.py +27 -3
- numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +146 -30
- numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
- numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
- numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
- numba_cuda/numba/cuda/cudaimpl.py +296 -275
- numba_cuda/numba/cuda/cudamath.py +1 -1
- numba_cuda/numba/cuda/debuginfo.py +99 -7
- numba_cuda/numba/cuda/decorators.py +87 -45
- numba_cuda/numba/cuda/descriptor.py +1 -1
- numba_cuda/numba/cuda/device_init.py +68 -18
- numba_cuda/numba/cuda/deviceufunc.py +143 -98
- numba_cuda/numba/cuda/dispatcher.py +300 -213
- numba_cuda/numba/cuda/errors.py +13 -10
- numba_cuda/numba/cuda/extending.py +55 -1
- numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
- numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
- numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +1090 -927
- numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +468 -319
- numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
- numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
- numba_cuda/numba/cuda/initialize.py +5 -3
- numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
- numba_cuda/numba/cuda/intrinsics.py +203 -28
- numba_cuda/numba/cuda/kernels/reduction.py +13 -13
- numba_cuda/numba/cuda/kernels/transpose.py +3 -6
- numba_cuda/numba/cuda/libdevice.py +317 -317
- numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
- numba_cuda/numba/cuda/locks.py +16 -0
- numba_cuda/numba/cuda/lowering.py +43 -0
- numba_cuda/numba/cuda/mathimpl.py +62 -57
- numba_cuda/numba/cuda/models.py +1 -5
- numba_cuda/numba/cuda/nvvmutils.py +103 -88
- numba_cuda/numba/cuda/printimpl.py +9 -5
- numba_cuda/numba/cuda/random.py +46 -36
- numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
- numba_cuda/numba/cuda/runtime/__init__.py +1 -1
- numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
- numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
- numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
- numba_cuda/numba/cuda/runtime/nrt.py +48 -43
- numba_cuda/numba/cuda/simulator/__init__.py +22 -12
- numba_cuda/numba/cuda/simulator/api.py +38 -22
- numba_cuda/numba/cuda/simulator/compiler.py +2 -2
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
- numba_cuda/numba/cuda/simulator/kernel.py +43 -34
- numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
- numba_cuda/numba/cuda/simulator/reduction.py +1 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
- numba_cuda/numba/cuda/simulator_init.py +2 -4
- numba_cuda/numba/cuda/stubs.py +134 -108
- numba_cuda/numba/cuda/target.py +92 -47
- numba_cuda/numba/cuda/testing.py +24 -19
- numba_cuda/numba/cuda/tests/__init__.py +14 -12
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +10 -7
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +59 -23
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +77 -28
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +24 -7
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +21 -12
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
- numba_cuda/numba/cuda/tests/cudapy/test_inline.py +59 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
- numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +81 -30
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
- numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
- numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
- numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
- numba_cuda/numba/cuda/types.py +5 -2
- numba_cuda/numba/cuda/ufuncs.py +382 -362
- numba_cuda/numba/cuda/utils.py +2 -2
- numba_cuda/numba/cuda/vector_types.py +5 -3
- numba_cuda/numba/cuda/vectorizers.py +38 -33
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/METADATA +1 -1
- numba_cuda-0.10.0.dist-info/RECORD +263 -0
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/WHEEL +1 -1
- numba_cuda-0.8.1.dist-info/RECORD +0 -251
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,9 @@
|
|
1
|
+
import re
|
2
|
+
|
1
3
|
import numpy as np
|
2
4
|
from numba import cuda, int32, int64, float32, float64
|
3
5
|
from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
|
6
|
+
from numba.cuda.compiler import compile_ptx
|
4
7
|
from numba.core import config
|
5
8
|
|
6
9
|
|
@@ -8,73 +11,73 @@ def useful_syncwarp(ary):
|
|
8
11
|
i = cuda.grid(1)
|
9
12
|
if i == 0:
|
10
13
|
ary[0] = 42
|
11
|
-
cuda.syncwarp(
|
14
|
+
cuda.syncwarp(0xFFFFFFFF)
|
12
15
|
ary[i] = ary[0]
|
13
16
|
|
14
17
|
|
15
18
|
def use_shfl_sync_idx(ary, idx):
|
16
19
|
i = cuda.grid(1)
|
17
|
-
val = cuda.shfl_sync(
|
20
|
+
val = cuda.shfl_sync(0xFFFFFFFF, i, idx)
|
18
21
|
ary[i] = val
|
19
22
|
|
20
23
|
|
21
24
|
def use_shfl_sync_up(ary, delta):
|
22
25
|
i = cuda.grid(1)
|
23
|
-
val = cuda.shfl_up_sync(
|
26
|
+
val = cuda.shfl_up_sync(0xFFFFFFFF, i, delta)
|
24
27
|
ary[i] = val
|
25
28
|
|
26
29
|
|
27
30
|
def use_shfl_sync_down(ary, delta):
|
28
31
|
i = cuda.grid(1)
|
29
|
-
val = cuda.shfl_down_sync(
|
32
|
+
val = cuda.shfl_down_sync(0xFFFFFFFF, i, delta)
|
30
33
|
ary[i] = val
|
31
34
|
|
32
35
|
|
33
36
|
def use_shfl_sync_xor(ary, xor):
|
34
37
|
i = cuda.grid(1)
|
35
|
-
val = cuda.shfl_xor_sync(
|
38
|
+
val = cuda.shfl_xor_sync(0xFFFFFFFF, i, xor)
|
36
39
|
ary[i] = val
|
37
40
|
|
38
41
|
|
39
42
|
def use_shfl_sync_with_val(ary, into):
|
40
43
|
i = cuda.grid(1)
|
41
|
-
val = cuda.shfl_sync(
|
44
|
+
val = cuda.shfl_sync(0xFFFFFFFF, into, 0)
|
42
45
|
ary[i] = val
|
43
46
|
|
44
47
|
|
45
48
|
def use_vote_sync_all(ary_in, ary_out):
|
46
49
|
i = cuda.grid(1)
|
47
|
-
pred = cuda.all_sync(
|
50
|
+
pred = cuda.all_sync(0xFFFFFFFF, ary_in[i])
|
48
51
|
ary_out[i] = pred
|
49
52
|
|
50
53
|
|
51
54
|
def use_vote_sync_any(ary_in, ary_out):
|
52
55
|
i = cuda.grid(1)
|
53
|
-
pred = cuda.any_sync(
|
56
|
+
pred = cuda.any_sync(0xFFFFFFFF, ary_in[i])
|
54
57
|
ary_out[i] = pred
|
55
58
|
|
56
59
|
|
57
60
|
def use_vote_sync_eq(ary_in, ary_out):
|
58
61
|
i = cuda.grid(1)
|
59
|
-
pred = cuda.eq_sync(
|
62
|
+
pred = cuda.eq_sync(0xFFFFFFFF, ary_in[i])
|
60
63
|
ary_out[i] = pred
|
61
64
|
|
62
65
|
|
63
66
|
def use_vote_sync_ballot(ary):
|
64
67
|
i = cuda.threadIdx.x
|
65
|
-
ballot = cuda.ballot_sync(
|
68
|
+
ballot = cuda.ballot_sync(0xFFFFFFFF, True)
|
66
69
|
ary[i] = ballot
|
67
70
|
|
68
71
|
|
69
72
|
def use_match_any_sync(ary_in, ary_out):
|
70
73
|
i = cuda.grid(1)
|
71
|
-
ballot = cuda.match_any_sync(
|
74
|
+
ballot = cuda.match_any_sync(0xFFFFFFFF, ary_in[i])
|
72
75
|
ary_out[i] = ballot
|
73
76
|
|
74
77
|
|
75
78
|
def use_match_all_sync(ary_in, ary_out):
|
76
79
|
i = cuda.grid(1)
|
77
|
-
ballot, pred = cuda.match_all_sync(
|
80
|
+
ballot, pred = cuda.match_all_sync(0xFFFFFFFF, ary_in[i])
|
78
81
|
ary_out[i] = ballot if pred else 0
|
79
82
|
|
80
83
|
|
@@ -144,16 +147,62 @@ class TestCudaWarpOperations(CUDATestCase):
|
|
144
147
|
compiled[1, nelem](ary, xor)
|
145
148
|
self.assertTrue(np.all(ary == exp))
|
146
149
|
|
150
|
+
def test_shfl_sync_const_mode_val(self):
|
151
|
+
# Test `mode` argument is constant in shfl_sync calls.
|
152
|
+
# Related to https://github.com/NVIDIA/numba-cuda/pull/231
|
153
|
+
subtest = [
|
154
|
+
(use_shfl_sync_idx, 4),
|
155
|
+
(use_shfl_sync_up, 4),
|
156
|
+
(use_shfl_sync_down, 4),
|
157
|
+
(use_shfl_sync_xor, 16),
|
158
|
+
]
|
159
|
+
|
160
|
+
args_re = r"\((.*)\)"
|
161
|
+
m = re.compile(args_re)
|
162
|
+
|
163
|
+
for func, value in subtest:
|
164
|
+
with self.subTest(func=func.__name__):
|
165
|
+
compiled = cuda.jit("void(int32[:], int32)")(func)
|
166
|
+
nelem = 32
|
167
|
+
ary = np.empty(nelem, dtype=np.int32)
|
168
|
+
compiled[1, nelem](ary, value)
|
169
|
+
irs = next(iter(compiled.inspect_llvm().values()))
|
170
|
+
|
171
|
+
for ir in irs.split("\n"):
|
172
|
+
if "call" in ir and "llvm.nvvm.shfl.sync.i32" in ir:
|
173
|
+
args = m.search(ir).group(0)
|
174
|
+
arglist = args.split(",")
|
175
|
+
mode_arg = arglist[1]
|
176
|
+
self.assertNotIn("%", mode_arg)
|
177
|
+
|
178
|
+
def test_shfl_sync_const_mode_val_sm100(self):
|
179
|
+
# Test shfl_sync compiles with cc=(10, 0)
|
180
|
+
subtest = [
|
181
|
+
use_shfl_sync_idx,
|
182
|
+
use_shfl_sync_up,
|
183
|
+
use_shfl_sync_down,
|
184
|
+
use_shfl_sync_xor,
|
185
|
+
]
|
186
|
+
|
187
|
+
for func in subtest:
|
188
|
+
with self.subTest(func=func.__name__):
|
189
|
+
compile_ptx(func, (int32[:], int32), cc=(10, 0))
|
190
|
+
|
147
191
|
def test_shfl_sync_types(self):
|
148
192
|
types = int32, int64, float32, float64
|
149
|
-
values = (
|
150
|
-
|
193
|
+
values = (
|
194
|
+
np.int32(-1),
|
195
|
+
np.int64(1 << 42),
|
196
|
+
np.float32(np.pi),
|
197
|
+
np.float64(np.pi),
|
198
|
+
)
|
151
199
|
for typ, val in zip(types, values):
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
200
|
+
with self.subTest(typ=typ):
|
201
|
+
compiled = cuda.jit((typ[:], typ))(use_shfl_sync_with_val)
|
202
|
+
nelem = 32
|
203
|
+
ary = np.empty(nelem, dtype=val.dtype)
|
204
|
+
compiled[1, nelem](ary, val)
|
205
|
+
self.assertTrue(np.all(ary == val))
|
157
206
|
|
158
207
|
def test_vote_sync_all(self):
|
159
208
|
compiled = cuda.jit("void(int32[:], int32[:])")(use_vote_sync_all)
|
@@ -197,10 +246,11 @@ class TestCudaWarpOperations(CUDATestCase):
|
|
197
246
|
nelem = 32
|
198
247
|
ary = np.empty(nelem, dtype=np.uint32)
|
199
248
|
compiled[1, nelem](ary)
|
200
|
-
self.assertTrue(np.all(ary == np.uint32(
|
249
|
+
self.assertTrue(np.all(ary == np.uint32(0xFFFFFFFF)))
|
201
250
|
|
202
|
-
@unittest.skipUnless(
|
203
|
-
|
251
|
+
@unittest.skipUnless(
|
252
|
+
_safe_cc_check((7, 0)), "Matching requires at least Volta Architecture"
|
253
|
+
)
|
204
254
|
def test_match_any_sync(self):
|
205
255
|
compiled = cuda.jit("void(int32[:], int32[:])")(use_match_any_sync)
|
206
256
|
nelem = 10
|
@@ -210,8 +260,9 @@ class TestCudaWarpOperations(CUDATestCase):
|
|
210
260
|
compiled[1, nelem](ary_in, ary_out)
|
211
261
|
self.assertTrue(np.all(ary_out == exp))
|
212
262
|
|
213
|
-
@unittest.skipUnless(
|
214
|
-
|
263
|
+
@unittest.skipUnless(
|
264
|
+
_safe_cc_check((7, 0)), "Matching requires at least Volta Architecture"
|
265
|
+
)
|
215
266
|
def test_match_all_sync(self):
|
216
267
|
compiled = cuda.jit("void(int32[:], int32[:])")(use_match_all_sync)
|
217
268
|
nelem = 10
|
@@ -223,9 +274,10 @@ class TestCudaWarpOperations(CUDATestCase):
|
|
223
274
|
compiled[1, nelem](ary_in, ary_out)
|
224
275
|
self.assertTrue(np.all(ary_out == 0))
|
225
276
|
|
226
|
-
@unittest.skipUnless(
|
227
|
-
|
228
|
-
|
277
|
+
@unittest.skipUnless(
|
278
|
+
_safe_cc_check((7, 0)),
|
279
|
+
"Independent scheduling requires at least Volta Architecture",
|
280
|
+
)
|
229
281
|
def test_independent_scheduling(self):
|
230
282
|
compiled = cuda.jit("void(uint32[:])")(use_independent_scheduling)
|
231
283
|
arr = np.empty(32, dtype=np.uint32)
|
@@ -267,10 +319,9 @@ class TestCudaWarpOperations(CUDATestCase):
|
|
267
319
|
# 0, 1, 3, 7, F, 1F, 3F, 7F, FF, 1FF, etc.
|
268
320
|
# or in binary:
|
269
321
|
# ...0001, ....0011, ...0111, etc.
|
270
|
-
expected = np.asarray([(2
|
271
|
-
dtype=np.uint32)
|
322
|
+
expected = np.asarray([(2**i) - 1 for i in range(32)], dtype=np.uint32)
|
272
323
|
np.testing.assert_equal(expected, out)
|
273
324
|
|
274
325
|
|
275
|
-
if __name__ ==
|
326
|
+
if __name__ == "__main__":
|
276
327
|
unittest.main()
|
@@ -10,12 +10,16 @@ import unittest
|
|
10
10
|
|
11
11
|
class TestCudaSimIssues(CUDATestCase):
|
12
12
|
def test_record_access(self):
|
13
|
-
backyard_type = [
|
14
|
-
|
13
|
+
backyard_type = [
|
14
|
+
("statue", np.float64),
|
15
|
+
("newspaper", np.float64, (6,)),
|
16
|
+
]
|
15
17
|
|
16
|
-
goose_type = [
|
17
|
-
|
18
|
-
|
18
|
+
goose_type = [
|
19
|
+
("garden", np.float64, (12,)),
|
20
|
+
("town", np.float64, (42,)),
|
21
|
+
("backyard", backyard_type),
|
22
|
+
]
|
19
23
|
|
20
24
|
goose_np_type = np.dtype(goose_type, align=True)
|
21
25
|
|
@@ -27,20 +31,22 @@ class TestCudaSimIssues(CUDATestCase):
|
|
27
31
|
|
28
32
|
item = np.recarray(1, dtype=goose_np_type)
|
29
33
|
simple_kernel[1, 1](item[0])
|
30
|
-
np.testing.assert_equal(item[0][
|
31
|
-
np.testing.assert_equal(item[0][
|
34
|
+
np.testing.assert_equal(item[0]["garden"][0], 45)
|
35
|
+
np.testing.assert_equal(item[0]["backyard"]["newspaper"][3], 5)
|
32
36
|
|
33
37
|
def test_recarray_setting(self):
|
34
|
-
recordwith2darray = np.dtype(
|
35
|
-
|
38
|
+
recordwith2darray = np.dtype(
|
39
|
+
[("i", np.int32), ("j", np.float32, (3, 2))]
|
40
|
+
)
|
36
41
|
rec = np.recarray(2, dtype=recordwith2darray)
|
37
|
-
rec[0][
|
42
|
+
rec[0]["i"] = 45
|
38
43
|
|
39
44
|
@cuda.jit
|
40
45
|
def simple_kernel(f):
|
41
46
|
f[1] = f[0]
|
47
|
+
|
42
48
|
simple_kernel[1, 1](rec)
|
43
|
-
np.testing.assert_equal(rec[0][
|
49
|
+
np.testing.assert_equal(rec[0]["i"], rec[1]["i"])
|
44
50
|
|
45
51
|
def test_cuda_module_in_device_function(self):
|
46
52
|
"""
|
@@ -63,7 +69,7 @@ class TestCudaSimIssues(CUDATestCase):
|
|
63
69
|
expected = np.arange(arr.size, dtype=np.int32)
|
64
70
|
np.testing.assert_equal(expected, arr)
|
65
71
|
|
66
|
-
@skip_unless_cudasim(
|
72
|
+
@skip_unless_cudasim("Only works on CUDASIM")
|
67
73
|
def test_deadlock_on_exception(self):
|
68
74
|
def assert_no_blockthreads():
|
69
75
|
blockthreads = []
|
@@ -98,5 +104,5 @@ class TestCudaSimIssues(CUDATestCase):
|
|
98
104
|
assert_no_blockthreads()
|
99
105
|
|
100
106
|
|
101
|
-
if __name__ ==
|
107
|
+
if __name__ == "__main__":
|
102
108
|
unittest.main()
|
@@ -2,14 +2,18 @@
|
|
2
2
|
# "magictoken" is used for markers as beginning and ending of example text.
|
3
3
|
|
4
4
|
import unittest
|
5
|
-
from numba.cuda.testing import (
|
6
|
-
|
7
|
-
|
5
|
+
from numba.cuda.testing import (
|
6
|
+
CUDATestCase,
|
7
|
+
skip_on_cudasim,
|
8
|
+
skip_if_cudadevrt_missing,
|
9
|
+
skip_unless_cc_60,
|
10
|
+
skip_if_mvc_enabled,
|
11
|
+
)
|
8
12
|
|
9
13
|
|
10
14
|
@skip_if_cudadevrt_missing
|
11
15
|
@skip_unless_cc_60
|
12
|
-
@skip_if_mvc_enabled(
|
16
|
+
@skip_if_mvc_enabled("CG not supported with MVC")
|
13
17
|
@skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
|
14
18
|
class TestCooperativeGroups(CUDATestCase):
|
15
19
|
def test_ex_grid_sync(self):
|
@@ -17,7 +21,7 @@ class TestCooperativeGroups(CUDATestCase):
|
|
17
21
|
from numba import cuda, int32
|
18
22
|
import numpy as np
|
19
23
|
|
20
|
-
sig = (int32[
|
24
|
+
sig = (int32[:, ::1],)
|
21
25
|
|
22
26
|
@cuda.jit(sig)
|
23
27
|
def sequential_rows(M):
|
@@ -34,6 +38,7 @@ class TestCooperativeGroups(CUDATestCase):
|
|
34
38
|
# Wait until all threads have written their column element,
|
35
39
|
# and that the write is visible to all other threads
|
36
40
|
g.sync()
|
41
|
+
|
37
42
|
# magictoken.ex_grid_sync_kernel.end
|
38
43
|
|
39
44
|
# magictoken.ex_grid_sync_data.begin
|
@@ -48,9 +53,11 @@ class TestCooperativeGroups(CUDATestCase):
|
|
48
53
|
|
49
54
|
# Skip this test if the grid size used in the example is too large for
|
50
55
|
# a cooperative launch on the current GPU
|
51
|
-
mb = sequential_rows.overloads[sig].max_cooperative_grid_blocks(
|
56
|
+
mb = sequential_rows.overloads[sig].max_cooperative_grid_blocks(
|
57
|
+
blockdim
|
58
|
+
)
|
52
59
|
if mb < griddim:
|
53
|
-
self.skipTest(
|
60
|
+
self.skipTest("Device does not support a large enough coop grid")
|
54
61
|
|
55
62
|
# magictoken.ex_grid_sync_launch.begin
|
56
63
|
# Kernel launch - this is implicitly a cooperative launch
|
@@ -73,5 +80,5 @@ class TestCooperativeGroups(CUDATestCase):
|
|
73
80
|
np.testing.assert_equal(A, reference)
|
74
81
|
|
75
82
|
|
76
|
-
if __name__ ==
|
83
|
+
if __name__ == "__main__":
|
77
84
|
unittest.main()
|
@@ -41,6 +41,7 @@ class TestCpuGpuCompat(CUDATestCase):
|
|
41
41
|
@numba.jit
|
42
42
|
def business_logic(x, y, z):
|
43
43
|
return 4 * z * (2 * x - (4 * y) / 2 * pi)
|
44
|
+
|
44
45
|
# ex_cpu_gpu_compat.define.end
|
45
46
|
|
46
47
|
# ex_cpu_gpu_compat.cpurun.begin
|
@@ -54,6 +55,7 @@ class TestCpuGpuCompat(CUDATestCase):
|
|
54
55
|
if tid < len(xarr):
|
55
56
|
# The function decorated with numba.jit may be directly reused
|
56
57
|
res[tid] = business_logic(xarr[tid], yarr[tid], zarr[tid])
|
58
|
+
|
57
59
|
# ex_cpu_gpu_compat.usegpu.end
|
58
60
|
|
59
61
|
# ex_cpu_gpu_compat.launch.begin
|
@@ -62,14 +64,9 @@ class TestCpuGpuCompat(CUDATestCase):
|
|
62
64
|
# [-126.79644737231007, 416.28324559588634, -218912930.2987788]
|
63
65
|
# ex_cpu_gpu_compat.launch.end
|
64
66
|
|
65
|
-
expect = [
|
66
|
-
business_logic(x, y, z) for x, y, z in zip(X, Y, Z)
|
67
|
-
]
|
67
|
+
expect = [business_logic(x, y, z) for x, y, z in zip(X, Y, Z)]
|
68
68
|
|
69
|
-
np.testing.assert_equal(
|
70
|
-
expect,
|
71
|
-
results.copy_to_host()
|
72
|
-
)
|
69
|
+
np.testing.assert_equal(expect, results.copy_to_host())
|
73
70
|
|
74
71
|
|
75
72
|
if __name__ == "__main__":
|
@@ -2,7 +2,7 @@
|
|
2
2
|
# "magictoken" is used for markers as beginning and ending of example text.
|
3
3
|
|
4
4
|
import unittest
|
5
|
-
from numba.cuda.testing import
|
5
|
+
from numba.cuda.testing import CUDATestCase, skip_on_cudasim
|
6
6
|
from numba.tests.support import skip_unless_cffi
|
7
7
|
|
8
8
|
|
@@ -18,11 +18,12 @@ class TestFFI(CUDATestCase):
|
|
18
18
|
# Path to the source containing the foreign function
|
19
19
|
# (here assumed to be in a subdirectory called "ffi")
|
20
20
|
basedir = os.path.dirname(os.path.abspath(__file__))
|
21
|
-
functions_cu = os.path.join(basedir,
|
21
|
+
functions_cu = os.path.join(basedir, "ffi", "functions.cu")
|
22
22
|
|
23
23
|
# Declaration of the foreign function
|
24
|
-
mul = cuda.declare_device(
|
25
|
-
|
24
|
+
mul = cuda.declare_device(
|
25
|
+
"mul_f32_f32", "float32(float32, float32)", link=functions_cu
|
26
|
+
)
|
26
27
|
|
27
28
|
# A kernel that calls mul; functions.cu is linked automatically due to
|
28
29
|
# the call to mul.
|
@@ -52,25 +53,29 @@ class TestFFI(CUDATestCase):
|
|
52
53
|
import os
|
53
54
|
|
54
55
|
basedir = os.path.dirname(os.path.abspath(__file__))
|
55
|
-
functions_cu = os.path.join(basedir,
|
56
|
+
functions_cu = os.path.join(basedir, "ffi", "functions.cu")
|
56
57
|
|
57
58
|
# magictoken.ex_from_buffer_decl.begin
|
58
|
-
signature =
|
59
|
-
sum_reduce = cuda.declare_device(
|
60
|
-
|
59
|
+
signature = "float32(CPointer(float32), int32)"
|
60
|
+
sum_reduce = cuda.declare_device(
|
61
|
+
"sum_reduce", signature, link=functions_cu
|
62
|
+
)
|
61
63
|
# magictoken.ex_from_buffer_decl.end
|
62
64
|
|
63
65
|
# magictoken.ex_from_buffer_kernel.begin
|
64
66
|
import cffi
|
67
|
+
|
65
68
|
ffi = cffi.FFI()
|
66
69
|
|
67
70
|
@cuda.jit
|
68
71
|
def reduction_caller(result, array):
|
69
72
|
array_ptr = ffi.from_buffer(array)
|
70
73
|
result[()] = sum_reduce(array_ptr, len(array))
|
74
|
+
|
71
75
|
# magictoken.ex_from_buffer_kernel.end
|
72
76
|
|
73
77
|
import numpy as np
|
78
|
+
|
74
79
|
x = np.arange(10).astype(np.float32)
|
75
80
|
r = np.ndarray((), dtype=np.float32)
|
76
81
|
|
@@ -81,5 +86,5 @@ class TestFFI(CUDATestCase):
|
|
81
86
|
np.testing.assert_allclose(expected, actual)
|
82
87
|
|
83
88
|
|
84
|
-
if __name__ ==
|
89
|
+
if __name__ == "__main__":
|
85
90
|
unittest.main()
|
@@ -1,14 +1,18 @@
|
|
1
1
|
import unittest
|
2
2
|
|
3
|
-
from numba.cuda.testing import (
|
4
|
-
|
5
|
-
|
3
|
+
from numba.cuda.testing import (
|
4
|
+
CUDATestCase,
|
5
|
+
skip_if_cudadevrt_missing,
|
6
|
+
skip_on_cudasim,
|
7
|
+
skip_unless_cc_60,
|
8
|
+
skip_if_mvc_enabled,
|
9
|
+
)
|
6
10
|
from numba.tests.support import captured_stdout
|
7
11
|
|
8
12
|
|
9
13
|
@skip_if_cudadevrt_missing
|
10
14
|
@skip_unless_cc_60
|
11
|
-
@skip_if_mvc_enabled(
|
15
|
+
@skip_if_mvc_enabled("CG not supported with MVC")
|
12
16
|
@skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
|
13
17
|
class TestLaplace(CUDATestCase):
|
14
18
|
"""
|
@@ -27,7 +31,6 @@ class TestLaplace(CUDATestCase):
|
|
27
31
|
super().tearDown()
|
28
32
|
|
29
33
|
def test_ex_laplace(self):
|
30
|
-
|
31
34
|
# set True to regenerate the figures that
|
32
35
|
# accompany this example
|
33
36
|
plot = False
|
@@ -55,24 +58,25 @@ class TestLaplace(CUDATestCase):
|
|
55
58
|
|
56
59
|
if plot:
|
57
60
|
import matplotlib.pyplot as plt
|
61
|
+
|
58
62
|
fig, ax = plt.subplots(figsize=(16 * 0.66, 9 * 0.66))
|
59
63
|
plt.plot(
|
60
64
|
np.arange(len(buf_0)),
|
61
65
|
buf_0.copy_to_host(),
|
62
66
|
lw=3,
|
63
67
|
marker="*",
|
64
|
-
color=
|
68
|
+
color="black",
|
65
69
|
)
|
66
70
|
|
67
|
-
plt.title(
|
68
|
-
plt.xlabel(
|
69
|
-
plt.ylabel(
|
71
|
+
plt.title("Initial State", fontsize=24)
|
72
|
+
plt.xlabel("Position", fontsize=24)
|
73
|
+
plt.ylabel("Temperature", fontsize=24)
|
70
74
|
|
71
75
|
ax.set_xticks(ax.get_xticks(), fontsize=16)
|
72
76
|
ax.set_yticks(ax.get_yticks(), fontsize=16)
|
73
77
|
plt.xlim(0, len(data))
|
74
78
|
plt.ylim(0, 10001)
|
75
|
-
plt.savefig(
|
79
|
+
plt.savefig("laplace_initial.svg")
|
76
80
|
|
77
81
|
# ex_laplace.kernel.begin
|
78
82
|
@cuda.jit
|
@@ -116,12 +120,11 @@ class TestLaplace(CUDATestCase):
|
|
116
120
|
|
117
121
|
# Wait for every thread to write before moving on
|
118
122
|
grid.sync()
|
123
|
+
|
119
124
|
# ex_laplace.kernel.end
|
120
125
|
|
121
126
|
# ex_laplace.launch.begin
|
122
|
-
solve_heat_equation.forall(len(data))(
|
123
|
-
buf_0, buf_1, niter, 0.25
|
124
|
-
)
|
127
|
+
solve_heat_equation.forall(len(data))(buf_0, buf_1, niter, 0.25)
|
125
128
|
# ex_laplace.launch.end
|
126
129
|
|
127
130
|
results = buf_1.copy_to_host()
|
@@ -129,20 +132,21 @@ class TestLaplace(CUDATestCase):
|
|
129
132
|
fig, ax = plt.subplots(figsize=(16 * 0.66, 9 * 0.66))
|
130
133
|
plt.plot(
|
131
134
|
np.arange(len(results)),
|
132
|
-
results,
|
135
|
+
results,
|
136
|
+
lw=3,
|
133
137
|
marker="*",
|
134
|
-
color=
|
138
|
+
color="black",
|
135
139
|
)
|
136
140
|
plt.title(f"T = {niter}", fontsize=24)
|
137
|
-
plt.xlabel(
|
138
|
-
plt.ylabel(
|
141
|
+
plt.xlabel("Position", fontsize=24)
|
142
|
+
plt.ylabel("Temperature", fontsize=24)
|
139
143
|
|
140
144
|
ax.set_xticks(ax.get_xticks(), fontsize=16)
|
141
145
|
ax.set_yticks(ax.get_yticks(), fontsize=16)
|
142
146
|
|
143
147
|
plt.ylim(0, max(results))
|
144
148
|
plt.xlim(0, len(results))
|
145
|
-
plt.savefig(
|
149
|
+
plt.savefig("laplace_final.svg")
|
146
150
|
|
147
151
|
# Integral over the domain should be equal to its initial value.
|
148
152
|
# Note that this should match the initial value of data[500] above, but
|
@@ -6,6 +6,7 @@ Reference: https://stackoverflow.com/a/64198479/13697228 by @RobertCrovella
|
|
6
6
|
Contents in this file are referenced from the sphinx-generated docs.
|
7
7
|
"magictoken" is used for markers as beginning and ending of example text.
|
8
8
|
"""
|
9
|
+
|
9
10
|
import unittest
|
10
11
|
from numba.cuda.testing import CUDATestCase, skip_on_cudasim
|
11
12
|
from numba.tests.support import captured_stdout
|
@@ -43,10 +44,11 @@ class TestMatMul(CUDATestCase):
|
|
43
44
|
"""Perform square matrix multiplication of C = A * B."""
|
44
45
|
i, j = cuda.grid(2)
|
45
46
|
if i < C.shape[0] and j < C.shape[1]:
|
46
|
-
tmp = 0.
|
47
|
+
tmp = 0.0
|
47
48
|
for k in range(A.shape[1]):
|
48
49
|
tmp += A[i, k] * B[k, j]
|
49
50
|
C[i, j] = tmp
|
51
|
+
|
50
52
|
# magictoken.ex_matmul.end
|
51
53
|
|
52
54
|
# magictoken.ex_run_matmul.begin
|
@@ -91,11 +93,11 @@ class TestMatMul(CUDATestCase):
|
|
91
93
|
|
92
94
|
tx = cuda.threadIdx.x
|
93
95
|
ty = cuda.threadIdx.y
|
94
|
-
bpg = cuda.gridDim.x
|
96
|
+
bpg = cuda.gridDim.x # blocks per grid
|
95
97
|
|
96
98
|
# Each thread computes one element in the result matrix.
|
97
99
|
# The dot product is chunked into dot products of TPB-long vectors.
|
98
|
-
tmp = float32(0.)
|
100
|
+
tmp = float32(0.0)
|
99
101
|
for i in range(bpg):
|
100
102
|
# Preload data into shared memory
|
101
103
|
sA[ty, tx] = 0
|
@@ -116,6 +118,7 @@ class TestMatMul(CUDATestCase):
|
|
116
118
|
cuda.syncthreads()
|
117
119
|
if y < C.shape[0] and x < C.shape[1]:
|
118
120
|
C[y, x] = tmp
|
121
|
+
|
119
122
|
# magictoken.ex_fast_matmul.end
|
120
123
|
|
121
124
|
# magictoken.ex_run_fast_matmul.begin
|
@@ -169,5 +172,5 @@ class TestMatMul(CUDATestCase):
|
|
169
172
|
self.assertTrue(np.all(z_h == x_h @ y_h), msg=msg)
|
170
173
|
|
171
174
|
|
172
|
-
if __name__ ==
|
175
|
+
if __name__ == "__main__":
|
173
176
|
unittest.main()
|
@@ -59,6 +59,7 @@ class TestMonteCarlo(CUDATestCase):
|
|
59
59
|
# value of the sample
|
60
60
|
y = func(samp)
|
61
61
|
out[gid] = y
|
62
|
+
|
62
63
|
# ex_montecarlo.kernel.end
|
63
64
|
|
64
65
|
# ex_montecarlo.callfunc.begin
|
@@ -84,6 +85,7 @@ class TestMonteCarlo(CUDATestCase):
|
|
84
85
|
factor = (upper_lim - lower_lim) / (nsamps - 1)
|
85
86
|
|
86
87
|
return sum_reduce(out) * factor
|
88
|
+
|
87
89
|
# ex_montecarlo.callfunc.end
|
88
90
|
|
89
91
|
# ex_montecarlo.launch.begin
|
@@ -10,8 +10,10 @@ class TestRandom(CUDATestCase):
|
|
10
10
|
def test_ex_3d_grid(self):
|
11
11
|
# magictoken.ex_3d_grid.begin
|
12
12
|
from numba import cuda
|
13
|
-
from numba.cuda.random import (
|
14
|
-
|
13
|
+
from numba.cuda.random import (
|
14
|
+
create_xoroshiro128p_states,
|
15
|
+
xoroshiro128p_uniform_float32,
|
16
|
+
)
|
15
17
|
import numpy as np
|
16
18
|
|
17
19
|
@cuda.jit
|
@@ -27,7 +29,9 @@ class TestRandom(CUDATestCase):
|
|
27
29
|
for i in range(startz, arr.shape[0], stridez):
|
28
30
|
for j in range(starty, arr.shape[1], stridey):
|
29
31
|
for k in range(startx, arr.shape[2], stridex):
|
30
|
-
arr[i, j, k] = xoroshiro128p_uniform_float32(
|
32
|
+
arr[i, j, k] = xoroshiro128p_uniform_float32(
|
33
|
+
rng_states, tid
|
34
|
+
)
|
31
35
|
|
32
36
|
# Array dimensions
|
33
37
|
X, Y, Z = 701, 900, 719
|
@@ -55,5 +59,5 @@ class TestRandom(CUDATestCase):
|
|
55
59
|
self.assertTrue(np.all(host_arr >= 0.0))
|
56
60
|
|
57
61
|
|
58
|
-
if __name__ ==
|
62
|
+
if __name__ == "__main__":
|
59
63
|
unittest.main()
|
@@ -61,11 +61,12 @@ class TestReduction(CUDATestCase):
|
|
61
61
|
# After the loop, the zeroth element contains the sum
|
62
62
|
if tid == 0:
|
63
63
|
data[tid] = shr[tid]
|
64
|
+
|
64
65
|
# ex_reduction.kernel.end
|
65
66
|
|
66
67
|
# ex_reduction.launch.begin
|
67
68
|
array_sum[1, nelem](a)
|
68
|
-
print(a[0])
|
69
|
+
print(a[0]) # 523776
|
69
70
|
print(sum(np.arange(1024))) # 523776
|
70
71
|
# ex_reduction.launch.end
|
71
72
|
|