numba-cuda 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.py +17 -13
- numba_cuda/VERSION +1 -1
- numba_cuda/_version.py +4 -1
- numba_cuda/numba/cuda/__init__.py +6 -2
- numba_cuda/numba/cuda/api.py +129 -86
- numba_cuda/numba/cuda/api_util.py +3 -3
- numba_cuda/numba/cuda/args.py +12 -16
- numba_cuda/numba/cuda/cg.py +6 -6
- numba_cuda/numba/cuda/codegen.py +74 -43
- numba_cuda/numba/cuda/compiler.py +246 -114
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
- numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
- numba_cuda/numba/cuda/cuda_paths.py +293 -99
- numba_cuda/numba/cuda/cudadecl.py +93 -79
- numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
- numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
- numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
- numba_cuda/numba/cuda/cudadrv/driver.py +460 -297
- numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
- numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
- numba_cuda/numba/cuda/cudadrv/error.py +6 -2
- numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
- numba_cuda/numba/cuda/cudadrv/linkable_code.py +27 -3
- numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +146 -30
- numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
- numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
- numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
- numba_cuda/numba/cuda/cudaimpl.py +296 -275
- numba_cuda/numba/cuda/cudamath.py +1 -1
- numba_cuda/numba/cuda/debuginfo.py +99 -7
- numba_cuda/numba/cuda/decorators.py +87 -45
- numba_cuda/numba/cuda/descriptor.py +1 -1
- numba_cuda/numba/cuda/device_init.py +68 -18
- numba_cuda/numba/cuda/deviceufunc.py +143 -98
- numba_cuda/numba/cuda/dispatcher.py +300 -213
- numba_cuda/numba/cuda/errors.py +13 -10
- numba_cuda/numba/cuda/extending.py +55 -1
- numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
- numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
- numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +1090 -927
- numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +468 -319
- numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
- numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
- numba_cuda/numba/cuda/initialize.py +5 -3
- numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
- numba_cuda/numba/cuda/intrinsics.py +203 -28
- numba_cuda/numba/cuda/kernels/reduction.py +13 -13
- numba_cuda/numba/cuda/kernels/transpose.py +3 -6
- numba_cuda/numba/cuda/libdevice.py +317 -317
- numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
- numba_cuda/numba/cuda/locks.py +16 -0
- numba_cuda/numba/cuda/lowering.py +43 -0
- numba_cuda/numba/cuda/mathimpl.py +62 -57
- numba_cuda/numba/cuda/models.py +1 -5
- numba_cuda/numba/cuda/nvvmutils.py +103 -88
- numba_cuda/numba/cuda/printimpl.py +9 -5
- numba_cuda/numba/cuda/random.py +46 -36
- numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
- numba_cuda/numba/cuda/runtime/__init__.py +1 -1
- numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
- numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
- numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
- numba_cuda/numba/cuda/runtime/nrt.py +48 -43
- numba_cuda/numba/cuda/simulator/__init__.py +22 -12
- numba_cuda/numba/cuda/simulator/api.py +38 -22
- numba_cuda/numba/cuda/simulator/compiler.py +2 -2
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
- numba_cuda/numba/cuda/simulator/kernel.py +43 -34
- numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
- numba_cuda/numba/cuda/simulator/reduction.py +1 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
- numba_cuda/numba/cuda/simulator_init.py +2 -4
- numba_cuda/numba/cuda/stubs.py +134 -108
- numba_cuda/numba/cuda/target.py +92 -47
- numba_cuda/numba/cuda/testing.py +24 -19
- numba_cuda/numba/cuda/tests/__init__.py +14 -12
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +10 -7
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +59 -23
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +77 -28
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +24 -7
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +21 -12
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
- numba_cuda/numba/cuda/tests/cudapy/test_inline.py +59 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
- numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +81 -30
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
- numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
- numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
- numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
- numba_cuda/numba/cuda/types.py +5 -2
- numba_cuda/numba/cuda/ufuncs.py +382 -362
- numba_cuda/numba/cuda/utils.py +2 -2
- numba_cuda/numba/cuda/vector_types.py +5 -3
- numba_cuda/numba/cuda/vectorizers.py +38 -33
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/METADATA +1 -1
- numba_cuda-0.10.0.dist-info/RECORD +263 -0
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/WHEEL +1 -1
- numba_cuda-0.8.1.dist-info/RECORD +0 -251
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/top_level.txt +0 -0
@@ -22,9 +22,17 @@ def atomic_cast_none(num):
|
|
22
22
|
|
23
23
|
|
24
24
|
@cuda.jit(device=True)
|
25
|
-
def atomic_binary_1dim_shared(
|
26
|
-
|
27
|
-
|
25
|
+
def atomic_binary_1dim_shared(
|
26
|
+
ary,
|
27
|
+
idx,
|
28
|
+
op2,
|
29
|
+
ary_dtype,
|
30
|
+
ary_nelements,
|
31
|
+
binop_func,
|
32
|
+
cast_func,
|
33
|
+
initializer,
|
34
|
+
neg_idx,
|
35
|
+
):
|
28
36
|
tid = cuda.threadIdx.x
|
29
37
|
sm = cuda.shared.array(ary_nelements, ary_dtype)
|
30
38
|
sm[tid] = initializer
|
@@ -38,8 +46,9 @@ def atomic_binary_1dim_shared(ary, idx, op2, ary_dtype, ary_nelements,
|
|
38
46
|
|
39
47
|
|
40
48
|
@cuda.jit(device=True)
|
41
|
-
def atomic_binary_1dim_shared2(
|
42
|
-
|
49
|
+
def atomic_binary_1dim_shared2(
|
50
|
+
ary, idx, op2, ary_dtype, ary_nelements, binop_func, cast_func
|
51
|
+
):
|
43
52
|
tid = cuda.threadIdx.x
|
44
53
|
sm = cuda.shared.array(ary_nelements, ary_dtype)
|
45
54
|
sm[tid] = ary[tid]
|
@@ -51,8 +60,9 @@ def atomic_binary_1dim_shared2(ary, idx, op2, ary_dtype, ary_nelements,
|
|
51
60
|
|
52
61
|
|
53
62
|
@cuda.jit(device=True)
|
54
|
-
def atomic_binary_2dim_shared(
|
55
|
-
|
63
|
+
def atomic_binary_2dim_shared(
|
64
|
+
ary, op2, ary_dtype, ary_shape, binop_func, y_cast_func, neg_idx
|
65
|
+
):
|
56
66
|
tx = cuda.threadIdx.x
|
57
67
|
ty = cuda.threadIdx.y
|
58
68
|
sm = cuda.shared.array(ary_shape, ary_dtype)
|
@@ -77,8 +87,9 @@ def atomic_binary_2dim_global(ary, op2, binop_func, y_cast_func, neg_idx):
|
|
77
87
|
|
78
88
|
|
79
89
|
@cuda.jit(device=True)
|
80
|
-
def atomic_binary_1dim_global(
|
81
|
-
|
90
|
+
def atomic_binary_1dim_global(
|
91
|
+
ary, idx, ary_nelements, op2, binop_func, neg_idx
|
92
|
+
):
|
82
93
|
tid = cuda.threadIdx.x
|
83
94
|
bin = int(idx[tid] % ary_nelements)
|
84
95
|
if neg_idx:
|
@@ -87,53 +98,79 @@ def atomic_binary_1dim_global(ary, idx, ary_nelements, op2,
|
|
87
98
|
|
88
99
|
|
89
100
|
def atomic_add(ary):
|
90
|
-
atomic_binary_1dim_shared(
|
91
|
-
|
101
|
+
atomic_binary_1dim_shared(
|
102
|
+
ary, ary, 1, uint32, 32, cuda.atomic.add, atomic_cast_none, 0, False
|
103
|
+
)
|
92
104
|
|
93
105
|
|
94
106
|
def atomic_add_wrap(ary):
|
95
|
-
atomic_binary_1dim_shared(
|
96
|
-
|
107
|
+
atomic_binary_1dim_shared(
|
108
|
+
ary, ary, 1, uint32, 32, cuda.atomic.add, atomic_cast_none, 0, True
|
109
|
+
)
|
97
110
|
|
98
111
|
|
99
112
|
def atomic_add2(ary):
|
100
|
-
atomic_binary_2dim_shared(
|
101
|
-
|
113
|
+
atomic_binary_2dim_shared(
|
114
|
+
ary, 1, uint32, (4, 8), cuda.atomic.add, atomic_cast_none, False
|
115
|
+
)
|
102
116
|
|
103
117
|
|
104
118
|
def atomic_add2_wrap(ary):
|
105
|
-
atomic_binary_2dim_shared(
|
106
|
-
|
119
|
+
atomic_binary_2dim_shared(
|
120
|
+
ary, 1, uint32, (4, 8), cuda.atomic.add, atomic_cast_none, True
|
121
|
+
)
|
107
122
|
|
108
123
|
|
109
124
|
def atomic_add3(ary):
|
110
|
-
atomic_binary_2dim_shared(
|
111
|
-
|
125
|
+
atomic_binary_2dim_shared(
|
126
|
+
ary, 1, uint32, (4, 8), cuda.atomic.add, atomic_cast_to_uint64, False
|
127
|
+
)
|
112
128
|
|
113
129
|
|
114
130
|
def atomic_add_float(ary):
|
115
|
-
atomic_binary_1dim_shared(
|
116
|
-
|
131
|
+
atomic_binary_1dim_shared(
|
132
|
+
ary,
|
133
|
+
ary,
|
134
|
+
1.0,
|
135
|
+
float32,
|
136
|
+
32,
|
137
|
+
cuda.atomic.add,
|
138
|
+
atomic_cast_to_int,
|
139
|
+
0.0,
|
140
|
+
False,
|
141
|
+
)
|
117
142
|
|
118
143
|
|
119
144
|
def atomic_add_float_wrap(ary):
|
120
|
-
atomic_binary_1dim_shared(
|
121
|
-
|
145
|
+
atomic_binary_1dim_shared(
|
146
|
+
ary,
|
147
|
+
ary,
|
148
|
+
1.0,
|
149
|
+
float32,
|
150
|
+
32,
|
151
|
+
cuda.atomic.add,
|
152
|
+
atomic_cast_to_int,
|
153
|
+
0.0,
|
154
|
+
True,
|
155
|
+
)
|
122
156
|
|
123
157
|
|
124
158
|
def atomic_add_float_2(ary):
|
125
|
-
atomic_binary_2dim_shared(
|
126
|
-
|
159
|
+
atomic_binary_2dim_shared(
|
160
|
+
ary, 1.0, float32, (4, 8), cuda.atomic.add, atomic_cast_none, False
|
161
|
+
)
|
127
162
|
|
128
163
|
|
129
164
|
def atomic_add_float_2_wrap(ary):
|
130
|
-
atomic_binary_2dim_shared(
|
131
|
-
|
165
|
+
atomic_binary_2dim_shared(
|
166
|
+
ary, 1.0, float32, (4, 8), cuda.atomic.add, atomic_cast_none, True
|
167
|
+
)
|
132
168
|
|
133
169
|
|
134
170
|
def atomic_add_float_3(ary):
|
135
|
-
atomic_binary_2dim_shared(
|
136
|
-
|
171
|
+
atomic_binary_2dim_shared(
|
172
|
+
ary, 1.0, float32, (4, 8), cuda.atomic.add, atomic_cast_to_uint64, False
|
173
|
+
)
|
137
174
|
|
138
175
|
|
139
176
|
def atomic_add_double_global(idx, ary):
|
@@ -153,78 +190,117 @@ def atomic_add_double_global_2_wrap(ary):
|
|
153
190
|
|
154
191
|
|
155
192
|
def atomic_add_double_global_3(ary):
|
156
|
-
atomic_binary_2dim_global(
|
157
|
-
|
193
|
+
atomic_binary_2dim_global(
|
194
|
+
ary, 1, cuda.atomic.add, atomic_cast_to_uint64, False
|
195
|
+
)
|
158
196
|
|
159
197
|
|
160
198
|
def atomic_add_double(idx, ary):
|
161
|
-
atomic_binary_1dim_shared(
|
162
|
-
|
199
|
+
atomic_binary_1dim_shared(
|
200
|
+
ary,
|
201
|
+
idx,
|
202
|
+
1.0,
|
203
|
+
float64,
|
204
|
+
32,
|
205
|
+
cuda.atomic.add,
|
206
|
+
atomic_cast_none,
|
207
|
+
0.0,
|
208
|
+
False,
|
209
|
+
)
|
163
210
|
|
164
211
|
|
165
212
|
def atomic_add_double_wrap(idx, ary):
|
166
|
-
atomic_binary_1dim_shared(
|
167
|
-
|
213
|
+
atomic_binary_1dim_shared(
|
214
|
+
ary, idx, 1.0, float64, 32, cuda.atomic.add, atomic_cast_none, 0.0, True
|
215
|
+
)
|
168
216
|
|
169
217
|
|
170
218
|
def atomic_add_double_2(ary):
|
171
|
-
atomic_binary_2dim_shared(
|
172
|
-
|
219
|
+
atomic_binary_2dim_shared(
|
220
|
+
ary, 1.0, float64, (4, 8), cuda.atomic.add, atomic_cast_none, False
|
221
|
+
)
|
173
222
|
|
174
223
|
|
175
224
|
def atomic_add_double_2_wrap(ary):
|
176
|
-
atomic_binary_2dim_shared(
|
177
|
-
|
225
|
+
atomic_binary_2dim_shared(
|
226
|
+
ary, 1.0, float64, (4, 8), cuda.atomic.add, atomic_cast_none, True
|
227
|
+
)
|
178
228
|
|
179
229
|
|
180
230
|
def atomic_add_double_3(ary):
|
181
|
-
atomic_binary_2dim_shared(
|
182
|
-
|
231
|
+
atomic_binary_2dim_shared(
|
232
|
+
ary, 1.0, float64, (4, 8), cuda.atomic.add, atomic_cast_to_uint64, False
|
233
|
+
)
|
183
234
|
|
184
235
|
|
185
236
|
def atomic_sub(ary):
|
186
|
-
atomic_binary_1dim_shared(
|
187
|
-
|
237
|
+
atomic_binary_1dim_shared(
|
238
|
+
ary, ary, 1, uint32, 32, cuda.atomic.sub, atomic_cast_none, 0, False
|
239
|
+
)
|
188
240
|
|
189
241
|
|
190
242
|
def atomic_sub2(ary):
|
191
|
-
atomic_binary_2dim_shared(
|
192
|
-
|
243
|
+
atomic_binary_2dim_shared(
|
244
|
+
ary, 1, uint32, (4, 8), cuda.atomic.sub, atomic_cast_none, False
|
245
|
+
)
|
193
246
|
|
194
247
|
|
195
248
|
def atomic_sub3(ary):
|
196
|
-
atomic_binary_2dim_shared(
|
197
|
-
|
249
|
+
atomic_binary_2dim_shared(
|
250
|
+
ary, 1, uint32, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False
|
251
|
+
)
|
198
252
|
|
199
253
|
|
200
254
|
def atomic_sub_float(ary):
|
201
|
-
atomic_binary_1dim_shared(
|
202
|
-
|
255
|
+
atomic_binary_1dim_shared(
|
256
|
+
ary,
|
257
|
+
ary,
|
258
|
+
1.0,
|
259
|
+
float32,
|
260
|
+
32,
|
261
|
+
cuda.atomic.sub,
|
262
|
+
atomic_cast_to_int,
|
263
|
+
0.0,
|
264
|
+
False,
|
265
|
+
)
|
203
266
|
|
204
267
|
|
205
268
|
def atomic_sub_float_2(ary):
|
206
|
-
atomic_binary_2dim_shared(
|
207
|
-
|
269
|
+
atomic_binary_2dim_shared(
|
270
|
+
ary, 1.0, float32, (4, 8), cuda.atomic.sub, atomic_cast_none, False
|
271
|
+
)
|
208
272
|
|
209
273
|
|
210
274
|
def atomic_sub_float_3(ary):
|
211
|
-
atomic_binary_2dim_shared(
|
212
|
-
|
275
|
+
atomic_binary_2dim_shared(
|
276
|
+
ary, 1.0, float32, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False
|
277
|
+
)
|
213
278
|
|
214
279
|
|
215
280
|
def atomic_sub_double(idx, ary):
|
216
|
-
atomic_binary_1dim_shared(
|
217
|
-
|
281
|
+
atomic_binary_1dim_shared(
|
282
|
+
ary,
|
283
|
+
idx,
|
284
|
+
1.0,
|
285
|
+
float64,
|
286
|
+
32,
|
287
|
+
cuda.atomic.sub,
|
288
|
+
atomic_cast_none,
|
289
|
+
0.0,
|
290
|
+
False,
|
291
|
+
)
|
218
292
|
|
219
293
|
|
220
294
|
def atomic_sub_double_2(ary):
|
221
|
-
atomic_binary_2dim_shared(
|
222
|
-
|
295
|
+
atomic_binary_2dim_shared(
|
296
|
+
ary, 1.0, float64, (4, 8), cuda.atomic.sub, atomic_cast_none, False
|
297
|
+
)
|
223
298
|
|
224
299
|
|
225
300
|
def atomic_sub_double_3(ary):
|
226
|
-
atomic_binary_2dim_shared(
|
227
|
-
|
301
|
+
atomic_binary_2dim_shared(
|
302
|
+
ary, 1.0, float64, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False
|
303
|
+
)
|
228
304
|
|
229
305
|
|
230
306
|
def atomic_sub_double_global(idx, ary):
|
@@ -232,28 +308,33 @@ def atomic_sub_double_global(idx, ary):
|
|
232
308
|
|
233
309
|
|
234
310
|
def atomic_sub_double_global_2(ary):
|
235
|
-
atomic_binary_2dim_global(
|
236
|
-
|
311
|
+
atomic_binary_2dim_global(
|
312
|
+
ary, 1.0, cuda.atomic.sub, atomic_cast_none, False
|
313
|
+
)
|
237
314
|
|
238
315
|
|
239
316
|
def atomic_sub_double_global_3(ary):
|
240
|
-
atomic_binary_2dim_shared(
|
241
|
-
|
317
|
+
atomic_binary_2dim_shared(
|
318
|
+
ary, 1.0, float64, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False
|
319
|
+
)
|
242
320
|
|
243
321
|
|
244
322
|
def atomic_and(ary, op2):
|
245
|
-
atomic_binary_1dim_shared(
|
246
|
-
|
323
|
+
atomic_binary_1dim_shared(
|
324
|
+
ary, ary, op2, uint32, 32, cuda.atomic.and_, atomic_cast_none, 1, False
|
325
|
+
)
|
247
326
|
|
248
327
|
|
249
328
|
def atomic_and2(ary, op2):
|
250
|
-
atomic_binary_2dim_shared(
|
251
|
-
|
329
|
+
atomic_binary_2dim_shared(
|
330
|
+
ary, op2, uint32, (4, 8), cuda.atomic.and_, atomic_cast_none, False
|
331
|
+
)
|
252
332
|
|
253
333
|
|
254
334
|
def atomic_and3(ary, op2):
|
255
|
-
atomic_binary_2dim_shared(
|
256
|
-
|
335
|
+
atomic_binary_2dim_shared(
|
336
|
+
ary, op2, uint32, (4, 8), cuda.atomic.and_, atomic_cast_to_uint64, False
|
337
|
+
)
|
257
338
|
|
258
339
|
|
259
340
|
def atomic_and_global(idx, ary, op2):
|
@@ -261,23 +342,27 @@ def atomic_and_global(idx, ary, op2):
|
|
261
342
|
|
262
343
|
|
263
344
|
def atomic_and_global_2(ary, op2):
|
264
|
-
atomic_binary_2dim_global(
|
265
|
-
|
345
|
+
atomic_binary_2dim_global(
|
346
|
+
ary, op2, cuda.atomic.and_, atomic_cast_none, False
|
347
|
+
)
|
266
348
|
|
267
349
|
|
268
350
|
def atomic_or(ary, op2):
|
269
|
-
atomic_binary_1dim_shared(
|
270
|
-
|
351
|
+
atomic_binary_1dim_shared(
|
352
|
+
ary, ary, op2, uint32, 32, cuda.atomic.or_, atomic_cast_none, 0, False
|
353
|
+
)
|
271
354
|
|
272
355
|
|
273
356
|
def atomic_or2(ary, op2):
|
274
|
-
atomic_binary_2dim_shared(
|
275
|
-
|
357
|
+
atomic_binary_2dim_shared(
|
358
|
+
ary, op2, uint32, (4, 8), cuda.atomic.or_, atomic_cast_none, False
|
359
|
+
)
|
276
360
|
|
277
361
|
|
278
362
|
def atomic_or3(ary, op2):
|
279
|
-
atomic_binary_2dim_shared(
|
280
|
-
|
363
|
+
atomic_binary_2dim_shared(
|
364
|
+
ary, op2, uint32, (4, 8), cuda.atomic.or_, atomic_cast_to_uint64, False
|
365
|
+
)
|
281
366
|
|
282
367
|
|
283
368
|
def atomic_or_global(idx, ary, op2):
|
@@ -285,23 +370,27 @@ def atomic_or_global(idx, ary, op2):
|
|
285
370
|
|
286
371
|
|
287
372
|
def atomic_or_global_2(ary, op2):
|
288
|
-
atomic_binary_2dim_global(
|
289
|
-
|
373
|
+
atomic_binary_2dim_global(
|
374
|
+
ary, op2, cuda.atomic.or_, atomic_cast_none, False
|
375
|
+
)
|
290
376
|
|
291
377
|
|
292
378
|
def atomic_xor(ary, op2):
|
293
|
-
atomic_binary_1dim_shared(
|
294
|
-
|
379
|
+
atomic_binary_1dim_shared(
|
380
|
+
ary, ary, op2, uint32, 32, cuda.atomic.xor, atomic_cast_none, 0, False
|
381
|
+
)
|
295
382
|
|
296
383
|
|
297
384
|
def atomic_xor2(ary, op2):
|
298
|
-
atomic_binary_2dim_shared(
|
299
|
-
|
385
|
+
atomic_binary_2dim_shared(
|
386
|
+
ary, op2, uint32, (4, 8), cuda.atomic.xor, atomic_cast_none, False
|
387
|
+
)
|
300
388
|
|
301
389
|
|
302
390
|
def atomic_xor3(ary, op2):
|
303
|
-
atomic_binary_2dim_shared(
|
304
|
-
|
391
|
+
atomic_binary_2dim_shared(
|
392
|
+
ary, op2, uint32, (4, 8), cuda.atomic.xor, atomic_cast_to_uint64, False
|
393
|
+
)
|
305
394
|
|
306
395
|
|
307
396
|
def atomic_xor_global(idx, ary, op2):
|
@@ -309,33 +398,39 @@ def atomic_xor_global(idx, ary, op2):
|
|
309
398
|
|
310
399
|
|
311
400
|
def atomic_xor_global_2(ary, op2):
|
312
|
-
atomic_binary_2dim_global(
|
313
|
-
|
401
|
+
atomic_binary_2dim_global(
|
402
|
+
ary, op2, cuda.atomic.xor, atomic_cast_none, False
|
403
|
+
)
|
314
404
|
|
315
405
|
|
316
406
|
def atomic_inc32(ary, idx, op2):
|
317
|
-
atomic_binary_1dim_shared2(
|
318
|
-
|
407
|
+
atomic_binary_1dim_shared2(
|
408
|
+
ary, idx, op2, uint32, 32, cuda.atomic.inc, atomic_cast_none
|
409
|
+
)
|
319
410
|
|
320
411
|
|
321
412
|
def atomic_inc64(ary, idx, op2):
|
322
|
-
atomic_binary_1dim_shared2(
|
323
|
-
|
413
|
+
atomic_binary_1dim_shared2(
|
414
|
+
ary, idx, op2, uint64, 32, cuda.atomic.inc, atomic_cast_to_int
|
415
|
+
)
|
324
416
|
|
325
417
|
|
326
418
|
def atomic_inc2_32(ary, op2):
|
327
|
-
atomic_binary_2dim_shared(
|
328
|
-
|
419
|
+
atomic_binary_2dim_shared(
|
420
|
+
ary, op2, uint32, (4, 8), cuda.atomic.inc, atomic_cast_none, False
|
421
|
+
)
|
329
422
|
|
330
423
|
|
331
424
|
def atomic_inc2_64(ary, op2):
|
332
|
-
atomic_binary_2dim_shared(
|
333
|
-
|
425
|
+
atomic_binary_2dim_shared(
|
426
|
+
ary, op2, uint64, (4, 8), cuda.atomic.inc, atomic_cast_none, False
|
427
|
+
)
|
334
428
|
|
335
429
|
|
336
430
|
def atomic_inc3(ary, op2):
|
337
|
-
atomic_binary_2dim_shared(
|
338
|
-
|
431
|
+
atomic_binary_2dim_shared(
|
432
|
+
ary, op2, uint32, (4, 8), cuda.atomic.inc, atomic_cast_to_uint64, False
|
433
|
+
)
|
339
434
|
|
340
435
|
|
341
436
|
def atomic_inc_global(idx, ary, op2):
|
@@ -343,33 +438,39 @@ def atomic_inc_global(idx, ary, op2):
|
|
343
438
|
|
344
439
|
|
345
440
|
def atomic_inc_global_2(ary, op2):
|
346
|
-
atomic_binary_2dim_global(
|
347
|
-
|
441
|
+
atomic_binary_2dim_global(
|
442
|
+
ary, op2, cuda.atomic.inc, atomic_cast_none, False
|
443
|
+
)
|
348
444
|
|
349
445
|
|
350
446
|
def atomic_dec32(ary, idx, op2):
|
351
|
-
atomic_binary_1dim_shared2(
|
352
|
-
|
447
|
+
atomic_binary_1dim_shared2(
|
448
|
+
ary, idx, op2, uint32, 32, cuda.atomic.dec, atomic_cast_none
|
449
|
+
)
|
353
450
|
|
354
451
|
|
355
452
|
def atomic_dec64(ary, idx, op2):
|
356
|
-
atomic_binary_1dim_shared2(
|
357
|
-
|
453
|
+
atomic_binary_1dim_shared2(
|
454
|
+
ary, idx, op2, uint64, 32, cuda.atomic.dec, atomic_cast_to_int
|
455
|
+
)
|
358
456
|
|
359
457
|
|
360
458
|
def atomic_dec2_32(ary, op2):
|
361
|
-
atomic_binary_2dim_shared(
|
362
|
-
|
459
|
+
atomic_binary_2dim_shared(
|
460
|
+
ary, op2, uint32, (4, 8), cuda.atomic.dec, atomic_cast_none, False
|
461
|
+
)
|
363
462
|
|
364
463
|
|
365
464
|
def atomic_dec2_64(ary, op2):
|
366
|
-
atomic_binary_2dim_shared(
|
367
|
-
|
465
|
+
atomic_binary_2dim_shared(
|
466
|
+
ary, op2, uint64, (4, 8), cuda.atomic.dec, atomic_cast_none, False
|
467
|
+
)
|
368
468
|
|
369
469
|
|
370
470
|
def atomic_dec3(ary, op2):
|
371
|
-
atomic_binary_2dim_shared(
|
372
|
-
|
471
|
+
atomic_binary_2dim_shared(
|
472
|
+
ary, op2, uint32, (4, 8), cuda.atomic.dec, atomic_cast_to_uint64, False
|
473
|
+
)
|
373
474
|
|
374
475
|
|
375
476
|
def atomic_dec_global(idx, ary, op2):
|
@@ -377,23 +478,27 @@ def atomic_dec_global(idx, ary, op2):
|
|
377
478
|
|
378
479
|
|
379
480
|
def atomic_dec_global_2(ary, op2):
|
380
|
-
atomic_binary_2dim_global(
|
381
|
-
|
481
|
+
atomic_binary_2dim_global(
|
482
|
+
ary, op2, cuda.atomic.dec, atomic_cast_none, False
|
483
|
+
)
|
382
484
|
|
383
485
|
|
384
486
|
def atomic_exch(ary, idx, op2):
|
385
|
-
atomic_binary_1dim_shared2(
|
386
|
-
|
487
|
+
atomic_binary_1dim_shared2(
|
488
|
+
ary, idx, op2, uint32, 32, cuda.atomic.exch, atomic_cast_none
|
489
|
+
)
|
387
490
|
|
388
491
|
|
389
492
|
def atomic_exch2(ary, op2):
|
390
|
-
atomic_binary_2dim_shared(
|
391
|
-
|
493
|
+
atomic_binary_2dim_shared(
|
494
|
+
ary, op2, uint32, (4, 8), cuda.atomic.exch, atomic_cast_none, False
|
495
|
+
)
|
392
496
|
|
393
497
|
|
394
498
|
def atomic_exch3(ary, op2):
|
395
|
-
atomic_binary_2dim_shared(
|
396
|
-
|
499
|
+
atomic_binary_2dim_shared(
|
500
|
+
ary, op2, uint64, (4, 8), cuda.atomic.exch, atomic_cast_none, False
|
501
|
+
)
|
397
502
|
|
398
503
|
|
399
504
|
def atomic_exch_global(idx, ary, op2):
|
@@ -401,7 +506,6 @@ def atomic_exch_global(idx, ary, op2):
|
|
401
506
|
|
402
507
|
|
403
508
|
def gen_atomic_extreme_funcs(func):
|
404
|
-
|
405
509
|
fns = dedent("""
|
406
510
|
def atomic(res, ary):
|
407
511
|
tx = cuda.threadIdx.x
|
@@ -431,21 +535,39 @@ def gen_atomic_extreme_funcs(func):
|
|
431
535
|
res[0] = smres[0]
|
432
536
|
""").format(func=func)
|
433
537
|
ld = {}
|
434
|
-
exec(fns, {
|
435
|
-
return (
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
(
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
538
|
+
exec(fns, {"cuda": cuda, "float64": float64, "uint64": uint64}, ld)
|
539
|
+
return (
|
540
|
+
ld["atomic"],
|
541
|
+
ld["atomic_double_normalizedindex"],
|
542
|
+
ld["atomic_double_oneindex"],
|
543
|
+
ld["atomic_double_shared"],
|
544
|
+
)
|
545
|
+
|
546
|
+
|
547
|
+
(
|
548
|
+
atomic_max,
|
549
|
+
atomic_max_double_normalizedindex,
|
550
|
+
atomic_max_double_oneindex,
|
551
|
+
atomic_max_double_shared,
|
552
|
+
) = gen_atomic_extreme_funcs("cuda.atomic.max")
|
553
|
+
(
|
554
|
+
atomic_min,
|
555
|
+
atomic_min_double_normalizedindex,
|
556
|
+
atomic_min_double_oneindex,
|
557
|
+
atomic_min_double_shared,
|
558
|
+
) = gen_atomic_extreme_funcs("cuda.atomic.min")
|
559
|
+
(
|
560
|
+
atomic_nanmax,
|
561
|
+
atomic_nanmax_double_normalizedindex,
|
562
|
+
atomic_nanmax_double_oneindex,
|
563
|
+
atomic_nanmax_double_shared,
|
564
|
+
) = gen_atomic_extreme_funcs("cuda.atomic.nanmax")
|
565
|
+
(
|
566
|
+
atomic_nanmin,
|
567
|
+
atomic_nanmin_double_normalizedindex,
|
568
|
+
atomic_nanmin_double_oneindex,
|
569
|
+
atomic_nanmin_double_shared,
|
570
|
+
) = gen_atomic_extreme_funcs("cuda.atomic.nanmin")
|
449
571
|
|
450
572
|
|
451
573
|
def atomic_compare_and_swap(res, old, ary, fill_val):
|
@@ -476,10 +598,10 @@ class TestCudaAtomics(CUDATestCase):
|
|
476
598
|
ary_wrap = ary.copy()
|
477
599
|
orig = ary.copy()
|
478
600
|
|
479
|
-
cuda_atomic_add = cuda.jit(
|
601
|
+
cuda_atomic_add = cuda.jit("void(uint32[:])")(atomic_add)
|
480
602
|
cuda_atomic_add[1, 32](ary)
|
481
603
|
|
482
|
-
cuda_atomic_add_wrap = cuda.jit(
|
604
|
+
cuda_atomic_add_wrap = cuda.jit("void(uint32[:])")(atomic_add_wrap)
|
483
605
|
cuda_atomic_add_wrap[1, 32](ary_wrap)
|
484
606
|
|
485
607
|
gold = np.zeros(32, dtype=np.uint32)
|
@@ -494,10 +616,10 @@ class TestCudaAtomics(CUDATestCase):
|
|
494
616
|
ary_wrap = ary.copy()
|
495
617
|
orig = ary.copy()
|
496
618
|
|
497
|
-
cuda_atomic_add2 = cuda.jit(
|
619
|
+
cuda_atomic_add2 = cuda.jit("void(uint32[:,:])")(atomic_add2)
|
498
620
|
cuda_atomic_add2[1, (4, 8)](ary)
|
499
621
|
|
500
|
-
cuda_atomic_add2_wrap = cuda.jit(
|
622
|
+
cuda_atomic_add2_wrap = cuda.jit("void(uint32[:,:])")(atomic_add2_wrap)
|
501
623
|
cuda_atomic_add2_wrap[1, (4, 8)](ary_wrap)
|
502
624
|
|
503
625
|
self.assertTrue(np.all(ary == orig + 1))
|
@@ -506,7 +628,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
506
628
|
def test_atomic_add3(self):
|
507
629
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
508
630
|
orig = ary.copy()
|
509
|
-
cuda_atomic_add3 = cuda.jit(
|
631
|
+
cuda_atomic_add3 = cuda.jit("void(uint32[:,:])")(atomic_add3)
|
510
632
|
cuda_atomic_add3[1, (4, 8)](ary)
|
511
633
|
|
512
634
|
self.assertTrue(np.all(ary == orig + 1))
|
@@ -516,10 +638,10 @@ class TestCudaAtomics(CUDATestCase):
|
|
516
638
|
ary_wrap = ary.copy()
|
517
639
|
orig = ary.copy().astype(np.intp)
|
518
640
|
|
519
|
-
cuda_atomic_add_float = cuda.jit(
|
641
|
+
cuda_atomic_add_float = cuda.jit("void(float32[:])")(atomic_add_float)
|
520
642
|
cuda_atomic_add_float[1, 32](ary)
|
521
643
|
|
522
|
-
add_float_wrap = cuda.jit(
|
644
|
+
add_float_wrap = cuda.jit("void(float32[:])")(atomic_add_float_wrap)
|
523
645
|
add_float_wrap[1, 32](ary_wrap)
|
524
646
|
|
525
647
|
gold = np.zeros(32, dtype=np.uint32)
|
@@ -534,10 +656,10 @@ class TestCudaAtomics(CUDATestCase):
|
|
534
656
|
ary_wrap = ary.copy()
|
535
657
|
orig = ary.copy()
|
536
658
|
|
537
|
-
cuda_atomic_add2 = cuda.jit(
|
659
|
+
cuda_atomic_add2 = cuda.jit("void(float32[:,:])")(atomic_add_float_2)
|
538
660
|
cuda_atomic_add2[1, (4, 8)](ary)
|
539
661
|
|
540
|
-
cuda_func_wrap = cuda.jit(
|
662
|
+
cuda_func_wrap = cuda.jit("void(float32[:,:])")(atomic_add_float_2_wrap)
|
541
663
|
cuda_func_wrap[1, (4, 8)](ary_wrap)
|
542
664
|
|
543
665
|
self.assertTrue(np.all(ary == orig + 1))
|
@@ -546,7 +668,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
546
668
|
def test_atomic_add_float_3(self):
|
547
669
|
ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
|
548
670
|
orig = ary.copy()
|
549
|
-
cuda_atomic_add3 = cuda.jit(
|
671
|
+
cuda_atomic_add3 = cuda.jit("void(float32[:,:])")(atomic_add_float_3)
|
550
672
|
cuda_atomic_add3[1, (4, 8)](ary)
|
551
673
|
|
552
674
|
self.assertTrue(np.all(ary == orig + 1))
|
@@ -561,24 +683,24 @@ class TestCudaAtomics(CUDATestCase):
|
|
561
683
|
inst = "(red|atom)"
|
562
684
|
|
563
685
|
if shared:
|
564
|
-
inst = f
|
686
|
+
inst = f"{inst}\\.shared"
|
565
687
|
|
566
|
-
self.assertRegex(asm, f
|
688
|
+
self.assertRegex(asm, f"{inst}.add.f64", asm)
|
567
689
|
else:
|
568
690
|
if shared:
|
569
|
-
self.assertIn(
|
691
|
+
self.assertIn("atom.shared.cas.b64", asm)
|
570
692
|
else:
|
571
|
-
self.assertIn(
|
693
|
+
self.assertIn("atom.cas.b64", asm)
|
572
694
|
|
573
695
|
def test_atomic_add_double(self):
|
574
696
|
idx = np.random.randint(0, 32, size=32, dtype=np.int64)
|
575
697
|
ary = np.zeros(32, np.float64)
|
576
698
|
ary_wrap = ary.copy()
|
577
699
|
|
578
|
-
cuda_fn = cuda.jit(
|
700
|
+
cuda_fn = cuda.jit("void(int64[:], float64[:])")(atomic_add_double)
|
579
701
|
cuda_fn[1, 32](idx, ary)
|
580
702
|
|
581
|
-
wrap_fn = cuda.jit(
|
703
|
+
wrap_fn = cuda.jit("void(int64[:], float64[:])")(atomic_add_double_wrap)
|
582
704
|
wrap_fn[1, 32](idx, ary_wrap)
|
583
705
|
|
584
706
|
gold = np.zeros(32, dtype=np.uint32)
|
@@ -595,10 +717,10 @@ class TestCudaAtomics(CUDATestCase):
|
|
595
717
|
ary_wrap = ary.copy()
|
596
718
|
orig = ary.copy()
|
597
719
|
|
598
|
-
cuda_fn = cuda.jit(
|
720
|
+
cuda_fn = cuda.jit("void(float64[:,:])")(atomic_add_double_2)
|
599
721
|
cuda_fn[1, (4, 8)](ary)
|
600
722
|
|
601
|
-
cuda_fn_wrap = cuda.jit(
|
723
|
+
cuda_fn_wrap = cuda.jit("void(float64[:,:])")(atomic_add_double_2_wrap)
|
602
724
|
cuda_fn_wrap[1, (4, 8)](ary_wrap)
|
603
725
|
|
604
726
|
np.testing.assert_equal(ary, orig + 1)
|
@@ -609,7 +731,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
609
731
|
def test_atomic_add_double_3(self):
|
610
732
|
ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
|
611
733
|
orig = ary.copy()
|
612
|
-
cuda_func = cuda.jit(
|
734
|
+
cuda_func = cuda.jit("void(float64[:,:])")(atomic_add_double_3)
|
613
735
|
cuda_func[1, (4, 8)](ary)
|
614
736
|
|
615
737
|
np.testing.assert_equal(ary, orig + 1)
|
@@ -620,7 +742,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
620
742
|
ary = np.zeros(32, np.float64)
|
621
743
|
ary_wrap = ary.copy()
|
622
744
|
|
623
|
-
sig =
|
745
|
+
sig = "void(int64[:], float64[:])"
|
624
746
|
cuda_func = cuda.jit(sig)(atomic_add_double_global)
|
625
747
|
wrap_cuda_func = cuda.jit(sig)(atomic_add_double_global_wrap)
|
626
748
|
|
@@ -641,7 +763,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
641
763
|
ary_wrap = ary.copy()
|
642
764
|
orig = ary.copy()
|
643
765
|
|
644
|
-
sig =
|
766
|
+
sig = "void(float64[:,:])"
|
645
767
|
cuda_func = cuda.jit(sig)(atomic_add_double_global_2)
|
646
768
|
wrap_cuda_func = cuda.jit(sig)(atomic_add_double_global_2_wrap)
|
647
769
|
|
@@ -656,7 +778,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
656
778
|
def test_atomic_add_double_global_3(self):
|
657
779
|
ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
|
658
780
|
orig = ary.copy()
|
659
|
-
cuda_func = cuda.jit(
|
781
|
+
cuda_func = cuda.jit("void(float64[:,:])")(atomic_add_double_global_3)
|
660
782
|
cuda_func[1, (4, 8)](ary)
|
661
783
|
|
662
784
|
np.testing.assert_equal(ary, orig + 1)
|
@@ -665,7 +787,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
665
787
|
def test_atomic_sub(self):
|
666
788
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32)
|
667
789
|
orig = ary.copy()
|
668
|
-
cuda_atomic_sub = cuda.jit(
|
790
|
+
cuda_atomic_sub = cuda.jit("void(uint32[:])")(atomic_sub)
|
669
791
|
cuda_atomic_sub[1, 32](ary)
|
670
792
|
|
671
793
|
gold = np.zeros(32, dtype=np.uint32)
|
@@ -677,21 +799,21 @@ class TestCudaAtomics(CUDATestCase):
|
|
677
799
|
def test_atomic_sub2(self):
|
678
800
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
679
801
|
orig = ary.copy()
|
680
|
-
cuda_atomic_sub2 = cuda.jit(
|
802
|
+
cuda_atomic_sub2 = cuda.jit("void(uint32[:,:])")(atomic_sub2)
|
681
803
|
cuda_atomic_sub2[1, (4, 8)](ary)
|
682
804
|
self.assertTrue(np.all(ary == orig - 1))
|
683
805
|
|
684
806
|
def test_atomic_sub3(self):
|
685
807
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
686
808
|
orig = ary.copy()
|
687
|
-
cuda_atomic_sub3 = cuda.jit(
|
809
|
+
cuda_atomic_sub3 = cuda.jit("void(uint32[:,:])")(atomic_sub3)
|
688
810
|
cuda_atomic_sub3[1, (4, 8)](ary)
|
689
811
|
self.assertTrue(np.all(ary == orig - 1))
|
690
812
|
|
691
813
|
def test_atomic_sub_float(self):
|
692
814
|
ary = np.random.randint(0, 32, size=32).astype(np.float32)
|
693
815
|
orig = ary.copy().astype(np.intp)
|
694
|
-
cuda_atomic_sub_float = cuda.jit(
|
816
|
+
cuda_atomic_sub_float = cuda.jit("void(float32[:])")(atomic_sub_float)
|
695
817
|
cuda_atomic_sub_float[1, 32](ary)
|
696
818
|
|
697
819
|
gold = np.zeros(32, dtype=np.float32)
|
@@ -703,21 +825,21 @@ class TestCudaAtomics(CUDATestCase):
|
|
703
825
|
def test_atomic_sub_float_2(self):
|
704
826
|
ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
|
705
827
|
orig = ary.copy()
|
706
|
-
cuda_atomic_sub2 = cuda.jit(
|
828
|
+
cuda_atomic_sub2 = cuda.jit("void(float32[:,:])")(atomic_sub_float_2)
|
707
829
|
cuda_atomic_sub2[1, (4, 8)](ary)
|
708
830
|
self.assertTrue(np.all(ary == orig - 1))
|
709
831
|
|
710
832
|
def test_atomic_sub_float_3(self):
|
711
833
|
ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
|
712
834
|
orig = ary.copy()
|
713
|
-
cuda_atomic_sub3 = cuda.jit(
|
835
|
+
cuda_atomic_sub3 = cuda.jit("void(float32[:,:])")(atomic_sub_float_3)
|
714
836
|
cuda_atomic_sub3[1, (4, 8)](ary)
|
715
837
|
self.assertTrue(np.all(ary == orig - 1))
|
716
838
|
|
717
839
|
def test_atomic_sub_double(self):
|
718
840
|
idx = np.random.randint(0, 32, size=32, dtype=np.int64)
|
719
841
|
ary = np.zeros(32, np.float64)
|
720
|
-
cuda_func = cuda.jit(
|
842
|
+
cuda_func = cuda.jit("void(int64[:], float64[:])")(atomic_sub_double)
|
721
843
|
cuda_func[1, 32](idx, ary)
|
722
844
|
|
723
845
|
gold = np.zeros(32, dtype=np.float64)
|
@@ -729,21 +851,21 @@ class TestCudaAtomics(CUDATestCase):
|
|
729
851
|
def test_atomic_sub_double_2(self):
|
730
852
|
ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
|
731
853
|
orig = ary.copy()
|
732
|
-
cuda_func = cuda.jit(
|
854
|
+
cuda_func = cuda.jit("void(float64[:,:])")(atomic_sub_double_2)
|
733
855
|
cuda_func[1, (4, 8)](ary)
|
734
856
|
np.testing.assert_equal(ary, orig - 1)
|
735
857
|
|
736
858
|
def test_atomic_sub_double_3(self):
|
737
859
|
ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
|
738
860
|
orig = ary.copy()
|
739
|
-
cuda_func = cuda.jit(
|
861
|
+
cuda_func = cuda.jit("void(float64[:,:])")(atomic_sub_double_3)
|
740
862
|
cuda_func[1, (4, 8)](ary)
|
741
863
|
np.testing.assert_equal(ary, orig - 1)
|
742
864
|
|
743
865
|
def test_atomic_sub_double_global(self):
|
744
866
|
idx = np.random.randint(0, 32, size=32, dtype=np.int64)
|
745
867
|
ary = np.zeros(32, np.float64)
|
746
|
-
sig =
|
868
|
+
sig = "void(int64[:], float64[:])"
|
747
869
|
cuda_func = cuda.jit(sig)(atomic_sub_double_global)
|
748
870
|
cuda_func[1, 32](idx, ary)
|
749
871
|
|
@@ -756,14 +878,14 @@ class TestCudaAtomics(CUDATestCase):
|
|
756
878
|
def test_atomic_sub_double_global_2(self):
|
757
879
|
ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
|
758
880
|
orig = ary.copy()
|
759
|
-
cuda_func = cuda.jit(
|
881
|
+
cuda_func = cuda.jit("void(float64[:,:])")(atomic_sub_double_global_2)
|
760
882
|
cuda_func[1, (4, 8)](ary)
|
761
883
|
np.testing.assert_equal(ary, orig - 1)
|
762
884
|
|
763
885
|
def test_atomic_sub_double_global_3(self):
|
764
886
|
ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
|
765
887
|
orig = ary.copy()
|
766
|
-
cuda_func = cuda.jit(
|
888
|
+
cuda_func = cuda.jit("void(float64[:,:])")(atomic_sub_double_global_3)
|
767
889
|
cuda_func[1, (4, 8)](ary)
|
768
890
|
np.testing.assert_equal(ary, orig - 1)
|
769
891
|
|
@@ -771,7 +893,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
771
893
|
rand_const = np.random.randint(500)
|
772
894
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32)
|
773
895
|
orig = ary.copy()
|
774
|
-
cuda_func = cuda.jit(
|
896
|
+
cuda_func = cuda.jit("void(uint32[:], uint32)")(atomic_and)
|
775
897
|
cuda_func[1, 32](ary, rand_const)
|
776
898
|
|
777
899
|
gold = ary.copy()
|
@@ -784,7 +906,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
784
906
|
rand_const = np.random.randint(500)
|
785
907
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
786
908
|
orig = ary.copy()
|
787
|
-
cuda_atomic_and2 = cuda.jit(
|
909
|
+
cuda_atomic_and2 = cuda.jit("void(uint32[:,:], uint32)")(atomic_and2)
|
788
910
|
cuda_atomic_and2[1, (4, 8)](ary, rand_const)
|
789
911
|
self.assertTrue(np.all(ary == orig & rand_const))
|
790
912
|
|
@@ -792,7 +914,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
792
914
|
rand_const = np.random.randint(500)
|
793
915
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
794
916
|
orig = ary.copy()
|
795
|
-
cuda_atomic_and3 = cuda.jit(
|
917
|
+
cuda_atomic_and3 = cuda.jit("void(uint32[:,:], uint32)")(atomic_and3)
|
796
918
|
cuda_atomic_and3[1, (4, 8)](ary, rand_const)
|
797
919
|
self.assertTrue(np.all(ary == orig & rand_const))
|
798
920
|
|
@@ -800,7 +922,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
800
922
|
rand_const = np.random.randint(500)
|
801
923
|
idx = np.random.randint(0, 32, size=32, dtype=np.int32)
|
802
924
|
ary = np.random.randint(0, 32, size=32, dtype=np.int32)
|
803
|
-
sig =
|
925
|
+
sig = "void(int32[:], int32[:], int32)"
|
804
926
|
cuda_func = cuda.jit(sig)(atomic_and_global)
|
805
927
|
cuda_func[1, 32](idx, ary, rand_const)
|
806
928
|
|
@@ -814,7 +936,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
814
936
|
rand_const = np.random.randint(500)
|
815
937
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
816
938
|
orig = ary.copy()
|
817
|
-
cuda_func = cuda.jit(
|
939
|
+
cuda_func = cuda.jit("void(uint32[:,:], uint32)")(atomic_and_global_2)
|
818
940
|
cuda_func[1, (4, 8)](ary, rand_const)
|
819
941
|
np.testing.assert_equal(ary, orig & rand_const)
|
820
942
|
|
@@ -822,7 +944,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
822
944
|
rand_const = np.random.randint(500)
|
823
945
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32)
|
824
946
|
orig = ary.copy()
|
825
|
-
cuda_func = cuda.jit(
|
947
|
+
cuda_func = cuda.jit("void(uint32[:], uint32)")(atomic_or)
|
826
948
|
cuda_func[1, 32](ary, rand_const)
|
827
949
|
|
828
950
|
gold = np.zeros(32, dtype=np.uint32)
|
@@ -835,7 +957,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
835
957
|
rand_const = np.random.randint(500)
|
836
958
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
837
959
|
orig = ary.copy()
|
838
|
-
cuda_atomic_and2 = cuda.jit(
|
960
|
+
cuda_atomic_and2 = cuda.jit("void(uint32[:,:], uint32)")(atomic_or2)
|
839
961
|
cuda_atomic_and2[1, (4, 8)](ary, rand_const)
|
840
962
|
self.assertTrue(np.all(ary == orig | rand_const))
|
841
963
|
|
@@ -843,7 +965,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
843
965
|
rand_const = np.random.randint(500)
|
844
966
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
845
967
|
orig = ary.copy()
|
846
|
-
cuda_atomic_and3 = cuda.jit(
|
968
|
+
cuda_atomic_and3 = cuda.jit("void(uint32[:,:], uint32)")(atomic_or3)
|
847
969
|
cuda_atomic_and3[1, (4, 8)](ary, rand_const)
|
848
970
|
self.assertTrue(np.all(ary == orig | rand_const))
|
849
971
|
|
@@ -851,7 +973,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
851
973
|
rand_const = np.random.randint(500)
|
852
974
|
idx = np.random.randint(0, 32, size=32, dtype=np.int32)
|
853
975
|
ary = np.random.randint(0, 32, size=32, dtype=np.int32)
|
854
|
-
sig =
|
976
|
+
sig = "void(int32[:], int32[:], int32)"
|
855
977
|
cuda_func = cuda.jit(sig)(atomic_or_global)
|
856
978
|
cuda_func[1, 32](idx, ary, rand_const)
|
857
979
|
|
@@ -865,7 +987,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
865
987
|
rand_const = np.random.randint(500)
|
866
988
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
867
989
|
orig = ary.copy()
|
868
|
-
cuda_func = cuda.jit(
|
990
|
+
cuda_func = cuda.jit("void(uint32[:,:], uint32)")(atomic_or_global_2)
|
869
991
|
cuda_func[1, (4, 8)](ary, rand_const)
|
870
992
|
np.testing.assert_equal(ary, orig | rand_const)
|
871
993
|
|
@@ -873,7 +995,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
873
995
|
rand_const = np.random.randint(500)
|
874
996
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32)
|
875
997
|
orig = ary.copy()
|
876
|
-
cuda_func = cuda.jit(
|
998
|
+
cuda_func = cuda.jit("void(uint32[:], uint32)")(atomic_xor)
|
877
999
|
cuda_func[1, 32](ary, rand_const)
|
878
1000
|
|
879
1001
|
gold = np.zeros(32, dtype=np.uint32)
|
@@ -886,7 +1008,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
886
1008
|
rand_const = np.random.randint(500)
|
887
1009
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
888
1010
|
orig = ary.copy()
|
889
|
-
cuda_atomic_xor2 = cuda.jit(
|
1011
|
+
cuda_atomic_xor2 = cuda.jit("void(uint32[:,:], uint32)")(atomic_xor2)
|
890
1012
|
cuda_atomic_xor2[1, (4, 8)](ary, rand_const)
|
891
1013
|
self.assertTrue(np.all(ary == orig ^ rand_const))
|
892
1014
|
|
@@ -894,7 +1016,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
894
1016
|
rand_const = np.random.randint(500)
|
895
1017
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
896
1018
|
orig = ary.copy()
|
897
|
-
cuda_atomic_xor3 = cuda.jit(
|
1019
|
+
cuda_atomic_xor3 = cuda.jit("void(uint32[:,:], uint32)")(atomic_xor3)
|
898
1020
|
cuda_atomic_xor3[1, (4, 8)](ary, rand_const)
|
899
1021
|
self.assertTrue(np.all(ary == orig ^ rand_const))
|
900
1022
|
|
@@ -903,7 +1025,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
903
1025
|
idx = np.random.randint(0, 32, size=32, dtype=np.int32)
|
904
1026
|
ary = np.random.randint(0, 32, size=32, dtype=np.int32)
|
905
1027
|
gold = ary.copy()
|
906
|
-
sig =
|
1028
|
+
sig = "void(int32[:], int32[:], int32)"
|
907
1029
|
cuda_func = cuda.jit(sig)(atomic_xor_global)
|
908
1030
|
cuda_func[1, 32](idx, ary, rand_const)
|
909
1031
|
|
@@ -916,12 +1038,12 @@ class TestCudaAtomics(CUDATestCase):
|
|
916
1038
|
rand_const = np.random.randint(500)
|
917
1039
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
918
1040
|
orig = ary.copy()
|
919
|
-
cuda_func = cuda.jit(
|
1041
|
+
cuda_func = cuda.jit("void(uint32[:,:], uint32)")(atomic_xor_global_2)
|
920
1042
|
cuda_func[1, (4, 8)](ary, rand_const)
|
921
1043
|
np.testing.assert_equal(ary, orig ^ rand_const)
|
922
1044
|
|
923
1045
|
def inc_dec_1dim_setup(self, dtype):
|
924
|
-
rconst = np.random.randint(32,
|
1046
|
+
rconst = np.random.randint(32, dtype=dtype)
|
925
1047
|
rary = np.random.randint(0, 32, size=32).astype(dtype)
|
926
1048
|
ary_idx = np.arange(32, dtype=dtype)
|
927
1049
|
return rconst, rary, ary_idx
|
@@ -951,131 +1073,141 @@ class TestCudaAtomics(CUDATestCase):
|
|
951
1073
|
|
952
1074
|
def test_atomic_inc_32(self):
|
953
1075
|
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
|
954
|
-
sig =
|
1076
|
+
sig = "void(uint32[:], uint32[:], uint32)"
|
955
1077
|
self.check_inc_index(ary, idx, rand_const, sig, 1, 32, atomic_inc32)
|
956
1078
|
|
957
1079
|
def test_atomic_inc_64(self):
|
958
1080
|
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
|
959
|
-
sig =
|
1081
|
+
sig = "void(uint64[:], uint64[:], uint64)"
|
960
1082
|
self.check_inc_index(ary, idx, rand_const, sig, 1, 32, atomic_inc64)
|
961
1083
|
|
962
1084
|
def test_atomic_inc2_32(self):
|
963
1085
|
rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
|
964
|
-
sig =
|
965
|
-
self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc2_32)
|
1086
|
+
sig = "void(uint32[:,:], uint32)"
|
1087
|
+
self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc2_32)
|
966
1088
|
|
967
1089
|
def test_atomic_inc2_64(self):
|
968
1090
|
rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
|
969
|
-
sig =
|
970
|
-
self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc2_64)
|
1091
|
+
sig = "void(uint64[:,:], uint64)"
|
1092
|
+
self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc2_64)
|
971
1093
|
|
972
1094
|
def test_atomic_inc3(self):
|
973
1095
|
rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
|
974
|
-
sig =
|
975
|
-
self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc3)
|
1096
|
+
sig = "void(uint32[:,:], uint32)"
|
1097
|
+
self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc3)
|
976
1098
|
|
977
1099
|
def test_atomic_inc_global_32(self):
|
978
1100
|
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
|
979
|
-
sig =
|
980
|
-
self.check_inc_index2(
|
981
|
-
|
1101
|
+
sig = "void(uint32[:], uint32[:], uint32)"
|
1102
|
+
self.check_inc_index2(
|
1103
|
+
ary, idx, rand_const, sig, 1, 32, atomic_inc_global
|
1104
|
+
)
|
982
1105
|
|
983
1106
|
def test_atomic_inc_global_64(self):
|
984
1107
|
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
|
985
|
-
sig =
|
986
|
-
self.check_inc_index2(
|
987
|
-
|
1108
|
+
sig = "void(uint64[:], uint64[:], uint64)"
|
1109
|
+
self.check_inc_index2(
|
1110
|
+
ary, idx, rand_const, sig, 1, 32, atomic_inc_global
|
1111
|
+
)
|
988
1112
|
|
989
1113
|
def test_atomic_inc_global_2_32(self):
|
990
1114
|
rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
|
991
|
-
sig =
|
992
|
-
self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc_global_2)
|
1115
|
+
sig = "void(uint32[:,:], uint32)"
|
1116
|
+
self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc_global_2)
|
993
1117
|
|
994
1118
|
def test_atomic_inc_global_2_64(self):
|
995
1119
|
rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
|
996
|
-
sig =
|
997
|
-
self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc_global_2)
|
1120
|
+
sig = "void(uint64[:,:], uint64)"
|
1121
|
+
self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc_global_2)
|
998
1122
|
|
999
1123
|
def check_dec_index(self, ary, idx, rconst, sig, nblocks, blksize, func):
|
1000
1124
|
orig = ary.copy()
|
1001
1125
|
cuda_func = cuda.jit(sig)(func)
|
1002
1126
|
cuda_func[nblocks, blksize](ary, idx, rconst)
|
1003
|
-
np.testing.assert_equal(
|
1004
|
-
|
1005
|
-
|
1006
|
-
|
1127
|
+
np.testing.assert_equal(
|
1128
|
+
ary,
|
1129
|
+
np.where(
|
1130
|
+
orig == 0, rconst, np.where(orig > rconst, rconst, orig - 1)
|
1131
|
+
),
|
1132
|
+
)
|
1007
1133
|
|
1008
1134
|
def check_dec_index2(self, ary, idx, rconst, sig, nblocks, blksize, func):
|
1009
1135
|
orig = ary.copy()
|
1010
1136
|
cuda_func = cuda.jit(sig)(func)
|
1011
1137
|
cuda_func[nblocks, blksize](idx, ary, rconst)
|
1012
|
-
np.testing.assert_equal(
|
1013
|
-
|
1014
|
-
|
1015
|
-
|
1138
|
+
np.testing.assert_equal(
|
1139
|
+
ary,
|
1140
|
+
np.where(
|
1141
|
+
orig == 0, rconst, np.where(orig > rconst, rconst, orig - 1)
|
1142
|
+
),
|
1143
|
+
)
|
1016
1144
|
|
1017
1145
|
def check_dec(self, ary, rconst, sig, nblocks, blksize, func):
|
1018
1146
|
orig = ary.copy()
|
1019
1147
|
cuda_func = cuda.jit(sig)(func)
|
1020
1148
|
cuda_func[nblocks, blksize](ary, rconst)
|
1021
|
-
np.testing.assert_equal(
|
1022
|
-
|
1023
|
-
|
1024
|
-
|
1149
|
+
np.testing.assert_equal(
|
1150
|
+
ary,
|
1151
|
+
np.where(
|
1152
|
+
orig == 0, rconst, np.where(orig > rconst, rconst, orig - 1)
|
1153
|
+
),
|
1154
|
+
)
|
1025
1155
|
|
1026
1156
|
def test_atomic_dec_32(self):
|
1027
1157
|
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
|
1028
|
-
sig =
|
1158
|
+
sig = "void(uint32[:], uint32[:], uint32)"
|
1029
1159
|
self.check_dec_index(ary, idx, rand_const, sig, 1, 32, atomic_dec32)
|
1030
1160
|
|
1031
1161
|
def test_atomic_dec_64(self):
|
1032
1162
|
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
|
1033
|
-
sig =
|
1163
|
+
sig = "void(uint64[:], uint64[:], uint64)"
|
1034
1164
|
self.check_dec_index(ary, idx, rand_const, sig, 1, 32, atomic_dec64)
|
1035
1165
|
|
1036
1166
|
def test_atomic_dec2_32(self):
|
1037
1167
|
rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
|
1038
|
-
sig =
|
1039
|
-
self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec2_32)
|
1168
|
+
sig = "void(uint32[:,:], uint32)"
|
1169
|
+
self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec2_32)
|
1040
1170
|
|
1041
1171
|
def test_atomic_dec2_64(self):
|
1042
1172
|
rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
|
1043
|
-
sig =
|
1044
|
-
self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec2_64)
|
1173
|
+
sig = "void(uint64[:,:], uint64)"
|
1174
|
+
self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec2_64)
|
1045
1175
|
|
1046
1176
|
def test_atomic_dec3_new(self):
|
1047
1177
|
rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
|
1048
|
-
sig =
|
1049
|
-
self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec3)
|
1178
|
+
sig = "void(uint32[:,:], uint32)"
|
1179
|
+
self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec3)
|
1050
1180
|
|
1051
1181
|
def test_atomic_dec_global_32(self):
|
1052
1182
|
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
|
1053
|
-
sig =
|
1054
|
-
self.check_dec_index2(
|
1055
|
-
|
1183
|
+
sig = "void(uint32[:], uint32[:], uint32)"
|
1184
|
+
self.check_dec_index2(
|
1185
|
+
ary, idx, rand_const, sig, 1, 32, atomic_dec_global
|
1186
|
+
)
|
1056
1187
|
|
1057
1188
|
def test_atomic_dec_global_64(self):
|
1058
1189
|
rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
|
1059
|
-
sig =
|
1060
|
-
self.check_dec_index2(
|
1061
|
-
|
1190
|
+
sig = "void(uint64[:], uint64[:], uint64)"
|
1191
|
+
self.check_dec_index2(
|
1192
|
+
ary, idx, rand_const, sig, 1, 32, atomic_dec_global
|
1193
|
+
)
|
1062
1194
|
|
1063
1195
|
def test_atomic_dec_global2_32(self):
|
1064
1196
|
rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
|
1065
|
-
sig =
|
1066
|
-
self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec_global_2)
|
1197
|
+
sig = "void(uint32[:,:], uint32)"
|
1198
|
+
self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec_global_2)
|
1067
1199
|
|
1068
1200
|
def test_atomic_dec_global2_64(self):
|
1069
1201
|
rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
|
1070
|
-
sig =
|
1071
|
-
self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec_global_2)
|
1202
|
+
sig = "void(uint64[:,:], uint64)"
|
1203
|
+
self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec_global_2)
|
1072
1204
|
|
1073
1205
|
def test_atomic_exch(self):
|
1074
1206
|
rand_const = np.random.randint(50, 100, dtype=np.uint32)
|
1075
1207
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32)
|
1076
1208
|
idx = np.arange(32, dtype=np.uint32)
|
1077
1209
|
|
1078
|
-
cuda_func = cuda.jit(
|
1210
|
+
cuda_func = cuda.jit("void(uint32[:], uint32[:], uint32)")(atomic_exch)
|
1079
1211
|
cuda_func[1, 32](ary, idx, rand_const)
|
1080
1212
|
|
1081
1213
|
np.testing.assert_equal(ary, rand_const)
|
@@ -1084,7 +1216,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
1084
1216
|
rand_const = np.random.randint(50, 100, dtype=np.uint32)
|
1085
1217
|
ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
|
1086
1218
|
|
1087
|
-
cuda_func = cuda.jit(
|
1219
|
+
cuda_func = cuda.jit("void(uint32[:,:], uint32)")(atomic_exch2)
|
1088
1220
|
cuda_func[1, (4, 8)](ary, rand_const)
|
1089
1221
|
np.testing.assert_equal(ary, rand_const)
|
1090
1222
|
|
@@ -1092,7 +1224,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
1092
1224
|
rand_const = np.random.randint(50, 100, dtype=np.uint64)
|
1093
1225
|
ary = np.random.randint(0, 32, size=32).astype(np.uint64).reshape(4, 8)
|
1094
1226
|
|
1095
|
-
cuda_func = cuda.jit(
|
1227
|
+
cuda_func = cuda.jit("void(uint64[:,:], uint64)")(atomic_exch3)
|
1096
1228
|
cuda_func[1, (4, 8)](ary, rand_const)
|
1097
1229
|
np.testing.assert_equal(ary, rand_const)
|
1098
1230
|
|
@@ -1101,7 +1233,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
1101
1233
|
idx = np.arange(32, dtype=np.uint32)
|
1102
1234
|
ary = np.random.randint(0, 32, size=32, dtype=np.uint32)
|
1103
1235
|
|
1104
|
-
sig =
|
1236
|
+
sig = "void(uint32[:], uint32[:], uint32)"
|
1105
1237
|
cuda_func = cuda.jit(sig)(atomic_exch_global)
|
1106
1238
|
cuda_func[1, 32](idx, ary, rand_const)
|
1107
1239
|
np.testing.assert_equal(ary, rand_const)
|
@@ -1135,8 +1267,9 @@ class TestCudaAtomics(CUDATestCase):
|
|
1135
1267
|
def test_atomic_max_double_normalizedindex(self):
|
1136
1268
|
vals = np.random.randint(0, 65535, size=(32, 32)).astype(np.float64)
|
1137
1269
|
res = np.zeros(1, np.float64)
|
1138
|
-
cuda_func = cuda.jit(
|
1139
|
-
atomic_max_double_normalizedindex
|
1270
|
+
cuda_func = cuda.jit("void(float64[:], float64[:,:])")(
|
1271
|
+
atomic_max_double_normalizedindex
|
1272
|
+
)
|
1140
1273
|
cuda_func[32, 32](res, vals)
|
1141
1274
|
|
1142
1275
|
gold = np.max(vals)
|
@@ -1145,8 +1278,9 @@ class TestCudaAtomics(CUDATestCase):
|
|
1145
1278
|
def test_atomic_max_double_oneindex(self):
|
1146
1279
|
vals = np.random.randint(0, 128, size=32).astype(np.float64)
|
1147
1280
|
res = np.zeros(1, np.float64)
|
1148
|
-
cuda_func = cuda.jit(
|
1149
|
-
atomic_max_double_oneindex
|
1281
|
+
cuda_func = cuda.jit("void(float64[:], float64[:])")(
|
1282
|
+
atomic_max_double_oneindex
|
1283
|
+
)
|
1150
1284
|
cuda_func[1, 32](res, vals)
|
1151
1285
|
|
1152
1286
|
gold = np.max(vals)
|
@@ -1182,8 +1316,9 @@ class TestCudaAtomics(CUDATestCase):
|
|
1182
1316
|
def test_atomic_min_double_normalizedindex(self):
|
1183
1317
|
vals = np.random.randint(0, 65535, size=(32, 32)).astype(np.float64)
|
1184
1318
|
res = np.ones(1, np.float64) * 65535
|
1185
|
-
cuda_func = cuda.jit(
|
1186
|
-
atomic_min_double_normalizedindex
|
1319
|
+
cuda_func = cuda.jit("void(float64[:], float64[:,:])")(
|
1320
|
+
atomic_min_double_normalizedindex
|
1321
|
+
)
|
1187
1322
|
cuda_func[32, 32](res, vals)
|
1188
1323
|
|
1189
1324
|
gold = np.min(vals)
|
@@ -1192,8 +1327,9 @@ class TestCudaAtomics(CUDATestCase):
|
|
1192
1327
|
def test_atomic_min_double_oneindex(self):
|
1193
1328
|
vals = np.random.randint(0, 128, size=32).astype(np.float64)
|
1194
1329
|
res = np.ones(1, np.float64) * 128
|
1195
|
-
cuda_func = cuda.jit(
|
1196
|
-
atomic_min_double_oneindex
|
1330
|
+
cuda_func = cuda.jit("void(float64[:], float64[:])")(
|
1331
|
+
atomic_min_double_oneindex
|
1332
|
+
)
|
1197
1333
|
cuda_func[1, 32](res, vals)
|
1198
1334
|
|
1199
1335
|
gold = np.min(vals)
|
@@ -1211,16 +1347,15 @@ class TestCudaAtomics(CUDATestCase):
|
|
1211
1347
|
# the result will be ary[idx] for either of ary[idx] or val being NaN.
|
1212
1348
|
|
1213
1349
|
def _test_atomic_minmax_nan_location(self, func):
|
1350
|
+
cuda_func = cuda.jit("void(float64[:], float64[:,:])")(func)
|
1214
1351
|
|
1215
|
-
|
1216
|
-
|
1217
|
-
vals = np.random.randint(0, 128, size=(1,1)).astype(np.float64)
|
1352
|
+
vals = np.random.randint(0, 128, size=(1, 1)).astype(np.float64)
|
1218
1353
|
res = np.zeros(1, np.float64) + np.nan
|
1219
1354
|
cuda_func[1, 1](res, vals)
|
1220
1355
|
np.testing.assert_equal(res, [np.nan])
|
1221
1356
|
|
1222
1357
|
def _test_atomic_minmax_nan_val(self, func):
|
1223
|
-
cuda_func = cuda.jit(
|
1358
|
+
cuda_func = cuda.jit("void(float64[:], float64[:,:])")(func)
|
1224
1359
|
|
1225
1360
|
res = np.random.randint(0, 128, size=1).astype(np.float64)
|
1226
1361
|
gold = res.copy()
|
@@ -1244,7 +1379,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
1244
1379
|
def test_atomic_max_double_shared(self):
|
1245
1380
|
vals = np.random.randint(0, 32, size=32).astype(np.float64)
|
1246
1381
|
res = np.zeros(1, np.float64)
|
1247
|
-
sig =
|
1382
|
+
sig = "void(float64[:], float64[:])"
|
1248
1383
|
cuda_func = cuda.jit(sig)(atomic_max_double_shared)
|
1249
1384
|
cuda_func[1, 32](res, vals)
|
1250
1385
|
|
@@ -1254,7 +1389,7 @@ class TestCudaAtomics(CUDATestCase):
|
|
1254
1389
|
def test_atomic_min_double_shared(self):
|
1255
1390
|
vals = np.random.randint(0, 32, size=32).astype(np.float64)
|
1256
1391
|
res = np.ones(1, np.float64) * 32
|
1257
|
-
sig =
|
1392
|
+
sig = "void(float64[:], float64[:])"
|
1258
1393
|
cuda_func = cuda.jit(sig)(atomic_min_double_shared)
|
1259
1394
|
cuda_func[1, 32](res, vals)
|
1260
1395
|
|
@@ -1289,64 +1424,120 @@ class TestCudaAtomics(CUDATestCase):
|
|
1289
1424
|
np.testing.assert_array_equal(expect_out, out)
|
1290
1425
|
|
1291
1426
|
def test_atomic_compare_and_swap(self):
|
1292
|
-
self.check_cas(
|
1293
|
-
|
1427
|
+
self.check_cas(
|
1428
|
+
n=100,
|
1429
|
+
fill=-99,
|
1430
|
+
unfill=-1,
|
1431
|
+
dtype=np.int32,
|
1432
|
+
cas_func=atomic_compare_and_swap,
|
1433
|
+
)
|
1294
1434
|
|
1295
1435
|
def test_atomic_compare_and_swap2(self):
|
1296
|
-
self.check_cas(
|
1297
|
-
|
1436
|
+
self.check_cas(
|
1437
|
+
n=100,
|
1438
|
+
fill=-45,
|
1439
|
+
unfill=-1,
|
1440
|
+
dtype=np.int64,
|
1441
|
+
cas_func=atomic_compare_and_swap,
|
1442
|
+
)
|
1298
1443
|
|
1299
1444
|
def test_atomic_compare_and_swap3(self):
|
1300
1445
|
rfill = np.random.randint(50, 500, dtype=np.uint32)
|
1301
1446
|
runfill = np.random.randint(1, 25, dtype=np.uint32)
|
1302
|
-
self.check_cas(
|
1303
|
-
|
1447
|
+
self.check_cas(
|
1448
|
+
n=100,
|
1449
|
+
fill=rfill,
|
1450
|
+
unfill=runfill,
|
1451
|
+
dtype=np.uint32,
|
1452
|
+
cas_func=atomic_compare_and_swap,
|
1453
|
+
)
|
1304
1454
|
|
1305
1455
|
def test_atomic_compare_and_swap4(self):
|
1306
1456
|
rfill = np.random.randint(50, 500, dtype=np.uint64)
|
1307
1457
|
runfill = np.random.randint(1, 25, dtype=np.uint64)
|
1308
|
-
self.check_cas(
|
1309
|
-
|
1458
|
+
self.check_cas(
|
1459
|
+
n=100,
|
1460
|
+
fill=rfill,
|
1461
|
+
unfill=runfill,
|
1462
|
+
dtype=np.uint64,
|
1463
|
+
cas_func=atomic_compare_and_swap,
|
1464
|
+
)
|
1310
1465
|
|
1311
1466
|
def test_atomic_cas_1dim(self):
|
1312
|
-
self.check_cas(
|
1313
|
-
|
1467
|
+
self.check_cas(
|
1468
|
+
n=100, fill=-99, unfill=-1, dtype=np.int32, cas_func=atomic_cas_1dim
|
1469
|
+
)
|
1314
1470
|
|
1315
1471
|
def test_atomic_cas_2dim(self):
|
1316
|
-
self.check_cas(
|
1317
|
-
|
1472
|
+
self.check_cas(
|
1473
|
+
n=100,
|
1474
|
+
fill=-99,
|
1475
|
+
unfill=-1,
|
1476
|
+
dtype=np.int32,
|
1477
|
+
cas_func=atomic_cas_2dim,
|
1478
|
+
ndim=2,
|
1479
|
+
)
|
1318
1480
|
|
1319
1481
|
def test_atomic_cas2_1dim(self):
|
1320
|
-
self.check_cas(
|
1321
|
-
|
1482
|
+
self.check_cas(
|
1483
|
+
n=100, fill=-45, unfill=-1, dtype=np.int64, cas_func=atomic_cas_1dim
|
1484
|
+
)
|
1322
1485
|
|
1323
1486
|
def test_atomic_cas2_2dim(self):
|
1324
|
-
self.check_cas(
|
1325
|
-
|
1487
|
+
self.check_cas(
|
1488
|
+
n=100,
|
1489
|
+
fill=-45,
|
1490
|
+
unfill=-1,
|
1491
|
+
dtype=np.int64,
|
1492
|
+
cas_func=atomic_cas_2dim,
|
1493
|
+
ndim=2,
|
1494
|
+
)
|
1326
1495
|
|
1327
1496
|
def test_atomic_cas3_1dim(self):
|
1328
1497
|
rfill = np.random.randint(50, 500, dtype=np.uint32)
|
1329
1498
|
runfill = np.random.randint(1, 25, dtype=np.uint32)
|
1330
|
-
self.check_cas(
|
1331
|
-
|
1499
|
+
self.check_cas(
|
1500
|
+
n=100,
|
1501
|
+
fill=rfill,
|
1502
|
+
unfill=runfill,
|
1503
|
+
dtype=np.uint32,
|
1504
|
+
cas_func=atomic_cas_1dim,
|
1505
|
+
)
|
1332
1506
|
|
1333
1507
|
def test_atomic_cas3_2dim(self):
|
1334
1508
|
rfill = np.random.randint(50, 500, dtype=np.uint32)
|
1335
1509
|
runfill = np.random.randint(1, 25, dtype=np.uint32)
|
1336
|
-
self.check_cas(
|
1337
|
-
|
1510
|
+
self.check_cas(
|
1511
|
+
n=100,
|
1512
|
+
fill=rfill,
|
1513
|
+
unfill=runfill,
|
1514
|
+
dtype=np.uint32,
|
1515
|
+
cas_func=atomic_cas_2dim,
|
1516
|
+
ndim=2,
|
1517
|
+
)
|
1338
1518
|
|
1339
1519
|
def test_atomic_cas4_1dim(self):
|
1340
1520
|
rfill = np.random.randint(50, 500, dtype=np.uint64)
|
1341
1521
|
runfill = np.random.randint(1, 25, dtype=np.uint64)
|
1342
|
-
self.check_cas(
|
1343
|
-
|
1522
|
+
self.check_cas(
|
1523
|
+
n=100,
|
1524
|
+
fill=rfill,
|
1525
|
+
unfill=runfill,
|
1526
|
+
dtype=np.uint64,
|
1527
|
+
cas_func=atomic_cas_1dim,
|
1528
|
+
)
|
1344
1529
|
|
1345
1530
|
def test_atomic_cas4_2dim(self):
|
1346
1531
|
rfill = np.random.randint(50, 500, dtype=np.uint64)
|
1347
1532
|
runfill = np.random.randint(1, 25, dtype=np.uint64)
|
1348
|
-
self.check_cas(
|
1349
|
-
|
1533
|
+
self.check_cas(
|
1534
|
+
n=100,
|
1535
|
+
fill=rfill,
|
1536
|
+
unfill=runfill,
|
1537
|
+
dtype=np.uint64,
|
1538
|
+
cas_func=atomic_cas_2dim,
|
1539
|
+
ndim=2,
|
1540
|
+
)
|
1350
1541
|
|
1351
1542
|
# Tests that the atomic add, min, and max operations return the old value -
|
1352
1543
|
# in the simulator, they did not (see Issue #5458). The max and min have
|
@@ -1438,34 +1629,36 @@ class TestCudaAtomics(CUDATestCase):
|
|
1438
1629
|
np.testing.assert_equal(res, gold)
|
1439
1630
|
|
1440
1631
|
def test_atomic_nanmax_int32(self):
|
1441
|
-
self.check_atomic_nanmax(
|
1442
|
-
|
1632
|
+
self.check_atomic_nanmax(
|
1633
|
+
dtype=np.int32, lo=-65535, hi=65535, init_val=0
|
1634
|
+
)
|
1443
1635
|
|
1444
1636
|
def test_atomic_nanmax_uint32(self):
|
1445
|
-
self.check_atomic_nanmax(dtype=np.uint32, lo=0, hi=65535,
|
1446
|
-
init_val=0)
|
1637
|
+
self.check_atomic_nanmax(dtype=np.uint32, lo=0, hi=65535, init_val=0)
|
1447
1638
|
|
1448
1639
|
def test_atomic_nanmax_int64(self):
|
1449
|
-
self.check_atomic_nanmax(
|
1450
|
-
|
1640
|
+
self.check_atomic_nanmax(
|
1641
|
+
dtype=np.int64, lo=-65535, hi=65535, init_val=0
|
1642
|
+
)
|
1451
1643
|
|
1452
1644
|
def test_atomic_nanmax_uint64(self):
|
1453
|
-
self.check_atomic_nanmax(dtype=np.uint64, lo=0, hi=65535,
|
1454
|
-
init_val=0)
|
1645
|
+
self.check_atomic_nanmax(dtype=np.uint64, lo=0, hi=65535, init_val=0)
|
1455
1646
|
|
1456
1647
|
def test_atomic_nanmax_float32(self):
|
1457
|
-
self.check_atomic_nanmax(
|
1458
|
-
|
1648
|
+
self.check_atomic_nanmax(
|
1649
|
+
dtype=np.float32, lo=-65535, hi=65535, init_val=np.nan
|
1650
|
+
)
|
1459
1651
|
|
1460
1652
|
def test_atomic_nanmax_double(self):
|
1461
|
-
self.check_atomic_nanmax(
|
1462
|
-
|
1653
|
+
self.check_atomic_nanmax(
|
1654
|
+
dtype=np.float64, lo=-65535, hi=65535, init_val=np.nan
|
1655
|
+
)
|
1463
1656
|
|
1464
1657
|
def test_atomic_nanmax_double_shared(self):
|
1465
1658
|
vals = np.random.randint(0, 32, size=32).astype(np.float64)
|
1466
1659
|
vals[1::2] = np.nan
|
1467
1660
|
res = np.array([0], dtype=vals.dtype)
|
1468
|
-
sig =
|
1661
|
+
sig = "void(float64[:], float64[:])"
|
1469
1662
|
cuda_func = cuda.jit(sig)(atomic_nanmax_double_shared)
|
1470
1663
|
cuda_func[1, 32](res, vals)
|
1471
1664
|
|
@@ -1476,8 +1669,9 @@ class TestCudaAtomics(CUDATestCase):
|
|
1476
1669
|
vals = np.random.randint(0, 128, size=32).astype(np.float64)
|
1477
1670
|
vals[1::2] = np.nan
|
1478
1671
|
res = np.zeros(1, np.float64)
|
1479
|
-
cuda_func = cuda.jit(
|
1480
|
-
atomic_max_double_oneindex
|
1672
|
+
cuda_func = cuda.jit("void(float64[:], float64[:])")(
|
1673
|
+
atomic_max_double_oneindex
|
1674
|
+
)
|
1481
1675
|
cuda_func[1, 32](res, vals)
|
1482
1676
|
|
1483
1677
|
gold = np.nanmax(vals)
|
@@ -1495,34 +1689,36 @@ class TestCudaAtomics(CUDATestCase):
|
|
1495
1689
|
np.testing.assert_equal(res, gold)
|
1496
1690
|
|
1497
1691
|
def test_atomic_nanmin_int32(self):
|
1498
|
-
self.check_atomic_nanmin(
|
1499
|
-
|
1692
|
+
self.check_atomic_nanmin(
|
1693
|
+
dtype=np.int32, lo=-65535, hi=65535, init_val=0
|
1694
|
+
)
|
1500
1695
|
|
1501
1696
|
def test_atomic_nanmin_uint32(self):
|
1502
|
-
self.check_atomic_nanmin(dtype=np.uint32, lo=0, hi=65535,
|
1503
|
-
init_val=0)
|
1697
|
+
self.check_atomic_nanmin(dtype=np.uint32, lo=0, hi=65535, init_val=0)
|
1504
1698
|
|
1505
1699
|
def test_atomic_nanmin_int64(self):
|
1506
|
-
self.check_atomic_nanmin(
|
1507
|
-
|
1700
|
+
self.check_atomic_nanmin(
|
1701
|
+
dtype=np.int64, lo=-65535, hi=65535, init_val=0
|
1702
|
+
)
|
1508
1703
|
|
1509
1704
|
def test_atomic_nanmin_uint64(self):
|
1510
|
-
self.check_atomic_nanmin(dtype=np.uint64, lo=0, hi=65535,
|
1511
|
-
init_val=0)
|
1705
|
+
self.check_atomic_nanmin(dtype=np.uint64, lo=0, hi=65535, init_val=0)
|
1512
1706
|
|
1513
1707
|
def test_atomic_nanmin_float(self):
|
1514
|
-
self.check_atomic_nanmin(
|
1515
|
-
|
1708
|
+
self.check_atomic_nanmin(
|
1709
|
+
dtype=np.float32, lo=-65535, hi=65535, init_val=np.nan
|
1710
|
+
)
|
1516
1711
|
|
1517
1712
|
def test_atomic_nanmin_double(self):
|
1518
|
-
self.check_atomic_nanmin(
|
1519
|
-
|
1713
|
+
self.check_atomic_nanmin(
|
1714
|
+
dtype=np.float64, lo=-65535, hi=65535, init_val=np.nan
|
1715
|
+
)
|
1520
1716
|
|
1521
1717
|
def test_atomic_nanmin_double_shared(self):
|
1522
1718
|
vals = np.random.randint(0, 32, size=32).astype(np.float64)
|
1523
1719
|
vals[1::2] = np.nan
|
1524
1720
|
res = np.array([32], dtype=vals.dtype)
|
1525
|
-
sig =
|
1721
|
+
sig = "void(float64[:], float64[:])"
|
1526
1722
|
cuda_func = cuda.jit(sig)(atomic_nanmin_double_shared)
|
1527
1723
|
cuda_func[1, 32](res, vals)
|
1528
1724
|
|
@@ -1533,8 +1729,9 @@ class TestCudaAtomics(CUDATestCase):
|
|
1533
1729
|
vals = np.random.randint(0, 128, size=32).astype(np.float64)
|
1534
1730
|
vals[1::2] = np.nan
|
1535
1731
|
res = np.array([128], np.float64)
|
1536
|
-
cuda_func = cuda.jit(
|
1537
|
-
atomic_min_double_oneindex
|
1732
|
+
cuda_func = cuda.jit("void(float64[:], float64[:])")(
|
1733
|
+
atomic_min_double_oneindex
|
1734
|
+
)
|
1538
1735
|
cuda_func[1, 32](res, vals)
|
1539
1736
|
|
1540
1737
|
gold = np.nanmin(vals)
|
@@ -1610,5 +1807,5 @@ class TestCudaAtomics(CUDATestCase):
|
|
1610
1807
|
self._test_atomic_nan_returns_old(kernel, 11)
|
1611
1808
|
|
1612
1809
|
|
1613
|
-
if __name__ ==
|
1810
|
+
if __name__ == "__main__":
|
1614
1811
|
unittest.main()
|