numba-cuda 0.0.0__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.pth +1 -0
- _numba_cuda_redirector.py +74 -0
- numba_cuda/VERSION +1 -0
- numba_cuda/__init__.py +5 -0
- numba_cuda/_version.py +19 -0
- numba_cuda/numba/cuda/__init__.py +22 -0
- numba_cuda/numba/cuda/api.py +526 -0
- numba_cuda/numba/cuda/api_util.py +30 -0
- numba_cuda/numba/cuda/args.py +77 -0
- numba_cuda/numba/cuda/cg.py +62 -0
- numba_cuda/numba/cuda/codegen.py +378 -0
- numba_cuda/numba/cuda/compiler.py +422 -0
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
- numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
- numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
- numba_cuda/numba/cuda/cuda_paths.py +258 -0
- numba_cuda/numba/cuda/cudadecl.py +806 -0
- numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
- numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
- numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
- numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
- numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
- numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
- numba_cuda/numba/cuda/cudadrv/error.py +36 -0
- numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
- numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
- numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
- numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
- numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
- numba_cuda/numba/cuda/cudaimpl.py +1055 -0
- numba_cuda/numba/cuda/cudamath.py +140 -0
- numba_cuda/numba/cuda/decorators.py +189 -0
- numba_cuda/numba/cuda/descriptor.py +33 -0
- numba_cuda/numba/cuda/device_init.py +89 -0
- numba_cuda/numba/cuda/deviceufunc.py +908 -0
- numba_cuda/numba/cuda/dispatcher.py +1057 -0
- numba_cuda/numba/cuda/errors.py +59 -0
- numba_cuda/numba/cuda/extending.py +7 -0
- numba_cuda/numba/cuda/initialize.py +13 -0
- numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
- numba_cuda/numba/cuda/intrinsics.py +198 -0
- numba_cuda/numba/cuda/kernels/__init__.py +0 -0
- numba_cuda/numba/cuda/kernels/reduction.py +262 -0
- numba_cuda/numba/cuda/kernels/transpose.py +65 -0
- numba_cuda/numba/cuda/libdevice.py +3382 -0
- numba_cuda/numba/cuda/libdevicedecl.py +17 -0
- numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
- numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
- numba_cuda/numba/cuda/mathimpl.py +448 -0
- numba_cuda/numba/cuda/models.py +48 -0
- numba_cuda/numba/cuda/nvvmutils.py +235 -0
- numba_cuda/numba/cuda/printimpl.py +86 -0
- numba_cuda/numba/cuda/random.py +292 -0
- numba_cuda/numba/cuda/simulator/__init__.py +38 -0
- numba_cuda/numba/cuda/simulator/api.py +110 -0
- numba_cuda/numba/cuda/simulator/compiler.py +9 -0
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
- numba_cuda/numba/cuda/simulator/kernel.py +308 -0
- numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
- numba_cuda/numba/cuda/simulator/reduction.py +15 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
- numba_cuda/numba/cuda/simulator_init.py +17 -0
- numba_cuda/numba/cuda/stubs.py +902 -0
- numba_cuda/numba/cuda/target.py +440 -0
- numba_cuda/numba/cuda/testing.py +202 -0
- numba_cuda/numba/cuda/tests/__init__.py +58 -0
- numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
- numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
- numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
- numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
- numba_cuda/numba/cuda/tests/data/error.cu +7 -0
- numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
- numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
- numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
- numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
- numba_cuda/numba/cuda/types.py +37 -0
- numba_cuda/numba/cuda/ufuncs.py +662 -0
- numba_cuda/numba/cuda/vector_types.py +209 -0
- numba_cuda/numba/cuda/vectorizers.py +252 -0
- numba_cuda-0.0.12.dist-info/LICENSE +25 -0
- numba_cuda-0.0.12.dist-info/METADATA +68 -0
- numba_cuda-0.0.12.dist-info/RECORD +231 -0
- {numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
- numba_cuda-0.0.0.dist-info/METADATA +0 -6
- numba_cuda-0.0.0.dist-info/RECORD +0 -5
- {numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,147 @@
|
|
1
|
+
from __future__ import print_function
|
2
|
+
|
3
|
+
import numpy as np
|
4
|
+
|
5
|
+
from numba import config, cuda, int32
|
6
|
+
from numba.cuda.testing import (unittest, CUDATestCase, skip_on_cudasim,
|
7
|
+
skip_unless_cc_60, skip_if_cudadevrt_missing,
|
8
|
+
skip_if_mvc_enabled)
|
9
|
+
|
10
|
+
|
11
|
+
@cuda.jit
|
12
|
+
def this_grid(A):
|
13
|
+
cuda.cg.this_grid()
|
14
|
+
A[0] = 1.0
|
15
|
+
|
16
|
+
|
17
|
+
@cuda.jit
|
18
|
+
def sync_group(A):
|
19
|
+
g = cuda.cg.this_grid()
|
20
|
+
g.sync()
|
21
|
+
A[0] = 1.0
|
22
|
+
|
23
|
+
|
24
|
+
@cuda.jit
|
25
|
+
def no_sync(A):
|
26
|
+
A[0] = cuda.grid(1)
|
27
|
+
|
28
|
+
|
29
|
+
def sequential_rows(M):
|
30
|
+
# The grid writes rows one at a time. Each thread reads an element from
|
31
|
+
# the previous row written by its "opposite" thread.
|
32
|
+
#
|
33
|
+
# A failure to sync the grid at each row would result in an incorrect
|
34
|
+
# result as some threads could run ahead of threads in other blocks, or
|
35
|
+
# fail to see the update to the previous row from their opposite thread.
|
36
|
+
|
37
|
+
col = cuda.grid(1)
|
38
|
+
g = cuda.cg.this_grid()
|
39
|
+
|
40
|
+
rows = M.shape[0]
|
41
|
+
cols = M.shape[1]
|
42
|
+
|
43
|
+
for row in range(1, rows):
|
44
|
+
opposite = cols - col - 1
|
45
|
+
M[row, col] = M[row - 1, opposite] + 1
|
46
|
+
g.sync()
|
47
|
+
|
48
|
+
|
49
|
+
@skip_if_cudadevrt_missing
|
50
|
+
@skip_if_mvc_enabled('CG not supported with MVC')
|
51
|
+
class TestCudaCooperativeGroups(CUDATestCase):
|
52
|
+
@skip_unless_cc_60
|
53
|
+
def test_this_grid(self):
|
54
|
+
A = np.full(1, fill_value=np.nan)
|
55
|
+
this_grid[1, 1](A)
|
56
|
+
|
57
|
+
# Ensure the kernel executed beyond the call to cuda.this_grid()
|
58
|
+
self.assertFalse(np.isnan(A[0]), 'Value was not set')
|
59
|
+
|
60
|
+
@skip_unless_cc_60
|
61
|
+
@skip_on_cudasim("Simulator doesn't differentiate between normal and "
|
62
|
+
"cooperative kernels")
|
63
|
+
def test_this_grid_is_cooperative(self):
|
64
|
+
A = np.full(1, fill_value=np.nan)
|
65
|
+
this_grid[1, 1](A)
|
66
|
+
|
67
|
+
# this_grid should have been determined to be cooperative
|
68
|
+
for key, overload in this_grid.overloads.items():
|
69
|
+
self.assertTrue(overload.cooperative)
|
70
|
+
|
71
|
+
@skip_unless_cc_60
|
72
|
+
def test_sync_group(self):
|
73
|
+
A = np.full(1, fill_value=np.nan)
|
74
|
+
sync_group[1, 1](A)
|
75
|
+
|
76
|
+
# Ensure the kernel executed beyond the call to cuda.sync_group()
|
77
|
+
self.assertFalse(np.isnan(A[0]), 'Value was not set')
|
78
|
+
|
79
|
+
@skip_unless_cc_60
|
80
|
+
@skip_on_cudasim("Simulator doesn't differentiate between normal and "
|
81
|
+
"cooperative kernels")
|
82
|
+
def test_sync_group_is_cooperative(self):
|
83
|
+
A = np.full(1, fill_value=np.nan)
|
84
|
+
sync_group[1, 1](A)
|
85
|
+
# sync_group should have been determined to be cooperative
|
86
|
+
for key, overload in sync_group.overloads.items():
|
87
|
+
self.assertTrue(overload.cooperative)
|
88
|
+
|
89
|
+
@skip_on_cudasim("Simulator does not implement linking")
|
90
|
+
def test_false_cooperative_doesnt_link_cudadevrt(self):
|
91
|
+
"""
|
92
|
+
We should only mark a kernel as cooperative and link cudadevrt if the
|
93
|
+
kernel uses grid sync. Here we ensure that one that doesn't use grid
|
94
|
+
synsync isn't marked as such.
|
95
|
+
"""
|
96
|
+
A = np.full(1, fill_value=np.nan)
|
97
|
+
no_sync[1, 1](A)
|
98
|
+
|
99
|
+
for key, overload in no_sync.overloads.items():
|
100
|
+
self.assertFalse(overload.cooperative)
|
101
|
+
for link in overload._codelibrary._linking_files:
|
102
|
+
self.assertNotIn('cudadevrt', link)
|
103
|
+
|
104
|
+
@skip_unless_cc_60
|
105
|
+
def test_sync_at_matrix_row(self):
|
106
|
+
if config.ENABLE_CUDASIM:
|
107
|
+
# Use a small matrix to compute using a single block in a
|
108
|
+
# reasonable amount of time
|
109
|
+
shape = (32, 32)
|
110
|
+
else:
|
111
|
+
shape = (1024, 1024)
|
112
|
+
A = np.zeros(shape, dtype=np.int32)
|
113
|
+
blockdim = 32
|
114
|
+
griddim = A.shape[1] // blockdim
|
115
|
+
|
116
|
+
sig = (int32[:,::1],)
|
117
|
+
c_sequential_rows = cuda.jit(sig)(sequential_rows)
|
118
|
+
|
119
|
+
overload = c_sequential_rows.overloads[sig]
|
120
|
+
mb = overload.max_cooperative_grid_blocks(blockdim)
|
121
|
+
if griddim > mb:
|
122
|
+
unittest.skip("GPU cannot support enough cooperative grid blocks")
|
123
|
+
|
124
|
+
c_sequential_rows[griddim, blockdim](A)
|
125
|
+
|
126
|
+
reference = np.tile(np.arange(shape[0]), (shape[1], 1)).T
|
127
|
+
np.testing.assert_equal(A, reference)
|
128
|
+
|
129
|
+
@skip_unless_cc_60
|
130
|
+
def test_max_cooperative_grid_blocks(self):
|
131
|
+
# The maximum number of blocks will vary based on the device so we
|
132
|
+
# can't test for an expected value, but we can check that the function
|
133
|
+
# doesn't error, and that varying the number of dimensions of the block
|
134
|
+
# whilst keeping the total number of threads constant doesn't change
|
135
|
+
# the maximum to validate some of the logic.
|
136
|
+
sig = (int32[:,::1],)
|
137
|
+
c_sequential_rows = cuda.jit(sig)(sequential_rows)
|
138
|
+
overload = c_sequential_rows.overloads[sig]
|
139
|
+
blocks1d = overload.max_cooperative_grid_blocks(256)
|
140
|
+
blocks2d = overload.max_cooperative_grid_blocks((16, 16))
|
141
|
+
blocks3d = overload.max_cooperative_grid_blocks((16, 4, 4))
|
142
|
+
self.assertEqual(blocks1d, blocks2d)
|
143
|
+
self.assertEqual(blocks1d, blocks3d)
|
144
|
+
|
145
|
+
|
146
|
+
if __name__ == '__main__':
|
147
|
+
unittest.main()
|
@@ -0,0 +1,435 @@
|
|
1
|
+
import numpy as np
|
2
|
+
|
3
|
+
from numba import vectorize, guvectorize
|
4
|
+
from numba import cuda
|
5
|
+
from numba.cuda.cudadrv import driver
|
6
|
+
from numba.cuda.testing import unittest, ContextResettingTestCase, ForeignArray
|
7
|
+
from numba.cuda.testing import skip_on_cudasim, skip_if_external_memmgr
|
8
|
+
from numba.tests.support import linux_only, override_config
|
9
|
+
from unittest.mock import call, patch
|
10
|
+
|
11
|
+
|
12
|
+
@skip_on_cudasim('CUDA Array Interface is not supported in the simulator')
|
13
|
+
class TestCudaArrayInterface(ContextResettingTestCase):
|
14
|
+
def assertPointersEqual(self, a, b):
|
15
|
+
if driver.USE_NV_BINDING:
|
16
|
+
self.assertEqual(int(a.device_ctypes_pointer),
|
17
|
+
int(b.device_ctypes_pointer))
|
18
|
+
|
19
|
+
def test_as_cuda_array(self):
|
20
|
+
h_arr = np.arange(10)
|
21
|
+
self.assertFalse(cuda.is_cuda_array(h_arr))
|
22
|
+
d_arr = cuda.to_device(h_arr)
|
23
|
+
self.assertTrue(cuda.is_cuda_array(d_arr))
|
24
|
+
my_arr = ForeignArray(d_arr)
|
25
|
+
self.assertTrue(cuda.is_cuda_array(my_arr))
|
26
|
+
wrapped = cuda.as_cuda_array(my_arr)
|
27
|
+
self.assertTrue(cuda.is_cuda_array(wrapped))
|
28
|
+
# Their values must equal the original array
|
29
|
+
np.testing.assert_array_equal(wrapped.copy_to_host(), h_arr)
|
30
|
+
np.testing.assert_array_equal(d_arr.copy_to_host(), h_arr)
|
31
|
+
# d_arr and wrapped must be the same buffer
|
32
|
+
self.assertPointersEqual(wrapped, d_arr)
|
33
|
+
|
34
|
+
def get_stream_value(self, stream):
|
35
|
+
if driver.USE_NV_BINDING:
|
36
|
+
return int(stream.handle)
|
37
|
+
else:
|
38
|
+
return stream.handle.value
|
39
|
+
|
40
|
+
@skip_if_external_memmgr('Ownership not relevant with external memmgr')
|
41
|
+
def test_ownership(self):
|
42
|
+
# Get the deallocation queue
|
43
|
+
ctx = cuda.current_context()
|
44
|
+
deallocs = ctx.memory_manager.deallocations
|
45
|
+
# Flush all deallocations
|
46
|
+
deallocs.clear()
|
47
|
+
self.assertEqual(len(deallocs), 0)
|
48
|
+
# Make new device array
|
49
|
+
d_arr = cuda.to_device(np.arange(100))
|
50
|
+
# Convert it
|
51
|
+
cvted = cuda.as_cuda_array(d_arr)
|
52
|
+
# Drop reference to the original object such that
|
53
|
+
# only `cvted` has a reference to it.
|
54
|
+
del d_arr
|
55
|
+
# There shouldn't be any new deallocations
|
56
|
+
self.assertEqual(len(deallocs), 0)
|
57
|
+
# Try to access the memory and verify its content
|
58
|
+
np.testing.assert_equal(cvted.copy_to_host(), np.arange(100))
|
59
|
+
# Drop last reference to the memory
|
60
|
+
del cvted
|
61
|
+
self.assertEqual(len(deallocs), 1)
|
62
|
+
# Flush
|
63
|
+
deallocs.clear()
|
64
|
+
|
65
|
+
def test_kernel_arg(self):
|
66
|
+
h_arr = np.arange(10)
|
67
|
+
d_arr = cuda.to_device(h_arr)
|
68
|
+
my_arr = ForeignArray(d_arr)
|
69
|
+
wrapped = cuda.as_cuda_array(my_arr)
|
70
|
+
|
71
|
+
@cuda.jit
|
72
|
+
def mutate(arr, val):
|
73
|
+
i = cuda.grid(1)
|
74
|
+
if i >= len(arr):
|
75
|
+
return
|
76
|
+
arr[i] += val
|
77
|
+
|
78
|
+
val = 7
|
79
|
+
mutate.forall(wrapped.size)(wrapped, val)
|
80
|
+
|
81
|
+
np.testing.assert_array_equal(wrapped.copy_to_host(), h_arr + val)
|
82
|
+
np.testing.assert_array_equal(d_arr.copy_to_host(), h_arr + val)
|
83
|
+
|
84
|
+
def test_ufunc_arg(self):
|
85
|
+
@vectorize(['f8(f8, f8)'], target='cuda')
|
86
|
+
def vadd(a, b):
|
87
|
+
return a + b
|
88
|
+
|
89
|
+
# Case 1: use custom array as argument
|
90
|
+
h_arr = np.random.random(10)
|
91
|
+
arr = ForeignArray(cuda.to_device(h_arr))
|
92
|
+
val = 6
|
93
|
+
out = vadd(arr, val)
|
94
|
+
np.testing.assert_array_equal(out.copy_to_host(), h_arr + val)
|
95
|
+
|
96
|
+
# Case 2: use custom array as return
|
97
|
+
out = ForeignArray(cuda.device_array(h_arr.shape))
|
98
|
+
returned = vadd(h_arr, val, out=out)
|
99
|
+
np.testing.assert_array_equal(returned.copy_to_host(), h_arr + val)
|
100
|
+
|
101
|
+
def test_gufunc_arg(self):
|
102
|
+
@guvectorize(['(f8, f8, f8[:])'], '(),()->()', target='cuda')
|
103
|
+
def vadd(inp, val, out):
|
104
|
+
out[0] = inp + val
|
105
|
+
|
106
|
+
# Case 1: use custom array as argument
|
107
|
+
h_arr = np.random.random(10)
|
108
|
+
arr = ForeignArray(cuda.to_device(h_arr))
|
109
|
+
val = np.float64(7)
|
110
|
+
out = vadd(arr, val)
|
111
|
+
np.testing.assert_array_equal(out.copy_to_host(), h_arr + val)
|
112
|
+
|
113
|
+
# Case 2: use custom array as return
|
114
|
+
out = ForeignArray(cuda.device_array(h_arr.shape))
|
115
|
+
returned = vadd(h_arr, val, out=out)
|
116
|
+
np.testing.assert_array_equal(returned.copy_to_host(), h_arr + val)
|
117
|
+
self.assertPointersEqual(returned, out._arr)
|
118
|
+
|
119
|
+
def test_array_views(self):
|
120
|
+
"""Views created via array interface support:
|
121
|
+
- Strided slices
|
122
|
+
- Strided slices
|
123
|
+
"""
|
124
|
+
h_arr = np.random.random(10)
|
125
|
+
c_arr = cuda.to_device(h_arr)
|
126
|
+
|
127
|
+
arr = cuda.as_cuda_array(c_arr)
|
128
|
+
|
129
|
+
# __getitem__ interface accesses expected data
|
130
|
+
|
131
|
+
# Direct views
|
132
|
+
np.testing.assert_array_equal(arr.copy_to_host(), h_arr)
|
133
|
+
np.testing.assert_array_equal(arr[:].copy_to_host(), h_arr)
|
134
|
+
|
135
|
+
# Slicing
|
136
|
+
np.testing.assert_array_equal(arr[:5].copy_to_host(), h_arr[:5])
|
137
|
+
|
138
|
+
# Strided view
|
139
|
+
np.testing.assert_array_equal(arr[::2].copy_to_host(), h_arr[::2])
|
140
|
+
|
141
|
+
# View of strided array
|
142
|
+
arr_strided = cuda.as_cuda_array(c_arr[::2])
|
143
|
+
np.testing.assert_array_equal(arr_strided.copy_to_host(), h_arr[::2])
|
144
|
+
|
145
|
+
# A strided-view-of-array and view-of-strided-array have the same
|
146
|
+
# shape, strides, itemsize, and alloc_size
|
147
|
+
self.assertEqual(arr[::2].shape, arr_strided.shape)
|
148
|
+
self.assertEqual(arr[::2].strides, arr_strided.strides)
|
149
|
+
self.assertEqual(arr[::2].dtype.itemsize, arr_strided.dtype.itemsize)
|
150
|
+
self.assertEqual(arr[::2].alloc_size, arr_strided.alloc_size)
|
151
|
+
self.assertEqual(arr[::2].nbytes,
|
152
|
+
arr_strided.size * arr_strided.dtype.itemsize)
|
153
|
+
|
154
|
+
# __setitem__ interface propagates into external array
|
155
|
+
|
156
|
+
# Writes to a slice
|
157
|
+
arr[:5] = np.pi
|
158
|
+
np.testing.assert_array_equal(
|
159
|
+
c_arr.copy_to_host(),
|
160
|
+
np.concatenate((np.full(5, np.pi), h_arr[5:]))
|
161
|
+
)
|
162
|
+
|
163
|
+
# Writes to a slice from a view
|
164
|
+
arr[:5] = arr[5:]
|
165
|
+
np.testing.assert_array_equal(
|
166
|
+
c_arr.copy_to_host(),
|
167
|
+
np.concatenate((h_arr[5:], h_arr[5:]))
|
168
|
+
)
|
169
|
+
|
170
|
+
# Writes through a view
|
171
|
+
arr[:] = cuda.to_device(h_arr)
|
172
|
+
np.testing.assert_array_equal(c_arr.copy_to_host(), h_arr)
|
173
|
+
|
174
|
+
# Writes to a strided slice
|
175
|
+
arr[::2] = np.pi
|
176
|
+
np.testing.assert_array_equal(
|
177
|
+
c_arr.copy_to_host()[::2],
|
178
|
+
np.full(5, np.pi),
|
179
|
+
)
|
180
|
+
np.testing.assert_array_equal(
|
181
|
+
c_arr.copy_to_host()[1::2],
|
182
|
+
h_arr[1::2]
|
183
|
+
)
|
184
|
+
|
185
|
+
def test_negative_strided_issue(self):
|
186
|
+
# issue #3705
|
187
|
+
h_arr = np.random.random(10)
|
188
|
+
c_arr = cuda.to_device(h_arr)
|
189
|
+
|
190
|
+
def base_offset(orig, sliced):
|
191
|
+
return sliced['data'][0] - orig['data'][0]
|
192
|
+
|
193
|
+
h_ai = h_arr.__array_interface__
|
194
|
+
c_ai = c_arr.__cuda_array_interface__
|
195
|
+
|
196
|
+
h_ai_sliced = h_arr[::-1].__array_interface__
|
197
|
+
c_ai_sliced = c_arr[::-1].__cuda_array_interface__
|
198
|
+
|
199
|
+
# Check data offset is correct
|
200
|
+
self.assertEqual(
|
201
|
+
base_offset(h_ai, h_ai_sliced),
|
202
|
+
base_offset(c_ai, c_ai_sliced),
|
203
|
+
)
|
204
|
+
# Check shape and strides are correct
|
205
|
+
self.assertEqual(h_ai_sliced['shape'], c_ai_sliced['shape'])
|
206
|
+
self.assertEqual(h_ai_sliced['strides'], c_ai_sliced['strides'])
|
207
|
+
|
208
|
+
def test_negative_strided_copy_to_host(self):
|
209
|
+
# issue #3705
|
210
|
+
h_arr = np.random.random(10)
|
211
|
+
c_arr = cuda.to_device(h_arr)
|
212
|
+
sliced = c_arr[::-1]
|
213
|
+
with self.assertRaises(NotImplementedError) as raises:
|
214
|
+
sliced.copy_to_host()
|
215
|
+
expected_msg = 'D->H copy not implemented for negative strides'
|
216
|
+
self.assertIn(expected_msg, str(raises.exception))
|
217
|
+
|
218
|
+
def test_masked_array(self):
|
219
|
+
h_arr = np.random.random(10)
|
220
|
+
h_mask = np.random.randint(2, size=10, dtype='bool')
|
221
|
+
c_arr = cuda.to_device(h_arr)
|
222
|
+
c_mask = cuda.to_device(h_mask)
|
223
|
+
|
224
|
+
# Manually create a masked CUDA Array Interface dictionary
|
225
|
+
masked_cuda_array_interface = c_arr.__cuda_array_interface__.copy()
|
226
|
+
masked_cuda_array_interface['mask'] = c_mask
|
227
|
+
|
228
|
+
with self.assertRaises(NotImplementedError) as raises:
|
229
|
+
cuda.from_cuda_array_interface(masked_cuda_array_interface)
|
230
|
+
expected_msg = 'Masked arrays are not supported'
|
231
|
+
self.assertIn(expected_msg, str(raises.exception))
|
232
|
+
|
233
|
+
def test_zero_size_array(self):
|
234
|
+
# for #4175
|
235
|
+
c_arr = cuda.device_array(0)
|
236
|
+
self.assertEqual(c_arr.__cuda_array_interface__['data'][0], 0)
|
237
|
+
|
238
|
+
@cuda.jit
|
239
|
+
def add_one(arr):
|
240
|
+
x = cuda.grid(1)
|
241
|
+
N = arr.shape[0]
|
242
|
+
if x < N:
|
243
|
+
arr[x] += 1
|
244
|
+
|
245
|
+
d_arr = ForeignArray(c_arr)
|
246
|
+
add_one[1, 10](d_arr) # this should pass
|
247
|
+
|
248
|
+
def test_strides(self):
|
249
|
+
# for #4175
|
250
|
+
# First, test C-contiguous array
|
251
|
+
c_arr = cuda.device_array((2, 3, 4))
|
252
|
+
self.assertEqual(c_arr.__cuda_array_interface__['strides'], None)
|
253
|
+
|
254
|
+
# Second, test non C-contiguous array
|
255
|
+
c_arr = c_arr[:, 1, :]
|
256
|
+
self.assertNotEqual(c_arr.__cuda_array_interface__['strides'], None)
|
257
|
+
|
258
|
+
def test_consuming_strides(self):
|
259
|
+
hostarray = np.arange(10).reshape(2, 5)
|
260
|
+
devarray = cuda.to_device(hostarray)
|
261
|
+
face = devarray.__cuda_array_interface__
|
262
|
+
self.assertIsNone(face['strides'])
|
263
|
+
got = cuda.from_cuda_array_interface(face).copy_to_host()
|
264
|
+
np.testing.assert_array_equal(got, hostarray)
|
265
|
+
self.assertTrue(got.flags['C_CONTIGUOUS'])
|
266
|
+
# Try non-NULL strides
|
267
|
+
face['strides'] = hostarray.strides
|
268
|
+
self.assertIsNotNone(face['strides'])
|
269
|
+
got = cuda.from_cuda_array_interface(face).copy_to_host()
|
270
|
+
np.testing.assert_array_equal(got, hostarray)
|
271
|
+
self.assertTrue(got.flags['C_CONTIGUOUS'])
|
272
|
+
|
273
|
+
def test_produce_no_stream(self):
|
274
|
+
c_arr = cuda.device_array(10)
|
275
|
+
self.assertIsNone(c_arr.__cuda_array_interface__['stream'])
|
276
|
+
|
277
|
+
mapped_arr = cuda.mapped_array(10)
|
278
|
+
self.assertIsNone(mapped_arr.__cuda_array_interface__['stream'])
|
279
|
+
|
280
|
+
@linux_only
|
281
|
+
def test_produce_managed_no_stream(self):
|
282
|
+
managed_arr = cuda.managed_array(10)
|
283
|
+
self.assertIsNone(managed_arr.__cuda_array_interface__['stream'])
|
284
|
+
|
285
|
+
def test_produce_stream(self):
|
286
|
+
s = cuda.stream()
|
287
|
+
c_arr = cuda.device_array(10, stream=s)
|
288
|
+
cai_stream = c_arr.__cuda_array_interface__['stream']
|
289
|
+
stream_value = self.get_stream_value(s)
|
290
|
+
self.assertEqual(stream_value, cai_stream)
|
291
|
+
|
292
|
+
s = cuda.stream()
|
293
|
+
mapped_arr = cuda.mapped_array(10, stream=s)
|
294
|
+
cai_stream = mapped_arr.__cuda_array_interface__['stream']
|
295
|
+
stream_value = self.get_stream_value(s)
|
296
|
+
self.assertEqual(stream_value, cai_stream)
|
297
|
+
|
298
|
+
@linux_only
|
299
|
+
def test_produce_managed_stream(self):
|
300
|
+
s = cuda.stream()
|
301
|
+
managed_arr = cuda.managed_array(10, stream=s)
|
302
|
+
cai_stream = managed_arr.__cuda_array_interface__['stream']
|
303
|
+
stream_value = self.get_stream_value(s)
|
304
|
+
self.assertEqual(stream_value, cai_stream)
|
305
|
+
|
306
|
+
def test_consume_no_stream(self):
|
307
|
+
# Create a foreign array with no stream
|
308
|
+
f_arr = ForeignArray(cuda.device_array(10))
|
309
|
+
|
310
|
+
# Ensure that the imported array has no default stream
|
311
|
+
c_arr = cuda.as_cuda_array(f_arr)
|
312
|
+
self.assertEqual(c_arr.stream, 0)
|
313
|
+
|
314
|
+
def test_consume_stream(self):
|
315
|
+
# Create a foreign array with a stream
|
316
|
+
s = cuda.stream()
|
317
|
+
f_arr = ForeignArray(cuda.device_array(10, stream=s))
|
318
|
+
|
319
|
+
# Ensure that an imported array has the stream as its default stream
|
320
|
+
c_arr = cuda.as_cuda_array(f_arr)
|
321
|
+
self.assertTrue(c_arr.stream.external)
|
322
|
+
stream_value = self.get_stream_value(s)
|
323
|
+
imported_stream_value = self.get_stream_value(c_arr.stream)
|
324
|
+
self.assertEqual(stream_value, imported_stream_value)
|
325
|
+
|
326
|
+
def test_consume_no_sync(self):
|
327
|
+
# Create a foreign array with no stream
|
328
|
+
f_arr = ForeignArray(cuda.device_array(10))
|
329
|
+
|
330
|
+
with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
|
331
|
+
return_value=None) as mock_sync:
|
332
|
+
cuda.as_cuda_array(f_arr)
|
333
|
+
|
334
|
+
# Ensure the synchronize method of a stream was not called
|
335
|
+
mock_sync.assert_not_called()
|
336
|
+
|
337
|
+
def test_consume_sync(self):
|
338
|
+
# Create a foreign array with a stream
|
339
|
+
s = cuda.stream()
|
340
|
+
f_arr = ForeignArray(cuda.device_array(10, stream=s))
|
341
|
+
|
342
|
+
with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
|
343
|
+
return_value=None) as mock_sync:
|
344
|
+
cuda.as_cuda_array(f_arr)
|
345
|
+
|
346
|
+
# Ensure the synchronize method of a stream was called
|
347
|
+
mock_sync.assert_called_once_with()
|
348
|
+
|
349
|
+
def test_consume_sync_disabled(self):
|
350
|
+
# Create a foreign array with a stream
|
351
|
+
s = cuda.stream()
|
352
|
+
f_arr = ForeignArray(cuda.device_array(10, stream=s))
|
353
|
+
|
354
|
+
# Set sync to false before testing. The test suite should generally be
|
355
|
+
# run with sync enabled, but stash the old value just in case it is
|
356
|
+
# not.
|
357
|
+
with override_config('CUDA_ARRAY_INTERFACE_SYNC', False):
|
358
|
+
with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
|
359
|
+
return_value=None) as mock_sync:
|
360
|
+
cuda.as_cuda_array(f_arr)
|
361
|
+
|
362
|
+
# Ensure the synchronize method of a stream was not called
|
363
|
+
mock_sync.assert_not_called()
|
364
|
+
|
365
|
+
def test_launch_no_sync(self):
|
366
|
+
# Create a foreign array with no stream
|
367
|
+
f_arr = ForeignArray(cuda.device_array(10))
|
368
|
+
|
369
|
+
@cuda.jit
|
370
|
+
def f(x):
|
371
|
+
pass
|
372
|
+
|
373
|
+
with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
|
374
|
+
return_value=None) as mock_sync:
|
375
|
+
f[1, 1](f_arr)
|
376
|
+
|
377
|
+
# Ensure the synchronize method of a stream was not called
|
378
|
+
mock_sync.assert_not_called()
|
379
|
+
|
380
|
+
def test_launch_sync(self):
|
381
|
+
# Create a foreign array with a stream
|
382
|
+
s = cuda.stream()
|
383
|
+
f_arr = ForeignArray(cuda.device_array(10, stream=s))
|
384
|
+
|
385
|
+
@cuda.jit
|
386
|
+
def f(x):
|
387
|
+
pass
|
388
|
+
|
389
|
+
with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
|
390
|
+
return_value=None) as mock_sync:
|
391
|
+
f[1, 1](f_arr)
|
392
|
+
|
393
|
+
# Ensure the synchronize method of a stream was called
|
394
|
+
mock_sync.assert_called_once_with()
|
395
|
+
|
396
|
+
def test_launch_sync_two_streams(self):
|
397
|
+
# Create two foreign arrays with streams
|
398
|
+
s1 = cuda.stream()
|
399
|
+
s2 = cuda.stream()
|
400
|
+
f_arr1 = ForeignArray(cuda.device_array(10, stream=s1))
|
401
|
+
f_arr2 = ForeignArray(cuda.device_array(10, stream=s2))
|
402
|
+
|
403
|
+
@cuda.jit
|
404
|
+
def f(x, y):
|
405
|
+
pass
|
406
|
+
|
407
|
+
with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
|
408
|
+
return_value=None) as mock_sync:
|
409
|
+
f[1, 1](f_arr1, f_arr2)
|
410
|
+
|
411
|
+
# Ensure that synchronize was called twice
|
412
|
+
mock_sync.assert_has_calls([call(), call()])
|
413
|
+
|
414
|
+
def test_launch_sync_disabled(self):
|
415
|
+
# Create two foreign arrays with streams
|
416
|
+
s1 = cuda.stream()
|
417
|
+
s2 = cuda.stream()
|
418
|
+
f_arr1 = ForeignArray(cuda.device_array(10, stream=s1))
|
419
|
+
f_arr2 = ForeignArray(cuda.device_array(10, stream=s2))
|
420
|
+
|
421
|
+
with override_config('CUDA_ARRAY_INTERFACE_SYNC', False):
|
422
|
+
@cuda.jit
|
423
|
+
def f(x, y):
|
424
|
+
pass
|
425
|
+
|
426
|
+
with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
|
427
|
+
return_value=None) as mock_sync:
|
428
|
+
f[1, 1](f_arr1, f_arr2)
|
429
|
+
|
430
|
+
# Ensure that synchronize was not called
|
431
|
+
mock_sync.assert_not_called()
|
432
|
+
|
433
|
+
|
434
|
+
if __name__ == "__main__":
|
435
|
+
unittest.main()
|
@@ -0,0 +1,90 @@
|
|
1
|
+
from numba import cuda
|
2
|
+
import numpy as np
|
3
|
+
from numba.cuda.testing import CUDATestCase
|
4
|
+
from numba.tests.support import override_config
|
5
|
+
import unittest
|
6
|
+
|
7
|
+
|
8
|
+
class TestCudaJitNoTypes(CUDATestCase):
|
9
|
+
"""
|
10
|
+
Tests the jit decorator with no types provided.
|
11
|
+
"""
|
12
|
+
|
13
|
+
def test_device_array(self):
|
14
|
+
@cuda.jit
|
15
|
+
def foo(x, y):
|
16
|
+
i = cuda.grid(1)
|
17
|
+
y[i] = x[i]
|
18
|
+
|
19
|
+
x = np.arange(10)
|
20
|
+
y = np.empty_like(x)
|
21
|
+
|
22
|
+
dx = cuda.to_device(x)
|
23
|
+
dy = cuda.to_device(y)
|
24
|
+
|
25
|
+
foo[10, 1](dx, dy)
|
26
|
+
|
27
|
+
dy.copy_to_host(y)
|
28
|
+
|
29
|
+
self.assertTrue(np.all(x == y))
|
30
|
+
|
31
|
+
def test_device_jit(self):
|
32
|
+
@cuda.jit(device=True)
|
33
|
+
def mapper(args):
|
34
|
+
a, b, c = args
|
35
|
+
return a + b + c
|
36
|
+
|
37
|
+
@cuda.jit(device=True)
|
38
|
+
def reducer(a, b):
|
39
|
+
return a + b
|
40
|
+
|
41
|
+
@cuda.jit
|
42
|
+
def driver(A, B):
|
43
|
+
i = cuda.grid(1)
|
44
|
+
if i < B.size:
|
45
|
+
args = A[i], A[i] + B[i], B[i]
|
46
|
+
B[i] = reducer(mapper(args), 1)
|
47
|
+
|
48
|
+
A = np.arange(100, dtype=np.float32)
|
49
|
+
B = np.arange(100, dtype=np.float32)
|
50
|
+
|
51
|
+
Acopy = A.copy()
|
52
|
+
Bcopy = B.copy()
|
53
|
+
|
54
|
+
driver[1, 100](A, B)
|
55
|
+
|
56
|
+
np.testing.assert_allclose(Acopy + Acopy + Bcopy + Bcopy + 1, B)
|
57
|
+
|
58
|
+
def test_device_jit_2(self):
|
59
|
+
@cuda.jit(device=True)
|
60
|
+
def inner(arg):
|
61
|
+
return arg + 1
|
62
|
+
|
63
|
+
@cuda.jit
|
64
|
+
def outer(argin, argout):
|
65
|
+
argout[0] = inner(argin[0]) + inner(2)
|
66
|
+
|
67
|
+
a = np.zeros(1)
|
68
|
+
b = np.zeros(1)
|
69
|
+
|
70
|
+
stream = cuda.stream()
|
71
|
+
d_a = cuda.to_device(a, stream)
|
72
|
+
d_b = cuda.to_device(b, stream)
|
73
|
+
|
74
|
+
outer[1, 1, stream](d_a, d_b)
|
75
|
+
|
76
|
+
d_b.copy_to_host(b, stream)
|
77
|
+
|
78
|
+
self.assertEqual(b[0], (a[0] + 1) + (2 + 1))
|
79
|
+
|
80
|
+
def test_jit_debug_simulator(self):
|
81
|
+
# Ensure that the jit decorator accepts the debug kwarg when the
|
82
|
+
# simulator is in use - see Issue #6615.
|
83
|
+
with override_config('ENABLE_CUDASIM', 1):
|
84
|
+
@cuda.jit(debug=True)
|
85
|
+
def f(x):
|
86
|
+
pass
|
87
|
+
|
88
|
+
|
89
|
+
if __name__ == '__main__':
|
90
|
+
unittest.main()
|