numba-cuda 0.0.0__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.pth +1 -0
- _numba_cuda_redirector.py +74 -0
- numba_cuda/VERSION +1 -0
- numba_cuda/__init__.py +5 -0
- numba_cuda/_version.py +19 -0
- numba_cuda/numba/cuda/__init__.py +22 -0
- numba_cuda/numba/cuda/api.py +526 -0
- numba_cuda/numba/cuda/api_util.py +30 -0
- numba_cuda/numba/cuda/args.py +77 -0
- numba_cuda/numba/cuda/cg.py +62 -0
- numba_cuda/numba/cuda/codegen.py +378 -0
- numba_cuda/numba/cuda/compiler.py +422 -0
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
- numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
- numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
- numba_cuda/numba/cuda/cuda_paths.py +258 -0
- numba_cuda/numba/cuda/cudadecl.py +806 -0
- numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
- numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
- numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
- numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
- numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
- numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
- numba_cuda/numba/cuda/cudadrv/error.py +36 -0
- numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
- numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
- numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
- numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
- numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
- numba_cuda/numba/cuda/cudaimpl.py +1055 -0
- numba_cuda/numba/cuda/cudamath.py +140 -0
- numba_cuda/numba/cuda/decorators.py +189 -0
- numba_cuda/numba/cuda/descriptor.py +33 -0
- numba_cuda/numba/cuda/device_init.py +89 -0
- numba_cuda/numba/cuda/deviceufunc.py +908 -0
- numba_cuda/numba/cuda/dispatcher.py +1057 -0
- numba_cuda/numba/cuda/errors.py +59 -0
- numba_cuda/numba/cuda/extending.py +7 -0
- numba_cuda/numba/cuda/initialize.py +13 -0
- numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
- numba_cuda/numba/cuda/intrinsics.py +198 -0
- numba_cuda/numba/cuda/kernels/__init__.py +0 -0
- numba_cuda/numba/cuda/kernels/reduction.py +262 -0
- numba_cuda/numba/cuda/kernels/transpose.py +65 -0
- numba_cuda/numba/cuda/libdevice.py +3382 -0
- numba_cuda/numba/cuda/libdevicedecl.py +17 -0
- numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
- numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
- numba_cuda/numba/cuda/mathimpl.py +448 -0
- numba_cuda/numba/cuda/models.py +48 -0
- numba_cuda/numba/cuda/nvvmutils.py +235 -0
- numba_cuda/numba/cuda/printimpl.py +86 -0
- numba_cuda/numba/cuda/random.py +292 -0
- numba_cuda/numba/cuda/simulator/__init__.py +38 -0
- numba_cuda/numba/cuda/simulator/api.py +110 -0
- numba_cuda/numba/cuda/simulator/compiler.py +9 -0
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
- numba_cuda/numba/cuda/simulator/kernel.py +308 -0
- numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
- numba_cuda/numba/cuda/simulator/reduction.py +15 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
- numba_cuda/numba/cuda/simulator_init.py +17 -0
- numba_cuda/numba/cuda/stubs.py +902 -0
- numba_cuda/numba/cuda/target.py +440 -0
- numba_cuda/numba/cuda/testing.py +202 -0
- numba_cuda/numba/cuda/tests/__init__.py +58 -0
- numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
- numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
- numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
- numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
- numba_cuda/numba/cuda/tests/data/error.cu +7 -0
- numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
- numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
- numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
- numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
- numba_cuda/numba/cuda/types.py +37 -0
- numba_cuda/numba/cuda/ufuncs.py +662 -0
- numba_cuda/numba/cuda/vector_types.py +209 -0
- numba_cuda/numba/cuda/vectorizers.py +252 -0
- numba_cuda-0.0.12.dist-info/LICENSE +25 -0
- numba_cuda-0.0.12.dist-info/METADATA +68 -0
- numba_cuda-0.0.12.dist-info/RECORD +231 -0
- {numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
- numba_cuda-0.0.0.dist-info/METADATA +0 -6
- numba_cuda-0.0.0.dist-info/RECORD +0 -5
- {numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
import _numba_cuda_redirector
|
@@ -0,0 +1,74 @@
|
|
1
|
+
import importlib
|
2
|
+
import importlib.abc
|
3
|
+
import pathlib
|
4
|
+
import sys
|
5
|
+
import warnings
|
6
|
+
|
7
|
+
multiple_locations_msg = ("Multiple submodule search locations for {}. "
|
8
|
+
"Cannot redirect numba.cuda to numba_cuda")
|
9
|
+
|
10
|
+
no_spec_msg = ("Couldn't get spec for {}. "
|
11
|
+
"Cannot redirect numba.cuda to numba_cuda")
|
12
|
+
|
13
|
+
|
14
|
+
class NumbaCudaFinder(importlib.abc.MetaPathFinder):
|
15
|
+
def __init__(self):
|
16
|
+
self.initialized = None
|
17
|
+
|
18
|
+
def ensure_initialized(self):
|
19
|
+
if self.initialized is not None:
|
20
|
+
return self.initialized
|
21
|
+
|
22
|
+
numba_spec = importlib.util.find_spec('numba')
|
23
|
+
|
24
|
+
if numba_spec is None:
|
25
|
+
warnings.warn(no_spec_msg.format('numba'))
|
26
|
+
self.initialized = False
|
27
|
+
return False
|
28
|
+
|
29
|
+
numba_cuda_spec = importlib.util.find_spec('numba_cuda')
|
30
|
+
|
31
|
+
if numba_spec is None:
|
32
|
+
warnings.warn(no_spec_msg.format('numba_cuda'))
|
33
|
+
self.initialized = False
|
34
|
+
return False
|
35
|
+
|
36
|
+
numba_search_locations = numba_spec.submodule_search_locations
|
37
|
+
numba_cuda_search_locations = numba_cuda_spec.submodule_search_locations
|
38
|
+
|
39
|
+
if len(numba_search_locations) != 1:
|
40
|
+
warnings.warn(multiple_locations_msg.format('numba'))
|
41
|
+
self.initialized = False
|
42
|
+
return False
|
43
|
+
|
44
|
+
if len(numba_cuda_search_locations) != 1:
|
45
|
+
warnings.warn(multiple_locations_msg.format('numba_cuda'))
|
46
|
+
self.initialized = False
|
47
|
+
return False
|
48
|
+
|
49
|
+
self.numba_path = numba_search_locations[0]
|
50
|
+
self.numba_cuda_path = str((pathlib.Path(numba_cuda_search_locations[0]) /
|
51
|
+
'numba'))
|
52
|
+
self.initialized = True
|
53
|
+
return True
|
54
|
+
|
55
|
+
def find_spec(self, name, path, target=None):
|
56
|
+
if "numba.cuda" in name:
|
57
|
+
initialized = self.ensure_initialized()
|
58
|
+
if not initialized:
|
59
|
+
return None
|
60
|
+
|
61
|
+
if any(self.numba_cuda_path in p for p in path):
|
62
|
+
# Re-entrancy - return and carry on
|
63
|
+
return None
|
64
|
+
|
65
|
+
oot_path = [p.replace(self.numba_path, self.numba_cuda_path)
|
66
|
+
for p in path]
|
67
|
+
for finder in sys.meta_path:
|
68
|
+
spec = finder.find_spec(name, oot_path, target)
|
69
|
+
if spec is not None:
|
70
|
+
return spec
|
71
|
+
|
72
|
+
|
73
|
+
finder = NumbaCudaFinder()
|
74
|
+
sys.meta_path.insert(0, finder)
|
numba_cuda/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.12
|
numba_cuda/__init__.py
CHANGED
numba_cuda/_version.py
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
# Copyright (c) 2024, NVIDIA CORPORATION.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
import importlib.resources
|
16
|
+
|
17
|
+
__version__ = (
|
18
|
+
importlib.resources.files("numba_cuda").joinpath("VERSION").read_text().strip()
|
19
|
+
)
|
@@ -0,0 +1,22 @@
|
|
1
|
+
from numba import runtests
|
2
|
+
from numba.core import config
|
3
|
+
|
4
|
+
if config.ENABLE_CUDASIM:
|
5
|
+
from .simulator_init import *
|
6
|
+
else:
|
7
|
+
from .device_init import *
|
8
|
+
from .device_init import _auto_device
|
9
|
+
|
10
|
+
from numba.cuda.compiler import (compile, compile_for_current_device,
|
11
|
+
compile_ptx, compile_ptx_for_current_device)
|
12
|
+
|
13
|
+
# This is the out-of-tree NVIDIA-maintained target. This is reported in Numba
|
14
|
+
# sysinfo (`numba -s`):
|
15
|
+
implementation = "NVIDIA"
|
16
|
+
|
17
|
+
|
18
|
+
def test(*args, **kwargs):
|
19
|
+
if not is_available():
|
20
|
+
raise cuda_error()
|
21
|
+
|
22
|
+
return runtests.main("numba.cuda.tests", *args, **kwargs)
|
@@ -0,0 +1,526 @@
|
|
1
|
+
"""
|
2
|
+
API that are reported to numba.cuda
|
3
|
+
"""
|
4
|
+
|
5
|
+
|
6
|
+
import contextlib
|
7
|
+
import os
|
8
|
+
|
9
|
+
import numpy as np
|
10
|
+
|
11
|
+
from .cudadrv import devicearray, devices, driver
|
12
|
+
from numba.core import config
|
13
|
+
from numba.cuda.api_util import prepare_shape_strides_dtype
|
14
|
+
|
15
|
+
# NDarray device helper
|
16
|
+
|
17
|
+
require_context = devices.require_context
|
18
|
+
current_context = devices.get_context
|
19
|
+
gpus = devices.gpus
|
20
|
+
|
21
|
+
|
22
|
+
@require_context
|
23
|
+
def from_cuda_array_interface(desc, owner=None, sync=True):
|
24
|
+
"""Create a DeviceNDArray from a cuda-array-interface description.
|
25
|
+
The ``owner`` is the owner of the underlying memory.
|
26
|
+
The resulting DeviceNDArray will acquire a reference from it.
|
27
|
+
|
28
|
+
If ``sync`` is ``True``, then the imported stream (if present) will be
|
29
|
+
synchronized.
|
30
|
+
"""
|
31
|
+
version = desc.get('version')
|
32
|
+
# Mask introduced in version 1
|
33
|
+
if 1 <= version:
|
34
|
+
mask = desc.get('mask')
|
35
|
+
# Would ideally be better to detect if the mask is all valid
|
36
|
+
if mask is not None:
|
37
|
+
raise NotImplementedError('Masked arrays are not supported')
|
38
|
+
|
39
|
+
shape = desc['shape']
|
40
|
+
strides = desc.get('strides')
|
41
|
+
dtype = np.dtype(desc['typestr'])
|
42
|
+
|
43
|
+
shape, strides, dtype = prepare_shape_strides_dtype(
|
44
|
+
shape, strides, dtype, order='C')
|
45
|
+
size = driver.memory_size_from_info(shape, strides, dtype.itemsize)
|
46
|
+
|
47
|
+
devptr = driver.get_devptr_for_active_ctx(desc['data'][0])
|
48
|
+
data = driver.MemoryPointer(
|
49
|
+
current_context(), devptr, size=size, owner=owner)
|
50
|
+
stream_ptr = desc.get('stream', None)
|
51
|
+
if stream_ptr is not None:
|
52
|
+
stream = external_stream(stream_ptr)
|
53
|
+
if sync and config.CUDA_ARRAY_INTERFACE_SYNC:
|
54
|
+
stream.synchronize()
|
55
|
+
else:
|
56
|
+
stream = 0 # No "Numba default stream", not the CUDA default stream
|
57
|
+
da = devicearray.DeviceNDArray(shape=shape, strides=strides,
|
58
|
+
dtype=dtype, gpu_data=data,
|
59
|
+
stream=stream)
|
60
|
+
return da
|
61
|
+
|
62
|
+
|
63
|
+
def as_cuda_array(obj, sync=True):
|
64
|
+
"""Create a DeviceNDArray from any object that implements
|
65
|
+
the :ref:`cuda array interface <cuda-array-interface>`.
|
66
|
+
|
67
|
+
A view of the underlying GPU buffer is created. No copying of the data
|
68
|
+
is done. The resulting DeviceNDArray will acquire a reference from `obj`.
|
69
|
+
|
70
|
+
If ``sync`` is ``True``, then the imported stream (if present) will be
|
71
|
+
synchronized.
|
72
|
+
"""
|
73
|
+
if not is_cuda_array(obj):
|
74
|
+
raise TypeError("*obj* doesn't implement the cuda array interface.")
|
75
|
+
else:
|
76
|
+
return from_cuda_array_interface(obj.__cuda_array_interface__,
|
77
|
+
owner=obj, sync=sync)
|
78
|
+
|
79
|
+
|
80
|
+
def is_cuda_array(obj):
|
81
|
+
"""Test if the object has defined the `__cuda_array_interface__` attribute.
|
82
|
+
|
83
|
+
Does not verify the validity of the interface.
|
84
|
+
"""
|
85
|
+
return hasattr(obj, '__cuda_array_interface__')
|
86
|
+
|
87
|
+
|
88
|
+
def is_float16_supported():
|
89
|
+
"""Whether 16-bit floats are supported.
|
90
|
+
|
91
|
+
float16 is always supported in current versions of Numba - returns True.
|
92
|
+
"""
|
93
|
+
return True
|
94
|
+
|
95
|
+
|
96
|
+
@require_context
|
97
|
+
def to_device(obj, stream=0, copy=True, to=None):
|
98
|
+
"""to_device(obj, stream=0, copy=True, to=None)
|
99
|
+
|
100
|
+
Allocate and transfer a numpy ndarray or structured scalar to the device.
|
101
|
+
|
102
|
+
To copy host->device a numpy array::
|
103
|
+
|
104
|
+
ary = np.arange(10)
|
105
|
+
d_ary = cuda.to_device(ary)
|
106
|
+
|
107
|
+
To enqueue the transfer to a stream::
|
108
|
+
|
109
|
+
stream = cuda.stream()
|
110
|
+
d_ary = cuda.to_device(ary, stream=stream)
|
111
|
+
|
112
|
+
The resulting ``d_ary`` is a ``DeviceNDArray``.
|
113
|
+
|
114
|
+
To copy device->host::
|
115
|
+
|
116
|
+
hary = d_ary.copy_to_host()
|
117
|
+
|
118
|
+
To copy device->host to an existing array::
|
119
|
+
|
120
|
+
ary = np.empty(shape=d_ary.shape, dtype=d_ary.dtype)
|
121
|
+
d_ary.copy_to_host(ary)
|
122
|
+
|
123
|
+
To enqueue the transfer to a stream::
|
124
|
+
|
125
|
+
hary = d_ary.copy_to_host(stream=stream)
|
126
|
+
"""
|
127
|
+
if to is None:
|
128
|
+
to, new = devicearray.auto_device(obj, stream=stream, copy=copy,
|
129
|
+
user_explicit=True)
|
130
|
+
return to
|
131
|
+
if copy:
|
132
|
+
to.copy_to_device(obj, stream=stream)
|
133
|
+
return to
|
134
|
+
|
135
|
+
|
136
|
+
@require_context
|
137
|
+
def device_array(shape, dtype=np.float64, strides=None, order='C', stream=0):
|
138
|
+
"""device_array(shape, dtype=np.float64, strides=None, order='C', stream=0)
|
139
|
+
|
140
|
+
Allocate an empty device ndarray. Similar to :meth:`numpy.empty`.
|
141
|
+
"""
|
142
|
+
shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
|
143
|
+
order)
|
144
|
+
return devicearray.DeviceNDArray(shape=shape, strides=strides, dtype=dtype,
|
145
|
+
stream=stream)
|
146
|
+
|
147
|
+
|
148
|
+
@require_context
|
149
|
+
def managed_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
|
150
|
+
attach_global=True):
|
151
|
+
"""managed_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
|
152
|
+
attach_global=True)
|
153
|
+
|
154
|
+
Allocate a np.ndarray with a buffer that is managed.
|
155
|
+
Similar to np.empty().
|
156
|
+
|
157
|
+
Managed memory is supported on Linux / x86 and PowerPC, and is considered
|
158
|
+
experimental on Windows and Linux / AArch64.
|
159
|
+
|
160
|
+
:param attach_global: A flag indicating whether to attach globally. Global
|
161
|
+
attachment implies that the memory is accessible from
|
162
|
+
any stream on any device. If ``False``, attachment is
|
163
|
+
*host*, and memory is only accessible by devices
|
164
|
+
with Compute Capability 6.0 and later.
|
165
|
+
"""
|
166
|
+
shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
|
167
|
+
order)
|
168
|
+
bytesize = driver.memory_size_from_info(shape, strides, dtype.itemsize)
|
169
|
+
buffer = current_context().memallocmanaged(bytesize,
|
170
|
+
attach_global=attach_global)
|
171
|
+
npary = np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order,
|
172
|
+
buffer=buffer)
|
173
|
+
managedview = np.ndarray.view(npary, type=devicearray.ManagedNDArray)
|
174
|
+
managedview.device_setup(buffer, stream=stream)
|
175
|
+
return managedview
|
176
|
+
|
177
|
+
|
178
|
+
@require_context
|
179
|
+
def pinned_array(shape, dtype=np.float64, strides=None, order='C'):
|
180
|
+
"""pinned_array(shape, dtype=np.float64, strides=None, order='C')
|
181
|
+
|
182
|
+
Allocate an :class:`ndarray <numpy.ndarray>` with a buffer that is pinned
|
183
|
+
(pagelocked). Similar to :func:`np.empty() <numpy.empty>`.
|
184
|
+
"""
|
185
|
+
shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
|
186
|
+
order)
|
187
|
+
bytesize = driver.memory_size_from_info(shape, strides,
|
188
|
+
dtype.itemsize)
|
189
|
+
buffer = current_context().memhostalloc(bytesize)
|
190
|
+
return np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order,
|
191
|
+
buffer=buffer)
|
192
|
+
|
193
|
+
|
194
|
+
@require_context
|
195
|
+
def mapped_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
|
196
|
+
portable=False, wc=False):
|
197
|
+
"""mapped_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
|
198
|
+
portable=False, wc=False)
|
199
|
+
|
200
|
+
Allocate a mapped ndarray with a buffer that is pinned and mapped on
|
201
|
+
to the device. Similar to np.empty()
|
202
|
+
|
203
|
+
:param portable: a boolean flag to allow the allocated device memory to be
|
204
|
+
usable in multiple devices.
|
205
|
+
:param wc: a boolean flag to enable writecombined allocation which is faster
|
206
|
+
to write by the host and to read by the device, but slower to
|
207
|
+
write by the host and slower to write by the device.
|
208
|
+
"""
|
209
|
+
shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
|
210
|
+
order)
|
211
|
+
bytesize = driver.memory_size_from_info(shape, strides, dtype.itemsize)
|
212
|
+
buffer = current_context().memhostalloc(bytesize, mapped=True)
|
213
|
+
npary = np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order,
|
214
|
+
buffer=buffer)
|
215
|
+
mappedview = np.ndarray.view(npary, type=devicearray.MappedNDArray)
|
216
|
+
mappedview.device_setup(buffer, stream=stream)
|
217
|
+
return mappedview
|
218
|
+
|
219
|
+
|
220
|
+
@contextlib.contextmanager
|
221
|
+
@require_context
|
222
|
+
def open_ipc_array(handle, shape, dtype, strides=None, offset=0):
|
223
|
+
"""
|
224
|
+
A context manager that opens a IPC *handle* (*CUipcMemHandle*) that is
|
225
|
+
represented as a sequence of bytes (e.g. *bytes*, tuple of int)
|
226
|
+
and represent it as an array of the given *shape*, *strides* and *dtype*.
|
227
|
+
The *strides* can be omitted. In that case, it is assumed to be a 1D
|
228
|
+
C contiguous array.
|
229
|
+
|
230
|
+
Yields a device array.
|
231
|
+
|
232
|
+
The IPC handle is closed automatically when context manager exits.
|
233
|
+
"""
|
234
|
+
dtype = np.dtype(dtype)
|
235
|
+
# compute size
|
236
|
+
size = np.prod(shape) * dtype.itemsize
|
237
|
+
# manually recreate the IPC mem handle
|
238
|
+
if driver.USE_NV_BINDING:
|
239
|
+
driver_handle = driver.binding.CUipcMemHandle()
|
240
|
+
driver_handle.reserved = handle
|
241
|
+
else:
|
242
|
+
driver_handle = driver.drvapi.cu_ipc_mem_handle()
|
243
|
+
driver_handle.reserved[:] = handle
|
244
|
+
# use *IpcHandle* to open the IPC memory
|
245
|
+
ipchandle = driver.IpcHandle(None, driver_handle, size, offset=offset)
|
246
|
+
yield ipchandle.open_array(current_context(), shape=shape,
|
247
|
+
strides=strides, dtype=dtype)
|
248
|
+
ipchandle.close()
|
249
|
+
|
250
|
+
|
251
|
+
def synchronize():
|
252
|
+
"Synchronize the current context."
|
253
|
+
return current_context().synchronize()
|
254
|
+
|
255
|
+
|
256
|
+
def _contiguous_strides_like_array(ary):
|
257
|
+
"""
|
258
|
+
Given an array, compute strides for a new contiguous array of the same
|
259
|
+
shape.
|
260
|
+
"""
|
261
|
+
# Don't recompute strides if the default strides will be sufficient to
|
262
|
+
# create a contiguous array.
|
263
|
+
if ary.flags['C_CONTIGUOUS'] or ary.flags['F_CONTIGUOUS'] or ary.ndim <= 1:
|
264
|
+
return None
|
265
|
+
|
266
|
+
# Otherwise, we need to compute new strides using an algorithm adapted from
|
267
|
+
# NumPy v1.17.4's PyArray_NewLikeArrayWithShape in
|
268
|
+
# core/src/multiarray/ctors.c. We permute the strides in ascending order
|
269
|
+
# then compute the stride for the dimensions with the same permutation.
|
270
|
+
|
271
|
+
# Stride permutation. E.g. a stride array (4, -2, 12) becomes
|
272
|
+
# [(1, -2), (0, 4), (2, 12)]
|
273
|
+
strideperm = [ x for x in enumerate(ary.strides) ]
|
274
|
+
strideperm.sort(key=lambda x: x[1])
|
275
|
+
|
276
|
+
# Compute new strides using permutation
|
277
|
+
strides = [0] * len(ary.strides)
|
278
|
+
stride = ary.dtype.itemsize
|
279
|
+
for i_perm, _ in strideperm:
|
280
|
+
strides[i_perm] = stride
|
281
|
+
stride *= ary.shape[i_perm]
|
282
|
+
return tuple(strides)
|
283
|
+
|
284
|
+
|
285
|
+
def _order_like_array(ary):
|
286
|
+
if ary.flags['F_CONTIGUOUS'] and not ary.flags['C_CONTIGUOUS']:
|
287
|
+
return 'F'
|
288
|
+
else:
|
289
|
+
return 'C'
|
290
|
+
|
291
|
+
|
292
|
+
def device_array_like(ary, stream=0):
|
293
|
+
"""
|
294
|
+
Call :func:`device_array() <numba.cuda.device_array>` with information from
|
295
|
+
the array.
|
296
|
+
"""
|
297
|
+
strides = _contiguous_strides_like_array(ary)
|
298
|
+
order = _order_like_array(ary)
|
299
|
+
return device_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
|
300
|
+
order=order, stream=stream)
|
301
|
+
|
302
|
+
|
303
|
+
def mapped_array_like(ary, stream=0, portable=False, wc=False):
|
304
|
+
"""
|
305
|
+
Call :func:`mapped_array() <numba.cuda.mapped_array>` with the information
|
306
|
+
from the array.
|
307
|
+
"""
|
308
|
+
strides = _contiguous_strides_like_array(ary)
|
309
|
+
order = _order_like_array(ary)
|
310
|
+
return mapped_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
|
311
|
+
order=order, stream=stream, portable=portable, wc=wc)
|
312
|
+
|
313
|
+
|
314
|
+
def pinned_array_like(ary):
|
315
|
+
"""
|
316
|
+
Call :func:`pinned_array() <numba.cuda.pinned_array>` with the information
|
317
|
+
from the array.
|
318
|
+
"""
|
319
|
+
strides = _contiguous_strides_like_array(ary)
|
320
|
+
order = _order_like_array(ary)
|
321
|
+
return pinned_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
|
322
|
+
order=order)
|
323
|
+
|
324
|
+
|
325
|
+
# Stream helper
|
326
|
+
@require_context
|
327
|
+
def stream():
|
328
|
+
"""
|
329
|
+
Create a CUDA stream that represents a command queue for the device.
|
330
|
+
"""
|
331
|
+
return current_context().create_stream()
|
332
|
+
|
333
|
+
|
334
|
+
@require_context
|
335
|
+
def default_stream():
|
336
|
+
"""
|
337
|
+
Get the default CUDA stream. CUDA semantics in general are that the default
|
338
|
+
stream is either the legacy default stream or the per-thread default stream
|
339
|
+
depending on which CUDA APIs are in use. In Numba, the APIs for the legacy
|
340
|
+
default stream are always the ones in use, but an option to use APIs for
|
341
|
+
the per-thread default stream may be provided in future.
|
342
|
+
"""
|
343
|
+
return current_context().get_default_stream()
|
344
|
+
|
345
|
+
|
346
|
+
@require_context
|
347
|
+
def legacy_default_stream():
|
348
|
+
"""
|
349
|
+
Get the legacy default CUDA stream.
|
350
|
+
"""
|
351
|
+
return current_context().get_legacy_default_stream()
|
352
|
+
|
353
|
+
|
354
|
+
@require_context
|
355
|
+
def per_thread_default_stream():
|
356
|
+
"""
|
357
|
+
Get the per-thread default CUDA stream.
|
358
|
+
"""
|
359
|
+
return current_context().get_per_thread_default_stream()
|
360
|
+
|
361
|
+
|
362
|
+
@require_context
|
363
|
+
def external_stream(ptr):
|
364
|
+
"""Create a Numba stream object for a stream allocated outside Numba.
|
365
|
+
|
366
|
+
:param ptr: Pointer to the external stream to wrap in a Numba Stream
|
367
|
+
:type ptr: int
|
368
|
+
"""
|
369
|
+
return current_context().create_external_stream(ptr)
|
370
|
+
|
371
|
+
|
372
|
+
# Page lock
|
373
|
+
@require_context
|
374
|
+
@contextlib.contextmanager
|
375
|
+
def pinned(*arylist):
|
376
|
+
"""A context manager for temporary pinning a sequence of host ndarrays.
|
377
|
+
"""
|
378
|
+
pmlist = []
|
379
|
+
for ary in arylist:
|
380
|
+
pm = current_context().mempin(ary, driver.host_pointer(ary),
|
381
|
+
driver.host_memory_size(ary),
|
382
|
+
mapped=False)
|
383
|
+
pmlist.append(pm)
|
384
|
+
yield
|
385
|
+
|
386
|
+
|
387
|
+
@require_context
|
388
|
+
@contextlib.contextmanager
|
389
|
+
def mapped(*arylist, **kws):
|
390
|
+
"""A context manager for temporarily mapping a sequence of host ndarrays.
|
391
|
+
"""
|
392
|
+
assert not kws or 'stream' in kws, "Only accept 'stream' as keyword."
|
393
|
+
stream = kws.get('stream', 0)
|
394
|
+
pmlist = []
|
395
|
+
devarylist = []
|
396
|
+
for ary in arylist:
|
397
|
+
pm = current_context().mempin(ary, driver.host_pointer(ary),
|
398
|
+
driver.host_memory_size(ary),
|
399
|
+
mapped=True)
|
400
|
+
pmlist.append(pm)
|
401
|
+
devary = devicearray.from_array_like(ary, gpu_data=pm, stream=stream)
|
402
|
+
devarylist.append(devary)
|
403
|
+
try:
|
404
|
+
if len(devarylist) == 1:
|
405
|
+
yield devarylist[0]
|
406
|
+
else:
|
407
|
+
yield devarylist
|
408
|
+
finally:
|
409
|
+
# When exiting from `with cuda.mapped(*arrs) as mapped_arrs:`, the name
|
410
|
+
# `mapped_arrs` stays in scope, blocking automatic unmapping based on
|
411
|
+
# reference count. We therefore invoke the finalizer manually.
|
412
|
+
for pm in pmlist:
|
413
|
+
pm.free()
|
414
|
+
|
415
|
+
|
416
|
+
def event(timing=True):
|
417
|
+
"""
|
418
|
+
Create a CUDA event. Timing data is only recorded by the event if it is
|
419
|
+
created with ``timing=True``.
|
420
|
+
"""
|
421
|
+
evt = current_context().create_event(timing=timing)
|
422
|
+
return evt
|
423
|
+
|
424
|
+
|
425
|
+
event_elapsed_time = driver.event_elapsed_time
|
426
|
+
|
427
|
+
|
428
|
+
# Device selection
|
429
|
+
|
430
|
+
def select_device(device_id):
|
431
|
+
"""
|
432
|
+
Make the context associated with device *device_id* the current context.
|
433
|
+
|
434
|
+
Returns a Device instance.
|
435
|
+
|
436
|
+
Raises exception on error.
|
437
|
+
"""
|
438
|
+
context = devices.get_context(device_id)
|
439
|
+
return context.device
|
440
|
+
|
441
|
+
|
442
|
+
def get_current_device():
|
443
|
+
"Get current device associated with the current thread"
|
444
|
+
return current_context().device
|
445
|
+
|
446
|
+
|
447
|
+
def list_devices():
|
448
|
+
"Return a list of all detected devices"
|
449
|
+
return devices.gpus
|
450
|
+
|
451
|
+
|
452
|
+
def close():
|
453
|
+
"""
|
454
|
+
Explicitly clears all contexts in the current thread, and destroys all
|
455
|
+
contexts if the current thread is the main thread.
|
456
|
+
"""
|
457
|
+
devices.reset()
|
458
|
+
|
459
|
+
|
460
|
+
def _auto_device(ary, stream=0, copy=True):
|
461
|
+
return devicearray.auto_device(ary, stream=stream, copy=copy)
|
462
|
+
|
463
|
+
|
464
|
+
def detect():
|
465
|
+
"""
|
466
|
+
Detect supported CUDA hardware and print a summary of the detected hardware.
|
467
|
+
|
468
|
+
Returns a boolean indicating whether any supported devices were detected.
|
469
|
+
"""
|
470
|
+
devlist = list_devices()
|
471
|
+
print('Found %d CUDA devices' % len(devlist))
|
472
|
+
supported_count = 0
|
473
|
+
for dev in devlist:
|
474
|
+
attrs = []
|
475
|
+
cc = dev.compute_capability
|
476
|
+
kernel_timeout = dev.KERNEL_EXEC_TIMEOUT
|
477
|
+
tcc = dev.TCC_DRIVER
|
478
|
+
fp32_to_fp64_ratio = dev.SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO
|
479
|
+
attrs += [('Compute Capability', '%d.%d' % cc)]
|
480
|
+
attrs += [('PCI Device ID', dev.PCI_DEVICE_ID)]
|
481
|
+
attrs += [('PCI Bus ID', dev.PCI_BUS_ID)]
|
482
|
+
attrs += [('UUID', dev.uuid)]
|
483
|
+
attrs += [('Watchdog', 'Enabled' if kernel_timeout else 'Disabled')]
|
484
|
+
if os.name == "nt":
|
485
|
+
attrs += [('Compute Mode', 'TCC' if tcc else 'WDDM')]
|
486
|
+
attrs += [('FP32/FP64 Performance Ratio', fp32_to_fp64_ratio)]
|
487
|
+
if cc < (3, 5):
|
488
|
+
support = '[NOT SUPPORTED: CC < 3.5]'
|
489
|
+
elif cc < (5, 0):
|
490
|
+
support = '[SUPPORTED (DEPRECATED)]'
|
491
|
+
supported_count += 1
|
492
|
+
else:
|
493
|
+
support = '[SUPPORTED]'
|
494
|
+
supported_count += 1
|
495
|
+
|
496
|
+
print('id %d %20s %40s' % (dev.id, dev.name, support))
|
497
|
+
for key, val in attrs:
|
498
|
+
print('%40s: %s' % (key, val))
|
499
|
+
|
500
|
+
print('Summary:')
|
501
|
+
print('\t%d/%d devices are supported' % (supported_count, len(devlist)))
|
502
|
+
return supported_count > 0
|
503
|
+
|
504
|
+
|
505
|
+
@contextlib.contextmanager
|
506
|
+
def defer_cleanup():
|
507
|
+
"""
|
508
|
+
Temporarily disable memory deallocation.
|
509
|
+
Use this to prevent resource deallocation breaking asynchronous execution.
|
510
|
+
|
511
|
+
For example::
|
512
|
+
|
513
|
+
with defer_cleanup():
|
514
|
+
# all cleanup is deferred in here
|
515
|
+
do_speed_critical_code()
|
516
|
+
# cleanup can occur here
|
517
|
+
|
518
|
+
Note: this context manager can be nested.
|
519
|
+
"""
|
520
|
+
with current_context().defer_cleanup():
|
521
|
+
yield
|
522
|
+
|
523
|
+
|
524
|
+
profiling = require_context(driver.profiling)
|
525
|
+
profile_start = require_context(driver.profile_start)
|
526
|
+
profile_stop = require_context(driver.profile_stop)
|
@@ -0,0 +1,30 @@
|
|
1
|
+
import numpy as np
|
2
|
+
|
3
|
+
|
4
|
+
def prepare_shape_strides_dtype(shape, strides, dtype, order):
|
5
|
+
dtype = np.dtype(dtype)
|
6
|
+
if isinstance(shape, int):
|
7
|
+
shape = (shape,)
|
8
|
+
if isinstance(strides, int):
|
9
|
+
strides = (strides,)
|
10
|
+
else:
|
11
|
+
strides = strides or _fill_stride_by_order(shape, dtype, order)
|
12
|
+
return shape, strides, dtype
|
13
|
+
|
14
|
+
|
15
|
+
def _fill_stride_by_order(shape, dtype, order):
|
16
|
+
nd = len(shape)
|
17
|
+
if nd == 0:
|
18
|
+
return ()
|
19
|
+
strides = [0] * nd
|
20
|
+
if order == 'C':
|
21
|
+
strides[-1] = dtype.itemsize
|
22
|
+
for d in reversed(range(nd - 1)):
|
23
|
+
strides[d] = strides[d + 1] * shape[d + 1]
|
24
|
+
elif order == 'F':
|
25
|
+
strides[0] = dtype.itemsize
|
26
|
+
for d in range(1, nd):
|
27
|
+
strides[d] = strides[d - 1] * shape[d - 1]
|
28
|
+
else:
|
29
|
+
raise ValueError('must be either C/F order')
|
30
|
+
return tuple(strides)
|