numba-cuda 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.py +17 -13
- numba_cuda/VERSION +1 -1
- numba_cuda/_version.py +4 -1
- numba_cuda/numba/cuda/__init__.py +6 -2
- numba_cuda/numba/cuda/api.py +129 -86
- numba_cuda/numba/cuda/api_util.py +3 -3
- numba_cuda/numba/cuda/args.py +12 -16
- numba_cuda/numba/cuda/cg.py +6 -6
- numba_cuda/numba/cuda/codegen.py +74 -43
- numba_cuda/numba/cuda/compiler.py +232 -113
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
- numba_cuda/numba/cuda/cuda_fp16.h +661 -661
- numba_cuda/numba/cuda/cuda_fp16.hpp +3 -3
- numba_cuda/numba/cuda/cuda_paths.py +291 -99
- numba_cuda/numba/cuda/cudadecl.py +125 -69
- numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
- numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
- numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
- numba_cuda/numba/cuda/cudadrv/driver.py +463 -297
- numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
- numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
- numba_cuda/numba/cuda/cudadrv/error.py +6 -2
- numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
- numba_cuda/numba/cuda/cudadrv/linkable_code.py +16 -1
- numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +138 -29
- numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
- numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
- numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
- numba_cuda/numba/cuda/cudaimpl.py +317 -233
- numba_cuda/numba/cuda/cudamath.py +1 -1
- numba_cuda/numba/cuda/debuginfo.py +8 -6
- numba_cuda/numba/cuda/decorators.py +75 -45
- numba_cuda/numba/cuda/descriptor.py +1 -1
- numba_cuda/numba/cuda/device_init.py +69 -18
- numba_cuda/numba/cuda/deviceufunc.py +143 -98
- numba_cuda/numba/cuda/dispatcher.py +300 -213
- numba_cuda/numba/cuda/errors.py +13 -10
- numba_cuda/numba/cuda/extending.py +1 -1
- numba_cuda/numba/cuda/initialize.py +5 -3
- numba_cuda/numba/cuda/intrinsic_wrapper.py +3 -3
- numba_cuda/numba/cuda/intrinsics.py +31 -27
- numba_cuda/numba/cuda/kernels/reduction.py +13 -13
- numba_cuda/numba/cuda/kernels/transpose.py +3 -6
- numba_cuda/numba/cuda/libdevice.py +317 -317
- numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
- numba_cuda/numba/cuda/locks.py +16 -0
- numba_cuda/numba/cuda/mathimpl.py +62 -57
- numba_cuda/numba/cuda/models.py +1 -5
- numba_cuda/numba/cuda/nvvmutils.py +103 -88
- numba_cuda/numba/cuda/printimpl.py +9 -5
- numba_cuda/numba/cuda/random.py +46 -36
- numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
- numba_cuda/numba/cuda/runtime/__init__.py +1 -1
- numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
- numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
- numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
- numba_cuda/numba/cuda/runtime/nrt.py +48 -43
- numba_cuda/numba/cuda/simulator/__init__.py +22 -12
- numba_cuda/numba/cuda/simulator/api.py +38 -22
- numba_cuda/numba/cuda/simulator/compiler.py +2 -2
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
- numba_cuda/numba/cuda/simulator/kernel.py +43 -34
- numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
- numba_cuda/numba/cuda/simulator/reduction.py +1 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
- numba_cuda/numba/cuda/simulator_init.py +2 -4
- numba_cuda/numba/cuda/stubs.py +139 -102
- numba_cuda/numba/cuda/target.py +64 -47
- numba_cuda/numba/cuda/testing.py +24 -19
- numba_cuda/numba/cuda/tests/__init__.py +14 -12
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +7 -6
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +57 -21
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +31 -28
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +6 -7
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +19 -12
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +6 -6
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
- numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +31 -25
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
- numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
- numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
- numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
- numba_cuda/numba/cuda/types.py +5 -2
- numba_cuda/numba/cuda/ufuncs.py +382 -362
- numba_cuda/numba/cuda/utils.py +2 -2
- numba_cuda/numba/cuda/vector_types.py +2 -2
- numba_cuda/numba/cuda/vectorizers.py +37 -32
- {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/METADATA +1 -1
- numba_cuda-0.9.0.dist-info/RECORD +253 -0
- {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/WHEEL +1 -1
- numba_cuda-0.8.0.dist-info/RECORD +0 -251
- {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/top_level.txt +0 -0
_numba_cuda_redirector.py
CHANGED
@@ -4,11 +4,14 @@ import pathlib
|
|
4
4
|
import sys
|
5
5
|
import warnings
|
6
6
|
|
7
|
-
multiple_locations_msg = (
|
8
|
-
|
7
|
+
multiple_locations_msg = (
|
8
|
+
"Multiple submodule search locations for {}. "
|
9
|
+
"Cannot redirect numba.cuda to numba_cuda"
|
10
|
+
)
|
9
11
|
|
10
|
-
no_spec_msg = (
|
11
|
-
|
12
|
+
no_spec_msg = (
|
13
|
+
"Couldn't get spec for {}. Cannot redirect numba.cuda to numba_cuda"
|
14
|
+
)
|
12
15
|
|
13
16
|
|
14
17
|
class NumbaCudaFinder(importlib.abc.MetaPathFinder):
|
@@ -19,17 +22,17 @@ class NumbaCudaFinder(importlib.abc.MetaPathFinder):
|
|
19
22
|
if self.initialized is not None:
|
20
23
|
return self.initialized
|
21
24
|
|
22
|
-
numba_spec = importlib.util.find_spec(
|
25
|
+
numba_spec = importlib.util.find_spec("numba")
|
23
26
|
|
24
27
|
if numba_spec is None:
|
25
|
-
warnings.warn(no_spec_msg.format(
|
28
|
+
warnings.warn(no_spec_msg.format("numba"))
|
26
29
|
self.initialized = False
|
27
30
|
return False
|
28
31
|
|
29
|
-
numba_cuda_spec = importlib.util.find_spec(
|
32
|
+
numba_cuda_spec = importlib.util.find_spec("numba_cuda")
|
30
33
|
|
31
34
|
if numba_spec is None:
|
32
|
-
warnings.warn(no_spec_msg.format(
|
35
|
+
warnings.warn(no_spec_msg.format("numba_cuda"))
|
33
36
|
self.initialized = False
|
34
37
|
return False
|
35
38
|
|
@@ -37,19 +40,19 @@ class NumbaCudaFinder(importlib.abc.MetaPathFinder):
|
|
37
40
|
numba_cuda_search_locations = numba_cuda_spec.submodule_search_locations
|
38
41
|
|
39
42
|
if len(numba_search_locations) != 1:
|
40
|
-
warnings.warn(multiple_locations_msg.format(
|
43
|
+
warnings.warn(multiple_locations_msg.format("numba"))
|
41
44
|
self.initialized = False
|
42
45
|
return False
|
43
46
|
|
44
47
|
if len(numba_cuda_search_locations) != 1:
|
45
|
-
warnings.warn(multiple_locations_msg.format(
|
48
|
+
warnings.warn(multiple_locations_msg.format("numba_cuda"))
|
46
49
|
self.initialized = False
|
47
50
|
return False
|
48
51
|
|
49
52
|
self.numba_path = numba_search_locations[0]
|
50
53
|
|
51
54
|
location = numba_cuda_search_locations[0]
|
52
|
-
self.numba_cuda_path = str((pathlib.Path(location) /
|
55
|
+
self.numba_cuda_path = str((pathlib.Path(location) / "numba"))
|
53
56
|
|
54
57
|
self.initialized = True
|
55
58
|
return True
|
@@ -64,8 +67,9 @@ class NumbaCudaFinder(importlib.abc.MetaPathFinder):
|
|
64
67
|
# Re-entrancy - return and carry on
|
65
68
|
return None
|
66
69
|
|
67
|
-
oot_path = [
|
68
|
-
|
70
|
+
oot_path = [
|
71
|
+
p.replace(self.numba_path, self.numba_cuda_path) for p in path
|
72
|
+
]
|
69
73
|
for finder in sys.meta_path:
|
70
74
|
try:
|
71
75
|
spec = finder.find_spec(name, oot_path, target)
|
numba_cuda/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.9.0
|
numba_cuda/_version.py
CHANGED
@@ -7,8 +7,12 @@ else:
|
|
7
7
|
from .device_init import *
|
8
8
|
from .device_init import _auto_device
|
9
9
|
|
10
|
-
from numba.cuda.compiler import (
|
11
|
-
|
10
|
+
from numba.cuda.compiler import (
|
11
|
+
compile,
|
12
|
+
compile_for_current_device,
|
13
|
+
compile_ptx,
|
14
|
+
compile_ptx_for_current_device,
|
15
|
+
)
|
12
16
|
|
13
17
|
# This is the out-of-tree NVIDIA-maintained target. This is reported in Numba
|
14
18
|
# sysinfo (`numba -s`):
|
numba_cuda/numba/cuda/api.py
CHANGED
@@ -2,7 +2,6 @@
|
|
2
2
|
API that are reported to numba.cuda
|
3
3
|
"""
|
4
4
|
|
5
|
-
|
6
5
|
import contextlib
|
7
6
|
import os
|
8
7
|
|
@@ -28,35 +27,37 @@ def from_cuda_array_interface(desc, owner=None, sync=True):
|
|
28
27
|
If ``sync`` is ``True``, then the imported stream (if present) will be
|
29
28
|
synchronized.
|
30
29
|
"""
|
31
|
-
version = desc.get(
|
30
|
+
version = desc.get("version")
|
32
31
|
# Mask introduced in version 1
|
33
32
|
if 1 <= version:
|
34
|
-
mask = desc.get(
|
33
|
+
mask = desc.get("mask")
|
35
34
|
# Would ideally be better to detect if the mask is all valid
|
36
35
|
if mask is not None:
|
37
|
-
raise NotImplementedError(
|
36
|
+
raise NotImplementedError("Masked arrays are not supported")
|
38
37
|
|
39
|
-
shape = desc[
|
40
|
-
strides = desc.get(
|
41
|
-
dtype = np.dtype(desc[
|
38
|
+
shape = desc["shape"]
|
39
|
+
strides = desc.get("strides")
|
40
|
+
dtype = np.dtype(desc["typestr"])
|
42
41
|
|
43
42
|
shape, strides, dtype = prepare_shape_strides_dtype(
|
44
|
-
shape, strides, dtype, order=
|
43
|
+
shape, strides, dtype, order="C"
|
44
|
+
)
|
45
45
|
size = driver.memory_size_from_info(shape, strides, dtype.itemsize)
|
46
46
|
|
47
|
-
devptr = driver.get_devptr_for_active_ctx(desc[
|
47
|
+
devptr = driver.get_devptr_for_active_ctx(desc["data"][0])
|
48
48
|
data = driver.MemoryPointer(
|
49
|
-
current_context(), devptr, size=size, owner=owner
|
50
|
-
|
49
|
+
current_context(), devptr, size=size, owner=owner
|
50
|
+
)
|
51
|
+
stream_ptr = desc.get("stream", None)
|
51
52
|
if stream_ptr is not None:
|
52
53
|
stream = external_stream(stream_ptr)
|
53
54
|
if sync and config.CUDA_ARRAY_INTERFACE_SYNC:
|
54
55
|
stream.synchronize()
|
55
56
|
else:
|
56
|
-
stream = 0
|
57
|
-
da = devicearray.DeviceNDArray(
|
58
|
-
|
59
|
-
|
57
|
+
stream = 0 # No "Numba default stream", not the CUDA default stream
|
58
|
+
da = devicearray.DeviceNDArray(
|
59
|
+
shape=shape, strides=strides, dtype=dtype, gpu_data=data, stream=stream
|
60
|
+
)
|
60
61
|
return da
|
61
62
|
|
62
63
|
|
@@ -73,8 +74,9 @@ def as_cuda_array(obj, sync=True):
|
|
73
74
|
if not is_cuda_array(obj):
|
74
75
|
raise TypeError("*obj* doesn't implement the cuda array interface.")
|
75
76
|
else:
|
76
|
-
return from_cuda_array_interface(
|
77
|
-
|
77
|
+
return from_cuda_array_interface(
|
78
|
+
obj.__cuda_array_interface__, owner=obj, sync=sync
|
79
|
+
)
|
78
80
|
|
79
81
|
|
80
82
|
def is_cuda_array(obj):
|
@@ -82,7 +84,7 @@ def is_cuda_array(obj):
|
|
82
84
|
|
83
85
|
Does not verify the validity of the interface.
|
84
86
|
"""
|
85
|
-
return hasattr(obj,
|
87
|
+
return hasattr(obj, "__cuda_array_interface__")
|
86
88
|
|
87
89
|
|
88
90
|
def is_float16_supported():
|
@@ -125,8 +127,9 @@ def to_device(obj, stream=0, copy=True, to=None):
|
|
125
127
|
hary = d_ary.copy_to_host(stream=stream)
|
126
128
|
"""
|
127
129
|
if to is None:
|
128
|
-
to, new = devicearray.auto_device(
|
129
|
-
|
130
|
+
to, new = devicearray.auto_device(
|
131
|
+
obj, stream=stream, copy=copy, user_explicit=True
|
132
|
+
)
|
130
133
|
return to
|
131
134
|
if copy:
|
132
135
|
to.copy_to_device(obj, stream=stream)
|
@@ -134,20 +137,28 @@ def to_device(obj, stream=0, copy=True, to=None):
|
|
134
137
|
|
135
138
|
|
136
139
|
@require_context
|
137
|
-
def device_array(shape, dtype=np.float64, strides=None, order=
|
140
|
+
def device_array(shape, dtype=np.float64, strides=None, order="C", stream=0):
|
138
141
|
"""device_array(shape, dtype=np.float64, strides=None, order='C', stream=0)
|
139
142
|
|
140
143
|
Allocate an empty device ndarray. Similar to :meth:`numpy.empty`.
|
141
144
|
"""
|
142
|
-
shape, strides, dtype = prepare_shape_strides_dtype(
|
143
|
-
|
144
|
-
|
145
|
-
|
145
|
+
shape, strides, dtype = prepare_shape_strides_dtype(
|
146
|
+
shape, strides, dtype, order
|
147
|
+
)
|
148
|
+
return devicearray.DeviceNDArray(
|
149
|
+
shape=shape, strides=strides, dtype=dtype, stream=stream
|
150
|
+
)
|
146
151
|
|
147
152
|
|
148
153
|
@require_context
|
149
|
-
def managed_array(
|
150
|
-
|
154
|
+
def managed_array(
|
155
|
+
shape,
|
156
|
+
dtype=np.float64,
|
157
|
+
strides=None,
|
158
|
+
order="C",
|
159
|
+
stream=0,
|
160
|
+
attach_global=True,
|
161
|
+
):
|
151
162
|
"""managed_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
|
152
163
|
attach_global=True)
|
153
164
|
|
@@ -163,37 +174,48 @@ def managed_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
|
|
163
174
|
*host*, and memory is only accessible by devices
|
164
175
|
with Compute Capability 6.0 and later.
|
165
176
|
"""
|
166
|
-
shape, strides, dtype = prepare_shape_strides_dtype(
|
167
|
-
|
177
|
+
shape, strides, dtype = prepare_shape_strides_dtype(
|
178
|
+
shape, strides, dtype, order
|
179
|
+
)
|
168
180
|
bytesize = driver.memory_size_from_info(shape, strides, dtype.itemsize)
|
169
|
-
buffer = current_context().memallocmanaged(
|
170
|
-
|
171
|
-
|
172
|
-
|
181
|
+
buffer = current_context().memallocmanaged(
|
182
|
+
bytesize, attach_global=attach_global
|
183
|
+
)
|
184
|
+
npary = np.ndarray(
|
185
|
+
shape=shape, strides=strides, dtype=dtype, order=order, buffer=buffer
|
186
|
+
)
|
173
187
|
managedview = np.ndarray.view(npary, type=devicearray.ManagedNDArray)
|
174
188
|
managedview.device_setup(buffer, stream=stream)
|
175
189
|
return managedview
|
176
190
|
|
177
191
|
|
178
192
|
@require_context
|
179
|
-
def pinned_array(shape, dtype=np.float64, strides=None, order=
|
193
|
+
def pinned_array(shape, dtype=np.float64, strides=None, order="C"):
|
180
194
|
"""pinned_array(shape, dtype=np.float64, strides=None, order='C')
|
181
195
|
|
182
196
|
Allocate an :class:`ndarray <numpy.ndarray>` with a buffer that is pinned
|
183
197
|
(pagelocked). Similar to :func:`np.empty() <numpy.empty>`.
|
184
198
|
"""
|
185
|
-
shape, strides, dtype = prepare_shape_strides_dtype(
|
186
|
-
|
187
|
-
|
188
|
-
|
199
|
+
shape, strides, dtype = prepare_shape_strides_dtype(
|
200
|
+
shape, strides, dtype, order
|
201
|
+
)
|
202
|
+
bytesize = driver.memory_size_from_info(shape, strides, dtype.itemsize)
|
189
203
|
buffer = current_context().memhostalloc(bytesize)
|
190
|
-
return np.ndarray(
|
191
|
-
|
204
|
+
return np.ndarray(
|
205
|
+
shape=shape, strides=strides, dtype=dtype, order=order, buffer=buffer
|
206
|
+
)
|
192
207
|
|
193
208
|
|
194
209
|
@require_context
|
195
|
-
def mapped_array(
|
196
|
-
|
210
|
+
def mapped_array(
|
211
|
+
shape,
|
212
|
+
dtype=np.float64,
|
213
|
+
strides=None,
|
214
|
+
order="C",
|
215
|
+
stream=0,
|
216
|
+
portable=False,
|
217
|
+
wc=False,
|
218
|
+
):
|
197
219
|
"""mapped_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
|
198
220
|
portable=False, wc=False)
|
199
221
|
|
@@ -206,12 +228,14 @@ def mapped_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
|
|
206
228
|
to write by the host and to read by the device, but slower to
|
207
229
|
write by the host and slower to write by the device.
|
208
230
|
"""
|
209
|
-
shape, strides, dtype = prepare_shape_strides_dtype(
|
210
|
-
|
231
|
+
shape, strides, dtype = prepare_shape_strides_dtype(
|
232
|
+
shape, strides, dtype, order
|
233
|
+
)
|
211
234
|
bytesize = driver.memory_size_from_info(shape, strides, dtype.itemsize)
|
212
235
|
buffer = current_context().memhostalloc(bytesize, mapped=True)
|
213
|
-
npary = np.ndarray(
|
214
|
-
|
236
|
+
npary = np.ndarray(
|
237
|
+
shape=shape, strides=strides, dtype=dtype, order=order, buffer=buffer
|
238
|
+
)
|
215
239
|
mappedview = np.ndarray.view(npary, type=devicearray.MappedNDArray)
|
216
240
|
mappedview.device_setup(buffer, stream=stream)
|
217
241
|
return mappedview
|
@@ -243,8 +267,9 @@ def open_ipc_array(handle, shape, dtype, strides=None, offset=0):
|
|
243
267
|
driver_handle.reserved[:] = handle
|
244
268
|
# use *IpcHandle* to open the IPC memory
|
245
269
|
ipchandle = driver.IpcHandle(None, driver_handle, size, offset=offset)
|
246
|
-
yield ipchandle.open_array(
|
247
|
-
|
270
|
+
yield ipchandle.open_array(
|
271
|
+
current_context(), shape=shape, strides=strides, dtype=dtype
|
272
|
+
)
|
248
273
|
ipchandle.close()
|
249
274
|
|
250
275
|
|
@@ -260,7 +285,7 @@ def _contiguous_strides_like_array(ary):
|
|
260
285
|
"""
|
261
286
|
# Don't recompute strides if the default strides will be sufficient to
|
262
287
|
# create a contiguous array.
|
263
|
-
if ary.flags[
|
288
|
+
if ary.flags["C_CONTIGUOUS"] or ary.flags["F_CONTIGUOUS"] or ary.ndim <= 1:
|
264
289
|
return None
|
265
290
|
|
266
291
|
# Otherwise, we need to compute new strides using an algorithm adapted from
|
@@ -270,7 +295,7 @@ def _contiguous_strides_like_array(ary):
|
|
270
295
|
|
271
296
|
# Stride permutation. E.g. a stride array (4, -2, 12) becomes
|
272
297
|
# [(1, -2), (0, 4), (2, 12)]
|
273
|
-
strideperm = [
|
298
|
+
strideperm = [x for x in enumerate(ary.strides)]
|
274
299
|
strideperm.sort(key=lambda x: x[1])
|
275
300
|
|
276
301
|
# Compute new strides using permutation
|
@@ -283,10 +308,10 @@ def _contiguous_strides_like_array(ary):
|
|
283
308
|
|
284
309
|
|
285
310
|
def _order_like_array(ary):
|
286
|
-
if ary.flags[
|
287
|
-
return
|
311
|
+
if ary.flags["F_CONTIGUOUS"] and not ary.flags["C_CONTIGUOUS"]:
|
312
|
+
return "F"
|
288
313
|
else:
|
289
|
-
return
|
314
|
+
return "C"
|
290
315
|
|
291
316
|
|
292
317
|
def device_array_like(ary, stream=0):
|
@@ -296,8 +321,13 @@ def device_array_like(ary, stream=0):
|
|
296
321
|
"""
|
297
322
|
strides = _contiguous_strides_like_array(ary)
|
298
323
|
order = _order_like_array(ary)
|
299
|
-
return device_array(
|
300
|
-
|
324
|
+
return device_array(
|
325
|
+
shape=ary.shape,
|
326
|
+
dtype=ary.dtype,
|
327
|
+
strides=strides,
|
328
|
+
order=order,
|
329
|
+
stream=stream,
|
330
|
+
)
|
301
331
|
|
302
332
|
|
303
333
|
def mapped_array_like(ary, stream=0, portable=False, wc=False):
|
@@ -307,8 +337,15 @@ def mapped_array_like(ary, stream=0, portable=False, wc=False):
|
|
307
337
|
"""
|
308
338
|
strides = _contiguous_strides_like_array(ary)
|
309
339
|
order = _order_like_array(ary)
|
310
|
-
return mapped_array(
|
311
|
-
|
340
|
+
return mapped_array(
|
341
|
+
shape=ary.shape,
|
342
|
+
dtype=ary.dtype,
|
343
|
+
strides=strides,
|
344
|
+
order=order,
|
345
|
+
stream=stream,
|
346
|
+
portable=portable,
|
347
|
+
wc=wc,
|
348
|
+
)
|
312
349
|
|
313
350
|
|
314
351
|
def pinned_array_like(ary):
|
@@ -318,8 +355,9 @@ def pinned_array_like(ary):
|
|
318
355
|
"""
|
319
356
|
strides = _contiguous_strides_like_array(ary)
|
320
357
|
order = _order_like_array(ary)
|
321
|
-
return pinned_array(
|
322
|
-
|
358
|
+
return pinned_array(
|
359
|
+
shape=ary.shape, dtype=ary.dtype, strides=strides, order=order
|
360
|
+
)
|
323
361
|
|
324
362
|
|
325
363
|
# Stream helper
|
@@ -373,13 +411,15 @@ def external_stream(ptr):
|
|
373
411
|
@require_context
|
374
412
|
@contextlib.contextmanager
|
375
413
|
def pinned(*arylist):
|
376
|
-
"""A context manager for temporary pinning a sequence of host ndarrays.
|
377
|
-
"""
|
414
|
+
"""A context manager for temporary pinning a sequence of host ndarrays."""
|
378
415
|
pmlist = []
|
379
416
|
for ary in arylist:
|
380
|
-
pm = current_context().mempin(
|
381
|
-
|
382
|
-
|
417
|
+
pm = current_context().mempin(
|
418
|
+
ary,
|
419
|
+
driver.host_pointer(ary),
|
420
|
+
driver.host_memory_size(ary),
|
421
|
+
mapped=False,
|
422
|
+
)
|
383
423
|
pmlist.append(pm)
|
384
424
|
yield
|
385
425
|
|
@@ -387,16 +427,18 @@ def pinned(*arylist):
|
|
387
427
|
@require_context
|
388
428
|
@contextlib.contextmanager
|
389
429
|
def mapped(*arylist, **kws):
|
390
|
-
"""A context manager for temporarily mapping a sequence of host ndarrays.
|
391
|
-
"""
|
392
|
-
|
393
|
-
stream = kws.get('stream', 0)
|
430
|
+
"""A context manager for temporarily mapping a sequence of host ndarrays."""
|
431
|
+
assert not kws or "stream" in kws, "Only accept 'stream' as keyword."
|
432
|
+
stream = kws.get("stream", 0)
|
394
433
|
pmlist = []
|
395
434
|
devarylist = []
|
396
435
|
for ary in arylist:
|
397
|
-
pm = current_context().mempin(
|
398
|
-
|
399
|
-
|
436
|
+
pm = current_context().mempin(
|
437
|
+
ary,
|
438
|
+
driver.host_pointer(ary),
|
439
|
+
driver.host_memory_size(ary),
|
440
|
+
mapped=True,
|
441
|
+
)
|
400
442
|
pmlist.append(pm)
|
401
443
|
devary = devicearray.from_array_like(ary, gpu_data=pm, stream=stream)
|
402
444
|
devarylist.append(devary)
|
@@ -427,6 +469,7 @@ event_elapsed_time = driver.event_elapsed_time
|
|
427
469
|
|
428
470
|
# Device selection
|
429
471
|
|
472
|
+
|
430
473
|
def select_device(device_id):
|
431
474
|
"""
|
432
475
|
Make the context associated with device *device_id* the current context.
|
@@ -468,7 +511,7 @@ def detect():
|
|
468
511
|
Returns a boolean indicating whether any supported devices were detected.
|
469
512
|
"""
|
470
513
|
devlist = list_devices()
|
471
|
-
print(
|
514
|
+
print("Found %d CUDA devices" % len(devlist))
|
472
515
|
supported_count = 0
|
473
516
|
for dev in devlist:
|
474
517
|
attrs = []
|
@@ -476,29 +519,29 @@ def detect():
|
|
476
519
|
kernel_timeout = dev.KERNEL_EXEC_TIMEOUT
|
477
520
|
tcc = dev.TCC_DRIVER
|
478
521
|
fp32_to_fp64_ratio = dev.SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO
|
479
|
-
attrs += [(
|
480
|
-
attrs += [(
|
481
|
-
attrs += [(
|
482
|
-
attrs += [(
|
483
|
-
attrs += [(
|
522
|
+
attrs += [("Compute Capability", "%d.%d" % cc)]
|
523
|
+
attrs += [("PCI Device ID", dev.PCI_DEVICE_ID)]
|
524
|
+
attrs += [("PCI Bus ID", dev.PCI_BUS_ID)]
|
525
|
+
attrs += [("UUID", dev.uuid)]
|
526
|
+
attrs += [("Watchdog", "Enabled" if kernel_timeout else "Disabled")]
|
484
527
|
if os.name == "nt":
|
485
|
-
attrs += [(
|
486
|
-
attrs += [(
|
528
|
+
attrs += [("Compute Mode", "TCC" if tcc else "WDDM")]
|
529
|
+
attrs += [("FP32/FP64 Performance Ratio", fp32_to_fp64_ratio)]
|
487
530
|
if cc < (3, 5):
|
488
|
-
support =
|
531
|
+
support = "[NOT SUPPORTED: CC < 3.5]"
|
489
532
|
elif cc < (5, 0):
|
490
|
-
support =
|
533
|
+
support = "[SUPPORTED (DEPRECATED)]"
|
491
534
|
supported_count += 1
|
492
535
|
else:
|
493
|
-
support =
|
536
|
+
support = "[SUPPORTED]"
|
494
537
|
supported_count += 1
|
495
538
|
|
496
|
-
print(
|
539
|
+
print("id %d %20s %40s" % (dev.id, dev.name, support))
|
497
540
|
for key, val in attrs:
|
498
|
-
print(
|
541
|
+
print("%40s: %s" % (key, val))
|
499
542
|
|
500
|
-
print(
|
501
|
-
print(
|
543
|
+
print("Summary:")
|
544
|
+
print("\t%d/%d devices are supported" % (supported_count, len(devlist)))
|
502
545
|
return supported_count > 0
|
503
546
|
|
504
547
|
|
@@ -17,14 +17,14 @@ def _fill_stride_by_order(shape, dtype, order):
|
|
17
17
|
if nd == 0:
|
18
18
|
return ()
|
19
19
|
strides = [0] * nd
|
20
|
-
if order ==
|
20
|
+
if order == "C":
|
21
21
|
strides[-1] = dtype.itemsize
|
22
22
|
for d in reversed(range(nd - 1)):
|
23
23
|
strides[d] = strides[d + 1] * shape[d + 1]
|
24
|
-
elif order ==
|
24
|
+
elif order == "F":
|
25
25
|
strides[0] = dtype.itemsize
|
26
26
|
for d in range(1, nd):
|
27
27
|
strides[d] = strides[d - 1] * shape[d - 1]
|
28
28
|
else:
|
29
|
-
raise ValueError(
|
29
|
+
raise ValueError("must be either C/F order")
|
30
30
|
return tuple(strides)
|
numba_cuda/numba/cuda/args.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
Hints to wrap Kernel arguments to indicate how to manage host-device
|
3
3
|
memory transfers before & after the kernel call.
|
4
4
|
"""
|
5
|
+
|
5
6
|
import abc
|
6
7
|
|
7
8
|
from numba.core.typing.typeof import typeof, Purpose
|
@@ -31,9 +32,8 @@ class ArgHint(metaclass=abc.ABCMeta):
|
|
31
32
|
class In(ArgHint):
|
32
33
|
def to_device(self, retr, stream=0):
|
33
34
|
from .cudadrv.devicearray import auto_device
|
34
|
-
|
35
|
-
|
36
|
-
stream=stream)
|
35
|
+
|
36
|
+
devary, _ = auto_device(self.value, stream=stream)
|
37
37
|
# A dummy writeback functor to keep devary alive until the kernel
|
38
38
|
# is called.
|
39
39
|
retr.append(lambda: devary)
|
@@ -43,10 +43,8 @@ class In(ArgHint):
|
|
43
43
|
class Out(ArgHint):
|
44
44
|
def to_device(self, retr, stream=0):
|
45
45
|
from .cudadrv.devicearray import auto_device
|
46
|
-
|
47
|
-
|
48
|
-
copy=False,
|
49
|
-
stream=stream)
|
46
|
+
|
47
|
+
devary, conv = auto_device(self.value, copy=False, stream=stream)
|
50
48
|
if conv:
|
51
49
|
retr.append(lambda: devary.copy_to_host(self.value, stream=stream))
|
52
50
|
return devary
|
@@ -55,9 +53,8 @@ class Out(ArgHint):
|
|
55
53
|
class InOut(ArgHint):
|
56
54
|
def to_device(self, retr, stream=0):
|
57
55
|
from .cudadrv.devicearray import auto_device
|
58
|
-
|
59
|
-
|
60
|
-
stream=stream)
|
56
|
+
|
57
|
+
devary, conv = auto_device(self.value, stream=stream)
|
61
58
|
if conv:
|
62
59
|
retr.append(lambda: devary.copy_to_host(self.value, stream=stream))
|
63
60
|
return devary
|
@@ -68,10 +65,9 @@ def wrap_arg(value, default=InOut):
|
|
68
65
|
|
69
66
|
|
70
67
|
__all__ = [
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
'wrap_arg',
|
68
|
+
"In",
|
69
|
+
"Out",
|
70
|
+
"InOut",
|
71
|
+
"ArgHint",
|
72
|
+
"wrap_arg",
|
77
73
|
]
|
numba_cuda/numba/cuda/cg.py
CHANGED
@@ -26,13 +26,13 @@ def _this_grid(typingctx):
|
|
26
26
|
one = context.get_constant(types.int32, 1)
|
27
27
|
mod = builder.module
|
28
28
|
return builder.call(
|
29
|
-
nvvmutils.declare_cudaCGGetIntrinsicHandle(mod),
|
30
|
-
|
29
|
+
nvvmutils.declare_cudaCGGetIntrinsicHandle(mod), (one,)
|
30
|
+
)
|
31
31
|
|
32
32
|
return sig, codegen
|
33
33
|
|
34
34
|
|
35
|
-
@overload(this_grid, target=
|
35
|
+
@overload(this_grid, target="cuda")
|
36
36
|
def _ol_this_grid():
|
37
37
|
def impl():
|
38
38
|
return _this_grid()
|
@@ -48,13 +48,13 @@ def _grid_group_sync(typingctx, group):
|
|
48
48
|
flags = context.get_constant(types.int32, 0)
|
49
49
|
mod = builder.module
|
50
50
|
return builder.call(
|
51
|
-
nvvmutils.declare_cudaCGSynchronize(mod),
|
52
|
-
|
51
|
+
nvvmutils.declare_cudaCGSynchronize(mod), (*args, flags)
|
52
|
+
)
|
53
53
|
|
54
54
|
return sig, codegen
|
55
55
|
|
56
56
|
|
57
|
-
@overload_method(GridGroupClass,
|
57
|
+
@overload_method(GridGroupClass, "sync", target="cuda")
|
58
58
|
def _ol_grid_group_sync(group):
|
59
59
|
def impl(group):
|
60
60
|
return _grid_group_sync(group)
|