numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.24.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/__init__.py +4 -1
- numba_cuda/numba/cuda/_compat.py +47 -0
- numba_cuda/numba/cuda/api.py +4 -1
- numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_dispatcher.cpp +8 -40
- numba_cuda/numba/cuda/cext/_hashtable.cpp +5 -0
- numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_pymodule.h +1 -1
- numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_typeof.cpp +56 -119
- numba_cuda/numba/cuda/cext/mviewbuf.c +7 -1
- numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +4 -5
- numba_cuda/numba/cuda/codegen.py +46 -12
- numba_cuda/numba/cuda/compiler.py +15 -9
- numba_cuda/numba/cuda/core/analysis.py +29 -21
- numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +1 -1
- numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
- numba_cuda/numba/cuda/core/base.py +12 -11
- numba_cuda/numba/cuda/core/bytecode.py +21 -13
- numba_cuda/numba/cuda/core/byteflow.py +336 -90
- numba_cuda/numba/cuda/core/compiler.py +3 -4
- numba_cuda/numba/cuda/core/compiler_machinery.py +3 -3
- numba_cuda/numba/cuda/core/config.py +5 -7
- numba_cuda/numba/cuda/core/consts.py +1 -1
- numba_cuda/numba/cuda/core/controlflow.py +17 -9
- numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
- numba_cuda/numba/cuda/core/errors.py +4 -912
- numba_cuda/numba/cuda/core/inline_closurecall.py +82 -67
- numba_cuda/numba/cuda/core/interpreter.py +334 -160
- numba_cuda/numba/cuda/core/ir.py +191 -119
- numba_cuda/numba/cuda/core/ir_utils.py +149 -128
- numba_cuda/numba/cuda/core/postproc.py +8 -8
- numba_cuda/numba/cuda/core/pythonapi.py +3 -0
- numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
- numba_cuda/numba/cuda/core/rewrites/static_binop.py +1 -1
- numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
- numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
- numba_cuda/numba/cuda/core/ssa.py +5 -5
- numba_cuda/numba/cuda/core/transforms.py +29 -16
- numba_cuda/numba/cuda/core/typed_passes.py +10 -10
- numba_cuda/numba/cuda/core/typeinfer.py +42 -27
- numba_cuda/numba/cuda/core/untyped_passes.py +82 -65
- numba_cuda/numba/cuda/cpython/unicode.py +2 -2
- numba_cuda/numba/cuda/cpython/unicode_support.py +1 -3
- numba_cuda/numba/cuda/cudadecl.py +0 -13
- numba_cuda/numba/cuda/cudadrv/devicearray.py +10 -9
- numba_cuda/numba/cuda/cudadrv/driver.py +142 -519
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +87 -32
- numba_cuda/numba/cuda/cudaimpl.py +0 -12
- numba_cuda/numba/cuda/debuginfo.py +25 -0
- numba_cuda/numba/cuda/descriptor.py +1 -1
- numba_cuda/numba/cuda/device_init.py +4 -7
- numba_cuda/numba/cuda/deviceufunc.py +3 -6
- numba_cuda/numba/cuda/dispatcher.py +39 -49
- numba_cuda/numba/cuda/intrinsics.py +150 -1
- numba_cuda/numba/cuda/libdeviceimpl.py +1 -2
- numba_cuda/numba/cuda/lowering.py +36 -29
- numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
- numba_cuda/numba/cuda/np/arrayobj.py +61 -9
- numba_cuda/numba/cuda/np/numpy_support.py +32 -9
- numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +4 -3
- numba_cuda/numba/cuda/printimpl.py +20 -0
- numba_cuda/numba/cuda/serialize.py +10 -0
- numba_cuda/numba/cuda/stubs.py +0 -11
- numba_cuda/numba/cuda/testing.py +4 -8
- numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +195 -51
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +6 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +53 -23
- numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +61 -9
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +6 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +22 -1
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +13 -0
- numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +37 -35
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +117 -1
- numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
- numba_cuda/numba/cuda/tests/support.py +11 -0
- numba_cuda/numba/cuda/types/cuda_functions.py +1 -1
- numba_cuda/numba/cuda/typing/asnumbatype.py +37 -2
- numba_cuda/numba/cuda/typing/context.py +3 -1
- numba_cuda/numba/cuda/typing/typeof.py +51 -2
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/METADATA +4 -13
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/RECORD +106 -105
- numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
- numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
- numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/WHEEL +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE.numba +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/top_level.txt +0 -0
numba_cuda/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
0.
|
|
1
|
+
0.24.0
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: BSD-2-Clause
|
|
3
3
|
|
|
4
|
+
# delvewheel: patch
|
|
5
|
+
|
|
4
6
|
import importlib
|
|
5
7
|
from numba.cuda.core import config
|
|
6
8
|
from .utils import _readenv
|
|
@@ -23,7 +25,8 @@ if not (
|
|
|
23
25
|
):
|
|
24
26
|
raise ImportError(
|
|
25
27
|
"NVIDIA CUDA Python bindings not found. Install the 'cuda' package "
|
|
26
|
-
|
|
28
|
+
'(e.g. pip install "cuda-bindings==XY.*" or "numba-cuda[cuXY]", '
|
|
29
|
+
"with XY=12 or XY=13)."
|
|
27
30
|
)
|
|
28
31
|
|
|
29
32
|
if config.ENABLE_CUDASIM:
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: BSD-2-Clause
|
|
3
|
+
from packaging import version
|
|
4
|
+
from cuda import core
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
CUDA_CORE_VERSION = version.parse(core.__version__)
|
|
8
|
+
if CUDA_CORE_VERSION < version.parse("0.5.0"):
|
|
9
|
+
from cuda.core.experimental import (
|
|
10
|
+
Program,
|
|
11
|
+
ProgramOptions,
|
|
12
|
+
Linker,
|
|
13
|
+
LinkerOptions,
|
|
14
|
+
Stream,
|
|
15
|
+
Device,
|
|
16
|
+
launch,
|
|
17
|
+
ObjectCode,
|
|
18
|
+
LaunchConfig,
|
|
19
|
+
)
|
|
20
|
+
from cuda.core.experimental._utils.cuda_utils import CUDAError, NVRTCError
|
|
21
|
+
else:
|
|
22
|
+
from cuda.core import (
|
|
23
|
+
Program,
|
|
24
|
+
ProgramOptions,
|
|
25
|
+
Linker,
|
|
26
|
+
LinkerOptions,
|
|
27
|
+
Stream,
|
|
28
|
+
Device,
|
|
29
|
+
launch,
|
|
30
|
+
ObjectCode,
|
|
31
|
+
LaunchConfig,
|
|
32
|
+
)
|
|
33
|
+
from cuda.core._utils.cuda_utils import CUDAError, NVRTCError
|
|
34
|
+
|
|
35
|
+
__all__ = [
|
|
36
|
+
"Program",
|
|
37
|
+
"ProgramOptions",
|
|
38
|
+
"Linker",
|
|
39
|
+
"LinkerOptions",
|
|
40
|
+
"Stream",
|
|
41
|
+
"Device",
|
|
42
|
+
"launch",
|
|
43
|
+
"CUDAError",
|
|
44
|
+
"NVRTCError",
|
|
45
|
+
"ObjectCode",
|
|
46
|
+
"LaunchConfig",
|
|
47
|
+
]
|
numba_cuda/numba/cuda/api.py
CHANGED
|
@@ -21,6 +21,7 @@ current_context = devices.get_context
|
|
|
21
21
|
gpus = devices.gpus
|
|
22
22
|
|
|
23
23
|
|
|
24
|
+
@require_context
|
|
24
25
|
def from_cuda_array_interface(desc, owner=None, sync=True):
|
|
25
26
|
"""Create a DeviceNDArray from a cuda-array-interface description.
|
|
26
27
|
The ``owner`` is the owner of the underlying memory.
|
|
@@ -47,7 +48,9 @@ def from_cuda_array_interface(desc, owner=None, sync=True):
|
|
|
47
48
|
|
|
48
49
|
cudevptr_class = driver.binding.CUdeviceptr
|
|
49
50
|
devptr = cudevptr_class(desc["data"][0])
|
|
50
|
-
data = driver.MemoryPointer(
|
|
51
|
+
data = driver.MemoryPointer(
|
|
52
|
+
current_context(), devptr, size=size, owner=owner
|
|
53
|
+
)
|
|
51
54
|
stream_ptr = desc.get("stream", None)
|
|
52
55
|
if stream_ptr is not None:
|
|
53
56
|
stream = external_stream(stream_ptr)
|
|
Binary file
|
|
@@ -12,7 +12,6 @@
|
|
|
12
12
|
#include "frameobject.h"
|
|
13
13
|
#include "traceback.h"
|
|
14
14
|
#include "typeconv.hpp"
|
|
15
|
-
#include "_devicearray.h"
|
|
16
15
|
|
|
17
16
|
/*
|
|
18
17
|
* Notes on the C_TRACE macro:
|
|
@@ -30,7 +29,7 @@
|
|
|
30
29
|
*
|
|
31
30
|
*/
|
|
32
31
|
|
|
33
|
-
#if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13))
|
|
32
|
+
#if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13) || (PY_MINOR_VERSION == 14))
|
|
34
33
|
|
|
35
34
|
#ifndef Py_BUILD_CORE
|
|
36
35
|
#define Py_BUILD_CORE 1
|
|
@@ -940,37 +939,6 @@ CLEANUP:
|
|
|
940
939
|
return retval;
|
|
941
940
|
}
|
|
942
941
|
|
|
943
|
-
static int
|
|
944
|
-
import_devicearray(void)
|
|
945
|
-
{
|
|
946
|
-
PyObject *devicearray = PyImport_ImportModule(NUMBA_DEVICEARRAY_IMPORT_NAME);
|
|
947
|
-
if (devicearray == NULL) {
|
|
948
|
-
return -1;
|
|
949
|
-
}
|
|
950
|
-
|
|
951
|
-
PyObject *d = PyModule_GetDict(devicearray);
|
|
952
|
-
if (d == NULL) {
|
|
953
|
-
Py_DECREF(devicearray);
|
|
954
|
-
return -1;
|
|
955
|
-
}
|
|
956
|
-
|
|
957
|
-
PyObject *key = PyUnicode_FromString("_DEVICEARRAY_API");
|
|
958
|
-
PyObject *c_api = PyDict_GetItemWithError(d, key);
|
|
959
|
-
int retcode = 0;
|
|
960
|
-
if (PyCapsule_IsValid(c_api, NUMBA_DEVICEARRAY_IMPORT_NAME "._DEVICEARRAY_API")) {
|
|
961
|
-
DeviceArray_API = (void**)PyCapsule_GetPointer(c_api, NUMBA_DEVICEARRAY_IMPORT_NAME "._DEVICEARRAY_API");
|
|
962
|
-
if (DeviceArray_API == NULL) {
|
|
963
|
-
retcode = -1;
|
|
964
|
-
}
|
|
965
|
-
} else {
|
|
966
|
-
retcode = -1;
|
|
967
|
-
}
|
|
968
|
-
|
|
969
|
-
Py_DECREF(key);
|
|
970
|
-
Py_DECREF(devicearray);
|
|
971
|
-
return retcode;
|
|
972
|
-
}
|
|
973
|
-
|
|
974
942
|
static PyMethodDef Dispatcher_methods[] = {
|
|
975
943
|
{ "_clear", (PyCFunction)Dispatcher_clear, METH_NOARGS, NULL },
|
|
976
944
|
{ "_insert", (PyCFunction)Dispatcher_Insert, METH_VARARGS | METH_KEYWORDS,
|
|
@@ -1036,12 +1004,18 @@ static PyTypeObject DispatcherType = {
|
|
|
1036
1004
|
0, /* tp_version_tag */
|
|
1037
1005
|
0, /* tp_finalize */
|
|
1038
1006
|
0, /* tp_vectorcall */
|
|
1039
|
-
#if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION
|
|
1007
|
+
#if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION >= 12)
|
|
1040
1008
|
/* This was introduced first in 3.12
|
|
1041
1009
|
* https://github.com/python/cpython/issues/91051
|
|
1042
1010
|
*/
|
|
1043
1011
|
0, /* tp_watched */
|
|
1044
1012
|
#endif
|
|
1013
|
+
#if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION >= 13)
|
|
1014
|
+
/* This was introduced in 3.13
|
|
1015
|
+
* https://github.com/python/cpython/pull/114900
|
|
1016
|
+
*/
|
|
1017
|
+
0, /* tp_versions_used */
|
|
1018
|
+
#endif
|
|
1045
1019
|
|
|
1046
1020
|
/* WARNING: Do not remove this, only modify it! It is a version guard to
|
|
1047
1021
|
* act as a reminder to update this struct on Python version update! */
|
|
@@ -1076,12 +1050,6 @@ static PyMethodDef ext_methods[] = {
|
|
|
1076
1050
|
|
|
1077
1051
|
|
|
1078
1052
|
MOD_INIT(_dispatcher) {
|
|
1079
|
-
if (import_devicearray() < 0) {
|
|
1080
|
-
PyErr_Print();
|
|
1081
|
-
PyErr_SetString(PyExc_ImportError, NUMBA_DEVICEARRAY_IMPORT_NAME " failed to import");
|
|
1082
|
-
return MOD_ERROR_VAL;
|
|
1083
|
-
}
|
|
1084
|
-
|
|
1085
1053
|
PyObject *m;
|
|
1086
1054
|
MOD_DEF(m, "_dispatcher", "No docs", ext_methods)
|
|
1087
1055
|
if (m == NULL)
|
|
@@ -110,7 +110,12 @@ _Numba_hashtable_hash_int(const void *key)
|
|
|
110
110
|
extern "C" Py_uhash_t
|
|
111
111
|
_Numba_hashtable_hash_ptr(const void *key)
|
|
112
112
|
{
|
|
113
|
+
/* Use public API on Python 3.13+; _Py_HashPointer is deprecated on 3.14+ */
|
|
114
|
+
#if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION >= 13)
|
|
115
|
+
return (Py_uhash_t)Py_HashPointer((void *)key);
|
|
116
|
+
#else
|
|
113
117
|
return (Py_uhash_t)_Py_HashPointer((void *)key);
|
|
118
|
+
#endif
|
|
114
119
|
}
|
|
115
120
|
|
|
116
121
|
extern "C" int
|
|
Binary file
|
|
@@ -33,6 +33,6 @@
|
|
|
33
33
|
Py_DECREF(tmp); } while (0)
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
#define NB_SUPPORTED_PYTHON_MINOR ((PY_MINOR_VERSION == 10) || (PY_MINOR_VERSION == 11) || (PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13))
|
|
36
|
+
#define NB_SUPPORTED_PYTHON_MINOR ((PY_MINOR_VERSION == 10) || (PY_MINOR_VERSION == 11) || (PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13) || (PY_MINOR_VERSION == 14))
|
|
37
37
|
|
|
38
38
|
#endif /* NUMBA_PY_MODULE_H_ */
|
|
Binary file
|
|
@@ -9,7 +9,6 @@
|
|
|
9
9
|
|
|
10
10
|
#include "_typeof.h"
|
|
11
11
|
#include "_hashtable.h"
|
|
12
|
-
#include "_devicearray.h"
|
|
13
12
|
#include "pyerrors.h"
|
|
14
13
|
|
|
15
14
|
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
|
|
@@ -18,11 +17,24 @@
|
|
|
18
17
|
#include <numpy/npy_2_compat.h>
|
|
19
18
|
#endif
|
|
20
19
|
|
|
21
|
-
#
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
20
|
+
#ifndef Py_BUILD_CORE
|
|
21
|
+
#define Py_BUILD_CORE 1
|
|
22
|
+
#endif
|
|
23
|
+
|
|
24
|
+
#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION >= 13)
|
|
25
|
+
// required include from Python 3.13+
|
|
26
|
+
#include "internal/pycore_setobject.h"
|
|
27
|
+
#ifndef PySet_NextEntry
|
|
28
|
+
#define PySet_NextEntry _PySet_NextEntryRef
|
|
29
|
+
#endif
|
|
30
|
+
#else
|
|
31
|
+
#ifndef PySet_NextEntry
|
|
32
|
+
#define PySet_NextEntry _PySet_NextEntry
|
|
33
|
+
#endif
|
|
34
|
+
#endif
|
|
35
|
+
|
|
36
|
+
#ifdef Py_BUILD_CORE
|
|
37
|
+
#undef Py_BUILD_CORE
|
|
26
38
|
#endif
|
|
27
39
|
|
|
28
40
|
|
|
@@ -56,9 +68,6 @@ static PyObject *str_typeof_pyval = NULL;
|
|
|
56
68
|
static PyObject *str_value = NULL;
|
|
57
69
|
static PyObject *str_numba_type = NULL;
|
|
58
70
|
|
|
59
|
-
/* CUDA device array API */
|
|
60
|
-
void **DeviceArray_API;
|
|
61
|
-
|
|
62
71
|
/*
|
|
63
72
|
* Type fingerprint computation.
|
|
64
73
|
*/
|
|
@@ -414,17 +423,52 @@ compute_fingerprint(string_writer_t *w, PyObject *val)
|
|
|
414
423
|
Py_hash_t h;
|
|
415
424
|
PyObject *item;
|
|
416
425
|
Py_ssize_t pos = 0;
|
|
426
|
+
int rc;
|
|
427
|
+
|
|
428
|
+
#if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 13
|
|
429
|
+
// needed when using _PySet_NextEntryRef
|
|
430
|
+
Py_BEGIN_CRITICAL_SECTION(val);
|
|
431
|
+
#endif
|
|
417
432
|
/* Only one item is considered, as in typeof.py */
|
|
418
|
-
|
|
433
|
+
rc = PySet_NextEntry(val, &pos, &item, &h);
|
|
434
|
+
|
|
435
|
+
#if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 13
|
|
436
|
+
// needed when using _PySet_NextEntryRef
|
|
437
|
+
Py_END_CRITICAL_SECTION();
|
|
438
|
+
#endif
|
|
439
|
+
|
|
440
|
+
if (!rc) {
|
|
419
441
|
/* Empty set */
|
|
420
442
|
PyErr_SetString(PyExc_ValueError,
|
|
421
443
|
"cannot compute fingerprint of empty set");
|
|
422
444
|
return -1;
|
|
423
445
|
}
|
|
424
|
-
|
|
425
|
-
|
|
446
|
+
|
|
447
|
+
if (string_writer_put_char(w, OP_SET)) {
|
|
448
|
+
goto fingerprint_error;
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
if (compute_fingerprint(w, item)) {
|
|
452
|
+
goto fingerprint_error;
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
goto fingerprint_success;
|
|
456
|
+
|
|
457
|
+
fingerprint_error:
|
|
458
|
+
#if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 13
|
|
459
|
+
// extra ref if using python >= 3.13
|
|
460
|
+
Py_XDECREF(item);
|
|
461
|
+
#endif
|
|
462
|
+
return -1;
|
|
463
|
+
|
|
464
|
+
fingerprint_success:
|
|
465
|
+
#if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 13
|
|
466
|
+
// extra ref if using python >= 3.13
|
|
467
|
+
Py_XDECREF(item);
|
|
468
|
+
#endif
|
|
426
469
|
return 0;
|
|
427
470
|
}
|
|
471
|
+
|
|
428
472
|
if (PyObject_CheckBuffer(val)) {
|
|
429
473
|
Py_buffer buf;
|
|
430
474
|
int flags = PyBUF_ND | PyBUF_STRIDES | PyBUF_FORMAT;
|
|
@@ -857,109 +901,6 @@ int typecode_arrayscalar(PyObject *dispatcher, PyObject* aryscalar) {
|
|
|
857
901
|
return BASIC_TYPECODES[typecode];
|
|
858
902
|
}
|
|
859
903
|
|
|
860
|
-
static
|
|
861
|
-
int typecode_devicendarray(PyObject *dispatcher, PyObject *ary)
|
|
862
|
-
{
|
|
863
|
-
int typecode;
|
|
864
|
-
int dtype;
|
|
865
|
-
int ndim;
|
|
866
|
-
int layout = 0;
|
|
867
|
-
PyObject *ndim_obj = nullptr;
|
|
868
|
-
PyObject *num_obj = nullptr;
|
|
869
|
-
PyObject *dtype_obj = nullptr;
|
|
870
|
-
int dtype_num = 0;
|
|
871
|
-
|
|
872
|
-
PyObject* flags = PyObject_GetAttrString(ary, "flags");
|
|
873
|
-
if (flags == NULL)
|
|
874
|
-
{
|
|
875
|
-
PyErr_Clear();
|
|
876
|
-
goto FALLBACK;
|
|
877
|
-
}
|
|
878
|
-
|
|
879
|
-
if (PyDict_GetItemString(flags, "C_CONTIGUOUS") == Py_True) {
|
|
880
|
-
layout = 1;
|
|
881
|
-
} else if (PyDict_GetItemString(flags, "F_CONTIGUOUS") == Py_True) {
|
|
882
|
-
layout = 2;
|
|
883
|
-
}
|
|
884
|
-
|
|
885
|
-
Py_DECREF(flags);
|
|
886
|
-
|
|
887
|
-
ndim_obj = PyObject_GetAttrString(ary, "ndim");
|
|
888
|
-
if (ndim_obj == NULL) {
|
|
889
|
-
/* If there's no ndim, try to proceed by clearing the error and using the
|
|
890
|
-
* fallback. */
|
|
891
|
-
PyErr_Clear();
|
|
892
|
-
goto FALLBACK;
|
|
893
|
-
}
|
|
894
|
-
|
|
895
|
-
ndim = PyLong_AsLong(ndim_obj);
|
|
896
|
-
Py_DECREF(ndim_obj);
|
|
897
|
-
|
|
898
|
-
if (PyErr_Occurred()) {
|
|
899
|
-
/* ndim wasn't an integer for some reason - unlikely to happen, but try
|
|
900
|
-
* the fallback. */
|
|
901
|
-
PyErr_Clear();
|
|
902
|
-
goto FALLBACK;
|
|
903
|
-
}
|
|
904
|
-
|
|
905
|
-
if (ndim <= 0 || ndim > N_NDIM)
|
|
906
|
-
goto FALLBACK;
|
|
907
|
-
|
|
908
|
-
dtype_obj = PyObject_GetAttrString(ary, "dtype");
|
|
909
|
-
if (dtype_obj == NULL) {
|
|
910
|
-
/* No dtype: try the fallback. */
|
|
911
|
-
PyErr_Clear();
|
|
912
|
-
goto FALLBACK;
|
|
913
|
-
}
|
|
914
|
-
|
|
915
|
-
num_obj = PyObject_GetAttrString(dtype_obj, "num");
|
|
916
|
-
Py_DECREF(dtype_obj);
|
|
917
|
-
|
|
918
|
-
if (num_obj == NULL) {
|
|
919
|
-
/* This strange dtype has no num - try the fallback. */
|
|
920
|
-
PyErr_Clear();
|
|
921
|
-
goto FALLBACK;
|
|
922
|
-
}
|
|
923
|
-
|
|
924
|
-
dtype_num = PyLong_AsLong(num_obj);
|
|
925
|
-
Py_DECREF(num_obj);
|
|
926
|
-
|
|
927
|
-
if (PyErr_Occurred()) {
|
|
928
|
-
/* num wasn't an integer for some reason - unlikely to happen, but try
|
|
929
|
-
* the fallback. */
|
|
930
|
-
PyErr_Clear();
|
|
931
|
-
goto FALLBACK;
|
|
932
|
-
}
|
|
933
|
-
|
|
934
|
-
dtype = dtype_num_to_typecode(dtype_num);
|
|
935
|
-
if (dtype == -1) {
|
|
936
|
-
/* Not a dtype we have in the global lookup table. */
|
|
937
|
-
goto FALLBACK;
|
|
938
|
-
}
|
|
939
|
-
|
|
940
|
-
/* Fast path, using direct table lookup */
|
|
941
|
-
assert(layout < N_LAYOUT);
|
|
942
|
-
assert(ndim <= N_NDIM);
|
|
943
|
-
assert(dtype < N_DTYPES);
|
|
944
|
-
typecode = cached_arycode[ndim - 1][layout][dtype];
|
|
945
|
-
|
|
946
|
-
if (typecode == -1) {
|
|
947
|
-
/* First use of this table entry, so it requires populating */
|
|
948
|
-
typecode = typecode_fallback_keep_ref(dispatcher, (PyObject*)ary);
|
|
949
|
-
cached_arycode[ndim - 1][layout][dtype] = typecode;
|
|
950
|
-
}
|
|
951
|
-
|
|
952
|
-
return typecode;
|
|
953
|
-
|
|
954
|
-
FALLBACK:
|
|
955
|
-
/* Slower path, for non-trivial array types. At present this always uses
|
|
956
|
-
the fingerprinting to get the typecode. Future optimization might
|
|
957
|
-
implement a cache, but this would require some fast equivalent of
|
|
958
|
-
PyArray_DESCR for a device array. */
|
|
959
|
-
|
|
960
|
-
return typecode_using_fingerprint(dispatcher, (PyObject *) ary);
|
|
961
|
-
}
|
|
962
|
-
|
|
963
904
|
extern "C" int
|
|
964
905
|
typeof_typecode(PyObject *dispatcher, PyObject *val)
|
|
965
906
|
{
|
|
@@ -994,10 +935,6 @@ typeof_typecode(PyObject *dispatcher, PyObject *val)
|
|
|
994
935
|
else if (tyobj == &PyArray_Type) {
|
|
995
936
|
return typecode_ndarray(dispatcher, (PyArrayObject*)val);
|
|
996
937
|
}
|
|
997
|
-
/* Subtype of CUDA device array */
|
|
998
|
-
else if (PyType_IsSubtype(tyobj, &DeviceArrayType)) {
|
|
999
|
-
return typecode_devicendarray(dispatcher, val);
|
|
1000
|
-
}
|
|
1001
938
|
/* Subtypes of Array handling */
|
|
1002
939
|
else if (PyType_IsSubtype(tyobj, &PyArray_Type)) {
|
|
1003
940
|
/* By default, Numba will treat all numpy.ndarray subtypes as if they
|
|
@@ -337,12 +337,18 @@ static PyTypeObject MemAllocType = {
|
|
|
337
337
|
0, /* tp_version_tag */
|
|
338
338
|
0, /* tp_finalize */
|
|
339
339
|
0, /* tp_vectorcall */
|
|
340
|
-
#if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION
|
|
340
|
+
#if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION >= 12)
|
|
341
341
|
/* This was introduced first in 3.12
|
|
342
342
|
* https://github.com/python/cpython/issues/91051
|
|
343
343
|
*/
|
|
344
344
|
0, /* tp_watched */
|
|
345
345
|
#endif
|
|
346
|
+
#if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION >= 13)
|
|
347
|
+
/* This was introduced in 3.13
|
|
348
|
+
* https://github.com/python/cpython/pull/114900
|
|
349
|
+
*/
|
|
350
|
+
0, /* tp_versions_used */
|
|
351
|
+
#endif
|
|
346
352
|
|
|
347
353
|
/* WARNING: Do not remove this, only modify it! It is a version guard to
|
|
348
354
|
* act as a reminder to update this struct on Python version update! */
|
|
Binary file
|
|
@@ -422,10 +422,9 @@ HAVE_ARGUMENT = dis.HAVE_ARGUMENT
|
|
|
422
422
|
EXTENDED_ARG = dis.EXTENDED_ARG
|
|
423
423
|
|
|
424
424
|
|
|
425
|
-
_BUILTIN_TYPE_NAMES = {
|
|
426
|
-
for k, v in types.__dict__.items()
|
|
427
|
-
|
|
428
|
-
_BUILTIN_TYPE_NAMES[v] = k
|
|
425
|
+
_BUILTIN_TYPE_NAMES = {
|
|
426
|
+
v: k for k, v in types.__dict__.items() if type(v) is type
|
|
427
|
+
}
|
|
429
428
|
|
|
430
429
|
|
|
431
430
|
def _builtin_type(name):
|
|
@@ -463,7 +462,7 @@ def _extract_class_dict(cls):
|
|
|
463
462
|
base_value = inherited_dict[name]
|
|
464
463
|
if value is base_value:
|
|
465
464
|
to_remove.append(name)
|
|
466
|
-
except KeyError:
|
|
465
|
+
except KeyError: # noqa: PERF203
|
|
467
466
|
pass
|
|
468
467
|
for name in to_remove:
|
|
469
468
|
clsdict.pop(name)
|
numba_cuda/numba/cuda/codegen.py
CHANGED
|
@@ -12,6 +12,7 @@ from numba.cuda.cudadrv.linkable_code import LinkableCode
|
|
|
12
12
|
from numba.cuda.memory_management.nrt import NRT_LIBRARY
|
|
13
13
|
|
|
14
14
|
import os
|
|
15
|
+
import pickle
|
|
15
16
|
import subprocess
|
|
16
17
|
import tempfile
|
|
17
18
|
|
|
@@ -189,6 +190,11 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
|
189
190
|
|
|
190
191
|
self.use_cooperative = False
|
|
191
192
|
|
|
193
|
+
# Objects that need to be kept alive for the lifetime of the
|
|
194
|
+
# kernels or device functions generated by this code library,
|
|
195
|
+
# e.g., device arrays captured from global scope.
|
|
196
|
+
self.referenced_objects = {}
|
|
197
|
+
|
|
192
198
|
@property
|
|
193
199
|
def llvm_strs(self):
|
|
194
200
|
if self._llvm_strs is None:
|
|
@@ -203,9 +209,14 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
|
203
209
|
return cc
|
|
204
210
|
|
|
205
211
|
device = devices.get_context().device
|
|
206
|
-
|
|
212
|
+
cc = device.compute_capability
|
|
213
|
+
cc = (cc[0], cc[1], "a" if cc >= (9, 0) else "")
|
|
214
|
+
return cc
|
|
207
215
|
|
|
208
216
|
def get_asm_str(self, cc=None):
|
|
217
|
+
return "\n".join(self.get_asm_strs(cc=cc))
|
|
218
|
+
|
|
219
|
+
def get_asm_strs(self, cc=None):
|
|
209
220
|
cc = self._ensure_cc(cc)
|
|
210
221
|
|
|
211
222
|
ptxes = self._ptx_cache.get(cc, None)
|
|
@@ -218,21 +229,25 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
|
218
229
|
|
|
219
230
|
irs = self.llvm_strs
|
|
220
231
|
|
|
221
|
-
|
|
232
|
+
if "g" in options:
|
|
233
|
+
ptxes = [nvvm.compile_ir(ir, **options) for ir in irs]
|
|
234
|
+
else:
|
|
235
|
+
ptxes = [nvvm.compile_ir(irs, **options)]
|
|
222
236
|
|
|
223
237
|
# Sometimes the result from NVVM contains trailing whitespace and
|
|
224
238
|
# nulls, which we strip so that the assembly dump looks a little
|
|
225
239
|
# tidier.
|
|
226
|
-
|
|
240
|
+
ptxes = [ptx.decode().strip("\x00").strip() for ptx in ptxes]
|
|
227
241
|
|
|
228
242
|
if config.DUMP_ASSEMBLY:
|
|
229
243
|
print(("ASSEMBLY %s" % self._name).center(80, "-"))
|
|
230
|
-
|
|
244
|
+
for ptx in ptxes:
|
|
245
|
+
print(ptx)
|
|
231
246
|
print("=" * 80)
|
|
232
247
|
|
|
233
|
-
self._ptx_cache[cc] =
|
|
248
|
+
self._ptx_cache[cc] = ptxes
|
|
234
249
|
|
|
235
|
-
return
|
|
250
|
+
return ptxes
|
|
236
251
|
|
|
237
252
|
def get_lto_ptx(self, cc=None):
|
|
238
253
|
"""
|
|
@@ -247,7 +262,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
|
247
262
|
|
|
248
263
|
cc = self._ensure_cc(cc)
|
|
249
264
|
|
|
250
|
-
linker = driver._Linker
|
|
265
|
+
linker = driver._Linker(
|
|
251
266
|
max_registers=self._max_registers,
|
|
252
267
|
cc=cc,
|
|
253
268
|
additional_flags=["-ptx"],
|
|
@@ -284,8 +299,9 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
|
284
299
|
ltoir = self.get_ltoir(cc=cc)
|
|
285
300
|
linker.add_ltoir(ltoir)
|
|
286
301
|
else:
|
|
287
|
-
|
|
288
|
-
|
|
302
|
+
ptxes = self.get_asm_strs(cc=cc)
|
|
303
|
+
for ptx in ptxes:
|
|
304
|
+
linker.add_ptx(ptx.encode())
|
|
289
305
|
|
|
290
306
|
for path in self._linking_files:
|
|
291
307
|
linker.add_file_guess_ext(path, ignore_nonlto)
|
|
@@ -308,7 +324,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
|
308
324
|
print(ptx)
|
|
309
325
|
print("=" * 80)
|
|
310
326
|
|
|
311
|
-
linker = driver._Linker
|
|
327
|
+
linker = driver._Linker(
|
|
312
328
|
max_registers=self._max_registers, cc=cc, lto=self._lto
|
|
313
329
|
)
|
|
314
330
|
self._link_all(linker, cc, ignore_nonlto=False)
|
|
@@ -333,7 +349,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
|
333
349
|
cufunc = self._cufunc_cache.get(device.id, None)
|
|
334
350
|
if cufunc:
|
|
335
351
|
return cufunc
|
|
336
|
-
cubin = self.get_cubin(
|
|
352
|
+
cubin = self.get_cubin()
|
|
337
353
|
module = ctx.create_module_image(
|
|
338
354
|
cubin, self._setup_functions, self._teardown_functions
|
|
339
355
|
)
|
|
@@ -377,6 +393,9 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
|
377
393
|
self._setup_functions.extend(library._setup_functions)
|
|
378
394
|
self._teardown_functions.extend(library._teardown_functions)
|
|
379
395
|
self.use_cooperative |= library.use_cooperative
|
|
396
|
+
self.referenced_objects.update(
|
|
397
|
+
getattr(library, "referenced_objects", {})
|
|
398
|
+
)
|
|
380
399
|
|
|
381
400
|
def add_linking_file(self, path_or_obj):
|
|
382
401
|
if isinstance(path_or_obj, LinkableCode):
|
|
@@ -432,7 +451,10 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
|
432
451
|
for mod in library.modules:
|
|
433
452
|
for fn in mod.functions:
|
|
434
453
|
if not fn.is_declaration:
|
|
435
|
-
|
|
454
|
+
if "g" in self._nvvm_options:
|
|
455
|
+
fn.linkage = "weak_odr"
|
|
456
|
+
else:
|
|
457
|
+
fn.linkage = "linkonce_odr"
|
|
436
458
|
|
|
437
459
|
self._finalized = True
|
|
438
460
|
|
|
@@ -442,6 +464,18 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
|
442
464
|
but loaded functions are discarded. They are recreated when needed
|
|
443
465
|
after deserialization.
|
|
444
466
|
"""
|
|
467
|
+
# Check for captured device arrays that cannot be safely cached.
|
|
468
|
+
if self.referenced_objects:
|
|
469
|
+
if any(
|
|
470
|
+
getattr(obj, "__cuda_array_interface__", None) is not None
|
|
471
|
+
for obj in self.referenced_objects.values()
|
|
472
|
+
):
|
|
473
|
+
raise pickle.PicklingError(
|
|
474
|
+
"Cannot serialize kernels or device functions referencing "
|
|
475
|
+
"global device arrays. Pass the array(s) as arguments "
|
|
476
|
+
"to the kernel instead."
|
|
477
|
+
)
|
|
478
|
+
|
|
445
479
|
nrt = False
|
|
446
480
|
if self._linking_files:
|
|
447
481
|
if (
|
|
@@ -852,15 +852,15 @@ def kernel_fixup(kernel, debug):
|
|
|
852
852
|
return_value = kernel.args[0]
|
|
853
853
|
|
|
854
854
|
for block in kernel.blocks:
|
|
855
|
-
remove_list = []
|
|
856
|
-
|
|
857
855
|
# Find all stores first
|
|
858
|
-
|
|
856
|
+
remove_list = [
|
|
857
|
+
inst
|
|
858
|
+
for inst in block.instructions
|
|
859
859
|
if (
|
|
860
860
|
isinstance(inst, ir.StoreInstr)
|
|
861
861
|
and inst.operands[1] == return_value
|
|
862
|
-
)
|
|
863
|
-
|
|
862
|
+
)
|
|
863
|
+
]
|
|
864
864
|
|
|
865
865
|
# Remove all stores
|
|
866
866
|
for to_remove in remove_list:
|
|
@@ -1023,10 +1023,9 @@ def compile_all(
|
|
|
1023
1023
|
)
|
|
1024
1024
|
|
|
1025
1025
|
if lto:
|
|
1026
|
-
|
|
1026
|
+
codes = [lib.get_ltoir(cc=cc)]
|
|
1027
1027
|
else:
|
|
1028
|
-
|
|
1029
|
-
codes = [code]
|
|
1028
|
+
codes = lib.get_asm_strs(cc=cc)
|
|
1030
1029
|
|
|
1031
1030
|
# linking_files
|
|
1032
1031
|
is_ltoir = output == "ltoir"
|
|
@@ -1241,7 +1240,14 @@ def compile(
|
|
|
1241
1240
|
if lto:
|
|
1242
1241
|
code = lib.get_ltoir(cc=cc)
|
|
1243
1242
|
else:
|
|
1244
|
-
|
|
1243
|
+
codes = lib.get_asm_strs(cc=cc)
|
|
1244
|
+
if len(codes) == 1:
|
|
1245
|
+
code = codes[0]
|
|
1246
|
+
else:
|
|
1247
|
+
raise RuntimeError(
|
|
1248
|
+
"Compiling this function results in multiple "
|
|
1249
|
+
"PTX files. Use compile_all() instead"
|
|
1250
|
+
)
|
|
1245
1251
|
return code, resty
|
|
1246
1252
|
|
|
1247
1253
|
|