numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.24.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/__init__.py +4 -1
  3. numba_cuda/numba/cuda/_compat.py +47 -0
  4. numba_cuda/numba/cuda/api.py +4 -1
  5. numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
  6. numba_cuda/numba/cuda/cext/_dispatcher.cpp +8 -40
  7. numba_cuda/numba/cuda/cext/_hashtable.cpp +5 -0
  8. numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
  9. numba_cuda/numba/cuda/cext/_pymodule.h +1 -1
  10. numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
  11. numba_cuda/numba/cuda/cext/_typeof.cpp +56 -119
  12. numba_cuda/numba/cuda/cext/mviewbuf.c +7 -1
  13. numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
  14. numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +4 -5
  15. numba_cuda/numba/cuda/codegen.py +46 -12
  16. numba_cuda/numba/cuda/compiler.py +15 -9
  17. numba_cuda/numba/cuda/core/analysis.py +29 -21
  18. numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +1 -1
  19. numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
  20. numba_cuda/numba/cuda/core/base.py +12 -11
  21. numba_cuda/numba/cuda/core/bytecode.py +21 -13
  22. numba_cuda/numba/cuda/core/byteflow.py +336 -90
  23. numba_cuda/numba/cuda/core/compiler.py +3 -4
  24. numba_cuda/numba/cuda/core/compiler_machinery.py +3 -3
  25. numba_cuda/numba/cuda/core/config.py +5 -7
  26. numba_cuda/numba/cuda/core/consts.py +1 -1
  27. numba_cuda/numba/cuda/core/controlflow.py +17 -9
  28. numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
  29. numba_cuda/numba/cuda/core/errors.py +4 -912
  30. numba_cuda/numba/cuda/core/inline_closurecall.py +82 -67
  31. numba_cuda/numba/cuda/core/interpreter.py +334 -160
  32. numba_cuda/numba/cuda/core/ir.py +191 -119
  33. numba_cuda/numba/cuda/core/ir_utils.py +149 -128
  34. numba_cuda/numba/cuda/core/postproc.py +8 -8
  35. numba_cuda/numba/cuda/core/pythonapi.py +3 -0
  36. numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
  37. numba_cuda/numba/cuda/core/rewrites/static_binop.py +1 -1
  38. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
  39. numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
  40. numba_cuda/numba/cuda/core/ssa.py +5 -5
  41. numba_cuda/numba/cuda/core/transforms.py +29 -16
  42. numba_cuda/numba/cuda/core/typed_passes.py +10 -10
  43. numba_cuda/numba/cuda/core/typeinfer.py +42 -27
  44. numba_cuda/numba/cuda/core/untyped_passes.py +82 -65
  45. numba_cuda/numba/cuda/cpython/unicode.py +2 -2
  46. numba_cuda/numba/cuda/cpython/unicode_support.py +1 -3
  47. numba_cuda/numba/cuda/cudadecl.py +0 -13
  48. numba_cuda/numba/cuda/cudadrv/devicearray.py +10 -9
  49. numba_cuda/numba/cuda/cudadrv/driver.py +142 -519
  50. numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
  51. numba_cuda/numba/cuda/cudadrv/nvrtc.py +87 -32
  52. numba_cuda/numba/cuda/cudaimpl.py +0 -12
  53. numba_cuda/numba/cuda/debuginfo.py +25 -0
  54. numba_cuda/numba/cuda/descriptor.py +1 -1
  55. numba_cuda/numba/cuda/device_init.py +4 -7
  56. numba_cuda/numba/cuda/deviceufunc.py +3 -6
  57. numba_cuda/numba/cuda/dispatcher.py +39 -49
  58. numba_cuda/numba/cuda/intrinsics.py +150 -1
  59. numba_cuda/numba/cuda/libdeviceimpl.py +1 -2
  60. numba_cuda/numba/cuda/lowering.py +36 -29
  61. numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
  62. numba_cuda/numba/cuda/np/arrayobj.py +61 -9
  63. numba_cuda/numba/cuda/np/numpy_support.py +32 -9
  64. numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +4 -3
  65. numba_cuda/numba/cuda/printimpl.py +20 -0
  66. numba_cuda/numba/cuda/serialize.py +10 -0
  67. numba_cuda/numba/cuda/stubs.py +0 -11
  68. numba_cuda/numba/cuda/testing.py +4 -8
  69. numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
  70. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
  71. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +195 -51
  72. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
  73. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
  74. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
  75. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +6 -7
  76. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
  77. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +53 -23
  78. numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +61 -9
  79. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +6 -0
  80. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
  81. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +22 -1
  82. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +13 -0
  83. numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +1 -1
  84. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +1 -1
  85. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +94 -0
  86. numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
  87. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
  88. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1 -1
  89. numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
  90. numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
  91. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +37 -35
  92. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +117 -1
  93. numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
  94. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
  95. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
  96. numba_cuda/numba/cuda/tests/support.py +11 -0
  97. numba_cuda/numba/cuda/types/cuda_functions.py +1 -1
  98. numba_cuda/numba/cuda/typing/asnumbatype.py +37 -2
  99. numba_cuda/numba/cuda/typing/context.py +3 -1
  100. numba_cuda/numba/cuda/typing/typeof.py +51 -2
  101. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/METADATA +4 -13
  102. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/RECORD +106 -105
  103. numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
  104. numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
  105. numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
  106. numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
  107. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/WHEEL +0 -0
  108. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE +0 -0
  109. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE.numba +0 -0
  110. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/top_level.txt +0 -0
numba_cuda/VERSION CHANGED
@@ -1 +1 @@
1
- 0.21.1
1
+ 0.24.0
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: BSD-2-Clause
3
3
 
4
+ # delvewheel: patch
5
+
4
6
  import importlib
5
7
  from numba.cuda.core import config
6
8
  from .utils import _readenv
@@ -23,7 +25,8 @@ if not (
23
25
  ):
24
26
  raise ImportError(
25
27
  "NVIDIA CUDA Python bindings not found. Install the 'cuda' package "
26
- "(e.g. pip install nvidia-cuda-python or numba-cuda[cuXY])."
28
+ '(e.g. pip install "cuda-bindings==XY.*" or "numba-cuda[cuXY]", '
29
+ "with XY=12 or XY=13)."
27
30
  )
28
31
 
29
32
  if config.ENABLE_CUDASIM:
@@ -0,0 +1,47 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+ from packaging import version
4
+ from cuda import core
5
+
6
+
7
+ CUDA_CORE_VERSION = version.parse(core.__version__)
8
+ if CUDA_CORE_VERSION < version.parse("0.5.0"):
9
+ from cuda.core.experimental import (
10
+ Program,
11
+ ProgramOptions,
12
+ Linker,
13
+ LinkerOptions,
14
+ Stream,
15
+ Device,
16
+ launch,
17
+ ObjectCode,
18
+ LaunchConfig,
19
+ )
20
+ from cuda.core.experimental._utils.cuda_utils import CUDAError, NVRTCError
21
+ else:
22
+ from cuda.core import (
23
+ Program,
24
+ ProgramOptions,
25
+ Linker,
26
+ LinkerOptions,
27
+ Stream,
28
+ Device,
29
+ launch,
30
+ ObjectCode,
31
+ LaunchConfig,
32
+ )
33
+ from cuda.core._utils.cuda_utils import CUDAError, NVRTCError
34
+
35
+ __all__ = [
36
+ "Program",
37
+ "ProgramOptions",
38
+ "Linker",
39
+ "LinkerOptions",
40
+ "Stream",
41
+ "Device",
42
+ "launch",
43
+ "CUDAError",
44
+ "NVRTCError",
45
+ "ObjectCode",
46
+ "LaunchConfig",
47
+ ]
@@ -21,6 +21,7 @@ current_context = devices.get_context
21
21
  gpus = devices.gpus
22
22
 
23
23
 
24
+ @require_context
24
25
  def from_cuda_array_interface(desc, owner=None, sync=True):
25
26
  """Create a DeviceNDArray from a cuda-array-interface description.
26
27
  The ``owner`` is the owner of the underlying memory.
@@ -47,7 +48,9 @@ def from_cuda_array_interface(desc, owner=None, sync=True):
47
48
 
48
49
  cudevptr_class = driver.binding.CUdeviceptr
49
50
  devptr = cudevptr_class(desc["data"][0])
50
- data = driver.MemoryPointer(devptr, size=size, owner=owner)
51
+ data = driver.MemoryPointer(
52
+ current_context(), devptr, size=size, owner=owner
53
+ )
51
54
  stream_ptr = desc.get("stream", None)
52
55
  if stream_ptr is not None:
53
56
  stream = external_stream(stream_ptr)
@@ -12,7 +12,6 @@
12
12
  #include "frameobject.h"
13
13
  #include "traceback.h"
14
14
  #include "typeconv.hpp"
15
- #include "_devicearray.h"
16
15
 
17
16
  /*
18
17
  * Notes on the C_TRACE macro:
@@ -30,7 +29,7 @@
30
29
  *
31
30
  */
32
31
 
33
- #if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13))
32
+ #if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13) || (PY_MINOR_VERSION == 14))
34
33
 
35
34
  #ifndef Py_BUILD_CORE
36
35
  #define Py_BUILD_CORE 1
@@ -940,37 +939,6 @@ CLEANUP:
940
939
  return retval;
941
940
  }
942
941
 
943
- static int
944
- import_devicearray(void)
945
- {
946
- PyObject *devicearray = PyImport_ImportModule(NUMBA_DEVICEARRAY_IMPORT_NAME);
947
- if (devicearray == NULL) {
948
- return -1;
949
- }
950
-
951
- PyObject *d = PyModule_GetDict(devicearray);
952
- if (d == NULL) {
953
- Py_DECREF(devicearray);
954
- return -1;
955
- }
956
-
957
- PyObject *key = PyUnicode_FromString("_DEVICEARRAY_API");
958
- PyObject *c_api = PyDict_GetItemWithError(d, key);
959
- int retcode = 0;
960
- if (PyCapsule_IsValid(c_api, NUMBA_DEVICEARRAY_IMPORT_NAME "._DEVICEARRAY_API")) {
961
- DeviceArray_API = (void**)PyCapsule_GetPointer(c_api, NUMBA_DEVICEARRAY_IMPORT_NAME "._DEVICEARRAY_API");
962
- if (DeviceArray_API == NULL) {
963
- retcode = -1;
964
- }
965
- } else {
966
- retcode = -1;
967
- }
968
-
969
- Py_DECREF(key);
970
- Py_DECREF(devicearray);
971
- return retcode;
972
- }
973
-
974
942
  static PyMethodDef Dispatcher_methods[] = {
975
943
  { "_clear", (PyCFunction)Dispatcher_clear, METH_NOARGS, NULL },
976
944
  { "_insert", (PyCFunction)Dispatcher_Insert, METH_VARARGS | METH_KEYWORDS,
@@ -1036,12 +1004,18 @@ static PyTypeObject DispatcherType = {
1036
1004
  0, /* tp_version_tag */
1037
1005
  0, /* tp_finalize */
1038
1006
  0, /* tp_vectorcall */
1039
- #if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION == 12)
1007
+ #if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION >= 12)
1040
1008
  /* This was introduced first in 3.12
1041
1009
  * https://github.com/python/cpython/issues/91051
1042
1010
  */
1043
1011
  0, /* tp_watched */
1044
1012
  #endif
1013
+ #if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION >= 13)
1014
+ /* This was introduced in 3.13
1015
+ * https://github.com/python/cpython/pull/114900
1016
+ */
1017
+ 0, /* tp_versions_used */
1018
+ #endif
1045
1019
 
1046
1020
  /* WARNING: Do not remove this, only modify it! It is a version guard to
1047
1021
  * act as a reminder to update this struct on Python version update! */
@@ -1076,12 +1050,6 @@ static PyMethodDef ext_methods[] = {
1076
1050
 
1077
1051
 
1078
1052
  MOD_INIT(_dispatcher) {
1079
- if (import_devicearray() < 0) {
1080
- PyErr_Print();
1081
- PyErr_SetString(PyExc_ImportError, NUMBA_DEVICEARRAY_IMPORT_NAME " failed to import");
1082
- return MOD_ERROR_VAL;
1083
- }
1084
-
1085
1053
  PyObject *m;
1086
1054
  MOD_DEF(m, "_dispatcher", "No docs", ext_methods)
1087
1055
  if (m == NULL)
@@ -110,7 +110,12 @@ _Numba_hashtable_hash_int(const void *key)
110
110
  extern "C" Py_uhash_t
111
111
  _Numba_hashtable_hash_ptr(const void *key)
112
112
  {
113
+ /* Use public API on Python 3.13+; _Py_HashPointer is deprecated on 3.14+ */
114
+ #if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION >= 13)
115
+ return (Py_uhash_t)Py_HashPointer((void *)key);
116
+ #else
113
117
  return (Py_uhash_t)_Py_HashPointer((void *)key);
118
+ #endif
114
119
  }
115
120
 
116
121
  extern "C" int
@@ -33,6 +33,6 @@
33
33
  Py_DECREF(tmp); } while (0)
34
34
 
35
35
 
36
- #define NB_SUPPORTED_PYTHON_MINOR ((PY_MINOR_VERSION == 10) || (PY_MINOR_VERSION == 11) || (PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13))
36
+ #define NB_SUPPORTED_PYTHON_MINOR ((PY_MINOR_VERSION == 10) || (PY_MINOR_VERSION == 11) || (PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13) || (PY_MINOR_VERSION == 14))
37
37
 
38
38
  #endif /* NUMBA_PY_MODULE_H_ */
@@ -9,7 +9,6 @@
9
9
 
10
10
  #include "_typeof.h"
11
11
  #include "_hashtable.h"
12
- #include "_devicearray.h"
13
12
  #include "pyerrors.h"
14
13
 
15
14
  #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
@@ -18,11 +17,24 @@
18
17
  #include <numpy/npy_2_compat.h>
19
18
  #endif
20
19
 
21
- #if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 13)
22
- #ifndef Py_BUILD_CORE
23
- #define Py_BUILD_CORE 1
24
- #endif
25
- #include "internal/pycore_setobject.h" // _PySet_NextEntry()
20
+ #ifndef Py_BUILD_CORE
21
+ #define Py_BUILD_CORE 1
22
+ #endif
23
+
24
+ #if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION >= 13)
25
+ // required include from Python 3.13+
26
+ #include "internal/pycore_setobject.h"
27
+ #ifndef PySet_NextEntry
28
+ #define PySet_NextEntry _PySet_NextEntryRef
29
+ #endif
30
+ #else
31
+ #ifndef PySet_NextEntry
32
+ #define PySet_NextEntry _PySet_NextEntry
33
+ #endif
34
+ #endif
35
+
36
+ #ifdef Py_BUILD_CORE
37
+ #undef Py_BUILD_CORE
26
38
  #endif
27
39
 
28
40
 
@@ -56,9 +68,6 @@ static PyObject *str_typeof_pyval = NULL;
56
68
  static PyObject *str_value = NULL;
57
69
  static PyObject *str_numba_type = NULL;
58
70
 
59
- /* CUDA device array API */
60
- void **DeviceArray_API;
61
-
62
71
  /*
63
72
  * Type fingerprint computation.
64
73
  */
@@ -414,17 +423,52 @@ compute_fingerprint(string_writer_t *w, PyObject *val)
414
423
  Py_hash_t h;
415
424
  PyObject *item;
416
425
  Py_ssize_t pos = 0;
426
+ int rc;
427
+
428
+ #if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 13
429
+ // needed when using _PySet_NextEntryRef
430
+ Py_BEGIN_CRITICAL_SECTION(val);
431
+ #endif
417
432
  /* Only one item is considered, as in typeof.py */
418
- if (!_PySet_NextEntry(val, &pos, &item, &h)) {
433
+ rc = PySet_NextEntry(val, &pos, &item, &h);
434
+
435
+ #if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 13
436
+ // needed when using _PySet_NextEntryRef
437
+ Py_END_CRITICAL_SECTION();
438
+ #endif
439
+
440
+ if (!rc) {
419
441
  /* Empty set */
420
442
  PyErr_SetString(PyExc_ValueError,
421
443
  "cannot compute fingerprint of empty set");
422
444
  return -1;
423
445
  }
424
- TRY(string_writer_put_char, w, OP_SET);
425
- TRY(compute_fingerprint, w, item);
446
+
447
+ if (string_writer_put_char(w, OP_SET)) {
448
+ goto fingerprint_error;
449
+ }
450
+
451
+ if (compute_fingerprint(w, item)) {
452
+ goto fingerprint_error;
453
+ }
454
+
455
+ goto fingerprint_success;
456
+
457
+ fingerprint_error:
458
+ #if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 13
459
+ // extra ref if using python >= 3.13
460
+ Py_XDECREF(item);
461
+ #endif
462
+ return -1;
463
+
464
+ fingerprint_success:
465
+ #if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 13
466
+ // extra ref if using python >= 3.13
467
+ Py_XDECREF(item);
468
+ #endif
426
469
  return 0;
427
470
  }
471
+
428
472
  if (PyObject_CheckBuffer(val)) {
429
473
  Py_buffer buf;
430
474
  int flags = PyBUF_ND | PyBUF_STRIDES | PyBUF_FORMAT;
@@ -857,109 +901,6 @@ int typecode_arrayscalar(PyObject *dispatcher, PyObject* aryscalar) {
857
901
  return BASIC_TYPECODES[typecode];
858
902
  }
859
903
 
860
- static
861
- int typecode_devicendarray(PyObject *dispatcher, PyObject *ary)
862
- {
863
- int typecode;
864
- int dtype;
865
- int ndim;
866
- int layout = 0;
867
- PyObject *ndim_obj = nullptr;
868
- PyObject *num_obj = nullptr;
869
- PyObject *dtype_obj = nullptr;
870
- int dtype_num = 0;
871
-
872
- PyObject* flags = PyObject_GetAttrString(ary, "flags");
873
- if (flags == NULL)
874
- {
875
- PyErr_Clear();
876
- goto FALLBACK;
877
- }
878
-
879
- if (PyDict_GetItemString(flags, "C_CONTIGUOUS") == Py_True) {
880
- layout = 1;
881
- } else if (PyDict_GetItemString(flags, "F_CONTIGUOUS") == Py_True) {
882
- layout = 2;
883
- }
884
-
885
- Py_DECREF(flags);
886
-
887
- ndim_obj = PyObject_GetAttrString(ary, "ndim");
888
- if (ndim_obj == NULL) {
889
- /* If there's no ndim, try to proceed by clearing the error and using the
890
- * fallback. */
891
- PyErr_Clear();
892
- goto FALLBACK;
893
- }
894
-
895
- ndim = PyLong_AsLong(ndim_obj);
896
- Py_DECREF(ndim_obj);
897
-
898
- if (PyErr_Occurred()) {
899
- /* ndim wasn't an integer for some reason - unlikely to happen, but try
900
- * the fallback. */
901
- PyErr_Clear();
902
- goto FALLBACK;
903
- }
904
-
905
- if (ndim <= 0 || ndim > N_NDIM)
906
- goto FALLBACK;
907
-
908
- dtype_obj = PyObject_GetAttrString(ary, "dtype");
909
- if (dtype_obj == NULL) {
910
- /* No dtype: try the fallback. */
911
- PyErr_Clear();
912
- goto FALLBACK;
913
- }
914
-
915
- num_obj = PyObject_GetAttrString(dtype_obj, "num");
916
- Py_DECREF(dtype_obj);
917
-
918
- if (num_obj == NULL) {
919
- /* This strange dtype has no num - try the fallback. */
920
- PyErr_Clear();
921
- goto FALLBACK;
922
- }
923
-
924
- dtype_num = PyLong_AsLong(num_obj);
925
- Py_DECREF(num_obj);
926
-
927
- if (PyErr_Occurred()) {
928
- /* num wasn't an integer for some reason - unlikely to happen, but try
929
- * the fallback. */
930
- PyErr_Clear();
931
- goto FALLBACK;
932
- }
933
-
934
- dtype = dtype_num_to_typecode(dtype_num);
935
- if (dtype == -1) {
936
- /* Not a dtype we have in the global lookup table. */
937
- goto FALLBACK;
938
- }
939
-
940
- /* Fast path, using direct table lookup */
941
- assert(layout < N_LAYOUT);
942
- assert(ndim <= N_NDIM);
943
- assert(dtype < N_DTYPES);
944
- typecode = cached_arycode[ndim - 1][layout][dtype];
945
-
946
- if (typecode == -1) {
947
- /* First use of this table entry, so it requires populating */
948
- typecode = typecode_fallback_keep_ref(dispatcher, (PyObject*)ary);
949
- cached_arycode[ndim - 1][layout][dtype] = typecode;
950
- }
951
-
952
- return typecode;
953
-
954
- FALLBACK:
955
- /* Slower path, for non-trivial array types. At present this always uses
956
- the fingerprinting to get the typecode. Future optimization might
957
- implement a cache, but this would require some fast equivalent of
958
- PyArray_DESCR for a device array. */
959
-
960
- return typecode_using_fingerprint(dispatcher, (PyObject *) ary);
961
- }
962
-
963
904
  extern "C" int
964
905
  typeof_typecode(PyObject *dispatcher, PyObject *val)
965
906
  {
@@ -994,10 +935,6 @@ typeof_typecode(PyObject *dispatcher, PyObject *val)
994
935
  else if (tyobj == &PyArray_Type) {
995
936
  return typecode_ndarray(dispatcher, (PyArrayObject*)val);
996
937
  }
997
- /* Subtype of CUDA device array */
998
- else if (PyType_IsSubtype(tyobj, &DeviceArrayType)) {
999
- return typecode_devicendarray(dispatcher, val);
1000
- }
1001
938
  /* Subtypes of Array handling */
1002
939
  else if (PyType_IsSubtype(tyobj, &PyArray_Type)) {
1003
940
  /* By default, Numba will treat all numpy.ndarray subtypes as if they
@@ -337,12 +337,18 @@ static PyTypeObject MemAllocType = {
337
337
  0, /* tp_version_tag */
338
338
  0, /* tp_finalize */
339
339
  0, /* tp_vectorcall */
340
- #if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION == 12)
340
+ #if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION >= 12)
341
341
  /* This was introduced first in 3.12
342
342
  * https://github.com/python/cpython/issues/91051
343
343
  */
344
344
  0, /* tp_watched */
345
345
  #endif
346
+ #if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION >= 13)
347
+ /* This was introduced in 3.13
348
+ * https://github.com/python/cpython/pull/114900
349
+ */
350
+ 0, /* tp_versions_used */
351
+ #endif
346
352
 
347
353
  /* WARNING: Do not remove this, only modify it! It is a version guard to
348
354
  * act as a reminder to update this struct on Python version update! */
@@ -422,10 +422,9 @@ HAVE_ARGUMENT = dis.HAVE_ARGUMENT
422
422
  EXTENDED_ARG = dis.EXTENDED_ARG
423
423
 
424
424
 
425
- _BUILTIN_TYPE_NAMES = {}
426
- for k, v in types.__dict__.items():
427
- if type(v) is type:
428
- _BUILTIN_TYPE_NAMES[v] = k
425
+ _BUILTIN_TYPE_NAMES = {
426
+ v: k for k, v in types.__dict__.items() if type(v) is type
427
+ }
429
428
 
430
429
 
431
430
  def _builtin_type(name):
@@ -463,7 +462,7 @@ def _extract_class_dict(cls):
463
462
  base_value = inherited_dict[name]
464
463
  if value is base_value:
465
464
  to_remove.append(name)
466
- except KeyError:
465
+ except KeyError: # noqa: PERF203
467
466
  pass
468
467
  for name in to_remove:
469
468
  clsdict.pop(name)
@@ -12,6 +12,7 @@ from numba.cuda.cudadrv.linkable_code import LinkableCode
12
12
  from numba.cuda.memory_management.nrt import NRT_LIBRARY
13
13
 
14
14
  import os
15
+ import pickle
15
16
  import subprocess
16
17
  import tempfile
17
18
 
@@ -189,6 +190,11 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
189
190
 
190
191
  self.use_cooperative = False
191
192
 
193
+ # Objects that need to be kept alive for the lifetime of the
194
+ # kernels or device functions generated by this code library,
195
+ # e.g., device arrays captured from global scope.
196
+ self.referenced_objects = {}
197
+
192
198
  @property
193
199
  def llvm_strs(self):
194
200
  if self._llvm_strs is None:
@@ -203,9 +209,14 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
203
209
  return cc
204
210
 
205
211
  device = devices.get_context().device
206
- return device.compute_capability
212
+ cc = device.compute_capability
213
+ cc = (cc[0], cc[1], "a" if cc >= (9, 0) else "")
214
+ return cc
207
215
 
208
216
  def get_asm_str(self, cc=None):
217
+ return "\n".join(self.get_asm_strs(cc=cc))
218
+
219
+ def get_asm_strs(self, cc=None):
209
220
  cc = self._ensure_cc(cc)
210
221
 
211
222
  ptxes = self._ptx_cache.get(cc, None)
@@ -218,21 +229,25 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
218
229
 
219
230
  irs = self.llvm_strs
220
231
 
221
- ptx = nvvm.compile_ir(irs, **options)
232
+ if "g" in options:
233
+ ptxes = [nvvm.compile_ir(ir, **options) for ir in irs]
234
+ else:
235
+ ptxes = [nvvm.compile_ir(irs, **options)]
222
236
 
223
237
  # Sometimes the result from NVVM contains trailing whitespace and
224
238
  # nulls, which we strip so that the assembly dump looks a little
225
239
  # tidier.
226
- ptx = ptx.decode().strip("\x00").strip()
240
+ ptxes = [ptx.decode().strip("\x00").strip() for ptx in ptxes]
227
241
 
228
242
  if config.DUMP_ASSEMBLY:
229
243
  print(("ASSEMBLY %s" % self._name).center(80, "-"))
230
- print(ptx)
244
+ for ptx in ptxes:
245
+ print(ptx)
231
246
  print("=" * 80)
232
247
 
233
- self._ptx_cache[cc] = ptx
248
+ self._ptx_cache[cc] = ptxes
234
249
 
235
- return ptx
250
+ return ptxes
236
251
 
237
252
  def get_lto_ptx(self, cc=None):
238
253
  """
@@ -247,7 +262,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
247
262
 
248
263
  cc = self._ensure_cc(cc)
249
264
 
250
- linker = driver._Linker.new(
265
+ linker = driver._Linker(
251
266
  max_registers=self._max_registers,
252
267
  cc=cc,
253
268
  additional_flags=["-ptx"],
@@ -284,8 +299,9 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
284
299
  ltoir = self.get_ltoir(cc=cc)
285
300
  linker.add_ltoir(ltoir)
286
301
  else:
287
- ptx = self.get_asm_str(cc=cc)
288
- linker.add_ptx(ptx.encode())
302
+ ptxes = self.get_asm_strs(cc=cc)
303
+ for ptx in ptxes:
304
+ linker.add_ptx(ptx.encode())
289
305
 
290
306
  for path in self._linking_files:
291
307
  linker.add_file_guess_ext(path, ignore_nonlto)
@@ -308,7 +324,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
308
324
  print(ptx)
309
325
  print("=" * 80)
310
326
 
311
- linker = driver._Linker.new(
327
+ linker = driver._Linker(
312
328
  max_registers=self._max_registers, cc=cc, lto=self._lto
313
329
  )
314
330
  self._link_all(linker, cc, ignore_nonlto=False)
@@ -333,7 +349,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
333
349
  cufunc = self._cufunc_cache.get(device.id, None)
334
350
  if cufunc:
335
351
  return cufunc
336
- cubin = self.get_cubin(cc=device.compute_capability)
352
+ cubin = self.get_cubin()
337
353
  module = ctx.create_module_image(
338
354
  cubin, self._setup_functions, self._teardown_functions
339
355
  )
@@ -377,6 +393,9 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
377
393
  self._setup_functions.extend(library._setup_functions)
378
394
  self._teardown_functions.extend(library._teardown_functions)
379
395
  self.use_cooperative |= library.use_cooperative
396
+ self.referenced_objects.update(
397
+ getattr(library, "referenced_objects", {})
398
+ )
380
399
 
381
400
  def add_linking_file(self, path_or_obj):
382
401
  if isinstance(path_or_obj, LinkableCode):
@@ -432,7 +451,10 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
432
451
  for mod in library.modules:
433
452
  for fn in mod.functions:
434
453
  if not fn.is_declaration:
435
- fn.linkage = "linkonce_odr"
454
+ if "g" in self._nvvm_options:
455
+ fn.linkage = "weak_odr"
456
+ else:
457
+ fn.linkage = "linkonce_odr"
436
458
 
437
459
  self._finalized = True
438
460
 
@@ -442,6 +464,18 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
442
464
  but loaded functions are discarded. They are recreated when needed
443
465
  after deserialization.
444
466
  """
467
+ # Check for captured device arrays that cannot be safely cached.
468
+ if self.referenced_objects:
469
+ if any(
470
+ getattr(obj, "__cuda_array_interface__", None) is not None
471
+ for obj in self.referenced_objects.values()
472
+ ):
473
+ raise pickle.PicklingError(
474
+ "Cannot serialize kernels or device functions referencing "
475
+ "global device arrays. Pass the array(s) as arguments "
476
+ "to the kernel instead."
477
+ )
478
+
445
479
  nrt = False
446
480
  if self._linking_files:
447
481
  if (
@@ -852,15 +852,15 @@ def kernel_fixup(kernel, debug):
852
852
  return_value = kernel.args[0]
853
853
 
854
854
  for block in kernel.blocks:
855
- remove_list = []
856
-
857
855
  # Find all stores first
858
- for inst in block.instructions:
856
+ remove_list = [
857
+ inst
858
+ for inst in block.instructions
859
859
  if (
860
860
  isinstance(inst, ir.StoreInstr)
861
861
  and inst.operands[1] == return_value
862
- ):
863
- remove_list.append(inst)
862
+ )
863
+ ]
864
864
 
865
865
  # Remove all stores
866
866
  for to_remove in remove_list:
@@ -1023,10 +1023,9 @@ def compile_all(
1023
1023
  )
1024
1024
 
1025
1025
  if lto:
1026
- code = lib.get_ltoir(cc=cc)
1026
+ codes = [lib.get_ltoir(cc=cc)]
1027
1027
  else:
1028
- code = lib.get_asm_str(cc=cc)
1029
- codes = [code]
1028
+ codes = lib.get_asm_strs(cc=cc)
1030
1029
 
1031
1030
  # linking_files
1032
1031
  is_ltoir = output == "ltoir"
@@ -1241,7 +1240,14 @@ def compile(
1241
1240
  if lto:
1242
1241
  code = lib.get_ltoir(cc=cc)
1243
1242
  else:
1244
- code = lib.get_asm_str(cc=cc)
1243
+ codes = lib.get_asm_strs(cc=cc)
1244
+ if len(codes) == 1:
1245
+ code = codes[0]
1246
+ else:
1247
+ raise RuntimeError(
1248
+ "Compiling this function results in multiple "
1249
+ "PTX files. Use compile_all() instead"
1250
+ )
1245
1251
  return code, resty
1246
1252
 
1247
1253