numba-cuda 0.23.0__cp313-cp313-win_amd64.whl → 0.24.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/__init__.py +4 -1
  3. numba_cuda/numba/cuda/_compat.py +47 -0
  4. numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
  5. numba_cuda/numba/cuda/cext/_dispatcher.cpp +8 -2
  6. numba_cuda/numba/cuda/cext/_hashtable.cpp +5 -0
  7. numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
  8. numba_cuda/numba/cuda/cext/_pymodule.h +1 -1
  9. numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
  10. numba_cuda/numba/cuda/cext/_typeof.cpp +56 -8
  11. numba_cuda/numba/cuda/cext/mviewbuf.c +7 -1
  12. numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
  13. numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +4 -5
  14. numba_cuda/numba/cuda/codegen.py +4 -2
  15. numba_cuda/numba/cuda/compiler.py +5 -5
  16. numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +1 -1
  17. numba_cuda/numba/cuda/core/base.py +6 -10
  18. numba_cuda/numba/cuda/core/bytecode.py +21 -13
  19. numba_cuda/numba/cuda/core/byteflow.py +336 -90
  20. numba_cuda/numba/cuda/core/compiler.py +3 -4
  21. numba_cuda/numba/cuda/core/compiler_machinery.py +3 -3
  22. numba_cuda/numba/cuda/core/config.py +5 -7
  23. numba_cuda/numba/cuda/core/controlflow.py +17 -9
  24. numba_cuda/numba/cuda/core/inline_closurecall.py +11 -10
  25. numba_cuda/numba/cuda/core/interpreter.py +255 -96
  26. numba_cuda/numba/cuda/core/ir_utils.py +8 -17
  27. numba_cuda/numba/cuda/core/pythonapi.py +3 -0
  28. numba_cuda/numba/cuda/core/rewrites/static_binop.py +1 -1
  29. numba_cuda/numba/cuda/core/ssa.py +2 -2
  30. numba_cuda/numba/cuda/core/transforms.py +4 -6
  31. numba_cuda/numba/cuda/core/typed_passes.py +1 -1
  32. numba_cuda/numba/cuda/core/typeinfer.py +3 -3
  33. numba_cuda/numba/cuda/core/untyped_passes.py +11 -10
  34. numba_cuda/numba/cuda/cpython/unicode.py +2 -2
  35. numba_cuda/numba/cuda/cpython/unicode_support.py +1 -3
  36. numba_cuda/numba/cuda/cudadrv/devicearray.py +4 -4
  37. numba_cuda/numba/cuda/cudadrv/driver.py +13 -11
  38. numba_cuda/numba/cuda/cudadrv/nvrtc.py +71 -32
  39. numba_cuda/numba/cuda/debuginfo.py +10 -79
  40. numba_cuda/numba/cuda/deviceufunc.py +3 -6
  41. numba_cuda/numba/cuda/dispatcher.py +5 -19
  42. numba_cuda/numba/cuda/libdeviceimpl.py +1 -2
  43. numba_cuda/numba/cuda/lowering.py +0 -28
  44. numba_cuda/numba/cuda/memory_management/nrt.py +1 -1
  45. numba_cuda/numba/cuda/np/arrayobj.py +7 -9
  46. numba_cuda/numba/cuda/np/numpy_support.py +7 -10
  47. numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +4 -3
  48. numba_cuda/numba/cuda/testing.py +4 -8
  49. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +66 -4
  50. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
  51. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +2 -2
  52. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +1 -1
  53. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +26 -4
  54. numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +61 -9
  55. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +6 -0
  56. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +12 -1
  57. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +13 -0
  58. numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +1 -1
  59. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +1 -1
  60. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +12 -7
  61. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +1 -1
  62. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1 -1
  63. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +37 -35
  64. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +8 -7
  65. numba_cuda/numba/cuda/tests/support.py +11 -0
  66. numba_cuda/numba/cuda/types/cuda_functions.py +1 -1
  67. numba_cuda/numba/cuda/typing/asnumbatype.py +37 -2
  68. numba_cuda/numba/cuda/typing/typeof.py +9 -16
  69. {numba_cuda-0.23.0.dist-info → numba_cuda-0.24.0.dist-info}/METADATA +4 -13
  70. {numba_cuda-0.23.0.dist-info → numba_cuda-0.24.0.dist-info}/RECORD +74 -73
  71. {numba_cuda-0.23.0.dist-info → numba_cuda-0.24.0.dist-info}/WHEEL +0 -0
  72. {numba_cuda-0.23.0.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE +0 -0
  73. {numba_cuda-0.23.0.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE.numba +0 -0
  74. {numba_cuda-0.23.0.dist-info → numba_cuda-0.24.0.dist-info}/top_level.txt +0 -0
@@ -875,6 +875,9 @@ class PythonAPI(object):
875
875
  self.py_hash_t.as_pointer(),
876
876
  ],
877
877
  )
878
+ # `_PySet_NextEntry` returns a borrowed reference to the key, which is
879
+ # generally not expected for iterators--which is the place where this
880
+ # is used internally. Perhaps we should revisit this at some point
878
881
  fn = self._get_function(fnty, name="_PySet_NextEntry")
879
882
  return self.builder.call(fn, (set, posptr, keyptr, hashptr))
880
883
 
@@ -27,7 +27,7 @@ class DetectStaticBinops(Rewrite):
27
27
  and expr.static_rhs is ir.UNDEFINED
28
28
  ):
29
29
  self.static_rhs[expr] = func_ir.infer_constant(expr.rhs)
30
- except errors.ConstantInferenceError:
30
+ except errors.ConstantInferenceError: # noqa: PERF203
31
31
  continue
32
32
 
33
33
  return len(self.static_lhs) > 0 or len(self.static_rhs) > 0
@@ -113,8 +113,8 @@ def _iterated_domfronts(cfg):
113
113
  keep_going = True
114
114
  while keep_going:
115
115
  keep_going = False
116
- for k, vs in domfronts.items():
117
- inner = reduce(operator.or_, [domfronts[v] for v in vs], set())
116
+ for vs in domfronts.values():
117
+ inner = reduce(operator.or_, map(domfronts.__getitem__, vs), set())
118
118
  if inner.difference(vs):
119
119
  vs |= inner
120
120
  keep_going = True
@@ -436,9 +436,7 @@ def with_lifting(func_ir, typingctx, targetctx, flags, locals):
436
436
  # the kind of contextmanager
437
437
  sub_irs = []
438
438
  for blk_start, blk_end in withs:
439
- body_blocks = []
440
- for node in _cfg_nodes_in_region(cfg, blk_start, blk_end):
441
- body_blocks.append(node)
439
+ body_blocks = _cfg_nodes_in_region(cfg, blk_start, blk_end).copy()
442
440
  _legalize_with_head(blocks[blk_start])
443
441
  # Find the contextmanager
444
442
  cmkind, extra = _get_with_contextmanager(func_ir, blocks, blk_start)
@@ -631,9 +629,9 @@ def find_setupwiths(func_ir):
631
629
  # add all its targets to the to_visit stack, unless we
632
630
  # have seen them already
633
631
  if ir_utils.is_terminator(stmt):
634
- for t in stmt.get_targets():
635
- if t not in seen:
636
- to_visit.append(t)
632
+ to_visit.extend(
633
+ t for t in stmt.get_targets() if t not in seen
634
+ )
637
635
 
638
636
  return setup_with_to_pop_blocks_map
639
637
 
@@ -164,7 +164,7 @@ class BaseTypeInference(FunctionPass):
164
164
  retstmts = []
165
165
  caststmts = {}
166
166
  argvars = set()
167
- for bid, blk in interp.blocks.items():
167
+ for blk in interp.blocks.values():
168
168
  for inst in blk.body:
169
169
  if isinstance(inst, ir.return_types):
170
170
  retstmts.append(inst.value.name)
@@ -1464,9 +1464,9 @@ https://numba.readthedocs.io/en/stable/user/troubleshoot.html#my-code-has-an-unt
1464
1464
  interped = msg % (name, atype, loc.strformat())
1465
1465
  return interped
1466
1466
 
1467
- problem_str = []
1468
- for xtype in rettypes:
1469
- problem_str.append(_termcolor.errmsg(check_type(xtype)))
1467
+ problem_str = [
1468
+ _termcolor.errmsg(check_type(xtype)) for xtype in rettypes
1469
+ ]
1470
1470
 
1471
1471
  raise TypingError(
1472
1472
  "Can't unify return type from the "
@@ -632,7 +632,7 @@ class MakeFunctionToJitFunction(FunctionPass):
632
632
  def run_pass(self, state):
633
633
  func_ir = state.func_ir
634
634
  mutated = False
635
- for idx, blk in func_ir.blocks.items():
635
+ for blk in func_ir.blocks.values():
636
636
  for stmt in blk.body:
637
637
  if isinstance(stmt, ir.assign_types):
638
638
  if isinstance(stmt.value, ir.expr_types):
@@ -696,7 +696,7 @@ class TransformLiteralUnrollConstListToTuple(FunctionPass):
696
696
  def run_pass(self, state):
697
697
  mutated = False
698
698
  func_ir = state.func_ir
699
- for label, blk in func_ir.blocks.items():
699
+ for blk in func_ir.blocks.values():
700
700
  calls = [_ for _ in blk.find_exprs("call")]
701
701
  for call in calls:
702
702
  glbl = guard(get_definition, func_ir, call.func)
@@ -1123,16 +1123,17 @@ class MixedContainerUnroller(FunctionPass):
1123
1123
  )
1124
1124
  keys = [k for k in data.keys()]
1125
1125
 
1126
- elifs = []
1127
- for i in range(1, len(keys)):
1128
- elifs.append(elif_tplt % ",".join(map(str, data[keys[i]])))
1126
+ elifs = [
1127
+ elif_tplt % ",".join(map(str, data[keys[i]]))
1128
+ for i in range(1, len(keys))
1129
+ ]
1129
1130
  src = b % (",".join(map(str, data[keys[0]])), "".join(elifs))
1130
1131
  wstr = src
1131
1132
  l = {}
1132
1133
  exec(wstr, {}, l)
1133
1134
  bfunc = l["foo"]
1134
1135
  branches = compile_to_numba_ir(bfunc, {})
1135
- for lbl, blk in branches.blocks.items():
1136
+ for blk in branches.blocks.values():
1136
1137
  for stmt in blk.body:
1137
1138
  if isinstance(stmt, ir.assign_types):
1138
1139
  if isinstance(stmt.value, ir.global_types):
@@ -1173,7 +1174,7 @@ class MixedContainerUnroller(FunctionPass):
1173
1174
  """This finds loops which are compliant with the form:
1174
1175
  for i in range(len(literal_unroll(<something>>)))"""
1175
1176
  unroll_loops = {}
1176
- for header_lbl, loop in loops.items():
1177
+ for loop in loops.values():
1177
1178
  # TODO: check the loop head has literal_unroll, if it does but
1178
1179
  # does not conform to the following then raise
1179
1180
 
@@ -1605,7 +1606,7 @@ class IterLoopCanonicalization(FunctionPass):
1605
1606
  for x in induction_vars:
1606
1607
  try: # there's not always an alias, e.g. loop from inlined closure
1607
1608
  tmp.add(func_ir.get_assignee(x, loop.header))
1608
- except ValueError:
1609
+ except ValueError: # noqa: PERF203
1609
1610
  pass
1610
1611
  induction_vars |= tmp
1611
1612
  induction_var_names = set([x.name for x in induction_vars])
@@ -1639,7 +1640,7 @@ class IterLoopCanonicalization(FunctionPass):
1639
1640
  loops = cfg.loops()
1640
1641
 
1641
1642
  mutated = False
1642
- for header, loop in loops.items():
1643
+ for loop in loops.values():
1643
1644
  stat = self.assess_loop(loop, func_ir, state.typemap)
1644
1645
  if stat:
1645
1646
  if self._DEBUG:
@@ -1979,7 +1980,7 @@ class RewriteDynamicRaises(FunctionPass):
1979
1980
  try:
1980
1981
  const = func_ir.infer_constant(exc_arg)
1981
1982
  exc_args.append(const)
1982
- except consts.ConstantInferenceError:
1983
+ except consts.ConstantInferenceError: # noqa: PERF203
1983
1984
  exc_args.append(exc_arg)
1984
1985
  loc = raise_.loc
1985
1986
 
@@ -394,7 +394,7 @@ def _set_code_point(a, i, ch):
394
394
  )
395
395
 
396
396
 
397
- if PYVERSION in ((3, 12), (3, 13)):
397
+ if PYVERSION in ((3, 12), (3, 13), (3, 14)):
398
398
 
399
399
  @register_jitable
400
400
  def _pick_kind(kind1, kind2):
@@ -442,7 +442,7 @@ def _pick_ascii(is_ascii1, is_ascii2):
442
442
  return types.uint32(0)
443
443
 
444
444
 
445
- if PYVERSION in ((3, 12), (3, 13)):
445
+ if PYVERSION in ((3, 12), (3, 13), (3, 14)):
446
446
 
447
447
  @register_jitable
448
448
  def _kind_to_byte_width(kind):
@@ -125,9 +125,7 @@ def _gettyperecord_impl(typingctx, codepoint):
125
125
 
126
126
  byref = [upper, lower, title, decimal, digit, flags]
127
127
  builder.call(fn, [args[0]] + byref)
128
- buf = []
129
- for x in byref:
130
- buf.append(builder.load(x))
128
+ buf = list(map(builder.load, byref))
131
129
 
132
130
  res = context.make_tuple(builder, signature.return_type, tuple(buf))
133
131
  return impl_ret_untracked(context, builder, signature.return_type, res)
@@ -852,10 +852,10 @@ def array_core(ary):
852
852
  """
853
853
  if not ary.strides or not ary.size:
854
854
  return ary
855
- core_index = []
856
- for stride in ary.strides:
857
- core_index.append(0 if stride == 0 else slice(None))
858
- return ary[tuple(core_index)]
855
+ core_index = tuple(
856
+ 0 if stride == 0 else slice(None) for stride in ary.strides
857
+ )
858
+ return ary[core_index]
859
859
 
860
860
 
861
861
  def is_contiguous(ary):
@@ -54,18 +54,16 @@ from numba.cuda.utils import cached_file_read
54
54
  from numba.cuda.cudadrv import enums, drvapi, nvrtc
55
55
 
56
56
  from cuda.bindings import driver as binding
57
- from cuda.core.experimental import (
57
+ from numba.cuda._compat import (
58
58
  Linker,
59
59
  LinkerOptions,
60
60
  ObjectCode,
61
- )
62
-
63
- from cuda.bindings.utils import get_cuda_native_handle
64
- from cuda.core.experimental import (
65
61
  Stream as ExperimentalStream,
66
62
  Device as ExperimentalDevice,
67
63
  )
68
64
 
65
+ from cuda.bindings.utils import get_cuda_native_handle
66
+
69
67
 
70
68
  # There is no definition of the default stream in the Nvidia bindings (nor
71
69
  # is there at the C/C++ level), so we define it here so we don't need to
@@ -184,7 +182,7 @@ def load_driver(dlloader, candidates):
184
182
  for path in candidates:
185
183
  try:
186
184
  dll = dlloader(path)
187
- except OSError as e:
185
+ except OSError as e: # noqa: PERF203
188
186
  # Problem opening the DLL
189
187
  path_not_exist.append(not os.path.isfile(path))
190
188
  driver_load_error.append(e)
@@ -375,10 +373,10 @@ class Driver(object):
375
373
  return getattr(self.lib, fname)
376
374
 
377
375
  for variant in variants:
378
- try:
379
- return getattr(self.lib, f"{fname}{variant}")
380
- except AttributeError:
381
- pass
376
+ if (
377
+ value := getattr(self.lib, f"{fname}{variant}", None)
378
+ ) is not None:
379
+ return value
382
380
 
383
381
  # Not found.
384
382
  # Delay missing function error to use
@@ -2305,7 +2303,11 @@ class _Linker:
2305
2303
  lto=None,
2306
2304
  additional_flags=None,
2307
2305
  ):
2308
- arch = f"sm_{cc[0]}{cc[1]}"
2306
+ if len(cc) == 3:
2307
+ arch = f"sm_{cc[0]}{cc[1]}{cc[2]}"
2308
+ else:
2309
+ arch = f"sm_{cc[0]}{cc[1]}"
2310
+
2309
2311
  self.max_registers = max_registers if max_registers else None
2310
2312
  self.lineinfo = lineinfo
2311
2313
  self.cc = cc
@@ -12,7 +12,7 @@ import os
12
12
  import warnings
13
13
  import functools
14
14
 
15
- from cuda.core.experimental import Program, ProgramOptions
15
+ from numba.cuda._compat import Program, ProgramOptions
16
16
  from cuda.bindings import nvrtc as bindings_nvrtc
17
17
 
18
18
  NVRTC_EXTRA_SEARCH_PATHS = _readenv(
@@ -30,6 +30,44 @@ def _get_nvrtc_version():
30
30
  return (major, minor)
31
31
 
32
32
 
33
+ def _verify_cc_tuple(cc):
34
+ version = _get_nvrtc_version()
35
+ ver_str = lambda version: ".".join(str(v) for v in version)
36
+
37
+ if len(cc) == 3:
38
+ cc, arch = (cc[0], cc[1]), cc[2]
39
+ else:
40
+ arch = ""
41
+
42
+ if arch not in ("", "a", "f"):
43
+ raise ValueError(
44
+ f"Invalid architecture suffix '{arch}' in compute capability "
45
+ f"{ver_str(cc)}{arch}. Expected '', 'a', or 'f'."
46
+ )
47
+
48
+ supported_ccs = get_supported_ccs()
49
+ try:
50
+ found = max(filter(lambda v: v <= cc, [v for v in supported_ccs]))
51
+ except ValueError:
52
+ raise RuntimeError(
53
+ f"Device compute capability {ver_str(cc)} is less than the "
54
+ f"minimum supported by NVRTC {ver_str(version)}. Supported "
55
+ "compute capabilities are "
56
+ f"{', '.join([ver_str(v) for v in supported_ccs])}."
57
+ )
58
+
59
+ if found != cc:
60
+ found = (found[0], found[1], arch)
61
+ warnings.warn(
62
+ f"Device compute capability {ver_str(cc)} is not supported by "
63
+ f"NVRTC {ver_str(version)}. Using {ver_str(found)} instead."
64
+ )
65
+ else:
66
+ found = (cc[0], cc[1], arch)
67
+
68
+ return found
69
+
70
+
33
71
  def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False):
34
72
  """
35
73
  Compile a CUDA C/C++ source to PTX or LTOIR for a given compute capability.
@@ -38,7 +76,8 @@ def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False):
38
76
  :type src: str
39
77
  :param name: The filename of the source (for information only)
40
78
  :type name: str
41
- :param cc: A tuple ``(major, minor)`` of the compute capability
79
+ :param cc: A tuple ``(major, minor)`` or ``(major, minor, arch)`` of the
80
+ compute capability
42
81
  :type cc: tuple
43
82
  :param ltoir: Compile into LTOIR if True, otherwise into PTX
44
83
  :type ltoir: bool
@@ -49,34 +88,18 @@ def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False):
49
88
  :return: The compiled PTX or LTOIR and compilation log
50
89
  :rtype: tuple
51
90
  """
91
+ found = _verify_cc_tuple(cc)
52
92
  version = _get_nvrtc_version()
53
93
 
54
- ver_str = lambda version: ".".join(str(v) for v in version)
55
- supported_ccs = get_supported_ccs()
56
- try:
57
- found = max(filter(lambda v: v <= cc, [v for v in supported_ccs]))
58
- except ValueError:
59
- raise RuntimeError(
60
- f"Device compute capability {ver_str(cc)} is less than the "
61
- f"minimum supported by NVRTC {ver_str(version)}. Supported "
62
- "compute capabilities are "
63
- f"{', '.join([ver_str(v) for v in supported_ccs])}."
64
- )
65
-
66
- if found != cc:
67
- warnings.warn(
68
- f"Device compute capability {ver_str(cc)} is not supported by "
69
- f"NVRTC {ver_str(version)}. Using {ver_str(found)} instead."
70
- )
71
-
72
94
  # Compilation options:
73
95
  # - Compile for the current device's compute capability.
74
96
  # - The CUDA include path is added.
75
97
  # - Relocatable Device Code (rdc) is needed to prevent device functions
76
98
  # being optimized away.
77
- major, minor = found
99
+ major, minor = found[0], found[1]
100
+ cc_arch = found[2] if len(found) == 3 else ""
78
101
 
79
- arch = f"sm_{major}{minor}"
102
+ arch = f"sm_{major}{minor}{cc_arch}"
80
103
 
81
104
  cuda_include_dir = get_cuda_paths()["include_dir"].info
82
105
  cuda_includes = [f"{cuda_include_dir}"]
@@ -156,7 +179,7 @@ def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False):
156
179
  return result, log
157
180
 
158
181
 
159
- def find_closest_arch(mycc):
182
+ def find_closest_arch(cc):
160
183
  """
161
184
  Given a compute capability, return the closest compute capability supported
162
185
  by the CUDA toolkit.
@@ -166,17 +189,17 @@ def find_closest_arch(mycc):
166
189
  """
167
190
  supported_ccs = get_supported_ccs()
168
191
 
169
- for i, cc in enumerate(supported_ccs):
170
- if cc == mycc:
192
+ for i, supported_cc in enumerate(supported_ccs):
193
+ if supported_cc == cc:
171
194
  # Matches
172
- return cc
173
- elif cc > mycc:
195
+ return supported_cc
196
+ elif supported_cc > cc:
174
197
  # Exceeded
175
198
  if i == 0:
176
199
  # CC lower than supported
177
200
  msg = (
178
201
  "GPU compute capability %d.%d is not supported"
179
- "(requires >=%d.%d)" % (mycc + cc)
202
+ "(requires >=%d.%d)" % (cc + supported_cc)
180
203
  )
181
204
  raise CCSupportError(msg)
182
205
  else:
@@ -187,13 +210,29 @@ def find_closest_arch(mycc):
187
210
  return supported_ccs[-1] # Choose the highest
188
211
 
189
212
 
190
- def get_arch_option(major, minor):
213
+ def get_arch_option(major, minor, arch=""):
191
214
  """Matches with the closest architecture option"""
192
215
  if config.FORCE_CUDA_CC:
193
- arch = config.FORCE_CUDA_CC
216
+ fcc = config.FORCE_CUDA_CC
217
+ major, minor = fcc[0], fcc[1]
218
+ if len(fcc) == 3:
219
+ arch = fcc[2]
220
+ else:
221
+ arch = ""
194
222
  else:
195
- arch = find_closest_arch((major, minor))
196
- return "compute_%d%d" % arch
223
+ new_major, new_minor = find_closest_arch((major, minor))
224
+ if (new_major, new_minor) != (major, minor):
225
+ # If we picked a different major / minor, then using an
226
+ # arch-specific version is invalid
227
+ if arch != "":
228
+ raise ValueError(
229
+ f"Can't use arch-specific compute_{major}{minor}{arch} with "
230
+ "closest found compute capability "
231
+ f"compute_{new_major}{new_minor}"
232
+ )
233
+ major, minor = new_major, new_minor
234
+
235
+ return f"compute_{major}{minor}{arch}"
197
236
 
198
237
 
199
238
  def get_lowest_supported_cc():
@@ -646,11 +646,6 @@ class CUDADIBuilder(DIBuilder):
646
646
  super().__init__(module, filepath, cgctx, directives_only)
647
647
  # Cache for local variable metadata type and line deduplication
648
648
  self._vartypelinemap = {}
649
- # Variable address space dictionary
650
- self._var_addrspace_map = {}
651
-
652
- def _set_addrspace_map(self, map):
653
- self._var_addrspace_map = map
654
649
 
655
650
  def _var_type(self, lltype, size, datamodel=None):
656
651
  is_bool = False
@@ -826,64 +821,6 @@ class CUDADIBuilder(DIBuilder):
826
821
  is_distinct=True,
827
822
  )
828
823
 
829
- # Check if there's actually address space info to handle
830
- addrspace = getattr(self, "_addrspace", None)
831
- if (
832
- isinstance(lltype, ir.LiteralStructType)
833
- and datamodel is not None
834
- and datamodel.inner_models()
835
- and addrspace not in (None, 0)
836
- ):
837
- # Process struct with datamodel that has address space info
838
- meta = []
839
- offset = 0
840
- for element, field, model in zip(
841
- lltype.elements, datamodel._fields, datamodel.inner_models()
842
- ):
843
- size_field = self.cgctx.get_abi_sizeof(element)
844
- if isinstance(element, ir.PointerType) and field == "data":
845
- # Create pointer type with correct address space
846
- pointee_size = self.cgctx.get_abi_sizeof(element.pointee)
847
- pointee_model = getattr(model, "_pointee_model", None)
848
- pointee_type = self._var_type(
849
- element.pointee, pointee_size, datamodel=pointee_model
850
- )
851
- meta_ptr = {
852
- "tag": ir.DIToken("DW_TAG_pointer_type"),
853
- "baseType": pointee_type,
854
- "size": _BYTE_SIZE * size_field,
855
- }
856
- dwarf_addrclass = self.get_dwarf_address_class(addrspace)
857
- if dwarf_addrclass is not None:
858
- meta_ptr["dwarfAddressSpace"] = int(dwarf_addrclass)
859
- basetype = m.add_debug_info("DIDerivedType", meta_ptr)
860
- else:
861
- basetype = self._var_type(
862
- element, size_field, datamodel=model
863
- )
864
- derived_type = m.add_debug_info(
865
- "DIDerivedType",
866
- {
867
- "tag": ir.DIToken("DW_TAG_member"),
868
- "name": field,
869
- "baseType": basetype,
870
- "size": _BYTE_SIZE * size_field,
871
- "offset": offset,
872
- },
873
- )
874
- meta.append(derived_type)
875
- offset += _BYTE_SIZE * size_field
876
-
877
- return m.add_debug_info(
878
- "DICompositeType",
879
- {
880
- "tag": ir.DIToken("DW_TAG_structure_type"),
881
- "name": f"{datamodel.fe_type}",
882
- "elements": m.add_metadata(meta),
883
- "size": offset,
884
- },
885
- is_distinct=True,
886
- )
887
824
  # For other cases, use upstream Numba implementation
888
825
  return super()._var_type(lltype, size, datamodel=datamodel)
889
826
 
@@ -936,22 +873,16 @@ class CUDADIBuilder(DIBuilder):
936
873
  # to llvm.dbg.value
937
874
  return
938
875
  else:
939
- # Look up address space for this variable
940
- self._addrspace = self._var_addrspace_map.get(name)
941
- try:
942
- return super().mark_variable(
943
- builder,
944
- allocavalue,
945
- name,
946
- lltype,
947
- size,
948
- line,
949
- datamodel,
950
- argidx,
951
- )
952
- finally:
953
- # Clean up address space info
954
- self._addrspace = None
876
+ return super().mark_variable(
877
+ builder,
878
+ allocavalue,
879
+ name,
880
+ lltype,
881
+ size,
882
+ line,
883
+ datamodel,
884
+ argidx,
885
+ )
955
886
 
956
887
  def update_variable(
957
888
  self,
@@ -682,12 +682,9 @@ class GUFuncEngine(object):
682
682
  inner_shapes.append(inner_shape)
683
683
 
684
684
  # solve output shape
685
- oshapes = []
686
- for outsig in self.sout:
687
- oshape = []
688
- for sym in outsig:
689
- oshape.append(symbolmap[sym])
690
- oshapes.append(tuple(oshape))
685
+ oshapes = [
686
+ tuple(map(symbolmap.__getitem__, outsig)) for outsig in self.sout
687
+ ]
691
688
 
692
689
  # find the biggest outershape as looping dimension
693
690
  sizes = [reduce(operator.mul, s, 1) for s in outer_shapes]
@@ -15,7 +15,7 @@ import uuid
15
15
  import re
16
16
  from warnings import warn
17
17
 
18
- from cuda.core.experimental import launch
18
+ from numba.cuda._compat import launch, LaunchConfig
19
19
 
20
20
  from numba.cuda.core import errors
21
21
  from numba.cuda import serialize, utils
@@ -41,7 +41,7 @@ from numba.cuda.compiler import (
41
41
  from numba.cuda.core import sigutils, config, entrypoints
42
42
  from numba.cuda.flags import Flags
43
43
  from numba.cuda.cudadrv import driver, nvvm
44
- from cuda.core.experimental import LaunchConfig
44
+
45
45
  from numba.cuda.locks import module_init_lock
46
46
  from numba.cuda.core.caching import Cache, CacheImpl, NullCache
47
47
  from numba.cuda.descriptor import cuda_target
@@ -858,7 +858,7 @@ class _DispatcherBase(_dispatcher.Dispatcher):
858
858
  for cres in overloads.values():
859
859
  try:
860
860
  targetctx.remove_user_function(cres.entry_point)
861
- except KeyError:
861
+ except KeyError: # noqa: PERF203
862
862
  pass
863
863
 
864
864
  return finalizer
@@ -1626,21 +1626,7 @@ class CUDADispatcher(serialize.ReduceMixin, _MemoMixin, _DispatcherBase):
1626
1626
  def typeof_pyval(self, val):
1627
1627
  # Based on _DispatcherBase.typeof_pyval, but differs from it to support
1628
1628
  # the CUDA Array Interface.
1629
- try:
1630
- return typeof(val, Purpose.argument)
1631
- except ValueError:
1632
- if (
1633
- interface := getattr(val, "__cuda_array_interface__")
1634
- ) is not None:
1635
- # When typing, we don't need to synchronize on the array's
1636
- # stream - this is done when the kernel is launched.
1637
-
1638
- return typeof(
1639
- cuda.from_cuda_array_interface(interface, sync=False),
1640
- Purpose.argument,
1641
- )
1642
- else:
1643
- raise
1629
+ return typeof(val, Purpose.argument)
1644
1630
 
1645
1631
  def specialize(self, *args):
1646
1632
  """
@@ -2104,7 +2090,7 @@ class CUDADispatcher(serialize.ReduceMixin, _MemoMixin, _DispatcherBase):
2104
2090
  if file is None:
2105
2091
  file = sys.stdout
2106
2092
 
2107
- for _, defn in self.overloads.items():
2093
+ for defn in self.overloads.values():
2108
2094
  defn.inspect_types(file=file)
2109
2095
 
2110
2096
  @classmethod
@@ -69,8 +69,7 @@ def libdevice_implement_multiple_returns(func, retty, prototype_args):
69
69
  tuple_args = []
70
70
  if retty != types.void:
71
71
  tuple_args.append(ret)
72
- for arg in virtual_args:
73
- tuple_args.append(builder.load(arg))
72
+ tuple_args.extend(map(builder.load, virtual_args))
74
73
 
75
74
  if isinstance(nb_retty, types.UniTuple):
76
75
  return cgutils.pack_array(builder, tuple_args)