numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.24.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/__init__.py +4 -1
  3. numba_cuda/numba/cuda/_compat.py +47 -0
  4. numba_cuda/numba/cuda/api.py +4 -1
  5. numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
  6. numba_cuda/numba/cuda/cext/_dispatcher.cpp +8 -40
  7. numba_cuda/numba/cuda/cext/_hashtable.cpp +5 -0
  8. numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
  9. numba_cuda/numba/cuda/cext/_pymodule.h +1 -1
  10. numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
  11. numba_cuda/numba/cuda/cext/_typeof.cpp +56 -119
  12. numba_cuda/numba/cuda/cext/mviewbuf.c +7 -1
  13. numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
  14. numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +4 -5
  15. numba_cuda/numba/cuda/codegen.py +46 -12
  16. numba_cuda/numba/cuda/compiler.py +15 -9
  17. numba_cuda/numba/cuda/core/analysis.py +29 -21
  18. numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +1 -1
  19. numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
  20. numba_cuda/numba/cuda/core/base.py +12 -11
  21. numba_cuda/numba/cuda/core/bytecode.py +21 -13
  22. numba_cuda/numba/cuda/core/byteflow.py +336 -90
  23. numba_cuda/numba/cuda/core/compiler.py +3 -4
  24. numba_cuda/numba/cuda/core/compiler_machinery.py +3 -3
  25. numba_cuda/numba/cuda/core/config.py +5 -7
  26. numba_cuda/numba/cuda/core/consts.py +1 -1
  27. numba_cuda/numba/cuda/core/controlflow.py +17 -9
  28. numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
  29. numba_cuda/numba/cuda/core/errors.py +4 -912
  30. numba_cuda/numba/cuda/core/inline_closurecall.py +82 -67
  31. numba_cuda/numba/cuda/core/interpreter.py +334 -160
  32. numba_cuda/numba/cuda/core/ir.py +191 -119
  33. numba_cuda/numba/cuda/core/ir_utils.py +149 -128
  34. numba_cuda/numba/cuda/core/postproc.py +8 -8
  35. numba_cuda/numba/cuda/core/pythonapi.py +3 -0
  36. numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
  37. numba_cuda/numba/cuda/core/rewrites/static_binop.py +1 -1
  38. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
  39. numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
  40. numba_cuda/numba/cuda/core/ssa.py +5 -5
  41. numba_cuda/numba/cuda/core/transforms.py +29 -16
  42. numba_cuda/numba/cuda/core/typed_passes.py +10 -10
  43. numba_cuda/numba/cuda/core/typeinfer.py +42 -27
  44. numba_cuda/numba/cuda/core/untyped_passes.py +82 -65
  45. numba_cuda/numba/cuda/cpython/unicode.py +2 -2
  46. numba_cuda/numba/cuda/cpython/unicode_support.py +1 -3
  47. numba_cuda/numba/cuda/cudadecl.py +0 -13
  48. numba_cuda/numba/cuda/cudadrv/devicearray.py +10 -9
  49. numba_cuda/numba/cuda/cudadrv/driver.py +142 -519
  50. numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
  51. numba_cuda/numba/cuda/cudadrv/nvrtc.py +87 -32
  52. numba_cuda/numba/cuda/cudaimpl.py +0 -12
  53. numba_cuda/numba/cuda/debuginfo.py +25 -0
  54. numba_cuda/numba/cuda/descriptor.py +1 -1
  55. numba_cuda/numba/cuda/device_init.py +4 -7
  56. numba_cuda/numba/cuda/deviceufunc.py +3 -6
  57. numba_cuda/numba/cuda/dispatcher.py +39 -49
  58. numba_cuda/numba/cuda/intrinsics.py +150 -1
  59. numba_cuda/numba/cuda/libdeviceimpl.py +1 -2
  60. numba_cuda/numba/cuda/lowering.py +36 -29
  61. numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
  62. numba_cuda/numba/cuda/np/arrayobj.py +61 -9
  63. numba_cuda/numba/cuda/np/numpy_support.py +32 -9
  64. numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +4 -3
  65. numba_cuda/numba/cuda/printimpl.py +20 -0
  66. numba_cuda/numba/cuda/serialize.py +10 -0
  67. numba_cuda/numba/cuda/stubs.py +0 -11
  68. numba_cuda/numba/cuda/testing.py +4 -8
  69. numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
  70. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
  71. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +195 -51
  72. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
  73. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
  74. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
  75. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +6 -7
  76. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
  77. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +53 -23
  78. numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +61 -9
  79. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +6 -0
  80. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
  81. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +22 -1
  82. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +13 -0
  83. numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +1 -1
  84. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +1 -1
  85. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +94 -0
  86. numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
  87. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
  88. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1 -1
  89. numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
  90. numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
  91. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +37 -35
  92. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +117 -1
  93. numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
  94. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
  95. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
  96. numba_cuda/numba/cuda/tests/support.py +11 -0
  97. numba_cuda/numba/cuda/types/cuda_functions.py +1 -1
  98. numba_cuda/numba/cuda/typing/asnumbatype.py +37 -2
  99. numba_cuda/numba/cuda/typing/context.py +3 -1
  100. numba_cuda/numba/cuda/typing/typeof.py +51 -2
  101. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/METADATA +4 -13
  102. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/RECORD +106 -105
  103. numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
  104. numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
  105. numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
  106. numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
  107. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/WHEEL +0 -0
  108. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE +0 -0
  109. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE.numba +0 -0
  110. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/top_level.txt +0 -0
@@ -441,7 +441,9 @@ class Lower(BaseLower):
441
441
  # Ensure that the variable is not defined multiple times
442
442
  # in the block
443
443
  [defblk] = var_assign_map[var]
444
- assign_stmts = self.blocks[defblk].find_insts(ir.Assign)
444
+ assign_stmts = self.blocks[defblk].find_insts(
445
+ ir.assign_types
446
+ )
445
447
  assigns = [
446
448
  stmt
447
449
  for stmt in assign_stmts
@@ -468,7 +470,7 @@ class Lower(BaseLower):
468
470
  self.builder.position_at_end(bb)
469
471
  all_names = set()
470
472
  for block in self.blocks.values():
471
- for x in block.find_insts(ir.Del):
473
+ for x in block.find_insts(ir.del_types):
472
474
  if x.value not in all_names:
473
475
  all_names.add(x.value)
474
476
  for name in all_names:
@@ -483,9 +485,9 @@ class Lower(BaseLower):
483
485
  self.func_ir,
484
486
  call.func,
485
487
  )
486
- if defn is not None and isinstance(defn, ir.Global):
488
+ if defn is not None and isinstance(defn, ir.global_types):
487
489
  if defn.value is eh.exception_check:
488
- if isinstance(block.terminator, ir.Branch):
490
+ if isinstance(block.terminator, ir.branch_types):
489
491
  targetblk = self.blkmap[block.terminator.truebr]
490
492
  # NOTE: This hacks in an attribute for call_conv to
491
493
  # pick up. This hack is no longer needed when
@@ -505,19 +507,19 @@ class Lower(BaseLower):
505
507
  self.debuginfo.mark_location(self.builder, self.loc.line)
506
508
  self.notify_loc(self.loc)
507
509
  self.debug_print(str(inst))
508
- if isinstance(inst, ir.Assign):
510
+ if isinstance(inst, ir.assign_types):
509
511
  ty = self.typeof(inst.target.name)
510
512
  val = self.lower_assign(ty, inst)
511
513
  argidx = None
512
514
  # If this is a store from an arg, like x = arg.x then tell debuginfo
513
515
  # that this is the arg
514
- if isinstance(inst.value, ir.Arg):
516
+ if isinstance(inst.value, ir.arg_types):
515
517
  # NOTE: debug location is the `def <func>` line
516
518
  self.debuginfo.mark_location(self.builder, self.defn_loc.line)
517
519
  argidx = inst.value.index + 1 # args start at 1
518
520
  self.storevar(val, inst.target.name, argidx=argidx)
519
521
 
520
- elif isinstance(inst, ir.Branch):
522
+ elif isinstance(inst, ir.branch_types):
521
523
  cond = self.loadvar(inst.cond.name)
522
524
  tr = self.blkmap[inst.truebr]
523
525
  fl = self.blkmap[inst.falsebr]
@@ -529,11 +531,11 @@ class Lower(BaseLower):
529
531
  )
530
532
  self.builder.cbranch(pred, tr, fl)
531
533
 
532
- elif isinstance(inst, ir.Jump):
534
+ elif isinstance(inst, ir.jump_types):
533
535
  target = self.blkmap[inst.target]
534
536
  self.builder.branch(target)
535
537
 
536
- elif isinstance(inst, ir.Return):
538
+ elif isinstance(inst, ir.return_types):
537
539
  if self.generator_info:
538
540
  # StopIteration
539
541
  self.genlower.return_from_generator(self)
@@ -551,10 +553,10 @@ class Lower(BaseLower):
551
553
  retval = self.context.get_return_value(self.builder, ty, val)
552
554
  self.call_conv.return_value(self.builder, retval)
553
555
 
554
- elif isinstance(inst, ir.PopBlock):
556
+ elif isinstance(inst, ir.popblock_types):
555
557
  pass # this is just a marker
556
558
 
557
- elif isinstance(inst, ir.StaticSetItem):
559
+ elif isinstance(inst, ir.staticsetitem_types):
558
560
  signature = self.fndesc.calltypes[inst]
559
561
  assert signature is not None
560
562
  try:
@@ -572,22 +574,22 @@ class Lower(BaseLower):
572
574
  )
573
575
  return impl(self.builder, (target, inst.index, value))
574
576
 
575
- elif isinstance(inst, ir.Print):
577
+ elif isinstance(inst, ir.print_types):
576
578
  self.lower_print(inst)
577
579
 
578
- elif isinstance(inst, ir.SetItem):
580
+ elif isinstance(inst, ir.setitem_types):
579
581
  signature = self.fndesc.calltypes[inst]
580
582
  assert signature is not None
581
583
  return self.lower_setitem(
582
584
  inst.target, inst.index, inst.value, signature
583
585
  )
584
586
 
585
- elif isinstance(inst, ir.StoreMap):
587
+ elif isinstance(inst, ir.storemap_types):
586
588
  signature = self.fndesc.calltypes[inst]
587
589
  assert signature is not None
588
590
  return self.lower_setitem(inst.dct, inst.key, inst.value, signature)
589
591
 
590
- elif isinstance(inst, ir.DelItem):
592
+ elif isinstance(inst, ir.delitem_types):
591
593
  target = self.loadvar(inst.target.name)
592
594
  index = self.loadvar(inst.index.name)
593
595
 
@@ -613,10 +615,10 @@ class Lower(BaseLower):
613
615
 
614
616
  return impl(self.builder, (target, index))
615
617
 
616
- elif isinstance(inst, ir.Del):
618
+ elif isinstance(inst, ir.del_types):
617
619
  self.delvar(inst.value)
618
620
 
619
- elif isinstance(inst, ir.SetAttr):
621
+ elif isinstance(inst, ir.setattr_types):
620
622
  target = self.loadvar(inst.target.name)
621
623
  value = self.loadvar(inst.value.name)
622
624
  signature = self.fndesc.calltypes[inst]
@@ -634,16 +636,16 @@ class Lower(BaseLower):
634
636
 
635
637
  return impl(self.builder, (target, value))
636
638
 
637
- elif isinstance(inst, ir.DynamicRaise):
639
+ elif isinstance(inst, ir.dynamicraise_types):
638
640
  self.lower_dynamic_raise(inst)
639
641
 
640
- elif isinstance(inst, ir.DynamicTryRaise):
642
+ elif isinstance(inst, ir.dynamictryraise_types):
641
643
  self.lower_try_dynamic_raise(inst)
642
644
 
643
- elif isinstance(inst, ir.StaticRaise):
645
+ elif isinstance(inst, ir.staticraise_types):
644
646
  self.lower_static_raise(inst)
645
647
 
646
- elif isinstance(inst, ir.StaticTryRaise):
648
+ elif isinstance(inst, ir.statictryraise_types):
647
649
  self.lower_static_try_raise(inst)
648
650
 
649
651
  else:
@@ -695,7 +697,7 @@ class Lower(BaseLower):
695
697
  args = []
696
698
  nb_types = []
697
699
  for exc_arg in exc_args:
698
- if isinstance(exc_arg, ir.Var):
700
+ if isinstance(exc_arg, ir.var_types):
699
701
  # dynamic values
700
702
  typ = self.typeof(exc_arg.name)
701
703
  val = self.loadvar(exc_arg.name)
@@ -727,24 +729,28 @@ class Lower(BaseLower):
727
729
  def lower_assign(self, ty, inst):
728
730
  value = inst.value
729
731
  # In nopython mode, closure vars are frozen like globals
730
- if isinstance(value, (ir.Const, ir.Global, ir.FreeVar)):
732
+ if (
733
+ isinstance(value, ir.const_types)
734
+ or isinstance(value, ir.global_types)
735
+ or isinstance(value, ir.freevar_types)
736
+ ):
731
737
  res = self.context.get_constant_generic(
732
738
  self.builder, ty, value.value
733
739
  )
734
740
  self.incref(ty, res)
735
741
  return res
736
742
 
737
- elif isinstance(value, ir.Expr):
743
+ elif isinstance(value, ir.expr_types):
738
744
  return self.lower_expr(ty, value)
739
745
 
740
- elif isinstance(value, ir.Var):
746
+ elif isinstance(value, ir.var_types):
741
747
  val = self.loadvar(value.name)
742
748
  oty = self.typeof(value.name)
743
749
  res = self.context.cast(self.builder, val, oty, ty)
744
750
  self.incref(ty, res)
745
751
  return res
746
752
 
747
- elif isinstance(value, ir.Arg):
753
+ elif isinstance(value, ir.arg_types):
748
754
  # Suspend debug info else all the arg repacking ends up being
749
755
  # associated with some line or other and it's actually just a detail
750
756
  # of Numba's CC.
@@ -770,7 +776,7 @@ class Lower(BaseLower):
770
776
  self.incref(ty, res)
771
777
  return res
772
778
 
773
- elif isinstance(value, ir.Yield):
779
+ elif isinstance(value, ir.yield_types):
774
780
  res = self.lower_yield(ty, value)
775
781
  self.incref(ty, res)
776
782
  return res
@@ -1792,7 +1798,7 @@ class CUDALower(Lower):
1792
1798
  self.dbg_val_names = set()
1793
1799
 
1794
1800
  if self.context.enable_debuginfo and self._disable_sroa_like_opt:
1795
- for x in block.find_insts(ir.Assign):
1801
+ for x in block.find_insts(ir.assign_types):
1796
1802
  if x.target.name.startswith("$"):
1797
1803
  continue
1798
1804
  ssa_name = x.target.name
@@ -1806,6 +1812,7 @@ class CUDALower(Lower):
1806
1812
  """
1807
1813
  super().pre_lower()
1808
1814
 
1815
+ # Track polymorphic variables for debug info
1809
1816
  self.poly_var_typ_map = {}
1810
1817
  self.poly_var_loc_map = {}
1811
1818
  self.poly_var_set = set()
@@ -1818,7 +1825,7 @@ class CUDALower(Lower):
1818
1825
  poly_map = {}
1819
1826
  # pre-scan all blocks
1820
1827
  for block in self.blocks.values():
1821
- for x in block.find_insts(ir.Assign):
1828
+ for x in block.find_insts(ir.assign_types):
1822
1829
  if x.target.name.startswith("$"):
1823
1830
  continue
1824
1831
  ssa_name = x.target.name
@@ -13,9 +13,10 @@ from numba.cuda import config, types
13
13
  from numba.cuda.cudadrv.driver import (
14
14
  _Linker,
15
15
  driver,
16
- launch_kernel,
16
+ _to_core_stream,
17
17
  _have_nvjitlink,
18
18
  )
19
+ from numba.cuda._compat import LaunchConfig, launch
19
20
  from numba.cuda.cudadrv import devices
20
21
  from numba.cuda.api import get_current_device
21
22
  from numba.cuda.utils import _readenv, cached_file_read
@@ -126,7 +127,7 @@ class _Runtime:
126
127
  cc = get_current_device().compute_capability
127
128
 
128
129
  # Create a new linker instance and add the cu file
129
- linker = _Linker.new(cc=cc, lto=_have_nvjitlink())
130
+ linker = _Linker(max_registers=0, cc=cc, lto=_have_nvjitlink())
130
131
  linker.add_cu_file(memsys_mod)
131
132
 
132
133
  # Complete the linker and create a module from it
@@ -179,20 +180,15 @@ class _Runtime:
179
180
  stream = cuda.default_stream()
180
181
 
181
182
  func = module.get_function(name)
182
- launch_kernel(
183
- func.handle,
184
- 1,
185
- 1,
186
- 1,
187
- 1,
188
- 1,
189
- 1,
190
- 0,
191
- stream.handle.value,
192
- params,
193
- cooperative=False,
183
+ config = LaunchConfig(
184
+ grid=(1, 1, 1),
185
+ block=(1, 1, 1),
186
+ shmem_size=0,
187
+ cooperative_launch=False,
194
188
  )
195
189
 
190
+ launch(_to_core_stream(stream), config, func.kernel, *params)
191
+
196
192
  def ensure_initialized(self, stream=None):
197
193
  """
198
194
  If memsys is not initialized, initialize memsys
@@ -31,6 +31,7 @@ from numba.cuda.np.numpy_support import (
31
31
  type_is_scalar,
32
32
  lt_complex,
33
33
  lt_floats,
34
+ strides_from_shape,
34
35
  )
35
36
  from numba.cuda.np.numpy_support import (
36
37
  type_can_asarray,
@@ -1797,10 +1798,10 @@ def numpy_broadcast_arrays(*args):
1797
1798
  tup = tuple_setitem(tup, i, shape[i])
1798
1799
 
1799
1800
  # numpy checks if the input arrays have the same shape as `shape`
1800
- outs = []
1801
- for array in literal_unroll(args):
1802
- outs.append(np.broadcast_to(np.asarray(array), tup))
1803
- return outs
1801
+ return [
1802
+ np.broadcast_to(np.asarray(array), tup)
1803
+ for array in literal_unroll(args)
1804
+ ]
1804
1805
 
1805
1806
  return impl
1806
1807
 
@@ -3642,10 +3643,63 @@ def record_static_setitem_int(context, builder, sig, args):
3642
3643
  def constant_array(context, builder, ty, pyval):
3643
3644
  """
3644
3645
  Create a constant array (mechanism is target-dependent).
3646
+
3647
+ For objects implementing __cuda_array_interface__,
3648
+ the device pointer is embedded directly as a constant. For other arrays,
3649
+ the target-specific mechanism is used.
3645
3650
  """
3651
+ # Check if this is a device array (implements __cuda_array_interface__)
3652
+ if getattr(pyval, "__cuda_array_interface__", None) is not None:
3653
+ return _lower_constant_device_array(context, builder, ty, pyval)
3654
+
3646
3655
  return context.make_constant_array(builder, ty, pyval)
3647
3656
 
3648
3657
 
3658
+ def _lower_constant_device_array(context, builder, ty, pyval):
3659
+ """
3660
+ Lower objects with __cuda_array_interface__ by embedding the device
3661
+ pointer as a constant.
3662
+
3663
+ This allows device arrays captured from globals to be used in CUDA
3664
+ kernels and device functions.
3665
+ """
3666
+ interface = pyval.__cuda_array_interface__
3667
+
3668
+ # Hold on to the device array to prevent garbage collection.
3669
+ context.active_code_library.referenced_objects[id(pyval)] = pyval
3670
+
3671
+ shape = interface["shape"]
3672
+ strides = interface.get("strides")
3673
+ data_ptr = interface["data"][0]
3674
+ typestr = interface["typestr"]
3675
+ itemsize = np.dtype(typestr).itemsize
3676
+
3677
+ # Calculate strides if not provided (C-contiguous)
3678
+ if strides is None:
3679
+ strides = strides_from_shape(shape, itemsize, order="C")
3680
+
3681
+ # Embed device pointer as constant
3682
+ llvoidptr = context.get_value_type(types.voidptr)
3683
+ data = context.get_constant(types.uintp, data_ptr).inttoptr(llvoidptr)
3684
+
3685
+ # Build array structure
3686
+ ary = context.make_array(ty)(context, builder)
3687
+ kshape = [context.get_constant(types.intp, s) for s in shape]
3688
+ kstrides = [context.get_constant(types.intp, s) for s in strides]
3689
+
3690
+ context.populate_array(
3691
+ ary,
3692
+ data=builder.bitcast(data, ary.data.type),
3693
+ shape=kshape,
3694
+ strides=kstrides,
3695
+ itemsize=context.get_constant(types.intp, itemsize),
3696
+ parent=None,
3697
+ meminfo=None,
3698
+ )
3699
+
3700
+ return ary._getvalue()
3701
+
3702
+
3649
3703
  @lower_constant(types.Record)
3650
3704
  def constant_record(context, builder, ty, pyval):
3651
3705
  """
@@ -4768,13 +4822,11 @@ def _parse_shape(context, builder, ty, val):
4768
4822
  ndim = ty.count
4769
4823
  passed_shapes = cgutils.unpack_tuple(builder, val, count=ndim)
4770
4824
 
4771
- shapes = []
4772
- for s in passed_shapes:
4773
- shapes.append(safecast_intp(context, builder, s.type, s))
4825
+ shapes = [safecast_intp(context, builder, s.type, s) for s in passed_shapes]
4774
4826
 
4775
4827
  zero = context.get_constant_generic(builder, types.intp, 0)
4776
- for dim in range(ndim):
4777
- is_neg = builder.icmp_signed("<", shapes[dim], zero)
4828
+ for shape in shapes:
4829
+ is_neg = builder.icmp_signed("<", shape, zero)
4778
4830
  with cgutils.if_unlikely(builder, is_neg):
4779
4831
  context.call_conv.return_user_exc(
4780
4832
  builder, ValueError, ("negative dimensions not allowed",)
@@ -3,7 +3,11 @@
3
3
 
4
4
  import collections
5
5
  import ctypes
6
+ import itertools
7
+ import functools
8
+ import operator
6
9
  import re
10
+
7
11
  import numpy as np
8
12
 
9
13
  from numba.cuda import types
@@ -17,6 +21,30 @@ from numba.cuda.cgutils import is_nonelike # noqa: F401
17
21
 
18
22
  numpy_version = tuple(map(int, np.__version__.split(".")[:2]))
19
23
 
24
+
25
+ @functools.lru_cache
26
+ def strides_from_shape(
27
+ shape: tuple[int, ...], itemsize: int, *, order: str
28
+ ) -> tuple[int, ...]:
29
+ """Compute strides for a contiguous array with given shape and order."""
30
+ if not shape:
31
+ # 0-D arrays have empty strides
32
+ return ()
33
+ limits = slice(1, None) if order == "C" else slice(None, -1)
34
+ transform = reversed if order == "C" else lambda x: x
35
+ strides = tuple(
36
+ map(
37
+ itemsize.__mul__,
38
+ itertools.accumulate(
39
+ transform(shape[limits]), operator.mul, initial=1
40
+ ),
41
+ )
42
+ )
43
+ if order == "F":
44
+ return strides
45
+ return strides[::-1]
46
+
47
+
20
48
  FROM_DTYPE = {
21
49
  np.dtype("bool"): types.boolean,
22
50
  np.dtype("int8"): types.int8,
@@ -92,16 +120,11 @@ def from_dtype(dtype):
92
120
  elif getattr(dtype, "fields", None) is not None:
93
121
  return from_struct_dtype(dtype)
94
122
 
95
- try:
96
- return FROM_DTYPE[dtype]
97
- except KeyError:
98
- pass
123
+ result = FROM_DTYPE.get(dtype)
124
+ if result is not None:
125
+ return result
99
126
 
100
- try:
101
- char = dtype.char
102
- except AttributeError:
103
- pass
104
- else:
127
+ if (char := getattr(dtype, "char", None)) is not None:
105
128
  if char in "SU":
106
129
  return _from_str_dtype(dtype)
107
130
  if char in "mM":
@@ -122,9 +122,10 @@ def polyutils_as_series(alist, trim=True):
122
122
 
123
123
  def impl(alist, trim=True):
124
124
  if tuple_input:
125
- arrays = []
126
- for item in literal_unroll(alist):
127
- arrays.append(np.atleast_1d(np.asarray(item)).astype(res_dtype))
125
+ arrays = [
126
+ np.atleast_1d(np.asarray(item)).astype(res_dtype)
127
+ for item in literal_unroll(alist)
128
+ ]
128
129
 
129
130
  elif list_input:
130
131
  arrays = [
@@ -32,6 +32,26 @@ def print_item(ty, context, builder, val):
32
32
  )
33
33
 
34
34
 
35
+ @print_item.register(types.Tuple)
36
+ @print_item.register(types.UniTuple)
37
+ def tuple_print_impl(ty, context, builder, val):
38
+ formats = []
39
+ values = []
40
+
41
+ for i, argtyp in enumerate(ty.types):
42
+ argval = builder.extract_value(val, i)
43
+ argfmt, argvals = print_item(argtyp, context, builder, argval)
44
+ formats.append(argfmt)
45
+ values.extend(argvals)
46
+
47
+ if len(formats) == 1:
48
+ base = "({},)"
49
+ else:
50
+ base = "({})"
51
+ rawfmt = base.format(", ".join(formats))
52
+ return rawfmt, values
53
+
54
+
35
55
  @print_item.register(types.Integer)
36
56
  @print_item.register(types.IntegerLiteral)
37
57
  def int_print_impl(ty, context, builder, val):
@@ -197,6 +197,16 @@ class NumbaPickler(cloudpickle.CloudPickler):
197
197
  # Overridden to disable pickling of certain types
198
198
  if type(obj) in self.disabled_types:
199
199
  _no_pickle(obj) # noreturn
200
+
201
+ # Prevent pickling of objects implementing __cuda_array_interface__
202
+ # These contain device pointers that would become stale after unpickling
203
+ if getattr(obj, "__cuda_array_interface__", None) is not None:
204
+ raise pickle.PicklingError(
205
+ "Cannot serialize kernels or device functions referencing "
206
+ "global device arrays. Pass the array(s) as arguments "
207
+ "to the kernel instead."
208
+ )
209
+
200
210
  return super().reducer_override(obj)
201
211
 
202
212
 
@@ -200,17 +200,6 @@ class syncwarp(Stub):
200
200
  _description_ = "<warp_sync()>"
201
201
 
202
202
 
203
- class vote_sync_intrinsic(Stub):
204
- """
205
- vote_sync_intrinsic(mask, mode, predictate)
206
-
207
- Nvvm intrinsic for performing a reduce and broadcast across a warp
208
- docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-vote
209
- """
210
-
211
- _description_ = "<vote_sync()>"
212
-
213
-
214
203
  class match_any_sync(Stub):
215
204
  """
216
205
  match_any_sync(mask, value)
@@ -276,14 +276,6 @@ def skip_if_curand_kernel_missing(fn):
276
276
  return unittest.skipUnless(curand_kernel_h_file, reason)(fn)
277
277
 
278
278
 
279
- def skip_if_mvc_enabled(reason):
280
- """Skip a test if Minor Version Compatibility is enabled"""
281
- assert isinstance(reason, str)
282
- return unittest.skipIf(
283
- config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY, reason
284
- )
285
-
286
-
287
279
  def cc_X_or_above(major, minor):
288
280
  if not config.ENABLE_CUDASIM:
289
281
  cc = devices.get_context().device.compute_capability
@@ -308,6 +300,10 @@ def skip_unless_cc_75(fn):
308
300
  return unittest.skipUnless(cc_X_or_above(7, 5), "requires cc >= 7.5")(fn)
309
301
 
310
302
 
303
+ def skip_unless_cc_90(fn):
304
+ return unittest.skipUnless(cc_X_or_above(9, 0), "requires cc >= 9.0")(fn)
305
+
306
+
311
307
  def xfail_unless_cudasim(fn):
312
308
  if config.ENABLE_CUDASIM:
313
309
  return fn
@@ -36,8 +36,13 @@ pytestmark = pytest.mark.skipif(
36
36
  ),
37
37
  ],
38
38
  )
39
- def test_one_arg(benchmark, array_func):
40
- @cuda.jit("void(float32[:])")
39
+ @pytest.mark.parametrize(
40
+ "jit",
41
+ [cuda.jit, cuda.jit("void(float32[::1])")],
42
+ ids=["dispatch", "signature"],
43
+ )
44
+ def test_one_arg(benchmark, array_func, jit):
45
+ @jit
41
46
  def one_arg(arr1):
42
47
  return
43
48
 
@@ -78,10 +83,22 @@ def test_one_arg(benchmark, array_func):
78
83
  ),
79
84
  ],
80
85
  )
81
- def test_many_args(benchmark, array_func):
86
+ @pytest.mark.parametrize(
87
+ "jit",
88
+ [
89
+ cuda.jit,
90
+ cuda.jit(
91
+ "void({})".format(
92
+ ", ".join(["float32[::1]"] * len(string.ascii_lowercase))
93
+ )
94
+ ),
95
+ ],
96
+ ids=["dispatch", "signature"],
97
+ )
98
+ def test_many_args(benchmark, array_func, jit):
82
99
  many_arrs = array_func()
83
100
 
84
- @cuda.jit("void({})".format(", ".join(["float32[:]"] * len(many_arrs))))
101
+ @jit
85
102
  def many_args(
86
103
  a,
87
104
  b,
@@ -2,7 +2,6 @@
2
2
  # SPDX-License-Identifier: BSD-2-Clause
3
3
 
4
4
  import numbers
5
- import weakref
6
5
 
7
6
  from numba import cuda
8
7
  from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
@@ -90,8 +89,8 @@ class Test3rdPartyContext(CUDATestCase):
90
89
  dev = driver.binding.CUdevice(0)
91
90
  binding_hctx = the_driver.cuDevicePrimaryCtxRetain(dev)
92
91
  hctx = driver.drvapi.cu_context(int(binding_hctx))
92
+ ctx = driver.Context(dev, hctx)
93
93
  try:
94
- ctx = driver.Context(weakref.proxy(self), hctx)
95
94
  ctx.push()
96
95
  # Check that the context from numba matches the created primary
97
96
  # context.