numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.23.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/api.py +4 -1
  3. numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
  4. numba_cuda/numba/cuda/cext/_dispatcher.cpp +0 -38
  5. numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
  6. numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
  7. numba_cuda/numba/cuda/cext/_typeof.cpp +0 -111
  8. numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
  9. numba_cuda/numba/cuda/codegen.py +42 -10
  10. numba_cuda/numba/cuda/compiler.py +10 -4
  11. numba_cuda/numba/cuda/core/analysis.py +29 -21
  12. numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
  13. numba_cuda/numba/cuda/core/base.py +6 -1
  14. numba_cuda/numba/cuda/core/consts.py +1 -1
  15. numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
  16. numba_cuda/numba/cuda/core/errors.py +4 -912
  17. numba_cuda/numba/cuda/core/inline_closurecall.py +71 -57
  18. numba_cuda/numba/cuda/core/interpreter.py +79 -64
  19. numba_cuda/numba/cuda/core/ir.py +191 -119
  20. numba_cuda/numba/cuda/core/ir_utils.py +142 -112
  21. numba_cuda/numba/cuda/core/postproc.py +8 -8
  22. numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
  23. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
  24. numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
  25. numba_cuda/numba/cuda/core/ssa.py +3 -3
  26. numba_cuda/numba/cuda/core/transforms.py +25 -10
  27. numba_cuda/numba/cuda/core/typed_passes.py +9 -9
  28. numba_cuda/numba/cuda/core/typeinfer.py +39 -24
  29. numba_cuda/numba/cuda/core/untyped_passes.py +71 -55
  30. numba_cuda/numba/cuda/cudadecl.py +0 -13
  31. numba_cuda/numba/cuda/cudadrv/devicearray.py +6 -5
  32. numba_cuda/numba/cuda/cudadrv/driver.py +132 -511
  33. numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
  34. numba_cuda/numba/cuda/cudadrv/nvrtc.py +16 -0
  35. numba_cuda/numba/cuda/cudaimpl.py +0 -12
  36. numba_cuda/numba/cuda/debuginfo.py +104 -10
  37. numba_cuda/numba/cuda/descriptor.py +1 -1
  38. numba_cuda/numba/cuda/device_init.py +4 -7
  39. numba_cuda/numba/cuda/dispatcher.py +36 -32
  40. numba_cuda/numba/cuda/intrinsics.py +150 -1
  41. numba_cuda/numba/cuda/lowering.py +64 -29
  42. numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
  43. numba_cuda/numba/cuda/np/arrayobj.py +54 -0
  44. numba_cuda/numba/cuda/np/numpy_support.py +26 -0
  45. numba_cuda/numba/cuda/printimpl.py +20 -0
  46. numba_cuda/numba/cuda/serialize.py +10 -0
  47. numba_cuda/numba/cuda/stubs.py +0 -11
  48. numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
  49. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
  50. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +130 -48
  51. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
  52. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
  53. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +5 -6
  54. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
  55. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +27 -19
  56. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
  57. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +10 -0
  58. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +89 -0
  59. numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
  60. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
  61. numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
  62. numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
  63. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +116 -1
  64. numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
  65. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
  66. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
  67. numba_cuda/numba/cuda/typing/context.py +3 -1
  68. numba_cuda/numba/cuda/typing/typeof.py +56 -0
  69. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/METADATA +1 -1
  70. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/RECORD +74 -74
  71. numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
  72. numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
  73. numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
  74. numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
  75. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/WHEEL +0 -0
  76. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/licenses/LICENSE +0 -0
  77. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/licenses/LICENSE.numba +0 -0
  78. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/top_level.txt +0 -0
@@ -11,6 +11,7 @@ from llvmlite import ir as llvm_ir
11
11
  from numba.cuda import HAS_NUMBA
12
12
  from numba.cuda.core import ir
13
13
  from numba.cuda import debuginfo, cgutils, utils, typing, types
14
+ from numba import cuda
14
15
  from numba.cuda.core import (
15
16
  ir_utils,
16
17
  targetconfig,
@@ -441,7 +442,9 @@ class Lower(BaseLower):
441
442
  # Ensure that the variable is not defined multiple times
442
443
  # in the block
443
444
  [defblk] = var_assign_map[var]
444
- assign_stmts = self.blocks[defblk].find_insts(ir.Assign)
445
+ assign_stmts = self.blocks[defblk].find_insts(
446
+ ir.assign_types
447
+ )
445
448
  assigns = [
446
449
  stmt
447
450
  for stmt in assign_stmts
@@ -468,7 +471,7 @@ class Lower(BaseLower):
468
471
  self.builder.position_at_end(bb)
469
472
  all_names = set()
470
473
  for block in self.blocks.values():
471
- for x in block.find_insts(ir.Del):
474
+ for x in block.find_insts(ir.del_types):
472
475
  if x.value not in all_names:
473
476
  all_names.add(x.value)
474
477
  for name in all_names:
@@ -483,9 +486,9 @@ class Lower(BaseLower):
483
486
  self.func_ir,
484
487
  call.func,
485
488
  )
486
- if defn is not None and isinstance(defn, ir.Global):
489
+ if defn is not None and isinstance(defn, ir.global_types):
487
490
  if defn.value is eh.exception_check:
488
- if isinstance(block.terminator, ir.Branch):
491
+ if isinstance(block.terminator, ir.branch_types):
489
492
  targetblk = self.blkmap[block.terminator.truebr]
490
493
  # NOTE: This hacks in an attribute for call_conv to
491
494
  # pick up. This hack is no longer needed when
@@ -505,19 +508,19 @@ class Lower(BaseLower):
505
508
  self.debuginfo.mark_location(self.builder, self.loc.line)
506
509
  self.notify_loc(self.loc)
507
510
  self.debug_print(str(inst))
508
- if isinstance(inst, ir.Assign):
511
+ if isinstance(inst, ir.assign_types):
509
512
  ty = self.typeof(inst.target.name)
510
513
  val = self.lower_assign(ty, inst)
511
514
  argidx = None
512
515
  # If this is a store from an arg, like x = arg.x then tell debuginfo
513
516
  # that this is the arg
514
- if isinstance(inst.value, ir.Arg):
517
+ if isinstance(inst.value, ir.arg_types):
515
518
  # NOTE: debug location is the `def <func>` line
516
519
  self.debuginfo.mark_location(self.builder, self.defn_loc.line)
517
520
  argidx = inst.value.index + 1 # args start at 1
518
521
  self.storevar(val, inst.target.name, argidx=argidx)
519
522
 
520
- elif isinstance(inst, ir.Branch):
523
+ elif isinstance(inst, ir.branch_types):
521
524
  cond = self.loadvar(inst.cond.name)
522
525
  tr = self.blkmap[inst.truebr]
523
526
  fl = self.blkmap[inst.falsebr]
@@ -529,11 +532,11 @@ class Lower(BaseLower):
529
532
  )
530
533
  self.builder.cbranch(pred, tr, fl)
531
534
 
532
- elif isinstance(inst, ir.Jump):
535
+ elif isinstance(inst, ir.jump_types):
533
536
  target = self.blkmap[inst.target]
534
537
  self.builder.branch(target)
535
538
 
536
- elif isinstance(inst, ir.Return):
539
+ elif isinstance(inst, ir.return_types):
537
540
  if self.generator_info:
538
541
  # StopIteration
539
542
  self.genlower.return_from_generator(self)
@@ -551,10 +554,10 @@ class Lower(BaseLower):
551
554
  retval = self.context.get_return_value(self.builder, ty, val)
552
555
  self.call_conv.return_value(self.builder, retval)
553
556
 
554
- elif isinstance(inst, ir.PopBlock):
557
+ elif isinstance(inst, ir.popblock_types):
555
558
  pass # this is just a marker
556
559
 
557
- elif isinstance(inst, ir.StaticSetItem):
560
+ elif isinstance(inst, ir.staticsetitem_types):
558
561
  signature = self.fndesc.calltypes[inst]
559
562
  assert signature is not None
560
563
  try:
@@ -572,22 +575,22 @@ class Lower(BaseLower):
572
575
  )
573
576
  return impl(self.builder, (target, inst.index, value))
574
577
 
575
- elif isinstance(inst, ir.Print):
578
+ elif isinstance(inst, ir.print_types):
576
579
  self.lower_print(inst)
577
580
 
578
- elif isinstance(inst, ir.SetItem):
581
+ elif isinstance(inst, ir.setitem_types):
579
582
  signature = self.fndesc.calltypes[inst]
580
583
  assert signature is not None
581
584
  return self.lower_setitem(
582
585
  inst.target, inst.index, inst.value, signature
583
586
  )
584
587
 
585
- elif isinstance(inst, ir.StoreMap):
588
+ elif isinstance(inst, ir.storemap_types):
586
589
  signature = self.fndesc.calltypes[inst]
587
590
  assert signature is not None
588
591
  return self.lower_setitem(inst.dct, inst.key, inst.value, signature)
589
592
 
590
- elif isinstance(inst, ir.DelItem):
593
+ elif isinstance(inst, ir.delitem_types):
591
594
  target = self.loadvar(inst.target.name)
592
595
  index = self.loadvar(inst.index.name)
593
596
 
@@ -613,10 +616,10 @@ class Lower(BaseLower):
613
616
 
614
617
  return impl(self.builder, (target, index))
615
618
 
616
- elif isinstance(inst, ir.Del):
619
+ elif isinstance(inst, ir.del_types):
617
620
  self.delvar(inst.value)
618
621
 
619
- elif isinstance(inst, ir.SetAttr):
622
+ elif isinstance(inst, ir.setattr_types):
620
623
  target = self.loadvar(inst.target.name)
621
624
  value = self.loadvar(inst.value.name)
622
625
  signature = self.fndesc.calltypes[inst]
@@ -634,16 +637,16 @@ class Lower(BaseLower):
634
637
 
635
638
  return impl(self.builder, (target, value))
636
639
 
637
- elif isinstance(inst, ir.DynamicRaise):
640
+ elif isinstance(inst, ir.dynamicraise_types):
638
641
  self.lower_dynamic_raise(inst)
639
642
 
640
- elif isinstance(inst, ir.DynamicTryRaise):
643
+ elif isinstance(inst, ir.dynamictryraise_types):
641
644
  self.lower_try_dynamic_raise(inst)
642
645
 
643
- elif isinstance(inst, ir.StaticRaise):
646
+ elif isinstance(inst, ir.staticraise_types):
644
647
  self.lower_static_raise(inst)
645
648
 
646
- elif isinstance(inst, ir.StaticTryRaise):
649
+ elif isinstance(inst, ir.statictryraise_types):
647
650
  self.lower_static_try_raise(inst)
648
651
 
649
652
  else:
@@ -695,7 +698,7 @@ class Lower(BaseLower):
695
698
  args = []
696
699
  nb_types = []
697
700
  for exc_arg in exc_args:
698
- if isinstance(exc_arg, ir.Var):
701
+ if isinstance(exc_arg, ir.var_types):
699
702
  # dynamic values
700
703
  typ = self.typeof(exc_arg.name)
701
704
  val = self.loadvar(exc_arg.name)
@@ -727,24 +730,28 @@ class Lower(BaseLower):
727
730
  def lower_assign(self, ty, inst):
728
731
  value = inst.value
729
732
  # In nopython mode, closure vars are frozen like globals
730
- if isinstance(value, (ir.Const, ir.Global, ir.FreeVar)):
733
+ if (
734
+ isinstance(value, ir.const_types)
735
+ or isinstance(value, ir.global_types)
736
+ or isinstance(value, ir.freevar_types)
737
+ ):
731
738
  res = self.context.get_constant_generic(
732
739
  self.builder, ty, value.value
733
740
  )
734
741
  self.incref(ty, res)
735
742
  return res
736
743
 
737
- elif isinstance(value, ir.Expr):
744
+ elif isinstance(value, ir.expr_types):
738
745
  return self.lower_expr(ty, value)
739
746
 
740
- elif isinstance(value, ir.Var):
747
+ elif isinstance(value, ir.var_types):
741
748
  val = self.loadvar(value.name)
742
749
  oty = self.typeof(value.name)
743
750
  res = self.context.cast(self.builder, val, oty, ty)
744
751
  self.incref(ty, res)
745
752
  return res
746
753
 
747
- elif isinstance(value, ir.Arg):
754
+ elif isinstance(value, ir.arg_types):
748
755
  # Suspend debug info else all the arg repacking ends up being
749
756
  # associated with some line or other and it's actually just a detail
750
757
  # of Numba's CC.
@@ -770,7 +777,7 @@ class Lower(BaseLower):
770
777
  self.incref(ty, res)
771
778
  return res
772
779
 
773
- elif isinstance(value, ir.Yield):
780
+ elif isinstance(value, ir.yield_types):
774
781
  res = self.lower_yield(ty, value)
775
782
  self.incref(ty, res)
776
783
  return res
@@ -1677,10 +1684,31 @@ class Lower(BaseLower):
1677
1684
 
1678
1685
 
1679
1686
  class CUDALower(Lower):
1687
+ def _is_shared_array_call(self, fnty):
1688
+ # Check if function type is a cuda.shared.array call
1689
+ if not hasattr(fnty, "typing_key"):
1690
+ return False
1691
+ return fnty.typing_key is cuda.shared.array
1692
+
1693
+ def _lower_call_normal(self, fnty, expr, signature):
1694
+ # Set flag for subsequent store to track shared address space
1695
+ if self.context.enable_debuginfo and self._is_shared_array_call(fnty):
1696
+ self._pending_shared_store = True
1697
+
1698
+ return super()._lower_call_normal(fnty, expr, signature)
1699
+
1680
1700
  def storevar(self, value, name, argidx=None):
1681
1701
  """
1682
1702
  Store the value into the given variable.
1683
1703
  """
1704
+ # Track address space for debug info
1705
+ if self.context.enable_debuginfo and self._pending_shared_store:
1706
+ from numba.cuda.cudadrv import nvvm
1707
+
1708
+ self._addrspace_map[name] = nvvm.ADDRSPACE_SHARED
1709
+ if not name.startswith("$") and not name.startswith("."):
1710
+ self._pending_shared_store = False
1711
+
1684
1712
  # Handle polymorphic variables with CUDA_DEBUG_POLY enabled
1685
1713
  if config.CUDA_DEBUG_POLY:
1686
1714
  src_name = name.split(".")[0]
@@ -1792,7 +1820,7 @@ class CUDALower(Lower):
1792
1820
  self.dbg_val_names = set()
1793
1821
 
1794
1822
  if self.context.enable_debuginfo and self._disable_sroa_like_opt:
1795
- for x in block.find_insts(ir.Assign):
1823
+ for x in block.find_insts(ir.assign_types):
1796
1824
  if x.target.name.startswith("$"):
1797
1825
  continue
1798
1826
  ssa_name = x.target.name
@@ -1806,6 +1834,13 @@ class CUDALower(Lower):
1806
1834
  """
1807
1835
  super().pre_lower()
1808
1836
 
1837
+ # Track address space for debug info
1838
+ self._addrspace_map = {}
1839
+ self._pending_shared_store = False
1840
+ if self.context.enable_debuginfo:
1841
+ self.debuginfo._set_addrspace_map(self._addrspace_map)
1842
+
1843
+ # Track polymorphic variables for debug info
1809
1844
  self.poly_var_typ_map = {}
1810
1845
  self.poly_var_loc_map = {}
1811
1846
  self.poly_var_set = set()
@@ -1818,7 +1853,7 @@ class CUDALower(Lower):
1818
1853
  poly_map = {}
1819
1854
  # pre-scan all blocks
1820
1855
  for block in self.blocks.values():
1821
- for x in block.find_insts(ir.Assign):
1856
+ for x in block.find_insts(ir.assign_types):
1822
1857
  if x.target.name.startswith("$"):
1823
1858
  continue
1824
1859
  ssa_name = x.target.name
@@ -13,9 +13,10 @@ from numba.cuda import config, types
13
13
  from numba.cuda.cudadrv.driver import (
14
14
  _Linker,
15
15
  driver,
16
- launch_kernel,
16
+ _to_core_stream,
17
17
  _have_nvjitlink,
18
18
  )
19
+ from cuda.core.experimental import LaunchConfig, launch
19
20
  from numba.cuda.cudadrv import devices
20
21
  from numba.cuda.api import get_current_device
21
22
  from numba.cuda.utils import _readenv, cached_file_read
@@ -126,7 +127,7 @@ class _Runtime:
126
127
  cc = get_current_device().compute_capability
127
128
 
128
129
  # Create a new linker instance and add the cu file
129
- linker = _Linker.new(cc=cc, lto=_have_nvjitlink())
130
+ linker = _Linker(max_registers=0, cc=cc, lto=_have_nvjitlink())
130
131
  linker.add_cu_file(memsys_mod)
131
132
 
132
133
  # Complete the linker and create a module from it
@@ -179,20 +180,15 @@ class _Runtime:
179
180
  stream = cuda.default_stream()
180
181
 
181
182
  func = module.get_function(name)
182
- launch_kernel(
183
- func.handle,
184
- 1,
185
- 1,
186
- 1,
187
- 1,
188
- 1,
189
- 1,
190
- 0,
191
- stream.handle.value,
192
- params,
193
- cooperative=False,
183
+ config = LaunchConfig(
184
+ grid=(1, 1, 1),
185
+ block=(1, 1, 1),
186
+ shmem_size=0,
187
+ cooperative_launch=False,
194
188
  )
195
189
 
190
+ launch(_to_core_stream(stream), config, func.kernel, *params)
191
+
196
192
  def ensure_initialized(self, stream=None):
197
193
  """
198
194
  If memsys is not initialized, initialize memsys
@@ -31,6 +31,7 @@ from numba.cuda.np.numpy_support import (
31
31
  type_is_scalar,
32
32
  lt_complex,
33
33
  lt_floats,
34
+ strides_from_shape,
34
35
  )
35
36
  from numba.cuda.np.numpy_support import (
36
37
  type_can_asarray,
@@ -3642,10 +3643,63 @@ def record_static_setitem_int(context, builder, sig, args):
3642
3643
  def constant_array(context, builder, ty, pyval):
3643
3644
  """
3644
3645
  Create a constant array (mechanism is target-dependent).
3646
+
3647
+ For objects implementing __cuda_array_interface__,
3648
+ the device pointer is embedded directly as a constant. For other arrays,
3649
+ the target-specific mechanism is used.
3645
3650
  """
3651
+ # Check if this is a device array (implements __cuda_array_interface__)
3652
+ if getattr(pyval, "__cuda_array_interface__", None) is not None:
3653
+ return _lower_constant_device_array(context, builder, ty, pyval)
3654
+
3646
3655
  return context.make_constant_array(builder, ty, pyval)
3647
3656
 
3648
3657
 
3658
+ def _lower_constant_device_array(context, builder, ty, pyval):
3659
+ """
3660
+ Lower objects with __cuda_array_interface__ by embedding the device
3661
+ pointer as a constant.
3662
+
3663
+ This allows device arrays captured from globals to be used in CUDA
3664
+ kernels and device functions.
3665
+ """
3666
+ interface = pyval.__cuda_array_interface__
3667
+
3668
+ # Hold on to the device array to prevent garbage collection.
3669
+ context.active_code_library.referenced_objects[id(pyval)] = pyval
3670
+
3671
+ shape = interface["shape"]
3672
+ strides = interface.get("strides")
3673
+ data_ptr = interface["data"][0]
3674
+ typestr = interface["typestr"]
3675
+ itemsize = np.dtype(typestr).itemsize
3676
+
3677
+ # Calculate strides if not provided (C-contiguous)
3678
+ if strides is None:
3679
+ strides = strides_from_shape(shape, itemsize, order="C")
3680
+
3681
+ # Embed device pointer as constant
3682
+ llvoidptr = context.get_value_type(types.voidptr)
3683
+ data = context.get_constant(types.uintp, data_ptr).inttoptr(llvoidptr)
3684
+
3685
+ # Build array structure
3686
+ ary = context.make_array(ty)(context, builder)
3687
+ kshape = [context.get_constant(types.intp, s) for s in shape]
3688
+ kstrides = [context.get_constant(types.intp, s) for s in strides]
3689
+
3690
+ context.populate_array(
3691
+ ary,
3692
+ data=builder.bitcast(data, ary.data.type),
3693
+ shape=kshape,
3694
+ strides=kstrides,
3695
+ itemsize=context.get_constant(types.intp, itemsize),
3696
+ parent=None,
3697
+ meminfo=None,
3698
+ )
3699
+
3700
+ return ary._getvalue()
3701
+
3702
+
3649
3703
  @lower_constant(types.Record)
3650
3704
  def constant_record(context, builder, ty, pyval):
3651
3705
  """
@@ -3,7 +3,10 @@
3
3
 
4
4
  import collections
5
5
  import ctypes
6
+ import itertools
7
+ import operator
6
8
  import re
9
+
7
10
  import numpy as np
8
11
 
9
12
  from numba.cuda import types
@@ -17,6 +20,29 @@ from numba.cuda.cgutils import is_nonelike # noqa: F401
17
20
 
18
21
  numpy_version = tuple(map(int, np.__version__.split(".")[:2]))
19
22
 
23
+
24
+ def strides_from_shape(
25
+ shape: tuple[int, ...], itemsize: int, *, order: str
26
+ ) -> tuple[int, ...]:
27
+ """Compute strides for a contiguous array with given shape and order."""
28
+ if len(shape) == 0:
29
+ # 0-D arrays have empty strides
30
+ return ()
31
+ limits = slice(1, None) if order == "C" else slice(None, -1)
32
+ transform = reversed if order == "C" else lambda x: x
33
+ strides = tuple(
34
+ map(
35
+ itemsize.__mul__,
36
+ itertools.accumulate(
37
+ transform(shape[limits]), operator.mul, initial=1
38
+ ),
39
+ )
40
+ )
41
+ if order == "F":
42
+ return strides
43
+ return strides[::-1]
44
+
45
+
20
46
  FROM_DTYPE = {
21
47
  np.dtype("bool"): types.boolean,
22
48
  np.dtype("int8"): types.int8,
@@ -32,6 +32,26 @@ def print_item(ty, context, builder, val):
32
32
  )
33
33
 
34
34
 
35
+ @print_item.register(types.Tuple)
36
+ @print_item.register(types.UniTuple)
37
+ def tuple_print_impl(ty, context, builder, val):
38
+ formats = []
39
+ values = []
40
+
41
+ for i, argtyp in enumerate(ty.types):
42
+ argval = builder.extract_value(val, i)
43
+ argfmt, argvals = print_item(argtyp, context, builder, argval)
44
+ formats.append(argfmt)
45
+ values.extend(argvals)
46
+
47
+ if len(formats) == 1:
48
+ base = "({},)"
49
+ else:
50
+ base = "({})"
51
+ rawfmt = base.format(", ".join(formats))
52
+ return rawfmt, values
53
+
54
+
35
55
  @print_item.register(types.Integer)
36
56
  @print_item.register(types.IntegerLiteral)
37
57
  def int_print_impl(ty, context, builder, val):
@@ -197,6 +197,16 @@ class NumbaPickler(cloudpickle.CloudPickler):
197
197
  # Overridden to disable pickling of certain types
198
198
  if type(obj) in self.disabled_types:
199
199
  _no_pickle(obj) # noreturn
200
+
201
+ # Prevent pickling of objects implementing __cuda_array_interface__
202
+ # These contain device pointers that would become stale after unpickling
203
+ if getattr(obj, "__cuda_array_interface__", None) is not None:
204
+ raise pickle.PicklingError(
205
+ "Cannot serialize kernels or device functions referencing "
206
+ "global device arrays. Pass the array(s) as arguments "
207
+ "to the kernel instead."
208
+ )
209
+
200
210
  return super().reducer_override(obj)
201
211
 
202
212
 
@@ -200,17 +200,6 @@ class syncwarp(Stub):
200
200
  _description_ = "<warp_sync()>"
201
201
 
202
202
 
203
- class vote_sync_intrinsic(Stub):
204
- """
205
- vote_sync_intrinsic(mask, mode, predictate)
206
-
207
- Nvvm intrinsic for performing a reduce and broadcast across a warp
208
- docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-vote
209
- """
210
-
211
- _description_ = "<vote_sync()>"
212
-
213
-
214
203
  class match_any_sync(Stub):
215
204
  """
216
205
  match_any_sync(mask, value)
@@ -36,8 +36,13 @@ pytestmark = pytest.mark.skipif(
36
36
  ),
37
37
  ],
38
38
  )
39
- def test_one_arg(benchmark, array_func):
40
- @cuda.jit("void(float32[:])")
39
+ @pytest.mark.parametrize(
40
+ "jit",
41
+ [cuda.jit, cuda.jit("void(float32[::1])")],
42
+ ids=["dispatch", "signature"],
43
+ )
44
+ def test_one_arg(benchmark, array_func, jit):
45
+ @jit
41
46
  def one_arg(arr1):
42
47
  return
43
48
 
@@ -78,10 +83,22 @@ def test_one_arg(benchmark, array_func):
78
83
  ),
79
84
  ],
80
85
  )
81
- def test_many_args(benchmark, array_func):
86
+ @pytest.mark.parametrize(
87
+ "jit",
88
+ [
89
+ cuda.jit,
90
+ cuda.jit(
91
+ "void({})".format(
92
+ ", ".join(["float32[::1]"] * len(string.ascii_lowercase))
93
+ )
94
+ ),
95
+ ],
96
+ ids=["dispatch", "signature"],
97
+ )
98
+ def test_many_args(benchmark, array_func, jit):
82
99
  many_arrs = array_func()
83
100
 
84
- @cuda.jit("void({})".format(", ".join(["float32[:]"] * len(many_arrs))))
101
+ @jit
85
102
  def many_args(
86
103
  a,
87
104
  b,
@@ -2,7 +2,6 @@
2
2
  # SPDX-License-Identifier: BSD-2-Clause
3
3
 
4
4
  import numbers
5
- import weakref
6
5
 
7
6
  from numba import cuda
8
7
  from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
@@ -90,8 +89,8 @@ class Test3rdPartyContext(CUDATestCase):
90
89
  dev = driver.binding.CUdevice(0)
91
90
  binding_hctx = the_driver.cuDevicePrimaryCtxRetain(dev)
92
91
  hctx = driver.drvapi.cu_context(int(binding_hctx))
92
+ ctx = driver.Context(dev, hctx)
93
93
  try:
94
- ctx = driver.Context(weakref.proxy(self), hctx)
95
94
  ctx.push()
96
95
  # Check that the context from numba matches the created primary
97
96
  # context.