numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.24.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/__init__.py +4 -1
  3. numba_cuda/numba/cuda/_compat.py +47 -0
  4. numba_cuda/numba/cuda/api.py +4 -1
  5. numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
  6. numba_cuda/numba/cuda/cext/_dispatcher.cpp +8 -40
  7. numba_cuda/numba/cuda/cext/_hashtable.cpp +5 -0
  8. numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
  9. numba_cuda/numba/cuda/cext/_pymodule.h +1 -1
  10. numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
  11. numba_cuda/numba/cuda/cext/_typeof.cpp +56 -119
  12. numba_cuda/numba/cuda/cext/mviewbuf.c +7 -1
  13. numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
  14. numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +4 -5
  15. numba_cuda/numba/cuda/codegen.py +46 -12
  16. numba_cuda/numba/cuda/compiler.py +15 -9
  17. numba_cuda/numba/cuda/core/analysis.py +29 -21
  18. numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +1 -1
  19. numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
  20. numba_cuda/numba/cuda/core/base.py +12 -11
  21. numba_cuda/numba/cuda/core/bytecode.py +21 -13
  22. numba_cuda/numba/cuda/core/byteflow.py +336 -90
  23. numba_cuda/numba/cuda/core/compiler.py +3 -4
  24. numba_cuda/numba/cuda/core/compiler_machinery.py +3 -3
  25. numba_cuda/numba/cuda/core/config.py +5 -7
  26. numba_cuda/numba/cuda/core/consts.py +1 -1
  27. numba_cuda/numba/cuda/core/controlflow.py +17 -9
  28. numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
  29. numba_cuda/numba/cuda/core/errors.py +4 -912
  30. numba_cuda/numba/cuda/core/inline_closurecall.py +82 -67
  31. numba_cuda/numba/cuda/core/interpreter.py +334 -160
  32. numba_cuda/numba/cuda/core/ir.py +191 -119
  33. numba_cuda/numba/cuda/core/ir_utils.py +149 -128
  34. numba_cuda/numba/cuda/core/postproc.py +8 -8
  35. numba_cuda/numba/cuda/core/pythonapi.py +3 -0
  36. numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
  37. numba_cuda/numba/cuda/core/rewrites/static_binop.py +1 -1
  38. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
  39. numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
  40. numba_cuda/numba/cuda/core/ssa.py +5 -5
  41. numba_cuda/numba/cuda/core/transforms.py +29 -16
  42. numba_cuda/numba/cuda/core/typed_passes.py +10 -10
  43. numba_cuda/numba/cuda/core/typeinfer.py +42 -27
  44. numba_cuda/numba/cuda/core/untyped_passes.py +82 -65
  45. numba_cuda/numba/cuda/cpython/unicode.py +2 -2
  46. numba_cuda/numba/cuda/cpython/unicode_support.py +1 -3
  47. numba_cuda/numba/cuda/cudadecl.py +0 -13
  48. numba_cuda/numba/cuda/cudadrv/devicearray.py +10 -9
  49. numba_cuda/numba/cuda/cudadrv/driver.py +142 -519
  50. numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
  51. numba_cuda/numba/cuda/cudadrv/nvrtc.py +87 -32
  52. numba_cuda/numba/cuda/cudaimpl.py +0 -12
  53. numba_cuda/numba/cuda/debuginfo.py +25 -0
  54. numba_cuda/numba/cuda/descriptor.py +1 -1
  55. numba_cuda/numba/cuda/device_init.py +4 -7
  56. numba_cuda/numba/cuda/deviceufunc.py +3 -6
  57. numba_cuda/numba/cuda/dispatcher.py +39 -49
  58. numba_cuda/numba/cuda/intrinsics.py +150 -1
  59. numba_cuda/numba/cuda/libdeviceimpl.py +1 -2
  60. numba_cuda/numba/cuda/lowering.py +36 -29
  61. numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
  62. numba_cuda/numba/cuda/np/arrayobj.py +61 -9
  63. numba_cuda/numba/cuda/np/numpy_support.py +32 -9
  64. numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +4 -3
  65. numba_cuda/numba/cuda/printimpl.py +20 -0
  66. numba_cuda/numba/cuda/serialize.py +10 -0
  67. numba_cuda/numba/cuda/stubs.py +0 -11
  68. numba_cuda/numba/cuda/testing.py +4 -8
  69. numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
  70. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
  71. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +195 -51
  72. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
  73. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
  74. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
  75. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +6 -7
  76. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
  77. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +53 -23
  78. numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +61 -9
  79. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +6 -0
  80. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
  81. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +22 -1
  82. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +13 -0
  83. numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +1 -1
  84. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +1 -1
  85. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +94 -0
  86. numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
  87. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
  88. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1 -1
  89. numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
  90. numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
  91. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +37 -35
  92. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +117 -1
  93. numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
  94. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
  95. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
  96. numba_cuda/numba/cuda/tests/support.py +11 -0
  97. numba_cuda/numba/cuda/types/cuda_functions.py +1 -1
  98. numba_cuda/numba/cuda/typing/asnumbatype.py +37 -2
  99. numba_cuda/numba/cuda/typing/context.py +3 -1
  100. numba_cuda/numba/cuda/typing/typeof.py +51 -2
  101. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/METADATA +4 -13
  102. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/RECORD +106 -105
  103. numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
  104. numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
  105. numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
  106. numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
  107. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/WHEEL +0 -0
  108. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE +0 -0
  109. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE.numba +0 -0
  110. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/top_level.txt +0 -0
@@ -2,21 +2,25 @@
2
2
  # SPDX-License-Identifier: BSD-2-Clause
3
3
 
4
4
  from ctypes import c_int, sizeof
5
-
6
- from numba.cuda.cudadrv.driver import (
7
- host_to_device,
8
- device_to_host,
9
- driver,
10
- launch_kernel,
5
+ import cffi
6
+ import numpy as np
7
+
8
+ from numba.cuda.cudadrv.driver import host_to_device, device_to_host, driver
9
+ from numba.cuda._compat import (
10
+ LaunchConfig,
11
+ Device,
12
+ Stream as ExperimentalStream,
13
+ launch,
11
14
  )
12
15
 
13
16
  from numba import cuda
14
- from numba.cuda.cudadrv import devices, driver as _driver
15
- from numba.cuda.testing import unittest, CUDATestCase
17
+ from numba.cuda.cudadrv import devices, nvrtc
18
+ from numba.cuda.testing import unittest, CUDATestCase, skip_unless_cc_90
16
19
  from numba.cuda.testing import skip_on_cudasim
20
+ from numba.cuda.tests.support import override_config
21
+ from numba.core import types
17
22
  import contextlib
18
23
 
19
- from cuda.core.experimental import Device
20
24
 
21
25
  ptx1 = """
22
26
  .version 1.4
@@ -98,22 +102,15 @@ class TestCudaDriver(CUDATestCase):
98
102
  host_to_device(memory, array, sizeof(array))
99
103
 
100
104
  ptr = memory.device_ctypes_pointer
101
- stream = 0
102
-
103
- stream = _driver.binding.CUstream(stream)
104
-
105
- launch_kernel(
106
- function.handle, # Kernel
107
- 1,
108
- 1,
109
- 1, # gx, gy, gz
110
- 100,
111
- 1,
112
- 1, # bx, by, bz
113
- 0, # dynamic shared mem
114
- stream, # stream
115
- [ptr],
116
- ) # arguments
105
+
106
+ config = LaunchConfig(
107
+ grid=(1, 1, 1),
108
+ block=(100, 1, 1),
109
+ shmem_size=0,
110
+ cooperative_launch=False,
111
+ )
112
+ exp_stream = ExperimentalStream.from_handle(0)
113
+ launch(exp_stream, config, function.kernel, ptr)
117
114
 
118
115
  device_to_host(array, memory, sizeof(array))
119
116
  for i, v in enumerate(array):
@@ -122,6 +119,8 @@ class TestCudaDriver(CUDATestCase):
122
119
  module.unload()
123
120
 
124
121
  def test_cuda_driver_stream_operations(self):
122
+ from numba.cuda.cudadrv.driver import _to_core_stream
123
+
125
124
  module = self.context.create_module_ptx(self.ptx)
126
125
  function = module.get_function("_Z10helloworldPi")
127
126
 
@@ -135,21 +134,14 @@ class TestCudaDriver(CUDATestCase):
135
134
 
136
135
  ptr = memory.device_ctypes_pointer
137
136
 
138
- stream_handle = stream.handle
139
- stream_handle = stream_handle.value
140
-
141
- launch_kernel(
142
- function.handle, # Kernel
143
- 1,
144
- 1,
145
- 1, # gx, gy, gz
146
- 100,
147
- 1,
148
- 1, # bx, by, bz
149
- 0, # dynamic shared mem
150
- stream_handle, # stream
151
- [ptr],
152
- ) # arguments
137
+ config = LaunchConfig(
138
+ grid=(1, 1, 1),
139
+ block=(100, 1, 1),
140
+ shmem_size=0,
141
+ cooperative_launch=False,
142
+ )
143
+ # Convert numba Stream to ExperimentalStream
144
+ launch(_to_core_stream(stream), config, function.kernel, ptr)
153
145
 
154
146
  device_to_host(array, memory, sizeof(array), stream=stream)
155
147
 
@@ -177,18 +169,13 @@ class TestCudaDriver(CUDATestCase):
177
169
 
178
170
  ptr = memory.device_ctypes_pointer
179
171
 
180
- launch_kernel(
181
- function.handle, # Kernel
182
- 1,
183
- 1,
184
- 1, # gx, gy, gz
185
- 100,
186
- 1,
187
- 1, # bx, by, bz
188
- 0, # dynamic shared mem
189
- stream.handle, # stream
190
- [ptr],
172
+ config = LaunchConfig(
173
+ grid=(1, 1, 1),
174
+ block=(100, 1, 1),
175
+ shmem_size=0,
176
+ cooperative_launch=False,
191
177
  )
178
+ launch(stream, config, function.kernel, ptr)
192
179
 
193
180
  device_to_host(array, memory, sizeof(array), stream=stream)
194
181
  for i, v in enumerate(array):
@@ -285,6 +272,105 @@ class TestCudaDriver(CUDATestCase):
285
272
  self.assertTrue(grid > 0)
286
273
  self.assertTrue(block > 0)
287
274
 
275
+ def test_cuda_cache_config(self):
276
+ from numba import types
277
+ import numpy as np
278
+
279
+ sig = (types.float32[::1], types.float32[::1])
280
+
281
+ @cuda.jit(sig)
282
+ def add_one(r, x):
283
+ i = cuda.grid(1)
284
+ if i < len(r):
285
+ r[i] = x[i] + 1
286
+
287
+ kernel = add_one.overloads[sig]
288
+ cufunc = kernel._codelibrary.get_cufunc()
289
+
290
+ configs_to_test = [
291
+ ("prefer_shared", dict(prefer_shared=True)),
292
+ ("prefer_cache", dict(prefer_cache=True)),
293
+ ("prefer_equal", dict(prefer_equal=True)),
294
+ ("default", dict()),
295
+ ]
296
+
297
+ for name, kwargs in configs_to_test:
298
+ with self.subTest(config=name):
299
+ try:
300
+ cufunc.cache_config(**kwargs)
301
+ except Exception as e:
302
+ self.fail(f"cache_config({name}) failed: {e}")
303
+
304
+ x = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
305
+ r = np.zeros_like(x)
306
+
307
+ d_x = cuda.to_device(x)
308
+ d_r = cuda.to_device(r)
309
+
310
+ cufunc.cache_config(prefer_shared=True)
311
+ add_one[1, 5](d_r, d_x)
312
+
313
+ result = d_r.copy_to_host()
314
+ expected = x + 1
315
+
316
+ np.testing.assert_array_almost_equal(
317
+ result,
318
+ expected,
319
+ err_msg="Kernel produced incorrect results after cache_config",
320
+ )
321
+
322
+ def test_cuda_set_shared_memory_carveout(self):
323
+ from numba import types
324
+ import numpy as np
325
+
326
+ sig = (types.float32[::1], types.float32[::1])
327
+
328
+ @cuda.jit(sig)
329
+ def add_one(r, x):
330
+ i = cuda.grid(1)
331
+ if i < len(r):
332
+ r[i] = x[i] + 1
333
+
334
+ kernel = add_one.overloads[sig]
335
+ cufunc = kernel._codelibrary.get_cufunc()
336
+
337
+ # valid carveout values
338
+ carveout_values = [-1, 0, 50, 100]
339
+ for value in carveout_values:
340
+ with self.subTest(carveout=value):
341
+ try:
342
+ cufunc.set_shared_memory_carveout(value)
343
+ except Exception as e:
344
+ self.fail(
345
+ f"set_shared_memory_carveout({value}) failed: {e}"
346
+ )
347
+
348
+ # invalid carveout values
349
+ invalid_values = [-2, 101, 150]
350
+ for value in invalid_values:
351
+ with self.subTest(invalid_carveout=value):
352
+ with self.assertRaises(ValueError):
353
+ cufunc.set_shared_memory_carveout(value)
354
+
355
+ # test the kernel
356
+ x = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
357
+ r = np.zeros_like(x)
358
+
359
+ d_x = cuda.to_device(x)
360
+ d_r = cuda.to_device(r)
361
+
362
+ cufunc.set_shared_memory_carveout(75)
363
+ add_one[1, 5](d_r, d_x)
364
+
365
+ result = d_r.copy_to_host()
366
+ expected = x + 1
367
+
368
+ np.testing.assert_array_almost_equal(
369
+ result,
370
+ expected,
371
+ err_msg="Kernel produced incorrect results after set_shared_memory_carveout",
372
+ )
373
+
288
374
 
289
375
  class TestDevice(CUDATestCase):
290
376
  def test_device_get_uuid(self):
@@ -309,5 +395,63 @@ class TestDevice(CUDATestCase):
309
395
  self.assertRegex(dev.uuid, uuid_format)
310
396
 
311
397
 
398
+ @skip_on_cudasim("CUDA asm unsupported in the simulator")
399
+ class TestAcceleratedArchitecture(CUDATestCase):
400
+ @skip_unless_cc_90
401
+ def test_device_arch_specific(self):
402
+ set_desc = cuda.CUSource("""
403
+ #include <cuda_fp16.h>
404
+
405
+ extern "C" __device__
406
+ int set_descriptor(int *out, int* smem) {
407
+ unsigned usmem = __cvta_generic_to_shared(smem);
408
+ asm volatile("tensormap.replace.tile.rank.shared::cta.b1024.b32 [%0], 2;" :: "r"(usmem));
409
+ return 0;
410
+ }
411
+ """)
412
+
413
+ set_descriptor = cuda.declare_device(
414
+ "set_descriptor",
415
+ types.int32(types.CPointer(types.int32)),
416
+ link=[set_desc],
417
+ )
418
+
419
+ ffi = cffi.FFI()
420
+
421
+ @cuda.jit
422
+ def kernel(a):
423
+ sm = cuda.shared.array(1, dtype=np.int32)
424
+ data_ptr = ffi.from_buffer(sm)
425
+ set_descriptor(data_ptr)
426
+
427
+ # just to prevent optimization:
428
+ sm[0] = 2
429
+ cuda.syncthreads()
430
+ a[0] = sm[0]
431
+
432
+ a = np.ones(1, dtype=np.int32)
433
+
434
+ kernel[1, 1](a)
435
+
436
+ assert a[0] == 2
437
+
438
+ def test_get_arch_option_force_cc(self):
439
+ with override_config("FORCE_CUDA_CC", (8, 0)):
440
+ arch = nvrtc.get_arch_option(9, 0, "a")
441
+ self.assertEqual("compute_80", arch)
442
+
443
+ def test_get_arch_option_force_cc_arch_specific(self):
444
+ with override_config("FORCE_CUDA_CC", (9, 0, "a")):
445
+ arch = nvrtc.get_arch_option(9, 0)
446
+ self.assertEqual("compute_90a", arch)
447
+
448
+ def test_get_arch_option_illegal_arch_specific(self):
449
+ # Using a fictitious very high compute capability (major 99) for this
450
+ # test to ensure future toolkits are unlikely to provide an exact match
451
+ msg = "Can't use arch-specific compute_990a with"
452
+ with self.assertRaisesRegex(ValueError, msg):
453
+ nvrtc.get_arch_option(99, 0, "a")
454
+
455
+
312
456
  if __name__ == "__main__":
313
457
  unittest.main()
@@ -87,13 +87,17 @@ class TestCudaMemory(CUDATestCase):
87
87
  dtor_invoked[0] += 1
88
88
 
89
89
  # Ensure finalizer is called when pointer is deleted
90
- ptr = driver.MemoryPointer(pointer=fake_ptr, size=40, finalizer=dtor)
90
+ ptr = driver.MemoryPointer(
91
+ context=self.context, pointer=fake_ptr, size=40, finalizer=dtor
92
+ )
91
93
  self.assertEqual(dtor_invoked[0], 0)
92
94
  del ptr
93
95
  self.assertEqual(dtor_invoked[0], 1)
94
96
 
95
97
  # Ensure removing derived pointer doesn't call finalizer
96
- ptr = driver.MemoryPointer(pointer=fake_ptr, size=40, finalizer=dtor)
98
+ ptr = driver.MemoryPointer(
99
+ context=self.context, pointer=fake_ptr, size=40, finalizer=dtor
100
+ )
97
101
  owned = ptr.own()
98
102
  del owned
99
103
  self.assertEqual(dtor_invoked[0], 1)
@@ -3,6 +3,7 @@
3
3
 
4
4
  import ctypes
5
5
  import numpy as np
6
+ import weakref
6
7
 
7
8
  from numba import cuda
8
9
  from numba.cuda.core import config
@@ -57,9 +58,10 @@ if not config.ENABLE_CUDASIM:
57
58
 
58
59
  # We use an AutoFreePointer so that the finalizer will be run when
59
60
  # the reference count drops to zero.
61
+ ctx = weakref.proxy(self.context)
60
62
  ptr = ctypes.c_void_p(alloc_count)
61
63
  return cuda.cudadrv.driver.AutoFreePointer(
62
- ptr, size, finalizer=finalizer
64
+ ctx, ptr, size, finalizer=finalizer
63
65
  )
64
66
 
65
67
  def initialize(self):
@@ -4,7 +4,7 @@
4
4
  import numpy as np
5
5
  from numba import cuda
6
6
  from numba.cuda.testing import unittest, CUDATestCase
7
- from cuda.core.experimental import Device
7
+ from numba.cuda._compat import Device
8
8
  from numba.cuda.testing import skip_on_cudasim
9
9
 
10
10
 
@@ -10,11 +10,12 @@ from numba.cuda.testing import (
10
10
  skip_if_nvjitlink_missing,
11
11
  )
12
12
  from numba.cuda.testing import CUDATestCase, test_data_dir
13
- from numba.cuda.cudadrv.driver import CudaAPIError, _Linker, LinkerError
13
+ from numba.cuda.cudadrv.driver import _Linker, LinkerError
14
14
  from numba.cuda import require_context
15
15
  from numba import cuda
16
16
  from numba.cuda import void, float64, int64, int32, float32
17
17
  from numba.cuda.typing.typeof import typeof
18
+ from numba.cuda._compat import CUDAError
18
19
 
19
20
  CONST1D = np.arange(10, dtype=np.float64)
20
21
 
@@ -113,7 +114,7 @@ class TestLinker(CUDATestCase):
113
114
  @require_context
114
115
  def test_linker_basic(self):
115
116
  """Simply go through the constructor and destructor"""
116
- linker = _Linker.new(cc=(7, 5))
117
+ linker = _Linker(max_registers=0, cc=(7, 5))
117
118
  del linker
118
119
 
119
120
  def _test_linking(self, eager):
@@ -195,7 +196,7 @@ class TestLinker(CUDATestCase):
195
196
 
196
197
  link = str(test_data_dir / "error.cu")
197
198
 
198
- from cuda.core.experimental._utils.cuda_utils import NVRTCError
199
+ from numba.cuda._compat import NVRTCError
199
200
 
200
201
  errty = NVRTCError
201
202
  with self.assertRaises(errty) as e:
@@ -308,10 +309,8 @@ class TestLinker(CUDATestCase):
308
309
  max_threads = compiled.get_max_threads_per_block()
309
310
  nelem = max_threads + 1
310
311
  ary = np.empty(nelem, dtype=np.int32)
311
- try:
312
+ with self.assertRaisesRegex(CUDAError, "CUDA_ERROR_INVALID_VALUE"):
312
313
  compiled[1, nelem](ary)
313
- except CudaAPIError as e:
314
- self.assertIn("cuLaunchKernel", e.msg)
315
314
 
316
315
  def test_get_local_mem_per_thread(self):
317
316
  sig = void(int32[::1], int32[::1], typeof(np.int32))
@@ -333,7 +332,7 @@ class TestLinker(CUDATestCase):
333
332
 
334
333
  @skip_if_nvjitlink_missing("nvJitLink not installed or new enough (>12.3)")
335
334
  def test_link_for_different_cc(self):
336
- linker = _Linker.new(cc=(7, 5), lto=True)
335
+ linker = _Linker(max_registers=0, cc=(7, 5), lto=True)
337
336
  code = """
338
337
  __device__ int foo(int x) {
339
338
  return x + 1;
@@ -13,11 +13,10 @@ from numba.cuda.testing import (
13
13
  CUDATestCase,
14
14
  skip_on_cudasim,
15
15
  )
16
+ from numba.cuda._compat import ObjectCode
16
17
 
17
18
  if not config.ENABLE_CUDASIM:
18
- from cuda.bindings.driver import cuModuleGetGlobal, cuMemcpyHtoD
19
-
20
- from cuda.bindings.driver import CUmodule as cu_module_type
19
+ from cuda.bindings.driver import cuLibraryGetGlobal, cuMemcpyHtoD
21
20
 
22
21
 
23
22
  def wipe_all_modules_in_context():
@@ -31,8 +30,8 @@ def wipe_all_modules_in_context():
31
30
  ctx.reset()
32
31
 
33
32
 
34
- def get_hashable_handle_value(handle):
35
- return handle
33
+ def get_hashable_handle_value(object_code):
34
+ return object_code.handle
36
35
 
37
36
 
38
37
  @skip_on_cudasim("Module loading not implemented in the simulator")
@@ -40,13 +39,13 @@ class TestModuleCallbacksBasic(CUDATestCase):
40
39
  def test_basic(self):
41
40
  counter = 0
42
41
 
43
- def setup(handle):
44
- self.assertTrue(isinstance(handle, cu_module_type))
42
+ def setup(object_code):
43
+ self.assertIsInstance(object_code, ObjectCode)
45
44
  nonlocal counter
46
45
  counter += 1
47
46
 
48
- def teardown(handle):
49
- self.assertTrue(isinstance(handle, cu_module_type))
47
+ def teardown(object_code):
48
+ self.assertIsInstance(object_code, ObjectCode)
50
49
  nonlocal counter
51
50
  counter -= 1
52
51
 
@@ -183,10 +182,10 @@ __device__ int get_num(int &retval) {
183
182
  }
184
183
  """
185
184
 
186
- def set_forty_two(handle):
185
+ def set_forty_two(object_code):
187
186
  # Initialize 42 to global variable `num`
188
- res, dptr, size = cuModuleGetGlobal(
189
- get_hashable_handle_value(handle), "num".encode()
187
+ res, dptr, size = cuLibraryGetGlobal(
188
+ get_hashable_handle_value(object_code), b"num"
190
189
  )
191
190
 
192
191
  arr = np.array([42], np.int32)
@@ -43,6 +43,12 @@ if TEST_BIN_DIR:
43
43
  TEST_BIN_DIR, "test_device_functions.ltoir"
44
44
  )
45
45
 
46
+ require_cuobjdump = (
47
+ test_device_functions_fatbin_multi,
48
+ test_device_functions_fatbin,
49
+ test_device_functions_o,
50
+ )
51
+
46
52
 
47
53
  @unittest.skipIf(
48
54
  not TEST_BIN_DIR or not _have_nvjitlink(),
@@ -99,17 +105,50 @@ class TestLinker(CUDATestCase):
99
105
  kernel[1, 1](result)
100
106
  assert result[0] == 3
101
107
 
108
+ def test_nvjitlink_jit_with_invalid_linkable_code(self):
109
+ with open(test_device_functions_cubin, "rb") as f:
110
+ content = f.read()
111
+ with self.assertRaisesRegex(
112
+ TypeError, "Expected path to file or a LinkableCode"
113
+ ):
114
+
115
+ @cuda.jit("void()", link=[content])
116
+ def kernel():
117
+ pass
118
+
119
+
120
+ @unittest.skipIf(
121
+ not TEST_BIN_DIR or not _have_nvjitlink(),
122
+ "nvJitLink not installed or new enough (>12.3)",
123
+ )
124
+ @skip_on_cudasim("Linking unsupported in the simulator")
125
+ class TestLinkerDumpAssembly(CUDATestCase):
126
+ def setUp(self):
127
+ super().setUp()
128
+ self._prev_dump_assembly = config.DUMP_ASSEMBLY
129
+ config.DUMP_ASSEMBLY = True
130
+
131
+ def tearDown(self):
132
+ config.DUMP_ASSEMBLY = self._prev_dump_assembly
133
+ super().tearDown()
134
+
102
135
  def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly(self):
103
- files = [
136
+ files = (
104
137
  test_device_functions_cu,
105
138
  test_device_functions_ltoir,
106
139
  test_device_functions_fatbin_multi,
107
- ]
108
-
109
- config.DUMP_ASSEMBLY = True
140
+ )
110
141
 
111
142
  for file in files:
112
143
  with self.subTest(file=file):
144
+ if (
145
+ file in require_cuobjdump
146
+ and os.getenv("NUMBA_CUDA_TEST_WHEEL_ONLY") is not None
147
+ ):
148
+ self.skipTest(
149
+ "wheel-only environments do not have cuobjdump"
150
+ )
151
+
113
152
  f = io.StringIO()
114
153
  with contextlib.redirect_stdout(f):
115
154
  sig = "uint32(uint32, uint32)"
@@ -125,21 +164,25 @@ class TestLinker(CUDATestCase):
125
164
 
126
165
  self.assertTrue("ASSEMBLY (AFTER LTO)" in f.getvalue())
127
166
 
128
- config.DUMP_ASSEMBLY = False
129
-
130
167
  def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn(self):
131
- files = [
168
+ files = (
132
169
  test_device_functions_a,
133
170
  test_device_functions_cubin,
134
171
  test_device_functions_fatbin,
135
172
  test_device_functions_o,
136
173
  test_device_functions_ptx,
137
- ]
138
-
139
- config.DUMP_ASSEMBLY = True
174
+ )
140
175
 
141
176
  for file in files:
142
177
  with self.subTest(file=file):
178
+ if (
179
+ file in require_cuobjdump
180
+ and os.getenv("NUMBA_CUDA_TEST_WHEEL_ONLY") is not None
181
+ ):
182
+ self.skipTest(
183
+ "wheel-only environments do not have cuobjdump"
184
+ )
185
+
143
186
  sig = "uint32(uint32, uint32)"
144
187
  add_from_numba = cuda.declare_device("add_from_numba", sig)
145
188
 
@@ -156,19 +199,6 @@ class TestLinker(CUDATestCase):
156
199
  func(result)
157
200
  assert result[0] == 3
158
201
 
159
- config.DUMP_ASSEMBLY = False
160
-
161
- def test_nvjitlink_jit_with_invalid_linkable_code(self):
162
- with open(test_device_functions_cubin, "rb") as f:
163
- content = f.read()
164
- with self.assertRaisesRegex(
165
- TypeError, "Expected path to file or a LinkableCode"
166
- ):
167
-
168
- @cuda.jit("void()", link=[content])
169
- def kernel():
170
- pass
171
-
172
202
 
173
203
  if __name__ == "__main__":
174
204
  unittest.main()
@@ -854,13 +854,25 @@ class TestBranchPrunePredicates(TestBranchPruneBase):
854
854
  _CONST2 = "PLACEHOLDER2"
855
855
  return _CONST2 + 4
856
856
 
857
- new = self._literal_const_sample_generator(impl, {1: 0, 3: 20})
857
+ if PYVERSION in ((3, 14),):
858
+ # The order of the __code__.co_consts changes with 3.14
859
+ new = self._literal_const_sample_generator(impl, {0: 0, 2: 20})
860
+ elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
861
+ new = self._literal_const_sample_generator(impl, {1: 0, 3: 20})
862
+ else:
863
+ raise NotImplementedError(PYVERSION)
858
864
  iconst = impl.__code__.co_consts
859
865
  nconst = new.__code__.co_consts
860
- self.assertEqual(
861
- iconst, (None, "PLACEHOLDER1", 3.14159, "PLACEHOLDER2", 4)
862
- )
863
- self.assertEqual(nconst, (None, 0, 3.14159, 20, 4))
866
+ if PYVERSION in ((3, 14),):
867
+ self.assertEqual(iconst, ("PLACEHOLDER1", 3.14159, "PLACEHOLDER2"))
868
+ self.assertEqual(nconst, (0, 3.14159, 20))
869
+ elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
870
+ self.assertEqual(
871
+ iconst, (None, "PLACEHOLDER1", 3.14159, "PLACEHOLDER2", 4)
872
+ )
873
+ self.assertEqual(nconst, (None, 0, 3.14159, 20, 4))
874
+ else:
875
+ raise NotImplementedError(PYVERSION)
864
876
  self.assertEqual(impl(None), 3.14159)
865
877
  self.assertEqual(new(None), 24)
866
878
 
@@ -872,7 +884,17 @@ class TestBranchPrunePredicates(TestBranchPruneBase):
872
884
 
873
885
  for c_inp, prune in (self._TRUTHY, False), (self._FALSEY, True):
874
886
  for const in c_inp:
875
- func = self._literal_const_sample_generator(impl, {1: const})
887
+ if PYVERSION in ((3, 14),):
888
+ # The order of the __code__.co_consts changes with 3.14
889
+ func = self._literal_const_sample_generator(
890
+ impl, {0: const}
891
+ )
892
+ elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
893
+ func = self._literal_const_sample_generator(
894
+ impl, {1: const}
895
+ )
896
+ else:
897
+ raise NotImplementedError(PYVERSION)
876
898
  self.assert_prune(
877
899
  func, (types.NoneType("none"),), [prune], None
878
900
  )
@@ -885,7 +907,17 @@ class TestBranchPrunePredicates(TestBranchPruneBase):
885
907
 
886
908
  for c_inp, prune in (self._TRUTHY, False), (self._FALSEY, True):
887
909
  for const in c_inp:
888
- func = self._literal_const_sample_generator(impl, {1: const})
910
+ if PYVERSION in ((3, 14),):
911
+ # The order of the __code__.co_consts changes with 3.14
912
+ func = self._literal_const_sample_generator(
913
+ impl, {0: const}
914
+ )
915
+ elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
916
+ func = self._literal_const_sample_generator(
917
+ impl, {1: const}
918
+ )
919
+ else:
920
+ raise NotImplementedError(PYVERSION)
889
921
  self.assert_prune(
890
922
  func, (types.NoneType("none"),), [prune], None
891
923
  )
@@ -900,7 +932,17 @@ class TestBranchPrunePredicates(TestBranchPruneBase):
900
932
 
901
933
  for c_inp, prune in (self._TRUTHY, False), (self._FALSEY, True):
902
934
  for const in c_inp:
903
- func = self._literal_const_sample_generator(impl, {1: const})
935
+ if PYVERSION in ((3, 14),):
936
+ # The order of the __code__.co_consts changes with 3.14
937
+ func = self._literal_const_sample_generator(
938
+ impl, {0: const}
939
+ )
940
+ elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
941
+ func = self._literal_const_sample_generator(
942
+ impl, {1: const}
943
+ )
944
+ else:
945
+ raise NotImplementedError(PYVERSION)
904
946
  self.assert_prune(
905
947
  func, (types.NoneType("none"),), [prune], None
906
948
  )
@@ -915,7 +957,17 @@ class TestBranchPrunePredicates(TestBranchPruneBase):
915
957
 
916
958
  for c_inp, prune in (self._TRUTHY, False), (self._FALSEY, True):
917
959
  for const in c_inp:
918
- func = self._literal_const_sample_generator(impl, {1: const})
960
+ if PYVERSION in ((3, 14),):
961
+ # The order of the __code__.co_consts changes with 3.14
962
+ func = self._literal_const_sample_generator(
963
+ impl, {0: const}
964
+ )
965
+ elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
966
+ func = self._literal_const_sample_generator(
967
+ impl, {1: const}
968
+ )
969
+ else:
970
+ raise NotImplementedError(PYVERSION)
919
971
  self.assert_prune(
920
972
  func, (types.NoneType("none"),), [prune], None
921
973
  )