numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.23.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/api.py +4 -1
  3. numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
  4. numba_cuda/numba/cuda/cext/_dispatcher.cpp +0 -38
  5. numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
  6. numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
  7. numba_cuda/numba/cuda/cext/_typeof.cpp +0 -111
  8. numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
  9. numba_cuda/numba/cuda/codegen.py +42 -10
  10. numba_cuda/numba/cuda/compiler.py +10 -4
  11. numba_cuda/numba/cuda/core/analysis.py +29 -21
  12. numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
  13. numba_cuda/numba/cuda/core/base.py +6 -1
  14. numba_cuda/numba/cuda/core/consts.py +1 -1
  15. numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
  16. numba_cuda/numba/cuda/core/errors.py +4 -912
  17. numba_cuda/numba/cuda/core/inline_closurecall.py +71 -57
  18. numba_cuda/numba/cuda/core/interpreter.py +79 -64
  19. numba_cuda/numba/cuda/core/ir.py +191 -119
  20. numba_cuda/numba/cuda/core/ir_utils.py +142 -112
  21. numba_cuda/numba/cuda/core/postproc.py +8 -8
  22. numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
  23. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
  24. numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
  25. numba_cuda/numba/cuda/core/ssa.py +3 -3
  26. numba_cuda/numba/cuda/core/transforms.py +25 -10
  27. numba_cuda/numba/cuda/core/typed_passes.py +9 -9
  28. numba_cuda/numba/cuda/core/typeinfer.py +39 -24
  29. numba_cuda/numba/cuda/core/untyped_passes.py +71 -55
  30. numba_cuda/numba/cuda/cudadecl.py +0 -13
  31. numba_cuda/numba/cuda/cudadrv/devicearray.py +6 -5
  32. numba_cuda/numba/cuda/cudadrv/driver.py +132 -511
  33. numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
  34. numba_cuda/numba/cuda/cudadrv/nvrtc.py +16 -0
  35. numba_cuda/numba/cuda/cudaimpl.py +0 -12
  36. numba_cuda/numba/cuda/debuginfo.py +104 -10
  37. numba_cuda/numba/cuda/descriptor.py +1 -1
  38. numba_cuda/numba/cuda/device_init.py +4 -7
  39. numba_cuda/numba/cuda/dispatcher.py +36 -32
  40. numba_cuda/numba/cuda/intrinsics.py +150 -1
  41. numba_cuda/numba/cuda/lowering.py +64 -29
  42. numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
  43. numba_cuda/numba/cuda/np/arrayobj.py +54 -0
  44. numba_cuda/numba/cuda/np/numpy_support.py +26 -0
  45. numba_cuda/numba/cuda/printimpl.py +20 -0
  46. numba_cuda/numba/cuda/serialize.py +10 -0
  47. numba_cuda/numba/cuda/stubs.py +0 -11
  48. numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
  49. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
  50. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +130 -48
  51. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
  52. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
  53. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +5 -6
  54. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
  55. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +27 -19
  56. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
  57. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +10 -0
  58. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +89 -0
  59. numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
  60. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
  61. numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
  62. numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
  63. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +116 -1
  64. numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
  65. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
  66. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
  67. numba_cuda/numba/cuda/typing/context.py +3 -1
  68. numba_cuda/numba/cuda/typing/typeof.py +56 -0
  69. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/METADATA +1 -1
  70. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/RECORD +74 -74
  71. numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
  72. numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
  73. numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
  74. numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
  75. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/WHEEL +0 -0
  76. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/licenses/LICENSE +0 -0
  77. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/licenses/LICENSE.numba +0 -0
  78. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/top_level.txt +0 -0
@@ -3,15 +3,15 @@
3
3
 
4
4
  from ctypes import c_int, sizeof
5
5
 
6
- from numba.cuda.cudadrv.driver import (
7
- host_to_device,
8
- device_to_host,
9
- driver,
10
- launch_kernel,
6
+ from numba.cuda.cudadrv.driver import host_to_device, device_to_host, driver
7
+ from cuda.core.experimental import (
8
+ LaunchConfig,
9
+ Stream as ExperimentalStream,
10
+ launch,
11
11
  )
12
12
 
13
13
  from numba import cuda
14
- from numba.cuda.cudadrv import devices, driver as _driver
14
+ from numba.cuda.cudadrv import devices
15
15
  from numba.cuda.testing import unittest, CUDATestCase
16
16
  from numba.cuda.testing import skip_on_cudasim
17
17
  import contextlib
@@ -98,22 +98,15 @@ class TestCudaDriver(CUDATestCase):
98
98
  host_to_device(memory, array, sizeof(array))
99
99
 
100
100
  ptr = memory.device_ctypes_pointer
101
- stream = 0
102
-
103
- stream = _driver.binding.CUstream(stream)
104
-
105
- launch_kernel(
106
- function.handle, # Kernel
107
- 1,
108
- 1,
109
- 1, # gx, gy, gz
110
- 100,
111
- 1,
112
- 1, # bx, by, bz
113
- 0, # dynamic shared mem
114
- stream, # stream
115
- [ptr],
116
- ) # arguments
101
+
102
+ config = LaunchConfig(
103
+ grid=(1, 1, 1),
104
+ block=(100, 1, 1),
105
+ shmem_size=0,
106
+ cooperative_launch=False,
107
+ )
108
+ exp_stream = ExperimentalStream.from_handle(0)
109
+ launch(exp_stream, config, function.kernel, ptr)
117
110
 
118
111
  device_to_host(array, memory, sizeof(array))
119
112
  for i, v in enumerate(array):
@@ -122,6 +115,8 @@ class TestCudaDriver(CUDATestCase):
122
115
  module.unload()
123
116
 
124
117
  def test_cuda_driver_stream_operations(self):
118
+ from numba.cuda.cudadrv.driver import _to_core_stream
119
+
125
120
  module = self.context.create_module_ptx(self.ptx)
126
121
  function = module.get_function("_Z10helloworldPi")
127
122
 
@@ -135,21 +130,14 @@ class TestCudaDriver(CUDATestCase):
135
130
 
136
131
  ptr = memory.device_ctypes_pointer
137
132
 
138
- stream_handle = stream.handle
139
- stream_handle = stream_handle.value
140
-
141
- launch_kernel(
142
- function.handle, # Kernel
143
- 1,
144
- 1,
145
- 1, # gx, gy, gz
146
- 100,
147
- 1,
148
- 1, # bx, by, bz
149
- 0, # dynamic shared mem
150
- stream_handle, # stream
151
- [ptr],
152
- ) # arguments
133
+ config = LaunchConfig(
134
+ grid=(1, 1, 1),
135
+ block=(100, 1, 1),
136
+ shmem_size=0,
137
+ cooperative_launch=False,
138
+ )
139
+ # Convert numba Stream to ExperimentalStream
140
+ launch(_to_core_stream(stream), config, function.kernel, ptr)
153
141
 
154
142
  device_to_host(array, memory, sizeof(array), stream=stream)
155
143
 
@@ -177,18 +165,13 @@ class TestCudaDriver(CUDATestCase):
177
165
 
178
166
  ptr = memory.device_ctypes_pointer
179
167
 
180
- launch_kernel(
181
- function.handle, # Kernel
182
- 1,
183
- 1,
184
- 1, # gx, gy, gz
185
- 100,
186
- 1,
187
- 1, # bx, by, bz
188
- 0, # dynamic shared mem
189
- stream.handle, # stream
190
- [ptr],
168
+ config = LaunchConfig(
169
+ grid=(1, 1, 1),
170
+ block=(100, 1, 1),
171
+ shmem_size=0,
172
+ cooperative_launch=False,
191
173
  )
174
+ launch(stream, config, function.kernel, ptr)
192
175
 
193
176
  device_to_host(array, memory, sizeof(array), stream=stream)
194
177
  for i, v in enumerate(array):
@@ -285,6 +268,105 @@ class TestCudaDriver(CUDATestCase):
285
268
  self.assertTrue(grid > 0)
286
269
  self.assertTrue(block > 0)
287
270
 
271
+ def test_cuda_cache_config(self):
272
+ from numba import types
273
+ import numpy as np
274
+
275
+ sig = (types.float32[::1], types.float32[::1])
276
+
277
+ @cuda.jit(sig)
278
+ def add_one(r, x):
279
+ i = cuda.grid(1)
280
+ if i < len(r):
281
+ r[i] = x[i] + 1
282
+
283
+ kernel = add_one.overloads[sig]
284
+ cufunc = kernel._codelibrary.get_cufunc()
285
+
286
+ configs_to_test = [
287
+ ("prefer_shared", dict(prefer_shared=True)),
288
+ ("prefer_cache", dict(prefer_cache=True)),
289
+ ("prefer_equal", dict(prefer_equal=True)),
290
+ ("default", dict()),
291
+ ]
292
+
293
+ for name, kwargs in configs_to_test:
294
+ with self.subTest(config=name):
295
+ try:
296
+ cufunc.cache_config(**kwargs)
297
+ except Exception as e:
298
+ self.fail(f"cache_config({name}) failed: {e}")
299
+
300
+ x = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
301
+ r = np.zeros_like(x)
302
+
303
+ d_x = cuda.to_device(x)
304
+ d_r = cuda.to_device(r)
305
+
306
+ cufunc.cache_config(prefer_shared=True)
307
+ add_one[1, 5](d_r, d_x)
308
+
309
+ result = d_r.copy_to_host()
310
+ expected = x + 1
311
+
312
+ np.testing.assert_array_almost_equal(
313
+ result,
314
+ expected,
315
+ err_msg="Kernel produced incorrect results after cache_config",
316
+ )
317
+
318
+ def test_cuda_set_shared_memory_carveout(self):
319
+ from numba import types
320
+ import numpy as np
321
+
322
+ sig = (types.float32[::1], types.float32[::1])
323
+
324
+ @cuda.jit(sig)
325
+ def add_one(r, x):
326
+ i = cuda.grid(1)
327
+ if i < len(r):
328
+ r[i] = x[i] + 1
329
+
330
+ kernel = add_one.overloads[sig]
331
+ cufunc = kernel._codelibrary.get_cufunc()
332
+
333
+ # valid carveout values
334
+ carveout_values = [-1, 0, 50, 100]
335
+ for value in carveout_values:
336
+ with self.subTest(carveout=value):
337
+ try:
338
+ cufunc.set_shared_memory_carveout(value)
339
+ except Exception as e:
340
+ self.fail(
341
+ f"set_shared_memory_carveout({value}) failed: {e}"
342
+ )
343
+
344
+ # invalid carveout values
345
+ invalid_values = [-2, 101, 150]
346
+ for value in invalid_values:
347
+ with self.subTest(invalid_carveout=value):
348
+ with self.assertRaises(ValueError):
349
+ cufunc.set_shared_memory_carveout(value)
350
+
351
+ # test the kernel
352
+ x = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
353
+ r = np.zeros_like(x)
354
+
355
+ d_x = cuda.to_device(x)
356
+ d_r = cuda.to_device(r)
357
+
358
+ cufunc.set_shared_memory_carveout(75)
359
+ add_one[1, 5](d_r, d_x)
360
+
361
+ result = d_r.copy_to_host()
362
+ expected = x + 1
363
+
364
+ np.testing.assert_array_almost_equal(
365
+ result,
366
+ expected,
367
+ err_msg="Kernel produced incorrect results after set_shared_memory_carveout",
368
+ )
369
+
288
370
 
289
371
  class TestDevice(CUDATestCase):
290
372
  def test_device_get_uuid(self):
@@ -87,13 +87,17 @@ class TestCudaMemory(CUDATestCase):
87
87
  dtor_invoked[0] += 1
88
88
 
89
89
  # Ensure finalizer is called when pointer is deleted
90
- ptr = driver.MemoryPointer(pointer=fake_ptr, size=40, finalizer=dtor)
90
+ ptr = driver.MemoryPointer(
91
+ context=self.context, pointer=fake_ptr, size=40, finalizer=dtor
92
+ )
91
93
  self.assertEqual(dtor_invoked[0], 0)
92
94
  del ptr
93
95
  self.assertEqual(dtor_invoked[0], 1)
94
96
 
95
97
  # Ensure removing derived pointer doesn't call finalizer
96
- ptr = driver.MemoryPointer(pointer=fake_ptr, size=40, finalizer=dtor)
98
+ ptr = driver.MemoryPointer(
99
+ context=self.context, pointer=fake_ptr, size=40, finalizer=dtor
100
+ )
97
101
  owned = ptr.own()
98
102
  del owned
99
103
  self.assertEqual(dtor_invoked[0], 1)
@@ -3,6 +3,7 @@
3
3
 
4
4
  import ctypes
5
5
  import numpy as np
6
+ import weakref
6
7
 
7
8
  from numba import cuda
8
9
  from numba.cuda.core import config
@@ -57,9 +58,10 @@ if not config.ENABLE_CUDASIM:
57
58
 
58
59
  # We use an AutoFreePointer so that the finalizer will be run when
59
60
  # the reference count drops to zero.
61
+ ctx = weakref.proxy(self.context)
60
62
  ptr = ctypes.c_void_p(alloc_count)
61
63
  return cuda.cudadrv.driver.AutoFreePointer(
62
- ptr, size, finalizer=finalizer
64
+ ctx, ptr, size, finalizer=finalizer
63
65
  )
64
66
 
65
67
  def initialize(self):
@@ -10,11 +10,12 @@ from numba.cuda.testing import (
10
10
  skip_if_nvjitlink_missing,
11
11
  )
12
12
  from numba.cuda.testing import CUDATestCase, test_data_dir
13
- from numba.cuda.cudadrv.driver import CudaAPIError, _Linker, LinkerError
13
+ from numba.cuda.cudadrv.driver import _Linker, LinkerError
14
14
  from numba.cuda import require_context
15
15
  from numba import cuda
16
16
  from numba.cuda import void, float64, int64, int32, float32
17
17
  from numba.cuda.typing.typeof import typeof
18
+ from cuda.core.experimental._utils.cuda_utils import CUDAError
18
19
 
19
20
  CONST1D = np.arange(10, dtype=np.float64)
20
21
 
@@ -113,7 +114,7 @@ class TestLinker(CUDATestCase):
113
114
  @require_context
114
115
  def test_linker_basic(self):
115
116
  """Simply go through the constructor and destructor"""
116
- linker = _Linker.new(cc=(7, 5))
117
+ linker = _Linker(max_registers=0, cc=(7, 5))
117
118
  del linker
118
119
 
119
120
  def _test_linking(self, eager):
@@ -308,10 +309,8 @@ class TestLinker(CUDATestCase):
308
309
  max_threads = compiled.get_max_threads_per_block()
309
310
  nelem = max_threads + 1
310
311
  ary = np.empty(nelem, dtype=np.int32)
311
- try:
312
+ with self.assertRaisesRegex(CUDAError, "CUDA_ERROR_INVALID_VALUE"):
312
313
  compiled[1, nelem](ary)
313
- except CudaAPIError as e:
314
- self.assertIn("cuLaunchKernel", e.msg)
315
314
 
316
315
  def test_get_local_mem_per_thread(self):
317
316
  sig = void(int32[::1], int32[::1], typeof(np.int32))
@@ -333,7 +332,7 @@ class TestLinker(CUDATestCase):
333
332
 
334
333
  @skip_if_nvjitlink_missing("nvJitLink not installed or new enough (>12.3)")
335
334
  def test_link_for_different_cc(self):
336
- linker = _Linker.new(cc=(7, 5), lto=True)
335
+ linker = _Linker(max_registers=0, cc=(7, 5), lto=True)
337
336
  code = """
338
337
  __device__ int foo(int x) {
339
338
  return x + 1;
@@ -13,11 +13,10 @@ from numba.cuda.testing import (
13
13
  CUDATestCase,
14
14
  skip_on_cudasim,
15
15
  )
16
+ from cuda.core.experimental import ObjectCode
16
17
 
17
18
  if not config.ENABLE_CUDASIM:
18
- from cuda.bindings.driver import cuModuleGetGlobal, cuMemcpyHtoD
19
-
20
- from cuda.bindings.driver import CUmodule as cu_module_type
19
+ from cuda.bindings.driver import cuLibraryGetGlobal, cuMemcpyHtoD
21
20
 
22
21
 
23
22
  def wipe_all_modules_in_context():
@@ -31,8 +30,8 @@ def wipe_all_modules_in_context():
31
30
  ctx.reset()
32
31
 
33
32
 
34
- def get_hashable_handle_value(handle):
35
- return handle
33
+ def get_hashable_handle_value(object_code):
34
+ return object_code.handle
36
35
 
37
36
 
38
37
  @skip_on_cudasim("Module loading not implemented in the simulator")
@@ -40,13 +39,13 @@ class TestModuleCallbacksBasic(CUDATestCase):
40
39
  def test_basic(self):
41
40
  counter = 0
42
41
 
43
- def setup(handle):
44
- self.assertTrue(isinstance(handle, cu_module_type))
42
+ def setup(object_code):
43
+ self.assertIsInstance(object_code, ObjectCode)
45
44
  nonlocal counter
46
45
  counter += 1
47
46
 
48
- def teardown(handle):
49
- self.assertTrue(isinstance(handle, cu_module_type))
47
+ def teardown(object_code):
48
+ self.assertIsInstance(object_code, ObjectCode)
50
49
  nonlocal counter
51
50
  counter -= 1
52
51
 
@@ -183,10 +182,10 @@ __device__ int get_num(int &retval) {
183
182
  }
184
183
  """
185
184
 
186
- def set_forty_two(handle):
185
+ def set_forty_two(object_code):
187
186
  # Initialize 42 to global variable `num`
188
- res, dptr, size = cuModuleGetGlobal(
189
- get_hashable_handle_value(handle), "num".encode()
187
+ res, dptr, size = cuLibraryGetGlobal(
188
+ get_hashable_handle_value(object_code), b"num"
190
189
  )
191
190
 
192
191
  arr = np.array([42], np.int32)
@@ -99,6 +99,33 @@ class TestLinker(CUDATestCase):
99
99
  kernel[1, 1](result)
100
100
  assert result[0] == 3
101
101
 
102
+ def test_nvjitlink_jit_with_invalid_linkable_code(self):
103
+ with open(test_device_functions_cubin, "rb") as f:
104
+ content = f.read()
105
+ with self.assertRaisesRegex(
106
+ TypeError, "Expected path to file or a LinkableCode"
107
+ ):
108
+
109
+ @cuda.jit("void()", link=[content])
110
+ def kernel():
111
+ pass
112
+
113
+
114
+ @unittest.skipIf(
115
+ not TEST_BIN_DIR or not _have_nvjitlink(),
116
+ "nvJitLink not installed or new enough (>12.3)",
117
+ )
118
+ @skip_on_cudasim("Linking unsupported in the simulator")
119
+ class TestLinkerDumpAssembly(CUDATestCase):
120
+ def setUp(self):
121
+ super().setUp()
122
+ self._prev_dump_assembly = config.DUMP_ASSEMBLY
123
+ config.DUMP_ASSEMBLY = True
124
+
125
+ def tearDown(self):
126
+ config.DUMP_ASSEMBLY = self._prev_dump_assembly
127
+ super().tearDown()
128
+
102
129
  def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly(self):
103
130
  files = [
104
131
  test_device_functions_cu,
@@ -106,8 +133,6 @@ class TestLinker(CUDATestCase):
106
133
  test_device_functions_fatbin_multi,
107
134
  ]
108
135
 
109
- config.DUMP_ASSEMBLY = True
110
-
111
136
  for file in files:
112
137
  with self.subTest(file=file):
113
138
  f = io.StringIO()
@@ -125,8 +150,6 @@ class TestLinker(CUDATestCase):
125
150
 
126
151
  self.assertTrue("ASSEMBLY (AFTER LTO)" in f.getvalue())
127
152
 
128
- config.DUMP_ASSEMBLY = False
129
-
130
153
  def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn(self):
131
154
  files = [
132
155
  test_device_functions_a,
@@ -136,8 +159,6 @@ class TestLinker(CUDATestCase):
136
159
  test_device_functions_ptx,
137
160
  ]
138
161
 
139
- config.DUMP_ASSEMBLY = True
140
-
141
162
  for file in files:
142
163
  with self.subTest(file=file):
143
164
  sig = "uint32(uint32, uint32)"
@@ -156,19 +177,6 @@ class TestLinker(CUDATestCase):
156
177
  func(result)
157
178
  assert result[0] == 3
158
179
 
159
- config.DUMP_ASSEMBLY = False
160
-
161
- def test_nvjitlink_jit_with_invalid_linkable_code(self):
162
- with open(test_device_functions_cubin, "rb") as f:
163
- content = f.read()
164
- with self.assertRaisesRegex(
165
- TypeError, "Expected path to file or a LinkableCode"
166
- ):
167
-
168
- @cuda.jit("void()", link=[content])
169
- def kernel():
170
- pass
171
-
172
180
 
173
181
  if __name__ == "__main__":
174
182
  unittest.main()
@@ -25,6 +25,11 @@ from numba.cuda.tests.support import (
25
25
  temp_directory,
26
26
  import_dynamic,
27
27
  )
28
+ import numpy as np
29
+ from pickle import PicklingError
30
+
31
+ # Module-level global for testing that caching rejects global device arrays
32
+ GLOBAL_DEVICE_ARRAY = None
28
33
 
29
34
 
30
35
  class BaseCacheTest(TestCase):
@@ -368,6 +373,48 @@ class CUDACachingTest(DispatcherCacheUsecasesTest):
368
373
  def f():
369
374
  pass
370
375
 
376
+ def test_cannot_cache_captured_device_array(self):
377
+ # Test that kernels capturing device arrays from closures cannot
378
+ # be cached. The error can come from either NumbaPickler (for closure
379
+ # variables) or CUDACodeLibrary._reduce_states (for referenced objects).
380
+ host_data = np.array([1.0, 2.0, 3.0], dtype=np.float32)
381
+ captured_arr = cuda.to_device(host_data)
382
+
383
+ msg = "global device arrays"
384
+ with self.assertRaisesRegex(PicklingError, msg):
385
+
386
+ @cuda.jit(cache=True)
387
+ def cached_kernel(output):
388
+ i = cuda.grid(1)
389
+ if i < output.size:
390
+ output[i] = captured_arr[i] * 2.0
391
+
392
+ output = cuda.device_array(3, dtype=np.float32)
393
+ cached_kernel[1, 3](output)
394
+
395
+ def test_cannot_cache_global_device_array(self):
396
+ # Test that kernels referencing module-level global device arrays
397
+ # cannot be cached.
398
+ global GLOBAL_DEVICE_ARRAY
399
+
400
+ host_data = np.array([1.0, 2.0, 3.0], dtype=np.float32)
401
+ GLOBAL_DEVICE_ARRAY = cuda.to_device(host_data)
402
+
403
+ try:
404
+ msg = "global device arrays"
405
+ with self.assertRaisesRegex(PicklingError, msg):
406
+
407
+ @cuda.jit(cache=True)
408
+ def cached_kernel_global(output):
409
+ i = cuda.grid(1)
410
+ if i < output.size:
411
+ output[i] = GLOBAL_DEVICE_ARRAY[i] * 2.0
412
+
413
+ output = cuda.device_array(3, dtype=np.float32)
414
+ cached_kernel_global[1, 3](output)
415
+ finally:
416
+ GLOBAL_DEVICE_ARRAY = None
417
+
371
418
 
372
419
  @skip_on_cudasim("Simulator does not implement caching")
373
420
  class CUDACooperativeGroupTest(DispatcherCacheUsecasesTest):
@@ -169,6 +169,16 @@ class TestCompile(unittest.TestCase):
169
169
  # ending in the filename of this module.
170
170
  self.assertRegex(ptx, '\\.file.*test_compiler.py"')
171
171
 
172
+ # We did test for the presence of debuginfo here, but in practice it made
173
+ # no sense - the C ABI wrapper generates a call instruction that has
174
+ # nothing to correlate with the DWARF, so it would confuse the debugger
175
+ # immediately anyway. With the resolution of Issue #588 (using separate
176
+ # translation of each IR module when debuginfo is enabled) the debuginfo
177
+ # isn't even produced for the ABI wrapper, because there was none present
178
+ # in that module anyway. So this test can only be expected to fail until we
179
+ # have a proper way of generating device functions with the C ABI without
180
+ # requiring the hack of generating a wrapper.
181
+ @unittest.expectedFailure
172
182
  def test_device_function_with_debug(self):
173
183
  # See Issue #6719 - this ensures that compilation with debug succeeds
174
184
  # with CUDA 11.2 / NVVM 7.0 onwards. Previously it failed because NVVM
@@ -6,6 +6,7 @@ from numba.cuda.tests.support import override_config, captured_stdout
6
6
  from numba.cuda.testing import skip_on_cudasim
7
7
  from numba import cuda
8
8
  from numba.cuda import types
9
+ from numba.cuda.np import numpy_support
9
10
  from numba.cuda.testing import CUDATestCase
10
11
  from numba.cuda.core import config
11
12
  from textwrap import dedent
@@ -884,6 +885,94 @@ class TestCudaDebugInfo(CUDATestCase):
884
885
  """,
885
886
  )
886
887
 
888
+ # shared_arr -> composite -> elements[4] (data field at index 4) -> pointer with dwarfAddressSpace: 8
889
+ # local_arr -> composite -> elements[4] (data field at index 4) -> pointer without dwarfAddressSpace: 8
890
+ address_class_filechecks = r"""
891
+ CHECK-DAG: [[SHARED_VAR:![0-9]+]] = !DILocalVariable({{.*}}name: "shared_arr"{{.*}}type: [[SHARED_COMPOSITE:![0-9]+]]
892
+ CHECK-DAG: [[SHARED_COMPOSITE]] = {{.*}}!DICompositeType(elements: [[SHARED_ELEMENTS:![0-9]+]]
893
+ CHECK-DAG: [[SHARED_ELEMENTS]] = !{{{.*}}, {{.*}}, {{.*}}, {{.*}}, [[SHARED_DATA:![0-9]+]], {{.*}}, {{.*}}}
894
+ CHECK-DAG: [[SHARED_DATA]] = !DIDerivedType(baseType: [[SHARED_PTR:![0-9]+]], name: "data"
895
+ CHECK-DAG: [[SHARED_PTR]] = !DIDerivedType({{.*}}dwarfAddressSpace: 8{{.*}}tag: DW_TAG_pointer_type
896
+
897
+ CHECK-DAG: [[LOCAL_VAR:![0-9]+]] = !DILocalVariable({{.*}}name: "local_arr"{{.*}}type: [[LOCAL_COMPOSITE:![0-9]+]]
898
+ CHECK-DAG: [[LOCAL_COMPOSITE]] = {{.*}}!DICompositeType(elements: [[LOCAL_ELEMENTS:![0-9]+]]
899
+ CHECK-DAG: [[LOCAL_ELEMENTS]] = !{{{.*}}, {{.*}}, {{.*}}, {{.*}}, [[LOCAL_DATA:![0-9]+]], {{.*}}, {{.*}}}
900
+ CHECK-DAG: [[LOCAL_DATA]] = !DIDerivedType(baseType: [[LOCAL_PTR:![0-9]+]], name: "data"
901
+ CHECK-DAG: [[LOCAL_PTR]] = !DIDerivedType(baseType: {{.*}}tag: DW_TAG_pointer_type
902
+ CHECK-NOT: [[LOCAL_PTR]]{{.*}}dwarfAddressSpace: 8
903
+ """
904
+
905
+ def _test_shared_memory_address_class(self, dtype):
906
+ """Test that shared memory arrays have correct DWARF address class.
907
+
908
+ Shared memory pointers should have addressClass: 8 (DW_AT_address_class
909
+ for CUDA shared memory) in their debug metadata, while regular local
910
+ arrays should not have this annotation.
911
+ """
912
+ sig = (numpy_support.from_dtype(dtype),)
913
+
914
+ @cuda.jit(sig, debug=True, opt=False)
915
+ def kernel_with_shared(data):
916
+ shared_arr = cuda.shared.array(32, dtype=dtype)
917
+ local_arr = cuda.local.array(32, dtype=dtype)
918
+ idx = cuda.grid(1)
919
+ if idx < 32:
920
+ shared_arr[idx] = data + idx
921
+ local_arr[idx] = data * 2 + idx
922
+ cuda.syncthreads()
923
+ if idx == 0:
924
+ result = dtype(0)
925
+ for i in range(32):
926
+ result += shared_arr[i] + local_arr[i]
927
+
928
+ llvm_ir = kernel_with_shared.inspect_llvm(sig)
929
+
930
+ self.assertFileCheckMatches(llvm_ir, self.address_class_filechecks)
931
+
932
+ def test_shared_memory_address_class_int32(self):
933
+ self._test_shared_memory_address_class(np.int32)
934
+
935
+ def test_shared_memory_address_class_complex64(self):
936
+ self._test_shared_memory_address_class(np.complex64)
937
+
938
+ def test_shared_memory_address_class_boolean(self):
939
+ self._test_shared_memory_address_class(np.bool)
940
+
941
+ def test_shared_memory_address_class_float16(self):
942
+ self._test_shared_memory_address_class(np.float16)
943
+
944
+ def test_shared_memory_address_class_record(self):
945
+ dtype = np.dtype(
946
+ [
947
+ ("a", np.int32),
948
+ ("b", np.float32),
949
+ ]
950
+ )
951
+ sig = (numpy_support.from_dtype(dtype),)
952
+
953
+ @cuda.jit(sig, debug=True, opt=False)
954
+ def kernel_with_shared(data):
955
+ shared_arr = cuda.shared.array(32, dtype=dtype)
956
+ local_arr = cuda.local.array(32, dtype=dtype)
957
+ result = cuda.local.array(1, dtype=dtype)
958
+ idx = cuda.grid(1)
959
+ if idx < 32:
960
+ shared_arr[idx].a = data.a + idx
961
+ local_arr[idx].a = data.a * 2 + idx
962
+ shared_arr[idx].b = data.b + idx
963
+ local_arr[idx].b = data.b * 2 + idx
964
+ cuda.syncthreads()
965
+ if idx == 0:
966
+ result[0].a = 0
967
+ result[0].b = 0.0
968
+ for i in range(32):
969
+ result[0].a += shared_arr[i].a + local_arr[i].a
970
+ result[0].b += shared_arr[i].b + local_arr[i].b
971
+
972
+ llvm_ir = kernel_with_shared.inspect_llvm(sig)
973
+
974
+ self.assertFileCheckMatches(llvm_ir, self.address_class_filechecks)
975
+
887
976
 
888
977
  if __name__ == "__main__":
889
978
  unittest.main()