numba-cuda 0.12.1__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/codegen.py +1 -1
  3. numba_cuda/numba/cuda/compiler.py +24 -1
  4. numba_cuda/numba/cuda/cudadrv/driver.py +15 -3
  5. numba_cuda/numba/cuda/cudadrv/nvrtc.py +1 -1
  6. numba_cuda/numba/cuda/cudadrv/nvvm.py +126 -25
  7. numba_cuda/numba/cuda/debuginfo.py +52 -1
  8. numba_cuda/numba/cuda/decorators.py +14 -0
  9. numba_cuda/numba/cuda/dispatcher.py +9 -2
  10. numba_cuda/numba/cuda/lowering.py +83 -4
  11. numba_cuda/numba/cuda/memory_management/__init__.py +1 -0
  12. numba_cuda/numba/cuda/simulator/__init__.py +10 -1
  13. numba_cuda/numba/cuda/simulator/_internal/__init__.py +1 -0
  14. numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +0 -0
  15. numba_cuda/numba/cuda/simulator/api.py +17 -0
  16. numba_cuda/numba/cuda/simulator/bf16.py +1 -0
  17. numba_cuda/numba/cuda/simulator/compiler.py +1 -0
  18. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +7 -0
  19. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +4 -0
  20. numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +57 -0
  21. numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +8 -0
  22. numba_cuda/numba/cuda/simulator/kernel.py +1 -1
  23. numba_cuda/numba/cuda/simulator/kernelapi.py +8 -2
  24. numba_cuda/numba/cuda/simulator/memory_management/__init__.py +1 -0
  25. numba_cuda/numba/cuda/simulator/memory_management/nrt.py +6 -0
  26. numba_cuda/numba/cuda/testing.py +10 -4
  27. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +2 -0
  28. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +15 -6
  29. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +3 -2
  30. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +3 -2
  31. numba_cuda/numba/cuda/tests/cudapy/test_array.py +0 -3
  32. numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +25 -1
  33. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +11 -4
  34. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +34 -21
  35. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +62 -2
  36. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +4 -2
  37. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +43 -4
  38. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +106 -2
  39. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +1 -0
  40. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +2 -0
  41. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +8 -21
  42. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +6 -6
  43. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +7 -7
  44. numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +64 -0
  45. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +60 -58
  46. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +3 -2
  47. numba_cuda/numba/cuda/tests/support.py +1 -1
  48. numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +1 -1
  49. numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +1 -1
  50. {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/METADATA +22 -1
  51. {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/RECORD +59 -51
  52. {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/WHEEL +1 -1
  53. numba_cuda/numba/cuda/runtime/__init__.py +0 -1
  54. /numba_cuda/numba/cuda/{runtime → memory_management}/memsys.cu +0 -0
  55. /numba_cuda/numba/cuda/{runtime → memory_management}/memsys.cuh +0 -0
  56. /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.cu +0 -0
  57. /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.cuh +0 -0
  58. /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.py +0 -0
  59. {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/licenses/LICENSE +0 -0
  60. {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1 @@
1
+ bfloat16 = None
@@ -7,3 +7,4 @@ compile = None
7
7
  compile_for_current_device = None
8
8
  compile_ptx = None
9
9
  compile_ptx_for_current_device = None
10
+ declare_device_function = None
@@ -3,6 +3,8 @@ Most of the driver API is unsupported in the simulator, but some stubs are
3
3
  provided to allow tests to import correctly.
4
4
  """
5
5
 
6
+ from numba import config
7
+
6
8
 
7
9
  def device_memset(dst, val, size, stream=0):
8
10
  dst.view("u1")[:size].fill(bytes([val])[0])
@@ -60,3 +62,8 @@ def launch_kernel(*args, **kwargs):
60
62
 
61
63
 
62
64
  USE_NV_BINDING = False
65
+
66
+ PyNvJitLinker = None
67
+
68
+ if config.ENABLE_CUDASIM:
69
+ config.CUDA_ENABLE_PYNVJITLINK = False
@@ -1,2 +1,6 @@
1
1
  def check_static_lib(lib):
2
2
  raise FileNotFoundError("Linking libraries not supported by cudasim")
3
+
4
+
5
+ def get_cuda_include_dir():
6
+ raise FileNotFoundError("CUDA includes not supported by cudasim")
@@ -0,0 +1,57 @@
1
+ class LinkableCode:
2
+ """An object that holds code to be linked from memory.
3
+
4
+ :param data: A buffer containing the data to link.
5
+ :param name: The name of the file to be referenced in any compilation or
6
+ linking errors that may be produced.
7
+ """
8
+
9
+ def __init__(self, data, name=None):
10
+ self.data = data
11
+ self._name = name
12
+
13
+ @property
14
+ def name(self):
15
+ return self._name or self.default_name
16
+
17
+
18
+ class PTXSource(LinkableCode):
19
+ """PTX source code in memory."""
20
+
21
+ default_name = "<unnamed-ptx>"
22
+
23
+
24
+ class CUSource(LinkableCode):
25
+ """CUDA C/C++ source code in memory."""
26
+
27
+ default_name = "<unnamed-cu>"
28
+
29
+
30
+ class Fatbin(LinkableCode):
31
+ """An ELF Fatbin in memory."""
32
+
33
+ default_name = "<unnamed-fatbin>"
34
+
35
+
36
+ class Cubin(LinkableCode):
37
+ """An ELF Cubin in memory."""
38
+
39
+ default_name = "<unnamed-cubin>"
40
+
41
+
42
+ class Archive(LinkableCode):
43
+ """An archive of objects in memory."""
44
+
45
+ default_name = "<unnamed-archive>"
46
+
47
+
48
+ class Object(LinkableCode):
49
+ """An object file in memory."""
50
+
51
+ default_name = "<unnamed-object>"
52
+
53
+
54
+ class LTOIR(LinkableCode):
55
+ """An LTOIR file in memory."""
56
+
57
+ default_name = "<unnamed-ltoir>"
@@ -0,0 +1,8 @@
1
+ """
2
+ NVVM is not supported in the simulator, but stubs are provided to allow tests
3
+ to import correctly.
4
+ """
5
+
6
+
7
+ def compile(src, name, cc, ltoir=False):
8
+ raise RuntimeError("NVRTC is not supported in the simulator")
@@ -78,7 +78,7 @@ class FakeCUDAKernel(object):
78
78
  functools.update_wrapper(self, fn)
79
79
 
80
80
  def __call__(self, *args):
81
- if self._device:
81
+ if self._device or _kernel_context:
82
82
  with swapped_cuda_module(self.fn, _get_kernel_context()):
83
83
  return self.fn(*args)
84
84
 
@@ -63,7 +63,10 @@ class FakeCUDALocal(object):
63
63
  CUDA Local arrays
64
64
  """
65
65
 
66
- def array(self, shape, dtype):
66
+ def array(self, shape, dtype, alignment=None):
67
+ if alignment is not None:
68
+ raise RuntimeError("Array alignment is not supported in cudasim")
69
+
67
70
  if isinstance(dtype, types.Type):
68
71
  dtype = numpy_support.as_dtype(dtype)
69
72
  return np.empty(shape, dtype)
@@ -102,7 +105,10 @@ class FakeCUDAShared(object):
102
105
  self._dynshared_size = dynshared_size
103
106
  self._dynshared = np.zeros(dynshared_size, dtype=np.byte)
104
107
 
105
- def array(self, shape, dtype):
108
+ def array(self, shape, dtype, alignment=None):
109
+ if alignment is not None:
110
+ raise RuntimeError("Array alignment is not supported in cudasim")
111
+
106
112
  if isinstance(dtype, types.Type):
107
113
  dtype = numpy_support.as_dtype(dtype)
108
114
  # Dynamic shared memory is requested with size 0 - this all shares the
@@ -0,0 +1 @@
1
+ from .nrt import rtsys # noqa: F401
@@ -0,0 +1,6 @@
1
+ from numba import config
2
+
3
+ rtsys = None
4
+
5
+ config.CUDA_NRT_STATS = False
6
+ config.CUDA_ENABLE_NRT = False
@@ -116,20 +116,26 @@ def skip_on_arm(reason):
116
116
  def skip_if_cuda_includes_missing(fn):
117
117
  # Skip when cuda.h is not available - generally this should indicate
118
118
  # whether the CUDA includes are available or not
119
- cuda_include_path = libs.get_cuda_include_dir()
119
+ reason = "CUDA include dir not available on this system"
120
+ try:
121
+ cuda_include_path = libs.get_cuda_include_dir()
122
+ except FileNotFoundError:
123
+ return unittest.skip(reason)(fn)
120
124
  cuda_h = os.path.join(cuda_include_path, "cuda.h")
121
125
  cuda_h_file = os.path.exists(cuda_h) and os.path.isfile(cuda_h)
122
- reason = "CUDA include dir not available on this system"
123
126
  return unittest.skipUnless(cuda_h_file, reason)(fn)
124
127
 
125
128
 
126
129
  def skip_if_curand_kernel_missing(fn):
127
- cuda_include_path = libs.get_cuda_include_dir()
130
+ reason = "curand_kernel.h not available on this system"
131
+ try:
132
+ cuda_include_path = libs.get_cuda_include_dir()
133
+ except FileNotFoundError:
134
+ return unittest.skip(reason)(fn)
128
135
  curand_kernel_h = os.path.join(cuda_include_path, "curand_kernel.h")
129
136
  curand_kernel_h_file = os.path.exists(curand_kernel_h) and os.path.isfile(
130
137
  curand_kernel_h
131
138
  )
132
- reason = "curand_kernel.h not available on this system"
133
139
  return unittest.skipUnless(curand_kernel_h_file, reason)(fn)
134
140
 
135
141
 
@@ -476,12 +476,14 @@ class TestArrayMethod(CUDATestCase):
476
476
  host_array, dev_array.copy_to_host().astype(dtype)
477
477
  )
478
478
 
479
+ @skip_on_cudasim("Simulator does not use __array__()")
479
480
  @unittest.skipUnless(IS_NUMPY_2, "NumPy 1.x does not pass copy kwarg")
480
481
  def test_np_array_copy_false(self):
481
482
  dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
482
483
  with self.assertRaisesRegex(ValueError, "`copy=False` is not"):
483
484
  np.array(dev_array, copy=False)
484
485
 
486
+ @skip_on_cudasim("Simulator does not use __array__()")
485
487
  @unittest.skipUnless(IS_NUMPY_2, "NumPy 1.x does not pass copy kwarg")
486
488
  def test_np_array_copy_true(self):
487
489
  dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
@@ -5,14 +5,19 @@ import numpy as np
5
5
 
6
6
  from numba import cuda, config
7
7
  from numba.cuda.cudadrv.linkable_code import CUSource
8
- from numba.cuda.testing import CUDATestCase, ContextResettingTestCase
8
+ from numba.cuda.testing import (
9
+ CUDATestCase,
10
+ ContextResettingTestCase,
11
+ skip_on_cudasim,
12
+ )
9
13
 
10
- from cuda.bindings.driver import cuModuleGetGlobal, cuMemcpyHtoD
14
+ if not config.ENABLE_CUDASIM:
15
+ from cuda.bindings.driver import cuModuleGetGlobal, cuMemcpyHtoD
11
16
 
12
- if config.CUDA_USE_NVIDIA_BINDING:
13
- from cuda.cuda import CUmodule as cu_module_type
14
- else:
15
- from numba.cuda.cudadrv.drvapi import cu_module as cu_module_type
17
+ if config.CUDA_USE_NVIDIA_BINDING:
18
+ from cuda.cuda import CUmodule as cu_module_type
19
+ else:
20
+ from numba.cuda.cudadrv.drvapi import cu_module as cu_module_type
16
21
 
17
22
 
18
23
  def wipe_all_modules_in_context():
@@ -32,6 +37,7 @@ def get_hashable_handle_value(handle):
32
37
  return handle
33
38
 
34
39
 
40
+ @skip_on_cudasim("Module loading not implemented in the simulator")
35
41
  class TestModuleCallbacksBasic(ContextResettingTestCase):
36
42
  def test_basic(self):
37
43
  counter = 0
@@ -136,6 +142,7 @@ class TestModuleCallbacksBasic(ContextResettingTestCase):
136
142
  self.assertEqual(len(teardown_seen), 2)
137
143
 
138
144
 
145
+ @skip_on_cudasim("Module loading not implemented in the simulator")
139
146
  class TestModuleCallbacksAPICompleteness(CUDATestCase):
140
147
  def test_api(self):
141
148
  def setup(handle):
@@ -164,6 +171,7 @@ class TestModuleCallbacksAPICompleteness(CUDATestCase):
164
171
  kernel[1, 1]()
165
172
 
166
173
 
174
+ @skip_on_cudasim("Module loading not implemented in the simulator")
167
175
  class TestModuleCallbacks(CUDATestCase):
168
176
  def setUp(self):
169
177
  super().setUp()
@@ -213,6 +221,7 @@ __device__ int get_num(int &retval) {
213
221
  self.assertEqual(arr[0], 42)
214
222
 
215
223
 
224
+ @skip_on_cudasim("Module loading not implemented in the simulator")
216
225
  class TestMultithreadedCallbacks(CUDATestCase):
217
226
  def test_concurrent_initialization(self):
218
227
  seen_mods = set()
@@ -267,6 +267,7 @@ class TestLinker(CUDATestCase):
267
267
  not PYNVJITLINK_INSTALLED or not TEST_BIN_DIR,
268
268
  reason="pynvjitlink not enabled",
269
269
  )
270
+ @skip_on_cudasim("Linking unsupported in the simulator")
270
271
  class TestLinkerUsage(CUDATestCase):
271
272
  """Test that whether pynvjitlink can be enabled by both environment variable
272
273
  and modification of config at runtime.
@@ -298,12 +299,12 @@ class TestLinkerUsage(CUDATestCase):
298
299
 
299
300
  def test_linker_enabled_envvar(self):
300
301
  env = os.environ.copy()
301
- env["NUMBA_CUDA_ENABLE_PYNVJITLINK"] = "1"
302
+ env.pop("NUMBA_CUDA_ENABLE_PYNVJITLINK", None)
302
303
  run_in_subprocess(self.src.format(config=""), env=env)
303
304
 
304
305
  def test_linker_disabled_envvar(self):
305
306
  env = os.environ.copy()
306
- env.pop("NUMBA_CUDA_ENABLE_PYNVJITLINK", None)
307
+ env["NUMBA_CUDA_ENABLE_PYNVJITLINK"] = "0"
307
308
  with self.assertRaisesRegex(
308
309
  AssertionError, "LTO and additional flags require PyNvJitLinker"
309
310
  ):
@@ -30,7 +30,8 @@ class TestNvvmDriver(unittest.TestCase):
30
30
  self.skipTest("-gen-lto unavailable in this toolkit version")
31
31
 
32
32
  nvvmir = self.get_nvvmir()
33
- ltoir = nvvm.compile_ir(nvvmir, opt=3, gen_lto=None, arch="compute_52")
33
+ arch = "compute_%d%d" % nvvm.LOWEST_CURRENT_CC
34
+ ltoir = nvvm.compile_ir(nvvmir, opt=3, gen_lto=None, arch=arch)
34
35
 
35
36
  # Verify we correctly passed the option by checking if we got LTOIR
36
37
  # from NVVM (by looking for the expected magic number for LTOIR)
@@ -138,9 +139,9 @@ class TestNvvmDriver(unittest.TestCase):
138
139
  class TestArchOption(unittest.TestCase):
139
140
  def test_get_arch_option(self):
140
141
  # Test returning the nearest lowest arch.
141
- self.assertEqual(nvvm.get_arch_option(5, 3), "compute_53")
142
142
  self.assertEqual(nvvm.get_arch_option(7, 5), "compute_75")
143
143
  self.assertEqual(nvvm.get_arch_option(7, 7), "compute_75")
144
+ self.assertEqual(nvvm.get_arch_option(8, 8), "compute_87")
144
145
  # Test known arch.
145
146
  supported_cc = nvvm.get_supported_ccs()
146
147
  for arch in supported_cc:
@@ -310,9 +310,6 @@ class TestCudaArray(CUDATestCase):
310
310
  check(array_reshape, array_reshape1d, arr, 0)
311
311
  check(array_reshape, array_reshape1d, arr, (0,))
312
312
  check(array_reshape, array_reshape3d, arr, (1, 0, 2))
313
- check_only_shape(array_reshape2d, arr, (0, -1), (0, 0))
314
- check_only_shape(array_reshape2d, arr, (4, -1), (4, 0))
315
- check_only_shape(array_reshape3d, arr, (-1, 0, 4), (0, 0, 4))
316
313
 
317
314
  # C-contiguous
318
315
  arr = np.arange(24)
@@ -3,7 +3,11 @@ import itertools
3
3
  import numpy as np
4
4
  from numba import cuda
5
5
  from numba.core.errors import TypingError
6
- from numba.cuda.testing import CUDATestCase
6
+ from numba.cuda.testing import (
7
+ CUDATestCase,
8
+ skip_on_cudasim,
9
+ skip_unless_cudasim,
10
+ )
7
11
  import unittest
8
12
 
9
13
 
@@ -65,6 +69,7 @@ for align in (True, False):
65
69
  # with the test_alignment.TestArrayAlignment class.
66
70
 
67
71
 
72
+ @skip_on_cudasim("Array alignment not supported on cudasim")
68
73
  class TestArrayAddressAlignment(CUDATestCase):
69
74
  """
70
75
  Test cuda.local.array and cuda.shared.array support for an alignment
@@ -232,5 +237,24 @@ class TestArrayAddressAlignment(CUDATestCase):
232
237
  print(".", end="", flush=True)
233
238
 
234
239
 
240
+ @skip_unless_cudasim("Only check for alignment unsupported in the simulator")
241
+ class TestCudasimUnsupportedAlignment(CUDATestCase):
242
+ def test_local_unsupported(self):
243
+ @cuda.jit
244
+ def f():
245
+ cuda.local.array(1, dtype=np.uint8, alignment=16)
246
+
247
+ with self.assertRaisesRegex(RuntimeError, "not supported in cudasim"):
248
+ f[1, 1]()
249
+
250
+ def test_shared_unsupported(self):
251
+ @cuda.jit
252
+ def f():
253
+ cuda.shared.array(1, dtype=np.uint8, alignment=16)
254
+
255
+ with self.assertRaisesRegex(RuntimeError, "not supported in cudasim"):
256
+ f[1, 1]()
257
+
258
+
235
259
  if __name__ == "__main__":
236
260
  unittest.main()
@@ -23,6 +23,15 @@ class TestBfloat16HighLevelBindings(CUDATestCase):
23
23
 
24
24
  def test_math_bindings(self):
25
25
  self.skip_unsupported()
26
+
27
+ exp_functions = [math.exp]
28
+ try:
29
+ from math import exp2
30
+
31
+ exp_functions += [exp2]
32
+ except ImportError:
33
+ pass
34
+
26
35
  functions = [
27
36
  math.trunc,
28
37
  math.ceil,
@@ -33,9 +42,7 @@ class TestBfloat16HighLevelBindings(CUDATestCase):
33
42
  math.cos,
34
43
  math.sin,
35
44
  math.tanh,
36
- math.exp,
37
- math.exp2,
38
- ]
45
+ ] + exp_functions
39
46
 
40
47
  for f in functions:
41
48
  with self.subTest(func=f):
@@ -49,7 +56,7 @@ class TestBfloat16HighLevelBindings(CUDATestCase):
49
56
  arr = cuda.device_array((1,), dtype="float32")
50
57
  kernel[1, 1](arr)
51
58
 
52
- if f in (math.exp, math.exp2):
59
+ if f in exp_functions:
53
60
  self.assertAlmostEqual(arr[0], f(3.14), delta=1e-1)
54
61
  else:
55
62
  self.assertAlmostEqual(arr[0], f(3.14), delta=1e-2)
@@ -2,29 +2,40 @@ import numba.cuda as cuda
2
2
  from numba.cuda.testing import unittest, CUDATestCase
3
3
  import numpy as np
4
4
 
5
- from numba import int16, int32, int64, uint16, uint32, uint64, float32, float64
5
+ from numba import (
6
+ config,
7
+ int16,
8
+ int32,
9
+ int64,
10
+ uint16,
11
+ uint32,
12
+ uint64,
13
+ float32,
14
+ float64,
15
+ )
6
16
  from numba.types import float16
7
17
 
8
- from numba.cuda._internal.cuda_bf16 import (
9
- nv_bfloat16,
10
- htrunc,
11
- hceil,
12
- hfloor,
13
- hrint,
14
- hsqrt,
15
- hrsqrt,
16
- hrcp,
17
- hlog,
18
- hlog2,
19
- hlog10,
20
- hcos,
21
- hsin,
22
- hexp,
23
- hexp2,
24
- hexp10,
25
- htanh,
26
- htanh_approx,
27
- )
18
+ if not config.ENABLE_CUDASIM:
19
+ from numba.cuda._internal.cuda_bf16 import (
20
+ nv_bfloat16,
21
+ htrunc,
22
+ hceil,
23
+ hfloor,
24
+ hrint,
25
+ hsqrt,
26
+ hrsqrt,
27
+ hrcp,
28
+ hlog,
29
+ hlog2,
30
+ hlog10,
31
+ hcos,
32
+ hsin,
33
+ hexp,
34
+ hexp2,
35
+ hexp10,
36
+ htanh,
37
+ htanh_approx,
38
+ )
28
39
 
29
40
  dtypes = [int16, int32, int64, uint16, uint32, uint64, float32]
30
41
 
@@ -263,6 +274,8 @@ class Bfloat16Test(CUDATestCase):
263
274
  np.testing.assert_allclose(arr, [8], atol=1e-2)
264
275
 
265
276
  def test_use_binding_inside_dfunc(self):
277
+ self.skip_unsupported()
278
+
266
279
  @cuda.jit(device=True)
267
280
  def f(arr):
268
281
  pi = nv_bfloat16(3.14)
@@ -1,5 +1,5 @@
1
1
  from math import sqrt
2
- from numba import cuda, float32, int16, int32, int64, uint32, void
2
+ from numba import cuda, float32, int16, int32, int64, types, uint32, void
3
3
  from numba.cuda import (
4
4
  compile,
5
5
  compile_for_current_device,
@@ -288,7 +288,7 @@ class TestCompileOnlyTests(unittest.TestCase):
288
288
  # Sleep for a variable time
289
289
  cuda.nanosleep(x)
290
290
 
291
- ptx, resty = compile_ptx(use_nanosleep, (uint32,), cc=(7, 0))
291
+ ptx, resty = compile_ptx(use_nanosleep, (uint32,))
292
292
 
293
293
  nanosleep_count = 0
294
294
  for line in ptx.split("\n"):
@@ -306,5 +306,65 @@ class TestCompileOnlyTests(unittest.TestCase):
306
306
  )
307
307
 
308
308
 
309
+ @skip_on_cudasim("Compilation unsupported in the simulator")
310
+ class TestCompileWithLaunchBounds(unittest.TestCase):
311
+ def _test_launch_bounds_common(self, launch_bounds):
312
+ def f():
313
+ pass
314
+
315
+ sig = "void()"
316
+ ptx, resty = cuda.compile_ptx(f, sig, launch_bounds=launch_bounds)
317
+ self.assertIsInstance(resty, types.NoneType)
318
+ self.assertRegex(ptx, r".maxntid\s+128,\s+1,\s+1")
319
+ return ptx
320
+
321
+ def test_launch_bounds_scalar(self):
322
+ launch_bounds = 128
323
+ ptx = self._test_launch_bounds_common(launch_bounds)
324
+
325
+ self.assertNotIn(".minnctapersm", ptx)
326
+ self.assertNotIn(".maxclusterrank", ptx)
327
+
328
+ def test_launch_bounds_tuple(self):
329
+ launch_bounds = (128,)
330
+ ptx = self._test_launch_bounds_common(launch_bounds)
331
+
332
+ self.assertNotIn(".minnctapersm", ptx)
333
+ self.assertNotIn(".maxclusterrank", ptx)
334
+
335
+ def test_launch_bounds_with_min_cta(self):
336
+ launch_bounds = (128, 2)
337
+ ptx = self._test_launch_bounds_common(launch_bounds)
338
+
339
+ self.assertRegex(ptx, r".minnctapersm\s+2")
340
+ self.assertNotIn(".maxclusterrank", ptx)
341
+
342
+ def test_launch_bounds_with_max_cluster_rank(self):
343
+ def f():
344
+ pass
345
+
346
+ launch_bounds = (128, 2, 4)
347
+ cc = (9, 0)
348
+ sig = "void()"
349
+ ptx, resty = cuda.compile_ptx(
350
+ f, sig, launch_bounds=launch_bounds, cc=cc
351
+ )
352
+ self.assertIsInstance(resty, types.NoneType)
353
+ self.assertRegex(ptx, r".maxntid\s+128,\s+1,\s+1")
354
+
355
+ self.assertRegex(ptx, r".minnctapersm\s+2")
356
+ self.assertRegex(ptx, r".maxclusterrank\s+4")
357
+
358
+ def test_too_many_launch_bounds(self):
359
+ def f():
360
+ pass
361
+
362
+ sig = "void()"
363
+ launch_bounds = (128, 2, 4, 8)
364
+
365
+ with self.assertRaisesRegex(ValueError, "Got 4 launch bounds:"):
366
+ cuda.compile_ptx(f, sig, launch_bounds=launch_bounds)
367
+
368
+
309
369
  if __name__ == "__main__":
310
370
  unittest.main()
@@ -157,6 +157,7 @@ class TestCudaCooperativeGroups(CUDATestCase):
157
157
  self.assertEqual(blocks1d, blocks2d)
158
158
  self.assertEqual(blocks1d, blocks3d)
159
159
 
160
+ @skip_on_cudasim("External code unsupported on cudasim")
160
161
  @skip_unless_cc_60
161
162
  def test_external_cooperative_func(self):
162
163
  cudapy_test_path = os.path.dirname(__file__)
@@ -171,12 +172,13 @@ class TestCudaCooperativeGroups(CUDATestCase):
171
172
  "cta_barrier", sig=sig, link=[src], use_cooperative=True
172
173
  )
173
174
 
174
- @cuda.jit
175
+ @cuda.jit("void()")
175
176
  def kernel():
176
177
  cta_barrier()
177
178
 
179
+ overload = kernel.overloads[()]
178
180
  block_size = 32
179
- grid_size = 1024
181
+ grid_size = overload.max_cooperative_grid_blocks(block_size)
180
182
 
181
183
  kernel[grid_size, block_size]()
182
184
 
@@ -332,10 +332,10 @@ class TestCudaDebugInfo(CUDATestCase):
332
332
 
333
333
  @cuda.jit("void(int32, int32)", debug=True, opt=False)
334
334
  def f(x, y):
335
- z = x # noqa: F841
336
- z = 100 # noqa: F841
337
- z = y # noqa: F841
338
- z = True # noqa: F841
335
+ z1 = x # noqa: F841
336
+ z2 = 100 # noqa: F841
337
+ z3 = y # noqa: F841
338
+ z4 = True # noqa: F841
339
339
 
340
340
  llvm_ir = f.inspect_llvm(sig)
341
341
  # Verify the call to llvm.dbg.declare is replaced by llvm.dbg.value
@@ -373,6 +373,45 @@ class TestCudaDebugInfo(CUDATestCase):
373
373
  match = re.compile(pat).search(llvm_ir)
374
374
  self.assertIsNone(match, msg=llvm_ir)
375
375
 
376
+ def test_union_poly_types(self):
377
+ sig = (types.int32, types.int32)
378
+
379
+ @cuda.jit("void(int32, int32)", debug=True, opt=False)
380
+ def f(x, y):
381
+ foo = 100 # noqa: F841
382
+ foo = 2.34 # noqa: F841
383
+ foo = True # noqa: F841
384
+ foo = 200 # noqa: F841
385
+
386
+ llvm_ir = f.inspect_llvm(sig)
387
+ # Extract the type node id
388
+ pat1 = r'!DILocalVariable\(.*name: "foo".*type: !(\d+)\)'
389
+ match = re.compile(pat1).search(llvm_ir)
390
+ self.assertIsNotNone(match, msg=llvm_ir)
391
+ mdnode_id = match.group(1)
392
+ # Verify the union type and extract the elements node id
393
+ pat2 = rf"!{mdnode_id} = distinct !DICompositeType\(elements: !(\d+),.*size: 64, tag: DW_TAG_union_type\)" # noqa: E501
394
+ match = re.compile(pat2).search(llvm_ir)
395
+ self.assertIsNotNone(match, msg=llvm_ir)
396
+ mdnode_id = match.group(1)
397
+ # Extract the member node ids
398
+ pat3 = r"!{ !(\d+), !(\d+), !(\d+) }"
399
+ match = re.compile(pat3).search(llvm_ir)
400
+ self.assertIsNotNone(match, msg=llvm_ir)
401
+ mdnode_id1 = match.group(1)
402
+ mdnode_id2 = match.group(2)
403
+ mdnode_id3 = match.group(3)
404
+ # Verify the member nodes
405
+ pat4 = rf'!{mdnode_id1} = !DIDerivedType(.*name: "_bool", size: 8, tag: DW_TAG_member)' # noqa: E501
406
+ match = re.compile(pat4).search(llvm_ir)
407
+ self.assertIsNotNone(match, msg=llvm_ir)
408
+ pat5 = rf'!{mdnode_id2} = !DIDerivedType(.*name: "_float64", size: 64, tag: DW_TAG_member)' # noqa: E501
409
+ match = re.compile(pat5).search(llvm_ir)
410
+ self.assertIsNotNone(match, msg=llvm_ir)
411
+ pat6 = rf'!{mdnode_id3} = !DIDerivedType(.*name: "_int64", size: 64, tag: DW_TAG_member)' # noqa: E501
412
+ match = re.compile(pat6).search(llvm_ir)
413
+ self.assertIsNotNone(match, msg=llvm_ir)
414
+
376
415
 
377
416
  if __name__ == "__main__":
378
417
  unittest.main()