numba-cuda 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/{cuda_bf16.py → _internal/cuda_bf16.py} +1 -1
  3. numba_cuda/numba/cuda/api.py +13 -0
  4. numba_cuda/numba/cuda/bf16.py +112 -0
  5. numba_cuda/numba/cuda/cg.py +2 -0
  6. numba_cuda/numba/cuda/codegen.py +9 -1
  7. numba_cuda/numba/cuda/compiler.py +2 -1
  8. numba_cuda/numba/cuda/cudadecl.py +6 -1
  9. numba_cuda/numba/cuda/cudadrv/driver.py +4 -0
  10. numba_cuda/numba/cuda/cudadrv/nvrtc.py +24 -2
  11. numba_cuda/numba/cuda/debuginfo.py +27 -0
  12. numba_cuda/numba/cuda/decorators.py +5 -2
  13. numba_cuda/numba/cuda/dispatcher.py +3 -3
  14. numba_cuda/numba/cuda/memory_management/__init__.py +1 -0
  15. numba_cuda/numba/cuda/simulator/__init__.py +10 -1
  16. numba_cuda/numba/cuda/simulator/_internal/__init__.py +1 -0
  17. numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +0 -0
  18. numba_cuda/numba/cuda/simulator/api.py +17 -0
  19. numba_cuda/numba/cuda/simulator/bf16.py +1 -0
  20. numba_cuda/numba/cuda/simulator/compiler.py +1 -0
  21. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +7 -0
  22. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +4 -0
  23. numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +57 -0
  24. numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +8 -0
  25. numba_cuda/numba/cuda/simulator/kernel.py +1 -1
  26. numba_cuda/numba/cuda/simulator/kernelapi.py +8 -2
  27. numba_cuda/numba/cuda/simulator/memory_management/__init__.py +1 -0
  28. numba_cuda/numba/cuda/simulator/memory_management/nrt.py +6 -0
  29. numba_cuda/numba/cuda/target.py +10 -1
  30. numba_cuda/numba/cuda/testing.py +10 -4
  31. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +2 -0
  32. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +15 -6
  33. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +1 -0
  34. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +0 -12
  35. numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +33 -0
  36. numba_cuda/numba/cuda/tests/cudapy/test_array.py +0 -3
  37. numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +25 -1
  38. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +62 -0
  39. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +80 -41
  40. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +34 -51
  41. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +36 -0
  42. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +17 -0
  43. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +1 -0
  44. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +2 -0
  45. numba_cuda/numba/cuda/tests/data/cta_barrier.cu +23 -0
  46. numba_cuda/numba/cuda/tests/data/include/add.cuh +3 -0
  47. numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +3 -0
  48. numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +9 -0
  49. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +48 -1
  50. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +60 -58
  51. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +3 -2
  52. numba_cuda/numba/cuda/tests/support.py +1 -1
  53. numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +1 -1
  54. numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +1 -1
  55. {numba_cuda-0.11.0.dist-info → numba_cuda-0.13.0.dist-info}/METADATA +1 -1
  56. {numba_cuda-0.11.0.dist-info → numba_cuda-0.13.0.dist-info}/RECORD +64 -50
  57. {numba_cuda-0.11.0.dist-info → numba_cuda-0.13.0.dist-info}/WHEEL +1 -1
  58. numba_cuda/numba/cuda/runtime/__init__.py +0 -1
  59. /numba_cuda/numba/cuda/{runtime → memory_management}/memsys.cu +0 -0
  60. /numba_cuda/numba/cuda/{runtime → memory_management}/memsys.cuh +0 -0
  61. /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.cu +0 -0
  62. /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.cuh +0 -0
  63. /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.py +0 -0
  64. {numba_cuda-0.11.0.dist-info → numba_cuda-0.13.0.dist-info}/licenses/LICENSE +0 -0
  65. {numba_cuda-0.11.0.dist-info → numba_cuda-0.13.0.dist-info}/top_level.txt +0 -0
@@ -116,20 +116,26 @@ def skip_on_arm(reason):
116
116
  def skip_if_cuda_includes_missing(fn):
117
117
  # Skip when cuda.h is not available - generally this should indicate
118
118
  # whether the CUDA includes are available or not
119
- cuda_include_path = libs.get_cuda_include_dir()
119
+ reason = "CUDA include dir not available on this system"
120
+ try:
121
+ cuda_include_path = libs.get_cuda_include_dir()
122
+ except FileNotFoundError:
123
+ return unittest.skip(reason)(fn)
120
124
  cuda_h = os.path.join(cuda_include_path, "cuda.h")
121
125
  cuda_h_file = os.path.exists(cuda_h) and os.path.isfile(cuda_h)
122
- reason = "CUDA include dir not available on this system"
123
126
  return unittest.skipUnless(cuda_h_file, reason)(fn)
124
127
 
125
128
 
126
129
  def skip_if_curand_kernel_missing(fn):
127
- cuda_include_path = libs.get_cuda_include_dir()
130
+ reason = "curand_kernel.h not available on this system"
131
+ try:
132
+ cuda_include_path = libs.get_cuda_include_dir()
133
+ except FileNotFoundError:
134
+ return unittest.skip(reason)(fn)
128
135
  curand_kernel_h = os.path.join(cuda_include_path, "curand_kernel.h")
129
136
  curand_kernel_h_file = os.path.exists(curand_kernel_h) and os.path.isfile(
130
137
  curand_kernel_h
131
138
  )
132
- reason = "curand_kernel.h not available on this system"
133
139
  return unittest.skipUnless(curand_kernel_h_file, reason)(fn)
134
140
 
135
141
 
@@ -476,12 +476,14 @@ class TestArrayMethod(CUDATestCase):
476
476
  host_array, dev_array.copy_to_host().astype(dtype)
477
477
  )
478
478
 
479
+ @skip_on_cudasim("Simulator does not use __array__()")
479
480
  @unittest.skipUnless(IS_NUMPY_2, "NumPy 1.x does not pass copy kwarg")
480
481
  def test_np_array_copy_false(self):
481
482
  dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
482
483
  with self.assertRaisesRegex(ValueError, "`copy=False` is not"):
483
484
  np.array(dev_array, copy=False)
484
485
 
486
+ @skip_on_cudasim("Simulator does not use __array__()")
485
487
  @unittest.skipUnless(IS_NUMPY_2, "NumPy 1.x does not pass copy kwarg")
486
488
  def test_np_array_copy_true(self):
487
489
  dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
@@ -5,14 +5,19 @@ import numpy as np
5
5
 
6
6
  from numba import cuda, config
7
7
  from numba.cuda.cudadrv.linkable_code import CUSource
8
- from numba.cuda.testing import CUDATestCase, ContextResettingTestCase
8
+ from numba.cuda.testing import (
9
+ CUDATestCase,
10
+ ContextResettingTestCase,
11
+ skip_on_cudasim,
12
+ )
9
13
 
10
- from cuda.bindings.driver import cuModuleGetGlobal, cuMemcpyHtoD
14
+ if not config.ENABLE_CUDASIM:
15
+ from cuda.bindings.driver import cuModuleGetGlobal, cuMemcpyHtoD
11
16
 
12
- if config.CUDA_USE_NVIDIA_BINDING:
13
- from cuda.cuda import CUmodule as cu_module_type
14
- else:
15
- from numba.cuda.cudadrv.drvapi import cu_module as cu_module_type
17
+ if config.CUDA_USE_NVIDIA_BINDING:
18
+ from cuda.cuda import CUmodule as cu_module_type
19
+ else:
20
+ from numba.cuda.cudadrv.drvapi import cu_module as cu_module_type
16
21
 
17
22
 
18
23
  def wipe_all_modules_in_context():
@@ -32,6 +37,7 @@ def get_hashable_handle_value(handle):
32
37
  return handle
33
38
 
34
39
 
40
+ @skip_on_cudasim("Module loading not implemented in the simulator")
35
41
  class TestModuleCallbacksBasic(ContextResettingTestCase):
36
42
  def test_basic(self):
37
43
  counter = 0
@@ -136,6 +142,7 @@ class TestModuleCallbacksBasic(ContextResettingTestCase):
136
142
  self.assertEqual(len(teardown_seen), 2)
137
143
 
138
144
 
145
+ @skip_on_cudasim("Module loading not implemented in the simulator")
139
146
  class TestModuleCallbacksAPICompleteness(CUDATestCase):
140
147
  def test_api(self):
141
148
  def setup(handle):
@@ -164,6 +171,7 @@ class TestModuleCallbacksAPICompleteness(CUDATestCase):
164
171
  kernel[1, 1]()
165
172
 
166
173
 
174
+ @skip_on_cudasim("Module loading not implemented in the simulator")
167
175
  class TestModuleCallbacks(CUDATestCase):
168
176
  def setUp(self):
169
177
  super().setUp()
@@ -213,6 +221,7 @@ __device__ int get_num(int &retval) {
213
221
  self.assertEqual(arr[0], 42)
214
222
 
215
223
 
224
+ @skip_on_cudasim("Module loading not implemented in the simulator")
216
225
  class TestMultithreadedCallbacks(CUDATestCase):
217
226
  def test_concurrent_initialization(self):
218
227
  seen_mods = set()
@@ -267,6 +267,7 @@ class TestLinker(CUDATestCase):
267
267
  not PYNVJITLINK_INSTALLED or not TEST_BIN_DIR,
268
268
  reason="pynvjitlink not enabled",
269
269
  )
270
+ @skip_on_cudasim("Linking unsupported in the simulator")
270
271
  class TestLinkerUsage(CUDATestCase):
271
272
  """Test that whether pynvjitlink can be enabled by both environment variable
272
273
  and modification of config at runtime.
@@ -203,18 +203,6 @@ def simple_usecase_kernel(r, x):
203
203
  simple_usecase_caller = CUDAUseCase(simple_usecase_kernel)
204
204
 
205
205
 
206
- # Usecase with cooperative groups
207
-
208
-
209
- @cuda.jit(cache=True)
210
- def cg_usecase_kernel(r, x):
211
- grid = cuda.cg.this_grid()
212
- grid.sync()
213
-
214
-
215
- cg_usecase = CUDAUseCase(cg_usecase_kernel)
216
-
217
-
218
206
  class _TestModule(CUDATestCase):
219
207
  """
220
208
  Tests for functionality of this module's functions.
@@ -0,0 +1,33 @@
1
+ from numba import cuda
2
+ from numba.cuda.testing import CUDATestCase
3
+ import sys
4
+
5
+ from numba.cuda.tests.cudapy.cache_usecases import CUDAUseCase
6
+
7
+
8
+ # Usecase with cooperative groups
9
+
10
+
11
+ @cuda.jit(cache=True)
12
+ def cg_usecase_kernel(r, x):
13
+ grid = cuda.cg.this_grid()
14
+ grid.sync()
15
+
16
+
17
+ cg_usecase = CUDAUseCase(cg_usecase_kernel)
18
+
19
+
20
+ class _TestModule(CUDATestCase):
21
+ """
22
+ Tests for functionality of this module's functions.
23
+ Note this does not define any "test_*" method, instead check_module()
24
+ should be called by hand.
25
+ """
26
+
27
+ def check_module(self, mod):
28
+ mod.cg_usecase(0)
29
+
30
+
31
+ def self_test():
32
+ mod = sys.modules[__name__]
33
+ _TestModule().check_module(mod)
@@ -310,9 +310,6 @@ class TestCudaArray(CUDATestCase):
310
310
  check(array_reshape, array_reshape1d, arr, 0)
311
311
  check(array_reshape, array_reshape1d, arr, (0,))
312
312
  check(array_reshape, array_reshape3d, arr, (1, 0, 2))
313
- check_only_shape(array_reshape2d, arr, (0, -1), (0, 0))
314
- check_only_shape(array_reshape2d, arr, (4, -1), (4, 0))
315
- check_only_shape(array_reshape3d, arr, (-1, 0, 4), (0, 0, 4))
316
313
 
317
314
  # C-contiguous
318
315
  arr = np.arange(24)
@@ -3,7 +3,11 @@ import itertools
3
3
  import numpy as np
4
4
  from numba import cuda
5
5
  from numba.core.errors import TypingError
6
- from numba.cuda.testing import CUDATestCase
6
+ from numba.cuda.testing import (
7
+ CUDATestCase,
8
+ skip_on_cudasim,
9
+ skip_unless_cudasim,
10
+ )
7
11
  import unittest
8
12
 
9
13
 
@@ -65,6 +69,7 @@ for align in (True, False):
65
69
  # with the test_alignment.TestArrayAlignment class.
66
70
 
67
71
 
72
+ @skip_on_cudasim("Array alignment not supported on cudasim")
68
73
  class TestArrayAddressAlignment(CUDATestCase):
69
74
  """
70
75
  Test cuda.local.array and cuda.shared.array support for an alignment
@@ -232,5 +237,24 @@ class TestArrayAddressAlignment(CUDATestCase):
232
237
  print(".", end="", flush=True)
233
238
 
234
239
 
240
+ @skip_unless_cudasim("Only check for alignment unsupported in the simulator")
241
+ class TestCudasimUnsupportedAlignment(CUDATestCase):
242
+ def test_local_unsupported(self):
243
+ @cuda.jit
244
+ def f():
245
+ cuda.local.array(1, dtype=np.uint8, alignment=16)
246
+
247
+ with self.assertRaisesRegex(RuntimeError, "not supported in cudasim"):
248
+ f[1, 1]()
249
+
250
+ def test_shared_unsupported(self):
251
+ @cuda.jit
252
+ def f():
253
+ cuda.shared.array(1, dtype=np.uint8, alignment=16)
254
+
255
+ with self.assertRaisesRegex(RuntimeError, "not supported in cudasim"):
256
+ f[1, 1]()
257
+
258
+
235
259
  if __name__ == "__main__":
236
260
  unittest.main()
@@ -0,0 +1,62 @@
1
+ from numba import cuda, float32
2
+ from numba.cuda.bf16 import bfloat16
3
+ from numba.cuda.testing import CUDATestCase
4
+
5
+ import math
6
+
7
+
8
+ class TestBfloat16HighLevelBindings(CUDATestCase):
9
+ def skip_unsupported(self):
10
+ if not cuda.is_bfloat16_supported():
11
+ self.skipTest(
12
+ "bfloat16 requires compute capability 8.0+ and CUDA version>= 12.0"
13
+ )
14
+
15
+ def test_use_type_in_kernel(self):
16
+ self.skip_unsupported()
17
+
18
+ @cuda.jit
19
+ def kernel():
20
+ bfloat16(3.14)
21
+
22
+ kernel[1, 1]()
23
+
24
+ def test_math_bindings(self):
25
+ self.skip_unsupported()
26
+
27
+ exp_functions = [math.exp]
28
+ try:
29
+ from math import exp2
30
+
31
+ exp_functions += [exp2]
32
+ except ImportError:
33
+ pass
34
+
35
+ functions = [
36
+ math.trunc,
37
+ math.ceil,
38
+ math.floor,
39
+ math.sqrt,
40
+ math.log,
41
+ math.log10,
42
+ math.cos,
43
+ math.sin,
44
+ math.tanh,
45
+ ] + exp_functions
46
+
47
+ for f in functions:
48
+ with self.subTest(func=f):
49
+
50
+ @cuda.jit
51
+ def kernel(arr):
52
+ x = bfloat16(3.14)
53
+ y = f(x)
54
+ arr[0] = float32(y)
55
+
56
+ arr = cuda.device_array((1,), dtype="float32")
57
+ kernel[1, 1](arr)
58
+
59
+ if f in exp_functions:
60
+ self.assertAlmostEqual(arr[0], f(3.14), delta=1e-1)
61
+ else:
62
+ self.assertAlmostEqual(arr[0], f(3.14), delta=1e-2)
@@ -2,41 +2,54 @@ import numba.cuda as cuda
2
2
  from numba.cuda.testing import unittest, CUDATestCase
3
3
  import numpy as np
4
4
 
5
- from numba import int16, int32, int64, uint16, uint32, uint64, float32, float64
6
- from numba.types import float16
7
-
8
- from numba.cuda.cuda_bf16 import (
9
- nv_bfloat16,
10
- htrunc,
11
- hceil,
12
- hfloor,
13
- hrint,
14
- hsqrt,
15
- hrsqrt,
16
- hrcp,
17
- hlog,
18
- hlog2,
19
- hlog10,
20
- hcos,
21
- hsin,
22
- hexp,
23
- hexp2,
24
- hexp10,
5
+ from numba import (
6
+ config,
7
+ int16,
8
+ int32,
9
+ int64,
10
+ uint16,
11
+ uint32,
12
+ uint64,
13
+ float32,
14
+ float64,
25
15
  )
16
+ from numba.types import float16
26
17
 
27
- from numba.cuda.cudadrv.runtime import get_version
28
-
29
- cuda_version = get_version()
18
+ if not config.ENABLE_CUDASIM:
19
+ from numba.cuda._internal.cuda_bf16 import (
20
+ nv_bfloat16,
21
+ htrunc,
22
+ hceil,
23
+ hfloor,
24
+ hrint,
25
+ hsqrt,
26
+ hrsqrt,
27
+ hrcp,
28
+ hlog,
29
+ hlog2,
30
+ hlog10,
31
+ hcos,
32
+ hsin,
33
+ hexp,
34
+ hexp2,
35
+ hexp10,
36
+ htanh,
37
+ htanh_approx,
38
+ )
30
39
 
31
40
  dtypes = [int16, int32, int64, uint16, uint32, uint64, float32]
32
41
 
33
42
 
34
- @unittest.skipIf(
35
- (cuda.get_current_device().compute_capability < (8, 0)),
36
- "bfloat16 requires compute capability 8.0+",
37
- )
38
43
  class Bfloat16Test(CUDATestCase):
44
+ def skip_unsupported(self):
45
+ if not cuda.is_bfloat16_supported():
46
+ self.skipTest(
47
+ "bfloat16 requires compute capability 8.0+ and CUDA version>= 12.0"
48
+ )
49
+
39
50
  def test_ctor(self):
51
+ self.skip_unsupported()
52
+
40
53
  @cuda.jit
41
54
  def simple_kernel():
42
55
  a = nv_bfloat16(float64(1.0)) # noqa: F841
@@ -47,18 +60,13 @@ class Bfloat16Test(CUDATestCase):
47
60
  f = nv_bfloat16(uint16(6)) # noqa: F841
48
61
  g = nv_bfloat16(uint32(7)) # noqa: F841
49
62
  h = nv_bfloat16(uint64(8)) # noqa: F841
63
+ i = nv_bfloat16(float16(9)) # noqa: F841
50
64
 
51
65
  simple_kernel[1, 1]()
52
66
 
53
- if cuda_version >= (12, 0):
54
-
55
- @cuda.jit
56
- def simple_kernel_fp16():
57
- i = nv_bfloat16(float16(9)) # noqa: F841
58
-
59
- simple_kernel_fp16[1, 1]()
60
-
61
67
  def test_casts(self):
68
+ self.skip_unsupported()
69
+
62
70
  @cuda.jit
63
71
  def simple_kernel(b, c, d, e, f, g, h):
64
72
  a = nv_bfloat16(3.14)
@@ -90,6 +98,7 @@ class Bfloat16Test(CUDATestCase):
90
98
  assert h[0] == 3
91
99
 
92
100
  def test_ctor_cast_loop(self):
101
+ self.skip_unsupported()
93
102
  for dtype in dtypes:
94
103
  with self.subTest(dtype=dtype):
95
104
 
@@ -106,6 +115,8 @@ class Bfloat16Test(CUDATestCase):
106
115
  assert a[0] == 3
107
116
 
108
117
  def test_arithmetic(self):
118
+ self.skip_unsupported()
119
+
109
120
  @cuda.jit
110
121
  def simple_kernel(arith, logic):
111
122
  # Binary Arithmetic Operators
@@ -175,6 +186,8 @@ class Bfloat16Test(CUDATestCase):
175
186
  )
176
187
 
177
188
  def test_math_func(self):
189
+ self.skip_unsupported()
190
+
178
191
  @cuda.jit
179
192
  def simple_kernel(a):
180
193
  x = nv_bfloat16(3.14)
@@ -191,16 +204,18 @@ class Bfloat16Test(CUDATestCase):
191
204
  a[9] = float32(hlog10(x))
192
205
  a[10] = float32(hcos(x))
193
206
  a[11] = float32(hsin(x))
194
- a[12] = float32(hexp(x))
195
- a[13] = float32(hexp2(x))
196
- a[14] = float32(hexp10(x))
207
+ a[12] = float32(htanh(x))
208
+ a[13] = float32(htanh_approx(x))
209
+ a[14] = float32(hexp(x))
210
+ a[15] = float32(hexp2(x))
211
+ a[16] = float32(hexp10(x))
197
212
 
198
- a = np.zeros(15, dtype=np.float32)
213
+ a = np.zeros(17, dtype=np.float32)
199
214
  simple_kernel[1, 1](a)
200
215
 
201
216
  x = 3.14
202
217
  np.testing.assert_allclose(
203
- a[:12],
218
+ a[:14],
204
219
  [
205
220
  np.trunc(x),
206
221
  np.ceil(x),
@@ -214,15 +229,19 @@ class Bfloat16Test(CUDATestCase):
214
229
  np.log10(x),
215
230
  np.cos(x),
216
231
  np.sin(x),
232
+ np.tanh(x),
233
+ np.tanh(x),
217
234
  ],
218
235
  atol=1e-2,
219
236
  )
220
237
 
221
238
  np.testing.assert_allclose(
222
- a[12:], [np.exp(x), np.exp2(x), np.power(10, x)], atol=1e2
239
+ a[14:], [np.exp(x), np.exp2(x), np.power(10, x)], atol=1e2
223
240
  )
224
241
 
225
242
  def test_check_bfloat16_type(self):
243
+ self.skip_unsupported()
244
+
226
245
  @cuda.jit
227
246
  def kernel(arr):
228
247
  x = nv_bfloat16(3.14)
@@ -237,6 +256,8 @@ class Bfloat16Test(CUDATestCase):
237
256
  np.testing.assert_allclose(arr, [3.14], atol=1e-2)
238
257
 
239
258
  def test_use_within_device_func(self):
259
+ self.skip_unsupported()
260
+
240
261
  @cuda.jit(device=True)
241
262
  def add_bf16(a, b):
242
263
  return a + b
@@ -252,6 +273,24 @@ class Bfloat16Test(CUDATestCase):
252
273
 
253
274
  np.testing.assert_allclose(arr, [8], atol=1e-2)
254
275
 
276
+ def test_use_binding_inside_dfunc(self):
277
+ self.skip_unsupported()
278
+
279
+ @cuda.jit(device=True)
280
+ def f(arr):
281
+ pi = nv_bfloat16(3.14)
282
+ three = htrunc(pi)
283
+ arr[0] = float32(three)
284
+
285
+ @cuda.jit
286
+ def kernel(arr):
287
+ f(arr)
288
+
289
+ arr = np.zeros(1, np.float32)
290
+ kernel[1, 1](arr)
291
+
292
+ np.testing.assert_allclose(arr, [3], atol=1e-2)
293
+
255
294
 
256
295
  if __name__ == "__main__":
257
296
  unittest.main()
@@ -1,8 +1,6 @@
1
1
  import multiprocessing
2
2
  import os
3
3
  import shutil
4
- import subprocess
5
- import sys
6
4
  import unittest
7
5
  import warnings
8
6
 
@@ -163,55 +161,6 @@ class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest):
163
161
  f = mod.renamed_function2
164
162
  self.assertPreciseEqual(f(2), 8)
165
163
 
166
- @skip_unless_cc_60
167
- @skip_if_cudadevrt_missing
168
- @skip_if_mvc_enabled("CG not supported with MVC")
169
- def test_cache_cg(self):
170
- # Functions using cooperative groups should be cacheable. See Issue
171
- # #8888: https://github.com/numba/numba/issues/8888
172
- self.check_pycache(0)
173
- mod = self.import_module()
174
- self.check_pycache(0)
175
-
176
- mod.cg_usecase(0)
177
- self.check_pycache(2) # 1 index, 1 data
178
-
179
- # Check the code runs ok from another process
180
- self.run_in_separate_process()
181
-
182
- @skip_unless_cc_60
183
- @skip_if_cudadevrt_missing
184
- @skip_if_mvc_enabled("CG not supported with MVC")
185
- def test_cache_cg_clean_run(self):
186
- # See Issue #9432: https://github.com/numba/numba/issues/9432
187
- # If a cached function using CG sync was the first thing to compile,
188
- # the compile would fail.
189
- self.check_pycache(0)
190
-
191
- # This logic is modelled on run_in_separate_process(), but executes the
192
- # CG usecase directly in the subprocess.
193
- code = """if 1:
194
- import sys
195
-
196
- sys.path.insert(0, %(tempdir)r)
197
- mod = __import__(%(modname)r)
198
- mod.cg_usecase(0)
199
- """ % dict(tempdir=self.tempdir, modname=self.modname)
200
-
201
- popen = subprocess.Popen(
202
- [sys.executable, "-c", code],
203
- stdout=subprocess.PIPE,
204
- stderr=subprocess.PIPE,
205
- )
206
- out, err = popen.communicate(timeout=60)
207
- if popen.returncode != 0:
208
- raise AssertionError(
209
- "process failed with code %s: \n"
210
- "stdout follows\n%s\n"
211
- "stderr follows\n%s\n"
212
- % (popen.returncode, out.decode(), err.decode()),
213
- )
214
-
215
164
  def _test_pycache_fallback(self):
216
165
  """
217
166
  With a disabled __pycache__, test there is a working fallback
@@ -275,6 +224,40 @@ class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest):
275
224
  pass
276
225
 
277
226
 
227
+ @skip_on_cudasim("Simulator does not implement caching")
228
+ class CUDACooperativeGroupTest(SerialMixin, DispatcherCacheUsecasesTest):
229
+ # See Issue #9432: https://github.com/numba/numba/issues/9432
230
+ # If a cached function using CG sync was the first thing to compile,
231
+ # the compile would fail.
232
+ here = os.path.dirname(__file__)
233
+ usecases_file = os.path.join(here, "cg_cache_usecases.py")
234
+ modname = "cuda_cooperative_caching_test_fodder"
235
+
236
+ def setUp(self):
237
+ DispatcherCacheUsecasesTest.setUp(self)
238
+ CUDATestCase.setUp(self)
239
+
240
+ def tearDown(self):
241
+ CUDATestCase.tearDown(self)
242
+ DispatcherCacheUsecasesTest.tearDown(self)
243
+
244
+ @skip_unless_cc_60
245
+ @skip_if_cudadevrt_missing
246
+ @skip_if_mvc_enabled("CG not supported with MVC")
247
+ def test_cache_cg(self):
248
+ # Functions using cooperative groups should be cacheable. See Issue
249
+ # #8888: https://github.com/numba/numba/issues/8888
250
+ self.check_pycache(0)
251
+ mod = self.import_module()
252
+ self.check_pycache(0)
253
+
254
+ mod.cg_usecase(0)
255
+ self.check_pycache(2) # 1 index, 1 data
256
+
257
+ # Check the code runs ok from another process
258
+ self.run_in_separate_process()
259
+
260
+
278
261
  @skip_on_cudasim("Simulator does not implement caching")
279
262
  class CUDAAndCPUCachingTest(SerialMixin, DispatcherCacheUsecasesTest):
280
263
  here = os.path.dirname(__file__)
@@ -1,8 +1,13 @@
1
1
  from __future__ import print_function
2
2
 
3
+ import os
4
+
5
+ import cffi
6
+
3
7
  import numpy as np
4
8
 
5
9
  from numba import config, cuda, int32
10
+ from numba.types import CPointer
6
11
  from numba.cuda.testing import (
7
12
  unittest,
8
13
  CUDATestCase,
@@ -11,6 +16,9 @@ from numba.cuda.testing import (
11
16
  skip_if_cudadevrt_missing,
12
17
  skip_if_mvc_enabled,
13
18
  )
19
+ from numba.core.typing import signature
20
+
21
+ ffi = cffi.FFI()
14
22
 
15
23
 
16
24
  @cuda.jit
@@ -149,6 +157,34 @@ class TestCudaCooperativeGroups(CUDATestCase):
149
157
  self.assertEqual(blocks1d, blocks2d)
150
158
  self.assertEqual(blocks1d, blocks3d)
151
159
 
160
+ @skip_on_cudasim("External code unsupported on cudasim")
161
+ @skip_unless_cc_60
162
+ def test_external_cooperative_func(self):
163
+ cudapy_test_path = os.path.dirname(__file__)
164
+ tests_path = os.path.dirname(cudapy_test_path)
165
+ data_path = os.path.join(tests_path, "data")
166
+ src = os.path.join(data_path, "cta_barrier.cu")
167
+
168
+ sig = signature(
169
+ CPointer(int32),
170
+ )
171
+ cta_barrier = cuda.declare_device(
172
+ "cta_barrier", sig=sig, link=[src], use_cooperative=True
173
+ )
174
+
175
+ @cuda.jit("void()")
176
+ def kernel():
177
+ cta_barrier()
178
+
179
+ overload = kernel.overloads[()]
180
+ block_size = 32
181
+ grid_size = overload.max_cooperative_grid_blocks(block_size)
182
+
183
+ kernel[grid_size, block_size]()
184
+
185
+ overload = kernel.overloads[()]
186
+ self.assertTrue(overload.cooperative)
187
+
152
188
 
153
189
  if __name__ == "__main__":
154
190
  unittest.main()
@@ -310,6 +310,23 @@ class TestCudaDebugInfo(CUDATestCase):
310
310
  with captured_stdout():
311
311
  self._test_kernel_args_types()
312
312
 
313
+ def test_kernel_args_names(self):
314
+ sig = (types.int32,)
315
+
316
+ @cuda.jit("void(int32)", debug=True, opt=False)
317
+ def f(x):
318
+ z = x # noqa: F841
319
+
320
+ llvm_ir = f.inspect_llvm(sig)
321
+
322
+ # Verify argument name is not prefixed with "arg."
323
+ pat = r"define void @.*\(i32 %\"x\"\)"
324
+ match = re.compile(pat).search(llvm_ir)
325
+ self.assertIsNotNone(match, msg=llvm_ir)
326
+ pat = r"define void @.*\(i32 %\"arg\.x\"\)"
327
+ match = re.compile(pat).search(llvm_ir)
328
+ self.assertIsNone(match, msg=llvm_ir)
329
+
313
330
  def test_llvm_dbg_value(self):
314
331
  sig = (types.int32, types.int32)
315
332
 
@@ -116,6 +116,7 @@ class EnumTest(CUDATestCase):
116
116
  got = cuda_func(arr)
117
117
  self.assertPreciseEqual(expected, got)
118
118
 
119
+ @skip_on_cudasim("No typing context in CUDA simulator")
119
120
  def test_int_enum_no_conversion(self):
120
121
  # Ported from Numba PR #10047: "Fix IntEnumMember.can_convert_to() when
121
122
  # no conversions found", https://github.com/numba/numba/pull/10047.