numba-cuda 0.12.1__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/codegen.py +1 -1
  3. numba_cuda/numba/cuda/compiler.py +24 -1
  4. numba_cuda/numba/cuda/cudadrv/driver.py +15 -3
  5. numba_cuda/numba/cuda/cudadrv/nvrtc.py +1 -1
  6. numba_cuda/numba/cuda/cudadrv/nvvm.py +126 -25
  7. numba_cuda/numba/cuda/debuginfo.py +52 -1
  8. numba_cuda/numba/cuda/decorators.py +14 -0
  9. numba_cuda/numba/cuda/dispatcher.py +9 -2
  10. numba_cuda/numba/cuda/lowering.py +83 -4
  11. numba_cuda/numba/cuda/memory_management/__init__.py +1 -0
  12. numba_cuda/numba/cuda/simulator/__init__.py +10 -1
  13. numba_cuda/numba/cuda/simulator/_internal/__init__.py +1 -0
  14. numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +0 -0
  15. numba_cuda/numba/cuda/simulator/api.py +17 -0
  16. numba_cuda/numba/cuda/simulator/bf16.py +1 -0
  17. numba_cuda/numba/cuda/simulator/compiler.py +1 -0
  18. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +7 -0
  19. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +4 -0
  20. numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +57 -0
  21. numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +8 -0
  22. numba_cuda/numba/cuda/simulator/kernel.py +1 -1
  23. numba_cuda/numba/cuda/simulator/kernelapi.py +8 -2
  24. numba_cuda/numba/cuda/simulator/memory_management/__init__.py +1 -0
  25. numba_cuda/numba/cuda/simulator/memory_management/nrt.py +6 -0
  26. numba_cuda/numba/cuda/testing.py +10 -4
  27. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +2 -0
  28. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +15 -6
  29. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +3 -2
  30. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +3 -2
  31. numba_cuda/numba/cuda/tests/cudapy/test_array.py +0 -3
  32. numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +25 -1
  33. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +11 -4
  34. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +34 -21
  35. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +62 -2
  36. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +4 -2
  37. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +43 -4
  38. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +106 -2
  39. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +1 -0
  40. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +2 -0
  41. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +8 -21
  42. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +6 -6
  43. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +7 -7
  44. numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +64 -0
  45. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +60 -58
  46. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +3 -2
  47. numba_cuda/numba/cuda/tests/support.py +1 -1
  48. numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +1 -1
  49. numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +1 -1
  50. {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/METADATA +22 -1
  51. {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/RECORD +59 -51
  52. {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/WHEEL +1 -1
  53. numba_cuda/numba/cuda/runtime/__init__.py +0 -1
  54. /numba_cuda/numba/cuda/{runtime → memory_management}/memsys.cu +0 -0
  55. /numba_cuda/numba/cuda/{runtime → memory_management}/memsys.cuh +0 -0
  56. /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.cu +0 -0
  57. /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.cuh +0 -0
  58. /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.py +0 -0
  59. {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/licenses/LICENSE +0 -0
  60. {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,26 @@
1
+ from numba.cuda.cudadrv.driver import CudaAPIError
1
2
  import numpy as np
2
3
  import threading
3
4
 
4
- from numba import boolean, config, cuda, float32, float64, int32, int64, void
5
+ from numba import (
6
+ boolean,
7
+ config,
8
+ cuda,
9
+ float32,
10
+ float64,
11
+ int32,
12
+ int64,
13
+ types,
14
+ uint32,
15
+ void,
16
+ )
5
17
  from numba.core.errors import TypingError
6
- from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase
18
+ from numba.cuda.testing import (
19
+ cc_X_or_above,
20
+ skip_on_cudasim,
21
+ unittest,
22
+ CUDATestCase,
23
+ )
7
24
  import math
8
25
 
9
26
 
@@ -466,6 +483,35 @@ class TestDispatcher(CUDATestCase):
466
483
  self.assertEqual("Add two integers, kernel version", add_kernel.__doc__)
467
484
  self.assertEqual("Add two integers, device version", add_device.__doc__)
468
485
 
486
+ @skip_on_cudasim("Cudasim does not have device pointers")
487
+ def test_dispatcher_cpointer_arguments(self):
488
+ ptr = types.CPointer(types.int32)
489
+ sig = void(ptr, int32, ptr, ptr, uint32)
490
+
491
+ @cuda.jit(sig)
492
+ def axpy(r, a, x, y, n):
493
+ i = cuda.grid(1)
494
+ if i < n:
495
+ r[i] = a * x[i] + y[i]
496
+
497
+ N = 16
498
+ a = 5
499
+ hx = np.arange(10, dtype=np.int32)
500
+ hy = np.arange(10, dtype=np.int32) * 2
501
+ dx = cuda.to_device(hx)
502
+ dy = cuda.to_device(hy)
503
+ dr = cuda.device_array_like(dx)
504
+
505
+ r_ptr = dr.__cuda_array_interface__["data"][0]
506
+ x_ptr = dx.__cuda_array_interface__["data"][0]
507
+ y_ptr = dy.__cuda_array_interface__["data"][0]
508
+
509
+ axpy[1, 32](r_ptr, a, x_ptr, y_ptr, N)
510
+
511
+ expected = a * hx + hy
512
+ actual = dr.copy_to_host()
513
+ np.testing.assert_equal(expected, actual)
514
+
469
515
 
470
516
  @skip_on_cudasim("CUDA simulator doesn't implement kernel properties")
471
517
  class TestDispatcherKernelProperties(CUDATestCase):
@@ -708,5 +754,63 @@ class TestDispatcherKernelProperties(CUDATestCase):
708
754
  self.assertGreaterEqual(local_mem_per_thread, N * 4)
709
755
 
710
756
 
757
+ @skip_on_cudasim("Simulator does not support launch bounds")
758
+ class TestLaunchBounds(CUDATestCase):
759
+ def _test_launch_bounds_common(self, launch_bounds):
760
+ @cuda.jit(launch_bounds=launch_bounds)
761
+ def f():
762
+ pass
763
+
764
+ # Test successful launch
765
+ f[1, 128]()
766
+
767
+ # Test launch bound exceeded
768
+ msg = "Call to cuLaunchKernel results in CUDA_ERROR_INVALID_VALUE"
769
+ with self.assertRaisesRegex(CudaAPIError, msg):
770
+ f[1, 256]()
771
+
772
+ sig = f.signatures[0]
773
+ ptx = f.inspect_asm(sig)
774
+ self.assertRegex(ptx, r".maxntid\s+128,\s+1,\s+1")
775
+
776
+ return ptx
777
+
778
+ def test_launch_bounds_scalar(self):
779
+ launch_bounds = 128
780
+ ptx = self._test_launch_bounds_common(launch_bounds)
781
+
782
+ self.assertNotIn(".minnctapersm", ptx)
783
+ self.assertNotIn(".maxclusterrank", ptx)
784
+
785
+ def test_launch_bounds_tuple(self):
786
+ launch_bounds = (128,)
787
+ ptx = self._test_launch_bounds_common(launch_bounds)
788
+
789
+ self.assertNotIn(".minnctapersm", ptx)
790
+ self.assertNotIn(".maxclusterrank", ptx)
791
+
792
+ def test_launch_bounds_with_min_cta(self):
793
+ launch_bounds = (128, 2)
794
+ ptx = self._test_launch_bounds_common(launch_bounds)
795
+
796
+ self.assertRegex(ptx, r".minnctapersm\s+2")
797
+ self.assertNotIn(".maxclusterrank", ptx)
798
+
799
+ @unittest.skipUnless(
800
+ cc_X_or_above(9, 0), "CC 9.0 needed for max cluster rank"
801
+ )
802
+ def test_launch_bounds_with_max_cluster_rank(self):
803
+ launch_bounds = (128, 2, 4)
804
+ ptx = self._test_launch_bounds_common(launch_bounds)
805
+
806
+ self.assertRegex(ptx, r".minnctapersm\s+2")
807
+ self.assertRegex(ptx, r".maxclusterrank\s+4")
808
+
809
+ def test_too_many_launch_bounds(self):
810
+ launch_bounds = (128, 2, 4, 8)
811
+ with self.assertRaisesRegex(ValueError, "Got 4 launch bounds:"):
812
+ cuda.jit("void()", launch_bounds=launch_bounds)(lambda: None)
813
+
814
+
711
815
  if __name__ == "__main__":
712
816
  unittest.main()
@@ -116,6 +116,7 @@ class EnumTest(CUDATestCase):
116
116
  got = cuda_func(arr)
117
117
  self.assertPreciseEqual(expected, got)
118
118
 
119
+ @skip_on_cudasim("No typing context in CUDA simulator")
119
120
  def test_int_enum_no_conversion(self):
120
121
  # Ported from Numba PR #10047: "Fix IntEnumMember.can_convert_to() when
121
122
  # no conversions found", https://github.com/numba/numba/pull/10047.
@@ -191,7 +191,9 @@ if TEST_BIN_DIR:
191
191
  )
192
192
 
193
193
 
194
+ @skip_on_cudasim("Extensions not supported in the simulator")
194
195
  class TestExtendingLinkage(CUDATestCase):
196
+ @unittest.skipUnless(TEST_BIN_DIR, "Necessary binaries are not available")
195
197
  def test_extension_adds_linkable_code(self):
196
198
  cuda_major_version = cuda.runtime.get_version()[0]
197
199
 
@@ -118,31 +118,18 @@ class TestFastMathOption(CUDATestCase):
118
118
  def tanh_kernel(r, x):
119
119
  r[0] = tanh(x)
120
120
 
121
- def tanh_common_test(cc, criterion):
122
- fastptx, _ = compile_ptx(
123
- tanh_kernel, (float32[::1], float32), fastmath=True, cc=cc
124
- )
125
- precptx, _ = compile_ptx(
126
- tanh_kernel, (float32[::1], float32), cc=cc
127
- )
128
- criterion.check(self, fastptx, precptx)
129
-
130
- tanh_common_test(
131
- cc=(7, 5),
132
- criterion=FastMathCriterion(
133
- fast_expected=["tanh.approx.f32 "],
134
- prec_unexpected=["tanh.approx.f32 "],
135
- ),
121
+ fastptx, _ = compile_ptx(
122
+ tanh_kernel, (float32[::1], float32), fastmath=True
136
123
  )
124
+ precptx, _ = compile_ptx(tanh_kernel, (float32[::1], float32))
137
125
 
138
- tanh_common_test(
139
- cc=(7, 0),
140
- criterion=FastMathCriterion(
141
- fast_expected=["ex2.approx.ftz.f32 ", "rcp.approx.ftz.f32 "],
142
- prec_unexpected=["tanh.approx.f32 "],
143
- ),
126
+ criterion = FastMathCriterion(
127
+ fast_expected=["tanh.approx.f32 "],
128
+ prec_unexpected=["tanh.approx.f32 "],
144
129
  )
145
130
 
131
+ criterion.check(self, fastptx, precptx)
132
+
146
133
  def test_expf(self):
147
134
  self._test_fast_math_unary(
148
135
  exp,
@@ -641,7 +641,7 @@ class TestCudaIntrinsic(CUDATestCase):
641
641
  @skip_on_cudasim("Compilation unsupported in the simulator")
642
642
  def test_hadd_ptx(self):
643
643
  args = (f2[:], f2, f2)
644
- ptx, _ = compile_ptx(simple_hadd_scalar, args, cc=(5, 3))
644
+ ptx, _ = compile_ptx(simple_hadd_scalar, args)
645
645
  self.assertIn("add.f16", ptx)
646
646
 
647
647
  @skip_unless_cc_53
@@ -668,7 +668,7 @@ class TestCudaIntrinsic(CUDATestCase):
668
668
  @skip_on_cudasim("Compilation unsupported in the simulator")
669
669
  def test_hfma_ptx(self):
670
670
  args = (f2[:], f2, f2, f2)
671
- ptx, _ = compile_ptx(simple_hfma_scalar, args, cc=(5, 3))
671
+ ptx, _ = compile_ptx(simple_hfma_scalar, args)
672
672
  self.assertIn("fma.rn.f16", ptx)
673
673
 
674
674
  @skip_unless_cc_53
@@ -693,7 +693,7 @@ class TestCudaIntrinsic(CUDATestCase):
693
693
  @skip_on_cudasim("Compilation unsupported in the simulator")
694
694
  def test_hsub_ptx(self):
695
695
  args = (f2[:], f2, f2)
696
- ptx, _ = compile_ptx(simple_hsub_scalar, args, cc=(5, 3))
696
+ ptx, _ = compile_ptx(simple_hsub_scalar, args)
697
697
  self.assertIn("sub.f16", ptx)
698
698
 
699
699
  @skip_unless_cc_53
@@ -718,7 +718,7 @@ class TestCudaIntrinsic(CUDATestCase):
718
718
  @skip_on_cudasim("Compilation unsupported in the simulator")
719
719
  def test_hmul_ptx(self):
720
720
  args = (f2[:], f2, f2)
721
- ptx, _ = compile_ptx(simple_hmul_scalar, args, cc=(5, 3))
721
+ ptx, _ = compile_ptx(simple_hmul_scalar, args)
722
722
  self.assertIn("mul.f16", ptx)
723
723
 
724
724
  @skip_unless_cc_53
@@ -763,7 +763,7 @@ class TestCudaIntrinsic(CUDATestCase):
763
763
  @skip_on_cudasim("Compilation unsupported in the simulator")
764
764
  def test_hneg_ptx(self):
765
765
  args = (f2[:], f2)
766
- ptx, _ = compile_ptx(simple_hneg_scalar, args, cc=(5, 3))
766
+ ptx, _ = compile_ptx(simple_hneg_scalar, args)
767
767
  self.assertIn("neg.f16", ptx)
768
768
 
769
769
  @skip_unless_cc_53
@@ -786,7 +786,7 @@ class TestCudaIntrinsic(CUDATestCase):
786
786
  @skip_on_cudasim("Compilation unsupported in the simulator")
787
787
  def test_habs_ptx(self):
788
788
  args = (f2[:], f2)
789
- ptx, _ = compile_ptx(simple_habs_scalar, args, cc=(5, 3))
789
+ ptx, _ = compile_ptx(simple_habs_scalar, args)
790
790
  self.assertIn("abs.f16", ptx)
791
791
 
792
792
  @skip_unless_cc_53
@@ -178,7 +178,7 @@ class TestOperatorModule(CUDATestCase):
178
178
  args = (f2[:], f2, f2)
179
179
  for fn, instr in zip(functions, instrs):
180
180
  with self.subTest(instr=instr):
181
- ptx, _ = compile_ptx(fn, args, cc=(5, 3))
181
+ ptx, _ = compile_ptx(fn, args)
182
182
  self.assertIn(instr, ptx)
183
183
 
184
184
  @skip_unless_cc_53
@@ -212,7 +212,7 @@ class TestOperatorModule(CUDATestCase):
212
212
 
213
213
  for fn, instr in zip(functions, instrs):
214
214
  with self.subTest(instr=instr):
215
- ptx, _ = compile_ptx(fn, args, cc=(5, 3))
215
+ ptx, _ = compile_ptx(fn, args)
216
216
  self.assertIn(instr, ptx)
217
217
 
218
218
  @skip_unless_cc_53
@@ -255,13 +255,13 @@ class TestOperatorModule(CUDATestCase):
255
255
  @skip_on_cudasim("Compilation unsupported in the simulator")
256
256
  def test_fp16_neg_ptx(self):
257
257
  args = (f2[:], f2)
258
- ptx, _ = compile_ptx(simple_fp16neg, args, cc=(5, 3))
258
+ ptx, _ = compile_ptx(simple_fp16neg, args)
259
259
  self.assertIn("neg.f16", ptx)
260
260
 
261
261
  @skip_on_cudasim("Compilation unsupported in the simulator")
262
262
  def test_fp16_abs_ptx(self):
263
263
  args = (f2[:], f2)
264
- ptx, _ = compile_ptx(simple_fp16abs, args, cc=(5, 3))
264
+ ptx, _ = compile_ptx(simple_fp16abs, args)
265
265
 
266
266
  self.assertIn("abs.f16", ptx)
267
267
 
@@ -396,7 +396,7 @@ class TestOperatorModule(CUDATestCase):
396
396
 
397
397
  for fn, op, s in zip(functions, ops, opstring):
398
398
  with self.subTest(op=op):
399
- ptx, _ = compile_ptx(fn, args, cc=(5, 3))
399
+ ptx, _ = compile_ptx(fn, args)
400
400
  self.assertIn(s, ptx)
401
401
 
402
402
  @skip_on_cudasim("Compilation unsupported in the simulator")
@@ -431,7 +431,7 @@ class TestOperatorModule(CUDATestCase):
431
431
  for fn, op in zip(functions, ops):
432
432
  with self.subTest(op=op):
433
433
  args = (b1[:], f2, from_dtype(np.int8))
434
- ptx, _ = compile_ptx(fn, args, cc=(5, 3))
434
+ ptx, _ = compile_ptx(fn, args)
435
435
  self.assertIn(opstring[op], ptx)
436
436
 
437
437
  @skip_on_cudasim("Compilation unsupported in the simulator")
@@ -475,7 +475,7 @@ class TestOperatorModule(CUDATestCase):
475
475
  with self.subTest(op=op, ty=ty):
476
476
  arg2_ty = np.result_type(np.float16, ty)
477
477
  args = (b1[:], f2, from_dtype(arg2_ty))
478
- ptx, _ = compile_ptx(fn, args, cc=(5, 3))
478
+ ptx, _ = compile_ptx(fn, args)
479
479
 
480
480
  ops = opstring[op] + opsuffix[arg2_ty]
481
481
  self.assertIn(ops, ptx)
@@ -0,0 +1,64 @@
1
+ import unittest
2
+
3
+ from numba.cuda.testing import CUDATestCase, skip_on_cudasim
4
+ from numba.tests.support import captured_stdout
5
+
6
+
7
+ @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
8
+ class TestCPointer(CUDATestCase):
9
+ """
10
+ Test simple vector addition
11
+ """
12
+
13
+ def setUp(self):
14
+ # Prevent output from this test showing
15
+ # up when running the test suite
16
+ self._captured_stdout = captured_stdout()
17
+ self._captured_stdout.__enter__()
18
+ super().setUp()
19
+
20
+ def tearDown(self):
21
+ # No exception type, value, or traceback
22
+ self._captured_stdout.__exit__(None, None, None)
23
+ super().tearDown()
24
+
25
+ def test_ex_cpointer(self):
26
+ # ex_cpointer.sig.begin
27
+ import numpy as np
28
+ from numba import cuda, types
29
+
30
+ # The first kernel argument is a pointer to a uint8 array.
31
+ # The second argument holds the length as a uint32.
32
+ # The return type of a kernel is always void.
33
+ sig = types.void(types.CPointer(types.uint8), types.uint32)
34
+ # ex_cpointer.sig.end
35
+
36
+ # ex_cpointer.kernel.begin
37
+ @cuda.jit(sig)
38
+ def add_one(x, n):
39
+ i = cuda.grid(1)
40
+ if i < n:
41
+ x[i] += 1
42
+
43
+ # ex_cpointer.kernel.end
44
+
45
+ # ex_cpointer.launch.begin
46
+ x = cuda.to_device(np.arange(10, dtype=np.uint8))
47
+
48
+ # Print initial values of x
49
+ print(x.copy_to_host()) # [0 1 2 3 4 5 6 7 8 9]
50
+
51
+ # Obtain a pointer to the data from from the CUDA Array Interface
52
+ x_ptr = x.__cuda_array_interface__["data"][0]
53
+ x_len = len(x)
54
+
55
+ # Launch the kernel with the pointer and length
56
+ add_one[1, 32](x_ptr, x_len)
57
+
58
+ # Demonstrate that the data was updated by the kernel
59
+ print(x.copy_to_host()) # [ 1 2 3 4 5 6 7 8 9 10]
60
+ # ex_cpointer.launch.end
61
+
62
+
63
+ if __name__ == "__main__":
64
+ unittest.main()
@@ -3,16 +3,13 @@ import os
3
3
 
4
4
  import numpy as np
5
5
  import unittest
6
- from numba.cuda.testing import CUDATestCase
6
+ from numba.cuda.testing import CUDATestCase, skip_on_cudasim
7
7
  from numba.tests.support import run_in_subprocess, override_config
8
8
  from numba.cuda import get_current_device
9
9
  from numba.cuda.cudadrv.nvrtc import compile
10
- from numba import types
11
- from numba.cuda.cudadecl import registry as cuda_decl_registry
10
+ from numba import config, types
12
11
  from numba.core.typing import signature
13
- from numba.cuda.cudaimpl import lower as cuda_lower
14
12
  from numba import cuda
15
- from numba.cuda.runtime.nrt import rtsys, get_include
16
13
  from numba.core.typing.templates import AbstractTemplate
17
14
  from numba.cuda.cudadrv.linkable_code import (
18
15
  CUSource,
@@ -23,67 +20,68 @@ from numba.cuda.cudadrv.linkable_code import (
23
20
  Object,
24
21
  )
25
22
 
26
-
27
23
  TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
28
24
 
29
- if TEST_BIN_DIR:
30
-
31
- def make_linkable_code(name, kind, mode):
32
- path = os.path.join(TEST_BIN_DIR, name)
33
- with open(path, mode) as f:
34
- contents = f.read()
35
- return kind(contents, nrt=True)
36
-
37
- nrt_extern_a = make_linkable_code("nrt_extern.a", Archive, "rb")
38
- nrt_extern_cubin = make_linkable_code("nrt_extern.cubin", Cubin, "rb")
39
- nrt_extern_cu = make_linkable_code(
40
- "nrt_extern.cu",
41
- CUSource,
42
- "rb",
43
- )
44
- nrt_extern_fatbin = make_linkable_code("nrt_extern.fatbin", Fatbin, "rb")
45
- nrt_extern_fatbin_multi = make_linkable_code(
46
- "nrt_extern_multi.fatbin", Fatbin, "rb"
47
- )
48
- nrt_extern_o = make_linkable_code("nrt_extern.o", Object, "rb")
49
- nrt_extern_ptx = make_linkable_code("nrt_extern.ptx", PTXSource, "rb")
25
+ if not config.ENABLE_CUDASIM:
26
+ from numba.cuda.memory_management.nrt import rtsys, get_include
27
+ from numba.cuda.cudadecl import registry as cuda_decl_registry
28
+ from numba.cuda.cudaimpl import lower as cuda_lower
50
29
 
30
+ def allocate_deallocate_handle():
31
+ """
32
+ Handle to call NRT_Allocate and NRT_Free
33
+ """
34
+ pass
51
35
 
52
- def allocate_deallocate_handle():
53
- """
54
- Handle to call NRT_Allocate and NRT_Free
55
- """
56
- pass
57
-
58
-
59
- @cuda_decl_registry.register_global(allocate_deallocate_handle)
60
- class AllocateShimImpl(AbstractTemplate):
61
- def generic(self, args, kws):
62
- return signature(types.void)
63
-
36
+ @cuda_decl_registry.register_global(allocate_deallocate_handle)
37
+ class AllocateShimImpl(AbstractTemplate):
38
+ def generic(self, args, kws):
39
+ return signature(types.void)
64
40
 
65
- device_fun_shim = cuda.declare_device(
66
- "device_allocate_deallocate", types.int32()
67
- )
41
+ device_fun_shim = cuda.declare_device(
42
+ "device_allocate_deallocate", types.int32()
43
+ )
68
44
 
45
+ # wrapper to turn the above into a python callable
46
+ def call_device_fun_shim():
47
+ return device_fun_shim()
48
+
49
+ @cuda_lower(allocate_deallocate_handle)
50
+ def allocate_deallocate_impl(context, builder, sig, args):
51
+ sig_ = types.int32()
52
+ # call the external function, passing the pointer
53
+ result = context.compile_internal(
54
+ builder,
55
+ call_device_fun_shim,
56
+ sig_,
57
+ (),
58
+ )
69
59
 
70
- # wrapper to turn the above into a python callable
71
- def call_device_fun_shim():
72
- return device_fun_shim()
60
+ return result
73
61
 
62
+ if TEST_BIN_DIR:
74
63
 
75
- @cuda_lower(allocate_deallocate_handle)
76
- def allocate_deallocate_impl(context, builder, sig, args):
77
- sig_ = types.int32()
78
- # call the external function, passing the pointer
79
- result = context.compile_internal(
80
- builder,
81
- call_device_fun_shim,
82
- sig_,
83
- (),
84
- )
64
+ def make_linkable_code(name, kind, mode):
65
+ path = os.path.join(TEST_BIN_DIR, name)
66
+ with open(path, mode) as f:
67
+ contents = f.read()
68
+ return kind(contents, nrt=True)
85
69
 
86
- return result
70
+ nrt_extern_a = make_linkable_code("nrt_extern.a", Archive, "rb")
71
+ nrt_extern_cubin = make_linkable_code("nrt_extern.cubin", Cubin, "rb")
72
+ nrt_extern_cu = make_linkable_code(
73
+ "nrt_extern.cu",
74
+ CUSource,
75
+ "rb",
76
+ )
77
+ nrt_extern_fatbin = make_linkable_code(
78
+ "nrt_extern.fatbin", Fatbin, "rb"
79
+ )
80
+ nrt_extern_fatbin_multi = make_linkable_code(
81
+ "nrt_extern_multi.fatbin", Fatbin, "rb"
82
+ )
83
+ nrt_extern_o = make_linkable_code("nrt_extern.o", Object, "rb")
84
+ nrt_extern_ptx = make_linkable_code("nrt_extern.ptx", PTXSource, "rb")
87
85
 
88
86
 
89
87
  class TestNrtBasic(CUDATestCase):
@@ -104,6 +102,7 @@ class TestNrtBasic(CUDATestCase):
104
102
  g[1, 1]()
105
103
  cuda.synchronize()
106
104
 
105
+ @skip_on_cudasim("CUDA Simulator does not produce PTX")
107
106
  def test_nrt_ptx_contains_refcount(self):
108
107
  @cuda.jit
109
108
  def f(x):
@@ -157,6 +156,7 @@ class TestNrtLinking(CUDATestCase):
157
156
  with override_config("CUDA_ENABLE_NRT", True):
158
157
  super(TestNrtLinking, self).run(result)
159
158
 
159
+ @skip_on_cudasim("CUDA Simulator does not link PTX")
160
160
  def test_nrt_detect_linked_ptx_file(self):
161
161
  src = f"#include <{get_include()}/nrt.cuh>"
162
162
  src += """
@@ -176,6 +176,7 @@ class TestNrtLinking(CUDATestCase):
176
176
  kernel[1, 1]()
177
177
 
178
178
  @unittest.skipIf(not TEST_BIN_DIR, "necessary binaries not generated.")
179
+ @skip_on_cudasim("CUDA Simulator does not link code")
179
180
  def test_nrt_detect_linkable_code(self):
180
181
  codes = (
181
182
  nrt_extern_a,
@@ -196,6 +197,7 @@ class TestNrtLinking(CUDATestCase):
196
197
  kernel[1, 1]()
197
198
 
198
199
 
200
+ @skip_on_cudasim("CUDASIM does not have NRT statistics")
199
201
  class TestNrtStatistics(CUDATestCase):
200
202
  def setUp(self):
201
203
  self._stream = cuda.default_stream()
@@ -213,7 +215,7 @@ class TestNrtStatistics(CUDATestCase):
213
215
  # Checks that explicitly turning the stats on via the env var works.
214
216
  src = """if 1:
215
217
  from numba import cuda
216
- from numba.cuda.runtime import rtsys
218
+ from numba.cuda.memory_management import rtsys
217
219
  import numpy as np
218
220
 
219
221
  @cuda.jit
@@ -252,7 +254,7 @@ class TestNrtStatistics(CUDATestCase):
252
254
  src = """if 1:
253
255
  from numba import cuda
254
256
  import numpy as np
255
- from numba.cuda.runtime import rtsys
257
+ from numba.cuda.memory_management import rtsys
256
258
 
257
259
  @cuda.jit
258
260
  def foo():
@@ -1,13 +1,14 @@
1
1
  import numpy as np
2
2
  import unittest
3
3
  from numba.tests.support import override_config
4
- from numba.cuda.runtime import rtsys
4
+ from numba.cuda.memory_management import rtsys
5
5
  from numba.cuda.tests.support import EnableNRTStatsMixin
6
- from numba.cuda.testing import CUDATestCase
6
+ from numba.cuda.testing import CUDATestCase, skip_on_cudasim
7
7
 
8
8
  from numba import cuda
9
9
 
10
10
 
11
+ @skip_on_cudasim("No refcounting in the simulator")
11
12
  class TestNrtRefCt(EnableNRTStatsMixin, CUDATestCase):
12
13
  def setUp(self):
13
14
  super(TestNrtRefCt, self).setUp()
@@ -1,4 +1,4 @@
1
- from numba.cuda.runtime.nrt import rtsys
1
+ from numba.cuda.memory_management.nrt import rtsys
2
2
 
3
3
 
4
4
  class EnableNRTStatsMixin(object):
@@ -40,7 +40,7 @@ LTOIR_FLAGS := $(LTOIR_GENCODE) -dc
40
40
 
41
41
  OUTPUT_DIR := ./
42
42
 
43
- NRT_INCLUDE_DIR := $(shell python -c "from numba.cuda.runtime.nrt import get_include; print(get_include())")
43
+ NRT_INCLUDE_DIR := $(shell python -c "from numba.cuda.memory_management.nrt import get_include; print(get_include())")
44
44
 
45
45
  all:
46
46
  @echo "GPU CC: $(GPU_CC)"
@@ -7,7 +7,7 @@ import subprocess
7
7
  import sys
8
8
 
9
9
  from cuda import nvrtc
10
- from numba.cuda.runtime.nrt import get_include
10
+ from numba.cuda.memory_management.nrt import get_include
11
11
 
12
12
  # Magic number found at the start of an LTO-IR file
13
13
  LTOIR_MAGIC = 0x7F4E43ED
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: numba-cuda
3
- Version: 0.12.1
3
+ Version: 0.14.0
4
4
  Summary: CUDA target for Numba
5
5
  Author: Anaconda Inc., NVIDIA Corporation
6
6
  License: BSD 2-clause
@@ -12,6 +12,27 @@ Requires-Python: >=3.9
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  Requires-Dist: numba>=0.59.1
15
+ Provides-Extra: cu11
16
+ Requires-Dist: cuda-python==11.8.*; extra == "cu11"
17
+ Requires-Dist: nvidia-cuda-nvcc-cu11; extra == "cu11"
18
+ Requires-Dist: nvidia-cuda-runtime-cu11; extra == "cu11"
19
+ Requires-Dist: nvidia-cuda-nvrtc-cu11; extra == "cu11"
20
+ Provides-Extra: cu12
21
+ Requires-Dist: cuda-python==12.9.*; extra == "cu12"
22
+ Requires-Dist: nvidia-cuda-nvcc-cu12; extra == "cu12"
23
+ Requires-Dist: nvidia-cuda-runtime-cu12; extra == "cu12"
24
+ Requires-Dist: nvidia-cuda-nvrtc-cu12; extra == "cu12"
25
+ Provides-Extra: test
26
+ Requires-Dist: psutil; extra == "test"
27
+ Requires-Dist: cffi; extra == "test"
28
+ Requires-Dist: pytest; extra == "test"
29
+ Provides-Extra: test-cu11
30
+ Requires-Dist: numba-cuda[test]; extra == "test-cu11"
31
+ Requires-Dist: nvidia-curand-cu11; extra == "test-cu11"
32
+ Provides-Extra: test-cu12
33
+ Requires-Dist: numba-cuda[test]; extra == "test-cu12"
34
+ Requires-Dist: nvidia-curand-cu12; extra == "test-cu12"
35
+ Requires-Dist: pynvjitlink-cu12; extra == "test-cu12"
15
36
  Dynamic: license-file
16
37
 
17
38
  <div align="center"><img src="docs/source/_static/numba-green-icon-rgb.svg" width="200"/></div>