numba-cuda 0.12.1__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/codegen.py +1 -1
- numba_cuda/numba/cuda/compiler.py +24 -1
- numba_cuda/numba/cuda/cudadrv/driver.py +15 -3
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +1 -1
- numba_cuda/numba/cuda/cudadrv/nvvm.py +126 -25
- numba_cuda/numba/cuda/debuginfo.py +52 -1
- numba_cuda/numba/cuda/decorators.py +14 -0
- numba_cuda/numba/cuda/dispatcher.py +9 -2
- numba_cuda/numba/cuda/lowering.py +83 -4
- numba_cuda/numba/cuda/memory_management/__init__.py +1 -0
- numba_cuda/numba/cuda/simulator/__init__.py +10 -1
- numba_cuda/numba/cuda/simulator/_internal/__init__.py +1 -0
- numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +0 -0
- numba_cuda/numba/cuda/simulator/api.py +17 -0
- numba_cuda/numba/cuda/simulator/bf16.py +1 -0
- numba_cuda/numba/cuda/simulator/compiler.py +1 -0
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +7 -0
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +57 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +8 -0
- numba_cuda/numba/cuda/simulator/kernel.py +1 -1
- numba_cuda/numba/cuda/simulator/kernelapi.py +8 -2
- numba_cuda/numba/cuda/simulator/memory_management/__init__.py +1 -0
- numba_cuda/numba/cuda/simulator/memory_management/nrt.py +6 -0
- numba_cuda/numba/cuda/testing.py +10 -4
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +2 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +15 -6
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +3 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +3 -2
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +0 -3
- numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +25 -1
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +11 -4
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +34 -21
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +62 -2
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +4 -2
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +43 -4
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +106 -2
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +2 -0
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +8 -21
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +6 -6
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +7 -7
- numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +64 -0
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +60 -58
- numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +3 -2
- numba_cuda/numba/cuda/tests/support.py +1 -1
- numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +1 -1
- numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +1 -1
- {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/METADATA +22 -1
- {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/RECORD +59 -51
- {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/WHEEL +1 -1
- numba_cuda/numba/cuda/runtime/__init__.py +0 -1
- /numba_cuda/numba/cuda/{runtime → memory_management}/memsys.cu +0 -0
- /numba_cuda/numba/cuda/{runtime → memory_management}/memsys.cuh +0 -0
- /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.cu +0 -0
- /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.cuh +0 -0
- /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.py +0 -0
- {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,26 @@
|
|
1
|
+
from numba.cuda.cudadrv.driver import CudaAPIError
|
1
2
|
import numpy as np
|
2
3
|
import threading
|
3
4
|
|
4
|
-
from numba import
|
5
|
+
from numba import (
|
6
|
+
boolean,
|
7
|
+
config,
|
8
|
+
cuda,
|
9
|
+
float32,
|
10
|
+
float64,
|
11
|
+
int32,
|
12
|
+
int64,
|
13
|
+
types,
|
14
|
+
uint32,
|
15
|
+
void,
|
16
|
+
)
|
5
17
|
from numba.core.errors import TypingError
|
6
|
-
from numba.cuda.testing import
|
18
|
+
from numba.cuda.testing import (
|
19
|
+
cc_X_or_above,
|
20
|
+
skip_on_cudasim,
|
21
|
+
unittest,
|
22
|
+
CUDATestCase,
|
23
|
+
)
|
7
24
|
import math
|
8
25
|
|
9
26
|
|
@@ -466,6 +483,35 @@ class TestDispatcher(CUDATestCase):
|
|
466
483
|
self.assertEqual("Add two integers, kernel version", add_kernel.__doc__)
|
467
484
|
self.assertEqual("Add two integers, device version", add_device.__doc__)
|
468
485
|
|
486
|
+
@skip_on_cudasim("Cudasim does not have device pointers")
|
487
|
+
def test_dispatcher_cpointer_arguments(self):
|
488
|
+
ptr = types.CPointer(types.int32)
|
489
|
+
sig = void(ptr, int32, ptr, ptr, uint32)
|
490
|
+
|
491
|
+
@cuda.jit(sig)
|
492
|
+
def axpy(r, a, x, y, n):
|
493
|
+
i = cuda.grid(1)
|
494
|
+
if i < n:
|
495
|
+
r[i] = a * x[i] + y[i]
|
496
|
+
|
497
|
+
N = 16
|
498
|
+
a = 5
|
499
|
+
hx = np.arange(10, dtype=np.int32)
|
500
|
+
hy = np.arange(10, dtype=np.int32) * 2
|
501
|
+
dx = cuda.to_device(hx)
|
502
|
+
dy = cuda.to_device(hy)
|
503
|
+
dr = cuda.device_array_like(dx)
|
504
|
+
|
505
|
+
r_ptr = dr.__cuda_array_interface__["data"][0]
|
506
|
+
x_ptr = dx.__cuda_array_interface__["data"][0]
|
507
|
+
y_ptr = dy.__cuda_array_interface__["data"][0]
|
508
|
+
|
509
|
+
axpy[1, 32](r_ptr, a, x_ptr, y_ptr, N)
|
510
|
+
|
511
|
+
expected = a * hx + hy
|
512
|
+
actual = dr.copy_to_host()
|
513
|
+
np.testing.assert_equal(expected, actual)
|
514
|
+
|
469
515
|
|
470
516
|
@skip_on_cudasim("CUDA simulator doesn't implement kernel properties")
|
471
517
|
class TestDispatcherKernelProperties(CUDATestCase):
|
@@ -708,5 +754,63 @@ class TestDispatcherKernelProperties(CUDATestCase):
|
|
708
754
|
self.assertGreaterEqual(local_mem_per_thread, N * 4)
|
709
755
|
|
710
756
|
|
757
|
+
@skip_on_cudasim("Simulator does not support launch bounds")
|
758
|
+
class TestLaunchBounds(CUDATestCase):
|
759
|
+
def _test_launch_bounds_common(self, launch_bounds):
|
760
|
+
@cuda.jit(launch_bounds=launch_bounds)
|
761
|
+
def f():
|
762
|
+
pass
|
763
|
+
|
764
|
+
# Test successful launch
|
765
|
+
f[1, 128]()
|
766
|
+
|
767
|
+
# Test launch bound exceeded
|
768
|
+
msg = "Call to cuLaunchKernel results in CUDA_ERROR_INVALID_VALUE"
|
769
|
+
with self.assertRaisesRegex(CudaAPIError, msg):
|
770
|
+
f[1, 256]()
|
771
|
+
|
772
|
+
sig = f.signatures[0]
|
773
|
+
ptx = f.inspect_asm(sig)
|
774
|
+
self.assertRegex(ptx, r".maxntid\s+128,\s+1,\s+1")
|
775
|
+
|
776
|
+
return ptx
|
777
|
+
|
778
|
+
def test_launch_bounds_scalar(self):
|
779
|
+
launch_bounds = 128
|
780
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
781
|
+
|
782
|
+
self.assertNotIn(".minnctapersm", ptx)
|
783
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
784
|
+
|
785
|
+
def test_launch_bounds_tuple(self):
|
786
|
+
launch_bounds = (128,)
|
787
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
788
|
+
|
789
|
+
self.assertNotIn(".minnctapersm", ptx)
|
790
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
791
|
+
|
792
|
+
def test_launch_bounds_with_min_cta(self):
|
793
|
+
launch_bounds = (128, 2)
|
794
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
795
|
+
|
796
|
+
self.assertRegex(ptx, r".minnctapersm\s+2")
|
797
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
798
|
+
|
799
|
+
@unittest.skipUnless(
|
800
|
+
cc_X_or_above(9, 0), "CC 9.0 needed for max cluster rank"
|
801
|
+
)
|
802
|
+
def test_launch_bounds_with_max_cluster_rank(self):
|
803
|
+
launch_bounds = (128, 2, 4)
|
804
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
805
|
+
|
806
|
+
self.assertRegex(ptx, r".minnctapersm\s+2")
|
807
|
+
self.assertRegex(ptx, r".maxclusterrank\s+4")
|
808
|
+
|
809
|
+
def test_too_many_launch_bounds(self):
|
810
|
+
launch_bounds = (128, 2, 4, 8)
|
811
|
+
with self.assertRaisesRegex(ValueError, "Got 4 launch bounds:"):
|
812
|
+
cuda.jit("void()", launch_bounds=launch_bounds)(lambda: None)
|
813
|
+
|
814
|
+
|
711
815
|
if __name__ == "__main__":
|
712
816
|
unittest.main()
|
@@ -116,6 +116,7 @@ class EnumTest(CUDATestCase):
|
|
116
116
|
got = cuda_func(arr)
|
117
117
|
self.assertPreciseEqual(expected, got)
|
118
118
|
|
119
|
+
@skip_on_cudasim("No typing context in CUDA simulator")
|
119
120
|
def test_int_enum_no_conversion(self):
|
120
121
|
# Ported from Numba PR #10047: "Fix IntEnumMember.can_convert_to() when
|
121
122
|
# no conversions found", https://github.com/numba/numba/pull/10047.
|
@@ -191,7 +191,9 @@ if TEST_BIN_DIR:
|
|
191
191
|
)
|
192
192
|
|
193
193
|
|
194
|
+
@skip_on_cudasim("Extensions not supported in the simulator")
|
194
195
|
class TestExtendingLinkage(CUDATestCase):
|
196
|
+
@unittest.skipUnless(TEST_BIN_DIR, "Necessary binaries are not available")
|
195
197
|
def test_extension_adds_linkable_code(self):
|
196
198
|
cuda_major_version = cuda.runtime.get_version()[0]
|
197
199
|
|
@@ -118,31 +118,18 @@ class TestFastMathOption(CUDATestCase):
|
|
118
118
|
def tanh_kernel(r, x):
|
119
119
|
r[0] = tanh(x)
|
120
120
|
|
121
|
-
|
122
|
-
|
123
|
-
tanh_kernel, (float32[::1], float32), fastmath=True, cc=cc
|
124
|
-
)
|
125
|
-
precptx, _ = compile_ptx(
|
126
|
-
tanh_kernel, (float32[::1], float32), cc=cc
|
127
|
-
)
|
128
|
-
criterion.check(self, fastptx, precptx)
|
129
|
-
|
130
|
-
tanh_common_test(
|
131
|
-
cc=(7, 5),
|
132
|
-
criterion=FastMathCriterion(
|
133
|
-
fast_expected=["tanh.approx.f32 "],
|
134
|
-
prec_unexpected=["tanh.approx.f32 "],
|
135
|
-
),
|
121
|
+
fastptx, _ = compile_ptx(
|
122
|
+
tanh_kernel, (float32[::1], float32), fastmath=True
|
136
123
|
)
|
124
|
+
precptx, _ = compile_ptx(tanh_kernel, (float32[::1], float32))
|
137
125
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
fast_expected=["ex2.approx.ftz.f32 ", "rcp.approx.ftz.f32 "],
|
142
|
-
prec_unexpected=["tanh.approx.f32 "],
|
143
|
-
),
|
126
|
+
criterion = FastMathCriterion(
|
127
|
+
fast_expected=["tanh.approx.f32 "],
|
128
|
+
prec_unexpected=["tanh.approx.f32 "],
|
144
129
|
)
|
145
130
|
|
131
|
+
criterion.check(self, fastptx, precptx)
|
132
|
+
|
146
133
|
def test_expf(self):
|
147
134
|
self._test_fast_math_unary(
|
148
135
|
exp,
|
@@ -641,7 +641,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
641
641
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
642
642
|
def test_hadd_ptx(self):
|
643
643
|
args = (f2[:], f2, f2)
|
644
|
-
ptx, _ = compile_ptx(simple_hadd_scalar, args
|
644
|
+
ptx, _ = compile_ptx(simple_hadd_scalar, args)
|
645
645
|
self.assertIn("add.f16", ptx)
|
646
646
|
|
647
647
|
@skip_unless_cc_53
|
@@ -668,7 +668,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
668
668
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
669
669
|
def test_hfma_ptx(self):
|
670
670
|
args = (f2[:], f2, f2, f2)
|
671
|
-
ptx, _ = compile_ptx(simple_hfma_scalar, args
|
671
|
+
ptx, _ = compile_ptx(simple_hfma_scalar, args)
|
672
672
|
self.assertIn("fma.rn.f16", ptx)
|
673
673
|
|
674
674
|
@skip_unless_cc_53
|
@@ -693,7 +693,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
693
693
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
694
694
|
def test_hsub_ptx(self):
|
695
695
|
args = (f2[:], f2, f2)
|
696
|
-
ptx, _ = compile_ptx(simple_hsub_scalar, args
|
696
|
+
ptx, _ = compile_ptx(simple_hsub_scalar, args)
|
697
697
|
self.assertIn("sub.f16", ptx)
|
698
698
|
|
699
699
|
@skip_unless_cc_53
|
@@ -718,7 +718,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
718
718
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
719
719
|
def test_hmul_ptx(self):
|
720
720
|
args = (f2[:], f2, f2)
|
721
|
-
ptx, _ = compile_ptx(simple_hmul_scalar, args
|
721
|
+
ptx, _ = compile_ptx(simple_hmul_scalar, args)
|
722
722
|
self.assertIn("mul.f16", ptx)
|
723
723
|
|
724
724
|
@skip_unless_cc_53
|
@@ -763,7 +763,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
763
763
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
764
764
|
def test_hneg_ptx(self):
|
765
765
|
args = (f2[:], f2)
|
766
|
-
ptx, _ = compile_ptx(simple_hneg_scalar, args
|
766
|
+
ptx, _ = compile_ptx(simple_hneg_scalar, args)
|
767
767
|
self.assertIn("neg.f16", ptx)
|
768
768
|
|
769
769
|
@skip_unless_cc_53
|
@@ -786,7 +786,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
786
786
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
787
787
|
def test_habs_ptx(self):
|
788
788
|
args = (f2[:], f2)
|
789
|
-
ptx, _ = compile_ptx(simple_habs_scalar, args
|
789
|
+
ptx, _ = compile_ptx(simple_habs_scalar, args)
|
790
790
|
self.assertIn("abs.f16", ptx)
|
791
791
|
|
792
792
|
@skip_unless_cc_53
|
@@ -178,7 +178,7 @@ class TestOperatorModule(CUDATestCase):
|
|
178
178
|
args = (f2[:], f2, f2)
|
179
179
|
for fn, instr in zip(functions, instrs):
|
180
180
|
with self.subTest(instr=instr):
|
181
|
-
ptx, _ = compile_ptx(fn, args
|
181
|
+
ptx, _ = compile_ptx(fn, args)
|
182
182
|
self.assertIn(instr, ptx)
|
183
183
|
|
184
184
|
@skip_unless_cc_53
|
@@ -212,7 +212,7 @@ class TestOperatorModule(CUDATestCase):
|
|
212
212
|
|
213
213
|
for fn, instr in zip(functions, instrs):
|
214
214
|
with self.subTest(instr=instr):
|
215
|
-
ptx, _ = compile_ptx(fn, args
|
215
|
+
ptx, _ = compile_ptx(fn, args)
|
216
216
|
self.assertIn(instr, ptx)
|
217
217
|
|
218
218
|
@skip_unless_cc_53
|
@@ -255,13 +255,13 @@ class TestOperatorModule(CUDATestCase):
|
|
255
255
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
256
256
|
def test_fp16_neg_ptx(self):
|
257
257
|
args = (f2[:], f2)
|
258
|
-
ptx, _ = compile_ptx(simple_fp16neg, args
|
258
|
+
ptx, _ = compile_ptx(simple_fp16neg, args)
|
259
259
|
self.assertIn("neg.f16", ptx)
|
260
260
|
|
261
261
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
262
262
|
def test_fp16_abs_ptx(self):
|
263
263
|
args = (f2[:], f2)
|
264
|
-
ptx, _ = compile_ptx(simple_fp16abs, args
|
264
|
+
ptx, _ = compile_ptx(simple_fp16abs, args)
|
265
265
|
|
266
266
|
self.assertIn("abs.f16", ptx)
|
267
267
|
|
@@ -396,7 +396,7 @@ class TestOperatorModule(CUDATestCase):
|
|
396
396
|
|
397
397
|
for fn, op, s in zip(functions, ops, opstring):
|
398
398
|
with self.subTest(op=op):
|
399
|
-
ptx, _ = compile_ptx(fn, args
|
399
|
+
ptx, _ = compile_ptx(fn, args)
|
400
400
|
self.assertIn(s, ptx)
|
401
401
|
|
402
402
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
@@ -431,7 +431,7 @@ class TestOperatorModule(CUDATestCase):
|
|
431
431
|
for fn, op in zip(functions, ops):
|
432
432
|
with self.subTest(op=op):
|
433
433
|
args = (b1[:], f2, from_dtype(np.int8))
|
434
|
-
ptx, _ = compile_ptx(fn, args
|
434
|
+
ptx, _ = compile_ptx(fn, args)
|
435
435
|
self.assertIn(opstring[op], ptx)
|
436
436
|
|
437
437
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
@@ -475,7 +475,7 @@ class TestOperatorModule(CUDATestCase):
|
|
475
475
|
with self.subTest(op=op, ty=ty):
|
476
476
|
arg2_ty = np.result_type(np.float16, ty)
|
477
477
|
args = (b1[:], f2, from_dtype(arg2_ty))
|
478
|
-
ptx, _ = compile_ptx(fn, args
|
478
|
+
ptx, _ = compile_ptx(fn, args)
|
479
479
|
|
480
480
|
ops = opstring[op] + opsuffix[arg2_ty]
|
481
481
|
self.assertIn(ops, ptx)
|
@@ -0,0 +1,64 @@
|
|
1
|
+
import unittest
|
2
|
+
|
3
|
+
from numba.cuda.testing import CUDATestCase, skip_on_cudasim
|
4
|
+
from numba.tests.support import captured_stdout
|
5
|
+
|
6
|
+
|
7
|
+
@skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
|
8
|
+
class TestCPointer(CUDATestCase):
|
9
|
+
"""
|
10
|
+
Test simple vector addition
|
11
|
+
"""
|
12
|
+
|
13
|
+
def setUp(self):
|
14
|
+
# Prevent output from this test showing
|
15
|
+
# up when running the test suite
|
16
|
+
self._captured_stdout = captured_stdout()
|
17
|
+
self._captured_stdout.__enter__()
|
18
|
+
super().setUp()
|
19
|
+
|
20
|
+
def tearDown(self):
|
21
|
+
# No exception type, value, or traceback
|
22
|
+
self._captured_stdout.__exit__(None, None, None)
|
23
|
+
super().tearDown()
|
24
|
+
|
25
|
+
def test_ex_cpointer(self):
|
26
|
+
# ex_cpointer.sig.begin
|
27
|
+
import numpy as np
|
28
|
+
from numba import cuda, types
|
29
|
+
|
30
|
+
# The first kernel argument is a pointer to a uint8 array.
|
31
|
+
# The second argument holds the length as a uint32.
|
32
|
+
# The return type of a kernel is always void.
|
33
|
+
sig = types.void(types.CPointer(types.uint8), types.uint32)
|
34
|
+
# ex_cpointer.sig.end
|
35
|
+
|
36
|
+
# ex_cpointer.kernel.begin
|
37
|
+
@cuda.jit(sig)
|
38
|
+
def add_one(x, n):
|
39
|
+
i = cuda.grid(1)
|
40
|
+
if i < n:
|
41
|
+
x[i] += 1
|
42
|
+
|
43
|
+
# ex_cpointer.kernel.end
|
44
|
+
|
45
|
+
# ex_cpointer.launch.begin
|
46
|
+
x = cuda.to_device(np.arange(10, dtype=np.uint8))
|
47
|
+
|
48
|
+
# Print initial values of x
|
49
|
+
print(x.copy_to_host()) # [0 1 2 3 4 5 6 7 8 9]
|
50
|
+
|
51
|
+
# Obtain a pointer to the data from from the CUDA Array Interface
|
52
|
+
x_ptr = x.__cuda_array_interface__["data"][0]
|
53
|
+
x_len = len(x)
|
54
|
+
|
55
|
+
# Launch the kernel with the pointer and length
|
56
|
+
add_one[1, 32](x_ptr, x_len)
|
57
|
+
|
58
|
+
# Demonstrate that the data was updated by the kernel
|
59
|
+
print(x.copy_to_host()) # [ 1 2 3 4 5 6 7 8 9 10]
|
60
|
+
# ex_cpointer.launch.end
|
61
|
+
|
62
|
+
|
63
|
+
if __name__ == "__main__":
|
64
|
+
unittest.main()
|
@@ -3,16 +3,13 @@ import os
|
|
3
3
|
|
4
4
|
import numpy as np
|
5
5
|
import unittest
|
6
|
-
from numba.cuda.testing import CUDATestCase
|
6
|
+
from numba.cuda.testing import CUDATestCase, skip_on_cudasim
|
7
7
|
from numba.tests.support import run_in_subprocess, override_config
|
8
8
|
from numba.cuda import get_current_device
|
9
9
|
from numba.cuda.cudadrv.nvrtc import compile
|
10
|
-
from numba import types
|
11
|
-
from numba.cuda.cudadecl import registry as cuda_decl_registry
|
10
|
+
from numba import config, types
|
12
11
|
from numba.core.typing import signature
|
13
|
-
from numba.cuda.cudaimpl import lower as cuda_lower
|
14
12
|
from numba import cuda
|
15
|
-
from numba.cuda.runtime.nrt import rtsys, get_include
|
16
13
|
from numba.core.typing.templates import AbstractTemplate
|
17
14
|
from numba.cuda.cudadrv.linkable_code import (
|
18
15
|
CUSource,
|
@@ -23,67 +20,68 @@ from numba.cuda.cudadrv.linkable_code import (
|
|
23
20
|
Object,
|
24
21
|
)
|
25
22
|
|
26
|
-
|
27
23
|
TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
|
28
24
|
|
29
|
-
if
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
with open(path, mode) as f:
|
34
|
-
contents = f.read()
|
35
|
-
return kind(contents, nrt=True)
|
36
|
-
|
37
|
-
nrt_extern_a = make_linkable_code("nrt_extern.a", Archive, "rb")
|
38
|
-
nrt_extern_cubin = make_linkable_code("nrt_extern.cubin", Cubin, "rb")
|
39
|
-
nrt_extern_cu = make_linkable_code(
|
40
|
-
"nrt_extern.cu",
|
41
|
-
CUSource,
|
42
|
-
"rb",
|
43
|
-
)
|
44
|
-
nrt_extern_fatbin = make_linkable_code("nrt_extern.fatbin", Fatbin, "rb")
|
45
|
-
nrt_extern_fatbin_multi = make_linkable_code(
|
46
|
-
"nrt_extern_multi.fatbin", Fatbin, "rb"
|
47
|
-
)
|
48
|
-
nrt_extern_o = make_linkable_code("nrt_extern.o", Object, "rb")
|
49
|
-
nrt_extern_ptx = make_linkable_code("nrt_extern.ptx", PTXSource, "rb")
|
25
|
+
if not config.ENABLE_CUDASIM:
|
26
|
+
from numba.cuda.memory_management.nrt import rtsys, get_include
|
27
|
+
from numba.cuda.cudadecl import registry as cuda_decl_registry
|
28
|
+
from numba.cuda.cudaimpl import lower as cuda_lower
|
50
29
|
|
30
|
+
def allocate_deallocate_handle():
|
31
|
+
"""
|
32
|
+
Handle to call NRT_Allocate and NRT_Free
|
33
|
+
"""
|
34
|
+
pass
|
51
35
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
pass
|
57
|
-
|
58
|
-
|
59
|
-
@cuda_decl_registry.register_global(allocate_deallocate_handle)
|
60
|
-
class AllocateShimImpl(AbstractTemplate):
|
61
|
-
def generic(self, args, kws):
|
62
|
-
return signature(types.void)
|
63
|
-
|
36
|
+
@cuda_decl_registry.register_global(allocate_deallocate_handle)
|
37
|
+
class AllocateShimImpl(AbstractTemplate):
|
38
|
+
def generic(self, args, kws):
|
39
|
+
return signature(types.void)
|
64
40
|
|
65
|
-
device_fun_shim = cuda.declare_device(
|
66
|
-
|
67
|
-
)
|
41
|
+
device_fun_shim = cuda.declare_device(
|
42
|
+
"device_allocate_deallocate", types.int32()
|
43
|
+
)
|
68
44
|
|
45
|
+
# wrapper to turn the above into a python callable
|
46
|
+
def call_device_fun_shim():
|
47
|
+
return device_fun_shim()
|
48
|
+
|
49
|
+
@cuda_lower(allocate_deallocate_handle)
|
50
|
+
def allocate_deallocate_impl(context, builder, sig, args):
|
51
|
+
sig_ = types.int32()
|
52
|
+
# call the external function, passing the pointer
|
53
|
+
result = context.compile_internal(
|
54
|
+
builder,
|
55
|
+
call_device_fun_shim,
|
56
|
+
sig_,
|
57
|
+
(),
|
58
|
+
)
|
69
59
|
|
70
|
-
|
71
|
-
def call_device_fun_shim():
|
72
|
-
return device_fun_shim()
|
60
|
+
return result
|
73
61
|
|
62
|
+
if TEST_BIN_DIR:
|
74
63
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
builder,
|
81
|
-
call_device_fun_shim,
|
82
|
-
sig_,
|
83
|
-
(),
|
84
|
-
)
|
64
|
+
def make_linkable_code(name, kind, mode):
|
65
|
+
path = os.path.join(TEST_BIN_DIR, name)
|
66
|
+
with open(path, mode) as f:
|
67
|
+
contents = f.read()
|
68
|
+
return kind(contents, nrt=True)
|
85
69
|
|
86
|
-
|
70
|
+
nrt_extern_a = make_linkable_code("nrt_extern.a", Archive, "rb")
|
71
|
+
nrt_extern_cubin = make_linkable_code("nrt_extern.cubin", Cubin, "rb")
|
72
|
+
nrt_extern_cu = make_linkable_code(
|
73
|
+
"nrt_extern.cu",
|
74
|
+
CUSource,
|
75
|
+
"rb",
|
76
|
+
)
|
77
|
+
nrt_extern_fatbin = make_linkable_code(
|
78
|
+
"nrt_extern.fatbin", Fatbin, "rb"
|
79
|
+
)
|
80
|
+
nrt_extern_fatbin_multi = make_linkable_code(
|
81
|
+
"nrt_extern_multi.fatbin", Fatbin, "rb"
|
82
|
+
)
|
83
|
+
nrt_extern_o = make_linkable_code("nrt_extern.o", Object, "rb")
|
84
|
+
nrt_extern_ptx = make_linkable_code("nrt_extern.ptx", PTXSource, "rb")
|
87
85
|
|
88
86
|
|
89
87
|
class TestNrtBasic(CUDATestCase):
|
@@ -104,6 +102,7 @@ class TestNrtBasic(CUDATestCase):
|
|
104
102
|
g[1, 1]()
|
105
103
|
cuda.synchronize()
|
106
104
|
|
105
|
+
@skip_on_cudasim("CUDA Simulator does not produce PTX")
|
107
106
|
def test_nrt_ptx_contains_refcount(self):
|
108
107
|
@cuda.jit
|
109
108
|
def f(x):
|
@@ -157,6 +156,7 @@ class TestNrtLinking(CUDATestCase):
|
|
157
156
|
with override_config("CUDA_ENABLE_NRT", True):
|
158
157
|
super(TestNrtLinking, self).run(result)
|
159
158
|
|
159
|
+
@skip_on_cudasim("CUDA Simulator does not link PTX")
|
160
160
|
def test_nrt_detect_linked_ptx_file(self):
|
161
161
|
src = f"#include <{get_include()}/nrt.cuh>"
|
162
162
|
src += """
|
@@ -176,6 +176,7 @@ class TestNrtLinking(CUDATestCase):
|
|
176
176
|
kernel[1, 1]()
|
177
177
|
|
178
178
|
@unittest.skipIf(not TEST_BIN_DIR, "necessary binaries not generated.")
|
179
|
+
@skip_on_cudasim("CUDA Simulator does not link code")
|
179
180
|
def test_nrt_detect_linkable_code(self):
|
180
181
|
codes = (
|
181
182
|
nrt_extern_a,
|
@@ -196,6 +197,7 @@ class TestNrtLinking(CUDATestCase):
|
|
196
197
|
kernel[1, 1]()
|
197
198
|
|
198
199
|
|
200
|
+
@skip_on_cudasim("CUDASIM does not have NRT statistics")
|
199
201
|
class TestNrtStatistics(CUDATestCase):
|
200
202
|
def setUp(self):
|
201
203
|
self._stream = cuda.default_stream()
|
@@ -213,7 +215,7 @@ class TestNrtStatistics(CUDATestCase):
|
|
213
215
|
# Checks that explicitly turning the stats on via the env var works.
|
214
216
|
src = """if 1:
|
215
217
|
from numba import cuda
|
216
|
-
from numba.cuda.
|
218
|
+
from numba.cuda.memory_management import rtsys
|
217
219
|
import numpy as np
|
218
220
|
|
219
221
|
@cuda.jit
|
@@ -252,7 +254,7 @@ class TestNrtStatistics(CUDATestCase):
|
|
252
254
|
src = """if 1:
|
253
255
|
from numba import cuda
|
254
256
|
import numpy as np
|
255
|
-
from numba.cuda.
|
257
|
+
from numba.cuda.memory_management import rtsys
|
256
258
|
|
257
259
|
@cuda.jit
|
258
260
|
def foo():
|
@@ -1,13 +1,14 @@
|
|
1
1
|
import numpy as np
|
2
2
|
import unittest
|
3
3
|
from numba.tests.support import override_config
|
4
|
-
from numba.cuda.
|
4
|
+
from numba.cuda.memory_management import rtsys
|
5
5
|
from numba.cuda.tests.support import EnableNRTStatsMixin
|
6
|
-
from numba.cuda.testing import CUDATestCase
|
6
|
+
from numba.cuda.testing import CUDATestCase, skip_on_cudasim
|
7
7
|
|
8
8
|
from numba import cuda
|
9
9
|
|
10
10
|
|
11
|
+
@skip_on_cudasim("No refcounting in the simulator")
|
11
12
|
class TestNrtRefCt(EnableNRTStatsMixin, CUDATestCase):
|
12
13
|
def setUp(self):
|
13
14
|
super(TestNrtRefCt, self).setUp()
|
@@ -40,7 +40,7 @@ LTOIR_FLAGS := $(LTOIR_GENCODE) -dc
|
|
40
40
|
|
41
41
|
OUTPUT_DIR := ./
|
42
42
|
|
43
|
-
NRT_INCLUDE_DIR := $(shell python -c "from numba.cuda.
|
43
|
+
NRT_INCLUDE_DIR := $(shell python -c "from numba.cuda.memory_management.nrt import get_include; print(get_include())")
|
44
44
|
|
45
45
|
all:
|
46
46
|
@echo "GPU CC: $(GPU_CC)"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: numba-cuda
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.14.0
|
4
4
|
Summary: CUDA target for Numba
|
5
5
|
Author: Anaconda Inc., NVIDIA Corporation
|
6
6
|
License: BSD 2-clause
|
@@ -12,6 +12,27 @@ Requires-Python: >=3.9
|
|
12
12
|
Description-Content-Type: text/markdown
|
13
13
|
License-File: LICENSE
|
14
14
|
Requires-Dist: numba>=0.59.1
|
15
|
+
Provides-Extra: cu11
|
16
|
+
Requires-Dist: cuda-python==11.8.*; extra == "cu11"
|
17
|
+
Requires-Dist: nvidia-cuda-nvcc-cu11; extra == "cu11"
|
18
|
+
Requires-Dist: nvidia-cuda-runtime-cu11; extra == "cu11"
|
19
|
+
Requires-Dist: nvidia-cuda-nvrtc-cu11; extra == "cu11"
|
20
|
+
Provides-Extra: cu12
|
21
|
+
Requires-Dist: cuda-python==12.9.*; extra == "cu12"
|
22
|
+
Requires-Dist: nvidia-cuda-nvcc-cu12; extra == "cu12"
|
23
|
+
Requires-Dist: nvidia-cuda-runtime-cu12; extra == "cu12"
|
24
|
+
Requires-Dist: nvidia-cuda-nvrtc-cu12; extra == "cu12"
|
25
|
+
Provides-Extra: test
|
26
|
+
Requires-Dist: psutil; extra == "test"
|
27
|
+
Requires-Dist: cffi; extra == "test"
|
28
|
+
Requires-Dist: pytest; extra == "test"
|
29
|
+
Provides-Extra: test-cu11
|
30
|
+
Requires-Dist: numba-cuda[test]; extra == "test-cu11"
|
31
|
+
Requires-Dist: nvidia-curand-cu11; extra == "test-cu11"
|
32
|
+
Provides-Extra: test-cu12
|
33
|
+
Requires-Dist: numba-cuda[test]; extra == "test-cu12"
|
34
|
+
Requires-Dist: nvidia-curand-cu12; extra == "test-cu12"
|
35
|
+
Requires-Dist: pynvjitlink-cu12; extra == "test-cu12"
|
15
36
|
Dynamic: license-file
|
16
37
|
|
17
38
|
<div align="center"><img src="docs/source/_static/numba-green-icon-rgb.svg" width="200"/></div>
|