numba-cuda 0.13.0__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/__init__.py +31 -0
- numba_cuda/numba/cuda/compiler.py +24 -1
- numba_cuda/numba/cuda/cudadrv/driver.py +10 -16
- numba_cuda/numba/cuda/cudadrv/mappings.py +2 -2
- numba_cuda/numba/cuda/cudadrv/nvvm.py +126 -25
- numba_cuda/numba/cuda/debuginfo.py +52 -1
- numba_cuda/numba/cuda/decorators.py +14 -0
- numba_cuda/numba/cuda/dispatcher.py +8 -1
- numba_cuda/numba/cuda/lowering.py +83 -4
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +0 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +2 -4
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +3 -2
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +62 -2
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +43 -4
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +106 -2
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +8 -21
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +6 -6
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +7 -7
- numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +64 -0
- {numba_cuda-0.13.0.dist-info → numba_cuda-0.15.0.dist-info}/METADATA +24 -1
- {numba_cuda-0.13.0.dist-info → numba_cuda-0.15.0.dist-info}/RECORD +26 -25
- {numba_cuda-0.13.0.dist-info → numba_cuda-0.15.0.dist-info}/WHEEL +0 -0
- {numba_cuda-0.13.0.dist-info → numba_cuda-0.15.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.13.0.dist-info → numba_cuda-0.15.0.dist-info}/top_level.txt +0 -0
@@ -57,8 +57,6 @@ if TEST_BIN_DIR:
|
|
57
57
|
)
|
58
58
|
@skip_on_cudasim("Linking unsupported in the simulator")
|
59
59
|
class TestLinker(CUDATestCase):
|
60
|
-
_NUMBA_NVIDIA_BINDING_0_ENV = {"NUMBA_CUDA_USE_NVIDIA_BINDING": "0"}
|
61
|
-
|
62
60
|
def test_nvjitlink_create(self):
|
63
61
|
patched_linker = PyNvJitLinker(cc=(7, 5))
|
64
62
|
assert "-arch=sm_75" in patched_linker.options
|
@@ -299,12 +297,12 @@ class TestLinkerUsage(CUDATestCase):
|
|
299
297
|
|
300
298
|
def test_linker_enabled_envvar(self):
|
301
299
|
env = os.environ.copy()
|
302
|
-
env
|
300
|
+
env.pop("NUMBA_CUDA_ENABLE_PYNVJITLINK", None)
|
303
301
|
run_in_subprocess(self.src.format(config=""), env=env)
|
304
302
|
|
305
303
|
def test_linker_disabled_envvar(self):
|
306
304
|
env = os.environ.copy()
|
307
|
-
env
|
305
|
+
env["NUMBA_CUDA_ENABLE_PYNVJITLINK"] = "0"
|
308
306
|
with self.assertRaisesRegex(
|
309
307
|
AssertionError, "LTO and additional flags require PyNvJitLinker"
|
310
308
|
):
|
@@ -30,7 +30,8 @@ class TestNvvmDriver(unittest.TestCase):
|
|
30
30
|
self.skipTest("-gen-lto unavailable in this toolkit version")
|
31
31
|
|
32
32
|
nvvmir = self.get_nvvmir()
|
33
|
-
|
33
|
+
arch = "compute_%d%d" % nvvm.LOWEST_CURRENT_CC
|
34
|
+
ltoir = nvvm.compile_ir(nvvmir, opt=3, gen_lto=None, arch=arch)
|
34
35
|
|
35
36
|
# Verify we correctly passed the option by checking if we got LTOIR
|
36
37
|
# from NVVM (by looking for the expected magic number for LTOIR)
|
@@ -138,9 +139,9 @@ class TestNvvmDriver(unittest.TestCase):
|
|
138
139
|
class TestArchOption(unittest.TestCase):
|
139
140
|
def test_get_arch_option(self):
|
140
141
|
# Test returning the nearest lowest arch.
|
141
|
-
self.assertEqual(nvvm.get_arch_option(5, 3), "compute_53")
|
142
142
|
self.assertEqual(nvvm.get_arch_option(7, 5), "compute_75")
|
143
143
|
self.assertEqual(nvvm.get_arch_option(7, 7), "compute_75")
|
144
|
+
self.assertEqual(nvvm.get_arch_option(8, 8), "compute_87")
|
144
145
|
# Test known arch.
|
145
146
|
supported_cc = nvvm.get_supported_ccs()
|
146
147
|
for arch in supported_cc:
|
@@ -1,5 +1,5 @@
|
|
1
1
|
from math import sqrt
|
2
|
-
from numba import cuda, float32, int16, int32, int64, uint32, void
|
2
|
+
from numba import cuda, float32, int16, int32, int64, types, uint32, void
|
3
3
|
from numba.cuda import (
|
4
4
|
compile,
|
5
5
|
compile_for_current_device,
|
@@ -288,7 +288,7 @@ class TestCompileOnlyTests(unittest.TestCase):
|
|
288
288
|
# Sleep for a variable time
|
289
289
|
cuda.nanosleep(x)
|
290
290
|
|
291
|
-
ptx, resty = compile_ptx(use_nanosleep, (uint32,)
|
291
|
+
ptx, resty = compile_ptx(use_nanosleep, (uint32,))
|
292
292
|
|
293
293
|
nanosleep_count = 0
|
294
294
|
for line in ptx.split("\n"):
|
@@ -306,5 +306,65 @@ class TestCompileOnlyTests(unittest.TestCase):
|
|
306
306
|
)
|
307
307
|
|
308
308
|
|
309
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
310
|
+
class TestCompileWithLaunchBounds(unittest.TestCase):
|
311
|
+
def _test_launch_bounds_common(self, launch_bounds):
|
312
|
+
def f():
|
313
|
+
pass
|
314
|
+
|
315
|
+
sig = "void()"
|
316
|
+
ptx, resty = cuda.compile_ptx(f, sig, launch_bounds=launch_bounds)
|
317
|
+
self.assertIsInstance(resty, types.NoneType)
|
318
|
+
self.assertRegex(ptx, r".maxntid\s+128,\s+1,\s+1")
|
319
|
+
return ptx
|
320
|
+
|
321
|
+
def test_launch_bounds_scalar(self):
|
322
|
+
launch_bounds = 128
|
323
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
324
|
+
|
325
|
+
self.assertNotIn(".minnctapersm", ptx)
|
326
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
327
|
+
|
328
|
+
def test_launch_bounds_tuple(self):
|
329
|
+
launch_bounds = (128,)
|
330
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
331
|
+
|
332
|
+
self.assertNotIn(".minnctapersm", ptx)
|
333
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
334
|
+
|
335
|
+
def test_launch_bounds_with_min_cta(self):
|
336
|
+
launch_bounds = (128, 2)
|
337
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
338
|
+
|
339
|
+
self.assertRegex(ptx, r".minnctapersm\s+2")
|
340
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
341
|
+
|
342
|
+
def test_launch_bounds_with_max_cluster_rank(self):
|
343
|
+
def f():
|
344
|
+
pass
|
345
|
+
|
346
|
+
launch_bounds = (128, 2, 4)
|
347
|
+
cc = (9, 0)
|
348
|
+
sig = "void()"
|
349
|
+
ptx, resty = cuda.compile_ptx(
|
350
|
+
f, sig, launch_bounds=launch_bounds, cc=cc
|
351
|
+
)
|
352
|
+
self.assertIsInstance(resty, types.NoneType)
|
353
|
+
self.assertRegex(ptx, r".maxntid\s+128,\s+1,\s+1")
|
354
|
+
|
355
|
+
self.assertRegex(ptx, r".minnctapersm\s+2")
|
356
|
+
self.assertRegex(ptx, r".maxclusterrank\s+4")
|
357
|
+
|
358
|
+
def test_too_many_launch_bounds(self):
|
359
|
+
def f():
|
360
|
+
pass
|
361
|
+
|
362
|
+
sig = "void()"
|
363
|
+
launch_bounds = (128, 2, 4, 8)
|
364
|
+
|
365
|
+
with self.assertRaisesRegex(ValueError, "Got 4 launch bounds:"):
|
366
|
+
cuda.compile_ptx(f, sig, launch_bounds=launch_bounds)
|
367
|
+
|
368
|
+
|
309
369
|
if __name__ == "__main__":
|
310
370
|
unittest.main()
|
@@ -332,10 +332,10 @@ class TestCudaDebugInfo(CUDATestCase):
|
|
332
332
|
|
333
333
|
@cuda.jit("void(int32, int32)", debug=True, opt=False)
|
334
334
|
def f(x, y):
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
335
|
+
z1 = x # noqa: F841
|
336
|
+
z2 = 100 # noqa: F841
|
337
|
+
z3 = y # noqa: F841
|
338
|
+
z4 = True # noqa: F841
|
339
339
|
|
340
340
|
llvm_ir = f.inspect_llvm(sig)
|
341
341
|
# Verify the call to llvm.dbg.declare is replaced by llvm.dbg.value
|
@@ -373,6 +373,45 @@ class TestCudaDebugInfo(CUDATestCase):
|
|
373
373
|
match = re.compile(pat).search(llvm_ir)
|
374
374
|
self.assertIsNone(match, msg=llvm_ir)
|
375
375
|
|
376
|
+
def test_union_poly_types(self):
|
377
|
+
sig = (types.int32, types.int32)
|
378
|
+
|
379
|
+
@cuda.jit("void(int32, int32)", debug=True, opt=False)
|
380
|
+
def f(x, y):
|
381
|
+
foo = 100 # noqa: F841
|
382
|
+
foo = 2.34 # noqa: F841
|
383
|
+
foo = True # noqa: F841
|
384
|
+
foo = 200 # noqa: F841
|
385
|
+
|
386
|
+
llvm_ir = f.inspect_llvm(sig)
|
387
|
+
# Extract the type node id
|
388
|
+
pat1 = r'!DILocalVariable\(.*name: "foo".*type: !(\d+)\)'
|
389
|
+
match = re.compile(pat1).search(llvm_ir)
|
390
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
391
|
+
mdnode_id = match.group(1)
|
392
|
+
# Verify the union type and extract the elements node id
|
393
|
+
pat2 = rf"!{mdnode_id} = distinct !DICompositeType\(elements: !(\d+),.*size: 64, tag: DW_TAG_union_type\)" # noqa: E501
|
394
|
+
match = re.compile(pat2).search(llvm_ir)
|
395
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
396
|
+
mdnode_id = match.group(1)
|
397
|
+
# Extract the member node ids
|
398
|
+
pat3 = r"!{ !(\d+), !(\d+), !(\d+) }"
|
399
|
+
match = re.compile(pat3).search(llvm_ir)
|
400
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
401
|
+
mdnode_id1 = match.group(1)
|
402
|
+
mdnode_id2 = match.group(2)
|
403
|
+
mdnode_id3 = match.group(3)
|
404
|
+
# Verify the member nodes
|
405
|
+
pat4 = rf'!{mdnode_id1} = !DIDerivedType(.*name: "_bool", size: 8, tag: DW_TAG_member)' # noqa: E501
|
406
|
+
match = re.compile(pat4).search(llvm_ir)
|
407
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
408
|
+
pat5 = rf'!{mdnode_id2} = !DIDerivedType(.*name: "_float64", size: 64, tag: DW_TAG_member)' # noqa: E501
|
409
|
+
match = re.compile(pat5).search(llvm_ir)
|
410
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
411
|
+
pat6 = rf'!{mdnode_id3} = !DIDerivedType(.*name: "_int64", size: 64, tag: DW_TAG_member)' # noqa: E501
|
412
|
+
match = re.compile(pat6).search(llvm_ir)
|
413
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
414
|
+
|
376
415
|
|
377
416
|
if __name__ == "__main__":
|
378
417
|
unittest.main()
|
@@ -1,9 +1,26 @@
|
|
1
|
+
from numba.cuda.cudadrv.driver import CudaAPIError
|
1
2
|
import numpy as np
|
2
3
|
import threading
|
3
4
|
|
4
|
-
from numba import
|
5
|
+
from numba import (
|
6
|
+
boolean,
|
7
|
+
config,
|
8
|
+
cuda,
|
9
|
+
float32,
|
10
|
+
float64,
|
11
|
+
int32,
|
12
|
+
int64,
|
13
|
+
types,
|
14
|
+
uint32,
|
15
|
+
void,
|
16
|
+
)
|
5
17
|
from numba.core.errors import TypingError
|
6
|
-
from numba.cuda.testing import
|
18
|
+
from numba.cuda.testing import (
|
19
|
+
cc_X_or_above,
|
20
|
+
skip_on_cudasim,
|
21
|
+
unittest,
|
22
|
+
CUDATestCase,
|
23
|
+
)
|
7
24
|
import math
|
8
25
|
|
9
26
|
|
@@ -466,6 +483,35 @@ class TestDispatcher(CUDATestCase):
|
|
466
483
|
self.assertEqual("Add two integers, kernel version", add_kernel.__doc__)
|
467
484
|
self.assertEqual("Add two integers, device version", add_device.__doc__)
|
468
485
|
|
486
|
+
@skip_on_cudasim("Cudasim does not have device pointers")
|
487
|
+
def test_dispatcher_cpointer_arguments(self):
|
488
|
+
ptr = types.CPointer(types.int32)
|
489
|
+
sig = void(ptr, int32, ptr, ptr, uint32)
|
490
|
+
|
491
|
+
@cuda.jit(sig)
|
492
|
+
def axpy(r, a, x, y, n):
|
493
|
+
i = cuda.grid(1)
|
494
|
+
if i < n:
|
495
|
+
r[i] = a * x[i] + y[i]
|
496
|
+
|
497
|
+
N = 16
|
498
|
+
a = 5
|
499
|
+
hx = np.arange(10, dtype=np.int32)
|
500
|
+
hy = np.arange(10, dtype=np.int32) * 2
|
501
|
+
dx = cuda.to_device(hx)
|
502
|
+
dy = cuda.to_device(hy)
|
503
|
+
dr = cuda.device_array_like(dx)
|
504
|
+
|
505
|
+
r_ptr = dr.__cuda_array_interface__["data"][0]
|
506
|
+
x_ptr = dx.__cuda_array_interface__["data"][0]
|
507
|
+
y_ptr = dy.__cuda_array_interface__["data"][0]
|
508
|
+
|
509
|
+
axpy[1, 32](r_ptr, a, x_ptr, y_ptr, N)
|
510
|
+
|
511
|
+
expected = a * hx + hy
|
512
|
+
actual = dr.copy_to_host()
|
513
|
+
np.testing.assert_equal(expected, actual)
|
514
|
+
|
469
515
|
|
470
516
|
@skip_on_cudasim("CUDA simulator doesn't implement kernel properties")
|
471
517
|
class TestDispatcherKernelProperties(CUDATestCase):
|
@@ -708,5 +754,63 @@ class TestDispatcherKernelProperties(CUDATestCase):
|
|
708
754
|
self.assertGreaterEqual(local_mem_per_thread, N * 4)
|
709
755
|
|
710
756
|
|
757
|
+
@skip_on_cudasim("Simulator does not support launch bounds")
|
758
|
+
class TestLaunchBounds(CUDATestCase):
|
759
|
+
def _test_launch_bounds_common(self, launch_bounds):
|
760
|
+
@cuda.jit(launch_bounds=launch_bounds)
|
761
|
+
def f():
|
762
|
+
pass
|
763
|
+
|
764
|
+
# Test successful launch
|
765
|
+
f[1, 128]()
|
766
|
+
|
767
|
+
# Test launch bound exceeded
|
768
|
+
msg = "Call to cuLaunchKernel results in CUDA_ERROR_INVALID_VALUE"
|
769
|
+
with self.assertRaisesRegex(CudaAPIError, msg):
|
770
|
+
f[1, 256]()
|
771
|
+
|
772
|
+
sig = f.signatures[0]
|
773
|
+
ptx = f.inspect_asm(sig)
|
774
|
+
self.assertRegex(ptx, r".maxntid\s+128,\s+1,\s+1")
|
775
|
+
|
776
|
+
return ptx
|
777
|
+
|
778
|
+
def test_launch_bounds_scalar(self):
|
779
|
+
launch_bounds = 128
|
780
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
781
|
+
|
782
|
+
self.assertNotIn(".minnctapersm", ptx)
|
783
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
784
|
+
|
785
|
+
def test_launch_bounds_tuple(self):
|
786
|
+
launch_bounds = (128,)
|
787
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
788
|
+
|
789
|
+
self.assertNotIn(".minnctapersm", ptx)
|
790
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
791
|
+
|
792
|
+
def test_launch_bounds_with_min_cta(self):
|
793
|
+
launch_bounds = (128, 2)
|
794
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
795
|
+
|
796
|
+
self.assertRegex(ptx, r".minnctapersm\s+2")
|
797
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
798
|
+
|
799
|
+
@unittest.skipUnless(
|
800
|
+
cc_X_or_above(9, 0), "CC 9.0 needed for max cluster rank"
|
801
|
+
)
|
802
|
+
def test_launch_bounds_with_max_cluster_rank(self):
|
803
|
+
launch_bounds = (128, 2, 4)
|
804
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
805
|
+
|
806
|
+
self.assertRegex(ptx, r".minnctapersm\s+2")
|
807
|
+
self.assertRegex(ptx, r".maxclusterrank\s+4")
|
808
|
+
|
809
|
+
def test_too_many_launch_bounds(self):
|
810
|
+
launch_bounds = (128, 2, 4, 8)
|
811
|
+
with self.assertRaisesRegex(ValueError, "Got 4 launch bounds:"):
|
812
|
+
cuda.jit("void()", launch_bounds=launch_bounds)(lambda: None)
|
813
|
+
|
814
|
+
|
711
815
|
if __name__ == "__main__":
|
712
816
|
unittest.main()
|
@@ -118,31 +118,18 @@ class TestFastMathOption(CUDATestCase):
|
|
118
118
|
def tanh_kernel(r, x):
|
119
119
|
r[0] = tanh(x)
|
120
120
|
|
121
|
-
|
122
|
-
|
123
|
-
tanh_kernel, (float32[::1], float32), fastmath=True, cc=cc
|
124
|
-
)
|
125
|
-
precptx, _ = compile_ptx(
|
126
|
-
tanh_kernel, (float32[::1], float32), cc=cc
|
127
|
-
)
|
128
|
-
criterion.check(self, fastptx, precptx)
|
129
|
-
|
130
|
-
tanh_common_test(
|
131
|
-
cc=(7, 5),
|
132
|
-
criterion=FastMathCriterion(
|
133
|
-
fast_expected=["tanh.approx.f32 "],
|
134
|
-
prec_unexpected=["tanh.approx.f32 "],
|
135
|
-
),
|
121
|
+
fastptx, _ = compile_ptx(
|
122
|
+
tanh_kernel, (float32[::1], float32), fastmath=True
|
136
123
|
)
|
124
|
+
precptx, _ = compile_ptx(tanh_kernel, (float32[::1], float32))
|
137
125
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
fast_expected=["ex2.approx.ftz.f32 ", "rcp.approx.ftz.f32 "],
|
142
|
-
prec_unexpected=["tanh.approx.f32 "],
|
143
|
-
),
|
126
|
+
criterion = FastMathCriterion(
|
127
|
+
fast_expected=["tanh.approx.f32 "],
|
128
|
+
prec_unexpected=["tanh.approx.f32 "],
|
144
129
|
)
|
145
130
|
|
131
|
+
criterion.check(self, fastptx, precptx)
|
132
|
+
|
146
133
|
def test_expf(self):
|
147
134
|
self._test_fast_math_unary(
|
148
135
|
exp,
|
@@ -641,7 +641,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
641
641
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
642
642
|
def test_hadd_ptx(self):
|
643
643
|
args = (f2[:], f2, f2)
|
644
|
-
ptx, _ = compile_ptx(simple_hadd_scalar, args
|
644
|
+
ptx, _ = compile_ptx(simple_hadd_scalar, args)
|
645
645
|
self.assertIn("add.f16", ptx)
|
646
646
|
|
647
647
|
@skip_unless_cc_53
|
@@ -668,7 +668,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
668
668
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
669
669
|
def test_hfma_ptx(self):
|
670
670
|
args = (f2[:], f2, f2, f2)
|
671
|
-
ptx, _ = compile_ptx(simple_hfma_scalar, args
|
671
|
+
ptx, _ = compile_ptx(simple_hfma_scalar, args)
|
672
672
|
self.assertIn("fma.rn.f16", ptx)
|
673
673
|
|
674
674
|
@skip_unless_cc_53
|
@@ -693,7 +693,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
693
693
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
694
694
|
def test_hsub_ptx(self):
|
695
695
|
args = (f2[:], f2, f2)
|
696
|
-
ptx, _ = compile_ptx(simple_hsub_scalar, args
|
696
|
+
ptx, _ = compile_ptx(simple_hsub_scalar, args)
|
697
697
|
self.assertIn("sub.f16", ptx)
|
698
698
|
|
699
699
|
@skip_unless_cc_53
|
@@ -718,7 +718,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
718
718
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
719
719
|
def test_hmul_ptx(self):
|
720
720
|
args = (f2[:], f2, f2)
|
721
|
-
ptx, _ = compile_ptx(simple_hmul_scalar, args
|
721
|
+
ptx, _ = compile_ptx(simple_hmul_scalar, args)
|
722
722
|
self.assertIn("mul.f16", ptx)
|
723
723
|
|
724
724
|
@skip_unless_cc_53
|
@@ -763,7 +763,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
763
763
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
764
764
|
def test_hneg_ptx(self):
|
765
765
|
args = (f2[:], f2)
|
766
|
-
ptx, _ = compile_ptx(simple_hneg_scalar, args
|
766
|
+
ptx, _ = compile_ptx(simple_hneg_scalar, args)
|
767
767
|
self.assertIn("neg.f16", ptx)
|
768
768
|
|
769
769
|
@skip_unless_cc_53
|
@@ -786,7 +786,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
786
786
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
787
787
|
def test_habs_ptx(self):
|
788
788
|
args = (f2[:], f2)
|
789
|
-
ptx, _ = compile_ptx(simple_habs_scalar, args
|
789
|
+
ptx, _ = compile_ptx(simple_habs_scalar, args)
|
790
790
|
self.assertIn("abs.f16", ptx)
|
791
791
|
|
792
792
|
@skip_unless_cc_53
|
@@ -178,7 +178,7 @@ class TestOperatorModule(CUDATestCase):
|
|
178
178
|
args = (f2[:], f2, f2)
|
179
179
|
for fn, instr in zip(functions, instrs):
|
180
180
|
with self.subTest(instr=instr):
|
181
|
-
ptx, _ = compile_ptx(fn, args
|
181
|
+
ptx, _ = compile_ptx(fn, args)
|
182
182
|
self.assertIn(instr, ptx)
|
183
183
|
|
184
184
|
@skip_unless_cc_53
|
@@ -212,7 +212,7 @@ class TestOperatorModule(CUDATestCase):
|
|
212
212
|
|
213
213
|
for fn, instr in zip(functions, instrs):
|
214
214
|
with self.subTest(instr=instr):
|
215
|
-
ptx, _ = compile_ptx(fn, args
|
215
|
+
ptx, _ = compile_ptx(fn, args)
|
216
216
|
self.assertIn(instr, ptx)
|
217
217
|
|
218
218
|
@skip_unless_cc_53
|
@@ -255,13 +255,13 @@ class TestOperatorModule(CUDATestCase):
|
|
255
255
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
256
256
|
def test_fp16_neg_ptx(self):
|
257
257
|
args = (f2[:], f2)
|
258
|
-
ptx, _ = compile_ptx(simple_fp16neg, args
|
258
|
+
ptx, _ = compile_ptx(simple_fp16neg, args)
|
259
259
|
self.assertIn("neg.f16", ptx)
|
260
260
|
|
261
261
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
262
262
|
def test_fp16_abs_ptx(self):
|
263
263
|
args = (f2[:], f2)
|
264
|
-
ptx, _ = compile_ptx(simple_fp16abs, args
|
264
|
+
ptx, _ = compile_ptx(simple_fp16abs, args)
|
265
265
|
|
266
266
|
self.assertIn("abs.f16", ptx)
|
267
267
|
|
@@ -396,7 +396,7 @@ class TestOperatorModule(CUDATestCase):
|
|
396
396
|
|
397
397
|
for fn, op, s in zip(functions, ops, opstring):
|
398
398
|
with self.subTest(op=op):
|
399
|
-
ptx, _ = compile_ptx(fn, args
|
399
|
+
ptx, _ = compile_ptx(fn, args)
|
400
400
|
self.assertIn(s, ptx)
|
401
401
|
|
402
402
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
@@ -431,7 +431,7 @@ class TestOperatorModule(CUDATestCase):
|
|
431
431
|
for fn, op in zip(functions, ops):
|
432
432
|
with self.subTest(op=op):
|
433
433
|
args = (b1[:], f2, from_dtype(np.int8))
|
434
|
-
ptx, _ = compile_ptx(fn, args
|
434
|
+
ptx, _ = compile_ptx(fn, args)
|
435
435
|
self.assertIn(opstring[op], ptx)
|
436
436
|
|
437
437
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
@@ -475,7 +475,7 @@ class TestOperatorModule(CUDATestCase):
|
|
475
475
|
with self.subTest(op=op, ty=ty):
|
476
476
|
arg2_ty = np.result_type(np.float16, ty)
|
477
477
|
args = (b1[:], f2, from_dtype(arg2_ty))
|
478
|
-
ptx, _ = compile_ptx(fn, args
|
478
|
+
ptx, _ = compile_ptx(fn, args)
|
479
479
|
|
480
480
|
ops = opstring[op] + opsuffix[arg2_ty]
|
481
481
|
self.assertIn(ops, ptx)
|
@@ -0,0 +1,64 @@
|
|
1
|
+
import unittest
|
2
|
+
|
3
|
+
from numba.cuda.testing import CUDATestCase, skip_on_cudasim
|
4
|
+
from numba.tests.support import captured_stdout
|
5
|
+
|
6
|
+
|
7
|
+
@skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
|
8
|
+
class TestCPointer(CUDATestCase):
|
9
|
+
"""
|
10
|
+
Test simple vector addition
|
11
|
+
"""
|
12
|
+
|
13
|
+
def setUp(self):
|
14
|
+
# Prevent output from this test showing
|
15
|
+
# up when running the test suite
|
16
|
+
self._captured_stdout = captured_stdout()
|
17
|
+
self._captured_stdout.__enter__()
|
18
|
+
super().setUp()
|
19
|
+
|
20
|
+
def tearDown(self):
|
21
|
+
# No exception type, value, or traceback
|
22
|
+
self._captured_stdout.__exit__(None, None, None)
|
23
|
+
super().tearDown()
|
24
|
+
|
25
|
+
def test_ex_cpointer(self):
|
26
|
+
# ex_cpointer.sig.begin
|
27
|
+
import numpy as np
|
28
|
+
from numba import cuda, types
|
29
|
+
|
30
|
+
# The first kernel argument is a pointer to a uint8 array.
|
31
|
+
# The second argument holds the length as a uint32.
|
32
|
+
# The return type of a kernel is always void.
|
33
|
+
sig = types.void(types.CPointer(types.uint8), types.uint32)
|
34
|
+
# ex_cpointer.sig.end
|
35
|
+
|
36
|
+
# ex_cpointer.kernel.begin
|
37
|
+
@cuda.jit(sig)
|
38
|
+
def add_one(x, n):
|
39
|
+
i = cuda.grid(1)
|
40
|
+
if i < n:
|
41
|
+
x[i] += 1
|
42
|
+
|
43
|
+
# ex_cpointer.kernel.end
|
44
|
+
|
45
|
+
# ex_cpointer.launch.begin
|
46
|
+
x = cuda.to_device(np.arange(10, dtype=np.uint8))
|
47
|
+
|
48
|
+
# Print initial values of x
|
49
|
+
print(x.copy_to_host()) # [0 1 2 3 4 5 6 7 8 9]
|
50
|
+
|
51
|
+
# Obtain a pointer to the data from from the CUDA Array Interface
|
52
|
+
x_ptr = x.__cuda_array_interface__["data"][0]
|
53
|
+
x_len = len(x)
|
54
|
+
|
55
|
+
# Launch the kernel with the pointer and length
|
56
|
+
add_one[1, 32](x_ptr, x_len)
|
57
|
+
|
58
|
+
# Demonstrate that the data was updated by the kernel
|
59
|
+
print(x.copy_to_host()) # [ 1 2 3 4 5 6 7 8 9 10]
|
60
|
+
# ex_cpointer.launch.end
|
61
|
+
|
62
|
+
|
63
|
+
if __name__ == "__main__":
|
64
|
+
unittest.main()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: numba-cuda
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.15.0
|
4
4
|
Summary: CUDA target for Numba
|
5
5
|
Author: Anaconda Inc., NVIDIA Corporation
|
6
6
|
License: BSD 2-clause
|
@@ -12,6 +12,29 @@ Requires-Python: >=3.9
|
|
12
12
|
Description-Content-Type: text/markdown
|
13
13
|
License-File: LICENSE
|
14
14
|
Requires-Dist: numba>=0.59.1
|
15
|
+
Provides-Extra: cu11
|
16
|
+
Requires-Dist: cuda-bindings==11.8.*; extra == "cu11"
|
17
|
+
Requires-Dist: cuda-python==11.8.*; extra == "cu11"
|
18
|
+
Requires-Dist: nvidia-cuda-nvcc-cu11; extra == "cu11"
|
19
|
+
Requires-Dist: nvidia-cuda-runtime-cu11; extra == "cu11"
|
20
|
+
Requires-Dist: nvidia-cuda-nvrtc-cu11; extra == "cu11"
|
21
|
+
Provides-Extra: cu12
|
22
|
+
Requires-Dist: cuda-bindings==12.9.*; extra == "cu12"
|
23
|
+
Requires-Dist: cuda-python==12.9.*; extra == "cu12"
|
24
|
+
Requires-Dist: nvidia-cuda-nvcc-cu12; extra == "cu12"
|
25
|
+
Requires-Dist: nvidia-cuda-runtime-cu12; extra == "cu12"
|
26
|
+
Requires-Dist: nvidia-cuda-nvrtc-cu12; extra == "cu12"
|
27
|
+
Provides-Extra: test
|
28
|
+
Requires-Dist: psutil; extra == "test"
|
29
|
+
Requires-Dist: cffi; extra == "test"
|
30
|
+
Requires-Dist: pytest; extra == "test"
|
31
|
+
Provides-Extra: test-cu11
|
32
|
+
Requires-Dist: numba-cuda[test]; extra == "test-cu11"
|
33
|
+
Requires-Dist: nvidia-curand-cu11; extra == "test-cu11"
|
34
|
+
Provides-Extra: test-cu12
|
35
|
+
Requires-Dist: numba-cuda[test]; extra == "test-cu12"
|
36
|
+
Requires-Dist: nvidia-curand-cu12; extra == "test-cu12"
|
37
|
+
Requires-Dist: pynvjitlink-cu12; extra == "test-cu12"
|
15
38
|
Dynamic: license-file
|
16
39
|
|
17
40
|
<div align="center"><img src="docs/source/_static/numba-green-icon-rgb.svg" width="200"/></div>
|