numba-cuda 0.13.0__py3-none-any.whl → 0.14.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of numba-cuda might be problematic. Click here for more details.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/compiler.py +24 -1
- numba_cuda/numba/cuda/cudadrv/driver.py +15 -3
- numba_cuda/numba/cuda/cudadrv/nvvm.py +126 -25
- numba_cuda/numba/cuda/debuginfo.py +52 -1
- numba_cuda/numba/cuda/decorators.py +14 -0
- numba_cuda/numba/cuda/dispatcher.py +8 -1
- numba_cuda/numba/cuda/lowering.py +83 -4
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +2 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +3 -2
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +62 -2
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +43 -4
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +106 -2
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +8 -21
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +16 -0
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +6 -6
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +7 -7
- numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +64 -0
- numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +1 -1
- {numba_cuda-0.13.0.dist-info → numba_cuda-0.14.1.dist-info}/METADATA +23 -1
- {numba_cuda-0.13.0.dist-info → numba_cuda-0.14.1.dist-info}/RECORD +24 -23
- {numba_cuda-0.13.0.dist-info → numba_cuda-0.14.1.dist-info}/WHEEL +0 -0
- {numba_cuda-0.13.0.dist-info → numba_cuda-0.14.1.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.13.0.dist-info → numba_cuda-0.14.1.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from math import sqrt
|
|
2
|
-
from numba import cuda, float32, int16, int32, int64, uint32, void
|
|
2
|
+
from numba import cuda, float32, int16, int32, int64, types, uint32, void
|
|
3
3
|
from numba.cuda import (
|
|
4
4
|
compile,
|
|
5
5
|
compile_for_current_device,
|
|
@@ -288,7 +288,7 @@ class TestCompileOnlyTests(unittest.TestCase):
|
|
|
288
288
|
# Sleep for a variable time
|
|
289
289
|
cuda.nanosleep(x)
|
|
290
290
|
|
|
291
|
-
ptx, resty = compile_ptx(use_nanosleep, (uint32,)
|
|
291
|
+
ptx, resty = compile_ptx(use_nanosleep, (uint32,))
|
|
292
292
|
|
|
293
293
|
nanosleep_count = 0
|
|
294
294
|
for line in ptx.split("\n"):
|
|
@@ -306,5 +306,65 @@ class TestCompileOnlyTests(unittest.TestCase):
|
|
|
306
306
|
)
|
|
307
307
|
|
|
308
308
|
|
|
309
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
310
|
+
class TestCompileWithLaunchBounds(unittest.TestCase):
|
|
311
|
+
def _test_launch_bounds_common(self, launch_bounds):
|
|
312
|
+
def f():
|
|
313
|
+
pass
|
|
314
|
+
|
|
315
|
+
sig = "void()"
|
|
316
|
+
ptx, resty = cuda.compile_ptx(f, sig, launch_bounds=launch_bounds)
|
|
317
|
+
self.assertIsInstance(resty, types.NoneType)
|
|
318
|
+
self.assertRegex(ptx, r".maxntid\s+128,\s+1,\s+1")
|
|
319
|
+
return ptx
|
|
320
|
+
|
|
321
|
+
def test_launch_bounds_scalar(self):
|
|
322
|
+
launch_bounds = 128
|
|
323
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
|
324
|
+
|
|
325
|
+
self.assertNotIn(".minnctapersm", ptx)
|
|
326
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
|
327
|
+
|
|
328
|
+
def test_launch_bounds_tuple(self):
|
|
329
|
+
launch_bounds = (128,)
|
|
330
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
|
331
|
+
|
|
332
|
+
self.assertNotIn(".minnctapersm", ptx)
|
|
333
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
|
334
|
+
|
|
335
|
+
def test_launch_bounds_with_min_cta(self):
|
|
336
|
+
launch_bounds = (128, 2)
|
|
337
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
|
338
|
+
|
|
339
|
+
self.assertRegex(ptx, r".minnctapersm\s+2")
|
|
340
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
|
341
|
+
|
|
342
|
+
def test_launch_bounds_with_max_cluster_rank(self):
|
|
343
|
+
def f():
|
|
344
|
+
pass
|
|
345
|
+
|
|
346
|
+
launch_bounds = (128, 2, 4)
|
|
347
|
+
cc = (9, 0)
|
|
348
|
+
sig = "void()"
|
|
349
|
+
ptx, resty = cuda.compile_ptx(
|
|
350
|
+
f, sig, launch_bounds=launch_bounds, cc=cc
|
|
351
|
+
)
|
|
352
|
+
self.assertIsInstance(resty, types.NoneType)
|
|
353
|
+
self.assertRegex(ptx, r".maxntid\s+128,\s+1,\s+1")
|
|
354
|
+
|
|
355
|
+
self.assertRegex(ptx, r".minnctapersm\s+2")
|
|
356
|
+
self.assertRegex(ptx, r".maxclusterrank\s+4")
|
|
357
|
+
|
|
358
|
+
def test_too_many_launch_bounds(self):
|
|
359
|
+
def f():
|
|
360
|
+
pass
|
|
361
|
+
|
|
362
|
+
sig = "void()"
|
|
363
|
+
launch_bounds = (128, 2, 4, 8)
|
|
364
|
+
|
|
365
|
+
with self.assertRaisesRegex(ValueError, "Got 4 launch bounds:"):
|
|
366
|
+
cuda.compile_ptx(f, sig, launch_bounds=launch_bounds)
|
|
367
|
+
|
|
368
|
+
|
|
309
369
|
if __name__ == "__main__":
|
|
310
370
|
unittest.main()
|
|
@@ -332,10 +332,10 @@ class TestCudaDebugInfo(CUDATestCase):
|
|
|
332
332
|
|
|
333
333
|
@cuda.jit("void(int32, int32)", debug=True, opt=False)
|
|
334
334
|
def f(x, y):
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
335
|
+
z1 = x # noqa: F841
|
|
336
|
+
z2 = 100 # noqa: F841
|
|
337
|
+
z3 = y # noqa: F841
|
|
338
|
+
z4 = True # noqa: F841
|
|
339
339
|
|
|
340
340
|
llvm_ir = f.inspect_llvm(sig)
|
|
341
341
|
# Verify the call to llvm.dbg.declare is replaced by llvm.dbg.value
|
|
@@ -373,6 +373,45 @@ class TestCudaDebugInfo(CUDATestCase):
|
|
|
373
373
|
match = re.compile(pat).search(llvm_ir)
|
|
374
374
|
self.assertIsNone(match, msg=llvm_ir)
|
|
375
375
|
|
|
376
|
+
def test_union_poly_types(self):
|
|
377
|
+
sig = (types.int32, types.int32)
|
|
378
|
+
|
|
379
|
+
@cuda.jit("void(int32, int32)", debug=True, opt=False)
|
|
380
|
+
def f(x, y):
|
|
381
|
+
foo = 100 # noqa: F841
|
|
382
|
+
foo = 2.34 # noqa: F841
|
|
383
|
+
foo = True # noqa: F841
|
|
384
|
+
foo = 200 # noqa: F841
|
|
385
|
+
|
|
386
|
+
llvm_ir = f.inspect_llvm(sig)
|
|
387
|
+
# Extract the type node id
|
|
388
|
+
pat1 = r'!DILocalVariable\(.*name: "foo".*type: !(\d+)\)'
|
|
389
|
+
match = re.compile(pat1).search(llvm_ir)
|
|
390
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
|
391
|
+
mdnode_id = match.group(1)
|
|
392
|
+
# Verify the union type and extract the elements node id
|
|
393
|
+
pat2 = rf"!{mdnode_id} = distinct !DICompositeType\(elements: !(\d+),.*size: 64, tag: DW_TAG_union_type\)" # noqa: E501
|
|
394
|
+
match = re.compile(pat2).search(llvm_ir)
|
|
395
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
|
396
|
+
mdnode_id = match.group(1)
|
|
397
|
+
# Extract the member node ids
|
|
398
|
+
pat3 = r"!{ !(\d+), !(\d+), !(\d+) }"
|
|
399
|
+
match = re.compile(pat3).search(llvm_ir)
|
|
400
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
|
401
|
+
mdnode_id1 = match.group(1)
|
|
402
|
+
mdnode_id2 = match.group(2)
|
|
403
|
+
mdnode_id3 = match.group(3)
|
|
404
|
+
# Verify the member nodes
|
|
405
|
+
pat4 = rf'!{mdnode_id1} = !DIDerivedType(.*name: "_bool", size: 8, tag: DW_TAG_member)' # noqa: E501
|
|
406
|
+
match = re.compile(pat4).search(llvm_ir)
|
|
407
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
|
408
|
+
pat5 = rf'!{mdnode_id2} = !DIDerivedType(.*name: "_float64", size: 64, tag: DW_TAG_member)' # noqa: E501
|
|
409
|
+
match = re.compile(pat5).search(llvm_ir)
|
|
410
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
|
411
|
+
pat6 = rf'!{mdnode_id3} = !DIDerivedType(.*name: "_int64", size: 64, tag: DW_TAG_member)' # noqa: E501
|
|
412
|
+
match = re.compile(pat6).search(llvm_ir)
|
|
413
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
|
414
|
+
|
|
376
415
|
|
|
377
416
|
if __name__ == "__main__":
|
|
378
417
|
unittest.main()
|
|
@@ -1,9 +1,26 @@
|
|
|
1
|
+
from numba.cuda.cudadrv.driver import CudaAPIError
|
|
1
2
|
import numpy as np
|
|
2
3
|
import threading
|
|
3
4
|
|
|
4
|
-
from numba import
|
|
5
|
+
from numba import (
|
|
6
|
+
boolean,
|
|
7
|
+
config,
|
|
8
|
+
cuda,
|
|
9
|
+
float32,
|
|
10
|
+
float64,
|
|
11
|
+
int32,
|
|
12
|
+
int64,
|
|
13
|
+
types,
|
|
14
|
+
uint32,
|
|
15
|
+
void,
|
|
16
|
+
)
|
|
5
17
|
from numba.core.errors import TypingError
|
|
6
|
-
from numba.cuda.testing import
|
|
18
|
+
from numba.cuda.testing import (
|
|
19
|
+
cc_X_or_above,
|
|
20
|
+
skip_on_cudasim,
|
|
21
|
+
unittest,
|
|
22
|
+
CUDATestCase,
|
|
23
|
+
)
|
|
7
24
|
import math
|
|
8
25
|
|
|
9
26
|
|
|
@@ -466,6 +483,35 @@ class TestDispatcher(CUDATestCase):
|
|
|
466
483
|
self.assertEqual("Add two integers, kernel version", add_kernel.__doc__)
|
|
467
484
|
self.assertEqual("Add two integers, device version", add_device.__doc__)
|
|
468
485
|
|
|
486
|
+
@skip_on_cudasim("Cudasim does not have device pointers")
|
|
487
|
+
def test_dispatcher_cpointer_arguments(self):
|
|
488
|
+
ptr = types.CPointer(types.int32)
|
|
489
|
+
sig = void(ptr, int32, ptr, ptr, uint32)
|
|
490
|
+
|
|
491
|
+
@cuda.jit(sig)
|
|
492
|
+
def axpy(r, a, x, y, n):
|
|
493
|
+
i = cuda.grid(1)
|
|
494
|
+
if i < n:
|
|
495
|
+
r[i] = a * x[i] + y[i]
|
|
496
|
+
|
|
497
|
+
N = 16
|
|
498
|
+
a = 5
|
|
499
|
+
hx = np.arange(10, dtype=np.int32)
|
|
500
|
+
hy = np.arange(10, dtype=np.int32) * 2
|
|
501
|
+
dx = cuda.to_device(hx)
|
|
502
|
+
dy = cuda.to_device(hy)
|
|
503
|
+
dr = cuda.device_array_like(dx)
|
|
504
|
+
|
|
505
|
+
r_ptr = dr.__cuda_array_interface__["data"][0]
|
|
506
|
+
x_ptr = dx.__cuda_array_interface__["data"][0]
|
|
507
|
+
y_ptr = dy.__cuda_array_interface__["data"][0]
|
|
508
|
+
|
|
509
|
+
axpy[1, 32](r_ptr, a, x_ptr, y_ptr, N)
|
|
510
|
+
|
|
511
|
+
expected = a * hx + hy
|
|
512
|
+
actual = dr.copy_to_host()
|
|
513
|
+
np.testing.assert_equal(expected, actual)
|
|
514
|
+
|
|
469
515
|
|
|
470
516
|
@skip_on_cudasim("CUDA simulator doesn't implement kernel properties")
|
|
471
517
|
class TestDispatcherKernelProperties(CUDATestCase):
|
|
@@ -708,5 +754,63 @@ class TestDispatcherKernelProperties(CUDATestCase):
|
|
|
708
754
|
self.assertGreaterEqual(local_mem_per_thread, N * 4)
|
|
709
755
|
|
|
710
756
|
|
|
757
|
+
@skip_on_cudasim("Simulator does not support launch bounds")
|
|
758
|
+
class TestLaunchBounds(CUDATestCase):
|
|
759
|
+
def _test_launch_bounds_common(self, launch_bounds):
|
|
760
|
+
@cuda.jit(launch_bounds=launch_bounds)
|
|
761
|
+
def f():
|
|
762
|
+
pass
|
|
763
|
+
|
|
764
|
+
# Test successful launch
|
|
765
|
+
f[1, 128]()
|
|
766
|
+
|
|
767
|
+
# Test launch bound exceeded
|
|
768
|
+
msg = "Call to cuLaunchKernel results in CUDA_ERROR_INVALID_VALUE"
|
|
769
|
+
with self.assertRaisesRegex(CudaAPIError, msg):
|
|
770
|
+
f[1, 256]()
|
|
771
|
+
|
|
772
|
+
sig = f.signatures[0]
|
|
773
|
+
ptx = f.inspect_asm(sig)
|
|
774
|
+
self.assertRegex(ptx, r".maxntid\s+128,\s+1,\s+1")
|
|
775
|
+
|
|
776
|
+
return ptx
|
|
777
|
+
|
|
778
|
+
def test_launch_bounds_scalar(self):
|
|
779
|
+
launch_bounds = 128
|
|
780
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
|
781
|
+
|
|
782
|
+
self.assertNotIn(".minnctapersm", ptx)
|
|
783
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
|
784
|
+
|
|
785
|
+
def test_launch_bounds_tuple(self):
|
|
786
|
+
launch_bounds = (128,)
|
|
787
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
|
788
|
+
|
|
789
|
+
self.assertNotIn(".minnctapersm", ptx)
|
|
790
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
|
791
|
+
|
|
792
|
+
def test_launch_bounds_with_min_cta(self):
|
|
793
|
+
launch_bounds = (128, 2)
|
|
794
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
|
795
|
+
|
|
796
|
+
self.assertRegex(ptx, r".minnctapersm\s+2")
|
|
797
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
|
798
|
+
|
|
799
|
+
@unittest.skipUnless(
|
|
800
|
+
cc_X_or_above(9, 0), "CC 9.0 needed for max cluster rank"
|
|
801
|
+
)
|
|
802
|
+
def test_launch_bounds_with_max_cluster_rank(self):
|
|
803
|
+
launch_bounds = (128, 2, 4)
|
|
804
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
|
805
|
+
|
|
806
|
+
self.assertRegex(ptx, r".minnctapersm\s+2")
|
|
807
|
+
self.assertRegex(ptx, r".maxclusterrank\s+4")
|
|
808
|
+
|
|
809
|
+
def test_too_many_launch_bounds(self):
|
|
810
|
+
launch_bounds = (128, 2, 4, 8)
|
|
811
|
+
with self.assertRaisesRegex(ValueError, "Got 4 launch bounds:"):
|
|
812
|
+
cuda.jit("void()", launch_bounds=launch_bounds)(lambda: None)
|
|
813
|
+
|
|
814
|
+
|
|
711
815
|
if __name__ == "__main__":
|
|
712
816
|
unittest.main()
|
|
@@ -118,31 +118,18 @@ class TestFastMathOption(CUDATestCase):
|
|
|
118
118
|
def tanh_kernel(r, x):
|
|
119
119
|
r[0] = tanh(x)
|
|
120
120
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
tanh_kernel, (float32[::1], float32), fastmath=True, cc=cc
|
|
124
|
-
)
|
|
125
|
-
precptx, _ = compile_ptx(
|
|
126
|
-
tanh_kernel, (float32[::1], float32), cc=cc
|
|
127
|
-
)
|
|
128
|
-
criterion.check(self, fastptx, precptx)
|
|
129
|
-
|
|
130
|
-
tanh_common_test(
|
|
131
|
-
cc=(7, 5),
|
|
132
|
-
criterion=FastMathCriterion(
|
|
133
|
-
fast_expected=["tanh.approx.f32 "],
|
|
134
|
-
prec_unexpected=["tanh.approx.f32 "],
|
|
135
|
-
),
|
|
121
|
+
fastptx, _ = compile_ptx(
|
|
122
|
+
tanh_kernel, (float32[::1], float32), fastmath=True
|
|
136
123
|
)
|
|
124
|
+
precptx, _ = compile_ptx(tanh_kernel, (float32[::1], float32))
|
|
137
125
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
fast_expected=["ex2.approx.ftz.f32 ", "rcp.approx.ftz.f32 "],
|
|
142
|
-
prec_unexpected=["tanh.approx.f32 "],
|
|
143
|
-
),
|
|
126
|
+
criterion = FastMathCriterion(
|
|
127
|
+
fast_expected=["tanh.approx.f32 "],
|
|
128
|
+
prec_unexpected=["tanh.approx.f32 "],
|
|
144
129
|
)
|
|
145
130
|
|
|
131
|
+
criterion.check(self, fastptx, precptx)
|
|
132
|
+
|
|
146
133
|
def test_expf(self):
|
|
147
134
|
self._test_fast_math_unary(
|
|
148
135
|
exp,
|
|
@@ -16,6 +16,14 @@ class TestInspect(CUDATestCase):
|
|
|
16
16
|
def cc(self):
|
|
17
17
|
return cuda.current_context().device.compute_capability
|
|
18
18
|
|
|
19
|
+
def skip_on_cuda_version_issues(self):
|
|
20
|
+
# FIXME: This should be unskipped once the cause of certain nvdisasm
|
|
21
|
+
# versions failing to dump SASS with certain driver / nvJitLink
|
|
22
|
+
# versions is understood
|
|
23
|
+
self.skipTest(
|
|
24
|
+
"Relocation information required for analysis not preserved"
|
|
25
|
+
)
|
|
26
|
+
|
|
19
27
|
def test_monotyped(self):
|
|
20
28
|
sig = (float32, int32)
|
|
21
29
|
|
|
@@ -110,6 +118,8 @@ class TestInspect(CUDATestCase):
|
|
|
110
118
|
|
|
111
119
|
@skip_without_nvdisasm("nvdisasm needed for inspect_sass()")
|
|
112
120
|
def test_inspect_sass_eager(self):
|
|
121
|
+
self.skip_on_cuda_version_issues()
|
|
122
|
+
|
|
113
123
|
sig = (float32[::1], int32[::1])
|
|
114
124
|
|
|
115
125
|
@cuda.jit(sig, lineinfo=True)
|
|
@@ -122,6 +132,8 @@ class TestInspect(CUDATestCase):
|
|
|
122
132
|
|
|
123
133
|
@skip_without_nvdisasm("nvdisasm needed for inspect_sass()")
|
|
124
134
|
def test_inspect_sass_lazy(self):
|
|
135
|
+
self.skip_on_cuda_version_issues()
|
|
136
|
+
|
|
125
137
|
@cuda.jit(lineinfo=True)
|
|
126
138
|
def add(x, y):
|
|
127
139
|
i = cuda.grid(1)
|
|
@@ -139,6 +151,8 @@ class TestInspect(CUDATestCase):
|
|
|
139
151
|
"Missing nvdisasm exception only generated when it is not present"
|
|
140
152
|
)
|
|
141
153
|
def test_inspect_sass_nvdisasm_missing(self):
|
|
154
|
+
self.skip_on_cuda_version_issues()
|
|
155
|
+
|
|
142
156
|
@cuda.jit((float32[::1],))
|
|
143
157
|
def f(x):
|
|
144
158
|
x[0] = 0
|
|
@@ -150,6 +164,8 @@ class TestInspect(CUDATestCase):
|
|
|
150
164
|
|
|
151
165
|
@skip_without_nvdisasm("nvdisasm needed for inspect_sass_cfg()")
|
|
152
166
|
def test_inspect_sass_cfg(self):
|
|
167
|
+
self.skip_on_cuda_version_issues()
|
|
168
|
+
|
|
153
169
|
sig = (float32[::1], int32[::1])
|
|
154
170
|
|
|
155
171
|
@cuda.jit(sig)
|
|
@@ -641,7 +641,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
|
641
641
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
642
642
|
def test_hadd_ptx(self):
|
|
643
643
|
args = (f2[:], f2, f2)
|
|
644
|
-
ptx, _ = compile_ptx(simple_hadd_scalar, args
|
|
644
|
+
ptx, _ = compile_ptx(simple_hadd_scalar, args)
|
|
645
645
|
self.assertIn("add.f16", ptx)
|
|
646
646
|
|
|
647
647
|
@skip_unless_cc_53
|
|
@@ -668,7 +668,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
|
668
668
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
669
669
|
def test_hfma_ptx(self):
|
|
670
670
|
args = (f2[:], f2, f2, f2)
|
|
671
|
-
ptx, _ = compile_ptx(simple_hfma_scalar, args
|
|
671
|
+
ptx, _ = compile_ptx(simple_hfma_scalar, args)
|
|
672
672
|
self.assertIn("fma.rn.f16", ptx)
|
|
673
673
|
|
|
674
674
|
@skip_unless_cc_53
|
|
@@ -693,7 +693,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
|
693
693
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
694
694
|
def test_hsub_ptx(self):
|
|
695
695
|
args = (f2[:], f2, f2)
|
|
696
|
-
ptx, _ = compile_ptx(simple_hsub_scalar, args
|
|
696
|
+
ptx, _ = compile_ptx(simple_hsub_scalar, args)
|
|
697
697
|
self.assertIn("sub.f16", ptx)
|
|
698
698
|
|
|
699
699
|
@skip_unless_cc_53
|
|
@@ -718,7 +718,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
|
718
718
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
719
719
|
def test_hmul_ptx(self):
|
|
720
720
|
args = (f2[:], f2, f2)
|
|
721
|
-
ptx, _ = compile_ptx(simple_hmul_scalar, args
|
|
721
|
+
ptx, _ = compile_ptx(simple_hmul_scalar, args)
|
|
722
722
|
self.assertIn("mul.f16", ptx)
|
|
723
723
|
|
|
724
724
|
@skip_unless_cc_53
|
|
@@ -763,7 +763,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
|
763
763
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
764
764
|
def test_hneg_ptx(self):
|
|
765
765
|
args = (f2[:], f2)
|
|
766
|
-
ptx, _ = compile_ptx(simple_hneg_scalar, args
|
|
766
|
+
ptx, _ = compile_ptx(simple_hneg_scalar, args)
|
|
767
767
|
self.assertIn("neg.f16", ptx)
|
|
768
768
|
|
|
769
769
|
@skip_unless_cc_53
|
|
@@ -786,7 +786,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
|
786
786
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
787
787
|
def test_habs_ptx(self):
|
|
788
788
|
args = (f2[:], f2)
|
|
789
|
-
ptx, _ = compile_ptx(simple_habs_scalar, args
|
|
789
|
+
ptx, _ = compile_ptx(simple_habs_scalar, args)
|
|
790
790
|
self.assertIn("abs.f16", ptx)
|
|
791
791
|
|
|
792
792
|
@skip_unless_cc_53
|
|
@@ -178,7 +178,7 @@ class TestOperatorModule(CUDATestCase):
|
|
|
178
178
|
args = (f2[:], f2, f2)
|
|
179
179
|
for fn, instr in zip(functions, instrs):
|
|
180
180
|
with self.subTest(instr=instr):
|
|
181
|
-
ptx, _ = compile_ptx(fn, args
|
|
181
|
+
ptx, _ = compile_ptx(fn, args)
|
|
182
182
|
self.assertIn(instr, ptx)
|
|
183
183
|
|
|
184
184
|
@skip_unless_cc_53
|
|
@@ -212,7 +212,7 @@ class TestOperatorModule(CUDATestCase):
|
|
|
212
212
|
|
|
213
213
|
for fn, instr in zip(functions, instrs):
|
|
214
214
|
with self.subTest(instr=instr):
|
|
215
|
-
ptx, _ = compile_ptx(fn, args
|
|
215
|
+
ptx, _ = compile_ptx(fn, args)
|
|
216
216
|
self.assertIn(instr, ptx)
|
|
217
217
|
|
|
218
218
|
@skip_unless_cc_53
|
|
@@ -255,13 +255,13 @@ class TestOperatorModule(CUDATestCase):
|
|
|
255
255
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
256
256
|
def test_fp16_neg_ptx(self):
|
|
257
257
|
args = (f2[:], f2)
|
|
258
|
-
ptx, _ = compile_ptx(simple_fp16neg, args
|
|
258
|
+
ptx, _ = compile_ptx(simple_fp16neg, args)
|
|
259
259
|
self.assertIn("neg.f16", ptx)
|
|
260
260
|
|
|
261
261
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
262
262
|
def test_fp16_abs_ptx(self):
|
|
263
263
|
args = (f2[:], f2)
|
|
264
|
-
ptx, _ = compile_ptx(simple_fp16abs, args
|
|
264
|
+
ptx, _ = compile_ptx(simple_fp16abs, args)
|
|
265
265
|
|
|
266
266
|
self.assertIn("abs.f16", ptx)
|
|
267
267
|
|
|
@@ -396,7 +396,7 @@ class TestOperatorModule(CUDATestCase):
|
|
|
396
396
|
|
|
397
397
|
for fn, op, s in zip(functions, ops, opstring):
|
|
398
398
|
with self.subTest(op=op):
|
|
399
|
-
ptx, _ = compile_ptx(fn, args
|
|
399
|
+
ptx, _ = compile_ptx(fn, args)
|
|
400
400
|
self.assertIn(s, ptx)
|
|
401
401
|
|
|
402
402
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
@@ -431,7 +431,7 @@ class TestOperatorModule(CUDATestCase):
|
|
|
431
431
|
for fn, op in zip(functions, ops):
|
|
432
432
|
with self.subTest(op=op):
|
|
433
433
|
args = (b1[:], f2, from_dtype(np.int8))
|
|
434
|
-
ptx, _ = compile_ptx(fn, args
|
|
434
|
+
ptx, _ = compile_ptx(fn, args)
|
|
435
435
|
self.assertIn(opstring[op], ptx)
|
|
436
436
|
|
|
437
437
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
@@ -475,7 +475,7 @@ class TestOperatorModule(CUDATestCase):
|
|
|
475
475
|
with self.subTest(op=op, ty=ty):
|
|
476
476
|
arg2_ty = np.result_type(np.float16, ty)
|
|
477
477
|
args = (b1[:], f2, from_dtype(arg2_ty))
|
|
478
|
-
ptx, _ = compile_ptx(fn, args
|
|
478
|
+
ptx, _ = compile_ptx(fn, args)
|
|
479
479
|
|
|
480
480
|
ops = opstring[op] + opsuffix[arg2_ty]
|
|
481
481
|
self.assertIn(ops, ptx)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
|
|
3
|
+
from numba.cuda.testing import CUDATestCase, skip_on_cudasim
|
|
4
|
+
from numba.tests.support import captured_stdout
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
|
|
8
|
+
class TestCPointer(CUDATestCase):
|
|
9
|
+
"""
|
|
10
|
+
Test simple vector addition
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def setUp(self):
|
|
14
|
+
# Prevent output from this test showing
|
|
15
|
+
# up when running the test suite
|
|
16
|
+
self._captured_stdout = captured_stdout()
|
|
17
|
+
self._captured_stdout.__enter__()
|
|
18
|
+
super().setUp()
|
|
19
|
+
|
|
20
|
+
def tearDown(self):
|
|
21
|
+
# No exception type, value, or traceback
|
|
22
|
+
self._captured_stdout.__exit__(None, None, None)
|
|
23
|
+
super().tearDown()
|
|
24
|
+
|
|
25
|
+
def test_ex_cpointer(self):
|
|
26
|
+
# ex_cpointer.sig.begin
|
|
27
|
+
import numpy as np
|
|
28
|
+
from numba import cuda, types
|
|
29
|
+
|
|
30
|
+
# The first kernel argument is a pointer to a uint8 array.
|
|
31
|
+
# The second argument holds the length as a uint32.
|
|
32
|
+
# The return type of a kernel is always void.
|
|
33
|
+
sig = types.void(types.CPointer(types.uint8), types.uint32)
|
|
34
|
+
# ex_cpointer.sig.end
|
|
35
|
+
|
|
36
|
+
# ex_cpointer.kernel.begin
|
|
37
|
+
@cuda.jit(sig)
|
|
38
|
+
def add_one(x, n):
|
|
39
|
+
i = cuda.grid(1)
|
|
40
|
+
if i < n:
|
|
41
|
+
x[i] += 1
|
|
42
|
+
|
|
43
|
+
# ex_cpointer.kernel.end
|
|
44
|
+
|
|
45
|
+
# ex_cpointer.launch.begin
|
|
46
|
+
x = cuda.to_device(np.arange(10, dtype=np.uint8))
|
|
47
|
+
|
|
48
|
+
# Print initial values of x
|
|
49
|
+
print(x.copy_to_host()) # [0 1 2 3 4 5 6 7 8 9]
|
|
50
|
+
|
|
51
|
+
# Obtain a pointer to the data from from the CUDA Array Interface
|
|
52
|
+
x_ptr = x.__cuda_array_interface__["data"][0]
|
|
53
|
+
x_len = len(x)
|
|
54
|
+
|
|
55
|
+
# Launch the kernel with the pointer and length
|
|
56
|
+
add_one[1, 32](x_ptr, x_len)
|
|
57
|
+
|
|
58
|
+
# Demonstrate that the data was updated by the kernel
|
|
59
|
+
print(x.copy_to_host()) # [ 1 2 3 4 5 6 7 8 9 10]
|
|
60
|
+
# ex_cpointer.launch.end
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
if __name__ == "__main__":
|
|
64
|
+
unittest.main()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: numba-cuda
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.14.1
|
|
4
4
|
Summary: CUDA target for Numba
|
|
5
5
|
Author: Anaconda Inc., NVIDIA Corporation
|
|
6
6
|
License: BSD 2-clause
|
|
@@ -12,6 +12,28 @@ Requires-Python: >=3.9
|
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
14
|
Requires-Dist: numba>=0.59.1
|
|
15
|
+
Provides-Extra: cu11
|
|
16
|
+
Requires-Dist: cuda-python==11.8.*; extra == "cu11"
|
|
17
|
+
Requires-Dist: nvidia-cuda-nvcc-cu11; extra == "cu11"
|
|
18
|
+
Requires-Dist: nvidia-cuda-runtime-cu11; extra == "cu11"
|
|
19
|
+
Requires-Dist: nvidia-cuda-nvrtc-cu11; extra == "cu11"
|
|
20
|
+
Provides-Extra: cu12
|
|
21
|
+
Requires-Dist: cuda-python==12.9.*; extra == "cu12"
|
|
22
|
+
Requires-Dist: nvidia-cuda-nvcc-cu12; extra == "cu12"
|
|
23
|
+
Requires-Dist: nvidia-cuda-runtime-cu12; extra == "cu12"
|
|
24
|
+
Requires-Dist: nvidia-cuda-nvrtc-cu12; extra == "cu12"
|
|
25
|
+
Requires-Dist: nvidia-cuda-cccl-cu12; extra == "cu12"
|
|
26
|
+
Provides-Extra: test
|
|
27
|
+
Requires-Dist: psutil; extra == "test"
|
|
28
|
+
Requires-Dist: cffi; extra == "test"
|
|
29
|
+
Requires-Dist: pytest; extra == "test"
|
|
30
|
+
Provides-Extra: test-cu11
|
|
31
|
+
Requires-Dist: numba-cuda[test]; extra == "test-cu11"
|
|
32
|
+
Requires-Dist: nvidia-curand-cu11; extra == "test-cu11"
|
|
33
|
+
Provides-Extra: test-cu12
|
|
34
|
+
Requires-Dist: numba-cuda[test]; extra == "test-cu12"
|
|
35
|
+
Requires-Dist: nvidia-curand-cu12; extra == "test-cu12"
|
|
36
|
+
Requires-Dist: pynvjitlink-cu12; extra == "test-cu12"
|
|
15
37
|
Dynamic: license-file
|
|
16
38
|
|
|
17
39
|
<div align="center"><img src="docs/source/_static/numba-green-icon-rgb.svg" width="200"/></div>
|