numba-cuda 0.17.0__py3-none-any.whl → 0.18.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of numba-cuda might be problematic. Click here for more details.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/__init__.py +0 -8
- numba_cuda/numba/cuda/_internal/cuda_fp16.py +14225 -0
- numba_cuda/numba/cuda/api_util.py +6 -0
- numba_cuda/numba/cuda/cgutils.py +1291 -0
- numba_cuda/numba/cuda/codegen.py +32 -14
- numba_cuda/numba/cuda/compiler.py +113 -10
- numba_cuda/numba/cuda/core/caching.py +741 -0
- numba_cuda/numba/cuda/core/callconv.py +338 -0
- numba_cuda/numba/cuda/core/codegen.py +168 -0
- numba_cuda/numba/cuda/core/compiler.py +205 -0
- numba_cuda/numba/cuda/core/typed_passes.py +139 -0
- numba_cuda/numba/cuda/cudadecl.py +0 -268
- numba_cuda/numba/cuda/cudadrv/devicearray.py +3 -0
- numba_cuda/numba/cuda/cudadrv/driver.py +2 -1
- numba_cuda/numba/cuda/cudadrv/nvvm.py +1 -1
- numba_cuda/numba/cuda/cudaimpl.py +4 -178
- numba_cuda/numba/cuda/debuginfo.py +469 -3
- numba_cuda/numba/cuda/device_init.py +0 -1
- numba_cuda/numba/cuda/dispatcher.py +310 -11
- numba_cuda/numba/cuda/extending.py +2 -1
- numba_cuda/numba/cuda/fp16.py +348 -0
- numba_cuda/numba/cuda/intrinsics.py +1 -1
- numba_cuda/numba/cuda/libdeviceimpl.py +2 -1
- numba_cuda/numba/cuda/lowering.py +1833 -8
- numba_cuda/numba/cuda/mathimpl.py +2 -90
- numba_cuda/numba/cuda/nvvmutils.py +2 -1
- numba_cuda/numba/cuda/printimpl.py +2 -1
- numba_cuda/numba/cuda/serialize.py +264 -0
- numba_cuda/numba/cuda/simulator/__init__.py +2 -0
- numba_cuda/numba/cuda/simulator/dispatcher.py +7 -0
- numba_cuda/numba/cuda/stubs.py +0 -308
- numba_cuda/numba/cuda/target.py +13 -5
- numba_cuda/numba/cuda/testing.py +156 -5
- numba_cuda/numba/cuda/tests/complex_usecases.py +113 -0
- numba_cuda/numba/cuda/tests/core/serialize_usecases.py +110 -0
- numba_cuda/numba/cuda/tests/core/test_serialize.py +359 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +10 -4
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +33 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +2 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +5 -10
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +15 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +381 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +108 -24
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +37 -23
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +43 -27
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +26 -9
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +27 -2
- numba_cuda/numba/cuda/tests/enum_usecases.py +56 -0
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +1 -2
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +1 -1
- numba_cuda/numba/cuda/utils.py +785 -0
- numba_cuda/numba/cuda/vector_types.py +1 -1
- {numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/METADATA +18 -4
- {numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/RECORD +63 -50
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +0 -46
- {numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/WHEEL +0 -0
- {numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/top_level.txt +0 -0
|
@@ -3,7 +3,6 @@ import numpy as np
|
|
|
3
3
|
import operator
|
|
4
4
|
import re
|
|
5
5
|
from numba import cuda, int64
|
|
6
|
-
from numba.cuda import compile_ptx
|
|
7
6
|
from numba.core.errors import TypingError
|
|
8
7
|
from numba.core.types import f2
|
|
9
8
|
from numba.cuda.testing import (
|
|
@@ -11,6 +10,7 @@ from numba.cuda.testing import (
|
|
|
11
10
|
CUDATestCase,
|
|
12
11
|
skip_on_cudasim,
|
|
13
12
|
skip_unless_cc_53,
|
|
13
|
+
skip_if_nvjitlink_missing,
|
|
14
14
|
)
|
|
15
15
|
|
|
16
16
|
|
|
@@ -174,27 +174,27 @@ def hlt_func_2(x, y):
|
|
|
174
174
|
return cuda.fp16.hlt(x, y)
|
|
175
175
|
|
|
176
176
|
|
|
177
|
-
def
|
|
177
|
+
def multiple_hcmp_1(r, a, b, c):
|
|
178
178
|
# float16 predicates used in two separate functions
|
|
179
179
|
r[0] = hlt_func_1(a, b) and hlt_func_2(b, c)
|
|
180
180
|
|
|
181
181
|
|
|
182
|
-
def
|
|
182
|
+
def multiple_hcmp_2(r, a, b, c):
|
|
183
183
|
# The same float16 predicate used in the caller and callee
|
|
184
184
|
r[0] = hlt_func_1(a, b) and cuda.fp16.hlt(b, c)
|
|
185
185
|
|
|
186
186
|
|
|
187
|
-
def
|
|
187
|
+
def multiple_hcmp_3(r, a, b, c):
|
|
188
188
|
# Different float16 predicates used in the caller and callee
|
|
189
189
|
r[0] = hlt_func_1(a, b) and cuda.fp16.hge(c, b)
|
|
190
190
|
|
|
191
191
|
|
|
192
|
-
def
|
|
192
|
+
def multiple_hcmp_4(r, a, b, c):
|
|
193
193
|
# The same float16 predicates used twice in a function
|
|
194
194
|
r[0] = cuda.fp16.hlt(a, b) and cuda.fp16.hlt(b, c)
|
|
195
195
|
|
|
196
196
|
|
|
197
|
-
def
|
|
197
|
+
def multiple_hcmp_5(r, a, b, c):
|
|
198
198
|
# Different float16 predicates used in a function
|
|
199
199
|
r[0] = cuda.fp16.hlt(a, b) and cuda.fp16.hge(c, b)
|
|
200
200
|
|
|
@@ -561,13 +561,13 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
|
561
561
|
|
|
562
562
|
def test_popc_u1(self):
|
|
563
563
|
compiled = cuda.jit("void(int32[:], uint8)")(simple_popc)
|
|
564
|
-
ary = np.zeros(1, dtype=np.
|
|
564
|
+
ary = np.zeros(1, dtype=np.int32)
|
|
565
565
|
compiled[1, 1](ary, np.uint8(0xFF))
|
|
566
566
|
self.assertEqual(ary[0], 8)
|
|
567
567
|
|
|
568
568
|
def test_popc_u2(self):
|
|
569
569
|
compiled = cuda.jit("void(int32[:], uint16)")(simple_popc)
|
|
570
|
-
ary = np.zeros(1, dtype=np.
|
|
570
|
+
ary = np.zeros(1, dtype=np.int32)
|
|
571
571
|
compiled[1, 1](ary, np.uint16(0xFFFF))
|
|
572
572
|
self.assertEqual(ary[0], 16)
|
|
573
573
|
|
|
@@ -585,13 +585,13 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
|
585
585
|
|
|
586
586
|
def test_bit_count_u1(self):
|
|
587
587
|
compiled = cuda.jit("void(int32[:], uint8)")(simple_bit_count)
|
|
588
|
-
ary = np.zeros(1, dtype=np.
|
|
588
|
+
ary = np.zeros(1, dtype=np.int32)
|
|
589
589
|
compiled[1, 1](ary, np.uint8(0xFF))
|
|
590
590
|
self.assertEqual(ary[0], 8)
|
|
591
591
|
|
|
592
592
|
def test_bit_count_u2(self):
|
|
593
593
|
compiled = cuda.jit("void(int32[:], uint16)")(simple_bit_count)
|
|
594
|
-
ary = np.zeros(1, dtype=np.
|
|
594
|
+
ary = np.zeros(1, dtype=np.int32)
|
|
595
595
|
compiled[1, 1](ary, np.uint16(0xFFFF))
|
|
596
596
|
self.assertEqual(ary[0], 16)
|
|
597
597
|
|
|
@@ -639,9 +639,11 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
|
639
639
|
np.testing.assert_allclose(ary[0], ref)
|
|
640
640
|
|
|
641
641
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
642
|
+
@skip_if_nvjitlink_missing("Numbast generated bindings")
|
|
642
643
|
def test_hadd_ptx(self):
|
|
644
|
+
compiled = cuda.jit("void(f2[:], f2, f2)", lto=True)(simple_hadd_scalar)
|
|
643
645
|
args = (f2[:], f2, f2)
|
|
644
|
-
ptx
|
|
646
|
+
ptx = compiled.inspect_lto_ptx(args)
|
|
645
647
|
self.assertIn("add.f16", ptx)
|
|
646
648
|
|
|
647
649
|
@skip_unless_cc_53
|
|
@@ -666,9 +668,13 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
|
666
668
|
np.testing.assert_allclose(ary[0], ref)
|
|
667
669
|
|
|
668
670
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
671
|
+
@skip_if_nvjitlink_missing("Numbast generated bindings")
|
|
669
672
|
def test_hfma_ptx(self):
|
|
673
|
+
compiled = cuda.jit("void(f2[:], f2, f2, f2)", lto=True)(
|
|
674
|
+
simple_hfma_scalar
|
|
675
|
+
)
|
|
670
676
|
args = (f2[:], f2, f2, f2)
|
|
671
|
-
ptx
|
|
677
|
+
ptx = compiled.inspect_lto_ptx(args)
|
|
672
678
|
self.assertIn("fma.rn.f16", ptx)
|
|
673
679
|
|
|
674
680
|
@skip_unless_cc_53
|
|
@@ -691,14 +697,16 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
|
691
697
|
np.testing.assert_allclose(ary[0], ref)
|
|
692
698
|
|
|
693
699
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
700
|
+
@skip_if_nvjitlink_missing("Numbast generated bindings")
|
|
694
701
|
def test_hsub_ptx(self):
|
|
702
|
+
compiled = cuda.jit("void(f2[:], f2, f2)", lto=True)(simple_hsub_scalar)
|
|
695
703
|
args = (f2[:], f2, f2)
|
|
696
|
-
ptx
|
|
704
|
+
ptx = compiled.inspect_lto_ptx(args)
|
|
697
705
|
self.assertIn("sub.f16", ptx)
|
|
698
706
|
|
|
699
707
|
@skip_unless_cc_53
|
|
700
708
|
def test_hmul(self):
|
|
701
|
-
compiled = cuda.jit(
|
|
709
|
+
compiled = cuda.jit(simple_hmul)
|
|
702
710
|
ary = np.zeros(1, dtype=np.float16)
|
|
703
711
|
arg1 = np.array([3.0], dtype=np.float16)
|
|
704
712
|
arg2 = np.array([4.0], dtype=np.float16)
|
|
@@ -716,9 +724,11 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
|
716
724
|
np.testing.assert_allclose(ary[0], ref)
|
|
717
725
|
|
|
718
726
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
727
|
+
@skip_if_nvjitlink_missing("Numbast generated bindings")
|
|
719
728
|
def test_hmul_ptx(self):
|
|
729
|
+
compiled = cuda.jit("void(f2[:], f2, f2)", lto=True)(simple_hmul_scalar)
|
|
720
730
|
args = (f2[:], f2, f2)
|
|
721
|
-
ptx
|
|
731
|
+
ptx = compiled.inspect_lto_ptx(args)
|
|
722
732
|
self.assertIn("mul.f16", ptx)
|
|
723
733
|
|
|
724
734
|
@skip_unless_cc_53
|
|
@@ -761,14 +771,16 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
|
761
771
|
np.testing.assert_allclose(ary[0], ref)
|
|
762
772
|
|
|
763
773
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
774
|
+
@skip_if_nvjitlink_missing("Numbast generated bindings")
|
|
764
775
|
def test_hneg_ptx(self):
|
|
776
|
+
compiled = cuda.jit("void(f2[:], f2)", lto=True)(simple_hneg_scalar)
|
|
765
777
|
args = (f2[:], f2)
|
|
766
|
-
ptx
|
|
778
|
+
ptx = compiled.inspect_lto_ptx(args)
|
|
767
779
|
self.assertIn("neg.f16", ptx)
|
|
768
780
|
|
|
769
781
|
@skip_unless_cc_53
|
|
770
782
|
def test_habs(self):
|
|
771
|
-
compiled = cuda.jit(
|
|
783
|
+
compiled = cuda.jit(simple_habs)
|
|
772
784
|
ary = np.zeros(1, dtype=np.float16)
|
|
773
785
|
arg1 = np.array([-3.0], dtype=np.float16)
|
|
774
786
|
compiled[1, 1](ary, arg1)
|
|
@@ -784,9 +796,11 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
|
784
796
|
np.testing.assert_allclose(ary[0], ref)
|
|
785
797
|
|
|
786
798
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
799
|
+
@skip_if_nvjitlink_missing("Numbast generated bindings")
|
|
787
800
|
def test_habs_ptx(self):
|
|
801
|
+
compiled = cuda.jit("void(f2[:], f2)", lto=True)(simple_habs_scalar)
|
|
788
802
|
args = (f2[:], f2)
|
|
789
|
-
ptx
|
|
803
|
+
ptx = compiled.inspect_lto_ptx(args)
|
|
790
804
|
self.assertIn("abs.f16", ptx)
|
|
791
805
|
|
|
792
806
|
@skip_unless_cc_53
|
|
@@ -908,11 +922,11 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
|
908
922
|
@skip_unless_cc_53
|
|
909
923
|
def test_multiple_float16_comparisons(self):
|
|
910
924
|
functions = (
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
925
|
+
multiple_hcmp_1,
|
|
926
|
+
multiple_hcmp_2,
|
|
927
|
+
multiple_hcmp_3,
|
|
928
|
+
multiple_hcmp_4,
|
|
929
|
+
multiple_hcmp_5,
|
|
916
930
|
)
|
|
917
931
|
for fn in functions:
|
|
918
932
|
with self.subTest(fn=fn):
|
|
@@ -4,10 +4,12 @@ from numba.cuda.testing import (
|
|
|
4
4
|
CUDATestCase,
|
|
5
5
|
skip_unless_cc_53,
|
|
6
6
|
skip_on_cudasim,
|
|
7
|
+
skip_if_nvjitlink_missing,
|
|
7
8
|
)
|
|
8
9
|
from numba import cuda
|
|
10
|
+
from numba.core import types
|
|
9
11
|
from numba.core.types import f2, b1
|
|
10
|
-
from numba.
|
|
12
|
+
from numba.core.typing import signature
|
|
11
13
|
import operator
|
|
12
14
|
import itertools
|
|
13
15
|
from numba.np.numpy_support import from_dtype
|
|
@@ -87,27 +89,27 @@ def hlt_func_2(x, y):
|
|
|
87
89
|
return x < y
|
|
88
90
|
|
|
89
91
|
|
|
90
|
-
def
|
|
92
|
+
def multiple_hcmp_1(r, a, b, c):
|
|
91
93
|
# float16 predicates used in two separate functions
|
|
92
94
|
r[0] = hlt_func_1(a, b) and hlt_func_2(b, c)
|
|
93
95
|
|
|
94
96
|
|
|
95
|
-
def
|
|
97
|
+
def multiple_hcmp_2(r, a, b, c):
|
|
96
98
|
# The same float16 predicate used in the caller and callee
|
|
97
99
|
r[0] = hlt_func_1(a, b) and b < c
|
|
98
100
|
|
|
99
101
|
|
|
100
|
-
def
|
|
102
|
+
def multiple_hcmp_3(r, a, b, c):
|
|
101
103
|
# Different float16 predicates used in the caller and callee
|
|
102
104
|
r[0] = hlt_func_1(a, b) and c >= b
|
|
103
105
|
|
|
104
106
|
|
|
105
|
-
def
|
|
107
|
+
def multiple_hcmp_4(r, a, b, c):
|
|
106
108
|
# The same float16 predicates used twice in a function
|
|
107
109
|
r[0] = a < b and b < c
|
|
108
110
|
|
|
109
111
|
|
|
110
|
-
def
|
|
112
|
+
def multiple_hcmp_5(r, a, b, c):
|
|
111
113
|
# Different float16 predicates used in a function
|
|
112
114
|
r[0] = a < b and c >= b
|
|
113
115
|
|
|
@@ -172,16 +174,19 @@ class TestOperatorModule(CUDATestCase):
|
|
|
172
174
|
np.testing.assert_allclose(got, expected)
|
|
173
175
|
|
|
174
176
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
177
|
+
@skip_if_nvjitlink_missing("Numbast generated bindings")
|
|
175
178
|
def test_fp16_binary_ptx(self):
|
|
176
179
|
functions = (simple_fp16add, simple_fp16sub, simple_fp16mul)
|
|
177
180
|
instrs = ("add.f16", "sub.f16", "mul.f16")
|
|
178
181
|
args = (f2[:], f2, f2)
|
|
179
182
|
for fn, instr in zip(functions, instrs):
|
|
180
183
|
with self.subTest(instr=instr):
|
|
181
|
-
|
|
184
|
+
compiled = cuda.jit("void(f2[:], f2, f2)", lto=True)(fn)
|
|
185
|
+
ptx = compiled.inspect_lto_ptx(args)
|
|
182
186
|
self.assertIn(instr, ptx)
|
|
183
187
|
|
|
184
188
|
@skip_unless_cc_53
|
|
189
|
+
@skip_if_nvjitlink_missing("Numbast generated bindings")
|
|
185
190
|
def test_mixed_fp16_binary_arithmetic(self):
|
|
186
191
|
functions = (
|
|
187
192
|
simple_fp16add,
|
|
@@ -193,7 +198,7 @@ class TestOperatorModule(CUDATestCase):
|
|
|
193
198
|
types = (np.int8, np.int16, np.int32, np.int64, np.float32, np.float64)
|
|
194
199
|
for (fn, op), ty in itertools.product(zip(functions, ops), types):
|
|
195
200
|
with self.subTest(op=op, ty=ty):
|
|
196
|
-
kernel = cuda.jit(fn)
|
|
201
|
+
kernel = cuda.jit(fn, lto=True)
|
|
197
202
|
|
|
198
203
|
arg1 = np.random.random(1).astype(np.float16)
|
|
199
204
|
arg2 = (np.random.random(1) * 100).astype(ty)
|
|
@@ -205,6 +210,7 @@ class TestOperatorModule(CUDATestCase):
|
|
|
205
210
|
np.testing.assert_allclose(got, expected)
|
|
206
211
|
|
|
207
212
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
213
|
+
@skip_if_nvjitlink_missing("Numbast generated bindings")
|
|
208
214
|
def test_fp16_inplace_binary_ptx(self):
|
|
209
215
|
functions = (simple_fp16_iadd, simple_fp16_isub, simple_fp16_imul)
|
|
210
216
|
instrs = ("add.f16", "sub.f16", "mul.f16")
|
|
@@ -212,7 +218,8 @@ class TestOperatorModule(CUDATestCase):
|
|
|
212
218
|
|
|
213
219
|
for fn, instr in zip(functions, instrs):
|
|
214
220
|
with self.subTest(instr=instr):
|
|
215
|
-
|
|
221
|
+
compiled = cuda.jit("void(f2[:], f2)", lto=True)(fn)
|
|
222
|
+
ptx = compiled.inspect_lto_ptx(args)
|
|
216
223
|
self.assertIn(instr, ptx)
|
|
217
224
|
|
|
218
225
|
@skip_unless_cc_53
|
|
@@ -253,16 +260,19 @@ class TestOperatorModule(CUDATestCase):
|
|
|
253
260
|
np.testing.assert_allclose(got, expected)
|
|
254
261
|
|
|
255
262
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
263
|
+
@skip_if_nvjitlink_missing("Numbast generated bindings")
|
|
256
264
|
def test_fp16_neg_ptx(self):
|
|
257
265
|
args = (f2[:], f2)
|
|
258
|
-
|
|
266
|
+
compiled = cuda.jit("void(f2[:], f2)", lto=True)(simple_fp16neg)
|
|
267
|
+
ptx = compiled.inspect_lto_ptx(args)
|
|
259
268
|
self.assertIn("neg.f16", ptx)
|
|
260
269
|
|
|
261
270
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
271
|
+
@skip_if_nvjitlink_missing("Numbast generated bindings")
|
|
262
272
|
def test_fp16_abs_ptx(self):
|
|
263
273
|
args = (f2[:], f2)
|
|
264
|
-
|
|
265
|
-
|
|
274
|
+
compiled = cuda.jit("void(f2[:], f2)", lto=True)(simple_fp16abs)
|
|
275
|
+
ptx = compiled.inspect_lto_ptx(args)
|
|
266
276
|
self.assertIn("abs.f16", ptx)
|
|
267
277
|
|
|
268
278
|
@skip_unless_cc_53
|
|
@@ -331,11 +341,11 @@ class TestOperatorModule(CUDATestCase):
|
|
|
331
341
|
@skip_unless_cc_53
|
|
332
342
|
def test_multiple_float16_comparisons(self):
|
|
333
343
|
functions = (
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
344
|
+
multiple_hcmp_1,
|
|
345
|
+
multiple_hcmp_2,
|
|
346
|
+
multiple_hcmp_3,
|
|
347
|
+
multiple_hcmp_4,
|
|
348
|
+
multiple_hcmp_5,
|
|
339
349
|
)
|
|
340
350
|
for fn in functions:
|
|
341
351
|
with self.subTest(fn=fn):
|
|
@@ -350,11 +360,11 @@ class TestOperatorModule(CUDATestCase):
|
|
|
350
360
|
@skip_unless_cc_53
|
|
351
361
|
def test_multiple_float16_comparisons_false(self):
|
|
352
362
|
functions = (
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
363
|
+
multiple_hcmp_1,
|
|
364
|
+
multiple_hcmp_2,
|
|
365
|
+
multiple_hcmp_3,
|
|
366
|
+
multiple_hcmp_4,
|
|
367
|
+
multiple_hcmp_5,
|
|
358
368
|
)
|
|
359
369
|
for fn in functions:
|
|
360
370
|
with self.subTest(fn=fn):
|
|
@@ -367,6 +377,7 @@ class TestOperatorModule(CUDATestCase):
|
|
|
367
377
|
self.assertFalse(ary[0])
|
|
368
378
|
|
|
369
379
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
380
|
+
@skip_if_nvjitlink_missing("Numbast generated bindings")
|
|
370
381
|
def test_fp16_comparison_ptx(self):
|
|
371
382
|
functions = (
|
|
372
383
|
simple_fp16_gt,
|
|
@@ -390,16 +401,18 @@ class TestOperatorModule(CUDATestCase):
|
|
|
390
401
|
"setp.lt.f16",
|
|
391
402
|
"setp.le.f16",
|
|
392
403
|
"setp.eq.f16",
|
|
393
|
-
"setp.
|
|
404
|
+
"setp.neu.f16",
|
|
394
405
|
)
|
|
395
406
|
args = (b1[:], f2, f2)
|
|
396
407
|
|
|
397
408
|
for fn, op, s in zip(functions, ops, opstring):
|
|
398
409
|
with self.subTest(op=op):
|
|
399
|
-
|
|
410
|
+
compiled = cuda.jit("void(b1[:], f2, f2)", lto=True)(fn)
|
|
411
|
+
ptx = compiled.inspect_lto_ptx(args)
|
|
400
412
|
self.assertIn(s, ptx)
|
|
401
413
|
|
|
402
414
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
415
|
+
@skip_if_nvjitlink_missing("Numbast generated bindings")
|
|
403
416
|
def test_fp16_int8_comparison_ptx(self):
|
|
404
417
|
# Test that int8 can be safely converted to fp16
|
|
405
418
|
# in a comparison
|
|
@@ -426,15 +439,17 @@ class TestOperatorModule(CUDATestCase):
|
|
|
426
439
|
operator.lt: "setp.lt.f16",
|
|
427
440
|
operator.le: "setp.le.f16",
|
|
428
441
|
operator.eq: "setp.eq.f16",
|
|
429
|
-
operator.ne: "setp.
|
|
442
|
+
operator.ne: "setp.neu.f16",
|
|
430
443
|
}
|
|
431
444
|
for fn, op in zip(functions, ops):
|
|
432
445
|
with self.subTest(op=op):
|
|
433
446
|
args = (b1[:], f2, from_dtype(np.int8))
|
|
434
|
-
|
|
447
|
+
compiled = cuda.jit(signature(types.void, *args), lto=True)(fn)
|
|
448
|
+
ptx = compiled.inspect_lto_ptx(args)
|
|
435
449
|
self.assertIn(opstring[op], ptx)
|
|
436
450
|
|
|
437
451
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
452
|
+
@skip_if_nvjitlink_missing("Numbast generated bindings")
|
|
438
453
|
def test_mixed_fp16_comparison_promotion_ptx(self):
|
|
439
454
|
functions = (
|
|
440
455
|
simple_fp16_gt,
|
|
@@ -475,7 +490,8 @@ class TestOperatorModule(CUDATestCase):
|
|
|
475
490
|
with self.subTest(op=op, ty=ty):
|
|
476
491
|
arg2_ty = np.result_type(np.float16, ty)
|
|
477
492
|
args = (b1[:], f2, from_dtype(arg2_ty))
|
|
478
|
-
|
|
493
|
+
compiled = cuda.jit(signature(types.void, *args), lto=True)(fn)
|
|
494
|
+
ptx = compiled.inspect_lto_ptx(args)
|
|
479
495
|
|
|
480
496
|
ops = opstring[op] + opsuffix[arg2_ty]
|
|
481
497
|
self.assertIn(ops, ptx)
|
|
@@ -17,16 +17,16 @@ def _make_ufunc_usecase(ufunc):
|
|
|
17
17
|
return fn
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
# This
|
|
21
|
-
#
|
|
22
|
-
#
|
|
23
|
-
# class instead. These are:
|
|
20
|
+
# This class provides common functionality for UFunc tests. The UFunc tests
|
|
21
|
+
# are quite long-running in comparison to other tests, so we break the tests up
|
|
22
|
+
# into multiple test classes for distribution across workers.
|
|
24
23
|
#
|
|
25
|
-
#
|
|
26
|
-
#
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
24
|
+
# This class would also be a CUDATestCase, but to avoid a confusing and
|
|
25
|
+
# potentially dangerous inheritance diamond with setUp methods that modify
|
|
26
|
+
# global state, we implement the necessary part of CUDATestCase within this
|
|
27
|
+
# class instead. This disables CUDA performance warnings for the duration of
|
|
28
|
+
# tests.
|
|
29
|
+
class CUDAUFuncTestBase(BasicUFuncTest, TestCase):
|
|
30
30
|
def setUp(self):
|
|
31
31
|
BasicUFuncTest.setUp(self)
|
|
32
32
|
|
|
@@ -146,6 +146,8 @@ class TestUFuncs(BasicUFuncTest, TestCase):
|
|
|
146
146
|
############################################################################
|
|
147
147
|
# Trigonometric Functions
|
|
148
148
|
|
|
149
|
+
|
|
150
|
+
class TestBasicTrigUFuncs(CUDAUFuncTestBase):
|
|
149
151
|
def test_sin_ufunc(self):
|
|
150
152
|
self.basic_ufunc_test(np.sin, kinds="cf")
|
|
151
153
|
|
|
@@ -167,6 +169,8 @@ class TestUFuncs(BasicUFuncTest, TestCase):
|
|
|
167
169
|
def test_arctan2_ufunc(self):
|
|
168
170
|
self.basic_ufunc_test(np.arctan2, kinds="f")
|
|
169
171
|
|
|
172
|
+
|
|
173
|
+
class TestHypTrigUFuncs(CUDAUFuncTestBase):
|
|
170
174
|
def test_hypot_ufunc(self):
|
|
171
175
|
self.basic_ufunc_test(np.hypot, kinds="f")
|
|
172
176
|
|
|
@@ -207,6 +211,8 @@ class TestUFuncs(BasicUFuncTest, TestCase):
|
|
|
207
211
|
|
|
208
212
|
self.basic_ufunc_test(np.arctanh, skip_inputs=to_skip, kinds="cf")
|
|
209
213
|
|
|
214
|
+
|
|
215
|
+
class TestConversionUFuncs(CUDAUFuncTestBase):
|
|
210
216
|
def test_deg2rad_ufunc(self):
|
|
211
217
|
self.basic_ufunc_test(np.deg2rad, kinds="f")
|
|
212
218
|
|
|
@@ -221,6 +227,9 @@ class TestUFuncs(BasicUFuncTest, TestCase):
|
|
|
221
227
|
|
|
222
228
|
############################################################################
|
|
223
229
|
# Comparison functions
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
class TestComparisonUFuncs1(CUDAUFuncTestBase):
|
|
224
233
|
def test_greater_ufunc(self):
|
|
225
234
|
self.signed_unsigned_cmp_test(np.greater)
|
|
226
235
|
|
|
@@ -239,6 +248,8 @@ class TestUFuncs(BasicUFuncTest, TestCase):
|
|
|
239
248
|
def test_equal_ufunc(self):
|
|
240
249
|
self.signed_unsigned_cmp_test(np.equal)
|
|
241
250
|
|
|
251
|
+
|
|
252
|
+
class TestLogicalUFuncs(CUDAUFuncTestBase):
|
|
242
253
|
def test_logical_and_ufunc(self):
|
|
243
254
|
self.basic_ufunc_test(np.logical_and)
|
|
244
255
|
|
|
@@ -251,6 +262,8 @@ class TestUFuncs(BasicUFuncTest, TestCase):
|
|
|
251
262
|
def test_logical_not_ufunc(self):
|
|
252
263
|
self.basic_ufunc_test(np.logical_not)
|
|
253
264
|
|
|
265
|
+
|
|
266
|
+
class TestMinmaxUFuncs(CUDAUFuncTestBase):
|
|
254
267
|
def test_maximum_ufunc(self):
|
|
255
268
|
self.basic_ufunc_test(np.maximum)
|
|
256
269
|
|
|
@@ -263,6 +276,8 @@ class TestUFuncs(BasicUFuncTest, TestCase):
|
|
|
263
276
|
def test_fmin_ufunc(self):
|
|
264
277
|
self.basic_ufunc_test(np.fmin)
|
|
265
278
|
|
|
279
|
+
|
|
280
|
+
class TestBitwiseUFuncs(CUDAUFuncTestBase):
|
|
266
281
|
def test_bitwise_and_ufunc(self):
|
|
267
282
|
self.basic_int_ufunc_test(np.bitwise_and)
|
|
268
283
|
|
|
@@ -286,6 +301,8 @@ class TestUFuncs(BasicUFuncTest, TestCase):
|
|
|
286
301
|
############################################################################
|
|
287
302
|
# Mathematical Functions
|
|
288
303
|
|
|
304
|
+
|
|
305
|
+
class TestLogUFuncs(CUDAUFuncTestBase):
|
|
289
306
|
def test_log_ufunc(self):
|
|
290
307
|
self.basic_ufunc_test(np.log, kinds="cf")
|
|
291
308
|
|
|
@@ -1,7 +1,12 @@
|
|
|
1
1
|
import numpy as np
|
|
2
2
|
from numba import cuda
|
|
3
|
-
from numba.cuda.
|
|
4
|
-
from numba.
|
|
3
|
+
from numba.cuda.cudadrv import driver
|
|
4
|
+
from numba.cuda.testing import (
|
|
5
|
+
unittest,
|
|
6
|
+
CUDATestCase,
|
|
7
|
+
skip_on_cudasim,
|
|
8
|
+
)
|
|
9
|
+
from numba.tests.support import linux_only, override_config, run_in_subprocess
|
|
5
10
|
from numba.core.errors import NumbaPerformanceWarning
|
|
6
11
|
from numba.core import config
|
|
7
12
|
import warnings
|
|
@@ -9,6 +14,26 @@ import warnings
|
|
|
9
14
|
|
|
10
15
|
@skip_on_cudasim("cudasim does not raise performance warnings")
|
|
11
16
|
class TestWarnings(CUDATestCase):
|
|
17
|
+
def test_float16_warn_if_lto_missing(self):
|
|
18
|
+
fp16_kernel_invocation = """
|
|
19
|
+
import math
|
|
20
|
+
from numba import cuda, core
|
|
21
|
+
|
|
22
|
+
@cuda.jit
|
|
23
|
+
def kernel():
|
|
24
|
+
x = core.types.float16(1.0)
|
|
25
|
+
y = math.sin(x)
|
|
26
|
+
|
|
27
|
+
kernel[1,1]()
|
|
28
|
+
kernel[1,1]()
|
|
29
|
+
"""
|
|
30
|
+
performance_warning = "float16 relies on LTO for performance"
|
|
31
|
+
expected_warning_count = 0 if driver._have_nvjitlink() else 1
|
|
32
|
+
_, err = run_in_subprocess(fp16_kernel_invocation)
|
|
33
|
+
self.assertEqual(
|
|
34
|
+
err.decode().count(performance_warning), expected_warning_count
|
|
35
|
+
)
|
|
36
|
+
|
|
12
37
|
def test_inefficient_launch_configuration(self):
|
|
13
38
|
@cuda.jit
|
|
14
39
|
def kernel():
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from enum import Enum, IntEnum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class Color(Enum):
|
|
5
|
+
red = 1
|
|
6
|
+
green = 2
|
|
7
|
+
blue = 3
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Shake(Enum):
|
|
11
|
+
vanilla = 7
|
|
12
|
+
chocolate = 4
|
|
13
|
+
cookies = 9
|
|
14
|
+
# Same as Color.blue
|
|
15
|
+
mint = 3
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Planet(Enum):
|
|
19
|
+
MERCURY = (3.303e23, 2.4397e6)
|
|
20
|
+
VENUS = (4.869e24, 6.0518e6)
|
|
21
|
+
EARTH = (5.976e24, 6.37814e6)
|
|
22
|
+
MARS = (6.421e23, 3.3972e6)
|
|
23
|
+
JUPITER = (1.9e27, 7.1492e7)
|
|
24
|
+
SATURN = (5.688e26, 6.0268e7)
|
|
25
|
+
URANUS = (8.686e25, 2.5559e7)
|
|
26
|
+
NEPTUNE = (1.024e26, 2.4746e7)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class HeterogeneousEnum(Enum):
|
|
30
|
+
red = 1.0
|
|
31
|
+
green = 2.0
|
|
32
|
+
blue = 3j
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class Shape(IntEnum):
|
|
36
|
+
# Same as Color.green
|
|
37
|
+
circle = 2
|
|
38
|
+
# Same as RequestError.internal_error
|
|
39
|
+
square = 500
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class RequestError(IntEnum):
|
|
43
|
+
dummy = 2
|
|
44
|
+
not_found = 404
|
|
45
|
+
internal_error = 500
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class IntEnumWithNegatives(IntEnum):
|
|
49
|
+
# Used for testing of hash, need to make sure -1 -> -2 to comply with CPy
|
|
50
|
+
one = 1
|
|
51
|
+
two = 2
|
|
52
|
+
too = 2
|
|
53
|
+
three = 3
|
|
54
|
+
negone = -1
|
|
55
|
+
negtwo = -2
|
|
56
|
+
negthree = -3
|
|
@@ -9,7 +9,6 @@ from numba.cuda.cudadrv import nvvm
|
|
|
9
9
|
from numba.cuda.testing import (
|
|
10
10
|
unittest,
|
|
11
11
|
skip_on_cudasim,
|
|
12
|
-
SerialMixin,
|
|
13
12
|
skip_unless_conda_cudatoolkit,
|
|
14
13
|
)
|
|
15
14
|
from numba.cuda.cuda_paths import (
|
|
@@ -24,7 +23,7 @@ has_cuda = nvvm.is_available()
|
|
|
24
23
|
has_mp_get_context = hasattr(mp, "get_context")
|
|
25
24
|
|
|
26
25
|
|
|
27
|
-
class LibraryLookupBase(
|
|
26
|
+
class LibraryLookupBase(unittest.TestCase):
|
|
28
27
|
def setUp(self):
|
|
29
28
|
ctx = mp.get_context("spawn")
|
|
30
29
|
|