numba-cuda 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/compiler.py +35 -3
- numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
- numba_cuda/numba/cuda/cuda_paths.py +2 -0
- numba_cuda/numba/cuda/cudadecl.py +0 -42
- numba_cuda/numba/cuda/cudadrv/linkable_code.py +11 -2
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +10 -3
- numba_cuda/numba/cuda/cudaimpl.py +0 -63
- numba_cuda/numba/cuda/debuginfo.py +92 -2
- numba_cuda/numba/cuda/decorators.py +27 -1
- numba_cuda/numba/cuda/device_init.py +4 -5
- numba_cuda/numba/cuda/dispatcher.py +4 -3
- numba_cuda/numba/cuda/extending.py +54 -0
- numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
- numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
- numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +550 -387
- numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +465 -316
- numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
- numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
- numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
- numba_cuda/numba/cuda/intrinsics.py +172 -1
- numba_cuda/numba/cuda/lowering.py +43 -0
- numba_cuda/numba/cuda/stubs.py +0 -11
- numba_cuda/numba/cuda/target.py +28 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +4 -2
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +46 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +18 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +4 -2
- numba_cuda/numba/cuda/tests/cudapy/test_inline.py +156 -0
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +50 -5
- numba_cuda/numba/cuda/vector_types.py +3 -1
- numba_cuda/numba/cuda/vectorizers.py +1 -1
- {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/METADATA +1 -1
- {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/RECORD +43 -33
- {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/WHEEL +1 -1
- {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/top_level.txt +0 -0
@@ -36,42 +36,3 @@ def ballot_sync(mask, predicate):
|
|
36
36
|
and are within the given mask.
|
37
37
|
"""
|
38
38
|
return numba.cuda.vote_sync_intrinsic(mask, 3, predicate)[0]
|
39
|
-
|
40
|
-
|
41
|
-
@jit(device=True)
|
42
|
-
def shfl_sync(mask, value, src_lane):
|
43
|
-
"""
|
44
|
-
Shuffles value across the masked warp and returns the value
|
45
|
-
from src_lane. If this is outside the warp, then the
|
46
|
-
given value is returned.
|
47
|
-
"""
|
48
|
-
return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1F)[0]
|
49
|
-
|
50
|
-
|
51
|
-
@jit(device=True)
|
52
|
-
def shfl_up_sync(mask, value, delta):
|
53
|
-
"""
|
54
|
-
Shuffles value across the masked warp and returns the value
|
55
|
-
from (laneid - delta). If this is outside the warp, then the
|
56
|
-
given value is returned.
|
57
|
-
"""
|
58
|
-
return numba.cuda.shfl_sync_intrinsic(mask, 1, value, delta, 0)[0]
|
59
|
-
|
60
|
-
|
61
|
-
@jit(device=True)
|
62
|
-
def shfl_down_sync(mask, value, delta):
|
63
|
-
"""
|
64
|
-
Shuffles value across the masked warp and returns the value
|
65
|
-
from (laneid + delta). If this is outside the warp, then the
|
66
|
-
given value is returned.
|
67
|
-
"""
|
68
|
-
return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1F)[0]
|
69
|
-
|
70
|
-
|
71
|
-
@jit(device=True)
|
72
|
-
def shfl_xor_sync(mask, value, lane_mask):
|
73
|
-
"""
|
74
|
-
Shuffles value across the masked warp and returns the value
|
75
|
-
from (laneid ^ lane_mask).
|
76
|
-
"""
|
77
|
-
return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1F)[0]
|
@@ -2,7 +2,7 @@ from llvmlite import ir
|
|
2
2
|
|
3
3
|
from numba import cuda, types
|
4
4
|
from numba.core import cgutils
|
5
|
-
from numba.core.errors import RequireLiteralValue
|
5
|
+
from numba.core.errors import RequireLiteralValue, TypingError
|
6
6
|
from numba.core.typing import signature
|
7
7
|
from numba.core.extending import overload_attribute, overload_method
|
8
8
|
from numba.cuda import nvvmutils
|
@@ -205,3 +205,174 @@ def syncthreads_or(typingctx, predicate):
|
|
205
205
|
@overload_method(types.Integer, "bit_count", target="cuda")
|
206
206
|
def integer_bit_count(i):
|
207
207
|
return lambda i: cuda.popc(i)
|
208
|
+
|
209
|
+
|
210
|
+
# -------------------------------------------------------------------------------
|
211
|
+
# Warp shuffle functions
|
212
|
+
#
|
213
|
+
# References:
|
214
|
+
#
|
215
|
+
# - https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-shuffle-functions
|
216
|
+
# - https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#data-movement
|
217
|
+
#
|
218
|
+
# Notes:
|
219
|
+
#
|
220
|
+
# - The public CUDA C/C++ and Numba Python APIs for these intrinsics use
|
221
|
+
# different names for parameters to the NVVM IR specification. So that we
|
222
|
+
# can correlate the implementation with the documentation, the @intrinsic
|
223
|
+
# API functions map the public API arguments to the NVVM intrinsic
|
224
|
+
# arguments.
|
225
|
+
# - The NVVM IR specification requires some of the parameters (e.g. mode) to be
|
226
|
+
# constants. It's therefore essential that we pass in some values to the
|
227
|
+
# shfl_sync_intrinsic function (e.g. the mode and c values).
|
228
|
+
# - Normally parameters for intrinsic functions in Numba would be given the
|
229
|
+
# same name as used in the API, and would contain a type. However, because we
|
230
|
+
# have to pass in some values and some times (and there is divergence between
|
231
|
+
# the names in the intrinsic documentation and the public APIs) we instead
|
232
|
+
# follow the convention of naming shfl_sync_intrinsic parameters with a
|
233
|
+
# suffix of _type or _value depending on whether they contain a type or a
|
234
|
+
# value.
|
235
|
+
|
236
|
+
|
237
|
+
@intrinsic
|
238
|
+
def shfl_sync(typingctx, mask, value, src_lane):
|
239
|
+
"""
|
240
|
+
Shuffles ``value`` across the masked warp and returns the value from
|
241
|
+
``src_lane``. If this is outside the warp, then the given value is
|
242
|
+
returned.
|
243
|
+
"""
|
244
|
+
membermask_type = mask
|
245
|
+
mode_value = 0
|
246
|
+
a_type = value
|
247
|
+
b_type = src_lane
|
248
|
+
c_value = 0x1F
|
249
|
+
return shfl_sync_intrinsic(
|
250
|
+
typingctx, membermask_type, mode_value, a_type, b_type, c_value
|
251
|
+
)
|
252
|
+
|
253
|
+
|
254
|
+
@intrinsic
|
255
|
+
def shfl_up_sync(typingctx, mask, value, delta):
|
256
|
+
"""
|
257
|
+
Shuffles ``value`` across the masked warp and returns the value from
|
258
|
+
``(laneid - delta)``. If this is outside the warp, then the given value is
|
259
|
+
returned.
|
260
|
+
"""
|
261
|
+
membermask_type = mask
|
262
|
+
mode_value = 1
|
263
|
+
a_type = value
|
264
|
+
b_type = delta
|
265
|
+
c_value = 0
|
266
|
+
return shfl_sync_intrinsic(
|
267
|
+
typingctx, membermask_type, mode_value, a_type, b_type, c_value
|
268
|
+
)
|
269
|
+
|
270
|
+
|
271
|
+
@intrinsic
|
272
|
+
def shfl_down_sync(typingctx, mask, value, delta):
|
273
|
+
"""
|
274
|
+
Shuffles ``value`` across the masked warp and returns the value from
|
275
|
+
``(laneid + delta)``. If this is outside the warp, then the given value is
|
276
|
+
returned.
|
277
|
+
"""
|
278
|
+
membermask_type = mask
|
279
|
+
mode_value = 2
|
280
|
+
a_type = value
|
281
|
+
b_type = delta
|
282
|
+
c_value = 0x1F
|
283
|
+
return shfl_sync_intrinsic(
|
284
|
+
typingctx, membermask_type, mode_value, a_type, b_type, c_value
|
285
|
+
)
|
286
|
+
|
287
|
+
|
288
|
+
@intrinsic
|
289
|
+
def shfl_xor_sync(typingctx, mask, value, lane_mask):
|
290
|
+
"""
|
291
|
+
Shuffles ``value`` across the masked warp and returns the value from
|
292
|
+
``(laneid ^ lane_mask)``.
|
293
|
+
"""
|
294
|
+
membermask_type = mask
|
295
|
+
mode_value = 3
|
296
|
+
a_type = value
|
297
|
+
b_type = lane_mask
|
298
|
+
c_value = 0x1F
|
299
|
+
return shfl_sync_intrinsic(
|
300
|
+
typingctx, membermask_type, mode_value, a_type, b_type, c_value
|
301
|
+
)
|
302
|
+
|
303
|
+
|
304
|
+
def shfl_sync_intrinsic(
|
305
|
+
typingctx,
|
306
|
+
membermask_type,
|
307
|
+
mode_value,
|
308
|
+
a_type,
|
309
|
+
b_type,
|
310
|
+
c_value,
|
311
|
+
):
|
312
|
+
if a_type not in (types.i4, types.i8, types.f4, types.f8):
|
313
|
+
raise TypingError(
|
314
|
+
"shfl_sync only supports 32- and 64-bit ints and floats"
|
315
|
+
)
|
316
|
+
|
317
|
+
def codegen(context, builder, sig, args):
|
318
|
+
"""
|
319
|
+
The NVVM shfl_sync intrinsic only supports i32, but the CUDA C/C++
|
320
|
+
intrinsic supports both 32- and 64-bit ints and floats, so for feature
|
321
|
+
parity, i32, i64, f32, and f64 are implemented. Floats by way of
|
322
|
+
bitcasting the float to an int, then shuffling, then bitcasting
|
323
|
+
back."""
|
324
|
+
membermask, a, b = args
|
325
|
+
|
326
|
+
# Types
|
327
|
+
a_type = sig.args[1]
|
328
|
+
return_type = context.get_value_type(sig.return_type)
|
329
|
+
i32 = ir.IntType(32)
|
330
|
+
i64 = ir.IntType(64)
|
331
|
+
|
332
|
+
if a_type in types.real_domain:
|
333
|
+
a = builder.bitcast(a, ir.IntType(a_type.bitwidth))
|
334
|
+
|
335
|
+
# NVVM intrinsic definition
|
336
|
+
arg_types = (i32, i32, i32, i32, i32)
|
337
|
+
shfl_return_type = ir.LiteralStructType((i32, ir.IntType(1)))
|
338
|
+
fnty = ir.FunctionType(shfl_return_type, arg_types)
|
339
|
+
|
340
|
+
fname = "llvm.nvvm.shfl.sync.i32"
|
341
|
+
shfl_sync = cgutils.get_or_insert_function(builder.module, fnty, fname)
|
342
|
+
|
343
|
+
# Intrinsic arguments
|
344
|
+
mode = ir.Constant(i32, mode_value)
|
345
|
+
c = ir.Constant(i32, c_value)
|
346
|
+
membermask = builder.trunc(membermask, i32)
|
347
|
+
b = builder.trunc(b, i32)
|
348
|
+
|
349
|
+
if a_type.bitwidth == 32:
|
350
|
+
a = builder.trunc(a, i32)
|
351
|
+
ret = builder.call(shfl_sync, (membermask, mode, a, b, c))
|
352
|
+
d = builder.extract_value(ret, 0)
|
353
|
+
else:
|
354
|
+
# Handle 64-bit values by shuffling as two 32-bit values and
|
355
|
+
# packing the result into 64 bits.
|
356
|
+
|
357
|
+
# Extract high and low parts
|
358
|
+
lo = builder.trunc(a, i32)
|
359
|
+
a_lshr = builder.lshr(a, ir.Constant(i64, 32))
|
360
|
+
hi = builder.trunc(a_lshr, i32)
|
361
|
+
|
362
|
+
# Shuffle individual parts
|
363
|
+
ret_lo = builder.call(shfl_sync, (membermask, mode, lo, b, c))
|
364
|
+
ret_hi = builder.call(shfl_sync, (membermask, mode, hi, b, c))
|
365
|
+
|
366
|
+
# Combine individual result parts into a 64-bit result
|
367
|
+
d_lo = builder.extract_value(ret_lo, 0)
|
368
|
+
d_hi = builder.extract_value(ret_hi, 0)
|
369
|
+
d_lo_64 = builder.zext(d_lo, i64)
|
370
|
+
d_hi_64 = builder.zext(d_hi, i64)
|
371
|
+
d_shl = builder.shl(d_hi_64, ir.Constant(i64, 32))
|
372
|
+
d = builder.or_(d_shl, d_lo_64)
|
373
|
+
|
374
|
+
return builder.bitcast(d, return_type)
|
375
|
+
|
376
|
+
sig = signature(a_type, membermask_type, a_type, b_type)
|
377
|
+
|
378
|
+
return sig, codegen
|
@@ -0,0 +1,43 @@
|
|
1
|
+
from numba.core.lowering import Lower
|
2
|
+
from llvmlite import ir
|
3
|
+
|
4
|
+
|
5
|
+
class CUDALower(Lower):
|
6
|
+
def storevar(self, value, name, argidx=None):
|
7
|
+
"""
|
8
|
+
Store the value into the given variable.
|
9
|
+
"""
|
10
|
+
super().storevar(value, name, argidx)
|
11
|
+
|
12
|
+
# Emit llvm.dbg.value instead of llvm.dbg.declare for local scalar
|
13
|
+
# variables immediately after a store instruction.
|
14
|
+
if (
|
15
|
+
self.context.enable_debuginfo
|
16
|
+
# Conditions used to elide stores in parent method
|
17
|
+
and (
|
18
|
+
name not in self._singly_assigned_vars
|
19
|
+
or self._disable_sroa_like_opt
|
20
|
+
)
|
21
|
+
# No emission of debuginfo for internal names
|
22
|
+
and not name.startswith("$")
|
23
|
+
):
|
24
|
+
# Emit debug value for user variable
|
25
|
+
fetype = self.typeof(name)
|
26
|
+
lltype = self.context.get_value_type(fetype)
|
27
|
+
int_type = (ir.IntType,)
|
28
|
+
real_type = ir.FloatType, ir.DoubleType
|
29
|
+
if isinstance(lltype, int_type + real_type):
|
30
|
+
# Emit debug value for scalar variable
|
31
|
+
sizeof = self.context.get_abi_sizeof(lltype)
|
32
|
+
datamodel = self.context.data_model_manager[fetype]
|
33
|
+
line = self.loc.line if argidx is None else self.defn_loc.line
|
34
|
+
self.debuginfo.update_variable(
|
35
|
+
self.builder,
|
36
|
+
value,
|
37
|
+
name,
|
38
|
+
lltype,
|
39
|
+
sizeof,
|
40
|
+
line,
|
41
|
+
datamodel,
|
42
|
+
argidx,
|
43
|
+
)
|
numba_cuda/numba/cuda/stubs.py
CHANGED
@@ -185,17 +185,6 @@ class syncwarp(Stub):
|
|
185
185
|
_description_ = "<warp_sync()>"
|
186
186
|
|
187
187
|
|
188
|
-
class shfl_sync_intrinsic(Stub):
|
189
|
-
"""
|
190
|
-
shfl_sync_intrinsic(mask, mode, value, mode_offset, clamp)
|
191
|
-
|
192
|
-
Nvvm intrinsic for shuffling data across a warp
|
193
|
-
docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-datamove
|
194
|
-
"""
|
195
|
-
|
196
|
-
_description_ = "<shfl_sync()>"
|
197
|
-
|
198
|
-
|
199
188
|
class vote_sync_intrinsic(Stub):
|
200
189
|
"""
|
201
190
|
vote_sync_intrinsic(mask, mode, predictate)
|
numba_cuda/numba/cuda/target.py
CHANGED
@@ -59,6 +59,34 @@ class CUDATypingContext(typing.BaseContext):
|
|
59
59
|
# continue with parent logic
|
60
60
|
return super(CUDATypingContext, self).resolve_value_type(val)
|
61
61
|
|
62
|
+
def can_convert(self, fromty, toty):
|
63
|
+
"""
|
64
|
+
Check whether conversion is possible from *fromty* to *toty*.
|
65
|
+
If successful, return a numba.typeconv.Conversion instance;
|
66
|
+
otherwise None is returned.
|
67
|
+
"""
|
68
|
+
|
69
|
+
# This implementation works around the issue addressed in Numba PR
|
70
|
+
# #10047, "Fix IntEnumMember.can_convert_to() when no conversions
|
71
|
+
# found", https://github.com/numba/numba/pull/10047.
|
72
|
+
#
|
73
|
+
# This should be gated on the version of Numba that the fix is
|
74
|
+
# incorporated into, and eventually removed when the minimum supported
|
75
|
+
# Numba version includes the fix.
|
76
|
+
|
77
|
+
try:
|
78
|
+
return super().can_convert(fromty, toty)
|
79
|
+
except TypeError:
|
80
|
+
if isinstance(fromty, types.IntEnumMember):
|
81
|
+
# IntEnumMember fails to correctly handle impossible
|
82
|
+
# conversions - in this scenario the correct thing to do is to
|
83
|
+
# return None to signal that the conversion was not possible
|
84
|
+
return None
|
85
|
+
else:
|
86
|
+
# Any failure involving conversion from a non-IntEnumMember is
|
87
|
+
# almost certainly a real and separate issue
|
88
|
+
raise
|
89
|
+
|
62
90
|
|
63
91
|
# -----------------------------------------------------------------------------
|
64
92
|
# Implementation
|
@@ -20,11 +20,13 @@ if not config.ENABLE_CUDASIM:
|
|
20
20
|
from numba import int32
|
21
21
|
from numba.core.extending import (
|
22
22
|
models,
|
23
|
-
register_model,
|
24
|
-
make_attribute_wrapper,
|
25
23
|
typeof_impl,
|
26
24
|
type_callable,
|
27
25
|
)
|
26
|
+
from numba.cuda.extending import (
|
27
|
+
register_model,
|
28
|
+
make_attribute_wrapper,
|
29
|
+
)
|
28
30
|
from numba.cuda.cudaimpl import lower
|
29
31
|
from numba.core import cgutils
|
30
32
|
|
@@ -7,7 +7,7 @@ from numba.cuda.testing import unittest, CUDATestCase
|
|
7
7
|
|
8
8
|
class TestCudaArrayArg(CUDATestCase):
|
9
9
|
def test_array_ary(self):
|
10
|
-
@cuda.jit("double(double[:],int64)", device=True, inline=
|
10
|
+
@cuda.jit("double(double[:],int64)", device=True, inline="always")
|
11
11
|
def device_function(a, c):
|
12
12
|
return a[c]
|
13
13
|
|
@@ -0,0 +1,257 @@
|
|
1
|
+
import numba.cuda as cuda
|
2
|
+
from numba.cuda.testing import unittest, CUDATestCase
|
3
|
+
import numpy as np
|
4
|
+
|
5
|
+
from numba import int16, int32, int64, uint16, uint32, uint64, float32, float64
|
6
|
+
from numba.types import float16
|
7
|
+
|
8
|
+
from numba.cuda.cuda_bf16 import (
|
9
|
+
nv_bfloat16,
|
10
|
+
htrunc,
|
11
|
+
hceil,
|
12
|
+
hfloor,
|
13
|
+
hrint,
|
14
|
+
hsqrt,
|
15
|
+
hrsqrt,
|
16
|
+
hrcp,
|
17
|
+
hlog,
|
18
|
+
hlog2,
|
19
|
+
hlog10,
|
20
|
+
hcos,
|
21
|
+
hsin,
|
22
|
+
hexp,
|
23
|
+
hexp2,
|
24
|
+
hexp10,
|
25
|
+
)
|
26
|
+
|
27
|
+
from numba.cuda.cudadrv.runtime import get_version
|
28
|
+
|
29
|
+
cuda_version = get_version()
|
30
|
+
|
31
|
+
dtypes = [int16, int32, int64, uint16, uint32, uint64, float32]
|
32
|
+
|
33
|
+
|
34
|
+
@unittest.skipIf(
|
35
|
+
(cuda.get_current_device().compute_capability < (8, 0)),
|
36
|
+
"bfloat16 requires compute capability 8.0+",
|
37
|
+
)
|
38
|
+
class Bfloat16Test(CUDATestCase):
|
39
|
+
def test_ctor(self):
|
40
|
+
@cuda.jit
|
41
|
+
def simple_kernel():
|
42
|
+
a = nv_bfloat16(float64(1.0)) # noqa: F841
|
43
|
+
b = nv_bfloat16(float32(2.0)) # noqa: F841
|
44
|
+
c = nv_bfloat16(int16(3)) # noqa: F841
|
45
|
+
d = nv_bfloat16(int32(4)) # noqa: F841
|
46
|
+
e = nv_bfloat16(int64(5)) # noqa: F841
|
47
|
+
f = nv_bfloat16(uint16(6)) # noqa: F841
|
48
|
+
g = nv_bfloat16(uint32(7)) # noqa: F841
|
49
|
+
h = nv_bfloat16(uint64(8)) # noqa: F841
|
50
|
+
|
51
|
+
simple_kernel[1, 1]()
|
52
|
+
|
53
|
+
if cuda_version >= (12, 0):
|
54
|
+
|
55
|
+
@cuda.jit
|
56
|
+
def simple_kernel_fp16():
|
57
|
+
i = nv_bfloat16(float16(9)) # noqa: F841
|
58
|
+
|
59
|
+
simple_kernel_fp16[1, 1]()
|
60
|
+
|
61
|
+
def test_casts(self):
|
62
|
+
@cuda.jit
|
63
|
+
def simple_kernel(b, c, d, e, f, g, h):
|
64
|
+
a = nv_bfloat16(3.14)
|
65
|
+
|
66
|
+
b[0] = float32(a)
|
67
|
+
c[0] = int16(a)
|
68
|
+
d[0] = int32(a)
|
69
|
+
e[0] = int64(a)
|
70
|
+
f[0] = uint16(a)
|
71
|
+
g[0] = uint32(a)
|
72
|
+
h[0] = uint64(a)
|
73
|
+
|
74
|
+
b = np.zeros(1, dtype=np.float32)
|
75
|
+
c = np.zeros(1, dtype=np.int16)
|
76
|
+
d = np.zeros(1, dtype=np.int32)
|
77
|
+
e = np.zeros(1, dtype=np.int64)
|
78
|
+
f = np.zeros(1, dtype=np.uint16)
|
79
|
+
g = np.zeros(1, dtype=np.uint32)
|
80
|
+
h = np.zeros(1, dtype=np.uint64)
|
81
|
+
|
82
|
+
simple_kernel[1, 1](b, c, d, e, f, g, h)
|
83
|
+
|
84
|
+
np.testing.assert_allclose(b[0], 3.14, atol=1e-2)
|
85
|
+
assert c[0] == 3
|
86
|
+
assert d[0] == 3
|
87
|
+
assert e[0] == 3
|
88
|
+
assert f[0] == 3
|
89
|
+
assert g[0] == 3
|
90
|
+
assert h[0] == 3
|
91
|
+
|
92
|
+
def test_ctor_cast_loop(self):
|
93
|
+
for dtype in dtypes:
|
94
|
+
with self.subTest(dtype=dtype):
|
95
|
+
|
96
|
+
@cuda.jit
|
97
|
+
def simple_kernel(a):
|
98
|
+
a[0] = dtype(nv_bfloat16(dtype(3.14)))
|
99
|
+
|
100
|
+
a = np.zeros(1, dtype=str(dtype))
|
101
|
+
simple_kernel[1, 1](a)
|
102
|
+
|
103
|
+
if np.dtype(str(dtype)).kind == "f":
|
104
|
+
np.testing.assert_allclose(a[0], 3.14, atol=1e-2)
|
105
|
+
else:
|
106
|
+
assert a[0] == 3
|
107
|
+
|
108
|
+
def test_arithmetic(self):
|
109
|
+
@cuda.jit
|
110
|
+
def simple_kernel(arith, logic):
|
111
|
+
# Binary Arithmetic Operators
|
112
|
+
a = nv_bfloat16(1.0)
|
113
|
+
b = nv_bfloat16(2.0)
|
114
|
+
|
115
|
+
arith[0] = float32(a + b)
|
116
|
+
arith[1] = float32(a - b)
|
117
|
+
arith[2] = float32(a * b)
|
118
|
+
arith[3] = float32(a / b)
|
119
|
+
|
120
|
+
# Arithmetic Assignment Operators
|
121
|
+
a = nv_bfloat16(1.0)
|
122
|
+
b = nv_bfloat16(2.0)
|
123
|
+
|
124
|
+
a += b
|
125
|
+
arith[4] = float32(a)
|
126
|
+
a -= b
|
127
|
+
arith[5] = float32(a)
|
128
|
+
a *= b
|
129
|
+
arith[6] = float32(a)
|
130
|
+
a /= b
|
131
|
+
arith[7] = float32(a)
|
132
|
+
|
133
|
+
# Unary Arithmetic Operators
|
134
|
+
a = nv_bfloat16(1.0)
|
135
|
+
|
136
|
+
arith[8] = float32(+a)
|
137
|
+
arith[9] = float32(-a)
|
138
|
+
|
139
|
+
# Comparison Operators
|
140
|
+
a = nv_bfloat16(1.0)
|
141
|
+
b = nv_bfloat16(2.0)
|
142
|
+
|
143
|
+
logic[0] = a == b
|
144
|
+
logic[1] = a != b
|
145
|
+
logic[2] = a > b
|
146
|
+
logic[3] = a < b
|
147
|
+
logic[4] = a >= b
|
148
|
+
logic[5] = a <= b
|
149
|
+
|
150
|
+
arith = np.zeros(10, dtype=np.float32)
|
151
|
+
logic = np.zeros(6, dtype=np.bool_)
|
152
|
+
|
153
|
+
simple_kernel[1, 1](arith, logic)
|
154
|
+
|
155
|
+
a = 1.0
|
156
|
+
b = 2.0
|
157
|
+
np.testing.assert_allclose(
|
158
|
+
arith,
|
159
|
+
[
|
160
|
+
a + b,
|
161
|
+
a - b,
|
162
|
+
a * b,
|
163
|
+
a / b,
|
164
|
+
a + b,
|
165
|
+
a + b - b,
|
166
|
+
(a + b - b) * b,
|
167
|
+
(a + b - b) * b / b,
|
168
|
+
+a,
|
169
|
+
-a,
|
170
|
+
],
|
171
|
+
atol=1e-2,
|
172
|
+
)
|
173
|
+
np.testing.assert_equal(
|
174
|
+
logic, [a == b, a != b, a > b, a < b, a >= b, a <= b]
|
175
|
+
)
|
176
|
+
|
177
|
+
def test_math_func(self):
|
178
|
+
@cuda.jit
|
179
|
+
def simple_kernel(a):
|
180
|
+
x = nv_bfloat16(3.14)
|
181
|
+
|
182
|
+
a[0] = float32(htrunc(x))
|
183
|
+
a[1] = float32(hceil(x))
|
184
|
+
a[2] = float32(hfloor(x))
|
185
|
+
a[3] = float32(hrint(x))
|
186
|
+
a[4] = float32(hsqrt(x))
|
187
|
+
a[5] = float32(hrsqrt(x))
|
188
|
+
a[6] = float32(hrcp(x))
|
189
|
+
a[7] = float32(hlog(x))
|
190
|
+
a[8] = float32(hlog2(x))
|
191
|
+
a[9] = float32(hlog10(x))
|
192
|
+
a[10] = float32(hcos(x))
|
193
|
+
a[11] = float32(hsin(x))
|
194
|
+
a[12] = float32(hexp(x))
|
195
|
+
a[13] = float32(hexp2(x))
|
196
|
+
a[14] = float32(hexp10(x))
|
197
|
+
|
198
|
+
a = np.zeros(15, dtype=np.float32)
|
199
|
+
simple_kernel[1, 1](a)
|
200
|
+
|
201
|
+
x = 3.14
|
202
|
+
np.testing.assert_allclose(
|
203
|
+
a[:12],
|
204
|
+
[
|
205
|
+
np.trunc(x),
|
206
|
+
np.ceil(x),
|
207
|
+
np.floor(x),
|
208
|
+
np.rint(x),
|
209
|
+
np.sqrt(x),
|
210
|
+
1 / np.sqrt(x),
|
211
|
+
1 / x,
|
212
|
+
np.log(x),
|
213
|
+
np.log2(x),
|
214
|
+
np.log10(x),
|
215
|
+
np.cos(x),
|
216
|
+
np.sin(x),
|
217
|
+
],
|
218
|
+
atol=1e-2,
|
219
|
+
)
|
220
|
+
|
221
|
+
np.testing.assert_allclose(
|
222
|
+
a[12:], [np.exp(x), np.exp2(x), np.power(10, x)], atol=1e2
|
223
|
+
)
|
224
|
+
|
225
|
+
def test_check_bfloat16_type(self):
|
226
|
+
@cuda.jit
|
227
|
+
def kernel(arr):
|
228
|
+
x = nv_bfloat16(3.14)
|
229
|
+
if isinstance(x, nv_bfloat16):
|
230
|
+
arr[0] = float32(x)
|
231
|
+
else:
|
232
|
+
arr[0] = float32(0.0)
|
233
|
+
|
234
|
+
arr = np.zeros(1, np.float32)
|
235
|
+
kernel[1, 1](arr)
|
236
|
+
|
237
|
+
np.testing.assert_allclose(arr, [3.14], atol=1e-2)
|
238
|
+
|
239
|
+
def test_use_within_device_func(self):
|
240
|
+
@cuda.jit(device=True)
|
241
|
+
def add_bf16(a, b):
|
242
|
+
return a + b
|
243
|
+
|
244
|
+
@cuda.jit
|
245
|
+
def kernel(arr):
|
246
|
+
a = nv_bfloat16(3.14)
|
247
|
+
b = nv_bfloat16(5)
|
248
|
+
arr[0] = float32(hfloor(add_bf16(a, b)))
|
249
|
+
|
250
|
+
arr = np.zeros(1, np.float32)
|
251
|
+
kernel[1, 1](arr)
|
252
|
+
|
253
|
+
np.testing.assert_allclose(arr, [8], atol=1e-2)
|
254
|
+
|
255
|
+
|
256
|
+
if __name__ == "__main__":
|
257
|
+
unittest.main()
|
@@ -310,6 +310,52 @@ class TestCudaDebugInfo(CUDATestCase):
|
|
310
310
|
with captured_stdout():
|
311
311
|
self._test_kernel_args_types()
|
312
312
|
|
313
|
+
def test_llvm_dbg_value(self):
|
314
|
+
sig = (types.int32, types.int32)
|
315
|
+
|
316
|
+
@cuda.jit("void(int32, int32)", debug=True, opt=False)
|
317
|
+
def f(x, y):
|
318
|
+
z = x # noqa: F841
|
319
|
+
z = 100 # noqa: F841
|
320
|
+
z = y # noqa: F841
|
321
|
+
z = True # noqa: F841
|
322
|
+
|
323
|
+
llvm_ir = f.inspect_llvm(sig)
|
324
|
+
# Verify the call to llvm.dbg.declare is replaced by llvm.dbg.value
|
325
|
+
pat1 = r'call void @"llvm.dbg.declare"'
|
326
|
+
match = re.compile(pat1).search(llvm_ir)
|
327
|
+
self.assertIsNone(match, msg=llvm_ir)
|
328
|
+
pat2 = r'call void @"llvm.dbg.value"'
|
329
|
+
match = re.compile(pat2).search(llvm_ir)
|
330
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
331
|
+
|
332
|
+
def test_no_user_var_alias(self):
|
333
|
+
sig = (types.int32, types.int32)
|
334
|
+
|
335
|
+
@cuda.jit("void(int32, int32)", debug=True, opt=False)
|
336
|
+
def f(x, y):
|
337
|
+
z = x # noqa: F841
|
338
|
+
z = y # noqa: F841
|
339
|
+
|
340
|
+
llvm_ir = f.inspect_llvm(sig)
|
341
|
+
pat = r'!DILocalVariable.*name:\s+"z\$1".*'
|
342
|
+
match = re.compile(pat).search(llvm_ir)
|
343
|
+
self.assertIsNone(match, msg=llvm_ir)
|
344
|
+
|
345
|
+
def test_no_literal_type(self):
|
346
|
+
sig = (types.int32,)
|
347
|
+
|
348
|
+
@cuda.jit("void(int32)", debug=True, opt=False)
|
349
|
+
def f(x):
|
350
|
+
z = x # noqa: F841
|
351
|
+
z = 100 # noqa: F841
|
352
|
+
z = True # noqa: F841
|
353
|
+
|
354
|
+
llvm_ir = f.inspect_llvm(sig)
|
355
|
+
pat = r'!DIBasicType.*name:\s+"Literal.*'
|
356
|
+
match = re.compile(pat).search(llvm_ir)
|
357
|
+
self.assertIsNone(match, msg=llvm_ir)
|
358
|
+
|
313
359
|
|
314
360
|
if __name__ == "__main__":
|
315
361
|
unittest.main()
|
@@ -6,6 +6,7 @@ import numpy as np
|
|
6
6
|
|
7
7
|
from numba import int16, int32
|
8
8
|
from numba import cuda, vectorize, njit
|
9
|
+
from numba.core import types
|
9
10
|
from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
|
10
11
|
from numba.tests.enum_usecases import (
|
11
12
|
Color,
|
@@ -115,6 +116,23 @@ class EnumTest(CUDATestCase):
|
|
115
116
|
got = cuda_func(arr)
|
116
117
|
self.assertPreciseEqual(expected, got)
|
117
118
|
|
119
|
+
def test_int_enum_no_conversion(self):
|
120
|
+
# Ported from Numba PR #10047: "Fix IntEnumMember.can_convert_to() when
|
121
|
+
# no conversions found", https://github.com/numba/numba/pull/10047.
|
122
|
+
|
123
|
+
# The original test is intended to ensures that
|
124
|
+
# IntEnumMember.can_convert_to() handles the case when the typing
|
125
|
+
# context's can_convert() method returns None to signal no possible
|
126
|
+
# conversion. In Numba-CUDA, we had to patch the CUDA target context to
|
127
|
+
# work around this issue, because we cannot guarantee that the
|
128
|
+
# IntEnumMember method can be patched before instances are created.
|
129
|
+
ctx = cuda.descriptor.cuda_target.typing_context
|
130
|
+
|
131
|
+
int_enum_type = types.IntEnumMember(Shape, types.int64)
|
132
|
+
# Conversion of an int enum member to a 1D array would be invalid
|
133
|
+
invalid_toty = types.int64[::1]
|
134
|
+
self.assertIsNone(ctx.can_convert(int_enum_type, invalid_toty))
|
135
|
+
|
118
136
|
|
119
137
|
if __name__ == "__main__":
|
120
138
|
unittest.main()
|