numba-cuda 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/compiler.py +35 -3
  3. numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
  4. numba_cuda/numba/cuda/cuda_paths.py +2 -0
  5. numba_cuda/numba/cuda/cudadecl.py +0 -42
  6. numba_cuda/numba/cuda/cudadrv/linkable_code.py +11 -2
  7. numba_cuda/numba/cuda/cudadrv/nvrtc.py +10 -3
  8. numba_cuda/numba/cuda/cudaimpl.py +0 -63
  9. numba_cuda/numba/cuda/debuginfo.py +92 -2
  10. numba_cuda/numba/cuda/decorators.py +27 -1
  11. numba_cuda/numba/cuda/device_init.py +4 -5
  12. numba_cuda/numba/cuda/dispatcher.py +4 -3
  13. numba_cuda/numba/cuda/extending.py +54 -0
  14. numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
  15. numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
  16. numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +550 -387
  17. numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +465 -316
  18. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  19. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  20. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  21. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  22. numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
  23. numba_cuda/numba/cuda/intrinsics.py +172 -1
  24. numba_cuda/numba/cuda/lowering.py +43 -0
  25. numba_cuda/numba/cuda/stubs.py +0 -11
  26. numba_cuda/numba/cuda/target.py +28 -0
  27. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +4 -2
  28. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +1 -1
  29. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
  30. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +1 -1
  31. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +46 -0
  32. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +18 -0
  33. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +4 -2
  34. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +156 -0
  35. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +1 -1
  36. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +50 -5
  37. numba_cuda/numba/cuda/vector_types.py +3 -1
  38. numba_cuda/numba/cuda/vectorizers.py +1 -1
  39. {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/METADATA +1 -1
  40. {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/RECORD +43 -33
  41. {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/WHEEL +1 -1
  42. {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/licenses/LICENSE +0 -0
  43. {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/top_level.txt +0 -0
@@ -36,42 +36,3 @@ def ballot_sync(mask, predicate):
36
36
  and are within the given mask.
37
37
  """
38
38
  return numba.cuda.vote_sync_intrinsic(mask, 3, predicate)[0]
39
-
40
-
41
- @jit(device=True)
42
- def shfl_sync(mask, value, src_lane):
43
- """
44
- Shuffles value across the masked warp and returns the value
45
- from src_lane. If this is outside the warp, then the
46
- given value is returned.
47
- """
48
- return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1F)[0]
49
-
50
-
51
- @jit(device=True)
52
- def shfl_up_sync(mask, value, delta):
53
- """
54
- Shuffles value across the masked warp and returns the value
55
- from (laneid - delta). If this is outside the warp, then the
56
- given value is returned.
57
- """
58
- return numba.cuda.shfl_sync_intrinsic(mask, 1, value, delta, 0)[0]
59
-
60
-
61
- @jit(device=True)
62
- def shfl_down_sync(mask, value, delta):
63
- """
64
- Shuffles value across the masked warp and returns the value
65
- from (laneid + delta). If this is outside the warp, then the
66
- given value is returned.
67
- """
68
- return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1F)[0]
69
-
70
-
71
- @jit(device=True)
72
- def shfl_xor_sync(mask, value, lane_mask):
73
- """
74
- Shuffles value across the masked warp and returns the value
75
- from (laneid ^ lane_mask).
76
- """
77
- return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1F)[0]
@@ -2,7 +2,7 @@ from llvmlite import ir
2
2
 
3
3
  from numba import cuda, types
4
4
  from numba.core import cgutils
5
- from numba.core.errors import RequireLiteralValue
5
+ from numba.core.errors import RequireLiteralValue, TypingError
6
6
  from numba.core.typing import signature
7
7
  from numba.core.extending import overload_attribute, overload_method
8
8
  from numba.cuda import nvvmutils
@@ -205,3 +205,174 @@ def syncthreads_or(typingctx, predicate):
205
205
  @overload_method(types.Integer, "bit_count", target="cuda")
206
206
  def integer_bit_count(i):
207
207
  return lambda i: cuda.popc(i)
208
+
209
+
210
+ # -------------------------------------------------------------------------------
211
+ # Warp shuffle functions
212
+ #
213
+ # References:
214
+ #
215
+ # - https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-shuffle-functions
216
+ # - https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#data-movement
217
+ #
218
+ # Notes:
219
+ #
220
+ # - The public CUDA C/C++ and Numba Python APIs for these intrinsics use
221
+ # different names for parameters to the NVVM IR specification. So that we
222
+ # can correlate the implementation with the documentation, the @intrinsic
223
+ # API functions map the public API arguments to the NVVM intrinsic
224
+ # arguments.
225
+ # - The NVVM IR specification requires some of the parameters (e.g. mode) to be
226
+ # constants. It's therefore essential that we pass in some values to the
227
+ # shfl_sync_intrinsic function (e.g. the mode and c values).
228
+ # - Normally parameters for intrinsic functions in Numba would be given the
229
+ # same name as used in the API, and would contain a type. However, because we
230
+ # have to pass in some values and some times (and there is divergence between
231
+ # the names in the intrinsic documentation and the public APIs) we instead
232
+ # follow the convention of naming shfl_sync_intrinsic parameters with a
233
+ # suffix of _type or _value depending on whether they contain a type or a
234
+ # value.
235
+
236
+
237
+ @intrinsic
238
+ def shfl_sync(typingctx, mask, value, src_lane):
239
+ """
240
+ Shuffles ``value`` across the masked warp and returns the value from
241
+ ``src_lane``. If this is outside the warp, then the given value is
242
+ returned.
243
+ """
244
+ membermask_type = mask
245
+ mode_value = 0
246
+ a_type = value
247
+ b_type = src_lane
248
+ c_value = 0x1F
249
+ return shfl_sync_intrinsic(
250
+ typingctx, membermask_type, mode_value, a_type, b_type, c_value
251
+ )
252
+
253
+
254
+ @intrinsic
255
+ def shfl_up_sync(typingctx, mask, value, delta):
256
+ """
257
+ Shuffles ``value`` across the masked warp and returns the value from
258
+ ``(laneid - delta)``. If this is outside the warp, then the given value is
259
+ returned.
260
+ """
261
+ membermask_type = mask
262
+ mode_value = 1
263
+ a_type = value
264
+ b_type = delta
265
+ c_value = 0
266
+ return shfl_sync_intrinsic(
267
+ typingctx, membermask_type, mode_value, a_type, b_type, c_value
268
+ )
269
+
270
+
271
+ @intrinsic
272
+ def shfl_down_sync(typingctx, mask, value, delta):
273
+ """
274
+ Shuffles ``value`` across the masked warp and returns the value from
275
+ ``(laneid + delta)``. If this is outside the warp, then the given value is
276
+ returned.
277
+ """
278
+ membermask_type = mask
279
+ mode_value = 2
280
+ a_type = value
281
+ b_type = delta
282
+ c_value = 0x1F
283
+ return shfl_sync_intrinsic(
284
+ typingctx, membermask_type, mode_value, a_type, b_type, c_value
285
+ )
286
+
287
+
288
+ @intrinsic
289
+ def shfl_xor_sync(typingctx, mask, value, lane_mask):
290
+ """
291
+ Shuffles ``value`` across the masked warp and returns the value from
292
+ ``(laneid ^ lane_mask)``.
293
+ """
294
+ membermask_type = mask
295
+ mode_value = 3
296
+ a_type = value
297
+ b_type = lane_mask
298
+ c_value = 0x1F
299
+ return shfl_sync_intrinsic(
300
+ typingctx, membermask_type, mode_value, a_type, b_type, c_value
301
+ )
302
+
303
+
304
+ def shfl_sync_intrinsic(
305
+ typingctx,
306
+ membermask_type,
307
+ mode_value,
308
+ a_type,
309
+ b_type,
310
+ c_value,
311
+ ):
312
+ if a_type not in (types.i4, types.i8, types.f4, types.f8):
313
+ raise TypingError(
314
+ "shfl_sync only supports 32- and 64-bit ints and floats"
315
+ )
316
+
317
+ def codegen(context, builder, sig, args):
318
+ """
319
+ The NVVM shfl_sync intrinsic only supports i32, but the CUDA C/C++
320
+ intrinsic supports both 32- and 64-bit ints and floats, so for feature
321
+ parity, i32, i64, f32, and f64 are implemented. Floats by way of
322
+ bitcasting the float to an int, then shuffling, then bitcasting
323
+ back."""
324
+ membermask, a, b = args
325
+
326
+ # Types
327
+ a_type = sig.args[1]
328
+ return_type = context.get_value_type(sig.return_type)
329
+ i32 = ir.IntType(32)
330
+ i64 = ir.IntType(64)
331
+
332
+ if a_type in types.real_domain:
333
+ a = builder.bitcast(a, ir.IntType(a_type.bitwidth))
334
+
335
+ # NVVM intrinsic definition
336
+ arg_types = (i32, i32, i32, i32, i32)
337
+ shfl_return_type = ir.LiteralStructType((i32, ir.IntType(1)))
338
+ fnty = ir.FunctionType(shfl_return_type, arg_types)
339
+
340
+ fname = "llvm.nvvm.shfl.sync.i32"
341
+ shfl_sync = cgutils.get_or_insert_function(builder.module, fnty, fname)
342
+
343
+ # Intrinsic arguments
344
+ mode = ir.Constant(i32, mode_value)
345
+ c = ir.Constant(i32, c_value)
346
+ membermask = builder.trunc(membermask, i32)
347
+ b = builder.trunc(b, i32)
348
+
349
+ if a_type.bitwidth == 32:
350
+ a = builder.trunc(a, i32)
351
+ ret = builder.call(shfl_sync, (membermask, mode, a, b, c))
352
+ d = builder.extract_value(ret, 0)
353
+ else:
354
+ # Handle 64-bit values by shuffling as two 32-bit values and
355
+ # packing the result into 64 bits.
356
+
357
+ # Extract high and low parts
358
+ lo = builder.trunc(a, i32)
359
+ a_lshr = builder.lshr(a, ir.Constant(i64, 32))
360
+ hi = builder.trunc(a_lshr, i32)
361
+
362
+ # Shuffle individual parts
363
+ ret_lo = builder.call(shfl_sync, (membermask, mode, lo, b, c))
364
+ ret_hi = builder.call(shfl_sync, (membermask, mode, hi, b, c))
365
+
366
+ # Combine individual result parts into a 64-bit result
367
+ d_lo = builder.extract_value(ret_lo, 0)
368
+ d_hi = builder.extract_value(ret_hi, 0)
369
+ d_lo_64 = builder.zext(d_lo, i64)
370
+ d_hi_64 = builder.zext(d_hi, i64)
371
+ d_shl = builder.shl(d_hi_64, ir.Constant(i64, 32))
372
+ d = builder.or_(d_shl, d_lo_64)
373
+
374
+ return builder.bitcast(d, return_type)
375
+
376
+ sig = signature(a_type, membermask_type, a_type, b_type)
377
+
378
+ return sig, codegen
@@ -0,0 +1,43 @@
1
+ from numba.core.lowering import Lower
2
+ from llvmlite import ir
3
+
4
+
5
+ class CUDALower(Lower):
6
+ def storevar(self, value, name, argidx=None):
7
+ """
8
+ Store the value into the given variable.
9
+ """
10
+ super().storevar(value, name, argidx)
11
+
12
+ # Emit llvm.dbg.value instead of llvm.dbg.declare for local scalar
13
+ # variables immediately after a store instruction.
14
+ if (
15
+ self.context.enable_debuginfo
16
+ # Conditions used to elide stores in parent method
17
+ and (
18
+ name not in self._singly_assigned_vars
19
+ or self._disable_sroa_like_opt
20
+ )
21
+ # No emission of debuginfo for internal names
22
+ and not name.startswith("$")
23
+ ):
24
+ # Emit debug value for user variable
25
+ fetype = self.typeof(name)
26
+ lltype = self.context.get_value_type(fetype)
27
+ int_type = (ir.IntType,)
28
+ real_type = ir.FloatType, ir.DoubleType
29
+ if isinstance(lltype, int_type + real_type):
30
+ # Emit debug value for scalar variable
31
+ sizeof = self.context.get_abi_sizeof(lltype)
32
+ datamodel = self.context.data_model_manager[fetype]
33
+ line = self.loc.line if argidx is None else self.defn_loc.line
34
+ self.debuginfo.update_variable(
35
+ self.builder,
36
+ value,
37
+ name,
38
+ lltype,
39
+ sizeof,
40
+ line,
41
+ datamodel,
42
+ argidx,
43
+ )
@@ -185,17 +185,6 @@ class syncwarp(Stub):
185
185
  _description_ = "<warp_sync()>"
186
186
 
187
187
 
188
- class shfl_sync_intrinsic(Stub):
189
- """
190
- shfl_sync_intrinsic(mask, mode, value, mode_offset, clamp)
191
-
192
- Nvvm intrinsic for shuffling data across a warp
193
- docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-datamove
194
- """
195
-
196
- _description_ = "<shfl_sync()>"
197
-
198
-
199
188
  class vote_sync_intrinsic(Stub):
200
189
  """
201
190
  vote_sync_intrinsic(mask, mode, predictate)
@@ -59,6 +59,34 @@ class CUDATypingContext(typing.BaseContext):
59
59
  # continue with parent logic
60
60
  return super(CUDATypingContext, self).resolve_value_type(val)
61
61
 
62
+ def can_convert(self, fromty, toty):
63
+ """
64
+ Check whether conversion is possible from *fromty* to *toty*.
65
+ If successful, return a numba.typeconv.Conversion instance;
66
+ otherwise None is returned.
67
+ """
68
+
69
+ # This implementation works around the issue addressed in Numba PR
70
+ # #10047, "Fix IntEnumMember.can_convert_to() when no conversions
71
+ # found", https://github.com/numba/numba/pull/10047.
72
+ #
73
+ # This should be gated on the version of Numba that the fix is
74
+ # incorporated into, and eventually removed when the minimum supported
75
+ # Numba version includes the fix.
76
+
77
+ try:
78
+ return super().can_convert(fromty, toty)
79
+ except TypeError:
80
+ if isinstance(fromty, types.IntEnumMember):
81
+ # IntEnumMember fails to correctly handle impossible
82
+ # conversions - in this scenario the correct thing to do is to
83
+ # return None to signal that the conversion was not possible
84
+ return None
85
+ else:
86
+ # Any failure involving conversion from a non-IntEnumMember is
87
+ # almost certainly a real and separate issue
88
+ raise
89
+
62
90
 
63
91
  # -----------------------------------------------------------------------------
64
92
  # Implementation
@@ -20,11 +20,13 @@ if not config.ENABLE_CUDASIM:
20
20
  from numba import int32
21
21
  from numba.core.extending import (
22
22
  models,
23
- register_model,
24
- make_attribute_wrapper,
25
23
  typeof_impl,
26
24
  type_callable,
27
25
  )
26
+ from numba.cuda.extending import (
27
+ register_model,
28
+ make_attribute_wrapper,
29
+ )
28
30
  from numba.cuda.cudaimpl import lower
29
31
  from numba.core import cgutils
30
32
 
@@ -7,7 +7,7 @@ from numba.cuda.testing import unittest, CUDATestCase
7
7
 
8
8
  class TestCudaArrayArg(CUDATestCase):
9
9
  def test_array_ary(self):
10
- @cuda.jit("double(double[:],int64)", device=True, inline=True)
10
+ @cuda.jit("double(double[:],int64)", device=True, inline="always")
11
11
  def device_function(a, c):
12
12
  return a[c]
13
13
 
@@ -0,0 +1,257 @@
1
+ import numba.cuda as cuda
2
+ from numba.cuda.testing import unittest, CUDATestCase
3
+ import numpy as np
4
+
5
+ from numba import int16, int32, int64, uint16, uint32, uint64, float32, float64
6
+ from numba.types import float16
7
+
8
+ from numba.cuda.cuda_bf16 import (
9
+ nv_bfloat16,
10
+ htrunc,
11
+ hceil,
12
+ hfloor,
13
+ hrint,
14
+ hsqrt,
15
+ hrsqrt,
16
+ hrcp,
17
+ hlog,
18
+ hlog2,
19
+ hlog10,
20
+ hcos,
21
+ hsin,
22
+ hexp,
23
+ hexp2,
24
+ hexp10,
25
+ )
26
+
27
+ from numba.cuda.cudadrv.runtime import get_version
28
+
29
+ cuda_version = get_version()
30
+
31
+ dtypes = [int16, int32, int64, uint16, uint32, uint64, float32]
32
+
33
+
34
+ @unittest.skipIf(
35
+ (cuda.get_current_device().compute_capability < (8, 0)),
36
+ "bfloat16 requires compute capability 8.0+",
37
+ )
38
+ class Bfloat16Test(CUDATestCase):
39
+ def test_ctor(self):
40
+ @cuda.jit
41
+ def simple_kernel():
42
+ a = nv_bfloat16(float64(1.0)) # noqa: F841
43
+ b = nv_bfloat16(float32(2.0)) # noqa: F841
44
+ c = nv_bfloat16(int16(3)) # noqa: F841
45
+ d = nv_bfloat16(int32(4)) # noqa: F841
46
+ e = nv_bfloat16(int64(5)) # noqa: F841
47
+ f = nv_bfloat16(uint16(6)) # noqa: F841
48
+ g = nv_bfloat16(uint32(7)) # noqa: F841
49
+ h = nv_bfloat16(uint64(8)) # noqa: F841
50
+
51
+ simple_kernel[1, 1]()
52
+
53
+ if cuda_version >= (12, 0):
54
+
55
+ @cuda.jit
56
+ def simple_kernel_fp16():
57
+ i = nv_bfloat16(float16(9)) # noqa: F841
58
+
59
+ simple_kernel_fp16[1, 1]()
60
+
61
+ def test_casts(self):
62
+ @cuda.jit
63
+ def simple_kernel(b, c, d, e, f, g, h):
64
+ a = nv_bfloat16(3.14)
65
+
66
+ b[0] = float32(a)
67
+ c[0] = int16(a)
68
+ d[0] = int32(a)
69
+ e[0] = int64(a)
70
+ f[0] = uint16(a)
71
+ g[0] = uint32(a)
72
+ h[0] = uint64(a)
73
+
74
+ b = np.zeros(1, dtype=np.float32)
75
+ c = np.zeros(1, dtype=np.int16)
76
+ d = np.zeros(1, dtype=np.int32)
77
+ e = np.zeros(1, dtype=np.int64)
78
+ f = np.zeros(1, dtype=np.uint16)
79
+ g = np.zeros(1, dtype=np.uint32)
80
+ h = np.zeros(1, dtype=np.uint64)
81
+
82
+ simple_kernel[1, 1](b, c, d, e, f, g, h)
83
+
84
+ np.testing.assert_allclose(b[0], 3.14, atol=1e-2)
85
+ assert c[0] == 3
86
+ assert d[0] == 3
87
+ assert e[0] == 3
88
+ assert f[0] == 3
89
+ assert g[0] == 3
90
+ assert h[0] == 3
91
+
92
+ def test_ctor_cast_loop(self):
93
+ for dtype in dtypes:
94
+ with self.subTest(dtype=dtype):
95
+
96
+ @cuda.jit
97
+ def simple_kernel(a):
98
+ a[0] = dtype(nv_bfloat16(dtype(3.14)))
99
+
100
+ a = np.zeros(1, dtype=str(dtype))
101
+ simple_kernel[1, 1](a)
102
+
103
+ if np.dtype(str(dtype)).kind == "f":
104
+ np.testing.assert_allclose(a[0], 3.14, atol=1e-2)
105
+ else:
106
+ assert a[0] == 3
107
+
108
+ def test_arithmetic(self):
109
+ @cuda.jit
110
+ def simple_kernel(arith, logic):
111
+ # Binary Arithmetic Operators
112
+ a = nv_bfloat16(1.0)
113
+ b = nv_bfloat16(2.0)
114
+
115
+ arith[0] = float32(a + b)
116
+ arith[1] = float32(a - b)
117
+ arith[2] = float32(a * b)
118
+ arith[3] = float32(a / b)
119
+
120
+ # Arithmetic Assignment Operators
121
+ a = nv_bfloat16(1.0)
122
+ b = nv_bfloat16(2.0)
123
+
124
+ a += b
125
+ arith[4] = float32(a)
126
+ a -= b
127
+ arith[5] = float32(a)
128
+ a *= b
129
+ arith[6] = float32(a)
130
+ a /= b
131
+ arith[7] = float32(a)
132
+
133
+ # Unary Arithmetic Operators
134
+ a = nv_bfloat16(1.0)
135
+
136
+ arith[8] = float32(+a)
137
+ arith[9] = float32(-a)
138
+
139
+ # Comparison Operators
140
+ a = nv_bfloat16(1.0)
141
+ b = nv_bfloat16(2.0)
142
+
143
+ logic[0] = a == b
144
+ logic[1] = a != b
145
+ logic[2] = a > b
146
+ logic[3] = a < b
147
+ logic[4] = a >= b
148
+ logic[5] = a <= b
149
+
150
+ arith = np.zeros(10, dtype=np.float32)
151
+ logic = np.zeros(6, dtype=np.bool_)
152
+
153
+ simple_kernel[1, 1](arith, logic)
154
+
155
+ a = 1.0
156
+ b = 2.0
157
+ np.testing.assert_allclose(
158
+ arith,
159
+ [
160
+ a + b,
161
+ a - b,
162
+ a * b,
163
+ a / b,
164
+ a + b,
165
+ a + b - b,
166
+ (a + b - b) * b,
167
+ (a + b - b) * b / b,
168
+ +a,
169
+ -a,
170
+ ],
171
+ atol=1e-2,
172
+ )
173
+ np.testing.assert_equal(
174
+ logic, [a == b, a != b, a > b, a < b, a >= b, a <= b]
175
+ )
176
+
177
+ def test_math_func(self):
178
+ @cuda.jit
179
+ def simple_kernel(a):
180
+ x = nv_bfloat16(3.14)
181
+
182
+ a[0] = float32(htrunc(x))
183
+ a[1] = float32(hceil(x))
184
+ a[2] = float32(hfloor(x))
185
+ a[3] = float32(hrint(x))
186
+ a[4] = float32(hsqrt(x))
187
+ a[5] = float32(hrsqrt(x))
188
+ a[6] = float32(hrcp(x))
189
+ a[7] = float32(hlog(x))
190
+ a[8] = float32(hlog2(x))
191
+ a[9] = float32(hlog10(x))
192
+ a[10] = float32(hcos(x))
193
+ a[11] = float32(hsin(x))
194
+ a[12] = float32(hexp(x))
195
+ a[13] = float32(hexp2(x))
196
+ a[14] = float32(hexp10(x))
197
+
198
+ a = np.zeros(15, dtype=np.float32)
199
+ simple_kernel[1, 1](a)
200
+
201
+ x = 3.14
202
+ np.testing.assert_allclose(
203
+ a[:12],
204
+ [
205
+ np.trunc(x),
206
+ np.ceil(x),
207
+ np.floor(x),
208
+ np.rint(x),
209
+ np.sqrt(x),
210
+ 1 / np.sqrt(x),
211
+ 1 / x,
212
+ np.log(x),
213
+ np.log2(x),
214
+ np.log10(x),
215
+ np.cos(x),
216
+ np.sin(x),
217
+ ],
218
+ atol=1e-2,
219
+ )
220
+
221
+ np.testing.assert_allclose(
222
+ a[12:], [np.exp(x), np.exp2(x), np.power(10, x)], atol=1e2
223
+ )
224
+
225
+ def test_check_bfloat16_type(self):
226
+ @cuda.jit
227
+ def kernel(arr):
228
+ x = nv_bfloat16(3.14)
229
+ if isinstance(x, nv_bfloat16):
230
+ arr[0] = float32(x)
231
+ else:
232
+ arr[0] = float32(0.0)
233
+
234
+ arr = np.zeros(1, np.float32)
235
+ kernel[1, 1](arr)
236
+
237
+ np.testing.assert_allclose(arr, [3.14], atol=1e-2)
238
+
239
+ def test_use_within_device_func(self):
240
+ @cuda.jit(device=True)
241
+ def add_bf16(a, b):
242
+ return a + b
243
+
244
+ @cuda.jit
245
+ def kernel(arr):
246
+ a = nv_bfloat16(3.14)
247
+ b = nv_bfloat16(5)
248
+ arr[0] = float32(hfloor(add_bf16(a, b)))
249
+
250
+ arr = np.zeros(1, np.float32)
251
+ kernel[1, 1](arr)
252
+
253
+ np.testing.assert_allclose(arr, [8], atol=1e-2)
254
+
255
+
256
+ if __name__ == "__main__":
257
+ unittest.main()
@@ -81,7 +81,7 @@ class TestBlackScholes(CUDATestCase):
81
81
  VOLATILITY,
82
82
  )
83
83
 
84
- @cuda.jit(double(double), device=True, inline=True)
84
+ @cuda.jit(double(double), device=True, inline="always")
85
85
  def cnd_cuda(d):
86
86
  K = 1.0 / (1.0 + 0.2316419 * math.fabs(d))
87
87
  ret_val = (
@@ -310,6 +310,52 @@ class TestCudaDebugInfo(CUDATestCase):
310
310
  with captured_stdout():
311
311
  self._test_kernel_args_types()
312
312
 
313
+ def test_llvm_dbg_value(self):
314
+ sig = (types.int32, types.int32)
315
+
316
+ @cuda.jit("void(int32, int32)", debug=True, opt=False)
317
+ def f(x, y):
318
+ z = x # noqa: F841
319
+ z = 100 # noqa: F841
320
+ z = y # noqa: F841
321
+ z = True # noqa: F841
322
+
323
+ llvm_ir = f.inspect_llvm(sig)
324
+ # Verify the call to llvm.dbg.declare is replaced by llvm.dbg.value
325
+ pat1 = r'call void @"llvm.dbg.declare"'
326
+ match = re.compile(pat1).search(llvm_ir)
327
+ self.assertIsNone(match, msg=llvm_ir)
328
+ pat2 = r'call void @"llvm.dbg.value"'
329
+ match = re.compile(pat2).search(llvm_ir)
330
+ self.assertIsNotNone(match, msg=llvm_ir)
331
+
332
+ def test_no_user_var_alias(self):
333
+ sig = (types.int32, types.int32)
334
+
335
+ @cuda.jit("void(int32, int32)", debug=True, opt=False)
336
+ def f(x, y):
337
+ z = x # noqa: F841
338
+ z = y # noqa: F841
339
+
340
+ llvm_ir = f.inspect_llvm(sig)
341
+ pat = r'!DILocalVariable.*name:\s+"z\$1".*'
342
+ match = re.compile(pat).search(llvm_ir)
343
+ self.assertIsNone(match, msg=llvm_ir)
344
+
345
+ def test_no_literal_type(self):
346
+ sig = (types.int32,)
347
+
348
+ @cuda.jit("void(int32)", debug=True, opt=False)
349
+ def f(x):
350
+ z = x # noqa: F841
351
+ z = 100 # noqa: F841
352
+ z = True # noqa: F841
353
+
354
+ llvm_ir = f.inspect_llvm(sig)
355
+ pat = r'!DIBasicType.*name:\s+"Literal.*'
356
+ match = re.compile(pat).search(llvm_ir)
357
+ self.assertIsNone(match, msg=llvm_ir)
358
+
313
359
 
314
360
  if __name__ == "__main__":
315
361
  unittest.main()
@@ -6,6 +6,7 @@ import numpy as np
6
6
 
7
7
  from numba import int16, int32
8
8
  from numba import cuda, vectorize, njit
9
+ from numba.core import types
9
10
  from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
10
11
  from numba.tests.enum_usecases import (
11
12
  Color,
@@ -115,6 +116,23 @@ class EnumTest(CUDATestCase):
115
116
  got = cuda_func(arr)
116
117
  self.assertPreciseEqual(expected, got)
117
118
 
119
+ def test_int_enum_no_conversion(self):
120
+ # Ported from Numba PR #10047: "Fix IntEnumMember.can_convert_to() when
121
+ # no conversions found", https://github.com/numba/numba/pull/10047.
122
+
123
+ # The original test is intended to ensures that
124
+ # IntEnumMember.can_convert_to() handles the case when the typing
125
+ # context's can_convert() method returns None to signal no possible
126
+ # conversion. In Numba-CUDA, we had to patch the CUDA target context to
127
+ # work around this issue, because we cannot guarantee that the
128
+ # IntEnumMember method can be patched before instances are created.
129
+ ctx = cuda.descriptor.cuda_target.typing_context
130
+
131
+ int_enum_type = types.IntEnumMember(Shape, types.int64)
132
+ # Conversion of an int enum member to a 1D array would be invalid
133
+ invalid_toty = types.int64[::1]
134
+ self.assertIsNone(ctx.can_convert(int_enum_type, invalid_toty))
135
+
118
136
 
119
137
  if __name__ == "__main__":
120
138
  unittest.main()