numba-cuda 0.19.1__py3-none-any.whl → 0.20.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of numba-cuda might be problematic. Click here for more details.

Files changed (171) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/__init__.py +1 -1
  3. numba_cuda/numba/cuda/_internal/cuda_bf16.py +12706 -1470
  4. numba_cuda/numba/cuda/_internal/cuda_fp16.py +2653 -8769
  5. numba_cuda/numba/cuda/api.py +6 -1
  6. numba_cuda/numba/cuda/bf16.py +285 -2
  7. numba_cuda/numba/cuda/cgutils.py +2 -2
  8. numba_cuda/numba/cuda/cloudpickle/__init__.py +21 -0
  9. numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +1598 -0
  10. numba_cuda/numba/cuda/cloudpickle/cloudpickle_fast.py +17 -0
  11. numba_cuda/numba/cuda/codegen.py +1 -1
  12. numba_cuda/numba/cuda/compiler.py +373 -30
  13. numba_cuda/numba/cuda/core/analysis.py +319 -0
  14. numba_cuda/numba/cuda/core/annotations/__init__.py +0 -0
  15. numba_cuda/numba/cuda/core/annotations/type_annotations.py +304 -0
  16. numba_cuda/numba/cuda/core/base.py +1289 -0
  17. numba_cuda/numba/cuda/core/bytecode.py +727 -0
  18. numba_cuda/numba/cuda/core/caching.py +2 -2
  19. numba_cuda/numba/cuda/core/compiler.py +6 -14
  20. numba_cuda/numba/cuda/core/compiler_machinery.py +497 -0
  21. numba_cuda/numba/cuda/core/config.py +747 -0
  22. numba_cuda/numba/cuda/core/consts.py +124 -0
  23. numba_cuda/numba/cuda/core/cpu.py +370 -0
  24. numba_cuda/numba/cuda/core/environment.py +68 -0
  25. numba_cuda/numba/cuda/core/event.py +511 -0
  26. numba_cuda/numba/cuda/core/funcdesc.py +330 -0
  27. numba_cuda/numba/cuda/core/inline_closurecall.py +1889 -0
  28. numba_cuda/numba/cuda/core/interpreter.py +48 -26
  29. numba_cuda/numba/cuda/core/ir_utils.py +15 -26
  30. numba_cuda/numba/cuda/core/options.py +262 -0
  31. numba_cuda/numba/cuda/core/postproc.py +249 -0
  32. numba_cuda/numba/cuda/core/pythonapi.py +1868 -0
  33. numba_cuda/numba/cuda/core/rewrites/__init__.py +26 -0
  34. numba_cuda/numba/cuda/core/rewrites/ir_print.py +90 -0
  35. numba_cuda/numba/cuda/core/rewrites/registry.py +104 -0
  36. numba_cuda/numba/cuda/core/rewrites/static_binop.py +40 -0
  37. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +187 -0
  38. numba_cuda/numba/cuda/core/rewrites/static_raise.py +98 -0
  39. numba_cuda/numba/cuda/core/ssa.py +496 -0
  40. numba_cuda/numba/cuda/core/targetconfig.py +329 -0
  41. numba_cuda/numba/cuda/core/tracing.py +231 -0
  42. numba_cuda/numba/cuda/core/transforms.py +952 -0
  43. numba_cuda/numba/cuda/core/typed_passes.py +738 -7
  44. numba_cuda/numba/cuda/core/typeinfer.py +1948 -0
  45. numba_cuda/numba/cuda/core/unsafe/__init__.py +0 -0
  46. numba_cuda/numba/cuda/core/unsafe/bytes.py +67 -0
  47. numba_cuda/numba/cuda/core/unsafe/eh.py +66 -0
  48. numba_cuda/numba/cuda/core/unsafe/refcount.py +98 -0
  49. numba_cuda/numba/cuda/core/untyped_passes.py +1983 -0
  50. numba_cuda/numba/cuda/cpython/cmathimpl.py +560 -0
  51. numba_cuda/numba/cuda/cpython/mathimpl.py +499 -0
  52. numba_cuda/numba/cuda/cpython/numbers.py +1474 -0
  53. numba_cuda/numba/cuda/cuda_paths.py +422 -246
  54. numba_cuda/numba/cuda/cudadecl.py +1 -1
  55. numba_cuda/numba/cuda/cudadrv/__init__.py +1 -1
  56. numba_cuda/numba/cuda/cudadrv/devicearray.py +2 -1
  57. numba_cuda/numba/cuda/cudadrv/driver.py +11 -140
  58. numba_cuda/numba/cuda/cudadrv/dummyarray.py +111 -24
  59. numba_cuda/numba/cuda/cudadrv/libs.py +5 -5
  60. numba_cuda/numba/cuda/cudadrv/mappings.py +1 -1
  61. numba_cuda/numba/cuda/cudadrv/nvrtc.py +19 -8
  62. numba_cuda/numba/cuda/cudadrv/nvvm.py +1 -4
  63. numba_cuda/numba/cuda/cudadrv/runtime.py +1 -1
  64. numba_cuda/numba/cuda/cudaimpl.py +5 -1
  65. numba_cuda/numba/cuda/debuginfo.py +85 -2
  66. numba_cuda/numba/cuda/decorators.py +3 -3
  67. numba_cuda/numba/cuda/descriptor.py +3 -4
  68. numba_cuda/numba/cuda/deviceufunc.py +66 -2
  69. numba_cuda/numba/cuda/dispatcher.py +18 -39
  70. numba_cuda/numba/cuda/flags.py +141 -1
  71. numba_cuda/numba/cuda/fp16.py +0 -2
  72. numba_cuda/numba/cuda/include/13/cuda_bf16.h +5118 -0
  73. numba_cuda/numba/cuda/include/13/cuda_bf16.hpp +3865 -0
  74. numba_cuda/numba/cuda/include/13/cuda_fp16.h +5363 -0
  75. numba_cuda/numba/cuda/include/13/cuda_fp16.hpp +3483 -0
  76. numba_cuda/numba/cuda/lowering.py +7 -144
  77. numba_cuda/numba/cuda/mathimpl.py +2 -1
  78. numba_cuda/numba/cuda/memory_management/nrt.py +43 -17
  79. numba_cuda/numba/cuda/misc/findlib.py +75 -0
  80. numba_cuda/numba/cuda/models.py +9 -1
  81. numba_cuda/numba/cuda/np/npdatetime_helpers.py +217 -0
  82. numba_cuda/numba/cuda/np/npyfuncs.py +1807 -0
  83. numba_cuda/numba/cuda/np/numpy_support.py +553 -0
  84. numba_cuda/numba/cuda/np/ufunc/ufuncbuilder.py +59 -0
  85. numba_cuda/numba/cuda/nvvmutils.py +1 -1
  86. numba_cuda/numba/cuda/printimpl.py +12 -1
  87. numba_cuda/numba/cuda/random.py +1 -1
  88. numba_cuda/numba/cuda/serialize.py +1 -1
  89. numba_cuda/numba/cuda/simulator/__init__.py +1 -1
  90. numba_cuda/numba/cuda/simulator/api.py +1 -1
  91. numba_cuda/numba/cuda/simulator/compiler.py +4 -0
  92. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +1 -1
  93. numba_cuda/numba/cuda/simulator/kernelapi.py +1 -1
  94. numba_cuda/numba/cuda/simulator/memory_management/nrt.py +14 -2
  95. numba_cuda/numba/cuda/target.py +35 -17
  96. numba_cuda/numba/cuda/testing.py +4 -19
  97. numba_cuda/numba/cuda/tests/__init__.py +1 -1
  98. numba_cuda/numba/cuda/tests/cloudpickle_main_class.py +9 -0
  99. numba_cuda/numba/cuda/tests/core/test_serialize.py +4 -4
  100. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +1 -1
  101. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +1 -1
  102. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +1 -1
  103. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +6 -3
  104. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +1 -1
  105. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +18 -2
  106. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +2 -1
  107. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +1 -1
  108. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +1 -1
  109. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +1 -1
  110. numba_cuda/numba/cuda/tests/cudapy/test_array.py +2 -1
  111. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1 -1
  112. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +539 -2
  113. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +81 -1
  114. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +1 -3
  115. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +1 -1
  116. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +1 -1
  117. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +2 -3
  118. numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +130 -0
  119. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +1 -1
  120. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +1 -1
  121. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +293 -4
  122. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo_types.py +1 -1
  123. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +1 -1
  124. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +1 -1
  125. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +1 -1
  126. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +2 -1
  127. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +18 -8
  128. numba_cuda/numba/cuda/tests/cudapy/test_ir_utils.py +10 -37
  129. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +1 -1
  130. numba_cuda/numba/cuda/tests/cudapy/test_math.py +1 -1
  131. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -1
  132. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +1 -1
  133. numba_cuda/numba/cuda/tests/cudapy/test_print.py +20 -0
  134. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +1 -1
  135. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +1 -1
  136. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +1 -1
  137. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +1 -1
  138. numba_cuda/numba/cuda/tests/cudapy/test_ssa.py +453 -0
  139. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +1 -1
  140. numba_cuda/numba/cuda/tests/cudapy/test_typeinfer.py +538 -0
  141. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +263 -2
  142. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +1 -1
  143. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +1 -1
  144. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +112 -6
  145. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +1 -1
  146. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +1 -1
  147. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +0 -2
  148. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +3 -2
  149. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +0 -2
  150. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +0 -2
  151. numba_cuda/numba/cuda/tests/nocuda/test_import.py +3 -1
  152. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +24 -12
  153. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +2 -1
  154. numba_cuda/numba/cuda/tests/support.py +55 -15
  155. numba_cuda/numba/cuda/tests/test_tracing.py +200 -0
  156. numba_cuda/numba/cuda/types.py +56 -0
  157. numba_cuda/numba/cuda/typing/__init__.py +9 -1
  158. numba_cuda/numba/cuda/typing/cffi_utils.py +55 -0
  159. numba_cuda/numba/cuda/typing/context.py +751 -0
  160. numba_cuda/numba/cuda/typing/enumdecl.py +74 -0
  161. numba_cuda/numba/cuda/typing/npydecl.py +658 -0
  162. numba_cuda/numba/cuda/typing/templates.py +7 -6
  163. numba_cuda/numba/cuda/ufuncs.py +3 -3
  164. numba_cuda/numba/cuda/utils.py +6 -112
  165. {numba_cuda-0.19.1.dist-info → numba_cuda-0.20.0.dist-info}/METADATA +2 -1
  166. {numba_cuda-0.19.1.dist-info → numba_cuda-0.20.0.dist-info}/RECORD +170 -115
  167. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +0 -60
  168. {numba_cuda-0.19.1.dist-info → numba_cuda-0.20.0.dist-info}/WHEEL +0 -0
  169. {numba_cuda-0.19.1.dist-info → numba_cuda-0.20.0.dist-info}/licenses/LICENSE +0 -0
  170. {numba_cuda-0.19.1.dist-info → numba_cuda-0.20.0.dist-info}/licenses/LICENSE.numba +0 -0
  171. {numba_cuda-0.19.1.dist-info → numba_cuda-0.20.0.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,117 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: BSD-2-Clause
3
3
 
4
- from numba import cuda, float32
5
- from numba.cuda.bf16 import bfloat16
4
+ import numpy as np
5
+ from ml_dtypes import bfloat16 as mldtypes_bf16
6
+
7
+ from numba import (
8
+ cuda,
9
+ float32,
10
+ float64,
11
+ int16,
12
+ int32,
13
+ int64,
14
+ uint16,
15
+ uint32,
16
+ uint64,
17
+ )
18
+ from numba.cuda import config
19
+
20
+ if not config.ENABLE_CUDASIM:
21
+ from numba.cuda.bf16 import (
22
+ bfloat16,
23
+ habs,
24
+ hadd,
25
+ hsub,
26
+ hmul,
27
+ hadd_rn,
28
+ hsub_rn,
29
+ hmul_rn,
30
+ hdiv,
31
+ hadd_sat,
32
+ hsub_sat,
33
+ hmul_sat,
34
+ hfma,
35
+ hfma_sat,
36
+ hneg,
37
+ hfma_relu,
38
+ # Comparison intrinsics
39
+ heq,
40
+ hne,
41
+ hge,
42
+ hgt,
43
+ hle,
44
+ hlt,
45
+ hmax,
46
+ hmin,
47
+ hmax_nan,
48
+ hmin_nan,
49
+ hisnan,
50
+ hisinf,
51
+ # Conversion intrinsics (NumPy-style names)
52
+ bfloat16_to_int8_rz,
53
+ bfloat16_to_uint8_rz,
54
+ int16_to_bfloat16_rn,
55
+ int16_to_bfloat16_rz,
56
+ int16_to_bfloat16_rd,
57
+ int16_to_bfloat16_ru,
58
+ bfloat16_to_int16_rn,
59
+ bfloat16_to_int16_rz,
60
+ bfloat16_to_int16_rd,
61
+ bfloat16_to_int16_ru,
62
+ uint16_to_bfloat16_rn,
63
+ uint16_to_bfloat16_rz,
64
+ uint16_to_bfloat16_rd,
65
+ uint16_to_bfloat16_ru,
66
+ bfloat16_to_uint16_rn,
67
+ bfloat16_to_uint16_rz,
68
+ bfloat16_to_uint16_rd,
69
+ bfloat16_to_uint16_ru,
70
+ int32_to_bfloat16_rn,
71
+ int32_to_bfloat16_rz,
72
+ int32_to_bfloat16_rd,
73
+ int32_to_bfloat16_ru,
74
+ bfloat16_to_int32_rn,
75
+ bfloat16_to_int32_rz,
76
+ bfloat16_to_int32_rd,
77
+ bfloat16_to_int32_ru,
78
+ uint32_to_bfloat16_rn,
79
+ uint32_to_bfloat16_rz,
80
+ uint32_to_bfloat16_rd,
81
+ uint32_to_bfloat16_ru,
82
+ bfloat16_to_uint32_rn,
83
+ bfloat16_to_uint32_rz,
84
+ bfloat16_to_uint32_rd,
85
+ bfloat16_to_uint32_ru,
86
+ bfloat16_to_int64_rn,
87
+ bfloat16_to_int64_rz,
88
+ bfloat16_to_int64_rd,
89
+ bfloat16_to_int64_ru,
90
+ int64_to_bfloat16_rn,
91
+ int64_to_bfloat16_rz,
92
+ int64_to_bfloat16_rd,
93
+ int64_to_bfloat16_ru,
94
+ bfloat16_to_uint64_rn,
95
+ bfloat16_to_uint64_rz,
96
+ bfloat16_to_uint64_rd,
97
+ bfloat16_to_uint64_ru,
98
+ uint64_to_bfloat16_rn,
99
+ uint64_to_bfloat16_rz,
100
+ uint64_to_bfloat16_rd,
101
+ uint64_to_bfloat16_ru,
102
+ bfloat16_as_int16,
103
+ int16_as_bfloat16,
104
+ bfloat16_as_uint16,
105
+ uint16_as_bfloat16,
106
+ bfloat16_to_float32,
107
+ float32_to_bfloat16,
108
+ float64_to_bfloat16,
109
+ float32_to_bfloat16_rn,
110
+ float32_to_bfloat16_rz,
111
+ float32_to_bfloat16_rd,
112
+ float32_to_bfloat16_ru,
113
+ )
114
+
6
115
  from numba.cuda.testing import CUDATestCase
7
116
 
8
117
  import math
@@ -61,3 +170,431 @@ class TestBfloat16HighLevelBindings(CUDATestCase):
61
170
  self.assertAlmostEqual(arr[0], f(3.14), delta=1e-1)
62
171
  else:
63
172
  self.assertAlmostEqual(arr[0], f(3.14), delta=1e-2)
173
+
174
+ def test_arithmetic_intrinsics_basic(self):
175
+ self.skip_unsupported()
176
+
177
+ @cuda.jit
178
+ def kernel(out):
179
+ a = bfloat16(1.25)
180
+ b = bfloat16(-2.5)
181
+
182
+ out[0] = float32(habs(b))
183
+ out[1] = float32(hadd(a, b))
184
+ out[2] = float32(hsub(a, b))
185
+ out[3] = float32(hmul(a, b))
186
+ out[4] = float32(hdiv(b, a))
187
+ out[5] = float32(hneg(a))
188
+ out[6] = float32(hfma(a, b, b))
189
+
190
+ out[7] = float32(hadd_rn(a, b))
191
+ out[8] = float32(hsub_rn(a, b))
192
+ out[9] = float32(hmul_rn(a, b))
193
+
194
+ out = cuda.device_array((10,), dtype="float32")
195
+ kernel[1, 1](out)
196
+
197
+ a = 1.25
198
+ b = -2.5
199
+ expected = [
200
+ abs(b),
201
+ a + b,
202
+ a - b,
203
+ a * b,
204
+ b / a,
205
+ -a,
206
+ a * b + b,
207
+ a + b,
208
+ a - b,
209
+ a * b,
210
+ ]
211
+ for i, exp in enumerate(expected):
212
+ self.assertAlmostEqual(out[i], exp, delta=1e-2)
213
+
214
+ def test_arithmetic_intrinsics_saturating(self):
215
+ self.skip_unsupported()
216
+
217
+ @cuda.jit
218
+ def kernel(out):
219
+ a = bfloat16(1.5)
220
+ b = bfloat16(0.75)
221
+
222
+ out[0] = float32(hadd_sat(a, b)) # 2.25 -> 1.0
223
+ out[1] = float32(hsub_sat(b, a)) # -0.75 -> 0.0
224
+ out[2] = float32(hmul_sat(a, b)) # 1.125 -> 1.0
225
+ out[3] = float32(hfma_sat(a, b, a)) # 1.125 + 1.5 -> 1.0
226
+
227
+ out = cuda.device_array((4,), dtype="float32")
228
+ kernel[1, 1](out)
229
+
230
+ self.assertAlmostEqual(out[0], 1.0, delta=1e-3)
231
+ self.assertAlmostEqual(out[1], 0.0, delta=1e-3)
232
+ self.assertAlmostEqual(out[2], 1.0, delta=1e-3)
233
+ self.assertAlmostEqual(out[3], 1.0, delta=1e-3)
234
+
235
+ # Also check they are clamped within [0, 1]
236
+ for i in range(4):
237
+ self.assertGreaterEqual(out[i], 0.0)
238
+ self.assertLessEqual(out[i], 1.0)
239
+
240
+ def test_fma_relu_intrinsic(self):
241
+ self.skip_unsupported()
242
+
243
+ @cuda.jit
244
+ def kernel(out):
245
+ a = bfloat16(-1.5)
246
+ b = bfloat16(2.0)
247
+ c = bfloat16(0.0)
248
+
249
+ out[0] = float32(hfma_relu(a, b, c)) # -3.0 -> relu -> 0.0
250
+
251
+ out = cuda.device_array((1,), dtype="float32")
252
+ kernel[1, 1](out)
253
+
254
+ self.assertAlmostEqual(out[0], 0.0, delta=1e-3)
255
+
256
+ def test_comparison_intrinsics(self):
257
+ self.skip_unsupported()
258
+
259
+ def make_kernel(cmpfn):
260
+ @cuda.jit
261
+ def kernel(out, a, b):
262
+ a_bf16 = bfloat16(a)
263
+ b_bf16 = bfloat16(b)
264
+ out[0] = cmpfn(a_bf16, b_bf16)
265
+
266
+ return kernel
267
+
268
+ comparisons = [heq, hne, hge, hgt, hle, hlt]
269
+ ops = [
270
+ lambda x, y: x == y,
271
+ lambda x, y: x != y,
272
+ lambda x, y: x >= y,
273
+ lambda x, y: x > y,
274
+ lambda x, y: x <= y,
275
+ lambda x, y: x < y,
276
+ ]
277
+
278
+ for cmpfn, op in zip(comparisons, ops):
279
+ with self.subTest(cmpfn=cmpfn):
280
+ kernel = make_kernel(cmpfn)
281
+ out = cuda.device_array((1,), dtype="bool")
282
+
283
+ a = 3.0
284
+ b = 3.0
285
+ kernel[1, 1](out, a, b)
286
+ self.assertEqual(bool(out[0]), op(3.0, 3.0))
287
+
288
+ a = 3.0
289
+ b = 4.0
290
+ kernel[1, 1](out, a, b)
291
+ self.assertEqual(bool(out[0]), op(3.0, 4.0))
292
+
293
+ a = 4.0
294
+ b = 3.0
295
+ kernel[1, 1](out, a, b)
296
+ self.assertEqual(bool(out[0]), op(4.0, 3.0))
297
+
298
+ def test_hmax_hmin_intrinsics(self):
299
+ self.skip_unsupported()
300
+
301
+ @cuda.jit
302
+ def kernel(out):
303
+ a = bfloat16(3.0)
304
+ b = bfloat16(4.0)
305
+ out[0] = float32(hmax(a, b))
306
+ out[1] = float32(hmin(a, b))
307
+
308
+ out = cuda.device_array((2,), dtype="float32")
309
+ kernel[1, 1](out)
310
+ self.assertAlmostEqual(out[0], 4.0, delta=1e-3)
311
+ self.assertAlmostEqual(out[1], 3.0, delta=1e-3)
312
+
313
+ def test_nan_and_inf_intrinsics(self):
314
+ self.skip_unsupported()
315
+
316
+ @cuda.jit
317
+ def kernel(out_bool, out_int):
318
+ nanv = bfloat16(float("nan"))
319
+ infv = bfloat16(float("inf"))
320
+ out_bool[0] = hisnan(nanv)
321
+ out_int[0] = hisinf(infv)
322
+
323
+ out_bool = cuda.device_array((1,), dtype="bool")
324
+ out_int = cuda.device_array((1,), dtype="int32")
325
+ kernel[1, 1](out_bool, out_int)
326
+ self.assertTrue(bool(out_bool[0]))
327
+ self.assertNotEqual(int(out_int[0]), 0)
328
+
329
+ def test_hmax_nan_hmin_nan_intrinsics(self):
330
+ self.skip_unsupported()
331
+
332
+ @cuda.jit
333
+ def kernel(out):
334
+ a = bfloat16(float("nan"))
335
+ b = bfloat16(2.0)
336
+ out[0] = float32(hmax_nan(a, b))
337
+ out[1] = float32(hmin_nan(a, b))
338
+ out[2] = float32(hmax(a, b))
339
+ out[3] = float32(hmin(a, b))
340
+
341
+ out = cuda.device_array((4,), dtype="float32")
342
+ kernel[1, 1](out)
343
+ # NaN-propagating variants should produce NaN
344
+ self.assertTrue(math.isnan(out[0]))
345
+ self.assertTrue(math.isnan(out[1]))
346
+ # Non-NaN variants should return the non-NaN operand
347
+ self.assertAlmostEqual(out[2], 2.0, delta=1e-3)
348
+ self.assertAlmostEqual(out[3], 2.0, delta=1e-3)
349
+
350
+ def test_bfloat16_as_bitcast(self):
351
+ self.skip_unsupported()
352
+
353
+ @cuda.jit
354
+ def roundtrip_kernel(test_val, i2, u2):
355
+ i2[0] = int16_as_bfloat16(bfloat16_as_int16(test_val))
356
+ u2[0] = uint16_as_bfloat16(bfloat16_as_uint16(test_val))
357
+
358
+ test_val = np.int16(0x3FC0) # 1.5 in bfloat16
359
+ i2 = cuda.device_array((1,), dtype="int16")
360
+ u2 = cuda.device_array((1,), dtype="uint16")
361
+ roundtrip_kernel[1, 1](test_val, i2, u2)
362
+
363
+ self.assertEqual(i2[0], test_val)
364
+ self.assertEqual(u2[0], test_val)
365
+
366
+ def test_to_integer_conversions(self):
367
+ self.skip_unsupported()
368
+
369
+ @cuda.jit
370
+ def kernel(test_val, i1, i2, i3, i4, u1, u2, u3, u4):
371
+ a = int16_as_bfloat16(test_val)
372
+
373
+ i1[0] = bfloat16_to_int8_rz(a)
374
+ u1[0] = bfloat16_to_uint8_rz(a)
375
+ i2[0] = bfloat16_to_int16_rn(a)
376
+ i2[1] = bfloat16_to_int16_rz(a)
377
+ i2[2] = bfloat16_to_int16_rd(a)
378
+ i2[3] = bfloat16_to_int16_ru(a)
379
+ u2[0] = bfloat16_to_uint16_rn(a)
380
+ u2[1] = bfloat16_to_uint16_rz(a)
381
+ u2[2] = bfloat16_to_uint16_rd(a)
382
+ u2[3] = bfloat16_to_uint16_ru(a)
383
+ i3[0] = bfloat16_to_int32_rn(a)
384
+ i3[1] = bfloat16_to_int32_rz(a)
385
+ i3[2] = bfloat16_to_int32_rd(a)
386
+ i3[3] = bfloat16_to_int32_ru(a)
387
+ u3[0] = bfloat16_to_uint32_rn(a)
388
+ u3[1] = bfloat16_to_uint32_rz(a)
389
+ u3[2] = bfloat16_to_uint32_rd(a)
390
+ u3[3] = bfloat16_to_uint32_ru(a)
391
+ i4[0] = bfloat16_to_int64_rn(a)
392
+ i4[1] = bfloat16_to_int64_rz(a)
393
+ i4[2] = bfloat16_to_int64_rd(a)
394
+ i4[3] = bfloat16_to_int64_ru(a)
395
+ u4[0] = bfloat16_to_uint64_rn(a)
396
+ u4[1] = bfloat16_to_uint64_rz(a)
397
+ u4[2] = bfloat16_to_uint64_rd(a)
398
+ u4[3] = bfloat16_to_uint64_ru(a)
399
+
400
+ # rz
401
+ i1 = cuda.device_array((1,), dtype="int8")
402
+ # rn, rz, rd, ru
403
+ i2 = cuda.device_array((4,), dtype="int16")
404
+ i3 = cuda.device_array((4,), dtype="int32")
405
+ i4 = cuda.device_array((4,), dtype="int64")
406
+ # rz
407
+ u1 = cuda.device_array((1,), dtype="uint8")
408
+ # rn, rz, rd, ru
409
+ u2 = cuda.device_array((4,), dtype="uint16")
410
+ u3 = cuda.device_array((4,), dtype="uint32")
411
+ u4 = cuda.device_array((4,), dtype="uint64")
412
+
413
+ test_val = np.int16(0x3FC0) # 1.5 in bfloat16
414
+
415
+ kernel[1, 1](test_val, i1, i2, i3, i4, u1, u2, u3, u4)
416
+
417
+ self.assertEqual(i1[0], 1)
418
+ self.assertEqual(u1[0], 1)
419
+
420
+ np.testing.assert_equal(i2, np.array([2, 1, 1, 2], "int16"))
421
+ np.testing.assert_equal(i3, np.array([2, 1, 1, 2], "int32"))
422
+ np.testing.assert_equal(i4, np.array([2, 1, 1, 2], "int64"))
423
+ np.testing.assert_equal(u2, np.array([2, 1, 1, 2], "uint16"))
424
+ np.testing.assert_equal(u3, np.array([2, 1, 1, 2], "uint32"))
425
+ np.testing.assert_equal(u4, np.array([2, 1, 1, 2], "uint64"))
426
+
427
+ def test_from_integer_conversions(self):
428
+ self.skip_unsupported()
429
+
430
+ test_val = 789
431
+
432
+ @cuda.jit
433
+ def kernel(out):
434
+ i2 = int16(test_val)
435
+ i3 = int32(test_val)
436
+ i4 = int64(test_val)
437
+ u2 = uint16(test_val)
438
+ u3 = uint32(test_val)
439
+ u4 = uint64(test_val)
440
+
441
+ i2rn = int16_to_bfloat16_rn(i2)
442
+ i2rz = int16_to_bfloat16_rz(i2)
443
+ i2rd = int16_to_bfloat16_rd(i2)
444
+ i2ru = int16_to_bfloat16_ru(i2)
445
+
446
+ u2rn = uint16_to_bfloat16_rn(u2)
447
+ u2rz = uint16_to_bfloat16_rz(u2)
448
+ u2rd = uint16_to_bfloat16_rd(u2)
449
+ u2ru = uint16_to_bfloat16_ru(u2)
450
+
451
+ i3rn = int32_to_bfloat16_rn(i3)
452
+ i3rz = int32_to_bfloat16_rz(i3)
453
+ i3rd = int32_to_bfloat16_rd(i3)
454
+ i3ru = int32_to_bfloat16_ru(i3)
455
+
456
+ u3rn = uint32_to_bfloat16_rn(u3)
457
+ u3rz = uint32_to_bfloat16_rz(u3)
458
+ u3rd = uint32_to_bfloat16_rd(u3)
459
+ u3ru = uint32_to_bfloat16_ru(u3)
460
+
461
+ i4rn = int64_to_bfloat16_rn(i4)
462
+ i4rz = int64_to_bfloat16_rz(i4)
463
+ i4rd = int64_to_bfloat16_rd(i4)
464
+ i4ru = int64_to_bfloat16_ru(i4)
465
+
466
+ u4rn = uint64_to_bfloat16_rn(u4)
467
+ u4rz = uint64_to_bfloat16_rz(u4)
468
+ u4rd = uint64_to_bfloat16_rd(u4)
469
+ u4ru = uint64_to_bfloat16_ru(u4)
470
+
471
+ out[0] = bfloat16_as_int16(i2rn)
472
+ out[1] = bfloat16_as_int16(i2rz)
473
+ out[2] = bfloat16_as_int16(i2rd)
474
+ out[3] = bfloat16_as_int16(i2ru)
475
+ out[4] = bfloat16_as_int16(u2rn)
476
+ out[5] = bfloat16_as_int16(u2rz)
477
+ out[6] = bfloat16_as_int16(u2rd)
478
+ out[7] = bfloat16_as_int16(u2ru)
479
+ out[8] = bfloat16_as_int16(i3rn)
480
+ out[9] = bfloat16_as_int16(i3rz)
481
+ out[10] = bfloat16_as_int16(i3rd)
482
+ out[11] = bfloat16_as_int16(i3ru)
483
+ out[12] = bfloat16_as_int16(u3rn)
484
+ out[13] = bfloat16_as_int16(u3rz)
485
+ out[14] = bfloat16_as_int16(u3rd)
486
+ out[15] = bfloat16_as_int16(u3ru)
487
+ out[16] = bfloat16_as_int16(i4rn)
488
+ out[17] = bfloat16_as_int16(i4rz)
489
+ out[18] = bfloat16_as_int16(i4rd)
490
+ out[19] = bfloat16_as_int16(i4ru)
491
+ out[20] = bfloat16_as_int16(u4rn)
492
+ out[21] = bfloat16_as_int16(u4rz)
493
+ out[22] = bfloat16_as_int16(u4rd)
494
+ out[23] = bfloat16_as_int16(u4ru)
495
+
496
+ out = cuda.device_array((24,), dtype="int16")
497
+ kernel[1, 1](out)
498
+ res = out.copy_to_host()
499
+
500
+ i2 = np.int16(789).astype(mldtypes_bf16).view("int16")
501
+ i3 = np.int32(789).astype(mldtypes_bf16).view("int16")
502
+ i4 = np.int64(789).astype(mldtypes_bf16).view("int16")
503
+ u2 = np.uint16(789).astype(mldtypes_bf16).view("int16")
504
+ u3 = np.uint32(789).astype(mldtypes_bf16).view("int16")
505
+ u4 = np.uint64(789).astype(mldtypes_bf16).view("int16")
506
+
507
+ i2arr = np.array([i2] * 4)
508
+ i3arr = np.array([i3] * 4)
509
+ i4arr = np.array([i4] * 4)
510
+ u2arr = np.array([u2] * 4)
511
+ u3arr = np.array([u3] * 4)
512
+ u4arr = np.array([u4] * 4)
513
+
514
+ two = np.ones_like(res[0:4]) * 2
515
+ np.testing.assert_array_less(_bf16_ulp_distance(res[0:4], i2arr), two)
516
+ np.testing.assert_array_less(_bf16_ulp_distance(res[4:8], i3arr), two)
517
+ np.testing.assert_array_less(_bf16_ulp_distance(res[8:12], i4arr), two)
518
+ np.testing.assert_array_less(_bf16_ulp_distance(res[12:16], u2arr), two)
519
+ np.testing.assert_array_less(_bf16_ulp_distance(res[16:20], u3arr), two)
520
+ np.testing.assert_array_less(_bf16_ulp_distance(res[20:24], u4arr), two)
521
+
522
+ def test_to_float_conversions(self):
523
+ self.skip_unsupported()
524
+
525
+ @cuda.jit
526
+ def kernel(out):
527
+ a = bfloat16(1.5)
528
+ out[0] = bfloat16_to_float32(a)
529
+
530
+ out = cuda.device_array((1,), dtype="float32")
531
+ kernel[1, 1](out)
532
+
533
+ self.assertAlmostEqual(out[0], 1.5, delta=1e-7) # conversion is exact
534
+
535
+ def test_from_float_conversions(self):
536
+ self.skip_unsupported()
537
+
538
+ test_val = 1.5
539
+
540
+ @cuda.jit
541
+ def kernel(out):
542
+ f4 = float32(test_val)
543
+ f8 = float64(test_val)
544
+
545
+ f4rn = float32_to_bfloat16_rn(f4)
546
+ f4rz = float32_to_bfloat16_rz(f4)
547
+ f4rd = float32_to_bfloat16_rd(f4)
548
+ f4ru = float32_to_bfloat16_ru(f4)
549
+
550
+ f4_default = float32_to_bfloat16(f4)
551
+ f8_default = float64_to_bfloat16(f8)
552
+
553
+ out[0] = bfloat16_as_int16(f4rn)
554
+ out[1] = bfloat16_as_int16(f4rz)
555
+ out[2] = bfloat16_as_int16(f4rd)
556
+ out[3] = bfloat16_as_int16(f4ru)
557
+ out[4] = bfloat16_as_int16(f4_default)
558
+ out[5] = bfloat16_as_int16(f8_default)
559
+
560
+ out = cuda.device_array((1,), dtype="int16")
561
+ kernel[1, 1](out)
562
+ raw = out.copy_to_host()
563
+
564
+ f4_expected = (
565
+ np.array([test_val] * 4, "float32")
566
+ .astype(mldtypes_bf16)
567
+ .view("int16")
568
+ )
569
+ f8_expected = (
570
+ np.array([test_val] * 1, "float64")
571
+ .astype(mldtypes_bf16)
572
+ .view("int16")
573
+ )
574
+
575
+ np.testing.assert_array_less(
576
+ _bf16_ulp_distance(raw[0:4], f4_expected), 2
577
+ )
578
+ np.testing.assert_array_less(
579
+ _bf16_ulp_distance(raw[4:], f8_expected), 2
580
+ )
581
+
582
+
583
+ def _bf16_ulp_rank(bits_int16: np.ndarray) -> np.ndarray:
584
+ """
585
+ Compute the ULP rank of a bfloat16 value. Input is the bits of the bfloat16 value as an int16.
586
+ The ULP rank is the number of ULPs between the value and 0.
587
+ Negative values are performed the inverse of 2's complement before computing the rank.
588
+ """
589
+ u = bits_int16.view(np.uint16)
590
+ sign = u >> 15
591
+ return np.where(sign == 0, u + 0x8000, 0x8000 - u).astype(np.int32)
592
+
593
+
594
+ def _bf16_ulp_distance(
595
+ a_bits_int16: np.ndarray, b_bits_int16: np.ndarray
596
+ ) -> np.ndarray:
597
+ """
598
+ Compute the difference between two bfloat16 values in ULPs.
599
+ """
600
+ return np.abs(_bf16_ulp_rank(a_bits_int16) - _bf16_ulp_rank(b_bits_int16))
@@ -1,12 +1,16 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: BSD-2-Clause
3
3
 
4
+ from collections import OrderedDict
5
+ import bisect
6
+
4
7
  import numba.cuda as cuda
5
8
  from numba.cuda.testing import unittest, CUDATestCase
6
9
  import numpy as np
10
+ import operator
11
+ from numba.cuda.testing import skip_if_nvjitlink_missing
7
12
 
8
13
  from numba import (
9
- config,
10
14
  int16,
11
15
  int32,
12
16
  int64,
@@ -17,6 +21,7 @@ from numba import (
17
21
  float64,
18
22
  )
19
23
  from numba.types import float16
24
+ from numba.cuda import config
20
25
 
21
26
  if not config.ENABLE_CUDASIM:
22
27
  from numba.cuda._internal.cuda_bf16 import (
@@ -292,6 +297,81 @@ class Bfloat16Test(CUDATestCase):
292
297
 
293
298
  np.testing.assert_allclose(arr, [3], atol=1e-2)
294
299
 
300
+ @skip_if_nvjitlink_missing("LTO is not supported without nvjitlink.")
301
+ def test_bf16_intrinsics_used_in_lto(self):
302
+ self.skip_unsupported()
303
+
304
+ operations = [
305
+ (
306
+ operator.add,
307
+ OrderedDict(
308
+ {
309
+ (
310
+ 7,
311
+ 0,
312
+ ): ".s16", # All CC prior to 8.0 uses bit operations
313
+ (8, 0): "fma.rn.bf16", # 8.0 uses fma
314
+ (9, 0): "add.bf16", # 9.0 uses native add
315
+ }
316
+ ),
317
+ ),
318
+ (
319
+ operator.sub,
320
+ OrderedDict(
321
+ {
322
+ (
323
+ 7,
324
+ 0,
325
+ ): ".s16", # All CC prior to 8.0 uses bit operations
326
+ (8, 0): "fma.rn.bf16", # 8.0 uses fma
327
+ (9, 0): "sub.bf16", # 9.0 uses native sub
328
+ }
329
+ ),
330
+ ),
331
+ (
332
+ operator.mul,
333
+ OrderedDict(
334
+ {
335
+ (
336
+ 7,
337
+ 0,
338
+ ): ".s16", # All CC prior to 8.0 uses bit operations
339
+ (8, 0): "fma.rn.bf16", # 8.0 uses fma
340
+ (9, 0): "mul.bf16", # 9.0 uses native mul
341
+ }
342
+ ),
343
+ ),
344
+ (
345
+ operator.truediv,
346
+ OrderedDict(
347
+ {
348
+ (10, 0): "div.approx.f32",
349
+ }
350
+ ),
351
+ ), # no native bf16 div, see cuda_bf16.hpp:L3067
352
+ ]
353
+
354
+ for op, ptx_op in operations:
355
+ with self.subTest(op=op):
356
+
357
+ @cuda.jit(lto=True)
358
+ def kernel(arr):
359
+ a = nv_bfloat16(3.14)
360
+ b = nv_bfloat16(5)
361
+ arr[0] = float32(op(a, b))
362
+
363
+ arr = np.zeros(1, np.float32)
364
+ kernel[1, 1](arr)
365
+ np.testing.assert_allclose(arr, [op(3.14, 5)], atol=1e-1)
366
+
367
+ ptx = next(iter(kernel.inspect_lto_ptx().values()))
368
+ cc = cuda.get_current_device().compute_capability
369
+ idx = bisect.bisect_right(list(ptx_op.keys()), cc)
370
+ # find the lowest major version from ptx_op dictionary
371
+ idx = max(0, idx - 1)
372
+ expected = list(ptx_op.values())[idx]
373
+ assert expected in ptx, ptx
374
+
295
375
 
296
376
  if __name__ == "__main__":
297
377
  unittest.main()
@@ -17,7 +17,6 @@ from numba.cuda.testing import (
17
17
  skip_on_cudasim,
18
18
  skip_unless_cc_60,
19
19
  skip_if_cudadevrt_missing,
20
- skip_if_mvc_enabled,
21
20
  test_data_dir,
22
21
  )
23
22
  from numba.cuda.tests.support import (
@@ -388,7 +387,6 @@ class CUDACooperativeGroupTest(DispatcherCacheUsecasesTest):
388
387
 
389
388
  @skip_unless_cc_60
390
389
  @skip_if_cudadevrt_missing
391
- @skip_if_mvc_enabled("CG not supported with MVC")
392
390
  def test_cache_cg(self):
393
391
  # Functions using cooperative groups should be cacheable. See Issue
394
392
  # #8888: https://github.com/numba/numba/issues/8888
@@ -622,7 +620,7 @@ class TestMultiCCCaching(DispatcherCacheUsecasesTest):
622
620
  def child_initializer():
623
621
  # Disable occupancy and implicit copy warnings in processes in a
624
622
  # multiprocessing pool.
625
- from numba.core import config
623
+ from numba.cuda.core import config
626
624
 
627
625
  config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
628
626
  config.CUDA_WARN_ON_IMPLICIT_COPY = 0
@@ -37,7 +37,7 @@ from numba.cuda.tests.complex_usecases import (
37
37
  sinh_usecase,
38
38
  tanh_usecase,
39
39
  )
40
- from numba.np import numpy_support
40
+ from numba.cuda.np import numpy_support
41
41
 
42
42
 
43
43
  def compile_scalar_func(pyfunc, argtypes, restype):
@@ -5,7 +5,7 @@ import numpy as np
5
5
 
6
6
  from numba import cuda, complex64, int32, float64
7
7
  from numba.cuda.testing import unittest, CUDATestCase
8
- from numba.core.config import ENABLE_CUDASIM
8
+ from numba.cuda.core.config import ENABLE_CUDASIM
9
9
 
10
10
  CONST_EMPTY = np.array([])
11
11
  CONST1D = np.arange(10, dtype=np.float64) / 2.0