numba-cuda 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. _numba_cuda_redirector.py +17 -13
  2. numba_cuda/VERSION +1 -1
  3. numba_cuda/_version.py +4 -1
  4. numba_cuda/numba/cuda/__init__.py +6 -2
  5. numba_cuda/numba/cuda/api.py +129 -86
  6. numba_cuda/numba/cuda/api_util.py +3 -3
  7. numba_cuda/numba/cuda/args.py +12 -16
  8. numba_cuda/numba/cuda/cg.py +6 -6
  9. numba_cuda/numba/cuda/codegen.py +74 -43
  10. numba_cuda/numba/cuda/compiler.py +246 -114
  11. numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
  12. numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
  13. numba_cuda/numba/cuda/cuda_paths.py +293 -99
  14. numba_cuda/numba/cuda/cudadecl.py +93 -79
  15. numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
  16. numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
  17. numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
  18. numba_cuda/numba/cuda/cudadrv/driver.py +460 -297
  19. numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
  20. numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
  21. numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
  22. numba_cuda/numba/cuda/cudadrv/error.py +6 -2
  23. numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
  24. numba_cuda/numba/cuda/cudadrv/linkable_code.py +27 -3
  25. numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
  26. numba_cuda/numba/cuda/cudadrv/nvrtc.py +146 -30
  27. numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
  28. numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
  29. numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
  30. numba_cuda/numba/cuda/cudaimpl.py +296 -275
  31. numba_cuda/numba/cuda/cudamath.py +1 -1
  32. numba_cuda/numba/cuda/debuginfo.py +99 -7
  33. numba_cuda/numba/cuda/decorators.py +87 -45
  34. numba_cuda/numba/cuda/descriptor.py +1 -1
  35. numba_cuda/numba/cuda/device_init.py +68 -18
  36. numba_cuda/numba/cuda/deviceufunc.py +143 -98
  37. numba_cuda/numba/cuda/dispatcher.py +300 -213
  38. numba_cuda/numba/cuda/errors.py +13 -10
  39. numba_cuda/numba/cuda/extending.py +55 -1
  40. numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
  41. numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
  42. numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +1090 -927
  43. numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +468 -319
  44. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  45. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  46. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  47. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  48. numba_cuda/numba/cuda/initialize.py +5 -3
  49. numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
  50. numba_cuda/numba/cuda/intrinsics.py +203 -28
  51. numba_cuda/numba/cuda/kernels/reduction.py +13 -13
  52. numba_cuda/numba/cuda/kernels/transpose.py +3 -6
  53. numba_cuda/numba/cuda/libdevice.py +317 -317
  54. numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
  55. numba_cuda/numba/cuda/locks.py +16 -0
  56. numba_cuda/numba/cuda/lowering.py +43 -0
  57. numba_cuda/numba/cuda/mathimpl.py +62 -57
  58. numba_cuda/numba/cuda/models.py +1 -5
  59. numba_cuda/numba/cuda/nvvmutils.py +103 -88
  60. numba_cuda/numba/cuda/printimpl.py +9 -5
  61. numba_cuda/numba/cuda/random.py +46 -36
  62. numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
  63. numba_cuda/numba/cuda/runtime/__init__.py +1 -1
  64. numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
  65. numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
  66. numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
  67. numba_cuda/numba/cuda/runtime/nrt.py +48 -43
  68. numba_cuda/numba/cuda/simulator/__init__.py +22 -12
  69. numba_cuda/numba/cuda/simulator/api.py +38 -22
  70. numba_cuda/numba/cuda/simulator/compiler.py +2 -2
  71. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
  72. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
  73. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
  74. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
  75. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
  76. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
  77. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
  78. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
  79. numba_cuda/numba/cuda/simulator/kernel.py +43 -34
  80. numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
  81. numba_cuda/numba/cuda/simulator/reduction.py +1 -0
  82. numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
  83. numba_cuda/numba/cuda/simulator_init.py +2 -4
  84. numba_cuda/numba/cuda/stubs.py +134 -108
  85. numba_cuda/numba/cuda/target.py +92 -47
  86. numba_cuda/numba/cuda/testing.py +24 -19
  87. numba_cuda/numba/cuda/tests/__init__.py +14 -12
  88. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
  89. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
  90. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
  91. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
  92. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
  93. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
  94. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
  95. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
  96. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
  97. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
  98. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
  99. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
  100. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
  101. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
  102. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
  103. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
  104. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
  105. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
  106. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
  107. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
  108. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
  109. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
  110. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
  111. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
  112. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
  113. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
  114. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
  115. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
  116. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
  117. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
  118. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
  119. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +10 -7
  120. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
  121. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
  123. numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
  124. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
  125. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
  126. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
  127. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +59 -23
  129. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
  130. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
  131. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
  132. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
  133. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
  134. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
  135. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
  136. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
  137. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
  138. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
  139. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
  140. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
  141. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
  142. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
  143. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +77 -28
  144. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
  145. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
  146. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +24 -7
  147. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
  148. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
  149. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +21 -12
  150. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
  151. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
  152. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
  153. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
  154. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
  155. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
  156. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
  157. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
  158. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
  159. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +59 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
  161. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
  162. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
  163. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
  164. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
  165. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +7 -7
  166. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
  167. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
  168. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
  169. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
  170. numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
  171. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
  172. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
  173. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
  174. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
  175. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
  176. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
  177. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
  178. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
  179. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
  180. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
  181. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
  182. numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
  183. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
  184. numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
  185. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
  186. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
  187. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
  188. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
  189. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
  190. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
  191. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
  192. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
  193. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
  194. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
  195. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
  196. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
  197. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
  198. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
  199. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
  200. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
  201. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
  202. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
  203. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
  204. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
  205. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +81 -30
  206. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
  207. numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
  208. numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
  209. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
  210. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
  211. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
  212. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
  213. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
  214. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
  216. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
  217. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
  218. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
  219. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
  220. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
  221. numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
  222. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
  223. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
  224. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
  225. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
  226. numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
  227. numba_cuda/numba/cuda/types.py +5 -2
  228. numba_cuda/numba/cuda/ufuncs.py +382 -362
  229. numba_cuda/numba/cuda/utils.py +2 -2
  230. numba_cuda/numba/cuda/vector_types.py +5 -3
  231. numba_cuda/numba/cuda/vectorizers.py +38 -33
  232. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/METADATA +1 -1
  233. numba_cuda-0.10.0.dist-info/RECORD +263 -0
  234. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/WHEEL +1 -1
  235. numba_cuda-0.8.1.dist-info/RECORD +0 -251
  236. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/licenses/LICENSE +0 -0
  237. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,257 @@
1
+ import numba.cuda as cuda
2
+ from numba.cuda.testing import unittest, CUDATestCase
3
+ import numpy as np
4
+
5
+ from numba import int16, int32, int64, uint16, uint32, uint64, float32, float64
6
+ from numba.types import float16
7
+
8
+ from numba.cuda.cuda_bf16 import (
9
+ nv_bfloat16,
10
+ htrunc,
11
+ hceil,
12
+ hfloor,
13
+ hrint,
14
+ hsqrt,
15
+ hrsqrt,
16
+ hrcp,
17
+ hlog,
18
+ hlog2,
19
+ hlog10,
20
+ hcos,
21
+ hsin,
22
+ hexp,
23
+ hexp2,
24
+ hexp10,
25
+ )
26
+
27
+ from numba.cuda.cudadrv.runtime import get_version
28
+
29
+ cuda_version = get_version()
30
+
31
+ dtypes = [int16, int32, int64, uint16, uint32, uint64, float32]
32
+
33
+
34
+ @unittest.skipIf(
35
+ (cuda.get_current_device().compute_capability < (8, 0)),
36
+ "bfloat16 requires compute capability 8.0+",
37
+ )
38
+ class Bfloat16Test(CUDATestCase):
39
+ def test_ctor(self):
40
+ @cuda.jit
41
+ def simple_kernel():
42
+ a = nv_bfloat16(float64(1.0)) # noqa: F841
43
+ b = nv_bfloat16(float32(2.0)) # noqa: F841
44
+ c = nv_bfloat16(int16(3)) # noqa: F841
45
+ d = nv_bfloat16(int32(4)) # noqa: F841
46
+ e = nv_bfloat16(int64(5)) # noqa: F841
47
+ f = nv_bfloat16(uint16(6)) # noqa: F841
48
+ g = nv_bfloat16(uint32(7)) # noqa: F841
49
+ h = nv_bfloat16(uint64(8)) # noqa: F841
50
+
51
+ simple_kernel[1, 1]()
52
+
53
+ if cuda_version >= (12, 0):
54
+
55
+ @cuda.jit
56
+ def simple_kernel_fp16():
57
+ i = nv_bfloat16(float16(9)) # noqa: F841
58
+
59
+ simple_kernel_fp16[1, 1]()
60
+
61
+ def test_casts(self):
62
+ @cuda.jit
63
+ def simple_kernel(b, c, d, e, f, g, h):
64
+ a = nv_bfloat16(3.14)
65
+
66
+ b[0] = float32(a)
67
+ c[0] = int16(a)
68
+ d[0] = int32(a)
69
+ e[0] = int64(a)
70
+ f[0] = uint16(a)
71
+ g[0] = uint32(a)
72
+ h[0] = uint64(a)
73
+
74
+ b = np.zeros(1, dtype=np.float32)
75
+ c = np.zeros(1, dtype=np.int16)
76
+ d = np.zeros(1, dtype=np.int32)
77
+ e = np.zeros(1, dtype=np.int64)
78
+ f = np.zeros(1, dtype=np.uint16)
79
+ g = np.zeros(1, dtype=np.uint32)
80
+ h = np.zeros(1, dtype=np.uint64)
81
+
82
+ simple_kernel[1, 1](b, c, d, e, f, g, h)
83
+
84
+ np.testing.assert_allclose(b[0], 3.14, atol=1e-2)
85
+ assert c[0] == 3
86
+ assert d[0] == 3
87
+ assert e[0] == 3
88
+ assert f[0] == 3
89
+ assert g[0] == 3
90
+ assert h[0] == 3
91
+
92
+ def test_ctor_cast_loop(self):
93
+ for dtype in dtypes:
94
+ with self.subTest(dtype=dtype):
95
+
96
+ @cuda.jit
97
+ def simple_kernel(a):
98
+ a[0] = dtype(nv_bfloat16(dtype(3.14)))
99
+
100
+ a = np.zeros(1, dtype=str(dtype))
101
+ simple_kernel[1, 1](a)
102
+
103
+ if np.dtype(str(dtype)).kind == "f":
104
+ np.testing.assert_allclose(a[0], 3.14, atol=1e-2)
105
+ else:
106
+ assert a[0] == 3
107
+
108
+ def test_arithmetic(self):
109
+ @cuda.jit
110
+ def simple_kernel(arith, logic):
111
+ # Binary Arithmetic Operators
112
+ a = nv_bfloat16(1.0)
113
+ b = nv_bfloat16(2.0)
114
+
115
+ arith[0] = float32(a + b)
116
+ arith[1] = float32(a - b)
117
+ arith[2] = float32(a * b)
118
+ arith[3] = float32(a / b)
119
+
120
+ # Arithmetic Assignment Operators
121
+ a = nv_bfloat16(1.0)
122
+ b = nv_bfloat16(2.0)
123
+
124
+ a += b
125
+ arith[4] = float32(a)
126
+ a -= b
127
+ arith[5] = float32(a)
128
+ a *= b
129
+ arith[6] = float32(a)
130
+ a /= b
131
+ arith[7] = float32(a)
132
+
133
+ # Unary Arithmetic Operators
134
+ a = nv_bfloat16(1.0)
135
+
136
+ arith[8] = float32(+a)
137
+ arith[9] = float32(-a)
138
+
139
+ # Comparison Operators
140
+ a = nv_bfloat16(1.0)
141
+ b = nv_bfloat16(2.0)
142
+
143
+ logic[0] = a == b
144
+ logic[1] = a != b
145
+ logic[2] = a > b
146
+ logic[3] = a < b
147
+ logic[4] = a >= b
148
+ logic[5] = a <= b
149
+
150
+ arith = np.zeros(10, dtype=np.float32)
151
+ logic = np.zeros(6, dtype=np.bool_)
152
+
153
+ simple_kernel[1, 1](arith, logic)
154
+
155
+ a = 1.0
156
+ b = 2.0
157
+ np.testing.assert_allclose(
158
+ arith,
159
+ [
160
+ a + b,
161
+ a - b,
162
+ a * b,
163
+ a / b,
164
+ a + b,
165
+ a + b - b,
166
+ (a + b - b) * b,
167
+ (a + b - b) * b / b,
168
+ +a,
169
+ -a,
170
+ ],
171
+ atol=1e-2,
172
+ )
173
+ np.testing.assert_equal(
174
+ logic, [a == b, a != b, a > b, a < b, a >= b, a <= b]
175
+ )
176
+
177
+ def test_math_func(self):
178
+ @cuda.jit
179
+ def simple_kernel(a):
180
+ x = nv_bfloat16(3.14)
181
+
182
+ a[0] = float32(htrunc(x))
183
+ a[1] = float32(hceil(x))
184
+ a[2] = float32(hfloor(x))
185
+ a[3] = float32(hrint(x))
186
+ a[4] = float32(hsqrt(x))
187
+ a[5] = float32(hrsqrt(x))
188
+ a[6] = float32(hrcp(x))
189
+ a[7] = float32(hlog(x))
190
+ a[8] = float32(hlog2(x))
191
+ a[9] = float32(hlog10(x))
192
+ a[10] = float32(hcos(x))
193
+ a[11] = float32(hsin(x))
194
+ a[12] = float32(hexp(x))
195
+ a[13] = float32(hexp2(x))
196
+ a[14] = float32(hexp10(x))
197
+
198
+ a = np.zeros(15, dtype=np.float32)
199
+ simple_kernel[1, 1](a)
200
+
201
+ x = 3.14
202
+ np.testing.assert_allclose(
203
+ a[:12],
204
+ [
205
+ np.trunc(x),
206
+ np.ceil(x),
207
+ np.floor(x),
208
+ np.rint(x),
209
+ np.sqrt(x),
210
+ 1 / np.sqrt(x),
211
+ 1 / x,
212
+ np.log(x),
213
+ np.log2(x),
214
+ np.log10(x),
215
+ np.cos(x),
216
+ np.sin(x),
217
+ ],
218
+ atol=1e-2,
219
+ )
220
+
221
+ np.testing.assert_allclose(
222
+ a[12:], [np.exp(x), np.exp2(x), np.power(10, x)], atol=1e2
223
+ )
224
+
225
+ def test_check_bfloat16_type(self):
226
+ @cuda.jit
227
+ def kernel(arr):
228
+ x = nv_bfloat16(3.14)
229
+ if isinstance(x, nv_bfloat16):
230
+ arr[0] = float32(x)
231
+ else:
232
+ arr[0] = float32(0.0)
233
+
234
+ arr = np.zeros(1, np.float32)
235
+ kernel[1, 1](arr)
236
+
237
+ np.testing.assert_allclose(arr, [3.14], atol=1e-2)
238
+
239
+ def test_use_within_device_func(self):
240
+ @cuda.jit(device=True)
241
+ def add_bf16(a, b):
242
+ return a + b
243
+
244
+ @cuda.jit
245
+ def kernel(arr):
246
+ a = nv_bfloat16(3.14)
247
+ b = nv_bfloat16(5)
248
+ arr[0] = float32(hfloor(add_bf16(a, b)))
249
+
250
+ arr = np.zeros(1, np.float32)
251
+ kernel[1, 1](arr)
252
+
253
+ np.testing.assert_allclose(arr, [8], atol=1e-2)
254
+
255
+
256
+ if __name__ == "__main__":
257
+ unittest.main()
@@ -17,13 +17,23 @@ RSQRT2PI = 0.39894228040143267793994605993438
17
17
 
18
18
  def cnd(d):
19
19
  K = 1.0 / (1.0 + 0.2316419 * np.abs(d))
20
- ret_val = (RSQRT2PI * np.exp(-0.5 * d * d) *
21
- (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))))
20
+ ret_val = (
21
+ RSQRT2PI
22
+ * np.exp(-0.5 * d * d)
23
+ * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))
24
+ )
22
25
  return np.where(d > 0, 1.0 - ret_val, ret_val)
23
26
 
24
27
 
25
- def black_scholes(callResult, putResult, stockPrice, optionStrike, optionYears,
26
- Riskfree, Volatility):
28
+ def black_scholes(
29
+ callResult,
30
+ putResult,
31
+ stockPrice,
32
+ optionStrike,
33
+ optionYears,
34
+ Riskfree,
35
+ Volatility,
36
+ ):
27
37
  S = stockPrice
28
38
  X = optionStrike
29
39
  T = optionYears
@@ -35,9 +45,9 @@ def black_scholes(callResult, putResult, stockPrice, optionStrike, optionYears,
35
45
  cndd1 = cnd(d1)
36
46
  cndd2 = cnd(d2)
37
47
 
38
- expRT = np.exp(- R * T)
39
- callResult[:] = (S * cndd1 - X * expRT * cndd2)
40
- putResult[:] = (X * expRT * (1.0 - cndd2) - S * (1.0 - cndd1))
48
+ expRT = np.exp(-R * T)
49
+ callResult[:] = S * cndd1 - X * expRT * cndd2
50
+ putResult[:] = X * expRT * (1.0 - cndd2) - S * (1.0 - cndd1)
41
51
 
42
52
 
43
53
  def randfloat(rand_var, low, high):
@@ -61,34 +71,54 @@ class TestBlackScholes(CUDATestCase):
61
71
 
62
72
  # numpy
63
73
  for i in range(iterations):
64
- black_scholes(callResultNumpy, putResultNumpy, stockPrice,
65
- optionStrike, optionYears, RISKFREE, VOLATILITY)
66
-
67
- @cuda.jit(double(double), device=True, inline=True)
74
+ black_scholes(
75
+ callResultNumpy,
76
+ putResultNumpy,
77
+ stockPrice,
78
+ optionStrike,
79
+ optionYears,
80
+ RISKFREE,
81
+ VOLATILITY,
82
+ )
83
+
84
+ @cuda.jit(double(double), device=True, inline="always")
68
85
  def cnd_cuda(d):
69
86
  K = 1.0 / (1.0 + 0.2316419 * math.fabs(d))
70
- ret_val = (RSQRT2PI * math.exp(-0.5 * d * d) *
71
- (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))))
87
+ ret_val = (
88
+ RSQRT2PI
89
+ * math.exp(-0.5 * d * d)
90
+ * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))
91
+ )
72
92
  if d > 0:
73
93
  ret_val = 1.0 - ret_val
74
94
  return ret_val
75
95
 
76
- @cuda.jit(void(double[:], double[:], double[:], double[:], double[:],
77
- double, double))
96
+ @cuda.jit(
97
+ void(
98
+ double[:],
99
+ double[:],
100
+ double[:],
101
+ double[:],
102
+ double[:],
103
+ double,
104
+ double,
105
+ )
106
+ )
78
107
  def black_scholes_cuda(callResult, putResult, S, X, T, R, V):
79
108
  i = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
80
109
  if i >= S.shape[0]:
81
110
  return
82
111
  sqrtT = math.sqrt(T[i])
83
- d1 = ((math.log(S[i] / X[i]) + (R + 0.5 * V * V) * T[i])
84
- / (V * sqrtT))
112
+ d1 = (math.log(S[i] / X[i]) + (R + 0.5 * V * V) * T[i]) / (
113
+ V * sqrtT
114
+ )
85
115
  d2 = d1 - V * sqrtT
86
116
  cndd1 = cnd_cuda(d1)
87
117
  cndd2 = cnd_cuda(d2)
88
118
 
89
- expRT = math.exp((-1. * R) * T[i])
90
- callResult[i] = (S[i] * cndd1 - X[i] * expRT * cndd2)
91
- putResult[i] = (X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1))
119
+ expRT = math.exp((-1.0 * R) * T[i])
120
+ callResult[i] = S[i] * cndd1 - X[i] * expRT * cndd2
121
+ putResult[i] = X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1)
92
122
 
93
123
  # numba
94
124
  blockdim = 512, 1
@@ -102,8 +132,14 @@ class TestBlackScholes(CUDATestCase):
102
132
 
103
133
  for i in range(iterations):
104
134
  black_scholes_cuda[griddim, blockdim, stream](
105
- d_callResult, d_putResult, d_stockPrice, d_optionStrike,
106
- d_optionYears, RISKFREE, VOLATILITY)
135
+ d_callResult,
136
+ d_putResult,
137
+ d_stockPrice,
138
+ d_optionStrike,
139
+ d_optionYears,
140
+ RISKFREE,
141
+ VOLATILITY,
142
+ )
107
143
  d_callResult.copy_to_host(callResultNumba, stream)
108
144
  d_putResult.copy_to_host(putResultNumba, stream)
109
145
  stream.synchronize()
@@ -116,5 +152,5 @@ class TestBlackScholes(CUDATestCase):
116
152
  self.assertTrue(max_abs_err < 1e-13)
117
153
 
118
154
 
119
- if __name__ == '__main__':
155
+ if __name__ == "__main__":
120
156
  unittest.main()
@@ -12,13 +12,13 @@ def boolean_func(A, vertial):
12
12
 
13
13
  class TestCudaBoolean(CUDATestCase):
14
14
  def test_boolean(self):
15
- func = cuda.jit('void(float64[:], bool_)')(boolean_func)
16
- A = np.array([0], dtype='float64')
15
+ func = cuda.jit("void(float64[:], bool_)")(boolean_func)
16
+ A = np.array([0], dtype="float64")
17
17
  func[1, 1](A, True)
18
18
  self.assertTrue(A[0] == 123)
19
19
  func[1, 1](A, False)
20
20
  self.assertTrue(A[0] == 321)
21
21
 
22
22
 
23
- if __name__ == '__main__':
23
+ if __name__ == "__main__":
24
24
  unittest.main()
@@ -8,15 +8,22 @@ import warnings
8
8
 
9
9
  from numba import cuda
10
10
  from numba.core.errors import NumbaWarning
11
- from numba.cuda.testing import (CUDATestCase, skip_on_cudasim,
12
- skip_unless_cc_60, skip_if_cudadevrt_missing,
13
- skip_if_mvc_enabled, test_data_dir)
11
+ from numba.cuda.testing import (
12
+ CUDATestCase,
13
+ skip_on_cudasim,
14
+ skip_unless_cc_60,
15
+ skip_if_cudadevrt_missing,
16
+ skip_if_mvc_enabled,
17
+ test_data_dir,
18
+ )
14
19
  from numba.tests.support import SerialMixin
15
- from numba.tests.test_caching import (DispatcherCacheUsecasesTest,
16
- skip_bad_access)
20
+ from numba.tests.test_caching import (
21
+ DispatcherCacheUsecasesTest,
22
+ skip_bad_access,
23
+ )
17
24
 
18
25
 
19
- @skip_on_cudasim('Simulator does not implement caching')
26
+ @skip_on_cudasim("Simulator does not implement caching")
20
27
  class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest):
21
28
  here = os.path.dirname(__file__)
22
29
  usecases_file = os.path.join(here, "cache_usecases.py")
@@ -72,23 +79,23 @@ class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest):
72
79
  mod = self.import_module()
73
80
  f = mod.many_locals
74
81
  f[1, 1]()
75
- self.check_pycache(2) # 1 index, 1 data
82
+ self.check_pycache(2) # 1 index, 1 data
76
83
 
77
84
  def test_closure(self):
78
85
  mod = self.import_module()
79
86
 
80
87
  with warnings.catch_warnings():
81
- warnings.simplefilter('error', NumbaWarning)
88
+ warnings.simplefilter("error", NumbaWarning)
82
89
 
83
90
  f = mod.closure1
84
- self.assertPreciseEqual(f(3), 6) # 3 + 3 = 6
91
+ self.assertPreciseEqual(f(3), 6) # 3 + 3 = 6
85
92
  f = mod.closure2
86
- self.assertPreciseEqual(f(3), 8) # 3 + 5 = 8
93
+ self.assertPreciseEqual(f(3), 8) # 3 + 5 = 8
87
94
  f = mod.closure3
88
- self.assertPreciseEqual(f(3), 10) # 3 + 7 = 10
95
+ self.assertPreciseEqual(f(3), 10) # 3 + 7 = 10
89
96
  f = mod.closure4
90
- self.assertPreciseEqual(f(3), 12) # 3 + 9 = 12
91
- self.check_pycache(5) # 1 nbi, 4 nbc
97
+ self.assertPreciseEqual(f(3), 12) # 3 + 9 = 12
98
+ self.check_pycache(5) # 1 nbi, 4 nbc
92
99
 
93
100
  def test_cache_reuse(self):
94
101
  mod = self.import_module()
@@ -158,7 +165,7 @@ class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest):
158
165
 
159
166
  @skip_unless_cc_60
160
167
  @skip_if_cudadevrt_missing
161
- @skip_if_mvc_enabled('CG not supported with MVC')
168
+ @skip_if_mvc_enabled("CG not supported with MVC")
162
169
  def test_cache_cg(self):
163
170
  # Functions using cooperative groups should be cacheable. See Issue
164
171
  # #8888: https://github.com/numba/numba/issues/8888
@@ -174,7 +181,7 @@ class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest):
174
181
 
175
182
  @skip_unless_cc_60
176
183
  @skip_if_cudadevrt_missing
177
- @skip_if_mvc_enabled('CG not supported with MVC')
184
+ @skip_if_mvc_enabled("CG not supported with MVC")
178
185
  def test_cache_cg_clean_run(self):
179
186
  # See Issue #9432: https://github.com/numba/numba/issues/9432
180
187
  # If a cached function using CG sync was the first thing to compile,
@@ -191,9 +198,11 @@ class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest):
191
198
  mod.cg_usecase(0)
192
199
  """ % dict(tempdir=self.tempdir, modname=self.modname)
193
200
 
194
- popen = subprocess.Popen([sys.executable, "-c", code],
195
- stdout=subprocess.PIPE,
196
- stderr=subprocess.PIPE)
201
+ popen = subprocess.Popen(
202
+ [sys.executable, "-c", code],
203
+ stdout=subprocess.PIPE,
204
+ stderr=subprocess.PIPE,
205
+ )
197
206
  out, err = popen.communicate(timeout=60)
198
207
  if popen.returncode != 0:
199
208
  raise AssertionError(
@@ -212,8 +221,9 @@ class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest):
212
221
  f = mod.add_usecase
213
222
  # Remove this function's cache files at the end, to avoid accumulation
214
223
  # across test calls.
215
- self.addCleanup(shutil.rmtree, f.func.stats.cache_path,
216
- ignore_errors=True)
224
+ self.addCleanup(
225
+ shutil.rmtree, f.func.stats.cache_path, ignore_errors=True
226
+ )
217
227
 
218
228
  self.assertPreciseEqual(f(2, 3), 6)
219
229
  # It's a cache miss since the file was copied to a new temp location
@@ -230,8 +240,9 @@ class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest):
230
240
  self.check_pycache(0)
231
241
 
232
242
  @skip_bad_access
233
- @unittest.skipIf(os.name == "nt",
234
- "cannot easily make a directory read-only on Windows")
243
+ @unittest.skipIf(
244
+ os.name == "nt", "cannot easily make a directory read-only on Windows"
245
+ )
235
246
  def test_non_creatable_pycache(self):
236
247
  # Make it impossible to create the __pycache__ directory
237
248
  old_perms = os.stat(self.tempdir).st_mode
@@ -241,11 +252,12 @@ class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest):
241
252
  self._test_pycache_fallback()
242
253
 
243
254
  @skip_bad_access
244
- @unittest.skipIf(os.name == "nt",
245
- "cannot easily make a directory read-only on Windows")
255
+ @unittest.skipIf(
256
+ os.name == "nt", "cannot easily make a directory read-only on Windows"
257
+ )
246
258
  def test_non_writable_pycache(self):
247
259
  # Make it impossible to write to the __pycache__ directory
248
- pycache = os.path.join(self.tempdir, '__pycache__')
260
+ pycache = os.path.join(self.tempdir, "__pycache__")
249
261
  os.mkdir(pycache)
250
262
  old_perms = os.stat(pycache).st_mode
251
263
  os.chmod(pycache, 0o500)
@@ -254,15 +266,16 @@ class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest):
254
266
  self._test_pycache_fallback()
255
267
 
256
268
  def test_cannot_cache_linking_libraries(self):
257
- link = str(test_data_dir / 'jitlink.ptx')
258
- msg = 'Cannot pickle CUDACodeLibrary with linking files'
269
+ link = str(test_data_dir / "jitlink.ptx")
270
+ msg = "Cannot pickle CUDACodeLibrary with linking files"
259
271
  with self.assertRaisesRegex(RuntimeError, msg):
260
- @cuda.jit('void()', cache=True, link=[link])
272
+
273
+ @cuda.jit("void()", cache=True, link=[link])
261
274
  def f():
262
275
  pass
263
276
 
264
277
 
265
- @skip_on_cudasim('Simulator does not implement caching')
278
+ @skip_on_cudasim("Simulator does not implement caching")
266
279
  class CUDAAndCPUCachingTest(SerialMixin, DispatcherCacheUsecasesTest):
267
280
  here = os.path.dirname(__file__)
268
281
  usecases_file = os.path.join(here, "cache_with_cpu_usecases.py")
@@ -353,7 +366,7 @@ def get_different_cc_gpus():
353
366
  return None
354
367
 
355
368
 
356
- @skip_on_cudasim('Simulator does not implement caching')
369
+ @skip_on_cudasim("Simulator does not implement caching")
357
370
  class TestMultiCCCaching(SerialMixin, DispatcherCacheUsecasesTest):
358
371
  here = os.path.dirname(__file__)
359
372
  usecases_file = os.path.join(here, "cache_usecases.py")
@@ -370,7 +383,7 @@ class TestMultiCCCaching(SerialMixin, DispatcherCacheUsecasesTest):
370
383
  def test_cache(self):
371
384
  gpus = get_different_cc_gpus()
372
385
  if not gpus:
373
- self.skipTest('Need two different CCs for multi-CC cache test')
386
+ self.skipTest("Need two different CCs for multi-CC cache test")
374
387
 
375
388
  self.check_pycache(0)
376
389
  mod = self.import_module()
@@ -482,13 +495,13 @@ def child_initializer():
482
495
  # Disable occupancy and implicit copy warnings in processes in a
483
496
  # multiprocessing pool.
484
497
  from numba.core import config
498
+
485
499
  config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
486
500
  config.CUDA_WARN_ON_IMPLICIT_COPY = 0
487
501
 
488
502
 
489
- @skip_on_cudasim('Simulator does not implement caching')
503
+ @skip_on_cudasim("Simulator does not implement caching")
490
504
  class TestMultiprocessCache(SerialMixin, DispatcherCacheUsecasesTest):
491
-
492
505
  # Nested multiprocessing.Pool raises AssertionError:
493
506
  # "daemonic processes are not allowed to have children"
494
507
  _numba_parallel_test_ = False
@@ -513,7 +526,7 @@ class TestMultiprocessCache(SerialMixin, DispatcherCacheUsecasesTest):
513
526
  f = mod.simple_usecase_caller
514
527
  n = 3
515
528
  try:
516
- ctx = multiprocessing.get_context('spawn')
529
+ ctx = multiprocessing.get_context("spawn")
517
530
  except AttributeError:
518
531
  ctx = multiprocessing
519
532
 
@@ -526,7 +539,7 @@ class TestMultiprocessCache(SerialMixin, DispatcherCacheUsecasesTest):
526
539
  self.assertEqual(res, n * (n - 1) // 2)
527
540
 
528
541
 
529
- @skip_on_cudasim('Simulator does not implement the CUDACodeLibrary')
542
+ @skip_on_cudasim("Simulator does not implement the CUDACodeLibrary")
530
543
  class TestCUDACodeLibrary(CUDATestCase):
531
544
  # For tests of miscellaneous CUDACodeLibrary behaviour that we wish to
532
545
  # explicitly check
@@ -539,7 +552,7 @@ class TestCUDACodeLibrary(CUDATestCase):
539
552
  # Usually a CodeLibrary requires a real CodeGen, but since we don't
540
553
  # interact with it, anything will do
541
554
  codegen = object()
542
- name = 'library'
555
+ name = "library"
543
556
  cl = CUDACodeLibrary(codegen, name)
544
- with self.assertRaisesRegex(RuntimeError, 'Cannot pickle unfinalized'):
557
+ with self.assertRaisesRegex(RuntimeError, "Cannot pickle unfinalized"):
545
558
  cl._reduce_states()