numba-cuda 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. _numba_cuda_redirector.py +17 -13
  2. numba_cuda/VERSION +1 -1
  3. numba_cuda/_version.py +4 -1
  4. numba_cuda/numba/cuda/__init__.py +6 -2
  5. numba_cuda/numba/cuda/api.py +129 -86
  6. numba_cuda/numba/cuda/api_util.py +3 -3
  7. numba_cuda/numba/cuda/args.py +12 -16
  8. numba_cuda/numba/cuda/cg.py +6 -6
  9. numba_cuda/numba/cuda/codegen.py +74 -43
  10. numba_cuda/numba/cuda/compiler.py +246 -114
  11. numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
  12. numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
  13. numba_cuda/numba/cuda/cuda_paths.py +293 -99
  14. numba_cuda/numba/cuda/cudadecl.py +93 -79
  15. numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
  16. numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
  17. numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
  18. numba_cuda/numba/cuda/cudadrv/driver.py +460 -297
  19. numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
  20. numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
  21. numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
  22. numba_cuda/numba/cuda/cudadrv/error.py +6 -2
  23. numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
  24. numba_cuda/numba/cuda/cudadrv/linkable_code.py +27 -3
  25. numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
  26. numba_cuda/numba/cuda/cudadrv/nvrtc.py +146 -30
  27. numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
  28. numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
  29. numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
  30. numba_cuda/numba/cuda/cudaimpl.py +296 -275
  31. numba_cuda/numba/cuda/cudamath.py +1 -1
  32. numba_cuda/numba/cuda/debuginfo.py +99 -7
  33. numba_cuda/numba/cuda/decorators.py +87 -45
  34. numba_cuda/numba/cuda/descriptor.py +1 -1
  35. numba_cuda/numba/cuda/device_init.py +68 -18
  36. numba_cuda/numba/cuda/deviceufunc.py +143 -98
  37. numba_cuda/numba/cuda/dispatcher.py +300 -213
  38. numba_cuda/numba/cuda/errors.py +13 -10
  39. numba_cuda/numba/cuda/extending.py +55 -1
  40. numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
  41. numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
  42. numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +1090 -927
  43. numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +468 -319
  44. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  45. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  46. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  47. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  48. numba_cuda/numba/cuda/initialize.py +5 -3
  49. numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
  50. numba_cuda/numba/cuda/intrinsics.py +203 -28
  51. numba_cuda/numba/cuda/kernels/reduction.py +13 -13
  52. numba_cuda/numba/cuda/kernels/transpose.py +3 -6
  53. numba_cuda/numba/cuda/libdevice.py +317 -317
  54. numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
  55. numba_cuda/numba/cuda/locks.py +16 -0
  56. numba_cuda/numba/cuda/lowering.py +43 -0
  57. numba_cuda/numba/cuda/mathimpl.py +62 -57
  58. numba_cuda/numba/cuda/models.py +1 -5
  59. numba_cuda/numba/cuda/nvvmutils.py +103 -88
  60. numba_cuda/numba/cuda/printimpl.py +9 -5
  61. numba_cuda/numba/cuda/random.py +46 -36
  62. numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
  63. numba_cuda/numba/cuda/runtime/__init__.py +1 -1
  64. numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
  65. numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
  66. numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
  67. numba_cuda/numba/cuda/runtime/nrt.py +48 -43
  68. numba_cuda/numba/cuda/simulator/__init__.py +22 -12
  69. numba_cuda/numba/cuda/simulator/api.py +38 -22
  70. numba_cuda/numba/cuda/simulator/compiler.py +2 -2
  71. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
  72. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
  73. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
  74. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
  75. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
  76. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
  77. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
  78. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
  79. numba_cuda/numba/cuda/simulator/kernel.py +43 -34
  80. numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
  81. numba_cuda/numba/cuda/simulator/reduction.py +1 -0
  82. numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
  83. numba_cuda/numba/cuda/simulator_init.py +2 -4
  84. numba_cuda/numba/cuda/stubs.py +134 -108
  85. numba_cuda/numba/cuda/target.py +92 -47
  86. numba_cuda/numba/cuda/testing.py +24 -19
  87. numba_cuda/numba/cuda/tests/__init__.py +14 -12
  88. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
  89. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
  90. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
  91. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
  92. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
  93. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
  94. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
  95. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
  96. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
  97. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
  98. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
  99. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
  100. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
  101. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
  102. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
  103. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
  104. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
  105. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
  106. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
  107. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
  108. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
  109. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
  110. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
  111. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
  112. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
  113. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
  114. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
  115. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
  116. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
  117. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
  118. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
  119. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +10 -7
  120. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
  121. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
  123. numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
  124. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
  125. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
  126. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
  127. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +59 -23
  129. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
  130. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
  131. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
  132. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
  133. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
  134. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
  135. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
  136. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
  137. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
  138. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
  139. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
  140. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
  141. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
  142. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
  143. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +77 -28
  144. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
  145. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
  146. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +24 -7
  147. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
  148. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
  149. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +21 -12
  150. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
  151. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
  152. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
  153. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
  154. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
  155. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
  156. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
  157. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
  158. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
  159. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +59 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
  161. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
  162. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
  163. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
  164. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
  165. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +7 -7
  166. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
  167. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
  168. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
  169. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
  170. numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
  171. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
  172. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
  173. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
  174. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
  175. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
  176. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
  177. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
  178. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
  179. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
  180. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
  181. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
  182. numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
  183. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
  184. numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
  185. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
  186. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
  187. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
  188. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
  189. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
  190. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
  191. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
  192. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
  193. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
  194. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
  195. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
  196. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
  197. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
  198. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
  199. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
  200. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
  201. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
  202. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
  203. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
  204. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
  205. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +81 -30
  206. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
  207. numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
  208. numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
  209. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
  210. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
  211. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
  212. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
  213. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
  214. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
  216. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
  217. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
  218. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
  219. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
  220. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
  221. numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
  222. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
  223. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
  224. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
  225. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
  226. numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
  227. numba_cuda/numba/cuda/types.py +5 -2
  228. numba_cuda/numba/cuda/ufuncs.py +382 -362
  229. numba_cuda/numba/cuda/utils.py +2 -2
  230. numba_cuda/numba/cuda/vector_types.py +5 -3
  231. numba_cuda/numba/cuda/vectorizers.py +38 -33
  232. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/METADATA +1 -1
  233. numba_cuda-0.10.0.dist-info/RECORD +263 -0
  234. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/WHEEL +1 -1
  235. numba_cuda-0.8.1.dist-info/RECORD +0 -251
  236. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/licenses/LICENSE +0 -0
  237. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/top_level.txt +0 -0
@@ -4,8 +4,7 @@ from numba.cuda import compile_ptx
4
4
  from numba.core.types import f2, i1, i2, i4, i8, u1, u2, u4, u8
5
5
  from numba import cuda
6
6
  from numba.core import types
7
- from numba.cuda.testing import (CUDATestCase, skip_on_cudasim,
8
- skip_unless_cc_53)
7
+ from numba.cuda.testing import CUDATestCase, skip_on_cudasim, skip_unless_cc_53
9
8
  from numba.types import float16, float32
10
9
  import itertools
11
10
  import unittest
@@ -50,7 +49,7 @@ def to_uint64(x):
50
49
  def to_float16(x):
51
50
  # When division and operators on float16 types are supported, this should
52
51
  # be changed to match the implementation in to_float32.
53
- return (np.float16(x) * np.float16(0.5))
52
+ return np.float16(x) * np.float16(0.5)
54
53
 
55
54
 
56
55
  def to_float32(x):
@@ -76,6 +75,7 @@ def to_complex128(x):
76
75
  # - The device version uses cuda.fp16.hmul
77
76
  # - The host version uses the * operator
78
77
 
78
+
79
79
  def cuda_int_literal_to_float16(x):
80
80
  # Note that we need to use `2` and not `np.float16(2)` to ensure that this
81
81
  # types as a literal int and not a const float16.
@@ -128,7 +128,7 @@ class TestCasting(CUDATestCase):
128
128
  self.assertEqual(cfunc(-12.3), pyfunc(-12.3))
129
129
  self.assertEqual(cfunc(-12.3), int(-12.3))
130
130
 
131
- @skip_on_cudasim('Compilation unsupported in the simulator')
131
+ @skip_on_cudasim("Compilation unsupported in the simulator")
132
132
  def test_float16_to_int_ptx(self):
133
133
  pyfuncs = (to_int8, to_int16, to_int32, to_int64)
134
134
  sizes = (8, 16, 32, 64)
@@ -150,7 +150,7 @@ class TestCasting(CUDATestCase):
150
150
  self.assertEqual(cfunc(12.3), pyfunc(12.3))
151
151
  self.assertEqual(cfunc(12.3), int(12.3))
152
152
 
153
- @skip_on_cudasim('Compilation unsupported in the simulator')
153
+ @skip_on_cudasim("Compilation unsupported in the simulator")
154
154
  def test_float16_to_uint_ptx(self):
155
155
  pyfuncs = (to_uint8, to_uint16, to_uint32, to_uint64)
156
156
  sizes = (8, 16, 32, 64)
@@ -171,17 +171,18 @@ class TestCasting(CUDATestCase):
171
171
 
172
172
  @skip_unless_cc_53
173
173
  def test_literal_to_float16(self):
174
- cudafuncs = (cuda_int_literal_to_float16,
175
- cuda_float_literal_to_float16)
176
- hostfuncs = (reference_int_literal_to_float16,
177
- reference_float_literal_to_float16)
174
+ cudafuncs = (cuda_int_literal_to_float16, cuda_float_literal_to_float16)
175
+ hostfuncs = (
176
+ reference_int_literal_to_float16,
177
+ reference_float_literal_to_float16,
178
+ )
178
179
 
179
180
  for cudafunc, hostfunc in zip(cudafuncs, hostfuncs):
180
181
  with self.subTest(func=cudafunc):
181
182
  cfunc = self._create_wrapped(cudafunc, np.float16, np.float16)
182
183
  self.assertEqual(cfunc(321), hostfunc(321))
183
184
 
184
- @skip_on_cudasim('Compilation unsupported in the simulator')
185
+ @skip_on_cudasim("Compilation unsupported in the simulator")
185
186
  def test_int_to_float16_ptx(self):
186
187
  fromtys = (i1, i2, i4, i8)
187
188
  sizes = (8, 16, 32, 64)
@@ -190,7 +191,7 @@ class TestCasting(CUDATestCase):
190
191
  ptx, _ = compile_ptx(to_float16, (ty,), device=True)
191
192
  self.assertIn(f"cvt.rn.f16.s{size}", ptx)
192
193
 
193
- @skip_on_cudasim('Compilation unsupported in the simulator')
194
+ @skip_on_cudasim("Compilation unsupported in the simulator")
194
195
  def test_uint_to_float16_ptx(self):
195
196
  fromtys = (u1, u2, u4, u8)
196
197
  sizes = (8, 16, 32, 64)
@@ -211,12 +212,14 @@ class TestCasting(CUDATestCase):
211
212
  # the CUDA target doesn't yet implement division (or operators)
212
213
  # for float16 values, so we test by comparing with the computed
213
214
  # expression instead.
214
- np.testing.assert_allclose(cfunc(12.3),
215
- toty(12.3) / toty(2), rtol=0.0003)
216
- np.testing.assert_allclose(cfunc(-12.3),
217
- toty(-12.3) / toty(2), rtol=0.0003)
218
-
219
- @skip_on_cudasim('Compilation unsupported in the simulator')
215
+ np.testing.assert_allclose(
216
+ cfunc(12.3), toty(12.3) / toty(2), rtol=0.0003
217
+ )
218
+ np.testing.assert_allclose(
219
+ cfunc(-12.3), toty(-12.3) / toty(2), rtol=0.0003
220
+ )
221
+
222
+ @skip_on_cudasim("Compilation unsupported in the simulator")
220
223
  def test_float16_to_float_ptx(self):
221
224
  pyfuncs = (to_float32, to_float64)
222
225
  postfixes = ("f32", "f64")
@@ -239,12 +242,14 @@ class TestCasting(CUDATestCase):
239
242
  # to match the casting that is automatically applied when
240
243
  # passing the input to the cfunc as part of wrapping it in
241
244
  # an array of type fromtype.
242
- np.testing.assert_allclose(cfunc(3.21),
243
- pyfunc(fromty(3.21)))
244
- np.testing.assert_allclose(cfunc(-3.21),
245
- pyfunc(fromty(-3.21)) + 0j)
246
-
247
- @skip_on_cudasim('Compilation unsupported in the simulator')
245
+ np.testing.assert_allclose(
246
+ cfunc(3.21), pyfunc(fromty(3.21))
247
+ )
248
+ np.testing.assert_allclose(
249
+ cfunc(-3.21), pyfunc(fromty(-3.21)) + 0j
250
+ )
251
+
252
+ @skip_on_cudasim("Compilation unsupported in the simulator")
248
253
  def test_native_cast(self):
249
254
  float32_ptx, _ = cuda.compile_ptx(native_cast, (float32,), device=True)
250
255
  self.assertIn("st.f32", float32_ptx)
@@ -253,5 +258,5 @@ class TestCasting(CUDATestCase):
253
258
  self.assertIn("st.u16", float16_ptx)
254
259
 
255
260
 
256
- if __name__ == '__main__':
261
+ if __name__ == "__main__":
257
262
  unittest.main()
@@ -1,21 +1,26 @@
1
1
  import numpy as np
2
2
 
3
3
  from numba import cuda, types
4
- from numba.cuda.testing import (skip_on_cudasim, test_data_dir, unittest,
5
- CUDATestCase)
4
+ from numba.cuda.testing import (
5
+ skip_on_cudasim,
6
+ test_data_dir,
7
+ unittest,
8
+ CUDATestCase,
9
+ )
6
10
  from numba.tests.support import skip_unless_cffi
7
11
 
8
12
 
9
13
  @skip_unless_cffi
10
- @skip_on_cudasim('Simulator does not support linking')
14
+ @skip_on_cudasim("Simulator does not support linking")
11
15
  class TestCFFI(CUDATestCase):
12
16
  def test_from_buffer(self):
13
17
  import cffi
18
+
14
19
  ffi = cffi.FFI()
15
20
 
16
- link = str(test_data_dir / 'jitlink.ptx')
21
+ link = str(test_data_dir / "jitlink.ptx")
17
22
  sig = types.void(types.CPointer(types.int32))
18
- array_mutator = cuda.declare_device('array_mutator', sig)
23
+ array_mutator = cuda.declare_device("array_mutator", sig)
19
24
 
20
25
  @cuda.jit(link=[link])
21
26
  def mutate_array(x):
@@ -29,5 +34,5 @@ class TestCFFI(CUDATestCase):
29
34
  self.assertEqual(x[0], x[1])
30
35
 
31
36
 
32
- if __name__ == '__main__':
37
+ if __name__ == "__main__":
33
38
  unittest.main()
@@ -1,7 +1,11 @@
1
1
  from math import sqrt
2
2
  from numba import cuda, float32, int16, int32, int64, uint32, void
3
- from numba.cuda import (compile, compile_for_current_device, compile_ptx,
4
- compile_ptx_for_current_device)
3
+ from numba.cuda import (
4
+ compile,
5
+ compile_for_current_device,
6
+ compile_ptx,
7
+ compile_ptx_for_current_device,
8
+ )
5
9
  from numba.cuda.cudadrv import runtime
6
10
  from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase
7
11
 
@@ -12,7 +16,7 @@ def f_module(x, y):
12
16
  return x + y
13
17
 
14
18
 
15
- @skip_on_cudasim('Compilation unsupported in the simulator')
19
+ @skip_on_cudasim("Compilation unsupported in the simulator")
16
20
  class TestCompile(unittest.TestCase):
17
21
  def test_global_kernel(self):
18
22
  def f(r, x, y):
@@ -24,11 +28,11 @@ class TestCompile(unittest.TestCase):
24
28
  ptx, resty = compile_ptx(f, args)
25
29
 
26
30
  # Kernels should not have a func_retval parameter
27
- self.assertNotIn('func_retval', ptx)
31
+ self.assertNotIn("func_retval", ptx)
28
32
  # .visible .func is used to denote a device function
29
- self.assertNotIn('.visible .func', ptx)
33
+ self.assertNotIn(".visible .func", ptx)
30
34
  # .visible .entry would denote the presence of a global function
31
- self.assertIn('.visible .entry', ptx)
35
+ self.assertIn(".visible .entry", ptx)
32
36
  # Return type for kernels should always be void
33
37
  self.assertEqual(resty, void)
34
38
 
@@ -41,11 +45,11 @@ class TestCompile(unittest.TestCase):
41
45
 
42
46
  # Device functions take a func_retval parameter for storing the
43
47
  # returned value in by reference
44
- self.assertIn('func_retval', ptx)
48
+ self.assertIn("func_retval", ptx)
45
49
  # .visible .func is used to denote a device function
46
- self.assertIn('.visible .func', ptx)
50
+ self.assertIn(".visible .func", ptx)
47
51
  # .visible .entry would denote the presence of a global function
48
- self.assertNotIn('.visible .entry', ptx)
52
+ self.assertNotIn(".visible .entry", ptx)
49
53
  # Inferred return type as expected?
50
54
  self.assertEqual(resty, float32)
51
55
 
@@ -71,21 +75,21 @@ class TestCompile(unittest.TestCase):
71
75
 
72
76
  # Without fastmath, fma contraction is enabled by default, but ftz and
73
77
  # approximate div / sqrt is not.
74
- self.assertIn('fma.rn.f32', ptx)
75
- self.assertIn('div.rn.f32', ptx)
76
- self.assertIn('sqrt.rn.f32', ptx)
78
+ self.assertIn("fma.rn.f32", ptx)
79
+ self.assertIn("div.rn.f32", ptx)
80
+ self.assertIn("sqrt.rn.f32", ptx)
77
81
 
78
82
  ptx, resty = compile_ptx(f, args, device=True, fastmath=True)
79
83
 
80
84
  # With fastmath, ftz and approximate div / sqrt are enabled
81
- self.assertIn('fma.rn.ftz.f32', ptx)
82
- self.assertIn('div.approx.ftz.f32', ptx)
83
- self.assertIn('sqrt.approx.ftz.f32', ptx)
85
+ self.assertIn("fma.rn.ftz.f32", ptx)
86
+ self.assertIn("div.approx.ftz.f32", ptx)
87
+ self.assertIn("sqrt.approx.ftz.f32", ptx)
84
88
 
85
89
  def check_debug_info(self, ptx):
86
90
  # A debug_info section should exist in the PTX. Whitespace varies
87
91
  # between CUDA toolkit versions.
88
- self.assertRegex(ptx, '\\.section\\s+\\.debug_info')
92
+ self.assertRegex(ptx, "\\.section\\s+\\.debug_info")
89
93
  # A .file directive should be produced and include the name of the
90
94
  # source. The path and whitespace may vary, so we accept anything
91
95
  # ending in the filename of this module.
@@ -136,23 +140,25 @@ class TestCompile(unittest.TestCase):
136
140
  def f(x, y):
137
141
  return x[0] + y[0]
138
142
 
139
- with self.assertRaisesRegex(TypeError, 'must have void return type'):
143
+ with self.assertRaisesRegex(TypeError, "must have void return type"):
140
144
  compile_ptx(f, (uint32[::1], uint32[::1]))
141
145
 
142
146
  def test_c_abi_disallowed_for_kernel(self):
143
147
  def f(x, y):
144
148
  return x + y
145
149
 
146
- with self.assertRaisesRegex(NotImplementedError,
147
- "The C ABI is not supported for kernels"):
150
+ with self.assertRaisesRegex(
151
+ NotImplementedError, "The C ABI is not supported for kernels"
152
+ ):
148
153
  compile_ptx(f, (int32, int32), abi="c")
149
154
 
150
155
  def test_unsupported_abi(self):
151
156
  def f(x, y):
152
157
  return x + y
153
158
 
154
- with self.assertRaisesRegex(NotImplementedError,
155
- "Unsupported ABI: fastcall"):
159
+ with self.assertRaisesRegex(
160
+ NotImplementedError, "Unsupported ABI: fastcall"
161
+ ):
156
162
  compile_ptx(f, (int32, int32), abi="fastcall")
157
163
 
158
164
  def test_c_abi_device_function(self):
@@ -166,8 +172,11 @@ class TestCompile(unittest.TestCase):
166
172
  # The function name should match the Python function name (not the
167
173
  # qualname, which includes additional info), and its return value
168
174
  # should be 32 bits
169
- self.assertRegex(ptx, r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
170
- r"func_retval0\)\s+f\(")
175
+ self.assertRegex(
176
+ ptx,
177
+ r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
178
+ r"func_retval0\)\s+f\(",
179
+ )
171
180
 
172
181
  # If we compile for 64-bit integers, the return type should be 64 bits
173
182
  # wide
@@ -175,44 +184,60 @@ class TestCompile(unittest.TestCase):
175
184
  self.assertRegex(ptx, r"\.visible\s+\.func\s+\(\.param\s+\.b64")
176
185
 
177
186
  def test_c_abi_device_function_module_scope(self):
178
- ptx, resty = compile_ptx(f_module, int32(int32, int32), device=True,
179
- abi="c")
187
+ ptx, resty = compile_ptx(
188
+ f_module, int32(int32, int32), device=True, abi="c"
189
+ )
180
190
 
181
191
  # The function name should match the Python function name, and its
182
192
  # return value should be 32 bits
183
- self.assertRegex(ptx, r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
184
- r"func_retval0\)\s+f_module\(")
193
+ self.assertRegex(
194
+ ptx,
195
+ r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
196
+ r"func_retval0\)\s+f_module\(",
197
+ )
185
198
 
186
199
  def test_c_abi_with_abi_name(self):
187
- abi_info = {'abi_name': '_Z4funcii'}
188
- ptx, resty = compile_ptx(f_module, int32(int32, int32), device=True,
189
- abi="c", abi_info=abi_info)
200
+ abi_info = {"abi_name": "_Z4funcii"}
201
+ ptx, resty = compile_ptx(
202
+ f_module,
203
+ int32(int32, int32),
204
+ device=True,
205
+ abi="c",
206
+ abi_info=abi_info,
207
+ )
190
208
 
191
209
  # The function name should match the one given in the ABI info, and its
192
210
  # return value should be 32 bits
193
- self.assertRegex(ptx, r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
194
- r"func_retval0\)\s+_Z4funcii\(")
211
+ self.assertRegex(
212
+ ptx,
213
+ r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
214
+ r"func_retval0\)\s+_Z4funcii\(",
215
+ )
195
216
 
196
217
  def test_compile_defaults_to_c_abi(self):
197
218
  ptx, resty = compile(f_module, int32(int32, int32), device=True)
198
219
 
199
220
  # The function name should match the Python function name, and its
200
221
  # return value should be 32 bits
201
- self.assertRegex(ptx, r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
202
- r"func_retval0\)\s+f_module\(")
222
+ self.assertRegex(
223
+ ptx,
224
+ r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
225
+ r"func_retval0\)\s+f_module\(",
226
+ )
203
227
 
204
228
  def test_compile_to_ltoir(self):
205
229
  if runtime.get_version() < (11, 5):
206
230
  self.skipTest("-gen-lto unavailable in this toolkit version")
207
231
 
208
- ltoir, resty = compile(f_module, int32(int32, int32), device=True,
209
- output="ltoir")
232
+ ltoir, resty = compile(
233
+ f_module, int32(int32, int32), device=True, output="ltoir"
234
+ )
210
235
 
211
236
  # There are no tools to interpret the LTOIR output, but we can check
212
237
  # that we appear to have obtained an LTOIR file. This magic number is
213
238
  # not documented, but is expected to remain consistent.
214
239
  LTOIR_MAGIC = 0x7F4E43ED
215
- header = int.from_bytes(ltoir[:4], byteorder='little')
240
+ header = int.from_bytes(ltoir[:4], byteorder="little")
216
241
  self.assertEqual(header, LTOIR_MAGIC)
217
242
  self.assertEqual(resty, int32)
218
243
 
@@ -220,11 +245,15 @@ class TestCompile(unittest.TestCase):
220
245
  illegal_output = "illegal"
221
246
  msg = f"Unsupported output type: {illegal_output}"
222
247
  with self.assertRaisesRegex(NotImplementedError, msg):
223
- compile(f_module, int32(int32, int32), device=True,
224
- output=illegal_output)
248
+ compile(
249
+ f_module,
250
+ int32(int32, int32),
251
+ device=True,
252
+ output=illegal_output,
253
+ )
225
254
 
226
255
 
227
- @skip_on_cudasim('Compilation unsupported in the simulator')
256
+ @skip_on_cudasim("Compilation unsupported in the simulator")
228
257
  class TestCompileForCurrentDevice(CUDATestCase):
229
258
  def _check_ptx_for_current_device(self, compile_function):
230
259
  def add(x, y):
@@ -237,7 +266,7 @@ class TestCompileForCurrentDevice(CUDATestCase):
237
266
  # closest compute capability supported by the current toolkit.
238
267
  device_cc = cuda.get_current_device().compute_capability
239
268
  cc = cuda.cudadrv.nvvm.find_closest_arch(device_cc)
240
- target = f'.target sm_{cc[0]}{cc[1]}'
269
+ target = f".target sm_{cc[0]}{cc[1]}"
241
270
  self.assertIn(target, ptx)
242
271
 
243
272
  def test_compile_ptx_for_current_device(self):
@@ -247,10 +276,10 @@ class TestCompileForCurrentDevice(CUDATestCase):
247
276
  self._check_ptx_for_current_device(compile_for_current_device)
248
277
 
249
278
 
250
- @skip_on_cudasim('Compilation unsupported in the simulator')
279
+ @skip_on_cudasim("Compilation unsupported in the simulator")
251
280
  class TestCompileOnlyTests(unittest.TestCase):
252
- '''For tests where we can only check correctness by examining the compiler
253
- output rather than observing the effects of execution.'''
281
+ """For tests where we can only check correctness by examining the compiler
282
+ output rather than observing the effects of execution."""
254
283
 
255
284
  def test_nanosleep(self):
256
285
  def use_nanosleep(x):
@@ -262,15 +291,20 @@ class TestCompileOnlyTests(unittest.TestCase):
262
291
  ptx, resty = compile_ptx(use_nanosleep, (uint32,), cc=(7, 0))
263
292
 
264
293
  nanosleep_count = 0
265
- for line in ptx.split('\n'):
266
- if 'nanosleep.u32' in line:
294
+ for line in ptx.split("\n"):
295
+ if "nanosleep.u32" in line:
267
296
  nanosleep_count += 1
268
297
 
269
298
  expected = 2
270
- self.assertEqual(expected, nanosleep_count,
271
- (f'Got {nanosleep_count} nanosleep instructions, '
272
- f'expected {expected}'))
299
+ self.assertEqual(
300
+ expected,
301
+ nanosleep_count,
302
+ (
303
+ f"Got {nanosleep_count} nanosleep instructions, "
304
+ f"expected {expected}"
305
+ ),
306
+ )
273
307
 
274
308
 
275
- if __name__ == '__main__':
309
+ if __name__ == "__main__":
276
310
  unittest.main()