numba-cuda 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (227) hide show
  1. _numba_cuda_redirector.py +17 -13
  2. numba_cuda/VERSION +1 -1
  3. numba_cuda/_version.py +4 -1
  4. numba_cuda/numba/cuda/__init__.py +6 -2
  5. numba_cuda/numba/cuda/api.py +129 -86
  6. numba_cuda/numba/cuda/api_util.py +3 -3
  7. numba_cuda/numba/cuda/args.py +12 -16
  8. numba_cuda/numba/cuda/cg.py +6 -6
  9. numba_cuda/numba/cuda/codegen.py +74 -43
  10. numba_cuda/numba/cuda/compiler.py +232 -113
  11. numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
  12. numba_cuda/numba/cuda/cuda_fp16.h +661 -661
  13. numba_cuda/numba/cuda/cuda_fp16.hpp +3 -3
  14. numba_cuda/numba/cuda/cuda_paths.py +291 -99
  15. numba_cuda/numba/cuda/cudadecl.py +125 -69
  16. numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
  17. numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
  18. numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
  19. numba_cuda/numba/cuda/cudadrv/driver.py +460 -297
  20. numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
  21. numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
  22. numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
  23. numba_cuda/numba/cuda/cudadrv/error.py +6 -2
  24. numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
  25. numba_cuda/numba/cuda/cudadrv/linkable_code.py +16 -1
  26. numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
  27. numba_cuda/numba/cuda/cudadrv/nvrtc.py +138 -29
  28. numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
  29. numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
  30. numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
  31. numba_cuda/numba/cuda/cudaimpl.py +317 -233
  32. numba_cuda/numba/cuda/cudamath.py +1 -1
  33. numba_cuda/numba/cuda/debuginfo.py +8 -6
  34. numba_cuda/numba/cuda/decorators.py +75 -45
  35. numba_cuda/numba/cuda/descriptor.py +1 -1
  36. numba_cuda/numba/cuda/device_init.py +69 -18
  37. numba_cuda/numba/cuda/deviceufunc.py +143 -98
  38. numba_cuda/numba/cuda/dispatcher.py +300 -213
  39. numba_cuda/numba/cuda/errors.py +13 -10
  40. numba_cuda/numba/cuda/extending.py +1 -1
  41. numba_cuda/numba/cuda/initialize.py +5 -3
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +3 -3
  43. numba_cuda/numba/cuda/intrinsics.py +31 -27
  44. numba_cuda/numba/cuda/kernels/reduction.py +13 -13
  45. numba_cuda/numba/cuda/kernels/transpose.py +3 -6
  46. numba_cuda/numba/cuda/libdevice.py +317 -317
  47. numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
  48. numba_cuda/numba/cuda/locks.py +16 -0
  49. numba_cuda/numba/cuda/mathimpl.py +62 -57
  50. numba_cuda/numba/cuda/models.py +1 -5
  51. numba_cuda/numba/cuda/nvvmutils.py +103 -88
  52. numba_cuda/numba/cuda/printimpl.py +9 -5
  53. numba_cuda/numba/cuda/random.py +46 -36
  54. numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
  55. numba_cuda/numba/cuda/runtime/__init__.py +1 -1
  56. numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
  57. numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
  58. numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
  59. numba_cuda/numba/cuda/runtime/nrt.py +48 -43
  60. numba_cuda/numba/cuda/simulator/__init__.py +22 -12
  61. numba_cuda/numba/cuda/simulator/api.py +38 -22
  62. numba_cuda/numba/cuda/simulator/compiler.py +2 -2
  63. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
  64. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
  65. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
  66. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
  67. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
  68. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
  69. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
  70. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
  71. numba_cuda/numba/cuda/simulator/kernel.py +43 -34
  72. numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
  73. numba_cuda/numba/cuda/simulator/reduction.py +1 -0
  74. numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
  75. numba_cuda/numba/cuda/simulator_init.py +2 -4
  76. numba_cuda/numba/cuda/stubs.py +139 -102
  77. numba_cuda/numba/cuda/target.py +64 -47
  78. numba_cuda/numba/cuda/testing.py +24 -19
  79. numba_cuda/numba/cuda/tests/__init__.py +14 -12
  80. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
  81. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
  88. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
  89. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
  90. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
  91. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
  92. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
  93. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
  94. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
  95. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
  98. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
  100. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
  101. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
  102. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
  103. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
  104. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
  105. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
  106. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
  107. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
  108. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
  109. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
  110. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
  111. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +7 -6
  112. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
  113. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
  115. numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
  117. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
  118. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
  119. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +57 -21
  120. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
  121. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
  122. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
  123. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
  124. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
  126. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
  127. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
  128. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
  129. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
  131. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
  132. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
  133. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
  134. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +31 -28
  135. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
  136. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
  137. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +6 -7
  138. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
  139. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
  140. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +19 -12
  141. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
  142. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
  143. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
  144. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
  145. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
  148. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
  149. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
  150. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
  151. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
  152. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
  153. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
  154. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
  155. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +6 -6
  156. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
  157. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
  158. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
  159. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
  160. numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
  161. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
  162. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
  163. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
  164. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
  165. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
  166. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
  167. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
  168. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
  169. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
  170. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
  171. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
  172. numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
  173. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
  174. numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
  175. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
  176. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
  177. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
  178. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
  179. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
  180. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
  182. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
  183. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
  184. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
  185. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
  186. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
  187. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
  188. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
  192. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
  193. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
  194. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
  195. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +31 -25
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
  197. numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
  198. numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
  199. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
  200. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
  201. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
  202. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
  203. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
  204. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
  206. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
  207. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
  208. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
  209. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
  210. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
  211. numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
  212. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
  213. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
  214. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
  215. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
  216. numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
  217. numba_cuda/numba/cuda/types.py +5 -2
  218. numba_cuda/numba/cuda/ufuncs.py +382 -362
  219. numba_cuda/numba/cuda/utils.py +2 -2
  220. numba_cuda/numba/cuda/vector_types.py +2 -2
  221. numba_cuda/numba/cuda/vectorizers.py +37 -32
  222. {numba_cuda-0.8.1.dist-info → numba_cuda-0.9.0.dist-info}/METADATA +1 -1
  223. numba_cuda-0.9.0.dist-info/RECORD +253 -0
  224. {numba_cuda-0.8.1.dist-info → numba_cuda-0.9.0.dist-info}/WHEEL +1 -1
  225. numba_cuda-0.8.1.dist-info/RECORD +0 -251
  226. {numba_cuda-0.8.1.dist-info → numba_cuda-0.9.0.dist-info}/licenses/LICENSE +0 -0
  227. {numba_cuda-0.8.1.dist-info → numba_cuda-0.9.0.dist-info}/top_level.txt +0 -0
@@ -6,8 +6,12 @@ from numba import cuda, int64
6
6
  from numba.cuda import compile_ptx
7
7
  from numba.core.errors import TypingError
8
8
  from numba.core.types import f2
9
- from numba.cuda.testing import (unittest, CUDATestCase, skip_on_cudasim,
10
- skip_unless_cc_53)
9
+ from numba.cuda.testing import (
10
+ unittest,
11
+ CUDATestCase,
12
+ skip_on_cudasim,
13
+ skip_unless_cc_53,
14
+ )
11
15
 
12
16
 
13
17
  def simple_threadidx(ary):
@@ -260,7 +264,6 @@ def simple_hsqrt(r, x):
260
264
 
261
265
 
262
266
  def simple_hrsqrt(r, x):
263
-
264
267
  i = cuda.grid(1)
265
268
 
266
269
  if i < len(r):
@@ -268,7 +271,7 @@ def simple_hrsqrt(r, x):
268
271
 
269
272
 
270
273
  def numpy_hrsqrt(x, dtype):
271
- return x ** -0.5
274
+ return x**-0.5
272
275
 
273
276
 
274
277
  def simple_hceil(r, x):
@@ -404,15 +407,15 @@ class TestCudaIntrinsic(CUDATestCase):
404
407
  f_res = f_contigous()
405
408
  self.assertTrue(np.all(c_res == f_res))
406
409
 
407
- @skip_on_cudasim('Cudasim does not check types')
410
+ @skip_on_cudasim("Cudasim does not check types")
408
411
  def test_nonliteral_grid_error(self):
409
- with self.assertRaisesRegex(TypingError, 'RequireLiteralValue'):
410
- cuda.jit('void(int32)')(nonliteral_grid)
412
+ with self.assertRaisesRegex(TypingError, "RequireLiteralValue"):
413
+ cuda.jit("void(int32)")(nonliteral_grid)
411
414
 
412
- @skip_on_cudasim('Cudasim does not check types')
415
+ @skip_on_cudasim("Cudasim does not check types")
413
416
  def test_nonliteral_gridsize_error(self):
414
- with self.assertRaisesRegex(TypingError, 'RequireLiteralValue'):
415
- cuda.jit('void(int32)')(nonliteral_gridsize)
417
+ with self.assertRaisesRegex(TypingError, "RequireLiteralValue"):
418
+ cuda.jit("void(int32)")(nonliteral_gridsize)
416
419
 
417
420
  def test_simple_grid1d(self):
418
421
  compiled = cuda.jit("void(int32[::1])")(simple_grid1d)
@@ -444,7 +447,7 @@ class TestCudaIntrinsic(CUDATestCase):
444
447
  compiled[nctaid, ntid](ary)
445
448
  self.assertEqual(ary[0], nctaid * ntid)
446
449
 
447
- @skip_on_cudasim('Requires too many threads')
450
+ @skip_on_cudasim("Requires too many threads")
448
451
  def test_issue_9229(self):
449
452
  # Ensure that grid and grid size are correct - #9229 showed that they
450
453
  # overflowed an int32.
@@ -469,7 +472,7 @@ class TestCudaIntrinsic(CUDATestCase):
469
472
  self.assertEqual(grid_error[0], 0)
470
473
  self.assertEqual(gridsize_error[0], 0)
471
474
 
472
- @skip_on_cudasim('Tests PTX emission')
475
+ @skip_on_cudasim("Tests PTX emission")
473
476
  def test_selp(self):
474
477
  sig = (int64[:], int64, int64[:])
475
478
  cu_branching_with_ifs = cuda.jit(sig)(branching_with_ifs)
@@ -485,14 +488,14 @@ class TestCudaIntrinsic(CUDATestCase):
485
488
  a = np.arange(n, dtype=np.int64)
486
489
  cu_branching_with_ifs[n, 1](a, b, c)
487
490
  ptx = cu_branching_with_ifs.inspect_asm(sig)
488
- self.assertEqual(2, len(re.findall(r'\s+bra\s+', ptx)))
489
- np.testing.assert_array_equal(a, expected, err_msg='branching')
491
+ self.assertEqual(2, len(re.findall(r"\s+bra\s+", ptx)))
492
+ np.testing.assert_array_equal(a, expected, err_msg="branching")
490
493
 
491
494
  a = np.arange(n, dtype=np.int64)
492
495
  cu_branching_with_selps[n, 1](a, b, c)
493
496
  ptx = cu_branching_with_selps.inspect_asm(sig)
494
- self.assertEqual(0, len(re.findall(r'\s+bra\s+', ptx)))
495
- np.testing.assert_array_equal(a, expected, err_msg='selp')
497
+ self.assertEqual(0, len(re.findall(r"\s+bra\s+", ptx)))
498
+ np.testing.assert_array_equal(a, expected, err_msg="selp")
496
499
 
497
500
  def test_simple_gridsize2d(self):
498
501
  compiled = cuda.jit("void(int32[::1])")(simple_gridsize2d)
@@ -528,10 +531,10 @@ class TestCudaIntrinsic(CUDATestCase):
528
531
  a, b, c = cuda.gridsize(3)
529
532
  out[x, y, z] = a * b * c
530
533
 
531
- arr = np.zeros(9 ** 3, dtype=np.int32).reshape(9, 9, 9)
534
+ arr = np.zeros(9**3, dtype=np.int32).reshape(9, 9, 9)
532
535
  foo[(3, 3, 3), (3, 3, 3)](arr)
533
536
 
534
- np.testing.assert_equal(arr, 9 ** 3)
537
+ np.testing.assert_equal(arr, 9**3)
535
538
 
536
539
  def test_3dgrid_2(self):
537
540
  @cuda.jit
@@ -539,13 +542,15 @@ class TestCudaIntrinsic(CUDATestCase):
539
542
  x, y, z = cuda.grid(3)
540
543
  a, b, c = cuda.gridsize(3)
541
544
  grid_is_right = (
542
- x == cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x and
543
- y == cuda.threadIdx.y + cuda.blockIdx.y * cuda.blockDim.y and
544
- z == cuda.threadIdx.z + cuda.blockIdx.z * cuda.blockDim.z
545
+ x == cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
546
+ and y == cuda.threadIdx.y + cuda.blockIdx.y * cuda.blockDim.y
547
+ and z == cuda.threadIdx.z + cuda.blockIdx.z * cuda.blockDim.z
548
+ )
549
+ gridsize_is_right = (
550
+ a == cuda.blockDim.x * cuda.gridDim.x
551
+ and b == cuda.blockDim.y * cuda.gridDim.y
552
+ and c == cuda.blockDim.z * cuda.gridDim.z
545
553
  )
546
- gridsize_is_right = (a == cuda.blockDim.x * cuda.gridDim.x and
547
- b == cuda.blockDim.y * cuda.gridDim.y and
548
- c == cuda.blockDim.z * cuda.gridDim.z)
549
554
  out[x, y, z] = grid_is_right and gridsize_is_right
550
555
 
551
556
  x, y, z = (4 * 3, 3 * 2, 2 * 4)
@@ -605,21 +610,21 @@ class TestCudaIntrinsic(CUDATestCase):
605
610
  def test_fma_f4(self):
606
611
  compiled = cuda.jit("void(f4[:], f4, f4, f4)")(simple_fma)
607
612
  ary = np.zeros(1, dtype=np.float32)
608
- compiled[1, 1](ary, 2., 3., 4.)
613
+ compiled[1, 1](ary, 2.0, 3.0, 4.0)
609
614
  np.testing.assert_allclose(ary[0], 2 * 3 + 4)
610
615
 
611
616
  def test_fma_f8(self):
612
617
  compiled = cuda.jit("void(f8[:], f8, f8, f8)")(simple_fma)
613
618
  ary = np.zeros(1, dtype=np.float64)
614
- compiled[1, 1](ary, 2., 3., 4.)
619
+ compiled[1, 1](ary, 2.0, 3.0, 4.0)
615
620
  np.testing.assert_allclose(ary[0], 2 * 3 + 4)
616
621
 
617
622
  @skip_unless_cc_53
618
623
  def test_hadd(self):
619
624
  compiled = cuda.jit("void(f2[:], f2[:], f2[:])")(simple_hadd)
620
625
  ary = np.zeros(1, dtype=np.float16)
621
- arg1 = np.array([3.], dtype=np.float16)
622
- arg2 = np.array([4.], dtype=np.float16)
626
+ arg1 = np.array([3.0], dtype=np.float16)
627
+ arg2 = np.array([4.0], dtype=np.float16)
623
628
  compiled[1, 1](ary, arg1, arg2)
624
629
  np.testing.assert_allclose(ary[0], arg1 + arg2)
625
630
 
@@ -628,24 +633,24 @@ class TestCudaIntrinsic(CUDATestCase):
628
633
  compiled = cuda.jit("void(f2[:], f2, f2)")(simple_hadd_scalar)
629
634
  ary = np.zeros(1, dtype=np.float16)
630
635
  arg1 = np.float16(3.1415926)
631
- arg2 = np.float16(3.)
636
+ arg2 = np.float16(3.0)
632
637
  compiled[1, 1](ary, arg1, arg2)
633
638
  ref = arg1 + arg2
634
639
  np.testing.assert_allclose(ary[0], ref)
635
640
 
636
- @skip_on_cudasim('Compilation unsupported in the simulator')
641
+ @skip_on_cudasim("Compilation unsupported in the simulator")
637
642
  def test_hadd_ptx(self):
638
643
  args = (f2[:], f2, f2)
639
644
  ptx, _ = compile_ptx(simple_hadd_scalar, args, cc=(5, 3))
640
- self.assertIn('add.f16', ptx)
645
+ self.assertIn("add.f16", ptx)
641
646
 
642
647
  @skip_unless_cc_53
643
648
  def test_hfma(self):
644
649
  compiled = cuda.jit("void(f2[:], f2[:], f2[:], f2[:])")(simple_hfma)
645
650
  ary = np.zeros(1, dtype=np.float16)
646
- arg1 = np.array([2.], dtype=np.float16)
647
- arg2 = np.array([3.], dtype=np.float16)
648
- arg3 = np.array([4.], dtype=np.float16)
651
+ arg1 = np.array([2.0], dtype=np.float16)
652
+ arg2 = np.array([3.0], dtype=np.float16)
653
+ arg3 = np.array([4.0], dtype=np.float16)
649
654
  compiled[1, 1](ary, arg1, arg2, arg3)
650
655
  np.testing.assert_allclose(ary[0], arg1 * arg2 + arg3)
651
656
 
@@ -653,25 +658,25 @@ class TestCudaIntrinsic(CUDATestCase):
653
658
  def test_hfma_scalar(self):
654
659
  compiled = cuda.jit("void(f2[:], f2, f2, f2)")(simple_hfma_scalar)
655
660
  ary = np.zeros(1, dtype=np.float16)
656
- arg1 = np.float16(2.)
657
- arg2 = np.float16(3.)
658
- arg3 = np.float16(4.)
661
+ arg1 = np.float16(2.0)
662
+ arg2 = np.float16(3.0)
663
+ arg3 = np.float16(4.0)
659
664
  compiled[1, 1](ary, arg1, arg2, arg3)
660
665
  ref = arg1 * arg2 + arg3
661
666
  np.testing.assert_allclose(ary[0], ref)
662
667
 
663
- @skip_on_cudasim('Compilation unsupported in the simulator')
668
+ @skip_on_cudasim("Compilation unsupported in the simulator")
664
669
  def test_hfma_ptx(self):
665
670
  args = (f2[:], f2, f2, f2)
666
671
  ptx, _ = compile_ptx(simple_hfma_scalar, args, cc=(5, 3))
667
- self.assertIn('fma.rn.f16', ptx)
672
+ self.assertIn("fma.rn.f16", ptx)
668
673
 
669
674
  @skip_unless_cc_53
670
675
  def test_hsub(self):
671
676
  compiled = cuda.jit("void(f2[:], f2[:], f2[:])")(simple_hsub)
672
677
  ary = np.zeros(1, dtype=np.float16)
673
- arg1 = np.array([3.], dtype=np.float16)
674
- arg2 = np.array([4.], dtype=np.float16)
678
+ arg1 = np.array([3.0], dtype=np.float16)
679
+ arg2 = np.array([4.0], dtype=np.float16)
675
680
  compiled[1, 1](ary, arg1, arg2)
676
681
  np.testing.assert_allclose(ary[0], arg1 - arg2)
677
682
 
@@ -685,18 +690,18 @@ class TestCudaIntrinsic(CUDATestCase):
685
690
  ref = arg1 - arg2
686
691
  np.testing.assert_allclose(ary[0], ref)
687
692
 
688
- @skip_on_cudasim('Compilation unsupported in the simulator')
693
+ @skip_on_cudasim("Compilation unsupported in the simulator")
689
694
  def test_hsub_ptx(self):
690
695
  args = (f2[:], f2, f2)
691
696
  ptx, _ = compile_ptx(simple_hsub_scalar, args, cc=(5, 3))
692
- self.assertIn('sub.f16', ptx)
697
+ self.assertIn("sub.f16", ptx)
693
698
 
694
699
  @skip_unless_cc_53
695
700
  def test_hmul(self):
696
701
  compiled = cuda.jit()(simple_hmul)
697
702
  ary = np.zeros(1, dtype=np.float16)
698
- arg1 = np.array([3.], dtype=np.float16)
699
- arg2 = np.array([4.], dtype=np.float16)
703
+ arg1 = np.array([3.0], dtype=np.float16)
704
+ arg2 = np.array([4.0], dtype=np.float16)
700
705
  compiled[1, 1](ary, arg1, arg2)
701
706
  np.testing.assert_allclose(ary[0], arg1 * arg2)
702
707
 
@@ -710,11 +715,11 @@ class TestCudaIntrinsic(CUDATestCase):
710
715
  ref = arg1 * arg2
711
716
  np.testing.assert_allclose(ary[0], ref)
712
717
 
713
- @skip_on_cudasim('Compilation unsupported in the simulator')
718
+ @skip_on_cudasim("Compilation unsupported in the simulator")
714
719
  def test_hmul_ptx(self):
715
720
  args = (f2[:], f2, f2)
716
721
  ptx, _ = compile_ptx(simple_hmul_scalar, args, cc=(5, 3))
717
- self.assertIn('mul.f16', ptx)
722
+ self.assertIn("mul.f16", ptx)
718
723
 
719
724
  @skip_unless_cc_53
720
725
  def test_hdiv_scalar(self):
@@ -742,7 +747,7 @@ class TestCudaIntrinsic(CUDATestCase):
742
747
  def test_hneg(self):
743
748
  compiled = cuda.jit("void(f2[:], f2[:])")(simple_hneg)
744
749
  ary = np.zeros(1, dtype=np.float16)
745
- arg1 = np.array([3.], dtype=np.float16)
750
+ arg1 = np.array([3.0], dtype=np.float16)
746
751
  compiled[1, 1](ary, arg1)
747
752
  np.testing.assert_allclose(ary[0], -arg1)
748
753
 
@@ -755,17 +760,17 @@ class TestCudaIntrinsic(CUDATestCase):
755
760
  ref = -arg1
756
761
  np.testing.assert_allclose(ary[0], ref)
757
762
 
758
- @skip_on_cudasim('Compilation unsupported in the simulator')
763
+ @skip_on_cudasim("Compilation unsupported in the simulator")
759
764
  def test_hneg_ptx(self):
760
765
  args = (f2[:], f2)
761
766
  ptx, _ = compile_ptx(simple_hneg_scalar, args, cc=(5, 3))
762
- self.assertIn('neg.f16', ptx)
767
+ self.assertIn("neg.f16", ptx)
763
768
 
764
769
  @skip_unless_cc_53
765
770
  def test_habs(self):
766
771
  compiled = cuda.jit()(simple_habs)
767
772
  ary = np.zeros(1, dtype=np.float16)
768
- arg1 = np.array([-3.], dtype=np.float16)
773
+ arg1 = np.array([-3.0], dtype=np.float16)
769
774
  compiled[1, 1](ary, arg1)
770
775
  np.testing.assert_allclose(ary[0], abs(arg1))
771
776
 
@@ -778,25 +783,43 @@ class TestCudaIntrinsic(CUDATestCase):
778
783
  ref = abs(arg1)
779
784
  np.testing.assert_allclose(ary[0], ref)
780
785
 
781
- @skip_on_cudasim('Compilation unsupported in the simulator')
786
+ @skip_on_cudasim("Compilation unsupported in the simulator")
782
787
  def test_habs_ptx(self):
783
788
  args = (f2[:], f2)
784
789
  ptx, _ = compile_ptx(simple_habs_scalar, args, cc=(5, 3))
785
- self.assertIn('abs.f16', ptx)
790
+ self.assertIn("abs.f16", ptx)
786
791
 
787
792
  @skip_unless_cc_53
788
793
  def test_fp16_intrinsics_common(self):
789
- kernels = (simple_hsin, simple_hcos,
790
- simple_hlog, simple_hlog2, simple_hlog10,
791
- simple_hsqrt, simple_hceil, simple_hfloor,
792
- simple_hrcp, simple_htrunc, simple_hrint,
793
- simple_hrsqrt)
794
+ kernels = (
795
+ simple_hsin,
796
+ simple_hcos,
797
+ simple_hlog,
798
+ simple_hlog2,
799
+ simple_hlog10,
800
+ simple_hsqrt,
801
+ simple_hceil,
802
+ simple_hfloor,
803
+ simple_hrcp,
804
+ simple_htrunc,
805
+ simple_hrint,
806
+ simple_hrsqrt,
807
+ )
794
808
  exp_kernels = (simple_hexp, simple_hexp2)
795
- expected_functions = (np.sin, np.cos,
796
- np.log, np.log2, np.log10,
797
- np.sqrt, np.ceil, np.floor,
798
- np.reciprocal, np.trunc, np.rint,
799
- numpy_hrsqrt)
809
+ expected_functions = (
810
+ np.sin,
811
+ np.cos,
812
+ np.log,
813
+ np.log2,
814
+ np.log10,
815
+ np.sqrt,
816
+ np.ceil,
817
+ np.floor,
818
+ np.reciprocal,
819
+ np.trunc,
820
+ np.rint,
821
+ numpy_hrsqrt,
822
+ )
800
823
  expected_exp_functions = (np.exp, np.exp2)
801
824
 
802
825
  # Generate random data
@@ -807,7 +830,7 @@ class TestCudaIntrinsic(CUDATestCase):
807
830
  for kernel, fn in zip(kernels, expected_functions):
808
831
  with self.subTest(fn=fn):
809
832
  kernel = cuda.jit("void(f2[:], f2[:])")(kernel)
810
- kernel[1,N](r, x)
833
+ kernel[1, N](r, x)
811
834
  expected = fn(x, dtype=np.float16)
812
835
  np.testing.assert_allclose(r, expected)
813
836
 
@@ -815,7 +838,7 @@ class TestCudaIntrinsic(CUDATestCase):
815
838
  for kernel, fn in zip(exp_kernels, expected_exp_functions):
816
839
  with self.subTest(fn=fn):
817
840
  kernel = cuda.jit("void(f2[:], f2[:])")(kernel)
818
- kernel[1,N](r, x2)
841
+ kernel[1, N](r, x2)
819
842
  expected = fn(x2, dtype=np.float16)
820
843
  np.testing.assert_allclose(r, expected)
821
844
 
@@ -836,14 +859,26 @@ class TestCudaIntrinsic(CUDATestCase):
836
859
 
837
860
  # Run the kernel
838
861
  hexp10_vectors[1, N](r, x)
839
- np.testing.assert_allclose(r, 10 ** x)
862
+ np.testing.assert_allclose(r, 10**x)
840
863
 
841
864
  @skip_unless_cc_53
842
865
  def test_fp16_comparison(self):
843
- fns = (simple_heq_scalar, simple_hne_scalar, simple_hge_scalar,
844
- simple_hgt_scalar, simple_hle_scalar, simple_hlt_scalar)
845
- ops = (operator.eq, operator.ne, operator.ge,
846
- operator.gt, operator.le, operator.lt)
866
+ fns = (
867
+ simple_heq_scalar,
868
+ simple_hne_scalar,
869
+ simple_hge_scalar,
870
+ simple_hgt_scalar,
871
+ simple_hle_scalar,
872
+ simple_hlt_scalar,
873
+ )
874
+ ops = (
875
+ operator.eq,
876
+ operator.ne,
877
+ operator.ge,
878
+ operator.gt,
879
+ operator.le,
880
+ operator.lt,
881
+ )
847
882
 
848
883
  for fn, op in zip(fns, ops):
849
884
  with self.subTest(op=op):
@@ -872,18 +907,20 @@ class TestCudaIntrinsic(CUDATestCase):
872
907
 
873
908
  @skip_unless_cc_53
874
909
  def test_multiple_float16_comparisons(self):
875
- functions = (test_multiple_hcmp_1,
876
- test_multiple_hcmp_2,
877
- test_multiple_hcmp_3,
878
- test_multiple_hcmp_4,
879
- test_multiple_hcmp_5)
910
+ functions = (
911
+ test_multiple_hcmp_1,
912
+ test_multiple_hcmp_2,
913
+ test_multiple_hcmp_3,
914
+ test_multiple_hcmp_4,
915
+ test_multiple_hcmp_5,
916
+ )
880
917
  for fn in functions:
881
918
  with self.subTest(fn=fn):
882
919
  compiled = cuda.jit("void(b1[:], f2, f2, f2)")(fn)
883
920
  ary = np.zeros(1, dtype=np.bool_)
884
- arg1 = np.float16(2.)
885
- arg2 = np.float16(3.)
886
- arg3 = np.float16(4.)
921
+ arg1 = np.float16(2.0)
922
+ arg2 = np.float16(3.0)
923
+ arg3 = np.float16(4.0)
887
924
  compiled[1, 1](ary, arg1, arg2, arg3)
888
925
  self.assertTrue(ary[0])
889
926
 
@@ -891,11 +928,11 @@ class TestCudaIntrinsic(CUDATestCase):
891
928
  def test_hmax(self):
892
929
  compiled = cuda.jit("void(f2[:], f2, f2)")(simple_hmax_scalar)
893
930
  ary = np.zeros(1, dtype=np.float16)
894
- arg1 = np.float16(3.)
895
- arg2 = np.float16(4.)
931
+ arg1 = np.float16(3.0)
932
+ arg2 = np.float16(4.0)
896
933
  compiled[1, 1](ary, arg1, arg2)
897
934
  np.testing.assert_allclose(ary[0], arg2)
898
- arg1 = np.float16(5.)
935
+ arg1 = np.float16(5.0)
899
936
  compiled[1, 1](ary, arg1, arg2)
900
937
  np.testing.assert_allclose(ary[0], arg1)
901
938
 
@@ -903,25 +940,25 @@ class TestCudaIntrinsic(CUDATestCase):
903
940
  def test_hmin(self):
904
941
  compiled = cuda.jit("void(f2[:], f2, f2)")(simple_hmin_scalar)
905
942
  ary = np.zeros(1, dtype=np.float16)
906
- arg1 = np.float16(3.)
907
- arg2 = np.float16(4.)
943
+ arg1 = np.float16(3.0)
944
+ arg2 = np.float16(4.0)
908
945
  compiled[1, 1](ary, arg1, arg2)
909
946
  np.testing.assert_allclose(ary[0], arg1)
910
- arg1 = np.float16(5.)
947
+ arg1 = np.float16(5.0)
911
948
  compiled[1, 1](ary, arg1, arg2)
912
949
  np.testing.assert_allclose(ary[0], arg2)
913
950
 
914
951
  def test_cbrt_f32(self):
915
952
  compiled = cuda.jit("void(float32[:], float32)")(simple_cbrt)
916
953
  ary = np.zeros(1, dtype=np.float32)
917
- cbrt_arg = 2.
954
+ cbrt_arg = 2.0
918
955
  compiled[1, 1](ary, cbrt_arg)
919
956
  np.testing.assert_allclose(ary[0], cbrt_arg ** (1 / 3))
920
957
 
921
958
  def test_cbrt_f64(self):
922
959
  compiled = cuda.jit("void(float64[:], float64)")(simple_cbrt)
923
960
  ary = np.zeros(1, dtype=np.float64)
924
- cbrt_arg = 6.
961
+ cbrt_arg = 6.0
925
962
  compiled[1, 1](ary, cbrt_arg)
926
963
  np.testing.assert_allclose(ary[0], cbrt_arg ** (1 / 3))
927
964
 
@@ -1052,25 +1089,36 @@ class TestCudaIntrinsic(CUDATestCase):
1052
1089
  np.concatenate((vals, np.array([np.inf, -np.inf, np.nan])))
1053
1090
  digits = (
1054
1091
  # Common case branch of round_to_impl
1055
- -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5,
1092
+ -5,
1093
+ -4,
1094
+ -3,
1095
+ -2,
1096
+ -1,
1097
+ 0,
1098
+ 1,
1099
+ 2,
1100
+ 3,
1101
+ 4,
1102
+ 5,
1056
1103
  # The algorithm currently implemented can only round to 13 digits
1057
1104
  # with single precision. Note that this doesn't trigger the
1058
1105
  # "overflow safe" branch of the implementation, which can only be
1059
1106
  # hit when using double precision.
1060
- 13
1107
+ 13,
1061
1108
  )
1062
1109
  for val, ndigits in itertools.product(vals, digits):
1063
1110
  with self.subTest(val=val, ndigits=ndigits):
1064
1111
  compiled[1, 1](ary, val, ndigits)
1065
- self.assertPreciseEqual(ary[0], round(val, ndigits),
1066
- prec='single')
1112
+ self.assertPreciseEqual(
1113
+ ary[0], round(val, ndigits), prec="single"
1114
+ )
1067
1115
 
1068
1116
  # CPython on most platforms uses rounding based on dtoa.c, whereas the CUDA
1069
1117
  # round-to implementation uses CPython's fallback implementation, which has
1070
1118
  # slightly different behavior at the edges of the domain. Since the CUDA
1071
1119
  # simulator executes using CPython, we need to skip this test when the
1072
1120
  # simulator is active.
1073
- @skip_on_cudasim('Overflow behavior differs on CPython')
1121
+ @skip_on_cudasim("Overflow behavior differs on CPython")
1074
1122
  def test_round_to_f4_overflow(self):
1075
1123
  # Test that the input value is returned when y in round_ndigits
1076
1124
  # overflows.
@@ -1092,7 +1140,7 @@ class TestCudaIntrinsic(CUDATestCase):
1092
1140
  val = 0.3425
1093
1141
  ndigits = 3
1094
1142
  compiled[1, 1](ary, val, ndigits)
1095
- self.assertPreciseEqual(ary[0], round(val, ndigits), prec='single')
1143
+ self.assertPreciseEqual(ary[0], round(val, ndigits), prec="single")
1096
1144
 
1097
1145
  def test_round_to_f8(self):
1098
1146
  compiled = cuda.jit("void(float64[:], float64, int32)")(simple_round_to)
@@ -1105,19 +1153,19 @@ class TestCudaIntrinsic(CUDATestCase):
1105
1153
  for val, ndigits in itertools.product(vals, digits):
1106
1154
  with self.subTest(val=val, ndigits=ndigits):
1107
1155
  compiled[1, 1](ary, val, ndigits)
1108
- self.assertPreciseEqual(ary[0], round(val, ndigits),
1109
- prec='exact')
1156
+ self.assertPreciseEqual(
1157
+ ary[0], round(val, ndigits), prec="exact"
1158
+ )
1110
1159
 
1111
1160
  # Trigger the "overflow safe" branch of the implementation
1112
1161
  val = 0.12345678987654321 * 10e-15
1113
1162
  ndigits = 23
1114
1163
  with self.subTest(val=val, ndigits=ndigits):
1115
1164
  compiled[1, 1](ary, val, ndigits)
1116
- self.assertPreciseEqual(ary[0], round(val, ndigits),
1117
- prec='double')
1165
+ self.assertPreciseEqual(ary[0], round(val, ndigits), prec="double")
1118
1166
 
1119
1167
  # Skipped on cudasim for the same reasons as test_round_to_f4 above.
1120
- @skip_on_cudasim('Overflow behavior differs on CPython')
1168
+ @skip_on_cudasim("Overflow behavior differs on CPython")
1121
1169
  def test_round_to_f8_overflow(self):
1122
1170
  # Test that the input value is returned when y in round_ndigits
1123
1171
  # overflows.
@@ -1139,8 +1187,8 @@ class TestCudaIntrinsic(CUDATestCase):
1139
1187
  val = 0.5425
1140
1188
  ndigits = 3
1141
1189
  compiled[1, 1](ary, val, ndigits)
1142
- self.assertPreciseEqual(ary[0], round(val, ndigits), prec='double')
1190
+ self.assertPreciseEqual(ary[0], round(val, ndigits), prec="double")
1143
1191
 
1144
1192
 
1145
- if __name__ == '__main__':
1193
+ if __name__ == "__main__":
1146
1194
  unittest.main()