numba-cuda 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (227) hide show
  1. _numba_cuda_redirector.py +17 -13
  2. numba_cuda/VERSION +1 -1
  3. numba_cuda/_version.py +4 -1
  4. numba_cuda/numba/cuda/__init__.py +6 -2
  5. numba_cuda/numba/cuda/api.py +129 -86
  6. numba_cuda/numba/cuda/api_util.py +3 -3
  7. numba_cuda/numba/cuda/args.py +12 -16
  8. numba_cuda/numba/cuda/cg.py +6 -6
  9. numba_cuda/numba/cuda/codegen.py +74 -43
  10. numba_cuda/numba/cuda/compiler.py +232 -113
  11. numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
  12. numba_cuda/numba/cuda/cuda_fp16.h +661 -661
  13. numba_cuda/numba/cuda/cuda_fp16.hpp +3 -3
  14. numba_cuda/numba/cuda/cuda_paths.py +291 -99
  15. numba_cuda/numba/cuda/cudadecl.py +125 -69
  16. numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
  17. numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
  18. numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
  19. numba_cuda/numba/cuda/cudadrv/driver.py +460 -297
  20. numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
  21. numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
  22. numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
  23. numba_cuda/numba/cuda/cudadrv/error.py +6 -2
  24. numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
  25. numba_cuda/numba/cuda/cudadrv/linkable_code.py +16 -1
  26. numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
  27. numba_cuda/numba/cuda/cudadrv/nvrtc.py +138 -29
  28. numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
  29. numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
  30. numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
  31. numba_cuda/numba/cuda/cudaimpl.py +317 -233
  32. numba_cuda/numba/cuda/cudamath.py +1 -1
  33. numba_cuda/numba/cuda/debuginfo.py +8 -6
  34. numba_cuda/numba/cuda/decorators.py +75 -45
  35. numba_cuda/numba/cuda/descriptor.py +1 -1
  36. numba_cuda/numba/cuda/device_init.py +69 -18
  37. numba_cuda/numba/cuda/deviceufunc.py +143 -98
  38. numba_cuda/numba/cuda/dispatcher.py +300 -213
  39. numba_cuda/numba/cuda/errors.py +13 -10
  40. numba_cuda/numba/cuda/extending.py +1 -1
  41. numba_cuda/numba/cuda/initialize.py +5 -3
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +3 -3
  43. numba_cuda/numba/cuda/intrinsics.py +31 -27
  44. numba_cuda/numba/cuda/kernels/reduction.py +13 -13
  45. numba_cuda/numba/cuda/kernels/transpose.py +3 -6
  46. numba_cuda/numba/cuda/libdevice.py +317 -317
  47. numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
  48. numba_cuda/numba/cuda/locks.py +16 -0
  49. numba_cuda/numba/cuda/mathimpl.py +62 -57
  50. numba_cuda/numba/cuda/models.py +1 -5
  51. numba_cuda/numba/cuda/nvvmutils.py +103 -88
  52. numba_cuda/numba/cuda/printimpl.py +9 -5
  53. numba_cuda/numba/cuda/random.py +46 -36
  54. numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
  55. numba_cuda/numba/cuda/runtime/__init__.py +1 -1
  56. numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
  57. numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
  58. numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
  59. numba_cuda/numba/cuda/runtime/nrt.py +48 -43
  60. numba_cuda/numba/cuda/simulator/__init__.py +22 -12
  61. numba_cuda/numba/cuda/simulator/api.py +38 -22
  62. numba_cuda/numba/cuda/simulator/compiler.py +2 -2
  63. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
  64. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
  65. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
  66. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
  67. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
  68. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
  69. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
  70. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
  71. numba_cuda/numba/cuda/simulator/kernel.py +43 -34
  72. numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
  73. numba_cuda/numba/cuda/simulator/reduction.py +1 -0
  74. numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
  75. numba_cuda/numba/cuda/simulator_init.py +2 -4
  76. numba_cuda/numba/cuda/stubs.py +139 -102
  77. numba_cuda/numba/cuda/target.py +64 -47
  78. numba_cuda/numba/cuda/testing.py +24 -19
  79. numba_cuda/numba/cuda/tests/__init__.py +14 -12
  80. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
  81. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
  88. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
  89. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
  90. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
  91. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
  92. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
  93. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
  94. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
  95. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
  98. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
  100. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
  101. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
  102. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
  103. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
  104. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
  105. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
  106. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
  107. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
  108. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
  109. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
  110. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
  111. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +7 -6
  112. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
  113. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
  115. numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
  117. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
  118. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
  119. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +57 -21
  120. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
  121. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
  122. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
  123. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
  124. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
  126. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
  127. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
  128. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
  129. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
  131. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
  132. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
  133. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
  134. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +31 -28
  135. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
  136. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
  137. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +6 -7
  138. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
  139. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
  140. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +19 -12
  141. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
  142. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
  143. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
  144. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
  145. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
  148. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
  149. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
  150. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
  151. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
  152. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
  153. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
  154. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
  155. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +6 -6
  156. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
  157. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
  158. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
  159. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
  160. numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
  161. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
  162. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
  163. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
  164. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
  165. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
  166. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
  167. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
  168. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
  169. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
  170. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
  171. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
  172. numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
  173. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
  174. numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
  175. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
  176. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
  177. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
  178. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
  179. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
  180. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
  182. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
  183. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
  184. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
  185. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
  186. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
  187. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
  188. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
  192. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
  193. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
  194. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
  195. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +31 -25
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
  197. numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
  198. numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
  199. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
  200. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
  201. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
  202. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
  203. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
  204. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
  206. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
  207. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
  208. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
  209. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
  210. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
  211. numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
  212. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
  213. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
  214. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
  215. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
  216. numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
  217. numba_cuda/numba/cuda/types.py +5 -2
  218. numba_cuda/numba/cuda/ufuncs.py +382 -362
  219. numba_cuda/numba/cuda/utils.py +2 -2
  220. numba_cuda/numba/cuda/vector_types.py +2 -2
  221. numba_cuda/numba/cuda/vectorizers.py +37 -32
  222. {numba_cuda-0.8.1.dist-info → numba_cuda-0.9.0.dist-info}/METADATA +1 -1
  223. numba_cuda-0.9.0.dist-info/RECORD +253 -0
  224. {numba_cuda-0.8.1.dist-info → numba_cuda-0.9.0.dist-info}/WHEEL +1 -1
  225. numba_cuda-0.8.1.dist-info/RECORD +0 -251
  226. {numba_cuda-0.8.1.dist-info → numba_cuda-0.9.0.dist-info}/licenses/LICENSE +0 -0
  227. {numba_cuda-0.8.1.dist-info → numba_cuda-0.9.0.dist-info}/top_level.txt +0 -0
@@ -8,73 +8,73 @@ def useful_syncwarp(ary):
8
8
  i = cuda.grid(1)
9
9
  if i == 0:
10
10
  ary[0] = 42
11
- cuda.syncwarp(0xffffffff)
11
+ cuda.syncwarp(0xFFFFFFFF)
12
12
  ary[i] = ary[0]
13
13
 
14
14
 
15
15
  def use_shfl_sync_idx(ary, idx):
16
16
  i = cuda.grid(1)
17
- val = cuda.shfl_sync(0xffffffff, i, idx)
17
+ val = cuda.shfl_sync(0xFFFFFFFF, i, idx)
18
18
  ary[i] = val
19
19
 
20
20
 
21
21
  def use_shfl_sync_up(ary, delta):
22
22
  i = cuda.grid(1)
23
- val = cuda.shfl_up_sync(0xffffffff, i, delta)
23
+ val = cuda.shfl_up_sync(0xFFFFFFFF, i, delta)
24
24
  ary[i] = val
25
25
 
26
26
 
27
27
  def use_shfl_sync_down(ary, delta):
28
28
  i = cuda.grid(1)
29
- val = cuda.shfl_down_sync(0xffffffff, i, delta)
29
+ val = cuda.shfl_down_sync(0xFFFFFFFF, i, delta)
30
30
  ary[i] = val
31
31
 
32
32
 
33
33
  def use_shfl_sync_xor(ary, xor):
34
34
  i = cuda.grid(1)
35
- val = cuda.shfl_xor_sync(0xffffffff, i, xor)
35
+ val = cuda.shfl_xor_sync(0xFFFFFFFF, i, xor)
36
36
  ary[i] = val
37
37
 
38
38
 
39
39
  def use_shfl_sync_with_val(ary, into):
40
40
  i = cuda.grid(1)
41
- val = cuda.shfl_sync(0xffffffff, into, 0)
41
+ val = cuda.shfl_sync(0xFFFFFFFF, into, 0)
42
42
  ary[i] = val
43
43
 
44
44
 
45
45
  def use_vote_sync_all(ary_in, ary_out):
46
46
  i = cuda.grid(1)
47
- pred = cuda.all_sync(0xffffffff, ary_in[i])
47
+ pred = cuda.all_sync(0xFFFFFFFF, ary_in[i])
48
48
  ary_out[i] = pred
49
49
 
50
50
 
51
51
  def use_vote_sync_any(ary_in, ary_out):
52
52
  i = cuda.grid(1)
53
- pred = cuda.any_sync(0xffffffff, ary_in[i])
53
+ pred = cuda.any_sync(0xFFFFFFFF, ary_in[i])
54
54
  ary_out[i] = pred
55
55
 
56
56
 
57
57
  def use_vote_sync_eq(ary_in, ary_out):
58
58
  i = cuda.grid(1)
59
- pred = cuda.eq_sync(0xffffffff, ary_in[i])
59
+ pred = cuda.eq_sync(0xFFFFFFFF, ary_in[i])
60
60
  ary_out[i] = pred
61
61
 
62
62
 
63
63
  def use_vote_sync_ballot(ary):
64
64
  i = cuda.threadIdx.x
65
- ballot = cuda.ballot_sync(0xffffffff, True)
65
+ ballot = cuda.ballot_sync(0xFFFFFFFF, True)
66
66
  ary[i] = ballot
67
67
 
68
68
 
69
69
  def use_match_any_sync(ary_in, ary_out):
70
70
  i = cuda.grid(1)
71
- ballot = cuda.match_any_sync(0xffffffff, ary_in[i])
71
+ ballot = cuda.match_any_sync(0xFFFFFFFF, ary_in[i])
72
72
  ary_out[i] = ballot
73
73
 
74
74
 
75
75
  def use_match_all_sync(ary_in, ary_out):
76
76
  i = cuda.grid(1)
77
- ballot, pred = cuda.match_all_sync(0xffffffff, ary_in[i])
77
+ ballot, pred = cuda.match_all_sync(0xFFFFFFFF, ary_in[i])
78
78
  ary_out[i] = ballot if pred else 0
79
79
 
80
80
 
@@ -146,8 +146,12 @@ class TestCudaWarpOperations(CUDATestCase):
146
146
 
147
147
  def test_shfl_sync_types(self):
148
148
  types = int32, int64, float32, float64
149
- values = (np.int32(-1), np.int64(1 << 42),
150
- np.float32(np.pi), np.float64(np.pi))
149
+ values = (
150
+ np.int32(-1),
151
+ np.int64(1 << 42),
152
+ np.float32(np.pi),
153
+ np.float64(np.pi),
154
+ )
151
155
  for typ, val in zip(types, values):
152
156
  compiled = cuda.jit((typ[:], typ))(use_shfl_sync_with_val)
153
157
  nelem = 32
@@ -197,10 +201,11 @@ class TestCudaWarpOperations(CUDATestCase):
197
201
  nelem = 32
198
202
  ary = np.empty(nelem, dtype=np.uint32)
199
203
  compiled[1, nelem](ary)
200
- self.assertTrue(np.all(ary == np.uint32(0xffffffff)))
204
+ self.assertTrue(np.all(ary == np.uint32(0xFFFFFFFF)))
201
205
 
202
- @unittest.skipUnless(_safe_cc_check((7, 0)),
203
- "Matching requires at least Volta Architecture")
206
+ @unittest.skipUnless(
207
+ _safe_cc_check((7, 0)), "Matching requires at least Volta Architecture"
208
+ )
204
209
  def test_match_any_sync(self):
205
210
  compiled = cuda.jit("void(int32[:], int32[:])")(use_match_any_sync)
206
211
  nelem = 10
@@ -210,8 +215,9 @@ class TestCudaWarpOperations(CUDATestCase):
210
215
  compiled[1, nelem](ary_in, ary_out)
211
216
  self.assertTrue(np.all(ary_out == exp))
212
217
 
213
- @unittest.skipUnless(_safe_cc_check((7, 0)),
214
- "Matching requires at least Volta Architecture")
218
+ @unittest.skipUnless(
219
+ _safe_cc_check((7, 0)), "Matching requires at least Volta Architecture"
220
+ )
215
221
  def test_match_all_sync(self):
216
222
  compiled = cuda.jit("void(int32[:], int32[:])")(use_match_all_sync)
217
223
  nelem = 10
@@ -223,9 +229,10 @@ class TestCudaWarpOperations(CUDATestCase):
223
229
  compiled[1, nelem](ary_in, ary_out)
224
230
  self.assertTrue(np.all(ary_out == 0))
225
231
 
226
- @unittest.skipUnless(_safe_cc_check((7, 0)),
227
- "Independent scheduling requires at least Volta "
228
- "Architecture")
232
+ @unittest.skipUnless(
233
+ _safe_cc_check((7, 0)),
234
+ "Independent scheduling requires at least Volta Architecture",
235
+ )
229
236
  def test_independent_scheduling(self):
230
237
  compiled = cuda.jit("void(uint32[:])")(use_independent_scheduling)
231
238
  arr = np.empty(32, dtype=np.uint32)
@@ -267,10 +274,9 @@ class TestCudaWarpOperations(CUDATestCase):
267
274
  # 0, 1, 3, 7, F, 1F, 3F, 7F, FF, 1FF, etc.
268
275
  # or in binary:
269
276
  # ...0001, ....0011, ...0111, etc.
270
- expected = np.asarray([(2 ** i) - 1 for i in range(32)],
271
- dtype=np.uint32)
277
+ expected = np.asarray([(2**i) - 1 for i in range(32)], dtype=np.uint32)
272
278
  np.testing.assert_equal(expected, out)
273
279
 
274
280
 
275
- if __name__ == '__main__':
281
+ if __name__ == "__main__":
276
282
  unittest.main()
@@ -10,12 +10,16 @@ import unittest
10
10
 
11
11
  class TestCudaSimIssues(CUDATestCase):
12
12
  def test_record_access(self):
13
- backyard_type = [('statue', np.float64),
14
- ('newspaper', np.float64, (6,))]
13
+ backyard_type = [
14
+ ("statue", np.float64),
15
+ ("newspaper", np.float64, (6,)),
16
+ ]
15
17
 
16
- goose_type = [('garden', np.float64, (12,)),
17
- ('town', np.float64, (42,)),
18
- ('backyard', backyard_type)]
18
+ goose_type = [
19
+ ("garden", np.float64, (12,)),
20
+ ("town", np.float64, (42,)),
21
+ ("backyard", backyard_type),
22
+ ]
19
23
 
20
24
  goose_np_type = np.dtype(goose_type, align=True)
21
25
 
@@ -27,20 +31,22 @@ class TestCudaSimIssues(CUDATestCase):
27
31
 
28
32
  item = np.recarray(1, dtype=goose_np_type)
29
33
  simple_kernel[1, 1](item[0])
30
- np.testing.assert_equal(item[0]['garden'][0], 45)
31
- np.testing.assert_equal(item[0]['backyard']['newspaper'][3], 5)
34
+ np.testing.assert_equal(item[0]["garden"][0], 45)
35
+ np.testing.assert_equal(item[0]["backyard"]["newspaper"][3], 5)
32
36
 
33
37
  def test_recarray_setting(self):
34
- recordwith2darray = np.dtype([('i', np.int32),
35
- ('j', np.float32, (3, 2))])
38
+ recordwith2darray = np.dtype(
39
+ [("i", np.int32), ("j", np.float32, (3, 2))]
40
+ )
36
41
  rec = np.recarray(2, dtype=recordwith2darray)
37
- rec[0]['i'] = 45
42
+ rec[0]["i"] = 45
38
43
 
39
44
  @cuda.jit
40
45
  def simple_kernel(f):
41
46
  f[1] = f[0]
47
+
42
48
  simple_kernel[1, 1](rec)
43
- np.testing.assert_equal(rec[0]['i'], rec[1]['i'])
49
+ np.testing.assert_equal(rec[0]["i"], rec[1]["i"])
44
50
 
45
51
  def test_cuda_module_in_device_function(self):
46
52
  """
@@ -63,7 +69,7 @@ class TestCudaSimIssues(CUDATestCase):
63
69
  expected = np.arange(arr.size, dtype=np.int32)
64
70
  np.testing.assert_equal(expected, arr)
65
71
 
66
- @skip_unless_cudasim('Only works on CUDASIM')
72
+ @skip_unless_cudasim("Only works on CUDASIM")
67
73
  def test_deadlock_on_exception(self):
68
74
  def assert_no_blockthreads():
69
75
  blockthreads = []
@@ -98,5 +104,5 @@ class TestCudaSimIssues(CUDATestCase):
98
104
  assert_no_blockthreads()
99
105
 
100
106
 
101
- if __name__ == '__main__':
107
+ if __name__ == "__main__":
102
108
  unittest.main()
@@ -20,4 +20,4 @@ int array_mutator(void *out, int *a)
20
20
  {
21
21
  a[0] = a[1];
22
22
  return 0;
23
- }
23
+ }
@@ -47,5 +47,3 @@
47
47
  st.param.b32 [func_retval0+0], %r2;
48
48
  ret;
49
49
  }
50
-
51
-
@@ -2,14 +2,18 @@
2
2
  # "magictoken" is used for markers as beginning and ending of example text.
3
3
 
4
4
  import unittest
5
- from numba.cuda.testing import (CUDATestCase, skip_on_cudasim,
6
- skip_if_cudadevrt_missing, skip_unless_cc_60,
7
- skip_if_mvc_enabled)
5
+ from numba.cuda.testing import (
6
+ CUDATestCase,
7
+ skip_on_cudasim,
8
+ skip_if_cudadevrt_missing,
9
+ skip_unless_cc_60,
10
+ skip_if_mvc_enabled,
11
+ )
8
12
 
9
13
 
10
14
  @skip_if_cudadevrt_missing
11
15
  @skip_unless_cc_60
12
- @skip_if_mvc_enabled('CG not supported with MVC')
16
+ @skip_if_mvc_enabled("CG not supported with MVC")
13
17
  @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
14
18
  class TestCooperativeGroups(CUDATestCase):
15
19
  def test_ex_grid_sync(self):
@@ -17,7 +21,7 @@ class TestCooperativeGroups(CUDATestCase):
17
21
  from numba import cuda, int32
18
22
  import numpy as np
19
23
 
20
- sig = (int32[:,::1],)
24
+ sig = (int32[:, ::1],)
21
25
 
22
26
  @cuda.jit(sig)
23
27
  def sequential_rows(M):
@@ -34,6 +38,7 @@ class TestCooperativeGroups(CUDATestCase):
34
38
  # Wait until all threads have written their column element,
35
39
  # and that the write is visible to all other threads
36
40
  g.sync()
41
+
37
42
  # magictoken.ex_grid_sync_kernel.end
38
43
 
39
44
  # magictoken.ex_grid_sync_data.begin
@@ -48,9 +53,11 @@ class TestCooperativeGroups(CUDATestCase):
48
53
 
49
54
  # Skip this test if the grid size used in the example is too large for
50
55
  # a cooperative launch on the current GPU
51
- mb = sequential_rows.overloads[sig].max_cooperative_grid_blocks(blockdim)
56
+ mb = sequential_rows.overloads[sig].max_cooperative_grid_blocks(
57
+ blockdim
58
+ )
52
59
  if mb < griddim:
53
- self.skipTest('Device does not support a large enough coop grid')
60
+ self.skipTest("Device does not support a large enough coop grid")
54
61
 
55
62
  # magictoken.ex_grid_sync_launch.begin
56
63
  # Kernel launch - this is implicitly a cooperative launch
@@ -73,5 +80,5 @@ class TestCooperativeGroups(CUDATestCase):
73
80
  np.testing.assert_equal(A, reference)
74
81
 
75
82
 
76
- if __name__ == '__main__':
83
+ if __name__ == "__main__":
77
84
  unittest.main()
@@ -41,6 +41,7 @@ class TestCpuGpuCompat(CUDATestCase):
41
41
  @numba.jit
42
42
  def business_logic(x, y, z):
43
43
  return 4 * z * (2 * x - (4 * y) / 2 * pi)
44
+
44
45
  # ex_cpu_gpu_compat.define.end
45
46
 
46
47
  # ex_cpu_gpu_compat.cpurun.begin
@@ -54,6 +55,7 @@ class TestCpuGpuCompat(CUDATestCase):
54
55
  if tid < len(xarr):
55
56
  # The function decorated with numba.jit may be directly reused
56
57
  res[tid] = business_logic(xarr[tid], yarr[tid], zarr[tid])
58
+
57
59
  # ex_cpu_gpu_compat.usegpu.end
58
60
 
59
61
  # ex_cpu_gpu_compat.launch.begin
@@ -62,14 +64,9 @@ class TestCpuGpuCompat(CUDATestCase):
62
64
  # [-126.79644737231007, 416.28324559588634, -218912930.2987788]
63
65
  # ex_cpu_gpu_compat.launch.end
64
66
 
65
- expect = [
66
- business_logic(x, y, z) for x, y, z in zip(X, Y, Z)
67
- ]
67
+ expect = [business_logic(x, y, z) for x, y, z in zip(X, Y, Z)]
68
68
 
69
- np.testing.assert_equal(
70
- expect,
71
- results.copy_to_host()
72
- )
69
+ np.testing.assert_equal(expect, results.copy_to_host())
73
70
 
74
71
 
75
72
  if __name__ == "__main__":
@@ -2,7 +2,7 @@
2
2
  # "magictoken" is used for markers as beginning and ending of example text.
3
3
 
4
4
  import unittest
5
- from numba.cuda.testing import (CUDATestCase, skip_on_cudasim)
5
+ from numba.cuda.testing import CUDATestCase, skip_on_cudasim
6
6
  from numba.tests.support import skip_unless_cffi
7
7
 
8
8
 
@@ -18,11 +18,12 @@ class TestFFI(CUDATestCase):
18
18
  # Path to the source containing the foreign function
19
19
  # (here assumed to be in a subdirectory called "ffi")
20
20
  basedir = os.path.dirname(os.path.abspath(__file__))
21
- functions_cu = os.path.join(basedir, 'ffi', 'functions.cu')
21
+ functions_cu = os.path.join(basedir, "ffi", "functions.cu")
22
22
 
23
23
  # Declaration of the foreign function
24
- mul = cuda.declare_device('mul_f32_f32', 'float32(float32, float32)',
25
- link=functions_cu)
24
+ mul = cuda.declare_device(
25
+ "mul_f32_f32", "float32(float32, float32)", link=functions_cu
26
+ )
26
27
 
27
28
  # A kernel that calls mul; functions.cu is linked automatically due to
28
29
  # the call to mul.
@@ -52,25 +53,29 @@ class TestFFI(CUDATestCase):
52
53
  import os
53
54
 
54
55
  basedir = os.path.dirname(os.path.abspath(__file__))
55
- functions_cu = os.path.join(basedir, 'ffi', 'functions.cu')
56
+ functions_cu = os.path.join(basedir, "ffi", "functions.cu")
56
57
 
57
58
  # magictoken.ex_from_buffer_decl.begin
58
- signature = 'float32(CPointer(float32), int32)'
59
- sum_reduce = cuda.declare_device('sum_reduce', signature,
60
- link=functions_cu)
59
+ signature = "float32(CPointer(float32), int32)"
60
+ sum_reduce = cuda.declare_device(
61
+ "sum_reduce", signature, link=functions_cu
62
+ )
61
63
  # magictoken.ex_from_buffer_decl.end
62
64
 
63
65
  # magictoken.ex_from_buffer_kernel.begin
64
66
  import cffi
67
+
65
68
  ffi = cffi.FFI()
66
69
 
67
70
  @cuda.jit
68
71
  def reduction_caller(result, array):
69
72
  array_ptr = ffi.from_buffer(array)
70
73
  result[()] = sum_reduce(array_ptr, len(array))
74
+
71
75
  # magictoken.ex_from_buffer_kernel.end
72
76
 
73
77
  import numpy as np
78
+
74
79
  x = np.arange(10).astype(np.float32)
75
80
  r = np.ndarray((), dtype=np.float32)
76
81
 
@@ -81,5 +86,5 @@ class TestFFI(CUDATestCase):
81
86
  np.testing.assert_allclose(expected, actual)
82
87
 
83
88
 
84
- if __name__ == '__main__':
89
+ if __name__ == "__main__":
85
90
  unittest.main()
@@ -1,14 +1,18 @@
1
1
  import unittest
2
2
 
3
- from numba.cuda.testing import (CUDATestCase, skip_if_cudadevrt_missing,
4
- skip_on_cudasim, skip_unless_cc_60,
5
- skip_if_mvc_enabled)
3
+ from numba.cuda.testing import (
4
+ CUDATestCase,
5
+ skip_if_cudadevrt_missing,
6
+ skip_on_cudasim,
7
+ skip_unless_cc_60,
8
+ skip_if_mvc_enabled,
9
+ )
6
10
  from numba.tests.support import captured_stdout
7
11
 
8
12
 
9
13
  @skip_if_cudadevrt_missing
10
14
  @skip_unless_cc_60
11
- @skip_if_mvc_enabled('CG not supported with MVC')
15
+ @skip_if_mvc_enabled("CG not supported with MVC")
12
16
  @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
13
17
  class TestLaplace(CUDATestCase):
14
18
  """
@@ -27,7 +31,6 @@ class TestLaplace(CUDATestCase):
27
31
  super().tearDown()
28
32
 
29
33
  def test_ex_laplace(self):
30
-
31
34
  # set True to regenerate the figures that
32
35
  # accompany this example
33
36
  plot = False
@@ -55,24 +58,25 @@ class TestLaplace(CUDATestCase):
55
58
 
56
59
  if plot:
57
60
  import matplotlib.pyplot as plt
61
+
58
62
  fig, ax = plt.subplots(figsize=(16 * 0.66, 9 * 0.66))
59
63
  plt.plot(
60
64
  np.arange(len(buf_0)),
61
65
  buf_0.copy_to_host(),
62
66
  lw=3,
63
67
  marker="*",
64
- color='black'
68
+ color="black",
65
69
  )
66
70
 
67
- plt.title('Initial State', fontsize=24)
68
- plt.xlabel('Position', fontsize=24)
69
- plt.ylabel('Temperature', fontsize=24)
71
+ plt.title("Initial State", fontsize=24)
72
+ plt.xlabel("Position", fontsize=24)
73
+ plt.ylabel("Temperature", fontsize=24)
70
74
 
71
75
  ax.set_xticks(ax.get_xticks(), fontsize=16)
72
76
  ax.set_yticks(ax.get_yticks(), fontsize=16)
73
77
  plt.xlim(0, len(data))
74
78
  plt.ylim(0, 10001)
75
- plt.savefig('laplace_initial.svg')
79
+ plt.savefig("laplace_initial.svg")
76
80
 
77
81
  # ex_laplace.kernel.begin
78
82
  @cuda.jit
@@ -116,12 +120,11 @@ class TestLaplace(CUDATestCase):
116
120
 
117
121
  # Wait for every thread to write before moving on
118
122
  grid.sync()
123
+
119
124
  # ex_laplace.kernel.end
120
125
 
121
126
  # ex_laplace.launch.begin
122
- solve_heat_equation.forall(len(data))(
123
- buf_0, buf_1, niter, 0.25
124
- )
127
+ solve_heat_equation.forall(len(data))(buf_0, buf_1, niter, 0.25)
125
128
  # ex_laplace.launch.end
126
129
 
127
130
  results = buf_1.copy_to_host()
@@ -129,20 +132,21 @@ class TestLaplace(CUDATestCase):
129
132
  fig, ax = plt.subplots(figsize=(16 * 0.66, 9 * 0.66))
130
133
  plt.plot(
131
134
  np.arange(len(results)),
132
- results, lw=3,
135
+ results,
136
+ lw=3,
133
137
  marker="*",
134
- color='black'
138
+ color="black",
135
139
  )
136
140
  plt.title(f"T = {niter}", fontsize=24)
137
- plt.xlabel('Position', fontsize=24)
138
- plt.ylabel('Temperature', fontsize=24)
141
+ plt.xlabel("Position", fontsize=24)
142
+ plt.ylabel("Temperature", fontsize=24)
139
143
 
140
144
  ax.set_xticks(ax.get_xticks(), fontsize=16)
141
145
  ax.set_yticks(ax.get_yticks(), fontsize=16)
142
146
 
143
147
  plt.ylim(0, max(results))
144
148
  plt.xlim(0, len(results))
145
- plt.savefig('laplace_final.svg')
149
+ plt.savefig("laplace_final.svg")
146
150
 
147
151
  # Integral over the domain should be equal to its initial value.
148
152
  # Note that this should match the initial value of data[500] above, but
@@ -6,6 +6,7 @@ Reference: https://stackoverflow.com/a/64198479/13697228 by @RobertCrovella
6
6
  Contents in this file are referenced from the sphinx-generated docs.
7
7
  "magictoken" is used for markers as beginning and ending of example text.
8
8
  """
9
+
9
10
  import unittest
10
11
  from numba.cuda.testing import CUDATestCase, skip_on_cudasim
11
12
  from numba.tests.support import captured_stdout
@@ -43,10 +44,11 @@ class TestMatMul(CUDATestCase):
43
44
  """Perform square matrix multiplication of C = A * B."""
44
45
  i, j = cuda.grid(2)
45
46
  if i < C.shape[0] and j < C.shape[1]:
46
- tmp = 0.
47
+ tmp = 0.0
47
48
  for k in range(A.shape[1]):
48
49
  tmp += A[i, k] * B[k, j]
49
50
  C[i, j] = tmp
51
+
50
52
  # magictoken.ex_matmul.end
51
53
 
52
54
  # magictoken.ex_run_matmul.begin
@@ -91,11 +93,11 @@ class TestMatMul(CUDATestCase):
91
93
 
92
94
  tx = cuda.threadIdx.x
93
95
  ty = cuda.threadIdx.y
94
- bpg = cuda.gridDim.x # blocks per grid
96
+ bpg = cuda.gridDim.x # blocks per grid
95
97
 
96
98
  # Each thread computes one element in the result matrix.
97
99
  # The dot product is chunked into dot products of TPB-long vectors.
98
- tmp = float32(0.)
100
+ tmp = float32(0.0)
99
101
  for i in range(bpg):
100
102
  # Preload data into shared memory
101
103
  sA[ty, tx] = 0
@@ -116,6 +118,7 @@ class TestMatMul(CUDATestCase):
116
118
  cuda.syncthreads()
117
119
  if y < C.shape[0] and x < C.shape[1]:
118
120
  C[y, x] = tmp
121
+
119
122
  # magictoken.ex_fast_matmul.end
120
123
 
121
124
  # magictoken.ex_run_fast_matmul.begin
@@ -169,5 +172,5 @@ class TestMatMul(CUDATestCase):
169
172
  self.assertTrue(np.all(z_h == x_h @ y_h), msg=msg)
170
173
 
171
174
 
172
- if __name__ == '__main__':
175
+ if __name__ == "__main__":
173
176
  unittest.main()
@@ -59,6 +59,7 @@ class TestMonteCarlo(CUDATestCase):
59
59
  # value of the sample
60
60
  y = func(samp)
61
61
  out[gid] = y
62
+
62
63
  # ex_montecarlo.kernel.end
63
64
 
64
65
  # ex_montecarlo.callfunc.begin
@@ -84,6 +85,7 @@ class TestMonteCarlo(CUDATestCase):
84
85
  factor = (upper_lim - lower_lim) / (nsamps - 1)
85
86
 
86
87
  return sum_reduce(out) * factor
88
+
87
89
  # ex_montecarlo.callfunc.end
88
90
 
89
91
  # ex_montecarlo.launch.begin
@@ -10,8 +10,10 @@ class TestRandom(CUDATestCase):
10
10
  def test_ex_3d_grid(self):
11
11
  # magictoken.ex_3d_grid.begin
12
12
  from numba import cuda
13
- from numba.cuda.random import (create_xoroshiro128p_states,
14
- xoroshiro128p_uniform_float32)
13
+ from numba.cuda.random import (
14
+ create_xoroshiro128p_states,
15
+ xoroshiro128p_uniform_float32,
16
+ )
15
17
  import numpy as np
16
18
 
17
19
  @cuda.jit
@@ -27,7 +29,9 @@ class TestRandom(CUDATestCase):
27
29
  for i in range(startz, arr.shape[0], stridez):
28
30
  for j in range(starty, arr.shape[1], stridey):
29
31
  for k in range(startx, arr.shape[2], stridex):
30
- arr[i, j, k] = xoroshiro128p_uniform_float32(rng_states, tid)
32
+ arr[i, j, k] = xoroshiro128p_uniform_float32(
33
+ rng_states, tid
34
+ )
31
35
 
32
36
  # Array dimensions
33
37
  X, Y, Z = 701, 900, 719
@@ -55,5 +59,5 @@ class TestRandom(CUDATestCase):
55
59
  self.assertTrue(np.all(host_arr >= 0.0))
56
60
 
57
61
 
58
- if __name__ == '__main__':
62
+ if __name__ == "__main__":
59
63
  unittest.main()
@@ -61,11 +61,12 @@ class TestReduction(CUDATestCase):
61
61
  # After the loop, the zeroth element contains the sum
62
62
  if tid == 0:
63
63
  data[tid] = shr[tid]
64
+
64
65
  # ex_reduction.kernel.end
65
66
 
66
67
  # ex_reduction.launch.begin
67
68
  array_sum[1, nelem](a)
68
- print(a[0]) # 523776
69
+ print(a[0]) # 523776
69
70
  print(sum(np.arange(1024))) # 523776
70
71
  # ex_reduction.launch.end
71
72