numba-cuda 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. _numba_cuda_redirector.py +17 -13
  2. numba_cuda/VERSION +1 -1
  3. numba_cuda/_version.py +4 -1
  4. numba_cuda/numba/cuda/__init__.py +6 -2
  5. numba_cuda/numba/cuda/api.py +129 -86
  6. numba_cuda/numba/cuda/api_util.py +3 -3
  7. numba_cuda/numba/cuda/args.py +12 -16
  8. numba_cuda/numba/cuda/cg.py +6 -6
  9. numba_cuda/numba/cuda/codegen.py +74 -43
  10. numba_cuda/numba/cuda/compiler.py +246 -114
  11. numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
  12. numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
  13. numba_cuda/numba/cuda/cuda_paths.py +293 -99
  14. numba_cuda/numba/cuda/cudadecl.py +93 -79
  15. numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
  16. numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
  17. numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
  18. numba_cuda/numba/cuda/cudadrv/driver.py +460 -297
  19. numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
  20. numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
  21. numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
  22. numba_cuda/numba/cuda/cudadrv/error.py +6 -2
  23. numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
  24. numba_cuda/numba/cuda/cudadrv/linkable_code.py +27 -3
  25. numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
  26. numba_cuda/numba/cuda/cudadrv/nvrtc.py +146 -30
  27. numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
  28. numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
  29. numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
  30. numba_cuda/numba/cuda/cudaimpl.py +296 -275
  31. numba_cuda/numba/cuda/cudamath.py +1 -1
  32. numba_cuda/numba/cuda/debuginfo.py +99 -7
  33. numba_cuda/numba/cuda/decorators.py +87 -45
  34. numba_cuda/numba/cuda/descriptor.py +1 -1
  35. numba_cuda/numba/cuda/device_init.py +68 -18
  36. numba_cuda/numba/cuda/deviceufunc.py +143 -98
  37. numba_cuda/numba/cuda/dispatcher.py +300 -213
  38. numba_cuda/numba/cuda/errors.py +13 -10
  39. numba_cuda/numba/cuda/extending.py +55 -1
  40. numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
  41. numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
  42. numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +1090 -927
  43. numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +468 -319
  44. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  45. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  46. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  47. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  48. numba_cuda/numba/cuda/initialize.py +5 -3
  49. numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
  50. numba_cuda/numba/cuda/intrinsics.py +203 -28
  51. numba_cuda/numba/cuda/kernels/reduction.py +13 -13
  52. numba_cuda/numba/cuda/kernels/transpose.py +3 -6
  53. numba_cuda/numba/cuda/libdevice.py +317 -317
  54. numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
  55. numba_cuda/numba/cuda/locks.py +16 -0
  56. numba_cuda/numba/cuda/lowering.py +43 -0
  57. numba_cuda/numba/cuda/mathimpl.py +62 -57
  58. numba_cuda/numba/cuda/models.py +1 -5
  59. numba_cuda/numba/cuda/nvvmutils.py +103 -88
  60. numba_cuda/numba/cuda/printimpl.py +9 -5
  61. numba_cuda/numba/cuda/random.py +46 -36
  62. numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
  63. numba_cuda/numba/cuda/runtime/__init__.py +1 -1
  64. numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
  65. numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
  66. numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
  67. numba_cuda/numba/cuda/runtime/nrt.py +48 -43
  68. numba_cuda/numba/cuda/simulator/__init__.py +22 -12
  69. numba_cuda/numba/cuda/simulator/api.py +38 -22
  70. numba_cuda/numba/cuda/simulator/compiler.py +2 -2
  71. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
  72. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
  73. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
  74. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
  75. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
  76. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
  77. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
  78. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
  79. numba_cuda/numba/cuda/simulator/kernel.py +43 -34
  80. numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
  81. numba_cuda/numba/cuda/simulator/reduction.py +1 -0
  82. numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
  83. numba_cuda/numba/cuda/simulator_init.py +2 -4
  84. numba_cuda/numba/cuda/stubs.py +134 -108
  85. numba_cuda/numba/cuda/target.py +92 -47
  86. numba_cuda/numba/cuda/testing.py +24 -19
  87. numba_cuda/numba/cuda/tests/__init__.py +14 -12
  88. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
  89. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
  90. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
  91. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
  92. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
  93. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
  94. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
  95. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
  96. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
  97. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
  98. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
  99. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
  100. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
  101. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
  102. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
  103. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
  104. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
  105. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
  106. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
  107. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
  108. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
  109. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
  110. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
  111. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
  112. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
  113. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
  114. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
  115. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
  116. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
  117. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
  118. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
  119. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +10 -7
  120. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
  121. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
  123. numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
  124. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
  125. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
  126. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
  127. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +59 -23
  129. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
  130. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
  131. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
  132. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
  133. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
  134. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
  135. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
  136. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
  137. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
  138. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
  139. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
  140. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
  141. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
  142. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
  143. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +77 -28
  144. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
  145. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
  146. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +24 -7
  147. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
  148. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
  149. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +21 -12
  150. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
  151. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
  152. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
  153. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
  154. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
  155. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
  156. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
  157. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
  158. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
  159. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +59 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
  161. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
  162. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
  163. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
  164. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
  165. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +7 -7
  166. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
  167. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
  168. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
  169. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
  170. numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
  171. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
  172. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
  173. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
  174. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
  175. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
  176. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
  177. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
  178. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
  179. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
  180. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
  181. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
  182. numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
  183. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
  184. numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
  185. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
  186. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
  187. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
  188. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
  189. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
  190. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
  191. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
  192. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
  193. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
  194. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
  195. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
  196. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
  197. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
  198. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
  199. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
  200. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
  201. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
  202. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
  203. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
  204. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
  205. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +81 -30
  206. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
  207. numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
  208. numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
  209. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
  210. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
  211. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
  212. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
  213. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
  214. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
  216. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
  217. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
  218. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
  219. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
  220. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
  221. numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
  222. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
  223. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
  224. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
  225. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
  226. numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
  227. numba_cuda/numba/cuda/types.py +5 -2
  228. numba_cuda/numba/cuda/ufuncs.py +382 -362
  229. numba_cuda/numba/cuda/utils.py +2 -2
  230. numba_cuda/numba/cuda/vector_types.py +5 -3
  231. numba_cuda/numba/cuda/vectorizers.py +38 -33
  232. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/METADATA +1 -1
  233. numba_cuda-0.10.0.dist-info/RECORD +263 -0
  234. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/WHEEL +1 -1
  235. numba_cuda-0.8.1.dist-info/RECORD +0 -251
  236. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/licenses/LICENSE +0 -0
  237. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,9 @@
1
+ import re
2
+
1
3
  import numpy as np
2
4
  from numba import cuda, int32, int64, float32, float64
3
5
  from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
6
+ from numba.cuda.compiler import compile_ptx
4
7
  from numba.core import config
5
8
 
6
9
 
@@ -8,73 +11,73 @@ def useful_syncwarp(ary):
8
11
  i = cuda.grid(1)
9
12
  if i == 0:
10
13
  ary[0] = 42
11
- cuda.syncwarp(0xffffffff)
14
+ cuda.syncwarp(0xFFFFFFFF)
12
15
  ary[i] = ary[0]
13
16
 
14
17
 
15
18
  def use_shfl_sync_idx(ary, idx):
16
19
  i = cuda.grid(1)
17
- val = cuda.shfl_sync(0xffffffff, i, idx)
20
+ val = cuda.shfl_sync(0xFFFFFFFF, i, idx)
18
21
  ary[i] = val
19
22
 
20
23
 
21
24
  def use_shfl_sync_up(ary, delta):
22
25
  i = cuda.grid(1)
23
- val = cuda.shfl_up_sync(0xffffffff, i, delta)
26
+ val = cuda.shfl_up_sync(0xFFFFFFFF, i, delta)
24
27
  ary[i] = val
25
28
 
26
29
 
27
30
  def use_shfl_sync_down(ary, delta):
28
31
  i = cuda.grid(1)
29
- val = cuda.shfl_down_sync(0xffffffff, i, delta)
32
+ val = cuda.shfl_down_sync(0xFFFFFFFF, i, delta)
30
33
  ary[i] = val
31
34
 
32
35
 
33
36
  def use_shfl_sync_xor(ary, xor):
34
37
  i = cuda.grid(1)
35
- val = cuda.shfl_xor_sync(0xffffffff, i, xor)
38
+ val = cuda.shfl_xor_sync(0xFFFFFFFF, i, xor)
36
39
  ary[i] = val
37
40
 
38
41
 
39
42
  def use_shfl_sync_with_val(ary, into):
40
43
  i = cuda.grid(1)
41
- val = cuda.shfl_sync(0xffffffff, into, 0)
44
+ val = cuda.shfl_sync(0xFFFFFFFF, into, 0)
42
45
  ary[i] = val
43
46
 
44
47
 
45
48
  def use_vote_sync_all(ary_in, ary_out):
46
49
  i = cuda.grid(1)
47
- pred = cuda.all_sync(0xffffffff, ary_in[i])
50
+ pred = cuda.all_sync(0xFFFFFFFF, ary_in[i])
48
51
  ary_out[i] = pred
49
52
 
50
53
 
51
54
  def use_vote_sync_any(ary_in, ary_out):
52
55
  i = cuda.grid(1)
53
- pred = cuda.any_sync(0xffffffff, ary_in[i])
56
+ pred = cuda.any_sync(0xFFFFFFFF, ary_in[i])
54
57
  ary_out[i] = pred
55
58
 
56
59
 
57
60
  def use_vote_sync_eq(ary_in, ary_out):
58
61
  i = cuda.grid(1)
59
- pred = cuda.eq_sync(0xffffffff, ary_in[i])
62
+ pred = cuda.eq_sync(0xFFFFFFFF, ary_in[i])
60
63
  ary_out[i] = pred
61
64
 
62
65
 
63
66
  def use_vote_sync_ballot(ary):
64
67
  i = cuda.threadIdx.x
65
- ballot = cuda.ballot_sync(0xffffffff, True)
68
+ ballot = cuda.ballot_sync(0xFFFFFFFF, True)
66
69
  ary[i] = ballot
67
70
 
68
71
 
69
72
  def use_match_any_sync(ary_in, ary_out):
70
73
  i = cuda.grid(1)
71
- ballot = cuda.match_any_sync(0xffffffff, ary_in[i])
74
+ ballot = cuda.match_any_sync(0xFFFFFFFF, ary_in[i])
72
75
  ary_out[i] = ballot
73
76
 
74
77
 
75
78
  def use_match_all_sync(ary_in, ary_out):
76
79
  i = cuda.grid(1)
77
- ballot, pred = cuda.match_all_sync(0xffffffff, ary_in[i])
80
+ ballot, pred = cuda.match_all_sync(0xFFFFFFFF, ary_in[i])
78
81
  ary_out[i] = ballot if pred else 0
79
82
 
80
83
 
@@ -144,16 +147,62 @@ class TestCudaWarpOperations(CUDATestCase):
144
147
  compiled[1, nelem](ary, xor)
145
148
  self.assertTrue(np.all(ary == exp))
146
149
 
150
+ def test_shfl_sync_const_mode_val(self):
151
+ # Test `mode` argument is constant in shfl_sync calls.
152
+ # Related to https://github.com/NVIDIA/numba-cuda/pull/231
153
+ subtest = [
154
+ (use_shfl_sync_idx, 4),
155
+ (use_shfl_sync_up, 4),
156
+ (use_shfl_sync_down, 4),
157
+ (use_shfl_sync_xor, 16),
158
+ ]
159
+
160
+ args_re = r"\((.*)\)"
161
+ m = re.compile(args_re)
162
+
163
+ for func, value in subtest:
164
+ with self.subTest(func=func.__name__):
165
+ compiled = cuda.jit("void(int32[:], int32)")(func)
166
+ nelem = 32
167
+ ary = np.empty(nelem, dtype=np.int32)
168
+ compiled[1, nelem](ary, value)
169
+ irs = next(iter(compiled.inspect_llvm().values()))
170
+
171
+ for ir in irs.split("\n"):
172
+ if "call" in ir and "llvm.nvvm.shfl.sync.i32" in ir:
173
+ args = m.search(ir).group(0)
174
+ arglist = args.split(",")
175
+ mode_arg = arglist[1]
176
+ self.assertNotIn("%", mode_arg)
177
+
178
+ def test_shfl_sync_const_mode_val_sm100(self):
179
+ # Test shfl_sync compiles with cc=(10, 0)
180
+ subtest = [
181
+ use_shfl_sync_idx,
182
+ use_shfl_sync_up,
183
+ use_shfl_sync_down,
184
+ use_shfl_sync_xor,
185
+ ]
186
+
187
+ for func in subtest:
188
+ with self.subTest(func=func.__name__):
189
+ compile_ptx(func, (int32[:], int32), cc=(10, 0))
190
+
147
191
  def test_shfl_sync_types(self):
148
192
  types = int32, int64, float32, float64
149
- values = (np.int32(-1), np.int64(1 << 42),
150
- np.float32(np.pi), np.float64(np.pi))
193
+ values = (
194
+ np.int32(-1),
195
+ np.int64(1 << 42),
196
+ np.float32(np.pi),
197
+ np.float64(np.pi),
198
+ )
151
199
  for typ, val in zip(types, values):
152
- compiled = cuda.jit((typ[:], typ))(use_shfl_sync_with_val)
153
- nelem = 32
154
- ary = np.empty(nelem, dtype=val.dtype)
155
- compiled[1, nelem](ary, val)
156
- self.assertTrue(np.all(ary == val))
200
+ with self.subTest(typ=typ):
201
+ compiled = cuda.jit((typ[:], typ))(use_shfl_sync_with_val)
202
+ nelem = 32
203
+ ary = np.empty(nelem, dtype=val.dtype)
204
+ compiled[1, nelem](ary, val)
205
+ self.assertTrue(np.all(ary == val))
157
206
 
158
207
  def test_vote_sync_all(self):
159
208
  compiled = cuda.jit("void(int32[:], int32[:])")(use_vote_sync_all)
@@ -197,10 +246,11 @@ class TestCudaWarpOperations(CUDATestCase):
197
246
  nelem = 32
198
247
  ary = np.empty(nelem, dtype=np.uint32)
199
248
  compiled[1, nelem](ary)
200
- self.assertTrue(np.all(ary == np.uint32(0xffffffff)))
249
+ self.assertTrue(np.all(ary == np.uint32(0xFFFFFFFF)))
201
250
 
202
- @unittest.skipUnless(_safe_cc_check((7, 0)),
203
- "Matching requires at least Volta Architecture")
251
+ @unittest.skipUnless(
252
+ _safe_cc_check((7, 0)), "Matching requires at least Volta Architecture"
253
+ )
204
254
  def test_match_any_sync(self):
205
255
  compiled = cuda.jit("void(int32[:], int32[:])")(use_match_any_sync)
206
256
  nelem = 10
@@ -210,8 +260,9 @@ class TestCudaWarpOperations(CUDATestCase):
210
260
  compiled[1, nelem](ary_in, ary_out)
211
261
  self.assertTrue(np.all(ary_out == exp))
212
262
 
213
- @unittest.skipUnless(_safe_cc_check((7, 0)),
214
- "Matching requires at least Volta Architecture")
263
+ @unittest.skipUnless(
264
+ _safe_cc_check((7, 0)), "Matching requires at least Volta Architecture"
265
+ )
215
266
  def test_match_all_sync(self):
216
267
  compiled = cuda.jit("void(int32[:], int32[:])")(use_match_all_sync)
217
268
  nelem = 10
@@ -223,9 +274,10 @@ class TestCudaWarpOperations(CUDATestCase):
223
274
  compiled[1, nelem](ary_in, ary_out)
224
275
  self.assertTrue(np.all(ary_out == 0))
225
276
 
226
- @unittest.skipUnless(_safe_cc_check((7, 0)),
227
- "Independent scheduling requires at least Volta "
228
- "Architecture")
277
+ @unittest.skipUnless(
278
+ _safe_cc_check((7, 0)),
279
+ "Independent scheduling requires at least Volta Architecture",
280
+ )
229
281
  def test_independent_scheduling(self):
230
282
  compiled = cuda.jit("void(uint32[:])")(use_independent_scheduling)
231
283
  arr = np.empty(32, dtype=np.uint32)
@@ -267,10 +319,9 @@ class TestCudaWarpOperations(CUDATestCase):
267
319
  # 0, 1, 3, 7, F, 1F, 3F, 7F, FF, 1FF, etc.
268
320
  # or in binary:
269
321
  # ...0001, ....0011, ...0111, etc.
270
- expected = np.asarray([(2 ** i) - 1 for i in range(32)],
271
- dtype=np.uint32)
322
+ expected = np.asarray([(2**i) - 1 for i in range(32)], dtype=np.uint32)
272
323
  np.testing.assert_equal(expected, out)
273
324
 
274
325
 
275
- if __name__ == '__main__':
326
+ if __name__ == "__main__":
276
327
  unittest.main()
@@ -10,12 +10,16 @@ import unittest
10
10
 
11
11
  class TestCudaSimIssues(CUDATestCase):
12
12
  def test_record_access(self):
13
- backyard_type = [('statue', np.float64),
14
- ('newspaper', np.float64, (6,))]
13
+ backyard_type = [
14
+ ("statue", np.float64),
15
+ ("newspaper", np.float64, (6,)),
16
+ ]
15
17
 
16
- goose_type = [('garden', np.float64, (12,)),
17
- ('town', np.float64, (42,)),
18
- ('backyard', backyard_type)]
18
+ goose_type = [
19
+ ("garden", np.float64, (12,)),
20
+ ("town", np.float64, (42,)),
21
+ ("backyard", backyard_type),
22
+ ]
19
23
 
20
24
  goose_np_type = np.dtype(goose_type, align=True)
21
25
 
@@ -27,20 +31,22 @@ class TestCudaSimIssues(CUDATestCase):
27
31
 
28
32
  item = np.recarray(1, dtype=goose_np_type)
29
33
  simple_kernel[1, 1](item[0])
30
- np.testing.assert_equal(item[0]['garden'][0], 45)
31
- np.testing.assert_equal(item[0]['backyard']['newspaper'][3], 5)
34
+ np.testing.assert_equal(item[0]["garden"][0], 45)
35
+ np.testing.assert_equal(item[0]["backyard"]["newspaper"][3], 5)
32
36
 
33
37
  def test_recarray_setting(self):
34
- recordwith2darray = np.dtype([('i', np.int32),
35
- ('j', np.float32, (3, 2))])
38
+ recordwith2darray = np.dtype(
39
+ [("i", np.int32), ("j", np.float32, (3, 2))]
40
+ )
36
41
  rec = np.recarray(2, dtype=recordwith2darray)
37
- rec[0]['i'] = 45
42
+ rec[0]["i"] = 45
38
43
 
39
44
  @cuda.jit
40
45
  def simple_kernel(f):
41
46
  f[1] = f[0]
47
+
42
48
  simple_kernel[1, 1](rec)
43
- np.testing.assert_equal(rec[0]['i'], rec[1]['i'])
49
+ np.testing.assert_equal(rec[0]["i"], rec[1]["i"])
44
50
 
45
51
  def test_cuda_module_in_device_function(self):
46
52
  """
@@ -63,7 +69,7 @@ class TestCudaSimIssues(CUDATestCase):
63
69
  expected = np.arange(arr.size, dtype=np.int32)
64
70
  np.testing.assert_equal(expected, arr)
65
71
 
66
- @skip_unless_cudasim('Only works on CUDASIM')
72
+ @skip_unless_cudasim("Only works on CUDASIM")
67
73
  def test_deadlock_on_exception(self):
68
74
  def assert_no_blockthreads():
69
75
  blockthreads = []
@@ -98,5 +104,5 @@ class TestCudaSimIssues(CUDATestCase):
98
104
  assert_no_blockthreads()
99
105
 
100
106
 
101
- if __name__ == '__main__':
107
+ if __name__ == "__main__":
102
108
  unittest.main()
@@ -20,4 +20,4 @@ int array_mutator(void *out, int *a)
20
20
  {
21
21
  a[0] = a[1];
22
22
  return 0;
23
- }
23
+ }
@@ -47,5 +47,3 @@
47
47
  st.param.b32 [func_retval0+0], %r2;
48
48
  ret;
49
49
  }
50
-
51
-
@@ -2,14 +2,18 @@
2
2
  # "magictoken" is used for markers as beginning and ending of example text.
3
3
 
4
4
  import unittest
5
- from numba.cuda.testing import (CUDATestCase, skip_on_cudasim,
6
- skip_if_cudadevrt_missing, skip_unless_cc_60,
7
- skip_if_mvc_enabled)
5
+ from numba.cuda.testing import (
6
+ CUDATestCase,
7
+ skip_on_cudasim,
8
+ skip_if_cudadevrt_missing,
9
+ skip_unless_cc_60,
10
+ skip_if_mvc_enabled,
11
+ )
8
12
 
9
13
 
10
14
  @skip_if_cudadevrt_missing
11
15
  @skip_unless_cc_60
12
- @skip_if_mvc_enabled('CG not supported with MVC')
16
+ @skip_if_mvc_enabled("CG not supported with MVC")
13
17
  @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
14
18
  class TestCooperativeGroups(CUDATestCase):
15
19
  def test_ex_grid_sync(self):
@@ -17,7 +21,7 @@ class TestCooperativeGroups(CUDATestCase):
17
21
  from numba import cuda, int32
18
22
  import numpy as np
19
23
 
20
- sig = (int32[:,::1],)
24
+ sig = (int32[:, ::1],)
21
25
 
22
26
  @cuda.jit(sig)
23
27
  def sequential_rows(M):
@@ -34,6 +38,7 @@ class TestCooperativeGroups(CUDATestCase):
34
38
  # Wait until all threads have written their column element,
35
39
  # and that the write is visible to all other threads
36
40
  g.sync()
41
+
37
42
  # magictoken.ex_grid_sync_kernel.end
38
43
 
39
44
  # magictoken.ex_grid_sync_data.begin
@@ -48,9 +53,11 @@ class TestCooperativeGroups(CUDATestCase):
48
53
 
49
54
  # Skip this test if the grid size used in the example is too large for
50
55
  # a cooperative launch on the current GPU
51
- mb = sequential_rows.overloads[sig].max_cooperative_grid_blocks(blockdim)
56
+ mb = sequential_rows.overloads[sig].max_cooperative_grid_blocks(
57
+ blockdim
58
+ )
52
59
  if mb < griddim:
53
- self.skipTest('Device does not support a large enough coop grid')
60
+ self.skipTest("Device does not support a large enough coop grid")
54
61
 
55
62
  # magictoken.ex_grid_sync_launch.begin
56
63
  # Kernel launch - this is implicitly a cooperative launch
@@ -73,5 +80,5 @@ class TestCooperativeGroups(CUDATestCase):
73
80
  np.testing.assert_equal(A, reference)
74
81
 
75
82
 
76
- if __name__ == '__main__':
83
+ if __name__ == "__main__":
77
84
  unittest.main()
@@ -41,6 +41,7 @@ class TestCpuGpuCompat(CUDATestCase):
41
41
  @numba.jit
42
42
  def business_logic(x, y, z):
43
43
  return 4 * z * (2 * x - (4 * y) / 2 * pi)
44
+
44
45
  # ex_cpu_gpu_compat.define.end
45
46
 
46
47
  # ex_cpu_gpu_compat.cpurun.begin
@@ -54,6 +55,7 @@ class TestCpuGpuCompat(CUDATestCase):
54
55
  if tid < len(xarr):
55
56
  # The function decorated with numba.jit may be directly reused
56
57
  res[tid] = business_logic(xarr[tid], yarr[tid], zarr[tid])
58
+
57
59
  # ex_cpu_gpu_compat.usegpu.end
58
60
 
59
61
  # ex_cpu_gpu_compat.launch.begin
@@ -62,14 +64,9 @@ class TestCpuGpuCompat(CUDATestCase):
62
64
  # [-126.79644737231007, 416.28324559588634, -218912930.2987788]
63
65
  # ex_cpu_gpu_compat.launch.end
64
66
 
65
- expect = [
66
- business_logic(x, y, z) for x, y, z in zip(X, Y, Z)
67
- ]
67
+ expect = [business_logic(x, y, z) for x, y, z in zip(X, Y, Z)]
68
68
 
69
- np.testing.assert_equal(
70
- expect,
71
- results.copy_to_host()
72
- )
69
+ np.testing.assert_equal(expect, results.copy_to_host())
73
70
 
74
71
 
75
72
  if __name__ == "__main__":
@@ -2,7 +2,7 @@
2
2
  # "magictoken" is used for markers as beginning and ending of example text.
3
3
 
4
4
  import unittest
5
- from numba.cuda.testing import (CUDATestCase, skip_on_cudasim)
5
+ from numba.cuda.testing import CUDATestCase, skip_on_cudasim
6
6
  from numba.tests.support import skip_unless_cffi
7
7
 
8
8
 
@@ -18,11 +18,12 @@ class TestFFI(CUDATestCase):
18
18
  # Path to the source containing the foreign function
19
19
  # (here assumed to be in a subdirectory called "ffi")
20
20
  basedir = os.path.dirname(os.path.abspath(__file__))
21
- functions_cu = os.path.join(basedir, 'ffi', 'functions.cu')
21
+ functions_cu = os.path.join(basedir, "ffi", "functions.cu")
22
22
 
23
23
  # Declaration of the foreign function
24
- mul = cuda.declare_device('mul_f32_f32', 'float32(float32, float32)',
25
- link=functions_cu)
24
+ mul = cuda.declare_device(
25
+ "mul_f32_f32", "float32(float32, float32)", link=functions_cu
26
+ )
26
27
 
27
28
  # A kernel that calls mul; functions.cu is linked automatically due to
28
29
  # the call to mul.
@@ -52,25 +53,29 @@ class TestFFI(CUDATestCase):
52
53
  import os
53
54
 
54
55
  basedir = os.path.dirname(os.path.abspath(__file__))
55
- functions_cu = os.path.join(basedir, 'ffi', 'functions.cu')
56
+ functions_cu = os.path.join(basedir, "ffi", "functions.cu")
56
57
 
57
58
  # magictoken.ex_from_buffer_decl.begin
58
- signature = 'float32(CPointer(float32), int32)'
59
- sum_reduce = cuda.declare_device('sum_reduce', signature,
60
- link=functions_cu)
59
+ signature = "float32(CPointer(float32), int32)"
60
+ sum_reduce = cuda.declare_device(
61
+ "sum_reduce", signature, link=functions_cu
62
+ )
61
63
  # magictoken.ex_from_buffer_decl.end
62
64
 
63
65
  # magictoken.ex_from_buffer_kernel.begin
64
66
  import cffi
67
+
65
68
  ffi = cffi.FFI()
66
69
 
67
70
  @cuda.jit
68
71
  def reduction_caller(result, array):
69
72
  array_ptr = ffi.from_buffer(array)
70
73
  result[()] = sum_reduce(array_ptr, len(array))
74
+
71
75
  # magictoken.ex_from_buffer_kernel.end
72
76
 
73
77
  import numpy as np
78
+
74
79
  x = np.arange(10).astype(np.float32)
75
80
  r = np.ndarray((), dtype=np.float32)
76
81
 
@@ -81,5 +86,5 @@ class TestFFI(CUDATestCase):
81
86
  np.testing.assert_allclose(expected, actual)
82
87
 
83
88
 
84
- if __name__ == '__main__':
89
+ if __name__ == "__main__":
85
90
  unittest.main()
@@ -1,14 +1,18 @@
1
1
  import unittest
2
2
 
3
- from numba.cuda.testing import (CUDATestCase, skip_if_cudadevrt_missing,
4
- skip_on_cudasim, skip_unless_cc_60,
5
- skip_if_mvc_enabled)
3
+ from numba.cuda.testing import (
4
+ CUDATestCase,
5
+ skip_if_cudadevrt_missing,
6
+ skip_on_cudasim,
7
+ skip_unless_cc_60,
8
+ skip_if_mvc_enabled,
9
+ )
6
10
  from numba.tests.support import captured_stdout
7
11
 
8
12
 
9
13
  @skip_if_cudadevrt_missing
10
14
  @skip_unless_cc_60
11
- @skip_if_mvc_enabled('CG not supported with MVC')
15
+ @skip_if_mvc_enabled("CG not supported with MVC")
12
16
  @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
13
17
  class TestLaplace(CUDATestCase):
14
18
  """
@@ -27,7 +31,6 @@ class TestLaplace(CUDATestCase):
27
31
  super().tearDown()
28
32
 
29
33
  def test_ex_laplace(self):
30
-
31
34
  # set True to regenerate the figures that
32
35
  # accompany this example
33
36
  plot = False
@@ -55,24 +58,25 @@ class TestLaplace(CUDATestCase):
55
58
 
56
59
  if plot:
57
60
  import matplotlib.pyplot as plt
61
+
58
62
  fig, ax = plt.subplots(figsize=(16 * 0.66, 9 * 0.66))
59
63
  plt.plot(
60
64
  np.arange(len(buf_0)),
61
65
  buf_0.copy_to_host(),
62
66
  lw=3,
63
67
  marker="*",
64
- color='black'
68
+ color="black",
65
69
  )
66
70
 
67
- plt.title('Initial State', fontsize=24)
68
- plt.xlabel('Position', fontsize=24)
69
- plt.ylabel('Temperature', fontsize=24)
71
+ plt.title("Initial State", fontsize=24)
72
+ plt.xlabel("Position", fontsize=24)
73
+ plt.ylabel("Temperature", fontsize=24)
70
74
 
71
75
  ax.set_xticks(ax.get_xticks(), fontsize=16)
72
76
  ax.set_yticks(ax.get_yticks(), fontsize=16)
73
77
  plt.xlim(0, len(data))
74
78
  plt.ylim(0, 10001)
75
- plt.savefig('laplace_initial.svg')
79
+ plt.savefig("laplace_initial.svg")
76
80
 
77
81
  # ex_laplace.kernel.begin
78
82
  @cuda.jit
@@ -116,12 +120,11 @@ class TestLaplace(CUDATestCase):
116
120
 
117
121
  # Wait for every thread to write before moving on
118
122
  grid.sync()
123
+
119
124
  # ex_laplace.kernel.end
120
125
 
121
126
  # ex_laplace.launch.begin
122
- solve_heat_equation.forall(len(data))(
123
- buf_0, buf_1, niter, 0.25
124
- )
127
+ solve_heat_equation.forall(len(data))(buf_0, buf_1, niter, 0.25)
125
128
  # ex_laplace.launch.end
126
129
 
127
130
  results = buf_1.copy_to_host()
@@ -129,20 +132,21 @@ class TestLaplace(CUDATestCase):
129
132
  fig, ax = plt.subplots(figsize=(16 * 0.66, 9 * 0.66))
130
133
  plt.plot(
131
134
  np.arange(len(results)),
132
- results, lw=3,
135
+ results,
136
+ lw=3,
133
137
  marker="*",
134
- color='black'
138
+ color="black",
135
139
  )
136
140
  plt.title(f"T = {niter}", fontsize=24)
137
- plt.xlabel('Position', fontsize=24)
138
- plt.ylabel('Temperature', fontsize=24)
141
+ plt.xlabel("Position", fontsize=24)
142
+ plt.ylabel("Temperature", fontsize=24)
139
143
 
140
144
  ax.set_xticks(ax.get_xticks(), fontsize=16)
141
145
  ax.set_yticks(ax.get_yticks(), fontsize=16)
142
146
 
143
147
  plt.ylim(0, max(results))
144
148
  plt.xlim(0, len(results))
145
- plt.savefig('laplace_final.svg')
149
+ plt.savefig("laplace_final.svg")
146
150
 
147
151
  # Integral over the domain should be equal to its initial value.
148
152
  # Note that this should match the initial value of data[500] above, but
@@ -6,6 +6,7 @@ Reference: https://stackoverflow.com/a/64198479/13697228 by @RobertCrovella
6
6
  Contents in this file are referenced from the sphinx-generated docs.
7
7
  "magictoken" is used for markers as beginning and ending of example text.
8
8
  """
9
+
9
10
  import unittest
10
11
  from numba.cuda.testing import CUDATestCase, skip_on_cudasim
11
12
  from numba.tests.support import captured_stdout
@@ -43,10 +44,11 @@ class TestMatMul(CUDATestCase):
43
44
  """Perform square matrix multiplication of C = A * B."""
44
45
  i, j = cuda.grid(2)
45
46
  if i < C.shape[0] and j < C.shape[1]:
46
- tmp = 0.
47
+ tmp = 0.0
47
48
  for k in range(A.shape[1]):
48
49
  tmp += A[i, k] * B[k, j]
49
50
  C[i, j] = tmp
51
+
50
52
  # magictoken.ex_matmul.end
51
53
 
52
54
  # magictoken.ex_run_matmul.begin
@@ -91,11 +93,11 @@ class TestMatMul(CUDATestCase):
91
93
 
92
94
  tx = cuda.threadIdx.x
93
95
  ty = cuda.threadIdx.y
94
- bpg = cuda.gridDim.x # blocks per grid
96
+ bpg = cuda.gridDim.x # blocks per grid
95
97
 
96
98
  # Each thread computes one element in the result matrix.
97
99
  # The dot product is chunked into dot products of TPB-long vectors.
98
- tmp = float32(0.)
100
+ tmp = float32(0.0)
99
101
  for i in range(bpg):
100
102
  # Preload data into shared memory
101
103
  sA[ty, tx] = 0
@@ -116,6 +118,7 @@ class TestMatMul(CUDATestCase):
116
118
  cuda.syncthreads()
117
119
  if y < C.shape[0] and x < C.shape[1]:
118
120
  C[y, x] = tmp
121
+
119
122
  # magictoken.ex_fast_matmul.end
120
123
 
121
124
  # magictoken.ex_run_fast_matmul.begin
@@ -169,5 +172,5 @@ class TestMatMul(CUDATestCase):
169
172
  self.assertTrue(np.all(z_h == x_h @ y_h), msg=msg)
170
173
 
171
174
 
172
- if __name__ == '__main__':
175
+ if __name__ == "__main__":
173
176
  unittest.main()
@@ -59,6 +59,7 @@ class TestMonteCarlo(CUDATestCase):
59
59
  # value of the sample
60
60
  y = func(samp)
61
61
  out[gid] = y
62
+
62
63
  # ex_montecarlo.kernel.end
63
64
 
64
65
  # ex_montecarlo.callfunc.begin
@@ -84,6 +85,7 @@ class TestMonteCarlo(CUDATestCase):
84
85
  factor = (upper_lim - lower_lim) / (nsamps - 1)
85
86
 
86
87
  return sum_reduce(out) * factor
88
+
87
89
  # ex_montecarlo.callfunc.end
88
90
 
89
91
  # ex_montecarlo.launch.begin
@@ -10,8 +10,10 @@ class TestRandom(CUDATestCase):
10
10
  def test_ex_3d_grid(self):
11
11
  # magictoken.ex_3d_grid.begin
12
12
  from numba import cuda
13
- from numba.cuda.random import (create_xoroshiro128p_states,
14
- xoroshiro128p_uniform_float32)
13
+ from numba.cuda.random import (
14
+ create_xoroshiro128p_states,
15
+ xoroshiro128p_uniform_float32,
16
+ )
15
17
  import numpy as np
16
18
 
17
19
  @cuda.jit
@@ -27,7 +29,9 @@ class TestRandom(CUDATestCase):
27
29
  for i in range(startz, arr.shape[0], stridez):
28
30
  for j in range(starty, arr.shape[1], stridey):
29
31
  for k in range(startx, arr.shape[2], stridex):
30
- arr[i, j, k] = xoroshiro128p_uniform_float32(rng_states, tid)
32
+ arr[i, j, k] = xoroshiro128p_uniform_float32(
33
+ rng_states, tid
34
+ )
31
35
 
32
36
  # Array dimensions
33
37
  X, Y, Z = 701, 900, 719
@@ -55,5 +59,5 @@ class TestRandom(CUDATestCase):
55
59
  self.assertTrue(np.all(host_arr >= 0.0))
56
60
 
57
61
 
58
- if __name__ == '__main__':
62
+ if __name__ == "__main__":
59
63
  unittest.main()
@@ -61,11 +61,12 @@ class TestReduction(CUDATestCase):
61
61
  # After the loop, the zeroth element contains the sum
62
62
  if tid == 0:
63
63
  data[tid] = shr[tid]
64
+
64
65
  # ex_reduction.kernel.end
65
66
 
66
67
  # ex_reduction.launch.begin
67
68
  array_sum[1, nelem](a)
68
- print(a[0]) # 523776
69
+ print(a[0]) # 523776
69
70
  print(sum(np.arange(1024))) # 523776
70
71
  # ex_reduction.launch.end
71
72