numba-cuda 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (227) hide show
  1. _numba_cuda_redirector.py +17 -13
  2. numba_cuda/VERSION +1 -1
  3. numba_cuda/_version.py +4 -1
  4. numba_cuda/numba/cuda/__init__.py +6 -2
  5. numba_cuda/numba/cuda/api.py +129 -86
  6. numba_cuda/numba/cuda/api_util.py +3 -3
  7. numba_cuda/numba/cuda/args.py +12 -16
  8. numba_cuda/numba/cuda/cg.py +6 -6
  9. numba_cuda/numba/cuda/codegen.py +74 -43
  10. numba_cuda/numba/cuda/compiler.py +232 -113
  11. numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
  12. numba_cuda/numba/cuda/cuda_fp16.h +661 -661
  13. numba_cuda/numba/cuda/cuda_fp16.hpp +3 -3
  14. numba_cuda/numba/cuda/cuda_paths.py +291 -99
  15. numba_cuda/numba/cuda/cudadecl.py +125 -69
  16. numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
  17. numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
  18. numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
  19. numba_cuda/numba/cuda/cudadrv/driver.py +463 -297
  20. numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
  21. numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
  22. numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
  23. numba_cuda/numba/cuda/cudadrv/error.py +6 -2
  24. numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
  25. numba_cuda/numba/cuda/cudadrv/linkable_code.py +16 -1
  26. numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
  27. numba_cuda/numba/cuda/cudadrv/nvrtc.py +138 -29
  28. numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
  29. numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
  30. numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
  31. numba_cuda/numba/cuda/cudaimpl.py +317 -233
  32. numba_cuda/numba/cuda/cudamath.py +1 -1
  33. numba_cuda/numba/cuda/debuginfo.py +8 -6
  34. numba_cuda/numba/cuda/decorators.py +75 -45
  35. numba_cuda/numba/cuda/descriptor.py +1 -1
  36. numba_cuda/numba/cuda/device_init.py +69 -18
  37. numba_cuda/numba/cuda/deviceufunc.py +143 -98
  38. numba_cuda/numba/cuda/dispatcher.py +300 -213
  39. numba_cuda/numba/cuda/errors.py +13 -10
  40. numba_cuda/numba/cuda/extending.py +1 -1
  41. numba_cuda/numba/cuda/initialize.py +5 -3
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +3 -3
  43. numba_cuda/numba/cuda/intrinsics.py +31 -27
  44. numba_cuda/numba/cuda/kernels/reduction.py +13 -13
  45. numba_cuda/numba/cuda/kernels/transpose.py +3 -6
  46. numba_cuda/numba/cuda/libdevice.py +317 -317
  47. numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
  48. numba_cuda/numba/cuda/locks.py +16 -0
  49. numba_cuda/numba/cuda/mathimpl.py +62 -57
  50. numba_cuda/numba/cuda/models.py +1 -5
  51. numba_cuda/numba/cuda/nvvmutils.py +103 -88
  52. numba_cuda/numba/cuda/printimpl.py +9 -5
  53. numba_cuda/numba/cuda/random.py +46 -36
  54. numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
  55. numba_cuda/numba/cuda/runtime/__init__.py +1 -1
  56. numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
  57. numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
  58. numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
  59. numba_cuda/numba/cuda/runtime/nrt.py +48 -43
  60. numba_cuda/numba/cuda/simulator/__init__.py +22 -12
  61. numba_cuda/numba/cuda/simulator/api.py +38 -22
  62. numba_cuda/numba/cuda/simulator/compiler.py +2 -2
  63. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
  64. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
  65. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
  66. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
  67. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
  68. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
  69. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
  70. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
  71. numba_cuda/numba/cuda/simulator/kernel.py +43 -34
  72. numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
  73. numba_cuda/numba/cuda/simulator/reduction.py +1 -0
  74. numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
  75. numba_cuda/numba/cuda/simulator_init.py +2 -4
  76. numba_cuda/numba/cuda/stubs.py +139 -102
  77. numba_cuda/numba/cuda/target.py +64 -47
  78. numba_cuda/numba/cuda/testing.py +24 -19
  79. numba_cuda/numba/cuda/tests/__init__.py +14 -12
  80. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
  81. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
  88. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
  89. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
  90. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
  91. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
  92. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
  93. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
  94. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
  95. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
  98. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
  100. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
  101. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
  102. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
  103. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
  104. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
  105. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
  106. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
  107. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
  108. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
  109. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
  110. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
  111. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +7 -6
  112. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
  113. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
  115. numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
  117. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
  118. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
  119. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +57 -21
  120. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
  121. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
  122. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
  123. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
  124. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
  126. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
  127. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
  128. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
  129. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
  131. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
  132. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
  133. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
  134. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +31 -28
  135. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
  136. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
  137. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +6 -7
  138. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
  139. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
  140. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +19 -12
  141. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
  142. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
  143. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
  144. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
  145. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
  148. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
  149. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
  150. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
  151. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
  152. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
  153. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
  154. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
  155. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +6 -6
  156. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
  157. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
  158. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
  159. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
  160. numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
  161. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
  162. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
  163. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
  164. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
  165. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
  166. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
  167. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
  168. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
  169. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
  170. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
  171. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
  172. numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
  173. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
  174. numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
  175. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
  176. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
  177. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
  178. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
  179. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
  180. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
  182. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
  183. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
  184. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
  185. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
  186. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
  187. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
  188. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
  192. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
  193. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
  194. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
  195. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +31 -25
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
  197. numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
  198. numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
  199. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
  200. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
  201. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
  202. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
  203. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
  204. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
  206. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
  207. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
  208. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
  209. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
  210. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
  211. numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
  212. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
  213. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
  214. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
  215. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
  216. numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
  217. numba_cuda/numba/cuda/types.py +5 -2
  218. numba_cuda/numba/cuda/ufuncs.py +382 -362
  219. numba_cuda/numba/cuda/utils.py +2 -2
  220. numba_cuda/numba/cuda/vector_types.py +2 -2
  221. numba_cuda/numba/cuda/vectorizers.py +37 -32
  222. {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/METADATA +1 -1
  223. numba_cuda-0.9.0.dist-info/RECORD +253 -0
  224. {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/WHEEL +1 -1
  225. numba_cuda-0.8.0.dist-info/RECORD +0 -251
  226. {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/licenses/LICENSE +0 -0
  227. {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/top_level.txt +0 -0
@@ -22,9 +22,17 @@ def atomic_cast_none(num):
22
22
 
23
23
 
24
24
  @cuda.jit(device=True)
25
- def atomic_binary_1dim_shared(ary, idx, op2, ary_dtype, ary_nelements,
26
- binop_func, cast_func, initializer,
27
- neg_idx):
25
+ def atomic_binary_1dim_shared(
26
+ ary,
27
+ idx,
28
+ op2,
29
+ ary_dtype,
30
+ ary_nelements,
31
+ binop_func,
32
+ cast_func,
33
+ initializer,
34
+ neg_idx,
35
+ ):
28
36
  tid = cuda.threadIdx.x
29
37
  sm = cuda.shared.array(ary_nelements, ary_dtype)
30
38
  sm[tid] = initializer
@@ -38,8 +46,9 @@ def atomic_binary_1dim_shared(ary, idx, op2, ary_dtype, ary_nelements,
38
46
 
39
47
 
40
48
  @cuda.jit(device=True)
41
- def atomic_binary_1dim_shared2(ary, idx, op2, ary_dtype, ary_nelements,
42
- binop_func, cast_func):
49
+ def atomic_binary_1dim_shared2(
50
+ ary, idx, op2, ary_dtype, ary_nelements, binop_func, cast_func
51
+ ):
43
52
  tid = cuda.threadIdx.x
44
53
  sm = cuda.shared.array(ary_nelements, ary_dtype)
45
54
  sm[tid] = ary[tid]
@@ -51,8 +60,9 @@ def atomic_binary_1dim_shared2(ary, idx, op2, ary_dtype, ary_nelements,
51
60
 
52
61
 
53
62
  @cuda.jit(device=True)
54
- def atomic_binary_2dim_shared(ary, op2, ary_dtype, ary_shape,
55
- binop_func, y_cast_func, neg_idx):
63
+ def atomic_binary_2dim_shared(
64
+ ary, op2, ary_dtype, ary_shape, binop_func, y_cast_func, neg_idx
65
+ ):
56
66
  tx = cuda.threadIdx.x
57
67
  ty = cuda.threadIdx.y
58
68
  sm = cuda.shared.array(ary_shape, ary_dtype)
@@ -77,8 +87,9 @@ def atomic_binary_2dim_global(ary, op2, binop_func, y_cast_func, neg_idx):
77
87
 
78
88
 
79
89
  @cuda.jit(device=True)
80
- def atomic_binary_1dim_global(ary, idx, ary_nelements, op2,
81
- binop_func, neg_idx):
90
+ def atomic_binary_1dim_global(
91
+ ary, idx, ary_nelements, op2, binop_func, neg_idx
92
+ ):
82
93
  tid = cuda.threadIdx.x
83
94
  bin = int(idx[tid] % ary_nelements)
84
95
  if neg_idx:
@@ -87,53 +98,79 @@ def atomic_binary_1dim_global(ary, idx, ary_nelements, op2,
87
98
 
88
99
 
89
100
  def atomic_add(ary):
90
- atomic_binary_1dim_shared(ary, ary, 1, uint32, 32,
91
- cuda.atomic.add, atomic_cast_none, 0, False)
101
+ atomic_binary_1dim_shared(
102
+ ary, ary, 1, uint32, 32, cuda.atomic.add, atomic_cast_none, 0, False
103
+ )
92
104
 
93
105
 
94
106
  def atomic_add_wrap(ary):
95
- atomic_binary_1dim_shared(ary, ary, 1, uint32, 32,
96
- cuda.atomic.add, atomic_cast_none, 0, True)
107
+ atomic_binary_1dim_shared(
108
+ ary, ary, 1, uint32, 32, cuda.atomic.add, atomic_cast_none, 0, True
109
+ )
97
110
 
98
111
 
99
112
  def atomic_add2(ary):
100
- atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
101
- cuda.atomic.add, atomic_cast_none, False)
113
+ atomic_binary_2dim_shared(
114
+ ary, 1, uint32, (4, 8), cuda.atomic.add, atomic_cast_none, False
115
+ )
102
116
 
103
117
 
104
118
  def atomic_add2_wrap(ary):
105
- atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
106
- cuda.atomic.add, atomic_cast_none, True)
119
+ atomic_binary_2dim_shared(
120
+ ary, 1, uint32, (4, 8), cuda.atomic.add, atomic_cast_none, True
121
+ )
107
122
 
108
123
 
109
124
  def atomic_add3(ary):
110
- atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
111
- cuda.atomic.add, atomic_cast_to_uint64, False)
125
+ atomic_binary_2dim_shared(
126
+ ary, 1, uint32, (4, 8), cuda.atomic.add, atomic_cast_to_uint64, False
127
+ )
112
128
 
113
129
 
114
130
  def atomic_add_float(ary):
115
- atomic_binary_1dim_shared(ary, ary, 1.0, float32, 32,
116
- cuda.atomic.add, atomic_cast_to_int, 0.0, False)
131
+ atomic_binary_1dim_shared(
132
+ ary,
133
+ ary,
134
+ 1.0,
135
+ float32,
136
+ 32,
137
+ cuda.atomic.add,
138
+ atomic_cast_to_int,
139
+ 0.0,
140
+ False,
141
+ )
117
142
 
118
143
 
119
144
  def atomic_add_float_wrap(ary):
120
- atomic_binary_1dim_shared(ary, ary, 1.0, float32, 32,
121
- cuda.atomic.add, atomic_cast_to_int, 0.0, True)
145
+ atomic_binary_1dim_shared(
146
+ ary,
147
+ ary,
148
+ 1.0,
149
+ float32,
150
+ 32,
151
+ cuda.atomic.add,
152
+ atomic_cast_to_int,
153
+ 0.0,
154
+ True,
155
+ )
122
156
 
123
157
 
124
158
  def atomic_add_float_2(ary):
125
- atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
126
- cuda.atomic.add, atomic_cast_none, False)
159
+ atomic_binary_2dim_shared(
160
+ ary, 1.0, float32, (4, 8), cuda.atomic.add, atomic_cast_none, False
161
+ )
127
162
 
128
163
 
129
164
  def atomic_add_float_2_wrap(ary):
130
- atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
131
- cuda.atomic.add, atomic_cast_none, True)
165
+ atomic_binary_2dim_shared(
166
+ ary, 1.0, float32, (4, 8), cuda.atomic.add, atomic_cast_none, True
167
+ )
132
168
 
133
169
 
134
170
  def atomic_add_float_3(ary):
135
- atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
136
- cuda.atomic.add, atomic_cast_to_uint64, False)
171
+ atomic_binary_2dim_shared(
172
+ ary, 1.0, float32, (4, 8), cuda.atomic.add, atomic_cast_to_uint64, False
173
+ )
137
174
 
138
175
 
139
176
  def atomic_add_double_global(idx, ary):
@@ -153,78 +190,117 @@ def atomic_add_double_global_2_wrap(ary):
153
190
 
154
191
 
155
192
  def atomic_add_double_global_3(ary):
156
- atomic_binary_2dim_global(ary, 1, cuda.atomic.add, atomic_cast_to_uint64,
157
- False)
193
+ atomic_binary_2dim_global(
194
+ ary, 1, cuda.atomic.add, atomic_cast_to_uint64, False
195
+ )
158
196
 
159
197
 
160
198
  def atomic_add_double(idx, ary):
161
- atomic_binary_1dim_shared(ary, idx, 1.0, float64, 32,
162
- cuda.atomic.add, atomic_cast_none, 0.0, False)
199
+ atomic_binary_1dim_shared(
200
+ ary,
201
+ idx,
202
+ 1.0,
203
+ float64,
204
+ 32,
205
+ cuda.atomic.add,
206
+ atomic_cast_none,
207
+ 0.0,
208
+ False,
209
+ )
163
210
 
164
211
 
165
212
  def atomic_add_double_wrap(idx, ary):
166
- atomic_binary_1dim_shared(ary, idx, 1.0, float64, 32,
167
- cuda.atomic.add, atomic_cast_none, 0.0, True)
213
+ atomic_binary_1dim_shared(
214
+ ary, idx, 1.0, float64, 32, cuda.atomic.add, atomic_cast_none, 0.0, True
215
+ )
168
216
 
169
217
 
170
218
  def atomic_add_double_2(ary):
171
- atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
172
- cuda.atomic.add, atomic_cast_none, False)
219
+ atomic_binary_2dim_shared(
220
+ ary, 1.0, float64, (4, 8), cuda.atomic.add, atomic_cast_none, False
221
+ )
173
222
 
174
223
 
175
224
  def atomic_add_double_2_wrap(ary):
176
- atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
177
- cuda.atomic.add, atomic_cast_none, True)
225
+ atomic_binary_2dim_shared(
226
+ ary, 1.0, float64, (4, 8), cuda.atomic.add, atomic_cast_none, True
227
+ )
178
228
 
179
229
 
180
230
  def atomic_add_double_3(ary):
181
- atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
182
- cuda.atomic.add, atomic_cast_to_uint64, False)
231
+ atomic_binary_2dim_shared(
232
+ ary, 1.0, float64, (4, 8), cuda.atomic.add, atomic_cast_to_uint64, False
233
+ )
183
234
 
184
235
 
185
236
  def atomic_sub(ary):
186
- atomic_binary_1dim_shared(ary, ary, 1, uint32, 32,
187
- cuda.atomic.sub, atomic_cast_none, 0, False)
237
+ atomic_binary_1dim_shared(
238
+ ary, ary, 1, uint32, 32, cuda.atomic.sub, atomic_cast_none, 0, False
239
+ )
188
240
 
189
241
 
190
242
  def atomic_sub2(ary):
191
- atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
192
- cuda.atomic.sub, atomic_cast_none, False)
243
+ atomic_binary_2dim_shared(
244
+ ary, 1, uint32, (4, 8), cuda.atomic.sub, atomic_cast_none, False
245
+ )
193
246
 
194
247
 
195
248
  def atomic_sub3(ary):
196
- atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
197
- cuda.atomic.sub, atomic_cast_to_uint64, False)
249
+ atomic_binary_2dim_shared(
250
+ ary, 1, uint32, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False
251
+ )
198
252
 
199
253
 
200
254
  def atomic_sub_float(ary):
201
- atomic_binary_1dim_shared(ary, ary, 1.0, float32, 32,
202
- cuda.atomic.sub, atomic_cast_to_int, 0.0, False)
255
+ atomic_binary_1dim_shared(
256
+ ary,
257
+ ary,
258
+ 1.0,
259
+ float32,
260
+ 32,
261
+ cuda.atomic.sub,
262
+ atomic_cast_to_int,
263
+ 0.0,
264
+ False,
265
+ )
203
266
 
204
267
 
205
268
  def atomic_sub_float_2(ary):
206
- atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
207
- cuda.atomic.sub, atomic_cast_none, False)
269
+ atomic_binary_2dim_shared(
270
+ ary, 1.0, float32, (4, 8), cuda.atomic.sub, atomic_cast_none, False
271
+ )
208
272
 
209
273
 
210
274
  def atomic_sub_float_3(ary):
211
- atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
212
- cuda.atomic.sub, atomic_cast_to_uint64, False)
275
+ atomic_binary_2dim_shared(
276
+ ary, 1.0, float32, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False
277
+ )
213
278
 
214
279
 
215
280
  def atomic_sub_double(idx, ary):
216
- atomic_binary_1dim_shared(ary, idx, 1.0, float64, 32,
217
- cuda.atomic.sub, atomic_cast_none, 0.0, False)
281
+ atomic_binary_1dim_shared(
282
+ ary,
283
+ idx,
284
+ 1.0,
285
+ float64,
286
+ 32,
287
+ cuda.atomic.sub,
288
+ atomic_cast_none,
289
+ 0.0,
290
+ False,
291
+ )
218
292
 
219
293
 
220
294
  def atomic_sub_double_2(ary):
221
- atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
222
- cuda.atomic.sub, atomic_cast_none, False)
295
+ atomic_binary_2dim_shared(
296
+ ary, 1.0, float64, (4, 8), cuda.atomic.sub, atomic_cast_none, False
297
+ )
223
298
 
224
299
 
225
300
  def atomic_sub_double_3(ary):
226
- atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
227
- cuda.atomic.sub, atomic_cast_to_uint64, False)
301
+ atomic_binary_2dim_shared(
302
+ ary, 1.0, float64, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False
303
+ )
228
304
 
229
305
 
230
306
  def atomic_sub_double_global(idx, ary):
@@ -232,28 +308,33 @@ def atomic_sub_double_global(idx, ary):
232
308
 
233
309
 
234
310
  def atomic_sub_double_global_2(ary):
235
- atomic_binary_2dim_global(ary, 1.0, cuda.atomic.sub, atomic_cast_none,
236
- False)
311
+ atomic_binary_2dim_global(
312
+ ary, 1.0, cuda.atomic.sub, atomic_cast_none, False
313
+ )
237
314
 
238
315
 
239
316
  def atomic_sub_double_global_3(ary):
240
- atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
241
- cuda.atomic.sub, atomic_cast_to_uint64, False)
317
+ atomic_binary_2dim_shared(
318
+ ary, 1.0, float64, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False
319
+ )
242
320
 
243
321
 
244
322
  def atomic_and(ary, op2):
245
- atomic_binary_1dim_shared(ary, ary, op2, uint32, 32,
246
- cuda.atomic.and_, atomic_cast_none, 1, False)
323
+ atomic_binary_1dim_shared(
324
+ ary, ary, op2, uint32, 32, cuda.atomic.and_, atomic_cast_none, 1, False
325
+ )
247
326
 
248
327
 
249
328
  def atomic_and2(ary, op2):
250
- atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
251
- cuda.atomic.and_, atomic_cast_none, False)
329
+ atomic_binary_2dim_shared(
330
+ ary, op2, uint32, (4, 8), cuda.atomic.and_, atomic_cast_none, False
331
+ )
252
332
 
253
333
 
254
334
  def atomic_and3(ary, op2):
255
- atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
256
- cuda.atomic.and_, atomic_cast_to_uint64, False)
335
+ atomic_binary_2dim_shared(
336
+ ary, op2, uint32, (4, 8), cuda.atomic.and_, atomic_cast_to_uint64, False
337
+ )
257
338
 
258
339
 
259
340
  def atomic_and_global(idx, ary, op2):
@@ -261,23 +342,27 @@ def atomic_and_global(idx, ary, op2):
261
342
 
262
343
 
263
344
  def atomic_and_global_2(ary, op2):
264
- atomic_binary_2dim_global(ary, op2, cuda.atomic.and_,
265
- atomic_cast_none, False)
345
+ atomic_binary_2dim_global(
346
+ ary, op2, cuda.atomic.and_, atomic_cast_none, False
347
+ )
266
348
 
267
349
 
268
350
  def atomic_or(ary, op2):
269
- atomic_binary_1dim_shared(ary, ary, op2, uint32, 32,
270
- cuda.atomic.or_, atomic_cast_none, 0, False)
351
+ atomic_binary_1dim_shared(
352
+ ary, ary, op2, uint32, 32, cuda.atomic.or_, atomic_cast_none, 0, False
353
+ )
271
354
 
272
355
 
273
356
  def atomic_or2(ary, op2):
274
- atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
275
- cuda.atomic.or_, atomic_cast_none, False)
357
+ atomic_binary_2dim_shared(
358
+ ary, op2, uint32, (4, 8), cuda.atomic.or_, atomic_cast_none, False
359
+ )
276
360
 
277
361
 
278
362
  def atomic_or3(ary, op2):
279
- atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
280
- cuda.atomic.or_, atomic_cast_to_uint64, False)
363
+ atomic_binary_2dim_shared(
364
+ ary, op2, uint32, (4, 8), cuda.atomic.or_, atomic_cast_to_uint64, False
365
+ )
281
366
 
282
367
 
283
368
  def atomic_or_global(idx, ary, op2):
@@ -285,23 +370,27 @@ def atomic_or_global(idx, ary, op2):
285
370
 
286
371
 
287
372
  def atomic_or_global_2(ary, op2):
288
- atomic_binary_2dim_global(ary, op2, cuda.atomic.or_,
289
- atomic_cast_none, False)
373
+ atomic_binary_2dim_global(
374
+ ary, op2, cuda.atomic.or_, atomic_cast_none, False
375
+ )
290
376
 
291
377
 
292
378
  def atomic_xor(ary, op2):
293
- atomic_binary_1dim_shared(ary, ary, op2, uint32, 32,
294
- cuda.atomic.xor, atomic_cast_none, 0, False)
379
+ atomic_binary_1dim_shared(
380
+ ary, ary, op2, uint32, 32, cuda.atomic.xor, atomic_cast_none, 0, False
381
+ )
295
382
 
296
383
 
297
384
  def atomic_xor2(ary, op2):
298
- atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
299
- cuda.atomic.xor, atomic_cast_none, False)
385
+ atomic_binary_2dim_shared(
386
+ ary, op2, uint32, (4, 8), cuda.atomic.xor, atomic_cast_none, False
387
+ )
300
388
 
301
389
 
302
390
  def atomic_xor3(ary, op2):
303
- atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
304
- cuda.atomic.xor, atomic_cast_to_uint64, False)
391
+ atomic_binary_2dim_shared(
392
+ ary, op2, uint32, (4, 8), cuda.atomic.xor, atomic_cast_to_uint64, False
393
+ )
305
394
 
306
395
 
307
396
  def atomic_xor_global(idx, ary, op2):
@@ -309,33 +398,39 @@ def atomic_xor_global(idx, ary, op2):
309
398
 
310
399
 
311
400
  def atomic_xor_global_2(ary, op2):
312
- atomic_binary_2dim_global(ary, op2, cuda.atomic.xor,
313
- atomic_cast_none, False)
401
+ atomic_binary_2dim_global(
402
+ ary, op2, cuda.atomic.xor, atomic_cast_none, False
403
+ )
314
404
 
315
405
 
316
406
  def atomic_inc32(ary, idx, op2):
317
- atomic_binary_1dim_shared2(ary, idx, op2, uint32, 32,
318
- cuda.atomic.inc, atomic_cast_none)
407
+ atomic_binary_1dim_shared2(
408
+ ary, idx, op2, uint32, 32, cuda.atomic.inc, atomic_cast_none
409
+ )
319
410
 
320
411
 
321
412
  def atomic_inc64(ary, idx, op2):
322
- atomic_binary_1dim_shared2(ary, idx, op2, uint64, 32,
323
- cuda.atomic.inc, atomic_cast_to_int)
413
+ atomic_binary_1dim_shared2(
414
+ ary, idx, op2, uint64, 32, cuda.atomic.inc, atomic_cast_to_int
415
+ )
324
416
 
325
417
 
326
418
  def atomic_inc2_32(ary, op2):
327
- atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
328
- cuda.atomic.inc, atomic_cast_none, False)
419
+ atomic_binary_2dim_shared(
420
+ ary, op2, uint32, (4, 8), cuda.atomic.inc, atomic_cast_none, False
421
+ )
329
422
 
330
423
 
331
424
  def atomic_inc2_64(ary, op2):
332
- atomic_binary_2dim_shared(ary, op2, uint64, (4, 8),
333
- cuda.atomic.inc, atomic_cast_none, False)
425
+ atomic_binary_2dim_shared(
426
+ ary, op2, uint64, (4, 8), cuda.atomic.inc, atomic_cast_none, False
427
+ )
334
428
 
335
429
 
336
430
  def atomic_inc3(ary, op2):
337
- atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
338
- cuda.atomic.inc, atomic_cast_to_uint64, False)
431
+ atomic_binary_2dim_shared(
432
+ ary, op2, uint32, (4, 8), cuda.atomic.inc, atomic_cast_to_uint64, False
433
+ )
339
434
 
340
435
 
341
436
  def atomic_inc_global(idx, ary, op2):
@@ -343,33 +438,39 @@ def atomic_inc_global(idx, ary, op2):
343
438
 
344
439
 
345
440
  def atomic_inc_global_2(ary, op2):
346
- atomic_binary_2dim_global(ary, op2, cuda.atomic.inc,
347
- atomic_cast_none, False)
441
+ atomic_binary_2dim_global(
442
+ ary, op2, cuda.atomic.inc, atomic_cast_none, False
443
+ )
348
444
 
349
445
 
350
446
  def atomic_dec32(ary, idx, op2):
351
- atomic_binary_1dim_shared2(ary, idx, op2, uint32, 32,
352
- cuda.atomic.dec, atomic_cast_none)
447
+ atomic_binary_1dim_shared2(
448
+ ary, idx, op2, uint32, 32, cuda.atomic.dec, atomic_cast_none
449
+ )
353
450
 
354
451
 
355
452
  def atomic_dec64(ary, idx, op2):
356
- atomic_binary_1dim_shared2(ary, idx, op2, uint64, 32,
357
- cuda.atomic.dec, atomic_cast_to_int)
453
+ atomic_binary_1dim_shared2(
454
+ ary, idx, op2, uint64, 32, cuda.atomic.dec, atomic_cast_to_int
455
+ )
358
456
 
359
457
 
360
458
  def atomic_dec2_32(ary, op2):
361
- atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
362
- cuda.atomic.dec, atomic_cast_none, False)
459
+ atomic_binary_2dim_shared(
460
+ ary, op2, uint32, (4, 8), cuda.atomic.dec, atomic_cast_none, False
461
+ )
363
462
 
364
463
 
365
464
  def atomic_dec2_64(ary, op2):
366
- atomic_binary_2dim_shared(ary, op2, uint64, (4, 8),
367
- cuda.atomic.dec, atomic_cast_none, False)
465
+ atomic_binary_2dim_shared(
466
+ ary, op2, uint64, (4, 8), cuda.atomic.dec, atomic_cast_none, False
467
+ )
368
468
 
369
469
 
370
470
  def atomic_dec3(ary, op2):
371
- atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
372
- cuda.atomic.dec, atomic_cast_to_uint64, False)
471
+ atomic_binary_2dim_shared(
472
+ ary, op2, uint32, (4, 8), cuda.atomic.dec, atomic_cast_to_uint64, False
473
+ )
373
474
 
374
475
 
375
476
  def atomic_dec_global(idx, ary, op2):
@@ -377,23 +478,27 @@ def atomic_dec_global(idx, ary, op2):
377
478
 
378
479
 
379
480
  def atomic_dec_global_2(ary, op2):
380
- atomic_binary_2dim_global(ary, op2, cuda.atomic.dec,
381
- atomic_cast_none, False)
481
+ atomic_binary_2dim_global(
482
+ ary, op2, cuda.atomic.dec, atomic_cast_none, False
483
+ )
382
484
 
383
485
 
384
486
  def atomic_exch(ary, idx, op2):
385
- atomic_binary_1dim_shared2(ary, idx, op2, uint32, 32,
386
- cuda.atomic.exch, atomic_cast_none)
487
+ atomic_binary_1dim_shared2(
488
+ ary, idx, op2, uint32, 32, cuda.atomic.exch, atomic_cast_none
489
+ )
387
490
 
388
491
 
389
492
  def atomic_exch2(ary, op2):
390
- atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
391
- cuda.atomic.exch, atomic_cast_none, False)
493
+ atomic_binary_2dim_shared(
494
+ ary, op2, uint32, (4, 8), cuda.atomic.exch, atomic_cast_none, False
495
+ )
392
496
 
393
497
 
394
498
  def atomic_exch3(ary, op2):
395
- atomic_binary_2dim_shared(ary, op2, uint64, (4, 8),
396
- cuda.atomic.exch, atomic_cast_none, False)
499
+ atomic_binary_2dim_shared(
500
+ ary, op2, uint64, (4, 8), cuda.atomic.exch, atomic_cast_none, False
501
+ )
397
502
 
398
503
 
399
504
  def atomic_exch_global(idx, ary, op2):
@@ -401,7 +506,6 @@ def atomic_exch_global(idx, ary, op2):
401
506
 
402
507
 
403
508
  def gen_atomic_extreme_funcs(func):
404
-
405
509
  fns = dedent("""
406
510
  def atomic(res, ary):
407
511
  tx = cuda.threadIdx.x
@@ -431,21 +535,39 @@ def gen_atomic_extreme_funcs(func):
431
535
  res[0] = smres[0]
432
536
  """).format(func=func)
433
537
  ld = {}
434
- exec(fns, {'cuda': cuda, 'float64': float64, 'uint64': uint64}, ld)
435
- return (ld['atomic'], ld['atomic_double_normalizedindex'],
436
- ld['atomic_double_oneindex'], ld['atomic_double_shared'])
437
-
438
-
439
- (atomic_max, atomic_max_double_normalizedindex, atomic_max_double_oneindex,
440
- atomic_max_double_shared) = gen_atomic_extreme_funcs('cuda.atomic.max')
441
- (atomic_min, atomic_min_double_normalizedindex, atomic_min_double_oneindex,
442
- atomic_min_double_shared) = gen_atomic_extreme_funcs('cuda.atomic.min')
443
- (atomic_nanmax, atomic_nanmax_double_normalizedindex,
444
- atomic_nanmax_double_oneindex, atomic_nanmax_double_shared) = \
445
- gen_atomic_extreme_funcs('cuda.atomic.nanmax')
446
- (atomic_nanmin, atomic_nanmin_double_normalizedindex,
447
- atomic_nanmin_double_oneindex, atomic_nanmin_double_shared) = \
448
- gen_atomic_extreme_funcs('cuda.atomic.nanmin')
538
+ exec(fns, {"cuda": cuda, "float64": float64, "uint64": uint64}, ld)
539
+ return (
540
+ ld["atomic"],
541
+ ld["atomic_double_normalizedindex"],
542
+ ld["atomic_double_oneindex"],
543
+ ld["atomic_double_shared"],
544
+ )
545
+
546
+
547
+ (
548
+ atomic_max,
549
+ atomic_max_double_normalizedindex,
550
+ atomic_max_double_oneindex,
551
+ atomic_max_double_shared,
552
+ ) = gen_atomic_extreme_funcs("cuda.atomic.max")
553
+ (
554
+ atomic_min,
555
+ atomic_min_double_normalizedindex,
556
+ atomic_min_double_oneindex,
557
+ atomic_min_double_shared,
558
+ ) = gen_atomic_extreme_funcs("cuda.atomic.min")
559
+ (
560
+ atomic_nanmax,
561
+ atomic_nanmax_double_normalizedindex,
562
+ atomic_nanmax_double_oneindex,
563
+ atomic_nanmax_double_shared,
564
+ ) = gen_atomic_extreme_funcs("cuda.atomic.nanmax")
565
+ (
566
+ atomic_nanmin,
567
+ atomic_nanmin_double_normalizedindex,
568
+ atomic_nanmin_double_oneindex,
569
+ atomic_nanmin_double_shared,
570
+ ) = gen_atomic_extreme_funcs("cuda.atomic.nanmin")
449
571
 
450
572
 
451
573
  def atomic_compare_and_swap(res, old, ary, fill_val):
@@ -476,10 +598,10 @@ class TestCudaAtomics(CUDATestCase):
476
598
  ary_wrap = ary.copy()
477
599
  orig = ary.copy()
478
600
 
479
- cuda_atomic_add = cuda.jit('void(uint32[:])')(atomic_add)
601
+ cuda_atomic_add = cuda.jit("void(uint32[:])")(atomic_add)
480
602
  cuda_atomic_add[1, 32](ary)
481
603
 
482
- cuda_atomic_add_wrap = cuda.jit('void(uint32[:])')(atomic_add_wrap)
604
+ cuda_atomic_add_wrap = cuda.jit("void(uint32[:])")(atomic_add_wrap)
483
605
  cuda_atomic_add_wrap[1, 32](ary_wrap)
484
606
 
485
607
  gold = np.zeros(32, dtype=np.uint32)
@@ -494,10 +616,10 @@ class TestCudaAtomics(CUDATestCase):
494
616
  ary_wrap = ary.copy()
495
617
  orig = ary.copy()
496
618
 
497
- cuda_atomic_add2 = cuda.jit('void(uint32[:,:])')(atomic_add2)
619
+ cuda_atomic_add2 = cuda.jit("void(uint32[:,:])")(atomic_add2)
498
620
  cuda_atomic_add2[1, (4, 8)](ary)
499
621
 
500
- cuda_atomic_add2_wrap = cuda.jit('void(uint32[:,:])')(atomic_add2_wrap)
622
+ cuda_atomic_add2_wrap = cuda.jit("void(uint32[:,:])")(atomic_add2_wrap)
501
623
  cuda_atomic_add2_wrap[1, (4, 8)](ary_wrap)
502
624
 
503
625
  self.assertTrue(np.all(ary == orig + 1))
@@ -506,7 +628,7 @@ class TestCudaAtomics(CUDATestCase):
506
628
  def test_atomic_add3(self):
507
629
  ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
508
630
  orig = ary.copy()
509
- cuda_atomic_add3 = cuda.jit('void(uint32[:,:])')(atomic_add3)
631
+ cuda_atomic_add3 = cuda.jit("void(uint32[:,:])")(atomic_add3)
510
632
  cuda_atomic_add3[1, (4, 8)](ary)
511
633
 
512
634
  self.assertTrue(np.all(ary == orig + 1))
@@ -516,10 +638,10 @@ class TestCudaAtomics(CUDATestCase):
516
638
  ary_wrap = ary.copy()
517
639
  orig = ary.copy().astype(np.intp)
518
640
 
519
- cuda_atomic_add_float = cuda.jit('void(float32[:])')(atomic_add_float)
641
+ cuda_atomic_add_float = cuda.jit("void(float32[:])")(atomic_add_float)
520
642
  cuda_atomic_add_float[1, 32](ary)
521
643
 
522
- add_float_wrap = cuda.jit('void(float32[:])')(atomic_add_float_wrap)
644
+ add_float_wrap = cuda.jit("void(float32[:])")(atomic_add_float_wrap)
523
645
  add_float_wrap[1, 32](ary_wrap)
524
646
 
525
647
  gold = np.zeros(32, dtype=np.uint32)
@@ -534,10 +656,10 @@ class TestCudaAtomics(CUDATestCase):
534
656
  ary_wrap = ary.copy()
535
657
  orig = ary.copy()
536
658
 
537
- cuda_atomic_add2 = cuda.jit('void(float32[:,:])')(atomic_add_float_2)
659
+ cuda_atomic_add2 = cuda.jit("void(float32[:,:])")(atomic_add_float_2)
538
660
  cuda_atomic_add2[1, (4, 8)](ary)
539
661
 
540
- cuda_func_wrap = cuda.jit('void(float32[:,:])')(atomic_add_float_2_wrap)
662
+ cuda_func_wrap = cuda.jit("void(float32[:,:])")(atomic_add_float_2_wrap)
541
663
  cuda_func_wrap[1, (4, 8)](ary_wrap)
542
664
 
543
665
  self.assertTrue(np.all(ary == orig + 1))
@@ -546,7 +668,7 @@ class TestCudaAtomics(CUDATestCase):
546
668
  def test_atomic_add_float_3(self):
547
669
  ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
548
670
  orig = ary.copy()
549
- cuda_atomic_add3 = cuda.jit('void(float32[:,:])')(atomic_add_float_3)
671
+ cuda_atomic_add3 = cuda.jit("void(float32[:,:])")(atomic_add_float_3)
550
672
  cuda_atomic_add3[1, (4, 8)](ary)
551
673
 
552
674
  self.assertTrue(np.all(ary == orig + 1))
@@ -561,24 +683,24 @@ class TestCudaAtomics(CUDATestCase):
561
683
  inst = "(red|atom)"
562
684
 
563
685
  if shared:
564
- inst = f'{inst}\\.shared'
686
+ inst = f"{inst}\\.shared"
565
687
 
566
- self.assertRegex(asm, f'{inst}.add.f64', asm)
688
+ self.assertRegex(asm, f"{inst}.add.f64", asm)
567
689
  else:
568
690
  if shared:
569
- self.assertIn('atom.shared.cas.b64', asm)
691
+ self.assertIn("atom.shared.cas.b64", asm)
570
692
  else:
571
- self.assertIn('atom.cas.b64', asm)
693
+ self.assertIn("atom.cas.b64", asm)
572
694
 
573
695
  def test_atomic_add_double(self):
574
696
  idx = np.random.randint(0, 32, size=32, dtype=np.int64)
575
697
  ary = np.zeros(32, np.float64)
576
698
  ary_wrap = ary.copy()
577
699
 
578
- cuda_fn = cuda.jit('void(int64[:], float64[:])')(atomic_add_double)
700
+ cuda_fn = cuda.jit("void(int64[:], float64[:])")(atomic_add_double)
579
701
  cuda_fn[1, 32](idx, ary)
580
702
 
581
- wrap_fn = cuda.jit('void(int64[:], float64[:])')(atomic_add_double_wrap)
703
+ wrap_fn = cuda.jit("void(int64[:], float64[:])")(atomic_add_double_wrap)
582
704
  wrap_fn[1, 32](idx, ary_wrap)
583
705
 
584
706
  gold = np.zeros(32, dtype=np.uint32)
@@ -595,10 +717,10 @@ class TestCudaAtomics(CUDATestCase):
595
717
  ary_wrap = ary.copy()
596
718
  orig = ary.copy()
597
719
 
598
- cuda_fn = cuda.jit('void(float64[:,:])')(atomic_add_double_2)
720
+ cuda_fn = cuda.jit("void(float64[:,:])")(atomic_add_double_2)
599
721
  cuda_fn[1, (4, 8)](ary)
600
722
 
601
- cuda_fn_wrap = cuda.jit('void(float64[:,:])')(atomic_add_double_2_wrap)
723
+ cuda_fn_wrap = cuda.jit("void(float64[:,:])")(atomic_add_double_2_wrap)
602
724
  cuda_fn_wrap[1, (4, 8)](ary_wrap)
603
725
 
604
726
  np.testing.assert_equal(ary, orig + 1)
@@ -609,7 +731,7 @@ class TestCudaAtomics(CUDATestCase):
609
731
  def test_atomic_add_double_3(self):
610
732
  ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
611
733
  orig = ary.copy()
612
- cuda_func = cuda.jit('void(float64[:,:])')(atomic_add_double_3)
734
+ cuda_func = cuda.jit("void(float64[:,:])")(atomic_add_double_3)
613
735
  cuda_func[1, (4, 8)](ary)
614
736
 
615
737
  np.testing.assert_equal(ary, orig + 1)
@@ -620,7 +742,7 @@ class TestCudaAtomics(CUDATestCase):
620
742
  ary = np.zeros(32, np.float64)
621
743
  ary_wrap = ary.copy()
622
744
 
623
- sig = 'void(int64[:], float64[:])'
745
+ sig = "void(int64[:], float64[:])"
624
746
  cuda_func = cuda.jit(sig)(atomic_add_double_global)
625
747
  wrap_cuda_func = cuda.jit(sig)(atomic_add_double_global_wrap)
626
748
 
@@ -641,7 +763,7 @@ class TestCudaAtomics(CUDATestCase):
641
763
  ary_wrap = ary.copy()
642
764
  orig = ary.copy()
643
765
 
644
- sig = 'void(float64[:,:])'
766
+ sig = "void(float64[:,:])"
645
767
  cuda_func = cuda.jit(sig)(atomic_add_double_global_2)
646
768
  wrap_cuda_func = cuda.jit(sig)(atomic_add_double_global_2_wrap)
647
769
 
@@ -656,7 +778,7 @@ class TestCudaAtomics(CUDATestCase):
656
778
  def test_atomic_add_double_global_3(self):
657
779
  ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
658
780
  orig = ary.copy()
659
- cuda_func = cuda.jit('void(float64[:,:])')(atomic_add_double_global_3)
781
+ cuda_func = cuda.jit("void(float64[:,:])")(atomic_add_double_global_3)
660
782
  cuda_func[1, (4, 8)](ary)
661
783
 
662
784
  np.testing.assert_equal(ary, orig + 1)
@@ -665,7 +787,7 @@ class TestCudaAtomics(CUDATestCase):
665
787
  def test_atomic_sub(self):
666
788
  ary = np.random.randint(0, 32, size=32).astype(np.uint32)
667
789
  orig = ary.copy()
668
- cuda_atomic_sub = cuda.jit('void(uint32[:])')(atomic_sub)
790
+ cuda_atomic_sub = cuda.jit("void(uint32[:])")(atomic_sub)
669
791
  cuda_atomic_sub[1, 32](ary)
670
792
 
671
793
  gold = np.zeros(32, dtype=np.uint32)
@@ -677,21 +799,21 @@ class TestCudaAtomics(CUDATestCase):
677
799
  def test_atomic_sub2(self):
678
800
  ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
679
801
  orig = ary.copy()
680
- cuda_atomic_sub2 = cuda.jit('void(uint32[:,:])')(atomic_sub2)
802
+ cuda_atomic_sub2 = cuda.jit("void(uint32[:,:])")(atomic_sub2)
681
803
  cuda_atomic_sub2[1, (4, 8)](ary)
682
804
  self.assertTrue(np.all(ary == orig - 1))
683
805
 
684
806
  def test_atomic_sub3(self):
685
807
  ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
686
808
  orig = ary.copy()
687
- cuda_atomic_sub3 = cuda.jit('void(uint32[:,:])')(atomic_sub3)
809
+ cuda_atomic_sub3 = cuda.jit("void(uint32[:,:])")(atomic_sub3)
688
810
  cuda_atomic_sub3[1, (4, 8)](ary)
689
811
  self.assertTrue(np.all(ary == orig - 1))
690
812
 
691
813
  def test_atomic_sub_float(self):
692
814
  ary = np.random.randint(0, 32, size=32).astype(np.float32)
693
815
  orig = ary.copy().astype(np.intp)
694
- cuda_atomic_sub_float = cuda.jit('void(float32[:])')(atomic_sub_float)
816
+ cuda_atomic_sub_float = cuda.jit("void(float32[:])")(atomic_sub_float)
695
817
  cuda_atomic_sub_float[1, 32](ary)
696
818
 
697
819
  gold = np.zeros(32, dtype=np.float32)
@@ -703,21 +825,21 @@ class TestCudaAtomics(CUDATestCase):
703
825
  def test_atomic_sub_float_2(self):
704
826
  ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
705
827
  orig = ary.copy()
706
- cuda_atomic_sub2 = cuda.jit('void(float32[:,:])')(atomic_sub_float_2)
828
+ cuda_atomic_sub2 = cuda.jit("void(float32[:,:])")(atomic_sub_float_2)
707
829
  cuda_atomic_sub2[1, (4, 8)](ary)
708
830
  self.assertTrue(np.all(ary == orig - 1))
709
831
 
710
832
  def test_atomic_sub_float_3(self):
711
833
  ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
712
834
  orig = ary.copy()
713
- cuda_atomic_sub3 = cuda.jit('void(float32[:,:])')(atomic_sub_float_3)
835
+ cuda_atomic_sub3 = cuda.jit("void(float32[:,:])")(atomic_sub_float_3)
714
836
  cuda_atomic_sub3[1, (4, 8)](ary)
715
837
  self.assertTrue(np.all(ary == orig - 1))
716
838
 
717
839
  def test_atomic_sub_double(self):
718
840
  idx = np.random.randint(0, 32, size=32, dtype=np.int64)
719
841
  ary = np.zeros(32, np.float64)
720
- cuda_func = cuda.jit('void(int64[:], float64[:])')(atomic_sub_double)
842
+ cuda_func = cuda.jit("void(int64[:], float64[:])")(atomic_sub_double)
721
843
  cuda_func[1, 32](idx, ary)
722
844
 
723
845
  gold = np.zeros(32, dtype=np.float64)
@@ -729,21 +851,21 @@ class TestCudaAtomics(CUDATestCase):
729
851
  def test_atomic_sub_double_2(self):
730
852
  ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
731
853
  orig = ary.copy()
732
- cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_2)
854
+ cuda_func = cuda.jit("void(float64[:,:])")(atomic_sub_double_2)
733
855
  cuda_func[1, (4, 8)](ary)
734
856
  np.testing.assert_equal(ary, orig - 1)
735
857
 
736
858
  def test_atomic_sub_double_3(self):
737
859
  ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
738
860
  orig = ary.copy()
739
- cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_3)
861
+ cuda_func = cuda.jit("void(float64[:,:])")(atomic_sub_double_3)
740
862
  cuda_func[1, (4, 8)](ary)
741
863
  np.testing.assert_equal(ary, orig - 1)
742
864
 
743
865
  def test_atomic_sub_double_global(self):
744
866
  idx = np.random.randint(0, 32, size=32, dtype=np.int64)
745
867
  ary = np.zeros(32, np.float64)
746
- sig = 'void(int64[:], float64[:])'
868
+ sig = "void(int64[:], float64[:])"
747
869
  cuda_func = cuda.jit(sig)(atomic_sub_double_global)
748
870
  cuda_func[1, 32](idx, ary)
749
871
 
@@ -756,14 +878,14 @@ class TestCudaAtomics(CUDATestCase):
756
878
  def test_atomic_sub_double_global_2(self):
757
879
  ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
758
880
  orig = ary.copy()
759
- cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_global_2)
881
+ cuda_func = cuda.jit("void(float64[:,:])")(atomic_sub_double_global_2)
760
882
  cuda_func[1, (4, 8)](ary)
761
883
  np.testing.assert_equal(ary, orig - 1)
762
884
 
763
885
  def test_atomic_sub_double_global_3(self):
764
886
  ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
765
887
  orig = ary.copy()
766
- cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_global_3)
888
+ cuda_func = cuda.jit("void(float64[:,:])")(atomic_sub_double_global_3)
767
889
  cuda_func[1, (4, 8)](ary)
768
890
  np.testing.assert_equal(ary, orig - 1)
769
891
 
@@ -771,7 +893,7 @@ class TestCudaAtomics(CUDATestCase):
771
893
  rand_const = np.random.randint(500)
772
894
  ary = np.random.randint(0, 32, size=32).astype(np.uint32)
773
895
  orig = ary.copy()
774
- cuda_func = cuda.jit('void(uint32[:], uint32)')(atomic_and)
896
+ cuda_func = cuda.jit("void(uint32[:], uint32)")(atomic_and)
775
897
  cuda_func[1, 32](ary, rand_const)
776
898
 
777
899
  gold = ary.copy()
@@ -784,7 +906,7 @@ class TestCudaAtomics(CUDATestCase):
784
906
  rand_const = np.random.randint(500)
785
907
  ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
786
908
  orig = ary.copy()
787
- cuda_atomic_and2 = cuda.jit('void(uint32[:,:], uint32)')(atomic_and2)
909
+ cuda_atomic_and2 = cuda.jit("void(uint32[:,:], uint32)")(atomic_and2)
788
910
  cuda_atomic_and2[1, (4, 8)](ary, rand_const)
789
911
  self.assertTrue(np.all(ary == orig & rand_const))
790
912
 
@@ -792,7 +914,7 @@ class TestCudaAtomics(CUDATestCase):
792
914
  rand_const = np.random.randint(500)
793
915
  ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
794
916
  orig = ary.copy()
795
- cuda_atomic_and3 = cuda.jit('void(uint32[:,:], uint32)')(atomic_and3)
917
+ cuda_atomic_and3 = cuda.jit("void(uint32[:,:], uint32)")(atomic_and3)
796
918
  cuda_atomic_and3[1, (4, 8)](ary, rand_const)
797
919
  self.assertTrue(np.all(ary == orig & rand_const))
798
920
 
@@ -800,7 +922,7 @@ class TestCudaAtomics(CUDATestCase):
800
922
  rand_const = np.random.randint(500)
801
923
  idx = np.random.randint(0, 32, size=32, dtype=np.int32)
802
924
  ary = np.random.randint(0, 32, size=32, dtype=np.int32)
803
- sig = 'void(int32[:], int32[:], int32)'
925
+ sig = "void(int32[:], int32[:], int32)"
804
926
  cuda_func = cuda.jit(sig)(atomic_and_global)
805
927
  cuda_func[1, 32](idx, ary, rand_const)
806
928
 
@@ -814,7 +936,7 @@ class TestCudaAtomics(CUDATestCase):
814
936
  rand_const = np.random.randint(500)
815
937
  ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
816
938
  orig = ary.copy()
817
- cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_and_global_2)
939
+ cuda_func = cuda.jit("void(uint32[:,:], uint32)")(atomic_and_global_2)
818
940
  cuda_func[1, (4, 8)](ary, rand_const)
819
941
  np.testing.assert_equal(ary, orig & rand_const)
820
942
 
@@ -822,7 +944,7 @@ class TestCudaAtomics(CUDATestCase):
822
944
  rand_const = np.random.randint(500)
823
945
  ary = np.random.randint(0, 32, size=32).astype(np.uint32)
824
946
  orig = ary.copy()
825
- cuda_func = cuda.jit('void(uint32[:], uint32)')(atomic_or)
947
+ cuda_func = cuda.jit("void(uint32[:], uint32)")(atomic_or)
826
948
  cuda_func[1, 32](ary, rand_const)
827
949
 
828
950
  gold = np.zeros(32, dtype=np.uint32)
@@ -835,7 +957,7 @@ class TestCudaAtomics(CUDATestCase):
835
957
  rand_const = np.random.randint(500)
836
958
  ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
837
959
  orig = ary.copy()
838
- cuda_atomic_and2 = cuda.jit('void(uint32[:,:], uint32)')(atomic_or2)
960
+ cuda_atomic_and2 = cuda.jit("void(uint32[:,:], uint32)")(atomic_or2)
839
961
  cuda_atomic_and2[1, (4, 8)](ary, rand_const)
840
962
  self.assertTrue(np.all(ary == orig | rand_const))
841
963
 
@@ -843,7 +965,7 @@ class TestCudaAtomics(CUDATestCase):
843
965
  rand_const = np.random.randint(500)
844
966
  ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
845
967
  orig = ary.copy()
846
- cuda_atomic_and3 = cuda.jit('void(uint32[:,:], uint32)')(atomic_or3)
968
+ cuda_atomic_and3 = cuda.jit("void(uint32[:,:], uint32)")(atomic_or3)
847
969
  cuda_atomic_and3[1, (4, 8)](ary, rand_const)
848
970
  self.assertTrue(np.all(ary == orig | rand_const))
849
971
 
@@ -851,7 +973,7 @@ class TestCudaAtomics(CUDATestCase):
851
973
  rand_const = np.random.randint(500)
852
974
  idx = np.random.randint(0, 32, size=32, dtype=np.int32)
853
975
  ary = np.random.randint(0, 32, size=32, dtype=np.int32)
854
- sig = 'void(int32[:], int32[:], int32)'
976
+ sig = "void(int32[:], int32[:], int32)"
855
977
  cuda_func = cuda.jit(sig)(atomic_or_global)
856
978
  cuda_func[1, 32](idx, ary, rand_const)
857
979
 
@@ -865,7 +987,7 @@ class TestCudaAtomics(CUDATestCase):
865
987
  rand_const = np.random.randint(500)
866
988
  ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
867
989
  orig = ary.copy()
868
- cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_or_global_2)
990
+ cuda_func = cuda.jit("void(uint32[:,:], uint32)")(atomic_or_global_2)
869
991
  cuda_func[1, (4, 8)](ary, rand_const)
870
992
  np.testing.assert_equal(ary, orig | rand_const)
871
993
 
@@ -873,7 +995,7 @@ class TestCudaAtomics(CUDATestCase):
873
995
  rand_const = np.random.randint(500)
874
996
  ary = np.random.randint(0, 32, size=32).astype(np.uint32)
875
997
  orig = ary.copy()
876
- cuda_func = cuda.jit('void(uint32[:], uint32)')(atomic_xor)
998
+ cuda_func = cuda.jit("void(uint32[:], uint32)")(atomic_xor)
877
999
  cuda_func[1, 32](ary, rand_const)
878
1000
 
879
1001
  gold = np.zeros(32, dtype=np.uint32)
@@ -886,7 +1008,7 @@ class TestCudaAtomics(CUDATestCase):
886
1008
  rand_const = np.random.randint(500)
887
1009
  ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
888
1010
  orig = ary.copy()
889
- cuda_atomic_xor2 = cuda.jit('void(uint32[:,:], uint32)')(atomic_xor2)
1011
+ cuda_atomic_xor2 = cuda.jit("void(uint32[:,:], uint32)")(atomic_xor2)
890
1012
  cuda_atomic_xor2[1, (4, 8)](ary, rand_const)
891
1013
  self.assertTrue(np.all(ary == orig ^ rand_const))
892
1014
 
@@ -894,7 +1016,7 @@ class TestCudaAtomics(CUDATestCase):
894
1016
  rand_const = np.random.randint(500)
895
1017
  ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
896
1018
  orig = ary.copy()
897
- cuda_atomic_xor3 = cuda.jit('void(uint32[:,:], uint32)')(atomic_xor3)
1019
+ cuda_atomic_xor3 = cuda.jit("void(uint32[:,:], uint32)")(atomic_xor3)
898
1020
  cuda_atomic_xor3[1, (4, 8)](ary, rand_const)
899
1021
  self.assertTrue(np.all(ary == orig ^ rand_const))
900
1022
 
@@ -903,7 +1025,7 @@ class TestCudaAtomics(CUDATestCase):
903
1025
  idx = np.random.randint(0, 32, size=32, dtype=np.int32)
904
1026
  ary = np.random.randint(0, 32, size=32, dtype=np.int32)
905
1027
  gold = ary.copy()
906
- sig = 'void(int32[:], int32[:], int32)'
1028
+ sig = "void(int32[:], int32[:], int32)"
907
1029
  cuda_func = cuda.jit(sig)(atomic_xor_global)
908
1030
  cuda_func[1, 32](idx, ary, rand_const)
909
1031
 
@@ -916,12 +1038,12 @@ class TestCudaAtomics(CUDATestCase):
916
1038
  rand_const = np.random.randint(500)
917
1039
  ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
918
1040
  orig = ary.copy()
919
- cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_xor_global_2)
1041
+ cuda_func = cuda.jit("void(uint32[:,:], uint32)")(atomic_xor_global_2)
920
1042
  cuda_func[1, (4, 8)](ary, rand_const)
921
1043
  np.testing.assert_equal(ary, orig ^ rand_const)
922
1044
 
923
1045
  def inc_dec_1dim_setup(self, dtype):
924
- rconst = np.random.randint(32, dtype=dtype)
1046
+ rconst = np.random.randint(32, dtype=dtype)
925
1047
  rary = np.random.randint(0, 32, size=32).astype(dtype)
926
1048
  ary_idx = np.arange(32, dtype=dtype)
927
1049
  return rconst, rary, ary_idx
@@ -951,131 +1073,141 @@ class TestCudaAtomics(CUDATestCase):
951
1073
 
952
1074
  def test_atomic_inc_32(self):
953
1075
  rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
954
- sig = 'void(uint32[:], uint32[:], uint32)'
1076
+ sig = "void(uint32[:], uint32[:], uint32)"
955
1077
  self.check_inc_index(ary, idx, rand_const, sig, 1, 32, atomic_inc32)
956
1078
 
957
1079
  def test_atomic_inc_64(self):
958
1080
  rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
959
- sig = 'void(uint64[:], uint64[:], uint64)'
1081
+ sig = "void(uint64[:], uint64[:], uint64)"
960
1082
  self.check_inc_index(ary, idx, rand_const, sig, 1, 32, atomic_inc64)
961
1083
 
962
1084
  def test_atomic_inc2_32(self):
963
1085
  rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
964
- sig = 'void(uint32[:,:], uint32)'
965
- self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc2_32)
1086
+ sig = "void(uint32[:,:], uint32)"
1087
+ self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc2_32)
966
1088
 
967
1089
  def test_atomic_inc2_64(self):
968
1090
  rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
969
- sig = 'void(uint64[:,:], uint64)'
970
- self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc2_64)
1091
+ sig = "void(uint64[:,:], uint64)"
1092
+ self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc2_64)
971
1093
 
972
1094
  def test_atomic_inc3(self):
973
1095
  rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
974
- sig = 'void(uint32[:,:], uint32)'
975
- self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc3)
1096
+ sig = "void(uint32[:,:], uint32)"
1097
+ self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc3)
976
1098
 
977
1099
  def test_atomic_inc_global_32(self):
978
1100
  rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
979
- sig = 'void(uint32[:], uint32[:], uint32)'
980
- self.check_inc_index2(ary, idx, rand_const, sig, 1, 32,
981
- atomic_inc_global)
1101
+ sig = "void(uint32[:], uint32[:], uint32)"
1102
+ self.check_inc_index2(
1103
+ ary, idx, rand_const, sig, 1, 32, atomic_inc_global
1104
+ )
982
1105
 
983
1106
  def test_atomic_inc_global_64(self):
984
1107
  rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
985
- sig = 'void(uint64[:], uint64[:], uint64)'
986
- self.check_inc_index2(ary, idx, rand_const, sig, 1, 32,
987
- atomic_inc_global)
1108
+ sig = "void(uint64[:], uint64[:], uint64)"
1109
+ self.check_inc_index2(
1110
+ ary, idx, rand_const, sig, 1, 32, atomic_inc_global
1111
+ )
988
1112
 
989
1113
  def test_atomic_inc_global_2_32(self):
990
1114
  rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
991
- sig = 'void(uint32[:,:], uint32)'
992
- self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc_global_2)
1115
+ sig = "void(uint32[:,:], uint32)"
1116
+ self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc_global_2)
993
1117
 
994
1118
  def test_atomic_inc_global_2_64(self):
995
1119
  rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
996
- sig = 'void(uint64[:,:], uint64)'
997
- self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc_global_2)
1120
+ sig = "void(uint64[:,:], uint64)"
1121
+ self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc_global_2)
998
1122
 
999
1123
  def check_dec_index(self, ary, idx, rconst, sig, nblocks, blksize, func):
1000
1124
  orig = ary.copy()
1001
1125
  cuda_func = cuda.jit(sig)(func)
1002
1126
  cuda_func[nblocks, blksize](ary, idx, rconst)
1003
- np.testing.assert_equal(ary, np.where(orig == 0, rconst,
1004
- np.where(orig > rconst,
1005
- rconst,
1006
- orig - 1)))
1127
+ np.testing.assert_equal(
1128
+ ary,
1129
+ np.where(
1130
+ orig == 0, rconst, np.where(orig > rconst, rconst, orig - 1)
1131
+ ),
1132
+ )
1007
1133
 
1008
1134
  def check_dec_index2(self, ary, idx, rconst, sig, nblocks, blksize, func):
1009
1135
  orig = ary.copy()
1010
1136
  cuda_func = cuda.jit(sig)(func)
1011
1137
  cuda_func[nblocks, blksize](idx, ary, rconst)
1012
- np.testing.assert_equal(ary, np.where(orig == 0, rconst,
1013
- np.where(orig > rconst,
1014
- rconst,
1015
- orig - 1)))
1138
+ np.testing.assert_equal(
1139
+ ary,
1140
+ np.where(
1141
+ orig == 0, rconst, np.where(orig > rconst, rconst, orig - 1)
1142
+ ),
1143
+ )
1016
1144
 
1017
1145
  def check_dec(self, ary, rconst, sig, nblocks, blksize, func):
1018
1146
  orig = ary.copy()
1019
1147
  cuda_func = cuda.jit(sig)(func)
1020
1148
  cuda_func[nblocks, blksize](ary, rconst)
1021
- np.testing.assert_equal(ary, np.where(orig == 0, rconst,
1022
- np.where(orig > rconst,
1023
- rconst,
1024
- orig - 1)))
1149
+ np.testing.assert_equal(
1150
+ ary,
1151
+ np.where(
1152
+ orig == 0, rconst, np.where(orig > rconst, rconst, orig - 1)
1153
+ ),
1154
+ )
1025
1155
 
1026
1156
  def test_atomic_dec_32(self):
1027
1157
  rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
1028
- sig = 'void(uint32[:], uint32[:], uint32)'
1158
+ sig = "void(uint32[:], uint32[:], uint32)"
1029
1159
  self.check_dec_index(ary, idx, rand_const, sig, 1, 32, atomic_dec32)
1030
1160
 
1031
1161
  def test_atomic_dec_64(self):
1032
1162
  rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
1033
- sig = 'void(uint64[:], uint64[:], uint64)'
1163
+ sig = "void(uint64[:], uint64[:], uint64)"
1034
1164
  self.check_dec_index(ary, idx, rand_const, sig, 1, 32, atomic_dec64)
1035
1165
 
1036
1166
  def test_atomic_dec2_32(self):
1037
1167
  rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
1038
- sig = 'void(uint32[:,:], uint32)'
1039
- self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec2_32)
1168
+ sig = "void(uint32[:,:], uint32)"
1169
+ self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec2_32)
1040
1170
 
1041
1171
  def test_atomic_dec2_64(self):
1042
1172
  rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
1043
- sig = 'void(uint64[:,:], uint64)'
1044
- self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec2_64)
1173
+ sig = "void(uint64[:,:], uint64)"
1174
+ self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec2_64)
1045
1175
 
1046
1176
  def test_atomic_dec3_new(self):
1047
1177
  rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
1048
- sig = 'void(uint32[:,:], uint32)'
1049
- self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec3)
1178
+ sig = "void(uint32[:,:], uint32)"
1179
+ self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec3)
1050
1180
 
1051
1181
  def test_atomic_dec_global_32(self):
1052
1182
  rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
1053
- sig = 'void(uint32[:], uint32[:], uint32)'
1054
- self.check_dec_index2(ary, idx, rand_const, sig, 1, 32,
1055
- atomic_dec_global)
1183
+ sig = "void(uint32[:], uint32[:], uint32)"
1184
+ self.check_dec_index2(
1185
+ ary, idx, rand_const, sig, 1, 32, atomic_dec_global
1186
+ )
1056
1187
 
1057
1188
  def test_atomic_dec_global_64(self):
1058
1189
  rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
1059
- sig = 'void(uint64[:], uint64[:], uint64)'
1060
- self.check_dec_index2(ary, idx, rand_const, sig, 1, 32,
1061
- atomic_dec_global)
1190
+ sig = "void(uint64[:], uint64[:], uint64)"
1191
+ self.check_dec_index2(
1192
+ ary, idx, rand_const, sig, 1, 32, atomic_dec_global
1193
+ )
1062
1194
 
1063
1195
  def test_atomic_dec_global2_32(self):
1064
1196
  rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
1065
- sig = 'void(uint32[:,:], uint32)'
1066
- self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec_global_2)
1197
+ sig = "void(uint32[:,:], uint32)"
1198
+ self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec_global_2)
1067
1199
 
1068
1200
  def test_atomic_dec_global2_64(self):
1069
1201
  rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
1070
- sig = 'void(uint64[:,:], uint64)'
1071
- self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec_global_2)
1202
+ sig = "void(uint64[:,:], uint64)"
1203
+ self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec_global_2)
1072
1204
 
1073
1205
  def test_atomic_exch(self):
1074
1206
  rand_const = np.random.randint(50, 100, dtype=np.uint32)
1075
1207
  ary = np.random.randint(0, 32, size=32).astype(np.uint32)
1076
1208
  idx = np.arange(32, dtype=np.uint32)
1077
1209
 
1078
- cuda_func = cuda.jit('void(uint32[:], uint32[:], uint32)')(atomic_exch)
1210
+ cuda_func = cuda.jit("void(uint32[:], uint32[:], uint32)")(atomic_exch)
1079
1211
  cuda_func[1, 32](ary, idx, rand_const)
1080
1212
 
1081
1213
  np.testing.assert_equal(ary, rand_const)
@@ -1084,7 +1216,7 @@ class TestCudaAtomics(CUDATestCase):
1084
1216
  rand_const = np.random.randint(50, 100, dtype=np.uint32)
1085
1217
  ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
1086
1218
 
1087
- cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_exch2)
1219
+ cuda_func = cuda.jit("void(uint32[:,:], uint32)")(atomic_exch2)
1088
1220
  cuda_func[1, (4, 8)](ary, rand_const)
1089
1221
  np.testing.assert_equal(ary, rand_const)
1090
1222
 
@@ -1092,7 +1224,7 @@ class TestCudaAtomics(CUDATestCase):
1092
1224
  rand_const = np.random.randint(50, 100, dtype=np.uint64)
1093
1225
  ary = np.random.randint(0, 32, size=32).astype(np.uint64).reshape(4, 8)
1094
1226
 
1095
- cuda_func = cuda.jit('void(uint64[:,:], uint64)')(atomic_exch3)
1227
+ cuda_func = cuda.jit("void(uint64[:,:], uint64)")(atomic_exch3)
1096
1228
  cuda_func[1, (4, 8)](ary, rand_const)
1097
1229
  np.testing.assert_equal(ary, rand_const)
1098
1230
 
@@ -1101,7 +1233,7 @@ class TestCudaAtomics(CUDATestCase):
1101
1233
  idx = np.arange(32, dtype=np.uint32)
1102
1234
  ary = np.random.randint(0, 32, size=32, dtype=np.uint32)
1103
1235
 
1104
- sig = 'void(uint32[:], uint32[:], uint32)'
1236
+ sig = "void(uint32[:], uint32[:], uint32)"
1105
1237
  cuda_func = cuda.jit(sig)(atomic_exch_global)
1106
1238
  cuda_func[1, 32](idx, ary, rand_const)
1107
1239
  np.testing.assert_equal(ary, rand_const)
@@ -1135,8 +1267,9 @@ class TestCudaAtomics(CUDATestCase):
1135
1267
  def test_atomic_max_double_normalizedindex(self):
1136
1268
  vals = np.random.randint(0, 65535, size=(32, 32)).astype(np.float64)
1137
1269
  res = np.zeros(1, np.float64)
1138
- cuda_func = cuda.jit('void(float64[:], float64[:,:])')(
1139
- atomic_max_double_normalizedindex)
1270
+ cuda_func = cuda.jit("void(float64[:], float64[:,:])")(
1271
+ atomic_max_double_normalizedindex
1272
+ )
1140
1273
  cuda_func[32, 32](res, vals)
1141
1274
 
1142
1275
  gold = np.max(vals)
@@ -1145,8 +1278,9 @@ class TestCudaAtomics(CUDATestCase):
1145
1278
  def test_atomic_max_double_oneindex(self):
1146
1279
  vals = np.random.randint(0, 128, size=32).astype(np.float64)
1147
1280
  res = np.zeros(1, np.float64)
1148
- cuda_func = cuda.jit('void(float64[:], float64[:])')(
1149
- atomic_max_double_oneindex)
1281
+ cuda_func = cuda.jit("void(float64[:], float64[:])")(
1282
+ atomic_max_double_oneindex
1283
+ )
1150
1284
  cuda_func[1, 32](res, vals)
1151
1285
 
1152
1286
  gold = np.max(vals)
@@ -1182,8 +1316,9 @@ class TestCudaAtomics(CUDATestCase):
1182
1316
  def test_atomic_min_double_normalizedindex(self):
1183
1317
  vals = np.random.randint(0, 65535, size=(32, 32)).astype(np.float64)
1184
1318
  res = np.ones(1, np.float64) * 65535
1185
- cuda_func = cuda.jit('void(float64[:], float64[:,:])')(
1186
- atomic_min_double_normalizedindex)
1319
+ cuda_func = cuda.jit("void(float64[:], float64[:,:])")(
1320
+ atomic_min_double_normalizedindex
1321
+ )
1187
1322
  cuda_func[32, 32](res, vals)
1188
1323
 
1189
1324
  gold = np.min(vals)
@@ -1192,8 +1327,9 @@ class TestCudaAtomics(CUDATestCase):
1192
1327
  def test_atomic_min_double_oneindex(self):
1193
1328
  vals = np.random.randint(0, 128, size=32).astype(np.float64)
1194
1329
  res = np.ones(1, np.float64) * 128
1195
- cuda_func = cuda.jit('void(float64[:], float64[:])')(
1196
- atomic_min_double_oneindex)
1330
+ cuda_func = cuda.jit("void(float64[:], float64[:])")(
1331
+ atomic_min_double_oneindex
1332
+ )
1197
1333
  cuda_func[1, 32](res, vals)
1198
1334
 
1199
1335
  gold = np.min(vals)
@@ -1211,16 +1347,15 @@ class TestCudaAtomics(CUDATestCase):
1211
1347
  # the result will be ary[idx] for either of ary[idx] or val being NaN.
1212
1348
 
1213
1349
  def _test_atomic_minmax_nan_location(self, func):
1350
+ cuda_func = cuda.jit("void(float64[:], float64[:,:])")(func)
1214
1351
 
1215
- cuda_func = cuda.jit('void(float64[:], float64[:,:])')(func)
1216
-
1217
- vals = np.random.randint(0, 128, size=(1,1)).astype(np.float64)
1352
+ vals = np.random.randint(0, 128, size=(1, 1)).astype(np.float64)
1218
1353
  res = np.zeros(1, np.float64) + np.nan
1219
1354
  cuda_func[1, 1](res, vals)
1220
1355
  np.testing.assert_equal(res, [np.nan])
1221
1356
 
1222
1357
  def _test_atomic_minmax_nan_val(self, func):
1223
- cuda_func = cuda.jit('void(float64[:], float64[:,:])')(func)
1358
+ cuda_func = cuda.jit("void(float64[:], float64[:,:])")(func)
1224
1359
 
1225
1360
  res = np.random.randint(0, 128, size=1).astype(np.float64)
1226
1361
  gold = res.copy()
@@ -1244,7 +1379,7 @@ class TestCudaAtomics(CUDATestCase):
1244
1379
  def test_atomic_max_double_shared(self):
1245
1380
  vals = np.random.randint(0, 32, size=32).astype(np.float64)
1246
1381
  res = np.zeros(1, np.float64)
1247
- sig = 'void(float64[:], float64[:])'
1382
+ sig = "void(float64[:], float64[:])"
1248
1383
  cuda_func = cuda.jit(sig)(atomic_max_double_shared)
1249
1384
  cuda_func[1, 32](res, vals)
1250
1385
 
@@ -1254,7 +1389,7 @@ class TestCudaAtomics(CUDATestCase):
1254
1389
  def test_atomic_min_double_shared(self):
1255
1390
  vals = np.random.randint(0, 32, size=32).astype(np.float64)
1256
1391
  res = np.ones(1, np.float64) * 32
1257
- sig = 'void(float64[:], float64[:])'
1392
+ sig = "void(float64[:], float64[:])"
1258
1393
  cuda_func = cuda.jit(sig)(atomic_min_double_shared)
1259
1394
  cuda_func[1, 32](res, vals)
1260
1395
 
@@ -1289,64 +1424,120 @@ class TestCudaAtomics(CUDATestCase):
1289
1424
  np.testing.assert_array_equal(expect_out, out)
1290
1425
 
1291
1426
  def test_atomic_compare_and_swap(self):
1292
- self.check_cas(n=100, fill=-99, unfill=-1, dtype=np.int32,
1293
- cas_func=atomic_compare_and_swap)
1427
+ self.check_cas(
1428
+ n=100,
1429
+ fill=-99,
1430
+ unfill=-1,
1431
+ dtype=np.int32,
1432
+ cas_func=atomic_compare_and_swap,
1433
+ )
1294
1434
 
1295
1435
  def test_atomic_compare_and_swap2(self):
1296
- self.check_cas(n=100, fill=-45, unfill=-1, dtype=np.int64,
1297
- cas_func=atomic_compare_and_swap)
1436
+ self.check_cas(
1437
+ n=100,
1438
+ fill=-45,
1439
+ unfill=-1,
1440
+ dtype=np.int64,
1441
+ cas_func=atomic_compare_and_swap,
1442
+ )
1298
1443
 
1299
1444
  def test_atomic_compare_and_swap3(self):
1300
1445
  rfill = np.random.randint(50, 500, dtype=np.uint32)
1301
1446
  runfill = np.random.randint(1, 25, dtype=np.uint32)
1302
- self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint32,
1303
- cas_func=atomic_compare_and_swap)
1447
+ self.check_cas(
1448
+ n=100,
1449
+ fill=rfill,
1450
+ unfill=runfill,
1451
+ dtype=np.uint32,
1452
+ cas_func=atomic_compare_and_swap,
1453
+ )
1304
1454
 
1305
1455
  def test_atomic_compare_and_swap4(self):
1306
1456
  rfill = np.random.randint(50, 500, dtype=np.uint64)
1307
1457
  runfill = np.random.randint(1, 25, dtype=np.uint64)
1308
- self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint64,
1309
- cas_func=atomic_compare_and_swap)
1458
+ self.check_cas(
1459
+ n=100,
1460
+ fill=rfill,
1461
+ unfill=runfill,
1462
+ dtype=np.uint64,
1463
+ cas_func=atomic_compare_and_swap,
1464
+ )
1310
1465
 
1311
1466
  def test_atomic_cas_1dim(self):
1312
- self.check_cas(n=100, fill=-99, unfill=-1, dtype=np.int32,
1313
- cas_func=atomic_cas_1dim)
1467
+ self.check_cas(
1468
+ n=100, fill=-99, unfill=-1, dtype=np.int32, cas_func=atomic_cas_1dim
1469
+ )
1314
1470
 
1315
1471
  def test_atomic_cas_2dim(self):
1316
- self.check_cas(n=100, fill=-99, unfill=-1, dtype=np.int32,
1317
- cas_func=atomic_cas_2dim, ndim=2)
1472
+ self.check_cas(
1473
+ n=100,
1474
+ fill=-99,
1475
+ unfill=-1,
1476
+ dtype=np.int32,
1477
+ cas_func=atomic_cas_2dim,
1478
+ ndim=2,
1479
+ )
1318
1480
 
1319
1481
  def test_atomic_cas2_1dim(self):
1320
- self.check_cas(n=100, fill=-45, unfill=-1, dtype=np.int64,
1321
- cas_func=atomic_cas_1dim)
1482
+ self.check_cas(
1483
+ n=100, fill=-45, unfill=-1, dtype=np.int64, cas_func=atomic_cas_1dim
1484
+ )
1322
1485
 
1323
1486
  def test_atomic_cas2_2dim(self):
1324
- self.check_cas(n=100, fill=-45, unfill=-1, dtype=np.int64,
1325
- cas_func=atomic_cas_2dim, ndim=2)
1487
+ self.check_cas(
1488
+ n=100,
1489
+ fill=-45,
1490
+ unfill=-1,
1491
+ dtype=np.int64,
1492
+ cas_func=atomic_cas_2dim,
1493
+ ndim=2,
1494
+ )
1326
1495
 
1327
1496
  def test_atomic_cas3_1dim(self):
1328
1497
  rfill = np.random.randint(50, 500, dtype=np.uint32)
1329
1498
  runfill = np.random.randint(1, 25, dtype=np.uint32)
1330
- self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint32,
1331
- cas_func=atomic_cas_1dim)
1499
+ self.check_cas(
1500
+ n=100,
1501
+ fill=rfill,
1502
+ unfill=runfill,
1503
+ dtype=np.uint32,
1504
+ cas_func=atomic_cas_1dim,
1505
+ )
1332
1506
 
1333
1507
  def test_atomic_cas3_2dim(self):
1334
1508
  rfill = np.random.randint(50, 500, dtype=np.uint32)
1335
1509
  runfill = np.random.randint(1, 25, dtype=np.uint32)
1336
- self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint32,
1337
- cas_func=atomic_cas_2dim, ndim=2)
1510
+ self.check_cas(
1511
+ n=100,
1512
+ fill=rfill,
1513
+ unfill=runfill,
1514
+ dtype=np.uint32,
1515
+ cas_func=atomic_cas_2dim,
1516
+ ndim=2,
1517
+ )
1338
1518
 
1339
1519
  def test_atomic_cas4_1dim(self):
1340
1520
  rfill = np.random.randint(50, 500, dtype=np.uint64)
1341
1521
  runfill = np.random.randint(1, 25, dtype=np.uint64)
1342
- self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint64,
1343
- cas_func=atomic_cas_1dim)
1522
+ self.check_cas(
1523
+ n=100,
1524
+ fill=rfill,
1525
+ unfill=runfill,
1526
+ dtype=np.uint64,
1527
+ cas_func=atomic_cas_1dim,
1528
+ )
1344
1529
 
1345
1530
  def test_atomic_cas4_2dim(self):
1346
1531
  rfill = np.random.randint(50, 500, dtype=np.uint64)
1347
1532
  runfill = np.random.randint(1, 25, dtype=np.uint64)
1348
- self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint64,
1349
- cas_func=atomic_cas_2dim, ndim=2)
1533
+ self.check_cas(
1534
+ n=100,
1535
+ fill=rfill,
1536
+ unfill=runfill,
1537
+ dtype=np.uint64,
1538
+ cas_func=atomic_cas_2dim,
1539
+ ndim=2,
1540
+ )
1350
1541
 
1351
1542
  # Tests that the atomic add, min, and max operations return the old value -
1352
1543
  # in the simulator, they did not (see Issue #5458). The max and min have
@@ -1438,34 +1629,36 @@ class TestCudaAtomics(CUDATestCase):
1438
1629
  np.testing.assert_equal(res, gold)
1439
1630
 
1440
1631
  def test_atomic_nanmax_int32(self):
1441
- self.check_atomic_nanmax(dtype=np.int32, lo=-65535, hi=65535,
1442
- init_val=0)
1632
+ self.check_atomic_nanmax(
1633
+ dtype=np.int32, lo=-65535, hi=65535, init_val=0
1634
+ )
1443
1635
 
1444
1636
  def test_atomic_nanmax_uint32(self):
1445
- self.check_atomic_nanmax(dtype=np.uint32, lo=0, hi=65535,
1446
- init_val=0)
1637
+ self.check_atomic_nanmax(dtype=np.uint32, lo=0, hi=65535, init_val=0)
1447
1638
 
1448
1639
  def test_atomic_nanmax_int64(self):
1449
- self.check_atomic_nanmax(dtype=np.int64, lo=-65535, hi=65535,
1450
- init_val=0)
1640
+ self.check_atomic_nanmax(
1641
+ dtype=np.int64, lo=-65535, hi=65535, init_val=0
1642
+ )
1451
1643
 
1452
1644
  def test_atomic_nanmax_uint64(self):
1453
- self.check_atomic_nanmax(dtype=np.uint64, lo=0, hi=65535,
1454
- init_val=0)
1645
+ self.check_atomic_nanmax(dtype=np.uint64, lo=0, hi=65535, init_val=0)
1455
1646
 
1456
1647
  def test_atomic_nanmax_float32(self):
1457
- self.check_atomic_nanmax(dtype=np.float32, lo=-65535, hi=65535,
1458
- init_val=np.nan)
1648
+ self.check_atomic_nanmax(
1649
+ dtype=np.float32, lo=-65535, hi=65535, init_val=np.nan
1650
+ )
1459
1651
 
1460
1652
  def test_atomic_nanmax_double(self):
1461
- self.check_atomic_nanmax(dtype=np.float64, lo=-65535, hi=65535,
1462
- init_val=np.nan)
1653
+ self.check_atomic_nanmax(
1654
+ dtype=np.float64, lo=-65535, hi=65535, init_val=np.nan
1655
+ )
1463
1656
 
1464
1657
  def test_atomic_nanmax_double_shared(self):
1465
1658
  vals = np.random.randint(0, 32, size=32).astype(np.float64)
1466
1659
  vals[1::2] = np.nan
1467
1660
  res = np.array([0], dtype=vals.dtype)
1468
- sig = 'void(float64[:], float64[:])'
1661
+ sig = "void(float64[:], float64[:])"
1469
1662
  cuda_func = cuda.jit(sig)(atomic_nanmax_double_shared)
1470
1663
  cuda_func[1, 32](res, vals)
1471
1664
 
@@ -1476,8 +1669,9 @@ class TestCudaAtomics(CUDATestCase):
1476
1669
  vals = np.random.randint(0, 128, size=32).astype(np.float64)
1477
1670
  vals[1::2] = np.nan
1478
1671
  res = np.zeros(1, np.float64)
1479
- cuda_func = cuda.jit('void(float64[:], float64[:])')(
1480
- atomic_max_double_oneindex)
1672
+ cuda_func = cuda.jit("void(float64[:], float64[:])")(
1673
+ atomic_max_double_oneindex
1674
+ )
1481
1675
  cuda_func[1, 32](res, vals)
1482
1676
 
1483
1677
  gold = np.nanmax(vals)
@@ -1495,34 +1689,36 @@ class TestCudaAtomics(CUDATestCase):
1495
1689
  np.testing.assert_equal(res, gold)
1496
1690
 
1497
1691
  def test_atomic_nanmin_int32(self):
1498
- self.check_atomic_nanmin(dtype=np.int32, lo=-65535, hi=65535,
1499
- init_val=0)
1692
+ self.check_atomic_nanmin(
1693
+ dtype=np.int32, lo=-65535, hi=65535, init_val=0
1694
+ )
1500
1695
 
1501
1696
  def test_atomic_nanmin_uint32(self):
1502
- self.check_atomic_nanmin(dtype=np.uint32, lo=0, hi=65535,
1503
- init_val=0)
1697
+ self.check_atomic_nanmin(dtype=np.uint32, lo=0, hi=65535, init_val=0)
1504
1698
 
1505
1699
  def test_atomic_nanmin_int64(self):
1506
- self.check_atomic_nanmin(dtype=np.int64, lo=-65535, hi=65535,
1507
- init_val=0)
1700
+ self.check_atomic_nanmin(
1701
+ dtype=np.int64, lo=-65535, hi=65535, init_val=0
1702
+ )
1508
1703
 
1509
1704
  def test_atomic_nanmin_uint64(self):
1510
- self.check_atomic_nanmin(dtype=np.uint64, lo=0, hi=65535,
1511
- init_val=0)
1705
+ self.check_atomic_nanmin(dtype=np.uint64, lo=0, hi=65535, init_val=0)
1512
1706
 
1513
1707
  def test_atomic_nanmin_float(self):
1514
- self.check_atomic_nanmin(dtype=np.float32, lo=-65535, hi=65535,
1515
- init_val=np.nan)
1708
+ self.check_atomic_nanmin(
1709
+ dtype=np.float32, lo=-65535, hi=65535, init_val=np.nan
1710
+ )
1516
1711
 
1517
1712
  def test_atomic_nanmin_double(self):
1518
- self.check_atomic_nanmin(dtype=np.float64, lo=-65535, hi=65535,
1519
- init_val=np.nan)
1713
+ self.check_atomic_nanmin(
1714
+ dtype=np.float64, lo=-65535, hi=65535, init_val=np.nan
1715
+ )
1520
1716
 
1521
1717
  def test_atomic_nanmin_double_shared(self):
1522
1718
  vals = np.random.randint(0, 32, size=32).astype(np.float64)
1523
1719
  vals[1::2] = np.nan
1524
1720
  res = np.array([32], dtype=vals.dtype)
1525
- sig = 'void(float64[:], float64[:])'
1721
+ sig = "void(float64[:], float64[:])"
1526
1722
  cuda_func = cuda.jit(sig)(atomic_nanmin_double_shared)
1527
1723
  cuda_func[1, 32](res, vals)
1528
1724
 
@@ -1533,8 +1729,9 @@ class TestCudaAtomics(CUDATestCase):
1533
1729
  vals = np.random.randint(0, 128, size=32).astype(np.float64)
1534
1730
  vals[1::2] = np.nan
1535
1731
  res = np.array([128], np.float64)
1536
- cuda_func = cuda.jit('void(float64[:], float64[:])')(
1537
- atomic_min_double_oneindex)
1732
+ cuda_func = cuda.jit("void(float64[:], float64[:])")(
1733
+ atomic_min_double_oneindex
1734
+ )
1538
1735
  cuda_func[1, 32](res, vals)
1539
1736
 
1540
1737
  gold = np.nanmin(vals)
@@ -1610,5 +1807,5 @@ class TestCudaAtomics(CUDATestCase):
1610
1807
  self._test_atomic_nan_returns_old(kernel, 11)
1611
1808
 
1612
1809
 
1613
- if __name__ == '__main__':
1810
+ if __name__ == "__main__":
1614
1811
  unittest.main()