numba-cuda 0.0.0__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. _numba_cuda_redirector.pth +1 -0
  2. _numba_cuda_redirector.py +74 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +5 -0
  5. numba_cuda/_version.py +19 -0
  6. numba_cuda/numba/cuda/__init__.py +22 -0
  7. numba_cuda/numba/cuda/api.py +526 -0
  8. numba_cuda/numba/cuda/api_util.py +30 -0
  9. numba_cuda/numba/cuda/args.py +77 -0
  10. numba_cuda/numba/cuda/cg.py +62 -0
  11. numba_cuda/numba/cuda/codegen.py +378 -0
  12. numba_cuda/numba/cuda/compiler.py +422 -0
  13. numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
  14. numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
  15. numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
  16. numba_cuda/numba/cuda/cuda_paths.py +258 -0
  17. numba_cuda/numba/cuda/cudadecl.py +806 -0
  18. numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
  19. numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
  20. numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
  21. numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
  22. numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
  23. numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
  24. numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
  25. numba_cuda/numba/cuda/cudadrv/error.py +36 -0
  26. numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
  27. numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
  28. numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
  29. numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
  30. numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
  31. numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
  32. numba_cuda/numba/cuda/cudaimpl.py +1055 -0
  33. numba_cuda/numba/cuda/cudamath.py +140 -0
  34. numba_cuda/numba/cuda/decorators.py +189 -0
  35. numba_cuda/numba/cuda/descriptor.py +33 -0
  36. numba_cuda/numba/cuda/device_init.py +89 -0
  37. numba_cuda/numba/cuda/deviceufunc.py +908 -0
  38. numba_cuda/numba/cuda/dispatcher.py +1057 -0
  39. numba_cuda/numba/cuda/errors.py +59 -0
  40. numba_cuda/numba/cuda/extending.py +7 -0
  41. numba_cuda/numba/cuda/initialize.py +13 -0
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
  43. numba_cuda/numba/cuda/intrinsics.py +198 -0
  44. numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  45. numba_cuda/numba/cuda/kernels/reduction.py +262 -0
  46. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  47. numba_cuda/numba/cuda/libdevice.py +3382 -0
  48. numba_cuda/numba/cuda/libdevicedecl.py +17 -0
  49. numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
  50. numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
  51. numba_cuda/numba/cuda/mathimpl.py +448 -0
  52. numba_cuda/numba/cuda/models.py +48 -0
  53. numba_cuda/numba/cuda/nvvmutils.py +235 -0
  54. numba_cuda/numba/cuda/printimpl.py +86 -0
  55. numba_cuda/numba/cuda/random.py +292 -0
  56. numba_cuda/numba/cuda/simulator/__init__.py +38 -0
  57. numba_cuda/numba/cuda/simulator/api.py +110 -0
  58. numba_cuda/numba/cuda/simulator/compiler.py +9 -0
  59. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
  60. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
  61. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
  62. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
  63. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
  64. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
  65. numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
  66. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
  67. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
  68. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
  69. numba_cuda/numba/cuda/simulator/kernel.py +308 -0
  70. numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
  71. numba_cuda/numba/cuda/simulator/reduction.py +15 -0
  72. numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
  73. numba_cuda/numba/cuda/simulator_init.py +17 -0
  74. numba_cuda/numba/cuda/stubs.py +902 -0
  75. numba_cuda/numba/cuda/target.py +440 -0
  76. numba_cuda/numba/cuda/testing.py +202 -0
  77. numba_cuda/numba/cuda/tests/__init__.py +58 -0
  78. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
  79. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
  80. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
  81. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
  88. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
  89. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
  90. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
  91. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
  92. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
  93. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
  94. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
  95. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
  98. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
  100. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
  101. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
  102. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
  103. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
  104. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
  105. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
  106. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
  107. numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
  108. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
  109. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
  110. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
  111. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
  112. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
  113. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
  115. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
  117. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
  118. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
  119. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
  120. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
  121. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
  123. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
  124. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
  126. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
  127. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
  129. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
  131. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
  132. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
  133. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
  134. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
  135. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
  136. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
  137. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
  138. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
  139. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
  140. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
  141. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
  142. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
  143. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
  144. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
  145. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
  148. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
  149. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
  150. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
  151. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
  152. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
  153. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
  154. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
  155. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
  156. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
  157. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
  158. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
  159. numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
  161. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
  162. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
  163. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
  164. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
  165. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
  166. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
  167. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
  168. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
  169. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
  170. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
  171. numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
  172. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
  173. numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
  174. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
  175. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
  176. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
  177. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  178. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
  179. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
  180. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
  182. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
  183. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
  184. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
  185. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
  186. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
  187. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
  188. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
  192. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
  193. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
  194. numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
  195. numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
  197. numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  198. numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
  199. numba_cuda/numba/cuda/tests/data/error.cu +7 -0
  200. numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
  201. numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
  202. numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
  203. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
  204. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
  206. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
  207. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
  208. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
  209. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
  210. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
  211. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
  212. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
  213. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
  214. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
  216. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
  217. numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
  218. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
  219. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
  220. numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
  221. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
  222. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
  223. numba_cuda/numba/cuda/types.py +37 -0
  224. numba_cuda/numba/cuda/ufuncs.py +662 -0
  225. numba_cuda/numba/cuda/vector_types.py +209 -0
  226. numba_cuda/numba/cuda/vectorizers.py +252 -0
  227. numba_cuda-0.0.12.dist-info/LICENSE +25 -0
  228. numba_cuda-0.0.12.dist-info/METADATA +68 -0
  229. numba_cuda-0.0.12.dist-info/RECORD +231 -0
  230. {numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
  231. numba_cuda-0.0.0.dist-info/METADATA +0 -6
  232. numba_cuda-0.0.0.dist-info/RECORD +0 -5
  233. {numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1620 @@
1
+ import numpy as np
2
+ from textwrap import dedent
3
+
4
+ from numba import cuda, uint32, uint64, float32, float64
5
+ from numba.cuda.testing import unittest, CUDATestCase, cc_X_or_above
6
+ from numba.core import config
7
+
8
+
9
+ @cuda.jit(device=True)
10
+ def atomic_cast_to_uint64(num):
11
+ return uint64(num)
12
+
13
+
14
+ @cuda.jit(device=True)
15
+ def atomic_cast_to_int(num):
16
+ return int(num)
17
+
18
+
19
+ @cuda.jit(device=True)
20
+ def atomic_cast_none(num):
21
+ return num
22
+
23
+
24
+ @cuda.jit(device=True)
25
+ def atomic_binary_1dim_shared(ary, idx, op2, ary_dtype, ary_nelements,
26
+ binop_func, cast_func, initializer,
27
+ neg_idx):
28
+ tid = cuda.threadIdx.x
29
+ sm = cuda.shared.array(ary_nelements, ary_dtype)
30
+ sm[tid] = initializer
31
+ cuda.syncthreads()
32
+ bin = cast_func(idx[tid] % ary_nelements)
33
+ if neg_idx:
34
+ bin = bin % ary_nelements
35
+ binop_func(sm, bin, op2)
36
+ cuda.syncthreads()
37
+ ary[tid] = sm[tid]
38
+
39
+
40
+ @cuda.jit(device=True)
41
+ def atomic_binary_1dim_shared2(ary, idx, op2, ary_dtype, ary_nelements,
42
+ binop_func, cast_func):
43
+ tid = cuda.threadIdx.x
44
+ sm = cuda.shared.array(ary_nelements, ary_dtype)
45
+ sm[tid] = ary[tid]
46
+ cuda.syncthreads()
47
+ bin = cast_func(idx[tid] % ary_nelements)
48
+ binop_func(sm, bin, op2)
49
+ cuda.syncthreads()
50
+ ary[tid] = sm[tid]
51
+
52
+
53
+ @cuda.jit(device=True)
54
+ def atomic_binary_2dim_shared(ary, op2, ary_dtype, ary_shape,
55
+ binop_func, y_cast_func, neg_idx):
56
+ tx = cuda.threadIdx.x
57
+ ty = cuda.threadIdx.y
58
+ sm = cuda.shared.array(ary_shape, ary_dtype)
59
+ sm[tx, ty] = ary[tx, ty]
60
+ cuda.syncthreads()
61
+ bin = (tx, y_cast_func(ty))
62
+ if neg_idx:
63
+ bin = (bin[0] % ary_shape[0], bin[1] % ary_shape[1])
64
+ binop_func(sm, bin, op2)
65
+ cuda.syncthreads()
66
+ ary[tx, ty] = sm[tx, ty]
67
+
68
+
69
+ @cuda.jit(device=True)
70
+ def atomic_binary_2dim_global(ary, op2, binop_func, y_cast_func, neg_idx):
71
+ tx = cuda.threadIdx.x
72
+ ty = cuda.threadIdx.y
73
+ bin = (tx, y_cast_func(ty))
74
+ if neg_idx:
75
+ bin = (bin[0] % ary.shape[0], bin[1] % ary.shape[1])
76
+ binop_func(ary, bin, op2)
77
+
78
+
79
+ @cuda.jit(device=True)
80
+ def atomic_binary_1dim_global(ary, idx, ary_nelements, op2,
81
+ binop_func, neg_idx):
82
+ tid = cuda.threadIdx.x
83
+ bin = int(idx[tid] % ary_nelements)
84
+ if neg_idx:
85
+ bin = bin % ary_nelements
86
+ binop_func(ary, bin, op2)
87
+
88
+
89
+ def atomic_add(ary):
90
+ atomic_binary_1dim_shared(ary, ary, 1, uint32, 32,
91
+ cuda.atomic.add, atomic_cast_none, 0, False)
92
+
93
+
94
+ def atomic_add_wrap(ary):
95
+ atomic_binary_1dim_shared(ary, ary, 1, uint32, 32,
96
+ cuda.atomic.add, atomic_cast_none, 0, True)
97
+
98
+
99
+ def atomic_add2(ary):
100
+ atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
101
+ cuda.atomic.add, atomic_cast_none, False)
102
+
103
+
104
+ def atomic_add2_wrap(ary):
105
+ atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
106
+ cuda.atomic.add, atomic_cast_none, True)
107
+
108
+
109
+ def atomic_add3(ary):
110
+ atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
111
+ cuda.atomic.add, atomic_cast_to_uint64, False)
112
+
113
+
114
+ def atomic_add_float(ary):
115
+ atomic_binary_1dim_shared(ary, ary, 1.0, float32, 32,
116
+ cuda.atomic.add, atomic_cast_to_int, 0.0, False)
117
+
118
+
119
+ def atomic_add_float_wrap(ary):
120
+ atomic_binary_1dim_shared(ary, ary, 1.0, float32, 32,
121
+ cuda.atomic.add, atomic_cast_to_int, 0.0, True)
122
+
123
+
124
+ def atomic_add_float_2(ary):
125
+ atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
126
+ cuda.atomic.add, atomic_cast_none, False)
127
+
128
+
129
+ def atomic_add_float_2_wrap(ary):
130
+ atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
131
+ cuda.atomic.add, atomic_cast_none, True)
132
+
133
+
134
+ def atomic_add_float_3(ary):
135
+ atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
136
+ cuda.atomic.add, atomic_cast_to_uint64, False)
137
+
138
+
139
+ def atomic_add_double_global(idx, ary):
140
+ atomic_binary_1dim_global(ary, idx, 32, 1.0, cuda.atomic.add, False)
141
+
142
+
143
+ def atomic_add_double_global_wrap(idx, ary):
144
+ atomic_binary_1dim_global(ary, idx, 32, 1.0, cuda.atomic.add, True)
145
+
146
+
147
+ def atomic_add_double_global_2(ary):
148
+ atomic_binary_2dim_global(ary, 1, cuda.atomic.add, atomic_cast_none, False)
149
+
150
+
151
+ def atomic_add_double_global_2_wrap(ary):
152
+ atomic_binary_2dim_global(ary, 1, cuda.atomic.add, atomic_cast_none, True)
153
+
154
+
155
+ def atomic_add_double_global_3(ary):
156
+ atomic_binary_2dim_global(ary, 1, cuda.atomic.add, atomic_cast_to_uint64,
157
+ False)
158
+
159
+
160
+ def atomic_add_double(idx, ary):
161
+ atomic_binary_1dim_shared(ary, idx, 1.0, float64, 32,
162
+ cuda.atomic.add, atomic_cast_none, 0.0, False)
163
+
164
+
165
+ def atomic_add_double_wrap(idx, ary):
166
+ atomic_binary_1dim_shared(ary, idx, 1.0, float64, 32,
167
+ cuda.atomic.add, atomic_cast_none, 0.0, True)
168
+
169
+
170
+ def atomic_add_double_2(ary):
171
+ atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
172
+ cuda.atomic.add, atomic_cast_none, False)
173
+
174
+
175
+ def atomic_add_double_2_wrap(ary):
176
+ atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
177
+ cuda.atomic.add, atomic_cast_none, True)
178
+
179
+
180
+ def atomic_add_double_3(ary):
181
+ atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
182
+ cuda.atomic.add, atomic_cast_to_uint64, False)
183
+
184
+
185
+ def atomic_sub(ary):
186
+ atomic_binary_1dim_shared(ary, ary, 1, uint32, 32,
187
+ cuda.atomic.sub, atomic_cast_none, 0, False)
188
+
189
+
190
+ def atomic_sub2(ary):
191
+ atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
192
+ cuda.atomic.sub, atomic_cast_none, False)
193
+
194
+
195
+ def atomic_sub3(ary):
196
+ atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
197
+ cuda.atomic.sub, atomic_cast_to_uint64, False)
198
+
199
+
200
+ def atomic_sub_float(ary):
201
+ atomic_binary_1dim_shared(ary, ary, 1.0, float32, 32,
202
+ cuda.atomic.sub, atomic_cast_to_int, 0.0, False)
203
+
204
+
205
+ def atomic_sub_float_2(ary):
206
+ atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
207
+ cuda.atomic.sub, atomic_cast_none, False)
208
+
209
+
210
+ def atomic_sub_float_3(ary):
211
+ atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
212
+ cuda.atomic.sub, atomic_cast_to_uint64, False)
213
+
214
+
215
+ def atomic_sub_double(idx, ary):
216
+ atomic_binary_1dim_shared(ary, idx, 1.0, float64, 32,
217
+ cuda.atomic.sub, atomic_cast_none, 0.0, False)
218
+
219
+
220
+ def atomic_sub_double_2(ary):
221
+ atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
222
+ cuda.atomic.sub, atomic_cast_none, False)
223
+
224
+
225
+ def atomic_sub_double_3(ary):
226
+ atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
227
+ cuda.atomic.sub, atomic_cast_to_uint64, False)
228
+
229
+
230
+ def atomic_sub_double_global(idx, ary):
231
+ atomic_binary_1dim_global(ary, idx, 32, 1.0, cuda.atomic.sub, False)
232
+
233
+
234
+ def atomic_sub_double_global_2(ary):
235
+ atomic_binary_2dim_global(ary, 1.0, cuda.atomic.sub, atomic_cast_none,
236
+ False)
237
+
238
+
239
+ def atomic_sub_double_global_3(ary):
240
+ atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
241
+ cuda.atomic.sub, atomic_cast_to_uint64, False)
242
+
243
+
244
+ def atomic_and(ary, op2):
245
+ atomic_binary_1dim_shared(ary, ary, op2, uint32, 32,
246
+ cuda.atomic.and_, atomic_cast_none, 1, False)
247
+
248
+
249
+ def atomic_and2(ary, op2):
250
+ atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
251
+ cuda.atomic.and_, atomic_cast_none, False)
252
+
253
+
254
+ def atomic_and3(ary, op2):
255
+ atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
256
+ cuda.atomic.and_, atomic_cast_to_uint64, False)
257
+
258
+
259
+ def atomic_and_global(idx, ary, op2):
260
+ atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.and_, False)
261
+
262
+
263
+ def atomic_and_global_2(ary, op2):
264
+ atomic_binary_2dim_global(ary, op2, cuda.atomic.and_,
265
+ atomic_cast_none, False)
266
+
267
+
268
+ def atomic_or(ary, op2):
269
+ atomic_binary_1dim_shared(ary, ary, op2, uint32, 32,
270
+ cuda.atomic.or_, atomic_cast_none, 0, False)
271
+
272
+
273
+ def atomic_or2(ary, op2):
274
+ atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
275
+ cuda.atomic.or_, atomic_cast_none, False)
276
+
277
+
278
+ def atomic_or3(ary, op2):
279
+ atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
280
+ cuda.atomic.or_, atomic_cast_to_uint64, False)
281
+
282
+
283
+ def atomic_or_global(idx, ary, op2):
284
+ atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.or_, False)
285
+
286
+
287
+ def atomic_or_global_2(ary, op2):
288
+ atomic_binary_2dim_global(ary, op2, cuda.atomic.or_,
289
+ atomic_cast_none, False)
290
+
291
+
292
+ def atomic_xor(ary, op2):
293
+ atomic_binary_1dim_shared(ary, ary, op2, uint32, 32,
294
+ cuda.atomic.xor, atomic_cast_none, 0, False)
295
+
296
+
297
+ def atomic_xor2(ary, op2):
298
+ atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
299
+ cuda.atomic.xor, atomic_cast_none, False)
300
+
301
+
302
+ def atomic_xor3(ary, op2):
303
+ atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
304
+ cuda.atomic.xor, atomic_cast_to_uint64, False)
305
+
306
+
307
+ def atomic_xor_global(idx, ary, op2):
308
+ atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.xor, False)
309
+
310
+
311
+ def atomic_xor_global_2(ary, op2):
312
+ atomic_binary_2dim_global(ary, op2, cuda.atomic.xor,
313
+ atomic_cast_none, False)
314
+
315
+
316
+ def atomic_inc32(ary, idx, op2):
317
+ atomic_binary_1dim_shared2(ary, idx, op2, uint32, 32,
318
+ cuda.atomic.inc, atomic_cast_none)
319
+
320
+
321
+ def atomic_inc64(ary, idx, op2):
322
+ atomic_binary_1dim_shared2(ary, idx, op2, uint64, 32,
323
+ cuda.atomic.inc, atomic_cast_to_int)
324
+
325
+
326
+ def atomic_inc2_32(ary, op2):
327
+ atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
328
+ cuda.atomic.inc, atomic_cast_none, False)
329
+
330
+
331
+ def atomic_inc2_64(ary, op2):
332
+ atomic_binary_2dim_shared(ary, op2, uint64, (4, 8),
333
+ cuda.atomic.inc, atomic_cast_none, False)
334
+
335
+
336
+ def atomic_inc3(ary, op2):
337
+ atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
338
+ cuda.atomic.inc, atomic_cast_to_uint64, False)
339
+
340
+
341
+ def atomic_inc_global(idx, ary, op2):
342
+ atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.inc, False)
343
+
344
+
345
+ def atomic_inc_global_2(ary, op2):
346
+ atomic_binary_2dim_global(ary, op2, cuda.atomic.inc,
347
+ atomic_cast_none, False)
348
+
349
+
350
+ def atomic_dec32(ary, idx, op2):
351
+ atomic_binary_1dim_shared2(ary, idx, op2, uint32, 32,
352
+ cuda.atomic.dec, atomic_cast_none)
353
+
354
+
355
+ def atomic_dec64(ary, idx, op2):
356
+ atomic_binary_1dim_shared2(ary, idx, op2, uint64, 32,
357
+ cuda.atomic.dec, atomic_cast_to_int)
358
+
359
+
360
+ def atomic_dec2_32(ary, op2):
361
+ atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
362
+ cuda.atomic.dec, atomic_cast_none, False)
363
+
364
+
365
+ def atomic_dec2_64(ary, op2):
366
+ atomic_binary_2dim_shared(ary, op2, uint64, (4, 8),
367
+ cuda.atomic.dec, atomic_cast_none, False)
368
+
369
+
370
+ def atomic_dec3(ary, op2):
371
+ atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
372
+ cuda.atomic.dec, atomic_cast_to_uint64, False)
373
+
374
+
375
+ def atomic_dec_global(idx, ary, op2):
376
+ atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.dec, False)
377
+
378
+
379
+ def atomic_dec_global_2(ary, op2):
380
+ atomic_binary_2dim_global(ary, op2, cuda.atomic.dec,
381
+ atomic_cast_none, False)
382
+
383
+
384
+ def atomic_exch(ary, idx, op2):
385
+ atomic_binary_1dim_shared2(ary, idx, op2, uint32, 32,
386
+ cuda.atomic.exch, atomic_cast_none)
387
+
388
+
389
+ def atomic_exch2(ary, op2):
390
+ atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
391
+ cuda.atomic.exch, atomic_cast_none, False)
392
+
393
+
394
+ def atomic_exch3(ary, op2):
395
+ atomic_binary_2dim_shared(ary, op2, uint64, (4, 8),
396
+ cuda.atomic.exch, atomic_cast_none, False)
397
+
398
+
399
+ def atomic_exch_global(idx, ary, op2):
400
+ atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.exch, False)
401
+
402
+
403
+ def gen_atomic_extreme_funcs(func):
404
+
405
+ fns = dedent("""
406
+ def atomic(res, ary):
407
+ tx = cuda.threadIdx.x
408
+ bx = cuda.blockIdx.x
409
+ {func}(res, 0, ary[tx, bx])
410
+
411
+ def atomic_double_normalizedindex(res, ary):
412
+ tx = cuda.threadIdx.x
413
+ bx = cuda.blockIdx.x
414
+ {func}(res, 0, ary[tx, uint64(bx)])
415
+
416
+ def atomic_double_oneindex(res, ary):
417
+ tx = cuda.threadIdx.x
418
+ {func}(res, 0, ary[tx])
419
+
420
+ def atomic_double_shared(res, ary):
421
+ tid = cuda.threadIdx.x
422
+ smary = cuda.shared.array(32, float64)
423
+ smary[tid] = ary[tid]
424
+ smres = cuda.shared.array(1, float64)
425
+ if tid == 0:
426
+ smres[0] = res[0]
427
+ cuda.syncthreads()
428
+ {func}(smres, 0, smary[tid])
429
+ cuda.syncthreads()
430
+ if tid == 0:
431
+ res[0] = smres[0]
432
+ """).format(func=func)
433
+ ld = {}
434
+ exec(fns, {'cuda': cuda, 'float64': float64, 'uint64': uint64}, ld)
435
+ return (ld['atomic'], ld['atomic_double_normalizedindex'],
436
+ ld['atomic_double_oneindex'], ld['atomic_double_shared'])
437
+
438
+
439
+ (atomic_max, atomic_max_double_normalizedindex, atomic_max_double_oneindex,
440
+ atomic_max_double_shared) = gen_atomic_extreme_funcs('cuda.atomic.max')
441
+ (atomic_min, atomic_min_double_normalizedindex, atomic_min_double_oneindex,
442
+ atomic_min_double_shared) = gen_atomic_extreme_funcs('cuda.atomic.min')
443
+ (atomic_nanmax, atomic_nanmax_double_normalizedindex,
444
+ atomic_nanmax_double_oneindex, atomic_nanmax_double_shared) = \
445
+ gen_atomic_extreme_funcs('cuda.atomic.nanmax')
446
+ (atomic_nanmin, atomic_nanmin_double_normalizedindex,
447
+ atomic_nanmin_double_oneindex, atomic_nanmin_double_shared) = \
448
+ gen_atomic_extreme_funcs('cuda.atomic.nanmin')
449
+
450
+
451
+ def atomic_compare_and_swap(res, old, ary, fill_val):
452
+ gid = cuda.grid(1)
453
+ if gid < res.size:
454
+ old[gid] = cuda.atomic.compare_and_swap(res[gid:], fill_val, ary[gid])
455
+
456
+
457
+ def atomic_cas_1dim(res, old, ary, fill_val):
458
+ gid = cuda.grid(1)
459
+ if gid < res.size:
460
+ old[gid] = cuda.atomic.cas(res, gid, fill_val, ary[gid])
461
+
462
+
463
+ def atomic_cas_2dim(res, old, ary, fill_val):
464
+ gid = cuda.grid(2)
465
+ if gid[0] < res.shape[0] and gid[1] < res.shape[1]:
466
+ old[gid] = cuda.atomic.cas(res, gid, fill_val, ary[gid])
467
+
468
+
469
+ class TestCudaAtomics(CUDATestCase):
470
+ def setUp(self):
471
+ super().setUp()
472
+ np.random.seed(0)
473
+
474
+ def test_atomic_add(self):
475
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32)
476
+ ary_wrap = ary.copy()
477
+ orig = ary.copy()
478
+
479
+ cuda_atomic_add = cuda.jit('void(uint32[:])')(atomic_add)
480
+ cuda_atomic_add[1, 32](ary)
481
+
482
+ cuda_atomic_add_wrap = cuda.jit('void(uint32[:])')(atomic_add_wrap)
483
+ cuda_atomic_add_wrap[1, 32](ary_wrap)
484
+
485
+ gold = np.zeros(32, dtype=np.uint32)
486
+ for i in range(orig.size):
487
+ gold[orig[i]] += 1
488
+
489
+ self.assertTrue(np.all(ary == gold))
490
+ self.assertTrue(np.all(ary_wrap == gold))
491
+
492
+ def test_atomic_add2(self):
493
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
494
+ ary_wrap = ary.copy()
495
+ orig = ary.copy()
496
+
497
+ cuda_atomic_add2 = cuda.jit('void(uint32[:,:])')(atomic_add2)
498
+ cuda_atomic_add2[1, (4, 8)](ary)
499
+
500
+ cuda_atomic_add2_wrap = cuda.jit('void(uint32[:,:])')(atomic_add2_wrap)
501
+ cuda_atomic_add2_wrap[1, (4, 8)](ary_wrap)
502
+
503
+ self.assertTrue(np.all(ary == orig + 1))
504
+ self.assertTrue(np.all(ary_wrap == orig + 1))
505
+
506
+ def test_atomic_add3(self):
507
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
508
+ orig = ary.copy()
509
+ cuda_atomic_add3 = cuda.jit('void(uint32[:,:])')(atomic_add3)
510
+ cuda_atomic_add3[1, (4, 8)](ary)
511
+
512
+ self.assertTrue(np.all(ary == orig + 1))
513
+
514
+ def test_atomic_add_float(self):
515
+ ary = np.random.randint(0, 32, size=32).astype(np.float32)
516
+ ary_wrap = ary.copy()
517
+ orig = ary.copy().astype(np.intp)
518
+
519
+ cuda_atomic_add_float = cuda.jit('void(float32[:])')(atomic_add_float)
520
+ cuda_atomic_add_float[1, 32](ary)
521
+
522
+ add_float_wrap = cuda.jit('void(float32[:])')(atomic_add_float_wrap)
523
+ add_float_wrap[1, 32](ary_wrap)
524
+
525
+ gold = np.zeros(32, dtype=np.uint32)
526
+ for i in range(orig.size):
527
+ gold[orig[i]] += 1.0
528
+
529
+ self.assertTrue(np.all(ary == gold))
530
+ self.assertTrue(np.all(ary_wrap == gold))
531
+
532
+ def test_atomic_add_float_2(self):
533
+ ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
534
+ ary_wrap = ary.copy()
535
+ orig = ary.copy()
536
+
537
+ cuda_atomic_add2 = cuda.jit('void(float32[:,:])')(atomic_add_float_2)
538
+ cuda_atomic_add2[1, (4, 8)](ary)
539
+
540
+ cuda_func_wrap = cuda.jit('void(float32[:,:])')(atomic_add_float_2_wrap)
541
+ cuda_func_wrap[1, (4, 8)](ary_wrap)
542
+
543
+ self.assertTrue(np.all(ary == orig + 1))
544
+ self.assertTrue(np.all(ary_wrap == orig + 1))
545
+
546
+ def test_atomic_add_float_3(self):
547
+ ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
548
+ orig = ary.copy()
549
+ cuda_atomic_add3 = cuda.jit('void(float32[:,:])')(atomic_add_float_3)
550
+ cuda_atomic_add3[1, (4, 8)](ary)
551
+
552
+ self.assertTrue(np.all(ary == orig + 1))
553
+
554
+ def assertCorrectFloat64Atomics(self, kernel, shared=True):
555
+ if config.ENABLE_CUDASIM:
556
+ return
557
+
558
+ # Use the first (and only) definition
559
+ asm = next(iter(kernel.inspect_asm().values()))
560
+ if cc_X_or_above(6, 0):
561
+ if cuda.runtime.get_version() > (12, 1):
562
+ # CUDA 12.2 and above generate a more optimized reduction
563
+ # instruction, because the result does not need to be
564
+ # placed in a register.
565
+ inst = 'red'
566
+ else:
567
+ inst = 'atom'
568
+
569
+ if shared:
570
+ inst = f'{inst}.shared'
571
+
572
+ self.assertIn(f'{inst}.add.f64', asm)
573
+ else:
574
+ if shared:
575
+ self.assertIn('atom.shared.cas.b64', asm)
576
+ else:
577
+ self.assertIn('atom.cas.b64', asm)
578
+
579
+ def test_atomic_add_double(self):
580
+ idx = np.random.randint(0, 32, size=32, dtype=np.int64)
581
+ ary = np.zeros(32, np.float64)
582
+ ary_wrap = ary.copy()
583
+
584
+ cuda_fn = cuda.jit('void(int64[:], float64[:])')(atomic_add_double)
585
+ cuda_fn[1, 32](idx, ary)
586
+
587
+ wrap_fn = cuda.jit('void(int64[:], float64[:])')(atomic_add_double_wrap)
588
+ wrap_fn[1, 32](idx, ary_wrap)
589
+
590
+ gold = np.zeros(32, dtype=np.uint32)
591
+ for i in range(idx.size):
592
+ gold[idx[i]] += 1.0
593
+
594
+ np.testing.assert_equal(ary, gold)
595
+ np.testing.assert_equal(ary_wrap, gold)
596
+ self.assertCorrectFloat64Atomics(cuda_fn)
597
+ self.assertCorrectFloat64Atomics(wrap_fn)
598
+
599
+ def test_atomic_add_double_2(self):
600
+ ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
601
+ ary_wrap = ary.copy()
602
+ orig = ary.copy()
603
+
604
+ cuda_fn = cuda.jit('void(float64[:,:])')(atomic_add_double_2)
605
+ cuda_fn[1, (4, 8)](ary)
606
+
607
+ cuda_fn_wrap = cuda.jit('void(float64[:,:])')(atomic_add_double_2_wrap)
608
+ cuda_fn_wrap[1, (4, 8)](ary_wrap)
609
+
610
+ np.testing.assert_equal(ary, orig + 1)
611
+ np.testing.assert_equal(ary_wrap, orig + 1)
612
+ self.assertCorrectFloat64Atomics(cuda_fn)
613
+ self.assertCorrectFloat64Atomics(cuda_fn_wrap)
614
+
615
+ def test_atomic_add_double_3(self):
616
+ ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
617
+ orig = ary.copy()
618
+ cuda_func = cuda.jit('void(float64[:,:])')(atomic_add_double_3)
619
+ cuda_func[1, (4, 8)](ary)
620
+
621
+ np.testing.assert_equal(ary, orig + 1)
622
+ self.assertCorrectFloat64Atomics(cuda_func)
623
+
624
+ def test_atomic_add_double_global(self):
625
+ idx = np.random.randint(0, 32, size=32, dtype=np.int64)
626
+ ary = np.zeros(32, np.float64)
627
+ ary_wrap = ary.copy()
628
+
629
+ sig = 'void(int64[:], float64[:])'
630
+ cuda_func = cuda.jit(sig)(atomic_add_double_global)
631
+ wrap_cuda_func = cuda.jit(sig)(atomic_add_double_global_wrap)
632
+
633
+ cuda_func[1, 32](idx, ary)
634
+ wrap_cuda_func[1, 32](idx, ary_wrap)
635
+
636
+ gold = np.zeros(32, dtype=np.uint32)
637
+ for i in range(idx.size):
638
+ gold[idx[i]] += 1.0
639
+
640
+ np.testing.assert_equal(ary, gold)
641
+ np.testing.assert_equal(ary_wrap, gold)
642
+ self.assertCorrectFloat64Atomics(cuda_func, shared=False)
643
+ self.assertCorrectFloat64Atomics(wrap_cuda_func, shared=False)
644
+
645
+ def test_atomic_add_double_global_2(self):
646
+ ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
647
+ ary_wrap = ary.copy()
648
+ orig = ary.copy()
649
+
650
+ sig = 'void(float64[:,:])'
651
+ cuda_func = cuda.jit(sig)(atomic_add_double_global_2)
652
+ wrap_cuda_func = cuda.jit(sig)(atomic_add_double_global_2_wrap)
653
+
654
+ cuda_func[1, (4, 8)](ary)
655
+ wrap_cuda_func[1, (4, 8)](ary_wrap)
656
+
657
+ np.testing.assert_equal(ary, orig + 1)
658
+ np.testing.assert_equal(ary_wrap, orig + 1)
659
+ self.assertCorrectFloat64Atomics(cuda_func, shared=False)
660
+ self.assertCorrectFloat64Atomics(wrap_cuda_func, shared=False)
661
+
662
+ def test_atomic_add_double_global_3(self):
663
+ ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
664
+ orig = ary.copy()
665
+ cuda_func = cuda.jit('void(float64[:,:])')(atomic_add_double_global_3)
666
+ cuda_func[1, (4, 8)](ary)
667
+
668
+ np.testing.assert_equal(ary, orig + 1)
669
+ self.assertCorrectFloat64Atomics(cuda_func, shared=False)
670
+
671
+ def test_atomic_sub(self):
672
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32)
673
+ orig = ary.copy()
674
+ cuda_atomic_sub = cuda.jit('void(uint32[:])')(atomic_sub)
675
+ cuda_atomic_sub[1, 32](ary)
676
+
677
+ gold = np.zeros(32, dtype=np.uint32)
678
+ for i in range(orig.size):
679
+ gold[orig[i]] -= 1
680
+
681
+ self.assertTrue(np.all(ary == gold))
682
+
683
+ def test_atomic_sub2(self):
684
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
685
+ orig = ary.copy()
686
+ cuda_atomic_sub2 = cuda.jit('void(uint32[:,:])')(atomic_sub2)
687
+ cuda_atomic_sub2[1, (4, 8)](ary)
688
+ self.assertTrue(np.all(ary == orig - 1))
689
+
690
+ def test_atomic_sub3(self):
691
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
692
+ orig = ary.copy()
693
+ cuda_atomic_sub3 = cuda.jit('void(uint32[:,:])')(atomic_sub3)
694
+ cuda_atomic_sub3[1, (4, 8)](ary)
695
+ self.assertTrue(np.all(ary == orig - 1))
696
+
697
+ def test_atomic_sub_float(self):
698
+ ary = np.random.randint(0, 32, size=32).astype(np.float32)
699
+ orig = ary.copy().astype(np.intp)
700
+ cuda_atomic_sub_float = cuda.jit('void(float32[:])')(atomic_sub_float)
701
+ cuda_atomic_sub_float[1, 32](ary)
702
+
703
+ gold = np.zeros(32, dtype=np.float32)
704
+ for i in range(orig.size):
705
+ gold[orig[i]] -= 1.0
706
+
707
+ self.assertTrue(np.all(ary == gold))
708
+
709
+ def test_atomic_sub_float_2(self):
710
+ ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
711
+ orig = ary.copy()
712
+ cuda_atomic_sub2 = cuda.jit('void(float32[:,:])')(atomic_sub_float_2)
713
+ cuda_atomic_sub2[1, (4, 8)](ary)
714
+ self.assertTrue(np.all(ary == orig - 1))
715
+
716
+ def test_atomic_sub_float_3(self):
717
+ ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
718
+ orig = ary.copy()
719
+ cuda_atomic_sub3 = cuda.jit('void(float32[:,:])')(atomic_sub_float_3)
720
+ cuda_atomic_sub3[1, (4, 8)](ary)
721
+ self.assertTrue(np.all(ary == orig - 1))
722
+
723
+ def test_atomic_sub_double(self):
724
+ idx = np.random.randint(0, 32, size=32, dtype=np.int64)
725
+ ary = np.zeros(32, np.float64)
726
+ cuda_func = cuda.jit('void(int64[:], float64[:])')(atomic_sub_double)
727
+ cuda_func[1, 32](idx, ary)
728
+
729
+ gold = np.zeros(32, dtype=np.float64)
730
+ for i in range(idx.size):
731
+ gold[idx[i]] -= 1.0
732
+
733
+ np.testing.assert_equal(ary, gold)
734
+
735
+ def test_atomic_sub_double_2(self):
736
+ ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
737
+ orig = ary.copy()
738
+ cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_2)
739
+ cuda_func[1, (4, 8)](ary)
740
+ np.testing.assert_equal(ary, orig - 1)
741
+
742
+ def test_atomic_sub_double_3(self):
743
+ ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
744
+ orig = ary.copy()
745
+ cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_3)
746
+ cuda_func[1, (4, 8)](ary)
747
+ np.testing.assert_equal(ary, orig - 1)
748
+
749
+ def test_atomic_sub_double_global(self):
750
+ idx = np.random.randint(0, 32, size=32, dtype=np.int64)
751
+ ary = np.zeros(32, np.float64)
752
+ sig = 'void(int64[:], float64[:])'
753
+ cuda_func = cuda.jit(sig)(atomic_sub_double_global)
754
+ cuda_func[1, 32](idx, ary)
755
+
756
+ gold = np.zeros(32, dtype=np.float64)
757
+ for i in range(idx.size):
758
+ gold[idx[i]] -= 1.0
759
+
760
+ np.testing.assert_equal(ary, gold)
761
+
762
+ def test_atomic_sub_double_global_2(self):
763
+ ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
764
+ orig = ary.copy()
765
+ cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_global_2)
766
+ cuda_func[1, (4, 8)](ary)
767
+ np.testing.assert_equal(ary, orig - 1)
768
+
769
+ def test_atomic_sub_double_global_3(self):
770
+ ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
771
+ orig = ary.copy()
772
+ cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_global_3)
773
+ cuda_func[1, (4, 8)](ary)
774
+ np.testing.assert_equal(ary, orig - 1)
775
+
776
+ def test_atomic_and(self):
777
+ rand_const = np.random.randint(500)
778
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32)
779
+ orig = ary.copy()
780
+ cuda_func = cuda.jit('void(uint32[:], uint32)')(atomic_and)
781
+ cuda_func[1, 32](ary, rand_const)
782
+
783
+ gold = ary.copy()
784
+ for i in range(orig.size):
785
+ gold[orig[i]] &= rand_const
786
+
787
+ self.assertTrue(np.all(ary == gold))
788
+
789
+ def test_atomic_and2(self):
790
+ rand_const = np.random.randint(500)
791
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
792
+ orig = ary.copy()
793
+ cuda_atomic_and2 = cuda.jit('void(uint32[:,:], uint32)')(atomic_and2)
794
+ cuda_atomic_and2[1, (4, 8)](ary, rand_const)
795
+ self.assertTrue(np.all(ary == orig & rand_const))
796
+
797
+ def test_atomic_and3(self):
798
+ rand_const = np.random.randint(500)
799
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
800
+ orig = ary.copy()
801
+ cuda_atomic_and3 = cuda.jit('void(uint32[:,:], uint32)')(atomic_and3)
802
+ cuda_atomic_and3[1, (4, 8)](ary, rand_const)
803
+ self.assertTrue(np.all(ary == orig & rand_const))
804
+
805
+ def test_atomic_and_global(self):
806
+ rand_const = np.random.randint(500)
807
+ idx = np.random.randint(0, 32, size=32, dtype=np.int32)
808
+ ary = np.random.randint(0, 32, size=32, dtype=np.int32)
809
+ sig = 'void(int32[:], int32[:], int32)'
810
+ cuda_func = cuda.jit(sig)(atomic_and_global)
811
+ cuda_func[1, 32](idx, ary, rand_const)
812
+
813
+ gold = ary.copy()
814
+ for i in range(idx.size):
815
+ gold[idx[i]] &= rand_const
816
+
817
+ np.testing.assert_equal(ary, gold)
818
+
819
+ def test_atomic_and_global_2(self):
820
+ rand_const = np.random.randint(500)
821
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
822
+ orig = ary.copy()
823
+ cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_and_global_2)
824
+ cuda_func[1, (4, 8)](ary, rand_const)
825
+ np.testing.assert_equal(ary, orig & rand_const)
826
+
827
+ def test_atomic_or(self):
828
+ rand_const = np.random.randint(500)
829
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32)
830
+ orig = ary.copy()
831
+ cuda_func = cuda.jit('void(uint32[:], uint32)')(atomic_or)
832
+ cuda_func[1, 32](ary, rand_const)
833
+
834
+ gold = np.zeros(32, dtype=np.uint32)
835
+ for i in range(orig.size):
836
+ gold[orig[i]] |= rand_const
837
+
838
+ self.assertTrue(np.all(ary == gold))
839
+
840
+ def test_atomic_or2(self):
841
+ rand_const = np.random.randint(500)
842
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
843
+ orig = ary.copy()
844
+ cuda_atomic_and2 = cuda.jit('void(uint32[:,:], uint32)')(atomic_or2)
845
+ cuda_atomic_and2[1, (4, 8)](ary, rand_const)
846
+ self.assertTrue(np.all(ary == orig | rand_const))
847
+
848
+ def test_atomic_or3(self):
849
+ rand_const = np.random.randint(500)
850
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
851
+ orig = ary.copy()
852
+ cuda_atomic_and3 = cuda.jit('void(uint32[:,:], uint32)')(atomic_or3)
853
+ cuda_atomic_and3[1, (4, 8)](ary, rand_const)
854
+ self.assertTrue(np.all(ary == orig | rand_const))
855
+
856
+ def test_atomic_or_global(self):
857
+ rand_const = np.random.randint(500)
858
+ idx = np.random.randint(0, 32, size=32, dtype=np.int32)
859
+ ary = np.random.randint(0, 32, size=32, dtype=np.int32)
860
+ sig = 'void(int32[:], int32[:], int32)'
861
+ cuda_func = cuda.jit(sig)(atomic_or_global)
862
+ cuda_func[1, 32](idx, ary, rand_const)
863
+
864
+ gold = ary.copy()
865
+ for i in range(idx.size):
866
+ gold[idx[i]] |= rand_const
867
+
868
+ np.testing.assert_equal(ary, gold)
869
+
870
+ def test_atomic_or_global_2(self):
871
+ rand_const = np.random.randint(500)
872
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
873
+ orig = ary.copy()
874
+ cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_or_global_2)
875
+ cuda_func[1, (4, 8)](ary, rand_const)
876
+ np.testing.assert_equal(ary, orig | rand_const)
877
+
878
+ def test_atomic_xor(self):
879
+ rand_const = np.random.randint(500)
880
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32)
881
+ orig = ary.copy()
882
+ cuda_func = cuda.jit('void(uint32[:], uint32)')(atomic_xor)
883
+ cuda_func[1, 32](ary, rand_const)
884
+
885
+ gold = np.zeros(32, dtype=np.uint32)
886
+ for i in range(orig.size):
887
+ gold[orig[i]] ^= rand_const
888
+
889
+ self.assertTrue(np.all(ary == gold))
890
+
891
+ def test_atomic_xor2(self):
892
+ rand_const = np.random.randint(500)
893
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
894
+ orig = ary.copy()
895
+ cuda_atomic_xor2 = cuda.jit('void(uint32[:,:], uint32)')(atomic_xor2)
896
+ cuda_atomic_xor2[1, (4, 8)](ary, rand_const)
897
+ self.assertTrue(np.all(ary == orig ^ rand_const))
898
+
899
+ def test_atomic_xor3(self):
900
+ rand_const = np.random.randint(500)
901
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
902
+ orig = ary.copy()
903
+ cuda_atomic_xor3 = cuda.jit('void(uint32[:,:], uint32)')(atomic_xor3)
904
+ cuda_atomic_xor3[1, (4, 8)](ary, rand_const)
905
+ self.assertTrue(np.all(ary == orig ^ rand_const))
906
+
907
+ def test_atomic_xor_global(self):
908
+ rand_const = np.random.randint(500)
909
+ idx = np.random.randint(0, 32, size=32, dtype=np.int32)
910
+ ary = np.random.randint(0, 32, size=32, dtype=np.int32)
911
+ gold = ary.copy()
912
+ sig = 'void(int32[:], int32[:], int32)'
913
+ cuda_func = cuda.jit(sig)(atomic_xor_global)
914
+ cuda_func[1, 32](idx, ary, rand_const)
915
+
916
+ for i in range(idx.size):
917
+ gold[idx[i]] ^= rand_const
918
+
919
+ np.testing.assert_equal(ary, gold)
920
+
921
+ def test_atomic_xor_global_2(self):
922
+ rand_const = np.random.randint(500)
923
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
924
+ orig = ary.copy()
925
+ cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_xor_global_2)
926
+ cuda_func[1, (4, 8)](ary, rand_const)
927
+ np.testing.assert_equal(ary, orig ^ rand_const)
928
+
929
+ def inc_dec_1dim_setup(self, dtype):
930
+ rconst = np.random.randint(32, dtype=dtype)
931
+ rary = np.random.randint(0, 32, size=32).astype(dtype)
932
+ ary_idx = np.arange(32, dtype=dtype)
933
+ return rconst, rary, ary_idx
934
+
935
+ def inc_dec_2dim_setup(self, dtype):
936
+ rconst = np.random.randint(32, dtype=dtype)
937
+ rary = np.random.randint(0, 32, size=32).astype(dtype).reshape(4, 8)
938
+ return rconst, rary
939
+
940
+ def check_inc_index(self, ary, idx, rconst, sig, nblocks, blksize, func):
941
+ orig = ary.copy()
942
+ cuda_func = cuda.jit(sig)(func)
943
+ cuda_func[nblocks, blksize](ary, idx, rconst)
944
+ np.testing.assert_equal(ary, np.where(orig >= rconst, 0, orig + 1))
945
+
946
+ def check_inc_index2(self, ary, idx, rconst, sig, nblocks, blksize, func):
947
+ orig = ary.copy()
948
+ cuda_func = cuda.jit(sig)(func)
949
+ cuda_func[nblocks, blksize](idx, ary, rconst)
950
+ np.testing.assert_equal(ary, np.where(orig >= rconst, 0, orig + 1))
951
+
952
+ def check_inc(self, ary, rconst, sig, nblocks, blksize, func):
953
+ orig = ary.copy()
954
+ cuda_func = cuda.jit(sig)(func)
955
+ cuda_func[nblocks, blksize](ary, rconst)
956
+ np.testing.assert_equal(ary, np.where(orig >= rconst, 0, orig + 1))
957
+
958
+ def test_atomic_inc_32(self):
959
+ rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
960
+ sig = 'void(uint32[:], uint32[:], uint32)'
961
+ self.check_inc_index(ary, idx, rand_const, sig, 1, 32, atomic_inc32)
962
+
963
+ def test_atomic_inc_64(self):
964
+ rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
965
+ sig = 'void(uint64[:], uint64[:], uint64)'
966
+ self.check_inc_index(ary, idx, rand_const, sig, 1, 32, atomic_inc64)
967
+
968
+ def test_atomic_inc2_32(self):
969
+ rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
970
+ sig = 'void(uint32[:,:], uint32)'
971
+ self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc2_32)
972
+
973
+ def test_atomic_inc2_64(self):
974
+ rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
975
+ sig = 'void(uint64[:,:], uint64)'
976
+ self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc2_64)
977
+
978
+ def test_atomic_inc3(self):
979
+ rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
980
+ sig = 'void(uint32[:,:], uint32)'
981
+ self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc3)
982
+
983
+ def test_atomic_inc_global_32(self):
984
+ rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
985
+ sig = 'void(uint32[:], uint32[:], uint32)'
986
+ self.check_inc_index2(ary, idx, rand_const, sig, 1, 32,
987
+ atomic_inc_global)
988
+
989
+ def test_atomic_inc_global_64(self):
990
+ rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
991
+ sig = 'void(uint64[:], uint64[:], uint64)'
992
+ self.check_inc_index2(ary, idx, rand_const, sig, 1, 32,
993
+ atomic_inc_global)
994
+
995
+ def test_atomic_inc_global_2_32(self):
996
+ rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
997
+ sig = 'void(uint32[:,:], uint32)'
998
+ self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc_global_2)
999
+
1000
+ def test_atomic_inc_global_2_64(self):
1001
+ rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
1002
+ sig = 'void(uint64[:,:], uint64)'
1003
+ self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc_global_2)
1004
+
1005
+ def check_dec_index(self, ary, idx, rconst, sig, nblocks, blksize, func):
1006
+ orig = ary.copy()
1007
+ cuda_func = cuda.jit(sig)(func)
1008
+ cuda_func[nblocks, blksize](ary, idx, rconst)
1009
+ np.testing.assert_equal(ary, np.where(orig == 0, rconst,
1010
+ np.where(orig > rconst,
1011
+ rconst,
1012
+ orig - 1)))
1013
+
1014
+ def check_dec_index2(self, ary, idx, rconst, sig, nblocks, blksize, func):
1015
+ orig = ary.copy()
1016
+ cuda_func = cuda.jit(sig)(func)
1017
+ cuda_func[nblocks, blksize](idx, ary, rconst)
1018
+ np.testing.assert_equal(ary, np.where(orig == 0, rconst,
1019
+ np.where(orig > rconst,
1020
+ rconst,
1021
+ orig - 1)))
1022
+
1023
+ def check_dec(self, ary, rconst, sig, nblocks, blksize, func):
1024
+ orig = ary.copy()
1025
+ cuda_func = cuda.jit(sig)(func)
1026
+ cuda_func[nblocks, blksize](ary, rconst)
1027
+ np.testing.assert_equal(ary, np.where(orig == 0, rconst,
1028
+ np.where(orig > rconst,
1029
+ rconst,
1030
+ orig - 1)))
1031
+
1032
+ def test_atomic_dec_32(self):
1033
+ rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
1034
+ sig = 'void(uint32[:], uint32[:], uint32)'
1035
+ self.check_dec_index(ary, idx, rand_const, sig, 1, 32, atomic_dec32)
1036
+
1037
+ def test_atomic_dec_64(self):
1038
+ rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
1039
+ sig = 'void(uint64[:], uint64[:], uint64)'
1040
+ self.check_dec_index(ary, idx, rand_const, sig, 1, 32, atomic_dec64)
1041
+
1042
+ def test_atomic_dec2_32(self):
1043
+ rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
1044
+ sig = 'void(uint32[:,:], uint32)'
1045
+ self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec2_32)
1046
+
1047
+ def test_atomic_dec2_64(self):
1048
+ rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
1049
+ sig = 'void(uint64[:,:], uint64)'
1050
+ self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec2_64)
1051
+
1052
+ def test_atomic_dec3_new(self):
1053
+ rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
1054
+ sig = 'void(uint32[:,:], uint32)'
1055
+ self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec3)
1056
+
1057
+ def test_atomic_dec_global_32(self):
1058
+ rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
1059
+ sig = 'void(uint32[:], uint32[:], uint32)'
1060
+ self.check_dec_index2(ary, idx, rand_const, sig, 1, 32,
1061
+ atomic_dec_global)
1062
+
1063
+ def test_atomic_dec_global_64(self):
1064
+ rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
1065
+ sig = 'void(uint64[:], uint64[:], uint64)'
1066
+ self.check_dec_index2(ary, idx, rand_const, sig, 1, 32,
1067
+ atomic_dec_global)
1068
+
1069
+ def test_atomic_dec_global2_32(self):
1070
+ rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
1071
+ sig = 'void(uint32[:,:], uint32)'
1072
+ self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec_global_2)
1073
+
1074
+ def test_atomic_dec_global2_64(self):
1075
+ rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
1076
+ sig = 'void(uint64[:,:], uint64)'
1077
+ self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec_global_2)
1078
+
1079
+ def test_atomic_exch(self):
1080
+ rand_const = np.random.randint(50, 100, dtype=np.uint32)
1081
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32)
1082
+ idx = np.arange(32, dtype=np.uint32)
1083
+
1084
+ cuda_func = cuda.jit('void(uint32[:], uint32[:], uint32)')(atomic_exch)
1085
+ cuda_func[1, 32](ary, idx, rand_const)
1086
+
1087
+ np.testing.assert_equal(ary, rand_const)
1088
+
1089
+ def test_atomic_exch2(self):
1090
+ rand_const = np.random.randint(50, 100, dtype=np.uint32)
1091
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
1092
+
1093
+ cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_exch2)
1094
+ cuda_func[1, (4, 8)](ary, rand_const)
1095
+ np.testing.assert_equal(ary, rand_const)
1096
+
1097
+ def test_atomic_exch3(self):
1098
+ rand_const = np.random.randint(50, 100, dtype=np.uint64)
1099
+ ary = np.random.randint(0, 32, size=32).astype(np.uint64).reshape(4, 8)
1100
+
1101
+ cuda_func = cuda.jit('void(uint64[:,:], uint64)')(atomic_exch3)
1102
+ cuda_func[1, (4, 8)](ary, rand_const)
1103
+ np.testing.assert_equal(ary, rand_const)
1104
+
1105
+ def test_atomic_exch_global(self):
1106
+ rand_const = np.random.randint(50, 100, dtype=np.uint32)
1107
+ idx = np.arange(32, dtype=np.uint32)
1108
+ ary = np.random.randint(0, 32, size=32, dtype=np.uint32)
1109
+
1110
+ sig = 'void(uint32[:], uint32[:], uint32)'
1111
+ cuda_func = cuda.jit(sig)(atomic_exch_global)
1112
+ cuda_func[1, 32](idx, ary, rand_const)
1113
+ np.testing.assert_equal(ary, rand_const)
1114
+
1115
+ def check_atomic_max(self, dtype, lo, hi):
1116
+ vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype)
1117
+ res = np.zeros(1, dtype=vals.dtype)
1118
+ cuda_func = cuda.jit(atomic_max)
1119
+ cuda_func[32, 32](res, vals)
1120
+ gold = np.max(vals)
1121
+ np.testing.assert_equal(res, gold)
1122
+
1123
+ def test_atomic_max_int32(self):
1124
+ self.check_atomic_max(dtype=np.int32, lo=-65535, hi=65535)
1125
+
1126
+ def test_atomic_max_uint32(self):
1127
+ self.check_atomic_max(dtype=np.uint32, lo=0, hi=65535)
1128
+
1129
+ def test_atomic_max_int64(self):
1130
+ self.check_atomic_max(dtype=np.int64, lo=-65535, hi=65535)
1131
+
1132
+ def test_atomic_max_uint64(self):
1133
+ self.check_atomic_max(dtype=np.uint64, lo=0, hi=65535)
1134
+
1135
+ def test_atomic_max_float32(self):
1136
+ self.check_atomic_max(dtype=np.float32, lo=-65535, hi=65535)
1137
+
1138
+ def test_atomic_max_double(self):
1139
+ self.check_atomic_max(dtype=np.float64, lo=-65535, hi=65535)
1140
+
1141
+ def test_atomic_max_double_normalizedindex(self):
1142
+ vals = np.random.randint(0, 65535, size=(32, 32)).astype(np.float64)
1143
+ res = np.zeros(1, np.float64)
1144
+ cuda_func = cuda.jit('void(float64[:], float64[:,:])')(
1145
+ atomic_max_double_normalizedindex)
1146
+ cuda_func[32, 32](res, vals)
1147
+
1148
+ gold = np.max(vals)
1149
+ np.testing.assert_equal(res, gold)
1150
+
1151
+ def test_atomic_max_double_oneindex(self):
1152
+ vals = np.random.randint(0, 128, size=32).astype(np.float64)
1153
+ res = np.zeros(1, np.float64)
1154
+ cuda_func = cuda.jit('void(float64[:], float64[:])')(
1155
+ atomic_max_double_oneindex)
1156
+ cuda_func[1, 32](res, vals)
1157
+
1158
+ gold = np.max(vals)
1159
+ np.testing.assert_equal(res, gold)
1160
+
1161
+ def check_atomic_min(self, dtype, lo, hi):
1162
+ vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype)
1163
+ res = np.array([65535], dtype=vals.dtype)
1164
+ cuda_func = cuda.jit(atomic_min)
1165
+ cuda_func[32, 32](res, vals)
1166
+
1167
+ gold = np.min(vals)
1168
+ np.testing.assert_equal(res, gold)
1169
+
1170
+ def test_atomic_min_int32(self):
1171
+ self.check_atomic_min(dtype=np.int32, lo=-65535, hi=65535)
1172
+
1173
+ def test_atomic_min_uint32(self):
1174
+ self.check_atomic_min(dtype=np.uint32, lo=0, hi=65535)
1175
+
1176
+ def test_atomic_min_int64(self):
1177
+ self.check_atomic_min(dtype=np.int64, lo=-65535, hi=65535)
1178
+
1179
+ def test_atomic_min_uint64(self):
1180
+ self.check_atomic_min(dtype=np.uint64, lo=0, hi=65535)
1181
+
1182
+ def test_atomic_min_float(self):
1183
+ self.check_atomic_min(dtype=np.float32, lo=-65535, hi=65535)
1184
+
1185
+ def test_atomic_min_double(self):
1186
+ self.check_atomic_min(dtype=np.float64, lo=-65535, hi=65535)
1187
+
1188
+ def test_atomic_min_double_normalizedindex(self):
1189
+ vals = np.random.randint(0, 65535, size=(32, 32)).astype(np.float64)
1190
+ res = np.ones(1, np.float64) * 65535
1191
+ cuda_func = cuda.jit('void(float64[:], float64[:,:])')(
1192
+ atomic_min_double_normalizedindex)
1193
+ cuda_func[32, 32](res, vals)
1194
+
1195
+ gold = np.min(vals)
1196
+ np.testing.assert_equal(res, gold)
1197
+
1198
+ def test_atomic_min_double_oneindex(self):
1199
+ vals = np.random.randint(0, 128, size=32).astype(np.float64)
1200
+ res = np.ones(1, np.float64) * 128
1201
+ cuda_func = cuda.jit('void(float64[:], float64[:])')(
1202
+ atomic_min_double_oneindex)
1203
+ cuda_func[1, 32](res, vals)
1204
+
1205
+ gold = np.min(vals)
1206
+ np.testing.assert_equal(res, gold)
1207
+
1208
+ # Taken together, _test_atomic_minmax_nan_location and
1209
+ # _test_atomic_minmax_nan_val check that NaNs are treated similarly to the
1210
+ # way they are in Python / NumPy - that is, {min,max}(a, b) == a if either
1211
+ # a or b is a NaN. For the atomics, this means that the max is taken as the
1212
+ # value stored in the memory location rather than the value supplied - i.e.
1213
+ # for:
1214
+ #
1215
+ # cuda.atomic.{min,max}(ary, idx, val)
1216
+ #
1217
+ # the result will be ary[idx] for either of ary[idx] or val being NaN.
1218
+
1219
+ def _test_atomic_minmax_nan_location(self, func):
1220
+
1221
+ cuda_func = cuda.jit('void(float64[:], float64[:,:])')(func)
1222
+
1223
+ vals = np.random.randint(0, 128, size=(1,1)).astype(np.float64)
1224
+ res = np.zeros(1, np.float64) + np.nan
1225
+ cuda_func[1, 1](res, vals)
1226
+ np.testing.assert_equal(res, [np.nan])
1227
+
1228
+ def _test_atomic_minmax_nan_val(self, func):
1229
+ cuda_func = cuda.jit('void(float64[:], float64[:,:])')(func)
1230
+
1231
+ res = np.random.randint(0, 128, size=1).astype(np.float64)
1232
+ gold = res.copy()
1233
+ vals = np.zeros((1, 1), np.float64) + np.nan
1234
+ cuda_func[1, 1](res, vals)
1235
+
1236
+ np.testing.assert_equal(res, gold)
1237
+
1238
+ def test_atomic_min_nan_location(self):
1239
+ self._test_atomic_minmax_nan_location(atomic_min)
1240
+
1241
+ def test_atomic_max_nan_location(self):
1242
+ self._test_atomic_minmax_nan_location(atomic_max)
1243
+
1244
+ def test_atomic_min_nan_val(self):
1245
+ self._test_atomic_minmax_nan_val(atomic_min)
1246
+
1247
+ def test_atomic_max_nan_val(self):
1248
+ self._test_atomic_minmax_nan_val(atomic_max)
1249
+
1250
+ def test_atomic_max_double_shared(self):
1251
+ vals = np.random.randint(0, 32, size=32).astype(np.float64)
1252
+ res = np.zeros(1, np.float64)
1253
+ sig = 'void(float64[:], float64[:])'
1254
+ cuda_func = cuda.jit(sig)(atomic_max_double_shared)
1255
+ cuda_func[1, 32](res, vals)
1256
+
1257
+ gold = np.max(vals)
1258
+ np.testing.assert_equal(res, gold)
1259
+
1260
+ def test_atomic_min_double_shared(self):
1261
+ vals = np.random.randint(0, 32, size=32).astype(np.float64)
1262
+ res = np.ones(1, np.float64) * 32
1263
+ sig = 'void(float64[:], float64[:])'
1264
+ cuda_func = cuda.jit(sig)(atomic_min_double_shared)
1265
+ cuda_func[1, 32](res, vals)
1266
+
1267
+ gold = np.min(vals)
1268
+ np.testing.assert_equal(res, gold)
1269
+
1270
+ def check_cas(self, n, fill, unfill, dtype, cas_func, ndim=1):
1271
+ res = [fill] * (n // 2) + [unfill] * (n // 2)
1272
+ np.random.shuffle(res)
1273
+ res = np.asarray(res, dtype=dtype)
1274
+ if ndim == 2:
1275
+ res.shape = (10, -1)
1276
+ out = np.zeros_like(res)
1277
+ ary = np.random.randint(1, 10, size=res.shape).astype(res.dtype)
1278
+
1279
+ fill_mask = res == fill
1280
+ unfill_mask = res == unfill
1281
+
1282
+ expect_res = np.zeros_like(res)
1283
+ expect_res[fill_mask] = ary[fill_mask]
1284
+ expect_res[unfill_mask] = unfill
1285
+
1286
+ expect_out = res.copy()
1287
+
1288
+ cuda_func = cuda.jit(cas_func)
1289
+ if ndim == 1:
1290
+ cuda_func[10, 10](res, out, ary, fill)
1291
+ else:
1292
+ cuda_func[(10, 10), (10, 10)](res, out, ary, fill)
1293
+
1294
+ np.testing.assert_array_equal(expect_res, res)
1295
+ np.testing.assert_array_equal(expect_out, out)
1296
+
1297
+ def test_atomic_compare_and_swap(self):
1298
+ self.check_cas(n=100, fill=-99, unfill=-1, dtype=np.int32,
1299
+ cas_func=atomic_compare_and_swap)
1300
+
1301
+ def test_atomic_compare_and_swap2(self):
1302
+ self.check_cas(n=100, fill=-45, unfill=-1, dtype=np.int64,
1303
+ cas_func=atomic_compare_and_swap)
1304
+
1305
+ def test_atomic_compare_and_swap3(self):
1306
+ rfill = np.random.randint(50, 500, dtype=np.uint32)
1307
+ runfill = np.random.randint(1, 25, dtype=np.uint32)
1308
+ self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint32,
1309
+ cas_func=atomic_compare_and_swap)
1310
+
1311
+ def test_atomic_compare_and_swap4(self):
1312
+ rfill = np.random.randint(50, 500, dtype=np.uint64)
1313
+ runfill = np.random.randint(1, 25, dtype=np.uint64)
1314
+ self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint64,
1315
+ cas_func=atomic_compare_and_swap)
1316
+
1317
+ def test_atomic_cas_1dim(self):
1318
+ self.check_cas(n=100, fill=-99, unfill=-1, dtype=np.int32,
1319
+ cas_func=atomic_cas_1dim)
1320
+
1321
+ def test_atomic_cas_2dim(self):
1322
+ self.check_cas(n=100, fill=-99, unfill=-1, dtype=np.int32,
1323
+ cas_func=atomic_cas_2dim, ndim=2)
1324
+
1325
+ def test_atomic_cas2_1dim(self):
1326
+ self.check_cas(n=100, fill=-45, unfill=-1, dtype=np.int64,
1327
+ cas_func=atomic_cas_1dim)
1328
+
1329
+ def test_atomic_cas2_2dim(self):
1330
+ self.check_cas(n=100, fill=-45, unfill=-1, dtype=np.int64,
1331
+ cas_func=atomic_cas_2dim, ndim=2)
1332
+
1333
+ def test_atomic_cas3_1dim(self):
1334
+ rfill = np.random.randint(50, 500, dtype=np.uint32)
1335
+ runfill = np.random.randint(1, 25, dtype=np.uint32)
1336
+ self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint32,
1337
+ cas_func=atomic_cas_1dim)
1338
+
1339
+ def test_atomic_cas3_2dim(self):
1340
+ rfill = np.random.randint(50, 500, dtype=np.uint32)
1341
+ runfill = np.random.randint(1, 25, dtype=np.uint32)
1342
+ self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint32,
1343
+ cas_func=atomic_cas_2dim, ndim=2)
1344
+
1345
+ def test_atomic_cas4_1dim(self):
1346
+ rfill = np.random.randint(50, 500, dtype=np.uint64)
1347
+ runfill = np.random.randint(1, 25, dtype=np.uint64)
1348
+ self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint64,
1349
+ cas_func=atomic_cas_1dim)
1350
+
1351
+ def test_atomic_cas4_2dim(self):
1352
+ rfill = np.random.randint(50, 500, dtype=np.uint64)
1353
+ runfill = np.random.randint(1, 25, dtype=np.uint64)
1354
+ self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint64,
1355
+ cas_func=atomic_cas_2dim, ndim=2)
1356
+
1357
+ # Tests that the atomic add, min, and max operations return the old value -
1358
+ # in the simulator, they did not (see Issue #5458). The max and min have
1359
+ # special handling for NaN values, so we explicitly test with a NaN in the
1360
+ # array being modified and the value provided.
1361
+
1362
+ def _test_atomic_returns_old(self, kernel, initial):
1363
+ x = np.zeros(2, dtype=np.float32)
1364
+ x[0] = initial
1365
+ kernel[1, 1](x)
1366
+ if np.isnan(initial):
1367
+ self.assertTrue(np.isnan(x[1]))
1368
+ else:
1369
+ self.assertEqual(x[1], initial)
1370
+
1371
+ def test_atomic_add_returns_old(self):
1372
+ @cuda.jit
1373
+ def kernel(x):
1374
+ x[1] = cuda.atomic.add(x, 0, 1)
1375
+
1376
+ self._test_atomic_returns_old(kernel, 10)
1377
+
1378
+ def test_atomic_max_returns_no_replace(self):
1379
+ @cuda.jit
1380
+ def kernel(x):
1381
+ x[1] = cuda.atomic.max(x, 0, 1)
1382
+
1383
+ self._test_atomic_returns_old(kernel, 10)
1384
+
1385
+ def test_atomic_max_returns_old_replace(self):
1386
+ @cuda.jit
1387
+ def kernel(x):
1388
+ x[1] = cuda.atomic.max(x, 0, 10)
1389
+
1390
+ self._test_atomic_returns_old(kernel, 1)
1391
+
1392
+ def test_atomic_max_returns_old_nan_in_array(self):
1393
+ @cuda.jit
1394
+ def kernel(x):
1395
+ x[1] = cuda.atomic.max(x, 0, 1)
1396
+
1397
+ self._test_atomic_returns_old(kernel, np.nan)
1398
+
1399
+ def test_atomic_max_returns_old_nan_val(self):
1400
+ @cuda.jit
1401
+ def kernel(x):
1402
+ x[1] = cuda.atomic.max(x, 0, np.nan)
1403
+
1404
+ self._test_atomic_returns_old(kernel, 10)
1405
+
1406
+ def test_atomic_min_returns_old_no_replace(self):
1407
+ @cuda.jit
1408
+ def kernel(x):
1409
+ x[1] = cuda.atomic.min(x, 0, 11)
1410
+
1411
+ self._test_atomic_returns_old(kernel, 10)
1412
+
1413
+ def test_atomic_min_returns_old_replace(self):
1414
+ @cuda.jit
1415
+ def kernel(x):
1416
+ x[1] = cuda.atomic.min(x, 0, 10)
1417
+
1418
+ self._test_atomic_returns_old(kernel, 11)
1419
+
1420
+ def test_atomic_min_returns_old_nan_in_array(self):
1421
+ @cuda.jit
1422
+ def kernel(x):
1423
+ x[1] = cuda.atomic.min(x, 0, 11)
1424
+
1425
+ self._test_atomic_returns_old(kernel, np.nan)
1426
+
1427
+ def test_atomic_min_returns_old_nan_val(self):
1428
+ @cuda.jit
1429
+ def kernel(x):
1430
+ x[1] = cuda.atomic.min(x, 0, np.nan)
1431
+
1432
+ self._test_atomic_returns_old(kernel, 11)
1433
+
1434
+ # Tests for atomic nanmin/nanmax
1435
+
1436
+ # nanmax tests
1437
+ def check_atomic_nanmax(self, dtype, lo, hi, init_val):
1438
+ vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype)
1439
+ vals[1::2] = init_val
1440
+ res = np.zeros(1, dtype=vals.dtype)
1441
+ cuda_func = cuda.jit(atomic_nanmax)
1442
+ cuda_func[32, 32](res, vals)
1443
+ gold = np.nanmax(vals)
1444
+ np.testing.assert_equal(res, gold)
1445
+
1446
+ def test_atomic_nanmax_int32(self):
1447
+ self.check_atomic_nanmax(dtype=np.int32, lo=-65535, hi=65535,
1448
+ init_val=0)
1449
+
1450
+ def test_atomic_nanmax_uint32(self):
1451
+ self.check_atomic_nanmax(dtype=np.uint32, lo=0, hi=65535,
1452
+ init_val=0)
1453
+
1454
+ def test_atomic_nanmax_int64(self):
1455
+ self.check_atomic_nanmax(dtype=np.int64, lo=-65535, hi=65535,
1456
+ init_val=0)
1457
+
1458
+ def test_atomic_nanmax_uint64(self):
1459
+ self.check_atomic_nanmax(dtype=np.uint64, lo=0, hi=65535,
1460
+ init_val=0)
1461
+
1462
+ def test_atomic_nanmax_float32(self):
1463
+ self.check_atomic_nanmax(dtype=np.float32, lo=-65535, hi=65535,
1464
+ init_val=np.nan)
1465
+
1466
+ def test_atomic_nanmax_double(self):
1467
+ self.check_atomic_nanmax(dtype=np.float64, lo=-65535, hi=65535,
1468
+ init_val=np.nan)
1469
+
1470
+ def test_atomic_nanmax_double_shared(self):
1471
+ vals = np.random.randint(0, 32, size=32).astype(np.float64)
1472
+ vals[1::2] = np.nan
1473
+ res = np.array([0], dtype=vals.dtype)
1474
+ sig = 'void(float64[:], float64[:])'
1475
+ cuda_func = cuda.jit(sig)(atomic_nanmax_double_shared)
1476
+ cuda_func[1, 32](res, vals)
1477
+
1478
+ gold = np.nanmax(vals)
1479
+ np.testing.assert_equal(res, gold)
1480
+
1481
+ def test_atomic_nanmax_double_oneindex(self):
1482
+ vals = np.random.randint(0, 128, size=32).astype(np.float64)
1483
+ vals[1::2] = np.nan
1484
+ res = np.zeros(1, np.float64)
1485
+ cuda_func = cuda.jit('void(float64[:], float64[:])')(
1486
+ atomic_max_double_oneindex)
1487
+ cuda_func[1, 32](res, vals)
1488
+
1489
+ gold = np.nanmax(vals)
1490
+ np.testing.assert_equal(res, gold)
1491
+
1492
+ # nanmin tests
1493
+ def check_atomic_nanmin(self, dtype, lo, hi, init_val):
1494
+ vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype)
1495
+ vals[1::2] = init_val
1496
+ res = np.array([65535], dtype=vals.dtype)
1497
+ cuda_func = cuda.jit(atomic_nanmin)
1498
+ cuda_func[32, 32](res, vals)
1499
+
1500
+ gold = np.nanmin(vals)
1501
+ np.testing.assert_equal(res, gold)
1502
+
1503
+ def test_atomic_nanmin_int32(self):
1504
+ self.check_atomic_nanmin(dtype=np.int32, lo=-65535, hi=65535,
1505
+ init_val=0)
1506
+
1507
+ def test_atomic_nanmin_uint32(self):
1508
+ self.check_atomic_nanmin(dtype=np.uint32, lo=0, hi=65535,
1509
+ init_val=0)
1510
+
1511
+ def test_atomic_nanmin_int64(self):
1512
+ self.check_atomic_nanmin(dtype=np.int64, lo=-65535, hi=65535,
1513
+ init_val=0)
1514
+
1515
+ def test_atomic_nanmin_uint64(self):
1516
+ self.check_atomic_nanmin(dtype=np.uint64, lo=0, hi=65535,
1517
+ init_val=0)
1518
+
1519
+ def test_atomic_nanmin_float(self):
1520
+ self.check_atomic_nanmin(dtype=np.float32, lo=-65535, hi=65535,
1521
+ init_val=np.nan)
1522
+
1523
+ def test_atomic_nanmin_double(self):
1524
+ self.check_atomic_nanmin(dtype=np.float64, lo=-65535, hi=65535,
1525
+ init_val=np.nan)
1526
+
1527
+ def test_atomic_nanmin_double_shared(self):
1528
+ vals = np.random.randint(0, 32, size=32).astype(np.float64)
1529
+ vals[1::2] = np.nan
1530
+ res = np.array([32], dtype=vals.dtype)
1531
+ sig = 'void(float64[:], float64[:])'
1532
+ cuda_func = cuda.jit(sig)(atomic_nanmin_double_shared)
1533
+ cuda_func[1, 32](res, vals)
1534
+
1535
+ gold = np.nanmin(vals)
1536
+ np.testing.assert_equal(res, gold)
1537
+
1538
+ def test_atomic_nanmin_double_oneindex(self):
1539
+ vals = np.random.randint(0, 128, size=32).astype(np.float64)
1540
+ vals[1::2] = np.nan
1541
+ res = np.array([128], np.float64)
1542
+ cuda_func = cuda.jit('void(float64[:], float64[:])')(
1543
+ atomic_min_double_oneindex)
1544
+ cuda_func[1, 32](res, vals)
1545
+
1546
+ gold = np.nanmin(vals)
1547
+ np.testing.assert_equal(res, gold)
1548
+
1549
+ # Returning old value tests
1550
+
1551
+ def _test_atomic_nan_returns_old(self, kernel, initial):
1552
+ x = np.zeros(2, dtype=np.float32)
1553
+ x[0] = initial
1554
+ x[1] = np.nan
1555
+ kernel[1, 1](x)
1556
+ if np.isnan(initial):
1557
+ self.assertFalse(np.isnan(x[0]))
1558
+ self.assertTrue(np.isnan(x[1]))
1559
+ else:
1560
+ self.assertEqual(x[1], initial)
1561
+
1562
+ def test_atomic_nanmax_returns_old_no_replace(self):
1563
+ @cuda.jit
1564
+ def kernel(x):
1565
+ x[1] = cuda.atomic.nanmax(x, 0, 1)
1566
+
1567
+ self._test_atomic_nan_returns_old(kernel, 10)
1568
+
1569
+ def test_atomic_nanmax_returns_old_replace(self):
1570
+ @cuda.jit
1571
+ def kernel(x):
1572
+ x[1] = cuda.atomic.nanmax(x, 0, 10)
1573
+
1574
+ self._test_atomic_nan_returns_old(kernel, 1)
1575
+
1576
+ def test_atomic_nanmax_returns_old_nan_in_array(self):
1577
+ @cuda.jit
1578
+ def kernel(x):
1579
+ x[1] = cuda.atomic.nanmax(x, 0, 1)
1580
+
1581
+ self._test_atomic_nan_returns_old(kernel, np.nan)
1582
+
1583
+ def test_atomic_nanmax_returns_old_nan_val(self):
1584
+ @cuda.jit
1585
+ def kernel(x):
1586
+ x[1] = cuda.atomic.nanmax(x, 0, np.nan)
1587
+
1588
+ self._test_atomic_nan_returns_old(kernel, 10)
1589
+
1590
+ def test_atomic_nanmin_returns_old_no_replace(self):
1591
+ @cuda.jit
1592
+ def kernel(x):
1593
+ x[1] = cuda.atomic.nanmin(x, 0, 11)
1594
+
1595
+ self._test_atomic_nan_returns_old(kernel, 10)
1596
+
1597
+ def test_atomic_nanmin_returns_old_replace(self):
1598
+ @cuda.jit
1599
+ def kernel(x):
1600
+ x[1] = cuda.atomic.nanmin(x, 0, 10)
1601
+
1602
+ self._test_atomic_nan_returns_old(kernel, 11)
1603
+
1604
+ def test_atomic_nanmin_returns_old_nan_in_array(self):
1605
+ @cuda.jit
1606
+ def kernel(x):
1607
+ x[1] = cuda.atomic.nanmin(x, 0, 11)
1608
+
1609
+ self._test_atomic_nan_returns_old(kernel, np.nan)
1610
+
1611
+ def test_atomic_nanmin_returns_old_nan_val(self):
1612
+ @cuda.jit
1613
+ def kernel(x):
1614
+ x[1] = cuda.atomic.nanmin(x, 0, np.nan)
1615
+
1616
+ self._test_atomic_nan_returns_old(kernel, 11)
1617
+
1618
+
1619
+ if __name__ == '__main__':
1620
+ unittest.main()