numba-cuda 0.22.0__cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (487) hide show
  1. _numba_cuda_redirector.pth +4 -0
  2. _numba_cuda_redirector.py +89 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +6 -0
  5. numba_cuda/_version.py +11 -0
  6. numba_cuda/numba/cuda/__init__.py +70 -0
  7. numba_cuda/numba/cuda/_internal/cuda_bf16.py +16394 -0
  8. numba_cuda/numba/cuda/_internal/cuda_fp16.py +8112 -0
  9. numba_cuda/numba/cuda/api.py +580 -0
  10. numba_cuda/numba/cuda/api_util.py +76 -0
  11. numba_cuda/numba/cuda/args.py +72 -0
  12. numba_cuda/numba/cuda/bf16.py +397 -0
  13. numba_cuda/numba/cuda/cache_hints.py +287 -0
  14. numba_cuda/numba/cuda/cext/__init__.py +2 -0
  15. numba_cuda/numba/cuda/cext/_devicearray.cpp +159 -0
  16. numba_cuda/numba/cuda/cext/_devicearray.cpython-313-aarch64-linux-gnu.so +0 -0
  17. numba_cuda/numba/cuda/cext/_devicearray.h +29 -0
  18. numba_cuda/numba/cuda/cext/_dispatcher.cpp +1098 -0
  19. numba_cuda/numba/cuda/cext/_dispatcher.cpython-313-aarch64-linux-gnu.so +0 -0
  20. numba_cuda/numba/cuda/cext/_hashtable.cpp +532 -0
  21. numba_cuda/numba/cuda/cext/_hashtable.h +135 -0
  22. numba_cuda/numba/cuda/cext/_helperlib.c +71 -0
  23. numba_cuda/numba/cuda/cext/_helperlib.cpython-313-aarch64-linux-gnu.so +0 -0
  24. numba_cuda/numba/cuda/cext/_helpermod.c +82 -0
  25. numba_cuda/numba/cuda/cext/_pymodule.h +38 -0
  26. numba_cuda/numba/cuda/cext/_typeconv.cpp +206 -0
  27. numba_cuda/numba/cuda/cext/_typeconv.cpython-313-aarch64-linux-gnu.so +0 -0
  28. numba_cuda/numba/cuda/cext/_typeof.cpp +1159 -0
  29. numba_cuda/numba/cuda/cext/_typeof.h +19 -0
  30. numba_cuda/numba/cuda/cext/capsulethunk.h +111 -0
  31. numba_cuda/numba/cuda/cext/mviewbuf.c +385 -0
  32. numba_cuda/numba/cuda/cext/mviewbuf.cpython-313-aarch64-linux-gnu.so +0 -0
  33. numba_cuda/numba/cuda/cext/typeconv.cpp +212 -0
  34. numba_cuda/numba/cuda/cext/typeconv.hpp +101 -0
  35. numba_cuda/numba/cuda/cg.py +67 -0
  36. numba_cuda/numba/cuda/cgutils.py +1294 -0
  37. numba_cuda/numba/cuda/cloudpickle/__init__.py +21 -0
  38. numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +1598 -0
  39. numba_cuda/numba/cuda/cloudpickle/cloudpickle_fast.py +17 -0
  40. numba_cuda/numba/cuda/codegen.py +541 -0
  41. numba_cuda/numba/cuda/compiler.py +1396 -0
  42. numba_cuda/numba/cuda/core/analysis.py +758 -0
  43. numba_cuda/numba/cuda/core/annotations/__init__.py +0 -0
  44. numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +288 -0
  45. numba_cuda/numba/cuda/core/annotations/type_annotations.py +305 -0
  46. numba_cuda/numba/cuda/core/base.py +1332 -0
  47. numba_cuda/numba/cuda/core/boxing.py +1411 -0
  48. numba_cuda/numba/cuda/core/bytecode.py +728 -0
  49. numba_cuda/numba/cuda/core/byteflow.py +2346 -0
  50. numba_cuda/numba/cuda/core/caching.py +744 -0
  51. numba_cuda/numba/cuda/core/callconv.py +392 -0
  52. numba_cuda/numba/cuda/core/codegen.py +171 -0
  53. numba_cuda/numba/cuda/core/compiler.py +199 -0
  54. numba_cuda/numba/cuda/core/compiler_lock.py +85 -0
  55. numba_cuda/numba/cuda/core/compiler_machinery.py +497 -0
  56. numba_cuda/numba/cuda/core/config.py +650 -0
  57. numba_cuda/numba/cuda/core/consts.py +124 -0
  58. numba_cuda/numba/cuda/core/controlflow.py +989 -0
  59. numba_cuda/numba/cuda/core/entrypoints.py +57 -0
  60. numba_cuda/numba/cuda/core/environment.py +66 -0
  61. numba_cuda/numba/cuda/core/errors.py +917 -0
  62. numba_cuda/numba/cuda/core/event.py +511 -0
  63. numba_cuda/numba/cuda/core/funcdesc.py +330 -0
  64. numba_cuda/numba/cuda/core/generators.py +387 -0
  65. numba_cuda/numba/cuda/core/imputils.py +509 -0
  66. numba_cuda/numba/cuda/core/inline_closurecall.py +1787 -0
  67. numba_cuda/numba/cuda/core/interpreter.py +3617 -0
  68. numba_cuda/numba/cuda/core/ir.py +1812 -0
  69. numba_cuda/numba/cuda/core/ir_utils.py +2638 -0
  70. numba_cuda/numba/cuda/core/optional.py +129 -0
  71. numba_cuda/numba/cuda/core/options.py +262 -0
  72. numba_cuda/numba/cuda/core/postproc.py +249 -0
  73. numba_cuda/numba/cuda/core/pythonapi.py +1859 -0
  74. numba_cuda/numba/cuda/core/registry.py +46 -0
  75. numba_cuda/numba/cuda/core/removerefctpass.py +123 -0
  76. numba_cuda/numba/cuda/core/rewrites/__init__.py +26 -0
  77. numba_cuda/numba/cuda/core/rewrites/ir_print.py +91 -0
  78. numba_cuda/numba/cuda/core/rewrites/registry.py +104 -0
  79. numba_cuda/numba/cuda/core/rewrites/static_binop.py +41 -0
  80. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +189 -0
  81. numba_cuda/numba/cuda/core/rewrites/static_raise.py +100 -0
  82. numba_cuda/numba/cuda/core/sigutils.py +68 -0
  83. numba_cuda/numba/cuda/core/ssa.py +498 -0
  84. numba_cuda/numba/cuda/core/targetconfig.py +330 -0
  85. numba_cuda/numba/cuda/core/tracing.py +231 -0
  86. numba_cuda/numba/cuda/core/transforms.py +956 -0
  87. numba_cuda/numba/cuda/core/typed_passes.py +867 -0
  88. numba_cuda/numba/cuda/core/typeinfer.py +1950 -0
  89. numba_cuda/numba/cuda/core/unsafe/__init__.py +0 -0
  90. numba_cuda/numba/cuda/core/unsafe/bytes.py +67 -0
  91. numba_cuda/numba/cuda/core/unsafe/eh.py +67 -0
  92. numba_cuda/numba/cuda/core/unsafe/refcount.py +98 -0
  93. numba_cuda/numba/cuda/core/untyped_passes.py +1979 -0
  94. numba_cuda/numba/cuda/cpython/builtins.py +1153 -0
  95. numba_cuda/numba/cuda/cpython/charseq.py +1218 -0
  96. numba_cuda/numba/cuda/cpython/cmathimpl.py +560 -0
  97. numba_cuda/numba/cuda/cpython/enumimpl.py +103 -0
  98. numba_cuda/numba/cuda/cpython/iterators.py +167 -0
  99. numba_cuda/numba/cuda/cpython/listobj.py +1326 -0
  100. numba_cuda/numba/cuda/cpython/mathimpl.py +499 -0
  101. numba_cuda/numba/cuda/cpython/numbers.py +1475 -0
  102. numba_cuda/numba/cuda/cpython/rangeobj.py +289 -0
  103. numba_cuda/numba/cuda/cpython/slicing.py +322 -0
  104. numba_cuda/numba/cuda/cpython/tupleobj.py +456 -0
  105. numba_cuda/numba/cuda/cpython/unicode.py +2865 -0
  106. numba_cuda/numba/cuda/cpython/unicode_support.py +1597 -0
  107. numba_cuda/numba/cuda/cpython/unsafe/__init__.py +0 -0
  108. numba_cuda/numba/cuda/cpython/unsafe/numbers.py +64 -0
  109. numba_cuda/numba/cuda/cpython/unsafe/tuple.py +92 -0
  110. numba_cuda/numba/cuda/cuda_paths.py +691 -0
  111. numba_cuda/numba/cuda/cudadecl.py +543 -0
  112. numba_cuda/numba/cuda/cudadrv/__init__.py +14 -0
  113. numba_cuda/numba/cuda/cudadrv/devicearray.py +954 -0
  114. numba_cuda/numba/cuda/cudadrv/devices.py +249 -0
  115. numba_cuda/numba/cuda/cudadrv/driver.py +3238 -0
  116. numba_cuda/numba/cuda/cudadrv/drvapi.py +435 -0
  117. numba_cuda/numba/cuda/cudadrv/dummyarray.py +562 -0
  118. numba_cuda/numba/cuda/cudadrv/enums.py +613 -0
  119. numba_cuda/numba/cuda/cudadrv/error.py +48 -0
  120. numba_cuda/numba/cuda/cudadrv/libs.py +220 -0
  121. numba_cuda/numba/cuda/cudadrv/linkable_code.py +184 -0
  122. numba_cuda/numba/cuda/cudadrv/mappings.py +14 -0
  123. numba_cuda/numba/cuda/cudadrv/ndarray.py +26 -0
  124. numba_cuda/numba/cuda/cudadrv/nvrtc.py +193 -0
  125. numba_cuda/numba/cuda/cudadrv/nvvm.py +756 -0
  126. numba_cuda/numba/cuda/cudadrv/rtapi.py +13 -0
  127. numba_cuda/numba/cuda/cudadrv/runtime.py +34 -0
  128. numba_cuda/numba/cuda/cudaimpl.py +983 -0
  129. numba_cuda/numba/cuda/cudamath.py +149 -0
  130. numba_cuda/numba/cuda/datamodel/__init__.py +7 -0
  131. numba_cuda/numba/cuda/datamodel/cuda_manager.py +66 -0
  132. numba_cuda/numba/cuda/datamodel/cuda_models.py +1446 -0
  133. numba_cuda/numba/cuda/datamodel/cuda_packer.py +224 -0
  134. numba_cuda/numba/cuda/datamodel/cuda_registry.py +22 -0
  135. numba_cuda/numba/cuda/datamodel/cuda_testing.py +153 -0
  136. numba_cuda/numba/cuda/datamodel/manager.py +11 -0
  137. numba_cuda/numba/cuda/datamodel/models.py +9 -0
  138. numba_cuda/numba/cuda/datamodel/packer.py +9 -0
  139. numba_cuda/numba/cuda/datamodel/registry.py +11 -0
  140. numba_cuda/numba/cuda/datamodel/testing.py +11 -0
  141. numba_cuda/numba/cuda/debuginfo.py +997 -0
  142. numba_cuda/numba/cuda/decorators.py +294 -0
  143. numba_cuda/numba/cuda/descriptor.py +35 -0
  144. numba_cuda/numba/cuda/device_init.py +155 -0
  145. numba_cuda/numba/cuda/deviceufunc.py +1021 -0
  146. numba_cuda/numba/cuda/dispatcher.py +2463 -0
  147. numba_cuda/numba/cuda/errors.py +72 -0
  148. numba_cuda/numba/cuda/extending.py +697 -0
  149. numba_cuda/numba/cuda/flags.py +178 -0
  150. numba_cuda/numba/cuda/fp16.py +357 -0
  151. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  152. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  153. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  154. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  155. numba_cuda/numba/cuda/include/13/cuda_bf16.h +5118 -0
  156. numba_cuda/numba/cuda/include/13/cuda_bf16.hpp +3865 -0
  157. numba_cuda/numba/cuda/include/13/cuda_fp16.h +5363 -0
  158. numba_cuda/numba/cuda/include/13/cuda_fp16.hpp +3483 -0
  159. numba_cuda/numba/cuda/initialize.py +24 -0
  160. numba_cuda/numba/cuda/intrinsics.py +531 -0
  161. numba_cuda/numba/cuda/itanium_mangler.py +214 -0
  162. numba_cuda/numba/cuda/kernels/__init__.py +2 -0
  163. numba_cuda/numba/cuda/kernels/reduction.py +265 -0
  164. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  165. numba_cuda/numba/cuda/libdevice.py +3386 -0
  166. numba_cuda/numba/cuda/libdevicedecl.py +20 -0
  167. numba_cuda/numba/cuda/libdevicefuncs.py +1060 -0
  168. numba_cuda/numba/cuda/libdeviceimpl.py +88 -0
  169. numba_cuda/numba/cuda/locks.py +19 -0
  170. numba_cuda/numba/cuda/lowering.py +1980 -0
  171. numba_cuda/numba/cuda/mathimpl.py +374 -0
  172. numba_cuda/numba/cuda/memory_management/__init__.py +4 -0
  173. numba_cuda/numba/cuda/memory_management/memsys.cu +99 -0
  174. numba_cuda/numba/cuda/memory_management/memsys.cuh +22 -0
  175. numba_cuda/numba/cuda/memory_management/nrt.cu +212 -0
  176. numba_cuda/numba/cuda/memory_management/nrt.cuh +48 -0
  177. numba_cuda/numba/cuda/memory_management/nrt.py +390 -0
  178. numba_cuda/numba/cuda/memory_management/nrt_context.py +438 -0
  179. numba_cuda/numba/cuda/misc/appdirs.py +594 -0
  180. numba_cuda/numba/cuda/misc/cffiimpl.py +24 -0
  181. numba_cuda/numba/cuda/misc/coverage_support.py +43 -0
  182. numba_cuda/numba/cuda/misc/dump_style.py +41 -0
  183. numba_cuda/numba/cuda/misc/findlib.py +75 -0
  184. numba_cuda/numba/cuda/misc/firstlinefinder.py +96 -0
  185. numba_cuda/numba/cuda/misc/gdb_hook.py +240 -0
  186. numba_cuda/numba/cuda/misc/literal.py +28 -0
  187. numba_cuda/numba/cuda/misc/llvm_pass_timings.py +412 -0
  188. numba_cuda/numba/cuda/misc/special.py +94 -0
  189. numba_cuda/numba/cuda/models.py +56 -0
  190. numba_cuda/numba/cuda/np/arraymath.py +5130 -0
  191. numba_cuda/numba/cuda/np/arrayobj.py +7635 -0
  192. numba_cuda/numba/cuda/np/extensions.py +11 -0
  193. numba_cuda/numba/cuda/np/linalg.py +3087 -0
  194. numba_cuda/numba/cuda/np/math/__init__.py +0 -0
  195. numba_cuda/numba/cuda/np/math/cmathimpl.py +558 -0
  196. numba_cuda/numba/cuda/np/math/mathimpl.py +487 -0
  197. numba_cuda/numba/cuda/np/math/numbers.py +1461 -0
  198. numba_cuda/numba/cuda/np/npdatetime.py +969 -0
  199. numba_cuda/numba/cuda/np/npdatetime_helpers.py +217 -0
  200. numba_cuda/numba/cuda/np/npyfuncs.py +1808 -0
  201. numba_cuda/numba/cuda/np/npyimpl.py +1027 -0
  202. numba_cuda/numba/cuda/np/numpy_support.py +798 -0
  203. numba_cuda/numba/cuda/np/polynomial/__init__.py +4 -0
  204. numba_cuda/numba/cuda/np/polynomial/polynomial_core.py +242 -0
  205. numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +380 -0
  206. numba_cuda/numba/cuda/np/ufunc/__init__.py +4 -0
  207. numba_cuda/numba/cuda/np/ufunc/decorators.py +203 -0
  208. numba_cuda/numba/cuda/np/ufunc/sigparse.py +68 -0
  209. numba_cuda/numba/cuda/np/ufunc/ufuncbuilder.py +65 -0
  210. numba_cuda/numba/cuda/np/ufunc_db.py +1282 -0
  211. numba_cuda/numba/cuda/np/unsafe/__init__.py +0 -0
  212. numba_cuda/numba/cuda/np/unsafe/ndarray.py +84 -0
  213. numba_cuda/numba/cuda/nvvmutils.py +254 -0
  214. numba_cuda/numba/cuda/printimpl.py +126 -0
  215. numba_cuda/numba/cuda/random.py +308 -0
  216. numba_cuda/numba/cuda/reshape_funcs.cu +156 -0
  217. numba_cuda/numba/cuda/serialize.py +267 -0
  218. numba_cuda/numba/cuda/simulator/__init__.py +63 -0
  219. numba_cuda/numba/cuda/simulator/_internal/__init__.py +4 -0
  220. numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +2 -0
  221. numba_cuda/numba/cuda/simulator/api.py +179 -0
  222. numba_cuda/numba/cuda/simulator/bf16.py +4 -0
  223. numba_cuda/numba/cuda/simulator/compiler.py +38 -0
  224. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +11 -0
  225. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +462 -0
  226. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +122 -0
  227. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +66 -0
  228. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +7 -0
  229. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +7 -0
  230. numba_cuda/numba/cuda/simulator/cudadrv/error.py +10 -0
  231. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +10 -0
  232. numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +61 -0
  233. numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +11 -0
  234. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +32 -0
  235. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +22 -0
  236. numba_cuda/numba/cuda/simulator/dispatcher.py +11 -0
  237. numba_cuda/numba/cuda/simulator/kernel.py +320 -0
  238. numba_cuda/numba/cuda/simulator/kernelapi.py +509 -0
  239. numba_cuda/numba/cuda/simulator/memory_management/__init__.py +4 -0
  240. numba_cuda/numba/cuda/simulator/memory_management/nrt.py +21 -0
  241. numba_cuda/numba/cuda/simulator/reduction.py +19 -0
  242. numba_cuda/numba/cuda/simulator/tests/support.py +4 -0
  243. numba_cuda/numba/cuda/simulator/vector_types.py +65 -0
  244. numba_cuda/numba/cuda/simulator_init.py +18 -0
  245. numba_cuda/numba/cuda/stubs.py +624 -0
  246. numba_cuda/numba/cuda/target.py +505 -0
  247. numba_cuda/numba/cuda/testing.py +347 -0
  248. numba_cuda/numba/cuda/tests/__init__.py +62 -0
  249. numba_cuda/numba/cuda/tests/benchmarks/__init__.py +0 -0
  250. numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +119 -0
  251. numba_cuda/numba/cuda/tests/cloudpickle_main_class.py +9 -0
  252. numba_cuda/numba/cuda/tests/core/serialize_usecases.py +113 -0
  253. numba_cuda/numba/cuda/tests/core/test_itanium_mangler.py +83 -0
  254. numba_cuda/numba/cuda/tests/core/test_serialize.py +371 -0
  255. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +9 -0
  256. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +147 -0
  257. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +161 -0
  258. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +397 -0
  259. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +24 -0
  260. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +180 -0
  261. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +313 -0
  262. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +191 -0
  263. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +621 -0
  264. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +247 -0
  265. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +100 -0
  266. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +200 -0
  267. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +53 -0
  268. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +72 -0
  269. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +138 -0
  270. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +43 -0
  271. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +15 -0
  272. numba_cuda/numba/cuda/tests/cudadrv/test_linkable_code.py +58 -0
  273. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +348 -0
  274. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +128 -0
  275. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +301 -0
  276. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +174 -0
  277. numba_cuda/numba/cuda/tests/cudadrv/test_nvrtc.py +28 -0
  278. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +185 -0
  279. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +39 -0
  280. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +23 -0
  281. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +38 -0
  282. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +48 -0
  283. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +44 -0
  284. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +127 -0
  285. numba_cuda/numba/cuda/tests/cudapy/__init__.py +9 -0
  286. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +231 -0
  287. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +50 -0
  288. numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +36 -0
  289. numba_cuda/numba/cuda/tests/cudapy/complex_usecases.py +116 -0
  290. numba_cuda/numba/cuda/tests/cudapy/enum_usecases.py +59 -0
  291. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +62 -0
  292. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +28 -0
  293. numba_cuda/numba/cuda/tests/cudapy/overload_usecases.py +33 -0
  294. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +104 -0
  295. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +47 -0
  296. numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +1122 -0
  297. numba_cuda/numba/cuda/tests/cudapy/test_array.py +344 -0
  298. numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +268 -0
  299. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +203 -0
  300. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +63 -0
  301. numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py +360 -0
  302. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1815 -0
  303. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +599 -0
  304. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +377 -0
  305. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +160 -0
  306. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +27 -0
  307. numba_cuda/numba/cuda/tests/cudapy/test_byteflow.py +98 -0
  308. numba_cuda/numba/cuda/tests/cudapy/test_cache_hints.py +210 -0
  309. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +683 -0
  310. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +265 -0
  311. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +42 -0
  312. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +718 -0
  313. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +370 -0
  314. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +23 -0
  315. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +142 -0
  316. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +178 -0
  317. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +193 -0
  318. numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +131 -0
  319. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +438 -0
  320. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +94 -0
  321. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +101 -0
  322. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +105 -0
  323. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +978 -0
  324. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo_types.py +476 -0
  325. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +500 -0
  326. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +820 -0
  327. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +152 -0
  328. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +111 -0
  329. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +170 -0
  330. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1088 -0
  331. numba_cuda/numba/cuda/tests/cudapy/test_extending_types.py +71 -0
  332. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +265 -0
  333. numba_cuda/numba/cuda/tests/cudapy/test_flow_control.py +1433 -0
  334. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +57 -0
  335. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +34 -0
  336. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +69 -0
  337. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +62 -0
  338. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +474 -0
  339. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +167 -0
  340. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +92 -0
  341. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +39 -0
  342. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +170 -0
  343. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +255 -0
  344. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1219 -0
  345. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +263 -0
  346. numba_cuda/numba/cuda/tests/cudapy/test_ir.py +598 -0
  347. numba_cuda/numba/cuda/tests/cudapy/test_ir_utils.py +276 -0
  348. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +101 -0
  349. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +68 -0
  350. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +123 -0
  351. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +194 -0
  352. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +220 -0
  353. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +173 -0
  354. numba_cuda/numba/cuda/tests/cudapy/test_make_function_to_jit_function.py +364 -0
  355. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +47 -0
  356. numba_cuda/numba/cuda/tests/cudapy/test_math.py +842 -0
  357. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +76 -0
  358. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +78 -0
  359. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +25 -0
  360. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +145 -0
  361. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +39 -0
  362. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +82 -0
  363. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +53 -0
  364. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +504 -0
  365. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +93 -0
  366. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +402 -0
  367. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +128 -0
  368. numba_cuda/numba/cuda/tests/cudapy/test_print.py +193 -0
  369. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +37 -0
  370. numba_cuda/numba/cuda/tests/cudapy/test_random.py +117 -0
  371. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +614 -0
  372. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +130 -0
  373. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +94 -0
  374. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  375. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +86 -0
  376. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +40 -0
  377. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +457 -0
  378. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +233 -0
  379. numba_cuda/numba/cuda/tests/cudapy/test_ssa.py +454 -0
  380. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +56 -0
  381. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +277 -0
  382. numba_cuda/numba/cuda/tests/cudapy/test_tracing.py +200 -0
  383. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +90 -0
  384. numba_cuda/numba/cuda/tests/cudapy/test_typeconv.py +333 -0
  385. numba_cuda/numba/cuda/tests/cudapy/test_typeinfer.py +538 -0
  386. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +585 -0
  387. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +42 -0
  388. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +485 -0
  389. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +312 -0
  390. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +23 -0
  391. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +183 -0
  392. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +40 -0
  393. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +40 -0
  394. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +206 -0
  395. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +446 -0
  396. numba_cuda/numba/cuda/tests/cudasim/__init__.py +9 -0
  397. numba_cuda/numba/cuda/tests/cudasim/support.py +9 -0
  398. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +111 -0
  399. numba_cuda/numba/cuda/tests/data/__init__.py +2 -0
  400. numba_cuda/numba/cuda/tests/data/cta_barrier.cu +28 -0
  401. numba_cuda/numba/cuda/tests/data/cuda_include.cu +10 -0
  402. numba_cuda/numba/cuda/tests/data/error.cu +12 -0
  403. numba_cuda/numba/cuda/tests/data/include/add.cuh +8 -0
  404. numba_cuda/numba/cuda/tests/data/jitlink.cu +28 -0
  405. numba_cuda/numba/cuda/tests/data/jitlink.ptx +49 -0
  406. numba_cuda/numba/cuda/tests/data/warn.cu +12 -0
  407. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +9 -0
  408. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +2 -0
  409. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +54 -0
  410. numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +8 -0
  411. numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +14 -0
  412. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +86 -0
  413. numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +68 -0
  414. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +81 -0
  415. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +141 -0
  416. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +160 -0
  417. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +180 -0
  418. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +119 -0
  419. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +66 -0
  420. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +80 -0
  421. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +206 -0
  422. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +53 -0
  423. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +76 -0
  424. numba_cuda/numba/cuda/tests/nocuda/__init__.py +9 -0
  425. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +452 -0
  426. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +48 -0
  427. numba_cuda/numba/cuda/tests/nocuda/test_import.py +63 -0
  428. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +252 -0
  429. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +59 -0
  430. numba_cuda/numba/cuda/tests/nrt/__init__.py +9 -0
  431. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +387 -0
  432. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +124 -0
  433. numba_cuda/numba/cuda/tests/support.py +900 -0
  434. numba_cuda/numba/cuda/typeconv/__init__.py +4 -0
  435. numba_cuda/numba/cuda/typeconv/castgraph.py +137 -0
  436. numba_cuda/numba/cuda/typeconv/rules.py +63 -0
  437. numba_cuda/numba/cuda/typeconv/typeconv.py +121 -0
  438. numba_cuda/numba/cuda/types/__init__.py +233 -0
  439. numba_cuda/numba/cuda/types/__init__.pyi +167 -0
  440. numba_cuda/numba/cuda/types/abstract.py +9 -0
  441. numba_cuda/numba/cuda/types/common.py +9 -0
  442. numba_cuda/numba/cuda/types/containers.py +9 -0
  443. numba_cuda/numba/cuda/types/cuda_abstract.py +533 -0
  444. numba_cuda/numba/cuda/types/cuda_common.py +110 -0
  445. numba_cuda/numba/cuda/types/cuda_containers.py +971 -0
  446. numba_cuda/numba/cuda/types/cuda_function_type.py +230 -0
  447. numba_cuda/numba/cuda/types/cuda_functions.py +798 -0
  448. numba_cuda/numba/cuda/types/cuda_iterators.py +120 -0
  449. numba_cuda/numba/cuda/types/cuda_misc.py +569 -0
  450. numba_cuda/numba/cuda/types/cuda_npytypes.py +690 -0
  451. numba_cuda/numba/cuda/types/cuda_scalars.py +280 -0
  452. numba_cuda/numba/cuda/types/ext_types.py +101 -0
  453. numba_cuda/numba/cuda/types/function_type.py +11 -0
  454. numba_cuda/numba/cuda/types/functions.py +9 -0
  455. numba_cuda/numba/cuda/types/iterators.py +9 -0
  456. numba_cuda/numba/cuda/types/misc.py +9 -0
  457. numba_cuda/numba/cuda/types/npytypes.py +9 -0
  458. numba_cuda/numba/cuda/types/scalars.py +9 -0
  459. numba_cuda/numba/cuda/typing/__init__.py +19 -0
  460. numba_cuda/numba/cuda/typing/arraydecl.py +939 -0
  461. numba_cuda/numba/cuda/typing/asnumbatype.py +130 -0
  462. numba_cuda/numba/cuda/typing/bufproto.py +70 -0
  463. numba_cuda/numba/cuda/typing/builtins.py +1209 -0
  464. numba_cuda/numba/cuda/typing/cffi_utils.py +219 -0
  465. numba_cuda/numba/cuda/typing/cmathdecl.py +47 -0
  466. numba_cuda/numba/cuda/typing/collections.py +138 -0
  467. numba_cuda/numba/cuda/typing/context.py +782 -0
  468. numba_cuda/numba/cuda/typing/ctypes_utils.py +125 -0
  469. numba_cuda/numba/cuda/typing/dictdecl.py +63 -0
  470. numba_cuda/numba/cuda/typing/enumdecl.py +74 -0
  471. numba_cuda/numba/cuda/typing/listdecl.py +147 -0
  472. numba_cuda/numba/cuda/typing/mathdecl.py +158 -0
  473. numba_cuda/numba/cuda/typing/npdatetime.py +322 -0
  474. numba_cuda/numba/cuda/typing/npydecl.py +749 -0
  475. numba_cuda/numba/cuda/typing/setdecl.py +115 -0
  476. numba_cuda/numba/cuda/typing/templates.py +1446 -0
  477. numba_cuda/numba/cuda/typing/typeof.py +301 -0
  478. numba_cuda/numba/cuda/ufuncs.py +746 -0
  479. numba_cuda/numba/cuda/utils.py +724 -0
  480. numba_cuda/numba/cuda/vector_types.py +214 -0
  481. numba_cuda/numba/cuda/vectorizers.py +260 -0
  482. numba_cuda-0.22.0.dist-info/METADATA +109 -0
  483. numba_cuda-0.22.0.dist-info/RECORD +487 -0
  484. numba_cuda-0.22.0.dist-info/WHEEL +6 -0
  485. numba_cuda-0.22.0.dist-info/licenses/LICENSE +26 -0
  486. numba_cuda-0.22.0.dist-info/licenses/LICENSE.numba +24 -0
  487. numba_cuda-0.22.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1815 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ import numpy as np
5
+ from textwrap import dedent
6
+
7
+ from numba import cuda
8
+ from numba.cuda import uint32, uint64, float32, float64, int32
9
+ from numba.cuda.testing import unittest, CUDATestCase, cc_X_or_above
10
+ from numba.cuda.core import config
11
+
12
+
13
+ @cuda.jit(device=True)
14
+ def atomic_cast_to_uint64(num):
15
+ return uint64(num)
16
+
17
+
18
+ @cuda.jit(device=True)
19
+ def atomic_cast_to_int(num):
20
+ return int(num)
21
+
22
+
23
+ @cuda.jit(device=True)
24
+ def atomic_cast_none(num):
25
+ return num
26
+
27
+
28
+ @cuda.jit(device=True)
29
+ def atomic_binary_1dim_shared(
30
+ ary,
31
+ idx,
32
+ op2,
33
+ ary_dtype,
34
+ ary_nelements,
35
+ binop_func,
36
+ cast_func,
37
+ initializer,
38
+ neg_idx,
39
+ ):
40
+ tid = cuda.threadIdx.x
41
+ sm = cuda.shared.array(ary_nelements, ary_dtype)
42
+ sm[tid] = initializer
43
+ cuda.syncthreads()
44
+ bin = cast_func(idx[tid] % ary_nelements)
45
+ if neg_idx:
46
+ bin = bin % ary_nelements
47
+ binop_func(sm, bin, op2)
48
+ cuda.syncthreads()
49
+ ary[tid] = sm[tid]
50
+
51
+
52
+ @cuda.jit(device=True)
53
+ def atomic_binary_1dim_shared2(
54
+ ary, idx, op2, ary_dtype, ary_nelements, binop_func, cast_func
55
+ ):
56
+ tid = cuda.threadIdx.x
57
+ sm = cuda.shared.array(ary_nelements, ary_dtype)
58
+ sm[tid] = ary[tid]
59
+ cuda.syncthreads()
60
+ bin = cast_func(idx[tid] % ary_nelements)
61
+ binop_func(sm, bin, op2)
62
+ cuda.syncthreads()
63
+ ary[tid] = sm[tid]
64
+
65
+
66
+ @cuda.jit(device=True)
67
+ def atomic_binary_2dim_shared(
68
+ ary, op2, ary_dtype, ary_shape, binop_func, y_cast_func, neg_idx
69
+ ):
70
+ tx = cuda.threadIdx.x
71
+ ty = cuda.threadIdx.y
72
+ sm = cuda.shared.array(ary_shape, ary_dtype)
73
+ sm[tx, ty] = ary[tx, ty]
74
+ cuda.syncthreads()
75
+ bin = (tx, y_cast_func(ty))
76
+ if neg_idx:
77
+ bin = (bin[0] % ary_shape[0], bin[1] % ary_shape[1])
78
+ binop_func(sm, bin, op2)
79
+ cuda.syncthreads()
80
+ ary[tx, ty] = sm[tx, ty]
81
+
82
+
83
+ @cuda.jit(device=True)
84
+ def atomic_binary_2dim_global(ary, op2, binop_func, y_cast_func, neg_idx):
85
+ tx = cuda.threadIdx.x
86
+ ty = cuda.threadIdx.y
87
+ bin = (tx, y_cast_func(ty))
88
+ if neg_idx:
89
+ bin = (bin[0] % ary.shape[0], bin[1] % ary.shape[1])
90
+ binop_func(ary, bin, op2)
91
+
92
+
93
+ @cuda.jit(device=True)
94
+ def atomic_binary_1dim_global(
95
+ ary, idx, ary_nelements, op2, binop_func, neg_idx
96
+ ):
97
+ tid = cuda.threadIdx.x
98
+ bin = int(idx[tid] % ary_nelements)
99
+ if neg_idx:
100
+ bin = bin % ary_nelements
101
+ binop_func(ary, bin, op2)
102
+
103
+
104
+ def atomic_add(ary):
105
+ atomic_binary_1dim_shared(
106
+ ary, ary, 1, uint32, 32, cuda.atomic.add, atomic_cast_none, 0, False
107
+ )
108
+
109
+
110
+ def atomic_add_wrap(ary):
111
+ atomic_binary_1dim_shared(
112
+ ary, ary, 1, uint32, 32, cuda.atomic.add, atomic_cast_none, 0, True
113
+ )
114
+
115
+
116
+ def atomic_add2(ary):
117
+ atomic_binary_2dim_shared(
118
+ ary, 1, uint32, (4, 8), cuda.atomic.add, atomic_cast_none, False
119
+ )
120
+
121
+
122
+ def atomic_add2_wrap(ary):
123
+ atomic_binary_2dim_shared(
124
+ ary, 1, uint32, (4, 8), cuda.atomic.add, atomic_cast_none, True
125
+ )
126
+
127
+
128
+ def atomic_add3(ary):
129
+ atomic_binary_2dim_shared(
130
+ ary, 1, uint32, (4, 8), cuda.atomic.add, atomic_cast_to_uint64, False
131
+ )
132
+
133
+
134
+ def atomic_add_float(ary):
135
+ atomic_binary_1dim_shared(
136
+ ary,
137
+ ary,
138
+ 1.0,
139
+ float32,
140
+ 32,
141
+ cuda.atomic.add,
142
+ atomic_cast_to_int,
143
+ 0.0,
144
+ False,
145
+ )
146
+
147
+
148
+ def atomic_add_float_wrap(ary):
149
+ atomic_binary_1dim_shared(
150
+ ary,
151
+ ary,
152
+ 1.0,
153
+ float32,
154
+ 32,
155
+ cuda.atomic.add,
156
+ atomic_cast_to_int,
157
+ 0.0,
158
+ True,
159
+ )
160
+
161
+
162
+ def atomic_add_float_2(ary):
163
+ atomic_binary_2dim_shared(
164
+ ary, 1.0, float32, (4, 8), cuda.atomic.add, atomic_cast_none, False
165
+ )
166
+
167
+
168
+ def atomic_add_float_2_wrap(ary):
169
+ atomic_binary_2dim_shared(
170
+ ary, 1.0, float32, (4, 8), cuda.atomic.add, atomic_cast_none, True
171
+ )
172
+
173
+
174
+ def atomic_add_float_3(ary):
175
+ atomic_binary_2dim_shared(
176
+ ary, 1.0, float32, (4, 8), cuda.atomic.add, atomic_cast_to_uint64, False
177
+ )
178
+
179
+
180
+ def atomic_add_double_global(idx, ary):
181
+ atomic_binary_1dim_global(ary, idx, 32, 1.0, cuda.atomic.add, False)
182
+
183
+
184
+ def atomic_add_double_global_wrap(idx, ary):
185
+ atomic_binary_1dim_global(ary, idx, 32, 1.0, cuda.atomic.add, True)
186
+
187
+
188
+ def atomic_add_double_global_2(ary):
189
+ atomic_binary_2dim_global(ary, 1, cuda.atomic.add, atomic_cast_none, False)
190
+
191
+
192
+ def atomic_add_double_global_2_wrap(ary):
193
+ atomic_binary_2dim_global(ary, 1, cuda.atomic.add, atomic_cast_none, True)
194
+
195
+
196
+ def atomic_add_double_global_3(ary):
197
+ atomic_binary_2dim_global(
198
+ ary, 1, cuda.atomic.add, atomic_cast_to_uint64, False
199
+ )
200
+
201
+
202
+ def atomic_add_double(idx, ary):
203
+ atomic_binary_1dim_shared(
204
+ ary,
205
+ idx,
206
+ 1.0,
207
+ float64,
208
+ 32,
209
+ cuda.atomic.add,
210
+ atomic_cast_none,
211
+ 0.0,
212
+ False,
213
+ )
214
+
215
+
216
+ def atomic_add_double_wrap(idx, ary):
217
+ atomic_binary_1dim_shared(
218
+ ary, idx, 1.0, float64, 32, cuda.atomic.add, atomic_cast_none, 0.0, True
219
+ )
220
+
221
+
222
+ def atomic_add_double_2(ary):
223
+ atomic_binary_2dim_shared(
224
+ ary, 1.0, float64, (4, 8), cuda.atomic.add, atomic_cast_none, False
225
+ )
226
+
227
+
228
+ def atomic_add_double_2_wrap(ary):
229
+ atomic_binary_2dim_shared(
230
+ ary, 1.0, float64, (4, 8), cuda.atomic.add, atomic_cast_none, True
231
+ )
232
+
233
+
234
+ def atomic_add_double_3(ary):
235
+ atomic_binary_2dim_shared(
236
+ ary, 1.0, float64, (4, 8), cuda.atomic.add, atomic_cast_to_uint64, False
237
+ )
238
+
239
+
240
+ def atomic_sub(ary):
241
+ atomic_binary_1dim_shared(
242
+ ary, ary, 1, int32, 32, cuda.atomic.sub, atomic_cast_none, 0, False
243
+ )
244
+
245
+
246
+ def atomic_sub2(ary):
247
+ atomic_binary_2dim_shared(
248
+ ary, 1, int32, (4, 8), cuda.atomic.sub, atomic_cast_none, False
249
+ )
250
+
251
+
252
+ def atomic_sub3(ary):
253
+ atomic_binary_2dim_shared(
254
+ ary, 1, int32, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False
255
+ )
256
+
257
+
258
+ def atomic_sub_float(ary):
259
+ atomic_binary_1dim_shared(
260
+ ary,
261
+ ary,
262
+ 1.0,
263
+ float32,
264
+ 32,
265
+ cuda.atomic.sub,
266
+ atomic_cast_to_int,
267
+ 0.0,
268
+ False,
269
+ )
270
+
271
+
272
+ def atomic_sub_float_2(ary):
273
+ atomic_binary_2dim_shared(
274
+ ary, 1.0, float32, (4, 8), cuda.atomic.sub, atomic_cast_none, False
275
+ )
276
+
277
+
278
+ def atomic_sub_float_3(ary):
279
+ atomic_binary_2dim_shared(
280
+ ary, 1.0, float32, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False
281
+ )
282
+
283
+
284
+ def atomic_sub_double(idx, ary):
285
+ atomic_binary_1dim_shared(
286
+ ary,
287
+ idx,
288
+ 1.0,
289
+ float64,
290
+ 32,
291
+ cuda.atomic.sub,
292
+ atomic_cast_none,
293
+ 0.0,
294
+ False,
295
+ )
296
+
297
+
298
+ def atomic_sub_double_2(ary):
299
+ atomic_binary_2dim_shared(
300
+ ary, 1.0, float64, (4, 8), cuda.atomic.sub, atomic_cast_none, False
301
+ )
302
+
303
+
304
+ def atomic_sub_double_3(ary):
305
+ atomic_binary_2dim_shared(
306
+ ary, 1.0, float64, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False
307
+ )
308
+
309
+
310
+ def atomic_sub_double_global(idx, ary):
311
+ atomic_binary_1dim_global(ary, idx, 32, 1.0, cuda.atomic.sub, False)
312
+
313
+
314
+ def atomic_sub_double_global_2(ary):
315
+ atomic_binary_2dim_global(
316
+ ary, 1.0, cuda.atomic.sub, atomic_cast_none, False
317
+ )
318
+
319
+
320
+ def atomic_sub_double_global_3(ary):
321
+ atomic_binary_2dim_shared(
322
+ ary, 1.0, float64, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False
323
+ )
324
+
325
+
326
+ def atomic_and(ary, op2):
327
+ atomic_binary_1dim_shared(
328
+ ary, ary, op2, uint32, 32, cuda.atomic.and_, atomic_cast_none, 1, False
329
+ )
330
+
331
+
332
+ def atomic_and2(ary, op2):
333
+ atomic_binary_2dim_shared(
334
+ ary, op2, uint32, (4, 8), cuda.atomic.and_, atomic_cast_none, False
335
+ )
336
+
337
+
338
+ def atomic_and3(ary, op2):
339
+ atomic_binary_2dim_shared(
340
+ ary, op2, uint32, (4, 8), cuda.atomic.and_, atomic_cast_to_uint64, False
341
+ )
342
+
343
+
344
+ def atomic_and_global(idx, ary, op2):
345
+ atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.and_, False)
346
+
347
+
348
+ def atomic_and_global_2(ary, op2):
349
+ atomic_binary_2dim_global(
350
+ ary, op2, cuda.atomic.and_, atomic_cast_none, False
351
+ )
352
+
353
+
354
+ def atomic_or(ary, op2):
355
+ atomic_binary_1dim_shared(
356
+ ary, ary, op2, uint32, 32, cuda.atomic.or_, atomic_cast_none, 0, False
357
+ )
358
+
359
+
360
+ def atomic_or2(ary, op2):
361
+ atomic_binary_2dim_shared(
362
+ ary, op2, uint32, (4, 8), cuda.atomic.or_, atomic_cast_none, False
363
+ )
364
+
365
+
366
+ def atomic_or3(ary, op2):
367
+ atomic_binary_2dim_shared(
368
+ ary, op2, uint32, (4, 8), cuda.atomic.or_, atomic_cast_to_uint64, False
369
+ )
370
+
371
+
372
+ def atomic_or_global(idx, ary, op2):
373
+ atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.or_, False)
374
+
375
+
376
+ def atomic_or_global_2(ary, op2):
377
+ atomic_binary_2dim_global(
378
+ ary, op2, cuda.atomic.or_, atomic_cast_none, False
379
+ )
380
+
381
+
382
+ def atomic_xor(ary, op2):
383
+ atomic_binary_1dim_shared(
384
+ ary, ary, op2, uint32, 32, cuda.atomic.xor, atomic_cast_none, 0, False
385
+ )
386
+
387
+
388
+ def atomic_xor2(ary, op2):
389
+ atomic_binary_2dim_shared(
390
+ ary, op2, uint32, (4, 8), cuda.atomic.xor, atomic_cast_none, False
391
+ )
392
+
393
+
394
+ def atomic_xor3(ary, op2):
395
+ atomic_binary_2dim_shared(
396
+ ary, op2, uint32, (4, 8), cuda.atomic.xor, atomic_cast_to_uint64, False
397
+ )
398
+
399
+
400
+ def atomic_xor_global(idx, ary, op2):
401
+ atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.xor, False)
402
+
403
+
404
+ def atomic_xor_global_2(ary, op2):
405
+ atomic_binary_2dim_global(
406
+ ary, op2, cuda.atomic.xor, atomic_cast_none, False
407
+ )
408
+
409
+
410
+ def atomic_inc32(ary, idx, op2):
411
+ atomic_binary_1dim_shared2(
412
+ ary, idx, op2, uint32, 32, cuda.atomic.inc, atomic_cast_none
413
+ )
414
+
415
+
416
+ def atomic_inc64(ary, idx, op2):
417
+ atomic_binary_1dim_shared2(
418
+ ary, idx, op2, uint64, 32, cuda.atomic.inc, atomic_cast_to_int
419
+ )
420
+
421
+
422
+ def atomic_inc2_32(ary, op2):
423
+ atomic_binary_2dim_shared(
424
+ ary, op2, uint32, (4, 8), cuda.atomic.inc, atomic_cast_none, False
425
+ )
426
+
427
+
428
+ def atomic_inc2_64(ary, op2):
429
+ atomic_binary_2dim_shared(
430
+ ary, op2, uint64, (4, 8), cuda.atomic.inc, atomic_cast_none, False
431
+ )
432
+
433
+
434
+ def atomic_inc3(ary, op2):
435
+ atomic_binary_2dim_shared(
436
+ ary, op2, uint32, (4, 8), cuda.atomic.inc, atomic_cast_to_uint64, False
437
+ )
438
+
439
+
440
+ def atomic_inc_global(idx, ary, op2):
441
+ atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.inc, False)
442
+
443
+
444
+ def atomic_inc_global_2(ary, op2):
445
+ atomic_binary_2dim_global(
446
+ ary, op2, cuda.atomic.inc, atomic_cast_none, False
447
+ )
448
+
449
+
450
+ def atomic_dec32(ary, idx, op2):
451
+ atomic_binary_1dim_shared2(
452
+ ary, idx, op2, uint32, 32, cuda.atomic.dec, atomic_cast_none
453
+ )
454
+
455
+
456
+ def atomic_dec64(ary, idx, op2):
457
+ atomic_binary_1dim_shared2(
458
+ ary, idx, op2, uint64, 32, cuda.atomic.dec, atomic_cast_to_int
459
+ )
460
+
461
+
462
+ def atomic_dec2_32(ary, op2):
463
+ atomic_binary_2dim_shared(
464
+ ary, op2, uint32, (4, 8), cuda.atomic.dec, atomic_cast_none, False
465
+ )
466
+
467
+
468
+ def atomic_dec2_64(ary, op2):
469
+ atomic_binary_2dim_shared(
470
+ ary, op2, uint64, (4, 8), cuda.atomic.dec, atomic_cast_none, False
471
+ )
472
+
473
+
474
+ def atomic_dec3(ary, op2):
475
+ atomic_binary_2dim_shared(
476
+ ary, op2, uint32, (4, 8), cuda.atomic.dec, atomic_cast_to_uint64, False
477
+ )
478
+
479
+
480
+ def atomic_dec_global(idx, ary, op2):
481
+ atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.dec, False)
482
+
483
+
484
+ def atomic_dec_global_2(ary, op2):
485
+ atomic_binary_2dim_global(
486
+ ary, op2, cuda.atomic.dec, atomic_cast_none, False
487
+ )
488
+
489
+
490
+ def atomic_exch(ary, idx, op2):
491
+ atomic_binary_1dim_shared2(
492
+ ary, idx, op2, uint32, 32, cuda.atomic.exch, atomic_cast_none
493
+ )
494
+
495
+
496
+ def atomic_exch2(ary, op2):
497
+ atomic_binary_2dim_shared(
498
+ ary, op2, uint32, (4, 8), cuda.atomic.exch, atomic_cast_none, False
499
+ )
500
+
501
+
502
+ def atomic_exch3(ary, op2):
503
+ atomic_binary_2dim_shared(
504
+ ary, op2, uint64, (4, 8), cuda.atomic.exch, atomic_cast_none, False
505
+ )
506
+
507
+
508
+ def atomic_exch_global(idx, ary, op2):
509
+ atomic_binary_1dim_global(ary, idx, 32, op2, cuda.atomic.exch, False)
510
+
511
+
512
+ def gen_atomic_extreme_funcs(func):
513
+ fns = dedent("""
514
+ def atomic(res, ary):
515
+ tx = cuda.threadIdx.x
516
+ bx = cuda.blockIdx.x
517
+ {func}(res, 0, ary[tx, bx])
518
+
519
+ def atomic_double_normalizedindex(res, ary):
520
+ tx = cuda.threadIdx.x
521
+ bx = cuda.blockIdx.x
522
+ {func}(res, 0, ary[tx, uint64(bx)])
523
+
524
+ def atomic_double_oneindex(res, ary):
525
+ tx = cuda.threadIdx.x
526
+ {func}(res, 0, ary[tx])
527
+
528
+ def atomic_double_shared(res, ary):
529
+ tid = cuda.threadIdx.x
530
+ smary = cuda.shared.array(32, float64)
531
+ smary[tid] = ary[tid]
532
+ smres = cuda.shared.array(1, float64)
533
+ if tid == 0:
534
+ smres[0] = res[0]
535
+ cuda.syncthreads()
536
+ {func}(smres, 0, smary[tid])
537
+ cuda.syncthreads()
538
+ if tid == 0:
539
+ res[0] = smres[0]
540
+ """).format(func=func)
541
+ ld = {}
542
+ exec(fns, {"cuda": cuda, "float64": float64, "uint64": uint64}, ld)
543
+ return (
544
+ ld["atomic"],
545
+ ld["atomic_double_normalizedindex"],
546
+ ld["atomic_double_oneindex"],
547
+ ld["atomic_double_shared"],
548
+ )
549
+
550
+
551
+ (
552
+ atomic_max,
553
+ atomic_max_double_normalizedindex,
554
+ atomic_max_double_oneindex,
555
+ atomic_max_double_shared,
556
+ ) = gen_atomic_extreme_funcs("cuda.atomic.max")
557
+ (
558
+ atomic_min,
559
+ atomic_min_double_normalizedindex,
560
+ atomic_min_double_oneindex,
561
+ atomic_min_double_shared,
562
+ ) = gen_atomic_extreme_funcs("cuda.atomic.min")
563
+ (
564
+ atomic_nanmax,
565
+ atomic_nanmax_double_normalizedindex,
566
+ atomic_nanmax_double_oneindex,
567
+ atomic_nanmax_double_shared,
568
+ ) = gen_atomic_extreme_funcs("cuda.atomic.nanmax")
569
+ (
570
+ atomic_nanmin,
571
+ atomic_nanmin_double_normalizedindex,
572
+ atomic_nanmin_double_oneindex,
573
+ atomic_nanmin_double_shared,
574
+ ) = gen_atomic_extreme_funcs("cuda.atomic.nanmin")
575
+
576
+
577
+ def atomic_compare_and_swap(res, old, ary, fill_val):
578
+ gid = cuda.grid(1)
579
+ if gid < res.size:
580
+ old[gid] = cuda.atomic.compare_and_swap(res[gid:], fill_val, ary[gid])
581
+
582
+
583
+ def atomic_cas_1dim(res, old, ary, fill_val):
584
+ gid = cuda.grid(1)
585
+ if gid < res.size:
586
+ old[gid] = cuda.atomic.cas(res, gid, fill_val, ary[gid])
587
+
588
+
589
+ def atomic_cas_2dim(res, old, ary, fill_val):
590
+ gid = cuda.grid(2)
591
+ if gid[0] < res.shape[0] and gid[1] < res.shape[1]:
592
+ old[gid] = cuda.atomic.cas(res, gid, fill_val, ary[gid])
593
+
594
+
595
+ class TestCudaAtomics(CUDATestCase):
596
+ def setUp(self):
597
+ super().setUp()
598
+ np.random.seed(0)
599
+
600
+ def test_atomic_add(self):
601
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32)
602
+ ary_wrap = ary.copy()
603
+ orig = ary.copy()
604
+
605
+ cuda_atomic_add = cuda.jit("void(uint32[:])")(atomic_add)
606
+ cuda_atomic_add[1, 32](ary)
607
+
608
+ cuda_atomic_add_wrap = cuda.jit("void(uint32[:])")(atomic_add_wrap)
609
+ cuda_atomic_add_wrap[1, 32](ary_wrap)
610
+
611
+ gold = np.zeros(32, dtype=np.uint32)
612
+ for i in range(orig.size):
613
+ gold[orig[i]] += 1
614
+
615
+ self.assertTrue(np.all(ary == gold))
616
+ self.assertTrue(np.all(ary_wrap == gold))
617
+
618
+ def test_atomic_add2(self):
619
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
620
+ ary_wrap = ary.copy()
621
+ orig = ary.copy()
622
+
623
+ cuda_atomic_add2 = cuda.jit("void(uint32[:,:])")(atomic_add2)
624
+ cuda_atomic_add2[1, (4, 8)](ary)
625
+
626
+ cuda_atomic_add2_wrap = cuda.jit("void(uint32[:,:])")(atomic_add2_wrap)
627
+ cuda_atomic_add2_wrap[1, (4, 8)](ary_wrap)
628
+
629
+ self.assertTrue(np.all(ary == orig + 1))
630
+ self.assertTrue(np.all(ary_wrap == orig + 1))
631
+
632
+ def test_atomic_add3(self):
633
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
634
+ orig = ary.copy()
635
+ cuda_atomic_add3 = cuda.jit("void(uint32[:,:])")(atomic_add3)
636
+ cuda_atomic_add3[1, (4, 8)](ary)
637
+
638
+ self.assertTrue(np.all(ary == orig + 1))
639
+
640
+ def test_atomic_add_float(self):
641
+ ary = np.random.randint(0, 32, size=32).astype(np.float32)
642
+ ary_wrap = ary.copy()
643
+ orig = ary.copy().astype(np.intp)
644
+
645
+ cuda_atomic_add_float = cuda.jit("void(float32[:])")(atomic_add_float)
646
+ cuda_atomic_add_float[1, 32](ary)
647
+
648
+ add_float_wrap = cuda.jit("void(float32[:])")(atomic_add_float_wrap)
649
+ add_float_wrap[1, 32](ary_wrap)
650
+
651
+ gold = np.zeros(32, dtype=np.uint32)
652
+ for i in range(orig.size):
653
+ gold[orig[i]] += 1.0
654
+
655
+ self.assertTrue(np.all(ary == gold))
656
+ self.assertTrue(np.all(ary_wrap == gold))
657
+
658
+ def test_atomic_add_float_2(self):
659
+ ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
660
+ ary_wrap = ary.copy()
661
+ orig = ary.copy()
662
+
663
+ cuda_atomic_add2 = cuda.jit("void(float32[:,:])")(atomic_add_float_2)
664
+ cuda_atomic_add2[1, (4, 8)](ary)
665
+
666
+ cuda_func_wrap = cuda.jit("void(float32[:,:])")(atomic_add_float_2_wrap)
667
+ cuda_func_wrap[1, (4, 8)](ary_wrap)
668
+
669
+ self.assertTrue(np.all(ary == orig + 1))
670
+ self.assertTrue(np.all(ary_wrap == orig + 1))
671
+
672
+ def test_atomic_add_float_3(self):
673
+ ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
674
+ orig = ary.copy()
675
+ cuda_atomic_add3 = cuda.jit("void(float32[:,:])")(atomic_add_float_3)
676
+ cuda_atomic_add3[1, (4, 8)](ary)
677
+
678
+ self.assertTrue(np.all(ary == orig + 1))
679
+
680
+ def assertCorrectFloat64Atomics(self, kernel, shared=True):
681
+ if config.ENABLE_CUDASIM:
682
+ return
683
+
684
+ # Use the first (and only) definition
685
+ asm = next(iter(kernel.inspect_asm().values()))
686
+ if cc_X_or_above(6, 0):
687
+ inst = "(red|atom)"
688
+
689
+ if shared:
690
+ inst = f"{inst}\\.shared"
691
+
692
+ self.assertRegex(asm, f"{inst}.add.f64", asm)
693
+ else:
694
+ if shared:
695
+ self.assertIn("atom.shared.cas.b64", asm)
696
+ else:
697
+ self.assertIn("atom.cas.b64", asm)
698
+
699
+ def test_atomic_add_double(self):
700
+ idx = np.random.randint(0, 32, size=32, dtype=np.int64)
701
+ ary = np.zeros(32, np.float64)
702
+ ary_wrap = ary.copy()
703
+
704
+ cuda_fn = cuda.jit("void(int64[:], float64[:])")(atomic_add_double)
705
+ cuda_fn[1, 32](idx, ary)
706
+
707
+ wrap_fn = cuda.jit("void(int64[:], float64[:])")(atomic_add_double_wrap)
708
+ wrap_fn[1, 32](idx, ary_wrap)
709
+
710
+ gold = np.zeros(32, dtype=np.uint32)
711
+ for i in range(idx.size):
712
+ gold[idx[i]] += 1.0
713
+
714
+ np.testing.assert_equal(ary, gold)
715
+ np.testing.assert_equal(ary_wrap, gold)
716
+ self.assertCorrectFloat64Atomics(cuda_fn)
717
+ self.assertCorrectFloat64Atomics(wrap_fn)
718
+
719
+ def test_atomic_add_double_2(self):
720
+ ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
721
+ ary_wrap = ary.copy()
722
+ orig = ary.copy()
723
+
724
+ cuda_fn = cuda.jit("void(float64[:,:])")(atomic_add_double_2)
725
+ cuda_fn[1, (4, 8)](ary)
726
+
727
+ cuda_fn_wrap = cuda.jit("void(float64[:,:])")(atomic_add_double_2_wrap)
728
+ cuda_fn_wrap[1, (4, 8)](ary_wrap)
729
+
730
+ np.testing.assert_equal(ary, orig + 1)
731
+ np.testing.assert_equal(ary_wrap, orig + 1)
732
+ self.assertCorrectFloat64Atomics(cuda_fn)
733
+ self.assertCorrectFloat64Atomics(cuda_fn_wrap)
734
+
735
+ def test_atomic_add_double_3(self):
736
+ ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
737
+ orig = ary.copy()
738
+ cuda_func = cuda.jit("void(float64[:,:])")(atomic_add_double_3)
739
+ cuda_func[1, (4, 8)](ary)
740
+
741
+ np.testing.assert_equal(ary, orig + 1)
742
+ self.assertCorrectFloat64Atomics(cuda_func)
743
+
744
+ def test_atomic_add_double_global(self):
745
+ idx = np.random.randint(0, 32, size=32, dtype=np.int64)
746
+ ary = np.zeros(32, np.float64)
747
+ ary_wrap = ary.copy()
748
+
749
+ sig = "void(int64[:], float64[:])"
750
+ cuda_func = cuda.jit(sig)(atomic_add_double_global)
751
+ wrap_cuda_func = cuda.jit(sig)(atomic_add_double_global_wrap)
752
+
753
+ cuda_func[1, 32](idx, ary)
754
+ wrap_cuda_func[1, 32](idx, ary_wrap)
755
+
756
+ gold = np.zeros(32, dtype=np.uint32)
757
+ for i in range(idx.size):
758
+ gold[idx[i]] += 1.0
759
+
760
+ np.testing.assert_equal(ary, gold)
761
+ np.testing.assert_equal(ary_wrap, gold)
762
+ self.assertCorrectFloat64Atomics(cuda_func, shared=False)
763
+ self.assertCorrectFloat64Atomics(wrap_cuda_func, shared=False)
764
+
765
+ def test_atomic_add_double_global_2(self):
766
+ ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
767
+ ary_wrap = ary.copy()
768
+ orig = ary.copy()
769
+
770
+ sig = "void(float64[:,:])"
771
+ cuda_func = cuda.jit(sig)(atomic_add_double_global_2)
772
+ wrap_cuda_func = cuda.jit(sig)(atomic_add_double_global_2_wrap)
773
+
774
+ cuda_func[1, (4, 8)](ary)
775
+ wrap_cuda_func[1, (4, 8)](ary_wrap)
776
+
777
+ np.testing.assert_equal(ary, orig + 1)
778
+ np.testing.assert_equal(ary_wrap, orig + 1)
779
+ self.assertCorrectFloat64Atomics(cuda_func, shared=False)
780
+ self.assertCorrectFloat64Atomics(wrap_cuda_func, shared=False)
781
+
782
+ def test_atomic_add_double_global_3(self):
783
+ ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
784
+ orig = ary.copy()
785
+ cuda_func = cuda.jit("void(float64[:,:])")(atomic_add_double_global_3)
786
+ cuda_func[1, (4, 8)](ary)
787
+
788
+ np.testing.assert_equal(ary, orig + 1)
789
+ self.assertCorrectFloat64Atomics(cuda_func, shared=False)
790
+
791
+ def test_atomic_sub(self):
792
+ ary = np.random.randint(0, 32, size=32, dtype=np.int32)
793
+ orig = ary.copy()
794
+ cuda_atomic_sub = cuda.jit("void(int32[:])")(atomic_sub)
795
+ cuda_atomic_sub[1, 32](ary)
796
+
797
+ gold = np.zeros(32, dtype=np.int32)
798
+ for i in range(orig.size):
799
+ gold[orig[i]] -= 1
800
+
801
+ self.assertTrue(np.all(ary == gold))
802
+
803
+ def test_atomic_sub2(self):
804
+ ary = np.random.randint(0, 32, size=(4, 8), dtype=np.int32)
805
+ orig = ary.copy()
806
+ cuda_atomic_sub2 = cuda.jit("void(int32[:,:])")(atomic_sub2)
807
+ cuda_atomic_sub2[1, (4, 8)](ary)
808
+ self.assertTrue(np.all(ary == orig - 1))
809
+
810
+ def test_atomic_sub3(self):
811
+ ary = np.random.randint(0, 32, size=(4, 8), dtype=np.uint32)
812
+ orig = ary.copy()
813
+ cuda_atomic_sub3 = cuda.jit("void(int32[:,:])")(atomic_sub3)
814
+ cuda_atomic_sub3[1, (4, 8)](ary)
815
+ self.assertTrue(np.all(ary == orig - 1))
816
+
817
+ def test_atomic_sub_float(self):
818
+ ary = np.random.randint(0, 32, size=32).astype(np.float32)
819
+ orig = ary.copy().astype(np.intp)
820
+ cuda_atomic_sub_float = cuda.jit("void(float32[:])")(atomic_sub_float)
821
+ cuda_atomic_sub_float[1, 32](ary)
822
+
823
+ gold = np.zeros(32, dtype=np.float32)
824
+ for i in range(orig.size):
825
+ gold[orig[i]] -= 1.0
826
+
827
+ self.assertTrue(np.all(ary == gold))
828
+
829
+ def test_atomic_sub_float_2(self):
830
+ ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
831
+ orig = ary.copy()
832
+ cuda_atomic_sub2 = cuda.jit("void(float32[:,:])")(atomic_sub_float_2)
833
+ cuda_atomic_sub2[1, (4, 8)](ary)
834
+ self.assertTrue(np.all(ary == orig - 1))
835
+
836
+ def test_atomic_sub_float_3(self):
837
+ ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
838
+ orig = ary.copy()
839
+ cuda_atomic_sub3 = cuda.jit("void(float32[:,:])")(atomic_sub_float_3)
840
+ cuda_atomic_sub3[1, (4, 8)](ary)
841
+ self.assertTrue(np.all(ary == orig - 1))
842
+
843
+ def test_atomic_sub_double(self):
844
+ idx = np.random.randint(0, 32, size=32, dtype=np.int64)
845
+ ary = np.zeros(32, np.float64)
846
+ cuda_func = cuda.jit("void(int64[:], float64[:])")(atomic_sub_double)
847
+ cuda_func[1, 32](idx, ary)
848
+
849
+ gold = np.zeros(32, dtype=np.float64)
850
+ for i in range(idx.size):
851
+ gold[idx[i]] -= 1.0
852
+
853
+ np.testing.assert_equal(ary, gold)
854
+
855
+ def test_atomic_sub_double_2(self):
856
+ ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
857
+ orig = ary.copy()
858
+ cuda_func = cuda.jit("void(float64[:,:])")(atomic_sub_double_2)
859
+ cuda_func[1, (4, 8)](ary)
860
+ np.testing.assert_equal(ary, orig - 1)
861
+
862
+ def test_atomic_sub_double_3(self):
863
+ ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
864
+ orig = ary.copy()
865
+ cuda_func = cuda.jit("void(float64[:,:])")(atomic_sub_double_3)
866
+ cuda_func[1, (4, 8)](ary)
867
+ np.testing.assert_equal(ary, orig - 1)
868
+
869
+ def test_atomic_sub_double_global(self):
870
+ idx = np.random.randint(0, 32, size=32, dtype=np.int64)
871
+ ary = np.zeros(32, np.float64)
872
+ sig = "void(int64[:], float64[:])"
873
+ cuda_func = cuda.jit(sig)(atomic_sub_double_global)
874
+ cuda_func[1, 32](idx, ary)
875
+
876
+ gold = np.zeros(32, dtype=np.float64)
877
+ for i in range(idx.size):
878
+ gold[idx[i]] -= 1.0
879
+
880
+ np.testing.assert_equal(ary, gold)
881
+
882
+ def test_atomic_sub_double_global_2(self):
883
+ ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
884
+ orig = ary.copy()
885
+ cuda_func = cuda.jit("void(float64[:,:])")(atomic_sub_double_global_2)
886
+ cuda_func[1, (4, 8)](ary)
887
+ np.testing.assert_equal(ary, orig - 1)
888
+
889
+ def test_atomic_sub_double_global_3(self):
890
+ ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
891
+ orig = ary.copy()
892
+ cuda_func = cuda.jit("void(float64[:,:])")(atomic_sub_double_global_3)
893
+ cuda_func[1, (4, 8)](ary)
894
+ np.testing.assert_equal(ary, orig - 1)
895
+
896
+ def test_atomic_and(self):
897
+ rand_const = np.random.randint(500)
898
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32)
899
+ orig = ary.copy()
900
+ cuda_func = cuda.jit("void(uint32[:], uint32)")(atomic_and)
901
+ cuda_func[1, 32](ary, rand_const)
902
+
903
+ gold = ary.copy()
904
+ for i in range(orig.size):
905
+ gold[orig[i]] &= rand_const
906
+
907
+ self.assertTrue(np.all(ary == gold))
908
+
909
+ def test_atomic_and2(self):
910
+ rand_const = np.random.randint(500)
911
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
912
+ orig = ary.copy()
913
+ cuda_atomic_and2 = cuda.jit("void(uint32[:,:], uint32)")(atomic_and2)
914
+ cuda_atomic_and2[1, (4, 8)](ary, rand_const)
915
+ self.assertTrue(np.all(ary == orig & rand_const))
916
+
917
+ def test_atomic_and3(self):
918
+ rand_const = np.random.randint(500)
919
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
920
+ orig = ary.copy()
921
+ cuda_atomic_and3 = cuda.jit("void(uint32[:,:], uint32)")(atomic_and3)
922
+ cuda_atomic_and3[1, (4, 8)](ary, rand_const)
923
+ self.assertTrue(np.all(ary == orig & rand_const))
924
+
925
+ def test_atomic_and_global(self):
926
+ rand_const = np.random.randint(500)
927
+ idx = np.random.randint(0, 32, size=32, dtype=np.int32)
928
+ ary = np.random.randint(0, 32, size=32, dtype=np.int32)
929
+ sig = "void(int32[:], int32[:], int32)"
930
+ cuda_func = cuda.jit(sig)(atomic_and_global)
931
+ cuda_func[1, 32](idx, ary, rand_const)
932
+
933
+ gold = ary.copy()
934
+ for i in range(idx.size):
935
+ gold[idx[i]] &= rand_const
936
+
937
+ np.testing.assert_equal(ary, gold)
938
+
939
+ def test_atomic_and_global_2(self):
940
+ rand_const = np.random.randint(500)
941
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
942
+ orig = ary.copy()
943
+ cuda_func = cuda.jit("void(uint32[:,:], uint32)")(atomic_and_global_2)
944
+ cuda_func[1, (4, 8)](ary, rand_const)
945
+ np.testing.assert_equal(ary, orig & rand_const)
946
+
947
+ def test_atomic_or(self):
948
+ rand_const = np.random.randint(500)
949
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32)
950
+ orig = ary.copy()
951
+ cuda_func = cuda.jit("void(uint32[:], uint32)")(atomic_or)
952
+ cuda_func[1, 32](ary, rand_const)
953
+
954
+ gold = np.zeros(32, dtype=np.uint32)
955
+ for i in range(orig.size):
956
+ gold[orig[i]] |= rand_const
957
+
958
+ self.assertTrue(np.all(ary == gold))
959
+
960
+ def test_atomic_or2(self):
961
+ rand_const = np.random.randint(500)
962
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
963
+ orig = ary.copy()
964
+ cuda_atomic_and2 = cuda.jit("void(uint32[:,:], uint32)")(atomic_or2)
965
+ cuda_atomic_and2[1, (4, 8)](ary, rand_const)
966
+ self.assertTrue(np.all(ary == orig | rand_const))
967
+
968
+ def test_atomic_or3(self):
969
+ rand_const = np.random.randint(500)
970
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
971
+ orig = ary.copy()
972
+ cuda_atomic_and3 = cuda.jit("void(uint32[:,:], uint32)")(atomic_or3)
973
+ cuda_atomic_and3[1, (4, 8)](ary, rand_const)
974
+ self.assertTrue(np.all(ary == orig | rand_const))
975
+
976
+ def test_atomic_or_global(self):
977
+ rand_const = np.random.randint(500)
978
+ idx = np.random.randint(0, 32, size=32, dtype=np.int32)
979
+ ary = np.random.randint(0, 32, size=32, dtype=np.int32)
980
+ sig = "void(int32[:], int32[:], int32)"
981
+ cuda_func = cuda.jit(sig)(atomic_or_global)
982
+ cuda_func[1, 32](idx, ary, rand_const)
983
+
984
+ gold = ary.copy()
985
+ for i in range(idx.size):
986
+ gold[idx[i]] |= rand_const
987
+
988
+ np.testing.assert_equal(ary, gold)
989
+
990
+ def test_atomic_or_global_2(self):
991
+ rand_const = np.random.randint(500)
992
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
993
+ orig = ary.copy()
994
+ cuda_func = cuda.jit("void(uint32[:,:], uint32)")(atomic_or_global_2)
995
+ cuda_func[1, (4, 8)](ary, rand_const)
996
+ np.testing.assert_equal(ary, orig | rand_const)
997
+
998
+ def test_atomic_xor(self):
999
+ rand_const = np.random.randint(500)
1000
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32)
1001
+ orig = ary.copy()
1002
+ cuda_func = cuda.jit("void(uint32[:], uint32)")(atomic_xor)
1003
+ cuda_func[1, 32](ary, rand_const)
1004
+
1005
+ gold = np.zeros(32, dtype=np.uint32)
1006
+ for i in range(orig.size):
1007
+ gold[orig[i]] ^= rand_const
1008
+
1009
+ self.assertTrue(np.all(ary == gold))
1010
+
1011
+ def test_atomic_xor2(self):
1012
+ rand_const = np.random.randint(500)
1013
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
1014
+ orig = ary.copy()
1015
+ cuda_atomic_xor2 = cuda.jit("void(uint32[:,:], uint32)")(atomic_xor2)
1016
+ cuda_atomic_xor2[1, (4, 8)](ary, rand_const)
1017
+ self.assertTrue(np.all(ary == orig ^ rand_const))
1018
+
1019
+ def test_atomic_xor3(self):
1020
+ rand_const = np.random.randint(500)
1021
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
1022
+ orig = ary.copy()
1023
+ cuda_atomic_xor3 = cuda.jit("void(uint32[:,:], uint32)")(atomic_xor3)
1024
+ cuda_atomic_xor3[1, (4, 8)](ary, rand_const)
1025
+ self.assertTrue(np.all(ary == orig ^ rand_const))
1026
+
1027
+ def test_atomic_xor_global(self):
1028
+ rand_const = np.random.randint(500)
1029
+ idx = np.random.randint(0, 32, size=32, dtype=np.int32)
1030
+ ary = np.random.randint(0, 32, size=32, dtype=np.int32)
1031
+ gold = ary.copy()
1032
+ sig = "void(int32[:], int32[:], int32)"
1033
+ cuda_func = cuda.jit(sig)(atomic_xor_global)
1034
+ cuda_func[1, 32](idx, ary, rand_const)
1035
+
1036
+ for i in range(idx.size):
1037
+ gold[idx[i]] ^= rand_const
1038
+
1039
+ np.testing.assert_equal(ary, gold)
1040
+
1041
+ def test_atomic_xor_global_2(self):
1042
+ rand_const = np.random.randint(500)
1043
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
1044
+ orig = ary.copy()
1045
+ cuda_func = cuda.jit("void(uint32[:,:], uint32)")(atomic_xor_global_2)
1046
+ cuda_func[1, (4, 8)](ary, rand_const)
1047
+ np.testing.assert_equal(ary, orig ^ rand_const)
1048
+
1049
+ def inc_dec_1dim_setup(self, dtype):
1050
+ rconst = np.random.randint(32, dtype=dtype)
1051
+ rary = np.random.randint(0, 32, size=32).astype(dtype)
1052
+ ary_idx = np.arange(32, dtype=dtype)
1053
+ return rconst, rary, ary_idx
1054
+
1055
+ def inc_dec_2dim_setup(self, dtype):
1056
+ rconst = np.random.randint(32, dtype=dtype)
1057
+ rary = np.random.randint(0, 32, size=32).astype(dtype).reshape(4, 8)
1058
+ return rconst, rary
1059
+
1060
+ def check_inc_index(self, ary, idx, rconst, sig, nblocks, blksize, func):
1061
+ orig = ary.copy()
1062
+ cuda_func = cuda.jit(sig)(func)
1063
+ cuda_func[nblocks, blksize](ary, idx, rconst)
1064
+ np.testing.assert_equal(ary, np.where(orig >= rconst, 0, orig + 1))
1065
+
1066
+ def check_inc_index2(self, ary, idx, rconst, sig, nblocks, blksize, func):
1067
+ orig = ary.copy()
1068
+ cuda_func = cuda.jit(sig)(func)
1069
+ cuda_func[nblocks, blksize](idx, ary, rconst)
1070
+ np.testing.assert_equal(ary, np.where(orig >= rconst, 0, orig + 1))
1071
+
1072
+ def check_inc(self, ary, rconst, sig, nblocks, blksize, func):
1073
+ orig = ary.copy()
1074
+ cuda_func = cuda.jit(sig)(func)
1075
+ cuda_func[nblocks, blksize](ary, rconst)
1076
+ np.testing.assert_equal(ary, np.where(orig >= rconst, 0, orig + 1))
1077
+
1078
+ def test_atomic_inc_32(self):
1079
+ rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
1080
+ sig = "void(uint32[:], uint32[:], uint32)"
1081
+ self.check_inc_index(ary, idx, rand_const, sig, 1, 32, atomic_inc32)
1082
+
1083
+ def test_atomic_inc_64(self):
1084
+ rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
1085
+ sig = "void(uint64[:], uint64[:], uint64)"
1086
+ self.check_inc_index(ary, idx, rand_const, sig, 1, 32, atomic_inc64)
1087
+
1088
+ def test_atomic_inc2_32(self):
1089
+ rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
1090
+ sig = "void(uint32[:,:], uint32)"
1091
+ self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc2_32)
1092
+
1093
+ def test_atomic_inc2_64(self):
1094
+ rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
1095
+ sig = "void(uint64[:,:], uint64)"
1096
+ self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc2_64)
1097
+
1098
+ def test_atomic_inc3(self):
1099
+ rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
1100
+ sig = "void(uint32[:,:], uint32)"
1101
+ self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc3)
1102
+
1103
+ def test_atomic_inc_global_32(self):
1104
+ rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
1105
+ sig = "void(uint32[:], uint32[:], uint32)"
1106
+ self.check_inc_index2(
1107
+ ary, idx, rand_const, sig, 1, 32, atomic_inc_global
1108
+ )
1109
+
1110
+ def test_atomic_inc_global_64(self):
1111
+ rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
1112
+ sig = "void(uint64[:], uint64[:], uint64)"
1113
+ self.check_inc_index2(
1114
+ ary, idx, rand_const, sig, 1, 32, atomic_inc_global
1115
+ )
1116
+
1117
+ def test_atomic_inc_global_2_32(self):
1118
+ rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
1119
+ sig = "void(uint32[:,:], uint32)"
1120
+ self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc_global_2)
1121
+
1122
+ def test_atomic_inc_global_2_64(self):
1123
+ rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
1124
+ sig = "void(uint64[:,:], uint64)"
1125
+ self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc_global_2)
1126
+
1127
+ def check_dec_index(self, ary, idx, rconst, sig, nblocks, blksize, func):
1128
+ orig = ary.copy()
1129
+ cuda_func = cuda.jit(sig)(func)
1130
+ cuda_func[nblocks, blksize](ary, idx, rconst)
1131
+ np.testing.assert_equal(
1132
+ ary,
1133
+ np.where(
1134
+ orig == 0, rconst, np.where(orig > rconst, rconst, orig - 1)
1135
+ ),
1136
+ )
1137
+
1138
+ def check_dec_index2(self, ary, idx, rconst, sig, nblocks, blksize, func):
1139
+ orig = ary.copy()
1140
+ cuda_func = cuda.jit(sig)(func)
1141
+ cuda_func[nblocks, blksize](idx, ary, rconst)
1142
+ np.testing.assert_equal(
1143
+ ary,
1144
+ np.where(
1145
+ orig == 0, rconst, np.where(orig > rconst, rconst, orig - 1)
1146
+ ),
1147
+ )
1148
+
1149
+ def check_dec(self, ary, rconst, sig, nblocks, blksize, func):
1150
+ orig = ary.copy()
1151
+ cuda_func = cuda.jit(sig)(func)
1152
+ cuda_func[nblocks, blksize](ary, rconst)
1153
+ np.testing.assert_equal(
1154
+ ary,
1155
+ np.where(
1156
+ orig == 0, rconst, np.where(orig > rconst, rconst, orig - 1)
1157
+ ),
1158
+ )
1159
+
1160
+ def test_atomic_dec_32(self):
1161
+ rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
1162
+ sig = "void(uint32[:], uint32[:], uint32)"
1163
+ self.check_dec_index(ary, idx, rand_const, sig, 1, 32, atomic_dec32)
1164
+
1165
+ def test_atomic_dec_64(self):
1166
+ rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
1167
+ sig = "void(uint64[:], uint64[:], uint64)"
1168
+ self.check_dec_index(ary, idx, rand_const, sig, 1, 32, atomic_dec64)
1169
+
1170
+ def test_atomic_dec2_32(self):
1171
+ rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
1172
+ sig = "void(uint32[:,:], uint32)"
1173
+ self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec2_32)
1174
+
1175
+ def test_atomic_dec2_64(self):
1176
+ rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
1177
+ sig = "void(uint64[:,:], uint64)"
1178
+ self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec2_64)
1179
+
1180
+ def test_atomic_dec3_new(self):
1181
+ rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
1182
+ sig = "void(uint32[:,:], uint32)"
1183
+ self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec3)
1184
+
1185
+ def test_atomic_dec_global_32(self):
1186
+ rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
1187
+ sig = "void(uint32[:], uint32[:], uint32)"
1188
+ self.check_dec_index2(
1189
+ ary, idx, rand_const, sig, 1, 32, atomic_dec_global
1190
+ )
1191
+
1192
+ def test_atomic_dec_global_64(self):
1193
+ rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
1194
+ sig = "void(uint64[:], uint64[:], uint64)"
1195
+ self.check_dec_index2(
1196
+ ary, idx, rand_const, sig, 1, 32, atomic_dec_global
1197
+ )
1198
+
1199
+ def test_atomic_dec_global2_32(self):
1200
+ rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
1201
+ sig = "void(uint32[:,:], uint32)"
1202
+ self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec_global_2)
1203
+
1204
+ def test_atomic_dec_global2_64(self):
1205
+ rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
1206
+ sig = "void(uint64[:,:], uint64)"
1207
+ self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec_global_2)
1208
+
1209
+ def test_atomic_exch(self):
1210
+ rand_const = np.random.randint(50, 100, dtype=np.uint32)
1211
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32)
1212
+ idx = np.arange(32, dtype=np.uint32)
1213
+
1214
+ cuda_func = cuda.jit("void(uint32[:], uint32[:], uint32)")(atomic_exch)
1215
+ cuda_func[1, 32](ary, idx, rand_const)
1216
+
1217
+ np.testing.assert_equal(ary, rand_const)
1218
+
1219
+ def test_atomic_exch2(self):
1220
+ rand_const = np.random.randint(50, 100, dtype=np.uint32)
1221
+ ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
1222
+
1223
+ cuda_func = cuda.jit("void(uint32[:,:], uint32)")(atomic_exch2)
1224
+ cuda_func[1, (4, 8)](ary, rand_const)
1225
+ np.testing.assert_equal(ary, rand_const)
1226
+
1227
+ def test_atomic_exch3(self):
1228
+ rand_const = np.random.randint(50, 100, dtype=np.uint64)
1229
+ ary = np.random.randint(0, 32, size=32).astype(np.uint64).reshape(4, 8)
1230
+
1231
+ cuda_func = cuda.jit("void(uint64[:,:], uint64)")(atomic_exch3)
1232
+ cuda_func[1, (4, 8)](ary, rand_const)
1233
+ np.testing.assert_equal(ary, rand_const)
1234
+
1235
+ def test_atomic_exch_global(self):
1236
+ rand_const = np.random.randint(50, 100, dtype=np.uint32)
1237
+ idx = np.arange(32, dtype=np.uint32)
1238
+ ary = np.random.randint(0, 32, size=32, dtype=np.uint32)
1239
+
1240
+ sig = "void(uint32[:], uint32[:], uint32)"
1241
+ cuda_func = cuda.jit(sig)(atomic_exch_global)
1242
+ cuda_func[1, 32](idx, ary, rand_const)
1243
+ np.testing.assert_equal(ary, rand_const)
1244
+
1245
+ def check_atomic_max(self, dtype, lo, hi):
1246
+ vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype)
1247
+ res = np.zeros(1, dtype=vals.dtype)
1248
+ cuda_func = cuda.jit(atomic_max)
1249
+ cuda_func[32, 32](res, vals)
1250
+ gold = np.max(vals)
1251
+ np.testing.assert_equal(res, gold)
1252
+
1253
+ def test_atomic_max_int32(self):
1254
+ self.check_atomic_max(dtype=np.int32, lo=-65535, hi=65535)
1255
+
1256
+ def test_atomic_max_uint32(self):
1257
+ self.check_atomic_max(dtype=np.uint32, lo=0, hi=65535)
1258
+
1259
+ def test_atomic_max_int64(self):
1260
+ self.check_atomic_max(dtype=np.int64, lo=-65535, hi=65535)
1261
+
1262
+ def test_atomic_max_uint64(self):
1263
+ self.check_atomic_max(dtype=np.uint64, lo=0, hi=65535)
1264
+
1265
+ def test_atomic_max_float32(self):
1266
+ self.check_atomic_max(dtype=np.float32, lo=-65535, hi=65535)
1267
+
1268
+ def test_atomic_max_double(self):
1269
+ self.check_atomic_max(dtype=np.float64, lo=-65535, hi=65535)
1270
+
1271
+ def test_atomic_max_double_normalizedindex(self):
1272
+ vals = np.random.randint(0, 65535, size=(32, 32)).astype(np.float64)
1273
+ res = np.zeros(1, np.float64)
1274
+ cuda_func = cuda.jit("void(float64[:], float64[:,:])")(
1275
+ atomic_max_double_normalizedindex
1276
+ )
1277
+ cuda_func[32, 32](res, vals)
1278
+
1279
+ gold = np.max(vals)
1280
+ np.testing.assert_equal(res, gold)
1281
+
1282
+ def test_atomic_max_double_oneindex(self):
1283
+ vals = np.random.randint(0, 128, size=32).astype(np.float64)
1284
+ res = np.zeros(1, np.float64)
1285
+ cuda_func = cuda.jit("void(float64[:], float64[:])")(
1286
+ atomic_max_double_oneindex
1287
+ )
1288
+ cuda_func[1, 32](res, vals)
1289
+
1290
+ gold = np.max(vals)
1291
+ np.testing.assert_equal(res, gold)
1292
+
1293
+ def check_atomic_min(self, dtype, lo, hi):
1294
+ vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype)
1295
+ res = np.array([65535], dtype=vals.dtype)
1296
+ cuda_func = cuda.jit(atomic_min)
1297
+ cuda_func[32, 32](res, vals)
1298
+
1299
+ gold = np.min(vals)
1300
+ np.testing.assert_equal(res, gold)
1301
+
1302
+ def test_atomic_min_int32(self):
1303
+ self.check_atomic_min(dtype=np.int32, lo=-65535, hi=65535)
1304
+
1305
+ def test_atomic_min_uint32(self):
1306
+ self.check_atomic_min(dtype=np.uint32, lo=0, hi=65535)
1307
+
1308
+ def test_atomic_min_int64(self):
1309
+ self.check_atomic_min(dtype=np.int64, lo=-65535, hi=65535)
1310
+
1311
+ def test_atomic_min_uint64(self):
1312
+ self.check_atomic_min(dtype=np.uint64, lo=0, hi=65535)
1313
+
1314
+ def test_atomic_min_float(self):
1315
+ self.check_atomic_min(dtype=np.float32, lo=-65535, hi=65535)
1316
+
1317
+ def test_atomic_min_double(self):
1318
+ self.check_atomic_min(dtype=np.float64, lo=-65535, hi=65535)
1319
+
1320
+ def test_atomic_min_double_normalizedindex(self):
1321
+ vals = np.random.randint(0, 65535, size=(32, 32)).astype(np.float64)
1322
+ res = np.ones(1, np.float64) * 65535
1323
+ cuda_func = cuda.jit("void(float64[:], float64[:,:])")(
1324
+ atomic_min_double_normalizedindex
1325
+ )
1326
+ cuda_func[32, 32](res, vals)
1327
+
1328
+ gold = np.min(vals)
1329
+ np.testing.assert_equal(res, gold)
1330
+
1331
+ def test_atomic_min_double_oneindex(self):
1332
+ vals = np.random.randint(0, 128, size=32).astype(np.float64)
1333
+ res = np.ones(1, np.float64) * 128
1334
+ cuda_func = cuda.jit("void(float64[:], float64[:])")(
1335
+ atomic_min_double_oneindex
1336
+ )
1337
+ cuda_func[1, 32](res, vals)
1338
+
1339
+ gold = np.min(vals)
1340
+ np.testing.assert_equal(res, gold)
1341
+
1342
+ # Taken together, _test_atomic_minmax_nan_location and
1343
+ # _test_atomic_minmax_nan_val check that NaNs are treated similarly to the
1344
+ # way they are in Python / NumPy - that is, {min,max}(a, b) == a if either
1345
+ # a or b is a NaN. For the atomics, this means that the max is taken as the
1346
+ # value stored in the memory location rather than the value supplied - i.e.
1347
+ # for:
1348
+ #
1349
+ # cuda.atomic.{min,max}(ary, idx, val)
1350
+ #
1351
+ # the result will be ary[idx] for either of ary[idx] or val being NaN.
1352
+
1353
+ def _test_atomic_minmax_nan_location(self, func):
1354
+ cuda_func = cuda.jit("void(float64[:], float64[:,:])")(func)
1355
+
1356
+ vals = np.random.randint(0, 128, size=(1, 1)).astype(np.float64)
1357
+ res = np.zeros(1, np.float64) + np.nan
1358
+ cuda_func[1, 1](res, vals)
1359
+ np.testing.assert_equal(res, [np.nan])
1360
+
1361
+ def _test_atomic_minmax_nan_val(self, func):
1362
+ cuda_func = cuda.jit("void(float64[:], float64[:,:])")(func)
1363
+
1364
+ res = np.random.randint(0, 128, size=1).astype(np.float64)
1365
+ gold = res.copy()
1366
+ vals = np.zeros((1, 1), np.float64) + np.nan
1367
+ cuda_func[1, 1](res, vals)
1368
+
1369
+ np.testing.assert_equal(res, gold)
1370
+
1371
+ def test_atomic_min_nan_location(self):
1372
+ self._test_atomic_minmax_nan_location(atomic_min)
1373
+
1374
+ def test_atomic_max_nan_location(self):
1375
+ self._test_atomic_minmax_nan_location(atomic_max)
1376
+
1377
+ def test_atomic_min_nan_val(self):
1378
+ self._test_atomic_minmax_nan_val(atomic_min)
1379
+
1380
+ def test_atomic_max_nan_val(self):
1381
+ self._test_atomic_minmax_nan_val(atomic_max)
1382
+
1383
+ def test_atomic_max_double_shared(self):
1384
+ vals = np.random.randint(0, 32, size=32).astype(np.float64)
1385
+ res = np.zeros(1, np.float64)
1386
+ sig = "void(float64[:], float64[:])"
1387
+ cuda_func = cuda.jit(sig)(atomic_max_double_shared)
1388
+ cuda_func[1, 32](res, vals)
1389
+
1390
+ gold = np.max(vals)
1391
+ np.testing.assert_equal(res, gold)
1392
+
1393
+ def test_atomic_min_double_shared(self):
1394
+ vals = np.random.randint(0, 32, size=32).astype(np.float64)
1395
+ res = np.ones(1, np.float64) * 32
1396
+ sig = "void(float64[:], float64[:])"
1397
+ cuda_func = cuda.jit(sig)(atomic_min_double_shared)
1398
+ cuda_func[1, 32](res, vals)
1399
+
1400
+ gold = np.min(vals)
1401
+ np.testing.assert_equal(res, gold)
1402
+
1403
+ def check_cas(self, n, fill, unfill, dtype, cas_func, ndim=1):
1404
+ res = [fill] * (n // 2) + [unfill] * (n // 2)
1405
+ np.random.shuffle(res)
1406
+ res = np.asarray(res, dtype=dtype)
1407
+ if ndim == 2:
1408
+ res.shape = (10, -1)
1409
+ out = np.zeros_like(res)
1410
+ ary = np.random.randint(1, 10, size=res.shape).astype(res.dtype)
1411
+
1412
+ fill_mask = res == fill
1413
+ unfill_mask = res == unfill
1414
+
1415
+ expect_res = np.zeros_like(res)
1416
+ expect_res[fill_mask] = ary[fill_mask]
1417
+ expect_res[unfill_mask] = unfill
1418
+
1419
+ expect_out = res.copy()
1420
+
1421
+ cuda_func = cuda.jit(cas_func)
1422
+ if ndim == 1:
1423
+ cuda_func[10, 10](res, out, ary, fill)
1424
+ else:
1425
+ cuda_func[(10, 10), (10, 10)](res, out, ary, fill)
1426
+
1427
+ np.testing.assert_array_equal(expect_res, res)
1428
+ np.testing.assert_array_equal(expect_out, out)
1429
+
1430
+ def test_atomic_compare_and_swap(self):
1431
+ self.check_cas(
1432
+ n=100,
1433
+ fill=-99,
1434
+ unfill=-1,
1435
+ dtype=np.int32,
1436
+ cas_func=atomic_compare_and_swap,
1437
+ )
1438
+
1439
+ def test_atomic_compare_and_swap2(self):
1440
+ self.check_cas(
1441
+ n=100,
1442
+ fill=-45,
1443
+ unfill=-1,
1444
+ dtype=np.int64,
1445
+ cas_func=atomic_compare_and_swap,
1446
+ )
1447
+
1448
+ def test_atomic_compare_and_swap3(self):
1449
+ rfill = np.random.randint(50, 500, dtype=np.uint32)
1450
+ runfill = np.random.randint(1, 25, dtype=np.uint32)
1451
+ self.check_cas(
1452
+ n=100,
1453
+ fill=rfill,
1454
+ unfill=runfill,
1455
+ dtype=np.uint32,
1456
+ cas_func=atomic_compare_and_swap,
1457
+ )
1458
+
1459
+ def test_atomic_compare_and_swap4(self):
1460
+ rfill = np.random.randint(50, 500, dtype=np.uint64)
1461
+ runfill = np.random.randint(1, 25, dtype=np.uint64)
1462
+ self.check_cas(
1463
+ n=100,
1464
+ fill=rfill,
1465
+ unfill=runfill,
1466
+ dtype=np.uint64,
1467
+ cas_func=atomic_compare_and_swap,
1468
+ )
1469
+
1470
+ def test_atomic_cas_1dim(self):
1471
+ self.check_cas(
1472
+ n=100, fill=-99, unfill=-1, dtype=np.int32, cas_func=atomic_cas_1dim
1473
+ )
1474
+
1475
+ def test_atomic_cas_2dim(self):
1476
+ self.check_cas(
1477
+ n=100,
1478
+ fill=-99,
1479
+ unfill=-1,
1480
+ dtype=np.int32,
1481
+ cas_func=atomic_cas_2dim,
1482
+ ndim=2,
1483
+ )
1484
+
1485
+ def test_atomic_cas2_1dim(self):
1486
+ self.check_cas(
1487
+ n=100, fill=-45, unfill=-1, dtype=np.int64, cas_func=atomic_cas_1dim
1488
+ )
1489
+
1490
+ def test_atomic_cas2_2dim(self):
1491
+ self.check_cas(
1492
+ n=100,
1493
+ fill=-45,
1494
+ unfill=-1,
1495
+ dtype=np.int64,
1496
+ cas_func=atomic_cas_2dim,
1497
+ ndim=2,
1498
+ )
1499
+
1500
+ def test_atomic_cas3_1dim(self):
1501
+ rfill = np.random.randint(50, 500, dtype=np.uint32)
1502
+ runfill = np.random.randint(1, 25, dtype=np.uint32)
1503
+ self.check_cas(
1504
+ n=100,
1505
+ fill=rfill,
1506
+ unfill=runfill,
1507
+ dtype=np.uint32,
1508
+ cas_func=atomic_cas_1dim,
1509
+ )
1510
+
1511
+ def test_atomic_cas3_2dim(self):
1512
+ rfill = np.random.randint(50, 500, dtype=np.uint32)
1513
+ runfill = np.random.randint(1, 25, dtype=np.uint32)
1514
+ self.check_cas(
1515
+ n=100,
1516
+ fill=rfill,
1517
+ unfill=runfill,
1518
+ dtype=np.uint32,
1519
+ cas_func=atomic_cas_2dim,
1520
+ ndim=2,
1521
+ )
1522
+
1523
+ def test_atomic_cas4_1dim(self):
1524
+ rfill = np.random.randint(50, 500, dtype=np.uint64)
1525
+ runfill = np.random.randint(1, 25, dtype=np.uint64)
1526
+ self.check_cas(
1527
+ n=100,
1528
+ fill=rfill,
1529
+ unfill=runfill,
1530
+ dtype=np.uint64,
1531
+ cas_func=atomic_cas_1dim,
1532
+ )
1533
+
1534
+ def test_atomic_cas4_2dim(self):
1535
+ rfill = np.random.randint(50, 500, dtype=np.uint64)
1536
+ runfill = np.random.randint(1, 25, dtype=np.uint64)
1537
+ self.check_cas(
1538
+ n=100,
1539
+ fill=rfill,
1540
+ unfill=runfill,
1541
+ dtype=np.uint64,
1542
+ cas_func=atomic_cas_2dim,
1543
+ ndim=2,
1544
+ )
1545
+
1546
+ # Tests that the atomic add, min, and max operations return the old value -
1547
+ # in the simulator, they did not (see Issue #5458). The max and min have
1548
+ # special handling for NaN values, so we explicitly test with a NaN in the
1549
+ # array being modified and the value provided.
1550
+
1551
+ def _test_atomic_returns_old(self, kernel, initial):
1552
+ x = np.zeros(2, dtype=np.float32)
1553
+ x[0] = initial
1554
+ kernel[1, 1](x)
1555
+ if np.isnan(initial):
1556
+ self.assertTrue(np.isnan(x[1]))
1557
+ else:
1558
+ self.assertEqual(x[1], initial)
1559
+
1560
+ def test_atomic_add_returns_old(self):
1561
+ @cuda.jit
1562
+ def kernel(x):
1563
+ x[1] = cuda.atomic.add(x, 0, 1)
1564
+
1565
+ self._test_atomic_returns_old(kernel, 10)
1566
+
1567
+ def test_atomic_max_returns_no_replace(self):
1568
+ @cuda.jit
1569
+ def kernel(x):
1570
+ x[1] = cuda.atomic.max(x, 0, 1)
1571
+
1572
+ self._test_atomic_returns_old(kernel, 10)
1573
+
1574
+ def test_atomic_max_returns_old_replace(self):
1575
+ @cuda.jit
1576
+ def kernel(x):
1577
+ x[1] = cuda.atomic.max(x, 0, 10)
1578
+
1579
+ self._test_atomic_returns_old(kernel, 1)
1580
+
1581
+ def test_atomic_max_returns_old_nan_in_array(self):
1582
+ @cuda.jit
1583
+ def kernel(x):
1584
+ x[1] = cuda.atomic.max(x, 0, 1)
1585
+
1586
+ self._test_atomic_returns_old(kernel, np.nan)
1587
+
1588
+ def test_atomic_max_returns_old_nan_val(self):
1589
+ @cuda.jit
1590
+ def kernel(x):
1591
+ x[1] = cuda.atomic.max(x, 0, np.nan)
1592
+
1593
+ self._test_atomic_returns_old(kernel, 10)
1594
+
1595
+ def test_atomic_min_returns_old_no_replace(self):
1596
+ @cuda.jit
1597
+ def kernel(x):
1598
+ x[1] = cuda.atomic.min(x, 0, 11)
1599
+
1600
+ self._test_atomic_returns_old(kernel, 10)
1601
+
1602
+ def test_atomic_min_returns_old_replace(self):
1603
+ @cuda.jit
1604
+ def kernel(x):
1605
+ x[1] = cuda.atomic.min(x, 0, 10)
1606
+
1607
+ self._test_atomic_returns_old(kernel, 11)
1608
+
1609
+ def test_atomic_min_returns_old_nan_in_array(self):
1610
+ @cuda.jit
1611
+ def kernel(x):
1612
+ x[1] = cuda.atomic.min(x, 0, 11)
1613
+
1614
+ self._test_atomic_returns_old(kernel, np.nan)
1615
+
1616
+ def test_atomic_min_returns_old_nan_val(self):
1617
+ @cuda.jit
1618
+ def kernel(x):
1619
+ x[1] = cuda.atomic.min(x, 0, np.nan)
1620
+
1621
+ self._test_atomic_returns_old(kernel, 11)
1622
+
1623
+ # Tests for atomic nanmin/nanmax
1624
+
1625
+ # nanmax tests
1626
+ def check_atomic_nanmax(self, dtype, lo, hi, init_val):
1627
+ vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype)
1628
+ vals[1::2] = init_val
1629
+ res = np.zeros(1, dtype=vals.dtype)
1630
+ cuda_func = cuda.jit(atomic_nanmax)
1631
+ cuda_func[32, 32](res, vals)
1632
+ gold = np.nanmax(vals)
1633
+ np.testing.assert_equal(res, gold)
1634
+
1635
+ def test_atomic_nanmax_int32(self):
1636
+ self.check_atomic_nanmax(
1637
+ dtype=np.int32, lo=-65535, hi=65535, init_val=0
1638
+ )
1639
+
1640
+ def test_atomic_nanmax_uint32(self):
1641
+ self.check_atomic_nanmax(dtype=np.uint32, lo=0, hi=65535, init_val=0)
1642
+
1643
+ def test_atomic_nanmax_int64(self):
1644
+ self.check_atomic_nanmax(
1645
+ dtype=np.int64, lo=-65535, hi=65535, init_val=0
1646
+ )
1647
+
1648
+ def test_atomic_nanmax_uint64(self):
1649
+ self.check_atomic_nanmax(dtype=np.uint64, lo=0, hi=65535, init_val=0)
1650
+
1651
+ def test_atomic_nanmax_float32(self):
1652
+ self.check_atomic_nanmax(
1653
+ dtype=np.float32, lo=-65535, hi=65535, init_val=np.nan
1654
+ )
1655
+
1656
+ def test_atomic_nanmax_double(self):
1657
+ self.check_atomic_nanmax(
1658
+ dtype=np.float64, lo=-65535, hi=65535, init_val=np.nan
1659
+ )
1660
+
1661
+ def test_atomic_nanmax_double_shared(self):
1662
+ vals = np.random.randint(0, 32, size=32).astype(np.float64)
1663
+ vals[1::2] = np.nan
1664
+ res = np.array([0], dtype=vals.dtype)
1665
+ sig = "void(float64[:], float64[:])"
1666
+ cuda_func = cuda.jit(sig)(atomic_nanmax_double_shared)
1667
+ cuda_func[1, 32](res, vals)
1668
+
1669
+ gold = np.nanmax(vals)
1670
+ np.testing.assert_equal(res, gold)
1671
+
1672
+ def test_atomic_nanmax_double_oneindex(self):
1673
+ vals = np.random.randint(0, 128, size=32).astype(np.float64)
1674
+ vals[1::2] = np.nan
1675
+ res = np.zeros(1, np.float64)
1676
+ cuda_func = cuda.jit("void(float64[:], float64[:])")(
1677
+ atomic_max_double_oneindex
1678
+ )
1679
+ cuda_func[1, 32](res, vals)
1680
+
1681
+ gold = np.nanmax(vals)
1682
+ np.testing.assert_equal(res, gold)
1683
+
1684
+ # nanmin tests
1685
+ def check_atomic_nanmin(self, dtype, lo, hi, init_val):
1686
+ vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype)
1687
+ vals[1::2] = init_val
1688
+ res = np.array([65535], dtype=vals.dtype)
1689
+ cuda_func = cuda.jit(atomic_nanmin)
1690
+ cuda_func[32, 32](res, vals)
1691
+
1692
+ gold = np.nanmin(vals)
1693
+ np.testing.assert_equal(res, gold)
1694
+
1695
+ def test_atomic_nanmin_int32(self):
1696
+ self.check_atomic_nanmin(
1697
+ dtype=np.int32, lo=-65535, hi=65535, init_val=0
1698
+ )
1699
+
1700
+ def test_atomic_nanmin_uint32(self):
1701
+ self.check_atomic_nanmin(dtype=np.uint32, lo=0, hi=65535, init_val=0)
1702
+
1703
+ def test_atomic_nanmin_int64(self):
1704
+ self.check_atomic_nanmin(
1705
+ dtype=np.int64, lo=-65535, hi=65535, init_val=0
1706
+ )
1707
+
1708
+ def test_atomic_nanmin_uint64(self):
1709
+ self.check_atomic_nanmin(dtype=np.uint64, lo=0, hi=65535, init_val=0)
1710
+
1711
+ def test_atomic_nanmin_float(self):
1712
+ self.check_atomic_nanmin(
1713
+ dtype=np.float32, lo=-65535, hi=65535, init_val=np.nan
1714
+ )
1715
+
1716
+ def test_atomic_nanmin_double(self):
1717
+ self.check_atomic_nanmin(
1718
+ dtype=np.float64, lo=-65535, hi=65535, init_val=np.nan
1719
+ )
1720
+
1721
+ def test_atomic_nanmin_double_shared(self):
1722
+ vals = np.random.randint(0, 32, size=32).astype(np.float64)
1723
+ vals[1::2] = np.nan
1724
+ res = np.array([32], dtype=vals.dtype)
1725
+ sig = "void(float64[:], float64[:])"
1726
+ cuda_func = cuda.jit(sig)(atomic_nanmin_double_shared)
1727
+ cuda_func[1, 32](res, vals)
1728
+
1729
+ gold = np.nanmin(vals)
1730
+ np.testing.assert_equal(res, gold)
1731
+
1732
+ def test_atomic_nanmin_double_oneindex(self):
1733
+ vals = np.random.randint(0, 128, size=32).astype(np.float64)
1734
+ vals[1::2] = np.nan
1735
+ res = np.array([128], np.float64)
1736
+ cuda_func = cuda.jit("void(float64[:], float64[:])")(
1737
+ atomic_min_double_oneindex
1738
+ )
1739
+ cuda_func[1, 32](res, vals)
1740
+
1741
+ gold = np.nanmin(vals)
1742
+ np.testing.assert_equal(res, gold)
1743
+
1744
+ # Returning old value tests
1745
+
1746
+ def _test_atomic_nan_returns_old(self, kernel, initial):
1747
+ x = np.zeros(2, dtype=np.float32)
1748
+ x[0] = initial
1749
+ x[1] = np.nan
1750
+ kernel[1, 1](x)
1751
+ if np.isnan(initial):
1752
+ self.assertFalse(np.isnan(x[0]))
1753
+ self.assertTrue(np.isnan(x[1]))
1754
+ else:
1755
+ self.assertEqual(x[1], initial)
1756
+
1757
+ def test_atomic_nanmax_returns_old_no_replace(self):
1758
+ @cuda.jit
1759
+ def kernel(x):
1760
+ x[1] = cuda.atomic.nanmax(x, 0, 1)
1761
+
1762
+ self._test_atomic_nan_returns_old(kernel, 10)
1763
+
1764
+ def test_atomic_nanmax_returns_old_replace(self):
1765
+ @cuda.jit
1766
+ def kernel(x):
1767
+ x[1] = cuda.atomic.nanmax(x, 0, 10)
1768
+
1769
+ self._test_atomic_nan_returns_old(kernel, 1)
1770
+
1771
+ def test_atomic_nanmax_returns_old_nan_in_array(self):
1772
+ @cuda.jit
1773
+ def kernel(x):
1774
+ x[1] = cuda.atomic.nanmax(x, 0, 1)
1775
+
1776
+ self._test_atomic_nan_returns_old(kernel, np.nan)
1777
+
1778
+ def test_atomic_nanmax_returns_old_nan_val(self):
1779
+ @cuda.jit
1780
+ def kernel(x):
1781
+ x[1] = cuda.atomic.nanmax(x, 0, np.nan)
1782
+
1783
+ self._test_atomic_nan_returns_old(kernel, 10)
1784
+
1785
+ def test_atomic_nanmin_returns_old_no_replace(self):
1786
+ @cuda.jit
1787
+ def kernel(x):
1788
+ x[1] = cuda.atomic.nanmin(x, 0, 11)
1789
+
1790
+ self._test_atomic_nan_returns_old(kernel, 10)
1791
+
1792
+ def test_atomic_nanmin_returns_old_replace(self):
1793
+ @cuda.jit
1794
+ def kernel(x):
1795
+ x[1] = cuda.atomic.nanmin(x, 0, 10)
1796
+
1797
+ self._test_atomic_nan_returns_old(kernel, 11)
1798
+
1799
+ def test_atomic_nanmin_returns_old_nan_in_array(self):
1800
+ @cuda.jit
1801
+ def kernel(x):
1802
+ x[1] = cuda.atomic.nanmin(x, 0, 11)
1803
+
1804
+ self._test_atomic_nan_returns_old(kernel, np.nan)
1805
+
1806
+ def test_atomic_nanmin_returns_old_nan_val(self):
1807
+ @cuda.jit
1808
+ def kernel(x):
1809
+ x[1] = cuda.atomic.nanmin(x, 0, np.nan)
1810
+
1811
+ self._test_atomic_nan_returns_old(kernel, 11)
1812
+
1813
+
1814
+ if __name__ == "__main__":
1815
+ unittest.main()