numba-cuda 0.21.1__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (488) hide show
  1. _numba_cuda_redirector.pth +4 -0
  2. _numba_cuda_redirector.py +89 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +6 -0
  5. numba_cuda/_version.py +11 -0
  6. numba_cuda/numba/cuda/__init__.py +70 -0
  7. numba_cuda/numba/cuda/_internal/cuda_bf16.py +16394 -0
  8. numba_cuda/numba/cuda/_internal/cuda_fp16.py +8112 -0
  9. numba_cuda/numba/cuda/api.py +577 -0
  10. numba_cuda/numba/cuda/api_util.py +76 -0
  11. numba_cuda/numba/cuda/args.py +72 -0
  12. numba_cuda/numba/cuda/bf16.py +397 -0
  13. numba_cuda/numba/cuda/cache_hints.py +287 -0
  14. numba_cuda/numba/cuda/cext/__init__.py +2 -0
  15. numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
  16. numba_cuda/numba/cuda/cext/_devicearray.cpp +159 -0
  17. numba_cuda/numba/cuda/cext/_devicearray.h +29 -0
  18. numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
  19. numba_cuda/numba/cuda/cext/_dispatcher.cpp +1098 -0
  20. numba_cuda/numba/cuda/cext/_hashtable.cpp +532 -0
  21. numba_cuda/numba/cuda/cext/_hashtable.h +135 -0
  22. numba_cuda/numba/cuda/cext/_helperlib.c +71 -0
  23. numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
  24. numba_cuda/numba/cuda/cext/_helpermod.c +82 -0
  25. numba_cuda/numba/cuda/cext/_pymodule.h +38 -0
  26. numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
  27. numba_cuda/numba/cuda/cext/_typeconv.cpp +206 -0
  28. numba_cuda/numba/cuda/cext/_typeof.cpp +1159 -0
  29. numba_cuda/numba/cuda/cext/_typeof.h +19 -0
  30. numba_cuda/numba/cuda/cext/capsulethunk.h +111 -0
  31. numba_cuda/numba/cuda/cext/mviewbuf.c +385 -0
  32. numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
  33. numba_cuda/numba/cuda/cext/typeconv.cpp +212 -0
  34. numba_cuda/numba/cuda/cext/typeconv.hpp +101 -0
  35. numba_cuda/numba/cuda/cg.py +67 -0
  36. numba_cuda/numba/cuda/cgutils.py +1294 -0
  37. numba_cuda/numba/cuda/cloudpickle/__init__.py +21 -0
  38. numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +1598 -0
  39. numba_cuda/numba/cuda/cloudpickle/cloudpickle_fast.py +17 -0
  40. numba_cuda/numba/cuda/codegen.py +541 -0
  41. numba_cuda/numba/cuda/compiler.py +1396 -0
  42. numba_cuda/numba/cuda/core/analysis.py +758 -0
  43. numba_cuda/numba/cuda/core/annotations/__init__.py +0 -0
  44. numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +288 -0
  45. numba_cuda/numba/cuda/core/annotations/type_annotations.py +305 -0
  46. numba_cuda/numba/cuda/core/base.py +1332 -0
  47. numba_cuda/numba/cuda/core/boxing.py +1411 -0
  48. numba_cuda/numba/cuda/core/bytecode.py +728 -0
  49. numba_cuda/numba/cuda/core/byteflow.py +2346 -0
  50. numba_cuda/numba/cuda/core/caching.py +744 -0
  51. numba_cuda/numba/cuda/core/callconv.py +392 -0
  52. numba_cuda/numba/cuda/core/codegen.py +171 -0
  53. numba_cuda/numba/cuda/core/compiler.py +199 -0
  54. numba_cuda/numba/cuda/core/compiler_lock.py +85 -0
  55. numba_cuda/numba/cuda/core/compiler_machinery.py +497 -0
  56. numba_cuda/numba/cuda/core/config.py +650 -0
  57. numba_cuda/numba/cuda/core/consts.py +124 -0
  58. numba_cuda/numba/cuda/core/controlflow.py +989 -0
  59. numba_cuda/numba/cuda/core/entrypoints.py +57 -0
  60. numba_cuda/numba/cuda/core/environment.py +66 -0
  61. numba_cuda/numba/cuda/core/errors.py +917 -0
  62. numba_cuda/numba/cuda/core/event.py +511 -0
  63. numba_cuda/numba/cuda/core/funcdesc.py +330 -0
  64. numba_cuda/numba/cuda/core/generators.py +387 -0
  65. numba_cuda/numba/cuda/core/imputils.py +509 -0
  66. numba_cuda/numba/cuda/core/inline_closurecall.py +1787 -0
  67. numba_cuda/numba/cuda/core/interpreter.py +3617 -0
  68. numba_cuda/numba/cuda/core/ir.py +1812 -0
  69. numba_cuda/numba/cuda/core/ir_utils.py +2638 -0
  70. numba_cuda/numba/cuda/core/optional.py +129 -0
  71. numba_cuda/numba/cuda/core/options.py +262 -0
  72. numba_cuda/numba/cuda/core/postproc.py +249 -0
  73. numba_cuda/numba/cuda/core/pythonapi.py +1859 -0
  74. numba_cuda/numba/cuda/core/registry.py +46 -0
  75. numba_cuda/numba/cuda/core/removerefctpass.py +123 -0
  76. numba_cuda/numba/cuda/core/rewrites/__init__.py +26 -0
  77. numba_cuda/numba/cuda/core/rewrites/ir_print.py +91 -0
  78. numba_cuda/numba/cuda/core/rewrites/registry.py +104 -0
  79. numba_cuda/numba/cuda/core/rewrites/static_binop.py +41 -0
  80. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +189 -0
  81. numba_cuda/numba/cuda/core/rewrites/static_raise.py +100 -0
  82. numba_cuda/numba/cuda/core/sigutils.py +68 -0
  83. numba_cuda/numba/cuda/core/ssa.py +498 -0
  84. numba_cuda/numba/cuda/core/targetconfig.py +330 -0
  85. numba_cuda/numba/cuda/core/tracing.py +231 -0
  86. numba_cuda/numba/cuda/core/transforms.py +956 -0
  87. numba_cuda/numba/cuda/core/typed_passes.py +867 -0
  88. numba_cuda/numba/cuda/core/typeinfer.py +1950 -0
  89. numba_cuda/numba/cuda/core/unsafe/__init__.py +0 -0
  90. numba_cuda/numba/cuda/core/unsafe/bytes.py +67 -0
  91. numba_cuda/numba/cuda/core/unsafe/eh.py +67 -0
  92. numba_cuda/numba/cuda/core/unsafe/refcount.py +98 -0
  93. numba_cuda/numba/cuda/core/untyped_passes.py +1979 -0
  94. numba_cuda/numba/cuda/cpython/builtins.py +1153 -0
  95. numba_cuda/numba/cuda/cpython/charseq.py +1218 -0
  96. numba_cuda/numba/cuda/cpython/cmathimpl.py +560 -0
  97. numba_cuda/numba/cuda/cpython/enumimpl.py +103 -0
  98. numba_cuda/numba/cuda/cpython/iterators.py +167 -0
  99. numba_cuda/numba/cuda/cpython/listobj.py +1326 -0
  100. numba_cuda/numba/cuda/cpython/mathimpl.py +499 -0
  101. numba_cuda/numba/cuda/cpython/numbers.py +1475 -0
  102. numba_cuda/numba/cuda/cpython/rangeobj.py +289 -0
  103. numba_cuda/numba/cuda/cpython/slicing.py +322 -0
  104. numba_cuda/numba/cuda/cpython/tupleobj.py +456 -0
  105. numba_cuda/numba/cuda/cpython/unicode.py +2865 -0
  106. numba_cuda/numba/cuda/cpython/unicode_support.py +1597 -0
  107. numba_cuda/numba/cuda/cpython/unsafe/__init__.py +0 -0
  108. numba_cuda/numba/cuda/cpython/unsafe/numbers.py +64 -0
  109. numba_cuda/numba/cuda/cpython/unsafe/tuple.py +92 -0
  110. numba_cuda/numba/cuda/cuda_paths.py +691 -0
  111. numba_cuda/numba/cuda/cudadecl.py +556 -0
  112. numba_cuda/numba/cuda/cudadrv/__init__.py +14 -0
  113. numba_cuda/numba/cuda/cudadrv/devicearray.py +951 -0
  114. numba_cuda/numba/cuda/cudadrv/devices.py +249 -0
  115. numba_cuda/numba/cuda/cudadrv/driver.py +3222 -0
  116. numba_cuda/numba/cuda/cudadrv/drvapi.py +435 -0
  117. numba_cuda/numba/cuda/cudadrv/dummyarray.py +558 -0
  118. numba_cuda/numba/cuda/cudadrv/enums.py +613 -0
  119. numba_cuda/numba/cuda/cudadrv/error.py +48 -0
  120. numba_cuda/numba/cuda/cudadrv/libs.py +220 -0
  121. numba_cuda/numba/cuda/cudadrv/linkable_code.py +184 -0
  122. numba_cuda/numba/cuda/cudadrv/mappings.py +14 -0
  123. numba_cuda/numba/cuda/cudadrv/ndarray.py +26 -0
  124. numba_cuda/numba/cuda/cudadrv/nvrtc.py +193 -0
  125. numba_cuda/numba/cuda/cudadrv/nvvm.py +756 -0
  126. numba_cuda/numba/cuda/cudadrv/rtapi.py +13 -0
  127. numba_cuda/numba/cuda/cudadrv/runtime.py +34 -0
  128. numba_cuda/numba/cuda/cudaimpl.py +995 -0
  129. numba_cuda/numba/cuda/cudamath.py +149 -0
  130. numba_cuda/numba/cuda/datamodel/__init__.py +7 -0
  131. numba_cuda/numba/cuda/datamodel/cuda_manager.py +66 -0
  132. numba_cuda/numba/cuda/datamodel/cuda_models.py +1446 -0
  133. numba_cuda/numba/cuda/datamodel/cuda_packer.py +224 -0
  134. numba_cuda/numba/cuda/datamodel/cuda_registry.py +22 -0
  135. numba_cuda/numba/cuda/datamodel/cuda_testing.py +153 -0
  136. numba_cuda/numba/cuda/datamodel/manager.py +11 -0
  137. numba_cuda/numba/cuda/datamodel/models.py +9 -0
  138. numba_cuda/numba/cuda/datamodel/packer.py +9 -0
  139. numba_cuda/numba/cuda/datamodel/registry.py +11 -0
  140. numba_cuda/numba/cuda/datamodel/testing.py +11 -0
  141. numba_cuda/numba/cuda/debuginfo.py +903 -0
  142. numba_cuda/numba/cuda/decorators.py +294 -0
  143. numba_cuda/numba/cuda/descriptor.py +35 -0
  144. numba_cuda/numba/cuda/device_init.py +158 -0
  145. numba_cuda/numba/cuda/deviceufunc.py +1021 -0
  146. numba_cuda/numba/cuda/dispatcher.py +2463 -0
  147. numba_cuda/numba/cuda/errors.py +72 -0
  148. numba_cuda/numba/cuda/extending.py +697 -0
  149. numba_cuda/numba/cuda/flags.py +178 -0
  150. numba_cuda/numba/cuda/fp16.py +357 -0
  151. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  152. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  153. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  154. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  155. numba_cuda/numba/cuda/include/13/cuda_bf16.h +5118 -0
  156. numba_cuda/numba/cuda/include/13/cuda_bf16.hpp +3865 -0
  157. numba_cuda/numba/cuda/include/13/cuda_fp16.h +5363 -0
  158. numba_cuda/numba/cuda/include/13/cuda_fp16.hpp +3483 -0
  159. numba_cuda/numba/cuda/initialize.py +24 -0
  160. numba_cuda/numba/cuda/intrinsic_wrapper.py +41 -0
  161. numba_cuda/numba/cuda/intrinsics.py +382 -0
  162. numba_cuda/numba/cuda/itanium_mangler.py +214 -0
  163. numba_cuda/numba/cuda/kernels/__init__.py +2 -0
  164. numba_cuda/numba/cuda/kernels/reduction.py +265 -0
  165. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  166. numba_cuda/numba/cuda/libdevice.py +3386 -0
  167. numba_cuda/numba/cuda/libdevicedecl.py +20 -0
  168. numba_cuda/numba/cuda/libdevicefuncs.py +1060 -0
  169. numba_cuda/numba/cuda/libdeviceimpl.py +88 -0
  170. numba_cuda/numba/cuda/locks.py +19 -0
  171. numba_cuda/numba/cuda/lowering.py +1951 -0
  172. numba_cuda/numba/cuda/mathimpl.py +374 -0
  173. numba_cuda/numba/cuda/memory_management/__init__.py +4 -0
  174. numba_cuda/numba/cuda/memory_management/memsys.cu +99 -0
  175. numba_cuda/numba/cuda/memory_management/memsys.cuh +22 -0
  176. numba_cuda/numba/cuda/memory_management/nrt.cu +212 -0
  177. numba_cuda/numba/cuda/memory_management/nrt.cuh +48 -0
  178. numba_cuda/numba/cuda/memory_management/nrt.py +390 -0
  179. numba_cuda/numba/cuda/memory_management/nrt_context.py +438 -0
  180. numba_cuda/numba/cuda/misc/appdirs.py +594 -0
  181. numba_cuda/numba/cuda/misc/cffiimpl.py +24 -0
  182. numba_cuda/numba/cuda/misc/coverage_support.py +43 -0
  183. numba_cuda/numba/cuda/misc/dump_style.py +41 -0
  184. numba_cuda/numba/cuda/misc/findlib.py +75 -0
  185. numba_cuda/numba/cuda/misc/firstlinefinder.py +96 -0
  186. numba_cuda/numba/cuda/misc/gdb_hook.py +240 -0
  187. numba_cuda/numba/cuda/misc/literal.py +28 -0
  188. numba_cuda/numba/cuda/misc/llvm_pass_timings.py +412 -0
  189. numba_cuda/numba/cuda/misc/special.py +94 -0
  190. numba_cuda/numba/cuda/models.py +56 -0
  191. numba_cuda/numba/cuda/np/arraymath.py +5130 -0
  192. numba_cuda/numba/cuda/np/arrayobj.py +7635 -0
  193. numba_cuda/numba/cuda/np/extensions.py +11 -0
  194. numba_cuda/numba/cuda/np/linalg.py +3087 -0
  195. numba_cuda/numba/cuda/np/math/__init__.py +0 -0
  196. numba_cuda/numba/cuda/np/math/cmathimpl.py +558 -0
  197. numba_cuda/numba/cuda/np/math/mathimpl.py +487 -0
  198. numba_cuda/numba/cuda/np/math/numbers.py +1461 -0
  199. numba_cuda/numba/cuda/np/npdatetime.py +969 -0
  200. numba_cuda/numba/cuda/np/npdatetime_helpers.py +217 -0
  201. numba_cuda/numba/cuda/np/npyfuncs.py +1808 -0
  202. numba_cuda/numba/cuda/np/npyimpl.py +1027 -0
  203. numba_cuda/numba/cuda/np/numpy_support.py +798 -0
  204. numba_cuda/numba/cuda/np/polynomial/__init__.py +4 -0
  205. numba_cuda/numba/cuda/np/polynomial/polynomial_core.py +242 -0
  206. numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +380 -0
  207. numba_cuda/numba/cuda/np/ufunc/__init__.py +4 -0
  208. numba_cuda/numba/cuda/np/ufunc/decorators.py +203 -0
  209. numba_cuda/numba/cuda/np/ufunc/sigparse.py +68 -0
  210. numba_cuda/numba/cuda/np/ufunc/ufuncbuilder.py +65 -0
  211. numba_cuda/numba/cuda/np/ufunc_db.py +1282 -0
  212. numba_cuda/numba/cuda/np/unsafe/__init__.py +0 -0
  213. numba_cuda/numba/cuda/np/unsafe/ndarray.py +84 -0
  214. numba_cuda/numba/cuda/nvvmutils.py +254 -0
  215. numba_cuda/numba/cuda/printimpl.py +126 -0
  216. numba_cuda/numba/cuda/random.py +308 -0
  217. numba_cuda/numba/cuda/reshape_funcs.cu +156 -0
  218. numba_cuda/numba/cuda/serialize.py +267 -0
  219. numba_cuda/numba/cuda/simulator/__init__.py +63 -0
  220. numba_cuda/numba/cuda/simulator/_internal/__init__.py +4 -0
  221. numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +2 -0
  222. numba_cuda/numba/cuda/simulator/api.py +179 -0
  223. numba_cuda/numba/cuda/simulator/bf16.py +4 -0
  224. numba_cuda/numba/cuda/simulator/compiler.py +38 -0
  225. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +11 -0
  226. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +462 -0
  227. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +122 -0
  228. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +66 -0
  229. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +7 -0
  230. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +7 -0
  231. numba_cuda/numba/cuda/simulator/cudadrv/error.py +10 -0
  232. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +10 -0
  233. numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +61 -0
  234. numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +11 -0
  235. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +32 -0
  236. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +22 -0
  237. numba_cuda/numba/cuda/simulator/dispatcher.py +11 -0
  238. numba_cuda/numba/cuda/simulator/kernel.py +320 -0
  239. numba_cuda/numba/cuda/simulator/kernelapi.py +509 -0
  240. numba_cuda/numba/cuda/simulator/memory_management/__init__.py +4 -0
  241. numba_cuda/numba/cuda/simulator/memory_management/nrt.py +21 -0
  242. numba_cuda/numba/cuda/simulator/reduction.py +19 -0
  243. numba_cuda/numba/cuda/simulator/tests/support.py +4 -0
  244. numba_cuda/numba/cuda/simulator/vector_types.py +65 -0
  245. numba_cuda/numba/cuda/simulator_init.py +18 -0
  246. numba_cuda/numba/cuda/stubs.py +635 -0
  247. numba_cuda/numba/cuda/target.py +505 -0
  248. numba_cuda/numba/cuda/testing.py +347 -0
  249. numba_cuda/numba/cuda/tests/__init__.py +62 -0
  250. numba_cuda/numba/cuda/tests/benchmarks/__init__.py +0 -0
  251. numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +119 -0
  252. numba_cuda/numba/cuda/tests/cloudpickle_main_class.py +9 -0
  253. numba_cuda/numba/cuda/tests/core/serialize_usecases.py +113 -0
  254. numba_cuda/numba/cuda/tests/core/test_itanium_mangler.py +83 -0
  255. numba_cuda/numba/cuda/tests/core/test_serialize.py +371 -0
  256. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +9 -0
  257. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +147 -0
  258. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +161 -0
  259. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +397 -0
  260. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +24 -0
  261. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +180 -0
  262. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +313 -0
  263. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +187 -0
  264. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +621 -0
  265. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +247 -0
  266. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +100 -0
  267. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +198 -0
  268. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +53 -0
  269. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +72 -0
  270. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +138 -0
  271. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +43 -0
  272. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +15 -0
  273. numba_cuda/numba/cuda/tests/cudadrv/test_linkable_code.py +58 -0
  274. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +348 -0
  275. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +128 -0
  276. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +301 -0
  277. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +174 -0
  278. numba_cuda/numba/cuda/tests/cudadrv/test_nvrtc.py +28 -0
  279. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +185 -0
  280. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +39 -0
  281. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +23 -0
  282. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +38 -0
  283. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +48 -0
  284. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +44 -0
  285. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +127 -0
  286. numba_cuda/numba/cuda/tests/cudapy/__init__.py +9 -0
  287. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +231 -0
  288. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +50 -0
  289. numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +36 -0
  290. numba_cuda/numba/cuda/tests/cudapy/complex_usecases.py +116 -0
  291. numba_cuda/numba/cuda/tests/cudapy/enum_usecases.py +59 -0
  292. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +62 -0
  293. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +28 -0
  294. numba_cuda/numba/cuda/tests/cudapy/overload_usecases.py +33 -0
  295. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +104 -0
  296. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +47 -0
  297. numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +1122 -0
  298. numba_cuda/numba/cuda/tests/cudapy/test_array.py +344 -0
  299. numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +268 -0
  300. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +203 -0
  301. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +63 -0
  302. numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py +360 -0
  303. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1815 -0
  304. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +599 -0
  305. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +377 -0
  306. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +160 -0
  307. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +27 -0
  308. numba_cuda/numba/cuda/tests/cudapy/test_byteflow.py +98 -0
  309. numba_cuda/numba/cuda/tests/cudapy/test_cache_hints.py +210 -0
  310. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +683 -0
  311. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +265 -0
  312. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +42 -0
  313. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +718 -0
  314. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +370 -0
  315. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +23 -0
  316. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +142 -0
  317. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +178 -0
  318. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +193 -0
  319. numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +131 -0
  320. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +438 -0
  321. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +94 -0
  322. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +101 -0
  323. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +105 -0
  324. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +889 -0
  325. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo_types.py +476 -0
  326. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +500 -0
  327. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +820 -0
  328. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +152 -0
  329. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +111 -0
  330. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +170 -0
  331. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1088 -0
  332. numba_cuda/numba/cuda/tests/cudapy/test_extending_types.py +71 -0
  333. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +265 -0
  334. numba_cuda/numba/cuda/tests/cudapy/test_flow_control.py +1433 -0
  335. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +57 -0
  336. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +34 -0
  337. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +69 -0
  338. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +62 -0
  339. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +474 -0
  340. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +167 -0
  341. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +92 -0
  342. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +39 -0
  343. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +170 -0
  344. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +255 -0
  345. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1219 -0
  346. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +263 -0
  347. numba_cuda/numba/cuda/tests/cudapy/test_ir.py +598 -0
  348. numba_cuda/numba/cuda/tests/cudapy/test_ir_utils.py +276 -0
  349. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +101 -0
  350. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +68 -0
  351. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +123 -0
  352. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +194 -0
  353. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +220 -0
  354. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +173 -0
  355. numba_cuda/numba/cuda/tests/cudapy/test_make_function_to_jit_function.py +364 -0
  356. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +47 -0
  357. numba_cuda/numba/cuda/tests/cudapy/test_math.py +842 -0
  358. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +76 -0
  359. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +78 -0
  360. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +25 -0
  361. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +145 -0
  362. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +39 -0
  363. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +82 -0
  364. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +53 -0
  365. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +504 -0
  366. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +93 -0
  367. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +402 -0
  368. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +128 -0
  369. numba_cuda/numba/cuda/tests/cudapy/test_print.py +193 -0
  370. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +37 -0
  371. numba_cuda/numba/cuda/tests/cudapy/test_random.py +117 -0
  372. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +614 -0
  373. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +130 -0
  374. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +94 -0
  375. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  376. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +86 -0
  377. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +40 -0
  378. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +457 -0
  379. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +233 -0
  380. numba_cuda/numba/cuda/tests/cudapy/test_ssa.py +454 -0
  381. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +56 -0
  382. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +277 -0
  383. numba_cuda/numba/cuda/tests/cudapy/test_tracing.py +200 -0
  384. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +90 -0
  385. numba_cuda/numba/cuda/tests/cudapy/test_typeconv.py +333 -0
  386. numba_cuda/numba/cuda/tests/cudapy/test_typeinfer.py +538 -0
  387. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +585 -0
  388. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +42 -0
  389. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +485 -0
  390. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +312 -0
  391. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +23 -0
  392. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +183 -0
  393. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +40 -0
  394. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +40 -0
  395. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +206 -0
  396. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +331 -0
  397. numba_cuda/numba/cuda/tests/cudasim/__init__.py +9 -0
  398. numba_cuda/numba/cuda/tests/cudasim/support.py +9 -0
  399. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +111 -0
  400. numba_cuda/numba/cuda/tests/data/__init__.py +2 -0
  401. numba_cuda/numba/cuda/tests/data/cta_barrier.cu +28 -0
  402. numba_cuda/numba/cuda/tests/data/cuda_include.cu +10 -0
  403. numba_cuda/numba/cuda/tests/data/error.cu +12 -0
  404. numba_cuda/numba/cuda/tests/data/include/add.cuh +8 -0
  405. numba_cuda/numba/cuda/tests/data/jitlink.cu +28 -0
  406. numba_cuda/numba/cuda/tests/data/jitlink.ptx +49 -0
  407. numba_cuda/numba/cuda/tests/data/warn.cu +12 -0
  408. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +9 -0
  409. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +2 -0
  410. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +54 -0
  411. numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +8 -0
  412. numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +14 -0
  413. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +86 -0
  414. numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +68 -0
  415. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +81 -0
  416. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +141 -0
  417. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +160 -0
  418. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +180 -0
  419. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +119 -0
  420. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +66 -0
  421. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +80 -0
  422. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +206 -0
  423. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +53 -0
  424. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +76 -0
  425. numba_cuda/numba/cuda/tests/nocuda/__init__.py +9 -0
  426. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +391 -0
  427. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +48 -0
  428. numba_cuda/numba/cuda/tests/nocuda/test_import.py +63 -0
  429. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +252 -0
  430. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +59 -0
  431. numba_cuda/numba/cuda/tests/nrt/__init__.py +9 -0
  432. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +387 -0
  433. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +124 -0
  434. numba_cuda/numba/cuda/tests/support.py +900 -0
  435. numba_cuda/numba/cuda/typeconv/__init__.py +4 -0
  436. numba_cuda/numba/cuda/typeconv/castgraph.py +137 -0
  437. numba_cuda/numba/cuda/typeconv/rules.py +63 -0
  438. numba_cuda/numba/cuda/typeconv/typeconv.py +121 -0
  439. numba_cuda/numba/cuda/types/__init__.py +233 -0
  440. numba_cuda/numba/cuda/types/__init__.pyi +167 -0
  441. numba_cuda/numba/cuda/types/abstract.py +9 -0
  442. numba_cuda/numba/cuda/types/common.py +9 -0
  443. numba_cuda/numba/cuda/types/containers.py +9 -0
  444. numba_cuda/numba/cuda/types/cuda_abstract.py +533 -0
  445. numba_cuda/numba/cuda/types/cuda_common.py +110 -0
  446. numba_cuda/numba/cuda/types/cuda_containers.py +971 -0
  447. numba_cuda/numba/cuda/types/cuda_function_type.py +230 -0
  448. numba_cuda/numba/cuda/types/cuda_functions.py +798 -0
  449. numba_cuda/numba/cuda/types/cuda_iterators.py +120 -0
  450. numba_cuda/numba/cuda/types/cuda_misc.py +569 -0
  451. numba_cuda/numba/cuda/types/cuda_npytypes.py +690 -0
  452. numba_cuda/numba/cuda/types/cuda_scalars.py +280 -0
  453. numba_cuda/numba/cuda/types/ext_types.py +101 -0
  454. numba_cuda/numba/cuda/types/function_type.py +11 -0
  455. numba_cuda/numba/cuda/types/functions.py +9 -0
  456. numba_cuda/numba/cuda/types/iterators.py +9 -0
  457. numba_cuda/numba/cuda/types/misc.py +9 -0
  458. numba_cuda/numba/cuda/types/npytypes.py +9 -0
  459. numba_cuda/numba/cuda/types/scalars.py +9 -0
  460. numba_cuda/numba/cuda/typing/__init__.py +19 -0
  461. numba_cuda/numba/cuda/typing/arraydecl.py +939 -0
  462. numba_cuda/numba/cuda/typing/asnumbatype.py +130 -0
  463. numba_cuda/numba/cuda/typing/bufproto.py +70 -0
  464. numba_cuda/numba/cuda/typing/builtins.py +1209 -0
  465. numba_cuda/numba/cuda/typing/cffi_utils.py +219 -0
  466. numba_cuda/numba/cuda/typing/cmathdecl.py +47 -0
  467. numba_cuda/numba/cuda/typing/collections.py +138 -0
  468. numba_cuda/numba/cuda/typing/context.py +782 -0
  469. numba_cuda/numba/cuda/typing/ctypes_utils.py +125 -0
  470. numba_cuda/numba/cuda/typing/dictdecl.py +63 -0
  471. numba_cuda/numba/cuda/typing/enumdecl.py +74 -0
  472. numba_cuda/numba/cuda/typing/listdecl.py +147 -0
  473. numba_cuda/numba/cuda/typing/mathdecl.py +158 -0
  474. numba_cuda/numba/cuda/typing/npdatetime.py +322 -0
  475. numba_cuda/numba/cuda/typing/npydecl.py +749 -0
  476. numba_cuda/numba/cuda/typing/setdecl.py +115 -0
  477. numba_cuda/numba/cuda/typing/templates.py +1446 -0
  478. numba_cuda/numba/cuda/typing/typeof.py +301 -0
  479. numba_cuda/numba/cuda/ufuncs.py +746 -0
  480. numba_cuda/numba/cuda/utils.py +724 -0
  481. numba_cuda/numba/cuda/vector_types.py +214 -0
  482. numba_cuda/numba/cuda/vectorizers.py +260 -0
  483. numba_cuda-0.21.1.dist-info/METADATA +109 -0
  484. numba_cuda-0.21.1.dist-info/RECORD +488 -0
  485. numba_cuda-0.21.1.dist-info/WHEEL +5 -0
  486. numba_cuda-0.21.1.dist-info/licenses/LICENSE +26 -0
  487. numba_cuda-0.21.1.dist-info/licenses/LICENSE.numba +24 -0
  488. numba_cuda-0.21.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,820 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ from numba.cuda.cudadrv.driver import CudaAPIError
5
+ import numpy as np
6
+ import threading
7
+
8
+ from numba.cuda.types import (
9
+ boolean,
10
+ float32,
11
+ float64,
12
+ int32,
13
+ int64,
14
+ uint32,
15
+ void,
16
+ )
17
+ from numba import cuda
18
+ from numba.cuda import config, types
19
+ from numba.cuda.core.errors import TypingError
20
+ from numba.cuda.testing import (
21
+ cc_X_or_above,
22
+ skip_on_cudasim,
23
+ unittest,
24
+ CUDATestCase,
25
+ )
26
+ import math
27
+
28
+
29
+ def add(x, y):
30
+ return x + y
31
+
32
+
33
+ def add_kernel(r, x, y):
34
+ r[0] = x + y
35
+
36
+
37
+ @skip_on_cudasim("Specialization not implemented in the simulator")
38
+ class TestDispatcherSpecialization(CUDATestCase):
39
+ def _test_no_double_specialize(self, dispatcher, ty):
40
+ with self.assertRaises(RuntimeError) as e:
41
+ dispatcher.specialize(ty)
42
+
43
+ self.assertIn("Dispatcher already specialized", str(e.exception))
44
+
45
+ def test_no_double_specialize_sig_same_types(self):
46
+ # Attempting to specialize a kernel jitted with a signature is illegal,
47
+ # even for the same types the kernel is already specialized for.
48
+ @cuda.jit("void(float32[::1])")
49
+ def f(x):
50
+ pass
51
+
52
+ self._test_no_double_specialize(f, float32[::1])
53
+
54
+ def test_no_double_specialize_no_sig_same_types(self):
55
+ # Attempting to specialize an already-specialized kernel is illegal,
56
+ # even for the same types the kernel is already specialized for.
57
+ @cuda.jit
58
+ def f(x):
59
+ pass
60
+
61
+ f_specialized = f.specialize(float32[::1])
62
+ self._test_no_double_specialize(f_specialized, float32[::1])
63
+
64
+ def test_no_double_specialize_sig_diff_types(self):
65
+ # Attempting to specialize a kernel jitted with a signature is illegal.
66
+ @cuda.jit("void(int32[::1])")
67
+ def f(x):
68
+ pass
69
+
70
+ self._test_no_double_specialize(f, float32[::1])
71
+
72
+ def test_no_double_specialize_no_sig_diff_types(self):
73
+ # Attempting to specialize an already-specialized kernel is illegal.
74
+ @cuda.jit
75
+ def f(x):
76
+ pass
77
+
78
+ f_specialized = f.specialize(int32[::1])
79
+ self._test_no_double_specialize(f_specialized, float32[::1])
80
+
81
+ def test_specialize_cache_same(self):
82
+ # Ensure that the same dispatcher is returned for the same argument
83
+ # types, and that different dispatchers are returned for different
84
+ # argument types.
85
+ @cuda.jit
86
+ def f(x):
87
+ pass
88
+
89
+ self.assertEqual(len(f.specializations), 0)
90
+
91
+ f_float32 = f.specialize(float32[::1])
92
+ self.assertEqual(len(f.specializations), 1)
93
+
94
+ f_float32_2 = f.specialize(float32[::1])
95
+ self.assertEqual(len(f.specializations), 1)
96
+ self.assertIs(f_float32, f_float32_2)
97
+
98
+ f_int32 = f.specialize(int32[::1])
99
+ self.assertEqual(len(f.specializations), 2)
100
+ self.assertIsNot(f_int32, f_float32)
101
+
102
+ def test_specialize_cache_same_with_ordering(self):
103
+ # Ensure that the same dispatcher is returned for the same argument
104
+ # types, and that different dispatchers are returned for different
105
+ # argument types, taking into account array ordering and multiple
106
+ # arguments.
107
+ @cuda.jit
108
+ def f(x, y):
109
+ pass
110
+
111
+ self.assertEqual(len(f.specializations), 0)
112
+
113
+ # 'A' order specialization
114
+ f_f32a_f32a = f.specialize(float32[:], float32[:])
115
+ self.assertEqual(len(f.specializations), 1)
116
+
117
+ # 'C' order specialization
118
+ f_f32c_f32c = f.specialize(float32[::1], float32[::1])
119
+ self.assertEqual(len(f.specializations), 2)
120
+ self.assertIsNot(f_f32a_f32a, f_f32c_f32c)
121
+
122
+ # Reuse 'C' order specialization
123
+ f_f32c_f32c_2 = f.specialize(float32[::1], float32[::1])
124
+ self.assertEqual(len(f.specializations), 2)
125
+ self.assertIs(f_f32c_f32c, f_f32c_f32c_2)
126
+
127
+
128
+ class TestDispatcher(CUDATestCase):
129
+ """Most tests based on those in numba.tests.test_dispatcher."""
130
+
131
+ def test_coerce_input_types(self):
132
+ # Do not allow unsafe conversions if we can still compile other
133
+ # specializations.
134
+ c_add = cuda.jit(add_kernel)
135
+
136
+ # Using a complex128 allows us to represent any result produced by the
137
+ # test
138
+ r = np.zeros(1, dtype=np.complex128)
139
+
140
+ c_add[1, 1](r, 123, 456)
141
+ self.assertEqual(r[0], add(123, 456))
142
+
143
+ c_add[1, 1](r, 12.3, 45.6)
144
+ self.assertEqual(r[0], add(12.3, 45.6))
145
+
146
+ c_add[1, 1](r, 12.3, 45.6j)
147
+ self.assertEqual(r[0], add(12.3, 45.6j))
148
+
149
+ c_add[1, 1](r, 12300000000, 456)
150
+ self.assertEqual(r[0], add(12300000000, 456))
151
+
152
+ # Now force compilation of only a single specialization
153
+ c_add = cuda.jit("(i4[::1], i4, i4)")(add_kernel)
154
+ r = np.zeros(1, dtype=np.int32)
155
+
156
+ c_add[1, 1](r, 123, 456)
157
+ self.assertPreciseEqual(r[0], add(123, 456))
158
+
159
+ @skip_on_cudasim("Simulator ignores signature")
160
+ @unittest.expectedFailure
161
+ def test_coerce_input_types_unsafe(self):
162
+ # Implicit (unsafe) conversion of float to int, originally from
163
+ # test_coerce_input_types. This test presently fails with the CUDA
164
+ # Dispatcher because argument preparation is done by
165
+ # _Kernel._prepare_args, which is currently inflexible with respect to
166
+ # the types it can accept when preparing.
167
+ #
168
+ # This test is marked as xfail until future changes enable this
169
+ # behavior.
170
+ c_add = cuda.jit("(i4[::1], i4, i4)")(add_kernel)
171
+ r = np.zeros(1, dtype=np.int32)
172
+
173
+ c_add[1, 1](r, 12.3, 45.6)
174
+ self.assertPreciseEqual(r[0], add(12, 45))
175
+
176
+ @skip_on_cudasim("Simulator ignores signature")
177
+ def test_coerce_input_types_unsafe_complex(self):
178
+ # Implicit conversion of complex to int disallowed
179
+ c_add = cuda.jit("(i4[::1], i4, i4)")(add_kernel)
180
+ r = np.zeros(1, dtype=np.int32)
181
+
182
+ with self.assertRaises(TypeError):
183
+ c_add[1, 1](r, 12.3, 45.6j)
184
+
185
+ @skip_on_cudasim("Simulator does not track overloads")
186
+ def test_ambiguous_new_version(self):
187
+ """Test compiling new version in an ambiguous case"""
188
+ c_add = cuda.jit(add_kernel)
189
+
190
+ r = np.zeros(1, dtype=np.float64)
191
+ INT = 1
192
+ FLT = 1.5
193
+
194
+ c_add[1, 1](r, INT, FLT)
195
+ self.assertAlmostEqual(r[0], INT + FLT)
196
+ self.assertEqual(len(c_add.overloads), 1)
197
+
198
+ c_add[1, 1](r, FLT, INT)
199
+ self.assertAlmostEqual(r[0], FLT + INT)
200
+ self.assertEqual(len(c_add.overloads), 2)
201
+
202
+ c_add[1, 1](r, FLT, FLT)
203
+ self.assertAlmostEqual(r[0], FLT + FLT)
204
+ self.assertEqual(len(c_add.overloads), 3)
205
+
206
+ # The following call is ambiguous because (int, int) can resolve
207
+ # to (float, int) or (int, float) with equal weight.
208
+ c_add[1, 1](r, 1, 1)
209
+ self.assertAlmostEqual(r[0], INT + INT)
210
+ self.assertEqual(
211
+ len(c_add.overloads), 4, "didn't compile a new version"
212
+ )
213
+
214
+ @skip_on_cudasim("Simulator doesn't support concurrent kernels")
215
+ def test_lock(self):
216
+ """
217
+ Test that (lazy) compiling from several threads at once doesn't
218
+ produce errors (see issue #908).
219
+ """
220
+ errors = []
221
+
222
+ @cuda.jit
223
+ def foo(r, x):
224
+ r[0] = x + 1
225
+
226
+ def wrapper():
227
+ try:
228
+ r = np.zeros(1, dtype=np.int64)
229
+ foo[1, 1](r, 1)
230
+ self.assertEqual(r[0], 2)
231
+ except Exception as e:
232
+ errors.append(e)
233
+
234
+ threads = [threading.Thread(target=wrapper) for i in range(16)]
235
+ for t in threads:
236
+ t.start()
237
+ for t in threads:
238
+ t.join()
239
+ self.assertFalse(errors)
240
+
241
+ def _test_explicit_signatures(self, sigs):
242
+ f = cuda.jit(sigs)(add_kernel)
243
+
244
+ # Exact signature matches
245
+ r = np.zeros(1, dtype=np.int64)
246
+ f[1, 1](r, 1, 2)
247
+ self.assertPreciseEqual(r[0], 3)
248
+
249
+ r = np.zeros(1, dtype=np.float64)
250
+ f[1, 1](r, 1.5, 2.5)
251
+ self.assertPreciseEqual(r[0], 4.0)
252
+
253
+ if config.ENABLE_CUDASIM:
254
+ # Pass - we can't check for no conversion on the simulator.
255
+ return
256
+
257
+ # No conversion
258
+ with self.assertRaises(TypeError) as cm:
259
+ r = np.zeros(1, dtype=np.complex128)
260
+ f[1, 1](r, 1j, 1j)
261
+ self.assertIn("No matching definition", str(cm.exception))
262
+ self.assertEqual(len(f.overloads), 2, f.overloads)
263
+
264
+ def test_explicit_signatures_strings(self):
265
+ # Check with a list of strings for signatures
266
+ sigs = [
267
+ "(int64[::1], int64, int64)",
268
+ "(float64[::1], float64, float64)",
269
+ ]
270
+ self._test_explicit_signatures(sigs)
271
+
272
+ def test_explicit_signatures_tuples(self):
273
+ # Check with a list of tuples of argument types for signatures
274
+ sigs = [(int64[::1], int64, int64), (float64[::1], float64, float64)]
275
+ self._test_explicit_signatures(sigs)
276
+
277
+ def test_explicit_signatures_signatures(self):
278
+ # Check with a list of Signature objects for signatures
279
+ sigs = [
280
+ void(int64[::1], int64, int64),
281
+ void(float64[::1], float64, float64),
282
+ ]
283
+ self._test_explicit_signatures(sigs)
284
+
285
+ def test_explicit_signatures_mixed(self):
286
+ # Check when we mix types of signature objects in a list of signatures
287
+
288
+ # Tuple and string
289
+ sigs = [(int64[::1], int64, int64), "(float64[::1], float64, float64)"]
290
+ self._test_explicit_signatures(sigs)
291
+
292
+ # Tuple and Signature object
293
+ sigs = [
294
+ (int64[::1], int64, int64),
295
+ void(float64[::1], float64, float64),
296
+ ]
297
+ self._test_explicit_signatures(sigs)
298
+
299
+ # Signature object and string
300
+ sigs = [
301
+ void(int64[::1], int64, int64),
302
+ "(float64[::1], float64, float64)",
303
+ ]
304
+ self._test_explicit_signatures(sigs)
305
+
306
+ def test_explicit_signatures_same_type_class(self):
307
+ # A more interesting one...
308
+ # (Note that the type of r is deliberately float64 in both cases so
309
+ # that dispatch is differentiated on the types of x and y only, to
310
+ # closely preserve the intent of the original test from
311
+ # numba.tests.test_dispatcher)
312
+ sigs = [
313
+ "(float64[::1], float32, float32)",
314
+ "(float64[::1], float64, float64)",
315
+ ]
316
+ f = cuda.jit(sigs)(add_kernel)
317
+
318
+ r = np.zeros(1, dtype=np.float64)
319
+ f[1, 1](r, np.float32(1), np.float32(2**-25))
320
+ self.assertPreciseEqual(r[0], 1.0)
321
+
322
+ r = np.zeros(1, dtype=np.float64)
323
+ f[1, 1](r, 1, 2**-25)
324
+ self.assertPreciseEqual(r[0], 1.0000000298023224)
325
+
326
+ @skip_on_cudasim("No overload resolution in the simulator")
327
+ def test_explicit_signatures_ambiguous_resolution(self):
328
+ # Fail to resolve ambiguity between the two best overloads
329
+ # (Also deliberate float64[::1] for the first argument in all cases)
330
+ f = cuda.jit(
331
+ [
332
+ "(float64[::1], float32, float64)",
333
+ "(float64[::1], float64, float32)",
334
+ "(float64[::1], int64, int64)",
335
+ ]
336
+ )(add_kernel)
337
+ with self.assertRaises(TypeError) as cm:
338
+ r = np.zeros(1, dtype=np.float64)
339
+ f[1, 1](r, 1.0, 2.0)
340
+
341
+ # The two best matches are output in the error message, as well
342
+ # as the actual argument types.
343
+ self.assertRegex(
344
+ str(cm.exception),
345
+ r"Ambiguous overloading for <function add_kernel [^>]*> "
346
+ r"\(Array\(float64, 1, 'C', False, aligned=True\), float64,"
347
+ r" float64\):\n"
348
+ r"\(Array\(float64, 1, 'C', False, aligned=True\), float32,"
349
+ r" float64\) -> none\n"
350
+ r"\(Array\(float64, 1, 'C', False, aligned=True\), float64,"
351
+ r" float32\) -> none",
352
+ )
353
+ # The integer signature is not part of the best matches
354
+ self.assertNotIn("int64", str(cm.exception))
355
+
356
+ @skip_on_cudasim("Simulator does not use _prepare_args")
357
+ @unittest.expectedFailure
358
+ def test_explicit_signatures_unsafe(self):
359
+ # These tests are from test_explicit_signatures, but have to be xfail
360
+ # at present because _prepare_args in the CUDA target cannot handle
361
+ # unsafe conversions of arguments.
362
+ f = cuda.jit("(int64[::1], int64, int64)")(add_kernel)
363
+ r = np.zeros(1, dtype=np.int64)
364
+
365
+ # Approximate match (unsafe conversion)
366
+ f[1, 1](r, 1.5, 2.5)
367
+ self.assertPreciseEqual(r[0], 3)
368
+ self.assertEqual(len(f.overloads), 1, f.overloads)
369
+
370
+ sigs = [
371
+ "(int64[::1], int64, int64)",
372
+ "(float64[::1], float64, float64)",
373
+ ]
374
+ f = cuda.jit(sigs)(add_kernel)
375
+ r = np.zeros(1, dtype=np.float64)
376
+ # Approximate match (int32 -> float64 is a safe conversion)
377
+ f[1, 1](r, np.int32(1), 2.5)
378
+ self.assertPreciseEqual(r[0], 3.5)
379
+
380
+ def add_device_usecase(self, sigs):
381
+ # Generate a kernel that calls the add device function compiled with a
382
+ # given set of signatures
383
+ add_device = cuda.jit(sigs, device=True)(add)
384
+
385
+ @cuda.jit
386
+ def f(r, x, y):
387
+ r[0] = add_device(x, y)
388
+
389
+ return f
390
+
391
+ def test_explicit_signatures_device(self):
392
+ # Tests similar to test_explicit_signatures, but on a device function
393
+ # instead of a kernel
394
+ sigs = ["(int64, int64)", "(float64, float64)"]
395
+ f = self.add_device_usecase(sigs)
396
+
397
+ # Exact signature matches
398
+ r = np.zeros(1, dtype=np.int64)
399
+ f[1, 1](r, 1, 2)
400
+ self.assertPreciseEqual(r[0], 3)
401
+
402
+ r = np.zeros(1, dtype=np.float64)
403
+ f[1, 1](r, 1.5, 2.5)
404
+ self.assertPreciseEqual(r[0], 4.0)
405
+
406
+ if config.ENABLE_CUDASIM:
407
+ # Pass - we can't check for no conversion on the simulator.
408
+ return
409
+
410
+ # No conversion
411
+ with self.assertRaises(TypingError) as cm:
412
+ r = np.zeros(1, dtype=np.complex128)
413
+ f[1, 1](r, 1j, 1j)
414
+
415
+ msg = str(cm.exception)
416
+ self.assertIn("Invalid use of type", msg)
417
+ self.assertIn("with parameters (complex128, complex128)", msg)
418
+ self.assertEqual(len(f.overloads), 2, f.overloads)
419
+
420
+ def test_explicit_signatures_device_same_type_class(self):
421
+ # A more interesting one...
422
+ # (Note that the type of r is deliberately float64 in both cases so
423
+ # that dispatch is differentiated on the types of x and y only, to
424
+ # closely preserve the intent of the original test from
425
+ # numba.tests.test_dispatcher)
426
+ sigs = ["(float32, float32)", "(float64, float64)"]
427
+ f = self.add_device_usecase(sigs)
428
+
429
+ r = np.zeros(1, dtype=np.float64)
430
+ f[1, 1](r, np.float32(1), np.float32(2**-25))
431
+ self.assertPreciseEqual(r[0], 1.0)
432
+
433
+ r = np.zeros(1, dtype=np.float64)
434
+ f[1, 1](r, 1, 2**-25)
435
+ self.assertPreciseEqual(r[0], 1.0000000298023224)
436
+
437
+ def test_explicit_signatures_device_ambiguous(self):
438
+ # Ambiguity between the two best overloads resolves. This is somewhat
439
+ # surprising given that ambiguity is not permitted for dispatching
440
+ # overloads when launching a kernel, but seems to be the general
441
+ # behaviour of Numba (See Issue #8307:
442
+ # https://github.com/numba/numba/issues/8307).
443
+ sigs = ["(float32, float64)", "(float64, float32)", "(int64, int64)"]
444
+ f = self.add_device_usecase(sigs)
445
+
446
+ r = np.zeros(1, dtype=np.float64)
447
+ f[1, 1](r, 1.5, 2.5)
448
+ self.assertPreciseEqual(r[0], 4.0)
449
+
450
+ @skip_on_cudasim("CUDA Simulator does not force casting")
451
+ def test_explicit_signatures_device_unsafe(self):
452
+ # These tests are from test_explicit_signatures. The device function
453
+ # variant of these tests can succeed on CUDA because the compilation
454
+ # can handle unsafe casting (c.f. test_explicit_signatures_unsafe which
455
+ # has to xfail due to _prepare_args not supporting unsafe casting).
456
+ sigs = ["(int64, int64)"]
457
+ f = self.add_device_usecase(sigs)
458
+
459
+ # Approximate match (unsafe conversion)
460
+ r = np.zeros(1, dtype=np.int64)
461
+ f[1, 1](r, 1.5, 2.5)
462
+ self.assertPreciseEqual(r[0], 3)
463
+ self.assertEqual(len(f.overloads), 1, f.overloads)
464
+
465
+ sigs = ["(int64, int64)", "(float64, float64)"]
466
+ f = self.add_device_usecase(sigs)
467
+
468
+ # Approximate match (int32 -> float64 is a safe conversion)
469
+ r = np.zeros(1, dtype=np.float64)
470
+ f[1, 1](r, np.int32(1), 2.5)
471
+ self.assertPreciseEqual(r[0], 3.5)
472
+
473
+ def test_dispatcher_docstring(self):
474
+ # Ensure that CUDA-jitting a function preserves its docstring. See
475
+ # Issue #5902: https://github.com/numba/numba/issues/5902
476
+
477
+ @cuda.jit
478
+ def add_kernel(a, b):
479
+ """Add two integers, kernel version"""
480
+
481
+ @cuda.jit(device=True)
482
+ def add_device(a, b):
483
+ """Add two integers, device version"""
484
+
485
+ self.assertEqual("Add two integers, kernel version", add_kernel.__doc__)
486
+ self.assertEqual("Add two integers, device version", add_device.__doc__)
487
+
488
+ @skip_on_cudasim("Cudasim does not have device pointers")
489
+ def test_dispatcher_cpointer_arguments(self):
490
+ ptr = types.CPointer(types.int32)
491
+ sig = void(ptr, int32, ptr, ptr, uint32)
492
+
493
+ @cuda.jit(sig)
494
+ def axpy(r, a, x, y, n):
495
+ i = cuda.grid(1)
496
+ if i < n:
497
+ r[i] = a * x[i] + y[i]
498
+
499
+ N = 16
500
+ a = 5
501
+ hx = np.arange(10, dtype=np.int32)
502
+ hy = np.arange(10, dtype=np.int32) * 2
503
+ dx = cuda.to_device(hx)
504
+ dy = cuda.to_device(hy)
505
+ dr = cuda.device_array_like(dx)
506
+
507
+ r_ptr = dr.__cuda_array_interface__["data"][0]
508
+ x_ptr = dx.__cuda_array_interface__["data"][0]
509
+ y_ptr = dy.__cuda_array_interface__["data"][0]
510
+
511
+ axpy[1, 32](r_ptr, a, x_ptr, y_ptr, N)
512
+
513
+ expected = a * hx + hy
514
+ actual = dr.copy_to_host()
515
+ np.testing.assert_equal(expected, actual)
516
+
517
+
518
+ @skip_on_cudasim("CUDA simulator doesn't implement kernel properties")
519
+ class TestDispatcherKernelProperties(CUDATestCase):
520
+ def test_get_regs_per_thread_unspecialized(self):
521
+ # A kernel where the register usage per thread is likely to differ
522
+ # between different specializations
523
+ @cuda.jit
524
+ def pi_sin_array(x, n):
525
+ i = cuda.grid(1)
526
+ if i < n:
527
+ x[i] = 3.14 * math.sin(x[i])
528
+
529
+ # Call the kernel with different arguments to create two different
530
+ # definitions within the Dispatcher object
531
+ N = 10
532
+ arr_f32 = np.zeros(N, dtype=np.float32)
533
+ arr_f64 = np.zeros(N, dtype=np.float64)
534
+
535
+ pi_sin_array[1, N](arr_f32, N)
536
+ pi_sin_array[1, N](arr_f64, N)
537
+
538
+ # Check we get a positive integer for the two different variations
539
+ sig_f32 = void(float32[::1], int64)
540
+ sig_f64 = void(float64[::1], int64)
541
+ regs_per_thread_f32 = pi_sin_array.get_regs_per_thread(sig_f32)
542
+ regs_per_thread_f64 = pi_sin_array.get_regs_per_thread(sig_f64)
543
+
544
+ self.assertIsInstance(regs_per_thread_f32, int)
545
+ self.assertIsInstance(regs_per_thread_f64, int)
546
+
547
+ self.assertGreater(regs_per_thread_f32, 0)
548
+ self.assertGreater(regs_per_thread_f64, 0)
549
+
550
+ # Check that getting the registers per thread for all signatures
551
+ # provides the same values as getting the registers per thread for
552
+ # individual signatures.
553
+ regs_per_thread_all = pi_sin_array.get_regs_per_thread()
554
+ self.assertEqual(regs_per_thread_all[sig_f32.args], regs_per_thread_f32)
555
+ self.assertEqual(regs_per_thread_all[sig_f64.args], regs_per_thread_f64)
556
+
557
+ if regs_per_thread_f32 == regs_per_thread_f64:
558
+ # If the register usage is the same for both variants, there may be
559
+ # a bug, but this may also be an artifact of the compiler / driver
560
+ # / device combination, so produce an informational message only.
561
+ print("f32 and f64 variant thread usages are equal.")
562
+ print("This may warrant some investigation. Devices:")
563
+ cuda.detect()
564
+
565
+ def test_get_regs_per_thread_specialized(self):
566
+ @cuda.jit(void(float32[::1], int64))
567
+ def pi_sin_array(x, n):
568
+ i = cuda.grid(1)
569
+ if i < n:
570
+ x[i] = 3.14 * math.sin(x[i])
571
+
572
+ # Check we get a positive integer for the specialized variation
573
+ regs_per_thread = pi_sin_array.get_regs_per_thread()
574
+ self.assertIsInstance(regs_per_thread, int)
575
+ self.assertGreater(regs_per_thread, 0)
576
+
577
+ def test_get_const_mem_unspecialized(self):
578
+ @cuda.jit
579
+ def const_fmt_string(val, to_print):
580
+ # We guard the print with a conditional to prevent noise from the
581
+ # test suite
582
+ if to_print:
583
+ print(val)
584
+
585
+ # Call the kernel with different arguments to create two different
586
+ # definitions within the Dispatcher object
587
+ const_fmt_string[1, 1](1, False)
588
+ const_fmt_string[1, 1](1.0, False)
589
+
590
+ # Check we get a positive integer for the two different variations
591
+ sig_i64 = void(int64, boolean)
592
+ sig_f64 = void(float64, boolean)
593
+ const_mem_size_i64 = const_fmt_string.get_const_mem_size(sig_i64)
594
+ const_mem_size_f64 = const_fmt_string.get_const_mem_size(sig_f64)
595
+
596
+ self.assertIsInstance(const_mem_size_i64, int)
597
+ self.assertIsInstance(const_mem_size_f64, int)
598
+
599
+ # 6 bytes for the equivalent of b'%lld\n\0'
600
+ self.assertGreaterEqual(const_mem_size_i64, 6)
601
+ # 4 bytes for the equivalent of b'%f\n\0'
602
+ self.assertGreaterEqual(const_mem_size_f64, 4)
603
+
604
+ # Check that getting the const memory size for all signatures
605
+ # provides the same values as getting the const memory size for
606
+ # individual signatures.
607
+
608
+ const_mem_size_all = const_fmt_string.get_const_mem_size()
609
+ self.assertEqual(const_mem_size_all[sig_i64.args], const_mem_size_i64)
610
+ self.assertEqual(const_mem_size_all[sig_f64.args], const_mem_size_f64)
611
+
612
+ def test_get_const_mem_specialized(self):
613
+ arr = np.arange(32, dtype=np.int64)
614
+ sig = void(int64[::1])
615
+
616
+ @cuda.jit(sig)
617
+ def const_array_use(x):
618
+ C = cuda.const.array_like(arr)
619
+ i = cuda.grid(1)
620
+ x[i] = C[i]
621
+
622
+ const_mem_size = const_array_use.get_const_mem_size(sig)
623
+ self.assertIsInstance(const_mem_size, int)
624
+ self.assertGreaterEqual(const_mem_size, arr.nbytes)
625
+
626
+ def test_get_shared_mem_per_block_unspecialized(self):
627
+ N = 10
628
+
629
+ # A kernel where the shared memory per block is likely to differ
630
+ # between different specializations
631
+ @cuda.jit
632
+ def simple_smem(ary):
633
+ sm = cuda.shared.array(N, dtype=ary.dtype)
634
+ for j in range(N):
635
+ sm[j] = j
636
+ for j in range(N):
637
+ ary[j] = sm[j]
638
+
639
+ # Call the kernel with different arguments to create two different
640
+ # definitions within the Dispatcher object
641
+ arr_f32 = np.zeros(N, dtype=np.float32)
642
+ arr_f64 = np.zeros(N, dtype=np.float64)
643
+
644
+ simple_smem[1, 1](arr_f32)
645
+ simple_smem[1, 1](arr_f64)
646
+
647
+ sig_f32 = void(float32[::1])
648
+ sig_f64 = void(float64[::1])
649
+
650
+ sh_mem_f32 = simple_smem.get_shared_mem_per_block(sig_f32)
651
+ sh_mem_f64 = simple_smem.get_shared_mem_per_block(sig_f64)
652
+
653
+ self.assertIsInstance(sh_mem_f32, int)
654
+ self.assertIsInstance(sh_mem_f64, int)
655
+
656
+ self.assertEqual(sh_mem_f32, N * 4)
657
+ self.assertEqual(sh_mem_f64, N * 8)
658
+
659
+ # Check that getting the shared memory per block for all signatures
660
+ # provides the same values as getting the shared mem per block for
661
+ # individual signatures.
662
+ sh_mem_f32_all = simple_smem.get_shared_mem_per_block()
663
+ sh_mem_f64_all = simple_smem.get_shared_mem_per_block()
664
+ self.assertEqual(sh_mem_f32_all[sig_f32.args], sh_mem_f32)
665
+ self.assertEqual(sh_mem_f64_all[sig_f64.args], sh_mem_f64)
666
+
667
+ def test_get_shared_mem_per_block_specialized(self):
668
+ @cuda.jit(void(float32[::1]))
669
+ def simple_smem(ary):
670
+ sm = cuda.shared.array(100, dtype=float32)
671
+ i = cuda.grid(1)
672
+ if i == 0:
673
+ for j in range(100):
674
+ sm[j] = j
675
+ cuda.syncthreads()
676
+ ary[i] = sm[i]
677
+
678
+ shared_mem_per_block = simple_smem.get_shared_mem_per_block()
679
+ self.assertIsInstance(shared_mem_per_block, int)
680
+ self.assertEqual(shared_mem_per_block, 400)
681
+
682
+ def test_get_max_threads_per_block_unspecialized(self):
683
+ N = 10
684
+
685
+ @cuda.jit
686
+ def simple_maxthreads(ary):
687
+ i = cuda.grid(1)
688
+ ary[i] = i
689
+
690
+ arr_f32 = np.zeros(N, dtype=np.float32)
691
+ simple_maxthreads[1, 1](arr_f32)
692
+ sig_f32 = void(float32[::1])
693
+ max_threads_f32 = simple_maxthreads.get_max_threads_per_block(sig_f32)
694
+
695
+ self.assertIsInstance(max_threads_f32, int)
696
+ self.assertGreater(max_threads_f32, 0)
697
+
698
+ max_threads_f32_all = simple_maxthreads.get_max_threads_per_block()
699
+ self.assertEqual(max_threads_f32_all[sig_f32.args], max_threads_f32)
700
+
701
+ def test_get_local_mem_per_thread_unspecialized(self):
702
+ # NOTE: A large amount of local memory must be allocated
703
+ # otherwise the compiler will optimize out the call to
704
+ # cuda.local.array and use local registers instead
705
+ N = 1000
706
+
707
+ @cuda.jit
708
+ def simple_lmem(ary):
709
+ lm = cuda.local.array(N, dtype=ary.dtype)
710
+ for j in range(N):
711
+ lm[j] = j
712
+ for j in range(N):
713
+ ary[j] = lm[j]
714
+
715
+ # Call the kernel with different arguments to create two different
716
+ # definitions within the Dispatcher object
717
+ arr_f32 = np.zeros(N, dtype=np.float32)
718
+ arr_f64 = np.zeros(N, dtype=np.float64)
719
+
720
+ simple_lmem[1, 1](arr_f32)
721
+ simple_lmem[1, 1](arr_f64)
722
+
723
+ sig_f32 = void(float32[::1])
724
+ sig_f64 = void(float64[::1])
725
+ local_mem_f32 = simple_lmem.get_local_mem_per_thread(sig_f32)
726
+ local_mem_f64 = simple_lmem.get_local_mem_per_thread(sig_f64)
727
+ self.assertIsInstance(local_mem_f32, int)
728
+ self.assertIsInstance(local_mem_f64, int)
729
+
730
+ self.assertGreaterEqual(local_mem_f32, N * 4)
731
+ self.assertGreaterEqual(local_mem_f64, N * 8)
732
+
733
+ # Check that getting the local memory per thread for all signatures
734
+ # provides the same values as getting the shared mem per block for
735
+ # individual signatures.
736
+ local_mem_all = simple_lmem.get_local_mem_per_thread()
737
+ self.assertEqual(local_mem_all[sig_f32.args], local_mem_f32)
738
+ self.assertEqual(local_mem_all[sig_f64.args], local_mem_f64)
739
+
740
+ def test_get_local_mem_per_thread_specialized(self):
741
+ # NOTE: A large amount of local memory must be allocated
742
+ # otherwise the compiler will optimize out the call to
743
+ # cuda.local.array and use local registers instead
744
+ N = 1000
745
+
746
+ @cuda.jit(void(float32[::1]))
747
+ def simple_lmem(ary):
748
+ lm = cuda.local.array(N, dtype=ary.dtype)
749
+ for j in range(N):
750
+ lm[j] = j
751
+ for j in range(N):
752
+ ary[j] = lm[j]
753
+
754
+ local_mem_per_thread = simple_lmem.get_local_mem_per_thread()
755
+ self.assertIsInstance(local_mem_per_thread, int)
756
+ self.assertGreaterEqual(local_mem_per_thread, N * 4)
757
+
758
+
759
+ @skip_on_cudasim("Simulator does not support launch bounds")
760
+ class TestLaunchBounds(CUDATestCase):
761
+ def _test_launch_bounds_common(self, launch_bounds):
762
+ @cuda.jit(launch_bounds=launch_bounds)
763
+ def f():
764
+ pass
765
+
766
+ # Test successful launch
767
+ f[1, 128]()
768
+
769
+ # Test launch bound exceeded
770
+ msg = "Call to cuLaunchKernel results in CUDA_ERROR_INVALID_VALUE"
771
+ with self.assertRaisesRegex(CudaAPIError, msg):
772
+ f[1, 256]()
773
+
774
+ sig = f.signatures[0]
775
+ ptx = f.inspect_asm(sig)
776
+ # Match either `.maxntid, 128, 1, 1` or `.maxntid 128` on a line by
777
+ # itself:
778
+ self.assertRegex(ptx, r".maxntid\s+128(?:,\s+1,\s+1)?\s*\n")
779
+
780
+ return ptx
781
+
782
+ def test_launch_bounds_scalar(self):
783
+ launch_bounds = 128
784
+ ptx = self._test_launch_bounds_common(launch_bounds)
785
+
786
+ self.assertNotIn(".minnctapersm", ptx)
787
+ self.assertNotIn(".maxclusterrank", ptx)
788
+
789
+ def test_launch_bounds_tuple(self):
790
+ launch_bounds = (128,)
791
+ ptx = self._test_launch_bounds_common(launch_bounds)
792
+
793
+ self.assertNotIn(".minnctapersm", ptx)
794
+ self.assertNotIn(".maxclusterrank", ptx)
795
+
796
+ def test_launch_bounds_with_min_cta(self):
797
+ launch_bounds = (128, 2)
798
+ ptx = self._test_launch_bounds_common(launch_bounds)
799
+
800
+ self.assertRegex(ptx, r".minnctapersm\s+2")
801
+ self.assertNotIn(".maxclusterrank", ptx)
802
+
803
+ @unittest.skipUnless(
804
+ cc_X_or_above(9, 0), "CC 9.0 needed for max cluster rank"
805
+ )
806
+ def test_launch_bounds_with_max_cluster_rank(self):
807
+ launch_bounds = (128, 2, 4)
808
+ ptx = self._test_launch_bounds_common(launch_bounds)
809
+
810
+ self.assertRegex(ptx, r".minnctapersm\s+2")
811
+ self.assertRegex(ptx, r".maxclusterrank\s+4")
812
+
813
+ def test_too_many_launch_bounds(self):
814
+ launch_bounds = (128, 2, 4, 8)
815
+ with self.assertRaisesRegex(ValueError, "Got 4 launch bounds:"):
816
+ cuda.jit("void()", launch_bounds=launch_bounds)(lambda: None)
817
+
818
+
819
+ if __name__ == "__main__":
820
+ unittest.main()