numba-cuda 0.22.0__cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of numba-cuda might be problematic. Click here for more details.

Files changed (487) hide show
  1. _numba_cuda_redirector.pth +4 -0
  2. _numba_cuda_redirector.py +89 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +6 -0
  5. numba_cuda/_version.py +11 -0
  6. numba_cuda/numba/cuda/__init__.py +70 -0
  7. numba_cuda/numba/cuda/_internal/cuda_bf16.py +16394 -0
  8. numba_cuda/numba/cuda/_internal/cuda_fp16.py +8112 -0
  9. numba_cuda/numba/cuda/api.py +580 -0
  10. numba_cuda/numba/cuda/api_util.py +76 -0
  11. numba_cuda/numba/cuda/args.py +72 -0
  12. numba_cuda/numba/cuda/bf16.py +397 -0
  13. numba_cuda/numba/cuda/cache_hints.py +287 -0
  14. numba_cuda/numba/cuda/cext/__init__.py +2 -0
  15. numba_cuda/numba/cuda/cext/_devicearray.cpp +159 -0
  16. numba_cuda/numba/cuda/cext/_devicearray.cpython-312-aarch64-linux-gnu.so +0 -0
  17. numba_cuda/numba/cuda/cext/_devicearray.h +29 -0
  18. numba_cuda/numba/cuda/cext/_dispatcher.cpp +1098 -0
  19. numba_cuda/numba/cuda/cext/_dispatcher.cpython-312-aarch64-linux-gnu.so +0 -0
  20. numba_cuda/numba/cuda/cext/_hashtable.cpp +532 -0
  21. numba_cuda/numba/cuda/cext/_hashtable.h +135 -0
  22. numba_cuda/numba/cuda/cext/_helperlib.c +71 -0
  23. numba_cuda/numba/cuda/cext/_helperlib.cpython-312-aarch64-linux-gnu.so +0 -0
  24. numba_cuda/numba/cuda/cext/_helpermod.c +82 -0
  25. numba_cuda/numba/cuda/cext/_pymodule.h +38 -0
  26. numba_cuda/numba/cuda/cext/_typeconv.cpp +206 -0
  27. numba_cuda/numba/cuda/cext/_typeconv.cpython-312-aarch64-linux-gnu.so +0 -0
  28. numba_cuda/numba/cuda/cext/_typeof.cpp +1159 -0
  29. numba_cuda/numba/cuda/cext/_typeof.h +19 -0
  30. numba_cuda/numba/cuda/cext/capsulethunk.h +111 -0
  31. numba_cuda/numba/cuda/cext/mviewbuf.c +385 -0
  32. numba_cuda/numba/cuda/cext/mviewbuf.cpython-312-aarch64-linux-gnu.so +0 -0
  33. numba_cuda/numba/cuda/cext/typeconv.cpp +212 -0
  34. numba_cuda/numba/cuda/cext/typeconv.hpp +101 -0
  35. numba_cuda/numba/cuda/cg.py +67 -0
  36. numba_cuda/numba/cuda/cgutils.py +1294 -0
  37. numba_cuda/numba/cuda/cloudpickle/__init__.py +21 -0
  38. numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +1598 -0
  39. numba_cuda/numba/cuda/cloudpickle/cloudpickle_fast.py +17 -0
  40. numba_cuda/numba/cuda/codegen.py +541 -0
  41. numba_cuda/numba/cuda/compiler.py +1396 -0
  42. numba_cuda/numba/cuda/core/analysis.py +758 -0
  43. numba_cuda/numba/cuda/core/annotations/__init__.py +0 -0
  44. numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +288 -0
  45. numba_cuda/numba/cuda/core/annotations/type_annotations.py +305 -0
  46. numba_cuda/numba/cuda/core/base.py +1332 -0
  47. numba_cuda/numba/cuda/core/boxing.py +1411 -0
  48. numba_cuda/numba/cuda/core/bytecode.py +728 -0
  49. numba_cuda/numba/cuda/core/byteflow.py +2346 -0
  50. numba_cuda/numba/cuda/core/caching.py +744 -0
  51. numba_cuda/numba/cuda/core/callconv.py +392 -0
  52. numba_cuda/numba/cuda/core/codegen.py +171 -0
  53. numba_cuda/numba/cuda/core/compiler.py +199 -0
  54. numba_cuda/numba/cuda/core/compiler_lock.py +85 -0
  55. numba_cuda/numba/cuda/core/compiler_machinery.py +497 -0
  56. numba_cuda/numba/cuda/core/config.py +650 -0
  57. numba_cuda/numba/cuda/core/consts.py +124 -0
  58. numba_cuda/numba/cuda/core/controlflow.py +989 -0
  59. numba_cuda/numba/cuda/core/entrypoints.py +57 -0
  60. numba_cuda/numba/cuda/core/environment.py +66 -0
  61. numba_cuda/numba/cuda/core/errors.py +917 -0
  62. numba_cuda/numba/cuda/core/event.py +511 -0
  63. numba_cuda/numba/cuda/core/funcdesc.py +330 -0
  64. numba_cuda/numba/cuda/core/generators.py +387 -0
  65. numba_cuda/numba/cuda/core/imputils.py +509 -0
  66. numba_cuda/numba/cuda/core/inline_closurecall.py +1787 -0
  67. numba_cuda/numba/cuda/core/interpreter.py +3617 -0
  68. numba_cuda/numba/cuda/core/ir.py +1812 -0
  69. numba_cuda/numba/cuda/core/ir_utils.py +2638 -0
  70. numba_cuda/numba/cuda/core/optional.py +129 -0
  71. numba_cuda/numba/cuda/core/options.py +262 -0
  72. numba_cuda/numba/cuda/core/postproc.py +249 -0
  73. numba_cuda/numba/cuda/core/pythonapi.py +1859 -0
  74. numba_cuda/numba/cuda/core/registry.py +46 -0
  75. numba_cuda/numba/cuda/core/removerefctpass.py +123 -0
  76. numba_cuda/numba/cuda/core/rewrites/__init__.py +26 -0
  77. numba_cuda/numba/cuda/core/rewrites/ir_print.py +91 -0
  78. numba_cuda/numba/cuda/core/rewrites/registry.py +104 -0
  79. numba_cuda/numba/cuda/core/rewrites/static_binop.py +41 -0
  80. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +189 -0
  81. numba_cuda/numba/cuda/core/rewrites/static_raise.py +100 -0
  82. numba_cuda/numba/cuda/core/sigutils.py +68 -0
  83. numba_cuda/numba/cuda/core/ssa.py +498 -0
  84. numba_cuda/numba/cuda/core/targetconfig.py +330 -0
  85. numba_cuda/numba/cuda/core/tracing.py +231 -0
  86. numba_cuda/numba/cuda/core/transforms.py +956 -0
  87. numba_cuda/numba/cuda/core/typed_passes.py +867 -0
  88. numba_cuda/numba/cuda/core/typeinfer.py +1950 -0
  89. numba_cuda/numba/cuda/core/unsafe/__init__.py +0 -0
  90. numba_cuda/numba/cuda/core/unsafe/bytes.py +67 -0
  91. numba_cuda/numba/cuda/core/unsafe/eh.py +67 -0
  92. numba_cuda/numba/cuda/core/unsafe/refcount.py +98 -0
  93. numba_cuda/numba/cuda/core/untyped_passes.py +1979 -0
  94. numba_cuda/numba/cuda/cpython/builtins.py +1153 -0
  95. numba_cuda/numba/cuda/cpython/charseq.py +1218 -0
  96. numba_cuda/numba/cuda/cpython/cmathimpl.py +560 -0
  97. numba_cuda/numba/cuda/cpython/enumimpl.py +103 -0
  98. numba_cuda/numba/cuda/cpython/iterators.py +167 -0
  99. numba_cuda/numba/cuda/cpython/listobj.py +1326 -0
  100. numba_cuda/numba/cuda/cpython/mathimpl.py +499 -0
  101. numba_cuda/numba/cuda/cpython/numbers.py +1475 -0
  102. numba_cuda/numba/cuda/cpython/rangeobj.py +289 -0
  103. numba_cuda/numba/cuda/cpython/slicing.py +322 -0
  104. numba_cuda/numba/cuda/cpython/tupleobj.py +456 -0
  105. numba_cuda/numba/cuda/cpython/unicode.py +2865 -0
  106. numba_cuda/numba/cuda/cpython/unicode_support.py +1597 -0
  107. numba_cuda/numba/cuda/cpython/unsafe/__init__.py +0 -0
  108. numba_cuda/numba/cuda/cpython/unsafe/numbers.py +64 -0
  109. numba_cuda/numba/cuda/cpython/unsafe/tuple.py +92 -0
  110. numba_cuda/numba/cuda/cuda_paths.py +691 -0
  111. numba_cuda/numba/cuda/cudadecl.py +543 -0
  112. numba_cuda/numba/cuda/cudadrv/__init__.py +14 -0
  113. numba_cuda/numba/cuda/cudadrv/devicearray.py +954 -0
  114. numba_cuda/numba/cuda/cudadrv/devices.py +249 -0
  115. numba_cuda/numba/cuda/cudadrv/driver.py +3238 -0
  116. numba_cuda/numba/cuda/cudadrv/drvapi.py +435 -0
  117. numba_cuda/numba/cuda/cudadrv/dummyarray.py +562 -0
  118. numba_cuda/numba/cuda/cudadrv/enums.py +613 -0
  119. numba_cuda/numba/cuda/cudadrv/error.py +48 -0
  120. numba_cuda/numba/cuda/cudadrv/libs.py +220 -0
  121. numba_cuda/numba/cuda/cudadrv/linkable_code.py +184 -0
  122. numba_cuda/numba/cuda/cudadrv/mappings.py +14 -0
  123. numba_cuda/numba/cuda/cudadrv/ndarray.py +26 -0
  124. numba_cuda/numba/cuda/cudadrv/nvrtc.py +193 -0
  125. numba_cuda/numba/cuda/cudadrv/nvvm.py +756 -0
  126. numba_cuda/numba/cuda/cudadrv/rtapi.py +13 -0
  127. numba_cuda/numba/cuda/cudadrv/runtime.py +34 -0
  128. numba_cuda/numba/cuda/cudaimpl.py +983 -0
  129. numba_cuda/numba/cuda/cudamath.py +149 -0
  130. numba_cuda/numba/cuda/datamodel/__init__.py +7 -0
  131. numba_cuda/numba/cuda/datamodel/cuda_manager.py +66 -0
  132. numba_cuda/numba/cuda/datamodel/cuda_models.py +1446 -0
  133. numba_cuda/numba/cuda/datamodel/cuda_packer.py +224 -0
  134. numba_cuda/numba/cuda/datamodel/cuda_registry.py +22 -0
  135. numba_cuda/numba/cuda/datamodel/cuda_testing.py +153 -0
  136. numba_cuda/numba/cuda/datamodel/manager.py +11 -0
  137. numba_cuda/numba/cuda/datamodel/models.py +9 -0
  138. numba_cuda/numba/cuda/datamodel/packer.py +9 -0
  139. numba_cuda/numba/cuda/datamodel/registry.py +11 -0
  140. numba_cuda/numba/cuda/datamodel/testing.py +11 -0
  141. numba_cuda/numba/cuda/debuginfo.py +997 -0
  142. numba_cuda/numba/cuda/decorators.py +294 -0
  143. numba_cuda/numba/cuda/descriptor.py +35 -0
  144. numba_cuda/numba/cuda/device_init.py +155 -0
  145. numba_cuda/numba/cuda/deviceufunc.py +1021 -0
  146. numba_cuda/numba/cuda/dispatcher.py +2463 -0
  147. numba_cuda/numba/cuda/errors.py +72 -0
  148. numba_cuda/numba/cuda/extending.py +697 -0
  149. numba_cuda/numba/cuda/flags.py +178 -0
  150. numba_cuda/numba/cuda/fp16.py +357 -0
  151. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  152. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  153. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  154. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  155. numba_cuda/numba/cuda/include/13/cuda_bf16.h +5118 -0
  156. numba_cuda/numba/cuda/include/13/cuda_bf16.hpp +3865 -0
  157. numba_cuda/numba/cuda/include/13/cuda_fp16.h +5363 -0
  158. numba_cuda/numba/cuda/include/13/cuda_fp16.hpp +3483 -0
  159. numba_cuda/numba/cuda/initialize.py +24 -0
  160. numba_cuda/numba/cuda/intrinsics.py +531 -0
  161. numba_cuda/numba/cuda/itanium_mangler.py +214 -0
  162. numba_cuda/numba/cuda/kernels/__init__.py +2 -0
  163. numba_cuda/numba/cuda/kernels/reduction.py +265 -0
  164. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  165. numba_cuda/numba/cuda/libdevice.py +3386 -0
  166. numba_cuda/numba/cuda/libdevicedecl.py +20 -0
  167. numba_cuda/numba/cuda/libdevicefuncs.py +1060 -0
  168. numba_cuda/numba/cuda/libdeviceimpl.py +88 -0
  169. numba_cuda/numba/cuda/locks.py +19 -0
  170. numba_cuda/numba/cuda/lowering.py +1980 -0
  171. numba_cuda/numba/cuda/mathimpl.py +374 -0
  172. numba_cuda/numba/cuda/memory_management/__init__.py +4 -0
  173. numba_cuda/numba/cuda/memory_management/memsys.cu +99 -0
  174. numba_cuda/numba/cuda/memory_management/memsys.cuh +22 -0
  175. numba_cuda/numba/cuda/memory_management/nrt.cu +212 -0
  176. numba_cuda/numba/cuda/memory_management/nrt.cuh +48 -0
  177. numba_cuda/numba/cuda/memory_management/nrt.py +390 -0
  178. numba_cuda/numba/cuda/memory_management/nrt_context.py +438 -0
  179. numba_cuda/numba/cuda/misc/appdirs.py +594 -0
  180. numba_cuda/numba/cuda/misc/cffiimpl.py +24 -0
  181. numba_cuda/numba/cuda/misc/coverage_support.py +43 -0
  182. numba_cuda/numba/cuda/misc/dump_style.py +41 -0
  183. numba_cuda/numba/cuda/misc/findlib.py +75 -0
  184. numba_cuda/numba/cuda/misc/firstlinefinder.py +96 -0
  185. numba_cuda/numba/cuda/misc/gdb_hook.py +240 -0
  186. numba_cuda/numba/cuda/misc/literal.py +28 -0
  187. numba_cuda/numba/cuda/misc/llvm_pass_timings.py +412 -0
  188. numba_cuda/numba/cuda/misc/special.py +94 -0
  189. numba_cuda/numba/cuda/models.py +56 -0
  190. numba_cuda/numba/cuda/np/arraymath.py +5130 -0
  191. numba_cuda/numba/cuda/np/arrayobj.py +7635 -0
  192. numba_cuda/numba/cuda/np/extensions.py +11 -0
  193. numba_cuda/numba/cuda/np/linalg.py +3087 -0
  194. numba_cuda/numba/cuda/np/math/__init__.py +0 -0
  195. numba_cuda/numba/cuda/np/math/cmathimpl.py +558 -0
  196. numba_cuda/numba/cuda/np/math/mathimpl.py +487 -0
  197. numba_cuda/numba/cuda/np/math/numbers.py +1461 -0
  198. numba_cuda/numba/cuda/np/npdatetime.py +969 -0
  199. numba_cuda/numba/cuda/np/npdatetime_helpers.py +217 -0
  200. numba_cuda/numba/cuda/np/npyfuncs.py +1808 -0
  201. numba_cuda/numba/cuda/np/npyimpl.py +1027 -0
  202. numba_cuda/numba/cuda/np/numpy_support.py +798 -0
  203. numba_cuda/numba/cuda/np/polynomial/__init__.py +4 -0
  204. numba_cuda/numba/cuda/np/polynomial/polynomial_core.py +242 -0
  205. numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +380 -0
  206. numba_cuda/numba/cuda/np/ufunc/__init__.py +4 -0
  207. numba_cuda/numba/cuda/np/ufunc/decorators.py +203 -0
  208. numba_cuda/numba/cuda/np/ufunc/sigparse.py +68 -0
  209. numba_cuda/numba/cuda/np/ufunc/ufuncbuilder.py +65 -0
  210. numba_cuda/numba/cuda/np/ufunc_db.py +1282 -0
  211. numba_cuda/numba/cuda/np/unsafe/__init__.py +0 -0
  212. numba_cuda/numba/cuda/np/unsafe/ndarray.py +84 -0
  213. numba_cuda/numba/cuda/nvvmutils.py +254 -0
  214. numba_cuda/numba/cuda/printimpl.py +126 -0
  215. numba_cuda/numba/cuda/random.py +308 -0
  216. numba_cuda/numba/cuda/reshape_funcs.cu +156 -0
  217. numba_cuda/numba/cuda/serialize.py +267 -0
  218. numba_cuda/numba/cuda/simulator/__init__.py +63 -0
  219. numba_cuda/numba/cuda/simulator/_internal/__init__.py +4 -0
  220. numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +2 -0
  221. numba_cuda/numba/cuda/simulator/api.py +179 -0
  222. numba_cuda/numba/cuda/simulator/bf16.py +4 -0
  223. numba_cuda/numba/cuda/simulator/compiler.py +38 -0
  224. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +11 -0
  225. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +462 -0
  226. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +122 -0
  227. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +66 -0
  228. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +7 -0
  229. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +7 -0
  230. numba_cuda/numba/cuda/simulator/cudadrv/error.py +10 -0
  231. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +10 -0
  232. numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +61 -0
  233. numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +11 -0
  234. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +32 -0
  235. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +22 -0
  236. numba_cuda/numba/cuda/simulator/dispatcher.py +11 -0
  237. numba_cuda/numba/cuda/simulator/kernel.py +320 -0
  238. numba_cuda/numba/cuda/simulator/kernelapi.py +509 -0
  239. numba_cuda/numba/cuda/simulator/memory_management/__init__.py +4 -0
  240. numba_cuda/numba/cuda/simulator/memory_management/nrt.py +21 -0
  241. numba_cuda/numba/cuda/simulator/reduction.py +19 -0
  242. numba_cuda/numba/cuda/simulator/tests/support.py +4 -0
  243. numba_cuda/numba/cuda/simulator/vector_types.py +65 -0
  244. numba_cuda/numba/cuda/simulator_init.py +18 -0
  245. numba_cuda/numba/cuda/stubs.py +624 -0
  246. numba_cuda/numba/cuda/target.py +505 -0
  247. numba_cuda/numba/cuda/testing.py +347 -0
  248. numba_cuda/numba/cuda/tests/__init__.py +62 -0
  249. numba_cuda/numba/cuda/tests/benchmarks/__init__.py +0 -0
  250. numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +119 -0
  251. numba_cuda/numba/cuda/tests/cloudpickle_main_class.py +9 -0
  252. numba_cuda/numba/cuda/tests/core/serialize_usecases.py +113 -0
  253. numba_cuda/numba/cuda/tests/core/test_itanium_mangler.py +83 -0
  254. numba_cuda/numba/cuda/tests/core/test_serialize.py +371 -0
  255. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +9 -0
  256. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +147 -0
  257. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +161 -0
  258. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +397 -0
  259. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +24 -0
  260. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +180 -0
  261. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +313 -0
  262. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +191 -0
  263. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +621 -0
  264. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +247 -0
  265. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +100 -0
  266. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +200 -0
  267. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +53 -0
  268. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +72 -0
  269. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +138 -0
  270. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +43 -0
  271. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +15 -0
  272. numba_cuda/numba/cuda/tests/cudadrv/test_linkable_code.py +58 -0
  273. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +348 -0
  274. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +128 -0
  275. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +301 -0
  276. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +174 -0
  277. numba_cuda/numba/cuda/tests/cudadrv/test_nvrtc.py +28 -0
  278. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +185 -0
  279. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +39 -0
  280. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +23 -0
  281. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +38 -0
  282. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +48 -0
  283. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +44 -0
  284. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +127 -0
  285. numba_cuda/numba/cuda/tests/cudapy/__init__.py +9 -0
  286. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +231 -0
  287. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +50 -0
  288. numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +36 -0
  289. numba_cuda/numba/cuda/tests/cudapy/complex_usecases.py +116 -0
  290. numba_cuda/numba/cuda/tests/cudapy/enum_usecases.py +59 -0
  291. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +62 -0
  292. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +28 -0
  293. numba_cuda/numba/cuda/tests/cudapy/overload_usecases.py +33 -0
  294. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +104 -0
  295. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +47 -0
  296. numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +1122 -0
  297. numba_cuda/numba/cuda/tests/cudapy/test_array.py +344 -0
  298. numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +268 -0
  299. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +203 -0
  300. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +63 -0
  301. numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py +360 -0
  302. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1815 -0
  303. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +599 -0
  304. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +377 -0
  305. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +160 -0
  306. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +27 -0
  307. numba_cuda/numba/cuda/tests/cudapy/test_byteflow.py +98 -0
  308. numba_cuda/numba/cuda/tests/cudapy/test_cache_hints.py +210 -0
  309. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +683 -0
  310. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +265 -0
  311. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +42 -0
  312. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +718 -0
  313. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +370 -0
  314. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +23 -0
  315. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +142 -0
  316. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +178 -0
  317. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +193 -0
  318. numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +131 -0
  319. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +438 -0
  320. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +94 -0
  321. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +101 -0
  322. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +105 -0
  323. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +978 -0
  324. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo_types.py +476 -0
  325. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +500 -0
  326. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +820 -0
  327. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +152 -0
  328. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +111 -0
  329. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +170 -0
  330. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1088 -0
  331. numba_cuda/numba/cuda/tests/cudapy/test_extending_types.py +71 -0
  332. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +265 -0
  333. numba_cuda/numba/cuda/tests/cudapy/test_flow_control.py +1433 -0
  334. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +57 -0
  335. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +34 -0
  336. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +69 -0
  337. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +62 -0
  338. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +474 -0
  339. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +167 -0
  340. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +92 -0
  341. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +39 -0
  342. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +170 -0
  343. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +255 -0
  344. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1219 -0
  345. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +263 -0
  346. numba_cuda/numba/cuda/tests/cudapy/test_ir.py +598 -0
  347. numba_cuda/numba/cuda/tests/cudapy/test_ir_utils.py +276 -0
  348. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +101 -0
  349. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +68 -0
  350. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +123 -0
  351. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +194 -0
  352. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +220 -0
  353. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +173 -0
  354. numba_cuda/numba/cuda/tests/cudapy/test_make_function_to_jit_function.py +364 -0
  355. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +47 -0
  356. numba_cuda/numba/cuda/tests/cudapy/test_math.py +842 -0
  357. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +76 -0
  358. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +78 -0
  359. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +25 -0
  360. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +145 -0
  361. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +39 -0
  362. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +82 -0
  363. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +53 -0
  364. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +504 -0
  365. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +93 -0
  366. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +402 -0
  367. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +128 -0
  368. numba_cuda/numba/cuda/tests/cudapy/test_print.py +193 -0
  369. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +37 -0
  370. numba_cuda/numba/cuda/tests/cudapy/test_random.py +117 -0
  371. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +614 -0
  372. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +130 -0
  373. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +94 -0
  374. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  375. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +86 -0
  376. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +40 -0
  377. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +457 -0
  378. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +233 -0
  379. numba_cuda/numba/cuda/tests/cudapy/test_ssa.py +454 -0
  380. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +56 -0
  381. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +277 -0
  382. numba_cuda/numba/cuda/tests/cudapy/test_tracing.py +200 -0
  383. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +90 -0
  384. numba_cuda/numba/cuda/tests/cudapy/test_typeconv.py +333 -0
  385. numba_cuda/numba/cuda/tests/cudapy/test_typeinfer.py +538 -0
  386. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +585 -0
  387. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +42 -0
  388. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +485 -0
  389. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +312 -0
  390. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +23 -0
  391. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +183 -0
  392. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +40 -0
  393. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +40 -0
  394. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +206 -0
  395. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +446 -0
  396. numba_cuda/numba/cuda/tests/cudasim/__init__.py +9 -0
  397. numba_cuda/numba/cuda/tests/cudasim/support.py +9 -0
  398. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +111 -0
  399. numba_cuda/numba/cuda/tests/data/__init__.py +2 -0
  400. numba_cuda/numba/cuda/tests/data/cta_barrier.cu +28 -0
  401. numba_cuda/numba/cuda/tests/data/cuda_include.cu +10 -0
  402. numba_cuda/numba/cuda/tests/data/error.cu +12 -0
  403. numba_cuda/numba/cuda/tests/data/include/add.cuh +8 -0
  404. numba_cuda/numba/cuda/tests/data/jitlink.cu +28 -0
  405. numba_cuda/numba/cuda/tests/data/jitlink.ptx +49 -0
  406. numba_cuda/numba/cuda/tests/data/warn.cu +12 -0
  407. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +9 -0
  408. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +2 -0
  409. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +54 -0
  410. numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +8 -0
  411. numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +14 -0
  412. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +86 -0
  413. numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +68 -0
  414. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +81 -0
  415. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +141 -0
  416. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +160 -0
  417. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +180 -0
  418. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +119 -0
  419. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +66 -0
  420. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +80 -0
  421. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +206 -0
  422. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +53 -0
  423. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +76 -0
  424. numba_cuda/numba/cuda/tests/nocuda/__init__.py +9 -0
  425. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +452 -0
  426. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +48 -0
  427. numba_cuda/numba/cuda/tests/nocuda/test_import.py +63 -0
  428. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +252 -0
  429. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +59 -0
  430. numba_cuda/numba/cuda/tests/nrt/__init__.py +9 -0
  431. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +387 -0
  432. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +124 -0
  433. numba_cuda/numba/cuda/tests/support.py +900 -0
  434. numba_cuda/numba/cuda/typeconv/__init__.py +4 -0
  435. numba_cuda/numba/cuda/typeconv/castgraph.py +137 -0
  436. numba_cuda/numba/cuda/typeconv/rules.py +63 -0
  437. numba_cuda/numba/cuda/typeconv/typeconv.py +121 -0
  438. numba_cuda/numba/cuda/types/__init__.py +233 -0
  439. numba_cuda/numba/cuda/types/__init__.pyi +167 -0
  440. numba_cuda/numba/cuda/types/abstract.py +9 -0
  441. numba_cuda/numba/cuda/types/common.py +9 -0
  442. numba_cuda/numba/cuda/types/containers.py +9 -0
  443. numba_cuda/numba/cuda/types/cuda_abstract.py +533 -0
  444. numba_cuda/numba/cuda/types/cuda_common.py +110 -0
  445. numba_cuda/numba/cuda/types/cuda_containers.py +971 -0
  446. numba_cuda/numba/cuda/types/cuda_function_type.py +230 -0
  447. numba_cuda/numba/cuda/types/cuda_functions.py +798 -0
  448. numba_cuda/numba/cuda/types/cuda_iterators.py +120 -0
  449. numba_cuda/numba/cuda/types/cuda_misc.py +569 -0
  450. numba_cuda/numba/cuda/types/cuda_npytypes.py +690 -0
  451. numba_cuda/numba/cuda/types/cuda_scalars.py +280 -0
  452. numba_cuda/numba/cuda/types/ext_types.py +101 -0
  453. numba_cuda/numba/cuda/types/function_type.py +11 -0
  454. numba_cuda/numba/cuda/types/functions.py +9 -0
  455. numba_cuda/numba/cuda/types/iterators.py +9 -0
  456. numba_cuda/numba/cuda/types/misc.py +9 -0
  457. numba_cuda/numba/cuda/types/npytypes.py +9 -0
  458. numba_cuda/numba/cuda/types/scalars.py +9 -0
  459. numba_cuda/numba/cuda/typing/__init__.py +19 -0
  460. numba_cuda/numba/cuda/typing/arraydecl.py +939 -0
  461. numba_cuda/numba/cuda/typing/asnumbatype.py +130 -0
  462. numba_cuda/numba/cuda/typing/bufproto.py +70 -0
  463. numba_cuda/numba/cuda/typing/builtins.py +1209 -0
  464. numba_cuda/numba/cuda/typing/cffi_utils.py +219 -0
  465. numba_cuda/numba/cuda/typing/cmathdecl.py +47 -0
  466. numba_cuda/numba/cuda/typing/collections.py +138 -0
  467. numba_cuda/numba/cuda/typing/context.py +782 -0
  468. numba_cuda/numba/cuda/typing/ctypes_utils.py +125 -0
  469. numba_cuda/numba/cuda/typing/dictdecl.py +63 -0
  470. numba_cuda/numba/cuda/typing/enumdecl.py +74 -0
  471. numba_cuda/numba/cuda/typing/listdecl.py +147 -0
  472. numba_cuda/numba/cuda/typing/mathdecl.py +158 -0
  473. numba_cuda/numba/cuda/typing/npdatetime.py +322 -0
  474. numba_cuda/numba/cuda/typing/npydecl.py +749 -0
  475. numba_cuda/numba/cuda/typing/setdecl.py +115 -0
  476. numba_cuda/numba/cuda/typing/templates.py +1446 -0
  477. numba_cuda/numba/cuda/typing/typeof.py +301 -0
  478. numba_cuda/numba/cuda/ufuncs.py +746 -0
  479. numba_cuda/numba/cuda/utils.py +724 -0
  480. numba_cuda/numba/cuda/vector_types.py +214 -0
  481. numba_cuda/numba/cuda/vectorizers.py +260 -0
  482. numba_cuda-0.22.0.dist-info/METADATA +109 -0
  483. numba_cuda-0.22.0.dist-info/RECORD +487 -0
  484. numba_cuda-0.22.0.dist-info/WHEEL +6 -0
  485. numba_cuda-0.22.0.dist-info/licenses/LICENSE +26 -0
  486. numba_cuda-0.22.0.dist-info/licenses/LICENSE.numba +24 -0
  487. numba_cuda-0.22.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1396 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ from llvmlite import ir
5
+ from collections import namedtuple
6
+ from warnings import warn, catch_warnings, simplefilter
7
+ import copy
8
+
9
+ from numba.cuda.core import ir as numba_ir
10
+ from numba.cuda.core import bytecode
11
+ from numba.cuda import types
12
+ from numba.cuda.core.options import ParallelOptions
13
+ from numba.cuda.core.compiler_lock import global_compiler_lock
14
+ from numba.cuda.core.errors import NumbaWarning, NumbaInvalidConfigWarning
15
+ from numba.cuda.core.interpreter import Interpreter
16
+
17
+ from numba.cuda import cgutils, typing, lowering, nvvmutils, utils
18
+ from numba.cuda.api import get_current_device
19
+ from numba.cuda.codegen import ExternalCodeLibrary
20
+
21
+ from numba.cuda.core import (
22
+ inline_closurecall,
23
+ sigutils,
24
+ postproc,
25
+ config,
26
+ funcdesc,
27
+ )
28
+ from numba.cuda.cudadrv import nvvm, nvrtc
29
+ from numba.cuda.cudadrv.linkable_code import LinkableCode
30
+ from numba.cuda.descriptor import cuda_target
31
+ from numba.cuda.flags import CUDAFlags
32
+ from numba.cuda.target import CUDACABICallConv
33
+ from numba.cuda.core.compiler import CompilerBase
34
+ from numba.cuda.core.compiler_machinery import (
35
+ FunctionPass,
36
+ LoweringPass,
37
+ PassManager,
38
+ register_pass,
39
+ )
40
+ from numba.cuda.core.untyped_passes import (
41
+ TranslateByteCode,
42
+ FixupArgs,
43
+ IRProcessing,
44
+ DeadBranchPrune,
45
+ RewriteSemanticConstants,
46
+ InlineClosureLikes,
47
+ GenericRewrites,
48
+ WithLifting,
49
+ InlineInlinables,
50
+ FindLiterallyCalls,
51
+ MakeFunctionToJitFunction,
52
+ LiteralUnroll,
53
+ ReconstructSSA,
54
+ RewriteDynamicRaises,
55
+ LiteralPropagationSubPipelinePass,
56
+ )
57
+ from numba.cuda.core.typed_passes import (
58
+ BaseNativeLowering,
59
+ NativeLowering,
60
+ AnnotateTypes,
61
+ IRLegalization,
62
+ NopythonTypeInference,
63
+ NopythonRewrites,
64
+ InlineOverloads,
65
+ PreLowerStripPhis,
66
+ NoPythonSupportedFeatureValidation,
67
+ )
68
+
69
+
70
+ _LowerResult = namedtuple(
71
+ "_LowerResult",
72
+ [
73
+ "fndesc",
74
+ "call_helper",
75
+ "cfunc",
76
+ "env",
77
+ ],
78
+ )
79
+
80
+
81
+ def sanitize_compile_result_entries(entries):
82
+ keys = set(entries.keys())
83
+ fieldset = set(CR_FIELDS)
84
+ badnames = keys - fieldset
85
+ if badnames:
86
+ raise NameError(*badnames)
87
+ missing = fieldset - keys
88
+ for k in missing:
89
+ entries[k] = None
90
+ # Avoid keeping alive traceback variables
91
+ err = entries["typing_error"]
92
+ if err is not None:
93
+ entries["typing_error"] = err.with_traceback(None)
94
+ return entries
95
+
96
+
97
+ def run_frontend(func, inline_closures=False, emit_dels=False):
98
+ """
99
+ Run the compiler frontend over the given Python function, and return
100
+ the function's canonical Numba IR.
101
+
102
+ If inline_closures is Truthy then closure inlining will be run
103
+ If emit_dels is Truthy the ir.Del nodes will be emitted appropriately
104
+ """
105
+ # XXX make this a dedicated Pipeline?
106
+ func_id = bytecode.FunctionIdentity.from_function(func)
107
+ interp = Interpreter(func_id)
108
+ bc = bytecode.ByteCode(func_id=func_id)
109
+ func_ir = interp.interpret(bc)
110
+ if inline_closures:
111
+ inline_pass = inline_closurecall.InlineClosureCallPass(
112
+ func_ir, ParallelOptions(False), {}, False
113
+ )
114
+ inline_pass.run()
115
+ post_proc = postproc.PostProcessor(func_ir)
116
+ post_proc.run(emit_dels)
117
+ return func_ir
118
+
119
+
120
+ class DefaultPassBuilder(object):
121
+ """
122
+ This is the default pass builder, it contains the "classic" default
123
+ pipelines as pre-canned PassManager instances:
124
+ - nopython
125
+ - objectmode
126
+ - interpreted
127
+ - typed
128
+ - untyped
129
+ - nopython lowering
130
+ """
131
+
132
+ @staticmethod
133
+ def define_nopython_pipeline(state, name="nopython"):
134
+ """Returns an nopython mode pipeline based PassManager"""
135
+ # compose pipeline from untyped, typed and lowering parts
136
+ dpb = DefaultPassBuilder
137
+ pm = PassManager(name)
138
+ untyped_passes = dpb.define_untyped_pipeline(state)
139
+ pm.passes.extend(untyped_passes.passes)
140
+
141
+ typed_passes = dpb.define_typed_pipeline(state)
142
+ pm.passes.extend(typed_passes.passes)
143
+
144
+ lowering_passes = dpb.define_nopython_lowering_pipeline(state)
145
+ pm.passes.extend(lowering_passes.passes)
146
+
147
+ pm.finalize()
148
+ return pm
149
+
150
+ @staticmethod
151
+ def define_nopython_lowering_pipeline(state, name="nopython_lowering"):
152
+ pm = PassManager(name)
153
+ # legalise
154
+ pm.add_pass(
155
+ NoPythonSupportedFeatureValidation,
156
+ "ensure features that are in use are in a valid form",
157
+ )
158
+ pm.add_pass(IRLegalization, "ensure IR is legal prior to lowering")
159
+ # Annotate only once legalized
160
+ pm.add_pass(AnnotateTypes, "annotate types")
161
+ # lower
162
+ pm.add_pass(NativeLowering, "native lowering")
163
+ pm.add_pass(CUDABackend, "nopython mode backend")
164
+ pm.finalize()
165
+ return pm
166
+
167
+ @staticmethod
168
+ def define_parfor_gufunc_nopython_lowering_pipeline(
169
+ state, name="parfor_gufunc_nopython_lowering"
170
+ ):
171
+ pm = PassManager(name)
172
+ # legalise
173
+ pm.add_pass(
174
+ NoPythonSupportedFeatureValidation,
175
+ "ensure features that are in use are in a valid form",
176
+ )
177
+ pm.add_pass(IRLegalization, "ensure IR is legal prior to lowering")
178
+ # Annotate only once legalized
179
+ pm.add_pass(AnnotateTypes, "annotate types")
180
+ # lower
181
+ pm.add_pass(NativeLowering, "native lowering")
182
+ pm.add_pass(CUDABackend, "nopython mode backend")
183
+ pm.finalize()
184
+ return pm
185
+
186
+ @staticmethod
187
+ def define_typed_pipeline(state, name="typed"):
188
+ """Returns the typed part of the nopython pipeline"""
189
+ pm = PassManager(name)
190
+ # typing
191
+ pm.add_pass(NopythonTypeInference, "nopython frontend")
192
+
193
+ # strip phis
194
+ pm.add_pass(PreLowerStripPhis, "remove phis nodes")
195
+
196
+ # optimisation
197
+ pm.add_pass(InlineOverloads, "inline overloaded functions")
198
+ if not state.flags.no_rewrites:
199
+ pm.add_pass(NopythonRewrites, "nopython rewrites")
200
+
201
+ pm.finalize()
202
+ return pm
203
+
204
+ @staticmethod
205
+ def define_untyped_pipeline(state, name="untyped"):
206
+ """Returns an untyped part of the nopython pipeline"""
207
+ pm = PassManager(name)
208
+ if state.func_ir is None:
209
+ pm.add_pass(TranslateByteCode, "analyzing bytecode")
210
+ pm.add_pass(FixupArgs, "fix up args")
211
+ pm.add_pass(IRProcessing, "processing IR")
212
+ pm.add_pass(WithLifting, "Handle with contexts")
213
+
214
+ # inline closures early in case they are using nonlocal's
215
+ # see issue #6585.
216
+ pm.add_pass(
217
+ InlineClosureLikes, "inline calls to locally defined closures"
218
+ )
219
+
220
+ # pre typing
221
+ if not state.flags.no_rewrites:
222
+ pm.add_pass(RewriteSemanticConstants, "rewrite semantic constants")
223
+ pm.add_pass(DeadBranchPrune, "dead branch pruning")
224
+ pm.add_pass(GenericRewrites, "nopython rewrites")
225
+
226
+ pm.add_pass(RewriteDynamicRaises, "rewrite dynamic raises")
227
+
228
+ # convert any remaining closures into functions
229
+ pm.add_pass(
230
+ MakeFunctionToJitFunction,
231
+ "convert make_function into JIT functions",
232
+ )
233
+ # inline functions that have been determined as inlinable and rerun
234
+ # branch pruning, this needs to be run after closures are inlined as
235
+ # the IR repr of a closure masks call sites if an inlinable is called
236
+ # inside a closure
237
+ pm.add_pass(InlineInlinables, "inline inlinable functions")
238
+ if not state.flags.no_rewrites:
239
+ pm.add_pass(DeadBranchPrune, "dead branch pruning")
240
+
241
+ pm.add_pass(FindLiterallyCalls, "find literally calls")
242
+ pm.add_pass(LiteralUnroll, "handles literal_unroll")
243
+
244
+ if state.flags.enable_ssa:
245
+ pm.add_pass(ReconstructSSA, "ssa")
246
+
247
+ if not state.flags.no_rewrites:
248
+ pm.add_pass(DeadBranchPrune, "dead branch pruning")
249
+
250
+ pm.add_pass(LiteralPropagationSubPipelinePass, "Literal propagation")
251
+
252
+ pm.finalize()
253
+ return pm
254
+
255
+
256
+ # The CUDACompileResult (CCR) has a specially-defined entry point equal to its
257
+ # id. This is because the entry point is used as a key into a dict of
258
+ # overloads by the base dispatcher. The id of the CCR is the only small and
259
+ # unique property of a CUDACompileResult in the CUDA target (cf. the CPU target,
260
+ # which uses its entry_point, which is a pointer value).
261
+ #
262
+ # This does feel a little hackish, and there are two ways in which this could
263
+ # be improved:
264
+ #
265
+ # 1. We could change the CUDACompileResult so that each instance has its own
266
+ # unique ID that can be used as a key - e.g. a count, similar to the way in
267
+ # which types have unique counts.
268
+ # 2. At some future time when kernel launch uses a compiled function, the entry
269
+ # point will no longer need to be a synthetic value, but will instead be a
270
+ # pointer to the compiled function as in the CPU target.
271
+
272
+ CR_FIELDS = [
273
+ "typing_context",
274
+ "target_context",
275
+ "entry_point",
276
+ "typing_error",
277
+ "type_annotation",
278
+ "signature",
279
+ "objectmode",
280
+ "lifted",
281
+ "fndesc",
282
+ "library",
283
+ "call_helper",
284
+ "environment",
285
+ "metadata",
286
+ # List of functions to call to initialize on unserialization
287
+ # (i.e cache load).
288
+ "reload_init",
289
+ "referenced_envs",
290
+ ]
291
+
292
+
293
+ class CUDACompileResult(namedtuple("_CompileResult", CR_FIELDS)):
294
+ """
295
+ A structure holding results from the compilation of a function.
296
+ """
297
+
298
+ __slots__ = ()
299
+
300
+ @property
301
+ def entry_point(self):
302
+ return id(self)
303
+
304
+ def _reduce(self):
305
+ """
306
+ Reduce a CompileResult to picklable components.
307
+ """
308
+ libdata = self.library.serialize_using_object_code()
309
+ # Make it (un)picklable efficiently
310
+ typeann = str(self.type_annotation)
311
+ fndesc = self.fndesc
312
+ # Those don't need to be pickled and may fail
313
+ fndesc.typemap = fndesc.calltypes = None
314
+ # The CUDA target does not reference environments
315
+ referenced_envs = tuple()
316
+ return (
317
+ libdata,
318
+ self.fndesc,
319
+ self.environment,
320
+ self.signature,
321
+ self.objectmode,
322
+ self.lifted,
323
+ typeann,
324
+ self.reload_init,
325
+ referenced_envs,
326
+ )
327
+
328
+ @classmethod
329
+ def _rebuild(
330
+ cls,
331
+ target_context,
332
+ libdata,
333
+ fndesc,
334
+ env,
335
+ signature,
336
+ objectmode,
337
+ lifted,
338
+ typeann,
339
+ reload_init,
340
+ referenced_envs,
341
+ ):
342
+ if reload_init:
343
+ # Re-run all
344
+ for fn in reload_init:
345
+ fn()
346
+
347
+ library = target_context.codegen().unserialize_library(libdata)
348
+ cfunc = target_context.get_executable(library, fndesc, env)
349
+ cr = cls(
350
+ target_context=target_context,
351
+ typing_context=target_context.typing_context,
352
+ library=library,
353
+ environment=env,
354
+ entry_point=cfunc,
355
+ fndesc=fndesc,
356
+ type_annotation=typeann,
357
+ signature=signature,
358
+ objectmode=objectmode,
359
+ lifted=lifted,
360
+ typing_error=None,
361
+ call_helper=None,
362
+ metadata=None, # Do not store, arbitrary & potentially large!
363
+ reload_init=reload_init,
364
+ referenced_envs=referenced_envs,
365
+ )
366
+
367
+ # Load Environments
368
+ for env in referenced_envs:
369
+ library.codegen.set_env(env.env_name, env)
370
+
371
+ return cr
372
+
373
+ @property
374
+ def codegen(self):
375
+ return self.target_context.codegen()
376
+
377
+ def dump(self, tab=""):
378
+ print(f"{tab}DUMP {type(self).__name__} {self.entry_point}")
379
+ self.signature.dump(tab=tab + " ")
380
+ print(f"{tab}END DUMP")
381
+
382
+
383
+ def cuda_compile_result(**entries):
384
+ entries = sanitize_compile_result_entries(entries)
385
+ return CUDACompileResult(**entries)
386
+
387
+
388
+ @register_pass(mutates_CFG=True, analysis_only=False)
389
+ class CUDABackend(LoweringPass):
390
+ _name = "cuda_backend"
391
+
392
+ def __init__(self):
393
+ LoweringPass.__init__(self)
394
+
395
+ def run_pass(self, state):
396
+ """
397
+ Back-end: Packages lowering output in a compile result
398
+ """
399
+ lowered = state["cr"]
400
+ signature = typing.signature(state.return_type, *state.args)
401
+
402
+ state.cr = cuda_compile_result(
403
+ typing_context=state.typingctx,
404
+ target_context=state.targetctx,
405
+ typing_error=state.status.fail_reason,
406
+ type_annotation=state.type_annotation,
407
+ library=state.library,
408
+ call_helper=lowered.call_helper,
409
+ signature=signature,
410
+ fndesc=lowered.fndesc,
411
+ )
412
+ return True
413
+
414
+
415
+ @register_pass(mutates_CFG=False, analysis_only=False)
416
+ class CreateLibrary(LoweringPass):
417
+ """
418
+ Create a CUDACodeLibrary for the NativeLowering pass to populate. The
419
+ NativeLowering pass will create a code library if none exists, but we need
420
+ to set it up with nvvm_options from the flags if they are present.
421
+ """
422
+
423
+ _name = "create_library"
424
+
425
+ def __init__(self):
426
+ LoweringPass.__init__(self)
427
+
428
+ def run_pass(self, state):
429
+ codegen = state.targetctx.codegen()
430
+ name = state.func_id.func_qualname
431
+ nvvm_options = state.flags.nvvm_options
432
+ max_registers = state.flags.max_registers
433
+ lto = state.flags.lto
434
+ state.library = codegen.create_library(
435
+ name,
436
+ nvvm_options=nvvm_options,
437
+ max_registers=max_registers,
438
+ lto=lto,
439
+ )
440
+ # Enable object caching upfront so that the library can be serialized.
441
+ state.library.enable_object_caching()
442
+
443
+ return True
444
+
445
+
446
+ @register_pass(mutates_CFG=True, analysis_only=False)
447
+ class CUDANativeLowering(BaseNativeLowering):
448
+ """Lowering pass for a CUDA native function IR described solely in terms of
449
+ Numba's standard `numba.cuda.core.ir` nodes."""
450
+
451
+ _name = "cuda_native_lowering"
452
+
453
+ @property
454
+ def lowering_class(self):
455
+ return lowering.CUDALower
456
+
457
+
458
+ class CUDABytecodeInterpreter(Interpreter):
459
+ # Based on the superclass implementation, but names the resulting variable
460
+ # "$bool<N>" instead of "bool<N>" - see Numba PR #9888:
461
+ # https://github.com/numba/numba/pull/9888
462
+ #
463
+ # This can be removed once that PR is available in an upstream Numba
464
+ # release.
465
+ def _op_JUMP_IF(self, inst, pred, iftrue):
466
+ brs = {
467
+ True: inst.get_jump_target(),
468
+ False: inst.next,
469
+ }
470
+ truebr = brs[iftrue]
471
+ falsebr = brs[not iftrue]
472
+
473
+ name = "$bool%s" % (inst.offset)
474
+ gv_fn = numba_ir.Global("bool", bool, loc=self.loc)
475
+ self.store(value=gv_fn, name=name)
476
+
477
+ callres = numba_ir.Expr.call(
478
+ self.get(name), (self.get(pred),), (), loc=self.loc
479
+ )
480
+
481
+ pname = "$%spred" % (inst.offset)
482
+ predicate = self.store(value=callres, name=pname)
483
+ bra = numba_ir.Branch(
484
+ cond=predicate, truebr=truebr, falsebr=falsebr, loc=self.loc
485
+ )
486
+ self.current_block.append(bra)
487
+
488
+
489
+ @register_pass(mutates_CFG=True, analysis_only=False)
490
+ class CUDATranslateBytecode(FunctionPass):
491
+ _name = "cuda_translate_bytecode"
492
+
493
+ def __init__(self):
494
+ FunctionPass.__init__(self)
495
+
496
+ def run_pass(self, state):
497
+ func_id = state["func_id"]
498
+ bc = state["bc"]
499
+ interp = CUDABytecodeInterpreter(func_id)
500
+ func_ir = interp.interpret(bc)
501
+ state["func_ir"] = func_ir
502
+ return True
503
+
504
+
505
+ class CUDACompiler(CompilerBase):
506
+ def define_pipelines(self):
507
+ dpb = DefaultPassBuilder
508
+ pm = PassManager("cuda")
509
+
510
+ untyped_passes = dpb.define_untyped_pipeline(self.state)
511
+
512
+ # Rather than replicating the whole untyped passes definition in
513
+ # numba-cuda, it seems cleaner to take the pass list and replace the
514
+ # TranslateBytecode pass with our own.
515
+
516
+ def replace_translate_pass(implementation, description):
517
+ if implementation is TranslateByteCode:
518
+ return (CUDATranslateBytecode, description)
519
+ else:
520
+ return (implementation, description)
521
+
522
+ cuda_untyped_passes = [
523
+ replace_translate_pass(implementation, description)
524
+ for implementation, description in untyped_passes.passes
525
+ ]
526
+
527
+ pm.passes.extend(cuda_untyped_passes)
528
+
529
+ typed_passes = dpb.define_typed_pipeline(self.state)
530
+ pm.passes.extend(typed_passes.passes)
531
+
532
+ lowering_passes = self.define_cuda_lowering_pipeline(self.state)
533
+ pm.passes.extend(lowering_passes.passes)
534
+
535
+ pm.finalize()
536
+ return [pm]
537
+
538
+ def define_cuda_lowering_pipeline(self, state):
539
+ pm = PassManager("cuda_lowering")
540
+ # legalise
541
+ pm.add_pass(IRLegalization, "ensure IR is legal prior to lowering")
542
+ pm.add_pass(AnnotateTypes, "annotate types")
543
+
544
+ # lower
545
+ pm.add_pass(CreateLibrary, "create library")
546
+ pm.add_pass(CUDANativeLowering, "cuda native lowering")
547
+ pm.add_pass(CUDABackend, "cuda backend")
548
+
549
+ pm.finalize()
550
+ return pm
551
+
552
+
553
+ def compile_extra(
554
+ typingctx,
555
+ targetctx,
556
+ func,
557
+ args,
558
+ return_type,
559
+ flags,
560
+ locals,
561
+ library=None,
562
+ pipeline_class=CUDACompiler,
563
+ ):
564
+ """Compiler entry point
565
+
566
+ Parameter
567
+ ---------
568
+ typingctx :
569
+ typing context
570
+ targetctx :
571
+ target context
572
+ func : function
573
+ the python function to be compiled
574
+ args : tuple, list
575
+ argument types
576
+ return_type :
577
+ Use ``None`` to indicate void return
578
+ flags : numba.compiler.Flags
579
+ compiler flags
580
+ library : numba.codegen.CodeLibrary
581
+ Used to store the compiled code.
582
+ If it is ``None``, a new CodeLibrary is used.
583
+ pipeline_class : type like numba.compiler.CompilerBase
584
+ compiler pipeline
585
+ """
586
+ pipeline = pipeline_class(
587
+ typingctx, targetctx, library, args, return_type, flags, locals
588
+ )
589
+ return pipeline.compile_extra(func)
590
+
591
+
592
+ def compile_ir(
593
+ typingctx,
594
+ targetctx,
595
+ func_ir,
596
+ args,
597
+ return_type,
598
+ flags,
599
+ locals,
600
+ lifted=(),
601
+ lifted_from=None,
602
+ is_lifted_loop=False,
603
+ library=None,
604
+ pipeline_class=CUDACompiler,
605
+ ):
606
+ """
607
+ Compile a function with the given IR.
608
+
609
+ For internal use only.
610
+ """
611
+
612
+ # This is a special branch that should only run on IR from a lifted loop
613
+ if is_lifted_loop:
614
+ # This code is pessimistic and costly, but it is a not often trodden
615
+ # path and it will go away once IR is made immutable. The problem is
616
+ # that the rewrite passes can mutate the IR into a state that makes
617
+ # it possible for invalid tokens to be transmitted to lowering which
618
+ # then trickle through into LLVM IR and causes RuntimeErrors as LLVM
619
+ # cannot compile it. As a result the following approach is taken:
620
+ # 1. Create some new flags that copy the original ones but switch
621
+ # off rewrites.
622
+ # 2. Compile with 1. to get a compile result
623
+ # 3. Try and compile another compile result but this time with the
624
+ # original flags (and IR being rewritten).
625
+ # 4. If 3 was successful, use the result, else use 2.
626
+
627
+ # create flags with no rewrites
628
+ norw_flags = copy.deepcopy(flags)
629
+ norw_flags.no_rewrites = True
630
+
631
+ def compile_local(the_ir, the_flags):
632
+ pipeline = pipeline_class(
633
+ typingctx,
634
+ targetctx,
635
+ library,
636
+ args,
637
+ return_type,
638
+ the_flags,
639
+ locals,
640
+ )
641
+ return pipeline.compile_ir(
642
+ func_ir=the_ir, lifted=lifted, lifted_from=lifted_from
643
+ )
644
+
645
+ # compile with rewrites off, IR shouldn't be mutated irreparably
646
+ norw_cres = compile_local(func_ir.copy(), norw_flags)
647
+
648
+ # try and compile with rewrites on if no_rewrites was not set in the
649
+ # original flags, IR might get broken but we've got a CompileResult
650
+ # that's usable from above.
651
+ rw_cres = None
652
+ if not flags.no_rewrites:
653
+ # Suppress warnings in compilation retry
654
+ with catch_warnings():
655
+ simplefilter("ignore", NumbaWarning)
656
+ try:
657
+ rw_cres = compile_local(func_ir.copy(), flags)
658
+ except Exception:
659
+ pass
660
+ # if the rewrite variant of compilation worked, use it, else use
661
+ # the norewrites backup
662
+ if rw_cres is not None:
663
+ cres = rw_cres
664
+ else:
665
+ cres = norw_cres
666
+ return cres
667
+
668
+ else:
669
+ pipeline = pipeline_class(
670
+ typingctx, targetctx, library, args, return_type, flags, locals
671
+ )
672
+ return pipeline.compile_ir(
673
+ func_ir=func_ir, lifted=lifted, lifted_from=lifted_from
674
+ )
675
+
676
+
677
+ def compile_internal(
678
+ typingctx, targetctx, library, func, args, return_type, flags, locals
679
+ ):
680
+ """
681
+ For internal use only.
682
+ """
683
+ pipeline = CUDACompiler(
684
+ typingctx, targetctx, library, args, return_type, flags, locals
685
+ )
686
+ return pipeline.compile_extra(func)
687
+
688
+
689
+ @global_compiler_lock
690
+ def compile_cuda(
691
+ pyfunc,
692
+ return_type,
693
+ args,
694
+ debug=False,
695
+ lineinfo=False,
696
+ forceinline=False,
697
+ fastmath=False,
698
+ nvvm_options=None,
699
+ cc=None,
700
+ max_registers=None,
701
+ lto=False,
702
+ ):
703
+ if cc is None:
704
+ raise ValueError("Compute Capability must be supplied")
705
+
706
+ from .descriptor import cuda_target
707
+
708
+ typingctx = cuda_target.typing_context
709
+ targetctx = cuda_target.target_context
710
+
711
+ flags = CUDAFlags()
712
+ # Do not compile (generate native code), just lower (to LLVM)
713
+ flags.no_compile = True
714
+ flags.no_cpython_wrapper = True
715
+ flags.no_cfunc_wrapper = True
716
+
717
+ # Both debug and lineinfo turn on debug information in the compiled code,
718
+ # but we keep them separate arguments in case we later want to overload
719
+ # some other behavior on the debug flag. In particular, -opt=3 is not
720
+ # supported with debug enabled, and enabling only lineinfo should not
721
+ # affect the error model.
722
+ if debug or lineinfo:
723
+ flags.debuginfo = True
724
+
725
+ if lineinfo:
726
+ flags.dbg_directives_only = True
727
+
728
+ if debug:
729
+ flags.error_model = "python"
730
+ flags.dbg_extend_lifetimes = True
731
+ else:
732
+ flags.error_model = "numpy"
733
+
734
+ if forceinline:
735
+ flags.forceinline = True
736
+ if fastmath:
737
+ flags.fastmath = True
738
+ if nvvm_options:
739
+ flags.nvvm_options = nvvm_options
740
+ flags.compute_capability = cc
741
+ flags.max_registers = max_registers
742
+ flags.lto = lto
743
+
744
+ with utils.numba_target_override():
745
+ cres = compile_extra(
746
+ typingctx=typingctx,
747
+ targetctx=targetctx,
748
+ func=pyfunc,
749
+ args=args,
750
+ return_type=return_type,
751
+ flags=flags,
752
+ locals={},
753
+ pipeline_class=CUDACompiler,
754
+ )
755
+
756
+ library = cres.library
757
+ library.finalize()
758
+
759
+ return cres
760
+
761
+
762
+ def cabi_wrap_function(
763
+ context, lib, fndesc, wrapper_function_name, nvvm_options
764
+ ):
765
+ """
766
+ Wrap a Numba ABI function in a C ABI wrapper at the NVVM IR level.
767
+
768
+ The C ABI wrapper will have the same name as the source Python function.
769
+ """
770
+ # The wrapper will be contained in a new library that links to the wrapped
771
+ # function's library
772
+ library = lib.codegen.create_library(
773
+ f"{lib.name}_function_",
774
+ entry_name=wrapper_function_name,
775
+ nvvm_options=nvvm_options,
776
+ )
777
+ library.add_linking_library(lib)
778
+
779
+ # Determine the caller (C ABI) and wrapper (Numba ABI) function types
780
+ argtypes = fndesc.argtypes
781
+ restype = fndesc.restype
782
+ c_call_conv = CUDACABICallConv(context)
783
+ wrapfnty = c_call_conv.get_function_type(restype, argtypes)
784
+ fnty = context.call_conv.get_function_type(fndesc.restype, argtypes)
785
+
786
+ # Create a new module and declare the callee
787
+ wrapper_module = context.create_module("cuda.cabi.wrapper")
788
+ func = ir.Function(wrapper_module, fnty, fndesc.llvm_func_name)
789
+
790
+ # Define the caller - populate it with a call to the callee and return
791
+ # its return value
792
+
793
+ wrapfn = ir.Function(wrapper_module, wrapfnty, wrapper_function_name)
794
+ builder = ir.IRBuilder(wrapfn.append_basic_block(""))
795
+
796
+ arginfo = context.get_arg_packer(argtypes)
797
+ callargs = arginfo.from_arguments(builder, wrapfn.args)
798
+ # We get (status, return_value), but we ignore the status since we
799
+ # can't propagate it through the C ABI anyway
800
+ _, return_value = context.call_conv.call_function(
801
+ builder, func, restype, argtypes, callargs
802
+ )
803
+ builder.ret(return_value)
804
+
805
+ if config.DUMP_LLVM:
806
+ utils.dump_llvm(fndesc, wrapper_module)
807
+
808
+ library.add_ir_module(wrapper_module)
809
+ library.finalize()
810
+ return library
811
+
812
+
813
+ def kernel_fixup(kernel, debug):
814
+ if debug:
815
+ exc_helper = add_exception_store_helper(kernel)
816
+
817
+ # Pass 1 - replace:
818
+ #
819
+ # ret <value>
820
+ #
821
+ # with:
822
+ #
823
+ # exc_helper(<value>)
824
+ # ret void
825
+
826
+ for block in kernel.blocks:
827
+ for i, inst in enumerate(block.instructions):
828
+ if isinstance(inst, ir.Ret):
829
+ old_ret = block.instructions.pop()
830
+ block.terminator = None
831
+
832
+ # The original return's metadata will be set on the new
833
+ # instructions in order to preserve debug info
834
+ metadata = old_ret.metadata
835
+
836
+ builder = ir.IRBuilder(block)
837
+ if debug:
838
+ status_code = old_ret.operands[0]
839
+ exc_helper_call = builder.call(exc_helper, (status_code,))
840
+ exc_helper_call.metadata = metadata
841
+
842
+ new_ret = builder.ret_void()
843
+ new_ret.metadata = old_ret.metadata
844
+
845
+ # Need to break out so we don't carry on modifying what we are
846
+ # iterating over. There can only be one return in a block
847
+ # anyway.
848
+ break
849
+
850
+ # Pass 2: remove stores of null pointer to return value argument pointer
851
+
852
+ return_value = kernel.args[0]
853
+
854
+ for block in kernel.blocks:
855
+ remove_list = []
856
+
857
+ # Find all stores first
858
+ for inst in block.instructions:
859
+ if (
860
+ isinstance(inst, ir.StoreInstr)
861
+ and inst.operands[1] == return_value
862
+ ):
863
+ remove_list.append(inst)
864
+
865
+ # Remove all stores
866
+ for to_remove in remove_list:
867
+ block.instructions.remove(to_remove)
868
+
869
+ # Replace non-void return type with void return type and remove return
870
+ # value
871
+
872
+ if isinstance(kernel.type, ir.PointerType):
873
+ new_type = ir.PointerType(
874
+ ir.FunctionType(ir.VoidType(), kernel.type.pointee.args[1:])
875
+ )
876
+ else:
877
+ new_type = ir.FunctionType(ir.VoidType(), kernel.type.args[1:])
878
+
879
+ kernel.type = new_type
880
+ kernel.return_value = ir.ReturnValue(kernel, ir.VoidType())
881
+ kernel.args = kernel.args[1:]
882
+
883
+ # If debug metadata is present, remove the return value from it
884
+
885
+ if kernel_metadata := getattr(kernel, "metadata", None):
886
+ if dbg_metadata := kernel_metadata.get("dbg", None):
887
+ for name, value in dbg_metadata.operands:
888
+ if name == "type":
889
+ type_metadata = value
890
+ for tm_name, tm_value in type_metadata.operands:
891
+ if tm_name == "types":
892
+ types = tm_value
893
+ types.operands = types.operands[1:]
894
+ if config.DUMP_LLVM:
895
+ types._clear_string_cache()
896
+
897
+ # Mark as a kernel for NVVM
898
+
899
+ nvvm.set_cuda_kernel(kernel)
900
+
901
+ if config.DUMP_LLVM:
902
+ print(f"LLVM DUMP: Post kernel fixup {kernel.name}".center(80, "-"))
903
+ print(kernel.module)
904
+ print("=" * 80)
905
+
906
+
907
+ def add_exception_store_helper(kernel):
908
+ # Create global variables for exception state
909
+
910
+ def define_error_gv(postfix):
911
+ name = kernel.name + postfix
912
+ gv = cgutils.add_global_variable(kernel.module, ir.IntType(32), name)
913
+ gv.initializer = ir.Constant(gv.type.pointee, None)
914
+ return gv
915
+
916
+ gv_exc = define_error_gv("__errcode__")
917
+ gv_tid = []
918
+ gv_ctaid = []
919
+ for i in "xyz":
920
+ gv_tid.append(define_error_gv("__tid%s__" % i))
921
+ gv_ctaid.append(define_error_gv("__ctaid%s__" % i))
922
+
923
+ # Create exception store helper function
924
+
925
+ helper_name = kernel.name + "__exc_helper__"
926
+ helper_type = ir.FunctionType(ir.VoidType(), (ir.IntType(32),))
927
+ helper_func = ir.Function(kernel.module, helper_type, helper_name)
928
+
929
+ block = helper_func.append_basic_block(name="entry")
930
+ builder = ir.IRBuilder(block)
931
+
932
+ # Implement status check / exception store logic
933
+
934
+ status_code = helper_func.args[0]
935
+ call_conv = cuda_target.target_context.call_conv
936
+ status = call_conv._get_return_status(builder, status_code)
937
+
938
+ # Check error status
939
+ with cgutils.if_likely(builder, status.is_ok):
940
+ builder.ret_void()
941
+
942
+ with builder.if_then(builder.not_(status.is_python_exc)):
943
+ # User exception raised
944
+ old = ir.Constant(gv_exc.type.pointee, None)
945
+
946
+ # Use atomic cmpxchg to prevent rewriting the error status
947
+ # Only the first error is recorded
948
+
949
+ xchg = builder.cmpxchg(
950
+ gv_exc, old, status.code, "monotonic", "monotonic"
951
+ )
952
+ changed = builder.extract_value(xchg, 1)
953
+
954
+ # If the xchange is successful, save the thread ID.
955
+ sreg = nvvmutils.SRegBuilder(builder)
956
+ with builder.if_then(changed):
957
+ for (
958
+ dim,
959
+ ptr,
960
+ ) in zip("xyz", gv_tid):
961
+ val = sreg.tid(dim)
962
+ builder.store(val, ptr)
963
+
964
+ for (
965
+ dim,
966
+ ptr,
967
+ ) in zip("xyz", gv_ctaid):
968
+ val = sreg.ctaid(dim)
969
+ builder.store(val, ptr)
970
+
971
+ builder.ret_void()
972
+
973
+ return helper_func
974
+
975
+
976
+ def compile_all(
977
+ pyfunc,
978
+ sig,
979
+ debug=None,
980
+ lineinfo=False,
981
+ device=True,
982
+ fastmath=False,
983
+ cc=None,
984
+ opt=None,
985
+ abi="c",
986
+ abi_info=None,
987
+ output="ltoir",
988
+ forceinline=False,
989
+ launch_bounds=None,
990
+ ):
991
+ """Similar to ``compile()``, but returns a list of PTX codes/LTO-IRs for
992
+ the compiled function and the external functions it depends on.
993
+ If external functions are CUDA C++ source, they will be compiled with
994
+ NVRTC. Other kinds of external function code (e.g. cubins, fatbins, etc.)
995
+ will be added directly to the return list. The output code kind is
996
+ determined by the ``output`` parameter, and defaults to ``"ltoir"``.
997
+ """
998
+
999
+ if output not in ("ptx", "ltoir"):
1000
+ raise NotImplementedError(f"Unsupported output type: {output}")
1001
+
1002
+ if forceinline and output != "ltoir":
1003
+ raise ValueError("Can only designate forced inlining in LTO-IR")
1004
+
1005
+ lto = output == "ltoir"
1006
+
1007
+ cc = _default_cc(cc)
1008
+
1009
+ lib, resty = _compile_pyfunc_with_fixup(
1010
+ pyfunc,
1011
+ sig,
1012
+ debug=debug,
1013
+ lineinfo=lineinfo,
1014
+ device=device,
1015
+ fastmath=fastmath,
1016
+ cc=cc,
1017
+ opt=opt,
1018
+ abi=abi,
1019
+ abi_info=abi_info,
1020
+ forceinline=forceinline,
1021
+ launch_bounds=launch_bounds,
1022
+ lto=lto,
1023
+ )
1024
+
1025
+ if lto:
1026
+ code = lib.get_ltoir(cc=cc)
1027
+ else:
1028
+ code = lib.get_asm_str(cc=cc)
1029
+ codes = [code]
1030
+
1031
+ # linking_files
1032
+ is_ltoir = output == "ltoir"
1033
+ for path_or_obj in lib._linking_files:
1034
+ obj = LinkableCode.from_path_or_obj(path_or_obj)
1035
+ if obj.kind == "cu":
1036
+ code, log = nvrtc.compile(
1037
+ obj.data,
1038
+ obj.name,
1039
+ cc,
1040
+ ltoir=is_ltoir,
1041
+ lineinfo=lineinfo,
1042
+ debug=debug,
1043
+ )
1044
+ codes.append(code)
1045
+ else:
1046
+ codes.append(obj)
1047
+
1048
+ return codes, resty
1049
+
1050
+
1051
+ def _compile_pyfunc_with_fixup(
1052
+ pyfunc,
1053
+ sig,
1054
+ debug=None,
1055
+ lineinfo=False,
1056
+ device=True,
1057
+ fastmath=False,
1058
+ cc=None,
1059
+ opt=None,
1060
+ abi="c",
1061
+ abi_info=None,
1062
+ forceinline=False,
1063
+ launch_bounds=None,
1064
+ lto=False,
1065
+ ):
1066
+ """Internal method to compile a python function and perform post-processing
1067
+
1068
+ - If pyfunc is a kernel, post-processing includes kernel fixup and setting
1069
+ launch bounds.
1070
+ - If pyfunc is a device function, post-processing includes ABI wrapper.
1071
+
1072
+ `lto` means that all internal pipeline options use LTO.
1073
+
1074
+ Returns the code library and return type.
1075
+ """
1076
+ if abi not in ("numba", "c"):
1077
+ raise NotImplementedError(f"Unsupported ABI: {abi}")
1078
+
1079
+ if abi == "c" and not device:
1080
+ raise NotImplementedError("The C ABI is not supported for kernels")
1081
+
1082
+ if forceinline and not device:
1083
+ raise ValueError("Cannot force-inline kernels")
1084
+
1085
+ debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
1086
+ opt = (config.OPT != 0) if opt is None else opt
1087
+
1088
+ if debug and opt:
1089
+ msg = (
1090
+ "debug=True with opt=True "
1091
+ "is not supported by CUDA. This may result in a crash"
1092
+ " - set debug=False or opt=False."
1093
+ )
1094
+ warn(NumbaInvalidConfigWarning(msg))
1095
+
1096
+ abi_info = abi_info or dict()
1097
+
1098
+ nvvm_options = {"fastmath": fastmath, "opt": 3 if opt else 0}
1099
+
1100
+ if debug:
1101
+ nvvm_options["g"] = None
1102
+
1103
+ if lto:
1104
+ nvvm_options["gen-lto"] = None
1105
+
1106
+ args, return_type = sigutils.normalize_signature(sig)
1107
+
1108
+ cc = _default_cc(cc)
1109
+
1110
+ cres = compile_cuda(
1111
+ pyfunc,
1112
+ return_type,
1113
+ args,
1114
+ debug=debug,
1115
+ lineinfo=lineinfo,
1116
+ fastmath=fastmath,
1117
+ nvvm_options=nvvm_options,
1118
+ cc=cc,
1119
+ forceinline=forceinline,
1120
+ )
1121
+ resty = cres.signature.return_type
1122
+
1123
+ if resty and not device and resty != types.void:
1124
+ raise TypeError("CUDA kernel must have void return type.")
1125
+
1126
+ tgt = cres.target_context
1127
+
1128
+ if device:
1129
+ lib = cres.library
1130
+ if abi == "c":
1131
+ wrapper_name = abi_info.get("abi_name", pyfunc.__name__)
1132
+ lib = cabi_wrap_function(
1133
+ tgt, lib, cres.fndesc, wrapper_name, nvvm_options
1134
+ )
1135
+ else:
1136
+ lib = cres.library
1137
+ kernel = lib.get_function(cres.fndesc.llvm_func_name)
1138
+ lib._entry_name = cres.fndesc.llvm_func_name
1139
+ kernel_fixup(kernel, debug)
1140
+ nvvm.set_launch_bounds(kernel, launch_bounds)
1141
+
1142
+ return lib, resty
1143
+
1144
+
1145
+ @global_compiler_lock
1146
+ def compile(
1147
+ pyfunc,
1148
+ sig,
1149
+ debug=None,
1150
+ lineinfo=False,
1151
+ device=True,
1152
+ fastmath=False,
1153
+ cc=None,
1154
+ opt=None,
1155
+ abi="c",
1156
+ abi_info=None,
1157
+ output="ptx",
1158
+ forceinline=False,
1159
+ launch_bounds=None,
1160
+ ):
1161
+ """Compile a Python function to PTX or LTO-IR for a given set of argument
1162
+ types.
1163
+
1164
+ :param pyfunc: The Python function to compile.
1165
+ :param sig: The signature representing the function's input and output
1166
+ types. If this is a tuple of argument types without a return
1167
+ type, the inferred return type is returned by this function. If
1168
+ a signature including a return type is passed, the compiled code
1169
+ will include a cast from the inferred return type to the
1170
+ specified return type, and this function will return the
1171
+ specified return type.
1172
+ :param debug: Whether to include debug info in the compiled code.
1173
+ :type debug: bool
1174
+ :param lineinfo: Whether to include a line mapping from the compiled code
1175
+ to the source code. Usually this is used with optimized
1176
+ code (since debug mode would automatically include this),
1177
+ so we want debug info in the LLVM IR but only the line
1178
+ mapping in the final output.
1179
+ :type lineinfo: bool
1180
+ :param device: Whether to compile a device function.
1181
+ :type device: bool
1182
+ :param fastmath: Whether to enable fast math flags (ftz=1, prec_sqrt=0,
1183
+ prec_div=, and fma=1)
1184
+ :type fastmath: bool
1185
+ :param cc: Compute capability to compile for, as a tuple
1186
+ ``(MAJOR, MINOR)``. Defaults to ``(5, 0)``.
1187
+ :type cc: tuple
1188
+ :param opt: Whether to enable optimizations in the compiled code.
1189
+ :type opt: bool
1190
+ :param abi: The ABI for a compiled function - either ``"numba"`` or
1191
+ ``"c"``. Note that the Numba ABI is not considered stable.
1192
+ The C ABI is only supported for device functions at present.
1193
+ :type abi: str
1194
+ :param abi_info: A dict of ABI-specific options. The ``"c"`` ABI supports
1195
+ one option, ``"abi_name"``, for providing the wrapper
1196
+ function's name. The ``"numba"`` ABI has no options.
1197
+ :type abi_info: dict
1198
+ :param output: Type of output to generate, either ``"ptx"`` or ``"ltoir"``.
1199
+ :type output: str
1200
+ :param forceinline: Enables inlining at the NVVM IR level when set to
1201
+ ``True``. This is accomplished by adding the
1202
+ ``alwaysinline`` function attribute to the function
1203
+ definition. This is only valid when the output is
1204
+ ``"ltoir"``.
1205
+ :param launch_bounds: Kernel launch bounds, specified as a scalar or a tuple
1206
+ of between one and three items. Tuple items provide:
1207
+
1208
+ - The maximum number of threads per block,
1209
+ - The minimum number of blocks per SM,
1210
+ - The maximum number of blocks per cluster.
1211
+
1212
+ If a scalar is provided, it is used as the maximum
1213
+ number of threads per block.
1214
+ :type launch_bounds: int | tuple[int]
1215
+ :return: (code, resty): The compiled code and inferred return type
1216
+ :rtype: tuple
1217
+ """
1218
+ if output not in ("ptx", "ltoir"):
1219
+ raise NotImplementedError(f"Unsupported output type: {output}")
1220
+
1221
+ if forceinline and output != "ltoir":
1222
+ raise ValueError("Can only designate forced inlining in LTO-IR")
1223
+
1224
+ lto = output == "ltoir"
1225
+ lib, resty = _compile_pyfunc_with_fixup(
1226
+ pyfunc,
1227
+ sig,
1228
+ debug=debug,
1229
+ lineinfo=lineinfo,
1230
+ device=device,
1231
+ fastmath=fastmath,
1232
+ cc=cc,
1233
+ opt=opt,
1234
+ abi=abi,
1235
+ abi_info=abi_info,
1236
+ forceinline=forceinline,
1237
+ launch_bounds=launch_bounds,
1238
+ lto=lto,
1239
+ )
1240
+
1241
+ if lto:
1242
+ code = lib.get_ltoir(cc=cc)
1243
+ else:
1244
+ code = lib.get_asm_str(cc=cc)
1245
+ return code, resty
1246
+
1247
+
1248
+ def compile_for_current_device(
1249
+ pyfunc,
1250
+ sig,
1251
+ debug=None,
1252
+ lineinfo=False,
1253
+ device=True,
1254
+ fastmath=False,
1255
+ opt=None,
1256
+ abi="c",
1257
+ abi_info=None,
1258
+ output="ptx",
1259
+ forceinline=False,
1260
+ launch_bounds=None,
1261
+ ):
1262
+ """Compile a Python function to PTX or LTO-IR for a given signature for the
1263
+ current device's compute capabilility. This calls :func:`compile` with an
1264
+ appropriate ``cc`` value for the current device."""
1265
+ cc = get_current_device().compute_capability
1266
+ return compile(
1267
+ pyfunc,
1268
+ sig,
1269
+ debug=debug,
1270
+ lineinfo=lineinfo,
1271
+ device=device,
1272
+ fastmath=fastmath,
1273
+ cc=cc,
1274
+ opt=opt,
1275
+ abi=abi,
1276
+ abi_info=abi_info,
1277
+ output=output,
1278
+ forceinline=forceinline,
1279
+ launch_bounds=launch_bounds,
1280
+ )
1281
+
1282
+
1283
+ def compile_ptx(
1284
+ pyfunc,
1285
+ sig,
1286
+ debug=None,
1287
+ lineinfo=False,
1288
+ device=False,
1289
+ fastmath=False,
1290
+ cc=None,
1291
+ opt=None,
1292
+ abi="numba",
1293
+ abi_info=None,
1294
+ forceinline=False,
1295
+ launch_bounds=None,
1296
+ ):
1297
+ """Compile a Python function to PTX for a given signature. See
1298
+ :func:`compile`. The defaults for this function are to compile a kernel
1299
+ with the Numba ABI, rather than :func:`compile`'s default of compiling a
1300
+ device function with the C ABI."""
1301
+ return compile(
1302
+ pyfunc,
1303
+ sig,
1304
+ debug=debug,
1305
+ lineinfo=lineinfo,
1306
+ device=device,
1307
+ fastmath=fastmath,
1308
+ cc=cc,
1309
+ opt=opt,
1310
+ abi=abi,
1311
+ abi_info=abi_info,
1312
+ output="ptx",
1313
+ forceinline=forceinline,
1314
+ launch_bounds=launch_bounds,
1315
+ )
1316
+
1317
+
1318
+ def compile_ptx_for_current_device(
1319
+ pyfunc,
1320
+ sig,
1321
+ debug=None,
1322
+ lineinfo=False,
1323
+ device=False,
1324
+ fastmath=False,
1325
+ opt=None,
1326
+ abi="numba",
1327
+ abi_info=None,
1328
+ forceinline=False,
1329
+ launch_bounds=None,
1330
+ ):
1331
+ """Compile a Python function to PTX for a given signature for the current
1332
+ device's compute capabilility. See :func:`compile_ptx`."""
1333
+ cc = get_current_device().compute_capability
1334
+ return compile_ptx(
1335
+ pyfunc,
1336
+ sig,
1337
+ debug=debug,
1338
+ lineinfo=lineinfo,
1339
+ device=device,
1340
+ fastmath=fastmath,
1341
+ cc=cc,
1342
+ opt=opt,
1343
+ abi=abi,
1344
+ abi_info=abi_info,
1345
+ forceinline=forceinline,
1346
+ launch_bounds=launch_bounds,
1347
+ )
1348
+
1349
+
1350
+ def declare_device_function(name, restype, argtypes, link, use_cooperative):
1351
+ from .descriptor import cuda_target
1352
+
1353
+ typingctx = cuda_target.typing_context
1354
+ targetctx = cuda_target.target_context
1355
+ sig = typing.signature(restype, *argtypes)
1356
+
1357
+ # extfn is the descriptor used to call the function from Python code, and
1358
+ # is used as the key for typing and lowering.
1359
+ extfn = ExternFunction(name, sig)
1360
+
1361
+ # Typing
1362
+ device_function_template = typing.make_concrete_template(name, extfn, [sig])
1363
+ typingctx.insert_user_function(extfn, device_function_template)
1364
+
1365
+ # Lowering
1366
+ lib = ExternalCodeLibrary(f"{name}_externals", targetctx.codegen())
1367
+ for file in link:
1368
+ lib.add_linking_file(file)
1369
+ lib.use_cooperative = use_cooperative
1370
+
1371
+ # ExternalFunctionDescriptor provides a lowering implementation for calling
1372
+ # external functions
1373
+ fndesc = funcdesc.ExternalFunctionDescriptor(name, restype, argtypes)
1374
+ targetctx.insert_user_function(extfn, fndesc, libs=(lib,))
1375
+
1376
+ return device_function_template
1377
+
1378
+
1379
+ class ExternFunction:
1380
+ """A descriptor that can be used to call the external function from within
1381
+ a Python kernel."""
1382
+
1383
+ def __init__(self, name, sig):
1384
+ self.name = name
1385
+ self.sig = sig
1386
+
1387
+
1388
+ def _default_cc(cc):
1389
+ """
1390
+ Return default compute capability based on config and nvrtc lowest supported cc.
1391
+
1392
+ If user specifies a cc, return that.
1393
+ """
1394
+ if cc:
1395
+ return cc
1396
+ return max(config.CUDA_DEFAULT_PTX_CC, nvrtc.get_lowest_supported_cc())