numba-cuda 0.22.0__cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of numba-cuda might be problematic. Click here for more details.

Files changed (487) hide show
  1. _numba_cuda_redirector.pth +4 -0
  2. _numba_cuda_redirector.py +89 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +6 -0
  5. numba_cuda/_version.py +11 -0
  6. numba_cuda/numba/cuda/__init__.py +70 -0
  7. numba_cuda/numba/cuda/_internal/cuda_bf16.py +16394 -0
  8. numba_cuda/numba/cuda/_internal/cuda_fp16.py +8112 -0
  9. numba_cuda/numba/cuda/api.py +580 -0
  10. numba_cuda/numba/cuda/api_util.py +76 -0
  11. numba_cuda/numba/cuda/args.py +72 -0
  12. numba_cuda/numba/cuda/bf16.py +397 -0
  13. numba_cuda/numba/cuda/cache_hints.py +287 -0
  14. numba_cuda/numba/cuda/cext/__init__.py +2 -0
  15. numba_cuda/numba/cuda/cext/_devicearray.cpp +159 -0
  16. numba_cuda/numba/cuda/cext/_devicearray.cpython-312-aarch64-linux-gnu.so +0 -0
  17. numba_cuda/numba/cuda/cext/_devicearray.h +29 -0
  18. numba_cuda/numba/cuda/cext/_dispatcher.cpp +1098 -0
  19. numba_cuda/numba/cuda/cext/_dispatcher.cpython-312-aarch64-linux-gnu.so +0 -0
  20. numba_cuda/numba/cuda/cext/_hashtable.cpp +532 -0
  21. numba_cuda/numba/cuda/cext/_hashtable.h +135 -0
  22. numba_cuda/numba/cuda/cext/_helperlib.c +71 -0
  23. numba_cuda/numba/cuda/cext/_helperlib.cpython-312-aarch64-linux-gnu.so +0 -0
  24. numba_cuda/numba/cuda/cext/_helpermod.c +82 -0
  25. numba_cuda/numba/cuda/cext/_pymodule.h +38 -0
  26. numba_cuda/numba/cuda/cext/_typeconv.cpp +206 -0
  27. numba_cuda/numba/cuda/cext/_typeconv.cpython-312-aarch64-linux-gnu.so +0 -0
  28. numba_cuda/numba/cuda/cext/_typeof.cpp +1159 -0
  29. numba_cuda/numba/cuda/cext/_typeof.h +19 -0
  30. numba_cuda/numba/cuda/cext/capsulethunk.h +111 -0
  31. numba_cuda/numba/cuda/cext/mviewbuf.c +385 -0
  32. numba_cuda/numba/cuda/cext/mviewbuf.cpython-312-aarch64-linux-gnu.so +0 -0
  33. numba_cuda/numba/cuda/cext/typeconv.cpp +212 -0
  34. numba_cuda/numba/cuda/cext/typeconv.hpp +101 -0
  35. numba_cuda/numba/cuda/cg.py +67 -0
  36. numba_cuda/numba/cuda/cgutils.py +1294 -0
  37. numba_cuda/numba/cuda/cloudpickle/__init__.py +21 -0
  38. numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +1598 -0
  39. numba_cuda/numba/cuda/cloudpickle/cloudpickle_fast.py +17 -0
  40. numba_cuda/numba/cuda/codegen.py +541 -0
  41. numba_cuda/numba/cuda/compiler.py +1396 -0
  42. numba_cuda/numba/cuda/core/analysis.py +758 -0
  43. numba_cuda/numba/cuda/core/annotations/__init__.py +0 -0
  44. numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +288 -0
  45. numba_cuda/numba/cuda/core/annotations/type_annotations.py +305 -0
  46. numba_cuda/numba/cuda/core/base.py +1332 -0
  47. numba_cuda/numba/cuda/core/boxing.py +1411 -0
  48. numba_cuda/numba/cuda/core/bytecode.py +728 -0
  49. numba_cuda/numba/cuda/core/byteflow.py +2346 -0
  50. numba_cuda/numba/cuda/core/caching.py +744 -0
  51. numba_cuda/numba/cuda/core/callconv.py +392 -0
  52. numba_cuda/numba/cuda/core/codegen.py +171 -0
  53. numba_cuda/numba/cuda/core/compiler.py +199 -0
  54. numba_cuda/numba/cuda/core/compiler_lock.py +85 -0
  55. numba_cuda/numba/cuda/core/compiler_machinery.py +497 -0
  56. numba_cuda/numba/cuda/core/config.py +650 -0
  57. numba_cuda/numba/cuda/core/consts.py +124 -0
  58. numba_cuda/numba/cuda/core/controlflow.py +989 -0
  59. numba_cuda/numba/cuda/core/entrypoints.py +57 -0
  60. numba_cuda/numba/cuda/core/environment.py +66 -0
  61. numba_cuda/numba/cuda/core/errors.py +917 -0
  62. numba_cuda/numba/cuda/core/event.py +511 -0
  63. numba_cuda/numba/cuda/core/funcdesc.py +330 -0
  64. numba_cuda/numba/cuda/core/generators.py +387 -0
  65. numba_cuda/numba/cuda/core/imputils.py +509 -0
  66. numba_cuda/numba/cuda/core/inline_closurecall.py +1787 -0
  67. numba_cuda/numba/cuda/core/interpreter.py +3617 -0
  68. numba_cuda/numba/cuda/core/ir.py +1812 -0
  69. numba_cuda/numba/cuda/core/ir_utils.py +2638 -0
  70. numba_cuda/numba/cuda/core/optional.py +129 -0
  71. numba_cuda/numba/cuda/core/options.py +262 -0
  72. numba_cuda/numba/cuda/core/postproc.py +249 -0
  73. numba_cuda/numba/cuda/core/pythonapi.py +1859 -0
  74. numba_cuda/numba/cuda/core/registry.py +46 -0
  75. numba_cuda/numba/cuda/core/removerefctpass.py +123 -0
  76. numba_cuda/numba/cuda/core/rewrites/__init__.py +26 -0
  77. numba_cuda/numba/cuda/core/rewrites/ir_print.py +91 -0
  78. numba_cuda/numba/cuda/core/rewrites/registry.py +104 -0
  79. numba_cuda/numba/cuda/core/rewrites/static_binop.py +41 -0
  80. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +189 -0
  81. numba_cuda/numba/cuda/core/rewrites/static_raise.py +100 -0
  82. numba_cuda/numba/cuda/core/sigutils.py +68 -0
  83. numba_cuda/numba/cuda/core/ssa.py +498 -0
  84. numba_cuda/numba/cuda/core/targetconfig.py +330 -0
  85. numba_cuda/numba/cuda/core/tracing.py +231 -0
  86. numba_cuda/numba/cuda/core/transforms.py +956 -0
  87. numba_cuda/numba/cuda/core/typed_passes.py +867 -0
  88. numba_cuda/numba/cuda/core/typeinfer.py +1950 -0
  89. numba_cuda/numba/cuda/core/unsafe/__init__.py +0 -0
  90. numba_cuda/numba/cuda/core/unsafe/bytes.py +67 -0
  91. numba_cuda/numba/cuda/core/unsafe/eh.py +67 -0
  92. numba_cuda/numba/cuda/core/unsafe/refcount.py +98 -0
  93. numba_cuda/numba/cuda/core/untyped_passes.py +1979 -0
  94. numba_cuda/numba/cuda/cpython/builtins.py +1153 -0
  95. numba_cuda/numba/cuda/cpython/charseq.py +1218 -0
  96. numba_cuda/numba/cuda/cpython/cmathimpl.py +560 -0
  97. numba_cuda/numba/cuda/cpython/enumimpl.py +103 -0
  98. numba_cuda/numba/cuda/cpython/iterators.py +167 -0
  99. numba_cuda/numba/cuda/cpython/listobj.py +1326 -0
  100. numba_cuda/numba/cuda/cpython/mathimpl.py +499 -0
  101. numba_cuda/numba/cuda/cpython/numbers.py +1475 -0
  102. numba_cuda/numba/cuda/cpython/rangeobj.py +289 -0
  103. numba_cuda/numba/cuda/cpython/slicing.py +322 -0
  104. numba_cuda/numba/cuda/cpython/tupleobj.py +456 -0
  105. numba_cuda/numba/cuda/cpython/unicode.py +2865 -0
  106. numba_cuda/numba/cuda/cpython/unicode_support.py +1597 -0
  107. numba_cuda/numba/cuda/cpython/unsafe/__init__.py +0 -0
  108. numba_cuda/numba/cuda/cpython/unsafe/numbers.py +64 -0
  109. numba_cuda/numba/cuda/cpython/unsafe/tuple.py +92 -0
  110. numba_cuda/numba/cuda/cuda_paths.py +691 -0
  111. numba_cuda/numba/cuda/cudadecl.py +543 -0
  112. numba_cuda/numba/cuda/cudadrv/__init__.py +14 -0
  113. numba_cuda/numba/cuda/cudadrv/devicearray.py +954 -0
  114. numba_cuda/numba/cuda/cudadrv/devices.py +249 -0
  115. numba_cuda/numba/cuda/cudadrv/driver.py +3238 -0
  116. numba_cuda/numba/cuda/cudadrv/drvapi.py +435 -0
  117. numba_cuda/numba/cuda/cudadrv/dummyarray.py +562 -0
  118. numba_cuda/numba/cuda/cudadrv/enums.py +613 -0
  119. numba_cuda/numba/cuda/cudadrv/error.py +48 -0
  120. numba_cuda/numba/cuda/cudadrv/libs.py +220 -0
  121. numba_cuda/numba/cuda/cudadrv/linkable_code.py +184 -0
  122. numba_cuda/numba/cuda/cudadrv/mappings.py +14 -0
  123. numba_cuda/numba/cuda/cudadrv/ndarray.py +26 -0
  124. numba_cuda/numba/cuda/cudadrv/nvrtc.py +193 -0
  125. numba_cuda/numba/cuda/cudadrv/nvvm.py +756 -0
  126. numba_cuda/numba/cuda/cudadrv/rtapi.py +13 -0
  127. numba_cuda/numba/cuda/cudadrv/runtime.py +34 -0
  128. numba_cuda/numba/cuda/cudaimpl.py +983 -0
  129. numba_cuda/numba/cuda/cudamath.py +149 -0
  130. numba_cuda/numba/cuda/datamodel/__init__.py +7 -0
  131. numba_cuda/numba/cuda/datamodel/cuda_manager.py +66 -0
  132. numba_cuda/numba/cuda/datamodel/cuda_models.py +1446 -0
  133. numba_cuda/numba/cuda/datamodel/cuda_packer.py +224 -0
  134. numba_cuda/numba/cuda/datamodel/cuda_registry.py +22 -0
  135. numba_cuda/numba/cuda/datamodel/cuda_testing.py +153 -0
  136. numba_cuda/numba/cuda/datamodel/manager.py +11 -0
  137. numba_cuda/numba/cuda/datamodel/models.py +9 -0
  138. numba_cuda/numba/cuda/datamodel/packer.py +9 -0
  139. numba_cuda/numba/cuda/datamodel/registry.py +11 -0
  140. numba_cuda/numba/cuda/datamodel/testing.py +11 -0
  141. numba_cuda/numba/cuda/debuginfo.py +997 -0
  142. numba_cuda/numba/cuda/decorators.py +294 -0
  143. numba_cuda/numba/cuda/descriptor.py +35 -0
  144. numba_cuda/numba/cuda/device_init.py +155 -0
  145. numba_cuda/numba/cuda/deviceufunc.py +1021 -0
  146. numba_cuda/numba/cuda/dispatcher.py +2463 -0
  147. numba_cuda/numba/cuda/errors.py +72 -0
  148. numba_cuda/numba/cuda/extending.py +697 -0
  149. numba_cuda/numba/cuda/flags.py +178 -0
  150. numba_cuda/numba/cuda/fp16.py +357 -0
  151. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  152. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  153. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  154. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  155. numba_cuda/numba/cuda/include/13/cuda_bf16.h +5118 -0
  156. numba_cuda/numba/cuda/include/13/cuda_bf16.hpp +3865 -0
  157. numba_cuda/numba/cuda/include/13/cuda_fp16.h +5363 -0
  158. numba_cuda/numba/cuda/include/13/cuda_fp16.hpp +3483 -0
  159. numba_cuda/numba/cuda/initialize.py +24 -0
  160. numba_cuda/numba/cuda/intrinsics.py +531 -0
  161. numba_cuda/numba/cuda/itanium_mangler.py +214 -0
  162. numba_cuda/numba/cuda/kernels/__init__.py +2 -0
  163. numba_cuda/numba/cuda/kernels/reduction.py +265 -0
  164. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  165. numba_cuda/numba/cuda/libdevice.py +3386 -0
  166. numba_cuda/numba/cuda/libdevicedecl.py +20 -0
  167. numba_cuda/numba/cuda/libdevicefuncs.py +1060 -0
  168. numba_cuda/numba/cuda/libdeviceimpl.py +88 -0
  169. numba_cuda/numba/cuda/locks.py +19 -0
  170. numba_cuda/numba/cuda/lowering.py +1980 -0
  171. numba_cuda/numba/cuda/mathimpl.py +374 -0
  172. numba_cuda/numba/cuda/memory_management/__init__.py +4 -0
  173. numba_cuda/numba/cuda/memory_management/memsys.cu +99 -0
  174. numba_cuda/numba/cuda/memory_management/memsys.cuh +22 -0
  175. numba_cuda/numba/cuda/memory_management/nrt.cu +212 -0
  176. numba_cuda/numba/cuda/memory_management/nrt.cuh +48 -0
  177. numba_cuda/numba/cuda/memory_management/nrt.py +390 -0
  178. numba_cuda/numba/cuda/memory_management/nrt_context.py +438 -0
  179. numba_cuda/numba/cuda/misc/appdirs.py +594 -0
  180. numba_cuda/numba/cuda/misc/cffiimpl.py +24 -0
  181. numba_cuda/numba/cuda/misc/coverage_support.py +43 -0
  182. numba_cuda/numba/cuda/misc/dump_style.py +41 -0
  183. numba_cuda/numba/cuda/misc/findlib.py +75 -0
  184. numba_cuda/numba/cuda/misc/firstlinefinder.py +96 -0
  185. numba_cuda/numba/cuda/misc/gdb_hook.py +240 -0
  186. numba_cuda/numba/cuda/misc/literal.py +28 -0
  187. numba_cuda/numba/cuda/misc/llvm_pass_timings.py +412 -0
  188. numba_cuda/numba/cuda/misc/special.py +94 -0
  189. numba_cuda/numba/cuda/models.py +56 -0
  190. numba_cuda/numba/cuda/np/arraymath.py +5130 -0
  191. numba_cuda/numba/cuda/np/arrayobj.py +7635 -0
  192. numba_cuda/numba/cuda/np/extensions.py +11 -0
  193. numba_cuda/numba/cuda/np/linalg.py +3087 -0
  194. numba_cuda/numba/cuda/np/math/__init__.py +0 -0
  195. numba_cuda/numba/cuda/np/math/cmathimpl.py +558 -0
  196. numba_cuda/numba/cuda/np/math/mathimpl.py +487 -0
  197. numba_cuda/numba/cuda/np/math/numbers.py +1461 -0
  198. numba_cuda/numba/cuda/np/npdatetime.py +969 -0
  199. numba_cuda/numba/cuda/np/npdatetime_helpers.py +217 -0
  200. numba_cuda/numba/cuda/np/npyfuncs.py +1808 -0
  201. numba_cuda/numba/cuda/np/npyimpl.py +1027 -0
  202. numba_cuda/numba/cuda/np/numpy_support.py +798 -0
  203. numba_cuda/numba/cuda/np/polynomial/__init__.py +4 -0
  204. numba_cuda/numba/cuda/np/polynomial/polynomial_core.py +242 -0
  205. numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +380 -0
  206. numba_cuda/numba/cuda/np/ufunc/__init__.py +4 -0
  207. numba_cuda/numba/cuda/np/ufunc/decorators.py +203 -0
  208. numba_cuda/numba/cuda/np/ufunc/sigparse.py +68 -0
  209. numba_cuda/numba/cuda/np/ufunc/ufuncbuilder.py +65 -0
  210. numba_cuda/numba/cuda/np/ufunc_db.py +1282 -0
  211. numba_cuda/numba/cuda/np/unsafe/__init__.py +0 -0
  212. numba_cuda/numba/cuda/np/unsafe/ndarray.py +84 -0
  213. numba_cuda/numba/cuda/nvvmutils.py +254 -0
  214. numba_cuda/numba/cuda/printimpl.py +126 -0
  215. numba_cuda/numba/cuda/random.py +308 -0
  216. numba_cuda/numba/cuda/reshape_funcs.cu +156 -0
  217. numba_cuda/numba/cuda/serialize.py +267 -0
  218. numba_cuda/numba/cuda/simulator/__init__.py +63 -0
  219. numba_cuda/numba/cuda/simulator/_internal/__init__.py +4 -0
  220. numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +2 -0
  221. numba_cuda/numba/cuda/simulator/api.py +179 -0
  222. numba_cuda/numba/cuda/simulator/bf16.py +4 -0
  223. numba_cuda/numba/cuda/simulator/compiler.py +38 -0
  224. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +11 -0
  225. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +462 -0
  226. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +122 -0
  227. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +66 -0
  228. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +7 -0
  229. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +7 -0
  230. numba_cuda/numba/cuda/simulator/cudadrv/error.py +10 -0
  231. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +10 -0
  232. numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +61 -0
  233. numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +11 -0
  234. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +32 -0
  235. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +22 -0
  236. numba_cuda/numba/cuda/simulator/dispatcher.py +11 -0
  237. numba_cuda/numba/cuda/simulator/kernel.py +320 -0
  238. numba_cuda/numba/cuda/simulator/kernelapi.py +509 -0
  239. numba_cuda/numba/cuda/simulator/memory_management/__init__.py +4 -0
  240. numba_cuda/numba/cuda/simulator/memory_management/nrt.py +21 -0
  241. numba_cuda/numba/cuda/simulator/reduction.py +19 -0
  242. numba_cuda/numba/cuda/simulator/tests/support.py +4 -0
  243. numba_cuda/numba/cuda/simulator/vector_types.py +65 -0
  244. numba_cuda/numba/cuda/simulator_init.py +18 -0
  245. numba_cuda/numba/cuda/stubs.py +624 -0
  246. numba_cuda/numba/cuda/target.py +505 -0
  247. numba_cuda/numba/cuda/testing.py +347 -0
  248. numba_cuda/numba/cuda/tests/__init__.py +62 -0
  249. numba_cuda/numba/cuda/tests/benchmarks/__init__.py +0 -0
  250. numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +119 -0
  251. numba_cuda/numba/cuda/tests/cloudpickle_main_class.py +9 -0
  252. numba_cuda/numba/cuda/tests/core/serialize_usecases.py +113 -0
  253. numba_cuda/numba/cuda/tests/core/test_itanium_mangler.py +83 -0
  254. numba_cuda/numba/cuda/tests/core/test_serialize.py +371 -0
  255. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +9 -0
  256. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +147 -0
  257. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +161 -0
  258. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +397 -0
  259. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +24 -0
  260. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +180 -0
  261. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +313 -0
  262. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +191 -0
  263. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +621 -0
  264. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +247 -0
  265. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +100 -0
  266. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +200 -0
  267. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +53 -0
  268. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +72 -0
  269. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +138 -0
  270. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +43 -0
  271. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +15 -0
  272. numba_cuda/numba/cuda/tests/cudadrv/test_linkable_code.py +58 -0
  273. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +348 -0
  274. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +128 -0
  275. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +301 -0
  276. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +174 -0
  277. numba_cuda/numba/cuda/tests/cudadrv/test_nvrtc.py +28 -0
  278. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +185 -0
  279. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +39 -0
  280. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +23 -0
  281. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +38 -0
  282. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +48 -0
  283. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +44 -0
  284. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +127 -0
  285. numba_cuda/numba/cuda/tests/cudapy/__init__.py +9 -0
  286. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +231 -0
  287. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +50 -0
  288. numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +36 -0
  289. numba_cuda/numba/cuda/tests/cudapy/complex_usecases.py +116 -0
  290. numba_cuda/numba/cuda/tests/cudapy/enum_usecases.py +59 -0
  291. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +62 -0
  292. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +28 -0
  293. numba_cuda/numba/cuda/tests/cudapy/overload_usecases.py +33 -0
  294. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +104 -0
  295. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +47 -0
  296. numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +1122 -0
  297. numba_cuda/numba/cuda/tests/cudapy/test_array.py +344 -0
  298. numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +268 -0
  299. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +203 -0
  300. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +63 -0
  301. numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py +360 -0
  302. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1815 -0
  303. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +599 -0
  304. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +377 -0
  305. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +160 -0
  306. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +27 -0
  307. numba_cuda/numba/cuda/tests/cudapy/test_byteflow.py +98 -0
  308. numba_cuda/numba/cuda/tests/cudapy/test_cache_hints.py +210 -0
  309. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +683 -0
  310. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +265 -0
  311. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +42 -0
  312. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +718 -0
  313. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +370 -0
  314. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +23 -0
  315. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +142 -0
  316. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +178 -0
  317. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +193 -0
  318. numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +131 -0
  319. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +438 -0
  320. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +94 -0
  321. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +101 -0
  322. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +105 -0
  323. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +978 -0
  324. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo_types.py +476 -0
  325. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +500 -0
  326. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +820 -0
  327. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +152 -0
  328. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +111 -0
  329. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +170 -0
  330. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1088 -0
  331. numba_cuda/numba/cuda/tests/cudapy/test_extending_types.py +71 -0
  332. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +265 -0
  333. numba_cuda/numba/cuda/tests/cudapy/test_flow_control.py +1433 -0
  334. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +57 -0
  335. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +34 -0
  336. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +69 -0
  337. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +62 -0
  338. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +474 -0
  339. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +167 -0
  340. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +92 -0
  341. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +39 -0
  342. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +170 -0
  343. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +255 -0
  344. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1219 -0
  345. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +263 -0
  346. numba_cuda/numba/cuda/tests/cudapy/test_ir.py +598 -0
  347. numba_cuda/numba/cuda/tests/cudapy/test_ir_utils.py +276 -0
  348. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +101 -0
  349. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +68 -0
  350. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +123 -0
  351. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +194 -0
  352. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +220 -0
  353. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +173 -0
  354. numba_cuda/numba/cuda/tests/cudapy/test_make_function_to_jit_function.py +364 -0
  355. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +47 -0
  356. numba_cuda/numba/cuda/tests/cudapy/test_math.py +842 -0
  357. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +76 -0
  358. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +78 -0
  359. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +25 -0
  360. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +145 -0
  361. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +39 -0
  362. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +82 -0
  363. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +53 -0
  364. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +504 -0
  365. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +93 -0
  366. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +402 -0
  367. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +128 -0
  368. numba_cuda/numba/cuda/tests/cudapy/test_print.py +193 -0
  369. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +37 -0
  370. numba_cuda/numba/cuda/tests/cudapy/test_random.py +117 -0
  371. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +614 -0
  372. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +130 -0
  373. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +94 -0
  374. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  375. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +86 -0
  376. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +40 -0
  377. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +457 -0
  378. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +233 -0
  379. numba_cuda/numba/cuda/tests/cudapy/test_ssa.py +454 -0
  380. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +56 -0
  381. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +277 -0
  382. numba_cuda/numba/cuda/tests/cudapy/test_tracing.py +200 -0
  383. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +90 -0
  384. numba_cuda/numba/cuda/tests/cudapy/test_typeconv.py +333 -0
  385. numba_cuda/numba/cuda/tests/cudapy/test_typeinfer.py +538 -0
  386. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +585 -0
  387. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +42 -0
  388. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +485 -0
  389. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +312 -0
  390. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +23 -0
  391. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +183 -0
  392. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +40 -0
  393. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +40 -0
  394. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +206 -0
  395. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +446 -0
  396. numba_cuda/numba/cuda/tests/cudasim/__init__.py +9 -0
  397. numba_cuda/numba/cuda/tests/cudasim/support.py +9 -0
  398. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +111 -0
  399. numba_cuda/numba/cuda/tests/data/__init__.py +2 -0
  400. numba_cuda/numba/cuda/tests/data/cta_barrier.cu +28 -0
  401. numba_cuda/numba/cuda/tests/data/cuda_include.cu +10 -0
  402. numba_cuda/numba/cuda/tests/data/error.cu +12 -0
  403. numba_cuda/numba/cuda/tests/data/include/add.cuh +8 -0
  404. numba_cuda/numba/cuda/tests/data/jitlink.cu +28 -0
  405. numba_cuda/numba/cuda/tests/data/jitlink.ptx +49 -0
  406. numba_cuda/numba/cuda/tests/data/warn.cu +12 -0
  407. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +9 -0
  408. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +2 -0
  409. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +54 -0
  410. numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +8 -0
  411. numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +14 -0
  412. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +86 -0
  413. numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +68 -0
  414. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +81 -0
  415. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +141 -0
  416. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +160 -0
  417. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +180 -0
  418. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +119 -0
  419. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +66 -0
  420. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +80 -0
  421. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +206 -0
  422. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +53 -0
  423. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +76 -0
  424. numba_cuda/numba/cuda/tests/nocuda/__init__.py +9 -0
  425. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +452 -0
  426. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +48 -0
  427. numba_cuda/numba/cuda/tests/nocuda/test_import.py +63 -0
  428. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +252 -0
  429. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +59 -0
  430. numba_cuda/numba/cuda/tests/nrt/__init__.py +9 -0
  431. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +387 -0
  432. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +124 -0
  433. numba_cuda/numba/cuda/tests/support.py +900 -0
  434. numba_cuda/numba/cuda/typeconv/__init__.py +4 -0
  435. numba_cuda/numba/cuda/typeconv/castgraph.py +137 -0
  436. numba_cuda/numba/cuda/typeconv/rules.py +63 -0
  437. numba_cuda/numba/cuda/typeconv/typeconv.py +121 -0
  438. numba_cuda/numba/cuda/types/__init__.py +233 -0
  439. numba_cuda/numba/cuda/types/__init__.pyi +167 -0
  440. numba_cuda/numba/cuda/types/abstract.py +9 -0
  441. numba_cuda/numba/cuda/types/common.py +9 -0
  442. numba_cuda/numba/cuda/types/containers.py +9 -0
  443. numba_cuda/numba/cuda/types/cuda_abstract.py +533 -0
  444. numba_cuda/numba/cuda/types/cuda_common.py +110 -0
  445. numba_cuda/numba/cuda/types/cuda_containers.py +971 -0
  446. numba_cuda/numba/cuda/types/cuda_function_type.py +230 -0
  447. numba_cuda/numba/cuda/types/cuda_functions.py +798 -0
  448. numba_cuda/numba/cuda/types/cuda_iterators.py +120 -0
  449. numba_cuda/numba/cuda/types/cuda_misc.py +569 -0
  450. numba_cuda/numba/cuda/types/cuda_npytypes.py +690 -0
  451. numba_cuda/numba/cuda/types/cuda_scalars.py +280 -0
  452. numba_cuda/numba/cuda/types/ext_types.py +101 -0
  453. numba_cuda/numba/cuda/types/function_type.py +11 -0
  454. numba_cuda/numba/cuda/types/functions.py +9 -0
  455. numba_cuda/numba/cuda/types/iterators.py +9 -0
  456. numba_cuda/numba/cuda/types/misc.py +9 -0
  457. numba_cuda/numba/cuda/types/npytypes.py +9 -0
  458. numba_cuda/numba/cuda/types/scalars.py +9 -0
  459. numba_cuda/numba/cuda/typing/__init__.py +19 -0
  460. numba_cuda/numba/cuda/typing/arraydecl.py +939 -0
  461. numba_cuda/numba/cuda/typing/asnumbatype.py +130 -0
  462. numba_cuda/numba/cuda/typing/bufproto.py +70 -0
  463. numba_cuda/numba/cuda/typing/builtins.py +1209 -0
  464. numba_cuda/numba/cuda/typing/cffi_utils.py +219 -0
  465. numba_cuda/numba/cuda/typing/cmathdecl.py +47 -0
  466. numba_cuda/numba/cuda/typing/collections.py +138 -0
  467. numba_cuda/numba/cuda/typing/context.py +782 -0
  468. numba_cuda/numba/cuda/typing/ctypes_utils.py +125 -0
  469. numba_cuda/numba/cuda/typing/dictdecl.py +63 -0
  470. numba_cuda/numba/cuda/typing/enumdecl.py +74 -0
  471. numba_cuda/numba/cuda/typing/listdecl.py +147 -0
  472. numba_cuda/numba/cuda/typing/mathdecl.py +158 -0
  473. numba_cuda/numba/cuda/typing/npdatetime.py +322 -0
  474. numba_cuda/numba/cuda/typing/npydecl.py +749 -0
  475. numba_cuda/numba/cuda/typing/setdecl.py +115 -0
  476. numba_cuda/numba/cuda/typing/templates.py +1446 -0
  477. numba_cuda/numba/cuda/typing/typeof.py +301 -0
  478. numba_cuda/numba/cuda/ufuncs.py +746 -0
  479. numba_cuda/numba/cuda/utils.py +724 -0
  480. numba_cuda/numba/cuda/vector_types.py +214 -0
  481. numba_cuda/numba/cuda/vectorizers.py +260 -0
  482. numba_cuda-0.22.0.dist-info/METADATA +109 -0
  483. numba_cuda-0.22.0.dist-info/RECORD +487 -0
  484. numba_cuda-0.22.0.dist-info/WHEEL +6 -0
  485. numba_cuda-0.22.0.dist-info/licenses/LICENSE +26 -0
  486. numba_cuda-0.22.0.dist-info/licenses/LICENSE.numba +24 -0
  487. numba_cuda-0.22.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,3238 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ """
5
+ CUDA driver bridge implementation
6
+
7
+ NOTE:
8
+ The new driver implementation uses a *_PendingDeallocs* that help prevents a
9
+ crashing the system (particularly OSX) when the CUDA context is corrupted at
10
+ resource deallocation. The old approach ties resource management directly
11
+ into the object destructor; thus, at corruption of the CUDA context,
12
+ subsequent deallocation could further corrupt the CUDA context and causes the
13
+ system to freeze in some cases.
14
+
15
+ """
16
+
17
+ import sys
18
+ import os
19
+ import ctypes
20
+ import weakref
21
+ import functools
22
+ import warnings
23
+ import logging
24
+ import threading
25
+ import asyncio
26
+ import pathlib
27
+ import subprocess
28
+ import tempfile
29
+ import re
30
+ from itertools import product
31
+ from abc import ABCMeta, abstractmethod
32
+ from ctypes import (
33
+ c_int,
34
+ byref,
35
+ c_size_t,
36
+ c_char,
37
+ c_char_p,
38
+ addressof,
39
+ c_void_p,
40
+ c_uint8,
41
+ )
42
+ import contextlib
43
+ import importlib
44
+ import numpy as np
45
+ from collections import namedtuple, deque
46
+
47
+
48
+ from numba.cuda.cext import mviewbuf
49
+ from numba.cuda.core import config
50
+ from numba.cuda import utils, serialize
51
+ from .error import CudaSupportError, CudaDriverError
52
+ from .drvapi import API_PROTOTYPES
53
+ from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj
54
+ from .mappings import FILE_EXTENSION_MAP
55
+ from .linkable_code import LinkableCode, LTOIR, Fatbin, Object
56
+ from numba.cuda.utils import cached_file_read
57
+ from numba.cuda.cudadrv import enums, drvapi, nvrtc
58
+
59
+ from cuda.bindings import driver as binding
60
+ from cuda.core.experimental import (
61
+ Linker,
62
+ LinkerOptions,
63
+ ObjectCode,
64
+ )
65
+
66
+ from cuda.bindings.utils import get_cuda_native_handle
67
+ from cuda.core.experimental import (
68
+ Stream as ExperimentalStream,
69
+ Device as ExperimentalDevice,
70
+ )
71
+
72
+
73
+ # There is no definition of the default stream in the Nvidia bindings (nor
74
+ # is there at the C/C++ level), so we define it here so we don't need to
75
+ # use a magic number 0 in places where we want the default stream.
76
+ CU_STREAM_DEFAULT = 0
77
+
78
+
79
+ MIN_REQUIRED_CC = (3, 5)
80
+ SUPPORTS_IPC = sys.platform.startswith("linux")
81
+
82
+
83
+ _py_decref = ctypes.pythonapi.Py_DecRef
84
+ _py_incref = ctypes.pythonapi.Py_IncRef
85
+ _py_decref.argtypes = [ctypes.py_object]
86
+ _py_incref.argtypes = [ctypes.py_object]
87
+
88
+
89
+ def make_logger():
90
+ logger = logging.getLogger(__name__)
91
+ # is logging configured?
92
+ if not logger.hasHandlers():
93
+ # read user config
94
+ lvl = str(config.CUDA_LOG_LEVEL).upper()
95
+ lvl = getattr(logging, lvl, None)
96
+ if not isinstance(lvl, int):
97
+ # default to critical level
98
+ lvl = logging.CRITICAL
99
+ logger.setLevel(lvl)
100
+ # did user specify a level?
101
+ if config.CUDA_LOG_LEVEL:
102
+ # create a simple handler that prints to stderr
103
+ handler = logging.StreamHandler(sys.stderr)
104
+ fmt = "== CUDA [%(relativeCreated)d] %(levelname)5s -- %(message)s"
105
+ handler.setFormatter(logging.Formatter(fmt=fmt))
106
+ logger.addHandler(handler)
107
+ else:
108
+ # otherwise, put a null handler
109
+ logger.addHandler(logging.NullHandler())
110
+ return logger
111
+
112
+
113
+ @functools.cache
114
+ def _have_nvjitlink():
115
+ try:
116
+ from cuda.bindings._internal import nvjitlink as nvjitlink_internal
117
+ from cuda.bindings._internal.utils import NotSupportedError
118
+ except ImportError:
119
+ return False
120
+
121
+ try:
122
+ if (
123
+ nvjitlink_internal._inspect_function_pointer("__nvJitLinkVersion")
124
+ == 0
125
+ ):
126
+ return False
127
+ try:
128
+ from cuda.bindings import nvjitlink
129
+
130
+ if nvjitlink.version() < (12, 3):
131
+ return False
132
+ except Exception:
133
+ return False
134
+ return True
135
+ except (RuntimeError, NotSupportedError):
136
+ return False
137
+
138
+
139
+ class DeadMemoryError(RuntimeError):
140
+ pass
141
+
142
+
143
+ class LinkerError(RuntimeError):
144
+ pass
145
+
146
+
147
+ class CudaAPIError(CudaDriverError):
148
+ def __init__(self, code, msg):
149
+ self.code = code
150
+ self.msg = msg
151
+ super(CudaAPIError, self).__init__(code, msg)
152
+
153
+ def __str__(self):
154
+ return "[%s] %s" % (self.code, self.msg)
155
+
156
+
157
+ def locate_driver_and_loader():
158
+ # Determine DLL type
159
+ if sys.platform == "win32":
160
+ dlloader = ctypes.WinDLL
161
+ dldir = ["\\windows\\system32"]
162
+ dlnames = ["nvcuda.dll"]
163
+ elif sys.platform == "darwin":
164
+ dlloader = ctypes.CDLL
165
+ dldir = ["/usr/local/cuda/lib"]
166
+ dlnames = ["libcuda.dylib"]
167
+ else:
168
+ # Assume to be *nix like
169
+ dlloader = ctypes.CDLL
170
+ dldir = ["/usr/lib", "/usr/lib64"]
171
+ dlnames = ["libcuda.so", "libcuda.so.1"]
172
+
173
+ # First search for the name in the default library path.
174
+ # If that is not found, try specific common paths.
175
+ candidates = dlnames + [
176
+ os.path.join(x, y) for x, y in product(dldir, dlnames)
177
+ ]
178
+
179
+ return dlloader, candidates
180
+
181
+
182
+ def load_driver(dlloader, candidates):
183
+ # Load the driver; Collect driver error information
184
+ path_not_exist = []
185
+ driver_load_error = []
186
+
187
+ for path in candidates:
188
+ try:
189
+ dll = dlloader(path)
190
+ except OSError as e:
191
+ # Problem opening the DLL
192
+ path_not_exist.append(not os.path.isfile(path))
193
+ driver_load_error.append(e)
194
+ else:
195
+ return dll, path
196
+
197
+ # Problem loading driver
198
+ if all(path_not_exist):
199
+ _raise_driver_not_found()
200
+ else:
201
+ errmsg = "\n".join(str(e) for e in driver_load_error)
202
+ _raise_driver_error(errmsg)
203
+
204
+
205
+ def find_driver():
206
+ dlloader, candidates = locate_driver_and_loader()
207
+ dll, path = load_driver(dlloader, candidates)
208
+ return dll
209
+
210
+
211
+ DRIVER_NOT_FOUND_MSG = """
212
+ CUDA driver library cannot be found.
213
+ Ensure that a compatible NVIDIA driver is installed and available on your system path.
214
+ """
215
+
216
+ DRIVER_LOAD_ERROR_MSG = """
217
+ Possible CUDA driver libraries are found but error occurred during load:
218
+ %s
219
+ """
220
+
221
+
222
+ def _raise_driver_not_found():
223
+ raise CudaSupportError(DRIVER_NOT_FOUND_MSG)
224
+
225
+
226
+ def _raise_driver_error(e):
227
+ raise CudaSupportError(DRIVER_LOAD_ERROR_MSG % e)
228
+
229
+
230
+ def _build_reverse_error_map():
231
+ prefix = "CUDA_ERROR"
232
+ map = utils.UniqueDict()
233
+ for name in dir(enums):
234
+ if name.startswith(prefix):
235
+ code = getattr(enums, name)
236
+ map[code] = name
237
+ return map
238
+
239
+
240
+ def _getpid():
241
+ return os.getpid()
242
+
243
+
244
+ ERROR_MAP = _build_reverse_error_map()
245
+
246
+
247
+ class Driver(object):
248
+ """
249
+ Driver API functions are lazily bound.
250
+ """
251
+
252
+ _singleton = None
253
+
254
+ def __new__(cls):
255
+ obj = cls._singleton
256
+ if obj is not None:
257
+ return obj
258
+ else:
259
+ obj = object.__new__(cls)
260
+ cls._singleton = obj
261
+ return obj
262
+
263
+ def __init__(self):
264
+ self.devices = utils.UniqueDict()
265
+ self.is_initialized = False
266
+ self.initialization_error = None
267
+ self.pid = None
268
+ try:
269
+ if config.DISABLE_CUDA:
270
+ msg = (
271
+ "CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1 "
272
+ "in the environment, or because CUDA is unsupported on "
273
+ "32-bit systems."
274
+ )
275
+ raise CudaSupportError(msg)
276
+ self.lib = find_driver()
277
+ except CudaSupportError as e:
278
+ self.is_initialized = True
279
+ self.initialization_error = e.msg
280
+
281
+ def ensure_initialized(self):
282
+ if self.is_initialized:
283
+ return
284
+
285
+ # lazily initialize logger
286
+ global _logger
287
+ _logger = make_logger()
288
+
289
+ self.is_initialized = True
290
+ try:
291
+ _logger.info("init")
292
+ self.cuInit(0)
293
+ except CudaAPIError as e:
294
+ description = f"{e.msg} ({e.code})"
295
+ self.initialization_error = description
296
+ raise CudaSupportError(f"Error at driver init: {description}")
297
+ else:
298
+ self.pid = _getpid()
299
+
300
+ @property
301
+ def is_available(self):
302
+ self.ensure_initialized()
303
+ return self.initialization_error is None
304
+
305
+ def __getattr__(self, fname):
306
+ # First request of a driver API function
307
+ self.ensure_initialized()
308
+
309
+ if self.initialization_error is not None:
310
+ raise CudaSupportError(
311
+ "Error at driver init: \n%s:" % self.initialization_error
312
+ )
313
+
314
+ return self._cuda_python_wrap_fn(fname)
315
+
316
+ def _ctypes_wrap_fn(self, fname, libfn=None):
317
+ # Wrap a CUDA driver function by default
318
+ if libfn is None:
319
+ try:
320
+ proto = API_PROTOTYPES[fname]
321
+ except KeyError:
322
+ raise AttributeError(fname)
323
+ restype = proto[0]
324
+ argtypes = proto[1:]
325
+
326
+ # Find function in driver library
327
+ libfn = self._find_api(fname)
328
+ libfn.restype = restype
329
+ libfn.argtypes = argtypes
330
+
331
+ def verbose_cuda_api_call(*args):
332
+ argstr = ", ".join([str(arg) for arg in args])
333
+ _logger.debug("call driver api: %s(%s)", libfn.__name__, argstr)
334
+ retcode = libfn(*args)
335
+ self._check_ctypes_error(fname, retcode)
336
+
337
+ def safe_cuda_api_call(*args):
338
+ _logger.debug("call driver api: %s", libfn.__name__)
339
+ retcode = libfn(*args)
340
+ self._check_ctypes_error(fname, retcode)
341
+
342
+ if config.CUDA_LOG_API_ARGS:
343
+ wrapper = verbose_cuda_api_call
344
+ else:
345
+ wrapper = safe_cuda_api_call
346
+
347
+ safe_call = functools.wraps(libfn)(wrapper)
348
+ setattr(self, fname, safe_call)
349
+ return safe_call
350
+
351
+ def _cuda_python_wrap_fn(self, fname):
352
+ libfn = getattr(binding, fname)
353
+
354
+ def verbose_cuda_api_call(*args):
355
+ argstr = ", ".join([str(arg) for arg in args])
356
+ _logger.debug("call driver api: %s(%s)", libfn.__name__, argstr)
357
+ return self._check_cuda_python_error(fname, libfn(*args))
358
+
359
+ def safe_cuda_api_call(*args):
360
+ _logger.debug("call driver api: %s", libfn.__name__)
361
+ return self._check_cuda_python_error(fname, libfn(*args))
362
+
363
+ if config.CUDA_LOG_API_ARGS:
364
+ wrapper = verbose_cuda_api_call
365
+ else:
366
+ wrapper = safe_cuda_api_call
367
+
368
+ safe_call = functools.wraps(libfn)(wrapper)
369
+ setattr(self, fname, safe_call)
370
+ return safe_call
371
+
372
+ def _find_api(self, fname):
373
+ # We use alternatively-named functions for PTDS with the Numba ctypes
374
+ # binding. It handles linking to the correct variant.
375
+ variants = ("_v2", "")
376
+
377
+ if fname in ("cuCtxGetDevice", "cuCtxSynchronize"):
378
+ return getattr(self.lib, fname)
379
+
380
+ for variant in variants:
381
+ try:
382
+ return getattr(self.lib, f"{fname}{variant}")
383
+ except AttributeError:
384
+ pass
385
+
386
+ # Not found.
387
+ # Delay missing function error to use
388
+ def absent_function(*args, **kws):
389
+ raise CudaDriverError(f"Driver missing function: {fname}")
390
+
391
+ setattr(self, fname, absent_function)
392
+ return absent_function
393
+
394
+ def _detect_fork(self):
395
+ if self.pid is not None and _getpid() != self.pid:
396
+ msg = "pid %s forked from pid %s after CUDA driver init"
397
+ _logger.critical(msg, _getpid(), self.pid)
398
+ raise CudaDriverError("CUDA initialized before forking")
399
+
400
+ def _check_ctypes_error(self, fname, retcode):
401
+ if retcode != enums.CUDA_SUCCESS:
402
+ errname = ERROR_MAP.get(retcode, "UNKNOWN_CUDA_ERROR")
403
+ msg = "Call to %s results in %s" % (fname, errname)
404
+ _logger.error(msg)
405
+ if retcode == enums.CUDA_ERROR_NOT_INITIALIZED:
406
+ self._detect_fork()
407
+ raise CudaAPIError(retcode, msg)
408
+
409
+ def _check_cuda_python_error(self, fname, returned):
410
+ retcode = returned[0]
411
+ retval = returned[1:]
412
+ if len(retval) == 1:
413
+ retval = retval[0]
414
+
415
+ if retcode != binding.CUresult.CUDA_SUCCESS:
416
+ msg = "Call to %s results in %s" % (fname, retcode.name)
417
+ _logger.error(msg)
418
+ if retcode == binding.CUresult.CUDA_ERROR_NOT_INITIALIZED:
419
+ self._detect_fork()
420
+ raise CudaAPIError(retcode, msg)
421
+
422
+ return retval
423
+
424
+ def get_device(self, devnum=0):
425
+ dev = self.devices.get(devnum)
426
+ if dev is None:
427
+ dev = Device(devnum)
428
+ self.devices[devnum] = dev
429
+ return weakref.proxy(dev)
430
+
431
+ def get_device_count(self):
432
+ return self.cuDeviceGetCount()
433
+
434
+ def list_devices(self):
435
+ """Returns a list of active devices"""
436
+ return list(self.devices.values())
437
+
438
+ def reset(self):
439
+ """Reset all devices"""
440
+ for dev in self.devices.values():
441
+ dev.reset()
442
+
443
+ def pop_active_context(self):
444
+ """Pop the active CUDA context and return the handle.
445
+ If no CUDA context is active, return None.
446
+ """
447
+ with self.get_active_context() as ac:
448
+ if ac.devnum is not None:
449
+ popped = drvapi.cu_context(int(driver.cuCtxPopCurrent()))
450
+ return popped
451
+
452
+ def get_active_context(self):
453
+ """Returns an instance of ``_ActiveContext``."""
454
+ return _ActiveContext()
455
+
456
+ def get_version(self):
457
+ """
458
+ Returns the CUDA Driver version as a tuple (major, minor).
459
+ """
460
+ # The version is encoded as (1000 * major) + (10 * minor)
461
+ version = driver.cuDriverGetVersion()
462
+ major = version // 1000
463
+ minor = (version - (major * 1000)) // 10
464
+ return (major, minor)
465
+
466
+
467
+ class _ActiveContext(object):
468
+ """An contextmanager object to cache active context to reduce dependency
469
+ on querying the CUDA driver API.
470
+
471
+ Once entering the context, it is assumed that the active CUDA context is
472
+ not changed until the context is exited.
473
+ """
474
+
475
+ _tls_cache = threading.local()
476
+
477
+ def __enter__(self):
478
+ is_top = False
479
+ # check TLS cache
480
+ cache = self._tls_cache
481
+ try:
482
+ hctx, devnum = cache.ctx_devnum
483
+ except AttributeError:
484
+ # Not cached. Query the driver API.
485
+ hctx = driver.cuCtxGetCurrent()
486
+ if int(hctx) == 0:
487
+ hctx = None
488
+ else:
489
+ hctx = drvapi.cu_context(int(hctx))
490
+
491
+ if hctx is None:
492
+ devnum = None
493
+ else:
494
+ devnum = int(driver.cuCtxGetDevice())
495
+
496
+ self._tls_cache.ctx_devnum = (hctx, devnum)
497
+ is_top = True
498
+
499
+ self._is_top = is_top
500
+ self.context_handle = hctx
501
+ self.devnum = devnum
502
+ return self
503
+
504
+ def __exit__(self, exc_type, exc_val, exc_tb):
505
+ if self._is_top:
506
+ del self._tls_cache.ctx_devnum
507
+
508
+ def __bool__(self):
509
+ """Returns True is there's a valid and active CUDA context."""
510
+ return self.context_handle is not None
511
+
512
+ __nonzero__ = __bool__
513
+
514
+
515
+ driver = Driver()
516
+
517
+
518
+ def _build_reverse_device_attrs():
519
+ prefix = "CU_DEVICE_ATTRIBUTE_"
520
+ map = utils.UniqueDict()
521
+ for name in dir(enums):
522
+ if name.startswith(prefix):
523
+ map[name[len(prefix) :]] = getattr(enums, name)
524
+ return map
525
+
526
+
527
+ DEVICE_ATTRIBUTES = _build_reverse_device_attrs()
528
+
529
+
530
+ class Device:
531
+ """
532
+ The device object owns the CUDA contexts. This is owned by the driver
533
+ object. User should not construct devices directly.
534
+ """
535
+
536
+ @classmethod
537
+ def from_identity(self, identity):
538
+ """Create Device object from device identity created by
539
+ ``Device.get_device_identity()``.
540
+ """
541
+ for devid in range(driver.get_device_count()):
542
+ d = driver.get_device(devid)
543
+ if d.get_device_identity() == identity:
544
+ return d
545
+ else:
546
+ raise RuntimeError(
547
+ f"No device of {identity} is found. "
548
+ "Target device may not be visible in this process."
549
+ )
550
+
551
+ def __init__(self, devnum: int) -> None:
552
+ self._dev = ExperimentalDevice(devnum)
553
+ self.id = self._dev.device_id
554
+ self.compute_capability = self._dev.compute_capability
555
+ self.name = self._dev.name
556
+ self.uuid = f"GPU-{self._dev.uuid}"
557
+ self.primary_context = None
558
+
559
+ def get_device_identity(self):
560
+ return {
561
+ "pci_domain_id": self.PCI_DOMAIN_ID,
562
+ "pci_bus_id": self.PCI_BUS_ID,
563
+ "pci_device_id": self.PCI_DEVICE_ID,
564
+ }
565
+
566
+ def __repr__(self):
567
+ return f"<CUDA device {self.id:d} '{self.name}'>"
568
+
569
+ def __getattr__(self, attr):
570
+ """Read attributes lazily"""
571
+ code = getattr(
572
+ binding.CUdevice_attribute, f"CU_DEVICE_ATTRIBUTE_{attr}"
573
+ )
574
+ value = driver.cuDeviceGetAttribute(code, self.id)
575
+
576
+ setattr(self, attr, value)
577
+ return value
578
+
579
+ def __hash__(self):
580
+ return hash(self.id)
581
+
582
+ def __eq__(self, other):
583
+ return isinstance(other, Device) and self.id == other.id
584
+
585
+ def __ne__(self, other):
586
+ return not (self == other)
587
+
588
+ def get_primary_context(self):
589
+ """
590
+ Returns the primary context for the device.
591
+ Note: it is not pushed to the CPU thread.
592
+ """
593
+ if (ctx := self.primary_context) is not None:
594
+ return ctx
595
+
596
+ if self.compute_capability < MIN_REQUIRED_CC:
597
+ raise CudaSupportError(
598
+ f"{self} has compute capability < {MIN_REQUIRED_CC}"
599
+ )
600
+
601
+ self._dev.set_current()
602
+ self.primary_context = ctx = Context(
603
+ weakref.proxy(self),
604
+ ctypes.c_void_p(int(self._dev.context._handle)),
605
+ )
606
+ return ctx
607
+
608
+ def release_primary_context(self):
609
+ """
610
+ Release reference to primary context if it has been retained.
611
+ """
612
+ if self.primary_context:
613
+ driver.cuDevicePrimaryCtxRelease(self.id)
614
+ self.primary_context = None
615
+
616
+ def reset(self):
617
+ try:
618
+ if (ctx := self.primary_context) is not None:
619
+ ctx.reset()
620
+ self.release_primary_context()
621
+ finally:
622
+ # reset at the driver level
623
+ driver.cuDevicePrimaryCtxReset(self.id)
624
+
625
+ @property
626
+ def supports_float16(self):
627
+ return self.compute_capability >= (5, 3)
628
+
629
+ @property
630
+ def supports_bfloat16(self):
631
+ return self.compute_capability >= (8, 0)
632
+
633
+
634
+ class BaseCUDAMemoryManager(object, metaclass=ABCMeta):
635
+ """Abstract base class for External Memory Management (EMM) Plugins."""
636
+
637
+ def __init__(self, *args, **kwargs):
638
+ if "context" not in kwargs:
639
+ raise RuntimeError("Memory manager requires a context")
640
+ self.context = kwargs.pop("context")
641
+
642
+ @abstractmethod
643
+ def memalloc(self, size):
644
+ """
645
+ Allocate on-device memory in the current context.
646
+
647
+ :param size: Size of allocation in bytes
648
+ :type size: int
649
+ :return: A memory pointer instance that owns the allocated memory
650
+ :rtype: :class:`MemoryPointer`
651
+ """
652
+
653
+ @abstractmethod
654
+ def memhostalloc(self, size, mapped, portable, wc):
655
+ """
656
+ Allocate pinned host memory.
657
+
658
+ :param size: Size of the allocation in bytes
659
+ :type size: int
660
+ :param mapped: Whether the allocated memory should be mapped into the
661
+ CUDA address space.
662
+ :type mapped: bool
663
+ :param portable: Whether the memory will be considered pinned by all
664
+ contexts, and not just the calling context.
665
+ :type portable: bool
666
+ :param wc: Whether to allocate the memory as write-combined.
667
+ :type wc: bool
668
+ :return: A memory pointer instance that owns the allocated memory. The
669
+ return type depends on whether the region was mapped into
670
+ device memory.
671
+ :rtype: :class:`MappedMemory` or :class:`PinnedMemory`
672
+ """
673
+
674
+ @abstractmethod
675
+ def mempin(self, owner, pointer, size, mapped):
676
+ """
677
+ Pin a region of host memory that is already allocated.
678
+
679
+ :param owner: The object that owns the memory.
680
+ :param pointer: The pointer to the beginning of the region to pin.
681
+ :type pointer: int
682
+ :param size: The size of the region in bytes.
683
+ :type size: int
684
+ :param mapped: Whether the region should also be mapped into device
685
+ memory.
686
+ :type mapped: bool
687
+ :return: A memory pointer instance that refers to the allocated
688
+ memory.
689
+ :rtype: :class:`MappedMemory` or :class:`PinnedMemory`
690
+ """
691
+
692
+ @abstractmethod
693
+ def initialize(self):
694
+ """
695
+ Perform any initialization required for the EMM plugin instance to be
696
+ ready to use.
697
+
698
+ :return: None
699
+ """
700
+
701
+ @abstractmethod
702
+ def get_ipc_handle(self, memory):
703
+ """
704
+ Return an IPC handle from a GPU allocation.
705
+
706
+ :param memory: Memory for which the IPC handle should be created.
707
+ :type memory: :class:`MemoryPointer`
708
+ :return: IPC handle for the allocation
709
+ :rtype: :class:`IpcHandle`
710
+ """
711
+
712
+ @abstractmethod
713
+ def get_memory_info(self):
714
+ """
715
+ Returns ``(free, total)`` memory in bytes in the context. May raise
716
+ :class:`NotImplementedError`, if returning such information is not
717
+ practical (e.g. for a pool allocator).
718
+
719
+ :return: Memory info
720
+ :rtype: :class:`MemoryInfo`
721
+ """
722
+
723
+ @abstractmethod
724
+ def reset(self):
725
+ """
726
+ Clears up all memory allocated in this context.
727
+
728
+ :return: None
729
+ """
730
+
731
+ @abstractmethod
732
+ def defer_cleanup(self):
733
+ """
734
+ Returns a context manager that ensures the implementation of deferred
735
+ cleanup whilst it is active.
736
+
737
+ :return: Context manager
738
+ """
739
+
740
+ @property
741
+ @abstractmethod
742
+ def interface_version(self):
743
+ """
744
+ Returns an integer specifying the version of the EMM Plugin interface
745
+ supported by the plugin implementation. Should always return 1 for
746
+ implementations of this version of the specification.
747
+ """
748
+
749
+
750
+ class HostOnlyCUDAMemoryManager(BaseCUDAMemoryManager):
751
+ """Base class for External Memory Management (EMM) Plugins that only
752
+ implement on-device allocation. A subclass need not implement the
753
+ ``memhostalloc`` and ``mempin`` methods.
754
+
755
+ This class also implements ``reset`` and ``defer_cleanup`` (see
756
+ :class:`numba.cuda.BaseCUDAMemoryManager`) for its own internal state
757
+ management. If an EMM Plugin based on this class also implements these
758
+ methods, then its implementations of these must also call the method from
759
+ ``super()`` to give ``HostOnlyCUDAMemoryManager`` an opportunity to do the
760
+ necessary work for the host allocations it is managing.
761
+
762
+ This class does not implement ``interface_version``, as it will always be
763
+ consistent with the version of Numba in which it is implemented. An EMM
764
+ Plugin subclassing this class should implement ``interface_version``
765
+ instead.
766
+ """
767
+
768
+ def __init__(self, *args, **kwargs):
769
+ super().__init__(*args, **kwargs)
770
+ self.allocations = utils.UniqueDict()
771
+ self.deallocations = _PendingDeallocs()
772
+
773
+ def _attempt_allocation(self, allocator):
774
+ """
775
+ Attempt allocation by calling *allocator*. If an out-of-memory error
776
+ is raised, the pending deallocations are flushed and the allocation
777
+ is retried. If it fails in the second attempt, the error is reraised.
778
+ """
779
+ try:
780
+ return allocator()
781
+ except CudaAPIError as e:
782
+ # is out-of-memory?
783
+ oom_code = binding.CUresult.CUDA_ERROR_OUT_OF_MEMORY
784
+ if e.code == oom_code:
785
+ # clear pending deallocations
786
+ self.deallocations.clear()
787
+ # try again
788
+ return allocator()
789
+ else:
790
+ raise
791
+
792
+ def memhostalloc(self, size, mapped=False, portable=False, wc=False):
793
+ """Implements the allocation of pinned host memory.
794
+
795
+ It is recommended that this method is not overridden by EMM Plugin
796
+ implementations - instead, use the :class:`BaseCUDAMemoryManager`.
797
+ """
798
+ flags = 0
799
+ if mapped:
800
+ flags |= enums.CU_MEMHOSTALLOC_DEVICEMAP
801
+ if portable:
802
+ flags |= enums.CU_MEMHOSTALLOC_PORTABLE
803
+ if wc:
804
+ flags |= enums.CU_MEMHOSTALLOC_WRITECOMBINED
805
+
806
+ def allocator():
807
+ return driver.cuMemHostAlloc(size, flags)
808
+
809
+ if mapped:
810
+ pointer = self._attempt_allocation(allocator)
811
+ else:
812
+ pointer = allocator()
813
+
814
+ alloc_key = pointer
815
+
816
+ finalizer = _hostalloc_finalizer(self, pointer, alloc_key, size, mapped)
817
+ ctx = weakref.proxy(self.context)
818
+
819
+ if mapped:
820
+ mem = MappedMemory(ctx, pointer, size, finalizer=finalizer)
821
+ self.allocations[alloc_key] = mem
822
+ return mem.own()
823
+ else:
824
+ return PinnedMemory(ctx, pointer, size, finalizer=finalizer)
825
+
826
+ def mempin(self, owner, pointer, size, mapped=False):
827
+ """Implements the pinning of host memory.
828
+
829
+ It is recommended that this method is not overridden by EMM Plugin
830
+ implementations - instead, use the :class:`BaseCUDAMemoryManager`.
831
+ """
832
+ alloc_key = pointer
833
+
834
+ # possible flags are "portable" (between context)
835
+ # and "device-map" (map host memory to device thus no need
836
+ # for memory transfer).
837
+ flags = 0
838
+
839
+ if mapped:
840
+ flags |= enums.CU_MEMHOSTREGISTER_DEVICEMAP
841
+
842
+ def allocator():
843
+ driver.cuMemHostRegister(pointer, size, flags)
844
+
845
+ if mapped:
846
+ self._attempt_allocation(allocator)
847
+ else:
848
+ allocator()
849
+
850
+ finalizer = _pin_finalizer(self, pointer, alloc_key, mapped)
851
+ ctx = weakref.proxy(self.context)
852
+
853
+ if mapped:
854
+ mem = MappedMemory(
855
+ ctx, pointer, size, owner=owner, finalizer=finalizer
856
+ )
857
+ self.allocations[alloc_key] = mem
858
+ return mem.own()
859
+ else:
860
+ return PinnedMemory(
861
+ ctx, pointer, size, owner=owner, finalizer=finalizer
862
+ )
863
+
864
+ def memallocmanaged(self, size, attach_global):
865
+ def allocator():
866
+ ma_flags = binding.CUmemAttach_flags
867
+
868
+ if attach_global:
869
+ flags = ma_flags.CU_MEM_ATTACH_GLOBAL.value
870
+ else:
871
+ flags = ma_flags.CU_MEM_ATTACH_HOST.value
872
+
873
+ return driver.cuMemAllocManaged(size, flags)
874
+
875
+ ptr = self._attempt_allocation(allocator)
876
+
877
+ alloc_key = ptr
878
+
879
+ finalizer = _alloc_finalizer(self, ptr, alloc_key, size)
880
+ ctx = weakref.proxy(self.context)
881
+ mem = ManagedMemory(ctx, ptr, size, finalizer=finalizer)
882
+ self.allocations[alloc_key] = mem
883
+ return mem.own()
884
+
885
+ def reset(self):
886
+ """Clears up all host memory (mapped and/or pinned) in the current
887
+ context.
888
+
889
+ EMM Plugins that override this method must call ``super().reset()`` to
890
+ ensure that host allocations are also cleaned up."""
891
+ self.allocations.clear()
892
+ self.deallocations.clear()
893
+
894
+ @contextlib.contextmanager
895
+ def defer_cleanup(self):
896
+ """Returns a context manager that disables cleanup of mapped or pinned
897
+ host memory in the current context whilst it is active.
898
+
899
+ EMM Plugins that override this method must obtain the context manager
900
+ from this method before yielding to ensure that cleanup of host
901
+ allocations is also deferred."""
902
+ with self.deallocations.disable():
903
+ yield
904
+
905
+
906
+ class GetIpcHandleMixin:
907
+ """A class that provides a default implementation of ``get_ipc_handle()``."""
908
+
909
+ def get_ipc_handle(self, memory):
910
+ """Open an IPC memory handle by using ``cuMemGetAddressRange`` to
911
+ determine the base pointer of the allocation. An IPC handle of type
912
+ ``cu_ipc_mem_handle`` is constructed and initialized with
913
+ ``cuIpcGetMemHandle``. A :class:`numba.cuda.IpcHandle` is returned,
914
+ populated with the underlying ``ipc_mem_handle``.
915
+ """
916
+ base, end = device_extents(memory)
917
+ ipchandle = driver.cuIpcGetMemHandle(base)
918
+ offset = int(memory.handle) - int(base)
919
+ source_info = self.context.device.get_device_identity()
920
+
921
+ return IpcHandle(
922
+ memory, ipchandle, memory.size, source_info, offset=offset
923
+ )
924
+
925
+
926
+ class NumbaCUDAMemoryManager(GetIpcHandleMixin, HostOnlyCUDAMemoryManager):
927
+ """Internal on-device memory management for Numba. This is implemented using
928
+ the EMM Plugin interface, but is not part of the public API."""
929
+
930
+ def initialize(self):
931
+ # Set the memory capacity of *deallocations* as the memory manager
932
+ # becomes active for the first time
933
+ if self.deallocations.memory_capacity == _SizeNotSet:
934
+ self.deallocations.memory_capacity = self.get_memory_info().total
935
+
936
+ def memalloc(self, size):
937
+ def allocator():
938
+ return driver.cuMemAlloc(size)
939
+
940
+ ptr = self._attempt_allocation(allocator)
941
+ alloc_key = ptr
942
+
943
+ finalizer = _alloc_finalizer(self, ptr, alloc_key, size)
944
+ ctx = weakref.proxy(self.context)
945
+ mem = AutoFreePointer(ctx, ptr, size, finalizer=finalizer)
946
+ self.allocations[alloc_key] = mem
947
+ return mem.own()
948
+
949
+ def get_memory_info(self):
950
+ free, total = driver.cuMemGetInfo()
951
+ return MemoryInfo(free=free, total=total)
952
+
953
+ @property
954
+ def interface_version(self):
955
+ return _SUPPORTED_EMM_INTERFACE_VERSION
956
+
957
+
958
+ _SUPPORTED_EMM_INTERFACE_VERSION = 1
959
+
960
+ _memory_manager = None
961
+
962
+
963
+ def _ensure_memory_manager():
964
+ global _memory_manager
965
+
966
+ if _memory_manager:
967
+ return
968
+
969
+ if config.CUDA_MEMORY_MANAGER == "default":
970
+ _memory_manager = NumbaCUDAMemoryManager
971
+ return
972
+
973
+ try:
974
+ mgr_module = importlib.import_module(config.CUDA_MEMORY_MANAGER)
975
+ set_memory_manager(mgr_module._numba_memory_manager)
976
+ except Exception:
977
+ raise RuntimeError(
978
+ "Failed to use memory manager from %s" % config.CUDA_MEMORY_MANAGER
979
+ )
980
+
981
+
982
+ def set_memory_manager(mm_plugin):
983
+ """Configure Numba to use an External Memory Management (EMM) Plugin. If
984
+ the EMM Plugin version does not match one supported by this version of
985
+ Numba, a RuntimeError will be raised.
986
+
987
+ :param mm_plugin: The class implementing the EMM Plugin.
988
+ :type mm_plugin: BaseCUDAMemoryManager
989
+ :return: None
990
+ """
991
+ global _memory_manager
992
+
993
+ dummy = mm_plugin(context=None)
994
+ iv = dummy.interface_version
995
+ if iv != _SUPPORTED_EMM_INTERFACE_VERSION:
996
+ err = "EMM Plugin interface has version %d - version %d required" % (
997
+ iv,
998
+ _SUPPORTED_EMM_INTERFACE_VERSION,
999
+ )
1000
+ raise RuntimeError(err)
1001
+
1002
+ _memory_manager = mm_plugin
1003
+
1004
+
1005
+ class _SizeNotSet(int):
1006
+ """
1007
+ Dummy object for _PendingDeallocs when *size* is not set.
1008
+ """
1009
+
1010
+ def __new__(cls, *args, **kwargs):
1011
+ return super().__new__(cls, 0)
1012
+
1013
+ def __str__(self):
1014
+ return "?"
1015
+
1016
+
1017
+ _SizeNotSet = _SizeNotSet()
1018
+
1019
+
1020
+ class _PendingDeallocs(object):
1021
+ """
1022
+ Pending deallocations of a context (or device since we are using the primary
1023
+ context). The capacity defaults to being unset (_SizeNotSet) but can be
1024
+ modified later once the driver is initialized and the total memory capacity
1025
+ known.
1026
+ """
1027
+
1028
+ def __init__(self, capacity=_SizeNotSet):
1029
+ self._cons = deque()
1030
+ self._disable_count = 0
1031
+ self._size = 0
1032
+ self.memory_capacity = capacity
1033
+
1034
+ @property
1035
+ def _max_pending_bytes(self):
1036
+ return int(self.memory_capacity * config.CUDA_DEALLOCS_RATIO)
1037
+
1038
+ def add_item(self, dtor, handle, size=_SizeNotSet):
1039
+ """
1040
+ Add a pending deallocation.
1041
+
1042
+ The *dtor* arg is the destructor function that takes an argument,
1043
+ *handle*. It is used as ``dtor(handle)``. The *size* arg is the
1044
+ byte size of the resource added. It is an optional argument. Some
1045
+ resources (e.g. CUModule) has an unknown memory footprint on the device.
1046
+ """
1047
+ _logger.info("add pending dealloc: %s %s bytes", dtor.__name__, size)
1048
+ self._cons.append((dtor, handle, size))
1049
+ self._size += int(size)
1050
+ if (
1051
+ len(self._cons) > config.CUDA_DEALLOCS_COUNT
1052
+ or self._size > self._max_pending_bytes
1053
+ ):
1054
+ self.clear()
1055
+
1056
+ def clear(self):
1057
+ """
1058
+ Flush any pending deallocations unless it is disabled.
1059
+ Do nothing if disabled.
1060
+ """
1061
+ if not self.is_disabled:
1062
+ while self._cons:
1063
+ [dtor, handle, size] = self._cons.popleft()
1064
+ _logger.info("dealloc: %s %s bytes", dtor.__name__, size)
1065
+ dtor(handle)
1066
+
1067
+ self._size = 0
1068
+
1069
+ @contextlib.contextmanager
1070
+ def disable(self):
1071
+ """
1072
+ Context manager to temporarily disable flushing pending deallocation.
1073
+ This can be nested.
1074
+ """
1075
+ self._disable_count += 1
1076
+ try:
1077
+ yield
1078
+ finally:
1079
+ self._disable_count -= 1
1080
+ assert self._disable_count >= 0
1081
+
1082
+ @property
1083
+ def is_disabled(self):
1084
+ return self._disable_count > 0
1085
+
1086
+ def __len__(self):
1087
+ """
1088
+ Returns number of pending deallocations.
1089
+ """
1090
+ return len(self._cons)
1091
+
1092
+
1093
+ MemoryInfo = namedtuple("MemoryInfo", "free,total")
1094
+ """Free and total memory for a device.
1095
+
1096
+ .. py:attribute:: free
1097
+
1098
+ Free device memory in bytes.
1099
+
1100
+ .. py:attribute:: total
1101
+
1102
+ Total device memory in bytes.
1103
+ """
1104
+
1105
+
1106
+ class Context(object):
1107
+ """
1108
+ This object wraps a CUDA Context resource.
1109
+
1110
+ Contexts should not be constructed directly by user code.
1111
+ """
1112
+
1113
+ def __init__(self, device, handle):
1114
+ self.device = device
1115
+ self.handle = handle
1116
+ self.allocations = utils.UniqueDict()
1117
+ self.deallocations = _PendingDeallocs()
1118
+ _ensure_memory_manager()
1119
+ self.memory_manager = _memory_manager(context=self)
1120
+ self.modules = utils.UniqueDict()
1121
+ # For storing context specific data
1122
+ self.extras = {}
1123
+
1124
+ def reset(self):
1125
+ """
1126
+ Clean up all owned resources in this context.
1127
+ """
1128
+ # Free owned resources
1129
+ _logger.info("reset context of device %s", self.device.id)
1130
+ self.memory_manager.reset()
1131
+ self.modules.clear()
1132
+ # Clear trash
1133
+ self.deallocations.clear()
1134
+
1135
+ def get_memory_info(self):
1136
+ """Returns (free, total) memory in bytes in the context."""
1137
+ return self.memory_manager.get_memory_info()
1138
+
1139
+ def get_active_blocks_per_multiprocessor(
1140
+ self, func, blocksize, memsize, flags=None
1141
+ ):
1142
+ """Return occupancy of a function.
1143
+ :param func: kernel for which occupancy is calculated
1144
+ :param blocksize: block size the kernel is intended to be launched with
1145
+ :param memsize: per-block dynamic shared memory usage intended, in bytes
1146
+ """
1147
+ args = (func, blocksize, memsize, flags)
1148
+ return self._cuda_python_active_blocks_per_multiprocessor(*args)
1149
+
1150
+ def _cuda_python_active_blocks_per_multiprocessor(
1151
+ self, func, blocksize, memsize, flags
1152
+ ):
1153
+ ps = [func.handle, blocksize, memsize]
1154
+
1155
+ if not flags:
1156
+ return driver.cuOccupancyMaxActiveBlocksPerMultiprocessor(*ps)
1157
+
1158
+ ps.append(flags)
1159
+ return driver.cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(*ps)
1160
+
1161
+ def _ctypes_active_blocks_per_multiprocessor(
1162
+ self, func, blocksize, memsize, flags
1163
+ ):
1164
+ retval = c_int()
1165
+ args = (byref(retval), func.handle, blocksize, memsize)
1166
+
1167
+ if not flags:
1168
+ driver.cuOccupancyMaxActiveBlocksPerMultiprocessor(*args)
1169
+ else:
1170
+ driver.cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(*args)
1171
+
1172
+ return retval.value
1173
+
1174
+ def get_max_potential_block_size(
1175
+ self, func, b2d_func, memsize, blocksizelimit, flags=None
1176
+ ):
1177
+ """Suggest a launch configuration with reasonable occupancy.
1178
+ :param func: kernel for which occupancy is calculated
1179
+ :param b2d_func: function that calculates how much per-block dynamic
1180
+ shared memory 'func' uses based on the block size.
1181
+ Can also be the address of a C function.
1182
+ Use `0` to pass `NULL` to the underlying CUDA API.
1183
+ :param memsize: per-block dynamic shared memory usage intended, in bytes
1184
+ :param blocksizelimit: maximum block size the kernel is designed to
1185
+ handle
1186
+ """
1187
+ args = (func, b2d_func, memsize, blocksizelimit, flags)
1188
+ return self._cuda_python_max_potential_block_size(*args)
1189
+
1190
+ def _ctypes_max_potential_block_size(
1191
+ self, func, b2d_func, memsize, blocksizelimit, flags
1192
+ ):
1193
+ gridsize = c_int()
1194
+ blocksize = c_int()
1195
+ b2d_cb = cu_occupancy_b2d_size(b2d_func)
1196
+ args = [
1197
+ byref(gridsize),
1198
+ byref(blocksize),
1199
+ func.handle,
1200
+ b2d_cb,
1201
+ memsize,
1202
+ blocksizelimit,
1203
+ ]
1204
+
1205
+ if not flags:
1206
+ driver.cuOccupancyMaxPotentialBlockSize(*args)
1207
+ else:
1208
+ args.append(flags)
1209
+ driver.cuOccupancyMaxPotentialBlockSizeWithFlags(*args)
1210
+
1211
+ return (gridsize.value, blocksize.value)
1212
+
1213
+ def _cuda_python_max_potential_block_size(
1214
+ self, func, b2d_func, memsize, blocksizelimit, flags
1215
+ ):
1216
+ b2d_cb = ctypes.CFUNCTYPE(c_size_t, c_int)(b2d_func)
1217
+ ptr = int.from_bytes(b2d_cb, byteorder="little")
1218
+ driver_b2d_cb = binding.CUoccupancyB2DSize(ptr)
1219
+ args = [func.handle, driver_b2d_cb, memsize, blocksizelimit]
1220
+
1221
+ if not flags:
1222
+ return driver.cuOccupancyMaxPotentialBlockSize(*args)
1223
+ else:
1224
+ args.append(flags)
1225
+ return driver.cuOccupancyMaxPotentialBlockSizeWithFlags(*args)
1226
+
1227
+ def prepare_for_use(self):
1228
+ """Initialize the context for use.
1229
+ It's safe to be called multiple times.
1230
+ """
1231
+ self.memory_manager.initialize()
1232
+
1233
+ def push(self):
1234
+ """
1235
+ Pushes this context on the current CPU Thread.
1236
+ """
1237
+ driver.cuCtxPushCurrent(self.handle.value)
1238
+ self.prepare_for_use()
1239
+
1240
+ def pop(self):
1241
+ """
1242
+ Pops this context off the current CPU thread. Note that this context
1243
+ must be at the top of the context stack, otherwise an error will occur.
1244
+ """
1245
+ popped = driver.pop_active_context()
1246
+ assert popped.value == self.handle.value
1247
+
1248
+ def memalloc(self, bytesize):
1249
+ return self.memory_manager.memalloc(bytesize)
1250
+
1251
+ def memallocmanaged(self, bytesize, attach_global=True):
1252
+ return self.memory_manager.memallocmanaged(bytesize, attach_global)
1253
+
1254
+ def memhostalloc(self, bytesize, mapped=False, portable=False, wc=False):
1255
+ return self.memory_manager.memhostalloc(bytesize, mapped, portable, wc)
1256
+
1257
+ def mempin(self, owner, pointer, size, mapped=False):
1258
+ if mapped and not self.device.CAN_MAP_HOST_MEMORY:
1259
+ raise CudaDriverError("%s cannot map host memory" % self.device)
1260
+ return self.memory_manager.mempin(owner, pointer, size, mapped)
1261
+
1262
+ def get_ipc_handle(self, memory):
1263
+ """
1264
+ Returns an *IpcHandle* from a GPU allocation.
1265
+ """
1266
+ if not SUPPORTS_IPC:
1267
+ raise OSError("OS does not support CUDA IPC")
1268
+ return self.memory_manager.get_ipc_handle(memory)
1269
+
1270
+ def open_ipc_handle(self, handle, size):
1271
+ # open the IPC handle to get the device pointer
1272
+ flags = 1 # CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS
1273
+ dptr = driver.cuIpcOpenMemHandle(handle, flags)
1274
+
1275
+ # wrap it
1276
+ return MemoryPointer(
1277
+ context=weakref.proxy(self), pointer=dptr, size=size
1278
+ )
1279
+
1280
+ def enable_peer_access(self, peer_context, flags=0):
1281
+ """Enable peer access between the current context and the peer context"""
1282
+ assert flags == 0, "*flags* is reserved and MUST be zero"
1283
+ driver.cuCtxEnablePeerAccess(peer_context, flags)
1284
+
1285
+ def can_access_peer(self, peer_device):
1286
+ """Returns a bool indicating whether the peer access between the
1287
+ current and peer device is possible.
1288
+ """
1289
+ peer_device = binding.CUdevice(peer_device)
1290
+ can_access_peer = driver.cuDeviceCanAccessPeer(
1291
+ self.device.id, peer_device
1292
+ )
1293
+
1294
+ return bool(can_access_peer)
1295
+
1296
+ def create_module_ptx(self, ptx):
1297
+ if isinstance(ptx, str):
1298
+ ptx = ptx.encode("utf8")
1299
+ image = ObjectCode.from_ptx(ptx)
1300
+ return self.create_module_image(image)
1301
+
1302
+ def create_module_image(
1303
+ self, image, setup_callbacks=None, teardown_callbacks=None
1304
+ ):
1305
+ module = load_module_image(
1306
+ self, image, setup_callbacks, teardown_callbacks
1307
+ )
1308
+ key = module.handle
1309
+ self.modules[key] = module
1310
+ return weakref.proxy(module)
1311
+
1312
+ def unload_module(self, module):
1313
+ key = module.handle
1314
+ del self.modules[key]
1315
+
1316
+ def get_default_stream(self):
1317
+ handle = drvapi.cu_stream(int(binding.CUstream(CU_STREAM_DEFAULT)))
1318
+ return Stream(handle)
1319
+
1320
+ def get_legacy_default_stream(self):
1321
+ handle = drvapi.cu_stream(
1322
+ int(binding.CUstream(binding.CU_STREAM_LEGACY))
1323
+ )
1324
+ return Stream(handle)
1325
+
1326
+ def get_per_thread_default_stream(self):
1327
+ handle = drvapi.cu_stream(
1328
+ int(binding.CUstream(binding.CU_STREAM_PER_THREAD))
1329
+ )
1330
+ return Stream(handle)
1331
+
1332
+ def create_stream(self):
1333
+ # The default stream creation flag, specifying that the created
1334
+ # stream synchronizes with stream 0 (this is different from the
1335
+ # default stream, which we define also as CU_STREAM_DEFAULT when
1336
+ # the NV binding is in use).
1337
+ flags = binding.CUstream_flags.CU_STREAM_DEFAULT.value
1338
+ handle = drvapi.cu_stream(int(driver.cuStreamCreate(flags)))
1339
+ return Stream(
1340
+ handle, finalizer=_stream_finalizer(self.deallocations, handle)
1341
+ )
1342
+
1343
+ def create_external_stream(self, ptr):
1344
+ if not isinstance(ptr, int):
1345
+ raise TypeError("ptr for external stream must be an int")
1346
+ handle = drvapi.cu_stream(int(binding.CUstream(ptr)))
1347
+ return Stream(handle, external=True)
1348
+
1349
+ def create_event(self, timing=True):
1350
+ flags = 0
1351
+ if not timing:
1352
+ flags |= enums.CU_EVENT_DISABLE_TIMING
1353
+ handle = drvapi.cu_event(int(driver.cuEventCreate(flags)))
1354
+ return Event(
1355
+ handle, finalizer=_event_finalizer(self.deallocations, handle)
1356
+ )
1357
+
1358
+ def synchronize(self):
1359
+ driver.cuCtxSynchronize()
1360
+
1361
+ @contextlib.contextmanager
1362
+ def defer_cleanup(self):
1363
+ with self.memory_manager.defer_cleanup():
1364
+ with self.deallocations.disable():
1365
+ yield
1366
+
1367
+ def __repr__(self):
1368
+ return f"<CUDA context {self.handle} of device {self.device.id:d}>"
1369
+
1370
+ def __eq__(self, other):
1371
+ if isinstance(other, Context):
1372
+ return self.handle == other.handle
1373
+ else:
1374
+ return NotImplemented
1375
+
1376
+ def __ne__(self, other):
1377
+ return not self.__eq__(other)
1378
+
1379
+
1380
+ def load_module_image(
1381
+ context, image, setup_callbacks=None, teardown_callbacks=None
1382
+ ):
1383
+ """
1384
+ image must be a pointer
1385
+ """
1386
+ return load_module_image_cuda_python(
1387
+ context, image, setup_callbacks, teardown_callbacks
1388
+ )
1389
+
1390
+
1391
+ def load_module_image_ctypes(
1392
+ context, image, setup_callbacks, teardown_callbacks
1393
+ ):
1394
+ logsz = config.CUDA_LOG_SIZE
1395
+
1396
+ jitinfo = (c_char * logsz)()
1397
+ jiterrors = (c_char * logsz)()
1398
+
1399
+ options = {
1400
+ enums.CU_JIT_INFO_LOG_BUFFER: addressof(jitinfo),
1401
+ enums.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
1402
+ enums.CU_JIT_ERROR_LOG_BUFFER: addressof(jiterrors),
1403
+ enums.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
1404
+ enums.CU_JIT_LOG_VERBOSE: c_void_p(config.CUDA_VERBOSE_JIT_LOG),
1405
+ }
1406
+
1407
+ option_keys = (drvapi.cu_jit_option * len(options))(*options.keys())
1408
+ option_vals = (c_void_p * len(options))(*options.values())
1409
+ handle = drvapi.cu_module()
1410
+ try:
1411
+ driver.cuModuleLoadDataEx(
1412
+ byref(handle), image, len(options), option_keys, option_vals
1413
+ )
1414
+ except CudaAPIError as e:
1415
+ msg = "cuModuleLoadDataEx error:\n%s" % jiterrors.value.decode("utf8")
1416
+ raise CudaAPIError(e.code, msg)
1417
+
1418
+ info_log = jitinfo.value
1419
+
1420
+ return CtypesModule(
1421
+ weakref.proxy(context),
1422
+ handle,
1423
+ info_log,
1424
+ _module_finalizer(context, handle),
1425
+ setup_callbacks,
1426
+ teardown_callbacks,
1427
+ )
1428
+
1429
+
1430
+ def load_module_image_cuda_python(
1431
+ context, image, setup_callbacks, teardown_callbacks
1432
+ ):
1433
+ """
1434
+ image must be a pointer
1435
+ """
1436
+ logsz = config.CUDA_LOG_SIZE
1437
+
1438
+ jitinfo = bytearray(logsz)
1439
+ jiterrors = bytearray(logsz)
1440
+
1441
+ jit_option = binding.CUjit_option
1442
+ options = {
1443
+ jit_option.CU_JIT_INFO_LOG_BUFFER: jitinfo,
1444
+ jit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: logsz,
1445
+ jit_option.CU_JIT_ERROR_LOG_BUFFER: jiterrors,
1446
+ jit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: logsz,
1447
+ jit_option.CU_JIT_LOG_VERBOSE: config.CUDA_VERBOSE_JIT_LOG,
1448
+ }
1449
+
1450
+ option_keys = [k for k in options.keys()]
1451
+ option_vals = [v for v in options.values()]
1452
+
1453
+ try:
1454
+ handle = driver.cuModuleLoadDataEx(
1455
+ image.code, len(options), option_keys, option_vals
1456
+ )
1457
+ except CudaAPIError as e:
1458
+ err_string = jiterrors.decode("utf-8")
1459
+ msg = "cuModuleLoadDataEx error:\n%s" % err_string
1460
+ raise CudaAPIError(e.code, msg)
1461
+
1462
+ info_log = jitinfo.decode("utf-8")
1463
+
1464
+ return CudaPythonModule(
1465
+ weakref.proxy(context),
1466
+ handle,
1467
+ info_log,
1468
+ _module_finalizer(context, handle),
1469
+ setup_callbacks,
1470
+ teardown_callbacks,
1471
+ )
1472
+
1473
+
1474
+ def _alloc_finalizer(memory_manager, ptr, alloc_key, size):
1475
+ allocations = memory_manager.allocations
1476
+ deallocations = memory_manager.deallocations
1477
+
1478
+ def core():
1479
+ if allocations:
1480
+ allocations.pop(alloc_key, None)
1481
+ deallocations.add_item(driver.cuMemFree, ptr, size)
1482
+
1483
+ return core
1484
+
1485
+
1486
+ def _hostalloc_finalizer(memory_manager, ptr, alloc_key, size, mapped):
1487
+ """
1488
+ Finalize page-locked host memory allocated by `context.memhostalloc`.
1489
+
1490
+ This memory is managed by CUDA, and finalization entails deallocation. The
1491
+ issues noted in `_pin_finalizer` are not relevant in this case, and the
1492
+ finalization is placed in the `context.deallocations` queue along with
1493
+ finalization of device objects.
1494
+
1495
+ """
1496
+ allocations = memory_manager.allocations
1497
+ deallocations = memory_manager.deallocations
1498
+ if not mapped:
1499
+ size = _SizeNotSet
1500
+
1501
+ def core():
1502
+ if mapped and allocations:
1503
+ del allocations[alloc_key]
1504
+ deallocations.add_item(driver.cuMemFreeHost, ptr, size)
1505
+
1506
+ return core
1507
+
1508
+
1509
+ def _pin_finalizer(memory_manager, ptr, alloc_key, mapped):
1510
+ """
1511
+ Finalize temporary page-locking of host memory by `context.mempin`.
1512
+
1513
+ This applies to memory not otherwise managed by CUDA. Page-locking can
1514
+ be requested multiple times on the same memory, and must therefore be
1515
+ lifted as soon as finalization is requested, otherwise subsequent calls to
1516
+ `mempin` may fail with `CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED`, leading
1517
+ to unexpected behavior for the context managers `cuda.{pinned,mapped}`.
1518
+ This function therefore carries out finalization immediately, bypassing the
1519
+ `context.deallocations` queue.
1520
+
1521
+ """
1522
+ allocations = memory_manager.allocations
1523
+
1524
+ def core():
1525
+ if mapped and allocations:
1526
+ del allocations[alloc_key]
1527
+ driver.cuMemHostUnregister(ptr)
1528
+
1529
+ return core
1530
+
1531
+
1532
+ def _event_finalizer(deallocs, handle):
1533
+ def core():
1534
+ deallocs.add_item(driver.cuEventDestroy, handle.value)
1535
+
1536
+ return core
1537
+
1538
+
1539
+ def _stream_finalizer(deallocs, handle):
1540
+ def core():
1541
+ deallocs.add_item(driver.cuStreamDestroy, handle.value)
1542
+
1543
+ return core
1544
+
1545
+
1546
+ def _module_finalizer(context, handle):
1547
+ dealloc = context.deallocations
1548
+ modules = context.modules
1549
+ key = handle
1550
+
1551
+ def core():
1552
+ shutting_down = utils.shutting_down # early bind
1553
+
1554
+ def module_unload(handle):
1555
+ # If we are not shutting down, we must be called due to
1556
+ # Context.reset() of Context.unload_module(). Both must have
1557
+ # cleared the module reference from the context.
1558
+ assert shutting_down() or key not in modules
1559
+ driver.cuModuleUnload(handle)
1560
+
1561
+ dealloc.add_item(module_unload, handle)
1562
+
1563
+ return core
1564
+
1565
+
1566
+ class _CudaIpcImpl(object):
1567
+ """Implementation of GPU IPC using CUDA driver API.
1568
+ This requires the devices to be peer accessible.
1569
+ """
1570
+
1571
+ def __init__(self, parent):
1572
+ self.base = parent.base
1573
+ self.handle = parent.handle
1574
+ self.size = parent.size
1575
+ self.offset = parent.offset
1576
+ # remember if the handle is already opened
1577
+ self._opened_mem = None
1578
+
1579
+ def open(self, context):
1580
+ """
1581
+ Import the IPC memory and returns a raw CUDA memory pointer object
1582
+ """
1583
+ if self.base is not None:
1584
+ raise ValueError("opening IpcHandle from original process")
1585
+
1586
+ if self._opened_mem is not None:
1587
+ raise ValueError("IpcHandle is already opened")
1588
+
1589
+ mem = context.open_ipc_handle(self.handle, self.offset + self.size)
1590
+ # this object owns the opened allocation
1591
+ # note: it is required the memory be freed after the ipc handle is
1592
+ # closed by the importing context.
1593
+ self._opened_mem = mem
1594
+ return mem.own().view(self.offset)
1595
+
1596
+ def close(self):
1597
+ if self._opened_mem is None:
1598
+ raise ValueError("IpcHandle not opened")
1599
+ driver.cuIpcCloseMemHandle(self._opened_mem.handle)
1600
+ self._opened_mem = None
1601
+
1602
+
1603
+ class _StagedIpcImpl(object):
1604
+ """Implementation of GPU IPC using custom staging logic to workaround
1605
+ CUDA IPC limitation on peer accessibility between devices.
1606
+ """
1607
+
1608
+ def __init__(self, parent, source_info):
1609
+ self.parent = parent
1610
+ self.base = parent.base
1611
+ self.handle = parent.handle
1612
+ self.size = parent.size
1613
+ self.source_info = source_info
1614
+
1615
+ def open(self, context):
1616
+ from numba import cuda
1617
+
1618
+ srcdev = Device.from_identity(self.source_info)
1619
+ srcdev_id = int(srcdev.id)
1620
+
1621
+ impl = _CudaIpcImpl(parent=self.parent)
1622
+ # Open context on the source device.
1623
+ with cuda.gpus[srcdev_id]:
1624
+ source_ptr = impl.open(cuda.devices.get_context())
1625
+
1626
+ # Allocate GPU buffer.
1627
+ newmem = context.memalloc(self.size)
1628
+ # Do D->D from the source peer-context
1629
+ # This performs automatic host staging
1630
+ device_to_device(newmem, source_ptr, self.size)
1631
+
1632
+ # Cleanup source context
1633
+ with cuda.gpus[srcdev_id]:
1634
+ impl.close()
1635
+
1636
+ return newmem
1637
+
1638
+ def close(self):
1639
+ # Nothing has to be done here
1640
+ pass
1641
+
1642
+
1643
+ class IpcHandle(object):
1644
+ """
1645
+ CUDA IPC handle. Serialization of the CUDA IPC handle object is implemented
1646
+ here.
1647
+
1648
+ :param base: A reference to the original allocation to keep it alive
1649
+ :type base: MemoryPointer
1650
+ :param handle: The CUDA IPC handle, as a ctypes array of bytes.
1651
+ :param size: Size of the original allocation
1652
+ :type size: int
1653
+ :param source_info: The identity of the device on which the IPC handle was
1654
+ opened.
1655
+ :type source_info: dict
1656
+ :param offset: The offset into the underlying allocation of the memory
1657
+ referred to by this IPC handle.
1658
+ :type offset: int
1659
+ """
1660
+
1661
+ def __init__(self, base, handle, size, source_info=None, offset=0):
1662
+ self.base = base
1663
+ self.handle = handle
1664
+ self.size = size
1665
+ self.source_info = source_info
1666
+ self._impl = None
1667
+ self.offset = offset
1668
+
1669
+ def _sentry_source_info(self):
1670
+ if self.source_info is None:
1671
+ raise RuntimeError("IPC handle doesn't have source info")
1672
+
1673
+ def can_access_peer(self, context):
1674
+ """Returns a bool indicating whether the active context can peer
1675
+ access the IPC handle
1676
+ """
1677
+ self._sentry_source_info()
1678
+ if self.source_info == context.device.get_device_identity():
1679
+ return True
1680
+ source_device = Device.from_identity(self.source_info)
1681
+ return context.can_access_peer(source_device.id)
1682
+
1683
+ def open_staged(self, context):
1684
+ """Open the IPC by allowing staging on the host memory first."""
1685
+ self._sentry_source_info()
1686
+
1687
+ if self._impl is not None:
1688
+ raise ValueError("IpcHandle is already opened")
1689
+
1690
+ self._impl = _StagedIpcImpl(self, self.source_info)
1691
+ return self._impl.open(context)
1692
+
1693
+ def open_direct(self, context):
1694
+ """
1695
+ Import the IPC memory and returns a raw CUDA memory pointer object
1696
+ """
1697
+ if self._impl is not None:
1698
+ raise ValueError("IpcHandle is already opened")
1699
+
1700
+ self._impl = _CudaIpcImpl(self)
1701
+ return self._impl.open(context)
1702
+
1703
+ def open(self, context):
1704
+ """Open the IPC handle and import the memory for usage in the given
1705
+ context. Returns a raw CUDA memory pointer object.
1706
+
1707
+ This is enhanced over CUDA IPC that it will work regardless of whether
1708
+ the source device is peer-accessible by the destination device.
1709
+ If the devices are peer-accessible, it uses .open_direct().
1710
+ If the devices are not peer-accessible, it uses .open_staged().
1711
+ """
1712
+ if self.source_info is None or self.can_access_peer(context):
1713
+ fn = self.open_direct
1714
+ else:
1715
+ fn = self.open_staged
1716
+ return fn(context)
1717
+
1718
+ def open_array(self, context, shape, dtype, strides=None):
1719
+ """
1720
+ Similar to `.open()` but returns an device array.
1721
+ """
1722
+ from . import devicearray
1723
+
1724
+ # by default, set strides to itemsize
1725
+ if strides is None:
1726
+ strides = dtype.itemsize
1727
+ dptr = self.open(context)
1728
+ # read the device pointer as an array
1729
+ return devicearray.DeviceNDArray(
1730
+ shape=shape, strides=strides, dtype=dtype, gpu_data=dptr
1731
+ )
1732
+
1733
+ def close(self):
1734
+ if self._impl is None:
1735
+ raise ValueError("IpcHandle not opened")
1736
+ self._impl.close()
1737
+ self._impl = None
1738
+
1739
+ def __reduce__(self):
1740
+ # Preprocess the IPC handle, which is defined as a byte array.
1741
+ preprocessed_handle = self.handle.reserved
1742
+ args = (
1743
+ self.__class__,
1744
+ preprocessed_handle,
1745
+ self.size,
1746
+ self.source_info,
1747
+ self.offset,
1748
+ )
1749
+ return (serialize._rebuild_reduction, args)
1750
+
1751
+ @classmethod
1752
+ def _rebuild(cls, handle_ary, size, source_info, offset):
1753
+ handle = binding.CUipcMemHandle()
1754
+ handle.reserved = handle_ary
1755
+ return cls(
1756
+ base=None,
1757
+ handle=handle,
1758
+ size=size,
1759
+ source_info=source_info,
1760
+ offset=offset,
1761
+ )
1762
+
1763
+
1764
+ class MemoryPointer(object):
1765
+ """A memory pointer that owns a buffer, with an optional finalizer. Memory
1766
+ pointers provide reference counting, and instances are initialized with a
1767
+ reference count of 1.
1768
+
1769
+ The base ``MemoryPointer`` class does not use the
1770
+ reference count for managing the buffer lifetime. Instead, the buffer
1771
+ lifetime is tied to the memory pointer instance's lifetime:
1772
+
1773
+ - When the instance is deleted, the finalizer will be called.
1774
+ - When the reference count drops to 0, no action is taken.
1775
+
1776
+ Subclasses of ``MemoryPointer`` may modify these semantics, for example to
1777
+ tie the buffer lifetime to the reference count, so that the buffer is freed
1778
+ when there are no more references.
1779
+
1780
+ :param context: The context in which the pointer was allocated.
1781
+ :type context: Context
1782
+ :param pointer: The address of the buffer.
1783
+ :type pointer: ctypes.c_void_p
1784
+ :param size: The size of the allocation in bytes.
1785
+ :type size: int
1786
+ :param owner: The owner is sometimes set by the internals of this class, or
1787
+ used for Numba's internal memory management. It should not be
1788
+ provided by an external user of the ``MemoryPointer`` class
1789
+ (e.g. from within an EMM Plugin); the default of `None`
1790
+ should always suffice.
1791
+ :type owner: NoneType
1792
+ :param finalizer: A function that is called when the buffer is to be freed.
1793
+ :type finalizer: function
1794
+ """
1795
+
1796
+ __cuda_memory__ = True
1797
+
1798
+ def __init__(self, context, pointer, size, owner=None, finalizer=None):
1799
+ if isinstance(pointer, ctypes.c_void_p):
1800
+ pointer = binding.CUdeviceptr(pointer.value)
1801
+
1802
+ self.context = context
1803
+ self.device_pointer = pointer
1804
+ self.size = size
1805
+ self._cuda_memsize_ = size
1806
+ self.is_managed = finalizer is not None
1807
+ self.refct = 1
1808
+ self.handle = self.device_pointer
1809
+ self._owner = owner
1810
+
1811
+ if finalizer is not None:
1812
+ self._finalizer = weakref.finalize(self, finalizer)
1813
+
1814
+ @property
1815
+ def owner(self):
1816
+ return self if self._owner is None else self._owner
1817
+
1818
+ def own(self):
1819
+ return OwnedPointer(weakref.proxy(self))
1820
+
1821
+ def free(self):
1822
+ """
1823
+ Forces the device memory to the trash.
1824
+ """
1825
+ if self.is_managed:
1826
+ if not self._finalizer.alive:
1827
+ raise RuntimeError("Freeing dead memory")
1828
+ self._finalizer()
1829
+ assert not self._finalizer.alive
1830
+
1831
+ def memset(self, byte, count=None, stream=0):
1832
+ count = self.size if count is None else count
1833
+ if stream:
1834
+ handle = stream.handle.value
1835
+ driver.cuMemsetD8Async(self.device_pointer, byte, count, handle)
1836
+ else:
1837
+ driver.cuMemsetD8(self.device_pointer, byte, count)
1838
+
1839
+ def view(self, start, stop=None):
1840
+ if stop is None:
1841
+ size = self.size - start
1842
+ else:
1843
+ size = stop - start
1844
+
1845
+ # Handle NULL/empty memory buffer
1846
+ if not self.device_pointer_value:
1847
+ if size != 0:
1848
+ raise RuntimeError("non-empty slice into empty slice")
1849
+ view = self # new view is just a reference to self
1850
+ # Handle normal case
1851
+ else:
1852
+ base = self.device_pointer_value + start
1853
+ if size < 0:
1854
+ raise RuntimeError("size cannot be negative")
1855
+ pointer = binding.CUdeviceptr()
1856
+ ctypes_ptr = drvapi.cu_device_ptr.from_address(pointer.getPtr())
1857
+ ctypes_ptr.value = base
1858
+ view = MemoryPointer(self.context, pointer, size, owner=self.owner)
1859
+
1860
+ if isinstance(self.owner, (MemoryPointer, OwnedPointer)):
1861
+ # Owned by a numba-managed memory segment, take an owned reference
1862
+ return OwnedPointer(weakref.proxy(self.owner), view)
1863
+ else:
1864
+ # Owned by external alloc, return view with same external owner
1865
+ return view
1866
+
1867
+ @property
1868
+ def device_ctypes_pointer(self):
1869
+ return drvapi.cu_device_ptr(int(self.device_pointer))
1870
+
1871
+ @property
1872
+ def device_pointer_value(self):
1873
+ return int(self.device_pointer) or None
1874
+
1875
+
1876
+ class AutoFreePointer(MemoryPointer):
1877
+ """Modifies the ownership semantic of the MemoryPointer so that the
1878
+ instance lifetime is directly tied to the number of references.
1879
+
1880
+ When the reference count reaches zero, the finalizer is invoked.
1881
+
1882
+ Constructor arguments are the same as for :class:`MemoryPointer`.
1883
+ """
1884
+
1885
+ def __init__(self, *args, **kwargs):
1886
+ super(AutoFreePointer, self).__init__(*args, **kwargs)
1887
+ # Releease the self reference to the buffer, so that the finalizer
1888
+ # is invoked if all the derived pointers are gone.
1889
+ self.refct -= 1
1890
+
1891
+
1892
+ class MappedMemory(AutoFreePointer):
1893
+ """A memory pointer that refers to a buffer on the host that is mapped into
1894
+ device memory.
1895
+
1896
+ :param context: The context in which the pointer was mapped.
1897
+ :type context: Context
1898
+ :param pointer: The address of the buffer.
1899
+ :type pointer: ctypes.c_void_p
1900
+ :param size: The size of the buffer in bytes.
1901
+ :type size: int
1902
+ :param owner: The owner is sometimes set by the internals of this class, or
1903
+ used for Numba's internal memory management. It should not be
1904
+ provided by an external user of the ``MappedMemory`` class
1905
+ (e.g. from within an EMM Plugin); the default of `None`
1906
+ should always suffice.
1907
+ :type owner: NoneType
1908
+ :param finalizer: A function that is called when the buffer is to be freed.
1909
+ :type finalizer: function
1910
+ """
1911
+
1912
+ __cuda_memory__ = True
1913
+
1914
+ def __init__(self, context, pointer, size, owner=None, finalizer=None):
1915
+ self.owned = owner
1916
+ self.host_pointer = pointer
1917
+
1918
+ devptr = driver.cuMemHostGetDevicePointer(pointer, 0)
1919
+ self._bufptr_ = self.host_pointer
1920
+
1921
+ self.device_pointer = devptr
1922
+ super(MappedMemory, self).__init__(
1923
+ context, devptr, size, finalizer=finalizer
1924
+ )
1925
+ self.handle = self.host_pointer
1926
+
1927
+ # For buffer interface
1928
+ self._buflen_ = self.size
1929
+
1930
+ def own(self):
1931
+ return MappedOwnedPointer(weakref.proxy(self))
1932
+
1933
+
1934
+ class PinnedMemory(mviewbuf.MemAlloc):
1935
+ """A pointer to a pinned buffer on the host.
1936
+
1937
+ :param context: The context in which the pointer was mapped.
1938
+ :type context: Context
1939
+ :param owner: The object owning the memory. For EMM plugin implementation,
1940
+ this ca
1941
+ :param pointer: The address of the buffer.
1942
+ :type pointer: ctypes.c_void_p
1943
+ :param size: The size of the buffer in bytes.
1944
+ :type size: int
1945
+ :param owner: An object owning the buffer that has been pinned. For EMM
1946
+ plugin implementation, the default of ``None`` suffices for
1947
+ memory allocated in ``memhostalloc`` - for ``mempin``, it
1948
+ should be the owner passed in to the ``mempin`` method.
1949
+ :param finalizer: A function that is called when the buffer is to be freed.
1950
+ :type finalizer: function
1951
+ """
1952
+
1953
+ def __init__(self, context, pointer, size, owner=None, finalizer=None):
1954
+ self.context = context
1955
+ self.owned = owner
1956
+ self.size = size
1957
+ self.host_pointer = pointer
1958
+ self.is_managed = finalizer is not None
1959
+ self.handle = self.host_pointer
1960
+
1961
+ # For buffer interface
1962
+ self._buflen_ = self.size
1963
+ self._bufptr_ = self.host_pointer
1964
+
1965
+ if finalizer is not None:
1966
+ weakref.finalize(self, finalizer)
1967
+
1968
+ def own(self):
1969
+ return self
1970
+
1971
+
1972
+ class ManagedMemory(AutoFreePointer):
1973
+ """A memory pointer that refers to a managed memory buffer (can be accessed
1974
+ on both host and device).
1975
+
1976
+ :param context: The context in which the pointer was mapped.
1977
+ :type context: Context
1978
+ :param pointer: The address of the buffer.
1979
+ :type pointer: ctypes.c_void_p
1980
+ :param size: The size of the buffer in bytes.
1981
+ :type size: int
1982
+ :param owner: The owner is sometimes set by the internals of this class, or
1983
+ used for Numba's internal memory management. It should not be
1984
+ provided by an external user of the ``ManagedMemory`` class
1985
+ (e.g. from within an EMM Plugin); the default of `None`
1986
+ should always suffice.
1987
+ :type owner: NoneType
1988
+ :param finalizer: A function that is called when the buffer is to be freed.
1989
+ :type finalizer: function
1990
+ """
1991
+
1992
+ __cuda_memory__ = True
1993
+
1994
+ def __init__(self, context, pointer, size, owner=None, finalizer=None):
1995
+ self.owned = owner
1996
+ devptr = pointer
1997
+ super().__init__(context, devptr, size, finalizer=finalizer)
1998
+
1999
+ # For buffer interface
2000
+ self._buflen_ = self.size
2001
+ self._bufptr_ = self.device_pointer
2002
+
2003
+ def own(self):
2004
+ return ManagedOwnedPointer(weakref.proxy(self))
2005
+
2006
+
2007
+ class OwnedPointer(object):
2008
+ def __init__(self, memptr, view=None):
2009
+ self._mem = memptr
2010
+
2011
+ if view is None:
2012
+ self._view = self._mem
2013
+ else:
2014
+ assert not view.is_managed
2015
+ self._view = view
2016
+
2017
+ mem = self._mem
2018
+
2019
+ def deref():
2020
+ try:
2021
+ mem.refct -= 1
2022
+ assert mem.refct >= 0
2023
+ if mem.refct == 0:
2024
+ mem.free()
2025
+ except ReferenceError:
2026
+ # ignore reference error here
2027
+ pass
2028
+
2029
+ self._mem.refct += 1
2030
+ weakref.finalize(self, deref)
2031
+
2032
+ # pull this attribute out for speed, because it's used often and
2033
+ # there's overhead to going through `__getattr__`
2034
+ self.device_ctypes_pointer = self._view.device_ctypes_pointer
2035
+
2036
+ def __getattr__(self, fname):
2037
+ """Proxy MemoryPointer methods"""
2038
+ return getattr(self._view, fname)
2039
+
2040
+
2041
+ class MappedOwnedPointer(OwnedPointer, mviewbuf.MemAlloc):
2042
+ pass
2043
+
2044
+
2045
+ class ManagedOwnedPointer(OwnedPointer, mviewbuf.MemAlloc):
2046
+ pass
2047
+
2048
+
2049
+ class Stream:
2050
+ def __init__(self, handle, finalizer=None, external=False):
2051
+ self.handle = handle
2052
+ self.external = external
2053
+ if finalizer is not None:
2054
+ weakref.finalize(self, finalizer)
2055
+
2056
+ def __int__(self):
2057
+ # The default stream's handle.value is 0, which gives `None`
2058
+ return self.handle.value or drvapi.CU_STREAM_DEFAULT
2059
+
2060
+ def __cuda_stream__(self):
2061
+ if not self.handle.value:
2062
+ return (0, drvapi.CU_STREAM_DEFAULT)
2063
+ return (0, self.handle.value)
2064
+
2065
+ def __repr__(self):
2066
+ default_streams = {
2067
+ drvapi.CU_STREAM_DEFAULT: "<Default CUDA stream>",
2068
+ drvapi.CU_STREAM_LEGACY: "<Legacy default CUDA stream>",
2069
+ drvapi.CU_STREAM_PER_THREAD: "<Per-thread default CUDA stream>",
2070
+ }
2071
+ ptr = self.handle.value or drvapi.CU_STREAM_DEFAULT
2072
+
2073
+ if ptr in default_streams:
2074
+ return default_streams[ptr]
2075
+ elif self.external:
2076
+ return f"<External CUDA stream {ptr:d}>"
2077
+ else:
2078
+ return f"<CUDA stream {ptr:d}>"
2079
+
2080
+ def synchronize(self):
2081
+ """
2082
+ Wait for all commands in this stream to execute. This will commit any
2083
+ pending memory transfers.
2084
+ """
2085
+ handle = self.handle.value
2086
+ driver.cuStreamSynchronize(handle)
2087
+
2088
+ @contextlib.contextmanager
2089
+ def auto_synchronize(self):
2090
+ """
2091
+ A context manager that waits for all commands in this stream to execute
2092
+ and commits any pending memory transfers upon exiting the context.
2093
+ """
2094
+ yield self
2095
+ self.synchronize()
2096
+
2097
+ def add_callback(self, callback, arg=None):
2098
+ """
2099
+ Add a callback to a compute stream.
2100
+ The user provided function is called from a driver thread once all
2101
+ preceding stream operations are complete.
2102
+
2103
+ Callback functions are called from a CUDA driver thread, not from
2104
+ the thread that invoked `add_callback`. No CUDA API functions may
2105
+ be called from within the callback function.
2106
+
2107
+ The duration of a callback function should be kept short, as the
2108
+ callback will block later work in the stream and may block other
2109
+ callbacks from being executed.
2110
+
2111
+ .. warning::
2112
+ There is a potential for deadlock due to a lock ordering issue
2113
+ between the GIL and the CUDA driver lock when using libraries
2114
+ that call CUDA functions without releasing the GIL. This can
2115
+ occur when the callback function, which holds the CUDA driver lock,
2116
+ attempts to acquire the GIL while another thread that holds the GIL
2117
+ is waiting for the CUDA driver lock. Consider using libraries that
2118
+ properly release the GIL around CUDA operations or restructure
2119
+ your code to avoid this situation.
2120
+
2121
+ Note: The driver function underlying this method is marked for
2122
+ eventual deprecation and may be replaced in a future CUDA release.
2123
+
2124
+ :param callback: Callback function with arguments (stream, status, arg).
2125
+ :param arg: Optional user data to be passed to the callback function.
2126
+ """
2127
+ data = (self, callback, arg)
2128
+ _py_incref(data)
2129
+ ptr = int.from_bytes(self._stream_callback, byteorder="little")
2130
+ stream_callback = binding.CUstreamCallback(ptr)
2131
+ # The callback needs to receive a pointer to the data PyObject
2132
+ data = id(data)
2133
+ handle = self.handle.value
2134
+ driver.cuStreamAddCallback(handle, stream_callback, data, 0)
2135
+
2136
+ @staticmethod
2137
+ @cu_stream_callback_pyobj
2138
+ def _stream_callback(handle, status, data):
2139
+ try:
2140
+ stream, callback, arg = data
2141
+ callback(stream, status, arg)
2142
+ except Exception as e:
2143
+ warnings.warn(f"Exception in stream callback: {e}")
2144
+ finally:
2145
+ _py_decref(data)
2146
+
2147
+ def async_done(self) -> asyncio.futures.Future:
2148
+ """
2149
+ Return an awaitable that resolves once all preceding stream operations
2150
+ are complete. The result of the awaitable is the current stream.
2151
+
2152
+ .. warning::
2153
+ There is a potential for deadlock due to a lock ordering issue
2154
+ between the GIL and the CUDA driver lock when using libraries
2155
+ that call CUDA functions without releasing the GIL. This can
2156
+ occur when the callback function (internally used by this method),
2157
+ which holds the CUDA driver lock, attempts to acquire the GIL
2158
+ while another thread that holds the GIL is waiting for the CUDA driver lock.
2159
+ Consider using libraries that properly release the GIL around
2160
+ CUDA operations or restructure your code to avoid this situation.
2161
+ """
2162
+ loop = asyncio.get_running_loop()
2163
+ future = loop.create_future()
2164
+
2165
+ def resolver(future, status):
2166
+ if future.done():
2167
+ return
2168
+ elif status == 0:
2169
+ future.set_result(self)
2170
+ else:
2171
+ future.set_exception(Exception(f"Stream error {status}"))
2172
+
2173
+ def callback(stream, status, future):
2174
+ loop.call_soon_threadsafe(resolver, future, status)
2175
+
2176
+ self.add_callback(callback, future)
2177
+ return future
2178
+
2179
+
2180
+ class Event:
2181
+ def __init__(self, handle, finalizer=None):
2182
+ self.handle = handle
2183
+ if finalizer is not None:
2184
+ weakref.finalize(self, finalizer)
2185
+
2186
+ def query(self):
2187
+ """
2188
+ Returns True if all work before the most recent record has completed;
2189
+ otherwise, returns False.
2190
+ """
2191
+ try:
2192
+ driver.cuEventQuery(self.handle)
2193
+ except CudaAPIError as e:
2194
+ if e.code == enums.CUDA_ERROR_NOT_READY:
2195
+ return False
2196
+ else:
2197
+ raise
2198
+ else:
2199
+ return True
2200
+
2201
+ def record(self, stream=0):
2202
+ """
2203
+ Set the record point of the event to the current point in the given
2204
+ stream.
2205
+
2206
+ The event will be considered to have occurred when all work that was
2207
+ queued in the stream at the time of the call to ``record()`` has been
2208
+ completed.
2209
+ """
2210
+ hstream = _stream_handle(stream)
2211
+ handle = self.handle.value
2212
+ driver.cuEventRecord(handle, hstream)
2213
+
2214
+ def synchronize(self):
2215
+ """
2216
+ Synchronize the host thread for the completion of the event.
2217
+ """
2218
+ handle = self.handle.value
2219
+ driver.cuEventSynchronize(handle)
2220
+
2221
+ def wait(self, stream=0):
2222
+ """
2223
+ All future works submitted to stream will wait util the event completes.
2224
+ """
2225
+ hstream = _stream_handle(stream)
2226
+ handle = self.handle.value
2227
+ flags = 0
2228
+ driver.cuStreamWaitEvent(hstream, handle, flags)
2229
+
2230
+ def elapsed_time(self, evtend):
2231
+ return event_elapsed_time(self, evtend)
2232
+
2233
+
2234
+ def event_elapsed_time(evtstart, evtend):
2235
+ """
2236
+ Compute the elapsed time between two events in milliseconds.
2237
+ """
2238
+ return driver.cuEventElapsedTime(evtstart.handle.value, evtend.handle.value)
2239
+
2240
+
2241
+ class Module(metaclass=ABCMeta):
2242
+ """Abstract base class for modules"""
2243
+
2244
+ def __init__(
2245
+ self,
2246
+ context,
2247
+ handle,
2248
+ info_log,
2249
+ finalizer=None,
2250
+ setup_callbacks=None,
2251
+ teardown_callbacks=None,
2252
+ ):
2253
+ self.context = context
2254
+ self.handle = handle
2255
+ self.info_log = info_log
2256
+ if finalizer is not None:
2257
+ self._finalizer = weakref.finalize(self, finalizer)
2258
+
2259
+ self.initialized = False
2260
+ self.setup_functions = setup_callbacks
2261
+ self.teardown_functions = teardown_callbacks
2262
+
2263
+ self._set_finalizers()
2264
+
2265
+ def unload(self):
2266
+ """Unload this module from the context"""
2267
+ self.context.unload_module(self)
2268
+
2269
+ @abstractmethod
2270
+ def get_function(self, name):
2271
+ """Returns a Function object encapsulating the named function"""
2272
+
2273
+ @abstractmethod
2274
+ def get_global_symbol(self, name):
2275
+ """Return a MemoryPointer referring to the named symbol"""
2276
+
2277
+ def setup(self):
2278
+ """Call the setup functions for the module"""
2279
+ if self.initialized:
2280
+ raise RuntimeError("The module has already been initialized.")
2281
+
2282
+ if self.setup_functions is None:
2283
+ return
2284
+
2285
+ for f in self.setup_functions:
2286
+ f(self.handle)
2287
+
2288
+ self.initialized = True
2289
+
2290
+ def _set_finalizers(self):
2291
+ """Create finalizers that tear down the module."""
2292
+ if self.teardown_functions is None:
2293
+ return
2294
+
2295
+ def _teardown(teardowns, handle):
2296
+ for f in teardowns:
2297
+ f(handle)
2298
+
2299
+ weakref.finalize(
2300
+ self,
2301
+ _teardown,
2302
+ self.teardown_functions,
2303
+ self.handle,
2304
+ )
2305
+
2306
+
2307
+ class CtypesModule(Module):
2308
+ def get_function(self, name):
2309
+ handle = drvapi.cu_function()
2310
+ driver.cuModuleGetFunction(
2311
+ byref(handle), self.handle, name.encode("utf8")
2312
+ )
2313
+ return CtypesFunction(weakref.proxy(self), handle, name)
2314
+
2315
+ def get_global_symbol(self, name):
2316
+ ptr = drvapi.cu_device_ptr()
2317
+ size = drvapi.c_size_t()
2318
+ driver.cuModuleGetGlobal(
2319
+ byref(ptr), byref(size), self.handle, name.encode("utf8")
2320
+ )
2321
+ return MemoryPointer(self.context, ptr, size), size.value
2322
+
2323
+
2324
+ class CudaPythonModule(Module):
2325
+ def get_function(self, name):
2326
+ handle = driver.cuModuleGetFunction(self.handle, name.encode("utf8"))
2327
+ return CudaPythonFunction(weakref.proxy(self), handle, name)
2328
+
2329
+ def get_global_symbol(self, name):
2330
+ ptr, size = driver.cuModuleGetGlobal(self.handle, name.encode("utf8"))
2331
+ return MemoryPointer(self.context, ptr, size), size
2332
+
2333
+
2334
+ FuncAttr = namedtuple(
2335
+ "FuncAttr", ["regs", "shared", "local", "const", "maxthreads"]
2336
+ )
2337
+
2338
+
2339
+ class Function(metaclass=ABCMeta):
2340
+ griddim = 1, 1, 1
2341
+ blockdim = 1, 1, 1
2342
+ stream = 0
2343
+ sharedmem = 0
2344
+
2345
+ def __init__(self, module, handle, name):
2346
+ self.module = module
2347
+ self.handle = handle
2348
+ self.name = name
2349
+ self.attrs = self.read_func_attr_all()
2350
+
2351
+ def __repr__(self):
2352
+ return "<CUDA function %s>" % self.name
2353
+
2354
+ @property
2355
+ def device(self):
2356
+ return self.module.context.device
2357
+
2358
+ @abstractmethod
2359
+ def cache_config(
2360
+ self, prefer_equal=False, prefer_cache=False, prefer_shared=False
2361
+ ):
2362
+ """Set the cache configuration for this function."""
2363
+
2364
+ @abstractmethod
2365
+ def read_func_attr(self, attrid):
2366
+ """Return the value of the attribute with given ID."""
2367
+
2368
+ @abstractmethod
2369
+ def read_func_attr_all(self):
2370
+ """Return a FuncAttr object with the values of various function
2371
+ attributes."""
2372
+
2373
+
2374
+ class CtypesFunction(Function):
2375
+ def cache_config(
2376
+ self, prefer_equal=False, prefer_cache=False, prefer_shared=False
2377
+ ):
2378
+ prefer_equal = prefer_equal or (prefer_cache and prefer_shared)
2379
+ if prefer_equal:
2380
+ flag = enums.CU_FUNC_CACHE_PREFER_EQUAL
2381
+ elif prefer_cache:
2382
+ flag = enums.CU_FUNC_CACHE_PREFER_L1
2383
+ elif prefer_shared:
2384
+ flag = enums.CU_FUNC_CACHE_PREFER_SHARED
2385
+ else:
2386
+ flag = enums.CU_FUNC_CACHE_PREFER_NONE
2387
+ driver.cuFuncSetCacheConfig(self.handle, flag)
2388
+
2389
+ def read_func_attr(self, attrid):
2390
+ retval = c_int()
2391
+ driver.cuFuncGetAttribute(byref(retval), attrid, self.handle)
2392
+ return retval.value
2393
+
2394
+ def read_func_attr_all(self):
2395
+ nregs = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_NUM_REGS)
2396
+ cmem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES)
2397
+ lmem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES)
2398
+ smem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES)
2399
+ maxtpb = self.read_func_attr(
2400
+ enums.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
2401
+ )
2402
+ return FuncAttr(
2403
+ regs=nregs, const=cmem, local=lmem, shared=smem, maxthreads=maxtpb
2404
+ )
2405
+
2406
+
2407
+ class CudaPythonFunction(Function):
2408
+ def cache_config(
2409
+ self, prefer_equal=False, prefer_cache=False, prefer_shared=False
2410
+ ):
2411
+ prefer_equal = prefer_equal or (prefer_cache and prefer_shared)
2412
+ attr = binding.CUfunction_attribute
2413
+ if prefer_equal:
2414
+ flag = attr.CU_FUNC_CACHE_PREFER_EQUAL
2415
+ elif prefer_cache:
2416
+ flag = attr.CU_FUNC_CACHE_PREFER_L1
2417
+ elif prefer_shared:
2418
+ flag = attr.CU_FUNC_CACHE_PREFER_SHARED
2419
+ else:
2420
+ flag = attr.CU_FUNC_CACHE_PREFER_NONE
2421
+ driver.cuFuncSetCacheConfig(self.handle, flag)
2422
+
2423
+ def read_func_attr(self, attrid):
2424
+ return driver.cuFuncGetAttribute(attrid, self.handle)
2425
+
2426
+ def read_func_attr_all(self):
2427
+ attr = binding.CUfunction_attribute
2428
+ nregs = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_NUM_REGS)
2429
+ cmem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES)
2430
+ lmem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES)
2431
+ smem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES)
2432
+ maxtpb = self.read_func_attr(
2433
+ attr.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
2434
+ )
2435
+ return FuncAttr(
2436
+ regs=nregs, const=cmem, local=lmem, shared=smem, maxthreads=maxtpb
2437
+ )
2438
+
2439
+
2440
+ def launch_kernel(
2441
+ cufunc_handle,
2442
+ gx,
2443
+ gy,
2444
+ gz,
2445
+ bx,
2446
+ by,
2447
+ bz,
2448
+ sharedmem,
2449
+ hstream,
2450
+ args,
2451
+ cooperative=False,
2452
+ ):
2453
+ param_ptrs = [addressof(arg) for arg in args]
2454
+ params = (c_void_p * len(param_ptrs))(*param_ptrs)
2455
+
2456
+ params_for_launch = addressof(params)
2457
+ extra = 0
2458
+
2459
+ if cooperative:
2460
+ driver.cuLaunchCooperativeKernel(
2461
+ cufunc_handle,
2462
+ gx,
2463
+ gy,
2464
+ gz,
2465
+ bx,
2466
+ by,
2467
+ bz,
2468
+ sharedmem,
2469
+ hstream,
2470
+ params_for_launch,
2471
+ )
2472
+ else:
2473
+ driver.cuLaunchKernel(
2474
+ cufunc_handle,
2475
+ gx,
2476
+ gy,
2477
+ gz,
2478
+ bx,
2479
+ by,
2480
+ bz,
2481
+ sharedmem,
2482
+ hstream,
2483
+ params_for_launch,
2484
+ extra,
2485
+ )
2486
+
2487
+
2488
+ class _LinkerBase(metaclass=ABCMeta):
2489
+ """Abstract base class for linkers"""
2490
+
2491
+ @classmethod
2492
+ def new(
2493
+ cls,
2494
+ max_registers=0,
2495
+ lineinfo=False,
2496
+ cc=None,
2497
+ lto=None,
2498
+ additional_flags=None,
2499
+ ):
2500
+ linker = _Linker
2501
+
2502
+ params = (max_registers, lineinfo, cc)
2503
+ if linker is _Linker:
2504
+ params = (*params, lto, additional_flags)
2505
+ else:
2506
+ if lto or additional_flags:
2507
+ raise ValueError("LTO and additional flags require nvjitlink")
2508
+
2509
+ return linker(*params)
2510
+
2511
+ @abstractmethod
2512
+ def __init__(self, max_registers, lineinfo, cc):
2513
+ # LTO unsupported in Numba at present, but the pynvjitlink linker
2514
+ # (https://github.com/rapidsai/pynvjitlink) supports it,
2515
+ self.lto = False
2516
+
2517
+ @property
2518
+ @abstractmethod
2519
+ def info_log(self):
2520
+ """Return the info log from the linker invocation"""
2521
+
2522
+ @property
2523
+ @abstractmethod
2524
+ def error_log(self):
2525
+ """Return the error log from the linker invocation"""
2526
+
2527
+ @abstractmethod
2528
+ def add_ptx(self, ptx, name):
2529
+ """Add PTX source in a string to the link"""
2530
+
2531
+ def add_cu(self, cu, name):
2532
+ """Add CUDA source in a string to the link. The name of the source
2533
+ file should be specified in `name`."""
2534
+ ptx, log = nvrtc.compile(cu, name, self.cc)
2535
+
2536
+ if config.DUMP_ASSEMBLY:
2537
+ print(("ASSEMBLY %s" % name).center(80, "-"))
2538
+ print(ptx)
2539
+ print("=" * 80)
2540
+
2541
+ # Link the program's PTX using the normal linker mechanism
2542
+ ptx_name = os.path.splitext(name)[0] + ".ptx"
2543
+ self.add_ptx(ptx.encode(), ptx_name)
2544
+
2545
+ @abstractmethod
2546
+ def add_data(self, data, kind, name):
2547
+ """Add in-memory data to the link"""
2548
+
2549
+ @abstractmethod
2550
+ def add_file(self, path, kind):
2551
+ """Add code from a file to the link"""
2552
+
2553
+ def add_cu_file(self, path):
2554
+ cu = cached_file_read(path, how="rb")
2555
+ self.add_cu(cu, os.path.basename(path))
2556
+
2557
+ def add_file_guess_ext(self, path_or_code, ignore_nonlto=False):
2558
+ """
2559
+ Add a file or LinkableCode object to the link. If a file is
2560
+ passed, the type will be inferred from the extension. A LinkableCode
2561
+ object represents a file already in memory.
2562
+
2563
+ When `ignore_nonlto` is set to true, do not add code that will not
2564
+ be LTO-ed in the linking process. This is useful in inspecting the
2565
+ LTO-ed portion of the PTX when linker is added with objects that can be
2566
+ both LTO-ed and not LTO-ed.
2567
+ """
2568
+ if isinstance(path_or_code, str):
2569
+ ext = pathlib.Path(path_or_code).suffix
2570
+ if ext == "":
2571
+ raise RuntimeError(
2572
+ "Don't know how to link file with no extension"
2573
+ )
2574
+ elif ext == ".cu":
2575
+ self.add_cu_file(path_or_code)
2576
+ else:
2577
+ kind = FILE_EXTENSION_MAP.get(ext.lstrip("."), None)
2578
+ if kind is None:
2579
+ raise RuntimeError(
2580
+ f"Don't know how to link file with extension {ext}"
2581
+ )
2582
+
2583
+ if ignore_nonlto:
2584
+ warn_and_return = False
2585
+ if kind in (
2586
+ FILE_EXTENSION_MAP["fatbin"],
2587
+ FILE_EXTENSION_MAP["o"],
2588
+ ):
2589
+ entry_types = inspect_obj_content(path_or_code)
2590
+ if "nvvm" not in entry_types:
2591
+ warn_and_return = True
2592
+ elif kind != FILE_EXTENSION_MAP["ltoir"]:
2593
+ warn_and_return = True
2594
+
2595
+ if warn_and_return:
2596
+ warnings.warn(
2597
+ f"Not adding {path_or_code} as it is not "
2598
+ "optimizable at link time, and `ignore_nonlto == "
2599
+ "True`."
2600
+ )
2601
+ return
2602
+
2603
+ self.add_file(path_or_code, kind)
2604
+ return
2605
+ else:
2606
+ # Otherwise, we should have been given a LinkableCode object
2607
+ if not isinstance(path_or_code, LinkableCode):
2608
+ raise TypeError(
2609
+ "Expected path to file or a LinkableCode object"
2610
+ )
2611
+
2612
+ if path_or_code.kind == "cu":
2613
+ self.add_cu(path_or_code.data, path_or_code.name)
2614
+ else:
2615
+ if ignore_nonlto:
2616
+ warn_and_return = False
2617
+ if isinstance(path_or_code, (Fatbin, Object)):
2618
+ with tempfile.NamedTemporaryFile("w") as fp:
2619
+ fp.write(path_or_code.data)
2620
+ entry_types = inspect_obj_content(fp.name)
2621
+ if "nvvm" not in entry_types:
2622
+ warn_and_return = True
2623
+ elif not isinstance(path_or_code, LTOIR):
2624
+ warn_and_return = True
2625
+
2626
+ if warn_and_return:
2627
+ warnings.warn(
2628
+ f"Not adding {path_or_code.name} as it is not "
2629
+ "optimizable at link time, and `ignore_nonlto == "
2630
+ "True`."
2631
+ )
2632
+ return
2633
+
2634
+ self.add_data(
2635
+ path_or_code.data, path_or_code.kind, path_or_code.name
2636
+ )
2637
+
2638
+ @abstractmethod
2639
+ def complete(self):
2640
+ """Complete the link. Returns (cubin, size)
2641
+
2642
+ cubin is a pointer to a internal buffer of cubin owned by the linker;
2643
+ thus, it should be loaded before the linker is destroyed.
2644
+ """
2645
+
2646
+
2647
+ class _Linker(_LinkerBase):
2648
+ def __init__(
2649
+ self,
2650
+ max_registers=None,
2651
+ lineinfo=False,
2652
+ cc=None,
2653
+ lto=None,
2654
+ additional_flags=None,
2655
+ ):
2656
+ arch = f"sm_{cc[0]}{cc[1]}"
2657
+ self.max_registers = max_registers if max_registers else None
2658
+ self.lineinfo = lineinfo
2659
+ self.cc = cc
2660
+ self.arch = arch
2661
+ if lto is False:
2662
+ # WAR for apparent nvjitlink issue
2663
+ lto = None
2664
+ self.lto = lto
2665
+ self.additional_flags = additional_flags
2666
+
2667
+ self.options = LinkerOptions(
2668
+ max_register_count=self.max_registers,
2669
+ lineinfo=lineinfo,
2670
+ arch=arch,
2671
+ link_time_optimization=lto,
2672
+ )
2673
+ self._complete = False
2674
+ self._object_codes = []
2675
+ self.linker = None # need at least one program
2676
+
2677
+ @property
2678
+ def info_log(self):
2679
+ if not self.linker:
2680
+ raise ValueError("Not Initialized")
2681
+ if self._complete:
2682
+ return self._info_log
2683
+ raise RuntimeError("Link not yet complete.")
2684
+
2685
+ @property
2686
+ def error_log(self):
2687
+ if not self.linker:
2688
+ raise ValueError("Not Initialized")
2689
+ if self._complete:
2690
+ return self._error_log
2691
+ raise RuntimeError("Link not yet complete.")
2692
+
2693
+ def add_ptx(self, ptx, name="<cudapy-ptx>"):
2694
+ obj = ObjectCode.from_ptx(ptx, name=name)
2695
+ self._object_codes.append(obj)
2696
+
2697
+ def add_cu(self, cu, name="<cudapy-cu>"):
2698
+ obj, log = nvrtc.compile(cu, name, self.cc, ltoir=self.lto)
2699
+
2700
+ if not self.lto and config.DUMP_ASSEMBLY:
2701
+ print(("ASSEMBLY %s" % name).center(80, "-"))
2702
+ print(obj.code)
2703
+
2704
+ self._object_codes.append(obj)
2705
+
2706
+ def add_cubin(self, cubin, name="<cudapy-cubin>"):
2707
+ obj = ObjectCode.from_cubin(cubin, name=name)
2708
+ self._object_codes.append(obj)
2709
+
2710
+ def add_ltoir(self, ltoir, name="<cudapy-ltoir>"):
2711
+ obj = ObjectCode.from_ltoir(ltoir, name=name)
2712
+ self._object_codes.append(obj)
2713
+
2714
+ def add_fatbin(self, fatbin, name="<cudapy-fatbin>"):
2715
+ obj = ObjectCode.from_fatbin(fatbin, name=name)
2716
+ self._object_codes.append(obj)
2717
+
2718
+ def add_object(self, obj, name="<cudapy-object>"):
2719
+ obj = ObjectCode.from_object(obj, name=name)
2720
+ self._object_codes.append(obj)
2721
+
2722
+ def add_library(self, lib, name="<cudapy-lib>"):
2723
+ obj = ObjectCode.from_library(lib, name=name)
2724
+ self._object_codes.append(obj)
2725
+
2726
+ def add_file(self, path, kind):
2727
+ try:
2728
+ data = cached_file_read(path, how="rb")
2729
+ except FileNotFoundError:
2730
+ raise LinkerError(f"{path} not found")
2731
+ name = pathlib.Path(path).name
2732
+ self.add_data(data, kind, name)
2733
+
2734
+ def add_data(self, data, kind, name):
2735
+ if kind == FILE_EXTENSION_MAP["ptx"]:
2736
+ fn = self.add_ptx
2737
+ elif kind == FILE_EXTENSION_MAP["cubin"]:
2738
+ fn = self.add_cubin
2739
+ elif kind == "cu":
2740
+ fn = self.add_cu
2741
+ elif (
2742
+ kind == FILE_EXTENSION_MAP["lib"] or kind == FILE_EXTENSION_MAP["a"]
2743
+ ):
2744
+ fn = self.add_library
2745
+ elif kind == FILE_EXTENSION_MAP["fatbin"]:
2746
+ fn = self.add_fatbin
2747
+ elif kind == FILE_EXTENSION_MAP["o"]:
2748
+ fn = self.add_object
2749
+ elif kind == FILE_EXTENSION_MAP["ltoir"]:
2750
+ fn = self.add_ltoir
2751
+ else:
2752
+ raise LinkerError(f"Don't know how to link {kind}")
2753
+
2754
+ fn(data, name)
2755
+
2756
+ def get_linked_ptx(self):
2757
+ options = LinkerOptions(
2758
+ max_register_count=self.max_registers,
2759
+ lineinfo=self.lineinfo,
2760
+ arch=self.arch,
2761
+ link_time_optimization=True,
2762
+ ptx=True,
2763
+ )
2764
+
2765
+ self.linker = Linker(*self._object_codes, options=options)
2766
+
2767
+ result = self.linker.link("ptx")
2768
+ self.close()
2769
+ self._complete = True
2770
+ return result.code
2771
+
2772
+ def close(self):
2773
+ self._info_log = self.linker.get_info_log()
2774
+ self._error_log = self.linker.get_error_log()
2775
+ self.linker.close()
2776
+
2777
+ def complete(self):
2778
+ self.linker = Linker(*self._object_codes, options=self.options)
2779
+ result = self.linker.link("cubin")
2780
+ self.close()
2781
+ self._complete = True
2782
+ return result
2783
+
2784
+
2785
+ class CtypesLinker(_LinkerBase):
2786
+ """
2787
+ Links for current device if no CC given
2788
+ """
2789
+
2790
+ def __init__(self, max_registers=0, lineinfo=False, cc=None):
2791
+ super().__init__(max_registers, lineinfo, cc)
2792
+
2793
+ logsz = config.CUDA_LOG_SIZE
2794
+ linkerinfo = (c_char * logsz)()
2795
+ linkererrors = (c_char * logsz)()
2796
+
2797
+ options = {
2798
+ enums.CU_JIT_INFO_LOG_BUFFER: addressof(linkerinfo),
2799
+ enums.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
2800
+ enums.CU_JIT_ERROR_LOG_BUFFER: addressof(linkererrors),
2801
+ enums.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
2802
+ enums.CU_JIT_LOG_VERBOSE: c_void_p(1),
2803
+ }
2804
+ if max_registers:
2805
+ options[enums.CU_JIT_MAX_REGISTERS] = c_void_p(max_registers)
2806
+ if lineinfo:
2807
+ options[enums.CU_JIT_GENERATE_LINE_INFO] = c_void_p(1)
2808
+
2809
+ self.cc = cc
2810
+ if cc is None:
2811
+ # No option value is needed, but we need something as a placeholder
2812
+ options[enums.CU_JIT_TARGET_FROM_CUCONTEXT] = 1
2813
+ else:
2814
+ cc_val = cc[0] * 10 + cc[1]
2815
+ options[enums.CU_JIT_TARGET] = c_void_p(cc_val)
2816
+
2817
+ raw_keys = list(options.keys())
2818
+ raw_values = list(options.values())
2819
+
2820
+ option_keys = (drvapi.cu_jit_option * len(raw_keys))(*raw_keys)
2821
+ option_vals = (c_void_p * len(raw_values))(*raw_values)
2822
+
2823
+ self.handle = handle = drvapi.cu_link_state()
2824
+ driver.cuLinkCreate(
2825
+ len(raw_keys), option_keys, option_vals, byref(self.handle)
2826
+ )
2827
+
2828
+ weakref.finalize(self, driver.cuLinkDestroy, handle)
2829
+
2830
+ self.linker_info_buf = linkerinfo
2831
+ self.linker_errors_buf = linkererrors
2832
+
2833
+ self._keep_alive = [linkerinfo, linkererrors, option_keys, option_vals]
2834
+
2835
+ @property
2836
+ def info_log(self):
2837
+ return self.linker_info_buf.value.decode("utf8")
2838
+
2839
+ @property
2840
+ def error_log(self):
2841
+ return self.linker_errors_buf.value.decode("utf8")
2842
+
2843
+ def add_cubin(self, cubin, name="<unnamed-cubin>"):
2844
+ return self._add_data(enums.CU_JIT_INPUT_CUBIN, cubin, name)
2845
+
2846
+ def add_ptx(self, ptx, name="<unnamed-ptx>"):
2847
+ return self._add_data(enums.CU_JIT_INPUT_PTX, ptx, name)
2848
+
2849
+ def add_object(self, object_, name="<unnamed-object>"):
2850
+ return self._add_data(enums.CU_JIT_INPUT_OBJECT, object_, name)
2851
+
2852
+ def add_fatbin(self, fatbin, name="<unnamed-fatbin>"):
2853
+ return self._add_data(enums.CU_JIT_INPUT_FATBINARY, fatbin, name)
2854
+
2855
+ def add_library(self, library, name="<unnamed-library>"):
2856
+ return self._add_data(enums.CU_JIT_INPUT_LIBRARY, library, name)
2857
+
2858
+ def _add_data(self, input_type, data, name):
2859
+ data_buffer = c_char_p(data)
2860
+ name_buffer = c_char_p(name.encode("utf8"))
2861
+ self._keep_alive += [data_buffer, name_buffer]
2862
+ try:
2863
+ driver.cuLinkAddData(
2864
+ self.handle,
2865
+ input_type,
2866
+ data_buffer,
2867
+ len(data),
2868
+ name_buffer,
2869
+ 0,
2870
+ None,
2871
+ None,
2872
+ )
2873
+ except CudaAPIError as e:
2874
+ raise LinkerError("%s\n%s" % (e, self.error_log))
2875
+
2876
+ def add_data(self, data, kind, name=None):
2877
+ # We pass the name as **kwargs to ensure the default name for the input
2878
+ # type is used if none is supplied
2879
+ kws = {}
2880
+ if name is not None:
2881
+ kws["name"] = name
2882
+
2883
+ if kind == FILE_EXTENSION_MAP["cubin"]:
2884
+ self.add_cubin(data, **kws)
2885
+ elif kind == FILE_EXTENSION_MAP["fatbin"]:
2886
+ self.add_fatbin(data, **kws)
2887
+ elif kind == FILE_EXTENSION_MAP["a"]:
2888
+ self.add_library(data, **kws)
2889
+ elif kind == FILE_EXTENSION_MAP["ptx"]:
2890
+ self.add_ptx(data, **kws)
2891
+ elif kind == FILE_EXTENSION_MAP["o"]:
2892
+ self.add_object(data, **kws)
2893
+ elif kind == FILE_EXTENSION_MAP["ltoir"]:
2894
+ raise LinkerError("Ctypes linker cannot link LTO-IR")
2895
+ else:
2896
+ raise LinkerError(f"Don't know how to link {kind}")
2897
+
2898
+ def add_file(self, path, kind):
2899
+ pathbuf = c_char_p(path.encode("utf8"))
2900
+ self._keep_alive.append(pathbuf)
2901
+
2902
+ try:
2903
+ driver.cuLinkAddFile(self.handle, kind, pathbuf, 0, None, None)
2904
+ except CudaAPIError as e:
2905
+ if e.code == enums.CUDA_ERROR_FILE_NOT_FOUND:
2906
+ msg = f"{path} not found"
2907
+ else:
2908
+ msg = "%s\n%s" % (e, self.error_log)
2909
+ raise LinkerError(msg)
2910
+
2911
+ def complete(self):
2912
+ cubin_buf = c_void_p(0)
2913
+ size = c_size_t(0)
2914
+
2915
+ try:
2916
+ driver.cuLinkComplete(self.handle, byref(cubin_buf), byref(size))
2917
+ except CudaAPIError as e:
2918
+ raise LinkerError("%s\n%s" % (e, self.error_log))
2919
+
2920
+ size = size.value
2921
+ assert size > 0, "linker returned a zero sized cubin"
2922
+ del self._keep_alive[:]
2923
+
2924
+ # We return a copy of the cubin because it's owned by the linker
2925
+ cubin_ptr = ctypes.cast(cubin_buf, ctypes.POINTER(ctypes.c_char))
2926
+ return bytes(np.ctypeslib.as_array(cubin_ptr, shape=(size,)))
2927
+
2928
+
2929
+ # -----------------------------------------------------------------------------
2930
+
2931
+
2932
+ def get_devptr_for_active_ctx(ptr):
2933
+ """Query the device pointer usable in the current context from an arbitrary
2934
+ pointer.
2935
+ """
2936
+ if ptr != 0:
2937
+ ptr_attrs = binding.CUpointer_attribute
2938
+ attr = ptr_attrs.CU_POINTER_ATTRIBUTE_DEVICE_POINTER
2939
+ ptrobj = binding.CUdeviceptr(ptr)
2940
+ return driver.cuPointerGetAttribute(attr, ptrobj)
2941
+ else:
2942
+ return binding.CUdeviceptr()
2943
+
2944
+
2945
+ def device_extents(devmem):
2946
+ """Find the extents (half open begin and end pointer) of the underlying
2947
+ device memory allocation.
2948
+
2949
+ NOTE: it always returns the extents of the allocation but the extents
2950
+ of the device memory view that can be a subsection of the entire allocation.
2951
+ """
2952
+ devptr = device_ctypes_pointer(devmem)
2953
+ s, n = driver.cuMemGetAddressRange(devptr.value)
2954
+ return int(s), int(binding.CUdeviceptr(int(s) + n))
2955
+
2956
+
2957
+ def device_memory_size(devmem):
2958
+ """Check the memory size of the device memory.
2959
+ The result is cached in the device memory object.
2960
+ It may query the driver for the memory size of the device memory allocation.
2961
+ """
2962
+ sz = getattr(devmem, "_cuda_memsize_", None)
2963
+ if sz is None:
2964
+ s, e = device_extents(devmem)
2965
+ sz = e - s
2966
+ devmem._cuda_memsize_ = sz
2967
+ assert sz >= 0, "{} length array".format(sz)
2968
+ return sz
2969
+
2970
+
2971
+ def _is_datetime_dtype(obj):
2972
+ """Returns True if the obj.dtype is datetime64 or timedelta64"""
2973
+ dtype = getattr(obj, "dtype", None)
2974
+ return dtype is not None and dtype.char in "Mm"
2975
+
2976
+
2977
+ def _workaround_for_datetime(obj):
2978
+ """Workaround for numpy#4983: buffer protocol doesn't support
2979
+ datetime64 or timedelta64.
2980
+ """
2981
+ if _is_datetime_dtype(obj):
2982
+ obj = obj.view(np.int64)
2983
+ return obj
2984
+
2985
+
2986
+ def host_pointer(obj, readonly=False):
2987
+ """Get host pointer from an obj.
2988
+
2989
+ If `readonly` is False, the buffer must be writable.
2990
+
2991
+ NOTE: The underlying data pointer from the host data buffer is used and
2992
+ it should not be changed until the operation which can be asynchronous
2993
+ completes.
2994
+ """
2995
+ if isinstance(obj, int):
2996
+ return obj
2997
+
2998
+ forcewritable = False
2999
+ if not readonly:
3000
+ forcewritable = isinstance(obj, np.void) or _is_datetime_dtype(obj)
3001
+
3002
+ obj = _workaround_for_datetime(obj)
3003
+ return mviewbuf.memoryview_get_buffer(obj, forcewritable, readonly)
3004
+
3005
+
3006
+ def host_memory_extents(obj):
3007
+ "Returns (start, end) the start and end pointer of the array (half open)."
3008
+ obj = _workaround_for_datetime(obj)
3009
+ return mviewbuf.memoryview_get_extents(obj)
3010
+
3011
+
3012
+ @functools.cache
3013
+ def memory_size_from_info(shape, strides, itemsize):
3014
+ """Get the byte size of a contiguous memory buffer given the shape, strides
3015
+ and itemsize.
3016
+ """
3017
+ assert len(shape) == len(strides), "# dim mismatch"
3018
+ ndim = len(shape)
3019
+ s, e = mviewbuf.memoryview_get_extents_info(shape, strides, ndim, itemsize)
3020
+ return e - s
3021
+
3022
+
3023
+ def host_memory_size(obj):
3024
+ "Get the size of the memory"
3025
+ s, e = host_memory_extents(obj)
3026
+ assert e >= s, "memory extend of negative size"
3027
+ return e - s
3028
+
3029
+
3030
+ def device_pointer(obj):
3031
+ "Get the device pointer as an integer"
3032
+ return device_ctypes_pointer(obj).value
3033
+
3034
+
3035
+ def device_ctypes_pointer(obj):
3036
+ "Get the ctypes object for the device pointer"
3037
+ if obj is None:
3038
+ return c_void_p(0)
3039
+ require_device_memory(obj)
3040
+ return obj.device_ctypes_pointer
3041
+
3042
+
3043
+ def is_device_memory(obj):
3044
+ """All CUDA memory object is recognized as an instance with the attribute
3045
+ "__cuda_memory__" defined and its value evaluated to True.
3046
+
3047
+ All CUDA memory object should also define an attribute named
3048
+ "device_pointer" which value is an int object carrying the pointer
3049
+ value of the device memory address. This is not tested in this method.
3050
+ """
3051
+ try:
3052
+ # This is cheaper than getattr in the non-exceptional case
3053
+ return obj.__cuda_memory__
3054
+ except AttributeError:
3055
+ return False
3056
+
3057
+
3058
+ def require_device_memory(obj):
3059
+ """A sentry for methods that accept CUDA memory object."""
3060
+ if not is_device_memory(obj):
3061
+ raise Exception("Not a CUDA memory object.")
3062
+
3063
+
3064
+ def device_memory_depends(devmem, *objs):
3065
+ """Add dependencies to the device memory.
3066
+
3067
+ Mainly used for creating structures that points to other device memory,
3068
+ so that the referees are not GC and released.
3069
+ """
3070
+ depset = getattr(devmem, "_depends_", [])
3071
+ depset.extend(objs)
3072
+
3073
+
3074
+ def host_to_device(dst, src, size, stream=0):
3075
+ """
3076
+ NOTE: The underlying data pointer from the host data buffer is used and
3077
+ it should not be changed until the operation which can be asynchronous
3078
+ completes.
3079
+ """
3080
+ fn = driver.cuMemcpyHtoD
3081
+ args = (device_pointer(dst), host_pointer(src, readonly=True), size)
3082
+
3083
+ if stream:
3084
+ fn = driver.cuMemcpyHtoDAsync
3085
+ args += (_stream_handle(stream),)
3086
+
3087
+ fn(*args)
3088
+
3089
+
3090
+ def device_to_host(dst, src, size, stream=0):
3091
+ """
3092
+ NOTE: The underlying data pointer from the host data buffer is used and
3093
+ it should not be changed until the operation which can be asynchronous
3094
+ completes.
3095
+ """
3096
+ fn = driver.cuMemcpyDtoH
3097
+ args = (host_pointer(dst), device_pointer(src), size)
3098
+
3099
+ if stream:
3100
+ fn = driver.cuMemcpyDtoHAsync
3101
+ args += (_stream_handle(stream),)
3102
+
3103
+ fn(*args)
3104
+
3105
+
3106
+ def device_to_device(dst, src, size, stream=0):
3107
+ """
3108
+ NOTE: The underlying data pointer from the device buffer is used and
3109
+ it should not be changed until the operation which can be asynchronous
3110
+ completes.
3111
+ """
3112
+ fn = driver.cuMemcpyDtoD
3113
+ args = (device_pointer(dst), device_pointer(src), size)
3114
+
3115
+ if stream:
3116
+ fn = driver.cuMemcpyDtoDAsync
3117
+ args += (_stream_handle(stream),)
3118
+
3119
+ fn(*args)
3120
+
3121
+
3122
+ def device_memset(dst, val, size, stream=0):
3123
+ """
3124
+ Memset on the device.
3125
+ If stream is 0, the call is synchronous.
3126
+ If stream is a Stream object, asynchronous mode is used.
3127
+
3128
+ dst: device memory
3129
+ val: byte value to be written
3130
+ size: number of bytes to be written
3131
+ stream: 0 (synchronous) or a CUDA stream
3132
+ """
3133
+ fn = driver.cuMemsetD8
3134
+ args = (device_pointer(dst), val, size)
3135
+
3136
+ if stream:
3137
+ fn = driver.cuMemsetD8Async
3138
+ args += (_stream_handle(stream),)
3139
+
3140
+ try:
3141
+ fn(*args)
3142
+ except CudaAPIError as e:
3143
+ invalid = binding.CUresult.CUDA_ERROR_INVALID_VALUE
3144
+ if (
3145
+ e.code == invalid
3146
+ and getattr(dst, "__cuda_memory__", False)
3147
+ and getattr(dst, "is_managed", False)
3148
+ ):
3149
+ buf = (c_uint8 * size).from_address(host_pointer(dst))
3150
+ byte = val & 0xFF
3151
+ buf[:] = [byte] * size
3152
+ return
3153
+ raise
3154
+
3155
+
3156
+ def profile_start():
3157
+ """
3158
+ Enable profile collection in the current context.
3159
+ """
3160
+ driver.cuProfilerStart()
3161
+
3162
+
3163
+ def profile_stop():
3164
+ """
3165
+ Disable profile collection in the current context.
3166
+ """
3167
+ driver.cuProfilerStop()
3168
+
3169
+
3170
+ @contextlib.contextmanager
3171
+ def profiling():
3172
+ """
3173
+ Context manager that enables profiling on entry and disables profiling on
3174
+ exit.
3175
+ """
3176
+ profile_start()
3177
+ yield
3178
+ profile_stop()
3179
+
3180
+
3181
+ def get_version():
3182
+ """
3183
+ Return the driver version as a tuple of (major, minor)
3184
+ """
3185
+ return driver.get_version()
3186
+
3187
+
3188
+ def inspect_obj_content(objpath: str):
3189
+ """
3190
+ Given path to a fatbin or object, use `cuobjdump` to examine its content
3191
+ Return the set of entries in the object.
3192
+ """
3193
+ code_types: set[str] = set()
3194
+
3195
+ try:
3196
+ out = subprocess.run(
3197
+ ["cuobjdump", objpath], check=True, capture_output=True
3198
+ )
3199
+ except FileNotFoundError as e:
3200
+ msg = (
3201
+ "cuobjdump has not been found. You may need "
3202
+ "to install the CUDA toolkit and ensure that "
3203
+ "it is available on your PATH.\n"
3204
+ )
3205
+ raise RuntimeError(msg) from e
3206
+
3207
+ objtable = out.stdout.decode("utf-8")
3208
+ entry_pattern = r"Fatbin (.*) code"
3209
+ for line in objtable.split("\n"):
3210
+ if match := re.match(entry_pattern, line):
3211
+ code_types.add(match.group(1))
3212
+
3213
+ return code_types
3214
+
3215
+
3216
+ def _stream_handle(stream):
3217
+ """
3218
+ Obtain the appropriate handle for various types of
3219
+ acceptable stream objects. Acceptable types are
3220
+ int (0 for default stream), Stream, ExperimentalStream
3221
+ """
3222
+
3223
+ if stream == 0:
3224
+ return stream
3225
+ allowed = (Stream, ExperimentalStream)
3226
+ if not isinstance(stream, allowed):
3227
+ raise TypeError(
3228
+ "Expected a Stream object or 0, got %s" % type(stream).__name__
3229
+ )
3230
+ elif hasattr(stream, "__cuda_stream__"):
3231
+ ver, ptr = stream.__cuda_stream__()
3232
+ assert ver == 0
3233
+ if isinstance(ptr, binding.CUstream):
3234
+ return get_cuda_native_handle(ptr)
3235
+ else:
3236
+ return ptr
3237
+ else:
3238
+ raise TypeError("Invalid Stream")