numba-cuda 0.21.1__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (488) hide show
  1. _numba_cuda_redirector.pth +4 -0
  2. _numba_cuda_redirector.py +89 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +6 -0
  5. numba_cuda/_version.py +11 -0
  6. numba_cuda/numba/cuda/__init__.py +70 -0
  7. numba_cuda/numba/cuda/_internal/cuda_bf16.py +16394 -0
  8. numba_cuda/numba/cuda/_internal/cuda_fp16.py +8112 -0
  9. numba_cuda/numba/cuda/api.py +577 -0
  10. numba_cuda/numba/cuda/api_util.py +76 -0
  11. numba_cuda/numba/cuda/args.py +72 -0
  12. numba_cuda/numba/cuda/bf16.py +397 -0
  13. numba_cuda/numba/cuda/cache_hints.py +287 -0
  14. numba_cuda/numba/cuda/cext/__init__.py +2 -0
  15. numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
  16. numba_cuda/numba/cuda/cext/_devicearray.cpp +159 -0
  17. numba_cuda/numba/cuda/cext/_devicearray.h +29 -0
  18. numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
  19. numba_cuda/numba/cuda/cext/_dispatcher.cpp +1098 -0
  20. numba_cuda/numba/cuda/cext/_hashtable.cpp +532 -0
  21. numba_cuda/numba/cuda/cext/_hashtable.h +135 -0
  22. numba_cuda/numba/cuda/cext/_helperlib.c +71 -0
  23. numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
  24. numba_cuda/numba/cuda/cext/_helpermod.c +82 -0
  25. numba_cuda/numba/cuda/cext/_pymodule.h +38 -0
  26. numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
  27. numba_cuda/numba/cuda/cext/_typeconv.cpp +206 -0
  28. numba_cuda/numba/cuda/cext/_typeof.cpp +1159 -0
  29. numba_cuda/numba/cuda/cext/_typeof.h +19 -0
  30. numba_cuda/numba/cuda/cext/capsulethunk.h +111 -0
  31. numba_cuda/numba/cuda/cext/mviewbuf.c +385 -0
  32. numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
  33. numba_cuda/numba/cuda/cext/typeconv.cpp +212 -0
  34. numba_cuda/numba/cuda/cext/typeconv.hpp +101 -0
  35. numba_cuda/numba/cuda/cg.py +67 -0
  36. numba_cuda/numba/cuda/cgutils.py +1294 -0
  37. numba_cuda/numba/cuda/cloudpickle/__init__.py +21 -0
  38. numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +1598 -0
  39. numba_cuda/numba/cuda/cloudpickle/cloudpickle_fast.py +17 -0
  40. numba_cuda/numba/cuda/codegen.py +541 -0
  41. numba_cuda/numba/cuda/compiler.py +1396 -0
  42. numba_cuda/numba/cuda/core/analysis.py +758 -0
  43. numba_cuda/numba/cuda/core/annotations/__init__.py +0 -0
  44. numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +288 -0
  45. numba_cuda/numba/cuda/core/annotations/type_annotations.py +305 -0
  46. numba_cuda/numba/cuda/core/base.py +1332 -0
  47. numba_cuda/numba/cuda/core/boxing.py +1411 -0
  48. numba_cuda/numba/cuda/core/bytecode.py +728 -0
  49. numba_cuda/numba/cuda/core/byteflow.py +2346 -0
  50. numba_cuda/numba/cuda/core/caching.py +744 -0
  51. numba_cuda/numba/cuda/core/callconv.py +392 -0
  52. numba_cuda/numba/cuda/core/codegen.py +171 -0
  53. numba_cuda/numba/cuda/core/compiler.py +199 -0
  54. numba_cuda/numba/cuda/core/compiler_lock.py +85 -0
  55. numba_cuda/numba/cuda/core/compiler_machinery.py +497 -0
  56. numba_cuda/numba/cuda/core/config.py +650 -0
  57. numba_cuda/numba/cuda/core/consts.py +124 -0
  58. numba_cuda/numba/cuda/core/controlflow.py +989 -0
  59. numba_cuda/numba/cuda/core/entrypoints.py +57 -0
  60. numba_cuda/numba/cuda/core/environment.py +66 -0
  61. numba_cuda/numba/cuda/core/errors.py +917 -0
  62. numba_cuda/numba/cuda/core/event.py +511 -0
  63. numba_cuda/numba/cuda/core/funcdesc.py +330 -0
  64. numba_cuda/numba/cuda/core/generators.py +387 -0
  65. numba_cuda/numba/cuda/core/imputils.py +509 -0
  66. numba_cuda/numba/cuda/core/inline_closurecall.py +1787 -0
  67. numba_cuda/numba/cuda/core/interpreter.py +3617 -0
  68. numba_cuda/numba/cuda/core/ir.py +1812 -0
  69. numba_cuda/numba/cuda/core/ir_utils.py +2638 -0
  70. numba_cuda/numba/cuda/core/optional.py +129 -0
  71. numba_cuda/numba/cuda/core/options.py +262 -0
  72. numba_cuda/numba/cuda/core/postproc.py +249 -0
  73. numba_cuda/numba/cuda/core/pythonapi.py +1859 -0
  74. numba_cuda/numba/cuda/core/registry.py +46 -0
  75. numba_cuda/numba/cuda/core/removerefctpass.py +123 -0
  76. numba_cuda/numba/cuda/core/rewrites/__init__.py +26 -0
  77. numba_cuda/numba/cuda/core/rewrites/ir_print.py +91 -0
  78. numba_cuda/numba/cuda/core/rewrites/registry.py +104 -0
  79. numba_cuda/numba/cuda/core/rewrites/static_binop.py +41 -0
  80. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +189 -0
  81. numba_cuda/numba/cuda/core/rewrites/static_raise.py +100 -0
  82. numba_cuda/numba/cuda/core/sigutils.py +68 -0
  83. numba_cuda/numba/cuda/core/ssa.py +498 -0
  84. numba_cuda/numba/cuda/core/targetconfig.py +330 -0
  85. numba_cuda/numba/cuda/core/tracing.py +231 -0
  86. numba_cuda/numba/cuda/core/transforms.py +956 -0
  87. numba_cuda/numba/cuda/core/typed_passes.py +867 -0
  88. numba_cuda/numba/cuda/core/typeinfer.py +1950 -0
  89. numba_cuda/numba/cuda/core/unsafe/__init__.py +0 -0
  90. numba_cuda/numba/cuda/core/unsafe/bytes.py +67 -0
  91. numba_cuda/numba/cuda/core/unsafe/eh.py +67 -0
  92. numba_cuda/numba/cuda/core/unsafe/refcount.py +98 -0
  93. numba_cuda/numba/cuda/core/untyped_passes.py +1979 -0
  94. numba_cuda/numba/cuda/cpython/builtins.py +1153 -0
  95. numba_cuda/numba/cuda/cpython/charseq.py +1218 -0
  96. numba_cuda/numba/cuda/cpython/cmathimpl.py +560 -0
  97. numba_cuda/numba/cuda/cpython/enumimpl.py +103 -0
  98. numba_cuda/numba/cuda/cpython/iterators.py +167 -0
  99. numba_cuda/numba/cuda/cpython/listobj.py +1326 -0
  100. numba_cuda/numba/cuda/cpython/mathimpl.py +499 -0
  101. numba_cuda/numba/cuda/cpython/numbers.py +1475 -0
  102. numba_cuda/numba/cuda/cpython/rangeobj.py +289 -0
  103. numba_cuda/numba/cuda/cpython/slicing.py +322 -0
  104. numba_cuda/numba/cuda/cpython/tupleobj.py +456 -0
  105. numba_cuda/numba/cuda/cpython/unicode.py +2865 -0
  106. numba_cuda/numba/cuda/cpython/unicode_support.py +1597 -0
  107. numba_cuda/numba/cuda/cpython/unsafe/__init__.py +0 -0
  108. numba_cuda/numba/cuda/cpython/unsafe/numbers.py +64 -0
  109. numba_cuda/numba/cuda/cpython/unsafe/tuple.py +92 -0
  110. numba_cuda/numba/cuda/cuda_paths.py +691 -0
  111. numba_cuda/numba/cuda/cudadecl.py +556 -0
  112. numba_cuda/numba/cuda/cudadrv/__init__.py +14 -0
  113. numba_cuda/numba/cuda/cudadrv/devicearray.py +951 -0
  114. numba_cuda/numba/cuda/cudadrv/devices.py +249 -0
  115. numba_cuda/numba/cuda/cudadrv/driver.py +3222 -0
  116. numba_cuda/numba/cuda/cudadrv/drvapi.py +435 -0
  117. numba_cuda/numba/cuda/cudadrv/dummyarray.py +558 -0
  118. numba_cuda/numba/cuda/cudadrv/enums.py +613 -0
  119. numba_cuda/numba/cuda/cudadrv/error.py +48 -0
  120. numba_cuda/numba/cuda/cudadrv/libs.py +220 -0
  121. numba_cuda/numba/cuda/cudadrv/linkable_code.py +184 -0
  122. numba_cuda/numba/cuda/cudadrv/mappings.py +14 -0
  123. numba_cuda/numba/cuda/cudadrv/ndarray.py +26 -0
  124. numba_cuda/numba/cuda/cudadrv/nvrtc.py +193 -0
  125. numba_cuda/numba/cuda/cudadrv/nvvm.py +756 -0
  126. numba_cuda/numba/cuda/cudadrv/rtapi.py +13 -0
  127. numba_cuda/numba/cuda/cudadrv/runtime.py +34 -0
  128. numba_cuda/numba/cuda/cudaimpl.py +995 -0
  129. numba_cuda/numba/cuda/cudamath.py +149 -0
  130. numba_cuda/numba/cuda/datamodel/__init__.py +7 -0
  131. numba_cuda/numba/cuda/datamodel/cuda_manager.py +66 -0
  132. numba_cuda/numba/cuda/datamodel/cuda_models.py +1446 -0
  133. numba_cuda/numba/cuda/datamodel/cuda_packer.py +224 -0
  134. numba_cuda/numba/cuda/datamodel/cuda_registry.py +22 -0
  135. numba_cuda/numba/cuda/datamodel/cuda_testing.py +153 -0
  136. numba_cuda/numba/cuda/datamodel/manager.py +11 -0
  137. numba_cuda/numba/cuda/datamodel/models.py +9 -0
  138. numba_cuda/numba/cuda/datamodel/packer.py +9 -0
  139. numba_cuda/numba/cuda/datamodel/registry.py +11 -0
  140. numba_cuda/numba/cuda/datamodel/testing.py +11 -0
  141. numba_cuda/numba/cuda/debuginfo.py +903 -0
  142. numba_cuda/numba/cuda/decorators.py +294 -0
  143. numba_cuda/numba/cuda/descriptor.py +35 -0
  144. numba_cuda/numba/cuda/device_init.py +158 -0
  145. numba_cuda/numba/cuda/deviceufunc.py +1021 -0
  146. numba_cuda/numba/cuda/dispatcher.py +2463 -0
  147. numba_cuda/numba/cuda/errors.py +72 -0
  148. numba_cuda/numba/cuda/extending.py +697 -0
  149. numba_cuda/numba/cuda/flags.py +178 -0
  150. numba_cuda/numba/cuda/fp16.py +357 -0
  151. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  152. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  153. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  154. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  155. numba_cuda/numba/cuda/include/13/cuda_bf16.h +5118 -0
  156. numba_cuda/numba/cuda/include/13/cuda_bf16.hpp +3865 -0
  157. numba_cuda/numba/cuda/include/13/cuda_fp16.h +5363 -0
  158. numba_cuda/numba/cuda/include/13/cuda_fp16.hpp +3483 -0
  159. numba_cuda/numba/cuda/initialize.py +24 -0
  160. numba_cuda/numba/cuda/intrinsic_wrapper.py +41 -0
  161. numba_cuda/numba/cuda/intrinsics.py +382 -0
  162. numba_cuda/numba/cuda/itanium_mangler.py +214 -0
  163. numba_cuda/numba/cuda/kernels/__init__.py +2 -0
  164. numba_cuda/numba/cuda/kernels/reduction.py +265 -0
  165. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  166. numba_cuda/numba/cuda/libdevice.py +3386 -0
  167. numba_cuda/numba/cuda/libdevicedecl.py +20 -0
  168. numba_cuda/numba/cuda/libdevicefuncs.py +1060 -0
  169. numba_cuda/numba/cuda/libdeviceimpl.py +88 -0
  170. numba_cuda/numba/cuda/locks.py +19 -0
  171. numba_cuda/numba/cuda/lowering.py +1951 -0
  172. numba_cuda/numba/cuda/mathimpl.py +374 -0
  173. numba_cuda/numba/cuda/memory_management/__init__.py +4 -0
  174. numba_cuda/numba/cuda/memory_management/memsys.cu +99 -0
  175. numba_cuda/numba/cuda/memory_management/memsys.cuh +22 -0
  176. numba_cuda/numba/cuda/memory_management/nrt.cu +212 -0
  177. numba_cuda/numba/cuda/memory_management/nrt.cuh +48 -0
  178. numba_cuda/numba/cuda/memory_management/nrt.py +390 -0
  179. numba_cuda/numba/cuda/memory_management/nrt_context.py +438 -0
  180. numba_cuda/numba/cuda/misc/appdirs.py +594 -0
  181. numba_cuda/numba/cuda/misc/cffiimpl.py +24 -0
  182. numba_cuda/numba/cuda/misc/coverage_support.py +43 -0
  183. numba_cuda/numba/cuda/misc/dump_style.py +41 -0
  184. numba_cuda/numba/cuda/misc/findlib.py +75 -0
  185. numba_cuda/numba/cuda/misc/firstlinefinder.py +96 -0
  186. numba_cuda/numba/cuda/misc/gdb_hook.py +240 -0
  187. numba_cuda/numba/cuda/misc/literal.py +28 -0
  188. numba_cuda/numba/cuda/misc/llvm_pass_timings.py +412 -0
  189. numba_cuda/numba/cuda/misc/special.py +94 -0
  190. numba_cuda/numba/cuda/models.py +56 -0
  191. numba_cuda/numba/cuda/np/arraymath.py +5130 -0
  192. numba_cuda/numba/cuda/np/arrayobj.py +7635 -0
  193. numba_cuda/numba/cuda/np/extensions.py +11 -0
  194. numba_cuda/numba/cuda/np/linalg.py +3087 -0
  195. numba_cuda/numba/cuda/np/math/__init__.py +0 -0
  196. numba_cuda/numba/cuda/np/math/cmathimpl.py +558 -0
  197. numba_cuda/numba/cuda/np/math/mathimpl.py +487 -0
  198. numba_cuda/numba/cuda/np/math/numbers.py +1461 -0
  199. numba_cuda/numba/cuda/np/npdatetime.py +969 -0
  200. numba_cuda/numba/cuda/np/npdatetime_helpers.py +217 -0
  201. numba_cuda/numba/cuda/np/npyfuncs.py +1808 -0
  202. numba_cuda/numba/cuda/np/npyimpl.py +1027 -0
  203. numba_cuda/numba/cuda/np/numpy_support.py +798 -0
  204. numba_cuda/numba/cuda/np/polynomial/__init__.py +4 -0
  205. numba_cuda/numba/cuda/np/polynomial/polynomial_core.py +242 -0
  206. numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +380 -0
  207. numba_cuda/numba/cuda/np/ufunc/__init__.py +4 -0
  208. numba_cuda/numba/cuda/np/ufunc/decorators.py +203 -0
  209. numba_cuda/numba/cuda/np/ufunc/sigparse.py +68 -0
  210. numba_cuda/numba/cuda/np/ufunc/ufuncbuilder.py +65 -0
  211. numba_cuda/numba/cuda/np/ufunc_db.py +1282 -0
  212. numba_cuda/numba/cuda/np/unsafe/__init__.py +0 -0
  213. numba_cuda/numba/cuda/np/unsafe/ndarray.py +84 -0
  214. numba_cuda/numba/cuda/nvvmutils.py +254 -0
  215. numba_cuda/numba/cuda/printimpl.py +126 -0
  216. numba_cuda/numba/cuda/random.py +308 -0
  217. numba_cuda/numba/cuda/reshape_funcs.cu +156 -0
  218. numba_cuda/numba/cuda/serialize.py +267 -0
  219. numba_cuda/numba/cuda/simulator/__init__.py +63 -0
  220. numba_cuda/numba/cuda/simulator/_internal/__init__.py +4 -0
  221. numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +2 -0
  222. numba_cuda/numba/cuda/simulator/api.py +179 -0
  223. numba_cuda/numba/cuda/simulator/bf16.py +4 -0
  224. numba_cuda/numba/cuda/simulator/compiler.py +38 -0
  225. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +11 -0
  226. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +462 -0
  227. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +122 -0
  228. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +66 -0
  229. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +7 -0
  230. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +7 -0
  231. numba_cuda/numba/cuda/simulator/cudadrv/error.py +10 -0
  232. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +10 -0
  233. numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +61 -0
  234. numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +11 -0
  235. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +32 -0
  236. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +22 -0
  237. numba_cuda/numba/cuda/simulator/dispatcher.py +11 -0
  238. numba_cuda/numba/cuda/simulator/kernel.py +320 -0
  239. numba_cuda/numba/cuda/simulator/kernelapi.py +509 -0
  240. numba_cuda/numba/cuda/simulator/memory_management/__init__.py +4 -0
  241. numba_cuda/numba/cuda/simulator/memory_management/nrt.py +21 -0
  242. numba_cuda/numba/cuda/simulator/reduction.py +19 -0
  243. numba_cuda/numba/cuda/simulator/tests/support.py +4 -0
  244. numba_cuda/numba/cuda/simulator/vector_types.py +65 -0
  245. numba_cuda/numba/cuda/simulator_init.py +18 -0
  246. numba_cuda/numba/cuda/stubs.py +635 -0
  247. numba_cuda/numba/cuda/target.py +505 -0
  248. numba_cuda/numba/cuda/testing.py +347 -0
  249. numba_cuda/numba/cuda/tests/__init__.py +62 -0
  250. numba_cuda/numba/cuda/tests/benchmarks/__init__.py +0 -0
  251. numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +119 -0
  252. numba_cuda/numba/cuda/tests/cloudpickle_main_class.py +9 -0
  253. numba_cuda/numba/cuda/tests/core/serialize_usecases.py +113 -0
  254. numba_cuda/numba/cuda/tests/core/test_itanium_mangler.py +83 -0
  255. numba_cuda/numba/cuda/tests/core/test_serialize.py +371 -0
  256. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +9 -0
  257. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +147 -0
  258. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +161 -0
  259. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +397 -0
  260. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +24 -0
  261. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +180 -0
  262. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +313 -0
  263. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +187 -0
  264. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +621 -0
  265. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +247 -0
  266. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +100 -0
  267. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +198 -0
  268. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +53 -0
  269. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +72 -0
  270. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +138 -0
  271. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +43 -0
  272. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +15 -0
  273. numba_cuda/numba/cuda/tests/cudadrv/test_linkable_code.py +58 -0
  274. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +348 -0
  275. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +128 -0
  276. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +301 -0
  277. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +174 -0
  278. numba_cuda/numba/cuda/tests/cudadrv/test_nvrtc.py +28 -0
  279. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +185 -0
  280. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +39 -0
  281. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +23 -0
  282. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +38 -0
  283. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +48 -0
  284. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +44 -0
  285. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +127 -0
  286. numba_cuda/numba/cuda/tests/cudapy/__init__.py +9 -0
  287. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +231 -0
  288. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +50 -0
  289. numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +36 -0
  290. numba_cuda/numba/cuda/tests/cudapy/complex_usecases.py +116 -0
  291. numba_cuda/numba/cuda/tests/cudapy/enum_usecases.py +59 -0
  292. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +62 -0
  293. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +28 -0
  294. numba_cuda/numba/cuda/tests/cudapy/overload_usecases.py +33 -0
  295. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +104 -0
  296. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +47 -0
  297. numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +1122 -0
  298. numba_cuda/numba/cuda/tests/cudapy/test_array.py +344 -0
  299. numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +268 -0
  300. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +203 -0
  301. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +63 -0
  302. numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py +360 -0
  303. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1815 -0
  304. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +599 -0
  305. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +377 -0
  306. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +160 -0
  307. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +27 -0
  308. numba_cuda/numba/cuda/tests/cudapy/test_byteflow.py +98 -0
  309. numba_cuda/numba/cuda/tests/cudapy/test_cache_hints.py +210 -0
  310. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +683 -0
  311. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +265 -0
  312. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +42 -0
  313. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +718 -0
  314. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +370 -0
  315. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +23 -0
  316. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +142 -0
  317. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +178 -0
  318. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +193 -0
  319. numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +131 -0
  320. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +438 -0
  321. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +94 -0
  322. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +101 -0
  323. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +105 -0
  324. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +889 -0
  325. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo_types.py +476 -0
  326. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +500 -0
  327. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +820 -0
  328. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +152 -0
  329. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +111 -0
  330. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +170 -0
  331. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1088 -0
  332. numba_cuda/numba/cuda/tests/cudapy/test_extending_types.py +71 -0
  333. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +265 -0
  334. numba_cuda/numba/cuda/tests/cudapy/test_flow_control.py +1433 -0
  335. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +57 -0
  336. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +34 -0
  337. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +69 -0
  338. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +62 -0
  339. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +474 -0
  340. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +167 -0
  341. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +92 -0
  342. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +39 -0
  343. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +170 -0
  344. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +255 -0
  345. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1219 -0
  346. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +263 -0
  347. numba_cuda/numba/cuda/tests/cudapy/test_ir.py +598 -0
  348. numba_cuda/numba/cuda/tests/cudapy/test_ir_utils.py +276 -0
  349. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +101 -0
  350. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +68 -0
  351. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +123 -0
  352. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +194 -0
  353. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +220 -0
  354. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +173 -0
  355. numba_cuda/numba/cuda/tests/cudapy/test_make_function_to_jit_function.py +364 -0
  356. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +47 -0
  357. numba_cuda/numba/cuda/tests/cudapy/test_math.py +842 -0
  358. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +76 -0
  359. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +78 -0
  360. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +25 -0
  361. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +145 -0
  362. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +39 -0
  363. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +82 -0
  364. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +53 -0
  365. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +504 -0
  366. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +93 -0
  367. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +402 -0
  368. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +128 -0
  369. numba_cuda/numba/cuda/tests/cudapy/test_print.py +193 -0
  370. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +37 -0
  371. numba_cuda/numba/cuda/tests/cudapy/test_random.py +117 -0
  372. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +614 -0
  373. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +130 -0
  374. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +94 -0
  375. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  376. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +86 -0
  377. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +40 -0
  378. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +457 -0
  379. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +233 -0
  380. numba_cuda/numba/cuda/tests/cudapy/test_ssa.py +454 -0
  381. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +56 -0
  382. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +277 -0
  383. numba_cuda/numba/cuda/tests/cudapy/test_tracing.py +200 -0
  384. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +90 -0
  385. numba_cuda/numba/cuda/tests/cudapy/test_typeconv.py +333 -0
  386. numba_cuda/numba/cuda/tests/cudapy/test_typeinfer.py +538 -0
  387. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +585 -0
  388. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +42 -0
  389. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +485 -0
  390. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +312 -0
  391. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +23 -0
  392. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +183 -0
  393. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +40 -0
  394. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +40 -0
  395. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +206 -0
  396. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +331 -0
  397. numba_cuda/numba/cuda/tests/cudasim/__init__.py +9 -0
  398. numba_cuda/numba/cuda/tests/cudasim/support.py +9 -0
  399. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +111 -0
  400. numba_cuda/numba/cuda/tests/data/__init__.py +2 -0
  401. numba_cuda/numba/cuda/tests/data/cta_barrier.cu +28 -0
  402. numba_cuda/numba/cuda/tests/data/cuda_include.cu +10 -0
  403. numba_cuda/numba/cuda/tests/data/error.cu +12 -0
  404. numba_cuda/numba/cuda/tests/data/include/add.cuh +8 -0
  405. numba_cuda/numba/cuda/tests/data/jitlink.cu +28 -0
  406. numba_cuda/numba/cuda/tests/data/jitlink.ptx +49 -0
  407. numba_cuda/numba/cuda/tests/data/warn.cu +12 -0
  408. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +9 -0
  409. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +2 -0
  410. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +54 -0
  411. numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +8 -0
  412. numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +14 -0
  413. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +86 -0
  414. numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +68 -0
  415. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +81 -0
  416. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +141 -0
  417. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +160 -0
  418. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +180 -0
  419. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +119 -0
  420. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +66 -0
  421. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +80 -0
  422. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +206 -0
  423. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +53 -0
  424. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +76 -0
  425. numba_cuda/numba/cuda/tests/nocuda/__init__.py +9 -0
  426. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +391 -0
  427. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +48 -0
  428. numba_cuda/numba/cuda/tests/nocuda/test_import.py +63 -0
  429. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +252 -0
  430. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +59 -0
  431. numba_cuda/numba/cuda/tests/nrt/__init__.py +9 -0
  432. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +387 -0
  433. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +124 -0
  434. numba_cuda/numba/cuda/tests/support.py +900 -0
  435. numba_cuda/numba/cuda/typeconv/__init__.py +4 -0
  436. numba_cuda/numba/cuda/typeconv/castgraph.py +137 -0
  437. numba_cuda/numba/cuda/typeconv/rules.py +63 -0
  438. numba_cuda/numba/cuda/typeconv/typeconv.py +121 -0
  439. numba_cuda/numba/cuda/types/__init__.py +233 -0
  440. numba_cuda/numba/cuda/types/__init__.pyi +167 -0
  441. numba_cuda/numba/cuda/types/abstract.py +9 -0
  442. numba_cuda/numba/cuda/types/common.py +9 -0
  443. numba_cuda/numba/cuda/types/containers.py +9 -0
  444. numba_cuda/numba/cuda/types/cuda_abstract.py +533 -0
  445. numba_cuda/numba/cuda/types/cuda_common.py +110 -0
  446. numba_cuda/numba/cuda/types/cuda_containers.py +971 -0
  447. numba_cuda/numba/cuda/types/cuda_function_type.py +230 -0
  448. numba_cuda/numba/cuda/types/cuda_functions.py +798 -0
  449. numba_cuda/numba/cuda/types/cuda_iterators.py +120 -0
  450. numba_cuda/numba/cuda/types/cuda_misc.py +569 -0
  451. numba_cuda/numba/cuda/types/cuda_npytypes.py +690 -0
  452. numba_cuda/numba/cuda/types/cuda_scalars.py +280 -0
  453. numba_cuda/numba/cuda/types/ext_types.py +101 -0
  454. numba_cuda/numba/cuda/types/function_type.py +11 -0
  455. numba_cuda/numba/cuda/types/functions.py +9 -0
  456. numba_cuda/numba/cuda/types/iterators.py +9 -0
  457. numba_cuda/numba/cuda/types/misc.py +9 -0
  458. numba_cuda/numba/cuda/types/npytypes.py +9 -0
  459. numba_cuda/numba/cuda/types/scalars.py +9 -0
  460. numba_cuda/numba/cuda/typing/__init__.py +19 -0
  461. numba_cuda/numba/cuda/typing/arraydecl.py +939 -0
  462. numba_cuda/numba/cuda/typing/asnumbatype.py +130 -0
  463. numba_cuda/numba/cuda/typing/bufproto.py +70 -0
  464. numba_cuda/numba/cuda/typing/builtins.py +1209 -0
  465. numba_cuda/numba/cuda/typing/cffi_utils.py +219 -0
  466. numba_cuda/numba/cuda/typing/cmathdecl.py +47 -0
  467. numba_cuda/numba/cuda/typing/collections.py +138 -0
  468. numba_cuda/numba/cuda/typing/context.py +782 -0
  469. numba_cuda/numba/cuda/typing/ctypes_utils.py +125 -0
  470. numba_cuda/numba/cuda/typing/dictdecl.py +63 -0
  471. numba_cuda/numba/cuda/typing/enumdecl.py +74 -0
  472. numba_cuda/numba/cuda/typing/listdecl.py +147 -0
  473. numba_cuda/numba/cuda/typing/mathdecl.py +158 -0
  474. numba_cuda/numba/cuda/typing/npdatetime.py +322 -0
  475. numba_cuda/numba/cuda/typing/npydecl.py +749 -0
  476. numba_cuda/numba/cuda/typing/setdecl.py +115 -0
  477. numba_cuda/numba/cuda/typing/templates.py +1446 -0
  478. numba_cuda/numba/cuda/typing/typeof.py +301 -0
  479. numba_cuda/numba/cuda/ufuncs.py +746 -0
  480. numba_cuda/numba/cuda/utils.py +724 -0
  481. numba_cuda/numba/cuda/vector_types.py +214 -0
  482. numba_cuda/numba/cuda/vectorizers.py +260 -0
  483. numba_cuda-0.21.1.dist-info/METADATA +109 -0
  484. numba_cuda-0.21.1.dist-info/RECORD +488 -0
  485. numba_cuda-0.21.1.dist-info/WHEEL +5 -0
  486. numba_cuda-0.21.1.dist-info/licenses/LICENSE +26 -0
  487. numba_cuda-0.21.1.dist-info/licenses/LICENSE.numba +24 -0
  488. numba_cuda-0.21.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,3222 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ """
5
+ CUDA driver bridge implementation
6
+
7
+ NOTE:
8
+ The new driver implementation uses a *_PendingDeallocs* that help prevents a
9
+ crashing the system (particularly OSX) when the CUDA context is corrupted at
10
+ resource deallocation. The old approach ties resource management directly
11
+ into the object destructor; thus, at corruption of the CUDA context,
12
+ subsequent deallocation could further corrupt the CUDA context and causes the
13
+ system to freeze in some cases.
14
+
15
+ """
16
+
17
+ import sys
18
+ import os
19
+ import ctypes
20
+ import weakref
21
+ import functools
22
+ import warnings
23
+ import logging
24
+ import threading
25
+ import asyncio
26
+ import pathlib
27
+ import subprocess
28
+ import tempfile
29
+ import re
30
+ from itertools import product
31
+ from abc import ABCMeta, abstractmethod
32
+ from ctypes import (
33
+ c_int,
34
+ byref,
35
+ c_size_t,
36
+ c_char,
37
+ c_char_p,
38
+ addressof,
39
+ c_void_p,
40
+ c_uint8,
41
+ )
42
+ import contextlib
43
+ import importlib
44
+ import numpy as np
45
+ from collections import namedtuple, deque
46
+
47
+
48
+ from numba.cuda.cext import mviewbuf
49
+ from numba.cuda.core import config
50
+ from numba.cuda import utils, serialize
51
+ from .error import CudaSupportError, CudaDriverError
52
+ from .drvapi import API_PROTOTYPES
53
+ from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj
54
+ from .mappings import FILE_EXTENSION_MAP
55
+ from .linkable_code import LinkableCode, LTOIR, Fatbin, Object
56
+ from numba.cuda.utils import cached_file_read
57
+ from numba.cuda.cudadrv import enums, drvapi, nvrtc
58
+
59
+ from cuda.bindings import driver as binding
60
+ from cuda.core.experimental import (
61
+ Linker,
62
+ LinkerOptions,
63
+ ObjectCode,
64
+ )
65
+
66
+ from cuda.bindings.utils import get_cuda_native_handle
67
+ from cuda.core.experimental import (
68
+ Stream as ExperimentalStream,
69
+ Device as ExperimentalDevice,
70
+ )
71
+
72
+
73
+ # There is no definition of the default stream in the Nvidia bindings (nor
74
+ # is there at the C/C++ level), so we define it here so we don't need to
75
+ # use a magic number 0 in places where we want the default stream.
76
+ CU_STREAM_DEFAULT = 0
77
+
78
+
79
+ MIN_REQUIRED_CC = (3, 5)
80
+ SUPPORTS_IPC = sys.platform.startswith("linux")
81
+
82
+
83
+ _py_decref = ctypes.pythonapi.Py_DecRef
84
+ _py_incref = ctypes.pythonapi.Py_IncRef
85
+ _py_decref.argtypes = [ctypes.py_object]
86
+ _py_incref.argtypes = [ctypes.py_object]
87
+
88
+
89
+ def make_logger():
90
+ logger = logging.getLogger(__name__)
91
+ # is logging configured?
92
+ if not logger.hasHandlers():
93
+ # read user config
94
+ lvl = str(config.CUDA_LOG_LEVEL).upper()
95
+ lvl = getattr(logging, lvl, None)
96
+ if not isinstance(lvl, int):
97
+ # default to critical level
98
+ lvl = logging.CRITICAL
99
+ logger.setLevel(lvl)
100
+ # did user specify a level?
101
+ if config.CUDA_LOG_LEVEL:
102
+ # create a simple handler that prints to stderr
103
+ handler = logging.StreamHandler(sys.stderr)
104
+ fmt = "== CUDA [%(relativeCreated)d] %(levelname)5s -- %(message)s"
105
+ handler.setFormatter(logging.Formatter(fmt=fmt))
106
+ logger.addHandler(handler)
107
+ else:
108
+ # otherwise, put a null handler
109
+ logger.addHandler(logging.NullHandler())
110
+ return logger
111
+
112
+
113
+ @functools.cache
114
+ def _have_nvjitlink():
115
+ try:
116
+ from cuda.bindings._internal import nvjitlink as nvjitlink_internal
117
+ from cuda.bindings._internal.utils import NotSupportedError
118
+ except ImportError:
119
+ return False
120
+
121
+ try:
122
+ if (
123
+ nvjitlink_internal._inspect_function_pointer("__nvJitLinkVersion")
124
+ == 0
125
+ ):
126
+ return False
127
+ try:
128
+ from cuda.bindings import nvjitlink
129
+
130
+ if nvjitlink.version() < (12, 3):
131
+ return False
132
+ except Exception:
133
+ return False
134
+ return True
135
+ except (RuntimeError, NotSupportedError):
136
+ return False
137
+
138
+
139
+ class DeadMemoryError(RuntimeError):
140
+ pass
141
+
142
+
143
+ class LinkerError(RuntimeError):
144
+ pass
145
+
146
+
147
+ class CudaAPIError(CudaDriverError):
148
+ def __init__(self, code, msg):
149
+ self.code = code
150
+ self.msg = msg
151
+ super(CudaAPIError, self).__init__(code, msg)
152
+
153
+ def __str__(self):
154
+ return "[%s] %s" % (self.code, self.msg)
155
+
156
+
157
+ def locate_driver_and_loader():
158
+ # Determine DLL type
159
+ if sys.platform == "win32":
160
+ dlloader = ctypes.WinDLL
161
+ dldir = ["\\windows\\system32"]
162
+ dlnames = ["nvcuda.dll"]
163
+ elif sys.platform == "darwin":
164
+ dlloader = ctypes.CDLL
165
+ dldir = ["/usr/local/cuda/lib"]
166
+ dlnames = ["libcuda.dylib"]
167
+ else:
168
+ # Assume to be *nix like
169
+ dlloader = ctypes.CDLL
170
+ dldir = ["/usr/lib", "/usr/lib64"]
171
+ dlnames = ["libcuda.so", "libcuda.so.1"]
172
+
173
+ # First search for the name in the default library path.
174
+ # If that is not found, try specific common paths.
175
+ candidates = dlnames + [
176
+ os.path.join(x, y) for x, y in product(dldir, dlnames)
177
+ ]
178
+
179
+ return dlloader, candidates
180
+
181
+
182
+ def load_driver(dlloader, candidates):
183
+ # Load the driver; Collect driver error information
184
+ path_not_exist = []
185
+ driver_load_error = []
186
+
187
+ for path in candidates:
188
+ try:
189
+ dll = dlloader(path)
190
+ except OSError as e:
191
+ # Problem opening the DLL
192
+ path_not_exist.append(not os.path.isfile(path))
193
+ driver_load_error.append(e)
194
+ else:
195
+ return dll, path
196
+
197
+ # Problem loading driver
198
+ if all(path_not_exist):
199
+ _raise_driver_not_found()
200
+ else:
201
+ errmsg = "\n".join(str(e) for e in driver_load_error)
202
+ _raise_driver_error(errmsg)
203
+
204
+
205
+ def find_driver():
206
+ dlloader, candidates = locate_driver_and_loader()
207
+ dll, path = load_driver(dlloader, candidates)
208
+ return dll
209
+
210
+
211
+ DRIVER_NOT_FOUND_MSG = """
212
+ CUDA driver library cannot be found.
213
+ Ensure that a compatible NVIDIA driver is installed and available on your system path.
214
+ """
215
+
216
+ DRIVER_LOAD_ERROR_MSG = """
217
+ Possible CUDA driver libraries are found but error occurred during load:
218
+ %s
219
+ """
220
+
221
+
222
+ def _raise_driver_not_found():
223
+ raise CudaSupportError(DRIVER_NOT_FOUND_MSG)
224
+
225
+
226
+ def _raise_driver_error(e):
227
+ raise CudaSupportError(DRIVER_LOAD_ERROR_MSG % e)
228
+
229
+
230
+ def _build_reverse_error_map():
231
+ prefix = "CUDA_ERROR"
232
+ map = utils.UniqueDict()
233
+ for name in dir(enums):
234
+ if name.startswith(prefix):
235
+ code = getattr(enums, name)
236
+ map[code] = name
237
+ return map
238
+
239
+
240
+ def _getpid():
241
+ return os.getpid()
242
+
243
+
244
+ ERROR_MAP = _build_reverse_error_map()
245
+
246
+
247
+ class Driver(object):
248
+ """
249
+ Driver API functions are lazily bound.
250
+ """
251
+
252
+ _singleton = None
253
+
254
+ def __new__(cls):
255
+ obj = cls._singleton
256
+ if obj is not None:
257
+ return obj
258
+ else:
259
+ obj = object.__new__(cls)
260
+ cls._singleton = obj
261
+ return obj
262
+
263
+ def __init__(self):
264
+ self.devices = utils.UniqueDict()
265
+ self.is_initialized = False
266
+ self.initialization_error = None
267
+ self.pid = None
268
+ try:
269
+ if config.DISABLE_CUDA:
270
+ msg = (
271
+ "CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1 "
272
+ "in the environment, or because CUDA is unsupported on "
273
+ "32-bit systems."
274
+ )
275
+ raise CudaSupportError(msg)
276
+ self.lib = find_driver()
277
+ except CudaSupportError as e:
278
+ self.is_initialized = True
279
+ self.initialization_error = e.msg
280
+
281
+ def ensure_initialized(self):
282
+ if self.is_initialized:
283
+ return
284
+
285
+ # lazily initialize logger
286
+ global _logger
287
+ _logger = make_logger()
288
+
289
+ self.is_initialized = True
290
+ try:
291
+ _logger.info("init")
292
+ self.cuInit(0)
293
+ except CudaAPIError as e:
294
+ description = f"{e.msg} ({e.code})"
295
+ self.initialization_error = description
296
+ raise CudaSupportError(f"Error at driver init: {description}")
297
+ else:
298
+ self.pid = _getpid()
299
+
300
+ @property
301
+ def is_available(self):
302
+ self.ensure_initialized()
303
+ return self.initialization_error is None
304
+
305
+ def __getattr__(self, fname):
306
+ # First request of a driver API function
307
+ self.ensure_initialized()
308
+
309
+ if self.initialization_error is not None:
310
+ raise CudaSupportError(
311
+ "Error at driver init: \n%s:" % self.initialization_error
312
+ )
313
+
314
+ return self._cuda_python_wrap_fn(fname)
315
+
316
+ def _ctypes_wrap_fn(self, fname, libfn=None):
317
+ # Wrap a CUDA driver function by default
318
+ if libfn is None:
319
+ try:
320
+ proto = API_PROTOTYPES[fname]
321
+ except KeyError:
322
+ raise AttributeError(fname)
323
+ restype = proto[0]
324
+ argtypes = proto[1:]
325
+
326
+ # Find function in driver library
327
+ libfn = self._find_api(fname)
328
+ libfn.restype = restype
329
+ libfn.argtypes = argtypes
330
+
331
+ def verbose_cuda_api_call(*args):
332
+ argstr = ", ".join([str(arg) for arg in args])
333
+ _logger.debug("call driver api: %s(%s)", libfn.__name__, argstr)
334
+ retcode = libfn(*args)
335
+ self._check_ctypes_error(fname, retcode)
336
+
337
+ def safe_cuda_api_call(*args):
338
+ _logger.debug("call driver api: %s", libfn.__name__)
339
+ retcode = libfn(*args)
340
+ self._check_ctypes_error(fname, retcode)
341
+
342
+ if config.CUDA_LOG_API_ARGS:
343
+ wrapper = verbose_cuda_api_call
344
+ else:
345
+ wrapper = safe_cuda_api_call
346
+
347
+ safe_call = functools.wraps(libfn)(wrapper)
348
+ setattr(self, fname, safe_call)
349
+ return safe_call
350
+
351
+ def _cuda_python_wrap_fn(self, fname):
352
+ libfn = getattr(binding, fname)
353
+
354
+ def verbose_cuda_api_call(*args):
355
+ argstr = ", ".join([str(arg) for arg in args])
356
+ _logger.debug("call driver api: %s(%s)", libfn.__name__, argstr)
357
+ return self._check_cuda_python_error(fname, libfn(*args))
358
+
359
+ def safe_cuda_api_call(*args):
360
+ _logger.debug("call driver api: %s", libfn.__name__)
361
+ return self._check_cuda_python_error(fname, libfn(*args))
362
+
363
+ if config.CUDA_LOG_API_ARGS:
364
+ wrapper = verbose_cuda_api_call
365
+ else:
366
+ wrapper = safe_cuda_api_call
367
+
368
+ safe_call = functools.wraps(libfn)(wrapper)
369
+ setattr(self, fname, safe_call)
370
+ return safe_call
371
+
372
+ def _find_api(self, fname):
373
+ # We use alternatively-named functions for PTDS with the Numba ctypes
374
+ # binding. It handles linking to the correct variant.
375
+ variants = ("_v2", "")
376
+
377
+ if fname in ("cuCtxGetDevice", "cuCtxSynchronize"):
378
+ return getattr(self.lib, fname)
379
+
380
+ for variant in variants:
381
+ try:
382
+ return getattr(self.lib, f"{fname}{variant}")
383
+ except AttributeError:
384
+ pass
385
+
386
+ # Not found.
387
+ # Delay missing function error to use
388
+ def absent_function(*args, **kws):
389
+ raise CudaDriverError(f"Driver missing function: {fname}")
390
+
391
+ setattr(self, fname, absent_function)
392
+ return absent_function
393
+
394
+ def _detect_fork(self):
395
+ if self.pid is not None and _getpid() != self.pid:
396
+ msg = "pid %s forked from pid %s after CUDA driver init"
397
+ _logger.critical(msg, _getpid(), self.pid)
398
+ raise CudaDriverError("CUDA initialized before forking")
399
+
400
+ def _check_ctypes_error(self, fname, retcode):
401
+ if retcode != enums.CUDA_SUCCESS:
402
+ errname = ERROR_MAP.get(retcode, "UNKNOWN_CUDA_ERROR")
403
+ msg = "Call to %s results in %s" % (fname, errname)
404
+ _logger.error(msg)
405
+ if retcode == enums.CUDA_ERROR_NOT_INITIALIZED:
406
+ self._detect_fork()
407
+ raise CudaAPIError(retcode, msg)
408
+
409
+ def _check_cuda_python_error(self, fname, returned):
410
+ retcode = returned[0]
411
+ retval = returned[1:]
412
+ if len(retval) == 1:
413
+ retval = retval[0]
414
+
415
+ if retcode != binding.CUresult.CUDA_SUCCESS:
416
+ msg = "Call to %s results in %s" % (fname, retcode.name)
417
+ _logger.error(msg)
418
+ if retcode == binding.CUresult.CUDA_ERROR_NOT_INITIALIZED:
419
+ self._detect_fork()
420
+ raise CudaAPIError(retcode, msg)
421
+
422
+ return retval
423
+
424
+ def get_device(self, devnum=0):
425
+ dev = self.devices.get(devnum)
426
+ if dev is None:
427
+ dev = Device(devnum)
428
+ self.devices[devnum] = dev
429
+ return weakref.proxy(dev)
430
+
431
+ def get_device_count(self):
432
+ return self.cuDeviceGetCount()
433
+
434
+ def list_devices(self):
435
+ """Returns a list of active devices"""
436
+ return list(self.devices.values())
437
+
438
+ def reset(self):
439
+ """Reset all devices"""
440
+ for dev in self.devices.values():
441
+ dev.reset()
442
+
443
+ def pop_active_context(self):
444
+ """Pop the active CUDA context and return the handle.
445
+ If no CUDA context is active, return None.
446
+ """
447
+ with self.get_active_context() as ac:
448
+ if ac.devnum is not None:
449
+ popped = drvapi.cu_context(int(driver.cuCtxPopCurrent()))
450
+ return popped
451
+
452
+ def get_active_context(self):
453
+ """Returns an instance of ``_ActiveContext``."""
454
+ return _ActiveContext()
455
+
456
+ def get_version(self):
457
+ """
458
+ Returns the CUDA Driver version as a tuple (major, minor).
459
+ """
460
+ # The version is encoded as (1000 * major) + (10 * minor)
461
+ version = driver.cuDriverGetVersion()
462
+ major = version // 1000
463
+ minor = (version - (major * 1000)) // 10
464
+ return (major, minor)
465
+
466
+
467
+ class _ActiveContext(object):
468
+ """An contextmanager object to cache active context to reduce dependency
469
+ on querying the CUDA driver API.
470
+
471
+ Once entering the context, it is assumed that the active CUDA context is
472
+ not changed until the context is exited.
473
+ """
474
+
475
+ _tls_cache = threading.local()
476
+
477
+ def __enter__(self):
478
+ is_top = False
479
+ # check TLS cache
480
+ cache = self._tls_cache
481
+ try:
482
+ hctx, devnum = cache.ctx_devnum
483
+ except AttributeError:
484
+ # Not cached. Query the driver API.
485
+ hctx = driver.cuCtxGetCurrent()
486
+ if int(hctx) == 0:
487
+ hctx = None
488
+ else:
489
+ hctx = drvapi.cu_context(int(hctx))
490
+
491
+ if hctx is None:
492
+ devnum = None
493
+ else:
494
+ devnum = int(driver.cuCtxGetDevice())
495
+
496
+ self._tls_cache.ctx_devnum = (hctx, devnum)
497
+ is_top = True
498
+
499
+ self._is_top = is_top
500
+ self.context_handle = hctx
501
+ self.devnum = devnum
502
+ return self
503
+
504
+ def __exit__(self, exc_type, exc_val, exc_tb):
505
+ if self._is_top:
506
+ del self._tls_cache.ctx_devnum
507
+
508
+ def __bool__(self):
509
+ """Returns True is there's a valid and active CUDA context."""
510
+ return self.context_handle is not None
511
+
512
+ __nonzero__ = __bool__
513
+
514
+
515
+ driver = Driver()
516
+
517
+
518
+ def _build_reverse_device_attrs():
519
+ prefix = "CU_DEVICE_ATTRIBUTE_"
520
+ map = utils.UniqueDict()
521
+ for name in dir(enums):
522
+ if name.startswith(prefix):
523
+ map[name[len(prefix) :]] = getattr(enums, name)
524
+ return map
525
+
526
+
527
+ DEVICE_ATTRIBUTES = _build_reverse_device_attrs()
528
+
529
+
530
+ class Device:
531
+ """
532
+ The device object owns the CUDA contexts. This is owned by the driver
533
+ object. User should not construct devices directly.
534
+ """
535
+
536
+ @classmethod
537
+ def from_identity(self, identity):
538
+ """Create Device object from device identity created by
539
+ ``Device.get_device_identity()``.
540
+ """
541
+ for devid in range(driver.get_device_count()):
542
+ d = driver.get_device(devid)
543
+ if d.get_device_identity() == identity:
544
+ return d
545
+ else:
546
+ raise RuntimeError(
547
+ f"No device of {identity} is found. "
548
+ "Target device may not be visible in this process."
549
+ )
550
+
551
+ def __init__(self, devnum: int) -> None:
552
+ self._dev = ExperimentalDevice(devnum)
553
+ self.id = self._dev.device_id
554
+ self.compute_capability = self._dev.compute_capability
555
+ self.name = self._dev.name
556
+ self.uuid = f"GPU-{self._dev.uuid}"
557
+ self.primary_context = None
558
+
559
+ def get_device_identity(self):
560
+ return {
561
+ "pci_domain_id": self.PCI_DOMAIN_ID,
562
+ "pci_bus_id": self.PCI_BUS_ID,
563
+ "pci_device_id": self.PCI_DEVICE_ID,
564
+ }
565
+
566
+ def __repr__(self):
567
+ return f"<CUDA device {self.id:d} '{self.name}'>"
568
+
569
+ def __getattr__(self, attr):
570
+ """Read attributes lazily"""
571
+ code = getattr(
572
+ binding.CUdevice_attribute, f"CU_DEVICE_ATTRIBUTE_{attr}"
573
+ )
574
+ value = driver.cuDeviceGetAttribute(code, self.id)
575
+
576
+ setattr(self, attr, value)
577
+ return value
578
+
579
+ def __hash__(self):
580
+ return hash(self.id)
581
+
582
+ def __eq__(self, other):
583
+ return isinstance(other, Device) and self.id == other.id
584
+
585
+ def __ne__(self, other):
586
+ return not (self == other)
587
+
588
+ def get_primary_context(self):
589
+ """
590
+ Returns the primary context for the device.
591
+ Note: it is not pushed to the CPU thread.
592
+ """
593
+ if (ctx := self.primary_context) is not None:
594
+ return ctx
595
+
596
+ if self.compute_capability < MIN_REQUIRED_CC:
597
+ raise CudaSupportError(
598
+ f"{self} has compute capability < {MIN_REQUIRED_CC}"
599
+ )
600
+
601
+ self._dev.set_current()
602
+ self.primary_context = ctx = Context(
603
+ weakref.proxy(self),
604
+ ctypes.c_void_p(int(self._dev.context._handle)),
605
+ )
606
+ return ctx
607
+
608
+ def release_primary_context(self):
609
+ """
610
+ Release reference to primary context if it has been retained.
611
+ """
612
+ if self.primary_context:
613
+ driver.cuDevicePrimaryCtxRelease(self.id)
614
+ self.primary_context = None
615
+
616
+ def reset(self):
617
+ try:
618
+ if (ctx := self.primary_context) is not None:
619
+ ctx.reset()
620
+ self.release_primary_context()
621
+ finally:
622
+ # reset at the driver level
623
+ driver.cuDevicePrimaryCtxReset(self.id)
624
+
625
+ @property
626
+ def supports_float16(self):
627
+ return self.compute_capability >= (5, 3)
628
+
629
+ @property
630
+ def supports_bfloat16(self):
631
+ return self.compute_capability >= (8, 0)
632
+
633
+
634
+ class BaseCUDAMemoryManager(object, metaclass=ABCMeta):
635
+ """Abstract base class for External Memory Management (EMM) Plugins."""
636
+
637
+ def __init__(self, *args, **kwargs):
638
+ if "context" not in kwargs:
639
+ raise RuntimeError("Memory manager requires a context")
640
+ self.context = kwargs.pop("context")
641
+
642
+ @abstractmethod
643
+ def memalloc(self, size):
644
+ """
645
+ Allocate on-device memory in the current context.
646
+
647
+ :param size: Size of allocation in bytes
648
+ :type size: int
649
+ :return: A memory pointer instance that owns the allocated memory
650
+ :rtype: :class:`MemoryPointer`
651
+ """
652
+
653
+ @abstractmethod
654
+ def memhostalloc(self, size, mapped, portable, wc):
655
+ """
656
+ Allocate pinned host memory.
657
+
658
+ :param size: Size of the allocation in bytes
659
+ :type size: int
660
+ :param mapped: Whether the allocated memory should be mapped into the
661
+ CUDA address space.
662
+ :type mapped: bool
663
+ :param portable: Whether the memory will be considered pinned by all
664
+ contexts, and not just the calling context.
665
+ :type portable: bool
666
+ :param wc: Whether to allocate the memory as write-combined.
667
+ :type wc: bool
668
+ :return: A memory pointer instance that owns the allocated memory. The
669
+ return type depends on whether the region was mapped into
670
+ device memory.
671
+ :rtype: :class:`MappedMemory` or :class:`PinnedMemory`
672
+ """
673
+
674
+ @abstractmethod
675
+ def mempin(self, owner, pointer, size, mapped):
676
+ """
677
+ Pin a region of host memory that is already allocated.
678
+
679
+ :param owner: The object that owns the memory.
680
+ :param pointer: The pointer to the beginning of the region to pin.
681
+ :type pointer: int
682
+ :param size: The size of the region in bytes.
683
+ :type size: int
684
+ :param mapped: Whether the region should also be mapped into device
685
+ memory.
686
+ :type mapped: bool
687
+ :return: A memory pointer instance that refers to the allocated
688
+ memory.
689
+ :rtype: :class:`MappedMemory` or :class:`PinnedMemory`
690
+ """
691
+
692
+ @abstractmethod
693
+ def initialize(self):
694
+ """
695
+ Perform any initialization required for the EMM plugin instance to be
696
+ ready to use.
697
+
698
+ :return: None
699
+ """
700
+
701
+ @abstractmethod
702
+ def get_ipc_handle(self, memory):
703
+ """
704
+ Return an IPC handle from a GPU allocation.
705
+
706
+ :param memory: Memory for which the IPC handle should be created.
707
+ :type memory: :class:`MemoryPointer`
708
+ :return: IPC handle for the allocation
709
+ :rtype: :class:`IpcHandle`
710
+ """
711
+
712
+ @abstractmethod
713
+ def get_memory_info(self):
714
+ """
715
+ Returns ``(free, total)`` memory in bytes in the context. May raise
716
+ :class:`NotImplementedError`, if returning such information is not
717
+ practical (e.g. for a pool allocator).
718
+
719
+ :return: Memory info
720
+ :rtype: :class:`MemoryInfo`
721
+ """
722
+
723
+ @abstractmethod
724
+ def reset(self):
725
+ """
726
+ Clears up all memory allocated in this context.
727
+
728
+ :return: None
729
+ """
730
+
731
+ @abstractmethod
732
+ def defer_cleanup(self):
733
+ """
734
+ Returns a context manager that ensures the implementation of deferred
735
+ cleanup whilst it is active.
736
+
737
+ :return: Context manager
738
+ """
739
+
740
+ @property
741
+ @abstractmethod
742
+ def interface_version(self):
743
+ """
744
+ Returns an integer specifying the version of the EMM Plugin interface
745
+ supported by the plugin implementation. Should always return 1 for
746
+ implementations of this version of the specification.
747
+ """
748
+
749
+
750
+ class HostOnlyCUDAMemoryManager(BaseCUDAMemoryManager):
751
+ """Base class for External Memory Management (EMM) Plugins that only
752
+ implement on-device allocation. A subclass need not implement the
753
+ ``memhostalloc`` and ``mempin`` methods.
754
+
755
+ This class also implements ``reset`` and ``defer_cleanup`` (see
756
+ :class:`numba.cuda.BaseCUDAMemoryManager`) for its own internal state
757
+ management. If an EMM Plugin based on this class also implements these
758
+ methods, then its implementations of these must also call the method from
759
+ ``super()`` to give ``HostOnlyCUDAMemoryManager`` an opportunity to do the
760
+ necessary work for the host allocations it is managing.
761
+
762
+ This class does not implement ``interface_version``, as it will always be
763
+ consistent with the version of Numba in which it is implemented. An EMM
764
+ Plugin subclassing this class should implement ``interface_version``
765
+ instead.
766
+ """
767
+
768
+ def __init__(self, *args, **kwargs):
769
+ super().__init__(*args, **kwargs)
770
+ self.allocations = utils.UniqueDict()
771
+ self.deallocations = _PendingDeallocs()
772
+
773
+ def _attempt_allocation(self, allocator):
774
+ """
775
+ Attempt allocation by calling *allocator*. If an out-of-memory error
776
+ is raised, the pending deallocations are flushed and the allocation
777
+ is retried. If it fails in the second attempt, the error is reraised.
778
+ """
779
+ try:
780
+ return allocator()
781
+ except CudaAPIError as e:
782
+ # is out-of-memory?
783
+ oom_code = binding.CUresult.CUDA_ERROR_OUT_OF_MEMORY
784
+ if e.code == oom_code:
785
+ # clear pending deallocations
786
+ self.deallocations.clear()
787
+ # try again
788
+ return allocator()
789
+ else:
790
+ raise
791
+
792
+ def memhostalloc(self, size, mapped=False, portable=False, wc=False):
793
+ """Implements the allocation of pinned host memory.
794
+
795
+ It is recommended that this method is not overridden by EMM Plugin
796
+ implementations - instead, use the :class:`BaseCUDAMemoryManager`.
797
+ """
798
+ flags = 0
799
+ if mapped:
800
+ flags |= enums.CU_MEMHOSTALLOC_DEVICEMAP
801
+ if portable:
802
+ flags |= enums.CU_MEMHOSTALLOC_PORTABLE
803
+ if wc:
804
+ flags |= enums.CU_MEMHOSTALLOC_WRITECOMBINED
805
+
806
+ def allocator():
807
+ return driver.cuMemHostAlloc(size, flags)
808
+
809
+ if mapped:
810
+ pointer = self._attempt_allocation(allocator)
811
+ else:
812
+ pointer = allocator()
813
+
814
+ alloc_key = pointer
815
+
816
+ finalizer = _hostalloc_finalizer(self, pointer, alloc_key, size, mapped)
817
+
818
+ if mapped:
819
+ mem = MappedMemory(pointer, size, finalizer=finalizer)
820
+ self.allocations[alloc_key] = mem
821
+ return mem.own()
822
+ else:
823
+ return PinnedMemory(pointer, size, finalizer=finalizer)
824
+
825
+ def mempin(self, owner, pointer, size, mapped=False):
826
+ """Implements the pinning of host memory.
827
+
828
+ It is recommended that this method is not overridden by EMM Plugin
829
+ implementations - instead, use the :class:`BaseCUDAMemoryManager`.
830
+ """
831
+ alloc_key = pointer
832
+
833
+ # possible flags are "portable" (between context)
834
+ # and "device-map" (map host memory to device thus no need
835
+ # for memory transfer).
836
+ flags = 0
837
+
838
+ if mapped:
839
+ flags |= enums.CU_MEMHOSTREGISTER_DEVICEMAP
840
+
841
+ def allocator():
842
+ driver.cuMemHostRegister(pointer, size, flags)
843
+
844
+ if mapped:
845
+ self._attempt_allocation(allocator)
846
+ else:
847
+ allocator()
848
+
849
+ finalizer = _pin_finalizer(self, pointer, alloc_key, mapped)
850
+
851
+ if mapped:
852
+ mem = MappedMemory(pointer, size, owner=owner, finalizer=finalizer)
853
+ self.allocations[alloc_key] = mem
854
+ return mem.own()
855
+ else:
856
+ return PinnedMemory(pointer, size, owner=owner, finalizer=finalizer)
857
+
858
+ def memallocmanaged(self, size, attach_global):
859
+ def allocator():
860
+ ma_flags = binding.CUmemAttach_flags
861
+
862
+ if attach_global:
863
+ flags = ma_flags.CU_MEM_ATTACH_GLOBAL.value
864
+ else:
865
+ flags = ma_flags.CU_MEM_ATTACH_HOST.value
866
+
867
+ return driver.cuMemAllocManaged(size, flags)
868
+
869
+ ptr = self._attempt_allocation(allocator)
870
+
871
+ alloc_key = ptr
872
+
873
+ finalizer = _alloc_finalizer(self, ptr, alloc_key, size)
874
+ mem = ManagedMemory(ptr, size, finalizer=finalizer)
875
+ self.allocations[alloc_key] = mem
876
+ return mem.own()
877
+
878
+ def reset(self):
879
+ """Clears up all host memory (mapped and/or pinned) in the current
880
+ context.
881
+
882
+ EMM Plugins that override this method must call ``super().reset()`` to
883
+ ensure that host allocations are also cleaned up."""
884
+ self.allocations.clear()
885
+ self.deallocations.clear()
886
+
887
+ @contextlib.contextmanager
888
+ def defer_cleanup(self):
889
+ """Returns a context manager that disables cleanup of mapped or pinned
890
+ host memory in the current context whilst it is active.
891
+
892
+ EMM Plugins that override this method must obtain the context manager
893
+ from this method before yielding to ensure that cleanup of host
894
+ allocations is also deferred."""
895
+ with self.deallocations.disable():
896
+ yield
897
+
898
+
899
+ class GetIpcHandleMixin:
900
+ """A class that provides a default implementation of ``get_ipc_handle()``."""
901
+
902
+ def get_ipc_handle(self, memory):
903
+ """Open an IPC memory handle by using ``cuMemGetAddressRange`` to
904
+ determine the base pointer of the allocation. An IPC handle of type
905
+ ``cu_ipc_mem_handle`` is constructed and initialized with
906
+ ``cuIpcGetMemHandle``. A :class:`numba.cuda.IpcHandle` is returned,
907
+ populated with the underlying ``ipc_mem_handle``.
908
+ """
909
+ base, end = device_extents(memory)
910
+ ipchandle = driver.cuIpcGetMemHandle(base)
911
+ offset = int(memory.handle) - int(base)
912
+ source_info = self.context.device.get_device_identity()
913
+
914
+ return IpcHandle(
915
+ memory, ipchandle, memory.size, source_info, offset=offset
916
+ )
917
+
918
+
919
+ class NumbaCUDAMemoryManager(GetIpcHandleMixin, HostOnlyCUDAMemoryManager):
920
+ """Internal on-device memory management for Numba. This is implemented using
921
+ the EMM Plugin interface, but is not part of the public API."""
922
+
923
+ def initialize(self):
924
+ # Set the memory capacity of *deallocations* as the memory manager
925
+ # becomes active for the first time
926
+ if self.deallocations.memory_capacity == _SizeNotSet:
927
+ self.deallocations.memory_capacity = self.get_memory_info().total
928
+
929
+ def memalloc(self, size):
930
+ def allocator():
931
+ return driver.cuMemAlloc(size)
932
+
933
+ ptr = self._attempt_allocation(allocator)
934
+ alloc_key = ptr
935
+
936
+ finalizer = _alloc_finalizer(self, ptr, alloc_key, size)
937
+ mem = AutoFreePointer(ptr, size, finalizer=finalizer)
938
+ self.allocations[alloc_key] = mem
939
+ return mem.own()
940
+
941
+ def get_memory_info(self):
942
+ free, total = driver.cuMemGetInfo()
943
+ return MemoryInfo(free=free, total=total)
944
+
945
+ @property
946
+ def interface_version(self):
947
+ return _SUPPORTED_EMM_INTERFACE_VERSION
948
+
949
+
950
+ _SUPPORTED_EMM_INTERFACE_VERSION = 1
951
+
952
+ _memory_manager = None
953
+
954
+
955
+ def _ensure_memory_manager():
956
+ global _memory_manager
957
+
958
+ if _memory_manager:
959
+ return
960
+
961
+ if config.CUDA_MEMORY_MANAGER == "default":
962
+ _memory_manager = NumbaCUDAMemoryManager
963
+ return
964
+
965
+ try:
966
+ mgr_module = importlib.import_module(config.CUDA_MEMORY_MANAGER)
967
+ set_memory_manager(mgr_module._numba_memory_manager)
968
+ except Exception:
969
+ raise RuntimeError(
970
+ "Failed to use memory manager from %s" % config.CUDA_MEMORY_MANAGER
971
+ )
972
+
973
+
974
+ def set_memory_manager(mm_plugin):
975
+ """Configure Numba to use an External Memory Management (EMM) Plugin. If
976
+ the EMM Plugin version does not match one supported by this version of
977
+ Numba, a RuntimeError will be raised.
978
+
979
+ :param mm_plugin: The class implementing the EMM Plugin.
980
+ :type mm_plugin: BaseCUDAMemoryManager
981
+ :return: None
982
+ """
983
+ global _memory_manager
984
+
985
+ dummy = mm_plugin(context=None)
986
+ iv = dummy.interface_version
987
+ if iv != _SUPPORTED_EMM_INTERFACE_VERSION:
988
+ err = "EMM Plugin interface has version %d - version %d required" % (
989
+ iv,
990
+ _SUPPORTED_EMM_INTERFACE_VERSION,
991
+ )
992
+ raise RuntimeError(err)
993
+
994
+ _memory_manager = mm_plugin
995
+
996
+
997
+ class _SizeNotSet(int):
998
+ """
999
+ Dummy object for _PendingDeallocs when *size* is not set.
1000
+ """
1001
+
1002
+ def __new__(cls, *args, **kwargs):
1003
+ return super().__new__(cls, 0)
1004
+
1005
+ def __str__(self):
1006
+ return "?"
1007
+
1008
+
1009
+ _SizeNotSet = _SizeNotSet()
1010
+
1011
+
1012
+ class _PendingDeallocs(object):
1013
+ """
1014
+ Pending deallocations of a context (or device since we are using the primary
1015
+ context). The capacity defaults to being unset (_SizeNotSet) but can be
1016
+ modified later once the driver is initialized and the total memory capacity
1017
+ known.
1018
+ """
1019
+
1020
+ def __init__(self, capacity=_SizeNotSet):
1021
+ self._cons = deque()
1022
+ self._disable_count = 0
1023
+ self._size = 0
1024
+ self.memory_capacity = capacity
1025
+
1026
+ @property
1027
+ def _max_pending_bytes(self):
1028
+ return int(self.memory_capacity * config.CUDA_DEALLOCS_RATIO)
1029
+
1030
+ def add_item(self, dtor, handle, size=_SizeNotSet):
1031
+ """
1032
+ Add a pending deallocation.
1033
+
1034
+ The *dtor* arg is the destructor function that takes an argument,
1035
+ *handle*. It is used as ``dtor(handle)``. The *size* arg is the
1036
+ byte size of the resource added. It is an optional argument. Some
1037
+ resources (e.g. CUModule) has an unknown memory footprint on the device.
1038
+ """
1039
+ _logger.info("add pending dealloc: %s %s bytes", dtor.__name__, size)
1040
+ self._cons.append((dtor, handle, size))
1041
+ self._size += int(size)
1042
+ if (
1043
+ len(self._cons) > config.CUDA_DEALLOCS_COUNT
1044
+ or self._size > self._max_pending_bytes
1045
+ ):
1046
+ self.clear()
1047
+
1048
+ def clear(self):
1049
+ """
1050
+ Flush any pending deallocations unless it is disabled.
1051
+ Do nothing if disabled.
1052
+ """
1053
+ if not self.is_disabled:
1054
+ while self._cons:
1055
+ [dtor, handle, size] = self._cons.popleft()
1056
+ _logger.info("dealloc: %s %s bytes", dtor.__name__, size)
1057
+ dtor(handle)
1058
+
1059
+ self._size = 0
1060
+
1061
+ @contextlib.contextmanager
1062
+ def disable(self):
1063
+ """
1064
+ Context manager to temporarily disable flushing pending deallocation.
1065
+ This can be nested.
1066
+ """
1067
+ self._disable_count += 1
1068
+ try:
1069
+ yield
1070
+ finally:
1071
+ self._disable_count -= 1
1072
+ assert self._disable_count >= 0
1073
+
1074
+ @property
1075
+ def is_disabled(self):
1076
+ return self._disable_count > 0
1077
+
1078
+ def __len__(self):
1079
+ """
1080
+ Returns number of pending deallocations.
1081
+ """
1082
+ return len(self._cons)
1083
+
1084
+
1085
+ MemoryInfo = namedtuple("MemoryInfo", "free,total")
1086
+ """Free and total memory for a device.
1087
+
1088
+ .. py:attribute:: free
1089
+
1090
+ Free device memory in bytes.
1091
+
1092
+ .. py:attribute:: total
1093
+
1094
+ Total device memory in bytes.
1095
+ """
1096
+
1097
+
1098
+ class Context(object):
1099
+ """
1100
+ This object wraps a CUDA Context resource.
1101
+
1102
+ Contexts should not be constructed directly by user code.
1103
+ """
1104
+
1105
+ def __init__(self, device, handle):
1106
+ self.device = device
1107
+ self.handle = handle
1108
+ self.allocations = utils.UniqueDict()
1109
+ self.deallocations = _PendingDeallocs()
1110
+ _ensure_memory_manager()
1111
+ self.memory_manager = _memory_manager(context=self)
1112
+ self.modules = utils.UniqueDict()
1113
+ # For storing context specific data
1114
+ self.extras = {}
1115
+
1116
+ def reset(self):
1117
+ """
1118
+ Clean up all owned resources in this context.
1119
+ """
1120
+ # Free owned resources
1121
+ _logger.info("reset context of device %s", self.device.id)
1122
+ self.memory_manager.reset()
1123
+ self.modules.clear()
1124
+ # Clear trash
1125
+ self.deallocations.clear()
1126
+
1127
+ def get_memory_info(self):
1128
+ """Returns (free, total) memory in bytes in the context."""
1129
+ return self.memory_manager.get_memory_info()
1130
+
1131
+ def get_active_blocks_per_multiprocessor(
1132
+ self, func, blocksize, memsize, flags=None
1133
+ ):
1134
+ """Return occupancy of a function.
1135
+ :param func: kernel for which occupancy is calculated
1136
+ :param blocksize: block size the kernel is intended to be launched with
1137
+ :param memsize: per-block dynamic shared memory usage intended, in bytes
1138
+ """
1139
+ args = (func, blocksize, memsize, flags)
1140
+ return self._cuda_python_active_blocks_per_multiprocessor(*args)
1141
+
1142
+ def _cuda_python_active_blocks_per_multiprocessor(
1143
+ self, func, blocksize, memsize, flags
1144
+ ):
1145
+ ps = [func.handle, blocksize, memsize]
1146
+
1147
+ if not flags:
1148
+ return driver.cuOccupancyMaxActiveBlocksPerMultiprocessor(*ps)
1149
+
1150
+ ps.append(flags)
1151
+ return driver.cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(*ps)
1152
+
1153
+ def _ctypes_active_blocks_per_multiprocessor(
1154
+ self, func, blocksize, memsize, flags
1155
+ ):
1156
+ retval = c_int()
1157
+ args = (byref(retval), func.handle, blocksize, memsize)
1158
+
1159
+ if not flags:
1160
+ driver.cuOccupancyMaxActiveBlocksPerMultiprocessor(*args)
1161
+ else:
1162
+ driver.cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(*args)
1163
+
1164
+ return retval.value
1165
+
1166
+ def get_max_potential_block_size(
1167
+ self, func, b2d_func, memsize, blocksizelimit, flags=None
1168
+ ):
1169
+ """Suggest a launch configuration with reasonable occupancy.
1170
+ :param func: kernel for which occupancy is calculated
1171
+ :param b2d_func: function that calculates how much per-block dynamic
1172
+ shared memory 'func' uses based on the block size.
1173
+ Can also be the address of a C function.
1174
+ Use `0` to pass `NULL` to the underlying CUDA API.
1175
+ :param memsize: per-block dynamic shared memory usage intended, in bytes
1176
+ :param blocksizelimit: maximum block size the kernel is designed to
1177
+ handle
1178
+ """
1179
+ args = (func, b2d_func, memsize, blocksizelimit, flags)
1180
+ return self._cuda_python_max_potential_block_size(*args)
1181
+
1182
+ def _ctypes_max_potential_block_size(
1183
+ self, func, b2d_func, memsize, blocksizelimit, flags
1184
+ ):
1185
+ gridsize = c_int()
1186
+ blocksize = c_int()
1187
+ b2d_cb = cu_occupancy_b2d_size(b2d_func)
1188
+ args = [
1189
+ byref(gridsize),
1190
+ byref(blocksize),
1191
+ func.handle,
1192
+ b2d_cb,
1193
+ memsize,
1194
+ blocksizelimit,
1195
+ ]
1196
+
1197
+ if not flags:
1198
+ driver.cuOccupancyMaxPotentialBlockSize(*args)
1199
+ else:
1200
+ args.append(flags)
1201
+ driver.cuOccupancyMaxPotentialBlockSizeWithFlags(*args)
1202
+
1203
+ return (gridsize.value, blocksize.value)
1204
+
1205
+ def _cuda_python_max_potential_block_size(
1206
+ self, func, b2d_func, memsize, blocksizelimit, flags
1207
+ ):
1208
+ b2d_cb = ctypes.CFUNCTYPE(c_size_t, c_int)(b2d_func)
1209
+ ptr = int.from_bytes(b2d_cb, byteorder="little")
1210
+ driver_b2d_cb = binding.CUoccupancyB2DSize(ptr)
1211
+ args = [func.handle, driver_b2d_cb, memsize, blocksizelimit]
1212
+
1213
+ if not flags:
1214
+ return driver.cuOccupancyMaxPotentialBlockSize(*args)
1215
+ else:
1216
+ args.append(flags)
1217
+ return driver.cuOccupancyMaxPotentialBlockSizeWithFlags(*args)
1218
+
1219
+ def prepare_for_use(self):
1220
+ """Initialize the context for use.
1221
+ It's safe to be called multiple times.
1222
+ """
1223
+ self.memory_manager.initialize()
1224
+
1225
+ def push(self):
1226
+ """
1227
+ Pushes this context on the current CPU Thread.
1228
+ """
1229
+ driver.cuCtxPushCurrent(self.handle.value)
1230
+ self.prepare_for_use()
1231
+
1232
+ def pop(self):
1233
+ """
1234
+ Pops this context off the current CPU thread. Note that this context
1235
+ must be at the top of the context stack, otherwise an error will occur.
1236
+ """
1237
+ popped = driver.pop_active_context()
1238
+ assert popped.value == self.handle.value
1239
+
1240
+ def memalloc(self, bytesize):
1241
+ return self.memory_manager.memalloc(bytesize)
1242
+
1243
+ def memallocmanaged(self, bytesize, attach_global=True):
1244
+ return self.memory_manager.memallocmanaged(bytesize, attach_global)
1245
+
1246
+ def memhostalloc(self, bytesize, mapped=False, portable=False, wc=False):
1247
+ return self.memory_manager.memhostalloc(bytesize, mapped, portable, wc)
1248
+
1249
+ def mempin(self, owner, pointer, size, mapped=False):
1250
+ if mapped and not self.device.CAN_MAP_HOST_MEMORY:
1251
+ raise CudaDriverError("%s cannot map host memory" % self.device)
1252
+ return self.memory_manager.mempin(owner, pointer, size, mapped)
1253
+
1254
+ def get_ipc_handle(self, memory):
1255
+ """
1256
+ Returns an *IpcHandle* from a GPU allocation.
1257
+ """
1258
+ if not SUPPORTS_IPC:
1259
+ raise OSError("OS does not support CUDA IPC")
1260
+ return self.memory_manager.get_ipc_handle(memory)
1261
+
1262
+ def open_ipc_handle(self, handle, size):
1263
+ # open the IPC handle to get the device pointer
1264
+ flags = 1 # CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS
1265
+ dptr = driver.cuIpcOpenMemHandle(handle, flags)
1266
+
1267
+ # wrap it
1268
+ return MemoryPointer(pointer=dptr, size=size)
1269
+
1270
+ def enable_peer_access(self, peer_context, flags=0):
1271
+ """Enable peer access between the current context and the peer context"""
1272
+ assert flags == 0, "*flags* is reserved and MUST be zero"
1273
+ driver.cuCtxEnablePeerAccess(peer_context, flags)
1274
+
1275
+ def can_access_peer(self, peer_device):
1276
+ """Returns a bool indicating whether the peer access between the
1277
+ current and peer device is possible.
1278
+ """
1279
+ peer_device = binding.CUdevice(peer_device)
1280
+ can_access_peer = driver.cuDeviceCanAccessPeer(
1281
+ self.device.id, peer_device
1282
+ )
1283
+
1284
+ return bool(can_access_peer)
1285
+
1286
+ def create_module_ptx(self, ptx):
1287
+ if isinstance(ptx, str):
1288
+ ptx = ptx.encode("utf8")
1289
+ image = ObjectCode.from_ptx(ptx)
1290
+ return self.create_module_image(image)
1291
+
1292
+ def create_module_image(
1293
+ self, image, setup_callbacks=None, teardown_callbacks=None
1294
+ ):
1295
+ module = load_module_image(
1296
+ self, image, setup_callbacks, teardown_callbacks
1297
+ )
1298
+ key = module.handle
1299
+ self.modules[key] = module
1300
+ return weakref.proxy(module)
1301
+
1302
+ def unload_module(self, module):
1303
+ key = module.handle
1304
+ del self.modules[key]
1305
+
1306
+ def get_default_stream(self):
1307
+ handle = drvapi.cu_stream(int(binding.CUstream(CU_STREAM_DEFAULT)))
1308
+ return Stream(handle)
1309
+
1310
+ def get_legacy_default_stream(self):
1311
+ handle = drvapi.cu_stream(
1312
+ int(binding.CUstream(binding.CU_STREAM_LEGACY))
1313
+ )
1314
+ return Stream(handle)
1315
+
1316
+ def get_per_thread_default_stream(self):
1317
+ handle = drvapi.cu_stream(
1318
+ int(binding.CUstream(binding.CU_STREAM_PER_THREAD))
1319
+ )
1320
+ return Stream(handle)
1321
+
1322
+ def create_stream(self):
1323
+ # The default stream creation flag, specifying that the created
1324
+ # stream synchronizes with stream 0 (this is different from the
1325
+ # default stream, which we define also as CU_STREAM_DEFAULT when
1326
+ # the NV binding is in use).
1327
+ flags = binding.CUstream_flags.CU_STREAM_DEFAULT.value
1328
+ handle = drvapi.cu_stream(int(driver.cuStreamCreate(flags)))
1329
+ return Stream(
1330
+ handle, finalizer=_stream_finalizer(self.deallocations, handle)
1331
+ )
1332
+
1333
+ def create_external_stream(self, ptr):
1334
+ if not isinstance(ptr, int):
1335
+ raise TypeError("ptr for external stream must be an int")
1336
+ handle = drvapi.cu_stream(int(binding.CUstream(ptr)))
1337
+ return Stream(handle, external=True)
1338
+
1339
+ def create_event(self, timing=True):
1340
+ flags = 0
1341
+ if not timing:
1342
+ flags |= enums.CU_EVENT_DISABLE_TIMING
1343
+ handle = drvapi.cu_event(int(driver.cuEventCreate(flags)))
1344
+ return Event(
1345
+ handle, finalizer=_event_finalizer(self.deallocations, handle)
1346
+ )
1347
+
1348
+ def synchronize(self):
1349
+ driver.cuCtxSynchronize()
1350
+
1351
+ @contextlib.contextmanager
1352
+ def defer_cleanup(self):
1353
+ with self.memory_manager.defer_cleanup():
1354
+ with self.deallocations.disable():
1355
+ yield
1356
+
1357
+ def __repr__(self):
1358
+ return f"<CUDA context {self.handle} of device {self.device.id:d}>"
1359
+
1360
+ def __eq__(self, other):
1361
+ if isinstance(other, Context):
1362
+ return self.handle == other.handle
1363
+ else:
1364
+ return NotImplemented
1365
+
1366
+ def __ne__(self, other):
1367
+ return not self.__eq__(other)
1368
+
1369
+
1370
+ def load_module_image(
1371
+ context, image, setup_callbacks=None, teardown_callbacks=None
1372
+ ):
1373
+ """
1374
+ image must be a pointer
1375
+ """
1376
+ return load_module_image_cuda_python(
1377
+ context, image, setup_callbacks, teardown_callbacks
1378
+ )
1379
+
1380
+
1381
+ def load_module_image_ctypes(
1382
+ context, image, setup_callbacks, teardown_callbacks
1383
+ ):
1384
+ logsz = config.CUDA_LOG_SIZE
1385
+
1386
+ jitinfo = (c_char * logsz)()
1387
+ jiterrors = (c_char * logsz)()
1388
+
1389
+ options = {
1390
+ enums.CU_JIT_INFO_LOG_BUFFER: addressof(jitinfo),
1391
+ enums.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
1392
+ enums.CU_JIT_ERROR_LOG_BUFFER: addressof(jiterrors),
1393
+ enums.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
1394
+ enums.CU_JIT_LOG_VERBOSE: c_void_p(config.CUDA_VERBOSE_JIT_LOG),
1395
+ }
1396
+
1397
+ option_keys = (drvapi.cu_jit_option * len(options))(*options.keys())
1398
+ option_vals = (c_void_p * len(options))(*options.values())
1399
+ handle = drvapi.cu_module()
1400
+ try:
1401
+ driver.cuModuleLoadDataEx(
1402
+ byref(handle), image, len(options), option_keys, option_vals
1403
+ )
1404
+ except CudaAPIError as e:
1405
+ msg = "cuModuleLoadDataEx error:\n%s" % jiterrors.value.decode("utf8")
1406
+ raise CudaAPIError(e.code, msg)
1407
+
1408
+ info_log = jitinfo.value
1409
+
1410
+ return CtypesModule(
1411
+ weakref.proxy(context),
1412
+ handle,
1413
+ info_log,
1414
+ _module_finalizer(context, handle),
1415
+ setup_callbacks,
1416
+ teardown_callbacks,
1417
+ )
1418
+
1419
+
1420
+ def load_module_image_cuda_python(
1421
+ context, image, setup_callbacks, teardown_callbacks
1422
+ ):
1423
+ """
1424
+ image must be a pointer
1425
+ """
1426
+ logsz = config.CUDA_LOG_SIZE
1427
+
1428
+ jitinfo = bytearray(logsz)
1429
+ jiterrors = bytearray(logsz)
1430
+
1431
+ jit_option = binding.CUjit_option
1432
+ options = {
1433
+ jit_option.CU_JIT_INFO_LOG_BUFFER: jitinfo,
1434
+ jit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: logsz,
1435
+ jit_option.CU_JIT_ERROR_LOG_BUFFER: jiterrors,
1436
+ jit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: logsz,
1437
+ jit_option.CU_JIT_LOG_VERBOSE: config.CUDA_VERBOSE_JIT_LOG,
1438
+ }
1439
+
1440
+ option_keys = [k for k in options.keys()]
1441
+ option_vals = [v for v in options.values()]
1442
+
1443
+ try:
1444
+ handle = driver.cuModuleLoadDataEx(
1445
+ image.code, len(options), option_keys, option_vals
1446
+ )
1447
+ except CudaAPIError as e:
1448
+ err_string = jiterrors.decode("utf-8")
1449
+ msg = "cuModuleLoadDataEx error:\n%s" % err_string
1450
+ raise CudaAPIError(e.code, msg)
1451
+
1452
+ info_log = jitinfo.decode("utf-8")
1453
+
1454
+ return CudaPythonModule(
1455
+ weakref.proxy(context),
1456
+ handle,
1457
+ info_log,
1458
+ _module_finalizer(context, handle),
1459
+ setup_callbacks,
1460
+ teardown_callbacks,
1461
+ )
1462
+
1463
+
1464
+ def _alloc_finalizer(memory_manager, ptr, alloc_key, size):
1465
+ allocations = memory_manager.allocations
1466
+ deallocations = memory_manager.deallocations
1467
+
1468
+ def core():
1469
+ if allocations:
1470
+ allocations.pop(alloc_key, None)
1471
+ deallocations.add_item(driver.cuMemFree, ptr, size)
1472
+
1473
+ return core
1474
+
1475
+
1476
+ def _hostalloc_finalizer(memory_manager, ptr, alloc_key, size, mapped):
1477
+ """
1478
+ Finalize page-locked host memory allocated by `context.memhostalloc`.
1479
+
1480
+ This memory is managed by CUDA, and finalization entails deallocation. The
1481
+ issues noted in `_pin_finalizer` are not relevant in this case, and the
1482
+ finalization is placed in the `context.deallocations` queue along with
1483
+ finalization of device objects.
1484
+
1485
+ """
1486
+ allocations = memory_manager.allocations
1487
+ deallocations = memory_manager.deallocations
1488
+ if not mapped:
1489
+ size = _SizeNotSet
1490
+
1491
+ def core():
1492
+ if mapped and allocations:
1493
+ del allocations[alloc_key]
1494
+ deallocations.add_item(driver.cuMemFreeHost, ptr, size)
1495
+
1496
+ return core
1497
+
1498
+
1499
+ def _pin_finalizer(memory_manager, ptr, alloc_key, mapped):
1500
+ """
1501
+ Finalize temporary page-locking of host memory by `context.mempin`.
1502
+
1503
+ This applies to memory not otherwise managed by CUDA. Page-locking can
1504
+ be requested multiple times on the same memory, and must therefore be
1505
+ lifted as soon as finalization is requested, otherwise subsequent calls to
1506
+ `mempin` may fail with `CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED`, leading
1507
+ to unexpected behavior for the context managers `cuda.{pinned,mapped}`.
1508
+ This function therefore carries out finalization immediately, bypassing the
1509
+ `context.deallocations` queue.
1510
+
1511
+ """
1512
+ allocations = memory_manager.allocations
1513
+
1514
+ def core():
1515
+ if mapped and allocations:
1516
+ del allocations[alloc_key]
1517
+ driver.cuMemHostUnregister(ptr)
1518
+
1519
+ return core
1520
+
1521
+
1522
+ def _event_finalizer(deallocs, handle):
1523
+ def core():
1524
+ deallocs.add_item(driver.cuEventDestroy, handle.value)
1525
+
1526
+ return core
1527
+
1528
+
1529
+ def _stream_finalizer(deallocs, handle):
1530
+ def core():
1531
+ deallocs.add_item(driver.cuStreamDestroy, handle.value)
1532
+
1533
+ return core
1534
+
1535
+
1536
+ def _module_finalizer(context, handle):
1537
+ dealloc = context.deallocations
1538
+ modules = context.modules
1539
+ key = handle
1540
+
1541
+ def core():
1542
+ shutting_down = utils.shutting_down # early bind
1543
+
1544
+ def module_unload(handle):
1545
+ # If we are not shutting down, we must be called due to
1546
+ # Context.reset() of Context.unload_module(). Both must have
1547
+ # cleared the module reference from the context.
1548
+ assert shutting_down() or key not in modules
1549
+ driver.cuModuleUnload(handle)
1550
+
1551
+ dealloc.add_item(module_unload, handle)
1552
+
1553
+ return core
1554
+
1555
+
1556
+ class _CudaIpcImpl(object):
1557
+ """Implementation of GPU IPC using CUDA driver API.
1558
+ This requires the devices to be peer accessible.
1559
+ """
1560
+
1561
+ def __init__(self, parent):
1562
+ self.base = parent.base
1563
+ self.handle = parent.handle
1564
+ self.size = parent.size
1565
+ self.offset = parent.offset
1566
+ # remember if the handle is already opened
1567
+ self._opened_mem = None
1568
+
1569
+ def open(self, context):
1570
+ """
1571
+ Import the IPC memory and returns a raw CUDA memory pointer object
1572
+ """
1573
+ if self.base is not None:
1574
+ raise ValueError("opening IpcHandle from original process")
1575
+
1576
+ if self._opened_mem is not None:
1577
+ raise ValueError("IpcHandle is already opened")
1578
+
1579
+ mem = context.open_ipc_handle(self.handle, self.offset + self.size)
1580
+ # this object owns the opened allocation
1581
+ # note: it is required the memory be freed after the ipc handle is
1582
+ # closed by the importing context.
1583
+ self._opened_mem = mem
1584
+ return mem.own().view(self.offset)
1585
+
1586
+ def close(self):
1587
+ if self._opened_mem is None:
1588
+ raise ValueError("IpcHandle not opened")
1589
+ driver.cuIpcCloseMemHandle(self._opened_mem.handle)
1590
+ self._opened_mem = None
1591
+
1592
+
1593
+ class _StagedIpcImpl(object):
1594
+ """Implementation of GPU IPC using custom staging logic to workaround
1595
+ CUDA IPC limitation on peer accessibility between devices.
1596
+ """
1597
+
1598
+ def __init__(self, parent, source_info):
1599
+ self.parent = parent
1600
+ self.base = parent.base
1601
+ self.handle = parent.handle
1602
+ self.size = parent.size
1603
+ self.source_info = source_info
1604
+
1605
+ def open(self, context):
1606
+ from numba import cuda
1607
+
1608
+ srcdev = Device.from_identity(self.source_info)
1609
+ srcdev_id = int(srcdev.id)
1610
+
1611
+ impl = _CudaIpcImpl(parent=self.parent)
1612
+ # Open context on the source device.
1613
+ with cuda.gpus[srcdev_id]:
1614
+ source_ptr = impl.open(cuda.devices.get_context())
1615
+
1616
+ # Allocate GPU buffer.
1617
+ newmem = context.memalloc(self.size)
1618
+ # Do D->D from the source peer-context
1619
+ # This performs automatic host staging
1620
+ device_to_device(newmem, source_ptr, self.size)
1621
+
1622
+ # Cleanup source context
1623
+ with cuda.gpus[srcdev_id]:
1624
+ impl.close()
1625
+
1626
+ return newmem
1627
+
1628
+ def close(self):
1629
+ # Nothing has to be done here
1630
+ pass
1631
+
1632
+
1633
+ class IpcHandle(object):
1634
+ """
1635
+ CUDA IPC handle. Serialization of the CUDA IPC handle object is implemented
1636
+ here.
1637
+
1638
+ :param base: A reference to the original allocation to keep it alive
1639
+ :type base: MemoryPointer
1640
+ :param handle: The CUDA IPC handle, as a ctypes array of bytes.
1641
+ :param size: Size of the original allocation
1642
+ :type size: int
1643
+ :param source_info: The identity of the device on which the IPC handle was
1644
+ opened.
1645
+ :type source_info: dict
1646
+ :param offset: The offset into the underlying allocation of the memory
1647
+ referred to by this IPC handle.
1648
+ :type offset: int
1649
+ """
1650
+
1651
+ def __init__(self, base, handle, size, source_info=None, offset=0):
1652
+ self.base = base
1653
+ self.handle = handle
1654
+ self.size = size
1655
+ self.source_info = source_info
1656
+ self._impl = None
1657
+ self.offset = offset
1658
+
1659
+ def _sentry_source_info(self):
1660
+ if self.source_info is None:
1661
+ raise RuntimeError("IPC handle doesn't have source info")
1662
+
1663
+ def can_access_peer(self, context):
1664
+ """Returns a bool indicating whether the active context can peer
1665
+ access the IPC handle
1666
+ """
1667
+ self._sentry_source_info()
1668
+ if self.source_info == context.device.get_device_identity():
1669
+ return True
1670
+ source_device = Device.from_identity(self.source_info)
1671
+ return context.can_access_peer(source_device.id)
1672
+
1673
+ def open_staged(self, context):
1674
+ """Open the IPC by allowing staging on the host memory first."""
1675
+ self._sentry_source_info()
1676
+
1677
+ if self._impl is not None:
1678
+ raise ValueError("IpcHandle is already opened")
1679
+
1680
+ self._impl = _StagedIpcImpl(self, self.source_info)
1681
+ return self._impl.open(context)
1682
+
1683
+ def open_direct(self, context):
1684
+ """
1685
+ Import the IPC memory and returns a raw CUDA memory pointer object
1686
+ """
1687
+ if self._impl is not None:
1688
+ raise ValueError("IpcHandle is already opened")
1689
+
1690
+ self._impl = _CudaIpcImpl(self)
1691
+ return self._impl.open(context)
1692
+
1693
+ def open(self, context):
1694
+ """Open the IPC handle and import the memory for usage in the given
1695
+ context. Returns a raw CUDA memory pointer object.
1696
+
1697
+ This is enhanced over CUDA IPC that it will work regardless of whether
1698
+ the source device is peer-accessible by the destination device.
1699
+ If the devices are peer-accessible, it uses .open_direct().
1700
+ If the devices are not peer-accessible, it uses .open_staged().
1701
+ """
1702
+ if self.source_info is None or self.can_access_peer(context):
1703
+ fn = self.open_direct
1704
+ else:
1705
+ fn = self.open_staged
1706
+ return fn(context)
1707
+
1708
+ def open_array(self, context, shape, dtype, strides=None):
1709
+ """
1710
+ Similar to `.open()` but returns an device array.
1711
+ """
1712
+ from . import devicearray
1713
+
1714
+ # by default, set strides to itemsize
1715
+ if strides is None:
1716
+ strides = dtype.itemsize
1717
+ dptr = self.open(context)
1718
+ # read the device pointer as an array
1719
+ return devicearray.DeviceNDArray(
1720
+ shape=shape, strides=strides, dtype=dtype, gpu_data=dptr
1721
+ )
1722
+
1723
+ def close(self):
1724
+ if self._impl is None:
1725
+ raise ValueError("IpcHandle not opened")
1726
+ self._impl.close()
1727
+ self._impl = None
1728
+
1729
+ def __reduce__(self):
1730
+ # Preprocess the IPC handle, which is defined as a byte array.
1731
+ preprocessed_handle = self.handle.reserved
1732
+ args = (
1733
+ self.__class__,
1734
+ preprocessed_handle,
1735
+ self.size,
1736
+ self.source_info,
1737
+ self.offset,
1738
+ )
1739
+ return (serialize._rebuild_reduction, args)
1740
+
1741
+ @classmethod
1742
+ def _rebuild(cls, handle_ary, size, source_info, offset):
1743
+ handle = binding.CUipcMemHandle()
1744
+ handle.reserved = handle_ary
1745
+ return cls(
1746
+ base=None,
1747
+ handle=handle,
1748
+ size=size,
1749
+ source_info=source_info,
1750
+ offset=offset,
1751
+ )
1752
+
1753
+
1754
+ class MemoryPointer:
1755
+ """A memory pointer that owns a buffer, with an optional finalizer. Memory
1756
+ pointers provide reference counting, and instances are initialized with a
1757
+ reference count of 1.
1758
+
1759
+ The base ``MemoryPointer`` class does not use the
1760
+ reference count for managing the buffer lifetime. Instead, the buffer
1761
+ lifetime is tied to the memory pointer instance's lifetime:
1762
+
1763
+ - When the instance is deleted, the finalizer will be called.
1764
+ - When the reference count drops to 0, no action is taken.
1765
+
1766
+ Subclasses of ``MemoryPointer`` may modify these semantics, for example to
1767
+ tie the buffer lifetime to the reference count, so that the buffer is freed
1768
+ when there are no more references.
1769
+
1770
+ :param pointer: The address of the buffer.
1771
+ :type pointer: ctypes.c_void_p
1772
+ :param size: The size of the allocation in bytes.
1773
+ :type size: int
1774
+ :param owner: The owner is sometimes set by the internals of this class, or
1775
+ used for Numba's internal memory management. It should not be
1776
+ provided by an external user of the ``MemoryPointer`` class
1777
+ (e.g. from within an EMM Plugin); the default of `None`
1778
+ should always suffice.
1779
+ :type owner: NoneType
1780
+ :param finalizer: A function that is called when the buffer is to be freed.
1781
+ :type finalizer: function
1782
+ """
1783
+
1784
+ __cuda_memory__ = True
1785
+
1786
+ def __init__(self, pointer, size, owner=None, finalizer=None):
1787
+ if isinstance(pointer, ctypes.c_void_p):
1788
+ pointer = binding.CUdeviceptr(pointer.value)
1789
+
1790
+ self.device_pointer = pointer
1791
+ self.size = size
1792
+ self._cuda_memsize_ = size
1793
+ self.is_managed = finalizer is not None
1794
+ self.refct = 1
1795
+ self.handle = self.device_pointer
1796
+ self._owner = owner
1797
+
1798
+ if finalizer is not None:
1799
+ self._finalizer = weakref.finalize(self, finalizer)
1800
+
1801
+ @property
1802
+ def owner(self):
1803
+ return self if self._owner is None else self._owner
1804
+
1805
+ def own(self):
1806
+ return OwnedPointer(weakref.proxy(self))
1807
+
1808
+ def free(self):
1809
+ """
1810
+ Forces the device memory to the trash.
1811
+ """
1812
+ if self.is_managed:
1813
+ if not self._finalizer.alive:
1814
+ raise RuntimeError("Freeing dead memory")
1815
+ self._finalizer()
1816
+ assert not self._finalizer.alive
1817
+
1818
+ def memset(self, byte, count=None, stream=0):
1819
+ count = self.size if count is None else count
1820
+ if stream:
1821
+ handle = stream.handle.value
1822
+ driver.cuMemsetD8Async(self.device_pointer, byte, count, handle)
1823
+ else:
1824
+ driver.cuMemsetD8(self.device_pointer, byte, count)
1825
+
1826
+ def view(self, start, stop=None):
1827
+ if stop is None:
1828
+ size = self.size - start
1829
+ else:
1830
+ size = stop - start
1831
+
1832
+ # Handle NULL/empty memory buffer
1833
+ if not self.device_pointer_value:
1834
+ if size != 0:
1835
+ raise RuntimeError("non-empty slice into empty slice")
1836
+ view = self # new view is just a reference to self
1837
+ # Handle normal case
1838
+ else:
1839
+ base = self.device_pointer_value + start
1840
+ if size < 0:
1841
+ raise RuntimeError("size cannot be negative")
1842
+ pointer = binding.CUdeviceptr()
1843
+ ctypes_ptr = drvapi.cu_device_ptr.from_address(pointer.getPtr())
1844
+ ctypes_ptr.value = base
1845
+ view = MemoryPointer(pointer, size, owner=self.owner)
1846
+
1847
+ if isinstance(self.owner, (MemoryPointer, OwnedPointer)):
1848
+ # Owned by a numba-managed memory segment, take an owned reference
1849
+ return OwnedPointer(weakref.proxy(self.owner), view)
1850
+ else:
1851
+ # Owned by external alloc, return view with same external owner
1852
+ return view
1853
+
1854
+ @property
1855
+ def device_ctypes_pointer(self):
1856
+ return drvapi.cu_device_ptr(int(self.device_pointer))
1857
+
1858
+ @property
1859
+ def device_pointer_value(self):
1860
+ return int(self.device_pointer) or None
1861
+
1862
+
1863
+ class AutoFreePointer(MemoryPointer):
1864
+ """Modifies the ownership semantic of the MemoryPointer so that the
1865
+ instance lifetime is directly tied to the number of references.
1866
+
1867
+ When the reference count reaches zero, the finalizer is invoked.
1868
+
1869
+ Constructor arguments are the same as for :class:`MemoryPointer`.
1870
+ """
1871
+
1872
+ def __init__(self, *args, **kwargs):
1873
+ super(AutoFreePointer, self).__init__(*args, **kwargs)
1874
+ # Release the self reference to the buffer, so that the finalizer
1875
+ # is invoked if all the derived pointers are gone.
1876
+ self.refct -= 1
1877
+
1878
+
1879
+ class MappedMemory(AutoFreePointer):
1880
+ """A memory pointer that refers to a buffer on the host that is mapped into
1881
+ device memory.
1882
+
1883
+ :param context: The context in which the pointer was mapped.
1884
+ :type context: Context
1885
+ :param pointer: The address of the buffer.
1886
+ :type pointer: ctypes.c_void_p
1887
+ :param size: The size of the buffer in bytes.
1888
+ :type size: int
1889
+ :param owner: The owner is sometimes set by the internals of this class, or
1890
+ used for Numba's internal memory management. It should not be
1891
+ provided by an external user of the ``MappedMemory`` class
1892
+ (e.g. from within an EMM Plugin); the default of `None`
1893
+ should always suffice.
1894
+ :type owner: NoneType
1895
+ :param finalizer: A function that is called when the buffer is to be freed.
1896
+ :type finalizer: function
1897
+ """
1898
+
1899
+ __cuda_memory__ = True
1900
+
1901
+ def __init__(self, pointer, size, owner=None, finalizer=None):
1902
+ self.owned = owner
1903
+ self.host_pointer = pointer
1904
+
1905
+ devptr = driver.cuMemHostGetDevicePointer(pointer, 0)
1906
+ self._bufptr_ = self.host_pointer
1907
+
1908
+ self.device_pointer = devptr
1909
+ super(MappedMemory, self).__init__(devptr, size, finalizer=finalizer)
1910
+ self.handle = self.host_pointer
1911
+
1912
+ # For buffer interface
1913
+ self._buflen_ = self.size
1914
+
1915
+ def own(self):
1916
+ return MappedOwnedPointer(weakref.proxy(self))
1917
+
1918
+
1919
+ class PinnedMemory(mviewbuf.MemAlloc):
1920
+ """A pointer to a pinned buffer on the host.
1921
+
1922
+ :param context: The context in which the pointer was mapped.
1923
+ :type context: Context
1924
+ :param owner: The object owning the memory. For EMM plugin implementation,
1925
+ this ca
1926
+ :param pointer: The address of the buffer.
1927
+ :type pointer: ctypes.c_void_p
1928
+ :param size: The size of the buffer in bytes.
1929
+ :type size: int
1930
+ :param owner: An object owning the buffer that has been pinned. For EMM
1931
+ plugin implementation, the default of ``None`` suffices for
1932
+ memory allocated in ``memhostalloc`` - for ``mempin``, it
1933
+ should be the owner passed in to the ``mempin`` method.
1934
+ :param finalizer: A function that is called when the buffer is to be freed.
1935
+ :type finalizer: function
1936
+ """
1937
+
1938
+ def __init__(self, pointer, size, owner=None, finalizer=None):
1939
+ self.owned = owner
1940
+ self.size = size
1941
+ self.host_pointer = pointer
1942
+ self.is_managed = finalizer is not None
1943
+ self.handle = self.host_pointer
1944
+
1945
+ # For buffer interface
1946
+ self._buflen_ = self.size
1947
+ self._bufptr_ = self.host_pointer
1948
+
1949
+ if finalizer is not None:
1950
+ weakref.finalize(self, finalizer)
1951
+
1952
+ def own(self):
1953
+ return self
1954
+
1955
+
1956
+ class ManagedMemory(AutoFreePointer):
1957
+ """A memory pointer that refers to a managed memory buffer (can be accessed
1958
+ on both host and device).
1959
+
1960
+ :param context: The context in which the pointer was mapped.
1961
+ :type context: Context
1962
+ :param pointer: The address of the buffer.
1963
+ :type pointer: ctypes.c_void_p
1964
+ :param size: The size of the buffer in bytes.
1965
+ :type size: int
1966
+ :param owner: The owner is sometimes set by the internals of this class, or
1967
+ used for Numba's internal memory management. It should not be
1968
+ provided by an external user of the ``ManagedMemory`` class
1969
+ (e.g. from within an EMM Plugin); the default of `None`
1970
+ should always suffice.
1971
+ :type owner: NoneType
1972
+ :param finalizer: A function that is called when the buffer is to be freed.
1973
+ :type finalizer: function
1974
+ """
1975
+
1976
+ __cuda_memory__ = True
1977
+
1978
+ def __init__(self, pointer, size, owner=None, finalizer=None):
1979
+ self.owned = owner
1980
+ devptr = pointer
1981
+ super().__init__(devptr, size, finalizer=finalizer)
1982
+
1983
+ # For buffer interface
1984
+ self._buflen_ = self.size
1985
+ self._bufptr_ = self.device_pointer
1986
+
1987
+ def own(self):
1988
+ return ManagedOwnedPointer(weakref.proxy(self))
1989
+
1990
+
1991
+ class OwnedPointer(object):
1992
+ def __init__(self, memptr, view=None):
1993
+ self._mem = memptr
1994
+
1995
+ if view is None:
1996
+ self._view = self._mem
1997
+ else:
1998
+ assert not view.is_managed
1999
+ self._view = view
2000
+
2001
+ mem = self._mem
2002
+
2003
+ def deref():
2004
+ try:
2005
+ mem.refct -= 1
2006
+ assert mem.refct >= 0
2007
+ if mem.refct == 0:
2008
+ mem.free()
2009
+ except ReferenceError:
2010
+ # ignore reference error here
2011
+ pass
2012
+
2013
+ self._mem.refct += 1
2014
+ weakref.finalize(self, deref)
2015
+
2016
+ # pull this attribute out for speed, because it's used often and
2017
+ # there's overhead to going through `__getattr__`
2018
+ self.device_ctypes_pointer = self._view.device_ctypes_pointer
2019
+
2020
+ def __getattr__(self, fname):
2021
+ """Proxy MemoryPointer methods"""
2022
+ return getattr(self._view, fname)
2023
+
2024
+
2025
+ class MappedOwnedPointer(OwnedPointer, mviewbuf.MemAlloc):
2026
+ pass
2027
+
2028
+
2029
+ class ManagedOwnedPointer(OwnedPointer, mviewbuf.MemAlloc):
2030
+ pass
2031
+
2032
+
2033
+ class Stream:
2034
+ def __init__(self, handle, finalizer=None, external=False):
2035
+ self.handle = handle
2036
+ self.external = external
2037
+ if finalizer is not None:
2038
+ weakref.finalize(self, finalizer)
2039
+
2040
+ def __int__(self):
2041
+ # The default stream's handle.value is 0, which gives `None`
2042
+ return self.handle.value or drvapi.CU_STREAM_DEFAULT
2043
+
2044
+ def __cuda_stream__(self):
2045
+ if not self.handle.value:
2046
+ return (0, drvapi.CU_STREAM_DEFAULT)
2047
+ return (0, self.handle.value)
2048
+
2049
+ def __repr__(self):
2050
+ default_streams = {
2051
+ drvapi.CU_STREAM_DEFAULT: "<Default CUDA stream>",
2052
+ drvapi.CU_STREAM_LEGACY: "<Legacy default CUDA stream>",
2053
+ drvapi.CU_STREAM_PER_THREAD: "<Per-thread default CUDA stream>",
2054
+ }
2055
+ ptr = self.handle.value or drvapi.CU_STREAM_DEFAULT
2056
+
2057
+ if ptr in default_streams:
2058
+ return default_streams[ptr]
2059
+ elif self.external:
2060
+ return f"<External CUDA stream {ptr:d}>"
2061
+ else:
2062
+ return f"<CUDA stream {ptr:d}>"
2063
+
2064
+ def synchronize(self):
2065
+ """
2066
+ Wait for all commands in this stream to execute. This will commit any
2067
+ pending memory transfers.
2068
+ """
2069
+ handle = self.handle.value
2070
+ driver.cuStreamSynchronize(handle)
2071
+
2072
+ @contextlib.contextmanager
2073
+ def auto_synchronize(self):
2074
+ """
2075
+ A context manager that waits for all commands in this stream to execute
2076
+ and commits any pending memory transfers upon exiting the context.
2077
+ """
2078
+ yield self
2079
+ self.synchronize()
2080
+
2081
+ def add_callback(self, callback, arg=None):
2082
+ """
2083
+ Add a callback to a compute stream.
2084
+ The user provided function is called from a driver thread once all
2085
+ preceding stream operations are complete.
2086
+
2087
+ Callback functions are called from a CUDA driver thread, not from
2088
+ the thread that invoked `add_callback`. No CUDA API functions may
2089
+ be called from within the callback function.
2090
+
2091
+ The duration of a callback function should be kept short, as the
2092
+ callback will block later work in the stream and may block other
2093
+ callbacks from being executed.
2094
+
2095
+ .. warning::
2096
+ There is a potential for deadlock due to a lock ordering issue
2097
+ between the GIL and the CUDA driver lock when using libraries
2098
+ that call CUDA functions without releasing the GIL. This can
2099
+ occur when the callback function, which holds the CUDA driver lock,
2100
+ attempts to acquire the GIL while another thread that holds the GIL
2101
+ is waiting for the CUDA driver lock. Consider using libraries that
2102
+ properly release the GIL around CUDA operations or restructure
2103
+ your code to avoid this situation.
2104
+
2105
+ Note: The driver function underlying this method is marked for
2106
+ eventual deprecation and may be replaced in a future CUDA release.
2107
+
2108
+ :param callback: Callback function with arguments (stream, status, arg).
2109
+ :param arg: Optional user data to be passed to the callback function.
2110
+ """
2111
+ data = (self, callback, arg)
2112
+ _py_incref(data)
2113
+ ptr = int.from_bytes(self._stream_callback, byteorder="little")
2114
+ stream_callback = binding.CUstreamCallback(ptr)
2115
+ # The callback needs to receive a pointer to the data PyObject
2116
+ data = id(data)
2117
+ handle = self.handle.value
2118
+ driver.cuStreamAddCallback(handle, stream_callback, data, 0)
2119
+
2120
+ @staticmethod
2121
+ @cu_stream_callback_pyobj
2122
+ def _stream_callback(handle, status, data):
2123
+ try:
2124
+ stream, callback, arg = data
2125
+ callback(stream, status, arg)
2126
+ except Exception as e:
2127
+ warnings.warn(f"Exception in stream callback: {e}")
2128
+ finally:
2129
+ _py_decref(data)
2130
+
2131
+ def async_done(self) -> asyncio.futures.Future:
2132
+ """
2133
+ Return an awaitable that resolves once all preceding stream operations
2134
+ are complete. The result of the awaitable is the current stream.
2135
+
2136
+ .. warning::
2137
+ There is a potential for deadlock due to a lock ordering issue
2138
+ between the GIL and the CUDA driver lock when using libraries
2139
+ that call CUDA functions without releasing the GIL. This can
2140
+ occur when the callback function (internally used by this method),
2141
+ which holds the CUDA driver lock, attempts to acquire the GIL
2142
+ while another thread that holds the GIL is waiting for the CUDA driver lock.
2143
+ Consider using libraries that properly release the GIL around
2144
+ CUDA operations or restructure your code to avoid this situation.
2145
+ """
2146
+ loop = asyncio.get_running_loop()
2147
+ future = loop.create_future()
2148
+
2149
+ def resolver(future, status):
2150
+ if future.done():
2151
+ return
2152
+ elif status == 0:
2153
+ future.set_result(self)
2154
+ else:
2155
+ future.set_exception(Exception(f"Stream error {status}"))
2156
+
2157
+ def callback(stream, status, future):
2158
+ loop.call_soon_threadsafe(resolver, future, status)
2159
+
2160
+ self.add_callback(callback, future)
2161
+ return future
2162
+
2163
+
2164
+ class Event:
2165
+ def __init__(self, handle, finalizer=None):
2166
+ self.handle = handle
2167
+ if finalizer is not None:
2168
+ weakref.finalize(self, finalizer)
2169
+
2170
+ def query(self):
2171
+ """
2172
+ Returns True if all work before the most recent record has completed;
2173
+ otherwise, returns False.
2174
+ """
2175
+ try:
2176
+ driver.cuEventQuery(self.handle)
2177
+ except CudaAPIError as e:
2178
+ if e.code == enums.CUDA_ERROR_NOT_READY:
2179
+ return False
2180
+ else:
2181
+ raise
2182
+ else:
2183
+ return True
2184
+
2185
+ def record(self, stream=0):
2186
+ """
2187
+ Set the record point of the event to the current point in the given
2188
+ stream.
2189
+
2190
+ The event will be considered to have occurred when all work that was
2191
+ queued in the stream at the time of the call to ``record()`` has been
2192
+ completed.
2193
+ """
2194
+ hstream = _stream_handle(stream)
2195
+ handle = self.handle.value
2196
+ driver.cuEventRecord(handle, hstream)
2197
+
2198
+ def synchronize(self):
2199
+ """
2200
+ Synchronize the host thread for the completion of the event.
2201
+ """
2202
+ handle = self.handle.value
2203
+ driver.cuEventSynchronize(handle)
2204
+
2205
+ def wait(self, stream=0):
2206
+ """
2207
+ All future works submitted to stream will wait util the event completes.
2208
+ """
2209
+ hstream = _stream_handle(stream)
2210
+ handle = self.handle.value
2211
+ flags = 0
2212
+ driver.cuStreamWaitEvent(hstream, handle, flags)
2213
+
2214
+ def elapsed_time(self, evtend):
2215
+ return event_elapsed_time(self, evtend)
2216
+
2217
+
2218
+ def event_elapsed_time(evtstart, evtend):
2219
+ """
2220
+ Compute the elapsed time between two events in milliseconds.
2221
+ """
2222
+ return driver.cuEventElapsedTime(evtstart.handle.value, evtend.handle.value)
2223
+
2224
+
2225
+ class Module(metaclass=ABCMeta):
2226
+ """Abstract base class for modules"""
2227
+
2228
+ def __init__(
2229
+ self,
2230
+ context,
2231
+ handle,
2232
+ info_log,
2233
+ finalizer=None,
2234
+ setup_callbacks=None,
2235
+ teardown_callbacks=None,
2236
+ ):
2237
+ self.context = context
2238
+ self.handle = handle
2239
+ self.info_log = info_log
2240
+ if finalizer is not None:
2241
+ self._finalizer = weakref.finalize(self, finalizer)
2242
+
2243
+ self.initialized = False
2244
+ self.setup_functions = setup_callbacks
2245
+ self.teardown_functions = teardown_callbacks
2246
+
2247
+ self._set_finalizers()
2248
+
2249
+ def unload(self):
2250
+ """Unload this module from the context"""
2251
+ self.context.unload_module(self)
2252
+
2253
+ @abstractmethod
2254
+ def get_function(self, name):
2255
+ """Returns a Function object encapsulating the named function"""
2256
+
2257
+ @abstractmethod
2258
+ def get_global_symbol(self, name):
2259
+ """Return a MemoryPointer referring to the named symbol"""
2260
+
2261
+ def setup(self):
2262
+ """Call the setup functions for the module"""
2263
+ if self.initialized:
2264
+ raise RuntimeError("The module has already been initialized.")
2265
+
2266
+ if self.setup_functions is None:
2267
+ return
2268
+
2269
+ for f in self.setup_functions:
2270
+ f(self.handle)
2271
+
2272
+ self.initialized = True
2273
+
2274
+ def _set_finalizers(self):
2275
+ """Create finalizers that tear down the module."""
2276
+ if self.teardown_functions is None:
2277
+ return
2278
+
2279
+ def _teardown(teardowns, handle):
2280
+ for f in teardowns:
2281
+ f(handle)
2282
+
2283
+ weakref.finalize(
2284
+ self,
2285
+ _teardown,
2286
+ self.teardown_functions,
2287
+ self.handle,
2288
+ )
2289
+
2290
+
2291
+ class CtypesModule(Module):
2292
+ def get_function(self, name):
2293
+ handle = drvapi.cu_function()
2294
+ driver.cuModuleGetFunction(
2295
+ byref(handle), self.handle, name.encode("utf8")
2296
+ )
2297
+ return CtypesFunction(weakref.proxy(self), handle, name)
2298
+
2299
+ def get_global_symbol(self, name):
2300
+ ptr = drvapi.cu_device_ptr()
2301
+ size = drvapi.c_size_t()
2302
+ driver.cuModuleGetGlobal(
2303
+ byref(ptr), byref(size), self.handle, name.encode("utf8")
2304
+ )
2305
+ return MemoryPointer(ptr, size), size.value
2306
+
2307
+
2308
+ class CudaPythonModule(Module):
2309
+ def get_function(self, name):
2310
+ handle = driver.cuModuleGetFunction(self.handle, name.encode("utf8"))
2311
+ return CudaPythonFunction(weakref.proxy(self), handle, name)
2312
+
2313
+ def get_global_symbol(self, name):
2314
+ ptr, size = driver.cuModuleGetGlobal(self.handle, name.encode("utf8"))
2315
+ return MemoryPointer(ptr, size), size
2316
+
2317
+
2318
+ FuncAttr = namedtuple(
2319
+ "FuncAttr", ["regs", "shared", "local", "const", "maxthreads"]
2320
+ )
2321
+
2322
+
2323
+ class Function(metaclass=ABCMeta):
2324
+ griddim = 1, 1, 1
2325
+ blockdim = 1, 1, 1
2326
+ stream = 0
2327
+ sharedmem = 0
2328
+
2329
+ def __init__(self, module, handle, name):
2330
+ self.module = module
2331
+ self.handle = handle
2332
+ self.name = name
2333
+ self.attrs = self.read_func_attr_all()
2334
+
2335
+ def __repr__(self):
2336
+ return "<CUDA function %s>" % self.name
2337
+
2338
+ @property
2339
+ def device(self):
2340
+ return self.module.context.device
2341
+
2342
+ @abstractmethod
2343
+ def cache_config(
2344
+ self, prefer_equal=False, prefer_cache=False, prefer_shared=False
2345
+ ):
2346
+ """Set the cache configuration for this function."""
2347
+
2348
+ @abstractmethod
2349
+ def read_func_attr(self, attrid):
2350
+ """Return the value of the attribute with given ID."""
2351
+
2352
+ @abstractmethod
2353
+ def read_func_attr_all(self):
2354
+ """Return a FuncAttr object with the values of various function
2355
+ attributes."""
2356
+
2357
+
2358
+ class CtypesFunction(Function):
2359
+ def cache_config(
2360
+ self, prefer_equal=False, prefer_cache=False, prefer_shared=False
2361
+ ):
2362
+ prefer_equal = prefer_equal or (prefer_cache and prefer_shared)
2363
+ if prefer_equal:
2364
+ flag = enums.CU_FUNC_CACHE_PREFER_EQUAL
2365
+ elif prefer_cache:
2366
+ flag = enums.CU_FUNC_CACHE_PREFER_L1
2367
+ elif prefer_shared:
2368
+ flag = enums.CU_FUNC_CACHE_PREFER_SHARED
2369
+ else:
2370
+ flag = enums.CU_FUNC_CACHE_PREFER_NONE
2371
+ driver.cuFuncSetCacheConfig(self.handle, flag)
2372
+
2373
+ def read_func_attr(self, attrid):
2374
+ retval = c_int()
2375
+ driver.cuFuncGetAttribute(byref(retval), attrid, self.handle)
2376
+ return retval.value
2377
+
2378
+ def read_func_attr_all(self):
2379
+ nregs = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_NUM_REGS)
2380
+ cmem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES)
2381
+ lmem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES)
2382
+ smem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES)
2383
+ maxtpb = self.read_func_attr(
2384
+ enums.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
2385
+ )
2386
+ return FuncAttr(
2387
+ regs=nregs, const=cmem, local=lmem, shared=smem, maxthreads=maxtpb
2388
+ )
2389
+
2390
+
2391
+ class CudaPythonFunction(Function):
2392
+ def cache_config(
2393
+ self, prefer_equal=False, prefer_cache=False, prefer_shared=False
2394
+ ):
2395
+ prefer_equal = prefer_equal or (prefer_cache and prefer_shared)
2396
+ attr = binding.CUfunction_attribute
2397
+ if prefer_equal:
2398
+ flag = attr.CU_FUNC_CACHE_PREFER_EQUAL
2399
+ elif prefer_cache:
2400
+ flag = attr.CU_FUNC_CACHE_PREFER_L1
2401
+ elif prefer_shared:
2402
+ flag = attr.CU_FUNC_CACHE_PREFER_SHARED
2403
+ else:
2404
+ flag = attr.CU_FUNC_CACHE_PREFER_NONE
2405
+ driver.cuFuncSetCacheConfig(self.handle, flag)
2406
+
2407
+ def read_func_attr(self, attrid):
2408
+ return driver.cuFuncGetAttribute(attrid, self.handle)
2409
+
2410
+ def read_func_attr_all(self):
2411
+ attr = binding.CUfunction_attribute
2412
+ nregs = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_NUM_REGS)
2413
+ cmem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES)
2414
+ lmem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES)
2415
+ smem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES)
2416
+ maxtpb = self.read_func_attr(
2417
+ attr.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
2418
+ )
2419
+ return FuncAttr(
2420
+ regs=nregs, const=cmem, local=lmem, shared=smem, maxthreads=maxtpb
2421
+ )
2422
+
2423
+
2424
+ def launch_kernel(
2425
+ cufunc_handle,
2426
+ gx,
2427
+ gy,
2428
+ gz,
2429
+ bx,
2430
+ by,
2431
+ bz,
2432
+ sharedmem,
2433
+ hstream,
2434
+ args,
2435
+ cooperative=False,
2436
+ ):
2437
+ param_ptrs = [addressof(arg) for arg in args]
2438
+ params = (c_void_p * len(param_ptrs))(*param_ptrs)
2439
+
2440
+ params_for_launch = addressof(params)
2441
+ extra = 0
2442
+
2443
+ if cooperative:
2444
+ driver.cuLaunchCooperativeKernel(
2445
+ cufunc_handle,
2446
+ gx,
2447
+ gy,
2448
+ gz,
2449
+ bx,
2450
+ by,
2451
+ bz,
2452
+ sharedmem,
2453
+ hstream,
2454
+ params_for_launch,
2455
+ )
2456
+ else:
2457
+ driver.cuLaunchKernel(
2458
+ cufunc_handle,
2459
+ gx,
2460
+ gy,
2461
+ gz,
2462
+ bx,
2463
+ by,
2464
+ bz,
2465
+ sharedmem,
2466
+ hstream,
2467
+ params_for_launch,
2468
+ extra,
2469
+ )
2470
+
2471
+
2472
+ class _LinkerBase(metaclass=ABCMeta):
2473
+ """Abstract base class for linkers"""
2474
+
2475
+ @classmethod
2476
+ def new(
2477
+ cls,
2478
+ max_registers=0,
2479
+ lineinfo=False,
2480
+ cc=None,
2481
+ lto=None,
2482
+ additional_flags=None,
2483
+ ):
2484
+ linker = _Linker
2485
+
2486
+ params = (max_registers, lineinfo, cc)
2487
+ if linker is _Linker:
2488
+ params = (*params, lto, additional_flags)
2489
+ else:
2490
+ if lto or additional_flags:
2491
+ raise ValueError("LTO and additional flags require nvjitlink")
2492
+
2493
+ return linker(*params)
2494
+
2495
+ @abstractmethod
2496
+ def __init__(self, max_registers, lineinfo, cc):
2497
+ # LTO unsupported in Numba at present, but the pynvjitlink linker
2498
+ # (https://github.com/rapidsai/pynvjitlink) supports it,
2499
+ self.lto = False
2500
+
2501
+ @property
2502
+ @abstractmethod
2503
+ def info_log(self):
2504
+ """Return the info log from the linker invocation"""
2505
+
2506
+ @property
2507
+ @abstractmethod
2508
+ def error_log(self):
2509
+ """Return the error log from the linker invocation"""
2510
+
2511
+ @abstractmethod
2512
+ def add_ptx(self, ptx, name):
2513
+ """Add PTX source in a string to the link"""
2514
+
2515
+ def add_cu(self, cu, name):
2516
+ """Add CUDA source in a string to the link. The name of the source
2517
+ file should be specified in `name`."""
2518
+ ptx, log = nvrtc.compile(cu, name, self.cc)
2519
+
2520
+ if config.DUMP_ASSEMBLY:
2521
+ print(("ASSEMBLY %s" % name).center(80, "-"))
2522
+ print(ptx)
2523
+ print("=" * 80)
2524
+
2525
+ # Link the program's PTX using the normal linker mechanism
2526
+ ptx_name = os.path.splitext(name)[0] + ".ptx"
2527
+ self.add_ptx(ptx.encode(), ptx_name)
2528
+
2529
+ @abstractmethod
2530
+ def add_data(self, data, kind, name):
2531
+ """Add in-memory data to the link"""
2532
+
2533
+ @abstractmethod
2534
+ def add_file(self, path, kind):
2535
+ """Add code from a file to the link"""
2536
+
2537
+ def add_cu_file(self, path):
2538
+ cu = cached_file_read(path, how="rb")
2539
+ self.add_cu(cu, os.path.basename(path))
2540
+
2541
+ def add_file_guess_ext(self, path_or_code, ignore_nonlto=False):
2542
+ """
2543
+ Add a file or LinkableCode object to the link. If a file is
2544
+ passed, the type will be inferred from the extension. A LinkableCode
2545
+ object represents a file already in memory.
2546
+
2547
+ When `ignore_nonlto` is set to true, do not add code that will not
2548
+ be LTO-ed in the linking process. This is useful in inspecting the
2549
+ LTO-ed portion of the PTX when linker is added with objects that can be
2550
+ both LTO-ed and not LTO-ed.
2551
+ """
2552
+ if isinstance(path_or_code, str):
2553
+ ext = pathlib.Path(path_or_code).suffix
2554
+ if ext == "":
2555
+ raise RuntimeError(
2556
+ "Don't know how to link file with no extension"
2557
+ )
2558
+ elif ext == ".cu":
2559
+ self.add_cu_file(path_or_code)
2560
+ else:
2561
+ kind = FILE_EXTENSION_MAP.get(ext.lstrip("."), None)
2562
+ if kind is None:
2563
+ raise RuntimeError(
2564
+ f"Don't know how to link file with extension {ext}"
2565
+ )
2566
+
2567
+ if ignore_nonlto:
2568
+ warn_and_return = False
2569
+ if kind in (
2570
+ FILE_EXTENSION_MAP["fatbin"],
2571
+ FILE_EXTENSION_MAP["o"],
2572
+ ):
2573
+ entry_types = inspect_obj_content(path_or_code)
2574
+ if "nvvm" not in entry_types:
2575
+ warn_and_return = True
2576
+ elif kind != FILE_EXTENSION_MAP["ltoir"]:
2577
+ warn_and_return = True
2578
+
2579
+ if warn_and_return:
2580
+ warnings.warn(
2581
+ f"Not adding {path_or_code} as it is not "
2582
+ "optimizable at link time, and `ignore_nonlto == "
2583
+ "True`."
2584
+ )
2585
+ return
2586
+
2587
+ self.add_file(path_or_code, kind)
2588
+ return
2589
+ else:
2590
+ # Otherwise, we should have been given a LinkableCode object
2591
+ if not isinstance(path_or_code, LinkableCode):
2592
+ raise TypeError(
2593
+ "Expected path to file or a LinkableCode object"
2594
+ )
2595
+
2596
+ if path_or_code.kind == "cu":
2597
+ self.add_cu(path_or_code.data, path_or_code.name)
2598
+ else:
2599
+ if ignore_nonlto:
2600
+ warn_and_return = False
2601
+ if isinstance(path_or_code, (Fatbin, Object)):
2602
+ with tempfile.NamedTemporaryFile("w") as fp:
2603
+ fp.write(path_or_code.data)
2604
+ entry_types = inspect_obj_content(fp.name)
2605
+ if "nvvm" not in entry_types:
2606
+ warn_and_return = True
2607
+ elif not isinstance(path_or_code, LTOIR):
2608
+ warn_and_return = True
2609
+
2610
+ if warn_and_return:
2611
+ warnings.warn(
2612
+ f"Not adding {path_or_code.name} as it is not "
2613
+ "optimizable at link time, and `ignore_nonlto == "
2614
+ "True`."
2615
+ )
2616
+ return
2617
+
2618
+ self.add_data(
2619
+ path_or_code.data, path_or_code.kind, path_or_code.name
2620
+ )
2621
+
2622
+ @abstractmethod
2623
+ def complete(self):
2624
+ """Complete the link. Returns (cubin, size)
2625
+
2626
+ cubin is a pointer to a internal buffer of cubin owned by the linker;
2627
+ thus, it should be loaded before the linker is destroyed.
2628
+ """
2629
+
2630
+
2631
+ class _Linker(_LinkerBase):
2632
+ def __init__(
2633
+ self,
2634
+ max_registers=None,
2635
+ lineinfo=False,
2636
+ cc=None,
2637
+ lto=None,
2638
+ additional_flags=None,
2639
+ ):
2640
+ arch = f"sm_{cc[0]}{cc[1]}"
2641
+ self.max_registers = max_registers if max_registers else None
2642
+ self.lineinfo = lineinfo
2643
+ self.cc = cc
2644
+ self.arch = arch
2645
+ if lto is False:
2646
+ # WAR for apparent nvjitlink issue
2647
+ lto = None
2648
+ self.lto = lto
2649
+ self.additional_flags = additional_flags
2650
+
2651
+ self.options = LinkerOptions(
2652
+ max_register_count=self.max_registers,
2653
+ lineinfo=lineinfo,
2654
+ arch=arch,
2655
+ link_time_optimization=lto,
2656
+ )
2657
+ self._complete = False
2658
+ self._object_codes = []
2659
+ self.linker = None # need at least one program
2660
+
2661
+ @property
2662
+ def info_log(self):
2663
+ if not self.linker:
2664
+ raise ValueError("Not Initialized")
2665
+ if self._complete:
2666
+ return self._info_log
2667
+ raise RuntimeError("Link not yet complete.")
2668
+
2669
+ @property
2670
+ def error_log(self):
2671
+ if not self.linker:
2672
+ raise ValueError("Not Initialized")
2673
+ if self._complete:
2674
+ return self._error_log
2675
+ raise RuntimeError("Link not yet complete.")
2676
+
2677
+ def add_ptx(self, ptx, name="<cudapy-ptx>"):
2678
+ obj = ObjectCode.from_ptx(ptx, name=name)
2679
+ self._object_codes.append(obj)
2680
+
2681
+ def add_cu(self, cu, name="<cudapy-cu>"):
2682
+ obj, log = nvrtc.compile(cu, name, self.cc, ltoir=self.lto)
2683
+
2684
+ if not self.lto and config.DUMP_ASSEMBLY:
2685
+ print(("ASSEMBLY %s" % name).center(80, "-"))
2686
+ print(obj.code)
2687
+
2688
+ self._object_codes.append(obj)
2689
+
2690
+ def add_cubin(self, cubin, name="<cudapy-cubin>"):
2691
+ obj = ObjectCode.from_cubin(cubin, name=name)
2692
+ self._object_codes.append(obj)
2693
+
2694
+ def add_ltoir(self, ltoir, name="<cudapy-ltoir>"):
2695
+ obj = ObjectCode.from_ltoir(ltoir, name=name)
2696
+ self._object_codes.append(obj)
2697
+
2698
+ def add_fatbin(self, fatbin, name="<cudapy-fatbin>"):
2699
+ obj = ObjectCode.from_fatbin(fatbin, name=name)
2700
+ self._object_codes.append(obj)
2701
+
2702
+ def add_object(self, obj, name="<cudapy-object>"):
2703
+ obj = ObjectCode.from_object(obj, name=name)
2704
+ self._object_codes.append(obj)
2705
+
2706
+ def add_library(self, lib, name="<cudapy-lib>"):
2707
+ obj = ObjectCode.from_library(lib, name=name)
2708
+ self._object_codes.append(obj)
2709
+
2710
+ def add_file(self, path, kind):
2711
+ try:
2712
+ data = cached_file_read(path, how="rb")
2713
+ except FileNotFoundError:
2714
+ raise LinkerError(f"{path} not found")
2715
+ name = pathlib.Path(path).name
2716
+ self.add_data(data, kind, name)
2717
+
2718
+ def add_data(self, data, kind, name):
2719
+ if kind == FILE_EXTENSION_MAP["ptx"]:
2720
+ fn = self.add_ptx
2721
+ elif kind == FILE_EXTENSION_MAP["cubin"]:
2722
+ fn = self.add_cubin
2723
+ elif kind == "cu":
2724
+ fn = self.add_cu
2725
+ elif (
2726
+ kind == FILE_EXTENSION_MAP["lib"] or kind == FILE_EXTENSION_MAP["a"]
2727
+ ):
2728
+ fn = self.add_library
2729
+ elif kind == FILE_EXTENSION_MAP["fatbin"]:
2730
+ fn = self.add_fatbin
2731
+ elif kind == FILE_EXTENSION_MAP["o"]:
2732
+ fn = self.add_object
2733
+ elif kind == FILE_EXTENSION_MAP["ltoir"]:
2734
+ fn = self.add_ltoir
2735
+ else:
2736
+ raise LinkerError(f"Don't know how to link {kind}")
2737
+
2738
+ fn(data, name)
2739
+
2740
+ def get_linked_ptx(self):
2741
+ options = LinkerOptions(
2742
+ max_register_count=self.max_registers,
2743
+ lineinfo=self.lineinfo,
2744
+ arch=self.arch,
2745
+ link_time_optimization=True,
2746
+ ptx=True,
2747
+ )
2748
+
2749
+ self.linker = Linker(*self._object_codes, options=options)
2750
+
2751
+ result = self.linker.link("ptx")
2752
+ self.close()
2753
+ self._complete = True
2754
+ return result.code
2755
+
2756
+ def close(self):
2757
+ self._info_log = self.linker.get_info_log()
2758
+ self._error_log = self.linker.get_error_log()
2759
+ self.linker.close()
2760
+
2761
+ def complete(self):
2762
+ self.linker = Linker(*self._object_codes, options=self.options)
2763
+ result = self.linker.link("cubin")
2764
+ self.close()
2765
+ self._complete = True
2766
+ return result
2767
+
2768
+
2769
+ class CtypesLinker(_LinkerBase):
2770
+ """
2771
+ Links for current device if no CC given
2772
+ """
2773
+
2774
+ def __init__(self, max_registers=0, lineinfo=False, cc=None):
2775
+ super().__init__(max_registers, lineinfo, cc)
2776
+
2777
+ logsz = config.CUDA_LOG_SIZE
2778
+ linkerinfo = (c_char * logsz)()
2779
+ linkererrors = (c_char * logsz)()
2780
+
2781
+ options = {
2782
+ enums.CU_JIT_INFO_LOG_BUFFER: addressof(linkerinfo),
2783
+ enums.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
2784
+ enums.CU_JIT_ERROR_LOG_BUFFER: addressof(linkererrors),
2785
+ enums.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
2786
+ enums.CU_JIT_LOG_VERBOSE: c_void_p(1),
2787
+ }
2788
+ if max_registers:
2789
+ options[enums.CU_JIT_MAX_REGISTERS] = c_void_p(max_registers)
2790
+ if lineinfo:
2791
+ options[enums.CU_JIT_GENERATE_LINE_INFO] = c_void_p(1)
2792
+
2793
+ self.cc = cc
2794
+ if cc is None:
2795
+ # No option value is needed, but we need something as a placeholder
2796
+ options[enums.CU_JIT_TARGET_FROM_CUCONTEXT] = 1
2797
+ else:
2798
+ cc_val = cc[0] * 10 + cc[1]
2799
+ options[enums.CU_JIT_TARGET] = c_void_p(cc_val)
2800
+
2801
+ raw_keys = list(options.keys())
2802
+ raw_values = list(options.values())
2803
+
2804
+ option_keys = (drvapi.cu_jit_option * len(raw_keys))(*raw_keys)
2805
+ option_vals = (c_void_p * len(raw_values))(*raw_values)
2806
+
2807
+ self.handle = handle = drvapi.cu_link_state()
2808
+ driver.cuLinkCreate(
2809
+ len(raw_keys), option_keys, option_vals, byref(self.handle)
2810
+ )
2811
+
2812
+ weakref.finalize(self, driver.cuLinkDestroy, handle)
2813
+
2814
+ self.linker_info_buf = linkerinfo
2815
+ self.linker_errors_buf = linkererrors
2816
+
2817
+ self._keep_alive = [linkerinfo, linkererrors, option_keys, option_vals]
2818
+
2819
+ @property
2820
+ def info_log(self):
2821
+ return self.linker_info_buf.value.decode("utf8")
2822
+
2823
+ @property
2824
+ def error_log(self):
2825
+ return self.linker_errors_buf.value.decode("utf8")
2826
+
2827
+ def add_cubin(self, cubin, name="<unnamed-cubin>"):
2828
+ return self._add_data(enums.CU_JIT_INPUT_CUBIN, cubin, name)
2829
+
2830
+ def add_ptx(self, ptx, name="<unnamed-ptx>"):
2831
+ return self._add_data(enums.CU_JIT_INPUT_PTX, ptx, name)
2832
+
2833
+ def add_object(self, object_, name="<unnamed-object>"):
2834
+ return self._add_data(enums.CU_JIT_INPUT_OBJECT, object_, name)
2835
+
2836
+ def add_fatbin(self, fatbin, name="<unnamed-fatbin>"):
2837
+ return self._add_data(enums.CU_JIT_INPUT_FATBINARY, fatbin, name)
2838
+
2839
+ def add_library(self, library, name="<unnamed-library>"):
2840
+ return self._add_data(enums.CU_JIT_INPUT_LIBRARY, library, name)
2841
+
2842
+ def _add_data(self, input_type, data, name):
2843
+ data_buffer = c_char_p(data)
2844
+ name_buffer = c_char_p(name.encode("utf8"))
2845
+ self._keep_alive += [data_buffer, name_buffer]
2846
+ try:
2847
+ driver.cuLinkAddData(
2848
+ self.handle,
2849
+ input_type,
2850
+ data_buffer,
2851
+ len(data),
2852
+ name_buffer,
2853
+ 0,
2854
+ None,
2855
+ None,
2856
+ )
2857
+ except CudaAPIError as e:
2858
+ raise LinkerError("%s\n%s" % (e, self.error_log))
2859
+
2860
+ def add_data(self, data, kind, name=None):
2861
+ # We pass the name as **kwargs to ensure the default name for the input
2862
+ # type is used if none is supplied
2863
+ kws = {}
2864
+ if name is not None:
2865
+ kws["name"] = name
2866
+
2867
+ if kind == FILE_EXTENSION_MAP["cubin"]:
2868
+ self.add_cubin(data, **kws)
2869
+ elif kind == FILE_EXTENSION_MAP["fatbin"]:
2870
+ self.add_fatbin(data, **kws)
2871
+ elif kind == FILE_EXTENSION_MAP["a"]:
2872
+ self.add_library(data, **kws)
2873
+ elif kind == FILE_EXTENSION_MAP["ptx"]:
2874
+ self.add_ptx(data, **kws)
2875
+ elif kind == FILE_EXTENSION_MAP["o"]:
2876
+ self.add_object(data, **kws)
2877
+ elif kind == FILE_EXTENSION_MAP["ltoir"]:
2878
+ raise LinkerError("Ctypes linker cannot link LTO-IR")
2879
+ else:
2880
+ raise LinkerError(f"Don't know how to link {kind}")
2881
+
2882
+ def add_file(self, path, kind):
2883
+ pathbuf = c_char_p(path.encode("utf8"))
2884
+ self._keep_alive.append(pathbuf)
2885
+
2886
+ try:
2887
+ driver.cuLinkAddFile(self.handle, kind, pathbuf, 0, None, None)
2888
+ except CudaAPIError as e:
2889
+ if e.code == enums.CUDA_ERROR_FILE_NOT_FOUND:
2890
+ msg = f"{path} not found"
2891
+ else:
2892
+ msg = "%s\n%s" % (e, self.error_log)
2893
+ raise LinkerError(msg)
2894
+
2895
+ def complete(self):
2896
+ cubin_buf = c_void_p(0)
2897
+ size = c_size_t(0)
2898
+
2899
+ try:
2900
+ driver.cuLinkComplete(self.handle, byref(cubin_buf), byref(size))
2901
+ except CudaAPIError as e:
2902
+ raise LinkerError("%s\n%s" % (e, self.error_log))
2903
+
2904
+ size = size.value
2905
+ assert size > 0, "linker returned a zero sized cubin"
2906
+ del self._keep_alive[:]
2907
+
2908
+ # We return a copy of the cubin because it's owned by the linker
2909
+ cubin_ptr = ctypes.cast(cubin_buf, ctypes.POINTER(ctypes.c_char))
2910
+ return bytes(np.ctypeslib.as_array(cubin_ptr, shape=(size,)))
2911
+
2912
+
2913
+ # -----------------------------------------------------------------------------
2914
+
2915
+
2916
+ def get_devptr_for_active_ctx(ptr):
2917
+ """Query the device pointer usable in the current context from an arbitrary
2918
+ pointer.
2919
+ """
2920
+ if ptr != 0:
2921
+ ptr_attrs = binding.CUpointer_attribute
2922
+ attr = ptr_attrs.CU_POINTER_ATTRIBUTE_DEVICE_POINTER
2923
+ ptrobj = binding.CUdeviceptr(ptr)
2924
+ return driver.cuPointerGetAttribute(attr, ptrobj)
2925
+ else:
2926
+ return binding.CUdeviceptr()
2927
+
2928
+
2929
+ def device_extents(devmem):
2930
+ """Find the extents (half open begin and end pointer) of the underlying
2931
+ device memory allocation.
2932
+
2933
+ NOTE: it always returns the extents of the allocation but the extents
2934
+ of the device memory view that can be a subsection of the entire allocation.
2935
+ """
2936
+ devptr = device_ctypes_pointer(devmem)
2937
+ s, n = driver.cuMemGetAddressRange(devptr.value)
2938
+ return int(s), int(binding.CUdeviceptr(int(s) + n))
2939
+
2940
+
2941
+ def device_memory_size(devmem):
2942
+ """Check the memory size of the device memory.
2943
+ The result is cached in the device memory object.
2944
+ It may query the driver for the memory size of the device memory allocation.
2945
+ """
2946
+ sz = getattr(devmem, "_cuda_memsize_", None)
2947
+ if sz is None:
2948
+ s, e = device_extents(devmem)
2949
+ sz = e - s
2950
+ devmem._cuda_memsize_ = sz
2951
+ assert sz >= 0, "{} length array".format(sz)
2952
+ return sz
2953
+
2954
+
2955
+ def _is_datetime_dtype(obj):
2956
+ """Returns True if the obj.dtype is datetime64 or timedelta64"""
2957
+ dtype = getattr(obj, "dtype", None)
2958
+ return dtype is not None and dtype.char in "Mm"
2959
+
2960
+
2961
+ def _workaround_for_datetime(obj):
2962
+ """Workaround for numpy#4983: buffer protocol doesn't support
2963
+ datetime64 or timedelta64.
2964
+ """
2965
+ if _is_datetime_dtype(obj):
2966
+ obj = obj.view(np.int64)
2967
+ return obj
2968
+
2969
+
2970
+ def host_pointer(obj, readonly=False):
2971
+ """Get host pointer from an obj.
2972
+
2973
+ If `readonly` is False, the buffer must be writable.
2974
+
2975
+ NOTE: The underlying data pointer from the host data buffer is used and
2976
+ it should not be changed until the operation which can be asynchronous
2977
+ completes.
2978
+ """
2979
+ if isinstance(obj, int):
2980
+ return obj
2981
+
2982
+ forcewritable = False
2983
+ if not readonly:
2984
+ forcewritable = isinstance(obj, np.void) or _is_datetime_dtype(obj)
2985
+
2986
+ obj = _workaround_for_datetime(obj)
2987
+ return mviewbuf.memoryview_get_buffer(obj, forcewritable, readonly)
2988
+
2989
+
2990
+ def host_memory_extents(obj):
2991
+ "Returns (start, end) the start and end pointer of the array (half open)."
2992
+ obj = _workaround_for_datetime(obj)
2993
+ return mviewbuf.memoryview_get_extents(obj)
2994
+
2995
+
2996
+ @functools.cache
2997
+ def memory_size_from_info(shape, strides, itemsize):
2998
+ """Get the byte size of a contiguous memory buffer given the shape, strides
2999
+ and itemsize.
3000
+ """
3001
+ assert len(shape) == len(strides), "# dim mismatch"
3002
+ ndim = len(shape)
3003
+ s, e = mviewbuf.memoryview_get_extents_info(shape, strides, ndim, itemsize)
3004
+ return e - s
3005
+
3006
+
3007
+ def host_memory_size(obj):
3008
+ "Get the size of the memory"
3009
+ s, e = host_memory_extents(obj)
3010
+ assert e >= s, "memory extend of negative size"
3011
+ return e - s
3012
+
3013
+
3014
+ def device_pointer(obj):
3015
+ "Get the device pointer as an integer"
3016
+ return device_ctypes_pointer(obj).value
3017
+
3018
+
3019
+ def device_ctypes_pointer(obj):
3020
+ "Get the ctypes object for the device pointer"
3021
+ if obj is None:
3022
+ return c_void_p(0)
3023
+ require_device_memory(obj)
3024
+ return obj.device_ctypes_pointer
3025
+
3026
+
3027
+ def is_device_memory(obj):
3028
+ """All CUDA memory object is recognized as an instance with the attribute
3029
+ "__cuda_memory__" defined and its value evaluated to True.
3030
+
3031
+ All CUDA memory object should also define an attribute named
3032
+ "device_pointer" which value is an int object carrying the pointer
3033
+ value of the device memory address. This is not tested in this method.
3034
+ """
3035
+ try:
3036
+ # This is cheaper than getattr in the non-exceptional case
3037
+ return obj.__cuda_memory__
3038
+ except AttributeError:
3039
+ return False
3040
+
3041
+
3042
+ def require_device_memory(obj):
3043
+ """A sentry for methods that accept CUDA memory object."""
3044
+ if not is_device_memory(obj):
3045
+ raise Exception("Not a CUDA memory object.")
3046
+
3047
+
3048
+ def device_memory_depends(devmem, *objs):
3049
+ """Add dependencies to the device memory.
3050
+
3051
+ Mainly used for creating structures that points to other device memory,
3052
+ so that the referees are not GC and released.
3053
+ """
3054
+ depset = getattr(devmem, "_depends_", [])
3055
+ depset.extend(objs)
3056
+
3057
+
3058
+ def host_to_device(dst, src, size, stream=0):
3059
+ """
3060
+ NOTE: The underlying data pointer from the host data buffer is used and
3061
+ it should not be changed until the operation which can be asynchronous
3062
+ completes.
3063
+ """
3064
+ fn = driver.cuMemcpyHtoD
3065
+ args = (device_pointer(dst), host_pointer(src, readonly=True), size)
3066
+
3067
+ if stream:
3068
+ fn = driver.cuMemcpyHtoDAsync
3069
+ args += (_stream_handle(stream),)
3070
+
3071
+ fn(*args)
3072
+
3073
+
3074
+ def device_to_host(dst, src, size, stream=0):
3075
+ """
3076
+ NOTE: The underlying data pointer from the host data buffer is used and
3077
+ it should not be changed until the operation which can be asynchronous
3078
+ completes.
3079
+ """
3080
+ fn = driver.cuMemcpyDtoH
3081
+ args = (host_pointer(dst), device_pointer(src), size)
3082
+
3083
+ if stream:
3084
+ fn = driver.cuMemcpyDtoHAsync
3085
+ args += (_stream_handle(stream),)
3086
+
3087
+ fn(*args)
3088
+
3089
+
3090
+ def device_to_device(dst, src, size, stream=0):
3091
+ """
3092
+ NOTE: The underlying data pointer from the device buffer is used and
3093
+ it should not be changed until the operation which can be asynchronous
3094
+ completes.
3095
+ """
3096
+ fn = driver.cuMemcpyDtoD
3097
+ args = (device_pointer(dst), device_pointer(src), size)
3098
+
3099
+ if stream:
3100
+ fn = driver.cuMemcpyDtoDAsync
3101
+ args += (_stream_handle(stream),)
3102
+
3103
+ fn(*args)
3104
+
3105
+
3106
+ def device_memset(dst, val, size, stream=0):
3107
+ """
3108
+ Memset on the device.
3109
+ If stream is 0, the call is synchronous.
3110
+ If stream is a Stream object, asynchronous mode is used.
3111
+
3112
+ dst: device memory
3113
+ val: byte value to be written
3114
+ size: number of bytes to be written
3115
+ stream: 0 (synchronous) or a CUDA stream
3116
+ """
3117
+ fn = driver.cuMemsetD8
3118
+ args = (device_pointer(dst), val, size)
3119
+
3120
+ if stream:
3121
+ fn = driver.cuMemsetD8Async
3122
+ args += (_stream_handle(stream),)
3123
+
3124
+ try:
3125
+ fn(*args)
3126
+ except CudaAPIError as e:
3127
+ invalid = binding.CUresult.CUDA_ERROR_INVALID_VALUE
3128
+ if (
3129
+ e.code == invalid
3130
+ and getattr(dst, "__cuda_memory__", False)
3131
+ and getattr(dst, "is_managed", False)
3132
+ ):
3133
+ buf = (c_uint8 * size).from_address(host_pointer(dst))
3134
+ byte = val & 0xFF
3135
+ buf[:] = [byte] * size
3136
+ return
3137
+ raise
3138
+
3139
+
3140
+ def profile_start():
3141
+ """
3142
+ Enable profile collection in the current context.
3143
+ """
3144
+ driver.cuProfilerStart()
3145
+
3146
+
3147
+ def profile_stop():
3148
+ """
3149
+ Disable profile collection in the current context.
3150
+ """
3151
+ driver.cuProfilerStop()
3152
+
3153
+
3154
+ @contextlib.contextmanager
3155
+ def profiling():
3156
+ """
3157
+ Context manager that enables profiling on entry and disables profiling on
3158
+ exit.
3159
+ """
3160
+ profile_start()
3161
+ yield
3162
+ profile_stop()
3163
+
3164
+
3165
+ def get_version():
3166
+ """
3167
+ Return the driver version as a tuple of (major, minor)
3168
+ """
3169
+ return driver.get_version()
3170
+
3171
+
3172
+ def inspect_obj_content(objpath: str):
3173
+ """
3174
+ Given path to a fatbin or object, use `cuobjdump` to examine its content
3175
+ Return the set of entries in the object.
3176
+ """
3177
+ code_types: set[str] = set()
3178
+
3179
+ try:
3180
+ out = subprocess.run(
3181
+ ["cuobjdump", objpath], check=True, capture_output=True
3182
+ )
3183
+ except FileNotFoundError as e:
3184
+ msg = (
3185
+ "cuobjdump has not been found. You may need "
3186
+ "to install the CUDA toolkit and ensure that "
3187
+ "it is available on your PATH.\n"
3188
+ )
3189
+ raise RuntimeError(msg) from e
3190
+
3191
+ objtable = out.stdout.decode("utf-8")
3192
+ entry_pattern = r"Fatbin (.*) code"
3193
+ for line in objtable.split("\n"):
3194
+ if match := re.match(entry_pattern, line):
3195
+ code_types.add(match.group(1))
3196
+
3197
+ return code_types
3198
+
3199
+
3200
+ def _stream_handle(stream):
3201
+ """
3202
+ Obtain the appropriate handle for various types of
3203
+ acceptable stream objects. Acceptable types are
3204
+ int (0 for default stream), Stream, ExperimentalStream
3205
+ """
3206
+
3207
+ if stream == 0:
3208
+ return stream
3209
+ allowed = (Stream, ExperimentalStream)
3210
+ if not isinstance(stream, allowed):
3211
+ raise TypeError(
3212
+ "Expected a Stream object or 0, got %s" % type(stream).__name__
3213
+ )
3214
+ elif hasattr(stream, "__cuda_stream__"):
3215
+ ver, ptr = stream.__cuda_stream__()
3216
+ assert ver == 0
3217
+ if isinstance(ptr, binding.CUstream):
3218
+ return get_cuda_native_handle(ptr)
3219
+ else:
3220
+ return ptr
3221
+ else:
3222
+ raise TypeError("Invalid Stream")