numba-cuda 0.0.1__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. _numba_cuda_redirector.pth +1 -0
  2. _numba_cuda_redirector.py +74 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +5 -0
  5. numba_cuda/_version.py +19 -0
  6. numba_cuda/numba/cuda/__init__.py +22 -0
  7. numba_cuda/numba/cuda/api.py +526 -0
  8. numba_cuda/numba/cuda/api_util.py +30 -0
  9. numba_cuda/numba/cuda/args.py +77 -0
  10. numba_cuda/numba/cuda/cg.py +62 -0
  11. numba_cuda/numba/cuda/codegen.py +378 -0
  12. numba_cuda/numba/cuda/compiler.py +422 -0
  13. numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
  14. numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
  15. numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
  16. numba_cuda/numba/cuda/cuda_paths.py +258 -0
  17. numba_cuda/numba/cuda/cudadecl.py +806 -0
  18. numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
  19. numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
  20. numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
  21. numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
  22. numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
  23. numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
  24. numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
  25. numba_cuda/numba/cuda/cudadrv/error.py +36 -0
  26. numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
  27. numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
  28. numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
  29. numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
  30. numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
  31. numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
  32. numba_cuda/numba/cuda/cudaimpl.py +1055 -0
  33. numba_cuda/numba/cuda/cudamath.py +140 -0
  34. numba_cuda/numba/cuda/decorators.py +189 -0
  35. numba_cuda/numba/cuda/descriptor.py +33 -0
  36. numba_cuda/numba/cuda/device_init.py +89 -0
  37. numba_cuda/numba/cuda/deviceufunc.py +908 -0
  38. numba_cuda/numba/cuda/dispatcher.py +1057 -0
  39. numba_cuda/numba/cuda/errors.py +59 -0
  40. numba_cuda/numba/cuda/extending.py +7 -0
  41. numba_cuda/numba/cuda/initialize.py +13 -0
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
  43. numba_cuda/numba/cuda/intrinsics.py +198 -0
  44. numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  45. numba_cuda/numba/cuda/kernels/reduction.py +262 -0
  46. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  47. numba_cuda/numba/cuda/libdevice.py +3382 -0
  48. numba_cuda/numba/cuda/libdevicedecl.py +17 -0
  49. numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
  50. numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
  51. numba_cuda/numba/cuda/mathimpl.py +448 -0
  52. numba_cuda/numba/cuda/models.py +48 -0
  53. numba_cuda/numba/cuda/nvvmutils.py +235 -0
  54. numba_cuda/numba/cuda/printimpl.py +86 -0
  55. numba_cuda/numba/cuda/random.py +292 -0
  56. numba_cuda/numba/cuda/simulator/__init__.py +38 -0
  57. numba_cuda/numba/cuda/simulator/api.py +110 -0
  58. numba_cuda/numba/cuda/simulator/compiler.py +9 -0
  59. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
  60. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
  61. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
  62. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
  63. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
  64. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
  65. numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
  66. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
  67. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
  68. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
  69. numba_cuda/numba/cuda/simulator/kernel.py +308 -0
  70. numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
  71. numba_cuda/numba/cuda/simulator/reduction.py +15 -0
  72. numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
  73. numba_cuda/numba/cuda/simulator_init.py +17 -0
  74. numba_cuda/numba/cuda/stubs.py +902 -0
  75. numba_cuda/numba/cuda/target.py +440 -0
  76. numba_cuda/numba/cuda/testing.py +202 -0
  77. numba_cuda/numba/cuda/tests/__init__.py +58 -0
  78. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
  79. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
  80. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
  81. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
  88. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
  89. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
  90. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
  91. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
  92. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
  93. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
  94. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
  95. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
  98. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
  100. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
  101. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
  102. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
  103. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
  104. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
  105. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
  106. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
  107. numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
  108. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
  109. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
  110. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
  111. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
  112. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
  113. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
  115. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
  117. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
  118. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
  119. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
  120. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
  121. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
  123. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
  124. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
  126. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
  127. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
  129. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
  131. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
  132. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
  133. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
  134. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
  135. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
  136. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
  137. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
  138. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
  139. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
  140. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
  141. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
  142. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
  143. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
  144. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
  145. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
  148. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
  149. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
  150. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
  151. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
  152. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
  153. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
  154. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
  155. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
  156. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
  157. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
  158. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
  159. numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
  161. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
  162. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
  163. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
  164. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
  165. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
  166. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
  167. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
  168. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
  169. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
  170. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
  171. numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
  172. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
  173. numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
  174. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
  175. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
  176. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
  177. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  178. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
  179. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
  180. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
  182. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
  183. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
  184. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
  185. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
  186. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
  187. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
  188. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
  192. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
  193. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
  194. numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
  195. numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
  197. numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  198. numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
  199. numba_cuda/numba/cuda/tests/data/error.cu +7 -0
  200. numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
  201. numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
  202. numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
  203. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
  204. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
  206. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
  207. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
  208. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
  209. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
  210. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
  211. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
  212. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
  213. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
  214. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
  216. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
  217. numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
  218. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
  219. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
  220. numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
  221. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
  222. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
  223. numba_cuda/numba/cuda/types.py +37 -0
  224. numba_cuda/numba/cuda/ufuncs.py +662 -0
  225. numba_cuda/numba/cuda/vector_types.py +209 -0
  226. numba_cuda/numba/cuda/vectorizers.py +252 -0
  227. numba_cuda-0.0.12.dist-info/LICENSE +25 -0
  228. numba_cuda-0.0.12.dist-info/METADATA +68 -0
  229. numba_cuda-0.0.12.dist-info/RECORD +231 -0
  230. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
  231. numba_cuda-0.0.1.dist-info/METADATA +0 -10
  232. numba_cuda-0.0.1.dist-info/RECORD +0 -5
  233. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,3201 @@
1
+ """
2
+ CUDA driver bridge implementation
3
+
4
+ NOTE:
5
+ The new driver implementation uses a *_PendingDeallocs* that help prevents a
6
+ crashing the system (particularly OSX) when the CUDA context is corrupted at
7
+ resource deallocation. The old approach ties resource management directly
8
+ into the object destructor; thus, at corruption of the CUDA context,
9
+ subsequent deallocation could further corrupt the CUDA context and causes the
10
+ system to freeze in some cases.
11
+
12
+ """
13
+
14
+ import sys
15
+ import os
16
+ import ctypes
17
+ import weakref
18
+ import functools
19
+ import warnings
20
+ import logging
21
+ import threading
22
+ import asyncio
23
+ import pathlib
24
+ from itertools import product
25
+ from abc import ABCMeta, abstractmethod
26
+ from ctypes import (c_int, byref, c_size_t, c_char, c_char_p, addressof,
27
+ c_void_p, c_float, c_uint)
28
+ import contextlib
29
+ import importlib
30
+ import numpy as np
31
+ from collections import namedtuple, deque
32
+
33
+ from numba import mviewbuf
34
+ from numba.core import utils, serialize, config
35
+ from .error import CudaSupportError, CudaDriverError
36
+ from .drvapi import API_PROTOTYPES
37
+ from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid
38
+ from numba.cuda.cudadrv import enums, drvapi, nvrtc
39
+
40
+ USE_NV_BINDING = config.CUDA_USE_NVIDIA_BINDING
41
+
42
+ if USE_NV_BINDING:
43
+ from cuda import cuda as binding
44
+ # There is no definition of the default stream in the Nvidia bindings (nor
45
+ # is there at the C/C++ level), so we define it here so we don't need to
46
+ # use a magic number 0 in places where we want the default stream.
47
+ CU_STREAM_DEFAULT = 0
48
+
49
+ MIN_REQUIRED_CC = (3, 5)
50
+ SUPPORTS_IPC = sys.platform.startswith('linux')
51
+
52
+
53
+ _py_decref = ctypes.pythonapi.Py_DecRef
54
+ _py_incref = ctypes.pythonapi.Py_IncRef
55
+ _py_decref.argtypes = [ctypes.py_object]
56
+ _py_incref.argtypes = [ctypes.py_object]
57
+
58
+
59
+ def make_logger():
60
+ logger = logging.getLogger(__name__)
61
+ # is logging configured?
62
+ if not logger.hasHandlers():
63
+ # read user config
64
+ lvl = str(config.CUDA_LOG_LEVEL).upper()
65
+ lvl = getattr(logging, lvl, None)
66
+ if not isinstance(lvl, int):
67
+ # default to critical level
68
+ lvl = logging.CRITICAL
69
+ logger.setLevel(lvl)
70
+ # did user specify a level?
71
+ if config.CUDA_LOG_LEVEL:
72
+ # create a simple handler that prints to stderr
73
+ handler = logging.StreamHandler(sys.stderr)
74
+ fmt = '== CUDA [%(relativeCreated)d] %(levelname)5s -- %(message)s'
75
+ handler.setFormatter(logging.Formatter(fmt=fmt))
76
+ logger.addHandler(handler)
77
+ else:
78
+ # otherwise, put a null handler
79
+ logger.addHandler(logging.NullHandler())
80
+ return logger
81
+
82
+
83
+ class DeadMemoryError(RuntimeError):
84
+ pass
85
+
86
+
87
+ class LinkerError(RuntimeError):
88
+ pass
89
+
90
+
91
+ class CudaAPIError(CudaDriverError):
92
+ def __init__(self, code, msg):
93
+ self.code = code
94
+ self.msg = msg
95
+ super(CudaAPIError, self).__init__(code, msg)
96
+
97
+ def __str__(self):
98
+ return "[%s] %s" % (self.code, self.msg)
99
+
100
+
101
+ def locate_driver_and_loader():
102
+
103
+ envpath = config.CUDA_DRIVER
104
+
105
+ if envpath == '0':
106
+ # Force fail
107
+ _raise_driver_not_found()
108
+
109
+ # Determine DLL type
110
+ if sys.platform == 'win32':
111
+ dlloader = ctypes.WinDLL
112
+ dldir = ['\\windows\\system32']
113
+ dlnames = ['nvcuda.dll']
114
+ elif sys.platform == 'darwin':
115
+ dlloader = ctypes.CDLL
116
+ dldir = ['/usr/local/cuda/lib']
117
+ dlnames = ['libcuda.dylib']
118
+ else:
119
+ # Assume to be *nix like
120
+ dlloader = ctypes.CDLL
121
+ dldir = ['/usr/lib', '/usr/lib64']
122
+ dlnames = ['libcuda.so', 'libcuda.so.1']
123
+
124
+ if envpath:
125
+ try:
126
+ envpath = os.path.abspath(envpath)
127
+ except ValueError:
128
+ raise ValueError("NUMBA_CUDA_DRIVER %s is not a valid path" %
129
+ envpath)
130
+ if not os.path.isfile(envpath):
131
+ raise ValueError("NUMBA_CUDA_DRIVER %s is not a valid file "
132
+ "path. Note it must be a filepath of the .so/"
133
+ ".dll/.dylib or the driver" % envpath)
134
+ candidates = [envpath]
135
+ else:
136
+ # First search for the name in the default library path.
137
+ # If that is not found, try the specific path.
138
+ candidates = dlnames + [os.path.join(x, y)
139
+ for x, y in product(dldir, dlnames)]
140
+
141
+ return dlloader, candidates
142
+
143
+
144
+ def load_driver(dlloader, candidates):
145
+
146
+ # Load the driver; Collect driver error information
147
+ path_not_exist = []
148
+ driver_load_error = []
149
+
150
+ for path in candidates:
151
+ try:
152
+ dll = dlloader(path)
153
+ except OSError as e:
154
+ # Problem opening the DLL
155
+ path_not_exist.append(not os.path.isfile(path))
156
+ driver_load_error.append(e)
157
+ else:
158
+ return dll, path
159
+
160
+ # Problem loading driver
161
+ if all(path_not_exist):
162
+ _raise_driver_not_found()
163
+ else:
164
+ errmsg = '\n'.join(str(e) for e in driver_load_error)
165
+ _raise_driver_error(errmsg)
166
+
167
+
168
+ def find_driver():
169
+ dlloader, candidates = locate_driver_and_loader()
170
+ dll, path = load_driver(dlloader, candidates)
171
+ return dll
172
+
173
+
174
+ DRIVER_NOT_FOUND_MSG = """
175
+ CUDA driver library cannot be found.
176
+ If you are sure that a CUDA driver is installed,
177
+ try setting environment variable NUMBA_CUDA_DRIVER
178
+ with the file path of the CUDA driver shared library.
179
+ """
180
+
181
+ DRIVER_LOAD_ERROR_MSG = """
182
+ Possible CUDA driver libraries are found but error occurred during load:
183
+ %s
184
+ """
185
+
186
+
187
+ def _raise_driver_not_found():
188
+ raise CudaSupportError(DRIVER_NOT_FOUND_MSG)
189
+
190
+
191
+ def _raise_driver_error(e):
192
+ raise CudaSupportError(DRIVER_LOAD_ERROR_MSG % e)
193
+
194
+
195
+ def _build_reverse_error_map():
196
+ prefix = 'CUDA_ERROR'
197
+ map = utils.UniqueDict()
198
+ for name in dir(enums):
199
+ if name.startswith(prefix):
200
+ code = getattr(enums, name)
201
+ map[code] = name
202
+ return map
203
+
204
+
205
+ def _getpid():
206
+ return os.getpid()
207
+
208
+
209
+ ERROR_MAP = _build_reverse_error_map()
210
+
211
+
212
+ class Driver(object):
213
+ """
214
+ Driver API functions are lazily bound.
215
+ """
216
+ _singleton = None
217
+
218
+ def __new__(cls):
219
+ obj = cls._singleton
220
+ if obj is not None:
221
+ return obj
222
+ else:
223
+ obj = object.__new__(cls)
224
+ cls._singleton = obj
225
+ return obj
226
+
227
+ def __init__(self):
228
+ self.devices = utils.UniqueDict()
229
+ self.is_initialized = False
230
+ self.initialization_error = None
231
+ self.pid = None
232
+ try:
233
+ if config.DISABLE_CUDA:
234
+ msg = ("CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1 "
235
+ "in the environment, or because CUDA is unsupported on "
236
+ "32-bit systems.")
237
+ raise CudaSupportError(msg)
238
+ self.lib = find_driver()
239
+ except CudaSupportError as e:
240
+ self.is_initialized = True
241
+ self.initialization_error = e.msg
242
+
243
+ def ensure_initialized(self):
244
+ if self.is_initialized:
245
+ return
246
+
247
+ # lazily initialize logger
248
+ global _logger
249
+ _logger = make_logger()
250
+
251
+ self.is_initialized = True
252
+ try:
253
+ _logger.info('init')
254
+ self.cuInit(0)
255
+ except CudaAPIError as e:
256
+ description = f"{e.msg} ({e.code})"
257
+ self.initialization_error = description
258
+ raise CudaSupportError(f"Error at driver init: {description}")
259
+ else:
260
+ self.pid = _getpid()
261
+
262
+ @property
263
+ def is_available(self):
264
+ self.ensure_initialized()
265
+ return self.initialization_error is None
266
+
267
+ def __getattr__(self, fname):
268
+ # First request of a driver API function
269
+ self.ensure_initialized()
270
+
271
+ if self.initialization_error is not None:
272
+ raise CudaSupportError("Error at driver init: \n%s:" %
273
+ self.initialization_error)
274
+
275
+ if USE_NV_BINDING:
276
+ return self._cuda_python_wrap_fn(fname)
277
+ else:
278
+ return self._ctypes_wrap_fn(fname)
279
+
280
+ def _ctypes_wrap_fn(self, fname, libfn=None):
281
+ # Wrap a CUDA driver function by default
282
+ if libfn is None:
283
+ try:
284
+ proto = API_PROTOTYPES[fname]
285
+ except KeyError:
286
+ raise AttributeError(fname)
287
+ restype = proto[0]
288
+ argtypes = proto[1:]
289
+
290
+ # Find function in driver library
291
+ libfn = self._find_api(fname)
292
+ libfn.restype = restype
293
+ libfn.argtypes = argtypes
294
+
295
+ def verbose_cuda_api_call(*args):
296
+ argstr = ", ".join([str(arg) for arg in args])
297
+ _logger.debug('call driver api: %s(%s)', libfn.__name__, argstr)
298
+ retcode = libfn(*args)
299
+ self._check_ctypes_error(fname, retcode)
300
+
301
+ def safe_cuda_api_call(*args):
302
+ _logger.debug('call driver api: %s', libfn.__name__)
303
+ retcode = libfn(*args)
304
+ self._check_ctypes_error(fname, retcode)
305
+
306
+ if config.CUDA_LOG_API_ARGS:
307
+ wrapper = verbose_cuda_api_call
308
+ else:
309
+ wrapper = safe_cuda_api_call
310
+
311
+ safe_call = functools.wraps(libfn)(wrapper)
312
+ setattr(self, fname, safe_call)
313
+ return safe_call
314
+
315
+ def _cuda_python_wrap_fn(self, fname):
316
+ libfn = getattr(binding, fname)
317
+
318
+ def verbose_cuda_api_call(*args):
319
+ argstr = ", ".join([str(arg) for arg in args])
320
+ _logger.debug('call driver api: %s(%s)', libfn.__name__, argstr)
321
+ return self._check_cuda_python_error(fname, libfn(*args))
322
+
323
+ def safe_cuda_api_call(*args):
324
+ _logger.debug('call driver api: %s', libfn.__name__)
325
+ return self._check_cuda_python_error(fname, libfn(*args))
326
+
327
+ if config.CUDA_LOG_API_ARGS:
328
+ wrapper = verbose_cuda_api_call
329
+ else:
330
+ wrapper = safe_cuda_api_call
331
+
332
+ safe_call = functools.wraps(libfn)(wrapper)
333
+ setattr(self, fname, safe_call)
334
+ return safe_call
335
+
336
+ def _find_api(self, fname):
337
+ # We use alternatively-named functions for PTDS with the Numba ctypes
338
+ # binding. For the NVidia binding, it handles linking to the correct
339
+ # variant.
340
+ if config.CUDA_PER_THREAD_DEFAULT_STREAM and not USE_NV_BINDING:
341
+ variants = ('_v2_ptds', '_v2_ptsz', '_ptds', '_ptsz', '_v2', '')
342
+ else:
343
+ variants = ('_v2', '')
344
+
345
+ for variant in variants:
346
+ try:
347
+ return getattr(self.lib, f'{fname}{variant}')
348
+ except AttributeError:
349
+ pass
350
+
351
+ # Not found.
352
+ # Delay missing function error to use
353
+ def absent_function(*args, **kws):
354
+ raise CudaDriverError(f'Driver missing function: {fname}')
355
+
356
+ setattr(self, fname, absent_function)
357
+ return absent_function
358
+
359
+ def _detect_fork(self):
360
+ if self.pid is not None and _getpid() != self.pid:
361
+ msg = 'pid %s forked from pid %s after CUDA driver init'
362
+ _logger.critical(msg, _getpid(), self.pid)
363
+ raise CudaDriverError("CUDA initialized before forking")
364
+
365
+ def _check_ctypes_error(self, fname, retcode):
366
+ if retcode != enums.CUDA_SUCCESS:
367
+ errname = ERROR_MAP.get(retcode, "UNKNOWN_CUDA_ERROR")
368
+ msg = "Call to %s results in %s" % (fname, errname)
369
+ _logger.error(msg)
370
+ if retcode == enums.CUDA_ERROR_NOT_INITIALIZED:
371
+ self._detect_fork()
372
+ raise CudaAPIError(retcode, msg)
373
+
374
+ def _check_cuda_python_error(self, fname, returned):
375
+ retcode = returned[0]
376
+ retval = returned[1:]
377
+ if len(retval) == 1:
378
+ retval = retval[0]
379
+
380
+ if retcode != binding.CUresult.CUDA_SUCCESS:
381
+ msg = "Call to %s results in %s" % (fname, retcode.name)
382
+ _logger.error(msg)
383
+ if retcode == binding.CUresult.CUDA_ERROR_NOT_INITIALIZED:
384
+ self._detect_fork()
385
+ raise CudaAPIError(retcode, msg)
386
+
387
+ return retval
388
+
389
+ def get_device(self, devnum=0):
390
+ dev = self.devices.get(devnum)
391
+ if dev is None:
392
+ dev = Device(devnum)
393
+ self.devices[devnum] = dev
394
+ return weakref.proxy(dev)
395
+
396
+ def get_device_count(self):
397
+ if USE_NV_BINDING:
398
+ return self.cuDeviceGetCount()
399
+
400
+ count = c_int()
401
+ self.cuDeviceGetCount(byref(count))
402
+ return count.value
403
+
404
+ def list_devices(self):
405
+ """Returns a list of active devices
406
+ """
407
+ return list(self.devices.values())
408
+
409
+ def reset(self):
410
+ """Reset all devices
411
+ """
412
+ for dev in self.devices.values():
413
+ dev.reset()
414
+
415
+ def pop_active_context(self):
416
+ """Pop the active CUDA context and return the handle.
417
+ If no CUDA context is active, return None.
418
+ """
419
+ with self.get_active_context() as ac:
420
+ if ac.devnum is not None:
421
+ if USE_NV_BINDING:
422
+ return driver.cuCtxPopCurrent()
423
+ else:
424
+ popped = drvapi.cu_context()
425
+ driver.cuCtxPopCurrent(byref(popped))
426
+ return popped
427
+
428
+ def get_active_context(self):
429
+ """Returns an instance of ``_ActiveContext``.
430
+ """
431
+ return _ActiveContext()
432
+
433
+ def get_version(self):
434
+ """
435
+ Returns the CUDA Runtime version as a tuple (major, minor).
436
+ """
437
+ if USE_NV_BINDING:
438
+ version = driver.cuDriverGetVersion()
439
+ else:
440
+ dv = ctypes.c_int(0)
441
+ driver.cuDriverGetVersion(ctypes.byref(dv))
442
+ version = dv.value
443
+
444
+ # The version is encoded as (1000 * major) + (10 * minor)
445
+ major = version // 1000
446
+ minor = (version - (major * 1000)) // 10
447
+ return (major, minor)
448
+
449
+
450
+ class _ActiveContext(object):
451
+ """An contextmanager object to cache active context to reduce dependency
452
+ on querying the CUDA driver API.
453
+
454
+ Once entering the context, it is assumed that the active CUDA context is
455
+ not changed until the context is exited.
456
+ """
457
+ _tls_cache = threading.local()
458
+
459
+ def __enter__(self):
460
+ is_top = False
461
+ # check TLS cache
462
+ if hasattr(self._tls_cache, 'ctx_devnum'):
463
+ hctx, devnum = self._tls_cache.ctx_devnum
464
+ # Not cached. Query the driver API.
465
+ else:
466
+ if USE_NV_BINDING:
467
+ hctx = driver.cuCtxGetCurrent()
468
+ if int(hctx) == 0:
469
+ hctx = None
470
+ else:
471
+ hctx = drvapi.cu_context(0)
472
+ driver.cuCtxGetCurrent(byref(hctx))
473
+ hctx = hctx if hctx.value else None
474
+
475
+ if hctx is None:
476
+ devnum = None
477
+ else:
478
+ if USE_NV_BINDING:
479
+ devnum = int(driver.cuCtxGetDevice())
480
+ else:
481
+ hdevice = drvapi.cu_device()
482
+ driver.cuCtxGetDevice(byref(hdevice))
483
+ devnum = hdevice.value
484
+
485
+ self._tls_cache.ctx_devnum = (hctx, devnum)
486
+ is_top = True
487
+
488
+ self._is_top = is_top
489
+ self.context_handle = hctx
490
+ self.devnum = devnum
491
+ return self
492
+
493
+ def __exit__(self, exc_type, exc_val, exc_tb):
494
+ if self._is_top:
495
+ delattr(self._tls_cache, 'ctx_devnum')
496
+
497
+ def __bool__(self):
498
+ """Returns True is there's a valid and active CUDA context.
499
+ """
500
+ return self.context_handle is not None
501
+
502
+ __nonzero__ = __bool__
503
+
504
+
505
+ driver = Driver()
506
+
507
+
508
+ def _build_reverse_device_attrs():
509
+ prefix = "CU_DEVICE_ATTRIBUTE_"
510
+ map = utils.UniqueDict()
511
+ for name in dir(enums):
512
+ if name.startswith(prefix):
513
+ map[name[len(prefix):]] = getattr(enums, name)
514
+ return map
515
+
516
+
517
+ DEVICE_ATTRIBUTES = _build_reverse_device_attrs()
518
+
519
+
520
+ class Device(object):
521
+ """
522
+ The device object owns the CUDA contexts. This is owned by the driver
523
+ object. User should not construct devices directly.
524
+ """
525
+ @classmethod
526
+ def from_identity(self, identity):
527
+ """Create Device object from device identity created by
528
+ ``Device.get_device_identity()``.
529
+ """
530
+ for devid in range(driver.get_device_count()):
531
+ d = driver.get_device(devid)
532
+ if d.get_device_identity() == identity:
533
+ return d
534
+ else:
535
+ errmsg = (
536
+ "No device of {} is found. "
537
+ "Target device may not be visible in this process."
538
+ ).format(identity)
539
+ raise RuntimeError(errmsg)
540
+
541
+ def __init__(self, devnum):
542
+ if USE_NV_BINDING:
543
+ result = driver.cuDeviceGet(devnum)
544
+ self.id = result
545
+ got_devnum = int(result)
546
+ else:
547
+ result = c_int()
548
+ driver.cuDeviceGet(byref(result), devnum)
549
+ got_devnum = result.value
550
+ self.id = got_devnum
551
+
552
+ msg = f"Driver returned device {got_devnum} instead of {devnum}"
553
+ if devnum != got_devnum:
554
+ raise RuntimeError(msg)
555
+
556
+ self.attributes = {}
557
+
558
+ # Read compute capability
559
+ self.compute_capability = (self.COMPUTE_CAPABILITY_MAJOR,
560
+ self.COMPUTE_CAPABILITY_MINOR)
561
+
562
+ # Read name
563
+ bufsz = 128
564
+
565
+ if USE_NV_BINDING:
566
+ buf = driver.cuDeviceGetName(bufsz, self.id)
567
+ name = buf.decode('utf-8').rstrip('\0')
568
+ else:
569
+ buf = (c_char * bufsz)()
570
+ driver.cuDeviceGetName(buf, bufsz, self.id)
571
+ name = buf.value
572
+
573
+ self.name = name
574
+
575
+ # Read UUID
576
+ if USE_NV_BINDING:
577
+ uuid = driver.cuDeviceGetUuid(self.id)
578
+ uuid_vals = tuple(uuid.bytes)
579
+ else:
580
+ uuid = cu_uuid()
581
+ driver.cuDeviceGetUuid(byref(uuid), self.id)
582
+ uuid_vals = tuple(bytes(uuid))
583
+
584
+ b = '%02x'
585
+ b2 = b * 2
586
+ b4 = b * 4
587
+ b6 = b * 6
588
+ fmt = f'GPU-{b4}-{b2}-{b2}-{b2}-{b6}'
589
+ self.uuid = fmt % uuid_vals
590
+
591
+ self.primary_context = None
592
+
593
+ def get_device_identity(self):
594
+ return {
595
+ 'pci_domain_id': self.PCI_DOMAIN_ID,
596
+ 'pci_bus_id': self.PCI_BUS_ID,
597
+ 'pci_device_id': self.PCI_DEVICE_ID,
598
+ }
599
+
600
+ def __repr__(self):
601
+ return "<CUDA device %d '%s'>" % (self.id, self.name)
602
+
603
+ def __getattr__(self, attr):
604
+ """Read attributes lazily
605
+ """
606
+ if USE_NV_BINDING:
607
+ code = getattr(binding.CUdevice_attribute,
608
+ f'CU_DEVICE_ATTRIBUTE_{attr}')
609
+ value = driver.cuDeviceGetAttribute(code, self.id)
610
+ else:
611
+ try:
612
+ code = DEVICE_ATTRIBUTES[attr]
613
+ except KeyError:
614
+ raise AttributeError(attr)
615
+
616
+ result = c_int()
617
+ driver.cuDeviceGetAttribute(byref(result), code, self.id)
618
+ value = result.value
619
+
620
+ setattr(self, attr, value)
621
+ return value
622
+
623
+ def __hash__(self):
624
+ return hash(self.id)
625
+
626
+ def __eq__(self, other):
627
+ if isinstance(other, Device):
628
+ return self.id == other.id
629
+ return False
630
+
631
+ def __ne__(self, other):
632
+ return not (self == other)
633
+
634
+ def get_primary_context(self):
635
+ """
636
+ Returns the primary context for the device.
637
+ Note: it is not pushed to the CPU thread.
638
+ """
639
+ if self.primary_context is not None:
640
+ return self.primary_context
641
+
642
+ met_requirement_for_device(self)
643
+ # create primary context
644
+ if USE_NV_BINDING:
645
+ hctx = driver.cuDevicePrimaryCtxRetain(self.id)
646
+ else:
647
+ hctx = drvapi.cu_context()
648
+ driver.cuDevicePrimaryCtxRetain(byref(hctx), self.id)
649
+
650
+ ctx = Context(weakref.proxy(self), hctx)
651
+ self.primary_context = ctx
652
+ return ctx
653
+
654
+ def release_primary_context(self):
655
+ """
656
+ Release reference to primary context if it has been retained.
657
+ """
658
+ if self.primary_context:
659
+ driver.cuDevicePrimaryCtxRelease(self.id)
660
+ self.primary_context = None
661
+
662
+ def reset(self):
663
+ try:
664
+ if self.primary_context is not None:
665
+ self.primary_context.reset()
666
+ self.release_primary_context()
667
+ finally:
668
+ # reset at the driver level
669
+ driver.cuDevicePrimaryCtxReset(self.id)
670
+
671
+ @property
672
+ def supports_float16(self):
673
+ return self.compute_capability >= (5, 3)
674
+
675
+
676
+ def met_requirement_for_device(device):
677
+ if device.compute_capability < MIN_REQUIRED_CC:
678
+ raise CudaSupportError("%s has compute capability < %s" %
679
+ (device, MIN_REQUIRED_CC))
680
+
681
+
682
+ class BaseCUDAMemoryManager(object, metaclass=ABCMeta):
683
+ """Abstract base class for External Memory Management (EMM) Plugins."""
684
+
685
+ def __init__(self, *args, **kwargs):
686
+ if 'context' not in kwargs:
687
+ raise RuntimeError("Memory manager requires a context")
688
+ self.context = kwargs.pop('context')
689
+
690
+ @abstractmethod
691
+ def memalloc(self, size):
692
+ """
693
+ Allocate on-device memory in the current context.
694
+
695
+ :param size: Size of allocation in bytes
696
+ :type size: int
697
+ :return: A memory pointer instance that owns the allocated memory
698
+ :rtype: :class:`MemoryPointer`
699
+ """
700
+
701
+ @abstractmethod
702
+ def memhostalloc(self, size, mapped, portable, wc):
703
+ """
704
+ Allocate pinned host memory.
705
+
706
+ :param size: Size of the allocation in bytes
707
+ :type size: int
708
+ :param mapped: Whether the allocated memory should be mapped into the
709
+ CUDA address space.
710
+ :type mapped: bool
711
+ :param portable: Whether the memory will be considered pinned by all
712
+ contexts, and not just the calling context.
713
+ :type portable: bool
714
+ :param wc: Whether to allocate the memory as write-combined.
715
+ :type wc: bool
716
+ :return: A memory pointer instance that owns the allocated memory. The
717
+ return type depends on whether the region was mapped into
718
+ device memory.
719
+ :rtype: :class:`MappedMemory` or :class:`PinnedMemory`
720
+ """
721
+
722
+ @abstractmethod
723
+ def mempin(self, owner, pointer, size, mapped):
724
+ """
725
+ Pin a region of host memory that is already allocated.
726
+
727
+ :param owner: The object that owns the memory.
728
+ :param pointer: The pointer to the beginning of the region to pin.
729
+ :type pointer: int
730
+ :param size: The size of the region in bytes.
731
+ :type size: int
732
+ :param mapped: Whether the region should also be mapped into device
733
+ memory.
734
+ :type mapped: bool
735
+ :return: A memory pointer instance that refers to the allocated
736
+ memory.
737
+ :rtype: :class:`MappedMemory` or :class:`PinnedMemory`
738
+ """
739
+
740
+ @abstractmethod
741
+ def initialize(self):
742
+ """
743
+ Perform any initialization required for the EMM plugin instance to be
744
+ ready to use.
745
+
746
+ :return: None
747
+ """
748
+
749
+ @abstractmethod
750
+ def get_ipc_handle(self, memory):
751
+ """
752
+ Return an IPC handle from a GPU allocation.
753
+
754
+ :param memory: Memory for which the IPC handle should be created.
755
+ :type memory: :class:`MemoryPointer`
756
+ :return: IPC handle for the allocation
757
+ :rtype: :class:`IpcHandle`
758
+ """
759
+
760
+ @abstractmethod
761
+ def get_memory_info(self):
762
+ """
763
+ Returns ``(free, total)`` memory in bytes in the context. May raise
764
+ :class:`NotImplementedError`, if returning such information is not
765
+ practical (e.g. for a pool allocator).
766
+
767
+ :return: Memory info
768
+ :rtype: :class:`MemoryInfo`
769
+ """
770
+
771
+ @abstractmethod
772
+ def reset(self):
773
+ """
774
+ Clears up all memory allocated in this context.
775
+
776
+ :return: None
777
+ """
778
+
779
+ @abstractmethod
780
+ def defer_cleanup(self):
781
+ """
782
+ Returns a context manager that ensures the implementation of deferred
783
+ cleanup whilst it is active.
784
+
785
+ :return: Context manager
786
+ """
787
+
788
+ @property
789
+ @abstractmethod
790
+ def interface_version(self):
791
+ """
792
+ Returns an integer specifying the version of the EMM Plugin interface
793
+ supported by the plugin implementation. Should always return 1 for
794
+ implementations of this version of the specification.
795
+ """
796
+
797
+
798
+ class HostOnlyCUDAMemoryManager(BaseCUDAMemoryManager):
799
+ """Base class for External Memory Management (EMM) Plugins that only
800
+ implement on-device allocation. A subclass need not implement the
801
+ ``memhostalloc`` and ``mempin`` methods.
802
+
803
+ This class also implements ``reset`` and ``defer_cleanup`` (see
804
+ :class:`numba.cuda.BaseCUDAMemoryManager`) for its own internal state
805
+ management. If an EMM Plugin based on this class also implements these
806
+ methods, then its implementations of these must also call the method from
807
+ ``super()`` to give ``HostOnlyCUDAMemoryManager`` an opportunity to do the
808
+ necessary work for the host allocations it is managing.
809
+
810
+ This class does not implement ``interface_version``, as it will always be
811
+ consistent with the version of Numba in which it is implemented. An EMM
812
+ Plugin subclassing this class should implement ``interface_version``
813
+ instead.
814
+ """
815
+
816
+ def __init__(self, *args, **kwargs):
817
+ super().__init__(*args, **kwargs)
818
+ self.allocations = utils.UniqueDict()
819
+ self.deallocations = _PendingDeallocs()
820
+
821
+ def _attempt_allocation(self, allocator):
822
+ """
823
+ Attempt allocation by calling *allocator*. If an out-of-memory error
824
+ is raised, the pending deallocations are flushed and the allocation
825
+ is retried. If it fails in the second attempt, the error is reraised.
826
+ """
827
+ try:
828
+ return allocator()
829
+ except CudaAPIError as e:
830
+ # is out-of-memory?
831
+ if USE_NV_BINDING:
832
+ oom_code = binding.CUresult.CUDA_ERROR_OUT_OF_MEMORY
833
+ else:
834
+ oom_code = enums.CUDA_ERROR_OUT_OF_MEMORY
835
+
836
+ if e.code == oom_code:
837
+ # clear pending deallocations
838
+ self.deallocations.clear()
839
+ # try again
840
+ return allocator()
841
+ else:
842
+ raise
843
+
844
+ def memhostalloc(self, size, mapped=False, portable=False,
845
+ wc=False):
846
+ """Implements the allocation of pinned host memory.
847
+
848
+ It is recommended that this method is not overridden by EMM Plugin
849
+ implementations - instead, use the :class:`BaseCUDAMemoryManager`.
850
+ """
851
+ flags = 0
852
+ if mapped:
853
+ flags |= enums.CU_MEMHOSTALLOC_DEVICEMAP
854
+ if portable:
855
+ flags |= enums.CU_MEMHOSTALLOC_PORTABLE
856
+ if wc:
857
+ flags |= enums.CU_MEMHOSTALLOC_WRITECOMBINED
858
+
859
+ if USE_NV_BINDING:
860
+ def allocator():
861
+ return driver.cuMemHostAlloc(size, flags)
862
+
863
+ if mapped:
864
+ pointer = self._attempt_allocation(allocator)
865
+ else:
866
+ pointer = allocator()
867
+
868
+ alloc_key = pointer
869
+ else:
870
+ pointer = c_void_p()
871
+
872
+ def allocator():
873
+ driver.cuMemHostAlloc(byref(pointer), size, flags)
874
+
875
+ if mapped:
876
+ self._attempt_allocation(allocator)
877
+ else:
878
+ allocator()
879
+
880
+ alloc_key = pointer.value
881
+
882
+ finalizer = _hostalloc_finalizer(self, pointer, alloc_key, size, mapped)
883
+ ctx = weakref.proxy(self.context)
884
+
885
+ if mapped:
886
+ mem = MappedMemory(ctx, pointer, size, finalizer=finalizer)
887
+ self.allocations[alloc_key] = mem
888
+ return mem.own()
889
+ else:
890
+ return PinnedMemory(ctx, pointer, size, finalizer=finalizer)
891
+
892
+ def mempin(self, owner, pointer, size, mapped=False):
893
+ """Implements the pinning of host memory.
894
+
895
+ It is recommended that this method is not overridden by EMM Plugin
896
+ implementations - instead, use the :class:`BaseCUDAMemoryManager`.
897
+ """
898
+ if isinstance(pointer, int) and not USE_NV_BINDING:
899
+ pointer = c_void_p(pointer)
900
+
901
+ if USE_NV_BINDING:
902
+ alloc_key = pointer
903
+ else:
904
+ alloc_key = pointer.value
905
+
906
+ # possible flags are "portable" (between context)
907
+ # and "device-map" (map host memory to device thus no need
908
+ # for memory transfer).
909
+ flags = 0
910
+
911
+ if mapped:
912
+ flags |= enums.CU_MEMHOSTREGISTER_DEVICEMAP
913
+
914
+ def allocator():
915
+ driver.cuMemHostRegister(pointer, size, flags)
916
+
917
+ if mapped:
918
+ self._attempt_allocation(allocator)
919
+ else:
920
+ allocator()
921
+
922
+ finalizer = _pin_finalizer(self, pointer, alloc_key, mapped)
923
+ ctx = weakref.proxy(self.context)
924
+
925
+ if mapped:
926
+ mem = MappedMemory(ctx, pointer, size, owner=owner,
927
+ finalizer=finalizer)
928
+ self.allocations[alloc_key] = mem
929
+ return mem.own()
930
+ else:
931
+ return PinnedMemory(ctx, pointer, size, owner=owner,
932
+ finalizer=finalizer)
933
+
934
+ def memallocmanaged(self, size, attach_global):
935
+ if USE_NV_BINDING:
936
+ def allocator():
937
+ ma_flags = binding.CUmemAttach_flags
938
+
939
+ if attach_global:
940
+ flags = ma_flags.CU_MEM_ATTACH_GLOBAL.value
941
+ else:
942
+ flags = ma_flags.CU_MEM_ATTACH_HOST.value
943
+
944
+ return driver.cuMemAllocManaged(size, flags)
945
+
946
+ ptr = self._attempt_allocation(allocator)
947
+
948
+ alloc_key = ptr
949
+
950
+ else:
951
+ ptr = drvapi.cu_device_ptr()
952
+
953
+ def allocator():
954
+ flags = c_uint()
955
+ if attach_global:
956
+ flags = enums.CU_MEM_ATTACH_GLOBAL
957
+ else:
958
+ flags = enums.CU_MEM_ATTACH_HOST
959
+
960
+ driver.cuMemAllocManaged(byref(ptr), size, flags)
961
+
962
+ self._attempt_allocation(allocator)
963
+
964
+ alloc_key = ptr.value
965
+
966
+ finalizer = _alloc_finalizer(self, ptr, alloc_key, size)
967
+ ctx = weakref.proxy(self.context)
968
+ mem = ManagedMemory(ctx, ptr, size, finalizer=finalizer)
969
+ self.allocations[alloc_key] = mem
970
+ return mem.own()
971
+
972
+ def reset(self):
973
+ """Clears up all host memory (mapped and/or pinned) in the current
974
+ context.
975
+
976
+ EMM Plugins that override this method must call ``super().reset()`` to
977
+ ensure that host allocations are also cleaned up."""
978
+ self.allocations.clear()
979
+ self.deallocations.clear()
980
+
981
+ @contextlib.contextmanager
982
+ def defer_cleanup(self):
983
+ """Returns a context manager that disables cleanup of mapped or pinned
984
+ host memory in the current context whilst it is active.
985
+
986
+ EMM Plugins that override this method must obtain the context manager
987
+ from this method before yielding to ensure that cleanup of host
988
+ allocations is also deferred."""
989
+ with self.deallocations.disable():
990
+ yield
991
+
992
+
993
+ class GetIpcHandleMixin:
994
+ """A class that provides a default implementation of ``get_ipc_handle()``.
995
+ """
996
+
997
+ def get_ipc_handle(self, memory):
998
+ """Open an IPC memory handle by using ``cuMemGetAddressRange`` to
999
+ determine the base pointer of the allocation. An IPC handle of type
1000
+ ``cu_ipc_mem_handle`` is constructed and initialized with
1001
+ ``cuIpcGetMemHandle``. A :class:`numba.cuda.IpcHandle` is returned,
1002
+ populated with the underlying ``ipc_mem_handle``.
1003
+ """
1004
+ base, end = device_extents(memory)
1005
+ if USE_NV_BINDING:
1006
+ ipchandle = driver.cuIpcGetMemHandle(base)
1007
+ offset = int(memory.handle) - int(base)
1008
+ else:
1009
+ ipchandle = drvapi.cu_ipc_mem_handle()
1010
+ driver.cuIpcGetMemHandle(byref(ipchandle), base)
1011
+ offset = memory.handle.value - base
1012
+ source_info = self.context.device.get_device_identity()
1013
+
1014
+ return IpcHandle(memory, ipchandle, memory.size, source_info,
1015
+ offset=offset)
1016
+
1017
+
1018
+ class NumbaCUDAMemoryManager(GetIpcHandleMixin, HostOnlyCUDAMemoryManager):
1019
+ """Internal on-device memory management for Numba. This is implemented using
1020
+ the EMM Plugin interface, but is not part of the public API."""
1021
+
1022
+ def initialize(self):
1023
+ # Set the memory capacity of *deallocations* as the memory manager
1024
+ # becomes active for the first time
1025
+ if self.deallocations.memory_capacity == _SizeNotSet:
1026
+ self.deallocations.memory_capacity = self.get_memory_info().total
1027
+
1028
+ def memalloc(self, size):
1029
+ if USE_NV_BINDING:
1030
+ def allocator():
1031
+ return driver.cuMemAlloc(size)
1032
+
1033
+ ptr = self._attempt_allocation(allocator)
1034
+ alloc_key = ptr
1035
+ else:
1036
+ ptr = drvapi.cu_device_ptr()
1037
+
1038
+ def allocator():
1039
+ driver.cuMemAlloc(byref(ptr), size)
1040
+
1041
+ self._attempt_allocation(allocator)
1042
+ alloc_key = ptr.value
1043
+
1044
+ finalizer = _alloc_finalizer(self, ptr, alloc_key, size)
1045
+ ctx = weakref.proxy(self.context)
1046
+ mem = AutoFreePointer(ctx, ptr, size, finalizer=finalizer)
1047
+ self.allocations[alloc_key] = mem
1048
+ return mem.own()
1049
+
1050
+ def get_memory_info(self):
1051
+ if USE_NV_BINDING:
1052
+ free, total = driver.cuMemGetInfo()
1053
+ else:
1054
+ free = c_size_t()
1055
+ total = c_size_t()
1056
+ driver.cuMemGetInfo(byref(free), byref(total))
1057
+ free = free.value
1058
+ total = total.value
1059
+
1060
+ return MemoryInfo(free=free, total=total)
1061
+
1062
+ @property
1063
+ def interface_version(self):
1064
+ return _SUPPORTED_EMM_INTERFACE_VERSION
1065
+
1066
+
1067
+ _SUPPORTED_EMM_INTERFACE_VERSION = 1
1068
+
1069
+ _memory_manager = None
1070
+
1071
+
1072
+ def _ensure_memory_manager():
1073
+ global _memory_manager
1074
+
1075
+ if _memory_manager:
1076
+ return
1077
+
1078
+ if config.CUDA_MEMORY_MANAGER == 'default':
1079
+ _memory_manager = NumbaCUDAMemoryManager
1080
+ return
1081
+
1082
+ try:
1083
+ mgr_module = importlib.import_module(config.CUDA_MEMORY_MANAGER)
1084
+ set_memory_manager(mgr_module._numba_memory_manager)
1085
+ except Exception:
1086
+ raise RuntimeError("Failed to use memory manager from %s" %
1087
+ config.CUDA_MEMORY_MANAGER)
1088
+
1089
+
1090
+ def set_memory_manager(mm_plugin):
1091
+ """Configure Numba to use an External Memory Management (EMM) Plugin. If
1092
+ the EMM Plugin version does not match one supported by this version of
1093
+ Numba, a RuntimeError will be raised.
1094
+
1095
+ :param mm_plugin: The class implementing the EMM Plugin.
1096
+ :type mm_plugin: BaseCUDAMemoryManager
1097
+ :return: None
1098
+ """
1099
+ global _memory_manager
1100
+
1101
+ dummy = mm_plugin(context=None)
1102
+ iv = dummy.interface_version
1103
+ if iv != _SUPPORTED_EMM_INTERFACE_VERSION:
1104
+ err = "EMM Plugin interface has version %d - version %d required" \
1105
+ % (iv, _SUPPORTED_EMM_INTERFACE_VERSION)
1106
+ raise RuntimeError(err)
1107
+
1108
+ _memory_manager = mm_plugin
1109
+
1110
+
1111
+ class _SizeNotSet(int):
1112
+ """
1113
+ Dummy object for _PendingDeallocs when *size* is not set.
1114
+ """
1115
+
1116
+ def __new__(cls, *args, **kwargs):
1117
+ return super().__new__(cls, 0)
1118
+
1119
+ def __str__(self):
1120
+ return '?'
1121
+
1122
+
1123
+ _SizeNotSet = _SizeNotSet()
1124
+
1125
+
1126
+ class _PendingDeallocs(object):
1127
+ """
1128
+ Pending deallocations of a context (or device since we are using the primary
1129
+ context). The capacity defaults to being unset (_SizeNotSet) but can be
1130
+ modified later once the driver is initialized and the total memory capacity
1131
+ known.
1132
+ """
1133
+ def __init__(self, capacity=_SizeNotSet):
1134
+ self._cons = deque()
1135
+ self._disable_count = 0
1136
+ self._size = 0
1137
+ self.memory_capacity = capacity
1138
+
1139
+ @property
1140
+ def _max_pending_bytes(self):
1141
+ return int(self.memory_capacity * config.CUDA_DEALLOCS_RATIO)
1142
+
1143
+ def add_item(self, dtor, handle, size=_SizeNotSet):
1144
+ """
1145
+ Add a pending deallocation.
1146
+
1147
+ The *dtor* arg is the destructor function that takes an argument,
1148
+ *handle*. It is used as ``dtor(handle)``. The *size* arg is the
1149
+ byte size of the resource added. It is an optional argument. Some
1150
+ resources (e.g. CUModule) has an unknown memory footprint on the device.
1151
+ """
1152
+ _logger.info('add pending dealloc: %s %s bytes', dtor.__name__, size)
1153
+ self._cons.append((dtor, handle, size))
1154
+ self._size += int(size)
1155
+ if (len(self._cons) > config.CUDA_DEALLOCS_COUNT or
1156
+ self._size > self._max_pending_bytes):
1157
+ self.clear()
1158
+
1159
+ def clear(self):
1160
+ """
1161
+ Flush any pending deallocations unless it is disabled.
1162
+ Do nothing if disabled.
1163
+ """
1164
+ if not self.is_disabled:
1165
+ while self._cons:
1166
+ [dtor, handle, size] = self._cons.popleft()
1167
+ _logger.info('dealloc: %s %s bytes', dtor.__name__, size)
1168
+ dtor(handle)
1169
+ self._size = 0
1170
+
1171
+ @contextlib.contextmanager
1172
+ def disable(self):
1173
+ """
1174
+ Context manager to temporarily disable flushing pending deallocation.
1175
+ This can be nested.
1176
+ """
1177
+ self._disable_count += 1
1178
+ try:
1179
+ yield
1180
+ finally:
1181
+ self._disable_count -= 1
1182
+ assert self._disable_count >= 0
1183
+
1184
+ @property
1185
+ def is_disabled(self):
1186
+ return self._disable_count > 0
1187
+
1188
+ def __len__(self):
1189
+ """
1190
+ Returns number of pending deallocations.
1191
+ """
1192
+ return len(self._cons)
1193
+
1194
+
1195
+ MemoryInfo = namedtuple("MemoryInfo", "free,total")
1196
+ """Free and total memory for a device.
1197
+
1198
+ .. py:attribute:: free
1199
+
1200
+ Free device memory in bytes.
1201
+
1202
+ .. py:attribute:: total
1203
+
1204
+ Total device memory in bytes.
1205
+ """
1206
+
1207
+
1208
+ class Context(object):
1209
+ """
1210
+ This object wraps a CUDA Context resource.
1211
+
1212
+ Contexts should not be constructed directly by user code.
1213
+ """
1214
+
1215
+ def __init__(self, device, handle):
1216
+ self.device = device
1217
+ self.handle = handle
1218
+ self.allocations = utils.UniqueDict()
1219
+ self.deallocations = _PendingDeallocs()
1220
+ _ensure_memory_manager()
1221
+ self.memory_manager = _memory_manager(context=self)
1222
+ self.modules = utils.UniqueDict()
1223
+ # For storing context specific data
1224
+ self.extras = {}
1225
+
1226
+ def reset(self):
1227
+ """
1228
+ Clean up all owned resources in this context.
1229
+ """
1230
+ # Free owned resources
1231
+ _logger.info('reset context of device %s', self.device.id)
1232
+ self.memory_manager.reset()
1233
+ self.modules.clear()
1234
+ # Clear trash
1235
+ self.deallocations.clear()
1236
+
1237
+ def get_memory_info(self):
1238
+ """Returns (free, total) memory in bytes in the context.
1239
+ """
1240
+ return self.memory_manager.get_memory_info()
1241
+
1242
+ def get_active_blocks_per_multiprocessor(self, func, blocksize, memsize,
1243
+ flags=None):
1244
+ """Return occupancy of a function.
1245
+ :param func: kernel for which occupancy is calculated
1246
+ :param blocksize: block size the kernel is intended to be launched with
1247
+ :param memsize: per-block dynamic shared memory usage intended, in bytes
1248
+ """
1249
+ args = (func, blocksize, memsize, flags)
1250
+ if USE_NV_BINDING:
1251
+ return self._cuda_python_active_blocks_per_multiprocessor(*args)
1252
+ else:
1253
+ return self._ctypes_active_blocks_per_multiprocessor(*args)
1254
+
1255
+ def _cuda_python_active_blocks_per_multiprocessor(self, func, blocksize,
1256
+ memsize, flags):
1257
+ ps = [func.handle, blocksize, memsize]
1258
+
1259
+ if not flags:
1260
+ return driver.cuOccupancyMaxActiveBlocksPerMultiprocessor(*ps)
1261
+
1262
+ ps.append(flags)
1263
+ return driver.cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(*ps)
1264
+
1265
+ def _ctypes_active_blocks_per_multiprocessor(self, func, blocksize,
1266
+ memsize, flags):
1267
+ retval = c_int()
1268
+ args = (byref(retval), func.handle, blocksize, memsize)
1269
+
1270
+ if not flags:
1271
+ driver.cuOccupancyMaxActiveBlocksPerMultiprocessor(*args)
1272
+ else:
1273
+ driver.cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(*args)
1274
+
1275
+ return retval.value
1276
+
1277
+ def get_max_potential_block_size(self, func, b2d_func, memsize,
1278
+ blocksizelimit, flags=None):
1279
+ """Suggest a launch configuration with reasonable occupancy.
1280
+ :param func: kernel for which occupancy is calculated
1281
+ :param b2d_func: function that calculates how much per-block dynamic
1282
+ shared memory 'func' uses based on the block size.
1283
+ Can also be the address of a C function.
1284
+ Use `0` to pass `NULL` to the underlying CUDA API.
1285
+ :param memsize: per-block dynamic shared memory usage intended, in bytes
1286
+ :param blocksizelimit: maximum block size the kernel is designed to
1287
+ handle
1288
+ """
1289
+ args = (func, b2d_func, memsize, blocksizelimit, flags)
1290
+ if USE_NV_BINDING:
1291
+ return self._cuda_python_max_potential_block_size(*args)
1292
+ else:
1293
+ return self._ctypes_max_potential_block_size(*args)
1294
+
1295
+ def _ctypes_max_potential_block_size(self, func, b2d_func, memsize,
1296
+ blocksizelimit, flags):
1297
+ gridsize = c_int()
1298
+ blocksize = c_int()
1299
+ b2d_cb = cu_occupancy_b2d_size(b2d_func)
1300
+ args = [byref(gridsize), byref(blocksize), func.handle, b2d_cb,
1301
+ memsize, blocksizelimit]
1302
+
1303
+ if not flags:
1304
+ driver.cuOccupancyMaxPotentialBlockSize(*args)
1305
+ else:
1306
+ args.append(flags)
1307
+ driver.cuOccupancyMaxPotentialBlockSizeWithFlags(*args)
1308
+
1309
+ return (gridsize.value, blocksize.value)
1310
+
1311
+ def _cuda_python_max_potential_block_size(self, func, b2d_func, memsize,
1312
+ blocksizelimit, flags):
1313
+ b2d_cb = ctypes.CFUNCTYPE(c_size_t, c_int)(b2d_func)
1314
+ ptr = int.from_bytes(b2d_cb, byteorder='little')
1315
+ driver_b2d_cb = binding.CUoccupancyB2DSize(ptr)
1316
+ args = [func.handle, driver_b2d_cb, memsize, blocksizelimit]
1317
+
1318
+ if not flags:
1319
+ return driver.cuOccupancyMaxPotentialBlockSize(*args)
1320
+ else:
1321
+ args.append(flags)
1322
+ return driver.cuOccupancyMaxPotentialBlockSizeWithFlags(*args)
1323
+
1324
+ def prepare_for_use(self):
1325
+ """Initialize the context for use.
1326
+ It's safe to be called multiple times.
1327
+ """
1328
+ self.memory_manager.initialize()
1329
+
1330
+ def push(self):
1331
+ """
1332
+ Pushes this context on the current CPU Thread.
1333
+ """
1334
+ driver.cuCtxPushCurrent(self.handle)
1335
+ self.prepare_for_use()
1336
+
1337
+ def pop(self):
1338
+ """
1339
+ Pops this context off the current CPU thread. Note that this context
1340
+ must be at the top of the context stack, otherwise an error will occur.
1341
+ """
1342
+ popped = driver.pop_active_context()
1343
+ if USE_NV_BINDING:
1344
+ assert int(popped) == int(self.handle)
1345
+ else:
1346
+ assert popped.value == self.handle.value
1347
+
1348
+ def memalloc(self, bytesize):
1349
+ return self.memory_manager.memalloc(bytesize)
1350
+
1351
+ def memallocmanaged(self, bytesize, attach_global=True):
1352
+ return self.memory_manager.memallocmanaged(bytesize, attach_global)
1353
+
1354
+ def memhostalloc(self, bytesize, mapped=False, portable=False, wc=False):
1355
+ return self.memory_manager.memhostalloc(bytesize, mapped, portable, wc)
1356
+
1357
+ def mempin(self, owner, pointer, size, mapped=False):
1358
+ if mapped and not self.device.CAN_MAP_HOST_MEMORY:
1359
+ raise CudaDriverError("%s cannot map host memory" % self.device)
1360
+ return self.memory_manager.mempin(owner, pointer, size, mapped)
1361
+
1362
+ def get_ipc_handle(self, memory):
1363
+ """
1364
+ Returns an *IpcHandle* from a GPU allocation.
1365
+ """
1366
+ if not SUPPORTS_IPC:
1367
+ raise OSError('OS does not support CUDA IPC')
1368
+ return self.memory_manager.get_ipc_handle(memory)
1369
+
1370
+ def open_ipc_handle(self, handle, size):
1371
+ # open the IPC handle to get the device pointer
1372
+ flags = 1 # CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS
1373
+ if USE_NV_BINDING:
1374
+ dptr = driver.cuIpcOpenMemHandle(handle, flags)
1375
+ else:
1376
+ dptr = drvapi.cu_device_ptr()
1377
+ driver.cuIpcOpenMemHandle(byref(dptr), handle, flags)
1378
+
1379
+ # wrap it
1380
+ return MemoryPointer(context=weakref.proxy(self), pointer=dptr,
1381
+ size=size)
1382
+
1383
+ def enable_peer_access(self, peer_context, flags=0):
1384
+ """Enable peer access between the current context and the peer context
1385
+ """
1386
+ assert flags == 0, '*flags* is reserved and MUST be zero'
1387
+ driver.cuCtxEnablePeerAccess(peer_context, flags)
1388
+
1389
+ def can_access_peer(self, peer_device):
1390
+ """Returns a bool indicating whether the peer access between the
1391
+ current and peer device is possible.
1392
+ """
1393
+ if USE_NV_BINDING:
1394
+ peer_device = binding.CUdevice(peer_device)
1395
+ can_access_peer = driver.cuDeviceCanAccessPeer(self.device.id,
1396
+ peer_device)
1397
+ else:
1398
+ can_access_peer = c_int()
1399
+ driver.cuDeviceCanAccessPeer(byref(can_access_peer),
1400
+ self.device.id, peer_device,)
1401
+
1402
+ return bool(can_access_peer)
1403
+
1404
+ def create_module_ptx(self, ptx):
1405
+ if isinstance(ptx, str):
1406
+ ptx = ptx.encode('utf8')
1407
+ if USE_NV_BINDING:
1408
+ image = ptx
1409
+ else:
1410
+ image = c_char_p(ptx)
1411
+ return self.create_module_image(image)
1412
+
1413
+ def create_module_image(self, image):
1414
+ module = load_module_image(self, image)
1415
+ if USE_NV_BINDING:
1416
+ key = module.handle
1417
+ else:
1418
+ key = module.handle.value
1419
+ self.modules[key] = module
1420
+ return weakref.proxy(module)
1421
+
1422
+ def unload_module(self, module):
1423
+ if USE_NV_BINDING:
1424
+ key = module.handle
1425
+ else:
1426
+ key = module.handle.value
1427
+ del self.modules[key]
1428
+
1429
+ def get_default_stream(self):
1430
+ if USE_NV_BINDING:
1431
+ handle = binding.CUstream(CU_STREAM_DEFAULT)
1432
+ else:
1433
+ handle = drvapi.cu_stream(drvapi.CU_STREAM_DEFAULT)
1434
+ return Stream(weakref.proxy(self), handle, None)
1435
+
1436
+ def get_legacy_default_stream(self):
1437
+ if USE_NV_BINDING:
1438
+ handle = binding.CUstream(binding.CU_STREAM_LEGACY)
1439
+ else:
1440
+ handle = drvapi.cu_stream(drvapi.CU_STREAM_LEGACY)
1441
+ return Stream(weakref.proxy(self), handle, None)
1442
+
1443
+ def get_per_thread_default_stream(self):
1444
+ if USE_NV_BINDING:
1445
+ handle = binding.CUstream(binding.CU_STREAM_PER_THREAD)
1446
+ else:
1447
+ handle = drvapi.cu_stream(drvapi.CU_STREAM_PER_THREAD)
1448
+ return Stream(weakref.proxy(self), handle, None)
1449
+
1450
+ def create_stream(self):
1451
+ if USE_NV_BINDING:
1452
+ # The default stream creation flag, specifying that the created
1453
+ # stream synchronizes with stream 0 (this is different from the
1454
+ # default stream, which we define also as CU_STREAM_DEFAULT when
1455
+ # the NV binding is in use).
1456
+ flags = binding.CUstream_flags.CU_STREAM_DEFAULT.value
1457
+ handle = driver.cuStreamCreate(flags)
1458
+ else:
1459
+ handle = drvapi.cu_stream()
1460
+ driver.cuStreamCreate(byref(handle), 0)
1461
+ return Stream(weakref.proxy(self), handle,
1462
+ _stream_finalizer(self.deallocations, handle))
1463
+
1464
+ def create_external_stream(self, ptr):
1465
+ if not isinstance(ptr, int):
1466
+ raise TypeError("ptr for external stream must be an int")
1467
+ if USE_NV_BINDING:
1468
+ handle = binding.CUstream(ptr)
1469
+ else:
1470
+ handle = drvapi.cu_stream(ptr)
1471
+ return Stream(weakref.proxy(self), handle, None,
1472
+ external=True)
1473
+
1474
+ def create_event(self, timing=True):
1475
+ flags = 0
1476
+ if not timing:
1477
+ flags |= enums.CU_EVENT_DISABLE_TIMING
1478
+ if USE_NV_BINDING:
1479
+ handle = driver.cuEventCreate(flags)
1480
+ else:
1481
+ handle = drvapi.cu_event()
1482
+ driver.cuEventCreate(byref(handle), flags)
1483
+ return Event(weakref.proxy(self), handle,
1484
+ finalizer=_event_finalizer(self.deallocations, handle))
1485
+
1486
+ def synchronize(self):
1487
+ driver.cuCtxSynchronize()
1488
+
1489
+ @contextlib.contextmanager
1490
+ def defer_cleanup(self):
1491
+ with self.memory_manager.defer_cleanup():
1492
+ with self.deallocations.disable():
1493
+ yield
1494
+
1495
+ def __repr__(self):
1496
+ return "<CUDA context %s of device %d>" % (self.handle, self.device.id)
1497
+
1498
+ def __eq__(self, other):
1499
+ if isinstance(other, Context):
1500
+ return self.handle == other.handle
1501
+ else:
1502
+ return NotImplemented
1503
+
1504
+ def __ne__(self, other):
1505
+ return not self.__eq__(other)
1506
+
1507
+
1508
+ def load_module_image(context, image):
1509
+ """
1510
+ image must be a pointer
1511
+ """
1512
+ if USE_NV_BINDING:
1513
+ return load_module_image_cuda_python(context, image)
1514
+ else:
1515
+ return load_module_image_ctypes(context, image)
1516
+
1517
+
1518
+ def load_module_image_ctypes(context, image):
1519
+ logsz = config.CUDA_LOG_SIZE
1520
+
1521
+ jitinfo = (c_char * logsz)()
1522
+ jiterrors = (c_char * logsz)()
1523
+
1524
+ options = {
1525
+ enums.CU_JIT_INFO_LOG_BUFFER: addressof(jitinfo),
1526
+ enums.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
1527
+ enums.CU_JIT_ERROR_LOG_BUFFER: addressof(jiterrors),
1528
+ enums.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
1529
+ enums.CU_JIT_LOG_VERBOSE: c_void_p(config.CUDA_VERBOSE_JIT_LOG),
1530
+ }
1531
+
1532
+ option_keys = (drvapi.cu_jit_option * len(options))(*options.keys())
1533
+ option_vals = (c_void_p * len(options))(*options.values())
1534
+
1535
+ handle = drvapi.cu_module()
1536
+ try:
1537
+ driver.cuModuleLoadDataEx(byref(handle), image, len(options),
1538
+ option_keys, option_vals)
1539
+ except CudaAPIError as e:
1540
+ msg = "cuModuleLoadDataEx error:\n%s" % jiterrors.value.decode("utf8")
1541
+ raise CudaAPIError(e.code, msg)
1542
+
1543
+ info_log = jitinfo.value
1544
+
1545
+ return CtypesModule(weakref.proxy(context), handle, info_log,
1546
+ _module_finalizer(context, handle))
1547
+
1548
+
1549
+ def load_module_image_cuda_python(context, image):
1550
+ """
1551
+ image must be a pointer
1552
+ """
1553
+ logsz = config.CUDA_LOG_SIZE
1554
+
1555
+ jitinfo = bytearray(logsz)
1556
+ jiterrors = bytearray(logsz)
1557
+
1558
+ jit_option = binding.CUjit_option
1559
+ options = {
1560
+ jit_option.CU_JIT_INFO_LOG_BUFFER: jitinfo,
1561
+ jit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: logsz,
1562
+ jit_option.CU_JIT_ERROR_LOG_BUFFER: jiterrors,
1563
+ jit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: logsz,
1564
+ jit_option.CU_JIT_LOG_VERBOSE: config.CUDA_VERBOSE_JIT_LOG,
1565
+ }
1566
+
1567
+ option_keys = [k for k in options.keys()]
1568
+ option_vals = [v for v in options.values()]
1569
+
1570
+ try:
1571
+ handle = driver.cuModuleLoadDataEx(image, len(options), option_keys,
1572
+ option_vals)
1573
+ except CudaAPIError as e:
1574
+ err_string = jiterrors.decode('utf-8')
1575
+ msg = "cuModuleLoadDataEx error:\n%s" % err_string
1576
+ raise CudaAPIError(e.code, msg)
1577
+
1578
+ info_log = jitinfo.decode('utf-8')
1579
+
1580
+ return CudaPythonModule(weakref.proxy(context), handle, info_log,
1581
+ _module_finalizer(context, handle))
1582
+
1583
+
1584
+ def _alloc_finalizer(memory_manager, ptr, alloc_key, size):
1585
+ allocations = memory_manager.allocations
1586
+ deallocations = memory_manager.deallocations
1587
+
1588
+ def core():
1589
+ if allocations:
1590
+ del allocations[alloc_key]
1591
+ deallocations.add_item(driver.cuMemFree, ptr, size)
1592
+
1593
+ return core
1594
+
1595
+
1596
+ def _hostalloc_finalizer(memory_manager, ptr, alloc_key, size, mapped):
1597
+ """
1598
+ Finalize page-locked host memory allocated by `context.memhostalloc`.
1599
+
1600
+ This memory is managed by CUDA, and finalization entails deallocation. The
1601
+ issues noted in `_pin_finalizer` are not relevant in this case, and the
1602
+ finalization is placed in the `context.deallocations` queue along with
1603
+ finalization of device objects.
1604
+
1605
+ """
1606
+ allocations = memory_manager.allocations
1607
+ deallocations = memory_manager.deallocations
1608
+ if not mapped:
1609
+ size = _SizeNotSet
1610
+
1611
+ def core():
1612
+ if mapped and allocations:
1613
+ del allocations[alloc_key]
1614
+ deallocations.add_item(driver.cuMemFreeHost, ptr, size)
1615
+
1616
+ return core
1617
+
1618
+
1619
+ def _pin_finalizer(memory_manager, ptr, alloc_key, mapped):
1620
+ """
1621
+ Finalize temporary page-locking of host memory by `context.mempin`.
1622
+
1623
+ This applies to memory not otherwise managed by CUDA. Page-locking can
1624
+ be requested multiple times on the same memory, and must therefore be
1625
+ lifted as soon as finalization is requested, otherwise subsequent calls to
1626
+ `mempin` may fail with `CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED`, leading
1627
+ to unexpected behavior for the context managers `cuda.{pinned,mapped}`.
1628
+ This function therefore carries out finalization immediately, bypassing the
1629
+ `context.deallocations` queue.
1630
+
1631
+ """
1632
+ allocations = memory_manager.allocations
1633
+
1634
+ def core():
1635
+ if mapped and allocations:
1636
+ del allocations[alloc_key]
1637
+ driver.cuMemHostUnregister(ptr)
1638
+
1639
+ return core
1640
+
1641
+
1642
+ def _event_finalizer(deallocs, handle):
1643
+ def core():
1644
+ deallocs.add_item(driver.cuEventDestroy, handle)
1645
+
1646
+ return core
1647
+
1648
+
1649
+ def _stream_finalizer(deallocs, handle):
1650
+ def core():
1651
+ deallocs.add_item(driver.cuStreamDestroy, handle)
1652
+
1653
+ return core
1654
+
1655
+
1656
+ def _module_finalizer(context, handle):
1657
+ dealloc = context.deallocations
1658
+ modules = context.modules
1659
+
1660
+ if USE_NV_BINDING:
1661
+ key = handle
1662
+ else:
1663
+ key = handle.value
1664
+
1665
+ def core():
1666
+ shutting_down = utils.shutting_down # early bind
1667
+
1668
+ def module_unload(handle):
1669
+ # If we are not shutting down, we must be called due to
1670
+ # Context.reset() of Context.unload_module(). Both must have
1671
+ # cleared the module reference from the context.
1672
+ assert shutting_down() or key not in modules
1673
+ driver.cuModuleUnload(handle)
1674
+
1675
+ dealloc.add_item(module_unload, handle)
1676
+
1677
+ return core
1678
+
1679
+
1680
+ class _CudaIpcImpl(object):
1681
+ """Implementation of GPU IPC using CUDA driver API.
1682
+ This requires the devices to be peer accessible.
1683
+ """
1684
+ def __init__(self, parent):
1685
+ self.base = parent.base
1686
+ self.handle = parent.handle
1687
+ self.size = parent.size
1688
+ self.offset = parent.offset
1689
+ # remember if the handle is already opened
1690
+ self._opened_mem = None
1691
+
1692
+ def open(self, context):
1693
+ """
1694
+ Import the IPC memory and returns a raw CUDA memory pointer object
1695
+ """
1696
+ if self.base is not None:
1697
+ raise ValueError('opening IpcHandle from original process')
1698
+
1699
+ if self._opened_mem is not None:
1700
+ raise ValueError('IpcHandle is already opened')
1701
+
1702
+ mem = context.open_ipc_handle(self.handle, self.offset + self.size)
1703
+ # this object owns the opened allocation
1704
+ # note: it is required the memory be freed after the ipc handle is
1705
+ # closed by the importing context.
1706
+ self._opened_mem = mem
1707
+ return mem.own().view(self.offset)
1708
+
1709
+ def close(self):
1710
+ if self._opened_mem is None:
1711
+ raise ValueError('IpcHandle not opened')
1712
+ driver.cuIpcCloseMemHandle(self._opened_mem.handle)
1713
+ self._opened_mem = None
1714
+
1715
+
1716
+ class _StagedIpcImpl(object):
1717
+ """Implementation of GPU IPC using custom staging logic to workaround
1718
+ CUDA IPC limitation on peer accessibility between devices.
1719
+ """
1720
+ def __init__(self, parent, source_info):
1721
+ self.parent = parent
1722
+ self.base = parent.base
1723
+ self.handle = parent.handle
1724
+ self.size = parent.size
1725
+ self.source_info = source_info
1726
+
1727
+ def open(self, context):
1728
+ from numba import cuda
1729
+
1730
+ srcdev = Device.from_identity(self.source_info)
1731
+ if USE_NV_BINDING:
1732
+ srcdev_id = int(srcdev.id)
1733
+ else:
1734
+ srcdev_id = srcdev.id
1735
+
1736
+ impl = _CudaIpcImpl(parent=self.parent)
1737
+ # Open context on the source device.
1738
+ with cuda.gpus[srcdev_id]:
1739
+ source_ptr = impl.open(cuda.devices.get_context())
1740
+
1741
+ # Allocate GPU buffer.
1742
+ newmem = context.memalloc(self.size)
1743
+ # Do D->D from the source peer-context
1744
+ # This performs automatic host staging
1745
+ device_to_device(newmem, source_ptr, self.size)
1746
+
1747
+ # Cleanup source context
1748
+ with cuda.gpus[srcdev_id]:
1749
+ impl.close()
1750
+
1751
+ return newmem
1752
+
1753
+ def close(self):
1754
+ # Nothing has to be done here
1755
+ pass
1756
+
1757
+
1758
+ class IpcHandle(object):
1759
+ """
1760
+ CUDA IPC handle. Serialization of the CUDA IPC handle object is implemented
1761
+ here.
1762
+
1763
+ :param base: A reference to the original allocation to keep it alive
1764
+ :type base: MemoryPointer
1765
+ :param handle: The CUDA IPC handle, as a ctypes array of bytes.
1766
+ :param size: Size of the original allocation
1767
+ :type size: int
1768
+ :param source_info: The identity of the device on which the IPC handle was
1769
+ opened.
1770
+ :type source_info: dict
1771
+ :param offset: The offset into the underlying allocation of the memory
1772
+ referred to by this IPC handle.
1773
+ :type offset: int
1774
+ """
1775
+ def __init__(self, base, handle, size, source_info=None, offset=0):
1776
+ self.base = base
1777
+ self.handle = handle
1778
+ self.size = size
1779
+ self.source_info = source_info
1780
+ self._impl = None
1781
+ self.offset = offset
1782
+
1783
+ def _sentry_source_info(self):
1784
+ if self.source_info is None:
1785
+ raise RuntimeError("IPC handle doesn't have source info")
1786
+
1787
+ def can_access_peer(self, context):
1788
+ """Returns a bool indicating whether the active context can peer
1789
+ access the IPC handle
1790
+ """
1791
+ self._sentry_source_info()
1792
+ if self.source_info == context.device.get_device_identity():
1793
+ return True
1794
+ source_device = Device.from_identity(self.source_info)
1795
+ return context.can_access_peer(source_device.id)
1796
+
1797
+ def open_staged(self, context):
1798
+ """Open the IPC by allowing staging on the host memory first.
1799
+ """
1800
+ self._sentry_source_info()
1801
+
1802
+ if self._impl is not None:
1803
+ raise ValueError('IpcHandle is already opened')
1804
+
1805
+ self._impl = _StagedIpcImpl(self, self.source_info)
1806
+ return self._impl.open(context)
1807
+
1808
+ def open_direct(self, context):
1809
+ """
1810
+ Import the IPC memory and returns a raw CUDA memory pointer object
1811
+ """
1812
+ if self._impl is not None:
1813
+ raise ValueError('IpcHandle is already opened')
1814
+
1815
+ self._impl = _CudaIpcImpl(self)
1816
+ return self._impl.open(context)
1817
+
1818
+ def open(self, context):
1819
+ """Open the IPC handle and import the memory for usage in the given
1820
+ context. Returns a raw CUDA memory pointer object.
1821
+
1822
+ This is enhanced over CUDA IPC that it will work regardless of whether
1823
+ the source device is peer-accessible by the destination device.
1824
+ If the devices are peer-accessible, it uses .open_direct().
1825
+ If the devices are not peer-accessible, it uses .open_staged().
1826
+ """
1827
+ if self.source_info is None or self.can_access_peer(context):
1828
+ fn = self.open_direct
1829
+ else:
1830
+ fn = self.open_staged
1831
+ return fn(context)
1832
+
1833
+ def open_array(self, context, shape, dtype, strides=None):
1834
+ """
1835
+ Similar to `.open()` but returns an device array.
1836
+ """
1837
+ from . import devicearray
1838
+
1839
+ # by default, set strides to itemsize
1840
+ if strides is None:
1841
+ strides = dtype.itemsize
1842
+ dptr = self.open(context)
1843
+ # read the device pointer as an array
1844
+ return devicearray.DeviceNDArray(shape=shape, strides=strides,
1845
+ dtype=dtype, gpu_data=dptr)
1846
+
1847
+ def close(self):
1848
+ if self._impl is None:
1849
+ raise ValueError('IpcHandle not opened')
1850
+ self._impl.close()
1851
+ self._impl = None
1852
+
1853
+ def __reduce__(self):
1854
+ # Preprocess the IPC handle, which is defined as a byte array.
1855
+ if USE_NV_BINDING:
1856
+ preprocessed_handle = self.handle.reserved
1857
+ else:
1858
+ preprocessed_handle = tuple(self.handle.reserved)
1859
+ args = (
1860
+ self.__class__,
1861
+ preprocessed_handle,
1862
+ self.size,
1863
+ self.source_info,
1864
+ self.offset,
1865
+ )
1866
+ return (serialize._rebuild_reduction, args)
1867
+
1868
+ @classmethod
1869
+ def _rebuild(cls, handle_ary, size, source_info, offset):
1870
+ if USE_NV_BINDING:
1871
+ handle = binding.CUipcMemHandle()
1872
+ else:
1873
+ handle = drvapi.cu_ipc_mem_handle()
1874
+ handle.reserved = handle_ary
1875
+ return cls(base=None, handle=handle, size=size,
1876
+ source_info=source_info, offset=offset)
1877
+
1878
+
1879
+ class MemoryPointer(object):
1880
+ """A memory pointer that owns a buffer, with an optional finalizer. Memory
1881
+ pointers provide reference counting, and instances are initialized with a
1882
+ reference count of 1.
1883
+
1884
+ The base ``MemoryPointer`` class does not use the
1885
+ reference count for managing the buffer lifetime. Instead, the buffer
1886
+ lifetime is tied to the memory pointer instance's lifetime:
1887
+
1888
+ - When the instance is deleted, the finalizer will be called.
1889
+ - When the reference count drops to 0, no action is taken.
1890
+
1891
+ Subclasses of ``MemoryPointer`` may modify these semantics, for example to
1892
+ tie the buffer lifetime to the reference count, so that the buffer is freed
1893
+ when there are no more references.
1894
+
1895
+ :param context: The context in which the pointer was allocated.
1896
+ :type context: Context
1897
+ :param pointer: The address of the buffer.
1898
+ :type pointer: ctypes.c_void_p
1899
+ :param size: The size of the allocation in bytes.
1900
+ :type size: int
1901
+ :param owner: The owner is sometimes set by the internals of this class, or
1902
+ used for Numba's internal memory management. It should not be
1903
+ provided by an external user of the ``MemoryPointer`` class
1904
+ (e.g. from within an EMM Plugin); the default of `None`
1905
+ should always suffice.
1906
+ :type owner: NoneType
1907
+ :param finalizer: A function that is called when the buffer is to be freed.
1908
+ :type finalizer: function
1909
+ """
1910
+ __cuda_memory__ = True
1911
+
1912
+ def __init__(self, context, pointer, size, owner=None, finalizer=None):
1913
+ self.context = context
1914
+ self.device_pointer = pointer
1915
+ self.size = size
1916
+ self._cuda_memsize_ = size
1917
+ self.is_managed = finalizer is not None
1918
+ self.refct = 1
1919
+ self.handle = self.device_pointer
1920
+ self._owner = owner
1921
+
1922
+ if finalizer is not None:
1923
+ self._finalizer = weakref.finalize(self, finalizer)
1924
+
1925
+ @property
1926
+ def owner(self):
1927
+ return self if self._owner is None else self._owner
1928
+
1929
+ def own(self):
1930
+ return OwnedPointer(weakref.proxy(self))
1931
+
1932
+ def free(self):
1933
+ """
1934
+ Forces the device memory to the trash.
1935
+ """
1936
+ if self.is_managed:
1937
+ if not self._finalizer.alive:
1938
+ raise RuntimeError("Freeing dead memory")
1939
+ self._finalizer()
1940
+ assert not self._finalizer.alive
1941
+
1942
+ def memset(self, byte, count=None, stream=0):
1943
+ count = self.size if count is None else count
1944
+ if stream:
1945
+ driver.cuMemsetD8Async(self.device_pointer, byte, count,
1946
+ stream.handle)
1947
+ else:
1948
+ driver.cuMemsetD8(self.device_pointer, byte, count)
1949
+
1950
+ def view(self, start, stop=None):
1951
+ if stop is None:
1952
+ size = self.size - start
1953
+ else:
1954
+ size = stop - start
1955
+
1956
+ # Handle NULL/empty memory buffer
1957
+ if not self.device_pointer_value:
1958
+ if size != 0:
1959
+ raise RuntimeError("non-empty slice into empty slice")
1960
+ view = self # new view is just a reference to self
1961
+ # Handle normal case
1962
+ else:
1963
+ base = self.device_pointer_value + start
1964
+ if size < 0:
1965
+ raise RuntimeError('size cannot be negative')
1966
+ if USE_NV_BINDING:
1967
+ pointer = binding.CUdeviceptr()
1968
+ ctypes_ptr = drvapi.cu_device_ptr.from_address(pointer.getPtr())
1969
+ ctypes_ptr.value = base
1970
+ else:
1971
+ pointer = drvapi.cu_device_ptr(base)
1972
+ view = MemoryPointer(self.context, pointer, size, owner=self.owner)
1973
+
1974
+ if isinstance(self.owner, (MemoryPointer, OwnedPointer)):
1975
+ # Owned by a numba-managed memory segment, take an owned reference
1976
+ return OwnedPointer(weakref.proxy(self.owner), view)
1977
+ else:
1978
+ # Owned by external alloc, return view with same external owner
1979
+ return view
1980
+
1981
+ @property
1982
+ def device_ctypes_pointer(self):
1983
+ return self.device_pointer
1984
+
1985
+ @property
1986
+ def device_pointer_value(self):
1987
+ if USE_NV_BINDING:
1988
+ return int(self.device_pointer) or None
1989
+ else:
1990
+ return self.device_pointer.value
1991
+
1992
+
1993
+ class AutoFreePointer(MemoryPointer):
1994
+ """Modifies the ownership semantic of the MemoryPointer so that the
1995
+ instance lifetime is directly tied to the number of references.
1996
+
1997
+ When the reference count reaches zero, the finalizer is invoked.
1998
+
1999
+ Constructor arguments are the same as for :class:`MemoryPointer`.
2000
+ """
2001
+ def __init__(self, *args, **kwargs):
2002
+ super(AutoFreePointer, self).__init__(*args, **kwargs)
2003
+ # Releease the self reference to the buffer, so that the finalizer
2004
+ # is invoked if all the derived pointers are gone.
2005
+ self.refct -= 1
2006
+
2007
+
2008
+ class MappedMemory(AutoFreePointer):
2009
+ """A memory pointer that refers to a buffer on the host that is mapped into
2010
+ device memory.
2011
+
2012
+ :param context: The context in which the pointer was mapped.
2013
+ :type context: Context
2014
+ :param pointer: The address of the buffer.
2015
+ :type pointer: ctypes.c_void_p
2016
+ :param size: The size of the buffer in bytes.
2017
+ :type size: int
2018
+ :param owner: The owner is sometimes set by the internals of this class, or
2019
+ used for Numba's internal memory management. It should not be
2020
+ provided by an external user of the ``MappedMemory`` class
2021
+ (e.g. from within an EMM Plugin); the default of `None`
2022
+ should always suffice.
2023
+ :type owner: NoneType
2024
+ :param finalizer: A function that is called when the buffer is to be freed.
2025
+ :type finalizer: function
2026
+ """
2027
+
2028
+ __cuda_memory__ = True
2029
+
2030
+ def __init__(self, context, pointer, size, owner=None, finalizer=None):
2031
+ self.owned = owner
2032
+ self.host_pointer = pointer
2033
+
2034
+ if USE_NV_BINDING:
2035
+ devptr = driver.cuMemHostGetDevicePointer(pointer, 0)
2036
+ self._bufptr_ = self.host_pointer
2037
+ else:
2038
+ devptr = drvapi.cu_device_ptr()
2039
+ driver.cuMemHostGetDevicePointer(byref(devptr), pointer, 0)
2040
+ self._bufptr_ = self.host_pointer.value
2041
+
2042
+ self.device_pointer = devptr
2043
+ super(MappedMemory, self).__init__(context, devptr, size,
2044
+ finalizer=finalizer)
2045
+ self.handle = self.host_pointer
2046
+
2047
+ # For buffer interface
2048
+ self._buflen_ = self.size
2049
+
2050
+ def own(self):
2051
+ return MappedOwnedPointer(weakref.proxy(self))
2052
+
2053
+
2054
+ class PinnedMemory(mviewbuf.MemAlloc):
2055
+ """A pointer to a pinned buffer on the host.
2056
+
2057
+ :param context: The context in which the pointer was mapped.
2058
+ :type context: Context
2059
+ :param owner: The object owning the memory. For EMM plugin implementation,
2060
+ this ca
2061
+ :param pointer: The address of the buffer.
2062
+ :type pointer: ctypes.c_void_p
2063
+ :param size: The size of the buffer in bytes.
2064
+ :type size: int
2065
+ :param owner: An object owning the buffer that has been pinned. For EMM
2066
+ plugin implementation, the default of ``None`` suffices for
2067
+ memory allocated in ``memhostalloc`` - for ``mempin``, it
2068
+ should be the owner passed in to the ``mempin`` method.
2069
+ :param finalizer: A function that is called when the buffer is to be freed.
2070
+ :type finalizer: function
2071
+ """
2072
+
2073
+ def __init__(self, context, pointer, size, owner=None, finalizer=None):
2074
+ self.context = context
2075
+ self.owned = owner
2076
+ self.size = size
2077
+ self.host_pointer = pointer
2078
+ self.is_managed = finalizer is not None
2079
+ self.handle = self.host_pointer
2080
+
2081
+ # For buffer interface
2082
+ self._buflen_ = self.size
2083
+ if USE_NV_BINDING:
2084
+ self._bufptr_ = self.host_pointer
2085
+ else:
2086
+ self._bufptr_ = self.host_pointer.value
2087
+
2088
+ if finalizer is not None:
2089
+ weakref.finalize(self, finalizer)
2090
+
2091
+ def own(self):
2092
+ return self
2093
+
2094
+
2095
+ class ManagedMemory(AutoFreePointer):
2096
+ """A memory pointer that refers to a managed memory buffer (can be accessed
2097
+ on both host and device).
2098
+
2099
+ :param context: The context in which the pointer was mapped.
2100
+ :type context: Context
2101
+ :param pointer: The address of the buffer.
2102
+ :type pointer: ctypes.c_void_p
2103
+ :param size: The size of the buffer in bytes.
2104
+ :type size: int
2105
+ :param owner: The owner is sometimes set by the internals of this class, or
2106
+ used for Numba's internal memory management. It should not be
2107
+ provided by an external user of the ``ManagedMemory`` class
2108
+ (e.g. from within an EMM Plugin); the default of `None`
2109
+ should always suffice.
2110
+ :type owner: NoneType
2111
+ :param finalizer: A function that is called when the buffer is to be freed.
2112
+ :type finalizer: function
2113
+ """
2114
+
2115
+ __cuda_memory__ = True
2116
+
2117
+ def __init__(self, context, pointer, size, owner=None, finalizer=None):
2118
+ self.owned = owner
2119
+ devptr = pointer
2120
+ super().__init__(context, devptr, size, finalizer=finalizer)
2121
+
2122
+ # For buffer interface
2123
+ self._buflen_ = self.size
2124
+ if USE_NV_BINDING:
2125
+ self._bufptr_ = self.device_pointer
2126
+ else:
2127
+ self._bufptr_ = self.device_pointer.value
2128
+
2129
+ def own(self):
2130
+ return ManagedOwnedPointer(weakref.proxy(self))
2131
+
2132
+
2133
+ class OwnedPointer(object):
2134
+ def __init__(self, memptr, view=None):
2135
+ self._mem = memptr
2136
+
2137
+ if view is None:
2138
+ self._view = self._mem
2139
+ else:
2140
+ assert not view.is_managed
2141
+ self._view = view
2142
+
2143
+ mem = self._mem
2144
+
2145
+ def deref():
2146
+ try:
2147
+ mem.refct -= 1
2148
+ assert mem.refct >= 0
2149
+ if mem.refct == 0:
2150
+ mem.free()
2151
+ except ReferenceError:
2152
+ # ignore reference error here
2153
+ pass
2154
+
2155
+ self._mem.refct += 1
2156
+ weakref.finalize(self, deref)
2157
+
2158
+ def __getattr__(self, fname):
2159
+ """Proxy MemoryPointer methods
2160
+ """
2161
+ return getattr(self._view, fname)
2162
+
2163
+
2164
+ class MappedOwnedPointer(OwnedPointer, mviewbuf.MemAlloc):
2165
+ pass
2166
+
2167
+
2168
+ class ManagedOwnedPointer(OwnedPointer, mviewbuf.MemAlloc):
2169
+ pass
2170
+
2171
+
2172
+ class Stream(object):
2173
+ def __init__(self, context, handle, finalizer, external=False):
2174
+ self.context = context
2175
+ self.handle = handle
2176
+ self.external = external
2177
+ if finalizer is not None:
2178
+ weakref.finalize(self, finalizer)
2179
+
2180
+ def __int__(self):
2181
+ if USE_NV_BINDING:
2182
+ return int(self.handle)
2183
+ else:
2184
+ # The default stream's handle.value is 0, which gives `None`
2185
+ return self.handle.value or drvapi.CU_STREAM_DEFAULT
2186
+
2187
+ def __repr__(self):
2188
+ if USE_NV_BINDING:
2189
+ default_streams = {
2190
+ CU_STREAM_DEFAULT: "<Default CUDA stream on %s>",
2191
+ binding.CU_STREAM_LEGACY:
2192
+ "<Legacy default CUDA stream on %s>",
2193
+ binding.CU_STREAM_PER_THREAD:
2194
+ "<Per-thread default CUDA stream on %s>",
2195
+ }
2196
+ ptr = int(self.handle) or 0
2197
+ else:
2198
+ default_streams = {
2199
+ drvapi.CU_STREAM_DEFAULT: "<Default CUDA stream on %s>",
2200
+ drvapi.CU_STREAM_LEGACY: "<Legacy default CUDA stream on %s>",
2201
+ drvapi.CU_STREAM_PER_THREAD:
2202
+ "<Per-thread default CUDA stream on %s>",
2203
+ }
2204
+ ptr = self.handle.value or drvapi.CU_STREAM_DEFAULT
2205
+
2206
+ if ptr in default_streams:
2207
+ return default_streams[ptr] % self.context
2208
+ elif self.external:
2209
+ return "<External CUDA stream %d on %s>" % (ptr, self.context)
2210
+ else:
2211
+ return "<CUDA stream %d on %s>" % (ptr, self.context)
2212
+
2213
+ def synchronize(self):
2214
+ '''
2215
+ Wait for all commands in this stream to execute. This will commit any
2216
+ pending memory transfers.
2217
+ '''
2218
+ driver.cuStreamSynchronize(self.handle)
2219
+
2220
+ @contextlib.contextmanager
2221
+ def auto_synchronize(self):
2222
+ '''
2223
+ A context manager that waits for all commands in this stream to execute
2224
+ and commits any pending memory transfers upon exiting the context.
2225
+ '''
2226
+ yield self
2227
+ self.synchronize()
2228
+
2229
+ def add_callback(self, callback, arg=None):
2230
+ """
2231
+ Add a callback to a compute stream.
2232
+ The user provided function is called from a driver thread once all
2233
+ preceding stream operations are complete.
2234
+
2235
+ Callback functions are called from a CUDA driver thread, not from
2236
+ the thread that invoked `add_callback`. No CUDA API functions may
2237
+ be called from within the callback function.
2238
+
2239
+ The duration of a callback function should be kept short, as the
2240
+ callback will block later work in the stream and may block other
2241
+ callbacks from being executed.
2242
+
2243
+ Note: The driver function underlying this method is marked for
2244
+ eventual deprecation and may be replaced in a future CUDA release.
2245
+
2246
+ :param callback: Callback function with arguments (stream, status, arg).
2247
+ :param arg: Optional user data to be passed to the callback function.
2248
+ """
2249
+ data = (self, callback, arg)
2250
+ _py_incref(data)
2251
+ if USE_NV_BINDING:
2252
+ ptr = int.from_bytes(self._stream_callback, byteorder='little')
2253
+ stream_callback = binding.CUstreamCallback(ptr)
2254
+ # The callback needs to receive a pointer to the data PyObject
2255
+ data = id(data)
2256
+ else:
2257
+ stream_callback = self._stream_callback
2258
+ driver.cuStreamAddCallback(self.handle, stream_callback, data, 0)
2259
+
2260
+ @staticmethod
2261
+ @cu_stream_callback_pyobj
2262
+ def _stream_callback(handle, status, data):
2263
+ try:
2264
+ stream, callback, arg = data
2265
+ callback(stream, status, arg)
2266
+ except Exception as e:
2267
+ warnings.warn(f"Exception in stream callback: {e}")
2268
+ finally:
2269
+ _py_decref(data)
2270
+
2271
+ def async_done(self) -> asyncio.futures.Future:
2272
+ """
2273
+ Return an awaitable that resolves once all preceding stream operations
2274
+ are complete. The result of the awaitable is the current stream.
2275
+ """
2276
+ loop = asyncio.get_running_loop()
2277
+ future = loop.create_future()
2278
+
2279
+ def resolver(future, status):
2280
+ if future.done():
2281
+ return
2282
+ elif status == 0:
2283
+ future.set_result(self)
2284
+ else:
2285
+ future.set_exception(Exception(f"Stream error {status}"))
2286
+
2287
+ def callback(stream, status, future):
2288
+ loop.call_soon_threadsafe(resolver, future, status)
2289
+
2290
+ self.add_callback(callback, future)
2291
+ return future
2292
+
2293
+
2294
+ class Event(object):
2295
+ def __init__(self, context, handle, finalizer=None):
2296
+ self.context = context
2297
+ self.handle = handle
2298
+ if finalizer is not None:
2299
+ weakref.finalize(self, finalizer)
2300
+
2301
+ def query(self):
2302
+ """
2303
+ Returns True if all work before the most recent record has completed;
2304
+ otherwise, returns False.
2305
+ """
2306
+ try:
2307
+ driver.cuEventQuery(self.handle)
2308
+ except CudaAPIError as e:
2309
+ if e.code == enums.CUDA_ERROR_NOT_READY:
2310
+ return False
2311
+ else:
2312
+ raise
2313
+ else:
2314
+ return True
2315
+
2316
+ def record(self, stream=0):
2317
+ """
2318
+ Set the record point of the event to the current point in the given
2319
+ stream.
2320
+
2321
+ The event will be considered to have occurred when all work that was
2322
+ queued in the stream at the time of the call to ``record()`` has been
2323
+ completed.
2324
+ """
2325
+ if USE_NV_BINDING:
2326
+ hstream = stream.handle if stream else binding.CUstream(0)
2327
+ else:
2328
+ hstream = stream.handle if stream else 0
2329
+ driver.cuEventRecord(self.handle, hstream)
2330
+
2331
+ def synchronize(self):
2332
+ """
2333
+ Synchronize the host thread for the completion of the event.
2334
+ """
2335
+ driver.cuEventSynchronize(self.handle)
2336
+
2337
+ def wait(self, stream=0):
2338
+ """
2339
+ All future works submitted to stream will wait util the event completes.
2340
+ """
2341
+ if USE_NV_BINDING:
2342
+ hstream = stream.handle if stream else binding.CUstream(0)
2343
+ else:
2344
+ hstream = stream.handle if stream else 0
2345
+ flags = 0
2346
+ driver.cuStreamWaitEvent(hstream, self.handle, flags)
2347
+
2348
+ def elapsed_time(self, evtend):
2349
+ return event_elapsed_time(self, evtend)
2350
+
2351
+
2352
+ def event_elapsed_time(evtstart, evtend):
2353
+ '''
2354
+ Compute the elapsed time between two events in milliseconds.
2355
+ '''
2356
+ if USE_NV_BINDING:
2357
+ return driver.cuEventElapsedTime(evtstart.handle, evtend.handle)
2358
+ else:
2359
+ msec = c_float()
2360
+ driver.cuEventElapsedTime(byref(msec), evtstart.handle, evtend.handle)
2361
+ return msec.value
2362
+
2363
+
2364
+ class Module(metaclass=ABCMeta):
2365
+ """Abstract base class for modules"""
2366
+
2367
+ def __init__(self, context, handle, info_log, finalizer=None):
2368
+ self.context = context
2369
+ self.handle = handle
2370
+ self.info_log = info_log
2371
+ if finalizer is not None:
2372
+ self._finalizer = weakref.finalize(self, finalizer)
2373
+
2374
+ def unload(self):
2375
+ """Unload this module from the context"""
2376
+ self.context.unload_module(self)
2377
+
2378
+ @abstractmethod
2379
+ def get_function(self, name):
2380
+ """Returns a Function object encapsulating the named function"""
2381
+
2382
+ @abstractmethod
2383
+ def get_global_symbol(self, name):
2384
+ """Return a MemoryPointer referring to the named symbol"""
2385
+
2386
+
2387
+ class CtypesModule(Module):
2388
+
2389
+ def get_function(self, name):
2390
+ handle = drvapi.cu_function()
2391
+ driver.cuModuleGetFunction(byref(handle), self.handle,
2392
+ name.encode('utf8'))
2393
+ return CtypesFunction(weakref.proxy(self), handle, name)
2394
+
2395
+ def get_global_symbol(self, name):
2396
+ ptr = drvapi.cu_device_ptr()
2397
+ size = drvapi.c_size_t()
2398
+ driver.cuModuleGetGlobal(byref(ptr), byref(size), self.handle,
2399
+ name.encode('utf8'))
2400
+ return MemoryPointer(self.context, ptr, size), size.value
2401
+
2402
+
2403
+ class CudaPythonModule(Module):
2404
+
2405
+ def get_function(self, name):
2406
+ handle = driver.cuModuleGetFunction(self.handle, name.encode('utf8'))
2407
+ return CudaPythonFunction(weakref.proxy(self), handle, name)
2408
+
2409
+ def get_global_symbol(self, name):
2410
+ ptr, size = driver.cuModuleGetGlobal(self.handle, name.encode('utf8'))
2411
+ return MemoryPointer(self.context, ptr, size), size
2412
+
2413
+
2414
+ FuncAttr = namedtuple("FuncAttr", ["regs", "shared", "local", "const",
2415
+ "maxthreads"])
2416
+
2417
+
2418
+ class Function(metaclass=ABCMeta):
2419
+ griddim = 1, 1, 1
2420
+ blockdim = 1, 1, 1
2421
+ stream = 0
2422
+ sharedmem = 0
2423
+
2424
+ def __init__(self, module, handle, name):
2425
+ self.module = module
2426
+ self.handle = handle
2427
+ self.name = name
2428
+ self.attrs = self.read_func_attr_all()
2429
+
2430
+ def __repr__(self):
2431
+ return "<CUDA function %s>" % self.name
2432
+
2433
+ @property
2434
+ def device(self):
2435
+ return self.module.context.device
2436
+
2437
+ @abstractmethod
2438
+ def cache_config(self, prefer_equal=False, prefer_cache=False,
2439
+ prefer_shared=False):
2440
+ """Set the cache configuration for this function."""
2441
+
2442
+ @abstractmethod
2443
+ def read_func_attr(self, attrid):
2444
+ """Return the value of the attribute with given ID."""
2445
+
2446
+ @abstractmethod
2447
+ def read_func_attr_all(self):
2448
+ """Return a FuncAttr object with the values of various function
2449
+ attributes."""
2450
+
2451
+
2452
+ class CtypesFunction(Function):
2453
+
2454
+ def cache_config(self, prefer_equal=False, prefer_cache=False,
2455
+ prefer_shared=False):
2456
+ prefer_equal = prefer_equal or (prefer_cache and prefer_shared)
2457
+ if prefer_equal:
2458
+ flag = enums.CU_FUNC_CACHE_PREFER_EQUAL
2459
+ elif prefer_cache:
2460
+ flag = enums.CU_FUNC_CACHE_PREFER_L1
2461
+ elif prefer_shared:
2462
+ flag = enums.CU_FUNC_CACHE_PREFER_SHARED
2463
+ else:
2464
+ flag = enums.CU_FUNC_CACHE_PREFER_NONE
2465
+ driver.cuFuncSetCacheConfig(self.handle, flag)
2466
+
2467
+ def read_func_attr(self, attrid):
2468
+ retval = c_int()
2469
+ driver.cuFuncGetAttribute(byref(retval), attrid, self.handle)
2470
+ return retval.value
2471
+
2472
+ def read_func_attr_all(self):
2473
+ nregs = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_NUM_REGS)
2474
+ cmem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES)
2475
+ lmem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES)
2476
+ smem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES)
2477
+ maxtpb = self.read_func_attr(
2478
+ enums.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK)
2479
+ return FuncAttr(regs=nregs, const=cmem, local=lmem, shared=smem,
2480
+ maxthreads=maxtpb)
2481
+
2482
+
2483
+ class CudaPythonFunction(Function):
2484
+
2485
+ def cache_config(self, prefer_equal=False, prefer_cache=False,
2486
+ prefer_shared=False):
2487
+ prefer_equal = prefer_equal or (prefer_cache and prefer_shared)
2488
+ attr = binding.CUfunction_attribute
2489
+ if prefer_equal:
2490
+ flag = attr.CU_FUNC_CACHE_PREFER_EQUAL
2491
+ elif prefer_cache:
2492
+ flag = attr.CU_FUNC_CACHE_PREFER_L1
2493
+ elif prefer_shared:
2494
+ flag = attr.CU_FUNC_CACHE_PREFER_SHARED
2495
+ else:
2496
+ flag = attr.CU_FUNC_CACHE_PREFER_NONE
2497
+ driver.cuFuncSetCacheConfig(self.handle, flag)
2498
+
2499
+ def read_func_attr(self, attrid):
2500
+ return driver.cuFuncGetAttribute(attrid, self.handle)
2501
+
2502
+ def read_func_attr_all(self):
2503
+ attr = binding.CUfunction_attribute
2504
+ nregs = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_NUM_REGS)
2505
+ cmem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES)
2506
+ lmem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES)
2507
+ smem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES)
2508
+ maxtpb = self.read_func_attr(
2509
+ attr.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK)
2510
+ return FuncAttr(regs=nregs, const=cmem, local=lmem, shared=smem,
2511
+ maxthreads=maxtpb)
2512
+
2513
+
2514
+ def launch_kernel(cufunc_handle,
2515
+ gx, gy, gz,
2516
+ bx, by, bz,
2517
+ sharedmem,
2518
+ hstream,
2519
+ args,
2520
+ cooperative=False):
2521
+
2522
+ param_ptrs = [addressof(arg) for arg in args]
2523
+ params = (c_void_p * len(param_ptrs))(*param_ptrs)
2524
+
2525
+ if USE_NV_BINDING:
2526
+ params_for_launch = addressof(params)
2527
+ extra = 0
2528
+ else:
2529
+ params_for_launch = params
2530
+ extra = None
2531
+
2532
+ if cooperative:
2533
+ driver.cuLaunchCooperativeKernel(cufunc_handle,
2534
+ gx, gy, gz,
2535
+ bx, by, bz,
2536
+ sharedmem,
2537
+ hstream,
2538
+ params_for_launch)
2539
+ else:
2540
+ driver.cuLaunchKernel(cufunc_handle,
2541
+ gx, gy, gz,
2542
+ bx, by, bz,
2543
+ sharedmem,
2544
+ hstream,
2545
+ params_for_launch,
2546
+ extra)
2547
+
2548
+
2549
+ if USE_NV_BINDING:
2550
+ jitty = binding.CUjitInputType
2551
+ FILE_EXTENSION_MAP = {
2552
+ 'o': jitty.CU_JIT_INPUT_OBJECT,
2553
+ 'ptx': jitty.CU_JIT_INPUT_PTX,
2554
+ 'a': jitty.CU_JIT_INPUT_LIBRARY,
2555
+ 'lib': jitty.CU_JIT_INPUT_LIBRARY,
2556
+ 'cubin': jitty.CU_JIT_INPUT_CUBIN,
2557
+ 'fatbin': jitty.CU_JIT_INPUT_FATBINARY,
2558
+ }
2559
+ else:
2560
+ FILE_EXTENSION_MAP = {
2561
+ 'o': enums.CU_JIT_INPUT_OBJECT,
2562
+ 'ptx': enums.CU_JIT_INPUT_PTX,
2563
+ 'a': enums.CU_JIT_INPUT_LIBRARY,
2564
+ 'lib': enums.CU_JIT_INPUT_LIBRARY,
2565
+ 'cubin': enums.CU_JIT_INPUT_CUBIN,
2566
+ 'fatbin': enums.CU_JIT_INPUT_FATBINARY,
2567
+ }
2568
+
2569
+
2570
+ class Linker(metaclass=ABCMeta):
2571
+ """Abstract base class for linkers"""
2572
+
2573
+ @classmethod
2574
+ def new(cls, max_registers=0, lineinfo=False, cc=None):
2575
+ if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY:
2576
+ return MVCLinker(max_registers, lineinfo, cc)
2577
+ elif USE_NV_BINDING:
2578
+ return CudaPythonLinker(max_registers, lineinfo, cc)
2579
+ else:
2580
+ return CtypesLinker(max_registers, lineinfo, cc)
2581
+
2582
+ @abstractmethod
2583
+ def __init__(self, max_registers, lineinfo, cc):
2584
+ # LTO unsupported in Numba at present, but the pynvjitlink linker
2585
+ # (https://github.com/rapidsai/pynvjitlink) supports it,
2586
+ self.lto = False
2587
+
2588
+ @property
2589
+ @abstractmethod
2590
+ def info_log(self):
2591
+ """Return the info log from the linker invocation"""
2592
+
2593
+ @property
2594
+ @abstractmethod
2595
+ def error_log(self):
2596
+ """Return the error log from the linker invocation"""
2597
+
2598
+ @abstractmethod
2599
+ def add_ptx(self, ptx, name):
2600
+ """Add PTX source in a string to the link"""
2601
+
2602
+ def add_cu(self, cu, name):
2603
+ """Add CUDA source in a string to the link. The name of the source
2604
+ file should be specified in `name`."""
2605
+ with driver.get_active_context() as ac:
2606
+ dev = driver.get_device(ac.devnum)
2607
+ cc = dev.compute_capability
2608
+
2609
+ ptx, log = nvrtc.compile(cu, name, cc)
2610
+
2611
+ if config.DUMP_ASSEMBLY:
2612
+ print(("ASSEMBLY %s" % name).center(80, '-'))
2613
+ print(ptx)
2614
+ print('=' * 80)
2615
+
2616
+ # Link the program's PTX using the normal linker mechanism
2617
+ ptx_name = os.path.splitext(name)[0] + ".ptx"
2618
+ self.add_ptx(ptx.encode(), ptx_name)
2619
+
2620
+ @abstractmethod
2621
+ def add_file(self, path, kind):
2622
+ """Add code from a file to the link"""
2623
+
2624
+ def add_cu_file(self, path):
2625
+ with open(path, 'rb') as f:
2626
+ cu = f.read()
2627
+ self.add_cu(cu, os.path.basename(path))
2628
+
2629
+ def add_file_guess_ext(self, path):
2630
+ """Add a file to the link, guessing its type from its extension."""
2631
+ ext = os.path.splitext(path)[1][1:]
2632
+ if ext == '':
2633
+ raise RuntimeError("Don't know how to link file with no extension")
2634
+ elif ext == 'cu':
2635
+ self.add_cu_file(path)
2636
+ else:
2637
+ kind = FILE_EXTENSION_MAP.get(ext, None)
2638
+ if kind is None:
2639
+ raise RuntimeError("Don't know how to link file with extension "
2640
+ f".{ext}")
2641
+ self.add_file(path, kind)
2642
+
2643
+ @abstractmethod
2644
+ def complete(self):
2645
+ """Complete the link. Returns (cubin, size)
2646
+
2647
+ cubin is a pointer to a internal buffer of cubin owned by the linker;
2648
+ thus, it should be loaded before the linker is destroyed.
2649
+ """
2650
+
2651
+
2652
+ _MVC_ERROR_MESSAGE = (
2653
+ "Minor version compatibility requires ptxcompiler and cubinlinker packages "
2654
+ "to be available"
2655
+ )
2656
+
2657
+
2658
+ class MVCLinker(Linker):
2659
+ """
2660
+ Linker supporting Minor Version Compatibility, backed by the cubinlinker
2661
+ package.
2662
+ """
2663
+ def __init__(self, max_registers=None, lineinfo=False, cc=None):
2664
+ try:
2665
+ from cubinlinker import CubinLinker
2666
+ except ImportError as err:
2667
+ raise ImportError(_MVC_ERROR_MESSAGE) from err
2668
+
2669
+ if cc is None:
2670
+ raise RuntimeError("MVCLinker requires Compute Capability to be "
2671
+ "specified, but cc is None")
2672
+
2673
+ super().__init__(max_registers, lineinfo, cc)
2674
+
2675
+ arch = f"sm_{cc[0] * 10 + cc[1]}"
2676
+ ptx_compile_opts = ['--gpu-name', arch, '-c']
2677
+ if max_registers:
2678
+ arg = f"--maxrregcount={max_registers}"
2679
+ ptx_compile_opts.append(arg)
2680
+ if lineinfo:
2681
+ ptx_compile_opts.append('--generate-line-info')
2682
+ self.ptx_compile_options = tuple(ptx_compile_opts)
2683
+
2684
+ self._linker = CubinLinker(f"--arch={arch}")
2685
+
2686
+ @property
2687
+ def info_log(self):
2688
+ return self._linker.info_log
2689
+
2690
+ @property
2691
+ def error_log(self):
2692
+ return self._linker.error_log
2693
+
2694
+ def add_ptx(self, ptx, name='<cudapy-ptx>'):
2695
+ try:
2696
+ from ptxcompiler import compile_ptx
2697
+ from cubinlinker import CubinLinkerError
2698
+ except ImportError as err:
2699
+ raise ImportError(_MVC_ERROR_MESSAGE) from err
2700
+ compile_result = compile_ptx(ptx.decode(), self.ptx_compile_options)
2701
+ try:
2702
+ self._linker.add_cubin(compile_result.compiled_program, name)
2703
+ except CubinLinkerError as e:
2704
+ raise LinkerError from e
2705
+
2706
+ def add_file(self, path, kind):
2707
+ try:
2708
+ from cubinlinker import CubinLinkerError
2709
+ except ImportError as err:
2710
+ raise ImportError(_MVC_ERROR_MESSAGE) from err
2711
+
2712
+ try:
2713
+ with open(path, 'rb') as f:
2714
+ data = f.read()
2715
+ except FileNotFoundError:
2716
+ raise LinkerError(f'{path} not found')
2717
+
2718
+ name = pathlib.Path(path).name
2719
+ if kind == FILE_EXTENSION_MAP['cubin']:
2720
+ fn = self._linker.add_cubin
2721
+ elif kind == FILE_EXTENSION_MAP['fatbin']:
2722
+ fn = self._linker.add_fatbin
2723
+ elif kind == FILE_EXTENSION_MAP['a']:
2724
+ raise LinkerError(f"Don't know how to link {kind}")
2725
+ elif kind == FILE_EXTENSION_MAP['ptx']:
2726
+ return self.add_ptx(data, name)
2727
+ else:
2728
+ raise LinkerError(f"Don't know how to link {kind}")
2729
+
2730
+ try:
2731
+ fn(data, name)
2732
+ except CubinLinkerError as e:
2733
+ raise LinkerError from e
2734
+
2735
+ def complete(self):
2736
+ try:
2737
+ from cubinlinker import CubinLinkerError
2738
+ except ImportError as err:
2739
+ raise ImportError(_MVC_ERROR_MESSAGE) from err
2740
+
2741
+ try:
2742
+ return self._linker.complete()
2743
+ except CubinLinkerError as e:
2744
+ raise LinkerError from e
2745
+
2746
+
2747
+ class CtypesLinker(Linker):
2748
+ """
2749
+ Links for current device if no CC given
2750
+ """
2751
+ def __init__(self, max_registers=0, lineinfo=False, cc=None):
2752
+ super().__init__(max_registers, lineinfo, cc)
2753
+
2754
+ logsz = config.CUDA_LOG_SIZE
2755
+ linkerinfo = (c_char * logsz)()
2756
+ linkererrors = (c_char * logsz)()
2757
+
2758
+ options = {
2759
+ enums.CU_JIT_INFO_LOG_BUFFER: addressof(linkerinfo),
2760
+ enums.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
2761
+ enums.CU_JIT_ERROR_LOG_BUFFER: addressof(linkererrors),
2762
+ enums.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
2763
+ enums.CU_JIT_LOG_VERBOSE: c_void_p(1),
2764
+ }
2765
+ if max_registers:
2766
+ options[enums.CU_JIT_MAX_REGISTERS] = c_void_p(max_registers)
2767
+ if lineinfo:
2768
+ options[enums.CU_JIT_GENERATE_LINE_INFO] = c_void_p(1)
2769
+
2770
+ if cc is None:
2771
+ # No option value is needed, but we need something as a placeholder
2772
+ options[enums.CU_JIT_TARGET_FROM_CUCONTEXT] = 1
2773
+ else:
2774
+ cc_val = cc[0] * 10 + cc[1]
2775
+ options[enums.CU_JIT_TARGET] = c_void_p(cc_val)
2776
+
2777
+ raw_keys = list(options.keys())
2778
+ raw_values = list(options.values())
2779
+
2780
+ option_keys = (drvapi.cu_jit_option * len(raw_keys))(*raw_keys)
2781
+ option_vals = (c_void_p * len(raw_values))(*raw_values)
2782
+
2783
+ self.handle = handle = drvapi.cu_link_state()
2784
+ driver.cuLinkCreate(len(raw_keys), option_keys, option_vals,
2785
+ byref(self.handle))
2786
+
2787
+ weakref.finalize(self, driver.cuLinkDestroy, handle)
2788
+
2789
+ self.linker_info_buf = linkerinfo
2790
+ self.linker_errors_buf = linkererrors
2791
+
2792
+ self._keep_alive = [linkerinfo, linkererrors, option_keys, option_vals]
2793
+
2794
+ @property
2795
+ def info_log(self):
2796
+ return self.linker_info_buf.value.decode('utf8')
2797
+
2798
+ @property
2799
+ def error_log(self):
2800
+ return self.linker_errors_buf.value.decode('utf8')
2801
+
2802
+ def add_ptx(self, ptx, name='<cudapy-ptx>'):
2803
+ ptxbuf = c_char_p(ptx)
2804
+ namebuf = c_char_p(name.encode('utf8'))
2805
+ self._keep_alive += [ptxbuf, namebuf]
2806
+ try:
2807
+ driver.cuLinkAddData(self.handle, enums.CU_JIT_INPUT_PTX,
2808
+ ptxbuf, len(ptx), namebuf, 0, None, None)
2809
+ except CudaAPIError as e:
2810
+ raise LinkerError("%s\n%s" % (e, self.error_log))
2811
+
2812
+ def add_file(self, path, kind):
2813
+ pathbuf = c_char_p(path.encode("utf8"))
2814
+ self._keep_alive.append(pathbuf)
2815
+
2816
+ try:
2817
+ driver.cuLinkAddFile(self.handle, kind, pathbuf, 0, None, None)
2818
+ except CudaAPIError as e:
2819
+ if e.code == enums.CUDA_ERROR_FILE_NOT_FOUND:
2820
+ msg = f'{path} not found'
2821
+ else:
2822
+ msg = "%s\n%s" % (e, self.error_log)
2823
+ raise LinkerError(msg)
2824
+
2825
+ def complete(self):
2826
+ cubin_buf = c_void_p(0)
2827
+ size = c_size_t(0)
2828
+
2829
+ try:
2830
+ driver.cuLinkComplete(self.handle, byref(cubin_buf), byref(size))
2831
+ except CudaAPIError as e:
2832
+ raise LinkerError("%s\n%s" % (e, self.error_log))
2833
+
2834
+ size = size.value
2835
+ assert size > 0, 'linker returned a zero sized cubin'
2836
+ del self._keep_alive[:]
2837
+
2838
+ # We return a copy of the cubin because it's owned by the linker
2839
+ cubin_ptr = ctypes.cast(cubin_buf, ctypes.POINTER(ctypes.c_char))
2840
+ return bytes(np.ctypeslib.as_array(cubin_ptr, shape=(size,)))
2841
+
2842
+
2843
+ class CudaPythonLinker(Linker):
2844
+ """
2845
+ Links for current device if no CC given
2846
+ """
2847
+ def __init__(self, max_registers=0, lineinfo=False, cc=None):
2848
+ super().__init__(max_registers, lineinfo, cc)
2849
+
2850
+ logsz = config.CUDA_LOG_SIZE
2851
+ linkerinfo = bytearray(logsz)
2852
+ linkererrors = bytearray(logsz)
2853
+
2854
+ jit_option = binding.CUjit_option
2855
+
2856
+ options = {
2857
+ jit_option.CU_JIT_INFO_LOG_BUFFER: linkerinfo,
2858
+ jit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: logsz,
2859
+ jit_option.CU_JIT_ERROR_LOG_BUFFER: linkererrors,
2860
+ jit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: logsz,
2861
+ jit_option.CU_JIT_LOG_VERBOSE: 1,
2862
+ }
2863
+ if max_registers:
2864
+ options[jit_option.CU_JIT_MAX_REGISTERS] = max_registers
2865
+ if lineinfo:
2866
+ options[jit_option.CU_JIT_GENERATE_LINE_INFO] = 1
2867
+
2868
+ if cc is None:
2869
+ # No option value is needed, but we need something as a placeholder
2870
+ options[jit_option.CU_JIT_TARGET_FROM_CUCONTEXT] = 1
2871
+ else:
2872
+ cc_val = cc[0] * 10 + cc[1]
2873
+ cc_enum = getattr(binding.CUjit_target,
2874
+ f'CU_TARGET_COMPUTE_{cc_val}')
2875
+ options[jit_option.CU_JIT_TARGET] = cc_enum
2876
+
2877
+ raw_keys = list(options.keys())
2878
+ raw_values = list(options.values())
2879
+
2880
+ self.handle = driver.cuLinkCreate(len(raw_keys), raw_keys, raw_values)
2881
+
2882
+ weakref.finalize(self, driver.cuLinkDestroy, self.handle)
2883
+
2884
+ self.linker_info_buf = linkerinfo
2885
+ self.linker_errors_buf = linkererrors
2886
+
2887
+ self._keep_alive = [linkerinfo, linkererrors, raw_keys, raw_values]
2888
+
2889
+ @property
2890
+ def info_log(self):
2891
+ return self.linker_info_buf.decode('utf8')
2892
+
2893
+ @property
2894
+ def error_log(self):
2895
+ return self.linker_errors_buf.decode('utf8')
2896
+
2897
+ def add_ptx(self, ptx, name='<cudapy-ptx>'):
2898
+ namebuf = name.encode('utf8')
2899
+ self._keep_alive += [ptx, namebuf]
2900
+ try:
2901
+ input_ptx = binding.CUjitInputType.CU_JIT_INPUT_PTX
2902
+ driver.cuLinkAddData(self.handle, input_ptx, ptx, len(ptx),
2903
+ namebuf, 0, [], [])
2904
+ except CudaAPIError as e:
2905
+ raise LinkerError("%s\n%s" % (e, self.error_log))
2906
+
2907
+ def add_file(self, path, kind):
2908
+ pathbuf = path.encode("utf8")
2909
+ self._keep_alive.append(pathbuf)
2910
+
2911
+ try:
2912
+ driver.cuLinkAddFile(self.handle, kind, pathbuf, 0, [], [])
2913
+ except CudaAPIError as e:
2914
+ if e.code == binding.CUresult.CUDA_ERROR_FILE_NOT_FOUND:
2915
+ msg = f'{path} not found'
2916
+ else:
2917
+ msg = "%s\n%s" % (e, self.error_log)
2918
+ raise LinkerError(msg)
2919
+
2920
+ def complete(self):
2921
+ try:
2922
+ cubin_buf, size = driver.cuLinkComplete(self.handle)
2923
+ except CudaAPIError as e:
2924
+ raise LinkerError("%s\n%s" % (e, self.error_log))
2925
+
2926
+ assert size > 0, 'linker returned a zero sized cubin'
2927
+ del self._keep_alive[:]
2928
+ # We return a copy of the cubin because it's owned by the linker
2929
+ cubin_ptr = ctypes.cast(cubin_buf, ctypes.POINTER(ctypes.c_char))
2930
+ return bytes(np.ctypeslib.as_array(cubin_ptr, shape=(size,)))
2931
+
2932
+
2933
+ # -----------------------------------------------------------------------------
2934
+
2935
+
2936
+ def get_devptr_for_active_ctx(ptr):
2937
+ """Query the device pointer usable in the current context from an arbitrary
2938
+ pointer.
2939
+ """
2940
+ if ptr != 0:
2941
+ if USE_NV_BINDING:
2942
+ ptr_attrs = binding.CUpointer_attribute
2943
+ attr = ptr_attrs.CU_POINTER_ATTRIBUTE_DEVICE_POINTER
2944
+ ptrobj = binding.CUdeviceptr(ptr)
2945
+ return driver.cuPointerGetAttribute(attr, ptrobj)
2946
+ else:
2947
+ devptr = drvapi.cu_device_ptr()
2948
+ attr = enums.CU_POINTER_ATTRIBUTE_DEVICE_POINTER
2949
+ driver.cuPointerGetAttribute(byref(devptr), attr, ptr)
2950
+ return devptr
2951
+ else:
2952
+ if USE_NV_BINDING:
2953
+ return binding.CUdeviceptr()
2954
+ else:
2955
+ return drvapi.cu_device_ptr()
2956
+
2957
+
2958
+ def device_extents(devmem):
2959
+ """Find the extents (half open begin and end pointer) of the underlying
2960
+ device memory allocation.
2961
+
2962
+ NOTE: it always returns the extents of the allocation but the extents
2963
+ of the device memory view that can be a subsection of the entire allocation.
2964
+ """
2965
+ devptr = device_ctypes_pointer(devmem)
2966
+ if USE_NV_BINDING:
2967
+ s, n = driver.cuMemGetAddressRange(devptr)
2968
+ return s, binding.CUdeviceptr(int(s) + n)
2969
+ else:
2970
+ s = drvapi.cu_device_ptr()
2971
+ n = c_size_t()
2972
+ driver.cuMemGetAddressRange(byref(s), byref(n), devptr)
2973
+ s, n = s.value, n.value
2974
+ return s, s + n
2975
+
2976
+
2977
+ def device_memory_size(devmem):
2978
+ """Check the memory size of the device memory.
2979
+ The result is cached in the device memory object.
2980
+ It may query the driver for the memory size of the device memory allocation.
2981
+ """
2982
+ sz = getattr(devmem, '_cuda_memsize_', None)
2983
+ if sz is None:
2984
+ s, e = device_extents(devmem)
2985
+ if USE_NV_BINDING:
2986
+ sz = int(e) - int(s)
2987
+ else:
2988
+ sz = e - s
2989
+ devmem._cuda_memsize_ = sz
2990
+ assert sz >= 0, "{} length array".format(sz)
2991
+ return sz
2992
+
2993
+
2994
+ def _is_datetime_dtype(obj):
2995
+ """Returns True if the obj.dtype is datetime64 or timedelta64
2996
+ """
2997
+ dtype = getattr(obj, 'dtype', None)
2998
+ return dtype is not None and dtype.char in 'Mm'
2999
+
3000
+
3001
+ def _workaround_for_datetime(obj):
3002
+ """Workaround for numpy#4983: buffer protocol doesn't support
3003
+ datetime64 or timedelta64.
3004
+ """
3005
+ if _is_datetime_dtype(obj):
3006
+ obj = obj.view(np.int64)
3007
+ return obj
3008
+
3009
+
3010
+ def host_pointer(obj, readonly=False):
3011
+ """Get host pointer from an obj.
3012
+
3013
+ If `readonly` is False, the buffer must be writable.
3014
+
3015
+ NOTE: The underlying data pointer from the host data buffer is used and
3016
+ it should not be changed until the operation which can be asynchronous
3017
+ completes.
3018
+ """
3019
+ if isinstance(obj, int):
3020
+ return obj
3021
+
3022
+ forcewritable = False
3023
+ if not readonly:
3024
+ forcewritable = isinstance(obj, np.void) or _is_datetime_dtype(obj)
3025
+
3026
+ obj = _workaround_for_datetime(obj)
3027
+ return mviewbuf.memoryview_get_buffer(obj, forcewritable, readonly)
3028
+
3029
+
3030
+ def host_memory_extents(obj):
3031
+ "Returns (start, end) the start and end pointer of the array (half open)."
3032
+ obj = _workaround_for_datetime(obj)
3033
+ return mviewbuf.memoryview_get_extents(obj)
3034
+
3035
+
3036
+ def memory_size_from_info(shape, strides, itemsize):
3037
+ """Get the byte size of a contiguous memory buffer given the shape, strides
3038
+ and itemsize.
3039
+ """
3040
+ assert len(shape) == len(strides), "# dim mismatch"
3041
+ ndim = len(shape)
3042
+ s, e = mviewbuf.memoryview_get_extents_info(shape, strides, ndim, itemsize)
3043
+ return e - s
3044
+
3045
+
3046
+ def host_memory_size(obj):
3047
+ "Get the size of the memory"
3048
+ s, e = host_memory_extents(obj)
3049
+ assert e >= s, "memory extend of negative size"
3050
+ return e - s
3051
+
3052
+
3053
+ def device_pointer(obj):
3054
+ "Get the device pointer as an integer"
3055
+ if USE_NV_BINDING:
3056
+ return obj.device_ctypes_pointer
3057
+ else:
3058
+ return device_ctypes_pointer(obj).value
3059
+
3060
+
3061
+ def device_ctypes_pointer(obj):
3062
+ "Get the ctypes object for the device pointer"
3063
+ if obj is None:
3064
+ return c_void_p(0)
3065
+ require_device_memory(obj)
3066
+ return obj.device_ctypes_pointer
3067
+
3068
+
3069
+ def is_device_memory(obj):
3070
+ """All CUDA memory object is recognized as an instance with the attribute
3071
+ "__cuda_memory__" defined and its value evaluated to True.
3072
+
3073
+ All CUDA memory object should also define an attribute named
3074
+ "device_pointer" which value is an int object carrying the pointer
3075
+ value of the device memory address. This is not tested in this method.
3076
+ """
3077
+ return getattr(obj, '__cuda_memory__', False)
3078
+
3079
+
3080
+ def require_device_memory(obj):
3081
+ """A sentry for methods that accept CUDA memory object.
3082
+ """
3083
+ if not is_device_memory(obj):
3084
+ raise Exception("Not a CUDA memory object.")
3085
+
3086
+
3087
+ def device_memory_depends(devmem, *objs):
3088
+ """Add dependencies to the device memory.
3089
+
3090
+ Mainly used for creating structures that points to other device memory,
3091
+ so that the referees are not GC and released.
3092
+ """
3093
+ depset = getattr(devmem, "_depends_", [])
3094
+ depset.extend(objs)
3095
+
3096
+
3097
+ def host_to_device(dst, src, size, stream=0):
3098
+ """
3099
+ NOTE: The underlying data pointer from the host data buffer is used and
3100
+ it should not be changed until the operation which can be asynchronous
3101
+ completes.
3102
+ """
3103
+ varargs = []
3104
+
3105
+ if stream:
3106
+ assert isinstance(stream, Stream)
3107
+ fn = driver.cuMemcpyHtoDAsync
3108
+ varargs.append(stream.handle)
3109
+ else:
3110
+ fn = driver.cuMemcpyHtoD
3111
+
3112
+ fn(device_pointer(dst), host_pointer(src, readonly=True), size, *varargs)
3113
+
3114
+
3115
+ def device_to_host(dst, src, size, stream=0):
3116
+ """
3117
+ NOTE: The underlying data pointer from the host data buffer is used and
3118
+ it should not be changed until the operation which can be asynchronous
3119
+ completes.
3120
+ """
3121
+ varargs = []
3122
+
3123
+ if stream:
3124
+ assert isinstance(stream, Stream)
3125
+ fn = driver.cuMemcpyDtoHAsync
3126
+ varargs.append(stream.handle)
3127
+ else:
3128
+ fn = driver.cuMemcpyDtoH
3129
+
3130
+ fn(host_pointer(dst), device_pointer(src), size, *varargs)
3131
+
3132
+
3133
+ def device_to_device(dst, src, size, stream=0):
3134
+ """
3135
+ NOTE: The underlying data pointer from the host data buffer is used and
3136
+ it should not be changed until the operation which can be asynchronous
3137
+ completes.
3138
+ """
3139
+ varargs = []
3140
+
3141
+ if stream:
3142
+ assert isinstance(stream, Stream)
3143
+ fn = driver.cuMemcpyDtoDAsync
3144
+ varargs.append(stream.handle)
3145
+ else:
3146
+ fn = driver.cuMemcpyDtoD
3147
+
3148
+ fn(device_pointer(dst), device_pointer(src), size, *varargs)
3149
+
3150
+
3151
+ def device_memset(dst, val, size, stream=0):
3152
+ """Memset on the device.
3153
+ If stream is not zero, asynchronous mode is used.
3154
+
3155
+ dst: device memory
3156
+ val: byte value to be written
3157
+ size: number of byte to be written
3158
+ stream: a CUDA stream
3159
+ """
3160
+ varargs = []
3161
+
3162
+ if stream:
3163
+ assert isinstance(stream, Stream)
3164
+ fn = driver.cuMemsetD8Async
3165
+ varargs.append(stream.handle)
3166
+ else:
3167
+ fn = driver.cuMemsetD8
3168
+
3169
+ fn(device_pointer(dst), val, size, *varargs)
3170
+
3171
+
3172
+ def profile_start():
3173
+ '''
3174
+ Enable profile collection in the current context.
3175
+ '''
3176
+ driver.cuProfilerStart()
3177
+
3178
+
3179
+ def profile_stop():
3180
+ '''
3181
+ Disable profile collection in the current context.
3182
+ '''
3183
+ driver.cuProfilerStop()
3184
+
3185
+
3186
+ @contextlib.contextmanager
3187
+ def profiling():
3188
+ """
3189
+ Context manager that enables profiling on entry and disables profiling on
3190
+ exit.
3191
+ """
3192
+ profile_start()
3193
+ yield
3194
+ profile_stop()
3195
+
3196
+
3197
+ def get_version():
3198
+ """
3199
+ Return the driver version as a tuple of (major, minor)
3200
+ """
3201
+ return driver.get_version()