xoscar 0.9.0__cp312-cp312-macosx_10_13_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xoscar/__init__.py +61 -0
- xoscar/_utils.cpython-312-darwin.so +0 -0
- xoscar/_utils.pxd +36 -0
- xoscar/_utils.pyx +246 -0
- xoscar/_version.py +693 -0
- xoscar/aio/__init__.py +16 -0
- xoscar/aio/base.py +86 -0
- xoscar/aio/file.py +59 -0
- xoscar/aio/lru.py +228 -0
- xoscar/aio/parallelism.py +39 -0
- xoscar/api.py +527 -0
- xoscar/backend.py +67 -0
- xoscar/backends/__init__.py +14 -0
- xoscar/backends/allocate_strategy.py +160 -0
- xoscar/backends/communication/__init__.py +30 -0
- xoscar/backends/communication/base.py +315 -0
- xoscar/backends/communication/core.py +69 -0
- xoscar/backends/communication/dummy.py +253 -0
- xoscar/backends/communication/errors.py +20 -0
- xoscar/backends/communication/socket.py +444 -0
- xoscar/backends/communication/ucx.py +538 -0
- xoscar/backends/communication/utils.py +97 -0
- xoscar/backends/config.py +157 -0
- xoscar/backends/context.py +437 -0
- xoscar/backends/core.py +352 -0
- xoscar/backends/indigen/__init__.py +16 -0
- xoscar/backends/indigen/__main__.py +19 -0
- xoscar/backends/indigen/backend.py +51 -0
- xoscar/backends/indigen/driver.py +26 -0
- xoscar/backends/indigen/fate_sharing.py +221 -0
- xoscar/backends/indigen/pool.py +515 -0
- xoscar/backends/indigen/shared_memory.py +548 -0
- xoscar/backends/message.cpython-312-darwin.so +0 -0
- xoscar/backends/message.pyi +255 -0
- xoscar/backends/message.pyx +646 -0
- xoscar/backends/pool.py +1630 -0
- xoscar/backends/router.py +285 -0
- xoscar/backends/test/__init__.py +16 -0
- xoscar/backends/test/backend.py +38 -0
- xoscar/backends/test/pool.py +233 -0
- xoscar/batch.py +256 -0
- xoscar/collective/__init__.py +27 -0
- xoscar/collective/backend/__init__.py +13 -0
- xoscar/collective/backend/nccl_backend.py +160 -0
- xoscar/collective/common.py +102 -0
- xoscar/collective/core.py +737 -0
- xoscar/collective/process_group.py +687 -0
- xoscar/collective/utils.py +41 -0
- xoscar/collective/xoscar_pygloo.cpython-312-darwin.so +0 -0
- xoscar/collective/xoscar_pygloo.pyi +239 -0
- xoscar/constants.py +23 -0
- xoscar/context.cpython-312-darwin.so +0 -0
- xoscar/context.pxd +21 -0
- xoscar/context.pyx +368 -0
- xoscar/core.cpython-312-darwin.so +0 -0
- xoscar/core.pxd +51 -0
- xoscar/core.pyx +664 -0
- xoscar/debug.py +188 -0
- xoscar/driver.py +42 -0
- xoscar/errors.py +63 -0
- xoscar/libcpp.pxd +31 -0
- xoscar/metrics/__init__.py +21 -0
- xoscar/metrics/api.py +288 -0
- xoscar/metrics/backends/__init__.py +13 -0
- xoscar/metrics/backends/console/__init__.py +13 -0
- xoscar/metrics/backends/console/console_metric.py +82 -0
- xoscar/metrics/backends/metric.py +149 -0
- xoscar/metrics/backends/prometheus/__init__.py +13 -0
- xoscar/metrics/backends/prometheus/prometheus_metric.py +70 -0
- xoscar/nvutils.py +717 -0
- xoscar/profiling.py +260 -0
- xoscar/serialization/__init__.py +20 -0
- xoscar/serialization/aio.py +141 -0
- xoscar/serialization/core.cpython-312-darwin.so +0 -0
- xoscar/serialization/core.pxd +28 -0
- xoscar/serialization/core.pyi +57 -0
- xoscar/serialization/core.pyx +944 -0
- xoscar/serialization/cuda.py +111 -0
- xoscar/serialization/exception.py +48 -0
- xoscar/serialization/mlx.py +67 -0
- xoscar/serialization/numpy.py +82 -0
- xoscar/serialization/pyfury.py +37 -0
- xoscar/serialization/scipy.py +72 -0
- xoscar/serialization/torch.py +180 -0
- xoscar/utils.py +522 -0
- xoscar/virtualenv/__init__.py +34 -0
- xoscar/virtualenv/core.py +268 -0
- xoscar/virtualenv/platform.py +56 -0
- xoscar/virtualenv/utils.py +100 -0
- xoscar/virtualenv/uv.py +321 -0
- xoscar-0.9.0.dist-info/METADATA +230 -0
- xoscar-0.9.0.dist-info/RECORD +94 -0
- xoscar-0.9.0.dist-info/WHEEL +6 -0
- xoscar-0.9.0.dist-info/top_level.txt +2 -0
xoscar/nvutils.py
ADDED
|
@@ -0,0 +1,717 @@
|
|
|
1
|
+
# Copyright 2022-2023 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import dataclasses
|
|
18
|
+
import logging
|
|
19
|
+
import os
|
|
20
|
+
import subprocess
|
|
21
|
+
import sys
|
|
22
|
+
import uuid
|
|
23
|
+
from collections import namedtuple
|
|
24
|
+
from ctypes import (
|
|
25
|
+
CDLL,
|
|
26
|
+
POINTER,
|
|
27
|
+
Structure,
|
|
28
|
+
byref,
|
|
29
|
+
c_char,
|
|
30
|
+
c_char_p,
|
|
31
|
+
c_int,
|
|
32
|
+
c_uint,
|
|
33
|
+
c_ulonglong,
|
|
34
|
+
create_string_buffer,
|
|
35
|
+
)
|
|
36
|
+
from typing import List, Tuple, Union
|
|
37
|
+
|
|
38
|
+
from .utils import parse_readable_size
|
|
39
|
+
|
|
40
|
+
logger = logging.getLogger(__name__)
|
|
41
|
+
|
|
42
|
+
# Some constants taken from cuda.h
|
|
43
|
+
CUDA_SUCCESS = 0
|
|
44
|
+
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
|
|
45
|
+
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39
|
|
46
|
+
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13
|
|
47
|
+
CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33
|
|
48
|
+
CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34
|
|
49
|
+
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36
|
|
50
|
+
|
|
51
|
+
CU_NO_CUDA_CAPABLE_DEVICE_DETECTED = 100
|
|
52
|
+
|
|
53
|
+
# nvml constants
|
|
54
|
+
NVML_SUCCESS = 0
|
|
55
|
+
NVML_ERROR_UNINITIALIZED = 1
|
|
56
|
+
NVML_ERROR_INVALID_ARGUMENT = 2
|
|
57
|
+
NVML_ERROR_NOT_SUPPORTED = 3
|
|
58
|
+
NVML_ERROR_NO_PERMISSION = 4
|
|
59
|
+
NVML_ERROR_ALREADY_INITIALIZED = 5
|
|
60
|
+
NVML_ERROR_NOT_FOUND = 6
|
|
61
|
+
NVML_ERROR_INSUFFICIENT_SIZE = 7
|
|
62
|
+
NVML_ERROR_INSUFFICIENT_POWER = 8
|
|
63
|
+
NVML_ERROR_DRIVER_NOT_LOADED = 9
|
|
64
|
+
NVML_ERROR_TIMEOUT = 10
|
|
65
|
+
NVML_ERROR_IRQ_ISSUE = 11
|
|
66
|
+
NVML_ERROR_LIBRARY_NOT_FOUND = 12
|
|
67
|
+
NVML_ERROR_FUNCTION_NOT_FOUND = 13
|
|
68
|
+
NVML_ERROR_CORRUPTED_INFOROM = 14
|
|
69
|
+
NVML_ERROR_GPU_IS_LOST = 15
|
|
70
|
+
NVML_ERROR_RESET_REQUIRED = 16
|
|
71
|
+
NVML_ERROR_OPERATING_SYSTEM = 17
|
|
72
|
+
NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18
|
|
73
|
+
NVML_ERROR_IN_USE = 19
|
|
74
|
+
NVML_ERROR_MEMORY = 20
|
|
75
|
+
NVML_ERROR_NO_DATA = 21
|
|
76
|
+
NVML_ERROR_VGPU_ECC_NOT_SUPPORTED = 22
|
|
77
|
+
NVML_ERROR_INSUFFICIENT_RESOURCES = 23
|
|
78
|
+
NVML_ERROR_FREQ_NOT_SUPPORTED = 24
|
|
79
|
+
NVML_ERROR_UNKNOWN = 999
|
|
80
|
+
NVML_TEMPERATURE_GPU = 0
|
|
81
|
+
NVML_DRIVER_NOT_LOADED = 9
|
|
82
|
+
NVML_DEVICE_UUID_V2_BUFFER_SIZE = 96
|
|
83
|
+
NVML_VALUE_NOT_AVAILABLE_ulonglong = c_ulonglong(-1)
|
|
84
|
+
NVML_DEVICE_MIG_DISABLE = 0x0
|
|
85
|
+
NVML_DEVICE_MIG_ENABLE = 0x1
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class _CUuuid_t(Structure):
|
|
89
|
+
_fields_ = [("bytes", c_char * 16)]
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class _nvmlUtilization_t(Structure):
|
|
93
|
+
_fields_ = [
|
|
94
|
+
("gpu", c_uint),
|
|
95
|
+
("memory", c_uint),
|
|
96
|
+
]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class _struct_nvmlDevice_t(Structure):
|
|
100
|
+
pass # opaque handle
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
_nvmlDevice_t = POINTER(_struct_nvmlDevice_t)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class _nvmlBAR1Memory_t(Structure):
|
|
107
|
+
_fields_ = [
|
|
108
|
+
("total", c_ulonglong),
|
|
109
|
+
("free", c_ulonglong),
|
|
110
|
+
("used", c_ulonglong),
|
|
111
|
+
]
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class _nvmlProcessInfo_t(Structure):
|
|
115
|
+
_fields_ = [
|
|
116
|
+
("pid", c_uint),
|
|
117
|
+
("usedGpuMemory", c_ulonglong),
|
|
118
|
+
("gpuInstanceId", c_uint),
|
|
119
|
+
("computeInstanceId", c_uint),
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
## Alternative object
|
|
124
|
+
# Allows the object to be printed
|
|
125
|
+
# Allows mismatched types to be assigned
|
|
126
|
+
# - like None when the Structure variant requires c_uint
|
|
127
|
+
class nvmlFriendlyObject:
|
|
128
|
+
def __init__(self, dictionary):
|
|
129
|
+
for x in dictionary:
|
|
130
|
+
setattr(self, x, dictionary[x])
|
|
131
|
+
|
|
132
|
+
def __str__(self):
|
|
133
|
+
return self.__dict__.__str__()
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def nvmlStructToFriendlyObject(struct):
|
|
137
|
+
d = {}
|
|
138
|
+
for x in struct._fields_:
|
|
139
|
+
key = x[0]
|
|
140
|
+
value = getattr(struct, key)
|
|
141
|
+
# only need to convert from bytes if bytes, no need to check python version.
|
|
142
|
+
d[key] = value.decode() if isinstance(value, bytes) else value
|
|
143
|
+
obj = nvmlFriendlyObject(d)
|
|
144
|
+
return obj
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
@dataclasses.dataclass
|
|
148
|
+
class CudaDeviceInfo:
|
|
149
|
+
uuid: bytes | None = None
|
|
150
|
+
device_index: int | None = None
|
|
151
|
+
mig_index: int | None = None
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@dataclasses.dataclass
|
|
155
|
+
class CudaContext:
|
|
156
|
+
has_context: bool
|
|
157
|
+
device_info: CudaDeviceInfo | None = None
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
_is_windows: bool = sys.platform.startswith("win")
|
|
161
|
+
_is_wsl: bool = "WSL_DISTRO_NAME" in os.environ
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _load_nv_library(*libnames):
|
|
165
|
+
for lib in libnames:
|
|
166
|
+
try:
|
|
167
|
+
return CDLL(lib)
|
|
168
|
+
except OSError:
|
|
169
|
+
continue
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
_cuda_lib = _nvml_lib = None
|
|
173
|
+
|
|
174
|
+
_cu_device_info = namedtuple(
|
|
175
|
+
"_cu_device_info", "index uuid name multiprocessors cuda_cores threads"
|
|
176
|
+
)
|
|
177
|
+
_nvml_driver_info = namedtuple("_nvml_driver_info", "driver_version cuda_version")
|
|
178
|
+
_nvml_device_status = namedtuple(
|
|
179
|
+
"_nvml_device_status",
|
|
180
|
+
"gpu_util mem_util temperature fb_total_mem fb_used_mem fb_free_mem",
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
_init_pid = None
|
|
184
|
+
_gpu_count = None
|
|
185
|
+
_driver_info = None
|
|
186
|
+
_device_infos: dict[int, _cu_device_info] = dict()
|
|
187
|
+
|
|
188
|
+
_no_device_warned = False
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
class NVError(Exception):
|
|
192
|
+
def __init__(self, msg, *args, errno=None):
|
|
193
|
+
self._errno = errno
|
|
194
|
+
super().__init__(msg or "Unknown error", *args)
|
|
195
|
+
|
|
196
|
+
def __str__(self):
|
|
197
|
+
return f"({self._errno}) {super().__str__()}"
|
|
198
|
+
|
|
199
|
+
@property
|
|
200
|
+
def errno(self):
|
|
201
|
+
return self._errno
|
|
202
|
+
|
|
203
|
+
@property
|
|
204
|
+
def message(self):
|
|
205
|
+
return super().__str__()
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
class NVDeviceAPIError(NVError):
|
|
209
|
+
pass
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
class NVMLAPIError(NVError):
|
|
213
|
+
pass
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _cu_check_error(result):
|
|
217
|
+
if result != CUDA_SUCCESS:
|
|
218
|
+
_error_str = c_char_p()
|
|
219
|
+
_cuda_lib.cuGetErrorString(result, byref(_error_str))
|
|
220
|
+
err_value = _error_str.value.decode() if _error_str.value is not None else None
|
|
221
|
+
raise NVDeviceAPIError(err_value, errno=result)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
_nvmlErrorString = None
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _nvml_check_error(result):
|
|
228
|
+
global _nvmlErrorString
|
|
229
|
+
if _nvmlErrorString is None:
|
|
230
|
+
_nvmlErrorString = _nvml_lib.nvmlErrorString
|
|
231
|
+
_nvmlErrorString.restype = c_char_p
|
|
232
|
+
|
|
233
|
+
if result != NVML_SUCCESS:
|
|
234
|
+
_error_str = _nvmlErrorString(result)
|
|
235
|
+
raise NVMLAPIError(_error_str.decode(), errno=result)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
_cu_process_var_to_cores = {
|
|
239
|
+
(1, 0): 8,
|
|
240
|
+
(1, 1): 8,
|
|
241
|
+
(1, 2): 8,
|
|
242
|
+
(1, 3): 8,
|
|
243
|
+
(2, 0): 32,
|
|
244
|
+
(2, 1): 48,
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _cu_get_processor_cores(major, minor):
|
|
249
|
+
return _cu_process_var_to_cores.get((major, minor), 192)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def _init_cp():
|
|
253
|
+
global _cuda_lib, _no_device_warned
|
|
254
|
+
if _init_pid == os.getpid():
|
|
255
|
+
return
|
|
256
|
+
|
|
257
|
+
libcuda_paths = ["libcuda.so", "libcuda.dylib", "cuda.dll", "nvcuda.dll"]
|
|
258
|
+
if _is_wsl:
|
|
259
|
+
libcuda_paths = ["/usr/lib/wsl/lib/libcuda.so"] + libcuda_paths
|
|
260
|
+
_cuda_lib = _load_nv_library(*libcuda_paths)
|
|
261
|
+
|
|
262
|
+
if _cuda_lib is None:
|
|
263
|
+
return
|
|
264
|
+
try:
|
|
265
|
+
_cu_check_error(_cuda_lib.cuInit(0))
|
|
266
|
+
except NVDeviceAPIError as ex:
|
|
267
|
+
if ex.errno == CU_NO_CUDA_CAPABLE_DEVICE_DETECTED:
|
|
268
|
+
_cuda_lib = None
|
|
269
|
+
if not _no_device_warned:
|
|
270
|
+
logger.warning("No CUDA device detected")
|
|
271
|
+
_no_device_warned = True
|
|
272
|
+
else:
|
|
273
|
+
logger.exception("Failed to initialize libcuda.")
|
|
274
|
+
return
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def _init_nvml():
|
|
278
|
+
global _nvml_lib, _no_device_warned
|
|
279
|
+
if _init_pid == os.getpid():
|
|
280
|
+
return
|
|
281
|
+
|
|
282
|
+
nvml_paths = [
|
|
283
|
+
"libnvidia-ml.so",
|
|
284
|
+
"libnvidia-ml.so.1",
|
|
285
|
+
"libnvidia-ml.dylib",
|
|
286
|
+
"nvml.dll",
|
|
287
|
+
]
|
|
288
|
+
if _is_windows:
|
|
289
|
+
nvml_paths.append(
|
|
290
|
+
os.path.join(
|
|
291
|
+
os.getenv("ProgramFiles", "C:/Program Files"),
|
|
292
|
+
"NVIDIA Corporation/NVSMI/nvml.dll",
|
|
293
|
+
)
|
|
294
|
+
)
|
|
295
|
+
if _is_wsl:
|
|
296
|
+
nvml_paths = ["/usr/lib/wsl/lib/libnvidia-ml.so.1"] + nvml_paths
|
|
297
|
+
_nvml_lib = _load_nv_library(*nvml_paths)
|
|
298
|
+
|
|
299
|
+
if _nvml_lib is None:
|
|
300
|
+
return
|
|
301
|
+
try:
|
|
302
|
+
_nvml_check_error(_nvml_lib.nvmlInit_v2())
|
|
303
|
+
except NVMLAPIError as ex:
|
|
304
|
+
if ex.errno == NVML_DRIVER_NOT_LOADED:
|
|
305
|
+
_nvml_lib = None
|
|
306
|
+
if not _no_device_warned:
|
|
307
|
+
logger.warning(
|
|
308
|
+
"Failed to load libnvidia-ml: %s, no CUDA device will be enabled",
|
|
309
|
+
ex.message,
|
|
310
|
+
)
|
|
311
|
+
_no_device_warned = True
|
|
312
|
+
else:
|
|
313
|
+
logger.exception("Failed to initialize libnvidia-ml.")
|
|
314
|
+
return
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def _init():
|
|
318
|
+
global _init_pid
|
|
319
|
+
|
|
320
|
+
_init_cp()
|
|
321
|
+
_init_nvml()
|
|
322
|
+
|
|
323
|
+
if _nvml_lib is not None and _cuda_lib is not None:
|
|
324
|
+
_init_pid = os.getpid()
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def get_device_count() -> int | None:
|
|
328
|
+
global _gpu_count
|
|
329
|
+
|
|
330
|
+
if _gpu_count is not None:
|
|
331
|
+
return _gpu_count
|
|
332
|
+
|
|
333
|
+
_init_nvml()
|
|
334
|
+
if _nvml_lib is None:
|
|
335
|
+
return None
|
|
336
|
+
|
|
337
|
+
if "CUDA_VISIBLE_DEVICES" in os.environ:
|
|
338
|
+
devices = os.environ["CUDA_VISIBLE_DEVICES"].strip()
|
|
339
|
+
if not devices or devices == "-1":
|
|
340
|
+
_gpu_count = 0
|
|
341
|
+
else:
|
|
342
|
+
_gpu_count = len(devices.split(","))
|
|
343
|
+
else:
|
|
344
|
+
n_gpus = c_uint()
|
|
345
|
+
_cu_check_error(_nvml_lib.nvmlDeviceGetCount(byref(n_gpus)))
|
|
346
|
+
_gpu_count = n_gpus.value
|
|
347
|
+
return _gpu_count
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def _get_all_device_count() -> int | None:
|
|
351
|
+
_init_nvml()
|
|
352
|
+
if _nvml_lib is None:
|
|
353
|
+
return None
|
|
354
|
+
|
|
355
|
+
n_gpus = c_uint()
|
|
356
|
+
_cu_check_error(_nvml_lib.nvmlDeviceGetCount(byref(n_gpus)))
|
|
357
|
+
return n_gpus.value
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def get_driver_info() -> _nvml_driver_info | None:
|
|
361
|
+
global _driver_info
|
|
362
|
+
|
|
363
|
+
_init_nvml()
|
|
364
|
+
if _nvml_lib is None:
|
|
365
|
+
return None
|
|
366
|
+
if _driver_info is not None:
|
|
367
|
+
return _driver_info
|
|
368
|
+
|
|
369
|
+
version_buf = create_string_buffer(100)
|
|
370
|
+
cuda_version = c_uint()
|
|
371
|
+
|
|
372
|
+
_nvml_check_error(
|
|
373
|
+
_nvml_lib.nvmlSystemGetDriverVersion(version_buf, len(version_buf))
|
|
374
|
+
)
|
|
375
|
+
_nvml_check_error(_nvml_lib.nvmlSystemGetCudaDriverVersion(byref(cuda_version)))
|
|
376
|
+
|
|
377
|
+
_driver_info = _nvml_driver_info(
|
|
378
|
+
driver_version=version_buf.value.decode(),
|
|
379
|
+
cuda_version=".".join(str(v) for v in divmod(cuda_version.value, 1000)),
|
|
380
|
+
)
|
|
381
|
+
return _driver_info
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
def get_device_info(dev_index: int) -> _cu_device_info | None:
|
|
385
|
+
try:
|
|
386
|
+
return _device_infos[dev_index]
|
|
387
|
+
except KeyError:
|
|
388
|
+
pass
|
|
389
|
+
|
|
390
|
+
_init()
|
|
391
|
+
if _init_pid is None:
|
|
392
|
+
return None
|
|
393
|
+
|
|
394
|
+
device = c_int()
|
|
395
|
+
name_buf = create_string_buffer(100)
|
|
396
|
+
uuid_t = _CUuuid_t()
|
|
397
|
+
cc_major = c_int()
|
|
398
|
+
cc_minor = c_int()
|
|
399
|
+
cores = c_int()
|
|
400
|
+
threads_per_core = c_int()
|
|
401
|
+
|
|
402
|
+
_cu_check_error(_cuda_lib.cuDeviceGet(byref(device), c_int(dev_index))) # type: ignore
|
|
403
|
+
_cu_check_error(_cuda_lib.cuDeviceGetName(name_buf, len(name_buf), device)) # type: ignore
|
|
404
|
+
_cu_check_error(_cuda_lib.cuDeviceGetUuid(byref(uuid_t), device)) # type: ignore
|
|
405
|
+
_cu_check_error(
|
|
406
|
+
_cuda_lib.cuDeviceComputeCapability(byref(cc_major), byref(cc_minor), device) # type: ignore
|
|
407
|
+
)
|
|
408
|
+
_cu_check_error(
|
|
409
|
+
_cuda_lib.cuDeviceGetAttribute( # type: ignore
|
|
410
|
+
byref(cores), CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device
|
|
411
|
+
)
|
|
412
|
+
)
|
|
413
|
+
_cu_check_error(
|
|
414
|
+
_cuda_lib.cuDeviceGetAttribute( # type: ignore
|
|
415
|
+
byref(threads_per_core),
|
|
416
|
+
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
|
|
417
|
+
device,
|
|
418
|
+
)
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
if "CUDA_VISIBLE_DEVICES" in os.environ:
|
|
422
|
+
real_dev_index = [
|
|
423
|
+
int(s) for s in os.environ["CUDA_VISIBLE_DEVICES"].split(",")
|
|
424
|
+
][dev_index]
|
|
425
|
+
else:
|
|
426
|
+
real_dev_index = dev_index
|
|
427
|
+
|
|
428
|
+
info = _device_infos[dev_index] = _cu_device_info(
|
|
429
|
+
index=real_dev_index,
|
|
430
|
+
uuid=uuid.UUID(bytes=uuid_t.bytes),
|
|
431
|
+
name=name_buf.value.decode(),
|
|
432
|
+
multiprocessors=cores.value,
|
|
433
|
+
cuda_cores=cores.value
|
|
434
|
+
* _cu_get_processor_cores(cc_major.value, cc_minor.value),
|
|
435
|
+
threads=cores.value * threads_per_core.value,
|
|
436
|
+
)
|
|
437
|
+
return info
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def get_device_status(dev_index: int) -> _nvml_device_status | None:
|
|
441
|
+
_init()
|
|
442
|
+
if _init_pid is None:
|
|
443
|
+
return None
|
|
444
|
+
|
|
445
|
+
c_device = _nvmlDevice_t()
|
|
446
|
+
c_utils = _nvmlUtilization_t()
|
|
447
|
+
c_temperature = c_uint()
|
|
448
|
+
c_memory_info = _nvmlBAR1Memory_t()
|
|
449
|
+
|
|
450
|
+
dev_uuid = get_device_info(dev_index).uuid # type: ignore
|
|
451
|
+
|
|
452
|
+
uuid_str = ("GPU-" + str(dev_uuid)).encode()
|
|
453
|
+
|
|
454
|
+
if not _is_wsl:
|
|
455
|
+
_nvml_check_error(
|
|
456
|
+
_nvml_lib.nvmlDeviceGetHandleByUUID(uuid_str, byref(c_device)) # type: ignore
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
_nvml_check_error(
|
|
460
|
+
_nvml_lib.nvmlDeviceGetUtilizationRates(c_device, byref(c_utils)) # type: ignore
|
|
461
|
+
)
|
|
462
|
+
gpu_util = c_utils.gpu
|
|
463
|
+
mem_util = c_utils.memory
|
|
464
|
+
|
|
465
|
+
_nvml_check_error(
|
|
466
|
+
_nvml_lib.nvmlDeviceGetTemperature( # type: ignore
|
|
467
|
+
c_device, NVML_TEMPERATURE_GPU, byref(c_temperature)
|
|
468
|
+
)
|
|
469
|
+
)
|
|
470
|
+
temperature = c_temperature.value
|
|
471
|
+
|
|
472
|
+
_nvml_check_error(
|
|
473
|
+
_nvml_lib.nvmlDeviceGetMemoryInfo(c_device, byref(c_memory_info)) # type: ignore
|
|
474
|
+
)
|
|
475
|
+
fb_total_mem = c_memory_info.total
|
|
476
|
+
fb_free_mem = c_memory_info.free
|
|
477
|
+
fb_used_mem = c_memory_info.used
|
|
478
|
+
else:
|
|
479
|
+
import defusedxml
|
|
480
|
+
|
|
481
|
+
proc = subprocess.Popen(
|
|
482
|
+
["/usr/lib/wsl/lib/nvidia-smi", "-q", f"--id={dev_index}", "-x"],
|
|
483
|
+
stdout=subprocess.PIPE,
|
|
484
|
+
)
|
|
485
|
+
proc.wait()
|
|
486
|
+
xml_result = defusedxml.ElementTree.fromstring(proc.stdout.read()) # type: ignore
|
|
487
|
+
gpu_node = xml_result.find("gpu")
|
|
488
|
+
|
|
489
|
+
fb_node = gpu_node.find("fb_memory_usage")
|
|
490
|
+
fb_total_mem = int(parse_readable_size(fb_node.find("total").text)[0])
|
|
491
|
+
fb_free_mem = int(parse_readable_size(fb_node.find("free").text)[0])
|
|
492
|
+
fb_used_mem = int(parse_readable_size(fb_node.find("used").text)[0])
|
|
493
|
+
|
|
494
|
+
util_node = gpu_node.find("utilization")
|
|
495
|
+
if util_node.find("gpu_util").text == "N/A":
|
|
496
|
+
gpu_util = 0
|
|
497
|
+
else:
|
|
498
|
+
gpu_util = int(util_node.find("gpu_util"))
|
|
499
|
+
if util_node.find("memory_util").text == "N/A":
|
|
500
|
+
mem_util = 0
|
|
501
|
+
else:
|
|
502
|
+
mem_util = int(util_node.find("memory_util"))
|
|
503
|
+
|
|
504
|
+
temperature = int(gpu_node.find("temperature").find("gpu_temp").text[:-1])
|
|
505
|
+
|
|
506
|
+
return _nvml_device_status(
|
|
507
|
+
gpu_util=gpu_util,
|
|
508
|
+
mem_util=mem_util,
|
|
509
|
+
temperature=temperature,
|
|
510
|
+
fb_total_mem=fb_total_mem,
|
|
511
|
+
fb_free_mem=fb_free_mem,
|
|
512
|
+
fb_used_mem=fb_used_mem,
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
def get_handle_by_index(index: int) -> _nvmlDevice_t: # type: ignore
|
|
517
|
+
_init_nvml()
|
|
518
|
+
if _nvml_lib is None:
|
|
519
|
+
return None
|
|
520
|
+
|
|
521
|
+
c_index = c_int(index)
|
|
522
|
+
device = _nvmlDevice_t()
|
|
523
|
+
_nvml_check_error(_nvml_lib.nvmlDeviceGetHandleByIndex_v2(c_index, byref(device)))
|
|
524
|
+
return device
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
def get_handle_by_uuid(uuid: bytes) -> _nvmlDevice_t: # type: ignore
|
|
528
|
+
_init_nvml()
|
|
529
|
+
if _nvml_lib is None:
|
|
530
|
+
return None
|
|
531
|
+
|
|
532
|
+
c_uuid = c_char_p(uuid)
|
|
533
|
+
device = _nvmlDevice_t()
|
|
534
|
+
_nvml_check_error(_nvml_lib.nvmlDeviceGetHandleByUUID(c_uuid, byref(device)))
|
|
535
|
+
return device
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
def get_mig_mode(device: _nvmlDevice_t) -> Tuple[int, int] | None: # type: ignore
|
|
539
|
+
_init_nvml()
|
|
540
|
+
if _nvml_lib is None:
|
|
541
|
+
return None
|
|
542
|
+
|
|
543
|
+
c_current_mode, c_pending_mode = c_uint(), c_uint()
|
|
544
|
+
_nvml_check_error(
|
|
545
|
+
_nvml_lib.nvmlDeviceGetMigMode(
|
|
546
|
+
device, byref(c_current_mode), byref(c_pending_mode)
|
|
547
|
+
)
|
|
548
|
+
)
|
|
549
|
+
return c_current_mode.value, c_pending_mode.value
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
def get_max_mig_device_count(device: _nvmlDevice_t) -> int | None: # type: ignore
|
|
553
|
+
_init_nvml()
|
|
554
|
+
if _nvml_lib is None:
|
|
555
|
+
return None
|
|
556
|
+
|
|
557
|
+
c_count = c_uint()
|
|
558
|
+
_nvml_check_error(_nvml_lib.nvmlDeviceGetMaxMigDeviceCount(device, byref(c_count)))
|
|
559
|
+
return c_count.value
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def get_mig_device_handle_by_index(device: _nvmlDevice_t, index: int) -> _nvmlDevice_t: # type: ignore
|
|
563
|
+
_init_nvml()
|
|
564
|
+
if _nvml_lib is None:
|
|
565
|
+
return None
|
|
566
|
+
|
|
567
|
+
c_index = c_uint(index)
|
|
568
|
+
mig_device = _nvmlDevice_t()
|
|
569
|
+
_nvml_check_error(
|
|
570
|
+
_nvml_lib.nvmlDeviceGetMigDeviceHandleByIndex(
|
|
571
|
+
device, c_index, byref(mig_device)
|
|
572
|
+
)
|
|
573
|
+
)
|
|
574
|
+
return mig_device
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
def get_index(handle: _nvmlDevice_t) -> int | None: # type: ignore
|
|
578
|
+
_init_nvml()
|
|
579
|
+
if _nvml_lib is None:
|
|
580
|
+
return None
|
|
581
|
+
|
|
582
|
+
c_index = c_uint()
|
|
583
|
+
_nvml_check_error(_nvml_lib.nvmlDeviceGetIndex(handle, byref(c_index)))
|
|
584
|
+
return c_index.value
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
def get_uuid(handle: _nvmlDevice_t) -> bytes | None: # type: ignore
|
|
588
|
+
_init_nvml()
|
|
589
|
+
if _nvml_lib is None:
|
|
590
|
+
return None
|
|
591
|
+
|
|
592
|
+
c_uuid = create_string_buffer(NVML_DEVICE_UUID_V2_BUFFER_SIZE)
|
|
593
|
+
_nvml_check_error(
|
|
594
|
+
_nvml_lib.nvmlDeviceGetUUID(
|
|
595
|
+
handle, c_uuid, c_uint(NVML_DEVICE_UUID_V2_BUFFER_SIZE)
|
|
596
|
+
)
|
|
597
|
+
)
|
|
598
|
+
return c_uuid.value
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
def get_index_and_uuid(device: Union[int, bytes, str]) -> CudaDeviceInfo | None:
|
|
602
|
+
_init_nvml()
|
|
603
|
+
if _nvml_lib is None:
|
|
604
|
+
return None
|
|
605
|
+
|
|
606
|
+
try:
|
|
607
|
+
device_index = int(device)
|
|
608
|
+
device_handle = get_handle_by_index(device_index)
|
|
609
|
+
uuid = get_uuid(device_handle)
|
|
610
|
+
except ValueError:
|
|
611
|
+
uuid = device if isinstance(device, bytes) else device.encode() # type: ignore
|
|
612
|
+
uuid_handle = get_handle_by_uuid(uuid)
|
|
613
|
+
device_index = get_index(uuid_handle) # type: ignore
|
|
614
|
+
uuid = get_uuid(uuid_handle)
|
|
615
|
+
|
|
616
|
+
return CudaDeviceInfo(uuid=uuid, device_index=device_index)
|
|
617
|
+
|
|
618
|
+
|
|
619
|
+
def get_compute_running_processes( # type: ignore
|
|
620
|
+
handle: _nvmlDevice_t, # type: ignore
|
|
621
|
+
) -> List[nvmlFriendlyObject] | None:
|
|
622
|
+
_init_nvml()
|
|
623
|
+
if _nvml_lib is None:
|
|
624
|
+
return None
|
|
625
|
+
|
|
626
|
+
c_count = c_uint(0)
|
|
627
|
+
func = getattr(_nvml_lib, "nvmlDeviceGetComputeRunningProcesses_v3", None)
|
|
628
|
+
if func is None:
|
|
629
|
+
func = getattr(_nvml_lib, "nvmlDeviceGetComputeRunningProcesses_v2")
|
|
630
|
+
ret = func(handle, byref(c_count), None)
|
|
631
|
+
|
|
632
|
+
if ret == NVML_SUCCESS:
|
|
633
|
+
# special case, no running processes
|
|
634
|
+
return []
|
|
635
|
+
elif ret == NVML_ERROR_INSUFFICIENT_SIZE:
|
|
636
|
+
# typical case
|
|
637
|
+
# oversize the array in case more processes are created
|
|
638
|
+
c_count.value = c_count.value * 2 + 5
|
|
639
|
+
proc_array = _nvmlProcessInfo_t * c_count.value
|
|
640
|
+
c_procs = proc_array()
|
|
641
|
+
|
|
642
|
+
_nvml_check_error(func(handle, byref(c_count), c_procs))
|
|
643
|
+
|
|
644
|
+
procs = []
|
|
645
|
+
for i in range(c_count.value):
|
|
646
|
+
# use an alternative struct for this object
|
|
647
|
+
obj = nvmlStructToFriendlyObject(c_procs[i])
|
|
648
|
+
if obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value:
|
|
649
|
+
# special case for WDDM on Windows, see comment above
|
|
650
|
+
obj.usedGpuMemory = None
|
|
651
|
+
procs.append(obj)
|
|
652
|
+
|
|
653
|
+
return procs
|
|
654
|
+
else:
|
|
655
|
+
# error case
|
|
656
|
+
_nvml_check_error(ret)
|
|
657
|
+
|
|
658
|
+
|
|
659
|
+
def _running_process_matches(handle: _nvmlDevice_t) -> bool: # type: ignore
|
|
660
|
+
"""Check whether the current process is same as that of handle
|
|
661
|
+
Parameters
|
|
662
|
+
----------
|
|
663
|
+
handle : _nvmlDevice_t
|
|
664
|
+
NVML handle to CUDA device
|
|
665
|
+
Returns
|
|
666
|
+
-------
|
|
667
|
+
out : bool
|
|
668
|
+
Whether the device handle has a CUDA context on the running process.
|
|
669
|
+
"""
|
|
670
|
+
return any(os.getpid() == o.pid for o in get_compute_running_processes(handle)) # type: ignore
|
|
671
|
+
|
|
672
|
+
|
|
673
|
+
def get_cuda_context() -> CudaContext:
|
|
674
|
+
"""Check whether the current process already has a CUDA context created."""
|
|
675
|
+
|
|
676
|
+
_init()
|
|
677
|
+
if _init_pid is None:
|
|
678
|
+
return CudaContext(has_context=False)
|
|
679
|
+
|
|
680
|
+
for index in range(_get_all_device_count()): # type: ignore
|
|
681
|
+
handle = get_handle_by_index(index)
|
|
682
|
+
try:
|
|
683
|
+
mig_current_mode, mig_pending_mode = get_mig_mode(handle) # type: ignore
|
|
684
|
+
except NVMLAPIError as e:
|
|
685
|
+
if e.errno == NVML_ERROR_NOT_SUPPORTED:
|
|
686
|
+
mig_current_mode = NVML_DEVICE_MIG_DISABLE
|
|
687
|
+
else:
|
|
688
|
+
raise
|
|
689
|
+
if mig_current_mode == NVML_DEVICE_MIG_ENABLE:
|
|
690
|
+
for mig_index in range(get_max_mig_device_count(handle)): # type: ignore
|
|
691
|
+
try:
|
|
692
|
+
mig_handle = get_mig_device_handle_by_index(handle, mig_index)
|
|
693
|
+
except NVMLAPIError as e:
|
|
694
|
+
if e.errno == NVML_ERROR_NOT_FOUND:
|
|
695
|
+
# No MIG device with that index
|
|
696
|
+
continue
|
|
697
|
+
else:
|
|
698
|
+
raise
|
|
699
|
+
if _running_process_matches(mig_handle):
|
|
700
|
+
return CudaContext(
|
|
701
|
+
has_context=True,
|
|
702
|
+
device_info=CudaDeviceInfo(
|
|
703
|
+
uuid=get_uuid(handle),
|
|
704
|
+
device_index=index,
|
|
705
|
+
mig_index=mig_index,
|
|
706
|
+
),
|
|
707
|
+
)
|
|
708
|
+
else:
|
|
709
|
+
if _running_process_matches(handle):
|
|
710
|
+
return CudaContext(
|
|
711
|
+
has_context=True,
|
|
712
|
+
device_info=CudaDeviceInfo(
|
|
713
|
+
uuid=get_uuid(handle), device_index=index
|
|
714
|
+
),
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
return CudaContext(has_context=False)
|