xoscar 0.9.0__cp312-cp312-macosx_10_13_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. xoscar/__init__.py +61 -0
  2. xoscar/_utils.cpython-312-darwin.so +0 -0
  3. xoscar/_utils.pxd +36 -0
  4. xoscar/_utils.pyx +246 -0
  5. xoscar/_version.py +693 -0
  6. xoscar/aio/__init__.py +16 -0
  7. xoscar/aio/base.py +86 -0
  8. xoscar/aio/file.py +59 -0
  9. xoscar/aio/lru.py +228 -0
  10. xoscar/aio/parallelism.py +39 -0
  11. xoscar/api.py +527 -0
  12. xoscar/backend.py +67 -0
  13. xoscar/backends/__init__.py +14 -0
  14. xoscar/backends/allocate_strategy.py +160 -0
  15. xoscar/backends/communication/__init__.py +30 -0
  16. xoscar/backends/communication/base.py +315 -0
  17. xoscar/backends/communication/core.py +69 -0
  18. xoscar/backends/communication/dummy.py +253 -0
  19. xoscar/backends/communication/errors.py +20 -0
  20. xoscar/backends/communication/socket.py +444 -0
  21. xoscar/backends/communication/ucx.py +538 -0
  22. xoscar/backends/communication/utils.py +97 -0
  23. xoscar/backends/config.py +157 -0
  24. xoscar/backends/context.py +437 -0
  25. xoscar/backends/core.py +352 -0
  26. xoscar/backends/indigen/__init__.py +16 -0
  27. xoscar/backends/indigen/__main__.py +19 -0
  28. xoscar/backends/indigen/backend.py +51 -0
  29. xoscar/backends/indigen/driver.py +26 -0
  30. xoscar/backends/indigen/fate_sharing.py +221 -0
  31. xoscar/backends/indigen/pool.py +515 -0
  32. xoscar/backends/indigen/shared_memory.py +548 -0
  33. xoscar/backends/message.cpython-312-darwin.so +0 -0
  34. xoscar/backends/message.pyi +255 -0
  35. xoscar/backends/message.pyx +646 -0
  36. xoscar/backends/pool.py +1630 -0
  37. xoscar/backends/router.py +285 -0
  38. xoscar/backends/test/__init__.py +16 -0
  39. xoscar/backends/test/backend.py +38 -0
  40. xoscar/backends/test/pool.py +233 -0
  41. xoscar/batch.py +256 -0
  42. xoscar/collective/__init__.py +27 -0
  43. xoscar/collective/backend/__init__.py +13 -0
  44. xoscar/collective/backend/nccl_backend.py +160 -0
  45. xoscar/collective/common.py +102 -0
  46. xoscar/collective/core.py +737 -0
  47. xoscar/collective/process_group.py +687 -0
  48. xoscar/collective/utils.py +41 -0
  49. xoscar/collective/xoscar_pygloo.cpython-312-darwin.so +0 -0
  50. xoscar/collective/xoscar_pygloo.pyi +239 -0
  51. xoscar/constants.py +23 -0
  52. xoscar/context.cpython-312-darwin.so +0 -0
  53. xoscar/context.pxd +21 -0
  54. xoscar/context.pyx +368 -0
  55. xoscar/core.cpython-312-darwin.so +0 -0
  56. xoscar/core.pxd +51 -0
  57. xoscar/core.pyx +664 -0
  58. xoscar/debug.py +188 -0
  59. xoscar/driver.py +42 -0
  60. xoscar/errors.py +63 -0
  61. xoscar/libcpp.pxd +31 -0
  62. xoscar/metrics/__init__.py +21 -0
  63. xoscar/metrics/api.py +288 -0
  64. xoscar/metrics/backends/__init__.py +13 -0
  65. xoscar/metrics/backends/console/__init__.py +13 -0
  66. xoscar/metrics/backends/console/console_metric.py +82 -0
  67. xoscar/metrics/backends/metric.py +149 -0
  68. xoscar/metrics/backends/prometheus/__init__.py +13 -0
  69. xoscar/metrics/backends/prometheus/prometheus_metric.py +70 -0
  70. xoscar/nvutils.py +717 -0
  71. xoscar/profiling.py +260 -0
  72. xoscar/serialization/__init__.py +20 -0
  73. xoscar/serialization/aio.py +141 -0
  74. xoscar/serialization/core.cpython-312-darwin.so +0 -0
  75. xoscar/serialization/core.pxd +28 -0
  76. xoscar/serialization/core.pyi +57 -0
  77. xoscar/serialization/core.pyx +944 -0
  78. xoscar/serialization/cuda.py +111 -0
  79. xoscar/serialization/exception.py +48 -0
  80. xoscar/serialization/mlx.py +67 -0
  81. xoscar/serialization/numpy.py +82 -0
  82. xoscar/serialization/pyfury.py +37 -0
  83. xoscar/serialization/scipy.py +72 -0
  84. xoscar/serialization/torch.py +180 -0
  85. xoscar/utils.py +522 -0
  86. xoscar/virtualenv/__init__.py +34 -0
  87. xoscar/virtualenv/core.py +268 -0
  88. xoscar/virtualenv/platform.py +56 -0
  89. xoscar/virtualenv/utils.py +100 -0
  90. xoscar/virtualenv/uv.py +321 -0
  91. xoscar-0.9.0.dist-info/METADATA +230 -0
  92. xoscar-0.9.0.dist-info/RECORD +94 -0
  93. xoscar-0.9.0.dist-info/WHEEL +6 -0
  94. xoscar-0.9.0.dist-info/top_level.txt +2 -0
xoscar/nvutils.py ADDED
@@ -0,0 +1,717 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import dataclasses
18
+ import logging
19
+ import os
20
+ import subprocess
21
+ import sys
22
+ import uuid
23
+ from collections import namedtuple
24
+ from ctypes import (
25
+ CDLL,
26
+ POINTER,
27
+ Structure,
28
+ byref,
29
+ c_char,
30
+ c_char_p,
31
+ c_int,
32
+ c_uint,
33
+ c_ulonglong,
34
+ create_string_buffer,
35
+ )
36
+ from typing import List, Tuple, Union
37
+
38
+ from .utils import parse_readable_size
39
+
40
+ logger = logging.getLogger(__name__)
41
+
42
+ # Some constants taken from cuda.h
43
+ CUDA_SUCCESS = 0
44
+ CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
45
+ CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39
46
+ CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13
47
+ CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33
48
+ CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34
49
+ CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36
50
+
51
+ CU_NO_CUDA_CAPABLE_DEVICE_DETECTED = 100
52
+
53
+ # nvml constants
54
+ NVML_SUCCESS = 0
55
+ NVML_ERROR_UNINITIALIZED = 1
56
+ NVML_ERROR_INVALID_ARGUMENT = 2
57
+ NVML_ERROR_NOT_SUPPORTED = 3
58
+ NVML_ERROR_NO_PERMISSION = 4
59
+ NVML_ERROR_ALREADY_INITIALIZED = 5
60
+ NVML_ERROR_NOT_FOUND = 6
61
+ NVML_ERROR_INSUFFICIENT_SIZE = 7
62
+ NVML_ERROR_INSUFFICIENT_POWER = 8
63
+ NVML_ERROR_DRIVER_NOT_LOADED = 9
64
+ NVML_ERROR_TIMEOUT = 10
65
+ NVML_ERROR_IRQ_ISSUE = 11
66
+ NVML_ERROR_LIBRARY_NOT_FOUND = 12
67
+ NVML_ERROR_FUNCTION_NOT_FOUND = 13
68
+ NVML_ERROR_CORRUPTED_INFOROM = 14
69
+ NVML_ERROR_GPU_IS_LOST = 15
70
+ NVML_ERROR_RESET_REQUIRED = 16
71
+ NVML_ERROR_OPERATING_SYSTEM = 17
72
+ NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18
73
+ NVML_ERROR_IN_USE = 19
74
+ NVML_ERROR_MEMORY = 20
75
+ NVML_ERROR_NO_DATA = 21
76
+ NVML_ERROR_VGPU_ECC_NOT_SUPPORTED = 22
77
+ NVML_ERROR_INSUFFICIENT_RESOURCES = 23
78
+ NVML_ERROR_FREQ_NOT_SUPPORTED = 24
79
+ NVML_ERROR_UNKNOWN = 999
80
+ NVML_TEMPERATURE_GPU = 0
81
+ NVML_DRIVER_NOT_LOADED = 9
82
+ NVML_DEVICE_UUID_V2_BUFFER_SIZE = 96
83
+ NVML_VALUE_NOT_AVAILABLE_ulonglong = c_ulonglong(-1)
84
+ NVML_DEVICE_MIG_DISABLE = 0x0
85
+ NVML_DEVICE_MIG_ENABLE = 0x1
86
+
87
+
88
+ class _CUuuid_t(Structure):
89
+ _fields_ = [("bytes", c_char * 16)]
90
+
91
+
92
+ class _nvmlUtilization_t(Structure):
93
+ _fields_ = [
94
+ ("gpu", c_uint),
95
+ ("memory", c_uint),
96
+ ]
97
+
98
+
99
+ class _struct_nvmlDevice_t(Structure):
100
+ pass # opaque handle
101
+
102
+
103
+ _nvmlDevice_t = POINTER(_struct_nvmlDevice_t)
104
+
105
+
106
+ class _nvmlBAR1Memory_t(Structure):
107
+ _fields_ = [
108
+ ("total", c_ulonglong),
109
+ ("free", c_ulonglong),
110
+ ("used", c_ulonglong),
111
+ ]
112
+
113
+
114
+ class _nvmlProcessInfo_t(Structure):
115
+ _fields_ = [
116
+ ("pid", c_uint),
117
+ ("usedGpuMemory", c_ulonglong),
118
+ ("gpuInstanceId", c_uint),
119
+ ("computeInstanceId", c_uint),
120
+ ]
121
+
122
+
123
+ ## Alternative object
124
+ # Allows the object to be printed
125
+ # Allows mismatched types to be assigned
126
+ # - like None when the Structure variant requires c_uint
127
+ class nvmlFriendlyObject:
128
+ def __init__(self, dictionary):
129
+ for x in dictionary:
130
+ setattr(self, x, dictionary[x])
131
+
132
+ def __str__(self):
133
+ return self.__dict__.__str__()
134
+
135
+
136
+ def nvmlStructToFriendlyObject(struct):
137
+ d = {}
138
+ for x in struct._fields_:
139
+ key = x[0]
140
+ value = getattr(struct, key)
141
+ # only need to convert from bytes if bytes, no need to check python version.
142
+ d[key] = value.decode() if isinstance(value, bytes) else value
143
+ obj = nvmlFriendlyObject(d)
144
+ return obj
145
+
146
+
147
+ @dataclasses.dataclass
148
+ class CudaDeviceInfo:
149
+ uuid: bytes | None = None
150
+ device_index: int | None = None
151
+ mig_index: int | None = None
152
+
153
+
154
+ @dataclasses.dataclass
155
+ class CudaContext:
156
+ has_context: bool
157
+ device_info: CudaDeviceInfo | None = None
158
+
159
+
160
+ _is_windows: bool = sys.platform.startswith("win")
161
+ _is_wsl: bool = "WSL_DISTRO_NAME" in os.environ
162
+
163
+
164
+ def _load_nv_library(*libnames):
165
+ for lib in libnames:
166
+ try:
167
+ return CDLL(lib)
168
+ except OSError:
169
+ continue
170
+
171
+
172
+ _cuda_lib = _nvml_lib = None
173
+
174
+ _cu_device_info = namedtuple(
175
+ "_cu_device_info", "index uuid name multiprocessors cuda_cores threads"
176
+ )
177
+ _nvml_driver_info = namedtuple("_nvml_driver_info", "driver_version cuda_version")
178
+ _nvml_device_status = namedtuple(
179
+ "_nvml_device_status",
180
+ "gpu_util mem_util temperature fb_total_mem fb_used_mem fb_free_mem",
181
+ )
182
+
183
+ _init_pid = None
184
+ _gpu_count = None
185
+ _driver_info = None
186
+ _device_infos: dict[int, _cu_device_info] = dict()
187
+
188
+ _no_device_warned = False
189
+
190
+
191
+ class NVError(Exception):
192
+ def __init__(self, msg, *args, errno=None):
193
+ self._errno = errno
194
+ super().__init__(msg or "Unknown error", *args)
195
+
196
+ def __str__(self):
197
+ return f"({self._errno}) {super().__str__()}"
198
+
199
+ @property
200
+ def errno(self):
201
+ return self._errno
202
+
203
+ @property
204
+ def message(self):
205
+ return super().__str__()
206
+
207
+
208
+ class NVDeviceAPIError(NVError):
209
+ pass
210
+
211
+
212
+ class NVMLAPIError(NVError):
213
+ pass
214
+
215
+
216
+ def _cu_check_error(result):
217
+ if result != CUDA_SUCCESS:
218
+ _error_str = c_char_p()
219
+ _cuda_lib.cuGetErrorString(result, byref(_error_str))
220
+ err_value = _error_str.value.decode() if _error_str.value is not None else None
221
+ raise NVDeviceAPIError(err_value, errno=result)
222
+
223
+
224
+ _nvmlErrorString = None
225
+
226
+
227
+ def _nvml_check_error(result):
228
+ global _nvmlErrorString
229
+ if _nvmlErrorString is None:
230
+ _nvmlErrorString = _nvml_lib.nvmlErrorString
231
+ _nvmlErrorString.restype = c_char_p
232
+
233
+ if result != NVML_SUCCESS:
234
+ _error_str = _nvmlErrorString(result)
235
+ raise NVMLAPIError(_error_str.decode(), errno=result)
236
+
237
+
238
+ _cu_process_var_to_cores = {
239
+ (1, 0): 8,
240
+ (1, 1): 8,
241
+ (1, 2): 8,
242
+ (1, 3): 8,
243
+ (2, 0): 32,
244
+ (2, 1): 48,
245
+ }
246
+
247
+
248
+ def _cu_get_processor_cores(major, minor):
249
+ return _cu_process_var_to_cores.get((major, minor), 192)
250
+
251
+
252
+ def _init_cp():
253
+ global _cuda_lib, _no_device_warned
254
+ if _init_pid == os.getpid():
255
+ return
256
+
257
+ libcuda_paths = ["libcuda.so", "libcuda.dylib", "cuda.dll", "nvcuda.dll"]
258
+ if _is_wsl:
259
+ libcuda_paths = ["/usr/lib/wsl/lib/libcuda.so"] + libcuda_paths
260
+ _cuda_lib = _load_nv_library(*libcuda_paths)
261
+
262
+ if _cuda_lib is None:
263
+ return
264
+ try:
265
+ _cu_check_error(_cuda_lib.cuInit(0))
266
+ except NVDeviceAPIError as ex:
267
+ if ex.errno == CU_NO_CUDA_CAPABLE_DEVICE_DETECTED:
268
+ _cuda_lib = None
269
+ if not _no_device_warned:
270
+ logger.warning("No CUDA device detected")
271
+ _no_device_warned = True
272
+ else:
273
+ logger.exception("Failed to initialize libcuda.")
274
+ return
275
+
276
+
277
+ def _init_nvml():
278
+ global _nvml_lib, _no_device_warned
279
+ if _init_pid == os.getpid():
280
+ return
281
+
282
+ nvml_paths = [
283
+ "libnvidia-ml.so",
284
+ "libnvidia-ml.so.1",
285
+ "libnvidia-ml.dylib",
286
+ "nvml.dll",
287
+ ]
288
+ if _is_windows:
289
+ nvml_paths.append(
290
+ os.path.join(
291
+ os.getenv("ProgramFiles", "C:/Program Files"),
292
+ "NVIDIA Corporation/NVSMI/nvml.dll",
293
+ )
294
+ )
295
+ if _is_wsl:
296
+ nvml_paths = ["/usr/lib/wsl/lib/libnvidia-ml.so.1"] + nvml_paths
297
+ _nvml_lib = _load_nv_library(*nvml_paths)
298
+
299
+ if _nvml_lib is None:
300
+ return
301
+ try:
302
+ _nvml_check_error(_nvml_lib.nvmlInit_v2())
303
+ except NVMLAPIError as ex:
304
+ if ex.errno == NVML_DRIVER_NOT_LOADED:
305
+ _nvml_lib = None
306
+ if not _no_device_warned:
307
+ logger.warning(
308
+ "Failed to load libnvidia-ml: %s, no CUDA device will be enabled",
309
+ ex.message,
310
+ )
311
+ _no_device_warned = True
312
+ else:
313
+ logger.exception("Failed to initialize libnvidia-ml.")
314
+ return
315
+
316
+
317
+ def _init():
318
+ global _init_pid
319
+
320
+ _init_cp()
321
+ _init_nvml()
322
+
323
+ if _nvml_lib is not None and _cuda_lib is not None:
324
+ _init_pid = os.getpid()
325
+
326
+
327
+ def get_device_count() -> int | None:
328
+ global _gpu_count
329
+
330
+ if _gpu_count is not None:
331
+ return _gpu_count
332
+
333
+ _init_nvml()
334
+ if _nvml_lib is None:
335
+ return None
336
+
337
+ if "CUDA_VISIBLE_DEVICES" in os.environ:
338
+ devices = os.environ["CUDA_VISIBLE_DEVICES"].strip()
339
+ if not devices or devices == "-1":
340
+ _gpu_count = 0
341
+ else:
342
+ _gpu_count = len(devices.split(","))
343
+ else:
344
+ n_gpus = c_uint()
345
+ _cu_check_error(_nvml_lib.nvmlDeviceGetCount(byref(n_gpus)))
346
+ _gpu_count = n_gpus.value
347
+ return _gpu_count
348
+
349
+
350
+ def _get_all_device_count() -> int | None:
351
+ _init_nvml()
352
+ if _nvml_lib is None:
353
+ return None
354
+
355
+ n_gpus = c_uint()
356
+ _cu_check_error(_nvml_lib.nvmlDeviceGetCount(byref(n_gpus)))
357
+ return n_gpus.value
358
+
359
+
360
+ def get_driver_info() -> _nvml_driver_info | None:
361
+ global _driver_info
362
+
363
+ _init_nvml()
364
+ if _nvml_lib is None:
365
+ return None
366
+ if _driver_info is not None:
367
+ return _driver_info
368
+
369
+ version_buf = create_string_buffer(100)
370
+ cuda_version = c_uint()
371
+
372
+ _nvml_check_error(
373
+ _nvml_lib.nvmlSystemGetDriverVersion(version_buf, len(version_buf))
374
+ )
375
+ _nvml_check_error(_nvml_lib.nvmlSystemGetCudaDriverVersion(byref(cuda_version)))
376
+
377
+ _driver_info = _nvml_driver_info(
378
+ driver_version=version_buf.value.decode(),
379
+ cuda_version=".".join(str(v) for v in divmod(cuda_version.value, 1000)),
380
+ )
381
+ return _driver_info
382
+
383
+
384
+ def get_device_info(dev_index: int) -> _cu_device_info | None:
385
+ try:
386
+ return _device_infos[dev_index]
387
+ except KeyError:
388
+ pass
389
+
390
+ _init()
391
+ if _init_pid is None:
392
+ return None
393
+
394
+ device = c_int()
395
+ name_buf = create_string_buffer(100)
396
+ uuid_t = _CUuuid_t()
397
+ cc_major = c_int()
398
+ cc_minor = c_int()
399
+ cores = c_int()
400
+ threads_per_core = c_int()
401
+
402
+ _cu_check_error(_cuda_lib.cuDeviceGet(byref(device), c_int(dev_index))) # type: ignore
403
+ _cu_check_error(_cuda_lib.cuDeviceGetName(name_buf, len(name_buf), device)) # type: ignore
404
+ _cu_check_error(_cuda_lib.cuDeviceGetUuid(byref(uuid_t), device)) # type: ignore
405
+ _cu_check_error(
406
+ _cuda_lib.cuDeviceComputeCapability(byref(cc_major), byref(cc_minor), device) # type: ignore
407
+ )
408
+ _cu_check_error(
409
+ _cuda_lib.cuDeviceGetAttribute( # type: ignore
410
+ byref(cores), CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device
411
+ )
412
+ )
413
+ _cu_check_error(
414
+ _cuda_lib.cuDeviceGetAttribute( # type: ignore
415
+ byref(threads_per_core),
416
+ CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
417
+ device,
418
+ )
419
+ )
420
+
421
+ if "CUDA_VISIBLE_DEVICES" in os.environ:
422
+ real_dev_index = [
423
+ int(s) for s in os.environ["CUDA_VISIBLE_DEVICES"].split(",")
424
+ ][dev_index]
425
+ else:
426
+ real_dev_index = dev_index
427
+
428
+ info = _device_infos[dev_index] = _cu_device_info(
429
+ index=real_dev_index,
430
+ uuid=uuid.UUID(bytes=uuid_t.bytes),
431
+ name=name_buf.value.decode(),
432
+ multiprocessors=cores.value,
433
+ cuda_cores=cores.value
434
+ * _cu_get_processor_cores(cc_major.value, cc_minor.value),
435
+ threads=cores.value * threads_per_core.value,
436
+ )
437
+ return info
438
+
439
+
440
+ def get_device_status(dev_index: int) -> _nvml_device_status | None:
441
+ _init()
442
+ if _init_pid is None:
443
+ return None
444
+
445
+ c_device = _nvmlDevice_t()
446
+ c_utils = _nvmlUtilization_t()
447
+ c_temperature = c_uint()
448
+ c_memory_info = _nvmlBAR1Memory_t()
449
+
450
+ dev_uuid = get_device_info(dev_index).uuid # type: ignore
451
+
452
+ uuid_str = ("GPU-" + str(dev_uuid)).encode()
453
+
454
+ if not _is_wsl:
455
+ _nvml_check_error(
456
+ _nvml_lib.nvmlDeviceGetHandleByUUID(uuid_str, byref(c_device)) # type: ignore
457
+ )
458
+
459
+ _nvml_check_error(
460
+ _nvml_lib.nvmlDeviceGetUtilizationRates(c_device, byref(c_utils)) # type: ignore
461
+ )
462
+ gpu_util = c_utils.gpu
463
+ mem_util = c_utils.memory
464
+
465
+ _nvml_check_error(
466
+ _nvml_lib.nvmlDeviceGetTemperature( # type: ignore
467
+ c_device, NVML_TEMPERATURE_GPU, byref(c_temperature)
468
+ )
469
+ )
470
+ temperature = c_temperature.value
471
+
472
+ _nvml_check_error(
473
+ _nvml_lib.nvmlDeviceGetMemoryInfo(c_device, byref(c_memory_info)) # type: ignore
474
+ )
475
+ fb_total_mem = c_memory_info.total
476
+ fb_free_mem = c_memory_info.free
477
+ fb_used_mem = c_memory_info.used
478
+ else:
479
+ import defusedxml
480
+
481
+ proc = subprocess.Popen(
482
+ ["/usr/lib/wsl/lib/nvidia-smi", "-q", f"--id={dev_index}", "-x"],
483
+ stdout=subprocess.PIPE,
484
+ )
485
+ proc.wait()
486
+ xml_result = defusedxml.ElementTree.fromstring(proc.stdout.read()) # type: ignore
487
+ gpu_node = xml_result.find("gpu")
488
+
489
+ fb_node = gpu_node.find("fb_memory_usage")
490
+ fb_total_mem = int(parse_readable_size(fb_node.find("total").text)[0])
491
+ fb_free_mem = int(parse_readable_size(fb_node.find("free").text)[0])
492
+ fb_used_mem = int(parse_readable_size(fb_node.find("used").text)[0])
493
+
494
+ util_node = gpu_node.find("utilization")
495
+ if util_node.find("gpu_util").text == "N/A":
496
+ gpu_util = 0
497
+ else:
498
+ gpu_util = int(util_node.find("gpu_util"))
499
+ if util_node.find("memory_util").text == "N/A":
500
+ mem_util = 0
501
+ else:
502
+ mem_util = int(util_node.find("memory_util"))
503
+
504
+ temperature = int(gpu_node.find("temperature").find("gpu_temp").text[:-1])
505
+
506
+ return _nvml_device_status(
507
+ gpu_util=gpu_util,
508
+ mem_util=mem_util,
509
+ temperature=temperature,
510
+ fb_total_mem=fb_total_mem,
511
+ fb_free_mem=fb_free_mem,
512
+ fb_used_mem=fb_used_mem,
513
+ )
514
+
515
+
516
+ def get_handle_by_index(index: int) -> _nvmlDevice_t: # type: ignore
517
+ _init_nvml()
518
+ if _nvml_lib is None:
519
+ return None
520
+
521
+ c_index = c_int(index)
522
+ device = _nvmlDevice_t()
523
+ _nvml_check_error(_nvml_lib.nvmlDeviceGetHandleByIndex_v2(c_index, byref(device)))
524
+ return device
525
+
526
+
527
+ def get_handle_by_uuid(uuid: bytes) -> _nvmlDevice_t: # type: ignore
528
+ _init_nvml()
529
+ if _nvml_lib is None:
530
+ return None
531
+
532
+ c_uuid = c_char_p(uuid)
533
+ device = _nvmlDevice_t()
534
+ _nvml_check_error(_nvml_lib.nvmlDeviceGetHandleByUUID(c_uuid, byref(device)))
535
+ return device
536
+
537
+
538
+ def get_mig_mode(device: _nvmlDevice_t) -> Tuple[int, int] | None: # type: ignore
539
+ _init_nvml()
540
+ if _nvml_lib is None:
541
+ return None
542
+
543
+ c_current_mode, c_pending_mode = c_uint(), c_uint()
544
+ _nvml_check_error(
545
+ _nvml_lib.nvmlDeviceGetMigMode(
546
+ device, byref(c_current_mode), byref(c_pending_mode)
547
+ )
548
+ )
549
+ return c_current_mode.value, c_pending_mode.value
550
+
551
+
552
+ def get_max_mig_device_count(device: _nvmlDevice_t) -> int | None: # type: ignore
553
+ _init_nvml()
554
+ if _nvml_lib is None:
555
+ return None
556
+
557
+ c_count = c_uint()
558
+ _nvml_check_error(_nvml_lib.nvmlDeviceGetMaxMigDeviceCount(device, byref(c_count)))
559
+ return c_count.value
560
+
561
+
562
+ def get_mig_device_handle_by_index(device: _nvmlDevice_t, index: int) -> _nvmlDevice_t: # type: ignore
563
+ _init_nvml()
564
+ if _nvml_lib is None:
565
+ return None
566
+
567
+ c_index = c_uint(index)
568
+ mig_device = _nvmlDevice_t()
569
+ _nvml_check_error(
570
+ _nvml_lib.nvmlDeviceGetMigDeviceHandleByIndex(
571
+ device, c_index, byref(mig_device)
572
+ )
573
+ )
574
+ return mig_device
575
+
576
+
577
+ def get_index(handle: _nvmlDevice_t) -> int | None: # type: ignore
578
+ _init_nvml()
579
+ if _nvml_lib is None:
580
+ return None
581
+
582
+ c_index = c_uint()
583
+ _nvml_check_error(_nvml_lib.nvmlDeviceGetIndex(handle, byref(c_index)))
584
+ return c_index.value
585
+
586
+
587
+ def get_uuid(handle: _nvmlDevice_t) -> bytes | None: # type: ignore
588
+ _init_nvml()
589
+ if _nvml_lib is None:
590
+ return None
591
+
592
+ c_uuid = create_string_buffer(NVML_DEVICE_UUID_V2_BUFFER_SIZE)
593
+ _nvml_check_error(
594
+ _nvml_lib.nvmlDeviceGetUUID(
595
+ handle, c_uuid, c_uint(NVML_DEVICE_UUID_V2_BUFFER_SIZE)
596
+ )
597
+ )
598
+ return c_uuid.value
599
+
600
+
601
+ def get_index_and_uuid(device: Union[int, bytes, str]) -> CudaDeviceInfo | None:
602
+ _init_nvml()
603
+ if _nvml_lib is None:
604
+ return None
605
+
606
+ try:
607
+ device_index = int(device)
608
+ device_handle = get_handle_by_index(device_index)
609
+ uuid = get_uuid(device_handle)
610
+ except ValueError:
611
+ uuid = device if isinstance(device, bytes) else device.encode() # type: ignore
612
+ uuid_handle = get_handle_by_uuid(uuid)
613
+ device_index = get_index(uuid_handle) # type: ignore
614
+ uuid = get_uuid(uuid_handle)
615
+
616
+ return CudaDeviceInfo(uuid=uuid, device_index=device_index)
617
+
618
+
619
+ def get_compute_running_processes( # type: ignore
620
+ handle: _nvmlDevice_t, # type: ignore
621
+ ) -> List[nvmlFriendlyObject] | None:
622
+ _init_nvml()
623
+ if _nvml_lib is None:
624
+ return None
625
+
626
+ c_count = c_uint(0)
627
+ func = getattr(_nvml_lib, "nvmlDeviceGetComputeRunningProcesses_v3", None)
628
+ if func is None:
629
+ func = getattr(_nvml_lib, "nvmlDeviceGetComputeRunningProcesses_v2")
630
+ ret = func(handle, byref(c_count), None)
631
+
632
+ if ret == NVML_SUCCESS:
633
+ # special case, no running processes
634
+ return []
635
+ elif ret == NVML_ERROR_INSUFFICIENT_SIZE:
636
+ # typical case
637
+ # oversize the array in case more processes are created
638
+ c_count.value = c_count.value * 2 + 5
639
+ proc_array = _nvmlProcessInfo_t * c_count.value
640
+ c_procs = proc_array()
641
+
642
+ _nvml_check_error(func(handle, byref(c_count), c_procs))
643
+
644
+ procs = []
645
+ for i in range(c_count.value):
646
+ # use an alternative struct for this object
647
+ obj = nvmlStructToFriendlyObject(c_procs[i])
648
+ if obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value:
649
+ # special case for WDDM on Windows, see comment above
650
+ obj.usedGpuMemory = None
651
+ procs.append(obj)
652
+
653
+ return procs
654
+ else:
655
+ # error case
656
+ _nvml_check_error(ret)
657
+
658
+
659
+ def _running_process_matches(handle: _nvmlDevice_t) -> bool: # type: ignore
660
+ """Check whether the current process is same as that of handle
661
+ Parameters
662
+ ----------
663
+ handle : _nvmlDevice_t
664
+ NVML handle to CUDA device
665
+ Returns
666
+ -------
667
+ out : bool
668
+ Whether the device handle has a CUDA context on the running process.
669
+ """
670
+ return any(os.getpid() == o.pid for o in get_compute_running_processes(handle)) # type: ignore
671
+
672
+
673
+ def get_cuda_context() -> CudaContext:
674
+ """Check whether the current process already has a CUDA context created."""
675
+
676
+ _init()
677
+ if _init_pid is None:
678
+ return CudaContext(has_context=False)
679
+
680
+ for index in range(_get_all_device_count()): # type: ignore
681
+ handle = get_handle_by_index(index)
682
+ try:
683
+ mig_current_mode, mig_pending_mode = get_mig_mode(handle) # type: ignore
684
+ except NVMLAPIError as e:
685
+ if e.errno == NVML_ERROR_NOT_SUPPORTED:
686
+ mig_current_mode = NVML_DEVICE_MIG_DISABLE
687
+ else:
688
+ raise
689
+ if mig_current_mode == NVML_DEVICE_MIG_ENABLE:
690
+ for mig_index in range(get_max_mig_device_count(handle)): # type: ignore
691
+ try:
692
+ mig_handle = get_mig_device_handle_by_index(handle, mig_index)
693
+ except NVMLAPIError as e:
694
+ if e.errno == NVML_ERROR_NOT_FOUND:
695
+ # No MIG device with that index
696
+ continue
697
+ else:
698
+ raise
699
+ if _running_process_matches(mig_handle):
700
+ return CudaContext(
701
+ has_context=True,
702
+ device_info=CudaDeviceInfo(
703
+ uuid=get_uuid(handle),
704
+ device_index=index,
705
+ mig_index=mig_index,
706
+ ),
707
+ )
708
+ else:
709
+ if _running_process_matches(handle):
710
+ return CudaContext(
711
+ has_context=True,
712
+ device_info=CudaDeviceInfo(
713
+ uuid=get_uuid(handle), device_index=index
714
+ ),
715
+ )
716
+
717
+ return CudaContext(has_context=False)