gpustack-runtime 0.1.40.post1__py3-none-any.whl → 0.1.41.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. gpustack_runtime/__init__.py +1 -1
  2. gpustack_runtime/__main__.py +5 -3
  3. gpustack_runtime/_version.py +2 -2
  4. gpustack_runtime/_version_appendix.py +1 -1
  5. gpustack_runtime/cmds/__init__.py +5 -3
  6. gpustack_runtime/cmds/__types__.py +1 -1
  7. gpustack_runtime/cmds/deployer.py +140 -18
  8. gpustack_runtime/cmds/detector.py +1 -1
  9. gpustack_runtime/cmds/images.py +1 -1
  10. gpustack_runtime/deployer/__init__.py +28 -2
  11. gpustack_runtime/deployer/__patches__.py +1 -1
  12. gpustack_runtime/deployer/__types__.py +2 -1
  13. gpustack_runtime/deployer/__utils__.py +2 -2
  14. gpustack_runtime/deployer/cdi/__init__.py +86 -5
  15. gpustack_runtime/deployer/cdi/__types__.py +92 -29
  16. gpustack_runtime/deployer/cdi/__utils__.py +180 -0
  17. gpustack_runtime/deployer/cdi/amd.py +146 -0
  18. gpustack_runtime/deployer/cdi/ascend.py +164 -0
  19. gpustack_runtime/deployer/cdi/hygon.py +147 -0
  20. gpustack_runtime/deployer/cdi/iluvatar.py +136 -0
  21. gpustack_runtime/deployer/cdi/metax.py +148 -0
  22. gpustack_runtime/deployer/cdi/thead.py +57 -23
  23. gpustack_runtime/deployer/docker.py +9 -8
  24. gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +325 -0
  25. gpustack_runtime/deployer/k8s/deviceplugin/__types__.py +131 -0
  26. gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +590 -0
  27. gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/__init__.py +3 -0
  28. gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api.proto +212 -0
  29. gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.py +86 -0
  30. gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.pyi +168 -0
  31. gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2_grpc.py +358 -0
  32. gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/constants.py +34 -0
  33. gpustack_runtime/deployer/kuberentes.py +50 -4
  34. gpustack_runtime/deployer/podman.py +9 -8
  35. gpustack_runtime/detector/__init__.py +42 -5
  36. gpustack_runtime/detector/__types__.py +8 -24
  37. gpustack_runtime/detector/__utils__.py +46 -39
  38. gpustack_runtime/detector/amd.py +55 -66
  39. gpustack_runtime/detector/ascend.py +29 -41
  40. gpustack_runtime/detector/cambricon.py +3 -3
  41. gpustack_runtime/detector/hygon.py +21 -49
  42. gpustack_runtime/detector/iluvatar.py +44 -60
  43. gpustack_runtime/detector/metax.py +54 -37
  44. gpustack_runtime/detector/mthreads.py +74 -36
  45. gpustack_runtime/detector/nvidia.py +130 -93
  46. gpustack_runtime/detector/pyacl/__init__.py +1 -1
  47. gpustack_runtime/detector/pyamdgpu/__init__.py +1 -1
  48. gpustack_runtime/detector/pyamdsmi/__init__.py +1 -1
  49. gpustack_runtime/detector/pycuda/__init__.py +1 -1
  50. gpustack_runtime/detector/pydcmi/__init__.py +1 -1
  51. gpustack_runtime/detector/pyhsa/__init__.py +1 -1
  52. gpustack_runtime/detector/pymxsml/__init__.py +1553 -1
  53. gpustack_runtime/detector/pyrocmcore/__init__.py +1 -1
  54. gpustack_runtime/detector/pyrocmsmi/__init__.py +1 -1
  55. gpustack_runtime/detector/thead.py +41 -60
  56. gpustack_runtime/envs.py +106 -12
  57. gpustack_runtime/logging.py +6 -2
  58. {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/METADATA +6 -1
  59. gpustack_runtime-0.1.41.post1.dist-info/RECORD +67 -0
  60. gpustack_runtime/detector/pymxsml/mxsml.py +0 -1580
  61. gpustack_runtime/detector/pymxsml/mxsml_extension.py +0 -816
  62. gpustack_runtime/detector/pymxsml/mxsml_mcm.py +0 -476
  63. gpustack_runtime-0.1.40.post1.dist-info/RECORD +0 -55
  64. {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/WHEEL +0 -0
  65. {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/entry_points.txt +0 -0
  66. {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/licenses/LICENSE +0 -0
@@ -1,4 +1,4 @@
1
- from __future__ import annotations
1
+ from __future__ import annotations as __future_annotations__
2
2
 
3
3
  import contextlib
4
4
  import logging
@@ -6,6 +6,8 @@ import math
6
6
  import time
7
7
  from _ctypes import byref
8
8
  from functools import lru_cache
9
+ from pathlib import Path
10
+ from typing import re
9
11
 
10
12
  import pynvml
11
13
 
@@ -18,7 +20,6 @@ from .__utils__ import (
18
20
  bitmask_to_str,
19
21
  byte_to_mebibyte,
20
22
  get_brief_version,
21
- get_device_files,
22
23
  get_memory,
23
24
  get_numa_node_by_bdf,
24
25
  get_numa_nodeset_size,
@@ -37,7 +38,7 @@ class NVIDIADetector(Detector):
37
38
  """
38
39
 
39
40
  @staticmethod
40
- @lru_cache
41
+ @lru_cache(maxsize=1)
41
42
  def is_supported() -> bool:
42
43
  """
43
44
  Check if NVIDIA detection is supported.
@@ -66,7 +67,7 @@ class NVIDIADetector(Detector):
66
67
  return supported
67
68
 
68
69
  @staticmethod
69
- @lru_cache
70
+ @lru_cache(maxsize=1)
70
71
  def detect_pci_devices() -> dict[str, PCIDevice]:
71
72
  # See https://pcisig.com/membership/member-companies?combine=NVIDIA.
72
73
  pci_devs = get_pci_devices(vendor="0x10de")
@@ -122,36 +123,35 @@ class NVIDIADetector(Detector):
122
123
  )
123
124
 
124
125
  dev_count = pynvml.nvmlDeviceGetCount()
125
- dev_files = None
126
126
  for dev_idx in range(dev_count):
127
127
  dev = pynvml.nvmlDeviceGetHandleByIndex(dev_idx)
128
128
 
129
129
  dev_cc_t = pynvml.nvmlDeviceGetCudaComputeCapability(dev)
130
130
  dev_cc = ".".join(map(str, dev_cc_t))
131
131
 
132
- dev_bdf = None
133
- with contextlib.suppress(pynvml.NVMLError):
134
- dev_pci_info = pynvml.nvmlDeviceGetPciInfo(dev)
135
- dev_bdf = str(dev_pci_info.busIdLegacy).lower()
132
+ dev_pci_info = pynvml.nvmlDeviceGetPciInfo(dev)
133
+ dev_bdf = str(dev_pci_info.busIdLegacy).lower()
134
+
135
+ dev_numa = get_numa_node_by_bdf(dev_bdf)
136
+ if not dev_numa:
137
+ dev_node_affinity = pynvml.nvmlDeviceGetMemoryAffinity(
138
+ dev,
139
+ get_numa_nodeset_size(),
140
+ pynvml.NVML_AFFINITY_SCOPE_NODE,
141
+ )
142
+ dev_numa = bitmask_to_str(list(dev_node_affinity))
136
143
 
137
144
  dev_mig_mode = pynvml.NVML_DEVICE_MIG_DISABLE
138
145
  with contextlib.suppress(pynvml.NVMLError):
139
146
  dev_mig_mode, _ = pynvml.nvmlDeviceGetMigMode(dev)
140
147
 
148
+ dev_index = dev_idx
149
+ if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
150
+ dev_index = pynvml.nvmlDeviceGetMinorNumber(dev)
151
+
141
152
  # With MIG disabled, treat as a single device.
142
153
 
143
154
  if dev_mig_mode == pynvml.NVML_DEVICE_MIG_DISABLE:
144
- dev_index = dev_idx
145
- if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
146
- if dev_files is None:
147
- dev_files = get_device_files(
148
- pattern=r"nvidia(?P<number>\d+)",
149
- )
150
- if len(dev_files) >= dev_count:
151
- dev_file = dev_files[dev_idx]
152
- if dev_file.number is not None:
153
- dev_index = dev_file.number
154
-
155
155
  dev_name = pynvml.nvmlDeviceGetName(dev)
156
156
 
157
157
  dev_uuid = pynvml.nvmlDeviceGetUUID(dev)
@@ -208,18 +208,15 @@ class NVIDIADetector(Detector):
208
208
  ) # mW to W
209
209
 
210
210
  dev_is_vgpu = False
211
- if dev_bdf and dev_bdf in pci_devs:
211
+ if dev_bdf in pci_devs:
212
212
  dev_is_vgpu = _is_vgpu(pci_devs[dev_bdf].config)
213
213
 
214
214
  dev_appendix = {
215
215
  "arch_family": _get_arch_family(dev_cc_t),
216
216
  "vgpu": dev_is_vgpu,
217
+ "bdf": dev_bdf,
218
+ "numa": dev_numa,
217
219
  }
218
- if dev_bdf:
219
- dev_appendix["bdf"] = dev_bdf
220
-
221
- if dev_links_state := _get_links_state(dev):
222
- dev_appendix.update(dev_links_state)
223
220
 
224
221
  if dev_fabric_info := _get_fabric_info(dev):
225
222
  dev_appendix.update(dev_fabric_info)
@@ -251,6 +248,8 @@ class NVIDIADetector(Detector):
251
248
  # Otherwise, get MIG devices,
252
249
  # inspired by https://github.com/NVIDIA/go-nvlib/blob/fdfe25d0ffc9d7a8c166f4639ef236da81116262/pkg/nvlib/device/mig_device.go#L61-L154.
253
250
 
251
+ dev_mig_minors = _get_mig_minors()
252
+
254
253
  mdev_name = ""
255
254
  mdev_cores = None
256
255
  mdev_count = pynvml.nvmlDeviceGetMaxMigDeviceCount(dev)
@@ -288,14 +287,21 @@ class NVIDIADetector(Detector):
288
287
  mdev_appendix = {
289
288
  "arch_family": _get_arch_family(dev_cc_t),
290
289
  "vgpu": True,
290
+ "bdf": dev_bdf,
291
+ "numa": dev_numa,
291
292
  }
292
- if dev_bdf:
293
- mdev_appendix["bdf"] = dev_bdf
294
293
 
295
294
  mdev_gi_id = pynvml.nvmlDeviceGetGpuInstanceId(mdev)
296
295
  mdev_appendix["gpu_instance_id"] = mdev_gi_id
297
296
  mdev_ci_id = pynvml.nvmlDeviceGetComputeInstanceId(mdev)
298
297
  mdev_appendix["compute_instance_id"] = mdev_ci_id
298
+ if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
299
+ mdev_appendix["gpu_instance_index"] = dev_mig_minors.get(
300
+ (dev_index, mdev_gi_id, None),
301
+ )
302
+ mdev_appendix["compute_instance_index"] = dev_mig_minors.get(
303
+ (dev_index, mdev_gi_id, mdev_ci_id),
304
+ )
299
305
 
300
306
  mdev_cores_util = _get_sm_util_from_gpm_metrics(dev, mdev_gi_id)
301
307
 
@@ -426,36 +432,24 @@ class NVIDIADetector(Detector):
426
432
  for i, dev_i in enumerate(devices):
427
433
  dev_i_handle = pynvml.nvmlDeviceGetHandleByUUID(dev_i.uuid)
428
434
 
429
- # Get affinity with PCIe BDF if possible.
430
- if dev_i_bdf := dev_i.appendix.get("bdf", ""):
431
- ret.devices_numa_affinities[i] = get_numa_node_by_bdf(
432
- dev_i_bdf,
433
- )
434
- ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
435
- ret.devices_numa_affinities[i],
436
- )
437
- # Otherwise, get affinity via NVML.
438
- if not ret.devices_cpu_affinities[i]:
439
- # Get NUMA affinity.
440
- try:
441
- dev_i_memset = pynvml.nvmlDeviceGetMemoryAffinity(
442
- dev_i_handle,
443
- get_numa_nodeset_size(),
444
- pynvml.NVML_AFFINITY_SCOPE_NODE,
445
- )
446
- ret.devices_numa_affinities[i] = bitmask_to_str(
447
- list(dev_i_memset),
448
- )
449
- except pynvml.NVMLError:
450
- debug_log_exception(
451
- logger,
452
- "Failed to get NUMA affinity for device %d",
453
- dev_i.index,
454
- )
455
- # Get CPU affinity.
456
- ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
457
- ret.devices_numa_affinities[i],
458
- )
435
+ # Get NUMA and CPU affinities.
436
+ ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
437
+ ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
438
+ ret.devices_numa_affinities[i],
439
+ )
440
+
441
+ # Get links state if applicable.
442
+ if dev_i_links_state := _get_links_state(dev_i_handle):
443
+ ret.appendices[i].update(dev_i_links_state)
444
+ # In practice, if a card has an active *Link,
445
+ # then other cards in the same machine should be interconnected with it through the *Link.
446
+ if dev_i_links_state.get("links_active_count", 0) > 0:
447
+ for j, dev_j in enumerate(devices):
448
+ if dev_i.index == dev_j.index:
449
+ continue
450
+ ret.devices_distances[i][j] = TopologyDistanceEnum.LINK
451
+ ret.devices_distances[j][i] = TopologyDistanceEnum.LINK
452
+ continue
459
453
 
460
454
  # Get distances to other devices.
461
455
  for j, dev_j in enumerate(devices):
@@ -470,8 +464,6 @@ class NVIDIADetector(Detector):
470
464
  dev_i_handle,
471
465
  dev_j_handle,
472
466
  )
473
- if dev_i.appendix.get("links_state", 0) > 0:
474
- distance = TopologyDistanceEnum.LINK
475
467
  except pynvml.NVMLError:
476
468
  debug_log_exception(
477
469
  logger,
@@ -482,9 +474,6 @@ class NVIDIADetector(Detector):
482
474
 
483
475
  ret.devices_distances[i][j] = distance
484
476
  ret.devices_distances[j][i] = distance
485
- except pynvml.NVMLError:
486
- debug_log_exception(logger, "Failed to fetch topology")
487
- raise
488
477
  except Exception:
489
478
  debug_log_exception(logger, "Failed to process topology fetching")
490
479
  raise
@@ -619,6 +608,37 @@ def _extract_field_value(
619
608
  return None
620
609
 
621
610
 
611
+ def _get_fabric_info(
612
+ dev: pynvml.c_nvmlDevice_t,
613
+ ) -> dict | None:
614
+ """
615
+ Get the NVSwitch fabric information for a device.
616
+
617
+ Args:
618
+ dev:
619
+ The NVML device handle.
620
+
621
+ Returns:
622
+ A dict includes fabric info or None if failed.
623
+
624
+ """
625
+ try:
626
+ dev_fabric = pynvml.c_nvmlGpuFabricInfoV_t()
627
+ ret = pynvml.nvmlDeviceGetGpuFabricInfoV(dev, byref(dev_fabric))
628
+ if ret != pynvml.NVML_SUCCESS:
629
+ return None
630
+ if dev_fabric.state != pynvml.NVML_GPU_FABRIC_STATE_COMPLETED:
631
+ return None
632
+ return {
633
+ "fabric_cluster_uuid": stringify_uuid(bytes(dev_fabric.clusterUuid)),
634
+ "fabric_clique_id": dev_fabric.cliqueId,
635
+ }
636
+ except pynvml.NVMLError:
637
+ debug_log_warning(logger, "Failed to get NVSwitch fabric info")
638
+
639
+ return None
640
+
641
+
622
642
  def _get_links_state(
623
643
  dev: pynvml.c_nvmlDevice_t,
624
644
  ) -> dict | None:
@@ -646,49 +666,23 @@ def _get_links_state(
646
666
  return None
647
667
 
648
668
  dev_links_state = 0
669
+ dev_links_active_count = 0
649
670
  try:
650
671
  for link_idx in range(int(dev_links_count)):
651
672
  dev_link_state = pynvml.nvmlDeviceGetNvLinkState(dev, link_idx)
652
673
  if dev_link_state:
653
- dev_links_state |= 1 << (link_idx + 1)
674
+ dev_links_state |= 1 << link_idx
675
+ dev_links_active_count += 1
654
676
  except pynvml.NVMLError:
655
677
  debug_log_warning(logger, "Failed to get NVLink link state")
656
678
 
657
679
  return {
658
680
  "links_count": dev_links_count,
659
681
  "links_state": dev_links_state,
682
+ "links_active_count": dev_links_active_count,
660
683
  }
661
684
 
662
685
 
663
- def _get_fabric_info(
664
- dev: pynvml.c_nvmlDevice_t,
665
- ) -> dict | None:
666
- """
667
- Get the NVSwitch fabric information for a device.
668
-
669
- Args:
670
- dev:
671
- The NVML device handle.
672
-
673
- Returns:
674
- A dict includes fabric info or None if failed.
675
-
676
- """
677
- try:
678
- dev_fabric = pynvml.c_nvmlGpuFabricInfoV_t()
679
- ret = pynvml.nvmlDeviceGetGpuFabricInfoV(dev, byref(dev_fabric))
680
- if ret != pynvml.NVML_SUCCESS:
681
- return None
682
- if dev_fabric.state != pynvml.NVML_GPU_FABRIC_STATE_COMPLETED:
683
- return None
684
- return {
685
- "fabric_cluster_uuid": stringify_uuid(bytes(dev_fabric.clusterUuid)),
686
- "fabric_clique_id": dev_fabric.cliqueId,
687
- }
688
- except pynvml.NVMLError:
689
- debug_log_warning(logger, "Failed to get NVSwitch fabric info")
690
-
691
-
692
686
  def _get_arch_family(dev_cc_t: list[int]) -> str:
693
687
  """
694
688
  Get the architecture family based on the CUDA compute capability.
@@ -917,3 +911,46 @@ def _is_vgpu(dev_config: bytes) -> bool:
917
911
  # Check for vGPU signature,
918
912
  # which is either 0x56 (NVIDIA vGPU) or 0x46 (NVIDIA GRID).
919
913
  return dev_cap[3] == 0x56 or dev_cap[4] == 0x46
914
+
915
+
916
+ def _get_mig_minors() -> dict[tuple, int] | None:
917
+ """
918
+ Get the minor mapping for MIG capability devices.
919
+
920
+ Returns:
921
+ A dict mapping (gpu_id, gi_id, ci_id) to minor number,
922
+ or None if not supported.
923
+
924
+ """
925
+ mig_minors_path = Path("/proc/driver/nvidia-caps/mig-minors")
926
+ if not mig_minors_path.exists():
927
+ return None
928
+
929
+ ret = {}
930
+ for _line in mig_minors_path.read_text(encoding="utf-8").splitlines():
931
+ line = _line.strip()
932
+ if not line:
933
+ continue
934
+
935
+ # Scan lines like:
936
+ # gpu%d/gi%d/ci%d/access %d
937
+ m = re.match(r"gpu(\d+)/gi(\d+)/ci(\d+)/access (\d+)", line)
938
+ if m:
939
+ gpu_id = int(m.group(1))
940
+ gi_id = int(m.group(2))
941
+ ci_id = int(m.group(3))
942
+ minor = int(m.group(4))
943
+ ret[(gpu_id, gi_id, ci_id)] = minor
944
+ continue
945
+
946
+ # Scan lines like:
947
+ # gpu%d/gi%d/access %d
948
+ m = re.match(r"gpu(\d+)/gi(\d+)/access (\d+)", line)
949
+ if m:
950
+ gpu_id = int(m.group(1))
951
+ gi_id = int(m.group(2))
952
+ minor = int(m.group(3))
953
+ ret[(gpu_id, gi_id, None)] = minor
954
+ continue
955
+
956
+ return ret
@@ -1,7 +1,7 @@
1
1
  ##
2
2
  # Python bindings for the ACL library
3
3
  ##
4
- from __future__ import annotations
4
+ from __future__ import annotations as __future_annotations__
5
5
 
6
6
  import contextlib
7
7
  import os
@@ -1,7 +1,7 @@
1
1
  ##
2
2
  # Python bindings for the DCMI library
3
3
  ##
4
- from __future__ import annotations
4
+ from __future__ import annotations as __future_annotations__
5
5
 
6
6
  import errno
7
7
  import os
@@ -1,7 +1,7 @@
1
1
  # Bridge amdsmi module to avoid import errors when amdsmi is not installed
2
2
  # This module raises an exception when amdsmi_init is called
3
3
  # and does nothing when amdsmi_shut_down is called.
4
- from __future__ import annotations
4
+ from __future__ import annotations as __future_annotations__
5
5
 
6
6
  import contextlib
7
7
  import os
@@ -1,4 +1,4 @@
1
- from __future__ import annotations
1
+ from __future__ import annotations as __future_annotations__
2
2
 
3
3
  import string
4
4
  import sys
@@ -1,7 +1,7 @@
1
1
  ##
2
2
  # Python bindings for the DCMI library
3
3
  ##
4
- from __future__ import annotations
4
+ from __future__ import annotations as __future_annotations__
5
5
 
6
6
  import string
7
7
  import sys
@@ -1,7 +1,7 @@
1
1
  ##
2
2
  # Python bindings for the HSA library
3
3
  ##
4
- from __future__ import annotations
4
+ from __future__ import annotations as __future_annotations__
5
5
 
6
6
  import contextlib
7
7
  import os