PyPI - gpustack-runtime - Versions diffs - 0.1.40.post1__py3-none-any.whl → 0.1.41.post1__py3-none-any.whl - Mend

gpustack-runtime 0.1.40.post1py3-none-any.whl → 0.1.41.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

gpustack_runtime/__init__.py +1 -1
gpustack_runtime/__main__.py +5 -3
gpustack_runtime/_version.py +2 -2
gpustack_runtime/_version_appendix.py +1 -1
gpustack_runtime/cmds/__init__.py +5 -3
gpustack_runtime/cmds/__types__.py +1 -1
gpustack_runtime/cmds/deployer.py +140 -18
gpustack_runtime/cmds/detector.py +1 -1
gpustack_runtime/cmds/images.py +1 -1
gpustack_runtime/deployer/__init__.py +28 -2
gpustack_runtime/deployer/__patches__.py +1 -1
gpustack_runtime/deployer/__types__.py +2 -1
gpustack_runtime/deployer/__utils__.py +2 -2
gpustack_runtime/deployer/cdi/__init__.py +86 -5
gpustack_runtime/deployer/cdi/__types__.py +92 -29
gpustack_runtime/deployer/cdi/__utils__.py +180 -0
gpustack_runtime/deployer/cdi/amd.py +146 -0
gpustack_runtime/deployer/cdi/ascend.py +164 -0
gpustack_runtime/deployer/cdi/hygon.py +147 -0
gpustack_runtime/deployer/cdi/iluvatar.py +136 -0
gpustack_runtime/deployer/cdi/metax.py +148 -0
gpustack_runtime/deployer/cdi/thead.py +57 -23
gpustack_runtime/deployer/docker.py +9 -8
gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +325 -0
gpustack_runtime/deployer/k8s/deviceplugin/__types__.py +131 -0
gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +590 -0
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/__init__.py +3 -0
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api.proto +212 -0
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.py +86 -0
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.pyi +168 -0
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2_grpc.py +358 -0
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/constants.py +34 -0
gpustack_runtime/deployer/kuberentes.py +50 -4
gpustack_runtime/deployer/podman.py +9 -8
gpustack_runtime/detector/__init__.py +42 -5
gpustack_runtime/detector/__types__.py +8 -24
gpustack_runtime/detector/__utils__.py +46 -39
gpustack_runtime/detector/amd.py +55 -66
gpustack_runtime/detector/ascend.py +29 -41
gpustack_runtime/detector/cambricon.py +3 -3
gpustack_runtime/detector/hygon.py +21 -49
gpustack_runtime/detector/iluvatar.py +44 -60
gpustack_runtime/detector/metax.py +54 -37
gpustack_runtime/detector/mthreads.py +74 -36
gpustack_runtime/detector/nvidia.py +130 -93
gpustack_runtime/detector/pyacl/__init__.py +1 -1
gpustack_runtime/detector/pyamdgpu/__init__.py +1 -1
gpustack_runtime/detector/pyamdsmi/__init__.py +1 -1
gpustack_runtime/detector/pycuda/__init__.py +1 -1
gpustack_runtime/detector/pydcmi/__init__.py +1 -1
gpustack_runtime/detector/pyhsa/__init__.py +1 -1
gpustack_runtime/detector/pymxsml/__init__.py +1553 -1
gpustack_runtime/detector/pyrocmcore/__init__.py +1 -1
gpustack_runtime/detector/pyrocmsmi/__init__.py +1 -1
gpustack_runtime/detector/thead.py +41 -60
gpustack_runtime/envs.py +106 -12
gpustack_runtime/logging.py +6 -2
{gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/METADATA +6 -1
gpustack_runtime-0.1.41.post1.dist-info/RECORD +67 -0
gpustack_runtime/detector/pymxsml/mxsml.py +0 -1580
gpustack_runtime/detector/pymxsml/mxsml_extension.py +0 -816
gpustack_runtime/detector/pymxsml/mxsml_mcm.py +0 -476
gpustack_runtime-0.1.40.post1.dist-info/RECORD +0 -55
{gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/WHEEL +0 -0
{gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/entry_points.txt +0 -0
{gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/licenses/LICENSE +0 -0

gpustack_runtime/detector/nvidia.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from __future__ import annotations
+from __future__ import annotations as __future_annotations__
 import contextlib
 import logging
@@ -6,6 +6,8 @@ import math
 import time
 from _ctypes import byref
 from functools import lru_cache
+from pathlib import Path
+from typing import re
 import pynvml
@@ -18,7 +20,6 @@ from .__utils__ import (
     bitmask_to_str,
     byte_to_mebibyte,
     get_brief_version,
-    get_device_files,
     get_memory,
     get_numa_node_by_bdf,
     get_numa_nodeset_size,
@@ -37,7 +38,7 @@ class NVIDIADetector(Detector):
     """
     @staticmethod
-    @lru_cache
+    @lru_cache(maxsize=1)
     def is_supported() -> bool:
         """
         Check if NVIDIA detection is supported.
@@ -66,7 +67,7 @@ class NVIDIADetector(Detector):
         return supported
     @staticmethod
-    @lru_cache
+    @lru_cache(maxsize=1)
     def detect_pci_devices() -> dict[str, PCIDevice]:
         # See https://pcisig.com/membership/member-companies?combine=NVIDIA.
         pci_devs = get_pci_devices(vendor="0x10de")
@@ -122,36 +123,35 @@ class NVIDIADetector(Detector):
             )
             dev_count = pynvml.nvmlDeviceGetCount()
-            dev_files = None
             for dev_idx in range(dev_count):
                 dev = pynvml.nvmlDeviceGetHandleByIndex(dev_idx)
                 dev_cc_t = pynvml.nvmlDeviceGetCudaComputeCapability(dev)
                 dev_cc = ".".join(map(str, dev_cc_t))
-                dev_bdf = None
-                with contextlib.suppress(pynvml.NVMLError):
-                    dev_pci_info = pynvml.nvmlDeviceGetPciInfo(dev)
-                    dev_bdf = str(dev_pci_info.busIdLegacy).lower()
+                dev_pci_info = pynvml.nvmlDeviceGetPciInfo(dev)
+                dev_bdf = str(dev_pci_info.busIdLegacy).lower()
+                dev_numa = get_numa_node_by_bdf(dev_bdf)
+                if not dev_numa:
+                    dev_node_affinity = pynvml.nvmlDeviceGetMemoryAffinity(
+                        dev,
+                        get_numa_nodeset_size(),
+                        pynvml.NVML_AFFINITY_SCOPE_NODE,
+                    )
+                    dev_numa = bitmask_to_str(list(dev_node_affinity))
                 dev_mig_mode = pynvml.NVML_DEVICE_MIG_DISABLE
                 with contextlib.suppress(pynvml.NVMLError):
                     dev_mig_mode, _ = pynvml.nvmlDeviceGetMigMode(dev)
+                dev_index = dev_idx
+                if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
+                    dev_index = pynvml.nvmlDeviceGetMinorNumber(dev)
                 # With MIG disabled, treat as a single device.
                 if dev_mig_mode == pynvml.NVML_DEVICE_MIG_DISABLE:
-                    dev_index = dev_idx
-                    if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
-                        if dev_files is None:
-                            dev_files = get_device_files(
-                                pattern=r"nvidia(?P<number>\d+)",
-                            )
-                        if len(dev_files) >= dev_count:
-                            dev_file = dev_files[dev_idx]
-                            if dev_file.number is not None:
-                                dev_index = dev_file.number
                     dev_name = pynvml.nvmlDeviceGetName(dev)
                     dev_uuid = pynvml.nvmlDeviceGetUUID(dev)
@@ -208,18 +208,15 @@ class NVIDIADetector(Detector):
                         )  # mW to W
                     dev_is_vgpu = False
-                    if dev_bdf and dev_bdf in pci_devs:
+                    if dev_bdf in pci_devs:
                         dev_is_vgpu = _is_vgpu(pci_devs[dev_bdf].config)
                     dev_appendix = {
                         "arch_family": _get_arch_family(dev_cc_t),
                         "vgpu": dev_is_vgpu,
+                        "bdf": dev_bdf,
+                        "numa": dev_numa,
                     }
-                    if dev_bdf:
-                        dev_appendix["bdf"] = dev_bdf
-                    if dev_links_state := _get_links_state(dev):
-                        dev_appendix.update(dev_links_state)
                     if dev_fabric_info := _get_fabric_info(dev):
                         dev_appendix.update(dev_fabric_info)
@@ -251,6 +248,8 @@ class NVIDIADetector(Detector):
                 # Otherwise, get MIG devices,
                 # inspired by https://github.com/NVIDIA/go-nvlib/blob/fdfe25d0ffc9d7a8c166f4639ef236da81116262/pkg/nvlib/device/mig_device.go#L61-L154.
+                dev_mig_minors = _get_mig_minors()
                 mdev_name = ""
                 mdev_cores = None
                 mdev_count = pynvml.nvmlDeviceGetMaxMigDeviceCount(dev)
@@ -288,14 +287,21 @@ class NVIDIADetector(Detector):
                     mdev_appendix = {
                         "arch_family": _get_arch_family(dev_cc_t),
                         "vgpu": True,
+                        "bdf": dev_bdf,
+                        "numa": dev_numa,
                     }
-                    if dev_bdf:
-                        mdev_appendix["bdf"] = dev_bdf
                     mdev_gi_id = pynvml.nvmlDeviceGetGpuInstanceId(mdev)
                     mdev_appendix["gpu_instance_id"] = mdev_gi_id
                     mdev_ci_id = pynvml.nvmlDeviceGetComputeInstanceId(mdev)
                     mdev_appendix["compute_instance_id"] = mdev_ci_id
+                    if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
+                        mdev_appendix["gpu_instance_index"] = dev_mig_minors.get(
+                            (dev_index, mdev_gi_id, None),
+                        )
+                        mdev_appendix["compute_instance_index"] = dev_mig_minors.get(
+                            (dev_index, mdev_gi_id, mdev_ci_id),
+                        )
                     mdev_cores_util = _get_sm_util_from_gpm_metrics(dev, mdev_gi_id)
@@ -426,36 +432,24 @@ class NVIDIADetector(Detector):
             for i, dev_i in enumerate(devices):
                 dev_i_handle = pynvml.nvmlDeviceGetHandleByUUID(dev_i.uuid)
-                # Get affinity with PCIe BDF if possible.
-                if dev_i_bdf := dev_i.appendix.get("bdf", ""):
-                    ret.devices_numa_affinities[i] = get_numa_node_by_bdf(
-                        dev_i_bdf,
-                    )
-                    ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
-                        ret.devices_numa_affinities[i],
-                    )
-                # Otherwise, get affinity via NVML.
-                if not ret.devices_cpu_affinities[i]:
-                    # Get NUMA affinity.
-                    try:
-                        dev_i_memset = pynvml.nvmlDeviceGetMemoryAffinity(
-                            dev_i_handle,
-                            get_numa_nodeset_size(),
-                            pynvml.NVML_AFFINITY_SCOPE_NODE,
-                        )
-                        ret.devices_numa_affinities[i] = bitmask_to_str(
-                            list(dev_i_memset),
-                        )
-                    except pynvml.NVMLError:
-                        debug_log_exception(
-                            logger,
-                            "Failed to get NUMA affinity for device %d",
-                            dev_i.index,
-                        )
-                    # Get CPU affinity.
-                    ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
-                        ret.devices_numa_affinities[i],
-                    )
+                # Get NUMA and CPU affinities.
+                ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
+                ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
+                    ret.devices_numa_affinities[i],
+                )
+                # Get links state if applicable.
+                if dev_i_links_state := _get_links_state(dev_i_handle):
+                    ret.appendices[i].update(dev_i_links_state)
+                    # In practice, if a card has an active *Link,
+                    # then other cards in the same machine should be interconnected with it through the *Link.
+                    if dev_i_links_state.get("links_active_count", 0) > 0:
+                        for j, dev_j in enumerate(devices):
+                            if dev_i.index == dev_j.index:
+                                continue
+                            ret.devices_distances[i][j] = TopologyDistanceEnum.LINK
+                            ret.devices_distances[j][i] = TopologyDistanceEnum.LINK
+                        continue
                 # Get distances to other devices.
                 for j, dev_j in enumerate(devices):
@@ -470,8 +464,6 @@ class NVIDIADetector(Detector):
                             dev_i_handle,
                             dev_j_handle,
                         )
-                        if dev_i.appendix.get("links_state", 0) > 0:
-                            distance = TopologyDistanceEnum.LINK
                     except pynvml.NVMLError:
                         debug_log_exception(
                             logger,
@@ -482,9 +474,6 @@ class NVIDIADetector(Detector):
                     ret.devices_distances[i][j] = distance
                     ret.devices_distances[j][i] = distance
-        except pynvml.NVMLError:
-            debug_log_exception(logger, "Failed to fetch topology")
-            raise
         except Exception:
             debug_log_exception(logger, "Failed to process topology fetching")
             raise
@@ -619,6 +608,37 @@ def _extract_field_value(
     return None
+def _get_fabric_info(
+    dev: pynvml.c_nvmlDevice_t,
+) -> dict | None:
+    """
+    Get the NVSwitch fabric information for a device.
+    Args:
+        dev:
+            The NVML device handle.
+    Returns:
+        A dict includes fabric info or None if failed.
+    """
+    try:
+        dev_fabric = pynvml.c_nvmlGpuFabricInfoV_t()
+        ret = pynvml.nvmlDeviceGetGpuFabricInfoV(dev, byref(dev_fabric))
+        if ret != pynvml.NVML_SUCCESS:
+            return None
+        if dev_fabric.state != pynvml.NVML_GPU_FABRIC_STATE_COMPLETED:
+            return None
+        return {
+            "fabric_cluster_uuid": stringify_uuid(bytes(dev_fabric.clusterUuid)),
+            "fabric_clique_id": dev_fabric.cliqueId,
+        }
+    except pynvml.NVMLError:
+        debug_log_warning(logger, "Failed to get NVSwitch fabric info")
+    return None
 def _get_links_state(
     dev: pynvml.c_nvmlDevice_t,
 ) -> dict | None:
@@ -646,49 +666,23 @@ def _get_links_state(
         return None
     dev_links_state = 0
+    dev_links_active_count = 0
     try:
         for link_idx in range(int(dev_links_count)):
             dev_link_state = pynvml.nvmlDeviceGetNvLinkState(dev, link_idx)
             if dev_link_state:
-                dev_links_state |= 1 << (link_idx + 1)
+                dev_links_state |= 1 << link_idx
+                dev_links_active_count += 1
     except pynvml.NVMLError:
         debug_log_warning(logger, "Failed to get NVLink link state")
     return {
         "links_count": dev_links_count,
         "links_state": dev_links_state,
+        "links_active_count": dev_links_active_count,
     }
-def _get_fabric_info(
-    dev: pynvml.c_nvmlDevice_t,
-) -> dict | None:
-    """
-    Get the NVSwitch fabric information for a device.
-    Args:
-        dev:
-            The NVML device handle.
-    Returns:
-        A dict includes fabric info or None if failed.
-    """
-    try:
-        dev_fabric = pynvml.c_nvmlGpuFabricInfoV_t()
-        ret = pynvml.nvmlDeviceGetGpuFabricInfoV(dev, byref(dev_fabric))
-        if ret != pynvml.NVML_SUCCESS:
-            return None
-        if dev_fabric.state != pynvml.NVML_GPU_FABRIC_STATE_COMPLETED:
-            return None
-        return {
-            "fabric_cluster_uuid": stringify_uuid(bytes(dev_fabric.clusterUuid)),
-            "fabric_clique_id": dev_fabric.cliqueId,
-        }
-    except pynvml.NVMLError:
-        debug_log_warning(logger, "Failed to get NVSwitch fabric info")
 def _get_arch_family(dev_cc_t: list[int]) -> str:
     """
     Get the architecture family based on the CUDA compute capability.
@@ -917,3 +911,46 @@ def _is_vgpu(dev_config: bytes) -> bool:
     # Check for vGPU signature,
     # which is either 0x56 (NVIDIA vGPU) or 0x46 (NVIDIA GRID).
     return dev_cap[3] == 0x56 or dev_cap[4] == 0x46
+def _get_mig_minors() -> dict[tuple, int] | None:
+    """
+    Get the minor mapping for MIG capability devices.
+    Returns:
+        A dict mapping (gpu_id, gi_id, ci_id) to minor number,
+        or None if not supported.
+    """
+    mig_minors_path = Path("/proc/driver/nvidia-caps/mig-minors")
+    if not mig_minors_path.exists():
+        return None
+    ret = {}
+    for _line in mig_minors_path.read_text(encoding="utf-8").splitlines():
+        line = _line.strip()
+        if not line:
+            continue
+        # Scan lines like:
+        # gpu%d/gi%d/ci%d/access %d
+        m = re.match(r"gpu(\d+)/gi(\d+)/ci(\d+)/access (\d+)", line)
+        if m:
+            gpu_id = int(m.group(1))
+            gi_id = int(m.group(2))
+            ci_id = int(m.group(3))
+            minor = int(m.group(4))
+            ret[(gpu_id, gi_id, ci_id)] = minor
+            continue
+        # Scan lines like:
+        # gpu%d/gi%d/access %d
+        m = re.match(r"gpu(\d+)/gi(\d+)/access (\d+)", line)
+        if m:
+            gpu_id = int(m.group(1))
+            gi_id = int(m.group(2))
+            minor = int(m.group(3))
+            ret[(gpu_id, gi_id, None)] = minor
+            continue
+    return ret

gpustack_runtime/detector/pyacl/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 ##
 # Python bindings for the ACL library
 ##
-from __future__ import annotations
+from __future__ import annotations as __future_annotations__
 import contextlib
 import os

gpustack_runtime/detector/pyamdgpu/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 ##
 # Python bindings for the DCMI library
 ##
-from __future__ import annotations
+from __future__ import annotations as __future_annotations__
 import errno
 import os

gpustack_runtime/detector/pyamdsmi/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # Bridge amdsmi module to avoid import errors when amdsmi is not installed
 # This module raises an exception when amdsmi_init is called
 # and does nothing when amdsmi_shut_down is called.
-from __future__ import annotations
+from __future__ import annotations as __future_annotations__
 import contextlib
 import os

gpustack_runtime/detector/pycuda/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from __future__ import annotations
+from __future__ import annotations as __future_annotations__
 import string
 import sys

gpustack_runtime/detector/pydcmi/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 ##
 # Python bindings for the DCMI library
 ##
-from __future__ import annotations
+from __future__ import annotations as __future_annotations__
 import string
 import sys

gpustack_runtime/detector/pyhsa/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 ##
 # Python bindings for the HSA library
 ##
-from __future__ import annotations
+from __future__ import annotations as __future_annotations__
 import contextlib
 import os

gpustack-runtime 0.1.40.post1__py3-none-any.whl → 0.1.41.post1__py3-none-any.whl

gpustack-runtime 0.1.40.post1py3-none-any.whl → 0.1.41.post1py3-none-any.whl