PyPI - gpustack-runtime - Versions diffs - 0.1.40.post1__py3-none-any.whl → 0.1.41.post1__py3-none-any.whl - Mend

gpustack-runtime 0.1.40.post1py3-none-any.whl → 0.1.41.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

gpustack_runtime/__init__.py +1 -1
gpustack_runtime/__main__.py +5 -3
gpustack_runtime/_version.py +2 -2
gpustack_runtime/_version_appendix.py +1 -1
gpustack_runtime/cmds/__init__.py +5 -3
gpustack_runtime/cmds/__types__.py +1 -1
gpustack_runtime/cmds/deployer.py +140 -18
gpustack_runtime/cmds/detector.py +1 -1
gpustack_runtime/cmds/images.py +1 -1
gpustack_runtime/deployer/__init__.py +28 -2
gpustack_runtime/deployer/__patches__.py +1 -1
gpustack_runtime/deployer/__types__.py +2 -1
gpustack_runtime/deployer/__utils__.py +2 -2
gpustack_runtime/deployer/cdi/__init__.py +86 -5
gpustack_runtime/deployer/cdi/__types__.py +92 -29
gpustack_runtime/deployer/cdi/__utils__.py +180 -0
gpustack_runtime/deployer/cdi/amd.py +146 -0
gpustack_runtime/deployer/cdi/ascend.py +164 -0
gpustack_runtime/deployer/cdi/hygon.py +147 -0
gpustack_runtime/deployer/cdi/iluvatar.py +136 -0
gpustack_runtime/deployer/cdi/metax.py +148 -0
gpustack_runtime/deployer/cdi/thead.py +57 -23
gpustack_runtime/deployer/docker.py +9 -8
gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +325 -0
gpustack_runtime/deployer/k8s/deviceplugin/__types__.py +131 -0
gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +590 -0
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/__init__.py +3 -0
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api.proto +212 -0
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.py +86 -0
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.pyi +168 -0
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2_grpc.py +358 -0
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/constants.py +34 -0
gpustack_runtime/deployer/kuberentes.py +50 -4
gpustack_runtime/deployer/podman.py +9 -8
gpustack_runtime/detector/__init__.py +42 -5
gpustack_runtime/detector/__types__.py +8 -24
gpustack_runtime/detector/__utils__.py +46 -39
gpustack_runtime/detector/amd.py +55 -66
gpustack_runtime/detector/ascend.py +29 -41
gpustack_runtime/detector/cambricon.py +3 -3
gpustack_runtime/detector/hygon.py +21 -49
gpustack_runtime/detector/iluvatar.py +44 -60
gpustack_runtime/detector/metax.py +54 -37
gpustack_runtime/detector/mthreads.py +74 -36
gpustack_runtime/detector/nvidia.py +130 -93
gpustack_runtime/detector/pyacl/__init__.py +1 -1
gpustack_runtime/detector/pyamdgpu/__init__.py +1 -1
gpustack_runtime/detector/pyamdsmi/__init__.py +1 -1
gpustack_runtime/detector/pycuda/__init__.py +1 -1
gpustack_runtime/detector/pydcmi/__init__.py +1 -1
gpustack_runtime/detector/pyhsa/__init__.py +1 -1
gpustack_runtime/detector/pymxsml/__init__.py +1553 -1
gpustack_runtime/detector/pyrocmcore/__init__.py +1 -1
gpustack_runtime/detector/pyrocmsmi/__init__.py +1 -1
gpustack_runtime/detector/thead.py +41 -60
gpustack_runtime/envs.py +106 -12
gpustack_runtime/logging.py +6 -2
{gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/METADATA +6 -1
gpustack_runtime-0.1.41.post1.dist-info/RECORD +67 -0
gpustack_runtime/detector/pymxsml/mxsml.py +0 -1580
gpustack_runtime/detector/pymxsml/mxsml_extension.py +0 -816
gpustack_runtime/detector/pymxsml/mxsml_mcm.py +0 -476
gpustack_runtime-0.1.40.post1.dist-info/RECORD +0 -55
{gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/WHEEL +0 -0
{gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/entry_points.txt +0 -0
{gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/licenses/LICENSE +0 -0

gpustack_runtime/detector/hygon.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from __future__ import annotations
+from __future__ import annotations as __future_annotations__
 import contextlib
 import logging
@@ -30,7 +30,7 @@ class HygonDetector(Detector):
     """
     @staticmethod
-    @lru_cache
+    @lru_cache(maxsize=1)
     def is_supported() -> bool:
         """
         Check if the Hygon detector is supported.
@@ -58,7 +58,7 @@ class HygonDetector(Detector):
         return supported
     @staticmethod
-    @lru_cache
+    @lru_cache(maxsize=1)
     def detect_pci_devices() -> dict[str, PCIDevice]:
         # See https://pcisig.com/membership/member-companies?combine=Higon.
         pci_devs = get_pci_devices(vendor="0x1d94")
@@ -120,12 +120,8 @@ class HygonDetector(Detector):
                     with contextlib.suppress(pyrocmsmi.ROCMSMIError):
                         dev_cc = pyrocmsmi.rsmi_dev_target_graphics_version_get(dev_idx)
-                dev_bdf = None
-                dev_card_id = None
-                dev_renderd_id = None
-                with contextlib.suppress(Exception):
-                    dev_bdf = pyrocmsmi.rsmi_dev_pci_id_get(dev_idx)
-                    dev_card_id, dev_renderd_id = _get_card_and_renderd_id(dev_bdf)
+                dev_bdf = pyrocmsmi.rsmi_dev_pci_id_get(dev_idx)
+                dev_card_id, dev_renderd_id = _get_card_and_renderd_id(dev_bdf)
                 dev_cores = dev_hsa_agent.compute_units
                 if not dev_cores and dev_card_id is not None:
@@ -157,15 +153,17 @@ class HygonDetector(Detector):
                 dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
                 dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
-                dev_is_vgpu = False
-                if dev_bdf:
-                    dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
+                dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
+                dev_numa = get_numa_node_by_bdf(dev_bdf)
+                if not dev_numa:
+                    dev_numa = str(pyrocmsmi.rsmi_topo_get_numa_node_number(dev_idx))
                 dev_appendix = {
                     "vgpu": dev_is_vgpu,
+                    "bdf": dev_bdf,
+                    "numa": dev_numa,
                 }
-                if dev_bdf is not None:
-                    dev_appendix["bdf"] = dev_bdf
                 if dev_card_id is not None:
                     dev_appendix["card_id"] = dev_card_id
                 if dev_renderd_id is not None:
@@ -253,37 +251,14 @@ class HygonDetector(Detector):
             pyrocmsmi.rsmi_init()
-            # Get NUMA and CPU affinities.
             for i, dev_i in enumerate(devices):
-                # Get affinity with PCIe BDF if possible.
-                if dev_i_bdf := dev_i.appendix.get("bdf", ""):
-                    ret.devices_numa_affinities[i] = get_numa_node_by_bdf(
-                        dev_i_bdf,
-                    )
-                    ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
-                        ret.devices_numa_affinities[i],
-                    )
-                # Otherwise, get affinity via ROCM SMI.
-                if not ret.devices_numa_affinities[i]:
-                    # Get NUMA affinity.
-                    try:
-                        dev_i_numa_node = pyrocmsmi.rsmi_topo_get_numa_node_number(
-                            dev_i.index,
-                        )
-                        ret.devices_numa_affinities[i] = str(dev_i_numa_node)
-                    except pyrocmsmi.ROCMSMIError:
-                        debug_log_exception(
-                            logger,
-                            "Failed to get NUMA affinity for device %d",
-                            dev_i.index,
-                        )
-                    # Get CPU affinity.
-                    ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
-                        ret.devices_numa_affinities[i],
-                    )
+                # Get NUMA and CPU affinities.
+                ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
+                ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
+                    ret.devices_numa_affinities[i],
+                )
-            # Get distances to other devices.
-            for i, dev_i in enumerate(devices):
+                # Get distances to other devices.
                 for j, dev_j in enumerate(devices):
                     if dev_i.index == dev_j.index or ret.devices_distances[i][j] != 0:
                         continue
@@ -326,9 +301,6 @@ class HygonDetector(Detector):
                     ret.devices_distances[i][j] = distance
                     ret.devices_distances[j][i] = distance
-        except pyrocmsmi.ROCMSMIError:
-            debug_log_exception(logger, "Failed to fetch topology")
-            raise
         except Exception:
             debug_log_exception(logger, "Failed to process topology fetching")
             raise
@@ -351,12 +323,12 @@ def _get_card_and_renderd_id(dev_bdf: str) -> tuple[int | None, int | None]:
     card_id = None
     renderd_id = None
-    for path in [
+    for drm_path in [
         Path(f"/sys/module/hycu/drivers/pci:hycu/{dev_bdf}/drm"),
         Path(f"/sys/module/hydcu/drivers/pci:hydcu/{dev_bdf}/drm"),
     ]:
-        if path.exists():
-            for dir_path in path.iterdir():
+        if drm_path.exists():
+            for dir_path in drm_path.iterdir():
                 if dir_path.name.startswith("card"):
                     card_id = int(dir_path.name[4:])
                 elif dir_path.name.startswith("renderD"):

gpustack_runtime/detector/iluvatar.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from __future__ import annotations
+from __future__ import annotations as __future_annotations__
 import contextlib
 import logging
@@ -37,7 +37,7 @@ class IluvatarDetector(Detector):
     """
     @staticmethod
-    @lru_cache
+    @lru_cache(maxsize=1)
     def is_supported() -> bool:
         """
         Check if the Iluvatar detector is supported.
@@ -66,7 +66,7 @@ class IluvatarDetector(Detector):
         return supported
     @staticmethod
-    @lru_cache
+    @lru_cache(maxsize=1)
     def detect_pci_devices() -> dict[str, PCIDevice]:
         # See https://pcisig.com/membership/member-companies?combine=Iluvatar.
         pci_devs = get_pci_devices(vendor="0x1e3e")
@@ -99,29 +99,36 @@ class IluvatarDetector(Detector):
             sys_driver_ver = pyixml.nvmlSystemGetDriverVersion()
-            sys_runtime_ver_original = pyixml.nvmlSystemGetCudaDriverVersion()
-            sys_runtime_ver_original = ".".join(
-                map(
-                    str,
-                    [
-                        sys_runtime_ver_original // 1000,
-                        (sys_runtime_ver_original % 1000) // 10,
-                        (sys_runtime_ver_original % 10),
-                    ],
-                ),
-            )
-            sys_runtime_ver = get_brief_version(
-                sys_runtime_ver_original,
-            )
+            sys_runtime_ver_original = None
+            sys_runtime_ver = None
+            with contextlib.suppress(pyixml.NVMLError):
+                sys_runtime_ver_original = pyixml.nvmlSystemGetCudaDriverVersion()
+                sys_runtime_ver_original = ".".join(
+                    map(
+                        str,
+                        [
+                            sys_runtime_ver_original // 1000,
+                            (sys_runtime_ver_original % 1000) // 10,
+                            (sys_runtime_ver_original % 10),
+                        ],
+                    ),
+                )
+                sys_runtime_ver = get_brief_version(
+                    sys_runtime_ver_original,
+                )
             dev_count = pyixml.nvmlDeviceGetCount()
             for dev_idx in range(dev_count):
                 dev = pyixml.nvmlDeviceGetHandleByIndex(dev_idx)
                 dev_index = dev_idx
-                dev_uuid = pyixml.nvmlDeviceGetUUID(dev)
+                if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
+                    dev_index = pyixml.nvmlDeviceGetMinorNumber(dev)
                 dev_name = pyixml.nvmlDeviceGetName(dev)
+                dev_uuid = pyixml.nvmlDeviceGetUUID(dev)
                 dev_cores = None
                 with contextlib.suppress(pyixml.NVMLError):
                     dev_cores = pyixml.nvmlDeviceGetNumGpuCores(dev)
@@ -171,20 +178,25 @@ class IluvatarDetector(Detector):
                     if dev_cc_t:
                         dev_cc = ".".join(map(str, dev_cc_t))
-                dev_bdf = None
-                with contextlib.suppress(pyixml.NVMLError):
-                    dev_pci_info = pyixml.nvmlDeviceGetPciInfo(dev)
-                    dev_bdf = str(dev_pci_info.busIdLegacy).lower()
+                dev_pci_info = pyixml.nvmlDeviceGetPciInfo(dev)
+                dev_bdf = str(dev_pci_info.busIdLegacy).lower()
+                dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
-                dev_is_vgpu = False
-                if dev_bdf:
-                    dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
+                dev_numa = get_numa_node_by_bdf(dev_bdf)
+                if not dev_numa:
+                    dev_node_affinity = pyixml.nvmlDeviceGetMemoryAffinity(
+                        dev,
+                        get_numa_nodeset_size(),
+                        pyixml.NVML_AFFINITY_SCOPE_NODE,
+                    )
+                    dev_numa = bitmask_to_str(list(dev_node_affinity))
                 dev_appendix = {
                     "vgpu": dev_is_vgpu,
+                    "bdf": dev_bdf,
+                    "numa": dev_numa,
                 }
-                if dev_bdf:
-                    dev_appendix["bdf"] = dev_bdf
                 ret.append(
                     Device(
@@ -247,36 +259,11 @@ class IluvatarDetector(Detector):
             for i, dev_i in enumerate(devices):
                 dev_i_handle = pyixml.nvmlDeviceGetHandleByUUID(dev_i.uuid)
-                # Get affinity with PCIe BDF if possible.
-                if dev_i_bdf := dev_i.appendix.get("bdf", ""):
-                    ret.devices_numa_affinities[i] = get_numa_node_by_bdf(
-                        dev_i_bdf,
-                    )
-                    ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
-                        ret.devices_numa_affinities[i],
-                    )
-                # Otherwise, get affinity via IXML.
-                if not ret.devices_cpu_affinities[i]:
-                    # Get NUMA affinity.
-                    try:
-                        dev_i_memset = pyixml.nvmlDeviceGetMemoryAffinity(
-                            dev_i_handle,
-                            get_numa_nodeset_size(),
-                            pyixml.NVML_AFFINITY_SCOPE_NODE,
-                        )
-                        ret.devices_numa_affinities[i] = bitmask_to_str(
-                            list(dev_i_memset),
-                        )
-                    except pyixml.NVMLError:
-                        debug_log_exception(
-                            logger,
-                            "Failed to get NUMA affinity for device %d",
-                            dev_i.index,
-                        )
-                    # Get CPU affinity.
-                    ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
-                        ret.devices_numa_affinities[i],
-                    )
+                # Get NUMA and CPU affinities.
+                ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
+                ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
+                    ret.devices_numa_affinities[i],
+                )
                 # Get distances to other devices.
                 for j, dev_j in enumerate(devices):
@@ -302,9 +289,6 @@ class IluvatarDetector(Detector):
                     ret.devices_distances[i][j] = distance
                     ret.devices_distances[j][i] = distance
-        except pyixml.NVMLError:
-            debug_log_exception(logger, "Failed to fetch topology")
-            raise
         except Exception:
             debug_log_exception(logger, "Failed to process topology fetching")
             raise

gpustack_runtime/detector/metax.py CHANGED Viewed

@@ -1,7 +1,8 @@
-from __future__ import annotations
+from __future__ import annotations as __future_annotations__
 import logging
 from functools import lru_cache
+from pathlib import Path
 from .. import envs
 from ..logging import debug_log_exception, debug_log_warning
@@ -48,7 +49,7 @@ class MetaXDetector(Detector):
     """
     @staticmethod
-    @lru_cache
+    @lru_cache(maxsize=1)
     def is_supported() -> bool:
         """
         Check if the MetaX detector is supported.
@@ -76,7 +77,7 @@ class MetaXDetector(Detector):
         return supported
     @staticmethod
-    @lru_cache
+    @lru_cache(maxsize=1)
     def detect_pci_devices() -> dict[str, PCIDevice]:
         # See https://pcisig.com/membership/member-companies?combine=MetaX.
         pci_devs = get_pci_devices(vendor="0x9999")
@@ -124,7 +125,6 @@ class MetaXDetector(Detector):
                 dev_name = dev_info.deviceName
                 if dev_info.mode == pymxsml.MXSML_VIRTUALIZATION_MODE_PF:
                     continue
-                dev_is_vgpu = dev_info.mode == pymxsml.MXSML_VIRTUALIZATION_MODE_VF
                 dev_core_util = pymxsml.mxSmlGetDeviceIpUsage(
                     dev_idx,
@@ -165,10 +165,28 @@ class MetaXDetector(Detector):
                         // 1000  # mW to W
                     )
+                dev_bdf = dev_info.bdfId
+                dev_card_id, dev_renderd_id = _get_card_and_renderd_id(dev_bdf)
+                dev_is_vgpu = dev_info.mode == pymxsml.MXSML_VIRTUALIZATION_MODE_VF
+                dev_numa = get_numa_node_by_bdf(dev_bdf)
+                if not dev_numa:
+                    dev_node_affinity = pymxsml.mxSmlGetNodeAffinity(
+                        dev_idx,
+                        get_numa_nodeset_size(),
+                    )
+                    dev_numa = bitmask_to_str(list(dev_node_affinity))
                 dev_appendix = {
                     "vgpu": dev_is_vgpu,
-                    "bdf": dev_info.bdfId,
+                    "bdf": dev_bdf,
+                    "numa": dev_numa,
                 }
+                if dev_card_id is not None:
+                    dev_appendix["card_id"] = dev_card_id
+                if dev_renderd_id is not None:
+                    dev_appendix["renderd_id"] = dev_renderd_id
                 ret.append(
                     Device(
@@ -226,35 +244,11 @@ class MetaXDetector(Detector):
             pymxsml.mxSmlInit()
             for i, dev_i in enumerate(devices):
-                # Get affinity with PCIe BDF if possible.
-                if dev_i_bdf := dev_i.appendix.get("bdf", ""):
-                    ret.devices_numa_affinities[i] = get_numa_node_by_bdf(
-                        dev_i_bdf,
-                    )
-                    ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
-                        ret.devices_numa_affinities[i],
-                    )
-                # Otherwise, get affinity by MXSML.
-                if not ret.devices_cpu_affinities[i]:
-                    # Get NUMA affinity.
-                    try:
-                        dev_i_nodeaff = pymxsml.mxSmlGetNodeAffinity(
-                            dev_i.index,
-                            get_numa_nodeset_size(),
-                        )
-                        ret.devices_numa_affinities[i] = bitmask_to_str(
-                            list(dev_i_nodeaff),
-                        )
-                    except pymxsml.MXSMLError:
-                        debug_log_warning(
-                            logger,
-                            "Failed to get device %d NUMA node affinity",
-                            dev_i.index,
-                        )
-                    # Get CPU affinity.
-                    ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
-                        ret.devices_numa_affinities[i],
-                    )
+                # Get NUMA and CPU affinities.
+                ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
+                ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
+                    ret.devices_numa_affinities[i],
+                )
                 # Get distances to other devices.
                 for j, dev_j in enumerate(devices):
@@ -281,11 +275,34 @@ class MetaXDetector(Detector):
                     ret.devices_distances[i][j] = distance
                     ret.devices_distances[j][i] = distance
-        except pymxsml.MXSMLError:
-            debug_log_exception(logger, "Failed to fetch topology")
-            raise
         except Exception:
             debug_log_exception(logger, "Failed to process topology fetching")
             raise
         return ret
+def _get_card_and_renderd_id(dev_bdf: str) -> tuple[int | None, int | None]:
+    """
+    Get the card ID and renderD ID for a given device bdf.
+    Args:
+        dev_bdf:
+            The device bdf.
+    Returns:
+        A tuple of (card_id, renderd_id).
+    """
+    card_id = None
+    renderd_id = None
+    drm_path = Path(f"/sys/module/metax/drivers/pci:metax/{dev_bdf}/drm")
+    if drm_path.exists():
+        for dir_path in drm_path.iterdir():
+            if dir_path.name.startswith("card"):
+                card_id = int(dir_path.name[4:])
+            elif dir_path.name.startswith("renderD"):
+                renderd_id = int(dir_path.name[7:])
+    return card_id, renderd_id

gpustack_runtime/detector/mthreads.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from __future__ import annotations
+from __future__ import annotations as __future_annotations__
 import logging
 from functools import lru_cache
@@ -47,7 +47,7 @@ class MThreadsDetector(Detector):
     """
     @staticmethod
-    @lru_cache
+    @lru_cache(maxsize=1)
     def is_supported() -> bool:
         """
         Check if the MThreads detector is supported.
@@ -76,7 +76,7 @@ class MThreadsDetector(Detector):
         return supported
     @staticmethod
-    @lru_cache
+    @lru_cache(maxsize=1)
     def detect_pci_devices() -> dict[str, PCIDevice]:
         # See https://pcisig.com/membership/member-companies?combine=Moore+Threads.
         pci_devs = get_pci_devices(vendor="0x1ed5")
@@ -117,6 +117,7 @@ class MThreadsDetector(Detector):
                 dev_cores = 0
                 dev_power_used = None
                 dev_pci_info = None
+                dev_is_vgpu = False
                 dev = pymtml.mtmlLibraryInitDeviceByIndex(dev_idx)
                 try:
                     dev_props = pymtml.mtmlDeviceGetProperty(dev)
@@ -163,9 +164,20 @@ class MThreadsDetector(Detector):
                 dev_bdf = f"{dev_pci_info.segment:04x}:{dev_pci_info.bus:02x}:{dev_pci_info.device:02x}.0"
+                dev_numa = get_numa_node_by_bdf(dev_bdf)
+                if not dev_numa:
+                    dev_node_affinity = pymtml.mtmlDeviceGetMemoryAffinityWithinNode(
+                        dev,
+                        get_numa_nodeset_size(),
+                    )
+                    dev_numa = bitmask_to_str(
+                        list(dev_node_affinity),
+                    )
                 dev_appendix = {
                     "vgpu": dev_is_vgpu,
                     "bdf": dev_bdf,
+                    "numa": dev_numa,
                 }
                 ret.append(
@@ -228,35 +240,24 @@ class MThreadsDetector(Detector):
                 dev_i_handle = pymtml.mtmlLibraryInitDeviceByIndex(dev_i.index)
                 try:
-                    # Get affinity with PCIe BDF if possible.
-                    if dev_i_bdf := dev_i.appendix.get("bdf", ""):
-                        ret.devices_numa_affinities[i] = get_numa_node_by_bdf(
-                            dev_i_bdf,
-                        )
-                        ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
-                            ret.devices_numa_affinities[i],
-                        )
-                    # Otherwise, get affinity via MTML.
-                    if not ret.devices_cpu_affinities[i]:
-                        # Get NUMA affinity.
-                        try:
-                            dev_i_memset = pymtml.mtmlDeviceGetMemoryAffinityWithinNode(
-                                dev_i_handle,
-                                get_numa_nodeset_size(),
-                            )
-                            ret.devices_numa_affinities[i] = bitmask_to_str(
-                                list(dev_i_memset),
-                            )
-                        except pymtml.MTMLError:
-                            debug_log_warning(
-                                logger,
-                                "Failed to get NUMA affinity for device %d",
-                                dev_i.index,
-                            )
-                        # Get CPU affinity.
-                        ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
-                            ret.devices_numa_affinities[i],
-                        )
+                    # Get NUMA and CPU affinities.
+                    ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
+                    ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
+                        ret.devices_numa_affinities[i],
+                    )
+                    # Get links state if applicable.
+                    if dev_i_links_state := _get_links_state(dev_i_handle):
+                        ret.appendices[i].update(dev_i_links_state)
+                        # In practice, if a card has an active *Link,
+                        # then other cards in the same machine should be interconnected with it through the *Link.
+                        if dev_i_links_state.get("links_active_count", 0) > 0:
+                            for j, dev_j in enumerate(devices):
+                                if dev_i.index == dev_j.index:
+                                    continue
+                                ret.devices_distances[i][j] = TopologyDistanceEnum.LINK
+                                ret.devices_distances[j][i] = TopologyDistanceEnum.LINK
+                            continue
                     # Get distances to other devices.
                     for j, dev_j in enumerate(devices):
@@ -278,7 +279,6 @@ class MThreadsDetector(Detector):
                                 topo,
                                 distance,
                             )
-                            # TODO(thxCode): Support LINK distance.
                         except pymtml.MTMLError:
                             debug_log_warning(
                                 logger,
@@ -295,9 +295,6 @@ class MThreadsDetector(Detector):
                 finally:
                     pymtml.mtmlLibraryFreeDevice(dev_i_handle)
-        except pymtml.MTMLError:
-            debug_log_exception(logger, "Failed to fetch topology")
-            raise
         except Exception:
             debug_log_exception(logger, "Failed to process topology fetching")
             raise
@@ -305,3 +302,44 @@ class MThreadsDetector(Detector):
             pymtml.mtmlLibraryShutDown()
         return ret
+def _get_links_state(
+    dev: pymtml.c_mtmlDevice_t,
+) -> dict | None:
+    """
+    Get the MTLink links count and state for a device.
+    Args:
+        dev:
+            The MTLink device handle.
+    Returns:
+        A dict includes links state or None if failed.
+    """
+    dev_links_count = 0
+    try:
+        dev_link_spec = pymtml.mtmlDeviceGetMtLinkSpec(dev)
+        dev_links_count = dev_link_spec.linkNum
+    except pymtml.MTMLError:
+        debug_log_warning(logger, "Failed to get MTLink links count")
+    if not dev_links_count:
+        return None
+    dev_links_state = 0
+    dev_links_active_count = 0
+    try:
+        for link_idx in range(int(dev_links_count)):
+            dev_link_state = pymtml.mtmlDeviceGetMtLinkState(dev, link_idx)
+            if dev_link_state == pymtml.MTML_MTLINK_STATE_UP:
+                dev_links_state |= 1 << link_idx
+                dev_links_active_count += 1
+    except pymtml.MTMLError:
+        debug_log_warning(logger, "Failed to get MTLink link state")
+    return {
+        "links_count": dev_links_count,
+        "links_state": dev_links_state,
+        "links_active_count": dev_links_active_count,
+    }

gpustack-runtime 0.1.40.post1__py3-none-any.whl → 0.1.41.post1__py3-none-any.whl

gpustack-runtime 0.1.40.post1py3-none-any.whl → 0.1.41.post1py3-none-any.whl