PyPI - gpustack-runtime - Versions diffs - 0.1.40.post1__py3-none-any.whl → 0.1.41__py3-none-any.whl - Mend

gpustack-runtime 0.1.40.post1py3-none-any.whl → 0.1.41py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

gpustack_runtime/__init__.py +1 -1
gpustack_runtime/__main__.py +5 -3
gpustack_runtime/_version.py +2 -2
gpustack_runtime/_version_appendix.py +1 -1
gpustack_runtime/cmds/__init__.py +5 -3
gpustack_runtime/cmds/__types__.py +1 -1
gpustack_runtime/cmds/deployer.py +140 -18
gpustack_runtime/cmds/detector.py +1 -1
gpustack_runtime/cmds/images.py +1 -1
gpustack_runtime/deployer/__init__.py +28 -2
gpustack_runtime/deployer/__patches__.py +1 -1
gpustack_runtime/deployer/__types__.py +2 -1
gpustack_runtime/deployer/__utils__.py +2 -2
gpustack_runtime/deployer/cdi/__init__.py +85 -5
gpustack_runtime/deployer/cdi/__types__.py +92 -29
gpustack_runtime/deployer/cdi/__utils__.py +178 -0
gpustack_runtime/deployer/cdi/amd.py +146 -0
gpustack_runtime/deployer/cdi/ascend.py +164 -0
gpustack_runtime/deployer/cdi/hygon.py +147 -0
gpustack_runtime/deployer/cdi/iluvatar.py +136 -0
gpustack_runtime/deployer/cdi/metax.py +148 -0
gpustack_runtime/deployer/cdi/thead.py +57 -23
gpustack_runtime/deployer/docker.py +9 -8
gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +240 -0
gpustack_runtime/deployer/k8s/deviceplugin/__types__.py +131 -0
gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +586 -0
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/__init__.py +3 -0
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api.proto +212 -0
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.py +86 -0
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.pyi +168 -0
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2_grpc.py +358 -0
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/constants.py +34 -0
gpustack_runtime/deployer/kuberentes.py +37 -4
gpustack_runtime/deployer/podman.py +9 -8
gpustack_runtime/detector/__init__.py +42 -5
gpustack_runtime/detector/__types__.py +8 -24
gpustack_runtime/detector/__utils__.py +46 -39
gpustack_runtime/detector/amd.py +55 -66
gpustack_runtime/detector/ascend.py +29 -41
gpustack_runtime/detector/cambricon.py +3 -3
gpustack_runtime/detector/hygon.py +21 -49
gpustack_runtime/detector/iluvatar.py +44 -60
gpustack_runtime/detector/metax.py +54 -37
gpustack_runtime/detector/mthreads.py +74 -36
gpustack_runtime/detector/nvidia.py +130 -93
gpustack_runtime/detector/pyacl/__init__.py +1 -1
gpustack_runtime/detector/pyamdgpu/__init__.py +1 -1
gpustack_runtime/detector/pyamdsmi/__init__.py +1 -1
gpustack_runtime/detector/pycuda/__init__.py +1 -1
gpustack_runtime/detector/pydcmi/__init__.py +1 -1
gpustack_runtime/detector/pyhsa/__init__.py +1 -1
gpustack_runtime/detector/pymxsml/__init__.py +1553 -1
gpustack_runtime/detector/pyrocmcore/__init__.py +1 -1
gpustack_runtime/detector/pyrocmsmi/__init__.py +1 -1
gpustack_runtime/detector/thead.py +41 -60
gpustack_runtime/envs.py +104 -12
gpustack_runtime/logging.py +6 -2
{gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.dist-info}/METADATA +6 -1
gpustack_runtime-0.1.41.dist-info/RECORD +67 -0
gpustack_runtime/detector/pymxsml/mxsml.py +0 -1580
gpustack_runtime/detector/pymxsml/mxsml_extension.py +0 -816
gpustack_runtime/detector/pymxsml/mxsml_mcm.py +0 -476
gpustack_runtime-0.1.40.post1.dist-info/RECORD +0 -55
{gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.dist-info}/WHEEL +0 -0
{gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.dist-info}/entry_points.txt +0 -0
{gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.dist-info}/licenses/LICENSE +0 -0

gpustack_runtime/detector/__types__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from __future__ import annotations
+from __future__ import annotations as __future_annotations__
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
@@ -122,28 +122,6 @@ def backend_to_manufacturer(backend: str) -> ManufacturerEnum:
     return ManufacturerEnum.UNKNOWN
-def supported_manufacturers() -> list[ManufacturerEnum]:
-    """
-    Get a list of supported manufacturers.
-    Returns:
-        A list of supported manufacturers.
-    """
-    return list(_MANUFACTURER_BACKEND_MAPPING.keys())
-def supported_backends() -> list[str]:
-    """
-    Get a list of supported backends.
-    Returns:
-        A list of supported backends.
-    """
-    return list(_MANUFACTURER_BACKEND_MAPPING.values())
 @dataclass_json
 @dataclass
 class Device:
@@ -258,6 +236,11 @@ class Topology:
     A list representing the NUMA affinity associated with each device.
     The value at index i represents the Memory set for device i.
     """
+    appendices: list[dict[str, Any]]
+    """
+    Appendices information of devices.
+    Each entry corresponds to a device and contains additional metadata.
+    """
     def __init__(
         self,
@@ -278,6 +261,7 @@ class Topology:
         self.devices_distances = [[0] * devices_count for _ in range(devices_count)]
         self.devices_cpu_affinities = [""] * devices_count
         self.devices_numa_affinities = [""] * devices_count
+        self.appendices = [{}] * devices_count
     def stringify(self) -> list[list[str]]:
         """
@@ -501,7 +485,7 @@ class Detector(ABC):
         """
         raise NotImplementedError
-    def get_topology(self, devices: Devices | None = None) -> Topology | None:  # noqa: ARG002
+    def get_topology(self, devices: Devices | None = None) -> Topology | None:
         """
         Get the Topology object between the given devices.

gpustack_runtime/detector/__utils__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from __future__ import annotations
+from __future__ import annotations as __future_annotations__
 import contextlib
 import os
@@ -746,7 +746,7 @@ def get_numa_node_cpu_mapping() -> dict[int, list[int]]:
     return numa_cpu_mapping
-@lru_cache(maxsize=128)
+@lru_cache
 def get_numa_node_by_bdf(bdf: str) -> str:
     """
     Get the NUMA node for a given PCI device BDF (Bus:Device.Function) address.
@@ -792,20 +792,7 @@ def map_cpu_affinity_to_numa_node(cpu_affinity: int | str | None) -> str:
     else:
         if not cpu_affinity:
             return ""
-        cpu_indices: list[int] = []
-        for part in cpu_affinity.split(","):
-            if "-" in part:
-                lo, hi = part.split("-")
-                lo_idx = safe_int(lo, -1)
-                hi_idx = safe_int(hi, -1)
-                if lo_idx == -1 or hi_idx == -1 or lo_idx > hi_idx:
-                    continue
-                cpu_indices.extend(list(range(lo_idx, hi_idx + 1)))
-            else:
-                idx = safe_int(part, -1)
-                if idx == -1:
-                    continue
-                cpu_indices.append(idx)
+        cpu_indices: list[int] = str_range_to_list(cpu_affinity)
     cpu_numa_mapping = get_cpu_numa_node_mapping()
@@ -818,7 +805,7 @@ def map_cpu_affinity_to_numa_node(cpu_affinity: int | str | None) -> str:
     if not numa_nodes:
         return ""
-    return list_to_range_str(sorted(numa_nodes))
+    return list_to_str_range(sorted(numa_nodes))
 @lru_cache
@@ -843,20 +830,7 @@ def map_numa_node_to_cpu_affinity(numa_node: int | str | None) -> str:
     else:
         if not numa_node:
             return ""
-        numa_indices: list[int] = []
-        for part in numa_node.split(","):
-            if "-" in part:
-                lo, hi = part.split("-")
-                lo_idx = safe_int(lo, -1)
-                hi_idx = safe_int(hi, -1)
-                if lo_idx == -1 or hi_idx == -1 or lo_idx > hi_idx:
-                    continue
-                numa_indices.extend(list(range(lo_idx, hi_idx + 1)))
-            else:
-                idx = safe_int(part, -1)
-                if idx == -1:
-                    continue
-                numa_indices.append(idx)
+        numa_indices: list[int] = str_range_to_list(numa_node)
     numa_cpu_mapping = get_numa_node_cpu_mapping()
@@ -867,7 +841,7 @@ def map_numa_node_to_cpu_affinity(numa_node: int | str | None) -> str:
     if not cpu_cores:
         return ""
-    return list_to_range_str(sorted(cpu_cores))
+    return list_to_str_range(sorted(cpu_cores))
 def bitmask_to_list(bitmask: int, offset: int = 0) -> list[int]:
@@ -889,7 +863,7 @@ def bitmask_to_list(bitmask: int, offset: int = 0) -> list[int]:
     return indices
-def list_to_range_str(indices: list[int]) -> str:
+def list_to_str_range(indices: list[int]) -> str:
     """
     Convert a list of indices to a comma-separated string with ranges.
@@ -919,15 +893,48 @@ def list_to_range_str(indices: list[int]) -> str:
             start, end = i, i
     ranges.append((start, end))
-    range_str_parts: list[str] = []
+    str_range_parts: list[str] = []
     for start, end in ranges:
         if start == end:
-            range_str_parts.append(f"{start}")
+            str_range_parts.append(f"{start}")
         else:
-            range_str_parts.append(f"{start}-{end}")
-    range_str = ",".join(range_str_parts)
+            str_range_parts.append(f"{start}-{end}")
+    str_range = ",".join(str_range_parts)
+    return str_range
+def str_range_to_list(str_range: str) -> list[int]:
+    """
+    Convert a comma-separated string with ranges to a list of indices.
+    Args:
+        str_range:
+            A comma-separated string with ranges (e.g., "0,2-4,6").
+    Returns:
+        A list of indices.
+    """
+    str_range_parts = str_range.split(",")
+    indices: set[int] = set()
+    for _part in str_range_parts:
+        part = _part.strip()
+        if "-" in part:
+            lo, hi = part.split("-")
+            lo_idx = safe_int(lo, -1)
+            hi_idx = safe_int(hi, -1)
+            if lo_idx == -1 or hi_idx == -1 or lo_idx > hi_idx:
+                continue
+            indices.update(range(lo_idx, hi_idx + 1))
+        else:
+            idx = safe_int(part, -1)
+            if idx == -1:
+                continue
+            indices.add(idx)
-    return range_str
+    return sorted(indices)
 def bitmask_to_str(bitmask_list: list) -> str:
@@ -950,7 +957,7 @@ def bitmask_to_str(bitmask_list: list) -> str:
             bits_lists.extend(bitmask_to_list(bitmask, offset))
         offset += get_bits_size()
-    return list_to_range_str(sorted(bits_lists))
+    return list_to_str_range(sorted(bits_lists))
 def get_physical_function_by_bdf(bdf: str) -> str:

gpustack_runtime/detector/amd.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from __future__ import annotations
+from __future__ import annotations as __future_annotations__
 import contextlib
 import logging
@@ -30,7 +30,7 @@ class AMDDetector(Detector):
     """
     @staticmethod
-    @lru_cache
+    @lru_cache(maxsize=1)
     def is_supported() -> bool:
         """
         Check if the AMD detector is supported.
@@ -59,7 +59,7 @@ class AMDDetector(Detector):
         return supported
     @staticmethod
-    @lru_cache
+    @lru_cache(maxsize=1)
     def detect_pci_devices() -> dict[str, PCIDevice]:
         # See https://pcisig.com/membership/member-companies?combine=AMD.
         pci_devs = get_pci_devices(vendor="0x1002")
@@ -108,11 +108,7 @@ class AMDDetector(Detector):
                     asic_serial = dev_gpu_asic_info.get("asic_serial")
                     dev_uuid = f"GPU-{(asic_serial[2:]).lower()}"
                 else:
-                    dev_uuid = ""
-                    with contextlib.suppress(pyrocmsmi.ROCMSMIError):
-                        dev_uuid = (
-                            f"GPU-{pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)[2:]}"
-                        )
+                    dev_uuid = f"GPU-{pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)[2:]}"
                 dev_hsa_agent = hsa_agents.get(dev_uuid, pyhsa.Agent())
                 dev_gpu_driver_info = pyamdsmi.amdsmi_get_gpu_driver_info(dev)
@@ -132,12 +128,8 @@ class AMDDetector(Detector):
                                 dev_idx,
                             )
-                dev_bdf = None
-                dev_card_id = None
-                dev_renderd_id = None
-                with contextlib.suppress(pyamdsmi.AmdSmiException):
-                    dev_bdf = pyamdsmi.amdsmi_get_gpu_device_bdf(dev)
-                    dev_card_id, dev_renderd_id = _get_card_and_renderd_id(dev_bdf)
+                dev_bdf = pyamdsmi.amdsmi_get_gpu_device_bdf(dev)
+                dev_card_id, dev_renderd_id = _get_card_and_renderd_id(dev_bdf)
                 dev_cores = dev_hsa_agent.compute_units
                 dev_asic_family_id = dev_hsa_agent.asic_family_id
@@ -205,27 +197,25 @@ class AMDDetector(Detector):
                         dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
                         dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
-                dev_is_vgpu = False
-                if dev_bdf:
-                    dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
+                dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
+                dev_numa = get_numa_node_by_bdf(dev_bdf)
+                if not dev_numa:
+                    dev_numa = str(pyamdsmi.amdsmi_topo_get_numa_node_number(dev))
                 dev_appendix = {
                     "arch_family": _get_arch_family(dev_asic_family_id),
                     "vgpu": dev_is_vgpu,
+                    "bdf": dev_bdf,
+                    "numa": dev_numa,
                 }
-                if dev_bdf:
-                    dev_appendix["bdf"] = dev_bdf
                 if dev_card_id is not None:
                     dev_appendix["card_id"] = dev_card_id
                 if dev_renderd_id is not None:
                     dev_appendix["renderd_id"] = dev_renderd_id
-                with contextlib.suppress(pyamdsmi.AmdSmiException):
-                    dev_xgmi = pyamdsmi.amdsmi_get_xgmi_info(dev)
-                    if xgmi_lanes := dev_xgmi.get("xgmi_lanes", None):
-                        dev_appendix["xgmi_lanes"] = xgmi_lanes
-                        dev_appendix["xgmi_hive_id"] = dev_xgmi.get("xgmi_hive_id")
-                        dev_appendix["xgmi_node_id"] = dev_xgmi.get("xgmi_node_id")
+                if dev_xgmi_info := _get_xgmi_info(dev):
+                    dev_appendix.update(dev_xgmi_info)
                 ret.append(
                     Device(
@@ -285,9 +275,9 @@ class AMDDetector(Detector):
         devs_mapping = None
         def get_device_handle(dev: Device):
-            if bdf := dev.appendix.get("bdf", None):
-                with contextlib.suppress(pyamdsmi.AmdSmiException):
-                    return pyamdsmi.amdsmi_get_processor_handle_from_bdf(bdf)
+            with contextlib.suppress(pyamdsmi.AmdSmiException):
+                bdf = dev.appendix["bdf"]
+                return pyamdsmi.amdsmi_get_processor_handle_from_bdf(bdf)
             nonlocal devs_mapping
             if devs_mapping is None:
                 devs = pyamdsmi.amdsmi_get_processor_handles()
@@ -295,7 +285,7 @@ class AMDDetector(Detector):
             return devs_mapping.get(dev.index)
         try:
-            pci_devices = self.detect_pci_devices()
+            pci_devs = self.detect_pci_devices()
             def distance_pci_devices(bdf_a: str, bdf_b: str) -> TopologyDistanceEnum:
                 """
@@ -311,8 +301,8 @@ class AMDDetector(Detector):
                     The TopologyDistanceEnum representing the distance.
                 """
-                pcid_a = pci_devices.get(bdf_a, None)
-                pcid_b = pci_devices.get(bdf_b, None)
+                pcid_a = pci_devs.get(bdf_a, None)
+                pcid_b = pci_devs.get(bdf_b, None)
                 score = compare_pci_devices(pcid_a, pcid_b)
                 if score > 0:
@@ -323,41 +313,16 @@ class AMDDetector(Detector):
             pyamdsmi.amdsmi_init()
-            # Get NUMA and CPU affinities.
-            for i, dev_i in enumerate(devices):
-                # Get affinity with PCIe BDF if possible.
-                if dev_i_bdf := dev_i.appendix.get("bdf", ""):
-                    ret.devices_numa_affinities[i] = get_numa_node_by_bdf(
-                        dev_i_bdf,
-                    )
-                    ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
-                        ret.devices_numa_affinities[i],
-                    )
-                # Otherwise, get affinity via AMD SMI.
-                if not ret.devices_cpu_affinities[i]:
-                    dev_i_handle = get_device_handle(dev_i)
-                    # Get NUMA affinity.
-                    try:
-                        dev_i_numa_node = pyamdsmi.amdsmi_topo_get_numa_node_number(
-                            dev_i_handle,
-                        )
-                        ret.devices_numa_affinities[i] = str(dev_i_numa_node)
-                    except pyamdsmi.AmdSmiException:
-                        debug_log_exception(
-                            logger,
-                            "Failed to get NUMA affinity for device %d",
-                            dev_i.index,
-                        )
-                    # Get CPU affinity.
-                    ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
-                        ret.devices_numa_affinities[i],
-                    )
-            # Get distances to other devices.
             for i, dev_i in enumerate(devices):
                 dev_i_handle = get_device_handle(dev_i)
+                # Get NUMA and CPU affinities.
+                ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
+                ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
+                    ret.devices_numa_affinities[i],
+                )
+                # Get distances to other devices.
                 for j, dev_j in enumerate(devices):
                     if dev_i.index == dev_j.index or ret.devices_distances[i][j] != 0:
                         continue
@@ -402,9 +367,6 @@ class AMDDetector(Detector):
                     ret.devices_distances[i][j] = distance
                     ret.devices_distances[j][i] = distance
-        except pyamdsmi.AmdSmiException:
-            debug_log_exception(logger, "Failed to fetch topology")
-            raise
         except Exception:
             debug_log_exception(logger, "Failed to process topology fetching")
             raise
@@ -465,6 +427,7 @@ def _get_card_and_renderd_id(dev_bdf: str) -> tuple[int | None, int | None]:
     """
     card_id = None
     renderd_id = None
     drm_path = Path(f"/sys/module/amdgpu/drivers/pci:amdgpu/{dev_bdf}/drm")
     if drm_path.exists():
         for dir_path in drm_path.iterdir():
@@ -474,3 +437,29 @@ def _get_card_and_renderd_id(dev_bdf: str) -> tuple[int | None, int | None]:
                 renderd_id = int(dir_path.name[7:])
     return card_id, renderd_id
+def _get_xgmi_info(dev) -> dict | None:
+    """
+    Get the XGMI information for a given device.
+    Args:
+        dev:
+            The device handle.
+    Returns:
+        A dictionary containing XGMI information, or None if not available.
+    """
+    try:
+        dev_xgmi = pyamdsmi.amdsmi_get_xgmi_info(dev)
+        if xgmi_lanes := dev_xgmi.get("xgmi_lanes", None):
+            return {
+                "xgmi_lanes": xgmi_lanes,
+                "xgmi_hive_id": dev_xgmi.get("xgmi_hive_id"),
+                "xgmi_node_id": dev_xgmi.get("xgmi_node_id"),
+            }
+    except pyamdsmi.AmdSmiException:
+        debug_log_exception(logger, "Failed to get XGMI information")
+    return None

gpustack_runtime/detector/ascend.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from __future__ import annotations
+from __future__ import annotations as __future_annotations__
 import contextlib
 import logging
@@ -47,7 +47,7 @@ class AscendDetector(Detector):
     """
     @staticmethod
-    @lru_cache
+    @lru_cache(maxsize=1)
     def is_supported() -> bool:
         """
         Check if the Ascend detector is supported.
@@ -75,7 +75,7 @@ class AscendDetector(Detector):
         return supported
     @staticmethod
-    @lru_cache
+    @lru_cache(maxsize=1)
     def detect_pci_devices() -> dict[str, PCIDevice]:
         # See https://pcisig.com/membership/member-companies?combine=Huawei.
         pci_devs = get_pci_devices(vendor="0x19e5")
@@ -184,12 +184,29 @@ class AscendDetector(Detector):
                     if dev_power_used:
                         dev_power_used = dev_power_used / 10  # 0.1W to W
+                    dev_bdf = pydcmi.dcmi_get_device_bdf(
+                        dev_card_id,
+                        dev_device_id,
+                    )
+                    dev_numa = get_numa_node_by_bdf(dev_bdf)
+                    if not dev_numa:
+                        dev_cpu_affinity = (
+                            pydcmi.dcmi_get_affinity_cpu_info_by_device_id(
+                                dev_card_id,
+                                dev_device_id,
+                            )
+                        )
+                        dev_numa = map_cpu_affinity_to_numa_node(dev_cpu_affinity)
                     dev_appendix = {
                         "arch_family": (
                             pyacl.aclrtGetSocName()
                             or _guess_soc_name_from_dev_name(dev_name)
                         ),
                         "vgpu": dev_is_vgpu,
+                        "bdf": dev_bdf,
+                        "numa": dev_numa,
                         "card_id": dev_card_id,
                         "device_id": dev_device_id,
                         "device_id_max": device_num_in_card - 1,
@@ -208,13 +225,6 @@ class AscendDetector(Detector):
                     if dev_roce_gateway:
                         dev_appendix["roce_gateway"] = str(dev_roce_gateway)
-                    with contextlib.suppress(pydcmi.DCMIError):
-                        dev_bdf = pydcmi.dcmi_get_device_bdf(
-                            dev_card_id,
-                            dev_device_id,
-                        )
-                        dev_appendix["bdf"] = dev_bdf
                     ret.append(
                         Device(
                             manufacturer=self.manufacturer,
@@ -270,44 +280,22 @@ class AscendDetector(Detector):
             pydcmi.dcmi_init()
             for i, dev_i in enumerate(devices):
-                dev_i_card_id = dev_i.appendix["card_id"]
-                dev_i_device_id = dev_i.appendix["device_id"]
+                dev_i_card_id = dev_i.appendix.get("card_id", i)
+                dev_i_device_id = dev_i.appendix.get("device_id", 0)
-                # Get affinity with PCIe BDF if possible.
-                if dev_i_bdf := dev_i.appendix.get("bdf", ""):
-                    ret.devices_numa_affinities[i] = get_numa_node_by_bdf(
-                        dev_i_bdf,
-                    )
-                    ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
-                        ret.devices_numa_affinities[i],
-                    )
-                # Otherwise, get affinity via DCMI.
-                if not ret.devices_cpu_affinities[i]:
-                    # Get CPU affinity.
-                    try:
-                        cpu_affinity = pydcmi.dcmi_get_affinity_cpu_info_by_device_id(
-                            dev_i.appendix["card_id"],
-                            dev_i.appendix["device_id"],
-                        )
-                        ret.devices_cpu_affinities[i] = cpu_affinity
-                    except pydcmi.DCMIError:
-                        debug_log_exception(
-                            slogger,
-                            "Failed to get CPU affinity for device %d",
-                            dev_i.index,
-                        )
-                    # Get NUMA affinity.
-                    ret.devices_numa_affinities[i] = map_cpu_affinity_to_numa_node(
-                        ret.devices_cpu_affinities[i],
-                    )
+                # Get NUMA and CPU affinities.
+                ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
+                ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
+                    ret.devices_numa_affinities[i],
+                )
                 # Get distances to other devices.
                 for j, dev_j in enumerate(devices):
                     if dev_i.index == dev_j.index or ret.devices_distances[i][j] != 0:
                         continue
-                    dev_j_card_id = dev_j.appendix["card_id"]
-                    dev_j_device_id = dev_j.appendix["device_id"]
+                    dev_j_card_id = dev_j.appendix.get("card_id", j)
+                    dev_j_device_id = dev_j.appendix.get("device_id", 0)
                     # If two devices are the same card,
                     # skip distance calculation.

gpustack_runtime/detector/cambricon.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from __future__ import annotations
+from __future__ import annotations as __future_annotations__
 import json
 import logging
@@ -26,7 +26,7 @@ class CambriconDetector(Detector):
     """
     @staticmethod
-    @lru_cache
+    @lru_cache(maxsize=1)
     def is_supported() -> bool:
         """
         Check if the Cambricon detector is supported.
@@ -50,7 +50,7 @@ class CambriconDetector(Detector):
         return supported
     @staticmethod
-    @lru_cache
+    @lru_cache(maxsize=1)
     def detect_pci_devices() -> dict[str, PCIDevice]:
         # See https://pcisig.com/membership/member-companies?combine=Cambricon.
         pci_devs = get_pci_devices(vendor="0xcabc")

gpustack-runtime 0.1.40.post1__py3-none-any.whl → 0.1.41__py3-none-any.whl

gpustack-runtime 0.1.40.post1py3-none-any.whl → 0.1.41py3-none-any.whl