PyPI - gpustack-runtime - Versions diffs - 0.1.39.post2__py3-none-any.whl → 0.1.40__py3-none-any.whl - Mend

gpustack-runtime 0.1.39.post2py3-none-any.whl → 0.1.40py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

gpustack_runtime/__main__.py +7 -3
gpustack_runtime/_version.py +2 -2
gpustack_runtime/_version_appendix.py +1 -1
gpustack_runtime/cmds/__init__.py +2 -0
gpustack_runtime/cmds/deployer.py +84 -2
gpustack_runtime/cmds/images.py +2 -0
gpustack_runtime/deployer/__init__.py +2 -0
gpustack_runtime/deployer/__types__.py +52 -28
gpustack_runtime/deployer/__utils__.py +99 -112
gpustack_runtime/deployer/cdi/__init__.py +81 -0
gpustack_runtime/deployer/cdi/__types__.py +667 -0
gpustack_runtime/deployer/cdi/thead.py +103 -0
gpustack_runtime/deployer/docker.py +42 -24
gpustack_runtime/deployer/kuberentes.py +8 -4
gpustack_runtime/deployer/podman.py +41 -23
gpustack_runtime/detector/__init__.py +62 -3
gpustack_runtime/detector/__types__.py +11 -0
gpustack_runtime/detector/__utils__.py +23 -0
gpustack_runtime/detector/amd.py +17 -9
gpustack_runtime/detector/hygon.py +6 -1
gpustack_runtime/detector/iluvatar.py +20 -5
gpustack_runtime/detector/mthreads.py +8 -12
gpustack_runtime/detector/nvidia.py +365 -168
gpustack_runtime/detector/pyacl/__init__.py +9 -1
gpustack_runtime/detector/pyamdgpu/__init__.py +8 -0
gpustack_runtime/detector/pycuda/__init__.py +9 -1
gpustack_runtime/detector/pydcmi/__init__.py +9 -2
gpustack_runtime/detector/pyhgml/__init__.py +5879 -0
gpustack_runtime/detector/pyhgml/libhgml.so +0 -0
gpustack_runtime/detector/pyhgml/libuki.so +0 -0
gpustack_runtime/detector/pyhsa/__init__.py +9 -0
gpustack_runtime/detector/pyixml/__init__.py +89 -164
gpustack_runtime/detector/pyrocmcore/__init__.py +42 -24
gpustack_runtime/detector/pyrocmsmi/__init__.py +141 -138
gpustack_runtime/detector/thead.py +733 -0
gpustack_runtime/envs.py +128 -55
{gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/METADATA +4 -2
gpustack_runtime-0.1.40.dist-info/RECORD +55 -0
gpustack_runtime/detector/pymtml/__init__.py +0 -770
gpustack_runtime-0.1.39.post2.dist-info/RECORD +0 -49
{gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/WHEEL +0 -0
{gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/entry_points.txt +0 -0
{gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/licenses/LICENSE +0 -0

gpustack_runtime/detector/nvidia.py CHANGED Viewed

@@ -2,9 +2,10 @@ from __future__ import annotations
 import contextlib
 import logging
+import math
+import time
 from _ctypes import byref
 from functools import lru_cache
-from math import ceil
 import pynvml
@@ -125,103 +126,104 @@ class NVIDIADetector(Detector):
             for dev_idx in range(dev_count):
                 dev = pynvml.nvmlDeviceGetHandleByIndex(dev_idx)
-                dev_index = dev_idx
-                if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
-                    if dev_files is None:
-                        dev_files = get_device_files(pattern=r"nvidia(?P<number>\d+)")
-                    if len(dev_files) >= dev_count:
-                        dev_file = dev_files[dev_idx]
-                        if dev_file.number is not None:
-                            dev_index = dev_file.number
-                dev_uuid = pynvml.nvmlDeviceGetUUID(dev)
-                dev_cores = None
-                if not envs.GPUSTACK_RUNTIME_DETECT_NO_TOOLKIT_CALL:
-                    with contextlib.suppress(pycuda.CUDAError):
-                        dev_gpudev = pycuda.cuDeviceGet(dev_idx)
-                        dev_cores = pycuda.cuDeviceGetAttribute(
-                            dev_gpudev,
-                            pycuda.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-                        )
+                dev_cc_t = pynvml.nvmlDeviceGetCudaComputeCapability(dev)
+                dev_cc = ".".join(map(str, dev_cc_t))
-                dev_mem = 0
-                dev_mem_used = 0
+                dev_bdf = None
                 with contextlib.suppress(pynvml.NVMLError):
-                    dev_mem_info = pynvml.nvmlDeviceGetMemoryInfo(dev)
-                    dev_mem = byte_to_mebibyte(  # byte to MiB
-                        dev_mem_info.total,
-                    )
-                    dev_mem_used = byte_to_mebibyte(  # byte to MiB
-                        dev_mem_info.used,
-                    )
-                if dev_mem == 0:
-                    dev_mem, dev_mem_used = get_memory()
+                    dev_pci_info = pynvml.nvmlDeviceGetPciInfo(dev)
+                    dev_bdf = str(dev_pci_info.busIdLegacy).lower()
-                dev_cores_util = None
+                dev_mig_mode = pynvml.NVML_DEVICE_MIG_DISABLE
                 with contextlib.suppress(pynvml.NVMLError):
-                    dev_util_rates = pynvml.nvmlDeviceGetUtilizationRates(dev)
-                    dev_cores_util = dev_util_rates.gpu
-                if dev_cores_util is None:
-                    debug_log_warning(
-                        logger,
-                        "Failed to get device %d cores utilization, setting to 0",
-                        dev_index,
-                    )
-                    dev_cores_util = 0
+                    dev_mig_mode, _ = pynvml.nvmlDeviceGetMigMode(dev)
-                dev_temp = None
-                with contextlib.suppress(pynvml.NVMLError):
-                    dev_temp = pynvml.nvmlDeviceGetTemperature(
-                        dev,
-                        pynvml.NVML_TEMPERATURE_GPU,
-                    )
+                # With MIG disabled, treat as a single device.
-                dev_power = None
-                dev_power_used = None
-                with contextlib.suppress(pynvml.NVMLError):
-                    dev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(dev)
-                    dev_power = dev_power // 1000  # mW to W
-                    dev_power_used = (
-                        pynvml.nvmlDeviceGetPowerUsage(dev) // 1000
-                    )  # mW to W
+                if dev_mig_mode == pynvml.NVML_DEVICE_MIG_DISABLE:
+                    dev_index = dev_idx
+                    if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
+                        if dev_files is None:
+                            dev_files = get_device_files(
+                                pattern=r"nvidia(?P<number>\d+)",
+                            )
+                        if len(dev_files) >= dev_count:
+                            dev_file = dev_files[dev_idx]
+                            if dev_file.number is not None:
+                                dev_index = dev_file.number
-                dev_cc_t = pynvml.nvmlDeviceGetCudaComputeCapability(dev)
-                dev_cc = ".".join(map(str, dev_cc_t))
+                    dev_name = pynvml.nvmlDeviceGetName(dev)
-                dev_is_vgpu = False
-                dev_pci_info = pynvml.nvmlDeviceGetPciInfo(dev)
-                for addr in [dev_pci_info.busIdLegacy, dev_pci_info.busId]:
-                    if addr in pci_devs:
-                        dev_is_vgpu = _is_vgpu(pci_devs[addr].config)
-                        break
+                    dev_uuid = pynvml.nvmlDeviceGetUUID(dev)
+                    dev_cores = None
+                    if not envs.GPUSTACK_RUNTIME_DETECT_NO_TOOLKIT_CALL:
+                        with contextlib.suppress(pycuda.CUDAError):
+                            dev_gpudev = pycuda.cuDeviceGet(dev_idx)
+                            dev_cores = pycuda.cuDeviceGetAttribute(
+                                dev_gpudev,
+                                pycuda.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+                            )
+                    dev_cores_util = _get_sm_util_from_gpm_metrics(dev)
+                    if dev_cores_util is None:
+                        with contextlib.suppress(pynvml.NVMLError):
+                            dev_util_rates = pynvml.nvmlDeviceGetUtilizationRates(dev)
+                            dev_cores_util = dev_util_rates.gpu
+                    if dev_cores_util is None:
+                        debug_log_warning(
+                            logger,
+                            "Failed to get device %d cores utilization, setting to 0",
+                            dev_index,
+                        )
+                        dev_cores_util = 0
-                dev_appendix = {
-                    "arch_family": _get_arch_family(dev_cc_t),
-                    "vgpu": dev_is_vgpu,
-                    "bdf": str(dev_pci_info.busIdLegacy).lower(),
-                }
+                    dev_mem = 0
+                    dev_mem_used = 0
+                    with contextlib.suppress(pynvml.NVMLError):
+                        dev_mem_info = pynvml.nvmlDeviceGetMemoryInfo(dev)
+                        dev_mem = byte_to_mebibyte(  # byte to MiB
+                            dev_mem_info.total,
+                        )
+                        dev_mem_used = byte_to_mebibyte(  # byte to MiB
+                            dev_mem_info.used,
+                        )
+                    if dev_mem == 0:
+                        dev_mem, dev_mem_used = get_memory()
-                with contextlib.suppress(pynvml.NVMLError):
-                    dev_fabric = pynvml.c_nvmlGpuFabricInfoV_t()
-                    r = pynvml.nvmlDeviceGetGpuFabricInfoV(dev, byref(dev_fabric))
-                    if r != pynvml.NVML_SUCCESS:
-                        dev_fabric = None
-                    if dev_fabric.state != pynvml.NVML_GPU_FABRIC_STATE_COMPLETED:
-                        dev_fabric = None
-                    if dev_fabric:
-                        dev_appendix["fabric_cluster_uuid"] = stringify_uuid(
-                            bytes(dev_fabric.clusterUuid),
+                    dev_temp = None
+                    with contextlib.suppress(pynvml.NVMLError):
+                        dev_temp = pynvml.nvmlDeviceGetTemperature(
+                            dev,
+                            pynvml.NVML_TEMPERATURE_GPU,
                         )
-                        dev_appendix["fabric_clique_id"] = dev_fabric.cliqueId
-                dev_mig_mode = pynvml.NVML_DEVICE_MIG_DISABLE
-                with contextlib.suppress(pynvml.NVMLError):
-                    dev_mig_mode, _ = pynvml.nvmlDeviceGetMigMode(dev)
+                    dev_power = None
+                    dev_power_used = None
+                    with contextlib.suppress(pynvml.NVMLError):
+                        dev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(dev)
+                        dev_power = dev_power // 1000  # mW to W
+                        dev_power_used = (
+                            pynvml.nvmlDeviceGetPowerUsage(dev) // 1000
+                        )  # mW to W
-                # If MIG is not enabled, return the GPU itself.
+                    dev_is_vgpu = False
+                    if dev_bdf and dev_bdf in pci_devs:
+                        dev_is_vgpu = _is_vgpu(pci_devs[dev_bdf].config)
+                    dev_appendix = {
+                        "arch_family": _get_arch_family(dev_cc_t),
+                        "vgpu": dev_is_vgpu,
+                    }
+                    if dev_bdf:
+                        dev_appendix["bdf"] = dev_bdf
+                    if dev_links_state := _get_links_state(dev):
+                        dev_appendix.update(dev_links_state)
+                    if dev_fabric_info := _get_fabric_info(dev):
+                        dev_appendix.update(dev_fabric_info)
-                if dev_mig_mode == pynvml.NVML_DEVICE_MIG_DISABLE:
-                    dev_name = pynvml.nvmlDeviceGetName(dev)
                     ret.append(
                         Device(
                             manufacturer=self.manufacturer,
@@ -250,7 +252,7 @@ class NVIDIADetector(Detector):
                 # inspired by https://github.com/NVIDIA/go-nvlib/blob/fdfe25d0ffc9d7a8c166f4639ef236da81116262/pkg/nvlib/device/mig_device.go#L61-L154.
                 mdev_name = ""
-                mdev_cores = 1
+                mdev_cores = None
                 mdev_count = pynvml.nvmlDeviceGetMaxMigDeviceCount(dev)
                 for mdev_idx in range(mdev_count):
                     mdev = pynvml.nvmlDeviceGetMigDeviceHandleByIndex(dev, mdev_idx)
@@ -283,16 +285,21 @@ class NVIDIADetector(Detector):
                         pynvml.nvmlDeviceGetPowerUsage(mdev) // 1000
                     )  # mW to W
-                    mdev_appendix = dev_appendix.copy()
+                    mdev_appendix = {
+                        "arch_family": _get_arch_family(dev_cc_t),
+                        "vgpu": True,
+                    }
+                    if dev_bdf:
+                        mdev_appendix["bdf"] = dev_bdf
                     mdev_gi_id = pynvml.nvmlDeviceGetGpuInstanceId(mdev)
                     mdev_appendix["gpu_instance_id"] = mdev_gi_id
                     mdev_ci_id = pynvml.nvmlDeviceGetComputeInstanceId(mdev)
                     mdev_appendix["compute_instance_id"] = mdev_ci_id
-                    if not mdev_name:
-                        mdev_attrs = pynvml.nvmlDeviceGetAttributes(mdev)
+                    mdev_cores_util = _get_sm_util_from_gpm_metrics(dev, mdev_gi_id)
+                    if not mdev_name:
                         mdev_gi = pynvml.nvmlDeviceGetGpuInstanceById(dev, mdev_gi_id)
                         mdev_ci = pynvml.nvmlGpuInstanceGetComputeInstanceById(
                             mdev_gi,
@@ -310,11 +317,6 @@ class NVIDIADetector(Detector):
                                 )
                                 if dev_gi_prf.id != mdev_gi_info.profileId:
                                     continue
-                                mdev_cores = getattr(
-                                    dev_gi_prf,
-                                    "multiprocessorCount",
-                                    1,
-                                )
                             except pynvml.NVMLError:
                                 continue
@@ -335,31 +337,31 @@ class NVIDIADetector(Detector):
                                     except pynvml.NVMLError:
                                         continue
-                                    gi_slices = _get_gpu_instance_slices(dev_gi_prf_id)
-                                    gi_attrs = _get_gpu_instance_attrs(dev_gi_prf_id)
-                                    gi_neg_attrs = _get_gpu_instance_negative_attrs(
-                                        dev_gi_prf_id,
-                                    )
-                                    ci_slices = _get_compute_instance_slices(
+                                    ci_slice = _get_compute_instance_slice(
                                         dev_ci_prf_id,
                                     )
-                                    ci_mem = _get_compute_instance_memory_in_gib(
+                                    gi_slice = _get_gpu_instance_slice(dev_gi_prf_id)
+                                    gi_mem = _get_gpu_instance_memory(
                                         dev_mem_info,
-                                        mdev_attrs,
+                                        dev_gi_prf,
+                                    )
+                                    gi_attrs = _get_gpu_instance_attrs(dev_gi_prf_id)
+                                    gi_neg_attrs = _get_gpu_instance_negattrs(
+                                        dev_gi_prf_id,
                                     )
-                                    if gi_slices == ci_slices:
-                                        mdev_name = f"{gi_slices}g.{ci_mem}gb"
+                                    if ci_slice == gi_slice:
+                                        mdev_name = f"{gi_slice}g.{gi_mem}gb"
                                     else:
                                         mdev_name = (
-                                            f"{ci_slices}c.{gi_slices}g.{ci_mem}gb"
+                                            f"{ci_slice}c.{gi_slice}g.{gi_mem}gb"
                                         )
                                     if gi_attrs:
                                         mdev_name += f"+{gi_attrs}"
                                     if gi_neg_attrs:
                                         mdev_name += f"-{gi_neg_attrs}"
-                                    mdev_cores = ci_slices
+                                    mdev_cores = mdev_ci_prf.multiprocessorCount
                                     break
@@ -374,6 +376,7 @@ class NVIDIADetector(Detector):
                             runtime_version_original=sys_runtime_ver_original,
                             compute_capability=dev_cc,
                             cores=mdev_cores,
+                            cores_utilization=mdev_cores_util,
                             memory=mdev_mem,
                             memory_used=mdev_mem_used,
                             memory_utilization=get_utilization(mdev_mem_used, mdev_mem),
@@ -467,8 +470,7 @@ class NVIDIADetector(Detector):
                             dev_i_handle,
                             dev_j_handle,
                         )
-                        # In practice, there may not be NVLINK nodes that are not interconnected.
-                        if "fabric_cluster_uuid" in dev_i.appendix:
+                        if dev_i.appendix.get("links_state", 0) > 0:
                             distance = TopologyDistanceEnum.LINK
                     except pynvml.NVMLError:
                         debug_log_exception(
@@ -492,6 +494,201 @@ class NVIDIADetector(Detector):
         return ret
+def _get_gpm_metrics(
+    metrics: list[int],
+    dev: pynvml.c_nvmlDevice_t,
+    gpu_instance_id: int | None = None,
+    interval: float = 0.1,
+) -> list[pynvml.c_nvmlGpmMetric_t] | None:
+    """
+    Get GPM metrics for a device or a MIG GPU instance.
+    Args:
+        metrics:
+            A list of GPM metric IDs to query.
+        dev:
+            The NVML device handle.
+        gpu_instance_id:
+            The GPU instance ID for MIG devices.
+        interval:
+            Interval in seconds between two samples.
+    Returns:
+        A list of GPM metric structures, or None if failed.
+    """
+    try:
+        dev_gpm_support = pynvml.nvmlGpmQueryDeviceSupport(dev)
+        if not bool(dev_gpm_support.isSupportedDevice):
+            return None
+    except pynvml.NVMLError:
+        debug_log_warning(logger, "Unsupported GPM query")
+        return None
+    dev_gpm_metrics = pynvml.c_nvmlGpmMetricsGet_t()
+    try:
+        dev_gpm_metrics.sample1 = pynvml.nvmlGpmSampleAlloc()
+        dev_gpm_metrics.sample2 = pynvml.nvmlGpmSampleAlloc()
+        if gpu_instance_id is None:
+            pynvml.nvmlGpmSampleGet(dev, dev_gpm_metrics.sample1)
+            time.sleep(interval)
+            pynvml.nvmlGpmSampleGet(dev, dev_gpm_metrics.sample2)
+        else:
+            pynvml.nvmlGpmMigSampleGet(dev, gpu_instance_id, dev_gpm_metrics.sample1)
+            time.sleep(interval)
+            pynvml.nvmlGpmMigSampleGet(dev, gpu_instance_id, dev_gpm_metrics.sample2)
+        dev_gpm_metrics.version = pynvml.NVML_GPM_METRICS_GET_VERSION
+        dev_gpm_metrics.numMetrics = len(metrics)
+        for metric_idx, metric in enumerate(metrics):
+            dev_gpm_metrics.metrics[metric_idx].metricId = metric
+        pynvml.nvmlGpmMetricsGet(dev_gpm_metrics)
+    except pynvml.NVMLError:
+        debug_log_exception(logger, "Failed to get GPM metrics")
+        return None
+    finally:
+        if dev_gpm_metrics.sample1:
+            pynvml.nvmlGpmSampleFree(dev_gpm_metrics.sample1)
+        if dev_gpm_metrics.sample2:
+            pynvml.nvmlGpmSampleFree(dev_gpm_metrics.sample2)
+    return list(dev_gpm_metrics.metrics)
+def _get_sm_util_from_gpm_metrics(
+    dev: pynvml.c_nvmlDevice_t,
+    gpu_instance_id: int | None = None,
+    interval: float = 0.1,
+) -> int | None:
+    """
+    Get SM utilization from GPM metrics.
+    Args:
+        dev:
+            The NVML device handle.
+        gpu_instance_id:
+            The GPU instance ID for MIG devices.
+        interval:
+            Interval in seconds between two samples.
+    Returns:
+        The SM utilization as an integer percentage, or None if failed.
+    """
+    dev_gpm_metrics = _get_gpm_metrics(
+        metrics=[pynvml.NVML_GPM_METRIC_SM_UTIL],
+        dev=dev,
+        gpu_instance_id=gpu_instance_id,
+        interval=interval,
+    )
+    if dev_gpm_metrics and not math.isnan(dev_gpm_metrics[0].value):
+        return int(dev_gpm_metrics[0].value)
+    return None
+def _extract_field_value(
+    field_value: pynvml.c_nvmlFieldValue_t,
+) -> int | float | None:
+    """
+    Extract the value from a NVML field value structure.
+    Args:
+        field_value:
+            The NVML field value structure.
+    Returns:
+        The extracted value as int, float, or None if unknown.
+    """
+    if field_value.nvmlReturn != pynvml.NVML_SUCCESS:
+        return None
+    match field_value.valueType:
+        case pynvml.NVML_VALUE_TYPE_DOUBLE:
+            return field_value.value.dVal
+        case pynvml.NVML_VALUE_TYPE_UNSIGNED_INT:
+            return field_value.value.uiVal
+        case pynvml.NVML_VALUE_TYPE_UNSIGNED_LONG:
+            return field_value.value.ulVal
+        case pynvml.NVML_VALUE_TYPE_UNSIGNED_LONG_LONG:
+            return field_value.value.ullVal
+        case pynvml.NVML_VALUE_TYPE_SIGNED_LONG_LONG:
+            return field_value.value.sllVal
+        case pynvml.NVML_VALUE_TYPE_SIGNED_INT:
+            return field_value.value.siVal
+        case pynvml.NVML_VALUE_TYPE_UNSIGNED_SHORT:
+            return field_value.value.usVal
+    return None
+def _get_links_state(
+    dev: pynvml.c_nvmlDevice_t,
+) -> dict | None:
+    """
+    Get the NVLink links count and state for a device.
+    Args:
+        dev:
+            The NVML device handle.
+    Returns:
+        A dict includes links state or None if failed.
+    """
+    dev_links_count = 0
+    try:
+        dev_fields = pynvml.nvmlDeviceGetFieldValues(
+            dev,
+            fieldIds=[pynvml.NVML_FI_DEV_NVLINK_LINK_COUNT],
+        )
+        dev_links_count = _extract_field_value(dev_fields[0])
+    except pynvml.NVMLError:
+        debug_log_warning(logger, "Failed to get NVLink links count")
+    if not dev_links_count:
+        return None
+    dev_links_state = 0
+    try:
+        for link_idx in range(int(dev_links_count)):
+            dev_link_state = pynvml.nvmlDeviceGetNvLinkState(dev, link_idx)
+            if dev_link_state:
+                dev_links_state |= 1 << (link_idx + 1)
+    except pynvml.NVMLError:
+        debug_log_warning(logger, "Failed to get NVLink link state")
+    return {
+        "links_count": dev_links_count,
+        "links_state": dev_links_state,
+    }
+def _get_fabric_info(
+    dev: pynvml.c_nvmlDevice_t,
+) -> dict | None:
+    """
+    Get the NVSwitch fabric information for a device.
+    Args:
+        dev:
+            The NVML device handle.
+    Returns:
+        A dict includes fabric info or None if failed.
+    """
+    try:
+        dev_fabric = pynvml.c_nvmlGpuFabricInfoV_t()
+        ret = pynvml.nvmlDeviceGetGpuFabricInfoV(dev, byref(dev_fabric))
+        if ret != pynvml.NVML_SUCCESS:
+            return None
+        if dev_fabric.state != pynvml.NVML_GPU_FABRIC_STATE_COMPLETED:
+            return None
+        return {
+            "fabric_cluster_uuid": stringify_uuid(bytes(dev_fabric.clusterUuid)),
+            "fabric_clique_id": dev_fabric.cliqueId,
+        }
+    except pynvml.NVMLError:
+        debug_log_warning(logger, "Failed to get NVSwitch fabric info")
 def _get_arch_family(dev_cc_t: list[int]) -> str:
     """
     Get the architecture family based on the CUDA compute capability.
@@ -528,9 +725,9 @@ def _get_arch_family(dev_cc_t: list[int]) -> str:
     return "Unknown"
-def _get_gpu_instance_slices(dev_gi_prf_id: int) -> int:
+def _get_gpu_instance_slice(dev_gi_prf_id: int) -> int:
     """
-    Get the number of slices for a given GPU Instance Profile ID.
+    Get the number of slice for a given GPU Instance Profile ID.
     Args:
         dev_gi_prf_id:
@@ -576,61 +773,33 @@ def _get_gpu_instance_slices(dev_gi_prf_id: int) -> int:
     raise AttributeError(msg)
-def _get_gpu_instance_attrs(dev_gi_prf_id: int) -> str:
+def _get_gpu_instance_memory(dev_mem, dev_gi_prf) -> int:
     """
-    Get attributes for a given GPU Instance Profile ID.
+    Compute the memory size of a MIG compute instance in GiB.
     Args:
-        dev_gi_prf_id:
-            The GPU Instance Profile ID.
+        dev_mem:
+            The total memory info of the parent GPU device.
+        dev_gi_prf:
+            The profile info of the GPU instance.
     Returns:
-        A string representing the attributes, or an empty string if none.
-    """
-    match dev_gi_prf_id:
-        case (
-            pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV1
-            | pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_REV1
-        ):
-            return "me"
-        case (
-            pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_ALL_ME
-            | pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_ALL_ME
-        ):
-            return "me.all"
-        case (
-            pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_GFX
-            | pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_GFX
-            | pynvml.NVML_GPU_INSTANCE_PROFILE_4_SLICE_GFX
-        ):
-            return "gfx"
-    return ""
+        The memory size in GiB.
-def _get_gpu_instance_negative_attrs(dev_gi_prf_id) -> str:
     """
-    Get negative attributes for a given GPU Instance Profile ID.
+    mem = dev_gi_prf.memorySizeMB * (1 << 20)  # MiB to byte
-    Args:
-        dev_gi_prf_id:
-            The GPU Instance Profile ID.
-    Returns:
-        A string representing the negative attributes, or an empty string if none.
-    """
-    if dev_gi_prf_id in [
-        pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_NO_ME,
-        pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_NO_ME,
-    ]:
-        return "me"
-    return ""
+    gib = round(
+        math.ceil(mem / dev_mem.total * 8)
+        / 8
+        * ((dev_mem.total + (1 << 30) - 1) / (1 << 30)),
+    )
+    return gib
-def _get_compute_instance_slices(dev_ci_prf_id: int) -> int:
+def _get_compute_instance_slice(dev_ci_prf_id: int) -> int:
     """
-    Get the number of slices for a given Compute Instance Profile ID.
+    Get the number of slice for a given Compute Instance Profile ID.
     Args:
         dev_ci_prf_id:
@@ -663,28 +832,56 @@ def _get_compute_instance_slices(dev_ci_prf_id: int) -> int:
     raise AttributeError(msg)
-def _get_compute_instance_memory_in_gib(dev_mem, mdev_attrs) -> int:
+def _get_gpu_instance_attrs(dev_gi_prf_id: int) -> str:
     """
-    Compute the memory size of a MIG compute instance in GiB.
+    Get attributes for a given GPU Instance Profile ID.
     Args:
-        dev_mem:
-            The total memory info of the parent GPU device.
-        mdev_attrs:
-            The attributes of the MIG device.
+        dev_gi_prf_id:
+            The GPU Instance Profile ID.
     Returns:
-        The memory size in GiB.
+        A string representing the attributes, or an empty string if none.
     """
-    gib = round(
-        ceil(
-            (mdev_attrs.memorySizeMB * (1 << 20)) / dev_mem.total * 8,
-        )
-        / 8
-        * ((dev_mem.total + (1 << 30) - 1) / (1 << 30)),
-    )
-    return gib
+    match dev_gi_prf_id:
+        case (
+            pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV1
+            | pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_REV1
+        ):
+            return "me"
+        case (
+            pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_ALL_ME
+            | pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_ALL_ME
+        ):
+            return "me.all"
+        case (
+            pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_GFX
+            | pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_GFX
+            | pynvml.NVML_GPU_INSTANCE_PROFILE_4_SLICE_GFX
+        ):
+            return "gfx"
+    return ""
+def _get_gpu_instance_negattrs(dev_gi_prf_id) -> str:
+    """
+    Get negative attributes for a given GPU Instance Profile ID.
+    Args:
+        dev_gi_prf_id:
+            The GPU Instance Profile ID.
+    Returns:
+        A string representing the negative attributes, or an empty string if none.
+    """
+    if dev_gi_prf_id in [
+        pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_NO_ME,
+        pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_NO_ME,
+    ]:
+        return "me"
+    return ""
 def _is_vgpu(dev_config: bytes) -> bool:

gpustack-runtime 0.1.39.post2__py3-none-any.whl → 0.1.40__py3-none-any.whl

gpustack-runtime 0.1.39.post2py3-none-any.whl → 0.1.40py3-none-any.whl