gpustack-runtime 0.1.39.post1__py3-none-any.whl → 0.1.39.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,7 +28,7 @@ from podman.domain.containers_create import CreateMixin
28
28
  from tqdm import tqdm
29
29
 
30
30
  from .. import envs
31
- from ..logging import debug_log_exception
31
+ from ..logging import debug_log_exception, debug_log_warning
32
32
  from .__patches__ import patch_render_payload
33
33
  from .__types__ import (
34
34
  Container,
@@ -37,7 +37,7 @@ from .__types__ import (
37
37
  ContainerMountModeEnum,
38
38
  ContainerProfileEnum,
39
39
  ContainerRestartPolicyEnum,
40
- Deployer,
40
+ EndoscopicDeployer,
41
41
  OperationError,
42
42
  UnsupportedError,
43
43
  WorkloadExecStream,
@@ -49,7 +49,13 @@ from .__types__ import (
49
49
  WorkloadStatusOperation,
50
50
  WorkloadStatusStateEnum,
51
51
  )
52
- from .__utils__ import _MiB, bytes_to_human_readable, replace_image_with, safe_json
52
+ from .__utils__ import (
53
+ _MiB,
54
+ bytes_to_human_readable,
55
+ replace_image_with,
56
+ safe_json,
57
+ sensitive_env_var,
58
+ )
53
59
 
54
60
  if TYPE_CHECKING:
55
61
  from collections.abc import Callable, Generator
@@ -144,7 +150,7 @@ class PodmanWorkloadPlan(WorkloadPlan):
144
150
  super().validate_and_default()
145
151
 
146
152
  # Adjust default image namespace if needed.
147
- if namespace := envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_NAMESPACE:
153
+ if namespace := envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_CONTAINER_NAMESPACE:
148
154
  self.pause_image = replace_image_with(
149
155
  image=self.pause_image,
150
156
  namespace=namespace,
@@ -299,7 +305,7 @@ Name of the Podman deployer.
299
305
  """
300
306
 
301
307
 
302
- class PodmanDeployer(Deployer):
308
+ class PodmanDeployer(EndoscopicDeployer):
303
309
  """
304
310
  Deployer implementation for Podman containers.
305
311
  """
@@ -429,12 +435,12 @@ class PodmanDeployer(Deployer):
429
435
  tag = tag or "latest"
430
436
  auth_config = None
431
437
  if (
432
- envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_USERNAME
433
- and envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_PASSWORD
438
+ envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_CONTAINER_REGISTRY_USERNAME
439
+ and envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_CONTAINER_REGISTRY_PASSWORD
434
440
  ):
435
441
  auth_config = {
436
- "username": envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_USERNAME,
437
- "password": envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_PASSWORD,
442
+ "username": envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_CONTAINER_REGISTRY_USERNAME,
443
+ "password": envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_CONTAINER_REGISTRY_PASSWORD,
438
444
  }
439
445
 
440
446
  logs = self._client.api.pull(
@@ -1150,39 +1156,29 @@ class PodmanDeployer(Deployer):
1150
1156
  super().__init__(_NAME)
1151
1157
  self._client = self._get_client()
1152
1158
 
1153
- def _prepare_create(self):
1159
+ def _prepare_mirrored_deployment(self):
1154
1160
  """
1155
- Prepare for creation.
1161
+ Prepare for mirrored deployment.
1156
1162
 
1157
1163
  """
1158
1164
  # Prepare mirrored deployment if enabled.
1159
1165
  if self._mutate_create_options:
1160
1166
  return
1161
1167
  self._mutate_create_options = lambda o: o
1162
- if not envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT:
1163
- logger.debug("Mirrored deployment disabled")
1164
- return
1165
1168
 
1166
1169
  # Retrieve self-container info.
1167
- ## - Get Container name, default to hostname if not set.
1168
- self_container_id = envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME
1169
- if not self_container_id:
1170
- self_container_id = socket.gethostname()
1171
- logger.warning(
1172
- "Mirrored deployment enabled, but no Container name set, using hostname(%s) instead",
1173
- self_container_id,
1174
- )
1175
1170
  try:
1176
- self_container = self._find_self_container(self_container_id)
1171
+ self_container = self._find_self_container()
1172
+ if not self_container:
1173
+ return
1177
1174
  logger.info(
1178
1175
  "Mirrored deployment enabled, using self Container %s for options mirroring",
1179
- self_container.id[:12],
1176
+ self_container.short_id,
1180
1177
  )
1181
1178
  self_image = self_container.image
1182
1179
  except podman.errors.APIError:
1183
1180
  logger.exception(
1184
- "Mirrored deployment enabled, but failed to get self Container %s, skipping",
1185
- self_container_id,
1181
+ "Mirrored deployment enabled, but failed to get self Container, skipping",
1186
1182
  )
1187
1183
  return
1188
1184
 
@@ -1193,8 +1189,12 @@ class PodmanDeployer(Deployer):
1193
1189
  self_container_envs: dict[str, str] = dict(
1194
1190
  item.split("=", 1) for item in self_container.attrs["Config"].get("Env", [])
1195
1191
  )
1196
- self_image_envs: dict[str, str] = dict(
1197
- item.split("=", 1) for item in self_image.attrs["Config"].get("Env", [])
1192
+ self_image_envs: dict[str, str] = (
1193
+ dict(
1194
+ item.split("=", 1) for item in self_image.attrs["Config"].get("Env", [])
1195
+ )
1196
+ if self_image.attrs["Config"]
1197
+ else {}
1198
1198
  )
1199
1199
  mirrored_envs: dict[str, str] = {
1200
1200
  # Filter out gpustack-internal envs and same-as-image envs.
@@ -1342,17 +1342,10 @@ class PodmanDeployer(Deployer):
1342
1342
 
1343
1343
  self._mutate_create_options = mutate_create_options
1344
1344
 
1345
- def _find_self_container(
1346
- self,
1347
- self_container_id: str,
1348
- ) -> podman.domain.containers.Container:
1345
+ def _find_self_container(self) -> podman.domain.containers.Container | None:
1349
1346
  """
1350
1347
  Find the current container if running inside a Podman container.
1351
1348
 
1352
- Args:
1353
- self_container_id:
1354
- The container name or ID to find.
1355
-
1356
1349
  Returns:
1357
1350
  The Podman container if found, None otherwise.
1358
1351
 
@@ -1360,38 +1353,54 @@ class PodmanDeployer(Deployer):
1360
1353
  If failed to find itself.
1361
1354
 
1362
1355
  """
1363
- if envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME:
1364
- # Directly get container by name or ID.
1365
- return self._client.containers.get(self_container_id)
1366
-
1367
- # Find containers that matches the hostname.
1368
- containers: list[podman.domain.containers.Container] = []
1369
- for c in self._client.containers.list(compatible=True):
1370
- # Ignore workload containers with host network enabled.
1371
- if _LABEL_WORKLOAD in c.labels:
1372
- continue
1373
- # Ignore containers that do not match the hostname.
1374
- if c.attrs["Config"].get("Hostname", "") != self_container_id:
1375
- continue
1376
- # Ignore containers that do not match the filter labels.
1377
- if envs.GPUSTACK_RUNTIME_PODMAN_MIRRORED_NAME_FILTER_LABELS and any(
1378
- c.labels.get(k) != v
1379
- for k, v in envs.GPUSTACK_RUNTIME_PODMAN_MIRRORED_NAME_FILTER_LABELS.items()
1380
- ):
1381
- continue
1382
- containers.append(c)
1356
+ if not envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT:
1357
+ logger.debug("Mirrored deployment disabled")
1358
+ return None
1383
1359
 
1384
- # Validate found containers.
1385
- if len(containers) != 1:
1386
- msg = (
1387
- f"Found multiple Containers with the same hostname {self_container_id}, "
1388
- if len(containers) > 1
1389
- else f"Not found Container with hostname {self_container_id}, "
1390
- "please use `--env GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME=...` to specify the exact container name"
1360
+ # Get container ID or hostname.
1361
+ self_container_id = envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME
1362
+ if not self_container_id:
1363
+ self_container_id = socket.gethostname()
1364
+ debug_log_warning(
1365
+ logger,
1366
+ "Mirrored deployment enabled, but no Container name set, using hostname(%s) instead",
1367
+ self_container_id,
1391
1368
  )
1392
- raise podman.errors.NotFound(msg)
1393
1369
 
1394
- return containers[0]
1370
+ if envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME:
1371
+ # Directly get container.
1372
+ self_container = self._client.containers.get(self_container_id)
1373
+ else:
1374
+ # Find containers that matches the hostname.
1375
+ containers: list[podman.domain.containers.Container] = []
1376
+ for c in self._client.containers.list(compatible=True):
1377
+ # Ignore workload containers with host network enabled.
1378
+ if _LABEL_WORKLOAD in c.labels:
1379
+ continue
1380
+ # Ignore containers that do not match the hostname.
1381
+ if c.attrs["Config"].get("Hostname", "") != self_container_id:
1382
+ continue
1383
+ # Ignore containers that do not match the filter labels.
1384
+ if envs.GPUSTACK_RUNTIME_PODMAN_MIRRORED_NAME_FILTER_LABELS and any(
1385
+ c.labels.get(k) != v
1386
+ for k, v in envs.GPUSTACK_RUNTIME_PODMAN_MIRRORED_NAME_FILTER_LABELS.items()
1387
+ ):
1388
+ continue
1389
+ containers.append(c)
1390
+
1391
+ # Validate found containers.
1392
+ if len(containers) != 1:
1393
+ msg = (
1394
+ f"Found multiple Containers with the same hostname {self_container_id}, "
1395
+ if len(containers) > 1
1396
+ else f"Not found Container with hostname {self_container_id}, "
1397
+ "please use `--env GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME=...` to specify the exact Container name"
1398
+ )
1399
+ raise podman.errors.NotFound(msg)
1400
+
1401
+ self_container = containers[0]
1402
+
1403
+ return self_container
1395
1404
 
1396
1405
  @_supported
1397
1406
  def _create(self, workload: WorkloadPlan):
@@ -1417,7 +1426,7 @@ class PodmanDeployer(Deployer):
1417
1426
  msg = f"Invalid workload type: {type(workload)}"
1418
1427
  raise TypeError(msg)
1419
1428
 
1420
- self._prepare_create()
1429
+ self._prepare_mirrored_deployment()
1421
1430
 
1422
1431
  if isinstance(workload, WorkloadPlan):
1423
1432
  workload = PodmanWorkloadPlan(**workload.__dict__)
@@ -1731,6 +1740,11 @@ class PodmanDeployer(Deployer):
1731
1740
  msg = f"Failed to fetch logs for container {container.name} of workload {name}{_detail_api_call_error(e)}"
1732
1741
  raise OperationError(msg) from e
1733
1742
  else:
1743
+ if not follow:
1744
+ result = bytearray()
1745
+ for chunk in output:
1746
+ result.extend(chunk)
1747
+ return result.decode("utf-8")
1734
1748
  return output
1735
1749
 
1736
1750
  @_supported
@@ -1815,6 +1829,216 @@ class PodmanDeployer(Deployer):
1815
1829
  return output
1816
1830
  return PodmanWorkloadExecStream(output)
1817
1831
 
1832
+ @_supported
1833
+ def _inspect(
1834
+ self,
1835
+ name: WorkloadName,
1836
+ namespace: WorkloadNamespace | None = None,
1837
+ ) -> str | None:
1838
+ """
1839
+ Inspect a Podman workload.
1840
+
1841
+ Args:
1842
+ name:
1843
+ The name of the workload.
1844
+ namespace:
1845
+ The namespace of the workload.
1846
+
1847
+ Returns:
1848
+ The inspection result as a JSON string. None if not found.
1849
+
1850
+ Raises:
1851
+ UnsupportedError:
1852
+ If Podman is not supported in the current environment.
1853
+ OperationError:
1854
+ If the Podman workload fails to inspect.
1855
+
1856
+ """
1857
+ workload = self._get(name=name, namespace=namespace)
1858
+ if not workload:
1859
+ return None
1860
+
1861
+ d_containers = getattr(workload, "_d_containers", [])
1862
+ if not d_containers:
1863
+ return None
1864
+
1865
+ result = []
1866
+ for c in d_containers:
1867
+ c_attrs = c.attrs
1868
+ # Mask sensitive environment variables
1869
+ if "Env" in c_attrs["Config"]:
1870
+ for i, env in enumerate(c_attrs["Config"]["Env"] or []):
1871
+ env_name, _ = env.split("=", maxsplit=1)
1872
+ if sensitive_env_var(env_name):
1873
+ c_attrs["Config"]["Env"][i] = f"{env_name}=******"
1874
+ result.append(c_attrs)
1875
+ return safe_json(result, indent=2)
1876
+
1877
+ def _find_self_container_for_endoscopy(self) -> podman.domain.containers.Container:
1878
+ """
1879
+ Find the self container for endoscopy.
1880
+ Only works in mirrored deployment mode.
1881
+
1882
+ Returns:
1883
+ The self container object.
1884
+
1885
+ Raises:
1886
+ UnsupportedError:
1887
+ If endoscopy is not supported in the current environment.
1888
+
1889
+ """
1890
+ try:
1891
+ self_container = self._find_self_container()
1892
+ except podman.errors.APIError as e:
1893
+ msg = "Endoscopy is not supported in the current environment: Mirrored deployment enabled, but failed to get self Container"
1894
+ raise UnsupportedError(msg) from e
1895
+ except Exception as e:
1896
+ msg = "Endoscopy is not supported in the current environment: Failed to get self Container"
1897
+ raise UnsupportedError(msg) from e
1898
+
1899
+ if not self_container:
1900
+ msg = "Endoscopy is not supported in the current environment: Mirrored deployment disabled"
1901
+ raise UnsupportedError(msg)
1902
+ return self_container
1903
+
1904
+ def _endoscopic_logs(
1905
+ self,
1906
+ timestamps: bool = False,
1907
+ tail: int | None = None,
1908
+ since: int | None = None,
1909
+ follow: bool = False,
1910
+ ) -> Generator[bytes | str, None, None] | bytes | str:
1911
+ """
1912
+ Get the logs of the deployer itself.
1913
+ Only works in mirrored deployment mode.
1914
+
1915
+ Args:
1916
+ timestamps:
1917
+ Show timestamps in the logs.
1918
+ tail:
1919
+ Number of lines to show from the end of the logs.
1920
+ since:
1921
+ Show logs since the given epoch in seconds.
1922
+ follow:
1923
+ Whether to follow the logs.
1924
+
1925
+ Returns:
1926
+ The logs as a byte string or a generator yielding byte strings if follow is True.
1927
+
1928
+ Raises:
1929
+ UnsupportedError:
1930
+ If endoscopy is not supported in the current environment.
1931
+ OperationError:
1932
+ If the deployer fails to get logs.
1933
+
1934
+ """
1935
+ self_container = self._find_self_container_for_endoscopy()
1936
+
1937
+ logs_options = {
1938
+ "timestamps": timestamps,
1939
+ "tail": tail if tail >= 0 else None,
1940
+ "since": since,
1941
+ "follow": follow,
1942
+ }
1943
+
1944
+ try:
1945
+ output = self_container.logs(
1946
+ stream=follow,
1947
+ **logs_options,
1948
+ )
1949
+ except podman.errors.APIError as e:
1950
+ msg = f"Failed to fetch logs for self Container {self_container.short_id}{_detail_api_call_error(e)}"
1951
+ raise OperationError(msg) from e
1952
+ else:
1953
+ if not follow:
1954
+ result = bytearray()
1955
+ for chunk in output:
1956
+ result.extend(chunk)
1957
+ return result.decode("utf-8")
1958
+ return output
1959
+
1960
+ def _endoscopic_exec(
1961
+ self,
1962
+ detach: bool = True,
1963
+ command: list[str] | None = None,
1964
+ args: list[str] | None = None,
1965
+ ) -> WorkloadExecStream | bytes | str:
1966
+ """
1967
+ Execute a command in the deployer itself.
1968
+ Only works in mirrored deployment mode.
1969
+
1970
+ Args:
1971
+ detach:
1972
+ Whether to detach from the command.
1973
+ command:
1974
+ The command to execute.
1975
+ If not specified, use /bin/sh and implicitly attach.
1976
+ args:
1977
+ The arguments to pass to the command.
1978
+
1979
+ Returns:
1980
+ If detach is False, return a WorkloadExecStream.
1981
+ otherwise, return the output of the command as a byte string or string.
1982
+
1983
+ Raises:
1984
+ UnsupportedError:
1985
+ If endoscopy is not supported in the current environment.
1986
+ OperationError:
1987
+ If the deployer fails to execute the command.
1988
+
1989
+ """
1990
+ self_container = self._find_self_container_for_endoscopy()
1991
+
1992
+ attach = not detach or not command
1993
+ exec_options = {
1994
+ "stdout": True,
1995
+ "stderr": True,
1996
+ "stdin": attach,
1997
+ "socket": attach,
1998
+ "tty": attach,
1999
+ "cmd": [*command, *(args or [])] if command else ["/bin/sh"],
2000
+ }
2001
+
2002
+ try:
2003
+ _, output = self_container.exec_run(
2004
+ detach=False,
2005
+ **exec_options,
2006
+ )
2007
+ except podman.errors.APIError as e:
2008
+ msg = f"Failed to exec command in self Container {self_container.short_id}{_detail_api_call_error(e)}"
2009
+ raise OperationError(msg) from e
2010
+ else:
2011
+ if not attach:
2012
+ return output
2013
+ return PodmanWorkloadExecStream(output)
2014
+
2015
+ def _endoscopic_inspect(self) -> str:
2016
+ """
2017
+ Inspect the deployer itself.
2018
+ Only works in mirrored deployment mode.
2019
+
2020
+ Returns:
2021
+ The inspection result.
2022
+
2023
+ Raises:
2024
+ UnsupportedError:
2025
+ If endoscopy is not supported in the current environment.
2026
+ OperationError:
2027
+ If the deployer fails to execute the command.
2028
+
2029
+ """
2030
+ self_container = self._find_self_container_for_endoscopy()
2031
+
2032
+ c_attrs = self_container.attrs
2033
+ # Mask sensitive environment variables
2034
+ if "Env" in c_attrs["Config"]:
2035
+ for i, env in enumerate(c_attrs["Config"]["Env"] or []):
2036
+ env_name, _ = env.split("=", maxsplit=1)
2037
+ if sensitive_env_var(env_name):
2038
+ c_attrs["Config"]["Env"][i] = f"{env_name}=******"
2039
+
2040
+ return safe_json(c_attrs, indent=2)
2041
+
1818
2042
 
1819
2043
  def _has_restart_policy(
1820
2044
  container: podman.domain.containers.Container,
@@ -951,3 +951,26 @@ def bitmask_to_str(bitmask_list: list) -> str:
951
951
  offset += get_bits_size()
952
952
 
953
953
  return list_to_range_str(sorted(bits_lists))
954
+
955
+
956
+ def get_physical_function_by_bdf(bdf: str) -> str:
957
+ """
958
+ Get the physical function BDF for a given PCI device BDF address.
959
+
960
+ Args:
961
+ bdf:
962
+ The PCI device BDF address (e.g., "0000:00:1f.0").
963
+
964
+ Returns:
965
+ The physical function BDF if found, otherwise returns the original BDF.
966
+
967
+ """
968
+ if bdf:
969
+ with contextlib.suppress(Exception):
970
+ dev_path = Path(f"/sys/bus/pci/devices/{bdf}")
971
+ if dev_path.exists():
972
+ physfn_path = dev_path / "physfn"
973
+ if physfn_path.exists():
974
+ physfn_realpath = physfn_path.resolve()
975
+ return physfn_realpath.name
976
+ return bdf
@@ -16,6 +16,7 @@ from .__utils__ import (
16
16
  get_brief_version,
17
17
  get_numa_node_by_bdf,
18
18
  get_pci_devices,
19
+ get_physical_function_by_bdf,
19
20
  get_utilization,
20
21
  map_numa_node_to_cpu_affinity,
21
22
  )
@@ -107,8 +108,12 @@ class AMDDetector(Detector):
107
108
  asic_serial = dev_gpu_asic_info.get("asic_serial")
108
109
  dev_uuid = f"GPU-{(asic_serial[2:]).lower()}"
109
110
  else:
110
- dev_uuid = f"GPU-{pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)[2:]}"
111
- dev_hsa_agent = hsa_agents.get(dev_uuid)
111
+ dev_uuid = ""
112
+ with contextlib.suppress(pyrocmsmi.ROCMSMIError):
113
+ dev_uuid = (
114
+ f"GPU-{pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)[2:]}"
115
+ )
116
+ dev_hsa_agent = hsa_agents.get(dev_uuid, pyhsa.Agent())
112
117
 
113
118
  dev_gpu_driver_info = pyamdsmi.amdsmi_get_gpu_driver_info(dev)
114
119
  dev_driver_ver = dev_gpu_driver_info.get("driver_version")
@@ -119,8 +124,13 @@ class AMDDetector(Detector):
119
124
 
120
125
  dev_cc = dev_hsa_agent.compute_capability
121
126
  if not dev_cc:
122
- with contextlib.suppress(pyrocmsmi.ROCMSMIError):
123
- dev_cc = pyrocmsmi.rsmi_dev_target_graphics_version_get(dev_idx)
127
+ if "target_graphics_version" in dev_gpu_asic_info:
128
+ dev_cc = dev_gpu_asic_info.get("target_graphics_version")
129
+ else:
130
+ with contextlib.suppress(pyrocmsmi.ROCMSMIError):
131
+ dev_cc = pyrocmsmi.rsmi_dev_target_graphics_version_get(
132
+ dev_idx,
133
+ )
124
134
 
125
135
  dev_bdf = None
126
136
  dev_card_id = None
@@ -195,15 +205,13 @@ class AMDDetector(Detector):
195
205
  dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
196
206
  dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
197
207
 
198
- dev_compute_partition = None
199
- with contextlib.suppress(pyamdsmi.AmdSmiException):
200
- dev_compute_partition = pyamdsmi.amdsmi_get_gpu_compute_partition(
201
- dev,
202
- )
208
+ dev_is_vgpu = False
209
+ if dev_bdf:
210
+ dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
203
211
 
204
212
  dev_appendix = {
205
213
  "arch_family": _get_arch_family(dev_asic_family_id),
206
- "vgpu": dev_compute_partition is not None,
214
+ "vgpu": dev_is_vgpu,
207
215
  }
208
216
  if dev_bdf:
209
217
  dev_appendix["bdf"] = dev_bdf
@@ -16,6 +16,7 @@ from .__utils__ import (
16
16
  get_brief_version,
17
17
  get_numa_node_by_bdf,
18
18
  get_pci_devices,
19
+ get_physical_function_by_bdf,
19
20
  get_utilization,
20
21
  map_numa_node_to_cpu_affinity,
21
22
  )
@@ -108,7 +109,7 @@ class HygonDetector(Detector):
108
109
  dev_index = dev_idx
109
110
 
110
111
  dev_uuid = f"GPU-{pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)[2:]}"
111
- dev_hsa_agent = hsa_agents.get(dev_uuid)
112
+ dev_hsa_agent = hsa_agents.get(dev_uuid, pyhsa.Agent())
112
113
 
113
114
  dev_name = dev_hsa_agent.name
114
115
  if not dev_name:
@@ -156,8 +157,12 @@ class HygonDetector(Detector):
156
157
  dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
157
158
  dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
158
159
 
160
+ dev_is_vgpu = False
161
+ if dev_bdf:
162
+ dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
163
+
159
164
  dev_appendix = {
160
- "vgpu": False,
165
+ "vgpu": dev_is_vgpu,
161
166
  }
162
167
  if dev_bdf is not None:
163
168
  dev_appendix["bdf"] = dev_bdf
@@ -23,6 +23,7 @@ from .__utils__ import (
23
23
  get_numa_node_by_bdf,
24
24
  get_numa_nodeset_size,
25
25
  get_pci_devices,
26
+ get_physical_function_by_bdf,
26
27
  get_utilization,
27
28
  map_numa_node_to_cpu_affinity,
28
29
  support_command,
@@ -165,13 +166,20 @@ class IluvatarDetector(Detector):
165
166
  if dev_cc_t:
166
167
  dev_cc = ".".join(map(str, dev_cc_t))
167
168
 
169
+ dev_bdf = None
170
+ with contextlib.suppress(pyixml.NVMLError):
171
+ dev_pci_info = pyixml.nvmlDeviceGetPciInfo(dev)
172
+ dev_bdf = str(dev_pci_info.busIdLegacy).lower()
173
+
168
174
  dev_is_vgpu = False
169
- dev_pci_info = pyixml.nvmlDeviceGetPciInfo(dev)
175
+ if dev_bdf:
176
+ dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
170
177
 
171
178
  dev_appendix = {
172
179
  "vgpu": dev_is_vgpu,
173
- "bdf": str(dev_pci_info.busIdLegacy).lower(),
174
180
  }
181
+ if dev_bdf:
182
+ dev_appendix["bdf"] = dev_bdf
175
183
 
176
184
  ret.append(
177
185
  Device(
@@ -3,9 +3,10 @@ from __future__ import annotations
3
3
  import logging
4
4
  from functools import lru_cache
5
5
 
6
+ import pymtml
7
+
6
8
  from .. import envs
7
9
  from ..logging import debug_log_exception, debug_log_warning
8
- from . import pymtml
9
10
  from .__types__ import (
10
11
  Detector,
11
12
  Device,
@@ -105,9 +106,8 @@ class MThreadsDetector(Detector):
105
106
 
106
107
  try:
107
108
  pymtml.mtmlLibraryInit()
108
-
109
- sys_driver_ver = pymtml.mtmlSystemGetDriverVersion()
110
-
109
+ system = pymtml.mtmlLibraryInitSystem()
110
+ sys_driver_ver = pymtml.mtmlSystemGetDriverVersion(system)
111
111
  dev_count = pymtml.mtmlLibraryCountDevice()
112
112
  for dev_idx in range(dev_count):
113
113
  dev_index = dev_idx
@@ -139,25 +139,20 @@ class MThreadsDetector(Detector):
139
139
 
140
140
  dev_mem = 0
141
141
  dev_mem_used = 0
142
- devmem = pymtml.mtmlDeviceInitMemory(dev)
143
- try:
142
+ with pymtml.mtmlMemoryContext(dev) as devmem:
144
143
  dev_mem = byte_to_mebibyte( # byte to MiB
145
144
  pymtml.mtmlMemoryGetTotal(devmem),
146
145
  )
147
146
  dev_mem_used = byte_to_mebibyte( # byte to MiB
148
147
  pymtml.mtmlMemoryGetUsed(devmem),
149
148
  )
150
- finally:
151
- pymtml.mtmlDeviceFreeMemory(devmem)
152
149
 
153
150
  dev_cores_util = None
154
151
  dev_temp = None
155
- devgpu = pymtml.mtmlDeviceInitGpu(dev)
156
- try:
152
+ with pymtml.mtmlGpuContext(dev) as devgpu:
157
153
  dev_cores_util = pymtml.mtmlGpuGetUtilization(devgpu)
158
154
  dev_temp = pymtml.mtmlGpuGetTemperature(devgpu)
159
- finally:
160
- pymtml.mtmlDeviceFreeGpu(devgpu)
155
+
161
156
  if dev_cores_util is None:
162
157
  debug_log_warning(
163
158
  logger,
@@ -198,6 +193,7 @@ class MThreadsDetector(Detector):
198
193
  debug_log_exception(logger, "Failed to process devices fetching")
199
194
  raise
200
195
  finally:
196
+ pymtml.mtmlLibraryFreeSystem(system)
201
197
  pymtml.mtmlLibraryShutDown()
202
198
 
203
199
  return ret