gpustack-runtime 0.1.40.post1__py3-none-any.whl → 0.1.41.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. gpustack_runtime/__init__.py +1 -1
  2. gpustack_runtime/__main__.py +5 -3
  3. gpustack_runtime/_version.py +2 -2
  4. gpustack_runtime/_version_appendix.py +1 -1
  5. gpustack_runtime/cmds/__init__.py +5 -3
  6. gpustack_runtime/cmds/__types__.py +1 -1
  7. gpustack_runtime/cmds/deployer.py +140 -18
  8. gpustack_runtime/cmds/detector.py +1 -1
  9. gpustack_runtime/cmds/images.py +1 -1
  10. gpustack_runtime/deployer/__init__.py +28 -2
  11. gpustack_runtime/deployer/__patches__.py +1 -1
  12. gpustack_runtime/deployer/__types__.py +2 -1
  13. gpustack_runtime/deployer/__utils__.py +2 -2
  14. gpustack_runtime/deployer/cdi/__init__.py +86 -5
  15. gpustack_runtime/deployer/cdi/__types__.py +92 -29
  16. gpustack_runtime/deployer/cdi/__utils__.py +180 -0
  17. gpustack_runtime/deployer/cdi/amd.py +146 -0
  18. gpustack_runtime/deployer/cdi/ascend.py +164 -0
  19. gpustack_runtime/deployer/cdi/hygon.py +147 -0
  20. gpustack_runtime/deployer/cdi/iluvatar.py +136 -0
  21. gpustack_runtime/deployer/cdi/metax.py +148 -0
  22. gpustack_runtime/deployer/cdi/thead.py +57 -23
  23. gpustack_runtime/deployer/docker.py +9 -8
  24. gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +325 -0
  25. gpustack_runtime/deployer/k8s/deviceplugin/__types__.py +131 -0
  26. gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +590 -0
  27. gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/__init__.py +3 -0
  28. gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api.proto +212 -0
  29. gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.py +86 -0
  30. gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.pyi +168 -0
  31. gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2_grpc.py +358 -0
  32. gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/constants.py +34 -0
  33. gpustack_runtime/deployer/kuberentes.py +50 -4
  34. gpustack_runtime/deployer/podman.py +9 -8
  35. gpustack_runtime/detector/__init__.py +42 -5
  36. gpustack_runtime/detector/__types__.py +8 -24
  37. gpustack_runtime/detector/__utils__.py +46 -39
  38. gpustack_runtime/detector/amd.py +55 -66
  39. gpustack_runtime/detector/ascend.py +29 -41
  40. gpustack_runtime/detector/cambricon.py +3 -3
  41. gpustack_runtime/detector/hygon.py +21 -49
  42. gpustack_runtime/detector/iluvatar.py +44 -60
  43. gpustack_runtime/detector/metax.py +54 -37
  44. gpustack_runtime/detector/mthreads.py +74 -36
  45. gpustack_runtime/detector/nvidia.py +130 -93
  46. gpustack_runtime/detector/pyacl/__init__.py +1 -1
  47. gpustack_runtime/detector/pyamdgpu/__init__.py +1 -1
  48. gpustack_runtime/detector/pyamdsmi/__init__.py +1 -1
  49. gpustack_runtime/detector/pycuda/__init__.py +1 -1
  50. gpustack_runtime/detector/pydcmi/__init__.py +1 -1
  51. gpustack_runtime/detector/pyhsa/__init__.py +1 -1
  52. gpustack_runtime/detector/pymxsml/__init__.py +1553 -1
  53. gpustack_runtime/detector/pyrocmcore/__init__.py +1 -1
  54. gpustack_runtime/detector/pyrocmsmi/__init__.py +1 -1
  55. gpustack_runtime/detector/thead.py +41 -60
  56. gpustack_runtime/envs.py +106 -12
  57. gpustack_runtime/logging.py +6 -2
  58. {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/METADATA +6 -1
  59. gpustack_runtime-0.1.41.post1.dist-info/RECORD +67 -0
  60. gpustack_runtime/detector/pymxsml/mxsml.py +0 -1580
  61. gpustack_runtime/detector/pymxsml/mxsml_extension.py +0 -816
  62. gpustack_runtime/detector/pymxsml/mxsml_mcm.py +0 -476
  63. gpustack_runtime-0.1.40.post1.dist-info/RECORD +0 -55
  64. {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/WHEEL +0 -0
  65. {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/entry_points.txt +0 -0
  66. {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/licenses/LICENSE +0 -0
@@ -1,4 +1,4 @@
1
- from __future__ import annotations
1
+ from __future__ import annotations as __future_annotations__
2
2
 
3
3
  import contextlib
4
4
  import logging
@@ -30,7 +30,7 @@ class HygonDetector(Detector):
30
30
  """
31
31
 
32
32
  @staticmethod
33
- @lru_cache
33
+ @lru_cache(maxsize=1)
34
34
  def is_supported() -> bool:
35
35
  """
36
36
  Check if the Hygon detector is supported.
@@ -58,7 +58,7 @@ class HygonDetector(Detector):
58
58
  return supported
59
59
 
60
60
  @staticmethod
61
- @lru_cache
61
+ @lru_cache(maxsize=1)
62
62
  def detect_pci_devices() -> dict[str, PCIDevice]:
63
63
  # See https://pcisig.com/membership/member-companies?combine=Higon.
64
64
  pci_devs = get_pci_devices(vendor="0x1d94")
@@ -120,12 +120,8 @@ class HygonDetector(Detector):
120
120
  with contextlib.suppress(pyrocmsmi.ROCMSMIError):
121
121
  dev_cc = pyrocmsmi.rsmi_dev_target_graphics_version_get(dev_idx)
122
122
 
123
- dev_bdf = None
124
- dev_card_id = None
125
- dev_renderd_id = None
126
- with contextlib.suppress(Exception):
127
- dev_bdf = pyrocmsmi.rsmi_dev_pci_id_get(dev_idx)
128
- dev_card_id, dev_renderd_id = _get_card_and_renderd_id(dev_bdf)
123
+ dev_bdf = pyrocmsmi.rsmi_dev_pci_id_get(dev_idx)
124
+ dev_card_id, dev_renderd_id = _get_card_and_renderd_id(dev_bdf)
129
125
 
130
126
  dev_cores = dev_hsa_agent.compute_units
131
127
  if not dev_cores and dev_card_id is not None:
@@ -157,15 +153,17 @@ class HygonDetector(Detector):
157
153
  dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
158
154
  dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
159
155
 
160
- dev_is_vgpu = False
161
- if dev_bdf:
162
- dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
156
+ dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
157
+
158
+ dev_numa = get_numa_node_by_bdf(dev_bdf)
159
+ if not dev_numa:
160
+ dev_numa = str(pyrocmsmi.rsmi_topo_get_numa_node_number(dev_idx))
163
161
 
164
162
  dev_appendix = {
165
163
  "vgpu": dev_is_vgpu,
164
+ "bdf": dev_bdf,
165
+ "numa": dev_numa,
166
166
  }
167
- if dev_bdf is not None:
168
- dev_appendix["bdf"] = dev_bdf
169
167
  if dev_card_id is not None:
170
168
  dev_appendix["card_id"] = dev_card_id
171
169
  if dev_renderd_id is not None:
@@ -253,37 +251,14 @@ class HygonDetector(Detector):
253
251
 
254
252
  pyrocmsmi.rsmi_init()
255
253
 
256
- # Get NUMA and CPU affinities.
257
254
  for i, dev_i in enumerate(devices):
258
- # Get affinity with PCIe BDF if possible.
259
- if dev_i_bdf := dev_i.appendix.get("bdf", ""):
260
- ret.devices_numa_affinities[i] = get_numa_node_by_bdf(
261
- dev_i_bdf,
262
- )
263
- ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
264
- ret.devices_numa_affinities[i],
265
- )
266
- # Otherwise, get affinity via ROCM SMI.
267
- if not ret.devices_numa_affinities[i]:
268
- # Get NUMA affinity.
269
- try:
270
- dev_i_numa_node = pyrocmsmi.rsmi_topo_get_numa_node_number(
271
- dev_i.index,
272
- )
273
- ret.devices_numa_affinities[i] = str(dev_i_numa_node)
274
- except pyrocmsmi.ROCMSMIError:
275
- debug_log_exception(
276
- logger,
277
- "Failed to get NUMA affinity for device %d",
278
- dev_i.index,
279
- )
280
- # Get CPU affinity.
281
- ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
282
- ret.devices_numa_affinities[i],
283
- )
255
+ # Get NUMA and CPU affinities.
256
+ ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
257
+ ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
258
+ ret.devices_numa_affinities[i],
259
+ )
284
260
 
285
- # Get distances to other devices.
286
- for i, dev_i in enumerate(devices):
261
+ # Get distances to other devices.
287
262
  for j, dev_j in enumerate(devices):
288
263
  if dev_i.index == dev_j.index or ret.devices_distances[i][j] != 0:
289
264
  continue
@@ -326,9 +301,6 @@ class HygonDetector(Detector):
326
301
 
327
302
  ret.devices_distances[i][j] = distance
328
303
  ret.devices_distances[j][i] = distance
329
- except pyrocmsmi.ROCMSMIError:
330
- debug_log_exception(logger, "Failed to fetch topology")
331
- raise
332
304
  except Exception:
333
305
  debug_log_exception(logger, "Failed to process topology fetching")
334
306
  raise
@@ -351,12 +323,12 @@ def _get_card_and_renderd_id(dev_bdf: str) -> tuple[int | None, int | None]:
351
323
  card_id = None
352
324
  renderd_id = None
353
325
 
354
- for path in [
326
+ for drm_path in [
355
327
  Path(f"/sys/module/hycu/drivers/pci:hycu/{dev_bdf}/drm"),
356
328
  Path(f"/sys/module/hydcu/drivers/pci:hydcu/{dev_bdf}/drm"),
357
329
  ]:
358
- if path.exists():
359
- for dir_path in path.iterdir():
330
+ if drm_path.exists():
331
+ for dir_path in drm_path.iterdir():
360
332
  if dir_path.name.startswith("card"):
361
333
  card_id = int(dir_path.name[4:])
362
334
  elif dir_path.name.startswith("renderD"):
@@ -1,4 +1,4 @@
1
- from __future__ import annotations
1
+ from __future__ import annotations as __future_annotations__
2
2
 
3
3
  import contextlib
4
4
  import logging
@@ -37,7 +37,7 @@ class IluvatarDetector(Detector):
37
37
  """
38
38
 
39
39
  @staticmethod
40
- @lru_cache
40
+ @lru_cache(maxsize=1)
41
41
  def is_supported() -> bool:
42
42
  """
43
43
  Check if the Iluvatar detector is supported.
@@ -66,7 +66,7 @@ class IluvatarDetector(Detector):
66
66
  return supported
67
67
 
68
68
  @staticmethod
69
- @lru_cache
69
+ @lru_cache(maxsize=1)
70
70
  def detect_pci_devices() -> dict[str, PCIDevice]:
71
71
  # See https://pcisig.com/membership/member-companies?combine=Iluvatar.
72
72
  pci_devs = get_pci_devices(vendor="0x1e3e")
@@ -99,29 +99,36 @@ class IluvatarDetector(Detector):
99
99
 
100
100
  sys_driver_ver = pyixml.nvmlSystemGetDriverVersion()
101
101
 
102
- sys_runtime_ver_original = pyixml.nvmlSystemGetCudaDriverVersion()
103
- sys_runtime_ver_original = ".".join(
104
- map(
105
- str,
106
- [
107
- sys_runtime_ver_original // 1000,
108
- (sys_runtime_ver_original % 1000) // 10,
109
- (sys_runtime_ver_original % 10),
110
- ],
111
- ),
112
- )
113
- sys_runtime_ver = get_brief_version(
114
- sys_runtime_ver_original,
115
- )
102
+ sys_runtime_ver_original = None
103
+ sys_runtime_ver = None
104
+ with contextlib.suppress(pyixml.NVMLError):
105
+ sys_runtime_ver_original = pyixml.nvmlSystemGetCudaDriverVersion()
106
+ sys_runtime_ver_original = ".".join(
107
+ map(
108
+ str,
109
+ [
110
+ sys_runtime_ver_original // 1000,
111
+ (sys_runtime_ver_original % 1000) // 10,
112
+ (sys_runtime_ver_original % 10),
113
+ ],
114
+ ),
115
+ )
116
+ sys_runtime_ver = get_brief_version(
117
+ sys_runtime_ver_original,
118
+ )
116
119
 
117
120
  dev_count = pyixml.nvmlDeviceGetCount()
118
121
  for dev_idx in range(dev_count):
119
122
  dev = pyixml.nvmlDeviceGetHandleByIndex(dev_idx)
120
123
 
121
124
  dev_index = dev_idx
122
- dev_uuid = pyixml.nvmlDeviceGetUUID(dev)
125
+ if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
126
+ dev_index = pyixml.nvmlDeviceGetMinorNumber(dev)
127
+
123
128
  dev_name = pyixml.nvmlDeviceGetName(dev)
124
129
 
130
+ dev_uuid = pyixml.nvmlDeviceGetUUID(dev)
131
+
125
132
  dev_cores = None
126
133
  with contextlib.suppress(pyixml.NVMLError):
127
134
  dev_cores = pyixml.nvmlDeviceGetNumGpuCores(dev)
@@ -171,20 +178,25 @@ class IluvatarDetector(Detector):
171
178
  if dev_cc_t:
172
179
  dev_cc = ".".join(map(str, dev_cc_t))
173
180
 
174
- dev_bdf = None
175
- with contextlib.suppress(pyixml.NVMLError):
176
- dev_pci_info = pyixml.nvmlDeviceGetPciInfo(dev)
177
- dev_bdf = str(dev_pci_info.busIdLegacy).lower()
181
+ dev_pci_info = pyixml.nvmlDeviceGetPciInfo(dev)
182
+ dev_bdf = str(dev_pci_info.busIdLegacy).lower()
183
+
184
+ dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
178
185
 
179
- dev_is_vgpu = False
180
- if dev_bdf:
181
- dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
186
+ dev_numa = get_numa_node_by_bdf(dev_bdf)
187
+ if not dev_numa:
188
+ dev_node_affinity = pyixml.nvmlDeviceGetMemoryAffinity(
189
+ dev,
190
+ get_numa_nodeset_size(),
191
+ pyixml.NVML_AFFINITY_SCOPE_NODE,
192
+ )
193
+ dev_numa = bitmask_to_str(list(dev_node_affinity))
182
194
 
183
195
  dev_appendix = {
184
196
  "vgpu": dev_is_vgpu,
197
+ "bdf": dev_bdf,
198
+ "numa": dev_numa,
185
199
  }
186
- if dev_bdf:
187
- dev_appendix["bdf"] = dev_bdf
188
200
 
189
201
  ret.append(
190
202
  Device(
@@ -247,36 +259,11 @@ class IluvatarDetector(Detector):
247
259
  for i, dev_i in enumerate(devices):
248
260
  dev_i_handle = pyixml.nvmlDeviceGetHandleByUUID(dev_i.uuid)
249
261
 
250
- # Get affinity with PCIe BDF if possible.
251
- if dev_i_bdf := dev_i.appendix.get("bdf", ""):
252
- ret.devices_numa_affinities[i] = get_numa_node_by_bdf(
253
- dev_i_bdf,
254
- )
255
- ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
256
- ret.devices_numa_affinities[i],
257
- )
258
- # Otherwise, get affinity via IXML.
259
- if not ret.devices_cpu_affinities[i]:
260
- # Get NUMA affinity.
261
- try:
262
- dev_i_memset = pyixml.nvmlDeviceGetMemoryAffinity(
263
- dev_i_handle,
264
- get_numa_nodeset_size(),
265
- pyixml.NVML_AFFINITY_SCOPE_NODE,
266
- )
267
- ret.devices_numa_affinities[i] = bitmask_to_str(
268
- list(dev_i_memset),
269
- )
270
- except pyixml.NVMLError:
271
- debug_log_exception(
272
- logger,
273
- "Failed to get NUMA affinity for device %d",
274
- dev_i.index,
275
- )
276
- # Get CPU affinity.
277
- ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
278
- ret.devices_numa_affinities[i],
279
- )
262
+ # Get NUMA and CPU affinities.
263
+ ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
264
+ ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
265
+ ret.devices_numa_affinities[i],
266
+ )
280
267
 
281
268
  # Get distances to other devices.
282
269
  for j, dev_j in enumerate(devices):
@@ -302,9 +289,6 @@ class IluvatarDetector(Detector):
302
289
 
303
290
  ret.devices_distances[i][j] = distance
304
291
  ret.devices_distances[j][i] = distance
305
- except pyixml.NVMLError:
306
- debug_log_exception(logger, "Failed to fetch topology")
307
- raise
308
292
  except Exception:
309
293
  debug_log_exception(logger, "Failed to process topology fetching")
310
294
  raise
@@ -1,7 +1,8 @@
1
- from __future__ import annotations
1
+ from __future__ import annotations as __future_annotations__
2
2
 
3
3
  import logging
4
4
  from functools import lru_cache
5
+ from pathlib import Path
5
6
 
6
7
  from .. import envs
7
8
  from ..logging import debug_log_exception, debug_log_warning
@@ -48,7 +49,7 @@ class MetaXDetector(Detector):
48
49
  """
49
50
 
50
51
  @staticmethod
51
- @lru_cache
52
+ @lru_cache(maxsize=1)
52
53
  def is_supported() -> bool:
53
54
  """
54
55
  Check if the MetaX detector is supported.
@@ -76,7 +77,7 @@ class MetaXDetector(Detector):
76
77
  return supported
77
78
 
78
79
  @staticmethod
79
- @lru_cache
80
+ @lru_cache(maxsize=1)
80
81
  def detect_pci_devices() -> dict[str, PCIDevice]:
81
82
  # See https://pcisig.com/membership/member-companies?combine=MetaX.
82
83
  pci_devs = get_pci_devices(vendor="0x9999")
@@ -124,7 +125,6 @@ class MetaXDetector(Detector):
124
125
  dev_name = dev_info.deviceName
125
126
  if dev_info.mode == pymxsml.MXSML_VIRTUALIZATION_MODE_PF:
126
127
  continue
127
- dev_is_vgpu = dev_info.mode == pymxsml.MXSML_VIRTUALIZATION_MODE_VF
128
128
 
129
129
  dev_core_util = pymxsml.mxSmlGetDeviceIpUsage(
130
130
  dev_idx,
@@ -165,10 +165,28 @@ class MetaXDetector(Detector):
165
165
  // 1000 # mW to W
166
166
  )
167
167
 
168
+ dev_bdf = dev_info.bdfId
169
+ dev_card_id, dev_renderd_id = _get_card_and_renderd_id(dev_bdf)
170
+
171
+ dev_is_vgpu = dev_info.mode == pymxsml.MXSML_VIRTUALIZATION_MODE_VF
172
+
173
+ dev_numa = get_numa_node_by_bdf(dev_bdf)
174
+ if not dev_numa:
175
+ dev_node_affinity = pymxsml.mxSmlGetNodeAffinity(
176
+ dev_idx,
177
+ get_numa_nodeset_size(),
178
+ )
179
+ dev_numa = bitmask_to_str(list(dev_node_affinity))
180
+
168
181
  dev_appendix = {
169
182
  "vgpu": dev_is_vgpu,
170
- "bdf": dev_info.bdfId,
183
+ "bdf": dev_bdf,
184
+ "numa": dev_numa,
171
185
  }
186
+ if dev_card_id is not None:
187
+ dev_appendix["card_id"] = dev_card_id
188
+ if dev_renderd_id is not None:
189
+ dev_appendix["renderd_id"] = dev_renderd_id
172
190
 
173
191
  ret.append(
174
192
  Device(
@@ -226,35 +244,11 @@ class MetaXDetector(Detector):
226
244
  pymxsml.mxSmlInit()
227
245
 
228
246
  for i, dev_i in enumerate(devices):
229
- # Get affinity with PCIe BDF if possible.
230
- if dev_i_bdf := dev_i.appendix.get("bdf", ""):
231
- ret.devices_numa_affinities[i] = get_numa_node_by_bdf(
232
- dev_i_bdf,
233
- )
234
- ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
235
- ret.devices_numa_affinities[i],
236
- )
237
- # Otherwise, get affinity by MXSML.
238
- if not ret.devices_cpu_affinities[i]:
239
- # Get NUMA affinity.
240
- try:
241
- dev_i_nodeaff = pymxsml.mxSmlGetNodeAffinity(
242
- dev_i.index,
243
- get_numa_nodeset_size(),
244
- )
245
- ret.devices_numa_affinities[i] = bitmask_to_str(
246
- list(dev_i_nodeaff),
247
- )
248
- except pymxsml.MXSMLError:
249
- debug_log_warning(
250
- logger,
251
- "Failed to get device %d NUMA node affinity",
252
- dev_i.index,
253
- )
254
- # Get CPU affinity.
255
- ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
256
- ret.devices_numa_affinities[i],
257
- )
247
+ # Get NUMA and CPU affinities.
248
+ ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
249
+ ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
250
+ ret.devices_numa_affinities[i],
251
+ )
258
252
 
259
253
  # Get distances to other devices.
260
254
  for j, dev_j in enumerate(devices):
@@ -281,11 +275,34 @@ class MetaXDetector(Detector):
281
275
 
282
276
  ret.devices_distances[i][j] = distance
283
277
  ret.devices_distances[j][i] = distance
284
- except pymxsml.MXSMLError:
285
- debug_log_exception(logger, "Failed to fetch topology")
286
- raise
287
278
  except Exception:
288
279
  debug_log_exception(logger, "Failed to process topology fetching")
289
280
  raise
290
281
 
291
282
  return ret
283
+
284
+
285
+ def _get_card_and_renderd_id(dev_bdf: str) -> tuple[int | None, int | None]:
286
+ """
287
+ Get the card ID and renderD ID for a given device bdf.
288
+
289
+ Args:
290
+ dev_bdf:
291
+ The device bdf.
292
+
293
+ Returns:
294
+ A tuple of (card_id, renderd_id).
295
+
296
+ """
297
+ card_id = None
298
+ renderd_id = None
299
+
300
+ drm_path = Path(f"/sys/module/metax/drivers/pci:metax/{dev_bdf}/drm")
301
+ if drm_path.exists():
302
+ for dir_path in drm_path.iterdir():
303
+ if dir_path.name.startswith("card"):
304
+ card_id = int(dir_path.name[4:])
305
+ elif dir_path.name.startswith("renderD"):
306
+ renderd_id = int(dir_path.name[7:])
307
+
308
+ return card_id, renderd_id
@@ -1,4 +1,4 @@
1
- from __future__ import annotations
1
+ from __future__ import annotations as __future_annotations__
2
2
 
3
3
  import logging
4
4
  from functools import lru_cache
@@ -47,7 +47,7 @@ class MThreadsDetector(Detector):
47
47
  """
48
48
 
49
49
  @staticmethod
50
- @lru_cache
50
+ @lru_cache(maxsize=1)
51
51
  def is_supported() -> bool:
52
52
  """
53
53
  Check if the MThreads detector is supported.
@@ -76,7 +76,7 @@ class MThreadsDetector(Detector):
76
76
  return supported
77
77
 
78
78
  @staticmethod
79
- @lru_cache
79
+ @lru_cache(maxsize=1)
80
80
  def detect_pci_devices() -> dict[str, PCIDevice]:
81
81
  # See https://pcisig.com/membership/member-companies?combine=Moore+Threads.
82
82
  pci_devs = get_pci_devices(vendor="0x1ed5")
@@ -117,6 +117,7 @@ class MThreadsDetector(Detector):
117
117
  dev_cores = 0
118
118
  dev_power_used = None
119
119
  dev_pci_info = None
120
+ dev_is_vgpu = False
120
121
  dev = pymtml.mtmlLibraryInitDeviceByIndex(dev_idx)
121
122
  try:
122
123
  dev_props = pymtml.mtmlDeviceGetProperty(dev)
@@ -163,9 +164,20 @@ class MThreadsDetector(Detector):
163
164
 
164
165
  dev_bdf = f"{dev_pci_info.segment:04x}:{dev_pci_info.bus:02x}:{dev_pci_info.device:02x}.0"
165
166
 
167
+ dev_numa = get_numa_node_by_bdf(dev_bdf)
168
+ if not dev_numa:
169
+ dev_node_affinity = pymtml.mtmlDeviceGetMemoryAffinityWithinNode(
170
+ dev,
171
+ get_numa_nodeset_size(),
172
+ )
173
+ dev_numa = bitmask_to_str(
174
+ list(dev_node_affinity),
175
+ )
176
+
166
177
  dev_appendix = {
167
178
  "vgpu": dev_is_vgpu,
168
179
  "bdf": dev_bdf,
180
+ "numa": dev_numa,
169
181
  }
170
182
 
171
183
  ret.append(
@@ -228,35 +240,24 @@ class MThreadsDetector(Detector):
228
240
  dev_i_handle = pymtml.mtmlLibraryInitDeviceByIndex(dev_i.index)
229
241
 
230
242
  try:
231
- # Get affinity with PCIe BDF if possible.
232
- if dev_i_bdf := dev_i.appendix.get("bdf", ""):
233
- ret.devices_numa_affinities[i] = get_numa_node_by_bdf(
234
- dev_i_bdf,
235
- )
236
- ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
237
- ret.devices_numa_affinities[i],
238
- )
239
- # Otherwise, get affinity via MTML.
240
- if not ret.devices_cpu_affinities[i]:
241
- # Get NUMA affinity.
242
- try:
243
- dev_i_memset = pymtml.mtmlDeviceGetMemoryAffinityWithinNode(
244
- dev_i_handle,
245
- get_numa_nodeset_size(),
246
- )
247
- ret.devices_numa_affinities[i] = bitmask_to_str(
248
- list(dev_i_memset),
249
- )
250
- except pymtml.MTMLError:
251
- debug_log_warning(
252
- logger,
253
- "Failed to get NUMA affinity for device %d",
254
- dev_i.index,
255
- )
256
- # Get CPU affinity.
257
- ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
258
- ret.devices_numa_affinities[i],
259
- )
243
+ # Get NUMA and CPU affinities.
244
+ ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
245
+ ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
246
+ ret.devices_numa_affinities[i],
247
+ )
248
+
249
+ # Get links state if applicable.
250
+ if dev_i_links_state := _get_links_state(dev_i_handle):
251
+ ret.appendices[i].update(dev_i_links_state)
252
+ # In practice, if a card has an active *Link,
253
+ # then other cards in the same machine should be interconnected with it through the *Link.
254
+ if dev_i_links_state.get("links_active_count", 0) > 0:
255
+ for j, dev_j in enumerate(devices):
256
+ if dev_i.index == dev_j.index:
257
+ continue
258
+ ret.devices_distances[i][j] = TopologyDistanceEnum.LINK
259
+ ret.devices_distances[j][i] = TopologyDistanceEnum.LINK
260
+ continue
260
261
 
261
262
  # Get distances to other devices.
262
263
  for j, dev_j in enumerate(devices):
@@ -278,7 +279,6 @@ class MThreadsDetector(Detector):
278
279
  topo,
279
280
  distance,
280
281
  )
281
- # TODO(thxCode): Support LINK distance.
282
282
  except pymtml.MTMLError:
283
283
  debug_log_warning(
284
284
  logger,
@@ -295,9 +295,6 @@ class MThreadsDetector(Detector):
295
295
  finally:
296
296
  pymtml.mtmlLibraryFreeDevice(dev_i_handle)
297
297
 
298
- except pymtml.MTMLError:
299
- debug_log_exception(logger, "Failed to fetch topology")
300
- raise
301
298
  except Exception:
302
299
  debug_log_exception(logger, "Failed to process topology fetching")
303
300
  raise
@@ -305,3 +302,44 @@ class MThreadsDetector(Detector):
305
302
  pymtml.mtmlLibraryShutDown()
306
303
 
307
304
  return ret
305
+
306
+
307
+ def _get_links_state(
308
+ dev: pymtml.c_mtmlDevice_t,
309
+ ) -> dict | None:
310
+ """
311
+ Get the MTLink links count and state for a device.
312
+
313
+ Args:
314
+ dev:
315
+ The MTLink device handle.
316
+
317
+ Returns:
318
+ A dict includes links state or None if failed.
319
+
320
+ """
321
+ dev_links_count = 0
322
+ try:
323
+ dev_link_spec = pymtml.mtmlDeviceGetMtLinkSpec(dev)
324
+ dev_links_count = dev_link_spec.linkNum
325
+ except pymtml.MTMLError:
326
+ debug_log_warning(logger, "Failed to get MTLink links count")
327
+ if not dev_links_count:
328
+ return None
329
+
330
+ dev_links_state = 0
331
+ dev_links_active_count = 0
332
+ try:
333
+ for link_idx in range(int(dev_links_count)):
334
+ dev_link_state = pymtml.mtmlDeviceGetMtLinkState(dev, link_idx)
335
+ if dev_link_state == pymtml.MTML_MTLINK_STATE_UP:
336
+ dev_links_state |= 1 << link_idx
337
+ dev_links_active_count += 1
338
+ except pymtml.MTMLError:
339
+ debug_log_warning(logger, "Failed to get MTLink link state")
340
+
341
+ return {
342
+ "links_count": dev_links_count,
343
+ "links_state": dev_links_state,
344
+ "links_active_count": dev_links_active_count,
345
+ }