gpustack-runtime 0.1.40.post1__py3-none-any.whl → 0.1.41.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpustack_runtime/__init__.py +1 -1
- gpustack_runtime/__main__.py +5 -3
- gpustack_runtime/_version.py +2 -2
- gpustack_runtime/_version_appendix.py +1 -1
- gpustack_runtime/cmds/__init__.py +5 -3
- gpustack_runtime/cmds/__types__.py +1 -1
- gpustack_runtime/cmds/deployer.py +140 -18
- gpustack_runtime/cmds/detector.py +1 -1
- gpustack_runtime/cmds/images.py +1 -1
- gpustack_runtime/deployer/__init__.py +28 -2
- gpustack_runtime/deployer/__patches__.py +1 -1
- gpustack_runtime/deployer/__types__.py +2 -1
- gpustack_runtime/deployer/__utils__.py +2 -2
- gpustack_runtime/deployer/cdi/__init__.py +86 -5
- gpustack_runtime/deployer/cdi/__types__.py +92 -29
- gpustack_runtime/deployer/cdi/__utils__.py +180 -0
- gpustack_runtime/deployer/cdi/amd.py +146 -0
- gpustack_runtime/deployer/cdi/ascend.py +164 -0
- gpustack_runtime/deployer/cdi/hygon.py +147 -0
- gpustack_runtime/deployer/cdi/iluvatar.py +136 -0
- gpustack_runtime/deployer/cdi/metax.py +148 -0
- gpustack_runtime/deployer/cdi/thead.py +57 -23
- gpustack_runtime/deployer/docker.py +9 -8
- gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +325 -0
- gpustack_runtime/deployer/k8s/deviceplugin/__types__.py +131 -0
- gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +590 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/__init__.py +3 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api.proto +212 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.py +86 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.pyi +168 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2_grpc.py +358 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/constants.py +34 -0
- gpustack_runtime/deployer/kuberentes.py +50 -4
- gpustack_runtime/deployer/podman.py +9 -8
- gpustack_runtime/detector/__init__.py +42 -5
- gpustack_runtime/detector/__types__.py +8 -24
- gpustack_runtime/detector/__utils__.py +46 -39
- gpustack_runtime/detector/amd.py +55 -66
- gpustack_runtime/detector/ascend.py +29 -41
- gpustack_runtime/detector/cambricon.py +3 -3
- gpustack_runtime/detector/hygon.py +21 -49
- gpustack_runtime/detector/iluvatar.py +44 -60
- gpustack_runtime/detector/metax.py +54 -37
- gpustack_runtime/detector/mthreads.py +74 -36
- gpustack_runtime/detector/nvidia.py +130 -93
- gpustack_runtime/detector/pyacl/__init__.py +1 -1
- gpustack_runtime/detector/pyamdgpu/__init__.py +1 -1
- gpustack_runtime/detector/pyamdsmi/__init__.py +1 -1
- gpustack_runtime/detector/pycuda/__init__.py +1 -1
- gpustack_runtime/detector/pydcmi/__init__.py +1 -1
- gpustack_runtime/detector/pyhsa/__init__.py +1 -1
- gpustack_runtime/detector/pymxsml/__init__.py +1553 -1
- gpustack_runtime/detector/pyrocmcore/__init__.py +1 -1
- gpustack_runtime/detector/pyrocmsmi/__init__.py +1 -1
- gpustack_runtime/detector/thead.py +41 -60
- gpustack_runtime/envs.py +106 -12
- gpustack_runtime/logging.py +6 -2
- {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/METADATA +6 -1
- gpustack_runtime-0.1.41.post1.dist-info/RECORD +67 -0
- gpustack_runtime/detector/pymxsml/mxsml.py +0 -1580
- gpustack_runtime/detector/pymxsml/mxsml_extension.py +0 -816
- gpustack_runtime/detector/pymxsml/mxsml_mcm.py +0 -476
- gpustack_runtime-0.1.40.post1.dist-info/RECORD +0 -55
- {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/WHEEL +0 -0
- {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/entry_points.txt +0 -0
- {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
1
|
+
from __future__ import annotations as __future_annotations__
|
|
2
2
|
|
|
3
3
|
import contextlib
|
|
4
4
|
import logging
|
|
@@ -30,7 +30,7 @@ class HygonDetector(Detector):
|
|
|
30
30
|
"""
|
|
31
31
|
|
|
32
32
|
@staticmethod
|
|
33
|
-
@lru_cache
|
|
33
|
+
@lru_cache(maxsize=1)
|
|
34
34
|
def is_supported() -> bool:
|
|
35
35
|
"""
|
|
36
36
|
Check if the Hygon detector is supported.
|
|
@@ -58,7 +58,7 @@ class HygonDetector(Detector):
|
|
|
58
58
|
return supported
|
|
59
59
|
|
|
60
60
|
@staticmethod
|
|
61
|
-
@lru_cache
|
|
61
|
+
@lru_cache(maxsize=1)
|
|
62
62
|
def detect_pci_devices() -> dict[str, PCIDevice]:
|
|
63
63
|
# See https://pcisig.com/membership/member-companies?combine=Higon.
|
|
64
64
|
pci_devs = get_pci_devices(vendor="0x1d94")
|
|
@@ -120,12 +120,8 @@ class HygonDetector(Detector):
|
|
|
120
120
|
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
|
|
121
121
|
dev_cc = pyrocmsmi.rsmi_dev_target_graphics_version_get(dev_idx)
|
|
122
122
|
|
|
123
|
-
dev_bdf =
|
|
124
|
-
dev_card_id =
|
|
125
|
-
dev_renderd_id = None
|
|
126
|
-
with contextlib.suppress(Exception):
|
|
127
|
-
dev_bdf = pyrocmsmi.rsmi_dev_pci_id_get(dev_idx)
|
|
128
|
-
dev_card_id, dev_renderd_id = _get_card_and_renderd_id(dev_bdf)
|
|
123
|
+
dev_bdf = pyrocmsmi.rsmi_dev_pci_id_get(dev_idx)
|
|
124
|
+
dev_card_id, dev_renderd_id = _get_card_and_renderd_id(dev_bdf)
|
|
129
125
|
|
|
130
126
|
dev_cores = dev_hsa_agent.compute_units
|
|
131
127
|
if not dev_cores and dev_card_id is not None:
|
|
@@ -157,15 +153,17 @@ class HygonDetector(Detector):
|
|
|
157
153
|
dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
|
|
158
154
|
dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
|
|
159
155
|
|
|
160
|
-
dev_is_vgpu =
|
|
161
|
-
|
|
162
|
-
|
|
156
|
+
dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
|
|
157
|
+
|
|
158
|
+
dev_numa = get_numa_node_by_bdf(dev_bdf)
|
|
159
|
+
if not dev_numa:
|
|
160
|
+
dev_numa = str(pyrocmsmi.rsmi_topo_get_numa_node_number(dev_idx))
|
|
163
161
|
|
|
164
162
|
dev_appendix = {
|
|
165
163
|
"vgpu": dev_is_vgpu,
|
|
164
|
+
"bdf": dev_bdf,
|
|
165
|
+
"numa": dev_numa,
|
|
166
166
|
}
|
|
167
|
-
if dev_bdf is not None:
|
|
168
|
-
dev_appendix["bdf"] = dev_bdf
|
|
169
167
|
if dev_card_id is not None:
|
|
170
168
|
dev_appendix["card_id"] = dev_card_id
|
|
171
169
|
if dev_renderd_id is not None:
|
|
@@ -253,37 +251,14 @@ class HygonDetector(Detector):
|
|
|
253
251
|
|
|
254
252
|
pyrocmsmi.rsmi_init()
|
|
255
253
|
|
|
256
|
-
# Get NUMA and CPU affinities.
|
|
257
254
|
for i, dev_i in enumerate(devices):
|
|
258
|
-
# Get
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
|
|
264
|
-
ret.devices_numa_affinities[i],
|
|
265
|
-
)
|
|
266
|
-
# Otherwise, get affinity via ROCM SMI.
|
|
267
|
-
if not ret.devices_numa_affinities[i]:
|
|
268
|
-
# Get NUMA affinity.
|
|
269
|
-
try:
|
|
270
|
-
dev_i_numa_node = pyrocmsmi.rsmi_topo_get_numa_node_number(
|
|
271
|
-
dev_i.index,
|
|
272
|
-
)
|
|
273
|
-
ret.devices_numa_affinities[i] = str(dev_i_numa_node)
|
|
274
|
-
except pyrocmsmi.ROCMSMIError:
|
|
275
|
-
debug_log_exception(
|
|
276
|
-
logger,
|
|
277
|
-
"Failed to get NUMA affinity for device %d",
|
|
278
|
-
dev_i.index,
|
|
279
|
-
)
|
|
280
|
-
# Get CPU affinity.
|
|
281
|
-
ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
|
|
282
|
-
ret.devices_numa_affinities[i],
|
|
283
|
-
)
|
|
255
|
+
# Get NUMA and CPU affinities.
|
|
256
|
+
ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
|
|
257
|
+
ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
|
|
258
|
+
ret.devices_numa_affinities[i],
|
|
259
|
+
)
|
|
284
260
|
|
|
285
|
-
|
|
286
|
-
for i, dev_i in enumerate(devices):
|
|
261
|
+
# Get distances to other devices.
|
|
287
262
|
for j, dev_j in enumerate(devices):
|
|
288
263
|
if dev_i.index == dev_j.index or ret.devices_distances[i][j] != 0:
|
|
289
264
|
continue
|
|
@@ -326,9 +301,6 @@ class HygonDetector(Detector):
|
|
|
326
301
|
|
|
327
302
|
ret.devices_distances[i][j] = distance
|
|
328
303
|
ret.devices_distances[j][i] = distance
|
|
329
|
-
except pyrocmsmi.ROCMSMIError:
|
|
330
|
-
debug_log_exception(logger, "Failed to fetch topology")
|
|
331
|
-
raise
|
|
332
304
|
except Exception:
|
|
333
305
|
debug_log_exception(logger, "Failed to process topology fetching")
|
|
334
306
|
raise
|
|
@@ -351,12 +323,12 @@ def _get_card_and_renderd_id(dev_bdf: str) -> tuple[int | None, int | None]:
|
|
|
351
323
|
card_id = None
|
|
352
324
|
renderd_id = None
|
|
353
325
|
|
|
354
|
-
for
|
|
326
|
+
for drm_path in [
|
|
355
327
|
Path(f"/sys/module/hycu/drivers/pci:hycu/{dev_bdf}/drm"),
|
|
356
328
|
Path(f"/sys/module/hydcu/drivers/pci:hydcu/{dev_bdf}/drm"),
|
|
357
329
|
]:
|
|
358
|
-
if
|
|
359
|
-
for dir_path in
|
|
330
|
+
if drm_path.exists():
|
|
331
|
+
for dir_path in drm_path.iterdir():
|
|
360
332
|
if dir_path.name.startswith("card"):
|
|
361
333
|
card_id = int(dir_path.name[4:])
|
|
362
334
|
elif dir_path.name.startswith("renderD"):
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
1
|
+
from __future__ import annotations as __future_annotations__
|
|
2
2
|
|
|
3
3
|
import contextlib
|
|
4
4
|
import logging
|
|
@@ -37,7 +37,7 @@ class IluvatarDetector(Detector):
|
|
|
37
37
|
"""
|
|
38
38
|
|
|
39
39
|
@staticmethod
|
|
40
|
-
@lru_cache
|
|
40
|
+
@lru_cache(maxsize=1)
|
|
41
41
|
def is_supported() -> bool:
|
|
42
42
|
"""
|
|
43
43
|
Check if the Iluvatar detector is supported.
|
|
@@ -66,7 +66,7 @@ class IluvatarDetector(Detector):
|
|
|
66
66
|
return supported
|
|
67
67
|
|
|
68
68
|
@staticmethod
|
|
69
|
-
@lru_cache
|
|
69
|
+
@lru_cache(maxsize=1)
|
|
70
70
|
def detect_pci_devices() -> dict[str, PCIDevice]:
|
|
71
71
|
# See https://pcisig.com/membership/member-companies?combine=Iluvatar.
|
|
72
72
|
pci_devs = get_pci_devices(vendor="0x1e3e")
|
|
@@ -99,29 +99,36 @@ class IluvatarDetector(Detector):
|
|
|
99
99
|
|
|
100
100
|
sys_driver_ver = pyixml.nvmlSystemGetDriverVersion()
|
|
101
101
|
|
|
102
|
-
sys_runtime_ver_original =
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
102
|
+
sys_runtime_ver_original = None
|
|
103
|
+
sys_runtime_ver = None
|
|
104
|
+
with contextlib.suppress(pyixml.NVMLError):
|
|
105
|
+
sys_runtime_ver_original = pyixml.nvmlSystemGetCudaDriverVersion()
|
|
106
|
+
sys_runtime_ver_original = ".".join(
|
|
107
|
+
map(
|
|
108
|
+
str,
|
|
109
|
+
[
|
|
110
|
+
sys_runtime_ver_original // 1000,
|
|
111
|
+
(sys_runtime_ver_original % 1000) // 10,
|
|
112
|
+
(sys_runtime_ver_original % 10),
|
|
113
|
+
],
|
|
114
|
+
),
|
|
115
|
+
)
|
|
116
|
+
sys_runtime_ver = get_brief_version(
|
|
117
|
+
sys_runtime_ver_original,
|
|
118
|
+
)
|
|
116
119
|
|
|
117
120
|
dev_count = pyixml.nvmlDeviceGetCount()
|
|
118
121
|
for dev_idx in range(dev_count):
|
|
119
122
|
dev = pyixml.nvmlDeviceGetHandleByIndex(dev_idx)
|
|
120
123
|
|
|
121
124
|
dev_index = dev_idx
|
|
122
|
-
|
|
125
|
+
if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
|
|
126
|
+
dev_index = pyixml.nvmlDeviceGetMinorNumber(dev)
|
|
127
|
+
|
|
123
128
|
dev_name = pyixml.nvmlDeviceGetName(dev)
|
|
124
129
|
|
|
130
|
+
dev_uuid = pyixml.nvmlDeviceGetUUID(dev)
|
|
131
|
+
|
|
125
132
|
dev_cores = None
|
|
126
133
|
with contextlib.suppress(pyixml.NVMLError):
|
|
127
134
|
dev_cores = pyixml.nvmlDeviceGetNumGpuCores(dev)
|
|
@@ -171,20 +178,25 @@ class IluvatarDetector(Detector):
|
|
|
171
178
|
if dev_cc_t:
|
|
172
179
|
dev_cc = ".".join(map(str, dev_cc_t))
|
|
173
180
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
181
|
+
dev_pci_info = pyixml.nvmlDeviceGetPciInfo(dev)
|
|
182
|
+
dev_bdf = str(dev_pci_info.busIdLegacy).lower()
|
|
183
|
+
|
|
184
|
+
dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
|
|
178
185
|
|
|
179
|
-
|
|
180
|
-
if
|
|
181
|
-
|
|
186
|
+
dev_numa = get_numa_node_by_bdf(dev_bdf)
|
|
187
|
+
if not dev_numa:
|
|
188
|
+
dev_node_affinity = pyixml.nvmlDeviceGetMemoryAffinity(
|
|
189
|
+
dev,
|
|
190
|
+
get_numa_nodeset_size(),
|
|
191
|
+
pyixml.NVML_AFFINITY_SCOPE_NODE,
|
|
192
|
+
)
|
|
193
|
+
dev_numa = bitmask_to_str(list(dev_node_affinity))
|
|
182
194
|
|
|
183
195
|
dev_appendix = {
|
|
184
196
|
"vgpu": dev_is_vgpu,
|
|
197
|
+
"bdf": dev_bdf,
|
|
198
|
+
"numa": dev_numa,
|
|
185
199
|
}
|
|
186
|
-
if dev_bdf:
|
|
187
|
-
dev_appendix["bdf"] = dev_bdf
|
|
188
200
|
|
|
189
201
|
ret.append(
|
|
190
202
|
Device(
|
|
@@ -247,36 +259,11 @@ class IluvatarDetector(Detector):
|
|
|
247
259
|
for i, dev_i in enumerate(devices):
|
|
248
260
|
dev_i_handle = pyixml.nvmlDeviceGetHandleByUUID(dev_i.uuid)
|
|
249
261
|
|
|
250
|
-
# Get
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
|
|
256
|
-
ret.devices_numa_affinities[i],
|
|
257
|
-
)
|
|
258
|
-
# Otherwise, get affinity via IXML.
|
|
259
|
-
if not ret.devices_cpu_affinities[i]:
|
|
260
|
-
# Get NUMA affinity.
|
|
261
|
-
try:
|
|
262
|
-
dev_i_memset = pyixml.nvmlDeviceGetMemoryAffinity(
|
|
263
|
-
dev_i_handle,
|
|
264
|
-
get_numa_nodeset_size(),
|
|
265
|
-
pyixml.NVML_AFFINITY_SCOPE_NODE,
|
|
266
|
-
)
|
|
267
|
-
ret.devices_numa_affinities[i] = bitmask_to_str(
|
|
268
|
-
list(dev_i_memset),
|
|
269
|
-
)
|
|
270
|
-
except pyixml.NVMLError:
|
|
271
|
-
debug_log_exception(
|
|
272
|
-
logger,
|
|
273
|
-
"Failed to get NUMA affinity for device %d",
|
|
274
|
-
dev_i.index,
|
|
275
|
-
)
|
|
276
|
-
# Get CPU affinity.
|
|
277
|
-
ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
|
|
278
|
-
ret.devices_numa_affinities[i],
|
|
279
|
-
)
|
|
262
|
+
# Get NUMA and CPU affinities.
|
|
263
|
+
ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
|
|
264
|
+
ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
|
|
265
|
+
ret.devices_numa_affinities[i],
|
|
266
|
+
)
|
|
280
267
|
|
|
281
268
|
# Get distances to other devices.
|
|
282
269
|
for j, dev_j in enumerate(devices):
|
|
@@ -302,9 +289,6 @@ class IluvatarDetector(Detector):
|
|
|
302
289
|
|
|
303
290
|
ret.devices_distances[i][j] = distance
|
|
304
291
|
ret.devices_distances[j][i] = distance
|
|
305
|
-
except pyixml.NVMLError:
|
|
306
|
-
debug_log_exception(logger, "Failed to fetch topology")
|
|
307
|
-
raise
|
|
308
292
|
except Exception:
|
|
309
293
|
debug_log_exception(logger, "Failed to process topology fetching")
|
|
310
294
|
raise
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
1
|
+
from __future__ import annotations as __future_annotations__
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
from functools import lru_cache
|
|
5
|
+
from pathlib import Path
|
|
5
6
|
|
|
6
7
|
from .. import envs
|
|
7
8
|
from ..logging import debug_log_exception, debug_log_warning
|
|
@@ -48,7 +49,7 @@ class MetaXDetector(Detector):
|
|
|
48
49
|
"""
|
|
49
50
|
|
|
50
51
|
@staticmethod
|
|
51
|
-
@lru_cache
|
|
52
|
+
@lru_cache(maxsize=1)
|
|
52
53
|
def is_supported() -> bool:
|
|
53
54
|
"""
|
|
54
55
|
Check if the MetaX detector is supported.
|
|
@@ -76,7 +77,7 @@ class MetaXDetector(Detector):
|
|
|
76
77
|
return supported
|
|
77
78
|
|
|
78
79
|
@staticmethod
|
|
79
|
-
@lru_cache
|
|
80
|
+
@lru_cache(maxsize=1)
|
|
80
81
|
def detect_pci_devices() -> dict[str, PCIDevice]:
|
|
81
82
|
# See https://pcisig.com/membership/member-companies?combine=MetaX.
|
|
82
83
|
pci_devs = get_pci_devices(vendor="0x9999")
|
|
@@ -124,7 +125,6 @@ class MetaXDetector(Detector):
|
|
|
124
125
|
dev_name = dev_info.deviceName
|
|
125
126
|
if dev_info.mode == pymxsml.MXSML_VIRTUALIZATION_MODE_PF:
|
|
126
127
|
continue
|
|
127
|
-
dev_is_vgpu = dev_info.mode == pymxsml.MXSML_VIRTUALIZATION_MODE_VF
|
|
128
128
|
|
|
129
129
|
dev_core_util = pymxsml.mxSmlGetDeviceIpUsage(
|
|
130
130
|
dev_idx,
|
|
@@ -165,10 +165,28 @@ class MetaXDetector(Detector):
|
|
|
165
165
|
// 1000 # mW to W
|
|
166
166
|
)
|
|
167
167
|
|
|
168
|
+
dev_bdf = dev_info.bdfId
|
|
169
|
+
dev_card_id, dev_renderd_id = _get_card_and_renderd_id(dev_bdf)
|
|
170
|
+
|
|
171
|
+
dev_is_vgpu = dev_info.mode == pymxsml.MXSML_VIRTUALIZATION_MODE_VF
|
|
172
|
+
|
|
173
|
+
dev_numa = get_numa_node_by_bdf(dev_bdf)
|
|
174
|
+
if not dev_numa:
|
|
175
|
+
dev_node_affinity = pymxsml.mxSmlGetNodeAffinity(
|
|
176
|
+
dev_idx,
|
|
177
|
+
get_numa_nodeset_size(),
|
|
178
|
+
)
|
|
179
|
+
dev_numa = bitmask_to_str(list(dev_node_affinity))
|
|
180
|
+
|
|
168
181
|
dev_appendix = {
|
|
169
182
|
"vgpu": dev_is_vgpu,
|
|
170
|
-
"bdf":
|
|
183
|
+
"bdf": dev_bdf,
|
|
184
|
+
"numa": dev_numa,
|
|
171
185
|
}
|
|
186
|
+
if dev_card_id is not None:
|
|
187
|
+
dev_appendix["card_id"] = dev_card_id
|
|
188
|
+
if dev_renderd_id is not None:
|
|
189
|
+
dev_appendix["renderd_id"] = dev_renderd_id
|
|
172
190
|
|
|
173
191
|
ret.append(
|
|
174
192
|
Device(
|
|
@@ -226,35 +244,11 @@ class MetaXDetector(Detector):
|
|
|
226
244
|
pymxsml.mxSmlInit()
|
|
227
245
|
|
|
228
246
|
for i, dev_i in enumerate(devices):
|
|
229
|
-
# Get
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
|
|
235
|
-
ret.devices_numa_affinities[i],
|
|
236
|
-
)
|
|
237
|
-
# Otherwise, get affinity by MXSML.
|
|
238
|
-
if not ret.devices_cpu_affinities[i]:
|
|
239
|
-
# Get NUMA affinity.
|
|
240
|
-
try:
|
|
241
|
-
dev_i_nodeaff = pymxsml.mxSmlGetNodeAffinity(
|
|
242
|
-
dev_i.index,
|
|
243
|
-
get_numa_nodeset_size(),
|
|
244
|
-
)
|
|
245
|
-
ret.devices_numa_affinities[i] = bitmask_to_str(
|
|
246
|
-
list(dev_i_nodeaff),
|
|
247
|
-
)
|
|
248
|
-
except pymxsml.MXSMLError:
|
|
249
|
-
debug_log_warning(
|
|
250
|
-
logger,
|
|
251
|
-
"Failed to get device %d NUMA node affinity",
|
|
252
|
-
dev_i.index,
|
|
253
|
-
)
|
|
254
|
-
# Get CPU affinity.
|
|
255
|
-
ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
|
|
256
|
-
ret.devices_numa_affinities[i],
|
|
257
|
-
)
|
|
247
|
+
# Get NUMA and CPU affinities.
|
|
248
|
+
ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
|
|
249
|
+
ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
|
|
250
|
+
ret.devices_numa_affinities[i],
|
|
251
|
+
)
|
|
258
252
|
|
|
259
253
|
# Get distances to other devices.
|
|
260
254
|
for j, dev_j in enumerate(devices):
|
|
@@ -281,11 +275,34 @@ class MetaXDetector(Detector):
|
|
|
281
275
|
|
|
282
276
|
ret.devices_distances[i][j] = distance
|
|
283
277
|
ret.devices_distances[j][i] = distance
|
|
284
|
-
except pymxsml.MXSMLError:
|
|
285
|
-
debug_log_exception(logger, "Failed to fetch topology")
|
|
286
|
-
raise
|
|
287
278
|
except Exception:
|
|
288
279
|
debug_log_exception(logger, "Failed to process topology fetching")
|
|
289
280
|
raise
|
|
290
281
|
|
|
291
282
|
return ret
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def _get_card_and_renderd_id(dev_bdf: str) -> tuple[int | None, int | None]:
|
|
286
|
+
"""
|
|
287
|
+
Get the card ID and renderD ID for a given device bdf.
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
dev_bdf:
|
|
291
|
+
The device bdf.
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
A tuple of (card_id, renderd_id).
|
|
295
|
+
|
|
296
|
+
"""
|
|
297
|
+
card_id = None
|
|
298
|
+
renderd_id = None
|
|
299
|
+
|
|
300
|
+
drm_path = Path(f"/sys/module/metax/drivers/pci:metax/{dev_bdf}/drm")
|
|
301
|
+
if drm_path.exists():
|
|
302
|
+
for dir_path in drm_path.iterdir():
|
|
303
|
+
if dir_path.name.startswith("card"):
|
|
304
|
+
card_id = int(dir_path.name[4:])
|
|
305
|
+
elif dir_path.name.startswith("renderD"):
|
|
306
|
+
renderd_id = int(dir_path.name[7:])
|
|
307
|
+
|
|
308
|
+
return card_id, renderd_id
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
1
|
+
from __future__ import annotations as __future_annotations__
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
from functools import lru_cache
|
|
@@ -47,7 +47,7 @@ class MThreadsDetector(Detector):
|
|
|
47
47
|
"""
|
|
48
48
|
|
|
49
49
|
@staticmethod
|
|
50
|
-
@lru_cache
|
|
50
|
+
@lru_cache(maxsize=1)
|
|
51
51
|
def is_supported() -> bool:
|
|
52
52
|
"""
|
|
53
53
|
Check if the MThreads detector is supported.
|
|
@@ -76,7 +76,7 @@ class MThreadsDetector(Detector):
|
|
|
76
76
|
return supported
|
|
77
77
|
|
|
78
78
|
@staticmethod
|
|
79
|
-
@lru_cache
|
|
79
|
+
@lru_cache(maxsize=1)
|
|
80
80
|
def detect_pci_devices() -> dict[str, PCIDevice]:
|
|
81
81
|
# See https://pcisig.com/membership/member-companies?combine=Moore+Threads.
|
|
82
82
|
pci_devs = get_pci_devices(vendor="0x1ed5")
|
|
@@ -117,6 +117,7 @@ class MThreadsDetector(Detector):
|
|
|
117
117
|
dev_cores = 0
|
|
118
118
|
dev_power_used = None
|
|
119
119
|
dev_pci_info = None
|
|
120
|
+
dev_is_vgpu = False
|
|
120
121
|
dev = pymtml.mtmlLibraryInitDeviceByIndex(dev_idx)
|
|
121
122
|
try:
|
|
122
123
|
dev_props = pymtml.mtmlDeviceGetProperty(dev)
|
|
@@ -163,9 +164,20 @@ class MThreadsDetector(Detector):
|
|
|
163
164
|
|
|
164
165
|
dev_bdf = f"{dev_pci_info.segment:04x}:{dev_pci_info.bus:02x}:{dev_pci_info.device:02x}.0"
|
|
165
166
|
|
|
167
|
+
dev_numa = get_numa_node_by_bdf(dev_bdf)
|
|
168
|
+
if not dev_numa:
|
|
169
|
+
dev_node_affinity = pymtml.mtmlDeviceGetMemoryAffinityWithinNode(
|
|
170
|
+
dev,
|
|
171
|
+
get_numa_nodeset_size(),
|
|
172
|
+
)
|
|
173
|
+
dev_numa = bitmask_to_str(
|
|
174
|
+
list(dev_node_affinity),
|
|
175
|
+
)
|
|
176
|
+
|
|
166
177
|
dev_appendix = {
|
|
167
178
|
"vgpu": dev_is_vgpu,
|
|
168
179
|
"bdf": dev_bdf,
|
|
180
|
+
"numa": dev_numa,
|
|
169
181
|
}
|
|
170
182
|
|
|
171
183
|
ret.append(
|
|
@@ -228,35 +240,24 @@ class MThreadsDetector(Detector):
|
|
|
228
240
|
dev_i_handle = pymtml.mtmlLibraryInitDeviceByIndex(dev_i.index)
|
|
229
241
|
|
|
230
242
|
try:
|
|
231
|
-
# Get
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
#
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
)
|
|
250
|
-
except pymtml.MTMLError:
|
|
251
|
-
debug_log_warning(
|
|
252
|
-
logger,
|
|
253
|
-
"Failed to get NUMA affinity for device %d",
|
|
254
|
-
dev_i.index,
|
|
255
|
-
)
|
|
256
|
-
# Get CPU affinity.
|
|
257
|
-
ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
|
|
258
|
-
ret.devices_numa_affinities[i],
|
|
259
|
-
)
|
|
243
|
+
# Get NUMA and CPU affinities.
|
|
244
|
+
ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
|
|
245
|
+
ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
|
|
246
|
+
ret.devices_numa_affinities[i],
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# Get links state if applicable.
|
|
250
|
+
if dev_i_links_state := _get_links_state(dev_i_handle):
|
|
251
|
+
ret.appendices[i].update(dev_i_links_state)
|
|
252
|
+
# In practice, if a card has an active *Link,
|
|
253
|
+
# then other cards in the same machine should be interconnected with it through the *Link.
|
|
254
|
+
if dev_i_links_state.get("links_active_count", 0) > 0:
|
|
255
|
+
for j, dev_j in enumerate(devices):
|
|
256
|
+
if dev_i.index == dev_j.index:
|
|
257
|
+
continue
|
|
258
|
+
ret.devices_distances[i][j] = TopologyDistanceEnum.LINK
|
|
259
|
+
ret.devices_distances[j][i] = TopologyDistanceEnum.LINK
|
|
260
|
+
continue
|
|
260
261
|
|
|
261
262
|
# Get distances to other devices.
|
|
262
263
|
for j, dev_j in enumerate(devices):
|
|
@@ -278,7 +279,6 @@ class MThreadsDetector(Detector):
|
|
|
278
279
|
topo,
|
|
279
280
|
distance,
|
|
280
281
|
)
|
|
281
|
-
# TODO(thxCode): Support LINK distance.
|
|
282
282
|
except pymtml.MTMLError:
|
|
283
283
|
debug_log_warning(
|
|
284
284
|
logger,
|
|
@@ -295,9 +295,6 @@ class MThreadsDetector(Detector):
|
|
|
295
295
|
finally:
|
|
296
296
|
pymtml.mtmlLibraryFreeDevice(dev_i_handle)
|
|
297
297
|
|
|
298
|
-
except pymtml.MTMLError:
|
|
299
|
-
debug_log_exception(logger, "Failed to fetch topology")
|
|
300
|
-
raise
|
|
301
298
|
except Exception:
|
|
302
299
|
debug_log_exception(logger, "Failed to process topology fetching")
|
|
303
300
|
raise
|
|
@@ -305,3 +302,44 @@ class MThreadsDetector(Detector):
|
|
|
305
302
|
pymtml.mtmlLibraryShutDown()
|
|
306
303
|
|
|
307
304
|
return ret
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def _get_links_state(
|
|
308
|
+
dev: pymtml.c_mtmlDevice_t,
|
|
309
|
+
) -> dict | None:
|
|
310
|
+
"""
|
|
311
|
+
Get the MTLink links count and state for a device.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
dev:
|
|
315
|
+
The MTLink device handle.
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
A dict includes links state or None if failed.
|
|
319
|
+
|
|
320
|
+
"""
|
|
321
|
+
dev_links_count = 0
|
|
322
|
+
try:
|
|
323
|
+
dev_link_spec = pymtml.mtmlDeviceGetMtLinkSpec(dev)
|
|
324
|
+
dev_links_count = dev_link_spec.linkNum
|
|
325
|
+
except pymtml.MTMLError:
|
|
326
|
+
debug_log_warning(logger, "Failed to get MTLink links count")
|
|
327
|
+
if not dev_links_count:
|
|
328
|
+
return None
|
|
329
|
+
|
|
330
|
+
dev_links_state = 0
|
|
331
|
+
dev_links_active_count = 0
|
|
332
|
+
try:
|
|
333
|
+
for link_idx in range(int(dev_links_count)):
|
|
334
|
+
dev_link_state = pymtml.mtmlDeviceGetMtLinkState(dev, link_idx)
|
|
335
|
+
if dev_link_state == pymtml.MTML_MTLINK_STATE_UP:
|
|
336
|
+
dev_links_state |= 1 << link_idx
|
|
337
|
+
dev_links_active_count += 1
|
|
338
|
+
except pymtml.MTMLError:
|
|
339
|
+
debug_log_warning(logger, "Failed to get MTLink link state")
|
|
340
|
+
|
|
341
|
+
return {
|
|
342
|
+
"links_count": dev_links_count,
|
|
343
|
+
"links_state": dev_links_state,
|
|
344
|
+
"links_active_count": dev_links_active_count,
|
|
345
|
+
}
|