gpustack-runtime 0.1.40.post1__py3-none-any.whl → 0.1.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpustack_runtime/__init__.py +1 -1
- gpustack_runtime/__main__.py +5 -3
- gpustack_runtime/_version.py +2 -2
- gpustack_runtime/_version_appendix.py +1 -1
- gpustack_runtime/cmds/__init__.py +5 -3
- gpustack_runtime/cmds/__types__.py +1 -1
- gpustack_runtime/cmds/deployer.py +140 -18
- gpustack_runtime/cmds/detector.py +1 -1
- gpustack_runtime/cmds/images.py +1 -1
- gpustack_runtime/deployer/__init__.py +28 -2
- gpustack_runtime/deployer/__patches__.py +1 -1
- gpustack_runtime/deployer/__types__.py +2 -1
- gpustack_runtime/deployer/__utils__.py +2 -2
- gpustack_runtime/deployer/cdi/__init__.py +85 -5
- gpustack_runtime/deployer/cdi/__types__.py +92 -29
- gpustack_runtime/deployer/cdi/__utils__.py +178 -0
- gpustack_runtime/deployer/cdi/amd.py +146 -0
- gpustack_runtime/deployer/cdi/ascend.py +164 -0
- gpustack_runtime/deployer/cdi/hygon.py +147 -0
- gpustack_runtime/deployer/cdi/iluvatar.py +136 -0
- gpustack_runtime/deployer/cdi/metax.py +148 -0
- gpustack_runtime/deployer/cdi/thead.py +57 -23
- gpustack_runtime/deployer/docker.py +9 -8
- gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +240 -0
- gpustack_runtime/deployer/k8s/deviceplugin/__types__.py +131 -0
- gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +586 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/__init__.py +3 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api.proto +212 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.py +86 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.pyi +168 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2_grpc.py +358 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/constants.py +34 -0
- gpustack_runtime/deployer/kuberentes.py +37 -4
- gpustack_runtime/deployer/podman.py +9 -8
- gpustack_runtime/detector/__init__.py +42 -5
- gpustack_runtime/detector/__types__.py +8 -24
- gpustack_runtime/detector/__utils__.py +46 -39
- gpustack_runtime/detector/amd.py +55 -66
- gpustack_runtime/detector/ascend.py +29 -41
- gpustack_runtime/detector/cambricon.py +3 -3
- gpustack_runtime/detector/hygon.py +21 -49
- gpustack_runtime/detector/iluvatar.py +44 -60
- gpustack_runtime/detector/metax.py +54 -37
- gpustack_runtime/detector/mthreads.py +74 -36
- gpustack_runtime/detector/nvidia.py +130 -93
- gpustack_runtime/detector/pyacl/__init__.py +1 -1
- gpustack_runtime/detector/pyamdgpu/__init__.py +1 -1
- gpustack_runtime/detector/pyamdsmi/__init__.py +1 -1
- gpustack_runtime/detector/pycuda/__init__.py +1 -1
- gpustack_runtime/detector/pydcmi/__init__.py +1 -1
- gpustack_runtime/detector/pyhsa/__init__.py +1 -1
- gpustack_runtime/detector/pymxsml/__init__.py +1553 -1
- gpustack_runtime/detector/pyrocmcore/__init__.py +1 -1
- gpustack_runtime/detector/pyrocmsmi/__init__.py +1 -1
- gpustack_runtime/detector/thead.py +41 -60
- gpustack_runtime/envs.py +104 -12
- gpustack_runtime/logging.py +6 -2
- {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.dist-info}/METADATA +6 -1
- gpustack_runtime-0.1.41.dist-info/RECORD +67 -0
- gpustack_runtime/detector/pymxsml/mxsml.py +0 -1580
- gpustack_runtime/detector/pymxsml/mxsml_extension.py +0 -816
- gpustack_runtime/detector/pymxsml/mxsml_mcm.py +0 -476
- gpustack_runtime-0.1.40.post1.dist-info/RECORD +0 -55
- {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.dist-info}/WHEEL +0 -0
- {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.dist-info}/entry_points.txt +0 -0
- {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
1
|
+
from __future__ import annotations as __future_annotations__
|
|
2
2
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
4
|
from dataclasses import dataclass
|
|
@@ -122,28 +122,6 @@ def backend_to_manufacturer(backend: str) -> ManufacturerEnum:
|
|
|
122
122
|
return ManufacturerEnum.UNKNOWN
|
|
123
123
|
|
|
124
124
|
|
|
125
|
-
def supported_manufacturers() -> list[ManufacturerEnum]:
|
|
126
|
-
"""
|
|
127
|
-
Get a list of supported manufacturers.
|
|
128
|
-
|
|
129
|
-
Returns:
|
|
130
|
-
A list of supported manufacturers.
|
|
131
|
-
|
|
132
|
-
"""
|
|
133
|
-
return list(_MANUFACTURER_BACKEND_MAPPING.keys())
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
def supported_backends() -> list[str]:
|
|
137
|
-
"""
|
|
138
|
-
Get a list of supported backends.
|
|
139
|
-
|
|
140
|
-
Returns:
|
|
141
|
-
A list of supported backends.
|
|
142
|
-
|
|
143
|
-
"""
|
|
144
|
-
return list(_MANUFACTURER_BACKEND_MAPPING.values())
|
|
145
|
-
|
|
146
|
-
|
|
147
125
|
@dataclass_json
|
|
148
126
|
@dataclass
|
|
149
127
|
class Device:
|
|
@@ -258,6 +236,11 @@ class Topology:
|
|
|
258
236
|
A list representing the NUMA affinity associated with each device.
|
|
259
237
|
The value at index i represents the Memory set for device i.
|
|
260
238
|
"""
|
|
239
|
+
appendices: list[dict[str, Any]]
|
|
240
|
+
"""
|
|
241
|
+
Appendices information of devices.
|
|
242
|
+
Each entry corresponds to a device and contains additional metadata.
|
|
243
|
+
"""
|
|
261
244
|
|
|
262
245
|
def __init__(
|
|
263
246
|
self,
|
|
@@ -278,6 +261,7 @@ class Topology:
|
|
|
278
261
|
self.devices_distances = [[0] * devices_count for _ in range(devices_count)]
|
|
279
262
|
self.devices_cpu_affinities = [""] * devices_count
|
|
280
263
|
self.devices_numa_affinities = [""] * devices_count
|
|
264
|
+
self.appendices = [{}] * devices_count
|
|
281
265
|
|
|
282
266
|
def stringify(self) -> list[list[str]]:
|
|
283
267
|
"""
|
|
@@ -501,7 +485,7 @@ class Detector(ABC):
|
|
|
501
485
|
"""
|
|
502
486
|
raise NotImplementedError
|
|
503
487
|
|
|
504
|
-
def get_topology(self, devices: Devices | None = None) -> Topology | None:
|
|
488
|
+
def get_topology(self, devices: Devices | None = None) -> Topology | None:
|
|
505
489
|
"""
|
|
506
490
|
Get the Topology object between the given devices.
|
|
507
491
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
1
|
+
from __future__ import annotations as __future_annotations__
|
|
2
2
|
|
|
3
3
|
import contextlib
|
|
4
4
|
import os
|
|
@@ -746,7 +746,7 @@ def get_numa_node_cpu_mapping() -> dict[int, list[int]]:
|
|
|
746
746
|
return numa_cpu_mapping
|
|
747
747
|
|
|
748
748
|
|
|
749
|
-
@lru_cache
|
|
749
|
+
@lru_cache
|
|
750
750
|
def get_numa_node_by_bdf(bdf: str) -> str:
|
|
751
751
|
"""
|
|
752
752
|
Get the NUMA node for a given PCI device BDF (Bus:Device.Function) address.
|
|
@@ -792,20 +792,7 @@ def map_cpu_affinity_to_numa_node(cpu_affinity: int | str | None) -> str:
|
|
|
792
792
|
else:
|
|
793
793
|
if not cpu_affinity:
|
|
794
794
|
return ""
|
|
795
|
-
cpu_indices: list[int] =
|
|
796
|
-
for part in cpu_affinity.split(","):
|
|
797
|
-
if "-" in part:
|
|
798
|
-
lo, hi = part.split("-")
|
|
799
|
-
lo_idx = safe_int(lo, -1)
|
|
800
|
-
hi_idx = safe_int(hi, -1)
|
|
801
|
-
if lo_idx == -1 or hi_idx == -1 or lo_idx > hi_idx:
|
|
802
|
-
continue
|
|
803
|
-
cpu_indices.extend(list(range(lo_idx, hi_idx + 1)))
|
|
804
|
-
else:
|
|
805
|
-
idx = safe_int(part, -1)
|
|
806
|
-
if idx == -1:
|
|
807
|
-
continue
|
|
808
|
-
cpu_indices.append(idx)
|
|
795
|
+
cpu_indices: list[int] = str_range_to_list(cpu_affinity)
|
|
809
796
|
|
|
810
797
|
cpu_numa_mapping = get_cpu_numa_node_mapping()
|
|
811
798
|
|
|
@@ -818,7 +805,7 @@ def map_cpu_affinity_to_numa_node(cpu_affinity: int | str | None) -> str:
|
|
|
818
805
|
if not numa_nodes:
|
|
819
806
|
return ""
|
|
820
807
|
|
|
821
|
-
return
|
|
808
|
+
return list_to_str_range(sorted(numa_nodes))
|
|
822
809
|
|
|
823
810
|
|
|
824
811
|
@lru_cache
|
|
@@ -843,20 +830,7 @@ def map_numa_node_to_cpu_affinity(numa_node: int | str | None) -> str:
|
|
|
843
830
|
else:
|
|
844
831
|
if not numa_node:
|
|
845
832
|
return ""
|
|
846
|
-
numa_indices: list[int] =
|
|
847
|
-
for part in numa_node.split(","):
|
|
848
|
-
if "-" in part:
|
|
849
|
-
lo, hi = part.split("-")
|
|
850
|
-
lo_idx = safe_int(lo, -1)
|
|
851
|
-
hi_idx = safe_int(hi, -1)
|
|
852
|
-
if lo_idx == -1 or hi_idx == -1 or lo_idx > hi_idx:
|
|
853
|
-
continue
|
|
854
|
-
numa_indices.extend(list(range(lo_idx, hi_idx + 1)))
|
|
855
|
-
else:
|
|
856
|
-
idx = safe_int(part, -1)
|
|
857
|
-
if idx == -1:
|
|
858
|
-
continue
|
|
859
|
-
numa_indices.append(idx)
|
|
833
|
+
numa_indices: list[int] = str_range_to_list(numa_node)
|
|
860
834
|
|
|
861
835
|
numa_cpu_mapping = get_numa_node_cpu_mapping()
|
|
862
836
|
|
|
@@ -867,7 +841,7 @@ def map_numa_node_to_cpu_affinity(numa_node: int | str | None) -> str:
|
|
|
867
841
|
if not cpu_cores:
|
|
868
842
|
return ""
|
|
869
843
|
|
|
870
|
-
return
|
|
844
|
+
return list_to_str_range(sorted(cpu_cores))
|
|
871
845
|
|
|
872
846
|
|
|
873
847
|
def bitmask_to_list(bitmask: int, offset: int = 0) -> list[int]:
|
|
@@ -889,7 +863,7 @@ def bitmask_to_list(bitmask: int, offset: int = 0) -> list[int]:
|
|
|
889
863
|
return indices
|
|
890
864
|
|
|
891
865
|
|
|
892
|
-
def
|
|
866
|
+
def list_to_str_range(indices: list[int]) -> str:
|
|
893
867
|
"""
|
|
894
868
|
Convert a list of indices to a comma-separated string with ranges.
|
|
895
869
|
|
|
@@ -919,15 +893,48 @@ def list_to_range_str(indices: list[int]) -> str:
|
|
|
919
893
|
start, end = i, i
|
|
920
894
|
ranges.append((start, end))
|
|
921
895
|
|
|
922
|
-
|
|
896
|
+
str_range_parts: list[str] = []
|
|
923
897
|
for start, end in ranges:
|
|
924
898
|
if start == end:
|
|
925
|
-
|
|
899
|
+
str_range_parts.append(f"{start}")
|
|
926
900
|
else:
|
|
927
|
-
|
|
928
|
-
|
|
901
|
+
str_range_parts.append(f"{start}-{end}")
|
|
902
|
+
str_range = ",".join(str_range_parts)
|
|
903
|
+
|
|
904
|
+
return str_range
|
|
905
|
+
|
|
906
|
+
|
|
907
|
+
def str_range_to_list(str_range: str) -> list[int]:
|
|
908
|
+
"""
|
|
909
|
+
Convert a comma-separated string with ranges to a list of indices.
|
|
910
|
+
|
|
911
|
+
Args:
|
|
912
|
+
str_range:
|
|
913
|
+
A comma-separated string with ranges (e.g., "0,2-4,6").
|
|
914
|
+
|
|
915
|
+
Returns:
|
|
916
|
+
A list of indices.
|
|
917
|
+
|
|
918
|
+
"""
|
|
919
|
+
str_range_parts = str_range.split(",")
|
|
920
|
+
|
|
921
|
+
indices: set[int] = set()
|
|
922
|
+
for _part in str_range_parts:
|
|
923
|
+
part = _part.strip()
|
|
924
|
+
if "-" in part:
|
|
925
|
+
lo, hi = part.split("-")
|
|
926
|
+
lo_idx = safe_int(lo, -1)
|
|
927
|
+
hi_idx = safe_int(hi, -1)
|
|
928
|
+
if lo_idx == -1 or hi_idx == -1 or lo_idx > hi_idx:
|
|
929
|
+
continue
|
|
930
|
+
indices.update(range(lo_idx, hi_idx + 1))
|
|
931
|
+
else:
|
|
932
|
+
idx = safe_int(part, -1)
|
|
933
|
+
if idx == -1:
|
|
934
|
+
continue
|
|
935
|
+
indices.add(idx)
|
|
929
936
|
|
|
930
|
-
return
|
|
937
|
+
return sorted(indices)
|
|
931
938
|
|
|
932
939
|
|
|
933
940
|
def bitmask_to_str(bitmask_list: list) -> str:
|
|
@@ -950,7 +957,7 @@ def bitmask_to_str(bitmask_list: list) -> str:
|
|
|
950
957
|
bits_lists.extend(bitmask_to_list(bitmask, offset))
|
|
951
958
|
offset += get_bits_size()
|
|
952
959
|
|
|
953
|
-
return
|
|
960
|
+
return list_to_str_range(sorted(bits_lists))
|
|
954
961
|
|
|
955
962
|
|
|
956
963
|
def get_physical_function_by_bdf(bdf: str) -> str:
|
gpustack_runtime/detector/amd.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
1
|
+
from __future__ import annotations as __future_annotations__
|
|
2
2
|
|
|
3
3
|
import contextlib
|
|
4
4
|
import logging
|
|
@@ -30,7 +30,7 @@ class AMDDetector(Detector):
|
|
|
30
30
|
"""
|
|
31
31
|
|
|
32
32
|
@staticmethod
|
|
33
|
-
@lru_cache
|
|
33
|
+
@lru_cache(maxsize=1)
|
|
34
34
|
def is_supported() -> bool:
|
|
35
35
|
"""
|
|
36
36
|
Check if the AMD detector is supported.
|
|
@@ -59,7 +59,7 @@ class AMDDetector(Detector):
|
|
|
59
59
|
return supported
|
|
60
60
|
|
|
61
61
|
@staticmethod
|
|
62
|
-
@lru_cache
|
|
62
|
+
@lru_cache(maxsize=1)
|
|
63
63
|
def detect_pci_devices() -> dict[str, PCIDevice]:
|
|
64
64
|
# See https://pcisig.com/membership/member-companies?combine=AMD.
|
|
65
65
|
pci_devs = get_pci_devices(vendor="0x1002")
|
|
@@ -108,11 +108,7 @@ class AMDDetector(Detector):
|
|
|
108
108
|
asic_serial = dev_gpu_asic_info.get("asic_serial")
|
|
109
109
|
dev_uuid = f"GPU-{(asic_serial[2:]).lower()}"
|
|
110
110
|
else:
|
|
111
|
-
dev_uuid = ""
|
|
112
|
-
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
|
|
113
|
-
dev_uuid = (
|
|
114
|
-
f"GPU-{pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)[2:]}"
|
|
115
|
-
)
|
|
111
|
+
dev_uuid = f"GPU-{pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)[2:]}"
|
|
116
112
|
dev_hsa_agent = hsa_agents.get(dev_uuid, pyhsa.Agent())
|
|
117
113
|
|
|
118
114
|
dev_gpu_driver_info = pyamdsmi.amdsmi_get_gpu_driver_info(dev)
|
|
@@ -132,12 +128,8 @@ class AMDDetector(Detector):
|
|
|
132
128
|
dev_idx,
|
|
133
129
|
)
|
|
134
130
|
|
|
135
|
-
dev_bdf =
|
|
136
|
-
dev_card_id =
|
|
137
|
-
dev_renderd_id = None
|
|
138
|
-
with contextlib.suppress(pyamdsmi.AmdSmiException):
|
|
139
|
-
dev_bdf = pyamdsmi.amdsmi_get_gpu_device_bdf(dev)
|
|
140
|
-
dev_card_id, dev_renderd_id = _get_card_and_renderd_id(dev_bdf)
|
|
131
|
+
dev_bdf = pyamdsmi.amdsmi_get_gpu_device_bdf(dev)
|
|
132
|
+
dev_card_id, dev_renderd_id = _get_card_and_renderd_id(dev_bdf)
|
|
141
133
|
|
|
142
134
|
dev_cores = dev_hsa_agent.compute_units
|
|
143
135
|
dev_asic_family_id = dev_hsa_agent.asic_family_id
|
|
@@ -205,27 +197,25 @@ class AMDDetector(Detector):
|
|
|
205
197
|
dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
|
|
206
198
|
dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
|
|
207
199
|
|
|
208
|
-
dev_is_vgpu =
|
|
209
|
-
|
|
210
|
-
|
|
200
|
+
dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
|
|
201
|
+
|
|
202
|
+
dev_numa = get_numa_node_by_bdf(dev_bdf)
|
|
203
|
+
if not dev_numa:
|
|
204
|
+
dev_numa = str(pyamdsmi.amdsmi_topo_get_numa_node_number(dev))
|
|
211
205
|
|
|
212
206
|
dev_appendix = {
|
|
213
207
|
"arch_family": _get_arch_family(dev_asic_family_id),
|
|
214
208
|
"vgpu": dev_is_vgpu,
|
|
209
|
+
"bdf": dev_bdf,
|
|
210
|
+
"numa": dev_numa,
|
|
215
211
|
}
|
|
216
|
-
if dev_bdf:
|
|
217
|
-
dev_appendix["bdf"] = dev_bdf
|
|
218
212
|
if dev_card_id is not None:
|
|
219
213
|
dev_appendix["card_id"] = dev_card_id
|
|
220
214
|
if dev_renderd_id is not None:
|
|
221
215
|
dev_appendix["renderd_id"] = dev_renderd_id
|
|
222
216
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
if xgmi_lanes := dev_xgmi.get("xgmi_lanes", None):
|
|
226
|
-
dev_appendix["xgmi_lanes"] = xgmi_lanes
|
|
227
|
-
dev_appendix["xgmi_hive_id"] = dev_xgmi.get("xgmi_hive_id")
|
|
228
|
-
dev_appendix["xgmi_node_id"] = dev_xgmi.get("xgmi_node_id")
|
|
217
|
+
if dev_xgmi_info := _get_xgmi_info(dev):
|
|
218
|
+
dev_appendix.update(dev_xgmi_info)
|
|
229
219
|
|
|
230
220
|
ret.append(
|
|
231
221
|
Device(
|
|
@@ -285,9 +275,9 @@ class AMDDetector(Detector):
|
|
|
285
275
|
devs_mapping = None
|
|
286
276
|
|
|
287
277
|
def get_device_handle(dev: Device):
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
278
|
+
with contextlib.suppress(pyamdsmi.AmdSmiException):
|
|
279
|
+
bdf = dev.appendix["bdf"]
|
|
280
|
+
return pyamdsmi.amdsmi_get_processor_handle_from_bdf(bdf)
|
|
291
281
|
nonlocal devs_mapping
|
|
292
282
|
if devs_mapping is None:
|
|
293
283
|
devs = pyamdsmi.amdsmi_get_processor_handles()
|
|
@@ -295,7 +285,7 @@ class AMDDetector(Detector):
|
|
|
295
285
|
return devs_mapping.get(dev.index)
|
|
296
286
|
|
|
297
287
|
try:
|
|
298
|
-
|
|
288
|
+
pci_devs = self.detect_pci_devices()
|
|
299
289
|
|
|
300
290
|
def distance_pci_devices(bdf_a: str, bdf_b: str) -> TopologyDistanceEnum:
|
|
301
291
|
"""
|
|
@@ -311,8 +301,8 @@ class AMDDetector(Detector):
|
|
|
311
301
|
The TopologyDistanceEnum representing the distance.
|
|
312
302
|
|
|
313
303
|
"""
|
|
314
|
-
pcid_a =
|
|
315
|
-
pcid_b =
|
|
304
|
+
pcid_a = pci_devs.get(bdf_a, None)
|
|
305
|
+
pcid_b = pci_devs.get(bdf_b, None)
|
|
316
306
|
|
|
317
307
|
score = compare_pci_devices(pcid_a, pcid_b)
|
|
318
308
|
if score > 0:
|
|
@@ -323,41 +313,16 @@ class AMDDetector(Detector):
|
|
|
323
313
|
|
|
324
314
|
pyamdsmi.amdsmi_init()
|
|
325
315
|
|
|
326
|
-
# Get NUMA and CPU affinities.
|
|
327
|
-
for i, dev_i in enumerate(devices):
|
|
328
|
-
# Get affinity with PCIe BDF if possible.
|
|
329
|
-
if dev_i_bdf := dev_i.appendix.get("bdf", ""):
|
|
330
|
-
ret.devices_numa_affinities[i] = get_numa_node_by_bdf(
|
|
331
|
-
dev_i_bdf,
|
|
332
|
-
)
|
|
333
|
-
ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
|
|
334
|
-
ret.devices_numa_affinities[i],
|
|
335
|
-
)
|
|
336
|
-
# Otherwise, get affinity via AMD SMI.
|
|
337
|
-
if not ret.devices_cpu_affinities[i]:
|
|
338
|
-
dev_i_handle = get_device_handle(dev_i)
|
|
339
|
-
|
|
340
|
-
# Get NUMA affinity.
|
|
341
|
-
try:
|
|
342
|
-
dev_i_numa_node = pyamdsmi.amdsmi_topo_get_numa_node_number(
|
|
343
|
-
dev_i_handle,
|
|
344
|
-
)
|
|
345
|
-
ret.devices_numa_affinities[i] = str(dev_i_numa_node)
|
|
346
|
-
except pyamdsmi.AmdSmiException:
|
|
347
|
-
debug_log_exception(
|
|
348
|
-
logger,
|
|
349
|
-
"Failed to get NUMA affinity for device %d",
|
|
350
|
-
dev_i.index,
|
|
351
|
-
)
|
|
352
|
-
# Get CPU affinity.
|
|
353
|
-
ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
|
|
354
|
-
ret.devices_numa_affinities[i],
|
|
355
|
-
)
|
|
356
|
-
|
|
357
|
-
# Get distances to other devices.
|
|
358
316
|
for i, dev_i in enumerate(devices):
|
|
359
317
|
dev_i_handle = get_device_handle(dev_i)
|
|
360
318
|
|
|
319
|
+
# Get NUMA and CPU affinities.
|
|
320
|
+
ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
|
|
321
|
+
ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
|
|
322
|
+
ret.devices_numa_affinities[i],
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
# Get distances to other devices.
|
|
361
326
|
for j, dev_j in enumerate(devices):
|
|
362
327
|
if dev_i.index == dev_j.index or ret.devices_distances[i][j] != 0:
|
|
363
328
|
continue
|
|
@@ -402,9 +367,6 @@ class AMDDetector(Detector):
|
|
|
402
367
|
|
|
403
368
|
ret.devices_distances[i][j] = distance
|
|
404
369
|
ret.devices_distances[j][i] = distance
|
|
405
|
-
except pyamdsmi.AmdSmiException:
|
|
406
|
-
debug_log_exception(logger, "Failed to fetch topology")
|
|
407
|
-
raise
|
|
408
370
|
except Exception:
|
|
409
371
|
debug_log_exception(logger, "Failed to process topology fetching")
|
|
410
372
|
raise
|
|
@@ -465,6 +427,7 @@ def _get_card_and_renderd_id(dev_bdf: str) -> tuple[int | None, int | None]:
|
|
|
465
427
|
"""
|
|
466
428
|
card_id = None
|
|
467
429
|
renderd_id = None
|
|
430
|
+
|
|
468
431
|
drm_path = Path(f"/sys/module/amdgpu/drivers/pci:amdgpu/{dev_bdf}/drm")
|
|
469
432
|
if drm_path.exists():
|
|
470
433
|
for dir_path in drm_path.iterdir():
|
|
@@ -474,3 +437,29 @@ def _get_card_and_renderd_id(dev_bdf: str) -> tuple[int | None, int | None]:
|
|
|
474
437
|
renderd_id = int(dir_path.name[7:])
|
|
475
438
|
|
|
476
439
|
return card_id, renderd_id
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def _get_xgmi_info(dev) -> dict | None:
|
|
443
|
+
"""
|
|
444
|
+
Get the XGMI information for a given device.
|
|
445
|
+
|
|
446
|
+
Args:
|
|
447
|
+
dev:
|
|
448
|
+
The device handle.
|
|
449
|
+
|
|
450
|
+
Returns:
|
|
451
|
+
A dictionary containing XGMI information, or None if not available.
|
|
452
|
+
|
|
453
|
+
"""
|
|
454
|
+
try:
|
|
455
|
+
dev_xgmi = pyamdsmi.amdsmi_get_xgmi_info(dev)
|
|
456
|
+
if xgmi_lanes := dev_xgmi.get("xgmi_lanes", None):
|
|
457
|
+
return {
|
|
458
|
+
"xgmi_lanes": xgmi_lanes,
|
|
459
|
+
"xgmi_hive_id": dev_xgmi.get("xgmi_hive_id"),
|
|
460
|
+
"xgmi_node_id": dev_xgmi.get("xgmi_node_id"),
|
|
461
|
+
}
|
|
462
|
+
except pyamdsmi.AmdSmiException:
|
|
463
|
+
debug_log_exception(logger, "Failed to get XGMI information")
|
|
464
|
+
|
|
465
|
+
return None
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
1
|
+
from __future__ import annotations as __future_annotations__
|
|
2
2
|
|
|
3
3
|
import contextlib
|
|
4
4
|
import logging
|
|
@@ -47,7 +47,7 @@ class AscendDetector(Detector):
|
|
|
47
47
|
"""
|
|
48
48
|
|
|
49
49
|
@staticmethod
|
|
50
|
-
@lru_cache
|
|
50
|
+
@lru_cache(maxsize=1)
|
|
51
51
|
def is_supported() -> bool:
|
|
52
52
|
"""
|
|
53
53
|
Check if the Ascend detector is supported.
|
|
@@ -75,7 +75,7 @@ class AscendDetector(Detector):
|
|
|
75
75
|
return supported
|
|
76
76
|
|
|
77
77
|
@staticmethod
|
|
78
|
-
@lru_cache
|
|
78
|
+
@lru_cache(maxsize=1)
|
|
79
79
|
def detect_pci_devices() -> dict[str, PCIDevice]:
|
|
80
80
|
# See https://pcisig.com/membership/member-companies?combine=Huawei.
|
|
81
81
|
pci_devs = get_pci_devices(vendor="0x19e5")
|
|
@@ -184,12 +184,29 @@ class AscendDetector(Detector):
|
|
|
184
184
|
if dev_power_used:
|
|
185
185
|
dev_power_used = dev_power_used / 10 # 0.1W to W
|
|
186
186
|
|
|
187
|
+
dev_bdf = pydcmi.dcmi_get_device_bdf(
|
|
188
|
+
dev_card_id,
|
|
189
|
+
dev_device_id,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
dev_numa = get_numa_node_by_bdf(dev_bdf)
|
|
193
|
+
if not dev_numa:
|
|
194
|
+
dev_cpu_affinity = (
|
|
195
|
+
pydcmi.dcmi_get_affinity_cpu_info_by_device_id(
|
|
196
|
+
dev_card_id,
|
|
197
|
+
dev_device_id,
|
|
198
|
+
)
|
|
199
|
+
)
|
|
200
|
+
dev_numa = map_cpu_affinity_to_numa_node(dev_cpu_affinity)
|
|
201
|
+
|
|
187
202
|
dev_appendix = {
|
|
188
203
|
"arch_family": (
|
|
189
204
|
pyacl.aclrtGetSocName()
|
|
190
205
|
or _guess_soc_name_from_dev_name(dev_name)
|
|
191
206
|
),
|
|
192
207
|
"vgpu": dev_is_vgpu,
|
|
208
|
+
"bdf": dev_bdf,
|
|
209
|
+
"numa": dev_numa,
|
|
193
210
|
"card_id": dev_card_id,
|
|
194
211
|
"device_id": dev_device_id,
|
|
195
212
|
"device_id_max": device_num_in_card - 1,
|
|
@@ -208,13 +225,6 @@ class AscendDetector(Detector):
|
|
|
208
225
|
if dev_roce_gateway:
|
|
209
226
|
dev_appendix["roce_gateway"] = str(dev_roce_gateway)
|
|
210
227
|
|
|
211
|
-
with contextlib.suppress(pydcmi.DCMIError):
|
|
212
|
-
dev_bdf = pydcmi.dcmi_get_device_bdf(
|
|
213
|
-
dev_card_id,
|
|
214
|
-
dev_device_id,
|
|
215
|
-
)
|
|
216
|
-
dev_appendix["bdf"] = dev_bdf
|
|
217
|
-
|
|
218
228
|
ret.append(
|
|
219
229
|
Device(
|
|
220
230
|
manufacturer=self.manufacturer,
|
|
@@ -270,44 +280,22 @@ class AscendDetector(Detector):
|
|
|
270
280
|
pydcmi.dcmi_init()
|
|
271
281
|
|
|
272
282
|
for i, dev_i in enumerate(devices):
|
|
273
|
-
dev_i_card_id = dev_i.appendix
|
|
274
|
-
dev_i_device_id = dev_i.appendix
|
|
283
|
+
dev_i_card_id = dev_i.appendix.get("card_id", i)
|
|
284
|
+
dev_i_device_id = dev_i.appendix.get("device_id", 0)
|
|
275
285
|
|
|
276
|
-
# Get
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
|
|
282
|
-
ret.devices_numa_affinities[i],
|
|
283
|
-
)
|
|
284
|
-
# Otherwise, get affinity via DCMI.
|
|
285
|
-
if not ret.devices_cpu_affinities[i]:
|
|
286
|
-
# Get CPU affinity.
|
|
287
|
-
try:
|
|
288
|
-
cpu_affinity = pydcmi.dcmi_get_affinity_cpu_info_by_device_id(
|
|
289
|
-
dev_i.appendix["card_id"],
|
|
290
|
-
dev_i.appendix["device_id"],
|
|
291
|
-
)
|
|
292
|
-
ret.devices_cpu_affinities[i] = cpu_affinity
|
|
293
|
-
except pydcmi.DCMIError:
|
|
294
|
-
debug_log_exception(
|
|
295
|
-
slogger,
|
|
296
|
-
"Failed to get CPU affinity for device %d",
|
|
297
|
-
dev_i.index,
|
|
298
|
-
)
|
|
299
|
-
# Get NUMA affinity.
|
|
300
|
-
ret.devices_numa_affinities[i] = map_cpu_affinity_to_numa_node(
|
|
301
|
-
ret.devices_cpu_affinities[i],
|
|
302
|
-
)
|
|
286
|
+
# Get NUMA and CPU affinities.
|
|
287
|
+
ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
|
|
288
|
+
ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
|
|
289
|
+
ret.devices_numa_affinities[i],
|
|
290
|
+
)
|
|
303
291
|
|
|
304
292
|
# Get distances to other devices.
|
|
305
293
|
for j, dev_j in enumerate(devices):
|
|
306
294
|
if dev_i.index == dev_j.index or ret.devices_distances[i][j] != 0:
|
|
307
295
|
continue
|
|
308
296
|
|
|
309
|
-
dev_j_card_id = dev_j.appendix
|
|
310
|
-
dev_j_device_id = dev_j.appendix
|
|
297
|
+
dev_j_card_id = dev_j.appendix.get("card_id", j)
|
|
298
|
+
dev_j_device_id = dev_j.appendix.get("device_id", 0)
|
|
311
299
|
|
|
312
300
|
# If two devices are the same card,
|
|
313
301
|
# skip distance calculation.
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
1
|
+
from __future__ import annotations as __future_annotations__
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
@@ -26,7 +26,7 @@ class CambriconDetector(Detector):
|
|
|
26
26
|
"""
|
|
27
27
|
|
|
28
28
|
@staticmethod
|
|
29
|
-
@lru_cache
|
|
29
|
+
@lru_cache(maxsize=1)
|
|
30
30
|
def is_supported() -> bool:
|
|
31
31
|
"""
|
|
32
32
|
Check if the Cambricon detector is supported.
|
|
@@ -50,7 +50,7 @@ class CambriconDetector(Detector):
|
|
|
50
50
|
return supported
|
|
51
51
|
|
|
52
52
|
@staticmethod
|
|
53
|
-
@lru_cache
|
|
53
|
+
@lru_cache(maxsize=1)
|
|
54
54
|
def detect_pci_devices() -> dict[str, PCIDevice]:
|
|
55
55
|
# See https://pcisig.com/membership/member-companies?combine=Cambricon.
|
|
56
56
|
pci_devs = get_pci_devices(vendor="0xcabc")
|