gpustack-runtime 0.1.40.post1__py3-none-any.whl → 0.1.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. gpustack_runtime/__init__.py +1 -1
  2. gpustack_runtime/__main__.py +5 -3
  3. gpustack_runtime/_version.py +2 -2
  4. gpustack_runtime/_version_appendix.py +1 -1
  5. gpustack_runtime/cmds/__init__.py +5 -3
  6. gpustack_runtime/cmds/__types__.py +1 -1
  7. gpustack_runtime/cmds/deployer.py +140 -18
  8. gpustack_runtime/cmds/detector.py +1 -1
  9. gpustack_runtime/cmds/images.py +1 -1
  10. gpustack_runtime/deployer/__init__.py +28 -2
  11. gpustack_runtime/deployer/__patches__.py +1 -1
  12. gpustack_runtime/deployer/__types__.py +2 -1
  13. gpustack_runtime/deployer/__utils__.py +2 -2
  14. gpustack_runtime/deployer/cdi/__init__.py +85 -5
  15. gpustack_runtime/deployer/cdi/__types__.py +92 -29
  16. gpustack_runtime/deployer/cdi/__utils__.py +178 -0
  17. gpustack_runtime/deployer/cdi/amd.py +146 -0
  18. gpustack_runtime/deployer/cdi/ascend.py +164 -0
  19. gpustack_runtime/deployer/cdi/hygon.py +147 -0
  20. gpustack_runtime/deployer/cdi/iluvatar.py +136 -0
  21. gpustack_runtime/deployer/cdi/metax.py +148 -0
  22. gpustack_runtime/deployer/cdi/thead.py +57 -23
  23. gpustack_runtime/deployer/docker.py +9 -8
  24. gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +240 -0
  25. gpustack_runtime/deployer/k8s/deviceplugin/__types__.py +131 -0
  26. gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +586 -0
  27. gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/__init__.py +3 -0
  28. gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api.proto +212 -0
  29. gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.py +86 -0
  30. gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.pyi +168 -0
  31. gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2_grpc.py +358 -0
  32. gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/constants.py +34 -0
  33. gpustack_runtime/deployer/kuberentes.py +37 -4
  34. gpustack_runtime/deployer/podman.py +9 -8
  35. gpustack_runtime/detector/__init__.py +42 -5
  36. gpustack_runtime/detector/__types__.py +8 -24
  37. gpustack_runtime/detector/__utils__.py +46 -39
  38. gpustack_runtime/detector/amd.py +55 -66
  39. gpustack_runtime/detector/ascend.py +29 -41
  40. gpustack_runtime/detector/cambricon.py +3 -3
  41. gpustack_runtime/detector/hygon.py +21 -49
  42. gpustack_runtime/detector/iluvatar.py +44 -60
  43. gpustack_runtime/detector/metax.py +54 -37
  44. gpustack_runtime/detector/mthreads.py +74 -36
  45. gpustack_runtime/detector/nvidia.py +130 -93
  46. gpustack_runtime/detector/pyacl/__init__.py +1 -1
  47. gpustack_runtime/detector/pyamdgpu/__init__.py +1 -1
  48. gpustack_runtime/detector/pyamdsmi/__init__.py +1 -1
  49. gpustack_runtime/detector/pycuda/__init__.py +1 -1
  50. gpustack_runtime/detector/pydcmi/__init__.py +1 -1
  51. gpustack_runtime/detector/pyhsa/__init__.py +1 -1
  52. gpustack_runtime/detector/pymxsml/__init__.py +1553 -1
  53. gpustack_runtime/detector/pyrocmcore/__init__.py +1 -1
  54. gpustack_runtime/detector/pyrocmsmi/__init__.py +1 -1
  55. gpustack_runtime/detector/thead.py +41 -60
  56. gpustack_runtime/envs.py +104 -12
  57. gpustack_runtime/logging.py +6 -2
  58. {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.dist-info}/METADATA +6 -1
  59. gpustack_runtime-0.1.41.dist-info/RECORD +67 -0
  60. gpustack_runtime/detector/pymxsml/mxsml.py +0 -1580
  61. gpustack_runtime/detector/pymxsml/mxsml_extension.py +0 -816
  62. gpustack_runtime/detector/pymxsml/mxsml_mcm.py +0 -476
  63. gpustack_runtime-0.1.40.post1.dist-info/RECORD +0 -55
  64. {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.dist-info}/WHEEL +0 -0
  65. {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.dist-info}/entry_points.txt +0 -0
  66. {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.dist-info}/licenses/LICENSE +0 -0
@@ -1,4 +1,4 @@
1
- from __future__ import annotations
1
+ from __future__ import annotations as __future_annotations__
2
2
 
3
3
  from abc import ABC, abstractmethod
4
4
  from dataclasses import dataclass
@@ -122,28 +122,6 @@ def backend_to_manufacturer(backend: str) -> ManufacturerEnum:
122
122
  return ManufacturerEnum.UNKNOWN
123
123
 
124
124
 
125
- def supported_manufacturers() -> list[ManufacturerEnum]:
126
- """
127
- Get a list of supported manufacturers.
128
-
129
- Returns:
130
- A list of supported manufacturers.
131
-
132
- """
133
- return list(_MANUFACTURER_BACKEND_MAPPING.keys())
134
-
135
-
136
- def supported_backends() -> list[str]:
137
- """
138
- Get a list of supported backends.
139
-
140
- Returns:
141
- A list of supported backends.
142
-
143
- """
144
- return list(_MANUFACTURER_BACKEND_MAPPING.values())
145
-
146
-
147
125
  @dataclass_json
148
126
  @dataclass
149
127
  class Device:
@@ -258,6 +236,11 @@ class Topology:
258
236
  A list representing the NUMA affinity associated with each device.
259
237
  The value at index i represents the Memory set for device i.
260
238
  """
239
+ appendices: list[dict[str, Any]]
240
+ """
241
+ Appendices information of devices.
242
+ Each entry corresponds to a device and contains additional metadata.
243
+ """
261
244
 
262
245
  def __init__(
263
246
  self,
@@ -278,6 +261,7 @@ class Topology:
278
261
  self.devices_distances = [[0] * devices_count for _ in range(devices_count)]
279
262
  self.devices_cpu_affinities = [""] * devices_count
280
263
  self.devices_numa_affinities = [""] * devices_count
264
+ self.appendices = [{}] * devices_count
281
265
 
282
266
  def stringify(self) -> list[list[str]]:
283
267
  """
@@ -501,7 +485,7 @@ class Detector(ABC):
501
485
  """
502
486
  raise NotImplementedError
503
487
 
504
- def get_topology(self, devices: Devices | None = None) -> Topology | None: # noqa: ARG002
488
+ def get_topology(self, devices: Devices | None = None) -> Topology | None:
505
489
  """
506
490
  Get the Topology object between the given devices.
507
491
 
@@ -1,4 +1,4 @@
1
- from __future__ import annotations
1
+ from __future__ import annotations as __future_annotations__
2
2
 
3
3
  import contextlib
4
4
  import os
@@ -746,7 +746,7 @@ def get_numa_node_cpu_mapping() -> dict[int, list[int]]:
746
746
  return numa_cpu_mapping
747
747
 
748
748
 
749
- @lru_cache(maxsize=128)
749
+ @lru_cache
750
750
  def get_numa_node_by_bdf(bdf: str) -> str:
751
751
  """
752
752
  Get the NUMA node for a given PCI device BDF (Bus:Device.Function) address.
@@ -792,20 +792,7 @@ def map_cpu_affinity_to_numa_node(cpu_affinity: int | str | None) -> str:
792
792
  else:
793
793
  if not cpu_affinity:
794
794
  return ""
795
- cpu_indices: list[int] = []
796
- for part in cpu_affinity.split(","):
797
- if "-" in part:
798
- lo, hi = part.split("-")
799
- lo_idx = safe_int(lo, -1)
800
- hi_idx = safe_int(hi, -1)
801
- if lo_idx == -1 or hi_idx == -1 or lo_idx > hi_idx:
802
- continue
803
- cpu_indices.extend(list(range(lo_idx, hi_idx + 1)))
804
- else:
805
- idx = safe_int(part, -1)
806
- if idx == -1:
807
- continue
808
- cpu_indices.append(idx)
795
+ cpu_indices: list[int] = str_range_to_list(cpu_affinity)
809
796
 
810
797
  cpu_numa_mapping = get_cpu_numa_node_mapping()
811
798
 
@@ -818,7 +805,7 @@ def map_cpu_affinity_to_numa_node(cpu_affinity: int | str | None) -> str:
818
805
  if not numa_nodes:
819
806
  return ""
820
807
 
821
- return list_to_range_str(sorted(numa_nodes))
808
+ return list_to_str_range(sorted(numa_nodes))
822
809
 
823
810
 
824
811
  @lru_cache
@@ -843,20 +830,7 @@ def map_numa_node_to_cpu_affinity(numa_node: int | str | None) -> str:
843
830
  else:
844
831
  if not numa_node:
845
832
  return ""
846
- numa_indices: list[int] = []
847
- for part in numa_node.split(","):
848
- if "-" in part:
849
- lo, hi = part.split("-")
850
- lo_idx = safe_int(lo, -1)
851
- hi_idx = safe_int(hi, -1)
852
- if lo_idx == -1 or hi_idx == -1 or lo_idx > hi_idx:
853
- continue
854
- numa_indices.extend(list(range(lo_idx, hi_idx + 1)))
855
- else:
856
- idx = safe_int(part, -1)
857
- if idx == -1:
858
- continue
859
- numa_indices.append(idx)
833
+ numa_indices: list[int] = str_range_to_list(numa_node)
860
834
 
861
835
  numa_cpu_mapping = get_numa_node_cpu_mapping()
862
836
 
@@ -867,7 +841,7 @@ def map_numa_node_to_cpu_affinity(numa_node: int | str | None) -> str:
867
841
  if not cpu_cores:
868
842
  return ""
869
843
 
870
- return list_to_range_str(sorted(cpu_cores))
844
+ return list_to_str_range(sorted(cpu_cores))
871
845
 
872
846
 
873
847
  def bitmask_to_list(bitmask: int, offset: int = 0) -> list[int]:
@@ -889,7 +863,7 @@ def bitmask_to_list(bitmask: int, offset: int = 0) -> list[int]:
889
863
  return indices
890
864
 
891
865
 
892
- def list_to_range_str(indices: list[int]) -> str:
866
+ def list_to_str_range(indices: list[int]) -> str:
893
867
  """
894
868
  Convert a list of indices to a comma-separated string with ranges.
895
869
 
@@ -919,15 +893,48 @@ def list_to_range_str(indices: list[int]) -> str:
919
893
  start, end = i, i
920
894
  ranges.append((start, end))
921
895
 
922
- range_str_parts: list[str] = []
896
+ str_range_parts: list[str] = []
923
897
  for start, end in ranges:
924
898
  if start == end:
925
- range_str_parts.append(f"{start}")
899
+ str_range_parts.append(f"{start}")
926
900
  else:
927
- range_str_parts.append(f"{start}-{end}")
928
- range_str = ",".join(range_str_parts)
901
+ str_range_parts.append(f"{start}-{end}")
902
+ str_range = ",".join(str_range_parts)
903
+
904
+ return str_range
905
+
906
+
907
+ def str_range_to_list(str_range: str) -> list[int]:
908
+ """
909
+ Convert a comma-separated string with ranges to a list of indices.
910
+
911
+ Args:
912
+ str_range:
913
+ A comma-separated string with ranges (e.g., "0,2-4,6").
914
+
915
+ Returns:
916
+ A list of indices.
917
+
918
+ """
919
+ str_range_parts = str_range.split(",")
920
+
921
+ indices: set[int] = set()
922
+ for _part in str_range_parts:
923
+ part = _part.strip()
924
+ if "-" in part:
925
+ lo, hi = part.split("-")
926
+ lo_idx = safe_int(lo, -1)
927
+ hi_idx = safe_int(hi, -1)
928
+ if lo_idx == -1 or hi_idx == -1 or lo_idx > hi_idx:
929
+ continue
930
+ indices.update(range(lo_idx, hi_idx + 1))
931
+ else:
932
+ idx = safe_int(part, -1)
933
+ if idx == -1:
934
+ continue
935
+ indices.add(idx)
929
936
 
930
- return range_str
937
+ return sorted(indices)
931
938
 
932
939
 
933
940
  def bitmask_to_str(bitmask_list: list) -> str:
@@ -950,7 +957,7 @@ def bitmask_to_str(bitmask_list: list) -> str:
950
957
  bits_lists.extend(bitmask_to_list(bitmask, offset))
951
958
  offset += get_bits_size()
952
959
 
953
- return list_to_range_str(sorted(bits_lists))
960
+ return list_to_str_range(sorted(bits_lists))
954
961
 
955
962
 
956
963
  def get_physical_function_by_bdf(bdf: str) -> str:
@@ -1,4 +1,4 @@
1
- from __future__ import annotations
1
+ from __future__ import annotations as __future_annotations__
2
2
 
3
3
  import contextlib
4
4
  import logging
@@ -30,7 +30,7 @@ class AMDDetector(Detector):
30
30
  """
31
31
 
32
32
  @staticmethod
33
- @lru_cache
33
+ @lru_cache(maxsize=1)
34
34
  def is_supported() -> bool:
35
35
  """
36
36
  Check if the AMD detector is supported.
@@ -59,7 +59,7 @@ class AMDDetector(Detector):
59
59
  return supported
60
60
 
61
61
  @staticmethod
62
- @lru_cache
62
+ @lru_cache(maxsize=1)
63
63
  def detect_pci_devices() -> dict[str, PCIDevice]:
64
64
  # See https://pcisig.com/membership/member-companies?combine=AMD.
65
65
  pci_devs = get_pci_devices(vendor="0x1002")
@@ -108,11 +108,7 @@ class AMDDetector(Detector):
108
108
  asic_serial = dev_gpu_asic_info.get("asic_serial")
109
109
  dev_uuid = f"GPU-{(asic_serial[2:]).lower()}"
110
110
  else:
111
- dev_uuid = ""
112
- with contextlib.suppress(pyrocmsmi.ROCMSMIError):
113
- dev_uuid = (
114
- f"GPU-{pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)[2:]}"
115
- )
111
+ dev_uuid = f"GPU-{pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)[2:]}"
116
112
  dev_hsa_agent = hsa_agents.get(dev_uuid, pyhsa.Agent())
117
113
 
118
114
  dev_gpu_driver_info = pyamdsmi.amdsmi_get_gpu_driver_info(dev)
@@ -132,12 +128,8 @@ class AMDDetector(Detector):
132
128
  dev_idx,
133
129
  )
134
130
 
135
- dev_bdf = None
136
- dev_card_id = None
137
- dev_renderd_id = None
138
- with contextlib.suppress(pyamdsmi.AmdSmiException):
139
- dev_bdf = pyamdsmi.amdsmi_get_gpu_device_bdf(dev)
140
- dev_card_id, dev_renderd_id = _get_card_and_renderd_id(dev_bdf)
131
+ dev_bdf = pyamdsmi.amdsmi_get_gpu_device_bdf(dev)
132
+ dev_card_id, dev_renderd_id = _get_card_and_renderd_id(dev_bdf)
141
133
 
142
134
  dev_cores = dev_hsa_agent.compute_units
143
135
  dev_asic_family_id = dev_hsa_agent.asic_family_id
@@ -205,27 +197,25 @@ class AMDDetector(Detector):
205
197
  dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
206
198
  dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
207
199
 
208
- dev_is_vgpu = False
209
- if dev_bdf:
210
- dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
200
+ dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
201
+
202
+ dev_numa = get_numa_node_by_bdf(dev_bdf)
203
+ if not dev_numa:
204
+ dev_numa = str(pyamdsmi.amdsmi_topo_get_numa_node_number(dev))
211
205
 
212
206
  dev_appendix = {
213
207
  "arch_family": _get_arch_family(dev_asic_family_id),
214
208
  "vgpu": dev_is_vgpu,
209
+ "bdf": dev_bdf,
210
+ "numa": dev_numa,
215
211
  }
216
- if dev_bdf:
217
- dev_appendix["bdf"] = dev_bdf
218
212
  if dev_card_id is not None:
219
213
  dev_appendix["card_id"] = dev_card_id
220
214
  if dev_renderd_id is not None:
221
215
  dev_appendix["renderd_id"] = dev_renderd_id
222
216
 
223
- with contextlib.suppress(pyamdsmi.AmdSmiException):
224
- dev_xgmi = pyamdsmi.amdsmi_get_xgmi_info(dev)
225
- if xgmi_lanes := dev_xgmi.get("xgmi_lanes", None):
226
- dev_appendix["xgmi_lanes"] = xgmi_lanes
227
- dev_appendix["xgmi_hive_id"] = dev_xgmi.get("xgmi_hive_id")
228
- dev_appendix["xgmi_node_id"] = dev_xgmi.get("xgmi_node_id")
217
+ if dev_xgmi_info := _get_xgmi_info(dev):
218
+ dev_appendix.update(dev_xgmi_info)
229
219
 
230
220
  ret.append(
231
221
  Device(
@@ -285,9 +275,9 @@ class AMDDetector(Detector):
285
275
  devs_mapping = None
286
276
 
287
277
  def get_device_handle(dev: Device):
288
- if bdf := dev.appendix.get("bdf", None):
289
- with contextlib.suppress(pyamdsmi.AmdSmiException):
290
- return pyamdsmi.amdsmi_get_processor_handle_from_bdf(bdf)
278
+ with contextlib.suppress(pyamdsmi.AmdSmiException):
279
+ bdf = dev.appendix["bdf"]
280
+ return pyamdsmi.amdsmi_get_processor_handle_from_bdf(bdf)
291
281
  nonlocal devs_mapping
292
282
  if devs_mapping is None:
293
283
  devs = pyamdsmi.amdsmi_get_processor_handles()
@@ -295,7 +285,7 @@ class AMDDetector(Detector):
295
285
  return devs_mapping.get(dev.index)
296
286
 
297
287
  try:
298
- pci_devices = self.detect_pci_devices()
288
+ pci_devs = self.detect_pci_devices()
299
289
 
300
290
  def distance_pci_devices(bdf_a: str, bdf_b: str) -> TopologyDistanceEnum:
301
291
  """
@@ -311,8 +301,8 @@ class AMDDetector(Detector):
311
301
  The TopologyDistanceEnum representing the distance.
312
302
 
313
303
  """
314
- pcid_a = pci_devices.get(bdf_a, None)
315
- pcid_b = pci_devices.get(bdf_b, None)
304
+ pcid_a = pci_devs.get(bdf_a, None)
305
+ pcid_b = pci_devs.get(bdf_b, None)
316
306
 
317
307
  score = compare_pci_devices(pcid_a, pcid_b)
318
308
  if score > 0:
@@ -323,41 +313,16 @@ class AMDDetector(Detector):
323
313
 
324
314
  pyamdsmi.amdsmi_init()
325
315
 
326
- # Get NUMA and CPU affinities.
327
- for i, dev_i in enumerate(devices):
328
- # Get affinity with PCIe BDF if possible.
329
- if dev_i_bdf := dev_i.appendix.get("bdf", ""):
330
- ret.devices_numa_affinities[i] = get_numa_node_by_bdf(
331
- dev_i_bdf,
332
- )
333
- ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
334
- ret.devices_numa_affinities[i],
335
- )
336
- # Otherwise, get affinity via AMD SMI.
337
- if not ret.devices_cpu_affinities[i]:
338
- dev_i_handle = get_device_handle(dev_i)
339
-
340
- # Get NUMA affinity.
341
- try:
342
- dev_i_numa_node = pyamdsmi.amdsmi_topo_get_numa_node_number(
343
- dev_i_handle,
344
- )
345
- ret.devices_numa_affinities[i] = str(dev_i_numa_node)
346
- except pyamdsmi.AmdSmiException:
347
- debug_log_exception(
348
- logger,
349
- "Failed to get NUMA affinity for device %d",
350
- dev_i.index,
351
- )
352
- # Get CPU affinity.
353
- ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
354
- ret.devices_numa_affinities[i],
355
- )
356
-
357
- # Get distances to other devices.
358
316
  for i, dev_i in enumerate(devices):
359
317
  dev_i_handle = get_device_handle(dev_i)
360
318
 
319
+ # Get NUMA and CPU affinities.
320
+ ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
321
+ ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
322
+ ret.devices_numa_affinities[i],
323
+ )
324
+
325
+ # Get distances to other devices.
361
326
  for j, dev_j in enumerate(devices):
362
327
  if dev_i.index == dev_j.index or ret.devices_distances[i][j] != 0:
363
328
  continue
@@ -402,9 +367,6 @@ class AMDDetector(Detector):
402
367
 
403
368
  ret.devices_distances[i][j] = distance
404
369
  ret.devices_distances[j][i] = distance
405
- except pyamdsmi.AmdSmiException:
406
- debug_log_exception(logger, "Failed to fetch topology")
407
- raise
408
370
  except Exception:
409
371
  debug_log_exception(logger, "Failed to process topology fetching")
410
372
  raise
@@ -465,6 +427,7 @@ def _get_card_and_renderd_id(dev_bdf: str) -> tuple[int | None, int | None]:
465
427
  """
466
428
  card_id = None
467
429
  renderd_id = None
430
+
468
431
  drm_path = Path(f"/sys/module/amdgpu/drivers/pci:amdgpu/{dev_bdf}/drm")
469
432
  if drm_path.exists():
470
433
  for dir_path in drm_path.iterdir():
@@ -474,3 +437,29 @@ def _get_card_and_renderd_id(dev_bdf: str) -> tuple[int | None, int | None]:
474
437
  renderd_id = int(dir_path.name[7:])
475
438
 
476
439
  return card_id, renderd_id
440
+
441
+
442
+ def _get_xgmi_info(dev) -> dict | None:
443
+ """
444
+ Get the XGMI information for a given device.
445
+
446
+ Args:
447
+ dev:
448
+ The device handle.
449
+
450
+ Returns:
451
+ A dictionary containing XGMI information, or None if not available.
452
+
453
+ """
454
+ try:
455
+ dev_xgmi = pyamdsmi.amdsmi_get_xgmi_info(dev)
456
+ if xgmi_lanes := dev_xgmi.get("xgmi_lanes", None):
457
+ return {
458
+ "xgmi_lanes": xgmi_lanes,
459
+ "xgmi_hive_id": dev_xgmi.get("xgmi_hive_id"),
460
+ "xgmi_node_id": dev_xgmi.get("xgmi_node_id"),
461
+ }
462
+ except pyamdsmi.AmdSmiException:
463
+ debug_log_exception(logger, "Failed to get XGMI information")
464
+
465
+ return None
@@ -1,4 +1,4 @@
1
- from __future__ import annotations
1
+ from __future__ import annotations as __future_annotations__
2
2
 
3
3
  import contextlib
4
4
  import logging
@@ -47,7 +47,7 @@ class AscendDetector(Detector):
47
47
  """
48
48
 
49
49
  @staticmethod
50
- @lru_cache
50
+ @lru_cache(maxsize=1)
51
51
  def is_supported() -> bool:
52
52
  """
53
53
  Check if the Ascend detector is supported.
@@ -75,7 +75,7 @@ class AscendDetector(Detector):
75
75
  return supported
76
76
 
77
77
  @staticmethod
78
- @lru_cache
78
+ @lru_cache(maxsize=1)
79
79
  def detect_pci_devices() -> dict[str, PCIDevice]:
80
80
  # See https://pcisig.com/membership/member-companies?combine=Huawei.
81
81
  pci_devs = get_pci_devices(vendor="0x19e5")
@@ -184,12 +184,29 @@ class AscendDetector(Detector):
184
184
  if dev_power_used:
185
185
  dev_power_used = dev_power_used / 10 # 0.1W to W
186
186
 
187
+ dev_bdf = pydcmi.dcmi_get_device_bdf(
188
+ dev_card_id,
189
+ dev_device_id,
190
+ )
191
+
192
+ dev_numa = get_numa_node_by_bdf(dev_bdf)
193
+ if not dev_numa:
194
+ dev_cpu_affinity = (
195
+ pydcmi.dcmi_get_affinity_cpu_info_by_device_id(
196
+ dev_card_id,
197
+ dev_device_id,
198
+ )
199
+ )
200
+ dev_numa = map_cpu_affinity_to_numa_node(dev_cpu_affinity)
201
+
187
202
  dev_appendix = {
188
203
  "arch_family": (
189
204
  pyacl.aclrtGetSocName()
190
205
  or _guess_soc_name_from_dev_name(dev_name)
191
206
  ),
192
207
  "vgpu": dev_is_vgpu,
208
+ "bdf": dev_bdf,
209
+ "numa": dev_numa,
193
210
  "card_id": dev_card_id,
194
211
  "device_id": dev_device_id,
195
212
  "device_id_max": device_num_in_card - 1,
@@ -208,13 +225,6 @@ class AscendDetector(Detector):
208
225
  if dev_roce_gateway:
209
226
  dev_appendix["roce_gateway"] = str(dev_roce_gateway)
210
227
 
211
- with contextlib.suppress(pydcmi.DCMIError):
212
- dev_bdf = pydcmi.dcmi_get_device_bdf(
213
- dev_card_id,
214
- dev_device_id,
215
- )
216
- dev_appendix["bdf"] = dev_bdf
217
-
218
228
  ret.append(
219
229
  Device(
220
230
  manufacturer=self.manufacturer,
@@ -270,44 +280,22 @@ class AscendDetector(Detector):
270
280
  pydcmi.dcmi_init()
271
281
 
272
282
  for i, dev_i in enumerate(devices):
273
- dev_i_card_id = dev_i.appendix["card_id"]
274
- dev_i_device_id = dev_i.appendix["device_id"]
283
+ dev_i_card_id = dev_i.appendix.get("card_id", i)
284
+ dev_i_device_id = dev_i.appendix.get("device_id", 0)
275
285
 
276
- # Get affinity with PCIe BDF if possible.
277
- if dev_i_bdf := dev_i.appendix.get("bdf", ""):
278
- ret.devices_numa_affinities[i] = get_numa_node_by_bdf(
279
- dev_i_bdf,
280
- )
281
- ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
282
- ret.devices_numa_affinities[i],
283
- )
284
- # Otherwise, get affinity via DCMI.
285
- if not ret.devices_cpu_affinities[i]:
286
- # Get CPU affinity.
287
- try:
288
- cpu_affinity = pydcmi.dcmi_get_affinity_cpu_info_by_device_id(
289
- dev_i.appendix["card_id"],
290
- dev_i.appendix["device_id"],
291
- )
292
- ret.devices_cpu_affinities[i] = cpu_affinity
293
- except pydcmi.DCMIError:
294
- debug_log_exception(
295
- slogger,
296
- "Failed to get CPU affinity for device %d",
297
- dev_i.index,
298
- )
299
- # Get NUMA affinity.
300
- ret.devices_numa_affinities[i] = map_cpu_affinity_to_numa_node(
301
- ret.devices_cpu_affinities[i],
302
- )
286
+ # Get NUMA and CPU affinities.
287
+ ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
288
+ ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
289
+ ret.devices_numa_affinities[i],
290
+ )
303
291
 
304
292
  # Get distances to other devices.
305
293
  for j, dev_j in enumerate(devices):
306
294
  if dev_i.index == dev_j.index or ret.devices_distances[i][j] != 0:
307
295
  continue
308
296
 
309
- dev_j_card_id = dev_j.appendix["card_id"]
310
- dev_j_device_id = dev_j.appendix["device_id"]
297
+ dev_j_card_id = dev_j.appendix.get("card_id", j)
298
+ dev_j_device_id = dev_j.appendix.get("device_id", 0)
311
299
 
312
300
  # If two devices are the same card,
313
301
  # skip distance calculation.
@@ -1,4 +1,4 @@
1
- from __future__ import annotations
1
+ from __future__ import annotations as __future_annotations__
2
2
 
3
3
  import json
4
4
  import logging
@@ -26,7 +26,7 @@ class CambriconDetector(Detector):
26
26
  """
27
27
 
28
28
  @staticmethod
29
- @lru_cache
29
+ @lru_cache(maxsize=1)
30
30
  def is_supported() -> bool:
31
31
  """
32
32
  Check if the Cambricon detector is supported.
@@ -50,7 +50,7 @@ class CambriconDetector(Detector):
50
50
  return supported
51
51
 
52
52
  @staticmethod
53
- @lru_cache
53
+ @lru_cache(maxsize=1)
54
54
  def detect_pci_devices() -> dict[str, PCIDevice]:
55
55
  # See https://pcisig.com/membership/member-companies?combine=Cambricon.
56
56
  pci_devs = get_pci_devices(vendor="0xcabc")