gpustack-runtime 0.1.41.post3__py3-none-any.whl → 0.1.42.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. gpustack_runtime/_version.py +2 -2
  2. gpustack_runtime/_version_appendix.py +1 -1
  3. gpustack_runtime/cmds/detector.py +4 -2
  4. gpustack_runtime/deployer/__types__.py +314 -233
  5. gpustack_runtime/deployer/cdi/__init__.py +1 -1
  6. gpustack_runtime/deployer/cdi/__types__.py +2 -2
  7. gpustack_runtime/deployer/cdi/__utils__.py +4 -1
  8. gpustack_runtime/deployer/cdi/amd.py +6 -8
  9. gpustack_runtime/deployer/cdi/ascend.py +7 -9
  10. gpustack_runtime/deployer/cdi/hygon.py +6 -8
  11. gpustack_runtime/deployer/cdi/iluvatar.py +6 -8
  12. gpustack_runtime/deployer/cdi/metax.py +6 -8
  13. gpustack_runtime/deployer/cdi/thead.py +6 -8
  14. gpustack_runtime/deployer/docker.py +133 -146
  15. gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +13 -8
  16. gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +26 -21
  17. gpustack_runtime/deployer/kuberentes.py +89 -108
  18. gpustack_runtime/deployer/podman.py +113 -120
  19. gpustack_runtime/detector/__init__.py +2 -0
  20. gpustack_runtime/detector/__types__.py +26 -0
  21. gpustack_runtime/detector/__utils__.py +3 -0
  22. gpustack_runtime/detector/amd.py +32 -10
  23. gpustack_runtime/detector/ascend.py +67 -13
  24. gpustack_runtime/detector/cambricon.py +3 -0
  25. gpustack_runtime/detector/hygon.py +22 -3
  26. gpustack_runtime/detector/iluvatar.py +15 -7
  27. gpustack_runtime/detector/metax.py +16 -6
  28. gpustack_runtime/detector/mthreads.py +22 -8
  29. gpustack_runtime/detector/nvidia.py +148 -140
  30. gpustack_runtime/detector/pyacl/__init__.py +34 -14
  31. gpustack_runtime/detector/pydcmi/__init__.py +4 -2
  32. gpustack_runtime/detector/pyixml/__init__.py +16 -0
  33. gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
  34. gpustack_runtime/detector/thead.py +145 -134
  35. gpustack_runtime/envs.py +7 -6
  36. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/METADATA +2 -2
  37. gpustack_runtime-0.1.42.post1.dist-info/RECORD +67 -0
  38. gpustack_runtime-0.1.41.post3.dist-info/RECORD +0 -67
  39. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/WHEEL +0 -0
  40. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/entry_points.txt +0 -0
  41. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/licenses/LICENSE +0 -0
@@ -15,12 +15,11 @@ from dataclasses_json import dataclass_json
15
15
  from .. import envs
16
16
  from ..detector import (
17
17
  ManufacturerEnum,
18
- Topology,
19
18
  detect_devices,
20
- get_devices_topologies,
21
19
  group_devices_by_manufacturer,
22
20
  manufacturer_to_backend,
23
21
  )
22
+ from ..detector.__utils__ import map_numa_node_to_cpu_affinity
24
23
  from .__utils__ import (
25
24
  adjust_image_with_envs,
26
25
  correct_runner_image,
@@ -31,6 +30,7 @@ from .__utils__ import (
31
30
  safe_yaml,
32
31
  validate_rfc1123_subdomain_name,
33
32
  )
33
+ from .k8s.deviceplugin import cdi_kind_to_kdp_resource
34
34
 
35
35
  if TYPE_CHECKING:
36
36
  from collections.abc import AsyncGenerator, Generator
@@ -839,17 +839,6 @@ class WorkloadPlan(WorkloadSecurity):
839
839
  Base plan class for all workloads.
840
840
 
841
841
  Attributes:
842
- resource_key_runtime_env_mapping: (dict[str, str]):
843
- Mapping from resource names to environment variable names for device allocation,
844
- which is used to tell the Container Runtime which GPUs to mount into the container.
845
- For example, {"nvidia.com/gpu": "NVIDIA_VISIBLE_DEVICES"},
846
- which sets the "NVIDIA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
847
- With privileged mode, the container can access all GPUs even if specified.
848
- resource_key_backend_env_mapping: (dict[str, list[str]]):
849
- Mapping from resource names to environment variable names for device runtime,
850
- which is used to tell the Device Runtime (e.g., ROCm, CUDA, OneAPI) which GPUs to use inside the container.
851
- For example, {"nvidia.com/gpu": ["CUDA_VISIBLE_DEVICES"]},
852
- which sets the "CUDA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
853
842
  name (WorkloadName):
854
843
  Name for the workload, it should be unique in the deployer.
855
844
  labels (dict[str, str] | None):
@@ -876,25 +865,6 @@ class WorkloadPlan(WorkloadSecurity):
876
865
 
877
866
  """
878
867
 
879
- resource_key_runtime_env_mapping: dict[str, str] = field(
880
- default_factory=lambda: envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES,
881
- )
882
- """
883
- Mapping from resource names to environment variable names for device allocation,
884
- which is used to tell the Container Runtime which GPUs to mount into the container.
885
- For example, {"nvidia.com/gpu": "NVIDIA_VISIBLE_DEVICES"},
886
- which sets the "NVIDIA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
887
- With privileged mode, the container can access all GPUs even if specified.
888
- """
889
- resource_key_backend_env_mapping: dict[str, list[str]] = field(
890
- default_factory=lambda: envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_BACKEND_VISIBLE_DEVICES,
891
- )
892
- """
893
- Mapping from resource names to environment variable names for device runtime,
894
- which is used to tell the Device Runtime (e.g., ROCm, CUDA, OneAPI) which GPUs to use inside the container.
895
- For example, {"nvidia.com/gpu": ["CUDA_VISIBLE_DEVICES"]},
896
- which sets the "CUDA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
897
- """
898
868
  namespace: WorkloadNamespace | None = None
899
869
  """
900
870
  Namespace for the workload.
@@ -1256,83 +1226,98 @@ def _default_args(func):
1256
1226
  return wrapper
1257
1227
 
1258
1228
 
1259
- class Deployer(ABC):
1229
+ @dataclass
1230
+ class DevicesMaterial:
1231
+ manufacturer: ManufacturerEnum = ManufacturerEnum.UNKNOWN
1260
1232
  """
1261
- Base class for all deployers.
1233
+ Manufacturer of devices,
1234
+ e.g. for NVIDIA, it is ManufacturerEnum.NVIDIA.
1262
1235
  """
1263
-
1264
- _name: str = "unknown"
1236
+ runtime_env: str = ""
1265
1237
  """
1266
- Name of the deployer.
1238
+ Runtime visible devices env name for devices,
1239
+ e.g. for NVIDIA, it is CUDA_VISIBLE_DEVICES.
1267
1240
  """
1268
- _pool: ThreadPoolExecutor | None = None
1241
+ backend_env: list[str] = field(default_factory=list)
1269
1242
  """
1270
- Thread pool for the deployer.
1243
+ Backend visible devices env name for devices,
1244
+ e.g. for AMD, it can be both HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES.
1271
1245
  """
1272
- _visible_devices_manufacturers: dict[str, ManufacturerEnum] | None = None
1246
+ cdi: str = ""
1273
1247
  """
1274
- Recorded visible devices manufacturers,
1275
- the key is the runtime visible devices env name,
1276
- the value is the corresponding manufacturer.
1277
- For example:
1278
- {
1279
- "NVIDIA_VISIBLE_DEVICES": ManufacturerEnum.NVIDIA,
1280
- "AMD_VISIBLE_DEVICES": ManufacturerEnum.AMD
1281
- }.
1248
+ CDI key for devices,
1249
+ e.g. for NVIDIA, it is nvidia.com/gpu.
1282
1250
  """
1283
- _visible_devices_env: dict[str, list[str]] | None = None
1251
+ runtime_values: dict[str, str] = field(default_factory=dict)
1284
1252
  """
1285
- Recorded visible devices envs,
1286
- the key is the runtime visible devices env name,
1287
- the value is the list of backend visible devices env names.
1288
- For example:
1289
- {
1290
- "NVIDIA_VISIBLE_DEVICES": ["CUDA_VISIBLE_DEVICES"],
1291
- "AMD_VISIBLE_DEVICES": ["HIP_VISIBLE_DEVICES", "ROCR_VISIBLE_DEVICES"]
1292
- }.
1253
+ Mapping devices to runtime visible devices env values,
1254
+ the key is the device index string,
1255
+ the value is the device index string or uuid.
1256
+ For example, {"0": "GPU-11111111-2222-3333-4444-555555555555"} for NVIDIA.
1293
1257
  """
1294
- _visible_devices_cdis: dict[str, str] | None = None
1258
+ backend_values: dict[str, dict[str, str]] = field(default_factory=dict)
1295
1259
  """
1296
- Recorded visible devices envs to CDI mapping,
1297
- the key is the runtime visible devices env name,
1298
- the value is the corresponding CDI key.
1299
- For example:
1300
- {
1301
- "NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu",
1302
- "AMD_VISIBLE_DEVICES": "amd.com/gpu"
1303
- }.
1260
+ Mapping devices to backend visible devices env values,
1261
+ the key is the device index string,
1262
+ the value is the device index string or aligned device index string.
1263
+ For example, {"NPU_VISIBLE_DEVICES": {"4": "0", "5": "1"}} for Ascend.
1304
1264
  """
1305
- _visible_devices_values: dict[str, list[str]] | None = None
1265
+ numa_affinities: dict[str, str] = field(default_factory=dict)
1306
1266
  """
1307
- Recorded visible devices values,
1308
- the key is the runtime visible devices env name,
1309
- the value is the list of device indexes or uuids.
1310
- For example:
1311
- {
1312
- "NVIDIA_VISIBLE_DEVICES": ["0"],
1313
- "AMD_VISIBLE_DEVICES": ["0", "1"]
1314
- }.
1267
+ Mapping devices to NUMA affinities,
1268
+ the key is the device index string,
1269
+ the value is the NUMA node string.
1270
+ For example, {"0": "0-1"}.
1315
1271
  """
1316
- _visible_devices_topologies: dict[str, Topology] | None = None
1272
+ cpus_affinities: dict[str, str] = field(default_factory=dict)
1317
1273
  """
1318
- Recorded visible devices topologies,
1319
- the key is the runtime visible devices env name,
1320
- the value is the corresponding topology.
1321
- For example:
1322
- {
1323
- "NVIDIA_VISIBLE_DEVICES": Topology(...),
1324
- "AMD_VISIBLE_DEVICES": Topology(...)
1325
- }.
1274
+ Mapping devices to CPUs affinities,
1275
+ the key is the device index string,
1276
+ the value is the CPU cores string.
1277
+ For example, {"0": "0-7"}.
1326
1278
  """
1327
- _backend_visible_devices_values_alignment: dict[str, dict[str, str]] | None = None
1279
+
1280
+
1281
+ class Deployer(ABC):
1328
1282
  """
1329
- Recorded backend visible devices values alignment,
1283
+ Base class for all deployers.
1284
+ """
1285
+
1286
+ _name: str = "unknown"
1287
+ """
1288
+ Name of the deployer.
1289
+ """
1290
+ _pool: ThreadPoolExecutor | None = None
1291
+ """
1292
+ Thread pool for the deployer.
1293
+ """
1294
+ _materials: dict[str, DevicesMaterial] | None = None
1295
+ """
1296
+ Mapping devices materials,
1330
1297
  the key is the runtime visible devices env name,
1331
- the value is the mapping from backend device index to aligned index.
1298
+ the value is the corresponding devices material.
1332
1299
  For example:
1333
1300
  {
1334
- "CUDA_VISIBLE_DEVICES": {"0": "0"},
1335
- "HIP_VISIBLE_DEVICES": {"0": "0", "1": "1"}
1301
+ "NVIDIA_VISIBLE_DEVICES": DevicesMaterial(
1302
+ manufacturer=ManufacturerEnum.NVIDIA,
1303
+ runtime_env="NVIDIA_VISIBLE_DEVICES",
1304
+ backend_env=["CUDA_VISIBLE_DEVICES"],
1305
+ cdi="nvidia.com/gpu",
1306
+ runtime_values={"0": "GPU-11111111-2222-3333-4444-555555555555"},
1307
+ backend_values={"CUDA_VISIBLE_DEVICES": {"0": "0"}},
1308
+ numa_affinities={"0": "0-1"},
1309
+ cpus_affinities={"0": "0-7"},
1310
+ ),
1311
+ "AMD_VISIBLE_DEVICES": DevicesMaterial(
1312
+ manufacturer=ManufacturerEnum.AMD,
1313
+ runtime_env="AMD_VISIBLE_DEVICES",
1314
+ backend_env=["HIP_VISIBLE_DEVICES", "ROCR_VISIBLE_DEVICES"],
1315
+ cdi="amd.com/gpu",
1316
+ runtime_values={"0": "0", "1": "1"},
1317
+ backend_values={"HIP_VISIBLE_DEVICES": {"0": "0", "1": "1"},"ROCR_VISIBLE_DEVICES": {"0": "0", "1": "1"}},
1318
+ numa_affinities={"0": "0-1", "1": "0-1"},
1319
+ cpus_affinities={"0": "0-7", "1": "8-15"},
1320
+ ),
1336
1321
  }.
1337
1322
  """
1338
1323
 
@@ -1359,21 +1344,12 @@ class Deployer(ABC):
1359
1344
 
1360
1345
  def _prepare(self):
1361
1346
  """
1362
- Detect devices once, and construct critical elements for post-processing, including:
1363
- - Prepare visible devices manufacturers mapping.
1364
- - Prepare visible devices environment variables mapping.
1365
- - Prepare visible devices values mapping.
1366
- - Prepare visible devices topologies mapping.
1347
+ Detect devices and prepare materials.
1367
1348
  """
1368
- if self._visible_devices_manufacturers is not None:
1349
+ if self._materials is not None:
1369
1350
  return
1370
1351
 
1371
- self._visible_devices_manufacturers = {}
1372
- self._visible_devices_env = {}
1373
- self._visible_devices_cdis = {}
1374
- self._visible_devices_values = {}
1375
- self._visible_devices_topologies = {}
1376
- self._backend_visible_devices_values_alignment = {}
1352
+ self._materials = {}
1377
1353
 
1378
1354
  group_devices = group_devices_by_manufacturer(
1379
1355
  detect_devices(fast=False),
@@ -1398,178 +1374,272 @@ class Deployer(ABC):
1398
1374
  )
1399
1375
  if ren and ben_list:
1400
1376
  valued_uuid = (
1401
- ren
1402
- in envs.GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_UUID
1377
+ self.allowed_uuid_values
1378
+ and (
1379
+ ren
1380
+ in envs.GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_UUID
1381
+ )
1403
1382
  and manu != ManufacturerEnum.ASCEND
1404
1383
  )
1405
- dev_uuids: list[str] = []
1406
- dev_indexes: list[str] = []
1407
- dev_indexes_alignment: dict[str, str] = {}
1384
+ dev_runtime_values: dict[str, str] = {}
1385
+ dev_backend_values: dict[str, str] = {}
1386
+ dev_backend_aligned_values: dict[str, str] = {}
1387
+ dev_numa_affinities: dict[str, str] = {}
1388
+ dev_cpus_affinities: dict[str, str] = {}
1408
1389
  for dev_i, dev in enumerate(devs):
1409
- dev_uuids.append(dev.uuid)
1410
- dev_indexes.append(str(dev.index))
1411
- dev_indexes_alignment[str(dev.index)] = str(dev_i)
1412
- # Map runtime visible devices env <-> manufacturer.
1413
- self._visible_devices_manufacturers[ren] = manu
1414
- # Map runtime visible devices env <-> backend visible devices env list.
1415
- self._visible_devices_env[ren] = ben_list
1416
- # Map runtime visible devices env <-> CDI key.
1417
- self._visible_devices_cdis[ren] = cdi
1418
- # Map runtime visible devices env <-> device indexes or uuids.
1419
- self._visible_devices_values[ren] = (
1420
- dev_uuids if valued_uuid else dev_indexes
1421
- )
1422
- # Map runtime visible devices env <-> topology.
1423
- if (
1424
- envs.GPUSTACK_RUNTIME_DEPLOY_CPU_AFFINITY
1425
- or envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY
1426
- ):
1427
- topos = get_devices_topologies(devices=devs)
1428
- if topos:
1429
- self._visible_devices_topologies[ren] = topos[0]
1430
- # Map backend visible devices env <-> devices alignment.
1431
- if not valued_uuid:
1432
- for ben in ben_list:
1433
- valued_alignment = (
1434
- ben
1390
+ dev_index = str(dev.index)
1391
+ if valued_uuid:
1392
+ dev_runtime_values[dev_index] = dev.uuid
1393
+ else:
1394
+ dev_runtime_values[dev_index] = dev_index
1395
+ dev_backend_values[dev_index] = dev_index
1396
+ dev_backend_aligned_values[dev_index] = str(dev_i)
1397
+ dev_numa_affinities[dev_index] = dev.appendix.get("numa", "")
1398
+ dev_cpus_affinities[dev_index] = map_numa_node_to_cpu_affinity(
1399
+ dev_numa_affinities[dev_index],
1400
+ )
1401
+
1402
+ self._materials[ren] = DevicesMaterial(
1403
+ manufacturer=manu,
1404
+ runtime_env=ren,
1405
+ backend_env=ben_list,
1406
+ cdi=cdi,
1407
+ runtime_values=dev_runtime_values,
1408
+ backend_values={
1409
+ ben: (
1410
+ dev_backend_aligned_values
1411
+ if ben
1435
1412
  in envs.GPUSTACK_RUNTIME_DEPLOY_BACKEND_VISIBLE_DEVICES_VALUE_ALIGNMENT
1413
+ else dev_backend_values
1436
1414
  )
1437
- if valued_alignment:
1438
- self._backend_visible_devices_values_alignment[ben] = (
1439
- dev_indexes_alignment
1440
- )
1415
+ for ben in ben_list
1416
+ },
1417
+ numa_affinities=dev_numa_affinities,
1418
+ cpus_affinities=dev_cpus_affinities,
1419
+ )
1441
1420
 
1442
- if self._visible_devices_env:
1421
+ if self._materials:
1443
1422
  return
1444
1423
 
1445
1424
  # Fallback to unknown backend
1446
1425
  ren = "UNKNOWN_RUNTIME_VISIBLE_DEVICES"
1447
- self._visible_devices_manufacturers[ren] = ManufacturerEnum.UNKNOWN
1448
- self._visible_devices_env[ren] = []
1449
- self._visible_devices_cdis[ren] = "unknown/devices"
1450
- self._visible_devices_values[ren] = ["all"]
1426
+ ben_list = ["UNKNOWN_BACKEND_VISIBLE_DEVICES"]
1427
+ cdi = "unknown.com/gpu"
1428
+ self._materials[ren] = DevicesMaterial(
1429
+ manufacturer=ManufacturerEnum.UNKNOWN,
1430
+ runtime_env=ren,
1431
+ backend_env=ben_list,
1432
+ cdi=cdi,
1433
+ runtime_values={"all": "all"},
1434
+ backend_values={ben: {"all": "all"} for ben in ben_list},
1435
+ )
1451
1436
 
1452
- def get_visible_devices_materials(
1437
+ def _get_materials(
1453
1438
  self,
1454
- ) -> (
1455
- dict[str, ManufacturerEnum],
1456
- dict[str, list[str]],
1457
- dict[str, str],
1458
- dict[str, list[str]],
1459
- ):
1439
+ ) -> dict[str, DevicesMaterial]:
1460
1440
  """
1461
- Return the visible devices environment variables, cdis and values mappings.
1462
- For example:
1463
- (
1464
- {
1465
- "NVIDIA_VISIBLE_DEVICES": ManufacturerEnum.NVIDIA,
1466
- "AMD_VISIBLE_DEVICES": ManufacturerEnum.AMD
1467
- },
1468
- {
1469
- "NVIDIA_VISIBLE_DEVICES": ["CUDA_VISIBLE_DEVICES"],
1470
- "AMD_VISIBLE_DEVICES": ["HIP_VISIBLE_DEVICES", "ROCR_VISIBLE_DEVICES"]
1471
- },
1472
- {
1473
- "NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu",
1474
- "AMD_VISIBLE_DEVICES": "amd.com/gpu"
1475
- },
1476
- {
1477
- "NVIDIA_VISIBLE_DEVICES": ["0"],
1478
- "AMD_VISIBLE_DEVICES": ["0", "1"]
1479
- }
1480
- ).
1441
+ Return the devices materials mapping.
1481
1442
 
1482
1443
  Returns:
1483
- A tuple of four dictionaries:
1484
- - The first dictionary maps runtime visible devices environment variable names
1485
- to corresponding manufacturers.
1486
- - The second dictionary maps runtime visible devices environment variable names
1487
- to lists of backend visible devices environment variable names.
1488
- - The third dictionary maps runtime visible devices environment variable names
1489
- to corresponding CDI keys.
1490
- - The last dictionary maps runtime visible devices environment variable names
1491
- to lists of device indexes or UUIDs.
1444
+ A dictionary mapping runtime visible devices environment variable names
1445
+ to corresponding devices materials.
1492
1446
 
1493
1447
  """
1494
1448
  self._prepare()
1495
- return (
1496
- self._visible_devices_manufacturers,
1497
- self._visible_devices_env,
1498
- self._visible_devices_cdis,
1499
- self._visible_devices_values,
1500
- )
1449
+ return self._materials
1501
1450
 
1502
- def get_visible_devices_affinities(
1451
+ def get_manufacturer(
1503
1452
  self,
1504
- runtime_env: list[str],
1505
- resource_value: str,
1506
- ) -> tuple[str, str]:
1453
+ runtime_env: str,
1454
+ ) -> ManufacturerEnum:
1507
1455
  """
1508
- Get the CPU and NUMA affinities for the given runtime environment and resource value.
1456
+ Return the manufacturer for the given runtime visible devices env name.
1509
1457
 
1510
1458
  Args:
1511
1459
  runtime_env:
1512
- The list of runtime visible devices environment variable names.
1513
- resource_value:
1514
- The resource value, which can be "all" or a comma-separated list of device indexes
1460
+ The runtime visible devices environment variable name.
1515
1461
 
1516
1462
  Returns:
1517
- A tuple containing:
1518
- - A comma-separated string of CPU affinities.
1519
- - A comma-separated string of NUMA affinities.
1463
+ The manufacturer enum.
1520
1464
 
1521
1465
  """
1522
- dev_indexes = []
1523
- if resource_value != "all":
1524
- dev_indexes = [int(v.strip()) for v in resource_value.split(",")]
1466
+ m = self._get_materials()
1525
1467
 
1526
- cpus_set: list[str] = []
1527
- numas_set: list[str] = []
1528
- for re_ in runtime_env:
1529
- topo = self._visible_devices_topologies.get(re_)
1530
- if topo:
1531
- cs, ns = topo.get_affinities(dev_indexes, deduplicate=False)
1532
- cpus_set.extend(cs)
1533
- numas_set.extend(ns)
1468
+ if runtime_env not in m:
1469
+ return ManufacturerEnum.UNKNOWN
1534
1470
 
1535
- return ",".join(set(cpus_set)), ",".join(set(numas_set))
1471
+ return m[runtime_env].manufacturer
1536
1472
 
1537
- def align_backend_visible_devices_env_values(
1473
+ def get_runtime_envs(
1538
1474
  self,
1539
- backend_visible_devices_env: str,
1540
- resource_key_values: str,
1541
- ) -> str:
1475
+ ) -> list[str]:
1476
+ """
1477
+ Return the supported runtime visible devices env names.
1478
+
1479
+ Returns:
1480
+ A list of supported runtime visible devices environment variable names.
1481
+
1482
+ """
1483
+ m = self._get_materials()
1484
+ return list(m.keys())
1485
+
1486
+ def get_runtime_visible_devices(
1487
+ self,
1488
+ runtime_env: str,
1489
+ fmt: str = "plain",
1490
+ ) -> list[str]:
1542
1491
  """
1543
- Return the aligned backend visible devices environment variable values.
1544
- For example, if the backend visible devices env is "ASCEND_RT_VISIBLE_DEVICES",
1545
- and the `resource_key_values` is "4,6", and the detected devices are with indexes
1546
- [4,5,6,7], then the aligned result will be "0,2".
1492
+ Return the runtime visible devices values for the given runtime visible devices env name.
1547
1493
 
1548
1494
  Args:
1549
- backend_visible_devices_env:
1550
- The backend visible devices environment variable name.
1551
- resource_key_values:
1552
- The resource key values to align.
1495
+ runtime_env:
1496
+ The runtime visible devices environment variable name.
1497
+ fmt:
1498
+ The format of the returned values,
1499
+ can be "cdi", "kdp", or "plain".
1553
1500
 
1554
1501
  Returns:
1555
- The aligned backend visible devices environment variable values.
1556
- If no alignment is needed, return the original `resource_key_values`.
1502
+ A list of runtime visible devices values.
1557
1503
 
1558
1504
  """
1559
- if (
1560
- backend_visible_devices_env
1561
- not in envs.GPUSTACK_RUNTIME_DEPLOY_BACKEND_VISIBLE_DEVICES_VALUE_ALIGNMENT
1562
- ):
1563
- return resource_key_values
1564
- self._prepare()
1565
- alignments = self._backend_visible_devices_values_alignment.get(
1566
- backend_visible_devices_env,
1567
- )
1568
- if not alignments:
1569
- return resource_key_values
1570
- return ",".join(
1571
- [alignments.get(v, v) for v in resource_key_values.split(",")],
1505
+ m = self._get_materials()
1506
+
1507
+ if runtime_env not in m:
1508
+ return []
1509
+
1510
+ rm = m[runtime_env]
1511
+ match fmt:
1512
+ case "cdi":
1513
+ return [f"{rm.cdi}={v}" for v in rm.runtime_values.values()]
1514
+ case "kdp":
1515
+ return [
1516
+ cdi_kind_to_kdp_resource(rm.cdi, v)
1517
+ for v in rm.runtime_values.values()
1518
+ ]
1519
+ return list(rm.runtime_values.values())
1520
+
1521
+ def map_runtime_visible_devices(
1522
+ self,
1523
+ runtime_env: str,
1524
+ resource_values: list[str],
1525
+ fmt: str = "plain",
1526
+ ) -> list[str]:
1527
+ """
1528
+ Map the given resource values to runtime visible devices values
1529
+ for the given runtime visible devices env name.
1530
+
1531
+ Args:
1532
+ runtime_env:
1533
+ The runtime visible devices environment variable name.
1534
+ resource_values:
1535
+ The resource values to map.
1536
+ fmt:
1537
+ The format of the returned values,
1538
+ can be "cdi", "kdp", or "plain".
1539
+
1540
+ Returns:
1541
+ A list of mapped runtime visible devices values.
1542
+
1543
+ """
1544
+ m = self._get_materials()
1545
+
1546
+ if runtime_env not in m:
1547
+ return []
1548
+
1549
+ rm = m[runtime_env]
1550
+ match fmt:
1551
+ case "cdi":
1552
+ return [
1553
+ f"{rm.cdi}={rm.runtime_values.get(v, v)}" for v in resource_values
1554
+ ]
1555
+ case "kdp":
1556
+ return [
1557
+ cdi_kind_to_kdp_resource(rm.cdi, rm.runtime_values.get(v, v))
1558
+ for v in resource_values
1559
+ ]
1560
+ return [rm.runtime_values.get(v, v) for v in resource_values]
1561
+
1562
+ def map_backend_visible_devices(
1563
+ self,
1564
+ runtime_envs: list[str],
1565
+ resource_values: list[str],
1566
+ ) -> dict[str, str]:
1567
+ """
1568
+ Map the given resource values to backend visible devices values
1569
+ for the given runtime visible devices env names.
1570
+
1571
+ Args:
1572
+ runtime_envs:
1573
+ The runtime visible devices environment variable names.
1574
+ resource_values:
1575
+ The resource values to map.
1576
+
1577
+ Returns:
1578
+ A dictionary mapping backend visible devices environment variable names
1579
+ to corresponding mapped backend visible devices values.
1580
+
1581
+ """
1582
+ m = self._get_materials()
1583
+
1584
+ ret = {}
1585
+ for runtime_env in runtime_envs:
1586
+ if runtime_env not in m:
1587
+ continue
1588
+ rm = m[runtime_env]
1589
+ for ben in rm.backend_env:
1590
+ ret[ben] = ",".join(
1591
+ [rm.backend_values[ben].get(v, v) for v in resource_values],
1592
+ )
1593
+ return ret
1594
+
1595
+ def map_visible_devices_affinities(
1596
+ self,
1597
+ runtime_envs: list[str],
1598
+ resource_values: list[str],
1599
+ ) -> dict[str, str]:
1600
+ """
1601
+ Map the given resource values to visible devices affinities
1602
+ for the given runtime visible devices env names.
1603
+
1604
+ Args:
1605
+ runtime_envs:
1606
+ The runtime visible devices environment variable names.
1607
+ resource_values:
1608
+ The resource values to map.
1609
+
1610
+ Returns:
1611
+ A dictionary mapping "cpuset_cpus" and/or "cpuset_mems"
1612
+ to corresponding mapped affinities strings.
1613
+
1614
+ """
1615
+ valued_affinity = (
1616
+ envs.GPUSTACK_RUNTIME_DEPLOY_CPU_AFFINITY
1617
+ or envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY
1572
1618
  )
1619
+ if not valued_affinity:
1620
+ return {}
1621
+
1622
+ m = self._get_materials()
1623
+
1624
+ ret = {}
1625
+ for runtime_env in runtime_envs:
1626
+ if runtime_env not in m:
1627
+ continue
1628
+ rm = m[runtime_env]
1629
+ cpus_set = set[str]()
1630
+ numas_set = set[str]()
1631
+ for v in resource_values:
1632
+ if v in rm.cpus_affinities:
1633
+ cpus_set.add(rm.cpus_affinities[v])
1634
+ if not envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY:
1635
+ continue
1636
+ if v in rm.numa_affinities:
1637
+ numas_set.add(rm.numa_affinities[v])
1638
+ if cpus := ",".join(sorted(cpus_set)):
1639
+ ret["cpuset_cpus"] = cpus
1640
+ if numas := ",".join(sorted(numas_set)):
1641
+ ret["cpuset_mems"] = numas
1642
+ return ret
1573
1643
 
1574
1644
  @property
1575
1645
  def name(self) -> str:
@@ -1582,6 +1652,17 @@ class Deployer(ABC):
1582
1652
  """
1583
1653
  return self._name
1584
1654
 
1655
+ @property
1656
+ def allowed_uuid_values(self) -> bool:
1657
+ """
1658
+ Return whether the deployer allows using UUIDs as visible devices values.
1659
+
1660
+ Returns:
1661
+ True if allowed, False otherwise.
1662
+
1663
+ """
1664
+ return True
1665
+
1585
1666
  def close(self):
1586
1667
  if self._pool:
1587
1668
  self._pool.shutdown(cancel_futures=True)