gpustack-runtime 0.1.41.post2__py3-none-any.whl → 0.1.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpustack_runtime/_version.py +2 -2
- gpustack_runtime/_version_appendix.py +1 -1
- gpustack_runtime/cmds/detector.py +3 -1
- gpustack_runtime/deployer/__types__.py +314 -233
- gpustack_runtime/deployer/cdi/__utils__.py +4 -1
- gpustack_runtime/deployer/docker.py +109 -148
- gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +21 -3
- gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +20 -19
- gpustack_runtime/deployer/kuberentes.py +91 -126
- gpustack_runtime/deployer/podman.py +89 -122
- gpustack_runtime/detector/__init__.py +2 -0
- gpustack_runtime/detector/__types__.py +26 -0
- gpustack_runtime/detector/amd.py +28 -8
- gpustack_runtime/detector/ascend.py +49 -4
- gpustack_runtime/detector/cambricon.py +3 -0
- gpustack_runtime/detector/hygon.py +16 -1
- gpustack_runtime/detector/iluvatar.py +6 -0
- gpustack_runtime/detector/metax.py +8 -0
- gpustack_runtime/detector/mthreads.py +11 -0
- gpustack_runtime/detector/nvidia.py +139 -134
- gpustack_runtime/detector/pyixml/__init__.py +16 -0
- gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
- gpustack_runtime/detector/thead.py +135 -127
- gpustack_runtime/envs.py +7 -6
- {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/METADATA +2 -2
- {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/RECORD +29 -29
- {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/WHEEL +0 -0
- {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/entry_points.txt +0 -0
- {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/licenses/LICENSE +0 -0
|
@@ -15,12 +15,11 @@ from dataclasses_json import dataclass_json
|
|
|
15
15
|
from .. import envs
|
|
16
16
|
from ..detector import (
|
|
17
17
|
ManufacturerEnum,
|
|
18
|
-
Topology,
|
|
19
18
|
detect_devices,
|
|
20
|
-
get_devices_topologies,
|
|
21
19
|
group_devices_by_manufacturer,
|
|
22
20
|
manufacturer_to_backend,
|
|
23
21
|
)
|
|
22
|
+
from ..detector.__utils__ import map_numa_node_to_cpu_affinity
|
|
24
23
|
from .__utils__ import (
|
|
25
24
|
adjust_image_with_envs,
|
|
26
25
|
correct_runner_image,
|
|
@@ -31,6 +30,7 @@ from .__utils__ import (
|
|
|
31
30
|
safe_yaml,
|
|
32
31
|
validate_rfc1123_subdomain_name,
|
|
33
32
|
)
|
|
33
|
+
from .k8s.deviceplugin import cdi_kind_to_kdp_resource
|
|
34
34
|
|
|
35
35
|
if TYPE_CHECKING:
|
|
36
36
|
from collections.abc import AsyncGenerator, Generator
|
|
@@ -839,17 +839,6 @@ class WorkloadPlan(WorkloadSecurity):
|
|
|
839
839
|
Base plan class for all workloads.
|
|
840
840
|
|
|
841
841
|
Attributes:
|
|
842
|
-
resource_key_runtime_env_mapping: (dict[str, str]):
|
|
843
|
-
Mapping from resource names to environment variable names for device allocation,
|
|
844
|
-
which is used to tell the Container Runtime which GPUs to mount into the container.
|
|
845
|
-
For example, {"nvidia.com/gpu": "NVIDIA_VISIBLE_DEVICES"},
|
|
846
|
-
which sets the "NVIDIA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
|
|
847
|
-
With privileged mode, the container can access all GPUs even if specified.
|
|
848
|
-
resource_key_backend_env_mapping: (dict[str, list[str]]):
|
|
849
|
-
Mapping from resource names to environment variable names for device runtime,
|
|
850
|
-
which is used to tell the Device Runtime (e.g., ROCm, CUDA, OneAPI) which GPUs to use inside the container.
|
|
851
|
-
For example, {"nvidia.com/gpu": ["CUDA_VISIBLE_DEVICES"]},
|
|
852
|
-
which sets the "CUDA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
|
|
853
842
|
name (WorkloadName):
|
|
854
843
|
Name for the workload, it should be unique in the deployer.
|
|
855
844
|
labels (dict[str, str] | None):
|
|
@@ -876,25 +865,6 @@ class WorkloadPlan(WorkloadSecurity):
|
|
|
876
865
|
|
|
877
866
|
"""
|
|
878
867
|
|
|
879
|
-
resource_key_runtime_env_mapping: dict[str, str] = field(
|
|
880
|
-
default_factory=lambda: envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES,
|
|
881
|
-
)
|
|
882
|
-
"""
|
|
883
|
-
Mapping from resource names to environment variable names for device allocation,
|
|
884
|
-
which is used to tell the Container Runtime which GPUs to mount into the container.
|
|
885
|
-
For example, {"nvidia.com/gpu": "NVIDIA_VISIBLE_DEVICES"},
|
|
886
|
-
which sets the "NVIDIA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
|
|
887
|
-
With privileged mode, the container can access all GPUs even if specified.
|
|
888
|
-
"""
|
|
889
|
-
resource_key_backend_env_mapping: dict[str, list[str]] = field(
|
|
890
|
-
default_factory=lambda: envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_BACKEND_VISIBLE_DEVICES,
|
|
891
|
-
)
|
|
892
|
-
"""
|
|
893
|
-
Mapping from resource names to environment variable names for device runtime,
|
|
894
|
-
which is used to tell the Device Runtime (e.g., ROCm, CUDA, OneAPI) which GPUs to use inside the container.
|
|
895
|
-
For example, {"nvidia.com/gpu": ["CUDA_VISIBLE_DEVICES"]},
|
|
896
|
-
which sets the "CUDA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
|
|
897
|
-
"""
|
|
898
868
|
namespace: WorkloadNamespace | None = None
|
|
899
869
|
"""
|
|
900
870
|
Namespace for the workload.
|
|
@@ -1256,83 +1226,98 @@ def _default_args(func):
|
|
|
1256
1226
|
return wrapper
|
|
1257
1227
|
|
|
1258
1228
|
|
|
1259
|
-
|
|
1229
|
+
@dataclass
|
|
1230
|
+
class DevicesMaterial:
|
|
1231
|
+
manufacturer: ManufacturerEnum = ManufacturerEnum.UNKNOWN
|
|
1260
1232
|
"""
|
|
1261
|
-
|
|
1233
|
+
Manufacturer of devices,
|
|
1234
|
+
e.g. for NVIDIA, it is ManufacturerEnum.NVIDIA.
|
|
1262
1235
|
"""
|
|
1263
|
-
|
|
1264
|
-
_name: str = "unknown"
|
|
1236
|
+
runtime_env: str = ""
|
|
1265
1237
|
"""
|
|
1266
|
-
|
|
1238
|
+
Runtime visible devices env name for devices,
|
|
1239
|
+
e.g. for NVIDIA, it is CUDA_VISIBLE_DEVICES.
|
|
1267
1240
|
"""
|
|
1268
|
-
|
|
1241
|
+
backend_env: list[str] = field(default_factory=list)
|
|
1269
1242
|
"""
|
|
1270
|
-
|
|
1243
|
+
Backend visible devices env name for devices,
|
|
1244
|
+
e.g. for AMD, it can be both HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES.
|
|
1271
1245
|
"""
|
|
1272
|
-
|
|
1246
|
+
cdi: str = ""
|
|
1273
1247
|
"""
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
the value is the corresponding manufacturer.
|
|
1277
|
-
For example:
|
|
1278
|
-
{
|
|
1279
|
-
"NVIDIA_VISIBLE_DEVICES": ManufacturerEnum.NVIDIA,
|
|
1280
|
-
"AMD_VISIBLE_DEVICES": ManufacturerEnum.AMD
|
|
1281
|
-
}.
|
|
1248
|
+
CDI key for devices,
|
|
1249
|
+
e.g. for NVIDIA, it is nvidia.com/gpu.
|
|
1282
1250
|
"""
|
|
1283
|
-
|
|
1251
|
+
runtime_values: dict[str, str] = field(default_factory=dict)
|
|
1284
1252
|
"""
|
|
1285
|
-
|
|
1286
|
-
the key is the
|
|
1287
|
-
the value is the
|
|
1288
|
-
For example:
|
|
1289
|
-
{
|
|
1290
|
-
"NVIDIA_VISIBLE_DEVICES": ["CUDA_VISIBLE_DEVICES"],
|
|
1291
|
-
"AMD_VISIBLE_DEVICES": ["HIP_VISIBLE_DEVICES", "ROCR_VISIBLE_DEVICES"]
|
|
1292
|
-
}.
|
|
1253
|
+
Mapping devices to runtime visible devices env values,
|
|
1254
|
+
the key is the device index string,
|
|
1255
|
+
the value is the device index string or uuid.
|
|
1256
|
+
For example, {"0": "GPU-11111111-2222-3333-4444-555555555555"} for NVIDIA.
|
|
1293
1257
|
"""
|
|
1294
|
-
|
|
1258
|
+
backend_values: dict[str, dict[str, str]] = field(default_factory=dict)
|
|
1295
1259
|
"""
|
|
1296
|
-
|
|
1297
|
-
the key is the
|
|
1298
|
-
the value is the
|
|
1299
|
-
For example:
|
|
1300
|
-
{
|
|
1301
|
-
"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu",
|
|
1302
|
-
"AMD_VISIBLE_DEVICES": "amd.com/gpu"
|
|
1303
|
-
}.
|
|
1260
|
+
Mapping devices to backend visible devices env values,
|
|
1261
|
+
the key is the device index string,
|
|
1262
|
+
the value is the device index string or aligned device index string.
|
|
1263
|
+
For example, {"NPU_VISIBLE_DEVICES": {"4": "0", "5": "1"}} for Ascend.
|
|
1304
1264
|
"""
|
|
1305
|
-
|
|
1265
|
+
numa_affinities: dict[str, str] = field(default_factory=dict)
|
|
1306
1266
|
"""
|
|
1307
|
-
|
|
1308
|
-
the key is the
|
|
1309
|
-
the value is the
|
|
1310
|
-
For example:
|
|
1311
|
-
{
|
|
1312
|
-
"NVIDIA_VISIBLE_DEVICES": ["0"],
|
|
1313
|
-
"AMD_VISIBLE_DEVICES": ["0", "1"]
|
|
1314
|
-
}.
|
|
1267
|
+
Mapping devices to NUMA affinities,
|
|
1268
|
+
the key is the device index string,
|
|
1269
|
+
the value is the NUMA node string.
|
|
1270
|
+
For example, {"0": "0-1"}.
|
|
1315
1271
|
"""
|
|
1316
|
-
|
|
1272
|
+
cpus_affinities: dict[str, str] = field(default_factory=dict)
|
|
1317
1273
|
"""
|
|
1318
|
-
|
|
1319
|
-
the key is the
|
|
1320
|
-
the value is the
|
|
1321
|
-
For example:
|
|
1322
|
-
{
|
|
1323
|
-
"NVIDIA_VISIBLE_DEVICES": Topology(...),
|
|
1324
|
-
"AMD_VISIBLE_DEVICES": Topology(...)
|
|
1325
|
-
}.
|
|
1274
|
+
Mapping devices to CPUs affinities,
|
|
1275
|
+
the key is the device index string,
|
|
1276
|
+
the value is the CPU cores string.
|
|
1277
|
+
For example, {"0": "0-7"}.
|
|
1326
1278
|
"""
|
|
1327
|
-
|
|
1279
|
+
|
|
1280
|
+
|
|
1281
|
+
class Deployer(ABC):
|
|
1328
1282
|
"""
|
|
1329
|
-
|
|
1283
|
+
Base class for all deployers.
|
|
1284
|
+
"""
|
|
1285
|
+
|
|
1286
|
+
_name: str = "unknown"
|
|
1287
|
+
"""
|
|
1288
|
+
Name of the deployer.
|
|
1289
|
+
"""
|
|
1290
|
+
_pool: ThreadPoolExecutor | None = None
|
|
1291
|
+
"""
|
|
1292
|
+
Thread pool for the deployer.
|
|
1293
|
+
"""
|
|
1294
|
+
_materials: dict[str, DevicesMaterial] | None = None
|
|
1295
|
+
"""
|
|
1296
|
+
Mapping devices materials,
|
|
1330
1297
|
the key is the runtime visible devices env name,
|
|
1331
|
-
the value is the
|
|
1298
|
+
the value is the corresponding devices material.
|
|
1332
1299
|
For example:
|
|
1333
1300
|
{
|
|
1334
|
-
"
|
|
1335
|
-
|
|
1301
|
+
"NVIDIA_VISIBLE_DEVICES": DevicesMaterial(
|
|
1302
|
+
manufacturer=ManufacturerEnum.NVIDIA,
|
|
1303
|
+
runtime_env="NVIDIA_VISIBLE_DEVICES",
|
|
1304
|
+
backend_env=["CUDA_VISIBLE_DEVICES"],
|
|
1305
|
+
cdi="nvidia.com/gpu",
|
|
1306
|
+
runtime_values={"0": "GPU-11111111-2222-3333-4444-555555555555"},
|
|
1307
|
+
backend_values={"CUDA_VISIBLE_DEVICES": {"0": "0"}},
|
|
1308
|
+
numa_affinities={"0": "0-1"},
|
|
1309
|
+
cpus_affinities={"0": "0-7"},
|
|
1310
|
+
),
|
|
1311
|
+
"AMD_VISIBLE_DEVICES": DevicesMaterial(
|
|
1312
|
+
manufacturer=ManufacturerEnum.AMD,
|
|
1313
|
+
runtime_env="AMD_VISIBLE_DEVICES",
|
|
1314
|
+
backend_env=["HIP_VISIBLE_DEVICES", "ROCR_VISIBLE_DEVICES"],
|
|
1315
|
+
cdi="amd.com/gpu",
|
|
1316
|
+
runtime_values={"0": "0", "1": "1"},
|
|
1317
|
+
backend_values={"HIP_VISIBLE_DEVICES": {"0": "0", "1": "1"},"ROCR_VISIBLE_DEVICES": {"0": "0", "1": "1"}},
|
|
1318
|
+
numa_affinities={"0": "0-1", "1": "0-1"},
|
|
1319
|
+
cpus_affinities={"0": "0-7", "1": "8-15"},
|
|
1320
|
+
),
|
|
1336
1321
|
}.
|
|
1337
1322
|
"""
|
|
1338
1323
|
|
|
@@ -1359,21 +1344,12 @@ class Deployer(ABC):
|
|
|
1359
1344
|
|
|
1360
1345
|
def _prepare(self):
|
|
1361
1346
|
"""
|
|
1362
|
-
Detect devices
|
|
1363
|
-
- Prepare visible devices manufacturers mapping.
|
|
1364
|
-
- Prepare visible devices environment variables mapping.
|
|
1365
|
-
- Prepare visible devices values mapping.
|
|
1366
|
-
- Prepare visible devices topologies mapping.
|
|
1347
|
+
Detect devices and prepare materials.
|
|
1367
1348
|
"""
|
|
1368
|
-
if self.
|
|
1349
|
+
if self._materials is not None:
|
|
1369
1350
|
return
|
|
1370
1351
|
|
|
1371
|
-
self.
|
|
1372
|
-
self._visible_devices_env = {}
|
|
1373
|
-
self._visible_devices_cdis = {}
|
|
1374
|
-
self._visible_devices_values = {}
|
|
1375
|
-
self._visible_devices_topologies = {}
|
|
1376
|
-
self._backend_visible_devices_values_alignment = {}
|
|
1352
|
+
self._materials = {}
|
|
1377
1353
|
|
|
1378
1354
|
group_devices = group_devices_by_manufacturer(
|
|
1379
1355
|
detect_devices(fast=False),
|
|
@@ -1398,178 +1374,272 @@ class Deployer(ABC):
|
|
|
1398
1374
|
)
|
|
1399
1375
|
if ren and ben_list:
|
|
1400
1376
|
valued_uuid = (
|
|
1401
|
-
|
|
1402
|
-
|
|
1377
|
+
self.allowed_uuid_values
|
|
1378
|
+
and (
|
|
1379
|
+
ren
|
|
1380
|
+
in envs.GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_UUID
|
|
1381
|
+
)
|
|
1403
1382
|
and manu != ManufacturerEnum.ASCEND
|
|
1404
1383
|
)
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1384
|
+
dev_runtime_values: dict[str, str] = {}
|
|
1385
|
+
dev_backend_values: dict[str, str] = {}
|
|
1386
|
+
dev_backend_aligned_values: dict[str, str] = {}
|
|
1387
|
+
dev_numa_affinities: dict[str, str] = {}
|
|
1388
|
+
dev_cpus_affinities: dict[str, str] = {}
|
|
1408
1389
|
for dev_i, dev in enumerate(devs):
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
if not valued_uuid:
|
|
1432
|
-
for ben in ben_list:
|
|
1433
|
-
valued_alignment = (
|
|
1434
|
-
ben
|
|
1390
|
+
dev_index = str(dev.index)
|
|
1391
|
+
if valued_uuid:
|
|
1392
|
+
dev_runtime_values[dev_index] = dev.uuid
|
|
1393
|
+
else:
|
|
1394
|
+
dev_runtime_values[dev_index] = dev_index
|
|
1395
|
+
dev_backend_values[dev_index] = dev_index
|
|
1396
|
+
dev_backend_aligned_values[dev_index] = str(dev_i)
|
|
1397
|
+
dev_numa_affinities[dev_index] = dev.appendix.get("numa", "")
|
|
1398
|
+
dev_cpus_affinities[dev_index] = map_numa_node_to_cpu_affinity(
|
|
1399
|
+
dev_numa_affinities[dev_index],
|
|
1400
|
+
)
|
|
1401
|
+
|
|
1402
|
+
self._materials[ren] = DevicesMaterial(
|
|
1403
|
+
manufacturer=manu,
|
|
1404
|
+
runtime_env=ren,
|
|
1405
|
+
backend_env=ben_list,
|
|
1406
|
+
cdi=cdi,
|
|
1407
|
+
runtime_values=dev_runtime_values,
|
|
1408
|
+
backend_values={
|
|
1409
|
+
ben: (
|
|
1410
|
+
dev_backend_aligned_values
|
|
1411
|
+
if ben
|
|
1435
1412
|
in envs.GPUSTACK_RUNTIME_DEPLOY_BACKEND_VISIBLE_DEVICES_VALUE_ALIGNMENT
|
|
1413
|
+
else dev_backend_values
|
|
1436
1414
|
)
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1415
|
+
for ben in ben_list
|
|
1416
|
+
},
|
|
1417
|
+
numa_affinities=dev_numa_affinities,
|
|
1418
|
+
cpus_affinities=dev_cpus_affinities,
|
|
1419
|
+
)
|
|
1441
1420
|
|
|
1442
|
-
if self.
|
|
1421
|
+
if self._materials:
|
|
1443
1422
|
return
|
|
1444
1423
|
|
|
1445
1424
|
# Fallback to unknown backend
|
|
1446
1425
|
ren = "UNKNOWN_RUNTIME_VISIBLE_DEVICES"
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
self.
|
|
1450
|
-
|
|
1426
|
+
ben_list = ["UNKNOWN_BACKEND_VISIBLE_DEVICES"]
|
|
1427
|
+
cdi = "unknown.com/gpu"
|
|
1428
|
+
self._materials[ren] = DevicesMaterial(
|
|
1429
|
+
manufacturer=ManufacturerEnum.UNKNOWN,
|
|
1430
|
+
runtime_env=ren,
|
|
1431
|
+
backend_env=ben_list,
|
|
1432
|
+
cdi=cdi,
|
|
1433
|
+
runtime_values={"all": "all"},
|
|
1434
|
+
backend_values={ben: {"all": "all"} for ben in ben_list},
|
|
1435
|
+
)
|
|
1451
1436
|
|
|
1452
|
-
def
|
|
1437
|
+
def _get_materials(
|
|
1453
1438
|
self,
|
|
1454
|
-
) ->
|
|
1455
|
-
dict[str, ManufacturerEnum],
|
|
1456
|
-
dict[str, list[str]],
|
|
1457
|
-
dict[str, str],
|
|
1458
|
-
dict[str, list[str]],
|
|
1459
|
-
):
|
|
1439
|
+
) -> dict[str, DevicesMaterial]:
|
|
1460
1440
|
"""
|
|
1461
|
-
Return the
|
|
1462
|
-
For example:
|
|
1463
|
-
(
|
|
1464
|
-
{
|
|
1465
|
-
"NVIDIA_VISIBLE_DEVICES": ManufacturerEnum.NVIDIA,
|
|
1466
|
-
"AMD_VISIBLE_DEVICES": ManufacturerEnum.AMD
|
|
1467
|
-
},
|
|
1468
|
-
{
|
|
1469
|
-
"NVIDIA_VISIBLE_DEVICES": ["CUDA_VISIBLE_DEVICES"],
|
|
1470
|
-
"AMD_VISIBLE_DEVICES": ["HIP_VISIBLE_DEVICES", "ROCR_VISIBLE_DEVICES"]
|
|
1471
|
-
},
|
|
1472
|
-
{
|
|
1473
|
-
"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu",
|
|
1474
|
-
"AMD_VISIBLE_DEVICES": "amd.com/gpu"
|
|
1475
|
-
},
|
|
1476
|
-
{
|
|
1477
|
-
"NVIDIA_VISIBLE_DEVICES": ["0"],
|
|
1478
|
-
"AMD_VISIBLE_DEVICES": ["0", "1"]
|
|
1479
|
-
}
|
|
1480
|
-
).
|
|
1441
|
+
Return the devices materials mapping.
|
|
1481
1442
|
|
|
1482
1443
|
Returns:
|
|
1483
|
-
A
|
|
1484
|
-
|
|
1485
|
-
to corresponding manufacturers.
|
|
1486
|
-
- The second dictionary maps runtime visible devices environment variable names
|
|
1487
|
-
to lists of backend visible devices environment variable names.
|
|
1488
|
-
- The third dictionary maps runtime visible devices environment variable names
|
|
1489
|
-
to corresponding CDI keys.
|
|
1490
|
-
- The last dictionary maps runtime visible devices environment variable names
|
|
1491
|
-
to lists of device indexes or UUIDs.
|
|
1444
|
+
A dictionary mapping runtime visible devices environment variable names
|
|
1445
|
+
to corresponding devices materials.
|
|
1492
1446
|
|
|
1493
1447
|
"""
|
|
1494
1448
|
self._prepare()
|
|
1495
|
-
return
|
|
1496
|
-
self._visible_devices_manufacturers,
|
|
1497
|
-
self._visible_devices_env,
|
|
1498
|
-
self._visible_devices_cdis,
|
|
1499
|
-
self._visible_devices_values,
|
|
1500
|
-
)
|
|
1449
|
+
return self._materials
|
|
1501
1450
|
|
|
1502
|
-
def
|
|
1451
|
+
def get_manufacturer(
|
|
1503
1452
|
self,
|
|
1504
|
-
runtime_env:
|
|
1505
|
-
|
|
1506
|
-
) -> tuple[str, str]:
|
|
1453
|
+
runtime_env: str,
|
|
1454
|
+
) -> ManufacturerEnum:
|
|
1507
1455
|
"""
|
|
1508
|
-
|
|
1456
|
+
Return the manufacturer for the given runtime visible devices env name.
|
|
1509
1457
|
|
|
1510
1458
|
Args:
|
|
1511
1459
|
runtime_env:
|
|
1512
|
-
The
|
|
1513
|
-
resource_value:
|
|
1514
|
-
The resource value, which can be "all" or a comma-separated list of device indexes
|
|
1460
|
+
The runtime visible devices environment variable name.
|
|
1515
1461
|
|
|
1516
1462
|
Returns:
|
|
1517
|
-
|
|
1518
|
-
- A comma-separated string of CPU affinities.
|
|
1519
|
-
- A comma-separated string of NUMA affinities.
|
|
1463
|
+
The manufacturer enum.
|
|
1520
1464
|
|
|
1521
1465
|
"""
|
|
1522
|
-
|
|
1523
|
-
if resource_value != "all":
|
|
1524
|
-
dev_indexes = [int(v.strip()) for v in resource_value.split(",")]
|
|
1466
|
+
m = self._get_materials()
|
|
1525
1467
|
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
for re_ in runtime_env:
|
|
1529
|
-
topo = self._visible_devices_topologies.get(re_)
|
|
1530
|
-
if topo:
|
|
1531
|
-
cs, ns = topo.get_affinities(dev_indexes, deduplicate=False)
|
|
1532
|
-
cpus_set.extend(cs)
|
|
1533
|
-
numas_set.extend(ns)
|
|
1468
|
+
if runtime_env not in m:
|
|
1469
|
+
return ManufacturerEnum.UNKNOWN
|
|
1534
1470
|
|
|
1535
|
-
return
|
|
1471
|
+
return m[runtime_env].manufacturer
|
|
1536
1472
|
|
|
1537
|
-
def
|
|
1473
|
+
def get_runtime_envs(
|
|
1538
1474
|
self,
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
|
|
1475
|
+
) -> list[str]:
|
|
1476
|
+
"""
|
|
1477
|
+
Return the supported runtime visible devices env names.
|
|
1478
|
+
|
|
1479
|
+
Returns:
|
|
1480
|
+
A list of supported runtime visible devices environment variable names.
|
|
1481
|
+
|
|
1482
|
+
"""
|
|
1483
|
+
m = self._get_materials()
|
|
1484
|
+
return list(m.keys())
|
|
1485
|
+
|
|
1486
|
+
def get_runtime_visible_devices(
|
|
1487
|
+
self,
|
|
1488
|
+
runtime_env: str,
|
|
1489
|
+
fmt: str = "plain",
|
|
1490
|
+
) -> list[str]:
|
|
1542
1491
|
"""
|
|
1543
|
-
Return the
|
|
1544
|
-
For example, if the backend visible devices env is "ASCEND_RT_VISIBLE_DEVICES",
|
|
1545
|
-
and the `resource_key_values` is "4,6", and the detected devices are with indexes
|
|
1546
|
-
[4,5,6,7], then the aligned result will be "0,2".
|
|
1492
|
+
Return the runtime visible devices values for the given runtime visible devices env name.
|
|
1547
1493
|
|
|
1548
1494
|
Args:
|
|
1549
|
-
|
|
1550
|
-
The
|
|
1551
|
-
|
|
1552
|
-
The
|
|
1495
|
+
runtime_env:
|
|
1496
|
+
The runtime visible devices environment variable name.
|
|
1497
|
+
fmt:
|
|
1498
|
+
The format of the returned values,
|
|
1499
|
+
can be "cdi", "kdp", or "plain".
|
|
1553
1500
|
|
|
1554
1501
|
Returns:
|
|
1555
|
-
|
|
1556
|
-
If no alignment is needed, return the original `resource_key_values`.
|
|
1502
|
+
A list of runtime visible devices values.
|
|
1557
1503
|
|
|
1558
1504
|
"""
|
|
1559
|
-
|
|
1560
|
-
|
|
1561
|
-
|
|
1562
|
-
|
|
1563
|
-
|
|
1564
|
-
|
|
1565
|
-
|
|
1566
|
-
|
|
1567
|
-
|
|
1568
|
-
|
|
1569
|
-
|
|
1570
|
-
|
|
1571
|
-
|
|
1505
|
+
m = self._get_materials()
|
|
1506
|
+
|
|
1507
|
+
if runtime_env not in m:
|
|
1508
|
+
return []
|
|
1509
|
+
|
|
1510
|
+
rm = m[runtime_env]
|
|
1511
|
+
match fmt:
|
|
1512
|
+
case "cdi":
|
|
1513
|
+
return [f"{rm.cdi}={v}" for v in rm.runtime_values.values()]
|
|
1514
|
+
case "kdp":
|
|
1515
|
+
return [
|
|
1516
|
+
cdi_kind_to_kdp_resource(rm.cdi, v)
|
|
1517
|
+
for v in rm.runtime_values.values()
|
|
1518
|
+
]
|
|
1519
|
+
return list(rm.runtime_values.values())
|
|
1520
|
+
|
|
1521
|
+
def map_runtime_visible_devices(
|
|
1522
|
+
self,
|
|
1523
|
+
runtime_env: str,
|
|
1524
|
+
resource_values: list[str],
|
|
1525
|
+
fmt: str = "plain",
|
|
1526
|
+
) -> list[str]:
|
|
1527
|
+
"""
|
|
1528
|
+
Map the given resource values to runtime visible devices values
|
|
1529
|
+
for the given runtime visible devices env name.
|
|
1530
|
+
|
|
1531
|
+
Args:
|
|
1532
|
+
runtime_env:
|
|
1533
|
+
The runtime visible devices environment variable name.
|
|
1534
|
+
resource_values:
|
|
1535
|
+
The resource values to map.
|
|
1536
|
+
fmt:
|
|
1537
|
+
The format of the returned values,
|
|
1538
|
+
can be "cdi", "kdp", or "plain".
|
|
1539
|
+
|
|
1540
|
+
Returns:
|
|
1541
|
+
A list of mapped runtime visible devices values.
|
|
1542
|
+
|
|
1543
|
+
"""
|
|
1544
|
+
m = self._get_materials()
|
|
1545
|
+
|
|
1546
|
+
if runtime_env not in m:
|
|
1547
|
+
return []
|
|
1548
|
+
|
|
1549
|
+
rm = m[runtime_env]
|
|
1550
|
+
match fmt:
|
|
1551
|
+
case "cdi":
|
|
1552
|
+
return [
|
|
1553
|
+
f"{rm.cdi}={rm.runtime_values.get(v, v)}" for v in resource_values
|
|
1554
|
+
]
|
|
1555
|
+
case "kdp":
|
|
1556
|
+
return [
|
|
1557
|
+
cdi_kind_to_kdp_resource(rm.cdi, rm.runtime_values.get(v, v))
|
|
1558
|
+
for v in resource_values
|
|
1559
|
+
]
|
|
1560
|
+
return [rm.runtime_values.get(v, v) for v in resource_values]
|
|
1561
|
+
|
|
1562
|
+
def map_backend_visible_devices(
|
|
1563
|
+
self,
|
|
1564
|
+
runtime_envs: list[str],
|
|
1565
|
+
resource_values: list[str],
|
|
1566
|
+
) -> dict[str, str]:
|
|
1567
|
+
"""
|
|
1568
|
+
Map the given resource values to backend visible devices values
|
|
1569
|
+
for the given runtime visible devices env names.
|
|
1570
|
+
|
|
1571
|
+
Args:
|
|
1572
|
+
runtime_envs:
|
|
1573
|
+
The runtime visible devices environment variable names.
|
|
1574
|
+
resource_values:
|
|
1575
|
+
The resource values to map.
|
|
1576
|
+
|
|
1577
|
+
Returns:
|
|
1578
|
+
A dictionary mapping backend visible devices environment variable names
|
|
1579
|
+
to corresponding mapped backend visible devices values.
|
|
1580
|
+
|
|
1581
|
+
"""
|
|
1582
|
+
m = self._get_materials()
|
|
1583
|
+
|
|
1584
|
+
ret = {}
|
|
1585
|
+
for runtime_env in runtime_envs:
|
|
1586
|
+
if runtime_env not in m:
|
|
1587
|
+
continue
|
|
1588
|
+
rm = m[runtime_env]
|
|
1589
|
+
for ben in rm.backend_env:
|
|
1590
|
+
ret[ben] = ",".join(
|
|
1591
|
+
[rm.backend_values[ben].get(v, v) for v in resource_values],
|
|
1592
|
+
)
|
|
1593
|
+
return ret
|
|
1594
|
+
|
|
1595
|
+
def map_visible_devices_affinities(
|
|
1596
|
+
self,
|
|
1597
|
+
runtime_envs: list[str],
|
|
1598
|
+
resource_values: list[str],
|
|
1599
|
+
) -> dict[str, str]:
|
|
1600
|
+
"""
|
|
1601
|
+
Map the given resource values to visible devices affinities
|
|
1602
|
+
for the given runtime visible devices env names.
|
|
1603
|
+
|
|
1604
|
+
Args:
|
|
1605
|
+
runtime_envs:
|
|
1606
|
+
The runtime visible devices environment variable names.
|
|
1607
|
+
resource_values:
|
|
1608
|
+
The resource values to map.
|
|
1609
|
+
|
|
1610
|
+
Returns:
|
|
1611
|
+
A dictionary mapping "cpuset_cpus" and/or "cpuset_mems"
|
|
1612
|
+
to corresponding mapped affinities strings.
|
|
1613
|
+
|
|
1614
|
+
"""
|
|
1615
|
+
valued_affinity = (
|
|
1616
|
+
envs.GPUSTACK_RUNTIME_DEPLOY_CPU_AFFINITY
|
|
1617
|
+
or envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY
|
|
1572
1618
|
)
|
|
1619
|
+
if not valued_affinity:
|
|
1620
|
+
return {}
|
|
1621
|
+
|
|
1622
|
+
m = self._get_materials()
|
|
1623
|
+
|
|
1624
|
+
ret = {}
|
|
1625
|
+
for runtime_env in runtime_envs:
|
|
1626
|
+
if runtime_env not in m:
|
|
1627
|
+
continue
|
|
1628
|
+
rm = m[runtime_env]
|
|
1629
|
+
cpus_set = set[str]()
|
|
1630
|
+
numas_set = set[str]()
|
|
1631
|
+
for v in resource_values:
|
|
1632
|
+
if v in rm.cpus_affinities:
|
|
1633
|
+
cpus_set.add(rm.cpus_affinities[v])
|
|
1634
|
+
if not envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY:
|
|
1635
|
+
continue
|
|
1636
|
+
if v in rm.numa_affinities:
|
|
1637
|
+
numas_set.add(rm.numa_affinities[v])
|
|
1638
|
+
if cpus := ",".join(sorted(cpus_set)):
|
|
1639
|
+
ret["cpuset_cpus"] = cpus
|
|
1640
|
+
if numas := ",".join(sorted(numas_set)):
|
|
1641
|
+
ret["cpuset_mems"] = numas
|
|
1642
|
+
return ret
|
|
1573
1643
|
|
|
1574
1644
|
@property
|
|
1575
1645
|
def name(self) -> str:
|
|
@@ -1582,6 +1652,17 @@ class Deployer(ABC):
|
|
|
1582
1652
|
"""
|
|
1583
1653
|
return self._name
|
|
1584
1654
|
|
|
1655
|
+
@property
|
|
1656
|
+
def allowed_uuid_values(self) -> bool:
|
|
1657
|
+
"""
|
|
1658
|
+
Return whether the deployer allows using UUIDs as visible devices values.
|
|
1659
|
+
|
|
1660
|
+
Returns:
|
|
1661
|
+
True if allowed, False otherwise.
|
|
1662
|
+
|
|
1663
|
+
"""
|
|
1664
|
+
return True
|
|
1665
|
+
|
|
1585
1666
|
def close(self):
|
|
1586
1667
|
if self._pool:
|
|
1587
1668
|
self._pool.shutdown(cancel_futures=True)
|