xpk 0.15.0__py3-none-any.whl → 0.16.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/README.md +19 -0
- xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3mega/storage_crd.yaml +52 -0
- xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
- xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
- xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
- xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
- xpk/blueprints/a4/storage_crd.yaml +52 -0
- xpk/commands/cluster.py +33 -12
- xpk/commands/cluster_gcluster_test.py +5 -1
- xpk/commands/cluster_test.py +125 -0
- xpk/commands/config.py +3 -3
- xpk/commands/inspector.py +5 -3
- xpk/commands/kind.py +2 -0
- xpk/commands/managed_ml_diagnostics.py +249 -0
- xpk/commands/managed_ml_diagnostics_test.py +146 -0
- xpk/commands/workload.py +125 -139
- xpk/commands/workload_test.py +160 -118
- xpk/core/blueprint/blueprint_generator.py +3 -0
- xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
- xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
- xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
- xpk/core/blueprint/testing/data/a4.yaml +185 -0
- xpk/core/capacity.py +2 -0
- xpk/core/cluster.py +18 -47
- xpk/core/cluster_test.py +76 -1
- xpk/core/config.py +81 -7
- xpk/core/config_test.py +67 -11
- xpk/core/docker_container.py +3 -1
- xpk/core/docker_image.py +10 -6
- xpk/core/docker_resources.py +1 -10
- xpk/core/kjob.py +17 -16
- xpk/core/kueue_manager.py +13 -19
- xpk/core/kueue_manager_test.py +27 -1
- xpk/core/nap.py +13 -14
- xpk/core/nodepool.py +17 -15
- xpk/core/nodepool_test.py +25 -4
- xpk/core/pathways.py +23 -0
- xpk/core/pathways_test.py +57 -0
- xpk/core/resources.py +84 -27
- xpk/core/scheduling.py +128 -132
- xpk/core/scheduling_test.py +215 -2
- xpk/core/system_characteristics.py +179 -0
- xpk/core/system_characteristics_test.py +49 -1
- xpk/core/telemetry.py +4 -4
- xpk/core/telemetry_test.py +9 -9
- xpk/core/vertex.py +4 -3
- xpk/core/workload_decorators/tcpx_decorator.py +5 -1
- xpk/main.py +2 -0
- xpk/parser/cluster.py +22 -88
- xpk/parser/cluster_test.py +41 -0
- xpk/parser/common.py +84 -0
- xpk/parser/storage.py +10 -0
- xpk/parser/storage_test.py +47 -0
- xpk/parser/workload.py +14 -41
- xpk/parser/workload_test.py +2 -48
- xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
- xpk/utils/feature_flags.py +3 -0
- xpk/utils/validation.py +2 -2
- xpk-0.16.1.dist-info/METADATA +127 -0
- {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/RECORD +67 -48
- xpk-0.15.0.dist-info/METADATA +0 -1666
- {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/WHEEL +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/entry_points.txt +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import pytest
|
|
18
|
+
from unittest.mock import MagicMock
|
|
19
|
+
from xpk.core.testing.commands_tester import CommandsTester
|
|
20
|
+
from .pathways import get_pathways_machine_types
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@pytest.fixture(autouse=True)
|
|
24
|
+
def commands_tester(mocker: MagicMock):
|
|
25
|
+
return CommandsTester(
|
|
26
|
+
mocker,
|
|
27
|
+
run_command_with_updates_path=(
|
|
28
|
+
"xpk.core.pathways.run_command_with_updates"
|
|
29
|
+
),
|
|
30
|
+
run_command_for_value_path="xpk.core.pathways.run_command_for_value",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_get_pathways_machine_types_when_command_fails_returns_failed_exit_code(
|
|
35
|
+
commands_tester: CommandsTester,
|
|
36
|
+
):
|
|
37
|
+
commands_tester.set_result_for_command(
|
|
38
|
+
(1, ""), "gcloud compute machine-types list"
|
|
39
|
+
)
|
|
40
|
+
return_code, machine_types = get_pathways_machine_types(
|
|
41
|
+
project="gke-project", zone="us-central1-a"
|
|
42
|
+
)
|
|
43
|
+
assert return_code == 1
|
|
44
|
+
assert machine_types == []
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_get_pathways_machine_types_when_command_suceeds_returns_machine_types(
|
|
48
|
+
commands_tester: CommandsTester,
|
|
49
|
+
):
|
|
50
|
+
commands_tester.set_result_for_command(
|
|
51
|
+
(0, "abc\ncba"), "gcloud compute machine-types list"
|
|
52
|
+
)
|
|
53
|
+
return_code, machine_types = get_pathways_machine_types(
|
|
54
|
+
project="gke-project", zone="us-central1-a"
|
|
55
|
+
)
|
|
56
|
+
assert return_code == 0
|
|
57
|
+
assert machine_types == ["abc", "cba"]
|
xpk/core/resources.py
CHANGED
|
@@ -15,6 +15,7 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
from dataclasses import dataclass
|
|
18
|
+
import os
|
|
18
19
|
|
|
19
20
|
from ..utils.console import xpk_print
|
|
20
21
|
from ..utils.file import write_tmp_file
|
|
@@ -30,9 +31,13 @@ from .capacity import (
|
|
|
30
31
|
from .commands import run_command_for_value, run_commands
|
|
31
32
|
from .config import XPK_CURRENT_VERSION
|
|
32
33
|
from .system_characteristics import AcceleratorType, get_system_characteristics_by_device_type, SystemCharacteristics
|
|
34
|
+
from enum import Enum
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ConfigMapType(Enum):
|
|
38
|
+
RESOURCES = 'resources-configmap'
|
|
39
|
+
METADATA = 'metadata-configmap'
|
|
33
40
|
|
|
34
|
-
CLUSTER_RESOURCES_CONFIGMAP = 'resources-configmap'
|
|
35
|
-
CLUSTER_METADATA_CONFIGMAP = 'metadata-configmap'
|
|
36
41
|
|
|
37
42
|
CLUSTER_CONFIGMAP_YAML = """kind: ConfigMap
|
|
38
43
|
apiVersion: v1
|
|
@@ -50,7 +55,15 @@ class AutoprovisioningConfig:
|
|
|
50
55
|
maximum_chips: int
|
|
51
56
|
|
|
52
57
|
|
|
53
|
-
def
|
|
58
|
+
def get_config_map_name(
|
|
59
|
+
cluster_name: str, config_map_type: ConfigMapType
|
|
60
|
+
) -> str:
|
|
61
|
+
return f'{cluster_name}-{config_map_type.value}'
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def get_cluster_configmap(
|
|
65
|
+
cluster_name: str, config_map_type: ConfigMapType
|
|
66
|
+
) -> dict[str, str] | None:
|
|
54
67
|
"""Run the Get GKE Cluster ConfigMap request.
|
|
55
68
|
|
|
56
69
|
Args:
|
|
@@ -59,15 +72,17 @@ def get_cluster_configmap(configmap_name) -> dict[str, str] | None:
|
|
|
59
72
|
Returns:
|
|
60
73
|
key:value pairs stored in cluster ConfigMap.
|
|
61
74
|
"""
|
|
75
|
+
config_map_name = get_config_map_name(cluster_name, config_map_type)
|
|
62
76
|
command = (
|
|
63
77
|
'kubectl get configmap'
|
|
64
|
-
f' {
|
|
78
|
+
f' {config_map_name} -o=custom-columns="ConfigData:data"'
|
|
79
|
+
' --no-headers=true'
|
|
65
80
|
)
|
|
66
81
|
|
|
67
82
|
return_code, return_value = run_command_for_value(
|
|
68
83
|
command,
|
|
69
84
|
'GKE Cluster Get ConfigMap',
|
|
70
|
-
dry_run_return_val=
|
|
85
|
+
dry_run_return_val=_get_dry_run_config_map_value(config_map_type),
|
|
71
86
|
)
|
|
72
87
|
if return_code != 0:
|
|
73
88
|
xpk_print(f'GKE Cluster Get ConfigMap request returned ERROR {return_code}')
|
|
@@ -89,9 +104,18 @@ def get_cluster_configmap(configmap_name) -> dict[str, str] | None:
|
|
|
89
104
|
return config_map
|
|
90
105
|
|
|
91
106
|
|
|
107
|
+
def _get_dry_run_config_map_value(config_map_type: ConfigMapType) -> str:
|
|
108
|
+
default_value = 'map[]'
|
|
109
|
+
|
|
110
|
+
if config_map_type == ConfigMapType.RESOURCES:
|
|
111
|
+
return os.getenv('DRY_RUN_RESOURCES_CONFIG_MAP', default_value)
|
|
112
|
+
|
|
113
|
+
return default_value
|
|
114
|
+
|
|
115
|
+
|
|
92
116
|
def create_cluster_configmaps(
|
|
93
117
|
args,
|
|
94
|
-
system,
|
|
118
|
+
system: SystemCharacteristics,
|
|
95
119
|
tensorboard_config: dict,
|
|
96
120
|
autoprovisioning_config: AutoprovisioningConfig | None,
|
|
97
121
|
) -> int:
|
|
@@ -127,9 +151,11 @@ def create_cluster_configmaps(
|
|
|
127
151
|
resources_data = (
|
|
128
152
|
f'{device_type}: "{int(args.num_slices) * system.vms_per_slice}"'
|
|
129
153
|
)
|
|
130
|
-
resources_configmap_name =
|
|
154
|
+
resources_configmap_name = get_config_map_name(
|
|
155
|
+
args.cluster, ConfigMapType.RESOURCES
|
|
156
|
+
)
|
|
131
157
|
resources_yml = CLUSTER_CONFIGMAP_YAML.format(
|
|
132
|
-
|
|
158
|
+
name=resources_configmap_name, data=resources_data
|
|
133
159
|
)
|
|
134
160
|
configmap_yml[resources_configmap_name] = resources_yml
|
|
135
161
|
|
|
@@ -148,15 +174,17 @@ def create_cluster_configmaps(
|
|
|
148
174
|
# Reservation ID if applicable.
|
|
149
175
|
if capacity_type == CapacityType.RESERVATION:
|
|
150
176
|
metadata += f'\n {RESERVATION_CONFIG_KEY}: {args.reservation}'
|
|
151
|
-
metadata_configmap_name =
|
|
177
|
+
metadata_configmap_name = get_config_map_name(
|
|
178
|
+
args.cluster, ConfigMapType.METADATA
|
|
179
|
+
)
|
|
152
180
|
metadata_yml = CLUSTER_CONFIGMAP_YAML.format(
|
|
153
|
-
|
|
181
|
+
name=metadata_configmap_name, data=metadata
|
|
154
182
|
)
|
|
155
183
|
configmap_yml[metadata_configmap_name] = metadata_yml
|
|
156
|
-
return
|
|
184
|
+
return _create_or_update_cluster_configmap(configmap_yml)
|
|
157
185
|
|
|
158
186
|
|
|
159
|
-
def
|
|
187
|
+
def _create_or_update_cluster_configmap(configmap_yml: dict[str, str]) -> int:
|
|
160
188
|
"""
|
|
161
189
|
Args:
|
|
162
190
|
configmap_yml: dict containing ConfigMap name and yml string.
|
|
@@ -187,7 +215,18 @@ def create_or_update_cluster_configmap(configmap_yml: dict) -> int:
|
|
|
187
215
|
return 0
|
|
188
216
|
|
|
189
217
|
|
|
190
|
-
def
|
|
218
|
+
def update_cluster_configmap(
|
|
219
|
+
cluster_name: str, config_map_type: ConfigMapType, data: str
|
|
220
|
+
) -> int:
|
|
221
|
+
config_map_name = get_config_map_name(cluster_name, config_map_type)
|
|
222
|
+
yaml = CLUSTER_CONFIGMAP_YAML.format(name=config_map_name, data=data)
|
|
223
|
+
config_map_dict = {config_map_name: yaml}
|
|
224
|
+
return _create_or_update_cluster_configmap(config_map_dict)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def check_cluster_resources(
|
|
228
|
+
args, system: SystemCharacteristics
|
|
229
|
+
) -> tuple[bool, bool]:
|
|
191
230
|
"""Check if cluster has resources of a specified device_type/gke_accelerator.
|
|
192
231
|
This check will be skipped if <args.cluster>-<_CLUSTER_RESOURCES_CONFIGMAP> ConfigMap doesn't exist for the cluster.
|
|
193
232
|
|
|
@@ -200,8 +239,9 @@ def check_cluster_resources(args, system) -> tuple[bool, bool]:
|
|
|
200
239
|
True if resources in the cluster should be checked, False otherwise.
|
|
201
240
|
True if device_type/gke_accelerator exists in the cluster, False otherwise.
|
|
202
241
|
"""
|
|
203
|
-
|
|
204
|
-
|
|
242
|
+
resources_config_map = get_cluster_configmap(
|
|
243
|
+
args.cluster, ConfigMapType.RESOURCES
|
|
244
|
+
)
|
|
205
245
|
if resources_config_map is None:
|
|
206
246
|
xpk_print(
|
|
207
247
|
f'No ConfigMap exist for cluster with the name {resources_config_map}.'
|
|
@@ -216,20 +256,35 @@ def check_cluster_resources(args, system) -> tuple[bool, bool]:
|
|
|
216
256
|
|
|
217
257
|
|
|
218
258
|
def get_cluster_system_characteristics(args) -> SystemCharacteristics | None:
|
|
219
|
-
"""Get
|
|
259
|
+
"""Get SystemCharcteristics based on the cluster resources configMap.
|
|
260
|
+
|
|
220
261
|
Args:
|
|
221
262
|
args: user provided arguments for running the command.
|
|
222
263
|
|
|
223
264
|
Returns:
|
|
224
|
-
returns system characteristics
|
|
265
|
+
returns system characteristics, or None if not found.
|
|
266
|
+
"""
|
|
267
|
+
resources_config_map = get_cluster_configmap(
|
|
268
|
+
args.cluster, ConfigMapType.RESOURCES
|
|
269
|
+
)
|
|
270
|
+
return get_cluster_system_characteristics_from_config_map(
|
|
271
|
+
resources_config_map
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def get_cluster_system_characteristics_from_config_map(
|
|
276
|
+
resources_config_map: dict[str, str] | None,
|
|
277
|
+
) -> SystemCharacteristics | None:
|
|
278
|
+
"""Get SystemCharcteristics based on the cluster resources configMap.
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
returns system characteristics, or None if not found.
|
|
225
282
|
"""
|
|
226
|
-
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
|
227
|
-
cluster_config_map = get_cluster_configmap(resources_configmap_name)
|
|
228
283
|
|
|
229
|
-
if
|
|
284
|
+
if resources_config_map is None:
|
|
230
285
|
return None
|
|
231
286
|
|
|
232
|
-
for key in
|
|
287
|
+
for key in resources_config_map:
|
|
233
288
|
system, result_code = get_system_characteristics_by_device_type(key)
|
|
234
289
|
if result_code == 0:
|
|
235
290
|
return system
|
|
@@ -238,20 +293,22 @@ def get_cluster_system_characteristics(args) -> SystemCharacteristics | None:
|
|
|
238
293
|
|
|
239
294
|
|
|
240
295
|
def get_cluster_capacity_type(args) -> CapacityType | None:
|
|
241
|
-
"""Get
|
|
296
|
+
"""Get CapacityType based on the cluster metadata configMap.
|
|
297
|
+
|
|
242
298
|
Args:
|
|
243
299
|
args: user provided arguments for running the command.
|
|
244
300
|
|
|
245
301
|
Returns:
|
|
246
|
-
returns
|
|
302
|
+
returns CapacityType, or None if not found.
|
|
247
303
|
"""
|
|
248
|
-
metadata_configmap_name =
|
|
249
|
-
|
|
304
|
+
metadata_configmap_name = get_cluster_configmap(
|
|
305
|
+
args.cluster, ConfigMapType.METADATA
|
|
306
|
+
)
|
|
250
307
|
|
|
251
|
-
if
|
|
308
|
+
if metadata_configmap_name is None:
|
|
252
309
|
return None
|
|
253
310
|
|
|
254
|
-
capacityValue =
|
|
311
|
+
capacityValue = metadata_configmap_name.get('capacity_type')
|
|
255
312
|
if capacityValue is not None:
|
|
256
313
|
return CapacityType[capacityValue.upper()]
|
|
257
314
|
|
xpk/core/scheduling.py
CHANGED
|
@@ -14,61 +14,63 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
from enum import Enum
|
|
18
|
+
|
|
19
|
+
from .kueue_manager import get_installed_kueue_version, has_sub_slicing_enabled
|
|
20
|
+
from ..utils.feature_flags import FeatureFlags
|
|
17
21
|
from ..utils.topology import get_slice_topology_level
|
|
18
22
|
from ..utils.console import xpk_print
|
|
19
23
|
from ..utils.topology import is_topology_valid
|
|
20
24
|
from ..utils.execution_context import is_dry_run
|
|
21
25
|
from .capacity import AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, AUTOPROVISIONING_CONFIG_VALUE
|
|
22
|
-
from .resources import CLUSTER_RESOURCES_CONFIGMAP, get_cluster_configmap
|
|
23
26
|
from .system_characteristics import (
|
|
27
|
+
SUB_SLICING_TOPOLOGIES,
|
|
24
28
|
AcceleratorType,
|
|
25
|
-
AcceleratorTypeToAcceleratorCharacteristics,
|
|
26
29
|
SystemCharacteristics,
|
|
30
|
+
create_accelerator_label,
|
|
31
|
+
create_machine_label,
|
|
27
32
|
)
|
|
33
|
+
from packaging.version import Version
|
|
28
34
|
|
|
35
|
+
_SUB_SLICING_MINIMUM_KUEUE_VERSION = Version('0.13.0')
|
|
29
36
|
|
|
30
|
-
def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
|
|
31
|
-
"""Check if workload can schedule based on the cluster resources (tpu_type and maximum VM in cluster).
|
|
32
37
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
38
|
+
class WorkloadScheduling(Enum):
|
|
39
|
+
UNAVAILABLE = 0
|
|
40
|
+
AVAILABLE = 1
|
|
41
|
+
SUB_SLICING_AVAILABLE = 2
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def check_if_workload_can_schedule(
|
|
45
|
+
args,
|
|
46
|
+
workload_system: SystemCharacteristics,
|
|
47
|
+
cluster_system: SystemCharacteristics | None,
|
|
48
|
+
resources_config_map: dict[str, str] | None,
|
|
49
|
+
) -> WorkloadScheduling:
|
|
50
|
+
"""Check if workload can schedule based on the cluster resources (tpu_type and maximum VM in cluster).
|
|
36
51
|
|
|
37
52
|
Returns:
|
|
38
|
-
returns
|
|
53
|
+
returns WorkloadScheduling describing scheduling option.
|
|
39
54
|
"""
|
|
40
|
-
|
|
41
|
-
|
|
55
|
+
if is_dry_run() and not cluster_system:
|
|
56
|
+
xpk_print('Skipping workload scheduling validation in dry run.')
|
|
57
|
+
return WorkloadScheduling.AVAILABLE
|
|
42
58
|
|
|
43
|
-
|
|
44
|
-
if cluster_config_map is None:
|
|
59
|
+
if resources_config_map is None:
|
|
45
60
|
xpk_print(
|
|
46
|
-
|
|
47
|
-
|
|
61
|
+
"Skipping workload scheduling validation, because there's no Resources"
|
|
62
|
+
' ConfigMap in the cluster.'
|
|
48
63
|
)
|
|
49
|
-
return
|
|
50
|
-
|
|
51
|
-
if is_dry_run():
|
|
52
|
-
return True
|
|
64
|
+
return WorkloadScheduling.AVAILABLE
|
|
53
65
|
|
|
54
|
-
|
|
55
|
-
missing_gke_accelerator_type = False
|
|
56
|
-
if not cluster_config_map.get(system.gke_accelerator):
|
|
57
|
-
xpk_print(
|
|
58
|
-
f'GKE Accelerator Type Check: {args.workload} is requesting'
|
|
59
|
-
f' {system.gke_accelerator} but cluster only contains'
|
|
60
|
-
f' {cluster_config_map.keys()}. '
|
|
61
|
-
)
|
|
62
|
-
missing_gke_accelerator_type = True
|
|
63
|
-
elif (
|
|
64
|
-
cluster_config_map[system.gke_accelerator]
|
|
65
|
-
== AUTOPROVISIONING_CONFIG_VALUE
|
|
66
|
-
):
|
|
66
|
+
if _is_cluster_set_up_for_nap(workload_system, resources_config_map):
|
|
67
67
|
# Run total chip check when in autoprovisioning mode.
|
|
68
68
|
max_chips_in_cluster = int(
|
|
69
|
-
|
|
69
|
+
resources_config_map[AUTOPROVISIONING_CONFIG_MAXIMUM_KEY]
|
|
70
|
+
)
|
|
71
|
+
num_chips_in_workload = get_total_chips_requested_from_args(
|
|
72
|
+
args, workload_system
|
|
70
73
|
)
|
|
71
|
-
num_chips_in_workload = get_total_chips_requested_from_args(args, system)
|
|
72
74
|
|
|
73
75
|
if num_chips_in_workload > max_chips_in_cluster:
|
|
74
76
|
xpk_print(
|
|
@@ -77,44 +79,100 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
|
|
|
77
79
|
' Resize the cluster to support more chips with'
|
|
78
80
|
' `xpk cluster create --autoprovisioning-max-chips=X ...`'
|
|
79
81
|
)
|
|
80
|
-
return
|
|
81
|
-
return
|
|
82
|
+
return WorkloadScheduling.UNAVAILABLE
|
|
83
|
+
return WorkloadScheduling.AVAILABLE
|
|
84
|
+
|
|
85
|
+
if workload_system.device_type in resources_config_map:
|
|
86
|
+
if _check_workload_size_fits(
|
|
87
|
+
args,
|
|
88
|
+
workload_system,
|
|
89
|
+
max_vm_in_cluster=int(
|
|
90
|
+
resources_config_map[workload_system.device_type]
|
|
91
|
+
),
|
|
92
|
+
):
|
|
93
|
+
return WorkloadScheduling.AVAILABLE
|
|
94
|
+
else:
|
|
95
|
+
return WorkloadScheduling.UNAVAILABLE
|
|
96
|
+
|
|
97
|
+
if _check_sub_slicing_availability(
|
|
98
|
+
workload_system=workload_system, cluster_system=cluster_system
|
|
99
|
+
):
|
|
100
|
+
assert cluster_system
|
|
101
|
+
if _check_workload_size_fits(
|
|
102
|
+
args,
|
|
103
|
+
workload_system,
|
|
104
|
+
max_vm_in_cluster=int(resources_config_map[cluster_system.device_type]),
|
|
105
|
+
):
|
|
106
|
+
return WorkloadScheduling.SUB_SLICING_AVAILABLE
|
|
107
|
+
else:
|
|
108
|
+
return WorkloadScheduling.UNAVAILABLE
|
|
109
|
+
|
|
110
|
+
xpk_print(
|
|
111
|
+
'Workload scheduling validation failed. XPK will not create the workload'
|
|
112
|
+
f' {args.workload}.'
|
|
113
|
+
)
|
|
114
|
+
return WorkloadScheduling.UNAVAILABLE
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _is_cluster_set_up_for_nap(
|
|
118
|
+
workload_system: SystemCharacteristics, resources_config_map: dict[str, str]
|
|
119
|
+
) -> bool:
|
|
120
|
+
return (
|
|
121
|
+
resources_config_map.get(workload_system.gke_accelerator, None)
|
|
122
|
+
== AUTOPROVISIONING_CONFIG_VALUE
|
|
123
|
+
)
|
|
82
124
|
|
|
83
|
-
# Check for device type
|
|
84
|
-
missing_device_type = False
|
|
85
|
-
device_type = system.device_type
|
|
86
|
-
if device_type not in cluster_config_map:
|
|
87
|
-
xpk_print(
|
|
88
|
-
f'Device Type Check: {args.workload} is requesting {device_type} but '
|
|
89
|
-
f'cluster only contains {cluster_config_map.keys()}. '
|
|
90
|
-
)
|
|
91
|
-
missing_device_type = True
|
|
92
125
|
|
|
93
|
-
|
|
126
|
+
def _check_workload_size_fits(
|
|
127
|
+
args,
|
|
128
|
+
workload_system: SystemCharacteristics,
|
|
129
|
+
max_vm_in_cluster: int,
|
|
130
|
+
) -> bool:
|
|
131
|
+
if workload_system.accelerator_type == AcceleratorType.GPU:
|
|
132
|
+
vm_required_by_workload = args.num_nodes
|
|
133
|
+
else:
|
|
134
|
+
vm_required_by_workload = args.num_slices * workload_system.vms_per_slice
|
|
135
|
+
|
|
136
|
+
if vm_required_by_workload > max_vm_in_cluster:
|
|
94
137
|
xpk_print(
|
|
95
|
-
'
|
|
96
|
-
f'
|
|
138
|
+
f'{args.workload} is requesting {args.num_slices} slice/slices of'
|
|
139
|
+
f' {workload_system.device_type}, which is'
|
|
140
|
+
f' {vm_required_by_workload} VMs, but the cluster only contains'
|
|
141
|
+
f' {max_vm_in_cluster} VMs of {workload_system.device_type}. XPK will'
|
|
142
|
+
' not create this workload.'
|
|
97
143
|
)
|
|
98
144
|
return False
|
|
99
|
-
else:
|
|
100
|
-
# Check if the size of the workload will fit in the cluster.
|
|
101
|
-
max_vm_in_cluster = int(cluster_config_map[device_type])
|
|
102
|
-
if system.accelerator_type == AcceleratorType.GPU:
|
|
103
|
-
vm_required_by_workload = args.num_nodes
|
|
104
|
-
else:
|
|
105
|
-
vm_required_by_workload = args.num_slices * system.vms_per_slice
|
|
106
|
-
if vm_required_by_workload > max_vm_in_cluster:
|
|
107
|
-
xpk_print(
|
|
108
|
-
f'{args.workload} is requesting {args.num_slices} slice/slices of'
|
|
109
|
-
f' {device_type}, which is {vm_required_by_workload} VMs, but the'
|
|
110
|
-
f' cluster only contains {max_vm_in_cluster} VMs of {device_type}.'
|
|
111
|
-
' XPK will not create this workload.'
|
|
112
|
-
)
|
|
113
|
-
return False
|
|
114
|
-
|
|
115
145
|
return True
|
|
116
146
|
|
|
117
147
|
|
|
148
|
+
def _check_sub_slicing_availability(
|
|
149
|
+
workload_system: SystemCharacteristics,
|
|
150
|
+
cluster_system: SystemCharacteristics | None,
|
|
151
|
+
) -> bool:
|
|
152
|
+
if (
|
|
153
|
+
(not FeatureFlags.SUB_SLICING_ENABLED)
|
|
154
|
+
or (not cluster_system)
|
|
155
|
+
or (workload_system.gke_accelerator != cluster_system.gke_accelerator)
|
|
156
|
+
or (not cluster_system.supports_sub_slicing)
|
|
157
|
+
or (workload_system.topology not in SUB_SLICING_TOPOLOGIES)
|
|
158
|
+
):
|
|
159
|
+
return False
|
|
160
|
+
|
|
161
|
+
return_code, sub_slicing_enabled = has_sub_slicing_enabled()
|
|
162
|
+
if return_code != 0 or not sub_slicing_enabled:
|
|
163
|
+
return False
|
|
164
|
+
|
|
165
|
+
return_code, current_version = get_installed_kueue_version(
|
|
166
|
+
dry_run_version=Version('0.13')
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
return (
|
|
170
|
+
return_code == 0
|
|
171
|
+
and current_version is not None
|
|
172
|
+
and current_version >= _SUB_SLICING_MINIMUM_KUEUE_VERSION
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
|
|
118
176
|
def get_total_chips_requested_from_args(
|
|
119
177
|
args, system: SystemCharacteristics
|
|
120
178
|
) -> int:
|
|
@@ -135,7 +193,7 @@ def get_total_chips_requested_from_args(
|
|
|
135
193
|
return int(num_chips)
|
|
136
194
|
|
|
137
195
|
|
|
138
|
-
def get_cpu_affinity(accelerator_type) -> str:
|
|
196
|
+
def get_cpu_affinity(accelerator_type: AcceleratorType) -> str:
|
|
139
197
|
"""Generate affinity rules for CPU nodepools, so that workload pods are
|
|
140
198
|
not scheduled on the default pool machines.
|
|
141
199
|
Args:
|
|
@@ -199,10 +257,8 @@ def get_gpu_scheduler(
|
|
|
199
257
|
"""
|
|
200
258
|
gpu_scheduler = gpu_scheduler_yaml.format(
|
|
201
259
|
scheduler_name=args.scheduler,
|
|
202
|
-
accelerator_label=create_accelerator_label(
|
|
203
|
-
|
|
204
|
-
),
|
|
205
|
-
machine_label=create_machine_label(system.accelerator_type, system),
|
|
260
|
+
accelerator_label=create_accelerator_label(system),
|
|
261
|
+
machine_label=create_machine_label(system),
|
|
206
262
|
node_pool_name=f'{args.cluster}-np-0',
|
|
207
263
|
autoprovisioning_args=autoprovisioning_args,
|
|
208
264
|
)
|
|
@@ -217,74 +273,14 @@ def get_gpu_scheduler(
|
|
|
217
273
|
return gpu_scheduler, return_code
|
|
218
274
|
|
|
219
275
|
|
|
220
|
-
def
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
Args:
|
|
224
|
-
accelerator_type: type of accelerator.
|
|
225
|
-
system: system characteristics.
|
|
226
|
-
|
|
227
|
-
Returns:
|
|
228
|
-
The accelerator label.
|
|
229
|
-
"""
|
|
230
|
-
if accelerator_type == AcceleratorType.CPU:
|
|
231
|
-
return ''
|
|
232
|
-
return (
|
|
233
|
-
f'{AcceleratorTypeToAcceleratorCharacteristics[accelerator_type].accelerator_label}:'
|
|
234
|
-
f' {system.gke_accelerator}'
|
|
235
|
-
)
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
def create_tpu_machine_type(accelerator_type, system) -> str:
|
|
239
|
-
"""Generates TPU machine type..
|
|
240
|
-
|
|
241
|
-
Args:
|
|
242
|
-
accelerator_type: type of accelerator.
|
|
243
|
-
system: system characteristics.
|
|
244
|
-
|
|
245
|
-
Returns:
|
|
246
|
-
The accelerator label.
|
|
247
|
-
"""
|
|
248
|
-
if accelerator_type == AcceleratorType.TPU:
|
|
276
|
+
def create_tpu_machine_type(system: SystemCharacteristics) -> str:
|
|
277
|
+
if system.accelerator_type == AcceleratorType.TPU:
|
|
249
278
|
return f'{system.gce_machine_type}'
|
|
250
279
|
return ''
|
|
251
280
|
|
|
252
281
|
|
|
253
|
-
def
|
|
254
|
-
|
|
255
|
-
) -> str:
|
|
256
|
-
"""Generates machine label.
|
|
257
|
-
|
|
258
|
-
Args:
|
|
259
|
-
accelerator_type: type of accelerator.
|
|
260
|
-
system: system characteristics.
|
|
261
|
-
autoprovisioning_enabled: describes autoprovisioning enablement.
|
|
262
|
-
|
|
263
|
-
Returns:
|
|
264
|
-
The machine label.
|
|
265
|
-
"""
|
|
266
|
-
if accelerator_type == AcceleratorType.TPU and not autoprovisioning_enabled:
|
|
267
|
-
return (
|
|
268
|
-
f'{AcceleratorTypeToAcceleratorCharacteristics[accelerator_type].machine_label}:'
|
|
269
|
-
f' {system.topology}'
|
|
270
|
-
)
|
|
271
|
-
return ''
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
def create_tpu_topology(
|
|
275
|
-
accelerator_type, system, autoprovisioning_enabled: bool = False
|
|
276
|
-
) -> str:
|
|
277
|
-
"""Generates TPU topology.
|
|
278
|
-
|
|
279
|
-
Args:
|
|
280
|
-
accelerator_type: type of accelerator.
|
|
281
|
-
system: system characteristics.
|
|
282
|
-
autoprovisioning_enabled: describes autoprovisioning enablement.
|
|
283
|
-
|
|
284
|
-
Returns:
|
|
285
|
-
The machine label.
|
|
286
|
-
"""
|
|
287
|
-
if accelerator_type == AcceleratorType.TPU and not autoprovisioning_enabled:
|
|
282
|
+
def create_tpu_topology(system: SystemCharacteristics) -> str:
|
|
283
|
+
if system.accelerator_type == AcceleratorType.TPU:
|
|
288
284
|
return f'{system.topology}'
|
|
289
285
|
return ''
|
|
290
286
|
|