xpk 0.14.2__py3-none-any.whl → 0.14.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/cluster.py +57 -22
- xpk/commands/cluster_gcluster_test.py +2 -2
- xpk/commands/cluster_test.py +197 -25
- xpk/commands/inspector.py +20 -7
- xpk/commands/kind.py +1 -1
- xpk/commands/workload.py +42 -4
- xpk/commands/workload_test.py +88 -5
- xpk/core/blueprint/blueprint_definitions.py +16 -1
- xpk/core/blueprint/blueprint_generator.py +11 -11
- xpk/core/capacity.py +17 -0
- xpk/core/capacity_test.py +50 -0
- xpk/core/config.py +1 -1
- xpk/core/docker_container.py +4 -4
- xpk/core/docker_resources.py +11 -11
- xpk/core/kjob.py +3 -5
- xpk/core/kueue_manager.py +21 -10
- xpk/core/kueue_manager_test.py +379 -536
- xpk/core/nap.py +1 -1
- xpk/core/nodepool.py +9 -9
- xpk/core/nodepool_test.py +4 -4
- xpk/core/pathways.py +1 -1
- xpk/core/resources.py +1 -1
- xpk/core/scheduling.py +7 -13
- xpk/core/system_characteristics.py +42 -35
- xpk/core/system_characteristics_test.py +3 -3
- xpk/core/testing/__init__.py +15 -0
- xpk/core/testing/commands_tester.py +131 -0
- xpk/core/testing/commands_tester_test.py +129 -0
- xpk/core/updates.py +57 -0
- xpk/core/updates_test.py +80 -0
- xpk/main.py +7 -4
- xpk/parser/common.py +8 -0
- xpk/utils/execution_context.py +20 -2
- {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/METADATA +1 -3
- {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/RECORD +39 -33
- {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/WHEEL +0 -0
- {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/entry_points.txt +0 -0
- {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/top_level.txt +0 -0
xpk/commands/cluster.py
CHANGED
|
@@ -17,7 +17,7 @@ limitations under the License.
|
|
|
17
17
|
from tabulate import tabulate
|
|
18
18
|
|
|
19
19
|
from ..utils.feature_flags import FeatureFlags
|
|
20
|
-
from ..core.capacity import H100_DEVICE_TYPE, H200_DEVICE_TYPE, B200_DEVICE_TYPE
|
|
20
|
+
from ..core.capacity import H100_DEVICE_TYPE, H200_DEVICE_TYPE, B200_DEVICE_TYPE, get_reservation_deployment_type
|
|
21
21
|
from ..core.cluster import (
|
|
22
22
|
get_all_clusters_programmatic,
|
|
23
23
|
get_cluster_credentials,
|
|
@@ -60,7 +60,7 @@ from ..core.nodepool import (
|
|
|
60
60
|
)
|
|
61
61
|
from ..core.ray import install_ray_cluster
|
|
62
62
|
from ..core.mtc import install_mtc_on_cluster
|
|
63
|
-
from ..core.resources import create_cluster_configmaps
|
|
63
|
+
from ..core.resources import AutoprovisioningConfig, create_cluster_configmaps
|
|
64
64
|
from ..core.scheduling import get_total_chips_requested_from_args
|
|
65
65
|
from ..core.storage import install_storage_crd
|
|
66
66
|
from ..core.system_characteristics import (
|
|
@@ -110,7 +110,7 @@ def cluster_adapt(args) -> None:
|
|
|
110
110
|
)
|
|
111
111
|
add_zone_and_project(args)
|
|
112
112
|
|
|
113
|
-
if system.accelerator_type == AcceleratorType
|
|
113
|
+
if system.accelerator_type == AcceleratorType.GPU and not getattr(
|
|
114
114
|
args, 'num_nodes'
|
|
115
115
|
):
|
|
116
116
|
xpk_print(
|
|
@@ -180,10 +180,12 @@ def cluster_adapt(args) -> None:
|
|
|
180
180
|
# if set_pathways_job_on_cluster_code != 0:
|
|
181
181
|
# xpk_exit(set_pathways_job_on_cluster_code)
|
|
182
182
|
|
|
183
|
-
|
|
183
|
+
install_kueue_code = _install_kueue(args, system, autoprovisioning_config)
|
|
184
|
+
if install_kueue_code != 0:
|
|
185
|
+
xpk_exit(install_kueue_code)
|
|
184
186
|
|
|
185
187
|
install_kjob(args)
|
|
186
|
-
if system.accelerator_type == AcceleratorType
|
|
188
|
+
if system.accelerator_type == AcceleratorType.GPU:
|
|
187
189
|
prepare_gpus(system)
|
|
188
190
|
|
|
189
191
|
if args.enable_ray_cluster:
|
|
@@ -204,6 +206,38 @@ def cluster_adapt(args) -> None:
|
|
|
204
206
|
def _validate_cluster_create_args(args, system: SystemCharacteristics):
|
|
205
207
|
if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing:
|
|
206
208
|
validate_sub_slicing_system(system)
|
|
209
|
+
_validate_sub_slicing_reservation(args)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _validate_sub_slicing_reservation(args):
|
|
213
|
+
if args.reservation is None:
|
|
214
|
+
xpk_print(
|
|
215
|
+
'Error: Validation failed: Sub-slicing cluster creation requires'
|
|
216
|
+
' Cluster Director reservation to be specified.'
|
|
217
|
+
)
|
|
218
|
+
xpk_exit(1)
|
|
219
|
+
|
|
220
|
+
deployment_type = get_reservation_deployment_type(
|
|
221
|
+
reservation=args.reservation, project=args.project, zone=args.zone
|
|
222
|
+
)
|
|
223
|
+
if deployment_type != 'DENSE':
|
|
224
|
+
xpk_print(
|
|
225
|
+
'Error: Validation failed: The specified reservation'
|
|
226
|
+
f' "{args.reservation}" is not a Cluster Director reservation.'
|
|
227
|
+
)
|
|
228
|
+
xpk_print(
|
|
229
|
+
'Please provide a reservation created for Cluster Director to proceed.'
|
|
230
|
+
)
|
|
231
|
+
xpk_print('To list valid Cluster Director reservations, run:')
|
|
232
|
+
xpk_print(
|
|
233
|
+
' gcloud compute reservations list --filter="deploymentType=DENSE"'
|
|
234
|
+
)
|
|
235
|
+
xpk_print(
|
|
236
|
+
'Refer to the documentation for more information on creating Cluster'
|
|
237
|
+
' Director reservations:'
|
|
238
|
+
' https://cloud.google.com/cluster-director/docs/reserve-capacity'
|
|
239
|
+
)
|
|
240
|
+
xpk_exit(1)
|
|
207
241
|
|
|
208
242
|
|
|
209
243
|
def cluster_create(args) -> None:
|
|
@@ -346,11 +380,13 @@ def cluster_create(args) -> None:
|
|
|
346
380
|
if set_pathways_job_on_cluster_code != 0:
|
|
347
381
|
xpk_exit(set_pathways_job_on_cluster_code)
|
|
348
382
|
|
|
349
|
-
|
|
383
|
+
install_kueue_code = _install_kueue(args, system, autoprovisioning_config)
|
|
384
|
+
if install_kueue_code != 0:
|
|
385
|
+
xpk_exit(install_kueue_code)
|
|
350
386
|
|
|
351
387
|
install_kjob(args)
|
|
352
388
|
|
|
353
|
-
if system.accelerator_type == AcceleratorType
|
|
389
|
+
if system.accelerator_type == AcceleratorType.GPU:
|
|
354
390
|
prepare_gpus(system)
|
|
355
391
|
|
|
356
392
|
if args.enable_ray_cluster:
|
|
@@ -1106,12 +1142,6 @@ def run_gke_cluster_create_command(
|
|
|
1106
1142
|
# benefit from a larger initial `--num-nodes`. After the cluster is created,
|
|
1107
1143
|
# the auto-scaler can reduce/increase the nodes based on the load.
|
|
1108
1144
|
|
|
1109
|
-
# If the user passes in the gke version then we use that directly instead of the rapid release.
|
|
1110
|
-
# This allows users to directly pass a specified gke version without release channel constraints.
|
|
1111
|
-
rapid_release_cmd = ''
|
|
1112
|
-
if args.gke_version is not None:
|
|
1113
|
-
rapid_release_cmd = ' --release-channel rapid'
|
|
1114
|
-
|
|
1115
1145
|
command = (
|
|
1116
1146
|
'gcloud beta container clusters create'
|
|
1117
1147
|
f' {args.cluster} --project={args.project}'
|
|
@@ -1122,25 +1152,23 @@ def run_gke_cluster_create_command(
|
|
|
1122
1152
|
' --enable-autoscaling'
|
|
1123
1153
|
' --total-min-nodes 1 --total-max-nodes 1000'
|
|
1124
1154
|
f' --num-nodes {args.default_pool_cpu_num_nodes}'
|
|
1125
|
-
f' {args.custom_cluster_arguments}'
|
|
1126
|
-
f' {rapid_release_cmd}'
|
|
1127
1155
|
' --enable-dns-access'
|
|
1128
1156
|
' --autoscaling-profile=optimize-utilization'
|
|
1129
1157
|
' --labels=gke_product_type=xpk'
|
|
1130
1158
|
)
|
|
1131
1159
|
|
|
1160
|
+
if args.gke_version or system.accelerator_type == AcceleratorType.GPU:
|
|
1161
|
+
command += ' --no-enable-autoupgrade'
|
|
1162
|
+
|
|
1132
1163
|
enable_ip_alias = False
|
|
1133
1164
|
|
|
1134
1165
|
if args.private or args.authorized_networks is not None:
|
|
1135
1166
|
enable_ip_alias = True
|
|
1136
1167
|
command += ' --enable-master-authorized-networks --enable-private-nodes'
|
|
1137
1168
|
|
|
1138
|
-
if system.accelerator_type == AcceleratorType
|
|
1169
|
+
if system.accelerator_type == AcceleratorType.GPU:
|
|
1139
1170
|
enable_ip_alias = True
|
|
1140
|
-
command +=
|
|
1141
|
-
' --enable-dataplane-v2'
|
|
1142
|
-
' --enable-multi-networking --no-enable-autoupgrade'
|
|
1143
|
-
)
|
|
1171
|
+
command += ' --enable-dataplane-v2 --enable-multi-networking'
|
|
1144
1172
|
else:
|
|
1145
1173
|
command += ' --location-policy=BALANCED --scopes=storage-full,gke-default'
|
|
1146
1174
|
|
|
@@ -1180,6 +1208,9 @@ def run_gke_cluster_create_command(
|
|
|
1180
1208
|
addons_str = ','.join(addons)
|
|
1181
1209
|
command += f' --addons={addons_str}'
|
|
1182
1210
|
|
|
1211
|
+
if args.custom_cluster_arguments:
|
|
1212
|
+
command += f' {args.custom_cluster_arguments}'
|
|
1213
|
+
|
|
1183
1214
|
return_code = run_command_with_updates(command, 'GKE Cluster Create')
|
|
1184
1215
|
if return_code != 0:
|
|
1185
1216
|
xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
|
|
@@ -1240,7 +1271,11 @@ def install_kjob(args):
|
|
|
1240
1271
|
xpk_exit(err_code)
|
|
1241
1272
|
|
|
1242
1273
|
|
|
1243
|
-
def
|
|
1274
|
+
def _install_kueue(
|
|
1275
|
+
args,
|
|
1276
|
+
system: SystemCharacteristics,
|
|
1277
|
+
autoprovisioning_config: AutoprovisioningConfig | None,
|
|
1278
|
+
) -> int:
|
|
1244
1279
|
xpk_print('Enabling Kueue on the cluster')
|
|
1245
1280
|
autoprovisioning_enabled = False
|
|
1246
1281
|
if autoprovisioning_config:
|
|
@@ -1251,7 +1286,7 @@ def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config):
|
|
|
1251
1286
|
# Determine total chips based on user specified topology.
|
|
1252
1287
|
total_chips = get_total_chips_requested_from_args(args, system)
|
|
1253
1288
|
kueue_manager = KueueManager()
|
|
1254
|
-
kueue_manager.install_or_upgrade(
|
|
1289
|
+
return kueue_manager.install_or_upgrade(
|
|
1255
1290
|
KueueConfig(
|
|
1256
1291
|
system,
|
|
1257
1292
|
total_chips=total_chips,
|
|
@@ -93,7 +93,7 @@ def test_install_kueue_standard(
|
|
|
93
93
|
gke_accelerator="nvidia-h100-mega-80gb",
|
|
94
94
|
gce_machine_type="a3-megagpu-8g",
|
|
95
95
|
chips_per_vm=8,
|
|
96
|
-
accelerator_type=AcceleratorType
|
|
96
|
+
accelerator_type=AcceleratorType.GPU,
|
|
97
97
|
device_type="h100-mega-80gb-8",
|
|
98
98
|
supports_sub_slicing=False,
|
|
99
99
|
)
|
|
@@ -140,7 +140,7 @@ def test_install_kueue_with_autoprovisioning(
|
|
|
140
140
|
gke_accelerator="nvidia-h100-mega-80gb",
|
|
141
141
|
gce_machine_type="a3-megagpu-8g",
|
|
142
142
|
chips_per_vm=8,
|
|
143
|
-
accelerator_type=AcceleratorType
|
|
143
|
+
accelerator_type=AcceleratorType.GPU,
|
|
144
144
|
device_type="h100-mega-80gb-8",
|
|
145
145
|
supports_sub_slicing=False,
|
|
146
146
|
)
|
xpk/commands/cluster_test.py
CHANGED
|
@@ -16,77 +16,249 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
from argparse import Namespace
|
|
18
18
|
from dataclasses import dataclass
|
|
19
|
-
from
|
|
19
|
+
from typing import Any
|
|
20
|
+
from unittest.mock import MagicMock, patch
|
|
20
21
|
import pytest
|
|
21
22
|
|
|
22
|
-
from xpk.commands.cluster import _validate_cluster_create_args
|
|
23
|
+
from xpk.commands.cluster import _install_kueue, _validate_cluster_create_args, run_gke_cluster_create_command
|
|
23
24
|
from xpk.core.system_characteristics import SystemCharacteristics, UserFacingNameToSystemCharacteristics
|
|
25
|
+
from xpk.core.testing.commands_tester import CommandsTester
|
|
24
26
|
from xpk.utils.feature_flags import FeatureFlags
|
|
25
27
|
|
|
26
28
|
|
|
27
29
|
@dataclass
|
|
28
30
|
class _Mocks:
|
|
29
31
|
common_print_mock: MagicMock
|
|
30
|
-
|
|
32
|
+
commands_print_mock: MagicMock
|
|
33
|
+
commands_get_reservation_deployment_type: MagicMock
|
|
34
|
+
commands_tester: CommandsTester
|
|
31
35
|
|
|
32
36
|
|
|
33
37
|
@pytest.fixture
|
|
34
|
-
def
|
|
38
|
+
def mocks(mocker) -> _Mocks:
|
|
35
39
|
common_print_mock = mocker.patch(
|
|
36
40
|
'xpk.commands.common.xpk_print',
|
|
37
41
|
return_value=None,
|
|
38
42
|
)
|
|
39
|
-
|
|
40
|
-
'xpk.commands.
|
|
41
|
-
|
|
43
|
+
commands_print_mock = mocker.patch(
|
|
44
|
+
'xpk.commands.cluster.xpk_print', return_value=None
|
|
45
|
+
)
|
|
46
|
+
commands_get_reservation_deployment_type = mocker.patch(
|
|
47
|
+
'xpk.commands.cluster.get_reservation_deployment_type',
|
|
48
|
+
return_value='DENSE',
|
|
42
49
|
)
|
|
43
50
|
return _Mocks(
|
|
44
|
-
common_print_mock=common_print_mock,
|
|
51
|
+
common_print_mock=common_print_mock,
|
|
52
|
+
commands_get_reservation_deployment_type=commands_get_reservation_deployment_type,
|
|
53
|
+
commands_print_mock=commands_print_mock,
|
|
54
|
+
commands_tester=CommandsTester(
|
|
55
|
+
mocker,
|
|
56
|
+
run_command_with_updates_path=(
|
|
57
|
+
'xpk.commands.cluster.run_command_with_updates'
|
|
58
|
+
),
|
|
59
|
+
),
|
|
45
60
|
)
|
|
46
61
|
|
|
47
62
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
63
|
+
def construct_args(**kwargs: Any) -> Namespace:
|
|
64
|
+
args_dict = dict(
|
|
65
|
+
project='project',
|
|
66
|
+
zone='us-central1-a',
|
|
67
|
+
reservation='',
|
|
68
|
+
default_pool_cpu_machine_type='test-machine-type',
|
|
69
|
+
cluster='test-cluster',
|
|
70
|
+
default_pool_cpu_num_nodes='100',
|
|
71
|
+
sub_slicing=False,
|
|
72
|
+
gke_version='',
|
|
73
|
+
private=False,
|
|
74
|
+
authorized_networks=None,
|
|
75
|
+
enable_pathways=False,
|
|
76
|
+
enable_ray_cluster=False,
|
|
77
|
+
enable_workload_identity=False,
|
|
78
|
+
enable_gcsfuse_csi_driver=False,
|
|
79
|
+
enable_gcpfilestore_csi_driver=False,
|
|
80
|
+
enable_parallelstore_csi_driver=False,
|
|
81
|
+
enable_pd_csi_driver=False,
|
|
82
|
+
enable_lustre_csi_driver=False,
|
|
83
|
+
custom_cluster_arguments='',
|
|
84
|
+
num_slices=1,
|
|
85
|
+
num_nodes=1,
|
|
86
|
+
flex=False,
|
|
87
|
+
memory_limit='100Gi',
|
|
88
|
+
cpu_limit=100,
|
|
89
|
+
cluster_cpu_machine_type='',
|
|
90
|
+
)
|
|
91
|
+
args_dict.update(kwargs)
|
|
92
|
+
return Namespace(**args_dict)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
GPU_TEST_SYSTEM: SystemCharacteristics = UserFacingNameToSystemCharacteristics[
|
|
96
|
+
'l4-1'
|
|
97
|
+
]
|
|
51
98
|
SUB_SLICING_SYSTEM: SystemCharacteristics = (
|
|
52
99
|
UserFacingNameToSystemCharacteristics['v6e-4x4']
|
|
53
100
|
)
|
|
101
|
+
TPU_TEST_SYSTEM: SystemCharacteristics = UserFacingNameToSystemCharacteristics[
|
|
102
|
+
'v6e-4x4'
|
|
103
|
+
]
|
|
54
104
|
|
|
55
105
|
|
|
56
106
|
def test_validate_cluster_create_args_for_correct_args_pass(
|
|
57
|
-
|
|
107
|
+
mocks: _Mocks,
|
|
58
108
|
):
|
|
59
109
|
args = Namespace()
|
|
60
110
|
|
|
61
|
-
_validate_cluster_create_args(args,
|
|
111
|
+
_validate_cluster_create_args(args, GPU_TEST_SYSTEM)
|
|
62
112
|
|
|
63
|
-
assert
|
|
64
|
-
assert mock_common_print_and_exit.common_exit_mock.call_count == 0
|
|
113
|
+
assert mocks.common_print_mock.call_count == 0
|
|
65
114
|
|
|
66
115
|
|
|
67
116
|
def test_validate_cluster_create_args_for_correct_sub_slicing_args_pass(
|
|
68
|
-
|
|
117
|
+
mocks: _Mocks,
|
|
69
118
|
):
|
|
70
119
|
FeatureFlags.SUB_SLICING_ENABLED = True
|
|
71
|
-
args =
|
|
120
|
+
args = construct_args(
|
|
121
|
+
sub_slicing=True,
|
|
122
|
+
reservation='test-reservation',
|
|
123
|
+
)
|
|
72
124
|
|
|
73
125
|
_validate_cluster_create_args(args, SUB_SLICING_SYSTEM)
|
|
74
126
|
|
|
75
|
-
assert
|
|
76
|
-
assert mock_common_print_and_exit.common_exit_mock.call_count == 0
|
|
127
|
+
assert mocks.common_print_mock.call_count == 0
|
|
77
128
|
|
|
78
129
|
|
|
79
130
|
def test_validate_cluster_create_args_for_not_supported_system_throws(
|
|
80
|
-
|
|
131
|
+
mocks: _Mocks,
|
|
81
132
|
):
|
|
82
133
|
FeatureFlags.SUB_SLICING_ENABLED = True
|
|
83
|
-
args =
|
|
134
|
+
args = construct_args(
|
|
135
|
+
sub_slicing=True,
|
|
136
|
+
reservation='test-reservation',
|
|
137
|
+
)
|
|
84
138
|
|
|
85
|
-
|
|
139
|
+
with pytest.raises(SystemExit):
|
|
140
|
+
_validate_cluster_create_args(args, GPU_TEST_SYSTEM)
|
|
86
141
|
|
|
87
|
-
assert
|
|
142
|
+
assert mocks.common_print_mock.call_count == 1
|
|
88
143
|
assert (
|
|
89
|
-
|
|
144
|
+
mocks.common_print_mock.call_args[0][0]
|
|
90
145
|
== 'Error: l4-1 does not support Sub-slicing.'
|
|
91
146
|
)
|
|
92
|
-
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def test_validate_cluster_create_args_for_missing_reservation(
|
|
150
|
+
mocks: _Mocks,
|
|
151
|
+
):
|
|
152
|
+
FeatureFlags.SUB_SLICING_ENABLED = True
|
|
153
|
+
args = construct_args(
|
|
154
|
+
sub_slicing=True,
|
|
155
|
+
reservation=None,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
with pytest.raises(SystemExit):
|
|
159
|
+
_validate_cluster_create_args(args, SUB_SLICING_SYSTEM)
|
|
160
|
+
|
|
161
|
+
assert mocks.commands_print_mock.call_count == 1
|
|
162
|
+
assert (
|
|
163
|
+
'Validation failed: Sub-slicing cluster creation requires'
|
|
164
|
+
in mocks.commands_print_mock.call_args[0][0]
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def test_validate_cluster_create_args_for_invalid_reservation(
|
|
169
|
+
mocks: _Mocks,
|
|
170
|
+
):
|
|
171
|
+
FeatureFlags.SUB_SLICING_ENABLED = True
|
|
172
|
+
args = construct_args(
|
|
173
|
+
sub_slicing=True,
|
|
174
|
+
reservation='test-reservation',
|
|
175
|
+
)
|
|
176
|
+
mocks.commands_get_reservation_deployment_type.return_value = 'SPARSE'
|
|
177
|
+
|
|
178
|
+
with pytest.raises(SystemExit):
|
|
179
|
+
_validate_cluster_create_args(args, SUB_SLICING_SYSTEM)
|
|
180
|
+
|
|
181
|
+
assert mocks.commands_print_mock.call_count == 5
|
|
182
|
+
assert (
|
|
183
|
+
'Refer to the documentation for more information on creating Cluster'
|
|
184
|
+
in mocks.commands_print_mock.call_args[0][0]
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
@patch('xpk.commands.cluster.KueueManager.install_or_upgrade')
|
|
189
|
+
def test_install_kueue_returns_kueue_installation_code(
|
|
190
|
+
mock_kueue_manager_install: MagicMock,
|
|
191
|
+
):
|
|
192
|
+
mock_kueue_manager_install.return_value = 17
|
|
193
|
+
|
|
194
|
+
code = _install_kueue(
|
|
195
|
+
args=construct_args(),
|
|
196
|
+
system=GPU_TEST_SYSTEM,
|
|
197
|
+
autoprovisioning_config=None,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
assert code == 17
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def test_run_gke_cluster_create_command_specifies_custom_cluster_arguments_last(
|
|
204
|
+
mocks: _Mocks,
|
|
205
|
+
):
|
|
206
|
+
result = run_gke_cluster_create_command(
|
|
207
|
+
args=construct_args(
|
|
208
|
+
custom_cluster_arguments='--enable-autoscaling=False --foo=baz'
|
|
209
|
+
),
|
|
210
|
+
gke_control_plane_version='1.2.3',
|
|
211
|
+
system=TPU_TEST_SYSTEM,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
assert result == 0
|
|
215
|
+
mocks.commands_tester.assert_command_run(
|
|
216
|
+
'clusters create',
|
|
217
|
+
' --enable-autoscaling',
|
|
218
|
+
' --enable-autoscaling=False --foo=baz',
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def test_run_gke_cluster_create_command_without_gke_version_does_not_have_no_autoupgrade_flag(
|
|
223
|
+
mocks: _Mocks,
|
|
224
|
+
):
|
|
225
|
+
result = run_gke_cluster_create_command(
|
|
226
|
+
args=construct_args(gke_version=''),
|
|
227
|
+
gke_control_plane_version='1.2.3',
|
|
228
|
+
system=TPU_TEST_SYSTEM,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
assert result == 0
|
|
232
|
+
mocks.commands_tester.assert_command_not_run(
|
|
233
|
+
'clusters create', ' --no-enable-autoupgrade'
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def test_run_gke_cluster_create_command_with_gke_version_has_no_autoupgrade_flag(
|
|
238
|
+
mocks: _Mocks,
|
|
239
|
+
):
|
|
240
|
+
result = run_gke_cluster_create_command(
|
|
241
|
+
args=construct_args(gke_version='1.2.3'),
|
|
242
|
+
gke_control_plane_version='1.2.3',
|
|
243
|
+
system=TPU_TEST_SYSTEM,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
assert result == 0
|
|
247
|
+
mocks.commands_tester.assert_command_run(
|
|
248
|
+
'clusters create', ' --no-enable-autoupgrade'
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def test_run_gke_cluster_create_command_with_gpu_system_has_no_enable_autoupgrade(
|
|
253
|
+
mocks: _Mocks,
|
|
254
|
+
):
|
|
255
|
+
result = run_gke_cluster_create_command(
|
|
256
|
+
args=construct_args(gke_version=''),
|
|
257
|
+
gke_control_plane_version='1.2.3',
|
|
258
|
+
system=GPU_TEST_SYSTEM,
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
assert result == 0
|
|
262
|
+
mocks.commands_tester.assert_command_run(
|
|
263
|
+
'clusters create', ' --no-enable-autoupgrade'
|
|
264
|
+
)
|
xpk/commands/inspector.py
CHANGED
|
@@ -23,6 +23,10 @@ from ..utils.console import xpk_exit, xpk_print
|
|
|
23
23
|
from ..utils.file import append_tmp_file, write_tmp_file
|
|
24
24
|
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
25
25
|
from .workload import get_workload_list
|
|
26
|
+
from ..core.kueue_manager import has_sub_slicing_enabled
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
_SPACER = '========================================================'
|
|
26
30
|
|
|
27
31
|
|
|
28
32
|
def inspector_run_command_helper(
|
|
@@ -40,7 +44,6 @@ def inspector_run_command_helper(
|
|
|
40
44
|
0 if successful and 1 otherwise.
|
|
41
45
|
"""
|
|
42
46
|
prefix = f'Command: {command}\nCommand Description: {command_description}\n'
|
|
43
|
-
postfix = '========================================================'
|
|
44
47
|
return_code, command_output = run_command_for_value(
|
|
45
48
|
command, f'{command_description}'
|
|
46
49
|
)
|
|
@@ -51,7 +54,7 @@ def inspector_run_command_helper(
|
|
|
51
54
|
)
|
|
52
55
|
return 1
|
|
53
56
|
|
|
54
|
-
inspector_command_output = f'{prefix} \n{command_output} \n{
|
|
57
|
+
inspector_command_output = f'{prefix} \n{command_output} \n{_SPACER} \n'
|
|
55
58
|
append_tmp_file(inspector_command_output, file)
|
|
56
59
|
|
|
57
60
|
if args.print_to_terminal:
|
|
@@ -71,17 +74,27 @@ def inspector_run_workload_list_helper(args, command_description, file) -> int:
|
|
|
71
74
|
0 if successful and 1 otherwise.
|
|
72
75
|
"""
|
|
73
76
|
prefix = f'Command Description: {command_description}\n'
|
|
74
|
-
postfix = '========================================================'
|
|
75
77
|
return_code, command_output = get_workload_list(args)
|
|
76
78
|
if return_code != 0:
|
|
77
79
|
xpk_exit(return_code)
|
|
78
|
-
inspector_command_output = f'{prefix} \n{command_output} \n{
|
|
80
|
+
inspector_command_output = f'{prefix} \n{command_output} \n{_SPACER} \n'
|
|
79
81
|
append_tmp_file(inspector_command_output, file)
|
|
80
82
|
if args.print_to_terminal:
|
|
81
83
|
xpk_print(inspector_command_output)
|
|
82
84
|
return 0
|
|
83
85
|
|
|
84
86
|
|
|
87
|
+
def inspector_run_sub_slicing_helper(args, file: str):
|
|
88
|
+
return_code, result = has_sub_slicing_enabled()
|
|
89
|
+
if return_code != 0:
|
|
90
|
+
xpk_exit(return_code)
|
|
91
|
+
if result:
|
|
92
|
+
output = f'Sub-slicing topology set up.\n{_SPACER}'
|
|
93
|
+
append_tmp_file(output, file)
|
|
94
|
+
if args.print_to_terminal:
|
|
95
|
+
xpk_print(output)
|
|
96
|
+
|
|
97
|
+
|
|
85
98
|
def inspector_output_link_helper(args, link, link_description, file) -> int:
|
|
86
99
|
"""Outputs a link for xpk inspector to the output file.
|
|
87
100
|
|
|
@@ -95,9 +108,7 @@ def inspector_output_link_helper(args, link, link_description, file) -> int:
|
|
|
95
108
|
0 if successful and 1 otherwise.
|
|
96
109
|
"""
|
|
97
110
|
inspector_link = (
|
|
98
|
-
f'Link Description: {link_description}\n'
|
|
99
|
-
f'Link: {link}\n'
|
|
100
|
-
'========================================================'
|
|
111
|
+
f'Link Description: {link_description}\nLink: {link}\n{_SPACER}\n'
|
|
101
112
|
)
|
|
102
113
|
append_tmp_file(inspector_link, file)
|
|
103
114
|
if args.print_to_terminal:
|
|
@@ -308,6 +319,8 @@ def inspector(args) -> None:
|
|
|
308
319
|
f' {command_description} return code: {return_code}'
|
|
309
320
|
)
|
|
310
321
|
|
|
322
|
+
inspector_run_sub_slicing_helper(args, inspector_file)
|
|
323
|
+
|
|
311
324
|
# Cloud Console Links:
|
|
312
325
|
workload_links = []
|
|
313
326
|
if args.workload:
|
xpk/commands/kind.py
CHANGED
xpk/commands/workload.py
CHANGED
|
@@ -27,6 +27,7 @@ from ..core.cluster import (
|
|
|
27
27
|
setup_k8s_env,
|
|
28
28
|
)
|
|
29
29
|
from ..core.commands import run_command_with_updates, run_commands
|
|
30
|
+
from ..core.kueue_manager import KueueManager, has_sub_slicing_enabled
|
|
30
31
|
from ..core.config import (VERTEX_TENSORBOARD_FEATURE_FLAG, XPK_CURRENT_VERSION)
|
|
31
32
|
from ..core.docker_container import (
|
|
32
33
|
get_main_container_docker_image,
|
|
@@ -95,6 +96,7 @@ from ..core.workload_decorators import (
|
|
|
95
96
|
tcpxo_decorator,
|
|
96
97
|
)
|
|
97
98
|
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
99
|
+
from packaging.version import Version
|
|
98
100
|
from ..utils.file import write_tmp_file
|
|
99
101
|
from ..utils.execution_context import is_dry_run
|
|
100
102
|
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
@@ -283,6 +285,7 @@ PW_WORKLOAD_CREATE_YAML = """
|
|
|
283
285
|
"""
|
|
284
286
|
|
|
285
287
|
SUB_SLICING_TOPOLOGIES = ['2x2', '2x4', '4x4', '4x8', '8x8', '8x16', '16x16']
|
|
288
|
+
SUB_SLICING_MINIMUM_KUEUE_VERSION = Version('0.13.0')
|
|
286
289
|
|
|
287
290
|
|
|
288
291
|
def workload_create_pathways(args) -> None:
|
|
@@ -340,6 +343,7 @@ def workload_create(args) -> None:
|
|
|
340
343
|
xpk_exit(return_code)
|
|
341
344
|
|
|
342
345
|
if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing_topology is not None:
|
|
346
|
+
_validate_sub_slicing_availability()
|
|
343
347
|
_validate_sub_slicing_topology(system, args.sub_slicing_topology)
|
|
344
348
|
|
|
345
349
|
if not check_if_workload_can_schedule(args, system):
|
|
@@ -483,7 +487,7 @@ def workload_create(args) -> None:
|
|
|
483
487
|
values: [{restart_on_exit_codes}]"""
|
|
484
488
|
|
|
485
489
|
# Create the workload file based on accelerator type or workload type.
|
|
486
|
-
if system.accelerator_type == AcceleratorType
|
|
490
|
+
if system.accelerator_type == AcceleratorType.GPU:
|
|
487
491
|
container, debugging_dashboard_id = get_user_workload_container(
|
|
488
492
|
args, system
|
|
489
493
|
)
|
|
@@ -566,7 +570,7 @@ def workload_create(args) -> None:
|
|
|
566
570
|
container=container,
|
|
567
571
|
vms_per_slice=(
|
|
568
572
|
compute_vms_per_slice(args.sub_slicing_topology)
|
|
569
|
-
if system.accelerator_type == AcceleratorType
|
|
573
|
+
if system.accelerator_type == AcceleratorType.TPU
|
|
570
574
|
and FeatureFlags.SUB_SLICING_ENABLED
|
|
571
575
|
and args.sub_slicing_topology is not None
|
|
572
576
|
else system.vms_per_slice
|
|
@@ -594,7 +598,7 @@ def workload_create(args) -> None:
|
|
|
594
598
|
tpu_toleration="""
|
|
595
599
|
- operator: "Exists"
|
|
596
600
|
key: google.com/tpu
|
|
597
|
-
""" if system.accelerator_type == AcceleratorType
|
|
601
|
+
""" if system.accelerator_type == AcceleratorType.TPU else '',
|
|
598
602
|
failure_policy_rules=failure_policy_rules,
|
|
599
603
|
pod_failure_policy=pod_failure_policy,
|
|
600
604
|
)
|
|
@@ -611,7 +615,7 @@ def workload_create(args) -> None:
|
|
|
611
615
|
|
|
612
616
|
# Get GKE outlier dashboard for TPU
|
|
613
617
|
outlier_dashboard_id = None
|
|
614
|
-
if system.accelerator_type == AcceleratorType
|
|
618
|
+
if system.accelerator_type == AcceleratorType.TPU:
|
|
615
619
|
outlier_dashboard_id = get_gke_outlier_dashboard(args)
|
|
616
620
|
|
|
617
621
|
# Outlier and debugging dashboards
|
|
@@ -678,6 +682,40 @@ def workload_create(args) -> None:
|
|
|
678
682
|
xpk_exit(0)
|
|
679
683
|
|
|
680
684
|
|
|
685
|
+
def _validate_sub_slicing_availability():
|
|
686
|
+
return_code, sub_slicing_enabled = has_sub_slicing_enabled()
|
|
687
|
+
if return_code != 0:
|
|
688
|
+
xpk_print(
|
|
689
|
+
'Error: Unable to validate sub-slicing support on a given cluster.'
|
|
690
|
+
)
|
|
691
|
+
xpk_exit(1)
|
|
692
|
+
|
|
693
|
+
if not sub_slicing_enabled:
|
|
694
|
+
xpk_print(
|
|
695
|
+
'Error: Cluster has not been not set up for Sub-slicing. Please enable'
|
|
696
|
+
' --sub-slicing in "cluster create" command first.'
|
|
697
|
+
)
|
|
698
|
+
xpk_exit(1)
|
|
699
|
+
|
|
700
|
+
kueue_manager = KueueManager()
|
|
701
|
+
return_code, current_version = kueue_manager.get_installed_kueue_version()
|
|
702
|
+
if return_code != 0:
|
|
703
|
+
xpk_print(
|
|
704
|
+
'Error: Unable to validate sub-slicing support on a given cluster.'
|
|
705
|
+
)
|
|
706
|
+
xpk_exit(1)
|
|
707
|
+
|
|
708
|
+
if current_version < SUB_SLICING_MINIMUM_KUEUE_VERSION:
|
|
709
|
+
xpk_print(
|
|
710
|
+
"Error: Current Kueue version ({current_version}) doesn't support"
|
|
711
|
+
' Sub-slicing. The minimal required version is'
|
|
712
|
+
' v{SUB_SLICING_MINIMUM_KUEUE_VERSION}. Please either update Kueue'
|
|
713
|
+
' manually, or run "cluster create --sub-slicing" on the existing'
|
|
714
|
+
' cluster.'
|
|
715
|
+
)
|
|
716
|
+
xpk_exit(1)
|
|
717
|
+
|
|
718
|
+
|
|
681
719
|
def _validate_sub_slicing_topology(
|
|
682
720
|
system_characteristics: SystemCharacteristics, sub_slicing_topology: str
|
|
683
721
|
) -> None:
|