xpk 0.17.2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/cluster.py +4 -35
- xpk/commands/cluster_gcluster.py +1 -13
- xpk/commands/cluster_gcluster_test.py +2 -10
- xpk/commands/cluster_test.py +0 -4
- xpk/commands/workload.py +10 -3
- xpk/commands/workload_test.py +1 -0
- xpk/core/cluster.py +10 -9
- xpk/core/config.py +5 -17
- xpk/core/kueue_manager_test.py +2 -0
- xpk/core/nodepool.py +6 -0
- xpk/core/nodepool_test.py +4 -0
- xpk/core/scheduling.py +28 -3
- xpk/core/scheduling_test.py +38 -1
- xpk/core/system_characteristics.py +39 -16
- xpk/core/system_characteristics_test.py +11 -0
- xpk/core/workload_decorators/rdma_decorator.py +0 -15
- xpk/core/workload_decorators/tcpx_decorator.py +0 -8
- xpk/core/workload_decorators/tcpx_decorator_test.py +0 -78
- xpk/core/workload_decorators/tcpxo_decorator.py +0 -16
- xpk/parser/common.py +0 -17
- xpk/parser/core.py +0 -39
- xpk/parser/storage.py +0 -11
- xpk/utils/feature_flags.py +1 -1
- xpk/utils/validation.py +0 -8
- {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/METADATA +15 -4
- {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/RECORD +30 -41
- xpk/commands/batch.py +0 -144
- xpk/commands/job.py +0 -244
- xpk/commands/kind.py +0 -286
- xpk/commands/kjob_common.py +0 -60
- xpk/commands/run.py +0 -140
- xpk/commands/shell.py +0 -142
- xpk/parser/batch.py +0 -43
- xpk/parser/job.py +0 -147
- xpk/parser/kind.py +0 -95
- xpk/parser/run.py +0 -47
- xpk/parser/shell.py +0 -59
- {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/WHEEL +0 -0
- {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/top_level.txt +0 -0
xpk/commands/cluster.py
CHANGED
|
@@ -49,7 +49,6 @@ from ..core.gcloud_context import (
|
|
|
49
49
|
zone_to_region,
|
|
50
50
|
)
|
|
51
51
|
from ..core.jobset import update_jobset_resources_if_necessary
|
|
52
|
-
from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
|
|
53
52
|
from ..core.kueue_manager import (KueueConfig, KueueManager)
|
|
54
53
|
from ..core.nap import enable_autoprovisioning_on_cluster
|
|
55
54
|
from ..core.network import (
|
|
@@ -98,7 +97,6 @@ def cluster_adapt(args) -> None:
|
|
|
98
97
|
if should_validate_dependencies(args):
|
|
99
98
|
validate_dependencies_list([
|
|
100
99
|
SystemDependency.KUBECTL,
|
|
101
|
-
SystemDependency.KJOB,
|
|
102
100
|
SystemDependency.GCLOUD,
|
|
103
101
|
])
|
|
104
102
|
args.enable_pathways = False
|
|
@@ -188,7 +186,6 @@ def cluster_adapt(args) -> None:
|
|
|
188
186
|
if install_kueue_code != 0:
|
|
189
187
|
xpk_exit(install_kueue_code)
|
|
190
188
|
|
|
191
|
-
install_kjob(args)
|
|
192
189
|
if system.accelerator_type == AcceleratorType.GPU:
|
|
193
190
|
prepare_gpus(system)
|
|
194
191
|
|
|
@@ -308,7 +305,6 @@ def cluster_create(args) -> None:
|
|
|
308
305
|
if should_validate_dependencies(args):
|
|
309
306
|
validate_dependencies_list([
|
|
310
307
|
SystemDependency.KUBECTL,
|
|
311
|
-
SystemDependency.KJOB,
|
|
312
308
|
SystemDependency.GCLOUD,
|
|
313
309
|
])
|
|
314
310
|
|
|
@@ -455,8 +451,6 @@ def cluster_create(args) -> None:
|
|
|
455
451
|
if install_kueue_code != 0:
|
|
456
452
|
xpk_exit(install_kueue_code)
|
|
457
453
|
|
|
458
|
-
install_kjob(args)
|
|
459
|
-
|
|
460
454
|
if system.accelerator_type == AcceleratorType.GPU:
|
|
461
455
|
prepare_gpus(system)
|
|
462
456
|
|
|
@@ -1239,29 +1233,20 @@ def run_gke_cluster_create_command(
|
|
|
1239
1233
|
' --autoscaling-profile=optimize-utilization'
|
|
1240
1234
|
' --labels=gke_product_type=xpk'
|
|
1241
1235
|
f' --release-channel={release_channel.value.lower()}'
|
|
1236
|
+
' --enable-ip-alias'
|
|
1237
|
+
' --enable-dataplane-v2'
|
|
1238
|
+
' --enable-multi-networking'
|
|
1242
1239
|
)
|
|
1243
1240
|
|
|
1244
1241
|
if args.gke_version:
|
|
1245
1242
|
command += ' --no-enable-autoupgrade'
|
|
1246
1243
|
|
|
1247
|
-
enable_ip_alias = False
|
|
1248
|
-
|
|
1249
1244
|
if args.private or args.authorized_networks is not None:
|
|
1250
|
-
enable_ip_alias = True
|
|
1251
1245
|
command += ' --enable-master-authorized-networks --enable-private-nodes'
|
|
1252
1246
|
|
|
1253
|
-
if system.accelerator_type
|
|
1254
|
-
enable_ip_alias = True
|
|
1255
|
-
command += ' --enable-dataplane-v2 --enable-multi-networking'
|
|
1256
|
-
else:
|
|
1247
|
+
if system.accelerator_type != AcceleratorType.GPU:
|
|
1257
1248
|
command += ' --location-policy=BALANCED --scopes=storage-full,gke-default'
|
|
1258
1249
|
|
|
1259
|
-
if args.enable_pathways:
|
|
1260
|
-
enable_ip_alias = True
|
|
1261
|
-
|
|
1262
|
-
if enable_ip_alias:
|
|
1263
|
-
command += ' --enable-ip-alias'
|
|
1264
|
-
|
|
1265
1250
|
if args.enable_ray_cluster:
|
|
1266
1251
|
command += ' --addons RayOperator'
|
|
1267
1252
|
|
|
@@ -1343,22 +1328,6 @@ def install_storage_csis(args):
|
|
|
1343
1328
|
xpk_exit(update_cluster_command_code)
|
|
1344
1329
|
|
|
1345
1330
|
|
|
1346
|
-
def install_kjob(args):
|
|
1347
|
-
xpk_print('Verifying kjob installation')
|
|
1348
|
-
err_code = verify_kjob_installed()
|
|
1349
|
-
if err_code > 0:
|
|
1350
|
-
xpk_exit(err_code)
|
|
1351
|
-
|
|
1352
|
-
xpk_print('Applying kjob CDRs')
|
|
1353
|
-
err_code = apply_kjob_crds()
|
|
1354
|
-
if err_code > 0:
|
|
1355
|
-
xpk_exit(err_code)
|
|
1356
|
-
|
|
1357
|
-
err_code = prepare_kjob(args)
|
|
1358
|
-
if err_code > 0:
|
|
1359
|
-
xpk_exit(err_code)
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
1331
|
def _install_kueue(
|
|
1363
1332
|
args,
|
|
1364
1333
|
system: SystemCharacteristics,
|
xpk/commands/cluster_gcluster.py
CHANGED
|
@@ -38,7 +38,6 @@ from ..core.commands import run_command_for_value
|
|
|
38
38
|
from ..core.docker_manager import DockerManager
|
|
39
39
|
from ..core.gcloud_context import zone_to_region
|
|
40
40
|
from ..core.gcluster_manager import GclusterManager
|
|
41
|
-
from ..core.kjob import apply_kjob_crds, prepare_kjob
|
|
42
41
|
from ..core.remote_state.fuse_remote_state import FuseStateClient
|
|
43
42
|
from ..core.remote_state.remote_state_client import RemoteStateClient
|
|
44
43
|
from ..utils.console import xpk_exit, xpk_print
|
|
@@ -112,18 +111,7 @@ def cluster_create(
|
|
|
112
111
|
get_cluster_credentials(args)
|
|
113
112
|
|
|
114
113
|
err_code = __install_kueue(args)
|
|
115
|
-
|
|
116
|
-
xpk_exit(err_code)
|
|
117
|
-
|
|
118
|
-
err_code = apply_kjob_crds()
|
|
119
|
-
if err_code > 0:
|
|
120
|
-
xpk_exit(err_code)
|
|
121
|
-
|
|
122
|
-
err_code = prepare_kjob(args)
|
|
123
|
-
if err_code > 0:
|
|
124
|
-
xpk_exit(err_code)
|
|
125
|
-
|
|
126
|
-
xpk_exit(0)
|
|
114
|
+
xpk_exit(err_code)
|
|
127
115
|
|
|
128
116
|
|
|
129
117
|
def __install_kueue(args) -> int:
|
|
@@ -46,8 +46,6 @@ def mock_cluster_create_deps(request):
|
|
|
46
46
|
"""Mocks dependencies for cluster_create."""
|
|
47
47
|
with (
|
|
48
48
|
patch("xpk.commands.cluster_gcluster.xpk_exit") as mock_exit,
|
|
49
|
-
patch("xpk.commands.cluster_gcluster.prepare_kjob") as mock_prep_kjob,
|
|
50
|
-
patch("xpk.commands.cluster_gcluster.apply_kjob_crds") as mock_apply_kjob,
|
|
51
49
|
patch(
|
|
52
50
|
"xpk.commands.cluster_gcluster.get_cluster_credentials"
|
|
53
51
|
) as mock_get_creds,
|
|
@@ -68,8 +66,6 @@ def mock_cluster_create_deps(request):
|
|
|
68
66
|
):
|
|
69
67
|
yield {
|
|
70
68
|
"xpk_exit": mock_exit,
|
|
71
|
-
"prepare_kjob": mock_prep_kjob,
|
|
72
|
-
"apply_kjob_crds": mock_apply_kjob,
|
|
73
69
|
"get_cluster_credentials": mock_get_creds,
|
|
74
70
|
"generate_blueprint": mock_gen_bp,
|
|
75
71
|
"prepare_gcluster_manager": mock_prep_gcm,
|
|
@@ -85,9 +81,6 @@ def test_install_kueue_standard(
|
|
|
85
81
|
mock_get_total_chips, mock_args, mock_cluster_create_deps
|
|
86
82
|
):
|
|
87
83
|
"""Tests __install_kueue for a standard installation."""
|
|
88
|
-
mock_cluster_create_deps["prepare_kjob"].return_value = 0
|
|
89
|
-
mock_cluster_create_deps["apply_kjob_crds"].return_value = 0
|
|
90
|
-
|
|
91
84
|
mock_system = SystemCharacteristics(
|
|
92
85
|
topology="N/A",
|
|
93
86
|
vms_per_slice=1,
|
|
@@ -98,6 +91,7 @@ def test_install_kueue_standard(
|
|
|
98
91
|
device_type="h100-mega-80gb-8",
|
|
99
92
|
supports_sub_slicing=False,
|
|
100
93
|
supports_super_slicing=False,
|
|
94
|
+
supports_accelerator_network_profile=True,
|
|
101
95
|
docker_platform=DockerPlatform.ARM,
|
|
102
96
|
gpu_config=GpuConfig(requires_topology=True),
|
|
103
97
|
)
|
|
@@ -138,9 +132,6 @@ def test_install_kueue_with_autoprovisioning(
|
|
|
138
132
|
mock_enable_autoprovisioning, mock_args, mock_cluster_create_deps
|
|
139
133
|
):
|
|
140
134
|
"""Tests __install_kueue with autoprovisioning enabled."""
|
|
141
|
-
mock_cluster_create_deps["prepare_kjob"].return_value = 0
|
|
142
|
-
mock_cluster_create_deps["apply_kjob_crds"].return_value = 0
|
|
143
|
-
|
|
144
135
|
mock_args.enable_autoprovisioning = True
|
|
145
136
|
mock_system = SystemCharacteristics(
|
|
146
137
|
topology="N/A",
|
|
@@ -152,6 +143,7 @@ def test_install_kueue_with_autoprovisioning(
|
|
|
152
143
|
device_type="h100-mega-80gb-8",
|
|
153
144
|
supports_sub_slicing=False,
|
|
154
145
|
supports_super_slicing=False,
|
|
146
|
+
supports_accelerator_network_profile=True,
|
|
155
147
|
docker_platform=DockerPlatform.ARM,
|
|
156
148
|
gpu_config=GpuConfig(requires_topology=True),
|
|
157
149
|
)
|
xpk/commands/cluster_test.py
CHANGED
|
@@ -56,7 +56,6 @@ class _ClusterCreateMocks:
|
|
|
56
56
|
create_cluster_configmaps: MagicMock
|
|
57
57
|
set_jobset_on_cluster: MagicMock
|
|
58
58
|
get_cluster_location: MagicMock
|
|
59
|
-
install_kjob: MagicMock
|
|
60
59
|
xpk_exit: MagicMock
|
|
61
60
|
update_jobset_resources_if_necessary: MagicMock
|
|
62
61
|
_install_kueue: MagicMock
|
|
@@ -204,9 +203,6 @@ def cluster_create_mocks(mocker) -> _ClusterCreateMocks:
|
|
|
204
203
|
'xpk.commands.cluster.get_cluster_location',
|
|
205
204
|
return_value='us-central1',
|
|
206
205
|
),
|
|
207
|
-
install_kjob=mocker.patch(
|
|
208
|
-
'xpk.commands.cluster.install_kjob', return_value=0
|
|
209
|
-
),
|
|
210
206
|
xpk_exit=mocker.patch('xpk.commands.cluster.xpk_exit'),
|
|
211
207
|
update_jobset_resources_if_necessary=mocker.patch(
|
|
212
208
|
'xpk.commands.cluster.update_jobset_resources_if_necessary',
|
xpk/commands/workload.py
CHANGED
|
@@ -57,6 +57,7 @@ from ..core.scheduling import (
|
|
|
57
57
|
WorkloadScheduling,
|
|
58
58
|
check_if_workload_can_schedule,
|
|
59
59
|
create_tpu_machine_type,
|
|
60
|
+
create_tpu_slice_topology_annotation,
|
|
60
61
|
create_tpu_topology,
|
|
61
62
|
get_cpu_affinity,
|
|
62
63
|
get_gpu_scheduler,
|
|
@@ -132,7 +133,7 @@ spec:
|
|
|
132
133
|
annotations:
|
|
133
134
|
{storage_annotations}
|
|
134
135
|
{sub_slicing_annotations}
|
|
135
|
-
{
|
|
136
|
+
{tpu_slice_topology_annotation}
|
|
136
137
|
spec:
|
|
137
138
|
schedulerName: {args.scheduler}
|
|
138
139
|
imagePullSecrets:
|
|
@@ -518,6 +519,8 @@ def workload_create(args) -> None:
|
|
|
518
519
|
workload_system, super_slicing=False
|
|
519
520
|
)
|
|
520
521
|
|
|
522
|
+
# TODO(b/466943057): Add ANP label for NAP (if not possible, use CCC)
|
|
523
|
+
|
|
521
524
|
# Create the workload file based on accelerator type or workload type.
|
|
522
525
|
if workload_system.accelerator_type == AcceleratorType.GPU:
|
|
523
526
|
container, debugging_dashboard_id = get_user_workload_container(
|
|
@@ -640,7 +643,11 @@ def workload_create(args) -> None:
|
|
|
640
643
|
else create_machine_label(workload_system)
|
|
641
644
|
)
|
|
642
645
|
node_selector_machine_label = machine_label if not use_super_slicing else ''
|
|
643
|
-
|
|
646
|
+
tpu_slice_topology_annotation = (
|
|
647
|
+
create_tpu_slice_topology_annotation(workload_system.topology)
|
|
648
|
+
if use_super_slicing
|
|
649
|
+
else ''
|
|
650
|
+
)
|
|
644
651
|
|
|
645
652
|
yml_string = WORKLOAD_CREATE_YAML.format(
|
|
646
653
|
args=args,
|
|
@@ -657,7 +664,7 @@ def workload_create(args) -> None:
|
|
|
657
664
|
),
|
|
658
665
|
placement_policy_label=placement_policy_label,
|
|
659
666
|
node_selector_machine_label=node_selector_machine_label,
|
|
660
|
-
|
|
667
|
+
tpu_slice_topology_annotation=tpu_slice_topology_annotation,
|
|
661
668
|
local_queue_name=LOCAL_QUEUE_NAME,
|
|
662
669
|
autoprovisioning_args=autoprovisioning_args,
|
|
663
670
|
volumes=get_volumes(args, workload_system),
|
xpk/commands/workload_test.py
CHANGED
xpk/core/cluster.py
CHANGED
|
@@ -391,14 +391,13 @@ def project_id_to_project_number(project_id: str) -> str:
|
|
|
391
391
|
|
|
392
392
|
|
|
393
393
|
def setup_k8s_env(args) -> k8s_client.ApiClient:
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
)
|
|
394
|
+
add_zone_and_project(args)
|
|
395
|
+
get_cluster_credentials(args)
|
|
396
|
+
args.project_number = (
|
|
397
|
+
project_id_to_project_number(args.project)
|
|
398
|
+
if not args.dry_run
|
|
399
|
+
else abs(hash(args.project) % (10**12)) # 12 digit hash
|
|
400
|
+
)
|
|
402
401
|
|
|
403
402
|
config.load_kube_config()
|
|
404
403
|
return k8s_client.ApiClient()
|
|
@@ -717,8 +716,10 @@ def get_cluster_credentials(args) -> int:
|
|
|
717
716
|
location=location,
|
|
718
717
|
dns_endpoint=True,
|
|
719
718
|
)
|
|
719
|
+
if return_code != 0:
|
|
720
|
+
return return_code
|
|
720
721
|
|
|
721
|
-
if
|
|
722
|
+
if not _are_credentials_valid():
|
|
722
723
|
xpk_print('Detected error. Retrying without --dns-endpoint flag...')
|
|
723
724
|
return_code = _get_credentials(
|
|
724
725
|
project=args.project,
|
xpk/core/config.py
CHANGED
|
@@ -19,6 +19,7 @@ import os
|
|
|
19
19
|
import ruamel.yaml
|
|
20
20
|
from abc import ABC, abstractmethod
|
|
21
21
|
from ..utils import file
|
|
22
|
+
from ..utils.execution_context import is_dry_run
|
|
22
23
|
from ..utils.console import xpk_print
|
|
23
24
|
from setuptools_scm import get_version as setuptools_get_version
|
|
24
25
|
from importlib.metadata import version, PackageNotFoundError
|
|
@@ -53,14 +54,6 @@ PROJECT_KEY = 'project-id'
|
|
|
53
54
|
CLIENT_ID_KEY = 'client-id'
|
|
54
55
|
SEND_TELEMETRY_KEY = 'send-telemetry'
|
|
55
56
|
ZONE_KEY = 'zone'
|
|
56
|
-
KJOB_BATCH_IMAGE = 'batch-image'
|
|
57
|
-
KJOB_BATCH_WORKING_DIRECTORY = 'batch-working-directory'
|
|
58
|
-
KJOB_SHELL_IMAGE = 'shell-image'
|
|
59
|
-
KJOB_SHELL_INTERACTIVE_COMMAND = 'shell-interactive-command'
|
|
60
|
-
KJOB_SHELL_WORKING_DIRECTORY = 'shell-working-directory'
|
|
61
|
-
CONFIGS_KEY = 'configs'
|
|
62
|
-
GKE_ENDPOINT_KEY = 'gke-endpoint'
|
|
63
|
-
DEPENDENCIES_KEY = 'deps-verified-version'
|
|
64
57
|
|
|
65
58
|
DEFAULT_KEYS = [
|
|
66
59
|
CFG_BUCKET_KEY,
|
|
@@ -69,13 +62,6 @@ DEFAULT_KEYS = [
|
|
|
69
62
|
CLIENT_ID_KEY,
|
|
70
63
|
SEND_TELEMETRY_KEY,
|
|
71
64
|
ZONE_KEY,
|
|
72
|
-
GKE_ENDPOINT_KEY,
|
|
73
|
-
DEPENDENCIES_KEY,
|
|
74
|
-
KJOB_BATCH_IMAGE,
|
|
75
|
-
KJOB_BATCH_WORKING_DIRECTORY,
|
|
76
|
-
KJOB_SHELL_IMAGE,
|
|
77
|
-
KJOB_SHELL_INTERACTIVE_COMMAND,
|
|
78
|
-
KJOB_SHELL_WORKING_DIRECTORY,
|
|
79
65
|
]
|
|
80
66
|
VERTEX_TENSORBOARD_FEATURE_FLAG = XPK_CURRENT_VERSION >= '0.4.0'
|
|
81
67
|
|
|
@@ -111,8 +97,7 @@ class FileSystemConfig(Config):
|
|
|
111
97
|
self._allowed_keys = DEFAULT_KEYS
|
|
112
98
|
|
|
113
99
|
def _open_configs(self) -> dict | None:
|
|
114
|
-
|
|
115
|
-
file.ensure_directory_exists(dir_path)
|
|
100
|
+
file.ensure_directory_exists(os.path.dirname(self._config))
|
|
116
101
|
|
|
117
102
|
if not os.path.exists(self._config):
|
|
118
103
|
return None
|
|
@@ -122,6 +107,9 @@ class FileSystemConfig(Config):
|
|
|
122
107
|
return config_yaml
|
|
123
108
|
|
|
124
109
|
def _save_configs(self, config_yaml: dict) -> None:
|
|
110
|
+
if is_dry_run():
|
|
111
|
+
return None
|
|
112
|
+
|
|
125
113
|
with open(self._config, encoding='utf-8', mode='w') as stream:
|
|
126
114
|
yaml.dump(config_yaml, stream)
|
|
127
115
|
|
xpk/core/kueue_manager_test.py
CHANGED
|
@@ -36,6 +36,7 @@ TPU_SYSTEM: SystemCharacteristics = SystemCharacteristics(
|
|
|
36
36
|
device_type="v5p-8",
|
|
37
37
|
supports_sub_slicing=False,
|
|
38
38
|
supports_super_slicing=False,
|
|
39
|
+
supports_accelerator_network_profile=False,
|
|
39
40
|
docker_platform=DockerPlatform.ARM,
|
|
40
41
|
)
|
|
41
42
|
|
|
@@ -411,6 +412,7 @@ def test_configure_generates_correct_manifest_with_gke_default_topology(
|
|
|
411
412
|
supports_sub_slicing=False,
|
|
412
413
|
supports_super_slicing=False,
|
|
413
414
|
docker_platform=DockerPlatform.ARM,
|
|
415
|
+
supports_accelerator_network_profile=True,
|
|
414
416
|
gpu_config=GpuConfig(requires_topology=True),
|
|
415
417
|
),
|
|
416
418
|
)
|
xpk/core/nodepool.py
CHANGED
|
@@ -289,6 +289,12 @@ def run_gke_node_pool_create_command(
|
|
|
289
289
|
f'{placement_args}'
|
|
290
290
|
' --enable-gvnic'
|
|
291
291
|
)
|
|
292
|
+
|
|
293
|
+
if system.supports_accelerator_network_profile:
|
|
294
|
+
command += (
|
|
295
|
+
' --accelerator-network-profile=auto'
|
|
296
|
+
' --node-labels=cloud.google.com/gke-networking-dra-driver=true'
|
|
297
|
+
)
|
|
292
298
|
if system.accelerator_type == AcceleratorType.TPU:
|
|
293
299
|
command += f' --node-version={gke_node_pool_version}'
|
|
294
300
|
if capacity_type == CapacityType.FLEX_START:
|
xpk/core/nodepool_test.py
CHANGED
|
@@ -251,6 +251,7 @@ def test_placement_policy_created_for_gpu_with_valid_topology(
|
|
|
251
251
|
device_type="h100-80gb-8",
|
|
252
252
|
supports_sub_slicing=False,
|
|
253
253
|
supports_super_slicing=False,
|
|
254
|
+
supports_accelerator_network_profile=True,
|
|
254
255
|
docker_platform=DockerPlatform.ARM,
|
|
255
256
|
gpu_config=GpuConfig(requires_topology=True),
|
|
256
257
|
)
|
|
@@ -284,6 +285,7 @@ def test_placement_policy_not_created_for_gpu_with_invalid_topology(
|
|
|
284
285
|
device_type="h100-80gb-8",
|
|
285
286
|
supports_sub_slicing=False,
|
|
286
287
|
supports_super_slicing=False,
|
|
288
|
+
supports_accelerator_network_profile=True,
|
|
287
289
|
docker_platform=DockerPlatform.ARM,
|
|
288
290
|
gpu_config=GpuConfig(requires_topology=True),
|
|
289
291
|
)
|
|
@@ -320,6 +322,7 @@ def test_placement_policy_created_for_tpu7x_with_valid_topology(
|
|
|
320
322
|
requires_workload_policy=True,
|
|
321
323
|
supports_sub_slicing=False,
|
|
322
324
|
supports_super_slicing=False,
|
|
325
|
+
supports_accelerator_network_profile=False,
|
|
323
326
|
docker_platform=DockerPlatform.ARM,
|
|
324
327
|
)
|
|
325
328
|
|
|
@@ -354,6 +357,7 @@ def test_placement_policy_not_created_for_non7x_tpu(
|
|
|
354
357
|
device_type="v6e-4",
|
|
355
358
|
supports_sub_slicing=True,
|
|
356
359
|
supports_super_slicing=False,
|
|
360
|
+
supports_accelerator_network_profile=True,
|
|
357
361
|
docker_platform=DockerPlatform.ARM,
|
|
358
362
|
)
|
|
359
363
|
|
xpk/core/scheduling.py
CHANGED
|
@@ -18,7 +18,7 @@ from enum import Enum
|
|
|
18
18
|
|
|
19
19
|
from .kueue_manager import get_installed_kueue_version, has_sub_slicing_enabled, has_super_slicing_enabled
|
|
20
20
|
from ..utils.feature_flags import FeatureFlags
|
|
21
|
-
from ..utils.topology import get_slice_topology_level
|
|
21
|
+
from ..utils.topology import get_slice_topology_level, parse_topology
|
|
22
22
|
from ..utils.console import xpk_print
|
|
23
23
|
from ..utils.topology import is_topology_valid
|
|
24
24
|
from ..utils.execution_context import is_dry_run
|
|
@@ -34,6 +34,7 @@ from packaging.version import Version
|
|
|
34
34
|
|
|
35
35
|
_SUB_SLICING_MINIMUM_KUEUE_VERSION = Version('0.13.0')
|
|
36
36
|
_SUPER_SLICING_MINIMUM_KUEUE_VERSION = Version('0.14.0')
|
|
37
|
+
_SUPER_SLICING_MAX_TOPOLOGY = (16, 24, 24)
|
|
37
38
|
|
|
38
39
|
|
|
39
40
|
class WorkloadScheduling(Enum):
|
|
@@ -115,7 +116,7 @@ def check_if_workload_can_schedule(
|
|
|
115
116
|
args,
|
|
116
117
|
workload_system,
|
|
117
118
|
max_vm_in_cluster=int(resources_config_map[cluster_system.device_type]),
|
|
118
|
-
):
|
|
119
|
+
) and _check_super_slicing_topology(workload_system):
|
|
119
120
|
return WorkloadScheduling.SUPER_SLICING_AVAILABLE
|
|
120
121
|
else:
|
|
121
122
|
return WorkloadScheduling.UNAVAILABLE
|
|
@@ -189,7 +190,6 @@ def _check_super_slicing_availability(
|
|
|
189
190
|
workload_system: SystemCharacteristics,
|
|
190
191
|
cluster_system: SystemCharacteristics,
|
|
191
192
|
) -> bool:
|
|
192
|
-
# TODO: b/465447813 - Add super-slicing workload topology validation.
|
|
193
193
|
if (
|
|
194
194
|
(not FeatureFlags.SUPER_SLICING_ENABLED)
|
|
195
195
|
or (workload_system.gke_accelerator != cluster_system.gke_accelerator)
|
|
@@ -212,6 +212,27 @@ def _check_super_slicing_availability(
|
|
|
212
212
|
)
|
|
213
213
|
|
|
214
214
|
|
|
215
|
+
def _check_super_slicing_topology(
|
|
216
|
+
workload_system: SystemCharacteristics,
|
|
217
|
+
) -> bool:
|
|
218
|
+
topology = parse_topology(workload_system.topology)
|
|
219
|
+
result = (
|
|
220
|
+
all(size % 4 == 0 and size >= 4 for size in topology)
|
|
221
|
+
and len(topology) == len(_SUPER_SLICING_MAX_TOPOLOGY)
|
|
222
|
+
and topology[0] <= topology[1] <= topology[2]
|
|
223
|
+
and all(a <= b for a, b in zip(topology, _SUPER_SLICING_MAX_TOPOLOGY))
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
if not result:
|
|
227
|
+
xpk_print(
|
|
228
|
+
'Error: Invalid super-slicing topology. It must adhere to the format of'
|
|
229
|
+
' 4i x 4j x 4k, where i <= j <= k, and i, j, k are integers, with a'
|
|
230
|
+
' maximum of 16x24x24.'
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
return result
|
|
234
|
+
|
|
235
|
+
|
|
215
236
|
def get_total_chips_requested_from_args(
|
|
216
237
|
args, system: SystemCharacteristics
|
|
217
238
|
) -> int:
|
|
@@ -342,6 +363,10 @@ def create_sub_slicing_annotations(sub_slicing_topology: str) -> list[str]:
|
|
|
342
363
|
]
|
|
343
364
|
|
|
344
365
|
|
|
366
|
+
def create_tpu_slice_topology_annotation(workload_topology: str) -> str:
|
|
367
|
+
return f'cloud.google.com/gke-tpu-slice-topology: {workload_topology}'
|
|
368
|
+
|
|
369
|
+
|
|
345
370
|
def create_placement_policy_label(
|
|
346
371
|
system: SystemCharacteristics, super_slicing: bool
|
|
347
372
|
) -> str:
|
xpk/core/scheduling_test.py
CHANGED
|
@@ -22,7 +22,7 @@ from pytest_mock import MockerFixture
|
|
|
22
22
|
from xpk.core.capacity import AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, AUTOPROVISIONING_CONFIG_VALUE
|
|
23
23
|
from xpk.core.testing.commands_tester import CommandsTester
|
|
24
24
|
from xpk.utils.feature_flags import FeatureFlags
|
|
25
|
-
from .scheduling import WorkloadScheduling, check_if_workload_can_schedule, create_sub_slicing_annotations, create_placement_policy_label, get_placement_policy_name, is_placement_policy_supported
|
|
25
|
+
from .scheduling import WorkloadScheduling, check_if_workload_can_schedule, create_sub_slicing_annotations, create_placement_policy_label, create_tpu_slice_topology_annotation, get_placement_policy_name, is_placement_policy_supported
|
|
26
26
|
from .system_characteristics import SystemCharacteristics, AcceleratorType, DockerPlatform, get_system_characteristics_by_device_type
|
|
27
27
|
|
|
28
28
|
|
|
@@ -66,6 +66,7 @@ def test_create_placement_policy_label_returns_valid_label():
|
|
|
66
66
|
accelerator_type=AcceleratorType.TPU,
|
|
67
67
|
supports_sub_slicing=False,
|
|
68
68
|
supports_super_slicing=False,
|
|
69
|
+
supports_accelerator_network_profile=False,
|
|
69
70
|
docker_platform=DockerPlatform.ARM,
|
|
70
71
|
)
|
|
71
72
|
label = create_placement_policy_label(
|
|
@@ -89,6 +90,7 @@ def test_get_placement_policy_name_returns_valid_name():
|
|
|
89
90
|
accelerator_type=AcceleratorType.TPU,
|
|
90
91
|
supports_sub_slicing=False,
|
|
91
92
|
supports_super_slicing=False,
|
|
93
|
+
supports_accelerator_network_profile=False,
|
|
92
94
|
docker_platform=DockerPlatform.ARM,
|
|
93
95
|
)
|
|
94
96
|
name = get_placement_policy_name(system_characteristics, super_slicing=False)
|
|
@@ -107,6 +109,7 @@ def test_get_placement_policy_name_super_slicing_returns_valid_name():
|
|
|
107
109
|
accelerator_type=AcceleratorType.TPU,
|
|
108
110
|
supports_sub_slicing=False,
|
|
109
111
|
supports_super_slicing=False,
|
|
112
|
+
supports_accelerator_network_profile=False,
|
|
110
113
|
docker_platform=DockerPlatform.ARM,
|
|
111
114
|
)
|
|
112
115
|
name = get_placement_policy_name(system_characteristics, super_slicing=True)
|
|
@@ -125,6 +128,7 @@ def test_is_placement_policy_supported_returns_true_for_system_characteristics_s
|
|
|
125
128
|
accelerator_type=AcceleratorType.TPU,
|
|
126
129
|
supports_sub_slicing=False,
|
|
127
130
|
supports_super_slicing=False,
|
|
131
|
+
supports_accelerator_network_profile=False,
|
|
128
132
|
docker_platform=DockerPlatform.ARM,
|
|
129
133
|
)
|
|
130
134
|
assert is_placement_policy_supported(system_characteristics) is True
|
|
@@ -142,6 +146,7 @@ def test_is_placement_policy_supported_returns_false_for_system_characteristics_
|
|
|
142
146
|
accelerator_type=AcceleratorType.TPU,
|
|
143
147
|
supports_sub_slicing=False,
|
|
144
148
|
supports_super_slicing=False,
|
|
149
|
+
supports_accelerator_network_profile=False,
|
|
145
150
|
docker_platform=DockerPlatform.ARM,
|
|
146
151
|
)
|
|
147
152
|
assert is_placement_policy_supported(system_characteristics) is False
|
|
@@ -159,6 +164,7 @@ def test_is_placement_policy_supported_returns_false_for_system_characteristics_
|
|
|
159
164
|
accelerator_type=AcceleratorType.TPU,
|
|
160
165
|
supports_sub_slicing=False,
|
|
161
166
|
supports_super_slicing=False,
|
|
167
|
+
supports_accelerator_network_profile=False,
|
|
162
168
|
docker_platform=DockerPlatform.ARM,
|
|
163
169
|
)
|
|
164
170
|
assert is_placement_policy_supported(system_characteristics) is False
|
|
@@ -369,6 +375,28 @@ SUPER_SLICING_CASE = SchedulingTestCase(
|
|
|
369
375
|
),
|
|
370
376
|
WorkloadScheduling.UNAVAILABLE,
|
|
371
377
|
),
|
|
378
|
+
(
|
|
379
|
+
'Super-slicing, but workload topology is not divisible by four',
|
|
380
|
+
dataclasses.replace(
|
|
381
|
+
SUPER_SLICING_CASE,
|
|
382
|
+
workload_system=_get_system_characteristics_or_die(
|
|
383
|
+
'tpu7x-2x2x1'
|
|
384
|
+
),
|
|
385
|
+
),
|
|
386
|
+
WorkloadScheduling.UNAVAILABLE,
|
|
387
|
+
),
|
|
388
|
+
(
|
|
389
|
+
'Super-slicing, but workload topology is too big for super-slice',
|
|
390
|
+
dataclasses.replace(
|
|
391
|
+
SUPER_SLICING_CASE,
|
|
392
|
+
workload_system=_get_system_characteristics_or_die(
|
|
393
|
+
'tpu7x-4x4x32'
|
|
394
|
+
),
|
|
395
|
+
# 10 cubes, to make sure vms fit:
|
|
396
|
+
resources_config_map={'tpu7x-128': str(64 // 4 * 10)},
|
|
397
|
+
),
|
|
398
|
+
WorkloadScheduling.UNAVAILABLE,
|
|
399
|
+
),
|
|
372
400
|
(
|
|
373
401
|
(
|
|
374
402
|
'Super-slicing should be ignored when a given device is already'
|
|
@@ -426,3 +454,12 @@ def test_check_if_workload_can_schedule(
|
|
|
426
454
|
)
|
|
427
455
|
== expected
|
|
428
456
|
)
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def test_create_tpu_slice_topology_annotation():
|
|
460
|
+
workload_system = _get_system_characteristics_or_die('tpu7x-4x4x8')
|
|
461
|
+
|
|
462
|
+
assert (
|
|
463
|
+
create_tpu_slice_topology_annotation(workload_system.topology)
|
|
464
|
+
== 'cloud.google.com/gke-tpu-slice-topology: 4x4x8'
|
|
465
|
+
)
|