xpk 0.13.0__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +9 -2
- xpk/commands/cluster.py +128 -115
- xpk/commands/cluster_gcluster.py +77 -14
- xpk/commands/cluster_gcluster_test.py +177 -0
- xpk/commands/common.py +10 -28
- xpk/commands/info.py +11 -9
- xpk/commands/inspector.py +21 -10
- xpk/commands/job.py +25 -9
- xpk/commands/kind.py +38 -40
- xpk/commands/kjob_common.py +4 -4
- xpk/commands/run.py +9 -2
- xpk/commands/shell.py +13 -10
- xpk/commands/storage.py +21 -0
- xpk/commands/version.py +0 -4
- xpk/commands/workload.py +43 -22
- xpk/core/blueprint/blueprint_generator.py +4 -40
- xpk/core/blueprint/blueprint_test.py +0 -6
- xpk/core/capacity.py +6 -5
- xpk/core/cluster.py +91 -194
- xpk/core/cluster_private.py +6 -11
- xpk/core/commands.py +11 -18
- xpk/core/config.py +1 -1
- xpk/core/docker_image.py +3 -4
- xpk/core/gcloud_context.py +26 -2
- xpk/core/gcloud_context_test.py +96 -0
- xpk/core/gcluster_manager.py +0 -3
- xpk/core/jobset.py +4 -7
- xpk/core/kjob.py +14 -27
- xpk/core/kueue_manager.py +383 -0
- xpk/core/kueue_manager_test.py +542 -0
- xpk/core/monitoring.py +1 -1
- xpk/core/nap.py +10 -15
- xpk/core/network.py +17 -18
- xpk/core/nodepool.py +66 -77
- xpk/core/nodepool_test.py +198 -1
- xpk/core/pathways.py +5 -5
- xpk/core/ray.py +10 -14
- xpk/core/resources.py +6 -11
- xpk/core/scheduling.py +19 -1
- xpk/core/scheduling_test.py +31 -0
- xpk/core/system_characteristics.py +335 -229
- xpk/core/vertex.py +1 -1
- xpk/core/workload.py +7 -8
- xpk/main.py +2 -4
- xpk/parser/cluster.py +7 -0
- xpk/parser/cluster_test.py +66 -0
- xpk/parser/common.py +11 -0
- xpk/parser/workload.py +62 -25
- xpk/parser/workload_test.py +82 -0
- xpk/utils/feature_flags.py +28 -0
- xpk/utils/kueue.py +20 -0
- xpk/utils/templates.py +2 -0
- xpk/utils/topology.py +37 -0
- xpk/utils/topology_test.py +43 -0
- xpk/utils/validation.py +79 -55
- xpk/utils/validation_test.py +37 -0
- {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
- xpk-0.14.0.dist-info/RECORD +112 -0
- xpk/core/kueue.py +0 -561
- xpk-0.13.0.dist-info/RECORD +0 -101
- {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
xpk/core/vertex.py
CHANGED
|
@@ -66,7 +66,7 @@ def create_vertex_experiment(args) -> dict | None:
|
|
|
66
66
|
)
|
|
67
67
|
|
|
68
68
|
metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
|
|
69
|
-
cluster_config_map = get_cluster_configmap(
|
|
69
|
+
cluster_config_map = get_cluster_configmap(metadata_configmap_name)
|
|
70
70
|
|
|
71
71
|
if cluster_config_map is None or 'tensorboard_name' not in cluster_config_map:
|
|
72
72
|
xpk_print(
|
xpk/core/workload.py
CHANGED
|
@@ -17,7 +17,7 @@ limitations under the License.
|
|
|
17
17
|
import re
|
|
18
18
|
from ..utils.console import xpk_exit, xpk_print
|
|
19
19
|
from .commands import run_command_for_value
|
|
20
|
-
from .gcloud_context import
|
|
20
|
+
from .gcloud_context import get_cluster_location
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
def workload_list_awk_command(filter_key) -> str:
|
|
@@ -131,7 +131,7 @@ def get_workload_list(args) -> tuple[int, str]:
|
|
|
131
131
|
if hasattr(args, 'filter_by_job'):
|
|
132
132
|
task += f' with filter-by-job={args.filter_by_job}'
|
|
133
133
|
|
|
134
|
-
return_code, return_value = run_command_for_value(command, task
|
|
134
|
+
return_code, return_value = run_command_for_value(command, task)
|
|
135
135
|
return return_code, return_value
|
|
136
136
|
|
|
137
137
|
|
|
@@ -152,7 +152,7 @@ def check_if_workload_exists(args) -> bool:
|
|
|
152
152
|
|
|
153
153
|
command = f"kubectl get workloads -o=custom-columns='{s}'"
|
|
154
154
|
return_code, return_msg = run_command_for_value(
|
|
155
|
-
command, 'Check if Workload Already Exists'
|
|
155
|
+
command, 'Check if Workload Already Exists'
|
|
156
156
|
)
|
|
157
157
|
|
|
158
158
|
if return_code != 0:
|
|
@@ -186,7 +186,7 @@ def wait_for_job_completion(args) -> int:
|
|
|
186
186
|
# Get the full workload name
|
|
187
187
|
get_workload_name_cmd = f'kubectl get workloads | grep jobset-{args.workload}'
|
|
188
188
|
return_code, return_value = run_command_for_value(
|
|
189
|
-
get_workload_name_cmd, 'Get full workload name'
|
|
189
|
+
get_workload_name_cmd, 'Get full workload name'
|
|
190
190
|
)
|
|
191
191
|
if return_code != 0:
|
|
192
192
|
xpk_print(f'Get full workload name request returned ERROR {return_code}')
|
|
@@ -205,7 +205,6 @@ def wait_for_job_completion(args) -> int:
|
|
|
205
205
|
return_code, return_value = run_command_for_value(
|
|
206
206
|
wait_cmd,
|
|
207
207
|
f'Wait for workload to finish with timeout of {timeout_msg}',
|
|
208
|
-
args,
|
|
209
208
|
print_timer=True,
|
|
210
209
|
)
|
|
211
210
|
if return_code != 0:
|
|
@@ -214,7 +213,7 @@ def wait_for_job_completion(args) -> int:
|
|
|
214
213
|
f'Timed out waiting for your workload after {timeout_msg}, see your'
|
|
215
214
|
' workload here:'
|
|
216
215
|
# pylint: disable=line-too-long
|
|
217
|
-
f' https://console.cloud.google.com/kubernetes/service/{
|
|
216
|
+
f' https://console.cloud.google.com/kubernetes/service/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
|
|
218
217
|
)
|
|
219
218
|
return 124
|
|
220
219
|
else:
|
|
@@ -224,14 +223,14 @@ def wait_for_job_completion(args) -> int:
|
|
|
224
223
|
xpk_print(
|
|
225
224
|
'Finished waiting for your workload, see your workload here:'
|
|
226
225
|
# pylint: disable=line-too-long
|
|
227
|
-
f' https://console.cloud.google.com/kubernetes/service/{
|
|
226
|
+
f' https://console.cloud.google.com/kubernetes/service/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
|
|
228
227
|
)
|
|
229
228
|
status_cmd = (
|
|
230
229
|
f'kubectl get jobset {args.workload} -o'
|
|
231
230
|
" jsonpath='{.status.conditions[-1].type}'"
|
|
232
231
|
)
|
|
233
232
|
return_code, return_value = run_command_for_value(
|
|
234
|
-
status_cmd, 'Get jobset status'
|
|
233
|
+
status_cmd, 'Get jobset status'
|
|
235
234
|
)
|
|
236
235
|
if return_code != 0:
|
|
237
236
|
xpk_print(f'Get workload status request returned ERROR {return_code}')
|
xpk/main.py
CHANGED
|
@@ -36,7 +36,6 @@ import sys
|
|
|
36
36
|
|
|
37
37
|
from .parser.core import set_parser
|
|
38
38
|
from .utils.console import xpk_print
|
|
39
|
-
from .utils.validation import validate_dependencies
|
|
40
39
|
from .utils.execution_context import set_dry_run
|
|
41
40
|
################### Compatibility Check ###################
|
|
42
41
|
# Check that the user runs the below version or greater.
|
|
@@ -66,9 +65,8 @@ def main() -> None:
|
|
|
66
65
|
xpk_print('Starting xpk', flush=True)
|
|
67
66
|
main_args = parser.parse_args()
|
|
68
67
|
main_args.enable_ray_cluster = False
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
validate_dependencies()
|
|
68
|
+
dry_run = 'dry_run' in main_args and main_args.dry_run
|
|
69
|
+
set_dry_run(dry_run)
|
|
72
70
|
main_args.func(main_args)
|
|
73
71
|
xpk_print('XPK Done.', flush=True)
|
|
74
72
|
|
xpk/parser/cluster.py
CHANGED
|
@@ -31,6 +31,7 @@ from ..core.config import CFG_BUCKET_KEY
|
|
|
31
31
|
from ..core.vertex import DEFAULT_VERTEX_TENSORBOARD_NAME
|
|
32
32
|
from .common import add_shared_arguments, ParserOrArgumentGroup
|
|
33
33
|
from .validators import name_type
|
|
34
|
+
from ..utils.feature_flags import FeatureFlags
|
|
34
35
|
|
|
35
36
|
|
|
36
37
|
def set_cluster_parser(cluster_parser: ArgumentParser):
|
|
@@ -142,6 +143,12 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
|
|
|
142
143
|
' enable cluster to accept Pathways workloads.'
|
|
143
144
|
),
|
|
144
145
|
)
|
|
146
|
+
if FeatureFlags.SUB_SLICING_ENABLED:
|
|
147
|
+
cluster_create_optional_arguments.add_argument(
|
|
148
|
+
'--sub-slicing',
|
|
149
|
+
action='store_true',
|
|
150
|
+
help='Whether to set up cluster to support sub-slicing',
|
|
151
|
+
)
|
|
145
152
|
|
|
146
153
|
autoprovisioning_arguments = cluster_create_parser.add_argument_group(
|
|
147
154
|
'Autoprovisioning Arguments',
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
from xpk.parser.cluster import set_cluster_create_parser
|
|
19
|
+
import pytest
|
|
20
|
+
from ..utils.feature_flags import FeatureFlags
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@pytest.fixture(autouse=True)
|
|
24
|
+
def with_sub_slicing_enabled():
|
|
25
|
+
FeatureFlags.SUB_SLICING_ENABLED = True
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_cluster_create_sub_slicing_is_hidden_with_flag_off():
|
|
29
|
+
FeatureFlags.SUB_SLICING_ENABLED = False
|
|
30
|
+
parser = argparse.ArgumentParser()
|
|
31
|
+
|
|
32
|
+
set_cluster_create_parser(parser)
|
|
33
|
+
help_str = parser.format_help()
|
|
34
|
+
|
|
35
|
+
assert "--sub-slicing" not in help_str
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_cluster_create_sub_slicing_is_shown_with_flag_on():
|
|
39
|
+
parser = argparse.ArgumentParser()
|
|
40
|
+
|
|
41
|
+
set_cluster_create_parser(parser)
|
|
42
|
+
help_str = parser.format_help()
|
|
43
|
+
|
|
44
|
+
assert "--sub-slicing" in help_str
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_cluster_create_sub_slicing_is_false_by_default():
|
|
48
|
+
parser = argparse.ArgumentParser()
|
|
49
|
+
|
|
50
|
+
set_cluster_create_parser(parser)
|
|
51
|
+
args = parser.parse_args(
|
|
52
|
+
["--cluster", "test-cluster", "--tpu-type", "test-tpu"]
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
assert args.sub_slicing is False
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def test_cluster_create_sub_slicing_can_be_set():
|
|
59
|
+
parser = argparse.ArgumentParser()
|
|
60
|
+
|
|
61
|
+
set_cluster_create_parser(parser)
|
|
62
|
+
args = parser.parse_args(
|
|
63
|
+
["--cluster", "test-cluster", "--tpu-type", "test-tpu", "--sub-slicing"]
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
assert args.sub_slicing is True
|
xpk/parser/common.py
CHANGED
|
@@ -62,6 +62,17 @@ def add_shared_arguments(
|
|
|
62
62
|
),
|
|
63
63
|
required=required,
|
|
64
64
|
)
|
|
65
|
+
custom_parser_or_group.add_argument(
|
|
66
|
+
'--skip-validation',
|
|
67
|
+
type=bool,
|
|
68
|
+
action=argparse.BooleanOptionalAction,
|
|
69
|
+
default=False,
|
|
70
|
+
help=(
|
|
71
|
+
'Skip dependency validation checks (kubectl, gcloud, docker, etc). '
|
|
72
|
+
'Independent of --dry-run.'
|
|
73
|
+
),
|
|
74
|
+
required=required,
|
|
75
|
+
)
|
|
65
76
|
|
|
66
77
|
|
|
67
78
|
def add_cluster_arguments(
|
xpk/parser/workload.py
CHANGED
|
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
from argparse import ArgumentParser
|
|
17
18
|
from ..commands.workload import (
|
|
18
19
|
workload_create,
|
|
19
20
|
workload_create_pathways,
|
|
@@ -23,9 +24,10 @@ from ..commands.workload import (
|
|
|
23
24
|
from ..core.docker_image import DEFAULT_DOCKER_IMAGE, DEFAULT_SCRIPT_DIR
|
|
24
25
|
from .common import add_shared_arguments
|
|
25
26
|
from .validators import directory_path_type, name_type
|
|
27
|
+
from ..utils.feature_flags import FeatureFlags
|
|
26
28
|
|
|
27
29
|
|
|
28
|
-
def set_workload_parsers(workload_parser):
|
|
30
|
+
def set_workload_parsers(workload_parser: ArgumentParser):
|
|
29
31
|
workload_subcommands = workload_parser.add_subparsers(
|
|
30
32
|
title='workload subcommands',
|
|
31
33
|
dest='xpk_workload_subcommands',
|
|
@@ -39,6 +41,28 @@ def set_workload_parsers(workload_parser):
|
|
|
39
41
|
workload_create_parser = workload_subcommands.add_parser(
|
|
40
42
|
'create', help='Create a new job.'
|
|
41
43
|
)
|
|
44
|
+
set_workload_create_parser(workload_create_parser)
|
|
45
|
+
|
|
46
|
+
# "workload create-pathways" command parser.
|
|
47
|
+
workload_create_pathways_parser = workload_subcommands.add_parser(
|
|
48
|
+
'create-pathways', help='Create a new job.'
|
|
49
|
+
)
|
|
50
|
+
set_workload_create_pathways_parser(workload_create_pathways_parser)
|
|
51
|
+
|
|
52
|
+
# "workload delete" command parser.
|
|
53
|
+
workload_delete_parser = workload_subcommands.add_parser(
|
|
54
|
+
'delete', help='Delete job.'
|
|
55
|
+
)
|
|
56
|
+
set_workload_delete_parser(workload_delete_parser)
|
|
57
|
+
|
|
58
|
+
# "workload list" command parser.
|
|
59
|
+
workload_list_parser = workload_subcommands.add_parser(
|
|
60
|
+
'list', help='List jobs.'
|
|
61
|
+
)
|
|
62
|
+
set_workload_list_parser(workload_list_parser)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def set_workload_create_parser(workload_create_parser: ArgumentParser):
|
|
42
66
|
workload_create_parser_required_arguments = (
|
|
43
67
|
workload_create_parser.add_argument_group(
|
|
44
68
|
'Workload Built-in Arguments',
|
|
@@ -193,10 +217,33 @@ def set_workload_parsers(workload_parser):
|
|
|
193
217
|
),
|
|
194
218
|
)
|
|
195
219
|
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
220
|
+
add_shared_workload_create_required_arguments([
|
|
221
|
+
workload_create_parser_required_arguments,
|
|
222
|
+
])
|
|
223
|
+
add_shared_workload_create_optional_arguments([
|
|
224
|
+
workload_create_parser_optional_arguments,
|
|
225
|
+
])
|
|
226
|
+
add_shared_workload_create_env_arguments([
|
|
227
|
+
workload_create_parser_optional_arguments,
|
|
228
|
+
])
|
|
229
|
+
add_shared_workload_base_docker_image_arguments([
|
|
230
|
+
workload_base_docker_image_arguments,
|
|
231
|
+
])
|
|
232
|
+
add_shared_workload_docker_image_arguments([
|
|
233
|
+
workload_docker_image_arguments,
|
|
234
|
+
])
|
|
235
|
+
add_shared_workload_create_tensorboard_arguments([
|
|
236
|
+
workload_vertex_tensorboard_arguments,
|
|
237
|
+
])
|
|
238
|
+
add_shared_workload_create_autoprovisioning_arguments([
|
|
239
|
+
workload_create_autoprovisioning_arguments,
|
|
240
|
+
])
|
|
241
|
+
workload_create_parser.set_defaults(func=workload_create)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def set_workload_create_pathways_parser(
|
|
245
|
+
workload_create_pathways_parser: ArgumentParser,
|
|
246
|
+
):
|
|
200
247
|
workload_create_pathways_parser_required_arguments = (
|
|
201
248
|
workload_create_pathways_parser.add_argument_group(
|
|
202
249
|
'Workload create-pathways Built-in Arguments',
|
|
@@ -232,7 +279,6 @@ def set_workload_parsers(workload_parser):
|
|
|
232
279
|
'Arguments for creating Vertex AI Experiment in workload create.',
|
|
233
280
|
)
|
|
234
281
|
)
|
|
235
|
-
|
|
236
282
|
### "workload create-pathways" Required arguments, specific to Pathways
|
|
237
283
|
workload_create_pathways_parser_required_arguments.add_argument(
|
|
238
284
|
'--tpu-type',
|
|
@@ -353,42 +399,30 @@ def set_workload_parsers(workload_parser):
|
|
|
353
399
|
)
|
|
354
400
|
|
|
355
401
|
add_shared_workload_create_required_arguments([
|
|
356
|
-
workload_create_parser_required_arguments,
|
|
357
402
|
workload_create_pathways_parser_required_arguments,
|
|
358
403
|
])
|
|
359
404
|
add_shared_workload_create_optional_arguments([
|
|
360
|
-
workload_create_parser_optional_arguments,
|
|
361
405
|
workload_create_pathways_parser_optional_arguments,
|
|
362
406
|
])
|
|
363
407
|
add_shared_workload_create_env_arguments([
|
|
364
|
-
workload_create_parser_optional_arguments,
|
|
365
408
|
workload_create_pathways_parser_optional_arguments,
|
|
366
409
|
])
|
|
367
410
|
add_shared_workload_base_docker_image_arguments([
|
|
368
|
-
workload_base_docker_image_arguments,
|
|
369
411
|
workload_create_pathways_base_docker_image_arguments,
|
|
370
412
|
])
|
|
371
413
|
add_shared_workload_docker_image_arguments([
|
|
372
|
-
workload_docker_image_arguments,
|
|
373
414
|
workload_create_pathways_docker_image_arguments,
|
|
374
415
|
])
|
|
375
416
|
add_shared_workload_create_tensorboard_arguments([
|
|
376
|
-
workload_vertex_tensorboard_arguments,
|
|
377
417
|
workload_create_pathways_vertex_tensorboard_arguments,
|
|
378
418
|
])
|
|
379
419
|
add_shared_workload_create_autoprovisioning_arguments([
|
|
380
|
-
workload_create_autoprovisioning_arguments,
|
|
381
420
|
workload_create_pathways_autoprovisioning_arguments,
|
|
382
421
|
])
|
|
383
|
-
|
|
384
|
-
# Set defaults for both workload create and workload create-pathways after adding all shared args.
|
|
385
|
-
workload_create_parser.set_defaults(func=workload_create)
|
|
386
422
|
workload_create_pathways_parser.set_defaults(func=workload_create_pathways)
|
|
387
423
|
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
'delete', help='Delete job.'
|
|
391
|
-
)
|
|
424
|
+
|
|
425
|
+
def set_workload_delete_parser(workload_delete_parser: ArgumentParser):
|
|
392
426
|
workload_delete_parser_required_arguments = (
|
|
393
427
|
workload_delete_parser.add_argument_group(
|
|
394
428
|
'Required Arguments',
|
|
@@ -454,14 +488,10 @@ def set_workload_parsers(workload_parser):
|
|
|
454
488
|
'Forces workload deletion command to run without additional approval.'
|
|
455
489
|
),
|
|
456
490
|
)
|
|
457
|
-
|
|
458
491
|
workload_delete_parser.set_defaults(func=workload_delete)
|
|
459
492
|
|
|
460
|
-
# "workload list" command parser.
|
|
461
|
-
workload_list_parser = workload_subcommands.add_parser(
|
|
462
|
-
'list', help='List jobs.'
|
|
463
|
-
)
|
|
464
493
|
|
|
494
|
+
def set_workload_list_parser(workload_list_parser: ArgumentParser):
|
|
465
495
|
workload_list_parser.add_argument(
|
|
466
496
|
'--cluster',
|
|
467
497
|
type=name_type,
|
|
@@ -629,6 +659,13 @@ def add_shared_workload_create_optional_arguments(args_parsers):
|
|
|
629
659
|
' the workload.'
|
|
630
660
|
),
|
|
631
661
|
)
|
|
662
|
+
if FeatureFlags.SUB_SLICING_ENABLED:
|
|
663
|
+
custom_parser.add_argument(
|
|
664
|
+
'--sub-slicing-topology',
|
|
665
|
+
type=str,
|
|
666
|
+
help='Sub-slicing topology to use.',
|
|
667
|
+
required=False,
|
|
668
|
+
)
|
|
632
669
|
|
|
633
670
|
|
|
634
671
|
def add_shared_workload_create_env_arguments(args_parsers):
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
from xpk.parser.workload import set_workload_create_parser
|
|
19
|
+
from ..utils.feature_flags import FeatureFlags
|
|
20
|
+
import pytest
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@pytest.fixture(autouse=True)
|
|
24
|
+
def with_sub_slicing_enabled():
|
|
25
|
+
FeatureFlags.SUB_SLICING_ENABLED = True
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_workload_create_sub_slicing_topology_is_hidden_with_flag_off():
|
|
29
|
+
FeatureFlags.SUB_SLICING_ENABLED = False
|
|
30
|
+
parser = argparse.ArgumentParser()
|
|
31
|
+
|
|
32
|
+
set_workload_create_parser(parser)
|
|
33
|
+
help_str = parser.format_help()
|
|
34
|
+
|
|
35
|
+
assert "--sub-slicing" not in help_str
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_workload_create_sub_slicing_topology_is_shown_with_flag_on():
|
|
39
|
+
parser = argparse.ArgumentParser()
|
|
40
|
+
|
|
41
|
+
set_workload_create_parser(parser)
|
|
42
|
+
help_str = parser.format_help()
|
|
43
|
+
|
|
44
|
+
assert "--sub-slicing" in help_str
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_workload_create_sub_slicing_topology_is_none_by_default():
|
|
48
|
+
parser = argparse.ArgumentParser()
|
|
49
|
+
|
|
50
|
+
set_workload_create_parser(parser)
|
|
51
|
+
args = parser.parse_args([
|
|
52
|
+
"--cluster",
|
|
53
|
+
"test-cluster",
|
|
54
|
+
"--command",
|
|
55
|
+
"python3",
|
|
56
|
+
"--workload",
|
|
57
|
+
"test",
|
|
58
|
+
"--tpu-type",
|
|
59
|
+
"test-tpu",
|
|
60
|
+
])
|
|
61
|
+
|
|
62
|
+
assert args.sub_slicing_topology is None
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def test_workload_create_sub_slicing_topology_can_be_set():
|
|
66
|
+
parser = argparse.ArgumentParser()
|
|
67
|
+
|
|
68
|
+
set_workload_create_parser(parser)
|
|
69
|
+
args = parser.parse_args([
|
|
70
|
+
"--cluster",
|
|
71
|
+
"test-cluster",
|
|
72
|
+
"--command",
|
|
73
|
+
"python3",
|
|
74
|
+
"--workload",
|
|
75
|
+
"test",
|
|
76
|
+
"--tpu-type",
|
|
77
|
+
"test-tpu",
|
|
78
|
+
"--sub-slicing-topology",
|
|
79
|
+
"2x2",
|
|
80
|
+
])
|
|
81
|
+
|
|
82
|
+
assert args.sub_slicing_topology is "2x2"
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import os
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _get_boolean_flag(flag: str, default: bool) -> bool:
|
|
21
|
+
return os.getenv(flag, str(default)).lower() == "true"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class _FeatureFlags:
|
|
25
|
+
SUB_SLICING_ENABLED = _get_boolean_flag("SUB_SLICING_ENABLED", default=False)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
FeatureFlags = _FeatureFlags()
|
xpk/utils/kueue.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def is_queued_cluster(num_slices: int) -> bool:
|
|
19
|
+
"""Determines if admission checks should be enabled and cluster queued."""
|
|
20
|
+
return num_slices <= 1
|
xpk/utils/templates.py
CHANGED
xpk/utils/topology.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from functools import reduce
|
|
18
|
+
from operator import mul
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def is_topology_valid(topology: str) -> bool:
|
|
22
|
+
try:
|
|
23
|
+
parse_topology(topology)
|
|
24
|
+
return True
|
|
25
|
+
except ValueError:
|
|
26
|
+
return False
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_topology_product(topology: str) -> int:
|
|
30
|
+
return reduce(mul, parse_topology(topology), 1)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def parse_topology(topology: str) -> list[int]:
|
|
34
|
+
if len(topology) <= 0:
|
|
35
|
+
raise ValueError("Topology is an empty string")
|
|
36
|
+
|
|
37
|
+
return [int(el) for el in topology.lower().split("x")]
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import pytest
|
|
18
|
+
from .topology import is_topology_valid, get_topology_product, parse_topology
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_is_topology_valid_with_invalid_topology():
|
|
22
|
+
result = is_topology_valid("N/A")
|
|
23
|
+
assert result is False
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_is_topology_valid_with_valid_topology():
|
|
27
|
+
result = is_topology_valid("1x1x1")
|
|
28
|
+
assert result is True
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_parse_topology_with_valid_topology():
|
|
32
|
+
result = parse_topology("1x2x3")
|
|
33
|
+
assert result == [1, 2, 3]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_parse_topology_with_empty_input():
|
|
37
|
+
with pytest.raises(ValueError):
|
|
38
|
+
parse_topology("")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_get_topology_product():
|
|
42
|
+
result = get_topology_product("1x2x3")
|
|
43
|
+
assert result == 6
|