xpk 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/README.md +19 -0
- xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3mega/storage_crd.yaml +52 -0
- xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
- xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
- xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
- xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
- xpk/blueprints/a4/storage_crd.yaml +52 -0
- xpk/commands/cluster.py +33 -12
- xpk/commands/cluster_gcluster_test.py +5 -1
- xpk/commands/cluster_test.py +125 -0
- xpk/commands/config.py +3 -3
- xpk/commands/inspector.py +5 -3
- xpk/commands/kind.py +2 -0
- xpk/commands/managed_ml_diagnostics.py +249 -0
- xpk/commands/managed_ml_diagnostics_test.py +146 -0
- xpk/commands/workload.py +124 -139
- xpk/commands/workload_test.py +160 -118
- xpk/core/blueprint/blueprint_generator.py +3 -0
- xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
- xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
- xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
- xpk/core/blueprint/testing/data/a4.yaml +185 -0
- xpk/core/capacity.py +2 -0
- xpk/core/cluster.py +18 -47
- xpk/core/cluster_test.py +76 -1
- xpk/core/config.py +81 -7
- xpk/core/config_test.py +67 -11
- xpk/core/docker_container.py +3 -1
- xpk/core/docker_image.py +10 -6
- xpk/core/docker_resources.py +1 -10
- xpk/core/kjob.py +17 -16
- xpk/core/kueue_manager.py +13 -19
- xpk/core/kueue_manager_test.py +27 -1
- xpk/core/nap.py +13 -14
- xpk/core/nodepool.py +17 -15
- xpk/core/nodepool_test.py +25 -4
- xpk/core/pathways.py +23 -0
- xpk/core/pathways_test.py +57 -0
- xpk/core/resources.py +84 -27
- xpk/core/scheduling.py +128 -132
- xpk/core/scheduling_test.py +215 -2
- xpk/core/system_characteristics.py +179 -0
- xpk/core/system_characteristics_test.py +49 -1
- xpk/core/telemetry.py +4 -4
- xpk/core/telemetry_test.py +9 -9
- xpk/core/vertex.py +4 -3
- xpk/core/workload_decorators/tcpx_decorator.py +5 -1
- xpk/main.py +2 -0
- xpk/parser/cluster.py +22 -88
- xpk/parser/cluster_test.py +41 -0
- xpk/parser/common.py +84 -0
- xpk/parser/storage.py +10 -0
- xpk/parser/storage_test.py +47 -0
- xpk/parser/workload.py +14 -41
- xpk/parser/workload_test.py +2 -48
- xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
- xpk/utils/feature_flags.py +3 -0
- xpk/utils/validation.py +2 -2
- xpk-0.16.0.dist-info/METADATA +127 -0
- {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/RECORD +67 -48
- xpk-0.15.0.dist-info/METADATA +0 -1666
- {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
|
@@ -39,12 +39,16 @@ def decorate_job(job_manifest: dict) -> dict:
|
|
|
39
39
|
return job_manifest
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
def decorate_jobset(
|
|
42
|
+
def decorate_jobset( # pylint: disable=dangerous-default-value
|
|
43
|
+
jobset_manifest_str: str,
|
|
44
|
+
sub_networks: list[str] = [], # pylint: disable=unused-argument
|
|
45
|
+
) -> str:
|
|
43
46
|
"""
|
|
44
47
|
Decorates a JobSet manifest with the necessary components for tcpxo-daemon.
|
|
45
48
|
|
|
46
49
|
Args:
|
|
47
50
|
jobset_manifest_str: The JobSet manifest as a YAML string.
|
|
51
|
+
sub_networks: This parameter is accepted for interface consistency but is not used.
|
|
48
52
|
|
|
49
53
|
Returns:
|
|
50
54
|
The modified JobSet manifest as a YAML string.
|
xpk/main.py
CHANGED
|
@@ -37,6 +37,7 @@ import sys
|
|
|
37
37
|
|
|
38
38
|
from .parser.core import set_parser
|
|
39
39
|
from .core.updates import print_xpk_hello
|
|
40
|
+
from .core.config import set_config, FileSystemConfig
|
|
40
41
|
from .core.telemetry import MetricsCollector, send_clearcut_payload, should_send_telemetry
|
|
41
42
|
from .utils.console import xpk_print, exit_code_to_int
|
|
42
43
|
from .utils.execution_context import set_context
|
|
@@ -69,6 +70,7 @@ def main() -> None:
|
|
|
69
70
|
|
|
70
71
|
main_args = parser.parse_args()
|
|
71
72
|
main_args.enable_ray_cluster = False
|
|
73
|
+
set_config(FileSystemConfig())
|
|
72
74
|
set_context(
|
|
73
75
|
dry_run_value='dry_run' in main_args and main_args.dry_run,
|
|
74
76
|
quiet_value=(
|
xpk/parser/cluster.py
CHANGED
|
@@ -26,11 +26,10 @@ from ..commands.cluster import (
|
|
|
26
26
|
cluster_describe,
|
|
27
27
|
cluster_list,
|
|
28
28
|
)
|
|
29
|
-
from ..core.config import
|
|
30
|
-
from ..core.system_characteristics import get_system_characteristics_keys_by_accelerator_type, AcceleratorType
|
|
29
|
+
from ..core.config import get_config
|
|
31
30
|
from ..core.config import CFG_BUCKET_KEY
|
|
32
31
|
from ..core.vertex import DEFAULT_VERTEX_TENSORBOARD_NAME
|
|
33
|
-
from .common import add_shared_arguments, ParserOrArgumentGroup
|
|
32
|
+
from .common import add_shared_arguments, ParserOrArgumentGroup, add_tpu_type_argument, add_tpu_and_device_type_arguments
|
|
34
33
|
from .validators import name_type
|
|
35
34
|
from ..utils.feature_flags import FeatureFlags
|
|
36
35
|
|
|
@@ -99,27 +98,7 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
|
|
|
99
98
|
required=True
|
|
100
99
|
)
|
|
101
100
|
)
|
|
102
|
-
cluster_device_group
|
|
103
|
-
'--tpu-type',
|
|
104
|
-
type=str,
|
|
105
|
-
default=None,
|
|
106
|
-
help='The tpu type to use, v5litepod-16, etc.',
|
|
107
|
-
metavar='TPU_TYPE',
|
|
108
|
-
choices=get_system_characteristics_keys_by_accelerator_type(
|
|
109
|
-
[AcceleratorType.TPU]
|
|
110
|
-
),
|
|
111
|
-
)
|
|
112
|
-
cluster_device_group.add_argument(
|
|
113
|
-
'--device-type',
|
|
114
|
-
type=str,
|
|
115
|
-
default=None,
|
|
116
|
-
help=(
|
|
117
|
-
'The device type to use (can be tpu or gpu or cpu), v5litepod-16,'
|
|
118
|
-
' h100-80gb-8, n2-standard-32-4 etc.'
|
|
119
|
-
),
|
|
120
|
-
metavar='DEVICE_TYPE',
|
|
121
|
-
choices=get_system_characteristics_keys_by_accelerator_type(),
|
|
122
|
-
)
|
|
101
|
+
add_tpu_and_device_type_arguments(cluster_device_group)
|
|
123
102
|
|
|
124
103
|
### Optional arguments specific to "cluster create"
|
|
125
104
|
cluster_create_optional_arguments = cluster_create_parser.add_argument_group(
|
|
@@ -131,7 +110,7 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
|
|
|
131
110
|
cluster_create_optional_arguments.add_argument(
|
|
132
111
|
'--cluster-state-gcs-bucket',
|
|
133
112
|
type=str,
|
|
134
|
-
default=
|
|
113
|
+
default=get_config().get(CFG_BUCKET_KEY),
|
|
135
114
|
help='The name of the bucket to store cluster state.',
|
|
136
115
|
required=False,
|
|
137
116
|
)
|
|
@@ -150,6 +129,7 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
|
|
|
150
129
|
' enable cluster to accept Pathways workloads.'
|
|
151
130
|
),
|
|
152
131
|
)
|
|
132
|
+
|
|
153
133
|
if FeatureFlags.SUB_SLICING_ENABLED:
|
|
154
134
|
add_cluster_create_sub_slicing_arguments(cluster_create_optional_arguments)
|
|
155
135
|
|
|
@@ -207,15 +187,8 @@ def set_cluster_create_pathways_parser(
|
|
|
207
187
|
add_shared_cluster_create_required_arguments(
|
|
208
188
|
cluster_create_pathways_required_arguments
|
|
209
189
|
)
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
type=str,
|
|
213
|
-
default=None,
|
|
214
|
-
help='The tpu type to use, v5litepod-16, etc.',
|
|
215
|
-
metavar='TPU_TYPE',
|
|
216
|
-
choices=get_system_characteristics_keys_by_accelerator_type(
|
|
217
|
-
[AcceleratorType.TPU]
|
|
218
|
-
),
|
|
190
|
+
add_tpu_type_argument(
|
|
191
|
+
cluster_create_pathways_required_arguments, required=True
|
|
219
192
|
)
|
|
220
193
|
|
|
221
194
|
### Optional arguments specific to "cluster create-pathways"
|
|
@@ -292,17 +265,8 @@ def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser):
|
|
|
292
265
|
add_shared_cluster_create_required_arguments(
|
|
293
266
|
cluster_create_ray_required_arguments
|
|
294
267
|
)
|
|
295
|
-
cluster_create_ray_required_arguments
|
|
296
|
-
|
|
297
|
-
type=str,
|
|
298
|
-
default=None,
|
|
299
|
-
help='The tpu type to use, v5litepod-16, etc.',
|
|
300
|
-
required=True,
|
|
301
|
-
metavar='TPU_TYPE',
|
|
302
|
-
choices=get_system_characteristics_keys_by_accelerator_type(
|
|
303
|
-
[AcceleratorType.TPU]
|
|
304
|
-
),
|
|
305
|
-
)
|
|
268
|
+
add_tpu_type_argument(cluster_create_ray_required_arguments, required=True)
|
|
269
|
+
|
|
306
270
|
# TODO(bzmarke): Add --device-type to support GPU/CPU
|
|
307
271
|
cluster_create_ray_required_arguments.add_argument(
|
|
308
272
|
'--ray-version',
|
|
@@ -392,7 +356,7 @@ def set_cluster_delete_parser(cluster_delete_parser: ArgumentParser):
|
|
|
392
356
|
cluster_delete_optional_arguments.add_argument(
|
|
393
357
|
'--cluster-state-gcs-bucket',
|
|
394
358
|
type=str,
|
|
395
|
-
default=
|
|
359
|
+
default=get_config().get(CFG_BUCKET_KEY),
|
|
396
360
|
help='The name of the bucket to store cluster state.',
|
|
397
361
|
required=False,
|
|
398
362
|
)
|
|
@@ -421,27 +385,7 @@ def set_cluster_cacheimage_parser(cluster_cacheimage_parser: ArgumentParser):
|
|
|
421
385
|
)
|
|
422
386
|
|
|
423
387
|
### Device Type Argument
|
|
424
|
-
cluster_cacheimage_group
|
|
425
|
-
'--tpu-type',
|
|
426
|
-
type=str,
|
|
427
|
-
default=None,
|
|
428
|
-
help='The tpu type to cache images on, v5litepod-16, etc.',
|
|
429
|
-
metavar='TPU_TYPE',
|
|
430
|
-
choices=get_system_characteristics_keys_by_accelerator_type(
|
|
431
|
-
[AcceleratorType.TPU]
|
|
432
|
-
),
|
|
433
|
-
)
|
|
434
|
-
cluster_cacheimage_group.add_argument(
|
|
435
|
-
'--device-type',
|
|
436
|
-
type=str,
|
|
437
|
-
default=None,
|
|
438
|
-
help=(
|
|
439
|
-
'The device type to cache images on (can be tpu or gpu),'
|
|
440
|
-
' v5litepod-16, h100-80gb-8, etc.'
|
|
441
|
-
),
|
|
442
|
-
metavar='DEVICE_TYPE',
|
|
443
|
-
choices=get_system_characteristics_keys_by_accelerator_type(),
|
|
444
|
-
)
|
|
388
|
+
add_tpu_and_device_type_arguments(cluster_cacheimage_group)
|
|
445
389
|
|
|
446
390
|
### Required arguments
|
|
447
391
|
cluster_cacheimage_required_arguments.add_argument(
|
|
@@ -526,27 +470,7 @@ def set_cluster_adapt_parser(cluster_adapt_parser: ArgumentParser):
|
|
|
526
470
|
required=True
|
|
527
471
|
)
|
|
528
472
|
)
|
|
529
|
-
cluster_adapt_device_group
|
|
530
|
-
'--tpu-type',
|
|
531
|
-
type=str,
|
|
532
|
-
default=None,
|
|
533
|
-
help='The tpu type used on cluster, v5litepod-16, etc.',
|
|
534
|
-
metavar='TPU_TYPE',
|
|
535
|
-
choices=get_system_characteristics_keys_by_accelerator_type(
|
|
536
|
-
[AcceleratorType.TPU]
|
|
537
|
-
),
|
|
538
|
-
)
|
|
539
|
-
cluster_adapt_device_group.add_argument(
|
|
540
|
-
'--device-type',
|
|
541
|
-
type=str,
|
|
542
|
-
default=None,
|
|
543
|
-
help=(
|
|
544
|
-
'The device type used on cluster (can be tpu or gpu or cpu), eg.'
|
|
545
|
-
' h100-80gb-8, n2-standard-32-4 etc.'
|
|
546
|
-
),
|
|
547
|
-
metavar='DEVICE_TYPE',
|
|
548
|
-
choices=get_system_characteristics_keys_by_accelerator_type(),
|
|
549
|
-
)
|
|
473
|
+
add_tpu_and_device_type_arguments(cluster_adapt_device_group)
|
|
550
474
|
|
|
551
475
|
cluster_adapt_optional_arguments = cluster_adapt_parser.add_argument_group(
|
|
552
476
|
'Optional Arguments',
|
|
@@ -691,6 +615,11 @@ def add_shared_cluster_create_optional_arguments(
|
|
|
691
615
|
' regional clusters, all zones must support the machine type.'
|
|
692
616
|
),
|
|
693
617
|
)
|
|
618
|
+
parser_or_group.add_argument(
|
|
619
|
+
'--managed-mldiagnostics',
|
|
620
|
+
action='store_true',
|
|
621
|
+
help='Enables the installation of required ML Diagnostics components.',
|
|
622
|
+
)
|
|
694
623
|
parser_or_group.add_argument(
|
|
695
624
|
'--cluster-cpu-machine-type',
|
|
696
625
|
type=str,
|
|
@@ -819,6 +748,11 @@ def add_driver_arguments(parser_or_group: ParserOrArgumentGroup):
|
|
|
819
748
|
action='store_true',
|
|
820
749
|
help='Enable Lustre CSI driver on the cluster.',
|
|
821
750
|
)
|
|
751
|
+
parser_or_group.add_argument(
|
|
752
|
+
'--enable-legacy-lustre-port',
|
|
753
|
+
action='store_true',
|
|
754
|
+
help='Enable legacy port for Lustre CSI driver on the cluster.',
|
|
755
|
+
)
|
|
822
756
|
|
|
823
757
|
|
|
824
758
|
def add_shared_cluster_create_tensorboard_arguments(
|
xpk/parser/cluster_test.py
CHANGED
|
@@ -103,3 +103,44 @@ def test_cluster_create_ray_sub_slicing_is_hidden_but_set_to_false():
|
|
|
103
103
|
|
|
104
104
|
assert args.sub_slicing is False
|
|
105
105
|
assert "--sub-slicing" not in help_str
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def test_cluster_create_managed_mldiagnostics():
|
|
109
|
+
parser = argparse.ArgumentParser()
|
|
110
|
+
|
|
111
|
+
set_cluster_create_parser(parser)
|
|
112
|
+
args = parser.parse_args([
|
|
113
|
+
"--cluster",
|
|
114
|
+
"test-cluster",
|
|
115
|
+
"--tpu-type",
|
|
116
|
+
"v5p-8",
|
|
117
|
+
"--managed-mldiagnostics",
|
|
118
|
+
])
|
|
119
|
+
|
|
120
|
+
assert args.managed_mldiagnostics is True
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def test_cluster_create_enable_lustre_legacy_port_is_false_by_default():
|
|
124
|
+
parser = argparse.ArgumentParser()
|
|
125
|
+
|
|
126
|
+
set_cluster_create_parser(parser)
|
|
127
|
+
args = parser.parse_args(
|
|
128
|
+
["--cluster", "test-cluster", "--tpu-type", "tpu7x-2"]
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
assert args.enable_legacy_lustre_port is False
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def test_cluster_create_enable_lustre_legacy_port_can_be_set():
|
|
135
|
+
parser = argparse.ArgumentParser()
|
|
136
|
+
|
|
137
|
+
set_cluster_create_parser(parser)
|
|
138
|
+
args = parser.parse_args([
|
|
139
|
+
"--cluster",
|
|
140
|
+
"test-cluster",
|
|
141
|
+
"--tpu-type",
|
|
142
|
+
"tpu7x-2",
|
|
143
|
+
"--enable-legacy-lustre-port",
|
|
144
|
+
])
|
|
145
|
+
|
|
146
|
+
assert args.enable_legacy_lustre_port is True
|
xpk/parser/common.py
CHANGED
|
@@ -16,6 +16,10 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
import argparse
|
|
18
18
|
from typing import Protocol, Any
|
|
19
|
+
from ..core.system_characteristics import get_system_characteristics_keys_by_accelerator_type, AcceleratorType
|
|
20
|
+
import difflib
|
|
21
|
+
from argcomplete import ChoicesCompleter
|
|
22
|
+
from argparse import Action, ArgumentError
|
|
19
23
|
|
|
20
24
|
|
|
21
25
|
class ParserOrArgumentGroup(Protocol):
|
|
@@ -24,6 +28,46 @@ class ParserOrArgumentGroup(Protocol):
|
|
|
24
28
|
...
|
|
25
29
|
|
|
26
30
|
|
|
31
|
+
class ManyChoicesAction(Action):
|
|
32
|
+
"""An action class to output better error message for arguments with large lists of choices."""
|
|
33
|
+
|
|
34
|
+
def __init__(self, *args, large_choice_list, **kwargs):
|
|
35
|
+
self.large_list_of_choices = large_choice_list
|
|
36
|
+
super().__init__(*args, **kwargs)
|
|
37
|
+
|
|
38
|
+
def __call__(self, parser, namespace, value, option_string=None):
|
|
39
|
+
if value not in self.large_list_of_choices:
|
|
40
|
+
close_matches = difflib.get_close_matches(
|
|
41
|
+
value, self.large_list_of_choices, n=5, cutoff=0
|
|
42
|
+
)
|
|
43
|
+
msg = (
|
|
44
|
+
f"invalid choice: '{value}' (closest matches:"
|
|
45
|
+
f" {', '.join(close_matches)})"
|
|
46
|
+
)
|
|
47
|
+
raise ArgumentError(self, msg)
|
|
48
|
+
setattr(namespace, self.dest, value)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def add_many_choices_argument(
|
|
52
|
+
parserOrGroup: ParserOrArgumentGroup,
|
|
53
|
+
flag_name,
|
|
54
|
+
choices: list[str],
|
|
55
|
+
metavar: str,
|
|
56
|
+
help_msg: str,
|
|
57
|
+
required: bool = False,
|
|
58
|
+
) -> None:
|
|
59
|
+
parserOrGroup.add_argument(
|
|
60
|
+
flag_name,
|
|
61
|
+
action=ManyChoicesAction,
|
|
62
|
+
large_choice_list=choices,
|
|
63
|
+
type=str,
|
|
64
|
+
metavar=metavar,
|
|
65
|
+
help=help_msg,
|
|
66
|
+
required=required,
|
|
67
|
+
default=None,
|
|
68
|
+
).completer = ChoicesCompleter(choices)
|
|
69
|
+
|
|
70
|
+
|
|
27
71
|
def add_shared_arguments(
|
|
28
72
|
custom_parser_or_group: ParserOrArgumentGroup, required=False
|
|
29
73
|
) -> None:
|
|
@@ -285,3 +329,43 @@ def add_slurm_arguments(custom_parser_or_group: ParserOrArgumentGroup):
|
|
|
285
329
|
' `very-high`. Defaults to `medium`.'
|
|
286
330
|
),
|
|
287
331
|
)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def add_tpu_type_argument(
|
|
335
|
+
custom_parser_or_group: ParserOrArgumentGroup,
|
|
336
|
+
required: bool = False,
|
|
337
|
+
) -> None:
|
|
338
|
+
add_many_choices_argument(
|
|
339
|
+
custom_parser_or_group,
|
|
340
|
+
'--tpu-type',
|
|
341
|
+
choices=get_system_characteristics_keys_by_accelerator_type(
|
|
342
|
+
[AcceleratorType.TPU]
|
|
343
|
+
),
|
|
344
|
+
metavar='TPU_TYPE',
|
|
345
|
+
help_msg='The tpu type to use, v5litepod-16, etc.',
|
|
346
|
+
required=required,
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def add_device_type_argument(
|
|
351
|
+
custom_parser_or_group: ParserOrArgumentGroup,
|
|
352
|
+
required: bool = False,
|
|
353
|
+
) -> None:
|
|
354
|
+
add_many_choices_argument(
|
|
355
|
+
custom_parser_or_group,
|
|
356
|
+
'--device-type',
|
|
357
|
+
choices=get_system_characteristics_keys_by_accelerator_type(),
|
|
358
|
+
metavar='DEVICE_TYPE',
|
|
359
|
+
help_msg=(
|
|
360
|
+
'The device type to use (can be tpu or gpu or cpu), v5litepod-16,'
|
|
361
|
+
' h100-80gb-8, n2-standard-32-4 etc.'
|
|
362
|
+
),
|
|
363
|
+
required=required,
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def add_tpu_and_device_type_arguments(
|
|
368
|
+
custom_parser_or_group: ParserOrArgumentGroup,
|
|
369
|
+
) -> None:
|
|
370
|
+
add_tpu_type_argument(custom_parser_or_group)
|
|
371
|
+
add_device_type_argument(custom_parser_or_group)
|
xpk/parser/storage.py
CHANGED
|
@@ -104,6 +104,16 @@ def add_storage_attach_parser(
|
|
|
104
104
|
help='If true workloads can only read from storage',
|
|
105
105
|
)
|
|
106
106
|
|
|
107
|
+
lustre_args = storage_attach_parser.add_argument_group(
|
|
108
|
+
'Lustre arguments',
|
|
109
|
+
'Arguments used when --type=lustre',
|
|
110
|
+
)
|
|
111
|
+
lustre_args.add_argument(
|
|
112
|
+
'--enable-legacy-lustre-port',
|
|
113
|
+
action='store_true',
|
|
114
|
+
help='Enable legacy port for Lustre CSI driver on the cluster.',
|
|
115
|
+
)
|
|
116
|
+
|
|
107
117
|
gcsfuse_args = storage_attach_parser.add_argument_group(
|
|
108
118
|
'FUSE arguments',
|
|
109
119
|
'Arguments used when --type=gcsfuse',
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
from xpk.parser.storage import set_storage_parser
|
|
19
|
+
|
|
20
|
+
DEFAULT_ATTACH_ARGUMENTS = (
|
|
21
|
+
"attach test-storage --cluster test-cluster --zone test-zone"
|
|
22
|
+
" --project test-project --mount-point test-mount-point"
|
|
23
|
+
" --readonly false --auto-mount true"
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
DEFAULT_LUSTRE_ATTACH_ARGUMENTS = (
|
|
27
|
+
DEFAULT_ATTACH_ARGUMENTS + " --type lustre --manifest test-manifest"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_cluster_create_enable_lustre_legacy_port_is_false_by_default():
|
|
32
|
+
parser = argparse.ArgumentParser()
|
|
33
|
+
|
|
34
|
+
set_storage_parser(parser)
|
|
35
|
+
args = parser.parse_args(DEFAULT_LUSTRE_ATTACH_ARGUMENTS.split())
|
|
36
|
+
|
|
37
|
+
assert args.enable_legacy_lustre_port is False
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_cluster_create_enable_lustre_legacy_port_can_be_set():
|
|
41
|
+
parser = argparse.ArgumentParser()
|
|
42
|
+
set_storage_parser(parser)
|
|
43
|
+
args = parser.parse_args(
|
|
44
|
+
DEFAULT_LUSTRE_ATTACH_ARGUMENTS.split() + ["--enable-legacy-lustre-port"]
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
assert args.enable_legacy_lustre_port is True
|
xpk/parser/workload.py
CHANGED
|
@@ -22,10 +22,8 @@ from ..commands.workload import (
|
|
|
22
22
|
workload_list,
|
|
23
23
|
)
|
|
24
24
|
from ..core.docker_image import DEFAULT_DOCKER_IMAGE, DEFAULT_SCRIPT_DIR
|
|
25
|
-
from .common import add_shared_arguments
|
|
25
|
+
from .common import add_shared_arguments, add_tpu_type_argument, add_tpu_and_device_type_arguments
|
|
26
26
|
from .validators import directory_path_type, name_type
|
|
27
|
-
from ..utils.feature_flags import FeatureFlags
|
|
28
|
-
from ..core.system_characteristics import get_system_characteristics_keys_by_accelerator_type, AcceleratorType, SUB_SLICING_TOPOLOGIES
|
|
29
27
|
|
|
30
28
|
|
|
31
29
|
def set_workload_parsers(workload_parser: ArgumentParser):
|
|
@@ -119,27 +117,7 @@ def set_workload_create_parser(workload_create_parser: ArgumentParser):
|
|
|
119
117
|
required=True
|
|
120
118
|
)
|
|
121
119
|
)
|
|
122
|
-
workload_device_group
|
|
123
|
-
'--tpu-type',
|
|
124
|
-
type=str,
|
|
125
|
-
default=None,
|
|
126
|
-
help='The tpu type to use, v5litepod-16, etc.',
|
|
127
|
-
metavar='TPU_TYPE',
|
|
128
|
-
choices=get_system_characteristics_keys_by_accelerator_type(
|
|
129
|
-
[AcceleratorType.TPU]
|
|
130
|
-
),
|
|
131
|
-
)
|
|
132
|
-
workload_device_group.add_argument(
|
|
133
|
-
'--device-type',
|
|
134
|
-
type=str,
|
|
135
|
-
default=None,
|
|
136
|
-
help=(
|
|
137
|
-
'The device type to use (can be tpu or gpu or cpu), v5litepod-16,'
|
|
138
|
-
' h100-80gb-8, n2-standard-32-4 etc.'
|
|
139
|
-
),
|
|
140
|
-
metavar='DEVICE_TYPE',
|
|
141
|
-
choices=get_system_characteristics_keys_by_accelerator_type(),
|
|
142
|
-
)
|
|
120
|
+
add_tpu_and_device_type_arguments(workload_device_group)
|
|
143
121
|
|
|
144
122
|
workload_create_parser_optional_arguments.add_argument(
|
|
145
123
|
'--storage',
|
|
@@ -287,15 +265,8 @@ def set_workload_create_pathways_parser(
|
|
|
287
265
|
)
|
|
288
266
|
)
|
|
289
267
|
### "workload create-pathways" Required arguments, specific to Pathways
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
type=str,
|
|
293
|
-
default=None,
|
|
294
|
-
help='The tpu type to use, v5litepod-16, etc.',
|
|
295
|
-
metavar='TPU_TYPE',
|
|
296
|
-
choices=get_system_characteristics_keys_by_accelerator_type(
|
|
297
|
-
[AcceleratorType.TPU]
|
|
298
|
-
),
|
|
268
|
+
add_tpu_type_argument(
|
|
269
|
+
workload_create_pathways_parser_required_arguments, required=True
|
|
299
270
|
)
|
|
300
271
|
|
|
301
272
|
### "workload create-pathways" Optional arguments, specific to Pathways
|
|
@@ -612,6 +583,16 @@ def add_shared_workload_create_optional_arguments(args_parsers):
|
|
|
612
583
|
' `jax-tpu`.'
|
|
613
584
|
),
|
|
614
585
|
)
|
|
586
|
+
custom_parser.add_argument(
|
|
587
|
+
'--output-manifest-file',
|
|
588
|
+
type=str,
|
|
589
|
+
default=None,
|
|
590
|
+
help=(
|
|
591
|
+
'If you want to see the generated manifest, provide a file path'
|
|
592
|
+
' here. This will write the manifest to the file. If used with'
|
|
593
|
+
' --dry-run, it will skip the actual deployment and cluster checks.'
|
|
594
|
+
),
|
|
595
|
+
)
|
|
615
596
|
custom_parser.add_argument(
|
|
616
597
|
'--num-slices',
|
|
617
598
|
type=int,
|
|
@@ -670,14 +651,6 @@ def add_shared_workload_create_optional_arguments(args_parsers):
|
|
|
670
651
|
' the workload.'
|
|
671
652
|
),
|
|
672
653
|
)
|
|
673
|
-
if FeatureFlags.SUB_SLICING_ENABLED:
|
|
674
|
-
custom_parser.add_argument(
|
|
675
|
-
'--sub-slicing-topology',
|
|
676
|
-
type=str,
|
|
677
|
-
help='Sub-slicing topology to use.',
|
|
678
|
-
required=False,
|
|
679
|
-
choices=SUB_SLICING_TOPOLOGIES,
|
|
680
|
-
)
|
|
681
654
|
|
|
682
655
|
|
|
683
656
|
def add_shared_workload_create_env_arguments(args_parsers):
|
xpk/parser/workload_test.py
CHANGED
|
@@ -16,35 +16,9 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
import argparse
|
|
18
18
|
from xpk.parser.workload import set_workload_create_parser
|
|
19
|
-
from ..utils.feature_flags import FeatureFlags
|
|
20
|
-
import pytest
|
|
21
19
|
|
|
22
20
|
|
|
23
|
-
|
|
24
|
-
def with_sub_slicing_enabled():
|
|
25
|
-
FeatureFlags.SUB_SLICING_ENABLED = True
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def test_workload_create_sub_slicing_topology_is_hidden_with_flag_off():
|
|
29
|
-
FeatureFlags.SUB_SLICING_ENABLED = False
|
|
30
|
-
parser = argparse.ArgumentParser()
|
|
31
|
-
|
|
32
|
-
set_workload_create_parser(parser)
|
|
33
|
-
help_str = parser.format_help()
|
|
34
|
-
|
|
35
|
-
assert "--sub-slicing" not in help_str
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def test_workload_create_sub_slicing_topology_is_shown_with_flag_on():
|
|
39
|
-
parser = argparse.ArgumentParser()
|
|
40
|
-
|
|
41
|
-
set_workload_create_parser(parser)
|
|
42
|
-
help_str = parser.format_help()
|
|
43
|
-
|
|
44
|
-
assert "--sub-slicing" in help_str
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
def test_workload_create_sub_slicing_topology_is_none_by_default():
|
|
21
|
+
def test_workload_create_parses():
|
|
48
22
|
parser = argparse.ArgumentParser()
|
|
49
23
|
|
|
50
24
|
set_workload_create_parser(parser)
|
|
@@ -59,24 +33,4 @@ def test_workload_create_sub_slicing_topology_is_none_by_default():
|
|
|
59
33
|
"tpu7x-2",
|
|
60
34
|
])
|
|
61
35
|
|
|
62
|
-
assert args
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def test_workload_create_sub_slicing_topology_can_be_set():
|
|
66
|
-
parser = argparse.ArgumentParser()
|
|
67
|
-
|
|
68
|
-
set_workload_create_parser(parser)
|
|
69
|
-
args = parser.parse_args([
|
|
70
|
-
"--cluster",
|
|
71
|
-
"test-cluster",
|
|
72
|
-
"--command",
|
|
73
|
-
"python3",
|
|
74
|
-
"--workload",
|
|
75
|
-
"test",
|
|
76
|
-
"--tpu-type",
|
|
77
|
-
"tpu7x-8",
|
|
78
|
-
"--sub-slicing-topology",
|
|
79
|
-
"2x4",
|
|
80
|
-
])
|
|
81
|
-
|
|
82
|
-
assert args.sub_slicing_topology is "2x4"
|
|
36
|
+
assert args
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
apiVersion: jobset.x-k8s.io/v1alpha2
|
|
2
|
+
kind: JobSet
|
|
3
|
+
metadata:
|
|
4
|
+
name: {{ workload }}
|
|
5
|
+
labels:
|
|
6
|
+
kueue.x-k8s.io/queue-name: multislice-queue # Name of the LocalQueue
|
|
7
|
+
xpk.google.com/workload: {{ workload }}
|
|
8
|
+
spec:
|
|
9
|
+
ttlSecondsAfterFinished: {{ ttl_seconds_after_finished }}
|
|
10
|
+
failurePolicy:
|
|
11
|
+
{{ failure_policy_rules }}
|
|
12
|
+
maxRestarts: {{ max_restarts }}
|
|
13
|
+
replicatedJobs:
|
|
14
|
+
- name: slice-job
|
|
15
|
+
replicas: 1
|
|
16
|
+
template:
|
|
17
|
+
spec:
|
|
18
|
+
parallelism: {{ num_nodes }}
|
|
19
|
+
completions: {{ num_nodes }}
|
|
20
|
+
backoffLimit: 0 # When any pod fails, the job is failed
|
|
21
|
+
{{ pod_failure_policy }}
|
|
22
|
+
template:
|
|
23
|
+
metadata:
|
|
24
|
+
labels:
|
|
25
|
+
xpk.google.com/workload: {{ workload }}
|
|
26
|
+
annotations:
|
|
27
|
+
{{ annotations }}
|
|
28
|
+
spec:
|
|
29
|
+
priorityClassName: {{ priority }}
|
|
30
|
+
restartPolicy: Never
|
|
31
|
+
nodeSelector:
|
|
32
|
+
{{ placement_policy_label }}
|
|
33
|
+
imagePullSecrets:
|
|
34
|
+
- name: {{ docker_image_pull_secret }}
|
|
35
|
+
dnsPolicy: ClusterFirstWithHostNet
|
|
36
|
+
terminationGracePeriodSeconds: {{ termination_grace_period_seconds }}
|
|
37
|
+
serviceAccountName: {{ service_account }}
|
|
38
|
+
tolerations:
|
|
39
|
+
- operator: "Exists"
|
|
40
|
+
key: nvidia.com/gpu
|
|
41
|
+
- key: "kubernetes.io/arch"
|
|
42
|
+
operator: "Equal"
|
|
43
|
+
value: "arm64"
|
|
44
|
+
effect: "NoSchedule"
|
|
45
|
+
containers:
|
|
46
|
+
{{ container }}
|
xpk/utils/feature_flags.py
CHANGED
|
@@ -29,6 +29,9 @@ def _get_boolean_flag(flag: str, default: bool) -> bool:
|
|
|
29
29
|
class _FeatureFlags:
|
|
30
30
|
SUB_SLICING_ENABLED = _get_boolean_flag("SUB_SLICING_ENABLED", default=False)
|
|
31
31
|
TELEMETRY_ENABLED = _get_boolean_flag("TELEMETRY_ENABLED", default=False)
|
|
32
|
+
SUPER_SLICING_ENABLED = _get_boolean_flag(
|
|
33
|
+
"SUPER_SLICING_ENABLED", default=False
|
|
34
|
+
)
|
|
32
35
|
|
|
33
36
|
|
|
34
37
|
FeatureFlags = _FeatureFlags()
|
xpk/utils/validation.py
CHANGED
|
@@ -72,8 +72,8 @@ class SystemDependency(Enum):
|
|
|
72
72
|
|
|
73
73
|
|
|
74
74
|
def should_validate_dependencies(args):
|
|
75
|
-
skip_validation = 'skip_validation'
|
|
76
|
-
dry_run = 'dry_run'
|
|
75
|
+
skip_validation = hasattr(args, 'skip_validation') and args.skip_validation
|
|
76
|
+
dry_run = hasattr(args, 'dry_run') and args.dry_run
|
|
77
77
|
return not skip_validation and not dry_run
|
|
78
78
|
|
|
79
79
|
|