xpk 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. integration/README.md +19 -0
  2. xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
  3. xpk/blueprints/a3mega/storage_crd.yaml +52 -0
  4. xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
  5. xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
  6. xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
  7. xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
  8. xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
  9. xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
  10. xpk/blueprints/a4/storage_crd.yaml +52 -0
  11. xpk/commands/cluster.py +33 -12
  12. xpk/commands/cluster_gcluster_test.py +5 -1
  13. xpk/commands/cluster_test.py +125 -0
  14. xpk/commands/config.py +3 -3
  15. xpk/commands/inspector.py +5 -3
  16. xpk/commands/kind.py +2 -0
  17. xpk/commands/managed_ml_diagnostics.py +249 -0
  18. xpk/commands/managed_ml_diagnostics_test.py +146 -0
  19. xpk/commands/workload.py +124 -139
  20. xpk/commands/workload_test.py +160 -118
  21. xpk/core/blueprint/blueprint_generator.py +3 -0
  22. xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
  23. xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
  24. xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
  25. xpk/core/blueprint/testing/data/a4.yaml +185 -0
  26. xpk/core/capacity.py +2 -0
  27. xpk/core/cluster.py +18 -47
  28. xpk/core/cluster_test.py +76 -1
  29. xpk/core/config.py +81 -7
  30. xpk/core/config_test.py +67 -11
  31. xpk/core/docker_container.py +3 -1
  32. xpk/core/docker_image.py +10 -6
  33. xpk/core/docker_resources.py +1 -10
  34. xpk/core/kjob.py +17 -16
  35. xpk/core/kueue_manager.py +13 -19
  36. xpk/core/kueue_manager_test.py +27 -1
  37. xpk/core/nap.py +13 -14
  38. xpk/core/nodepool.py +17 -15
  39. xpk/core/nodepool_test.py +25 -4
  40. xpk/core/pathways.py +23 -0
  41. xpk/core/pathways_test.py +57 -0
  42. xpk/core/resources.py +84 -27
  43. xpk/core/scheduling.py +128 -132
  44. xpk/core/scheduling_test.py +215 -2
  45. xpk/core/system_characteristics.py +179 -0
  46. xpk/core/system_characteristics_test.py +49 -1
  47. xpk/core/telemetry.py +4 -4
  48. xpk/core/telemetry_test.py +9 -9
  49. xpk/core/vertex.py +4 -3
  50. xpk/core/workload_decorators/tcpx_decorator.py +5 -1
  51. xpk/main.py +2 -0
  52. xpk/parser/cluster.py +22 -88
  53. xpk/parser/cluster_test.py +41 -0
  54. xpk/parser/common.py +84 -0
  55. xpk/parser/storage.py +10 -0
  56. xpk/parser/storage_test.py +47 -0
  57. xpk/parser/workload.py +14 -41
  58. xpk/parser/workload_test.py +2 -48
  59. xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
  60. xpk/utils/feature_flags.py +3 -0
  61. xpk/utils/validation.py +2 -2
  62. xpk-0.16.0.dist-info/METADATA +127 -0
  63. {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/RECORD +67 -48
  64. xpk-0.15.0.dist-info/METADATA +0 -1666
  65. {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
  66. {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
  67. {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
  68. {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
@@ -39,12 +39,16 @@ def decorate_job(job_manifest: dict) -> dict:
39
39
  return job_manifest
40
40
 
41
41
 
42
- def decorate_jobset(jobset_manifest_str: str) -> str:
42
+ def decorate_jobset( # pylint: disable=dangerous-default-value
43
+ jobset_manifest_str: str,
44
+ sub_networks: list[str] = [], # pylint: disable=unused-argument
45
+ ) -> str:
43
46
  """
44
47
  Decorates a JobSet manifest with the necessary components for tcpxo-daemon.
45
48
 
46
49
  Args:
47
50
  jobset_manifest_str: The JobSet manifest as a YAML string.
51
+ sub_networks: This parameter is accepted for interface consistency but is not used.
48
52
 
49
53
  Returns:
50
54
  The modified JobSet manifest as a YAML string.
xpk/main.py CHANGED
@@ -37,6 +37,7 @@ import sys
37
37
 
38
38
  from .parser.core import set_parser
39
39
  from .core.updates import print_xpk_hello
40
+ from .core.config import set_config, FileSystemConfig
40
41
  from .core.telemetry import MetricsCollector, send_clearcut_payload, should_send_telemetry
41
42
  from .utils.console import xpk_print, exit_code_to_int
42
43
  from .utils.execution_context import set_context
@@ -69,6 +70,7 @@ def main() -> None:
69
70
 
70
71
  main_args = parser.parse_args()
71
72
  main_args.enable_ray_cluster = False
73
+ set_config(FileSystemConfig())
72
74
  set_context(
73
75
  dry_run_value='dry_run' in main_args and main_args.dry_run,
74
76
  quiet_value=(
xpk/parser/cluster.py CHANGED
@@ -26,11 +26,10 @@ from ..commands.cluster import (
26
26
  cluster_describe,
27
27
  cluster_list,
28
28
  )
29
- from ..core.config import xpk_config
30
- from ..core.system_characteristics import get_system_characteristics_keys_by_accelerator_type, AcceleratorType
29
+ from ..core.config import get_config
31
30
  from ..core.config import CFG_BUCKET_KEY
32
31
  from ..core.vertex import DEFAULT_VERTEX_TENSORBOARD_NAME
33
- from .common import add_shared_arguments, ParserOrArgumentGroup
32
+ from .common import add_shared_arguments, ParserOrArgumentGroup, add_tpu_type_argument, add_tpu_and_device_type_arguments
34
33
  from .validators import name_type
35
34
  from ..utils.feature_flags import FeatureFlags
36
35
 
@@ -99,27 +98,7 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
99
98
  required=True
100
99
  )
101
100
  )
102
- cluster_device_group.add_argument(
103
- '--tpu-type',
104
- type=str,
105
- default=None,
106
- help='The tpu type to use, v5litepod-16, etc.',
107
- metavar='TPU_TYPE',
108
- choices=get_system_characteristics_keys_by_accelerator_type(
109
- [AcceleratorType.TPU]
110
- ),
111
- )
112
- cluster_device_group.add_argument(
113
- '--device-type',
114
- type=str,
115
- default=None,
116
- help=(
117
- 'The device type to use (can be tpu or gpu or cpu), v5litepod-16,'
118
- ' h100-80gb-8, n2-standard-32-4 etc.'
119
- ),
120
- metavar='DEVICE_TYPE',
121
- choices=get_system_characteristics_keys_by_accelerator_type(),
122
- )
101
+ add_tpu_and_device_type_arguments(cluster_device_group)
123
102
 
124
103
  ### Optional arguments specific to "cluster create"
125
104
  cluster_create_optional_arguments = cluster_create_parser.add_argument_group(
@@ -131,7 +110,7 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
131
110
  cluster_create_optional_arguments.add_argument(
132
111
  '--cluster-state-gcs-bucket',
133
112
  type=str,
134
- default=xpk_config.get(CFG_BUCKET_KEY),
113
+ default=get_config().get(CFG_BUCKET_KEY),
135
114
  help='The name of the bucket to store cluster state.',
136
115
  required=False,
137
116
  )
@@ -150,6 +129,7 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
150
129
  ' enable cluster to accept Pathways workloads.'
151
130
  ),
152
131
  )
132
+
153
133
  if FeatureFlags.SUB_SLICING_ENABLED:
154
134
  add_cluster_create_sub_slicing_arguments(cluster_create_optional_arguments)
155
135
 
@@ -207,15 +187,8 @@ def set_cluster_create_pathways_parser(
207
187
  add_shared_cluster_create_required_arguments(
208
188
  cluster_create_pathways_required_arguments
209
189
  )
210
- cluster_create_pathways_required_arguments.add_argument(
211
- '--tpu-type',
212
- type=str,
213
- default=None,
214
- help='The tpu type to use, v5litepod-16, etc.',
215
- metavar='TPU_TYPE',
216
- choices=get_system_characteristics_keys_by_accelerator_type(
217
- [AcceleratorType.TPU]
218
- ),
190
+ add_tpu_type_argument(
191
+ cluster_create_pathways_required_arguments, required=True
219
192
  )
220
193
 
221
194
  ### Optional arguments specific to "cluster create-pathways"
@@ -292,17 +265,8 @@ def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser):
292
265
  add_shared_cluster_create_required_arguments(
293
266
  cluster_create_ray_required_arguments
294
267
  )
295
- cluster_create_ray_required_arguments.add_argument(
296
- '--tpu-type',
297
- type=str,
298
- default=None,
299
- help='The tpu type to use, v5litepod-16, etc.',
300
- required=True,
301
- metavar='TPU_TYPE',
302
- choices=get_system_characteristics_keys_by_accelerator_type(
303
- [AcceleratorType.TPU]
304
- ),
305
- )
268
+ add_tpu_type_argument(cluster_create_ray_required_arguments, required=True)
269
+
306
270
  # TODO(bzmarke): Add --device-type to support GPU/CPU
307
271
  cluster_create_ray_required_arguments.add_argument(
308
272
  '--ray-version',
@@ -392,7 +356,7 @@ def set_cluster_delete_parser(cluster_delete_parser: ArgumentParser):
392
356
  cluster_delete_optional_arguments.add_argument(
393
357
  '--cluster-state-gcs-bucket',
394
358
  type=str,
395
- default=xpk_config.get(CFG_BUCKET_KEY),
359
+ default=get_config().get(CFG_BUCKET_KEY),
396
360
  help='The name of the bucket to store cluster state.',
397
361
  required=False,
398
362
  )
@@ -421,27 +385,7 @@ def set_cluster_cacheimage_parser(cluster_cacheimage_parser: ArgumentParser):
421
385
  )
422
386
 
423
387
  ### Device Type Argument
424
- cluster_cacheimage_group.add_argument(
425
- '--tpu-type',
426
- type=str,
427
- default=None,
428
- help='The tpu type to cache images on, v5litepod-16, etc.',
429
- metavar='TPU_TYPE',
430
- choices=get_system_characteristics_keys_by_accelerator_type(
431
- [AcceleratorType.TPU]
432
- ),
433
- )
434
- cluster_cacheimage_group.add_argument(
435
- '--device-type',
436
- type=str,
437
- default=None,
438
- help=(
439
- 'The device type to cache images on (can be tpu or gpu),'
440
- ' v5litepod-16, h100-80gb-8, etc.'
441
- ),
442
- metavar='DEVICE_TYPE',
443
- choices=get_system_characteristics_keys_by_accelerator_type(),
444
- )
388
+ add_tpu_and_device_type_arguments(cluster_cacheimage_group)
445
389
 
446
390
  ### Required arguments
447
391
  cluster_cacheimage_required_arguments.add_argument(
@@ -526,27 +470,7 @@ def set_cluster_adapt_parser(cluster_adapt_parser: ArgumentParser):
526
470
  required=True
527
471
  )
528
472
  )
529
- cluster_adapt_device_group.add_argument(
530
- '--tpu-type',
531
- type=str,
532
- default=None,
533
- help='The tpu type used on cluster, v5litepod-16, etc.',
534
- metavar='TPU_TYPE',
535
- choices=get_system_characteristics_keys_by_accelerator_type(
536
- [AcceleratorType.TPU]
537
- ),
538
- )
539
- cluster_adapt_device_group.add_argument(
540
- '--device-type',
541
- type=str,
542
- default=None,
543
- help=(
544
- 'The device type used on cluster (can be tpu or gpu or cpu), eg.'
545
- ' h100-80gb-8, n2-standard-32-4 etc.'
546
- ),
547
- metavar='DEVICE_TYPE',
548
- choices=get_system_characteristics_keys_by_accelerator_type(),
549
- )
473
+ add_tpu_and_device_type_arguments(cluster_adapt_device_group)
550
474
 
551
475
  cluster_adapt_optional_arguments = cluster_adapt_parser.add_argument_group(
552
476
  'Optional Arguments',
@@ -691,6 +615,11 @@ def add_shared_cluster_create_optional_arguments(
691
615
  ' regional clusters, all zones must support the machine type.'
692
616
  ),
693
617
  )
618
+ parser_or_group.add_argument(
619
+ '--managed-mldiagnostics',
620
+ action='store_true',
621
+ help='Enables the installation of required ML Diagnostics components.',
622
+ )
694
623
  parser_or_group.add_argument(
695
624
  '--cluster-cpu-machine-type',
696
625
  type=str,
@@ -819,6 +748,11 @@ def add_driver_arguments(parser_or_group: ParserOrArgumentGroup):
819
748
  action='store_true',
820
749
  help='Enable Lustre CSI driver on the cluster.',
821
750
  )
751
+ parser_or_group.add_argument(
752
+ '--enable-legacy-lustre-port',
753
+ action='store_true',
754
+ help='Enable legacy port for Lustre CSI driver on the cluster.',
755
+ )
822
756
 
823
757
 
824
758
  def add_shared_cluster_create_tensorboard_arguments(
@@ -103,3 +103,44 @@ def test_cluster_create_ray_sub_slicing_is_hidden_but_set_to_false():
103
103
 
104
104
  assert args.sub_slicing is False
105
105
  assert "--sub-slicing" not in help_str
106
+
107
+
108
+ def test_cluster_create_managed_mldiagnostics():
109
+ parser = argparse.ArgumentParser()
110
+
111
+ set_cluster_create_parser(parser)
112
+ args = parser.parse_args([
113
+ "--cluster",
114
+ "test-cluster",
115
+ "--tpu-type",
116
+ "v5p-8",
117
+ "--managed-mldiagnostics",
118
+ ])
119
+
120
+ assert args.managed_mldiagnostics is True
121
+
122
+
123
+ def test_cluster_create_enable_lustre_legacy_port_is_false_by_default():
124
+ parser = argparse.ArgumentParser()
125
+
126
+ set_cluster_create_parser(parser)
127
+ args = parser.parse_args(
128
+ ["--cluster", "test-cluster", "--tpu-type", "tpu7x-2"]
129
+ )
130
+
131
+ assert args.enable_legacy_lustre_port is False
132
+
133
+
134
+ def test_cluster_create_enable_lustre_legacy_port_can_be_set():
135
+ parser = argparse.ArgumentParser()
136
+
137
+ set_cluster_create_parser(parser)
138
+ args = parser.parse_args([
139
+ "--cluster",
140
+ "test-cluster",
141
+ "--tpu-type",
142
+ "tpu7x-2",
143
+ "--enable-legacy-lustre-port",
144
+ ])
145
+
146
+ assert args.enable_legacy_lustre_port is True
xpk/parser/common.py CHANGED
@@ -16,6 +16,10 @@ limitations under the License.
16
16
 
17
17
  import argparse
18
18
  from typing import Protocol, Any
19
+ from ..core.system_characteristics import get_system_characteristics_keys_by_accelerator_type, AcceleratorType
20
+ import difflib
21
+ from argcomplete import ChoicesCompleter
22
+ from argparse import Action, ArgumentError
19
23
 
20
24
 
21
25
  class ParserOrArgumentGroup(Protocol):
@@ -24,6 +28,46 @@ class ParserOrArgumentGroup(Protocol):
24
28
  ...
25
29
 
26
30
 
31
+ class ManyChoicesAction(Action):
32
+ """An action class to output better error message for arguments with large lists of choices."""
33
+
34
+ def __init__(self, *args, large_choice_list, **kwargs):
35
+ self.large_list_of_choices = large_choice_list
36
+ super().__init__(*args, **kwargs)
37
+
38
+ def __call__(self, parser, namespace, value, option_string=None):
39
+ if value not in self.large_list_of_choices:
40
+ close_matches = difflib.get_close_matches(
41
+ value, self.large_list_of_choices, n=5, cutoff=0
42
+ )
43
+ msg = (
44
+ f"invalid choice: '{value}' (closest matches:"
45
+ f" {', '.join(close_matches)})"
46
+ )
47
+ raise ArgumentError(self, msg)
48
+ setattr(namespace, self.dest, value)
49
+
50
+
51
+ def add_many_choices_argument(
52
+ parserOrGroup: ParserOrArgumentGroup,
53
+ flag_name,
54
+ choices: list[str],
55
+ metavar: str,
56
+ help_msg: str,
57
+ required: bool = False,
58
+ ) -> None:
59
+ parserOrGroup.add_argument(
60
+ flag_name,
61
+ action=ManyChoicesAction,
62
+ large_choice_list=choices,
63
+ type=str,
64
+ metavar=metavar,
65
+ help=help_msg,
66
+ required=required,
67
+ default=None,
68
+ ).completer = ChoicesCompleter(choices)
69
+
70
+
27
71
  def add_shared_arguments(
28
72
  custom_parser_or_group: ParserOrArgumentGroup, required=False
29
73
  ) -> None:
@@ -285,3 +329,43 @@ def add_slurm_arguments(custom_parser_or_group: ParserOrArgumentGroup):
285
329
  ' `very-high`. Defaults to `medium`.'
286
330
  ),
287
331
  )
332
+
333
+
334
+ def add_tpu_type_argument(
335
+ custom_parser_or_group: ParserOrArgumentGroup,
336
+ required: bool = False,
337
+ ) -> None:
338
+ add_many_choices_argument(
339
+ custom_parser_or_group,
340
+ '--tpu-type',
341
+ choices=get_system_characteristics_keys_by_accelerator_type(
342
+ [AcceleratorType.TPU]
343
+ ),
344
+ metavar='TPU_TYPE',
345
+ help_msg='The tpu type to use, v5litepod-16, etc.',
346
+ required=required,
347
+ )
348
+
349
+
350
+ def add_device_type_argument(
351
+ custom_parser_or_group: ParserOrArgumentGroup,
352
+ required: bool = False,
353
+ ) -> None:
354
+ add_many_choices_argument(
355
+ custom_parser_or_group,
356
+ '--device-type',
357
+ choices=get_system_characteristics_keys_by_accelerator_type(),
358
+ metavar='DEVICE_TYPE',
359
+ help_msg=(
360
+ 'The device type to use (can be tpu or gpu or cpu), v5litepod-16,'
361
+ ' h100-80gb-8, n2-standard-32-4 etc.'
362
+ ),
363
+ required=required,
364
+ )
365
+
366
+
367
+ def add_tpu_and_device_type_arguments(
368
+ custom_parser_or_group: ParserOrArgumentGroup,
369
+ ) -> None:
370
+ add_tpu_type_argument(custom_parser_or_group)
371
+ add_device_type_argument(custom_parser_or_group)
xpk/parser/storage.py CHANGED
@@ -104,6 +104,16 @@ def add_storage_attach_parser(
104
104
  help='If true workloads can only read from storage',
105
105
  )
106
106
 
107
+ lustre_args = storage_attach_parser.add_argument_group(
108
+ 'Lustre arguments',
109
+ 'Arguments used when --type=lustre',
110
+ )
111
+ lustre_args.add_argument(
112
+ '--enable-legacy-lustre-port',
113
+ action='store_true',
114
+ help='Enable legacy port for Lustre CSI driver on the cluster.',
115
+ )
116
+
107
117
  gcsfuse_args = storage_attach_parser.add_argument_group(
108
118
  'FUSE arguments',
109
119
  'Arguments used when --type=gcsfuse',
@@ -0,0 +1,47 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import argparse
18
+ from xpk.parser.storage import set_storage_parser
19
+
20
+ DEFAULT_ATTACH_ARGUMENTS = (
21
+ "attach test-storage --cluster test-cluster --zone test-zone"
22
+ " --project test-project --mount-point test-mount-point"
23
+ " --readonly false --auto-mount true"
24
+ )
25
+
26
+ DEFAULT_LUSTRE_ATTACH_ARGUMENTS = (
27
+ DEFAULT_ATTACH_ARGUMENTS + " --type lustre --manifest test-manifest"
28
+ )
29
+
30
+
31
+ def test_cluster_create_enable_lustre_legacy_port_is_false_by_default():
32
+ parser = argparse.ArgumentParser()
33
+
34
+ set_storage_parser(parser)
35
+ args = parser.parse_args(DEFAULT_LUSTRE_ATTACH_ARGUMENTS.split())
36
+
37
+ assert args.enable_legacy_lustre_port is False
38
+
39
+
40
+ def test_cluster_create_enable_lustre_legacy_port_can_be_set():
41
+ parser = argparse.ArgumentParser()
42
+ set_storage_parser(parser)
43
+ args = parser.parse_args(
44
+ DEFAULT_LUSTRE_ATTACH_ARGUMENTS.split() + ["--enable-legacy-lustre-port"]
45
+ )
46
+
47
+ assert args.enable_legacy_lustre_port is True
xpk/parser/workload.py CHANGED
@@ -22,10 +22,8 @@ from ..commands.workload import (
22
22
  workload_list,
23
23
  )
24
24
  from ..core.docker_image import DEFAULT_DOCKER_IMAGE, DEFAULT_SCRIPT_DIR
25
- from .common import add_shared_arguments
25
+ from .common import add_shared_arguments, add_tpu_type_argument, add_tpu_and_device_type_arguments
26
26
  from .validators import directory_path_type, name_type
27
- from ..utils.feature_flags import FeatureFlags
28
- from ..core.system_characteristics import get_system_characteristics_keys_by_accelerator_type, AcceleratorType, SUB_SLICING_TOPOLOGIES
29
27
 
30
28
 
31
29
  def set_workload_parsers(workload_parser: ArgumentParser):
@@ -119,27 +117,7 @@ def set_workload_create_parser(workload_create_parser: ArgumentParser):
119
117
  required=True
120
118
  )
121
119
  )
122
- workload_device_group.add_argument(
123
- '--tpu-type',
124
- type=str,
125
- default=None,
126
- help='The tpu type to use, v5litepod-16, etc.',
127
- metavar='TPU_TYPE',
128
- choices=get_system_characteristics_keys_by_accelerator_type(
129
- [AcceleratorType.TPU]
130
- ),
131
- )
132
- workload_device_group.add_argument(
133
- '--device-type',
134
- type=str,
135
- default=None,
136
- help=(
137
- 'The device type to use (can be tpu or gpu or cpu), v5litepod-16,'
138
- ' h100-80gb-8, n2-standard-32-4 etc.'
139
- ),
140
- metavar='DEVICE_TYPE',
141
- choices=get_system_characteristics_keys_by_accelerator_type(),
142
- )
120
+ add_tpu_and_device_type_arguments(workload_device_group)
143
121
 
144
122
  workload_create_parser_optional_arguments.add_argument(
145
123
  '--storage',
@@ -287,15 +265,8 @@ def set_workload_create_pathways_parser(
287
265
  )
288
266
  )
289
267
  ### "workload create-pathways" Required arguments, specific to Pathways
290
- workload_create_pathways_parser_required_arguments.add_argument(
291
- '--tpu-type',
292
- type=str,
293
- default=None,
294
- help='The tpu type to use, v5litepod-16, etc.',
295
- metavar='TPU_TYPE',
296
- choices=get_system_characteristics_keys_by_accelerator_type(
297
- [AcceleratorType.TPU]
298
- ),
268
+ add_tpu_type_argument(
269
+ workload_create_pathways_parser_required_arguments, required=True
299
270
  )
300
271
 
301
272
  ### "workload create-pathways" Optional arguments, specific to Pathways
@@ -612,6 +583,16 @@ def add_shared_workload_create_optional_arguments(args_parsers):
612
583
  ' `jax-tpu`.'
613
584
  ),
614
585
  )
586
+ custom_parser.add_argument(
587
+ '--output-manifest-file',
588
+ type=str,
589
+ default=None,
590
+ help=(
591
+ 'If you want to see the generated manifest, provide a file path'
592
+ ' here. This will write the manifest to the file. If used with'
593
+ ' --dry-run, it will skip the actual deployment and cluster checks.'
594
+ ),
595
+ )
615
596
  custom_parser.add_argument(
616
597
  '--num-slices',
617
598
  type=int,
@@ -670,14 +651,6 @@ def add_shared_workload_create_optional_arguments(args_parsers):
670
651
  ' the workload.'
671
652
  ),
672
653
  )
673
- if FeatureFlags.SUB_SLICING_ENABLED:
674
- custom_parser.add_argument(
675
- '--sub-slicing-topology',
676
- type=str,
677
- help='Sub-slicing topology to use.',
678
- required=False,
679
- choices=SUB_SLICING_TOPOLOGIES,
680
- )
681
654
 
682
655
 
683
656
  def add_shared_workload_create_env_arguments(args_parsers):
@@ -16,35 +16,9 @@ limitations under the License.
16
16
 
17
17
  import argparse
18
18
  from xpk.parser.workload import set_workload_create_parser
19
- from ..utils.feature_flags import FeatureFlags
20
- import pytest
21
19
 
22
20
 
23
- @pytest.fixture(autouse=True)
24
- def with_sub_slicing_enabled():
25
- FeatureFlags.SUB_SLICING_ENABLED = True
26
-
27
-
28
- def test_workload_create_sub_slicing_topology_is_hidden_with_flag_off():
29
- FeatureFlags.SUB_SLICING_ENABLED = False
30
- parser = argparse.ArgumentParser()
31
-
32
- set_workload_create_parser(parser)
33
- help_str = parser.format_help()
34
-
35
- assert "--sub-slicing" not in help_str
36
-
37
-
38
- def test_workload_create_sub_slicing_topology_is_shown_with_flag_on():
39
- parser = argparse.ArgumentParser()
40
-
41
- set_workload_create_parser(parser)
42
- help_str = parser.format_help()
43
-
44
- assert "--sub-slicing" in help_str
45
-
46
-
47
- def test_workload_create_sub_slicing_topology_is_none_by_default():
21
+ def test_workload_create_parses():
48
22
  parser = argparse.ArgumentParser()
49
23
 
50
24
  set_workload_create_parser(parser)
@@ -59,24 +33,4 @@ def test_workload_create_sub_slicing_topology_is_none_by_default():
59
33
  "tpu7x-2",
60
34
  ])
61
35
 
62
- assert args.sub_slicing_topology is None
63
-
64
-
65
- def test_workload_create_sub_slicing_topology_can_be_set():
66
- parser = argparse.ArgumentParser()
67
-
68
- set_workload_create_parser(parser)
69
- args = parser.parse_args([
70
- "--cluster",
71
- "test-cluster",
72
- "--command",
73
- "python3",
74
- "--workload",
75
- "test",
76
- "--tpu-type",
77
- "tpu7x-8",
78
- "--sub-slicing-topology",
79
- "2x4",
80
- ])
81
-
82
- assert args.sub_slicing_topology is "2x4"
36
+ assert args
@@ -0,0 +1,46 @@
1
+ apiVersion: jobset.x-k8s.io/v1alpha2
2
+ kind: JobSet
3
+ metadata:
4
+ name: {{ workload }}
5
+ labels:
6
+ kueue.x-k8s.io/queue-name: multislice-queue # Name of the LocalQueue
7
+ xpk.google.com/workload: {{ workload }}
8
+ spec:
9
+ ttlSecondsAfterFinished: {{ ttl_seconds_after_finished }}
10
+ failurePolicy:
11
+ {{ failure_policy_rules }}
12
+ maxRestarts: {{ max_restarts }}
13
+ replicatedJobs:
14
+ - name: slice-job
15
+ replicas: 1
16
+ template:
17
+ spec:
18
+ parallelism: {{ num_nodes }}
19
+ completions: {{ num_nodes }}
20
+ backoffLimit: 0 # When any pod fails, the job is failed
21
+ {{ pod_failure_policy }}
22
+ template:
23
+ metadata:
24
+ labels:
25
+ xpk.google.com/workload: {{ workload }}
26
+ annotations:
27
+ {{ annotations }}
28
+ spec:
29
+ priorityClassName: {{ priority }}
30
+ restartPolicy: Never
31
+ nodeSelector:
32
+ {{ placement_policy_label }}
33
+ imagePullSecrets:
34
+ - name: {{ docker_image_pull_secret }}
35
+ dnsPolicy: ClusterFirstWithHostNet
36
+ terminationGracePeriodSeconds: {{ termination_grace_period_seconds }}
37
+ serviceAccountName: {{ service_account }}
38
+ tolerations:
39
+ - operator: "Exists"
40
+ key: nvidia.com/gpu
41
+ - key: "kubernetes.io/arch"
42
+ operator: "Equal"
43
+ value: "arm64"
44
+ effect: "NoSchedule"
45
+ containers:
46
+ {{ container }}
@@ -29,6 +29,9 @@ def _get_boolean_flag(flag: str, default: bool) -> bool:
29
29
  class _FeatureFlags:
30
30
  SUB_SLICING_ENABLED = _get_boolean_flag("SUB_SLICING_ENABLED", default=False)
31
31
  TELEMETRY_ENABLED = _get_boolean_flag("TELEMETRY_ENABLED", default=False)
32
+ SUPER_SLICING_ENABLED = _get_boolean_flag(
33
+ "SUPER_SLICING_ENABLED", default=False
34
+ )
32
35
 
33
36
 
34
37
  FeatureFlags = _FeatureFlags()
xpk/utils/validation.py CHANGED
@@ -72,8 +72,8 @@ class SystemDependency(Enum):
72
72
 
73
73
 
74
74
  def should_validate_dependencies(args):
75
- skip_validation = 'skip_validation' in args and args.skip_validation
76
- dry_run = 'dry_run' in args and args.dry_run
75
+ skip_validation = hasattr(args, 'skip_validation') and args.skip_validation
76
+ dry_run = hasattr(args, 'dry_run') and args.dry_run
77
77
  return not skip_validation and not dry_run
78
78
 
79
79