xpk 0.12.0__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. xpk/commands/batch.py +17 -10
  2. xpk/commands/cluster.py +137 -123
  3. xpk/commands/cluster_gcluster.py +77 -14
  4. xpk/commands/cluster_gcluster_test.py +177 -0
  5. xpk/commands/common.py +13 -27
  6. xpk/commands/info.py +11 -9
  7. xpk/commands/inspector.py +22 -11
  8. xpk/commands/job.py +53 -9
  9. xpk/commands/kind.py +38 -40
  10. xpk/commands/kjob_common.py +4 -4
  11. xpk/commands/run.py +9 -2
  12. xpk/commands/shell.py +13 -10
  13. xpk/commands/storage.py +26 -2
  14. xpk/commands/version.py +0 -4
  15. xpk/commands/workload.py +58 -30
  16. xpk/core/blueprint/blueprint_generator.py +4 -40
  17. xpk/core/blueprint/blueprint_test.py +0 -6
  18. xpk/core/capacity.py +6 -5
  19. xpk/core/cluster.py +96 -195
  20. xpk/core/cluster_private.py +9 -12
  21. xpk/core/commands.py +21 -25
  22. xpk/core/config.py +1 -1
  23. xpk/core/docker_image.py +17 -9
  24. xpk/core/docker_resources.py +9 -4
  25. xpk/core/gcloud_context.py +26 -2
  26. xpk/core/gcloud_context_test.py +96 -0
  27. xpk/core/gcluster_manager.py +0 -3
  28. xpk/core/jobset.py +5 -8
  29. xpk/core/kjob.py +19 -29
  30. xpk/core/kueue_manager.py +383 -0
  31. xpk/core/kueue_manager_test.py +542 -0
  32. xpk/core/monitoring.py +1 -1
  33. xpk/core/nap.py +11 -16
  34. xpk/core/network.py +18 -19
  35. xpk/core/nodepool.py +65 -71
  36. xpk/core/nodepool_test.py +198 -1
  37. xpk/core/pathways.py +9 -5
  38. xpk/core/ray.py +11 -15
  39. xpk/core/resources.py +15 -10
  40. xpk/core/scheduling.py +23 -1
  41. xpk/core/scheduling_test.py +31 -0
  42. xpk/core/system_characteristics.py +335 -229
  43. xpk/core/vertex.py +1 -1
  44. xpk/core/workload.py +7 -8
  45. xpk/main.py +3 -2
  46. xpk/parser/cluster.py +50 -0
  47. xpk/parser/cluster_test.py +66 -0
  48. xpk/parser/common.py +11 -0
  49. xpk/parser/workload.py +62 -25
  50. xpk/parser/workload_test.py +82 -0
  51. xpk/utils/execution_context.py +28 -0
  52. xpk/utils/feature_flags.py +28 -0
  53. xpk/utils/file.py +25 -10
  54. xpk/utils/kueue.py +20 -0
  55. xpk/utils/network.py +4 -0
  56. xpk/utils/templates.py +2 -0
  57. xpk/utils/topology.py +37 -0
  58. xpk/utils/topology_test.py +43 -0
  59. xpk/utils/validation.py +79 -55
  60. xpk/utils/validation_test.py +37 -0
  61. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
  62. xpk-0.14.0.dist-info/RECORD +112 -0
  63. xpk/core/kueue.py +0 -545
  64. xpk-0.12.0.dist-info/RECORD +0 -100
  65. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
  66. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
  67. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
  68. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
xpk/core/vertex.py CHANGED
@@ -66,7 +66,7 @@ def create_vertex_experiment(args) -> dict | None:
66
66
  )
67
67
 
68
68
  metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
69
- cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
69
+ cluster_config_map = get_cluster_configmap(metadata_configmap_name)
70
70
 
71
71
  if cluster_config_map is None or 'tensorboard_name' not in cluster_config_map:
72
72
  xpk_print(
xpk/core/workload.py CHANGED
@@ -17,7 +17,7 @@ limitations under the License.
17
17
  import re
18
18
  from ..utils.console import xpk_exit, xpk_print
19
19
  from .commands import run_command_for_value
20
- from .gcloud_context import zone_to_region
20
+ from .gcloud_context import get_cluster_location
21
21
 
22
22
 
23
23
  def workload_list_awk_command(filter_key) -> str:
@@ -131,7 +131,7 @@ def get_workload_list(args) -> tuple[int, str]:
131
131
  if hasattr(args, 'filter_by_job'):
132
132
  task += f' with filter-by-job={args.filter_by_job}'
133
133
 
134
- return_code, return_value = run_command_for_value(command, task, args)
134
+ return_code, return_value = run_command_for_value(command, task)
135
135
  return return_code, return_value
136
136
 
137
137
 
@@ -152,7 +152,7 @@ def check_if_workload_exists(args) -> bool:
152
152
 
153
153
  command = f"kubectl get workloads -o=custom-columns='{s}'"
154
154
  return_code, return_msg = run_command_for_value(
155
- command, 'Check if Workload Already Exists', args
155
+ command, 'Check if Workload Already Exists'
156
156
  )
157
157
 
158
158
  if return_code != 0:
@@ -186,7 +186,7 @@ def wait_for_job_completion(args) -> int:
186
186
  # Get the full workload name
187
187
  get_workload_name_cmd = f'kubectl get workloads | grep jobset-{args.workload}'
188
188
  return_code, return_value = run_command_for_value(
189
- get_workload_name_cmd, 'Get full workload name', args
189
+ get_workload_name_cmd, 'Get full workload name'
190
190
  )
191
191
  if return_code != 0:
192
192
  xpk_print(f'Get full workload name request returned ERROR {return_code}')
@@ -205,7 +205,6 @@ def wait_for_job_completion(args) -> int:
205
205
  return_code, return_value = run_command_for_value(
206
206
  wait_cmd,
207
207
  f'Wait for workload to finish with timeout of {timeout_msg}',
208
- args,
209
208
  print_timer=True,
210
209
  )
211
210
  if return_code != 0:
@@ -214,7 +213,7 @@ def wait_for_job_completion(args) -> int:
214
213
  f'Timed out waiting for your workload after {timeout_msg}, see your'
215
214
  ' workload here:'
216
215
  # pylint: disable=line-too-long
217
- f' https://console.cloud.google.com/kubernetes/service/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
216
+ f' https://console.cloud.google.com/kubernetes/service/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
218
217
  )
219
218
  return 124
220
219
  else:
@@ -224,14 +223,14 @@ def wait_for_job_completion(args) -> int:
224
223
  xpk_print(
225
224
  'Finished waiting for your workload, see your workload here:'
226
225
  # pylint: disable=line-too-long
227
- f' https://console.cloud.google.com/kubernetes/service/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
226
+ f' https://console.cloud.google.com/kubernetes/service/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
228
227
  )
229
228
  status_cmd = (
230
229
  f'kubectl get jobset {args.workload} -o'
231
230
  " jsonpath='{.status.conditions[-1].type}'"
232
231
  )
233
232
  return_code, return_value = run_command_for_value(
234
- status_cmd, 'Get jobset status', args
233
+ status_cmd, 'Get jobset status'
235
234
  )
236
235
  if return_code != 0:
237
236
  xpk_print(f'Get workload status request returned ERROR {return_code}')
xpk/main.py CHANGED
@@ -36,7 +36,7 @@ import sys
36
36
 
37
37
  from .parser.core import set_parser
38
38
  from .utils.console import xpk_print
39
- from .utils.validation import validate_dependencies
39
+ from .utils.execution_context import set_dry_run
40
40
  ################### Compatibility Check ###################
41
41
  # Check that the user runs the below version or greater.
42
42
 
@@ -63,9 +63,10 @@ def main() -> None:
63
63
  set_parser(parser=parser)
64
64
 
65
65
  xpk_print('Starting xpk', flush=True)
66
- validate_dependencies()
67
66
  main_args = parser.parse_args()
68
67
  main_args.enable_ray_cluster = False
68
+ dry_run = 'dry_run' in main_args and main_args.dry_run
69
+ set_dry_run(dry_run)
69
70
  main_args.func(main_args)
70
71
  xpk_print('XPK Done.', flush=True)
71
72
 
xpk/parser/cluster.py CHANGED
@@ -31,6 +31,7 @@ from ..core.config import CFG_BUCKET_KEY
31
31
  from ..core.vertex import DEFAULT_VERTEX_TENSORBOARD_NAME
32
32
  from .common import add_shared_arguments, ParserOrArgumentGroup
33
33
  from .validators import name_type
34
+ from ..utils.feature_flags import FeatureFlags
34
35
 
35
36
 
36
37
  def set_cluster_parser(cluster_parser: ArgumentParser):
@@ -142,6 +143,12 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
142
143
  ' enable cluster to accept Pathways workloads.'
143
144
  ),
144
145
  )
146
+ if FeatureFlags.SUB_SLICING_ENABLED:
147
+ cluster_create_optional_arguments.add_argument(
148
+ '--sub-slicing',
149
+ action='store_true',
150
+ help='Whether to set up cluster to support sub-slicing',
151
+ )
145
152
 
146
153
  autoprovisioning_arguments = cluster_create_parser.add_argument_group(
147
154
  'Autoprovisioning Arguments',
@@ -174,6 +181,13 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
174
181
  'Arguments for configuring MTC in cluster create.',
175
182
  )
176
183
  add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments)
184
+
185
+ cluster_create_resource_limits = cluster_create_parser.add_argument_group(
186
+ 'Optional Resource Limits Arguments',
187
+ 'Arguments for configuring resource limits in cluster create.',
188
+ )
189
+ add_resource_limits(cluster_create_resource_limits)
190
+
177
191
  cluster_create_parser.set_defaults(func=cluster_create)
178
192
 
179
193
 
@@ -245,6 +259,15 @@ def set_cluster_create_pathways_parser(
245
259
  )
246
260
  )
247
261
  add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments)
262
+
263
+ cluster_create_resource_limits = (
264
+ cluster_create_pathways_parser.add_argument_group(
265
+ 'Optional Resource Limits Arguments',
266
+ 'Arguments for configuring resource limits in cluster create.',
267
+ )
268
+ )
269
+ add_resource_limits(cluster_create_resource_limits)
270
+
248
271
  cluster_create_pathways_parser.set_defaults(func=cluster_create_pathways)
249
272
 
250
273
 
@@ -320,6 +343,13 @@ def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser):
320
343
  'Arguments for configuring MTC in cluster create.',
321
344
  )
322
345
  add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments)
346
+
347
+ cluster_create_resource_limits = cluster_create_ray_parser.add_argument_group(
348
+ 'Optional Resource Limits Arguments',
349
+ 'Arguments for configuring resource limits in cluster create.',
350
+ )
351
+ add_resource_limits(cluster_create_resource_limits)
352
+
323
353
  cluster_create_ray_parser.set_defaults(func=cluster_create_ray_cluster)
324
354
 
325
355
 
@@ -887,3 +917,23 @@ def add_shared_cluster_create_mtc_arguments(
887
917
  ' checkpointing. By default, it is set to "google.com/tpu".'
888
918
  ),
889
919
  )
920
+
921
+
922
+ def add_resource_limits(parser_or_group: ParserOrArgumentGroup):
923
+ """Add resource limits arguments in cluster create.
924
+
925
+ Args:
926
+ List of cluster create resource limits arguments parsers or group
927
+ """
928
+ parser_or_group.add_argument(
929
+ '--memory-limit',
930
+ type=str,
931
+ default=None,
932
+ help='The memory limit for the Kueue controller manager.',
933
+ )
934
+ parser_or_group.add_argument(
935
+ '--cpu-limit',
936
+ type=int,
937
+ default=None,
938
+ help='The CPU limit for the Kueue controller manager.',
939
+ )
@@ -0,0 +1,66 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import argparse
18
+ from xpk.parser.cluster import set_cluster_create_parser
19
+ import pytest
20
+ from ..utils.feature_flags import FeatureFlags
21
+
22
+
23
+ @pytest.fixture(autouse=True)
24
+ def with_sub_slicing_enabled():
25
+ FeatureFlags.SUB_SLICING_ENABLED = True
26
+
27
+
28
+ def test_cluster_create_sub_slicing_is_hidden_with_flag_off():
29
+ FeatureFlags.SUB_SLICING_ENABLED = False
30
+ parser = argparse.ArgumentParser()
31
+
32
+ set_cluster_create_parser(parser)
33
+ help_str = parser.format_help()
34
+
35
+ assert "--sub-slicing" not in help_str
36
+
37
+
38
+ def test_cluster_create_sub_slicing_is_shown_with_flag_on():
39
+ parser = argparse.ArgumentParser()
40
+
41
+ set_cluster_create_parser(parser)
42
+ help_str = parser.format_help()
43
+
44
+ assert "--sub-slicing" in help_str
45
+
46
+
47
+ def test_cluster_create_sub_slicing_is_false_by_default():
48
+ parser = argparse.ArgumentParser()
49
+
50
+ set_cluster_create_parser(parser)
51
+ args = parser.parse_args(
52
+ ["--cluster", "test-cluster", "--tpu-type", "test-tpu"]
53
+ )
54
+
55
+ assert args.sub_slicing is False
56
+
57
+
58
+ def test_cluster_create_sub_slicing_can_be_set():
59
+ parser = argparse.ArgumentParser()
60
+
61
+ set_cluster_create_parser(parser)
62
+ args = parser.parse_args(
63
+ ["--cluster", "test-cluster", "--tpu-type", "test-tpu", "--sub-slicing"]
64
+ )
65
+
66
+ assert args.sub_slicing is True
xpk/parser/common.py CHANGED
@@ -62,6 +62,17 @@ def add_shared_arguments(
62
62
  ),
63
63
  required=required,
64
64
  )
65
+ custom_parser_or_group.add_argument(
66
+ '--skip-validation',
67
+ type=bool,
68
+ action=argparse.BooleanOptionalAction,
69
+ default=False,
70
+ help=(
71
+ 'Skip dependency validation checks (kubectl, gcloud, docker, etc). '
72
+ 'Independent of --dry-run.'
73
+ ),
74
+ required=required,
75
+ )
65
76
 
66
77
 
67
78
  def add_cluster_arguments(
xpk/parser/workload.py CHANGED
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ from argparse import ArgumentParser
17
18
  from ..commands.workload import (
18
19
  workload_create,
19
20
  workload_create_pathways,
@@ -23,9 +24,10 @@ from ..commands.workload import (
23
24
  from ..core.docker_image import DEFAULT_DOCKER_IMAGE, DEFAULT_SCRIPT_DIR
24
25
  from .common import add_shared_arguments
25
26
  from .validators import directory_path_type, name_type
27
+ from ..utils.feature_flags import FeatureFlags
26
28
 
27
29
 
28
- def set_workload_parsers(workload_parser):
30
+ def set_workload_parsers(workload_parser: ArgumentParser):
29
31
  workload_subcommands = workload_parser.add_subparsers(
30
32
  title='workload subcommands',
31
33
  dest='xpk_workload_subcommands',
@@ -39,6 +41,28 @@ def set_workload_parsers(workload_parser):
39
41
  workload_create_parser = workload_subcommands.add_parser(
40
42
  'create', help='Create a new job.'
41
43
  )
44
+ set_workload_create_parser(workload_create_parser)
45
+
46
+ # "workload create-pathways" command parser.
47
+ workload_create_pathways_parser = workload_subcommands.add_parser(
48
+ 'create-pathways', help='Create a new job.'
49
+ )
50
+ set_workload_create_pathways_parser(workload_create_pathways_parser)
51
+
52
+ # "workload delete" command parser.
53
+ workload_delete_parser = workload_subcommands.add_parser(
54
+ 'delete', help='Delete job.'
55
+ )
56
+ set_workload_delete_parser(workload_delete_parser)
57
+
58
+ # "workload list" command parser.
59
+ workload_list_parser = workload_subcommands.add_parser(
60
+ 'list', help='List jobs.'
61
+ )
62
+ set_workload_list_parser(workload_list_parser)
63
+
64
+
65
+ def set_workload_create_parser(workload_create_parser: ArgumentParser):
42
66
  workload_create_parser_required_arguments = (
43
67
  workload_create_parser.add_argument_group(
44
68
  'Workload Built-in Arguments',
@@ -193,10 +217,33 @@ def set_workload_parsers(workload_parser):
193
217
  ),
194
218
  )
195
219
 
196
- # "workload create-pathways" command parser.
197
- workload_create_pathways_parser = workload_subcommands.add_parser(
198
- 'create-pathways', help='Create a new job.'
199
- )
220
+ add_shared_workload_create_required_arguments([
221
+ workload_create_parser_required_arguments,
222
+ ])
223
+ add_shared_workload_create_optional_arguments([
224
+ workload_create_parser_optional_arguments,
225
+ ])
226
+ add_shared_workload_create_env_arguments([
227
+ workload_create_parser_optional_arguments,
228
+ ])
229
+ add_shared_workload_base_docker_image_arguments([
230
+ workload_base_docker_image_arguments,
231
+ ])
232
+ add_shared_workload_docker_image_arguments([
233
+ workload_docker_image_arguments,
234
+ ])
235
+ add_shared_workload_create_tensorboard_arguments([
236
+ workload_vertex_tensorboard_arguments,
237
+ ])
238
+ add_shared_workload_create_autoprovisioning_arguments([
239
+ workload_create_autoprovisioning_arguments,
240
+ ])
241
+ workload_create_parser.set_defaults(func=workload_create)
242
+
243
+
244
+ def set_workload_create_pathways_parser(
245
+ workload_create_pathways_parser: ArgumentParser,
246
+ ):
200
247
  workload_create_pathways_parser_required_arguments = (
201
248
  workload_create_pathways_parser.add_argument_group(
202
249
  'Workload create-pathways Built-in Arguments',
@@ -232,7 +279,6 @@ def set_workload_parsers(workload_parser):
232
279
  'Arguments for creating Vertex AI Experiment in workload create.',
233
280
  )
234
281
  )
235
-
236
282
  ### "workload create-pathways" Required arguments, specific to Pathways
237
283
  workload_create_pathways_parser_required_arguments.add_argument(
238
284
  '--tpu-type',
@@ -353,42 +399,30 @@ def set_workload_parsers(workload_parser):
353
399
  )
354
400
 
355
401
  add_shared_workload_create_required_arguments([
356
- workload_create_parser_required_arguments,
357
402
  workload_create_pathways_parser_required_arguments,
358
403
  ])
359
404
  add_shared_workload_create_optional_arguments([
360
- workload_create_parser_optional_arguments,
361
405
  workload_create_pathways_parser_optional_arguments,
362
406
  ])
363
407
  add_shared_workload_create_env_arguments([
364
- workload_create_parser_optional_arguments,
365
408
  workload_create_pathways_parser_optional_arguments,
366
409
  ])
367
410
  add_shared_workload_base_docker_image_arguments([
368
- workload_base_docker_image_arguments,
369
411
  workload_create_pathways_base_docker_image_arguments,
370
412
  ])
371
413
  add_shared_workload_docker_image_arguments([
372
- workload_docker_image_arguments,
373
414
  workload_create_pathways_docker_image_arguments,
374
415
  ])
375
416
  add_shared_workload_create_tensorboard_arguments([
376
- workload_vertex_tensorboard_arguments,
377
417
  workload_create_pathways_vertex_tensorboard_arguments,
378
418
  ])
379
419
  add_shared_workload_create_autoprovisioning_arguments([
380
- workload_create_autoprovisioning_arguments,
381
420
  workload_create_pathways_autoprovisioning_arguments,
382
421
  ])
383
-
384
- # Set defaults for both workload create and workload create-pathways after adding all shared args.
385
- workload_create_parser.set_defaults(func=workload_create)
386
422
  workload_create_pathways_parser.set_defaults(func=workload_create_pathways)
387
423
 
388
- # "workload delete" command parser.
389
- workload_delete_parser = workload_subcommands.add_parser(
390
- 'delete', help='Delete job.'
391
- )
424
+
425
+ def set_workload_delete_parser(workload_delete_parser: ArgumentParser):
392
426
  workload_delete_parser_required_arguments = (
393
427
  workload_delete_parser.add_argument_group(
394
428
  'Required Arguments',
@@ -454,14 +488,10 @@ def set_workload_parsers(workload_parser):
454
488
  'Forces workload deletion command to run without additional approval.'
455
489
  ),
456
490
  )
457
-
458
491
  workload_delete_parser.set_defaults(func=workload_delete)
459
492
 
460
- # "workload list" command parser.
461
- workload_list_parser = workload_subcommands.add_parser(
462
- 'list', help='List jobs.'
463
- )
464
493
 
494
+ def set_workload_list_parser(workload_list_parser: ArgumentParser):
465
495
  workload_list_parser.add_argument(
466
496
  '--cluster',
467
497
  type=name_type,
@@ -629,6 +659,13 @@ def add_shared_workload_create_optional_arguments(args_parsers):
629
659
  ' the workload.'
630
660
  ),
631
661
  )
662
+ if FeatureFlags.SUB_SLICING_ENABLED:
663
+ custom_parser.add_argument(
664
+ '--sub-slicing-topology',
665
+ type=str,
666
+ help='Sub-slicing topology to use.',
667
+ required=False,
668
+ )
632
669
 
633
670
 
634
671
  def add_shared_workload_create_env_arguments(args_parsers):
@@ -0,0 +1,82 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import argparse
18
+ from xpk.parser.workload import set_workload_create_parser
19
+ from ..utils.feature_flags import FeatureFlags
20
+ import pytest
21
+
22
+
23
+ @pytest.fixture(autouse=True)
24
+ def with_sub_slicing_enabled():
25
+ FeatureFlags.SUB_SLICING_ENABLED = True
26
+
27
+
28
+ def test_workload_create_sub_slicing_topology_is_hidden_with_flag_off():
29
+ FeatureFlags.SUB_SLICING_ENABLED = False
30
+ parser = argparse.ArgumentParser()
31
+
32
+ set_workload_create_parser(parser)
33
+ help_str = parser.format_help()
34
+
35
+ assert "--sub-slicing" not in help_str
36
+
37
+
38
+ def test_workload_create_sub_slicing_topology_is_shown_with_flag_on():
39
+ parser = argparse.ArgumentParser()
40
+
41
+ set_workload_create_parser(parser)
42
+ help_str = parser.format_help()
43
+
44
+ assert "--sub-slicing" in help_str
45
+
46
+
47
+ def test_workload_create_sub_slicing_topology_is_none_by_default():
48
+ parser = argparse.ArgumentParser()
49
+
50
+ set_workload_create_parser(parser)
51
+ args = parser.parse_args([
52
+ "--cluster",
53
+ "test-cluster",
54
+ "--command",
55
+ "python3",
56
+ "--workload",
57
+ "test",
58
+ "--tpu-type",
59
+ "test-tpu",
60
+ ])
61
+
62
+ assert args.sub_slicing_topology is None
63
+
64
+
65
+ def test_workload_create_sub_slicing_topology_can_be_set():
66
+ parser = argparse.ArgumentParser()
67
+
68
+ set_workload_create_parser(parser)
69
+ args = parser.parse_args([
70
+ "--cluster",
71
+ "test-cluster",
72
+ "--command",
73
+ "python3",
74
+ "--workload",
75
+ "test",
76
+ "--tpu-type",
77
+ "test-tpu",
78
+ "--sub-slicing-topology",
79
+ "2x2",
80
+ ])
81
+
82
+ assert args.sub_slicing_topology is "2x2"
@@ -0,0 +1,28 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ dry_run = False
18
+
19
+
20
+ def set_dry_run(value: bool) -> None:
21
+ """Sets the dry_run flag."""
22
+ global dry_run
23
+ dry_run = value
24
+
25
+
26
+ def is_dry_run() -> bool:
27
+ """Returns the current value of the dry_run flag."""
28
+ return dry_run
@@ -0,0 +1,28 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import os
18
+
19
+
20
+ def _get_boolean_flag(flag: str, default: bool) -> bool:
21
+ return os.getenv(flag, str(default)).lower() == "true"
22
+
23
+
24
+ class _FeatureFlags:
25
+ SUB_SLICING_ENABLED = _get_boolean_flag("SUB_SLICING_ENABLED", default=False)
26
+
27
+
28
+ FeatureFlags = _FeatureFlags()
xpk/utils/file.py CHANGED
@@ -16,10 +16,11 @@ limitations under the License.
16
16
 
17
17
  import tempfile
18
18
  import os
19
- from .console import xpk_print
19
+ import hashlib
20
+ from .execution_context import is_dry_run
20
21
 
21
22
 
22
- def make_tmp_files(per_command_name):
23
+ def make_tmp_files(per_command_name: list[str]) -> list[str]:
23
24
  """Make temporary files for each command.
24
25
 
25
26
  Args:
@@ -28,16 +29,19 @@ def make_tmp_files(per_command_name):
28
29
  Returns:
29
30
  A list of temporary files for each command.
30
31
  """
32
+ if is_dry_run():
33
+ return [_hash_filename(command) for command in per_command_name]
34
+
31
35
  # Supports removal of spaces from command names before converting to file name.
32
36
  return [
33
37
  tempfile.NamedTemporaryFile(
34
38
  delete=False, prefix=command.replace(' ', '-') + '-'
35
- )
39
+ ).file.name
36
40
  for command in per_command_name
37
41
  ]
38
42
 
39
43
 
40
- def write_tmp_file(payload):
44
+ def write_tmp_file(payload: str) -> str:
41
45
  """Writes `payload` to a temporary file.
42
46
 
43
47
  Args:
@@ -46,14 +50,17 @@ def write_tmp_file(payload):
46
50
  Returns:
47
51
  A file object that was written to.
48
52
  """
53
+ if is_dry_run():
54
+ return _hash_filename(payload)
55
+
49
56
  with tempfile.NamedTemporaryFile(delete=False) as tmp:
50
57
  with open(file=tmp.name, mode='w', encoding='utf=8') as f:
51
58
  f.write(payload)
52
59
  f.flush()
53
- return tmp
60
+ return tmp.file.name
54
61
 
55
62
 
56
- def append_tmp_file(payload, file):
63
+ def append_tmp_file(payload: str, file: str) -> str:
57
64
  """Appends `payload` to an already created file.
58
65
 
59
66
  Use `write_temporary_file` to create a file.
@@ -65,18 +72,26 @@ def append_tmp_file(payload, file):
65
72
  Returns:
66
73
  A file object that was written to.
67
74
  """
68
- with open(file=file.name, mode='a', encoding='utf=8') as f:
75
+ if is_dry_run():
76
+ return file
77
+
78
+ with open(file=file, mode='a', encoding='utf=8') as f:
69
79
  f.write(payload)
70
80
  f.flush()
71
81
  return file
72
82
 
73
83
 
74
- def ensure_directory_exists(directory_path):
84
+ def ensure_directory_exists(directory_path: str) -> None:
75
85
  """Checks if a directory exists and creates it if it doesn't.
76
86
 
77
87
  Args:
78
88
  directory_path: The path to the directory.
79
89
  """
80
- if not os.path.exists(directory_path):
90
+ if not is_dry_run() and not os.path.exists(directory_path):
81
91
  os.makedirs(directory_path)
82
- xpk_print(f"Directory '{directory_path}' created successfully.")
92
+
93
+
94
+ def _hash_filename(seed: str) -> str:
95
+ m = hashlib.sha256()
96
+ m.update(seed.encode('utf-8'))
97
+ return m.hexdigest()
xpk/utils/kueue.py ADDED
@@ -0,0 +1,20 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+
18
+ def is_queued_cluster(num_slices: int) -> bool:
19
+ """Determines if admission checks should be enabled and cluster queued."""
20
+ return num_slices <= 1