xpk 0.13.0__py3-none-any.whl → 0.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. integration/__init__.py +15 -0
  2. integration/docker_manager_test.py +102 -0
  3. integration/gcluster_a3mega_test.py +204 -0
  4. integration/gcluster_a3ultra_test.py +176 -0
  5. integration/gcluster_a4_test.py +176 -0
  6. integration/gcluster_test.py +107 -0
  7. xpk/commands/batch.py +9 -2
  8. xpk/commands/cluster.py +143 -117
  9. xpk/commands/cluster_gcluster.py +81 -14
  10. xpk/commands/cluster_gcluster_test.py +177 -0
  11. xpk/commands/cluster_test.py +92 -0
  12. xpk/commands/common.py +14 -26
  13. xpk/commands/info.py +11 -9
  14. xpk/commands/inspector.py +21 -10
  15. xpk/commands/job.py +25 -9
  16. xpk/commands/kind.py +39 -40
  17. xpk/commands/kjob_common.py +4 -4
  18. xpk/commands/run.py +9 -2
  19. xpk/commands/shell.py +13 -10
  20. xpk/commands/storage.py +21 -0
  21. xpk/commands/version.py +0 -4
  22. xpk/commands/workload.py +84 -29
  23. xpk/commands/workload_test.py +81 -0
  24. xpk/core/blueprint/blueprint_generator.py +4 -40
  25. xpk/core/blueprint/blueprint_test.py +0 -6
  26. xpk/core/blueprint/testing/__init__.py +15 -0
  27. xpk/core/capacity.py +6 -5
  28. xpk/core/cluster.py +91 -194
  29. xpk/core/cluster_private.py +6 -11
  30. xpk/core/commands.py +11 -18
  31. xpk/core/config.py +1 -1
  32. xpk/core/docker_image.py +3 -4
  33. xpk/core/gcloud_context.py +26 -2
  34. xpk/core/gcloud_context_test.py +96 -0
  35. xpk/core/gcluster_manager.py +0 -3
  36. xpk/core/jobset.py +4 -7
  37. xpk/core/kjob.py +14 -27
  38. xpk/core/kueue_manager.py +423 -0
  39. xpk/core/kueue_manager_test.py +574 -0
  40. xpk/core/monitoring.py +1 -1
  41. xpk/core/nap.py +10 -15
  42. xpk/core/network.py +17 -18
  43. xpk/core/nodepool.py +66 -77
  44. xpk/core/nodepool_test.py +198 -1
  45. xpk/core/pathways.py +5 -5
  46. xpk/core/ray.py +10 -14
  47. xpk/core/resources.py +6 -11
  48. xpk/core/scheduling.py +19 -1
  49. xpk/core/scheduling_test.py +31 -0
  50. xpk/core/system_characteristics.py +350 -232
  51. xpk/core/system_characteristics_test.py +73 -0
  52. xpk/core/vertex.py +1 -1
  53. xpk/core/workload.py +7 -8
  54. xpk/main.py +2 -4
  55. xpk/parser/cluster.py +7 -0
  56. xpk/parser/cluster_test.py +66 -0
  57. xpk/parser/common.py +11 -0
  58. xpk/parser/workload.py +62 -25
  59. xpk/parser/workload_test.py +82 -0
  60. xpk/templates/cluster_preheat.yaml.j2 +31 -0
  61. xpk/templates/filestore-pv.yaml +17 -0
  62. xpk/templates/filestore-pvc.yaml +11 -0
  63. xpk/templates/filestore-sc.yaml +10 -0
  64. xpk/templates/fuse-pv.yaml +17 -0
  65. xpk/templates/fuse-pvc.yaml +13 -0
  66. xpk/templates/kueue_config.yaml.j2 +95 -0
  67. xpk/templates/kueue_gke_default_topology.yaml.j2 +10 -0
  68. xpk/templates/kueue_sub_slicing_topology.yaml.j2 +14 -0
  69. xpk/templates/mtc-cpc.yaml +15 -0
  70. xpk/templates/volume_bundle.yaml +7 -0
  71. xpk/utils/feature_flags.py +28 -0
  72. xpk/utils/kueue.py +20 -0
  73. xpk/utils/templates.py +15 -0
  74. xpk/utils/topology.py +46 -0
  75. xpk/utils/topology_test.py +63 -0
  76. xpk/utils/validation.py +79 -55
  77. xpk/utils/validation_test.py +37 -0
  78. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/METADATA +6 -1
  79. xpk-0.14.1.dist-info/RECORD +133 -0
  80. xpk-0.14.1.dist-info/top_level.txt +2 -0
  81. xpk/core/kueue.py +0 -561
  82. xpk-0.13.0.dist-info/RECORD +0 -101
  83. xpk-0.13.0.dist-info/top_level.txt +0 -1
  84. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/WHEEL +0 -0
  85. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/entry_points.txt +0 -0
  86. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,73 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from .system_characteristics import get_tpu_system_characteristics_map, SystemCharacteristics
18
+
19
+
20
+ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topology():
21
+ result = get_tpu_system_characteristics_map(
22
+ prefix="test",
23
+ tensorcores_per_chip=1,
24
+ gke_accelerator="test",
25
+ machine_type="test",
26
+ supported_topologies=["1x1"],
27
+ supports_sub_slicing=False,
28
+ requires_workload_policy=True,
29
+ )
30
+
31
+ expected_system_characteristics = SystemCharacteristics(
32
+ topology="1x1",
33
+ vms_per_slice=1,
34
+ gke_accelerator="test",
35
+ gce_machine_type="test",
36
+ chips_per_vm=1,
37
+ accelerator_type=1,
38
+ device_type="test-1",
39
+ supports_sub_slicing=False,
40
+ requires_workload_policy=True,
41
+ )
42
+ assert result == {
43
+ "test-1": expected_system_characteristics,
44
+ "test-1x1": expected_system_characteristics,
45
+ }
46
+
47
+
48
+ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topology():
49
+ result = get_tpu_system_characteristics_map(
50
+ prefix="test",
51
+ tensorcores_per_chip=2,
52
+ gke_accelerator="test",
53
+ machine_type="test",
54
+ supported_topologies=["2x2"],
55
+ supports_sub_slicing=False,
56
+ requires_workload_policy=True,
57
+ )
58
+
59
+ expected_system_characteristics = SystemCharacteristics(
60
+ topology="2x2",
61
+ vms_per_slice=1,
62
+ gke_accelerator="test",
63
+ gce_machine_type="test",
64
+ chips_per_vm=4,
65
+ accelerator_type=1,
66
+ device_type="test-8",
67
+ supports_sub_slicing=False,
68
+ requires_workload_policy=True,
69
+ )
70
+ assert result == {
71
+ "test-8": expected_system_characteristics,
72
+ "test-2x2": expected_system_characteristics,
73
+ }
xpk/core/vertex.py CHANGED
@@ -66,7 +66,7 @@ def create_vertex_experiment(args) -> dict | None:
66
66
  )
67
67
 
68
68
  metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
69
- cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
69
+ cluster_config_map = get_cluster_configmap(metadata_configmap_name)
70
70
 
71
71
  if cluster_config_map is None or 'tensorboard_name' not in cluster_config_map:
72
72
  xpk_print(
xpk/core/workload.py CHANGED
@@ -17,7 +17,7 @@ limitations under the License.
17
17
  import re
18
18
  from ..utils.console import xpk_exit, xpk_print
19
19
  from .commands import run_command_for_value
20
- from .gcloud_context import zone_to_region
20
+ from .gcloud_context import get_cluster_location
21
21
 
22
22
 
23
23
  def workload_list_awk_command(filter_key) -> str:
@@ -131,7 +131,7 @@ def get_workload_list(args) -> tuple[int, str]:
131
131
  if hasattr(args, 'filter_by_job'):
132
132
  task += f' with filter-by-job={args.filter_by_job}'
133
133
 
134
- return_code, return_value = run_command_for_value(command, task, args)
134
+ return_code, return_value = run_command_for_value(command, task)
135
135
  return return_code, return_value
136
136
 
137
137
 
@@ -152,7 +152,7 @@ def check_if_workload_exists(args) -> bool:
152
152
 
153
153
  command = f"kubectl get workloads -o=custom-columns='{s}'"
154
154
  return_code, return_msg = run_command_for_value(
155
- command, 'Check if Workload Already Exists', args
155
+ command, 'Check if Workload Already Exists'
156
156
  )
157
157
 
158
158
  if return_code != 0:
@@ -186,7 +186,7 @@ def wait_for_job_completion(args) -> int:
186
186
  # Get the full workload name
187
187
  get_workload_name_cmd = f'kubectl get workloads | grep jobset-{args.workload}'
188
188
  return_code, return_value = run_command_for_value(
189
- get_workload_name_cmd, 'Get full workload name', args
189
+ get_workload_name_cmd, 'Get full workload name'
190
190
  )
191
191
  if return_code != 0:
192
192
  xpk_print(f'Get full workload name request returned ERROR {return_code}')
@@ -205,7 +205,6 @@ def wait_for_job_completion(args) -> int:
205
205
  return_code, return_value = run_command_for_value(
206
206
  wait_cmd,
207
207
  f'Wait for workload to finish with timeout of {timeout_msg}',
208
- args,
209
208
  print_timer=True,
210
209
  )
211
210
  if return_code != 0:
@@ -214,7 +213,7 @@ def wait_for_job_completion(args) -> int:
214
213
  f'Timed out waiting for your workload after {timeout_msg}, see your'
215
214
  ' workload here:'
216
215
  # pylint: disable=line-too-long
217
- f' https://console.cloud.google.com/kubernetes/service/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
216
+ f' https://console.cloud.google.com/kubernetes/service/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
218
217
  )
219
218
  return 124
220
219
  else:
@@ -224,14 +223,14 @@ def wait_for_job_completion(args) -> int:
224
223
  xpk_print(
225
224
  'Finished waiting for your workload, see your workload here:'
226
225
  # pylint: disable=line-too-long
227
- f' https://console.cloud.google.com/kubernetes/service/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
226
+ f' https://console.cloud.google.com/kubernetes/service/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
228
227
  )
229
228
  status_cmd = (
230
229
  f'kubectl get jobset {args.workload} -o'
231
230
  " jsonpath='{.status.conditions[-1].type}'"
232
231
  )
233
232
  return_code, return_value = run_command_for_value(
234
- status_cmd, 'Get jobset status', args
233
+ status_cmd, 'Get jobset status'
235
234
  )
236
235
  if return_code != 0:
237
236
  xpk_print(f'Get workload status request returned ERROR {return_code}')
xpk/main.py CHANGED
@@ -36,7 +36,6 @@ import sys
36
36
 
37
37
  from .parser.core import set_parser
38
38
  from .utils.console import xpk_print
39
- from .utils.validation import validate_dependencies
40
39
  from .utils.execution_context import set_dry_run
41
40
  ################### Compatibility Check ###################
42
41
  # Check that the user runs the below version or greater.
@@ -66,9 +65,8 @@ def main() -> None:
66
65
  xpk_print('Starting xpk', flush=True)
67
66
  main_args = parser.parse_args()
68
67
  main_args.enable_ray_cluster = False
69
- set_dry_run('dry_run' in main_args and main_args.dry_run)
70
- if not main_args.dry_run:
71
- validate_dependencies()
68
+ dry_run = 'dry_run' in main_args and main_args.dry_run
69
+ set_dry_run(dry_run)
72
70
  main_args.func(main_args)
73
71
  xpk_print('XPK Done.', flush=True)
74
72
 
xpk/parser/cluster.py CHANGED
@@ -31,6 +31,7 @@ from ..core.config import CFG_BUCKET_KEY
31
31
  from ..core.vertex import DEFAULT_VERTEX_TENSORBOARD_NAME
32
32
  from .common import add_shared_arguments, ParserOrArgumentGroup
33
33
  from .validators import name_type
34
+ from ..utils.feature_flags import FeatureFlags
34
35
 
35
36
 
36
37
  def set_cluster_parser(cluster_parser: ArgumentParser):
@@ -142,6 +143,12 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
142
143
  ' enable cluster to accept Pathways workloads.'
143
144
  ),
144
145
  )
146
+ if FeatureFlags.SUB_SLICING_ENABLED:
147
+ cluster_create_optional_arguments.add_argument(
148
+ '--sub-slicing',
149
+ action='store_true',
150
+ help='Whether to set up cluster to support sub-slicing',
151
+ )
145
152
 
146
153
  autoprovisioning_arguments = cluster_create_parser.add_argument_group(
147
154
  'Autoprovisioning Arguments',
@@ -0,0 +1,66 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import argparse
18
+ from xpk.parser.cluster import set_cluster_create_parser
19
+ import pytest
20
+ from ..utils.feature_flags import FeatureFlags
21
+
22
+
23
+ @pytest.fixture(autouse=True)
24
+ def with_sub_slicing_enabled():
25
+ FeatureFlags.SUB_SLICING_ENABLED = True
26
+
27
+
28
+ def test_cluster_create_sub_slicing_is_hidden_with_flag_off():
29
+ FeatureFlags.SUB_SLICING_ENABLED = False
30
+ parser = argparse.ArgumentParser()
31
+
32
+ set_cluster_create_parser(parser)
33
+ help_str = parser.format_help()
34
+
35
+ assert "--sub-slicing" not in help_str
36
+
37
+
38
+ def test_cluster_create_sub_slicing_is_shown_with_flag_on():
39
+ parser = argparse.ArgumentParser()
40
+
41
+ set_cluster_create_parser(parser)
42
+ help_str = parser.format_help()
43
+
44
+ assert "--sub-slicing" in help_str
45
+
46
+
47
+ def test_cluster_create_sub_slicing_is_false_by_default():
48
+ parser = argparse.ArgumentParser()
49
+
50
+ set_cluster_create_parser(parser)
51
+ args = parser.parse_args(
52
+ ["--cluster", "test-cluster", "--tpu-type", "test-tpu"]
53
+ )
54
+
55
+ assert args.sub_slicing is False
56
+
57
+
58
+ def test_cluster_create_sub_slicing_can_be_set():
59
+ parser = argparse.ArgumentParser()
60
+
61
+ set_cluster_create_parser(parser)
62
+ args = parser.parse_args(
63
+ ["--cluster", "test-cluster", "--tpu-type", "test-tpu", "--sub-slicing"]
64
+ )
65
+
66
+ assert args.sub_slicing is True
xpk/parser/common.py CHANGED
@@ -62,6 +62,17 @@ def add_shared_arguments(
62
62
  ),
63
63
  required=required,
64
64
  )
65
+ custom_parser_or_group.add_argument(
66
+ '--skip-validation',
67
+ type=bool,
68
+ action=argparse.BooleanOptionalAction,
69
+ default=False,
70
+ help=(
71
+ 'Skip dependency validation checks (kubectl, gcloud, docker, etc). '
72
+ 'Independent of --dry-run.'
73
+ ),
74
+ required=required,
75
+ )
65
76
 
66
77
 
67
78
  def add_cluster_arguments(
xpk/parser/workload.py CHANGED
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ from argparse import ArgumentParser
17
18
  from ..commands.workload import (
18
19
  workload_create,
19
20
  workload_create_pathways,
@@ -23,9 +24,10 @@ from ..commands.workload import (
23
24
  from ..core.docker_image import DEFAULT_DOCKER_IMAGE, DEFAULT_SCRIPT_DIR
24
25
  from .common import add_shared_arguments
25
26
  from .validators import directory_path_type, name_type
27
+ from ..utils.feature_flags import FeatureFlags
26
28
 
27
29
 
28
- def set_workload_parsers(workload_parser):
30
+ def set_workload_parsers(workload_parser: ArgumentParser):
29
31
  workload_subcommands = workload_parser.add_subparsers(
30
32
  title='workload subcommands',
31
33
  dest='xpk_workload_subcommands',
@@ -39,6 +41,28 @@ def set_workload_parsers(workload_parser):
39
41
  workload_create_parser = workload_subcommands.add_parser(
40
42
  'create', help='Create a new job.'
41
43
  )
44
+ set_workload_create_parser(workload_create_parser)
45
+
46
+ # "workload create-pathways" command parser.
47
+ workload_create_pathways_parser = workload_subcommands.add_parser(
48
+ 'create-pathways', help='Create a new job.'
49
+ )
50
+ set_workload_create_pathways_parser(workload_create_pathways_parser)
51
+
52
+ # "workload delete" command parser.
53
+ workload_delete_parser = workload_subcommands.add_parser(
54
+ 'delete', help='Delete job.'
55
+ )
56
+ set_workload_delete_parser(workload_delete_parser)
57
+
58
+ # "workload list" command parser.
59
+ workload_list_parser = workload_subcommands.add_parser(
60
+ 'list', help='List jobs.'
61
+ )
62
+ set_workload_list_parser(workload_list_parser)
63
+
64
+
65
+ def set_workload_create_parser(workload_create_parser: ArgumentParser):
42
66
  workload_create_parser_required_arguments = (
43
67
  workload_create_parser.add_argument_group(
44
68
  'Workload Built-in Arguments',
@@ -193,10 +217,33 @@ def set_workload_parsers(workload_parser):
193
217
  ),
194
218
  )
195
219
 
196
- # "workload create-pathways" command parser.
197
- workload_create_pathways_parser = workload_subcommands.add_parser(
198
- 'create-pathways', help='Create a new job.'
199
- )
220
+ add_shared_workload_create_required_arguments([
221
+ workload_create_parser_required_arguments,
222
+ ])
223
+ add_shared_workload_create_optional_arguments([
224
+ workload_create_parser_optional_arguments,
225
+ ])
226
+ add_shared_workload_create_env_arguments([
227
+ workload_create_parser_optional_arguments,
228
+ ])
229
+ add_shared_workload_base_docker_image_arguments([
230
+ workload_base_docker_image_arguments,
231
+ ])
232
+ add_shared_workload_docker_image_arguments([
233
+ workload_docker_image_arguments,
234
+ ])
235
+ add_shared_workload_create_tensorboard_arguments([
236
+ workload_vertex_tensorboard_arguments,
237
+ ])
238
+ add_shared_workload_create_autoprovisioning_arguments([
239
+ workload_create_autoprovisioning_arguments,
240
+ ])
241
+ workload_create_parser.set_defaults(func=workload_create)
242
+
243
+
244
+ def set_workload_create_pathways_parser(
245
+ workload_create_pathways_parser: ArgumentParser,
246
+ ):
200
247
  workload_create_pathways_parser_required_arguments = (
201
248
  workload_create_pathways_parser.add_argument_group(
202
249
  'Workload create-pathways Built-in Arguments',
@@ -232,7 +279,6 @@ def set_workload_parsers(workload_parser):
232
279
  'Arguments for creating Vertex AI Experiment in workload create.',
233
280
  )
234
281
  )
235
-
236
282
  ### "workload create-pathways" Required arguments, specific to Pathways
237
283
  workload_create_pathways_parser_required_arguments.add_argument(
238
284
  '--tpu-type',
@@ -353,42 +399,30 @@ def set_workload_parsers(workload_parser):
353
399
  )
354
400
 
355
401
  add_shared_workload_create_required_arguments([
356
- workload_create_parser_required_arguments,
357
402
  workload_create_pathways_parser_required_arguments,
358
403
  ])
359
404
  add_shared_workload_create_optional_arguments([
360
- workload_create_parser_optional_arguments,
361
405
  workload_create_pathways_parser_optional_arguments,
362
406
  ])
363
407
  add_shared_workload_create_env_arguments([
364
- workload_create_parser_optional_arguments,
365
408
  workload_create_pathways_parser_optional_arguments,
366
409
  ])
367
410
  add_shared_workload_base_docker_image_arguments([
368
- workload_base_docker_image_arguments,
369
411
  workload_create_pathways_base_docker_image_arguments,
370
412
  ])
371
413
  add_shared_workload_docker_image_arguments([
372
- workload_docker_image_arguments,
373
414
  workload_create_pathways_docker_image_arguments,
374
415
  ])
375
416
  add_shared_workload_create_tensorboard_arguments([
376
- workload_vertex_tensorboard_arguments,
377
417
  workload_create_pathways_vertex_tensorboard_arguments,
378
418
  ])
379
419
  add_shared_workload_create_autoprovisioning_arguments([
380
- workload_create_autoprovisioning_arguments,
381
420
  workload_create_pathways_autoprovisioning_arguments,
382
421
  ])
383
-
384
- # Set defaults for both workload create and workload create-pathways after adding all shared args.
385
- workload_create_parser.set_defaults(func=workload_create)
386
422
  workload_create_pathways_parser.set_defaults(func=workload_create_pathways)
387
423
 
388
- # "workload delete" command parser.
389
- workload_delete_parser = workload_subcommands.add_parser(
390
- 'delete', help='Delete job.'
391
- )
424
+
425
+ def set_workload_delete_parser(workload_delete_parser: ArgumentParser):
392
426
  workload_delete_parser_required_arguments = (
393
427
  workload_delete_parser.add_argument_group(
394
428
  'Required Arguments',
@@ -454,14 +488,10 @@ def set_workload_parsers(workload_parser):
454
488
  'Forces workload deletion command to run without additional approval.'
455
489
  ),
456
490
  )
457
-
458
491
  workload_delete_parser.set_defaults(func=workload_delete)
459
492
 
460
- # "workload list" command parser.
461
- workload_list_parser = workload_subcommands.add_parser(
462
- 'list', help='List jobs.'
463
- )
464
493
 
494
+ def set_workload_list_parser(workload_list_parser: ArgumentParser):
465
495
  workload_list_parser.add_argument(
466
496
  '--cluster',
467
497
  type=name_type,
@@ -629,6 +659,13 @@ def add_shared_workload_create_optional_arguments(args_parsers):
629
659
  ' the workload.'
630
660
  ),
631
661
  )
662
+ if FeatureFlags.SUB_SLICING_ENABLED:
663
+ custom_parser.add_argument(
664
+ '--sub-slicing-topology',
665
+ type=str,
666
+ help='Sub-slicing topology to use.',
667
+ required=False,
668
+ )
632
669
 
633
670
 
634
671
  def add_shared_workload_create_env_arguments(args_parsers):
@@ -0,0 +1,82 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import argparse
18
+ from xpk.parser.workload import set_workload_create_parser
19
+ from ..utils.feature_flags import FeatureFlags
20
+ import pytest
21
+
22
+
23
+ @pytest.fixture(autouse=True)
24
+ def with_sub_slicing_enabled():
25
+ FeatureFlags.SUB_SLICING_ENABLED = True
26
+
27
+
28
+ def test_workload_create_sub_slicing_topology_is_hidden_with_flag_off():
29
+ FeatureFlags.SUB_SLICING_ENABLED = False
30
+ parser = argparse.ArgumentParser()
31
+
32
+ set_workload_create_parser(parser)
33
+ help_str = parser.format_help()
34
+
35
+ assert "--sub-slicing" not in help_str
36
+
37
+
38
+ def test_workload_create_sub_slicing_topology_is_shown_with_flag_on():
39
+ parser = argparse.ArgumentParser()
40
+
41
+ set_workload_create_parser(parser)
42
+ help_str = parser.format_help()
43
+
44
+ assert "--sub-slicing" in help_str
45
+
46
+
47
+ def test_workload_create_sub_slicing_topology_is_none_by_default():
48
+ parser = argparse.ArgumentParser()
49
+
50
+ set_workload_create_parser(parser)
51
+ args = parser.parse_args([
52
+ "--cluster",
53
+ "test-cluster",
54
+ "--command",
55
+ "python3",
56
+ "--workload",
57
+ "test",
58
+ "--tpu-type",
59
+ "test-tpu",
60
+ ])
61
+
62
+ assert args.sub_slicing_topology is None
63
+
64
+
65
+ def test_workload_create_sub_slicing_topology_can_be_set():
66
+ parser = argparse.ArgumentParser()
67
+
68
+ set_workload_create_parser(parser)
69
+ args = parser.parse_args([
70
+ "--cluster",
71
+ "test-cluster",
72
+ "--command",
73
+ "python3",
74
+ "--workload",
75
+ "test",
76
+ "--tpu-type",
77
+ "test-tpu",
78
+ "--sub-slicing-topology",
79
+ "2x2",
80
+ ])
81
+
82
+ assert args.sub_slicing_topology is "2x2"
@@ -0,0 +1,31 @@
1
+ apiVersion: apps/v1
2
+ kind: DaemonSet
3
+ metadata:
4
+ name: {{ cachekey }}
5
+ labels:
6
+ k8s-app: {{ cachekey }}
7
+ spec:
8
+ selector:
9
+ matchLabels:
10
+ k8s-app: {{ cachekey }}
11
+ updateStrategy:
12
+ type: RollingUpdate
13
+ template:
14
+ metadata:
15
+ labels:
16
+ name: {{ cachekey }}
17
+ k8s-app: {{ cachekey }}
18
+ spec:
19
+ affinity:
20
+ nodeAffinity:
21
+ requiredDuringSchedulingIgnoredDuringExecution:
22
+ nodeSelectorTerms:
23
+ - matchExpressions:
24
+ - key: {{ nodeSelectorKey }}
25
+ operator: Exists
26
+ tolerations:
27
+ - operator: "Exists"
28
+ containers:
29
+ - image: {{ image_name }}
30
+ name: {{ cachekey }}
31
+ command: [ "sleep", "inf" ]
@@ -0,0 +1,17 @@
1
+ apiVersion: v1
2
+ kind: PersistentVolume
3
+ metadata:
4
+ name: xpk-filestore-pv
5
+ spec:
6
+ storageClassName:
7
+ capacity:
8
+ storage:
9
+ accessModes:
10
+ persistentVolumeReclaimPolicy: Retain
11
+ volumeMode: Filesystem
12
+ csi:
13
+ driver: filestore.csi.storage.gke.io
14
+ volumeHandle:
15
+ volumeAttributes:
16
+ ip:
17
+ volume:
@@ -0,0 +1,11 @@
1
+ kind: PersistentVolumeClaim
2
+ apiVersion: v1
3
+ metadata:
4
+ name:
5
+ spec:
6
+ accessModes:
7
+ storageClassName:
8
+ volumeName:
9
+ resources:
10
+ requests:
11
+ storage:
@@ -0,0 +1,10 @@
1
+ apiVersion: storage.k8s.io/v1
2
+ kind: StorageClass
3
+ metadata:
4
+ name:
5
+ provisioner: filestore.csi.storage.gke.io
6
+ volumeBindingMode: Immediate
7
+ allowVolumeExpansion: true
8
+ parameters:
9
+ tier: standard
10
+ network: default
@@ -0,0 +1,17 @@
1
+ apiVersion: v1
2
+ kind: PersistentVolume
3
+ metadata:
4
+ name:
5
+ spec:
6
+ accessModes:
7
+ - ReadWriteMany
8
+ capacity:
9
+ storage:
10
+ storageClassName: example-storage-class
11
+ mountOptions:
12
+ - implicit-dirs
13
+ csi:
14
+ driver: gcsfuse.csi.storage.gke.io
15
+ volumeHandle:
16
+ volumeAttributes:
17
+ gcsfuseLoggingSeverity: warning
@@ -0,0 +1,13 @@
1
+ apiVersion: v1
2
+ kind: PersistentVolumeClaim
3
+ metadata:
4
+ name:
5
+ namespace: default
6
+ spec:
7
+ accessModes:
8
+ - ReadWriteMany
9
+ resources:
10
+ requests:
11
+ storage:
12
+ volumeName:
13
+ storageClassName: example-storage-class