xpk 0.13.0__py3-none-any.whl → 0.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. integration/__init__.py +15 -0
  2. integration/docker_manager_test.py +102 -0
  3. integration/gcluster_a3mega_test.py +204 -0
  4. integration/gcluster_a3ultra_test.py +176 -0
  5. integration/gcluster_a4_test.py +176 -0
  6. integration/gcluster_test.py +107 -0
  7. xpk/commands/batch.py +9 -2
  8. xpk/commands/cluster.py +143 -117
  9. xpk/commands/cluster_gcluster.py +81 -14
  10. xpk/commands/cluster_gcluster_test.py +177 -0
  11. xpk/commands/cluster_test.py +92 -0
  12. xpk/commands/common.py +14 -26
  13. xpk/commands/info.py +11 -9
  14. xpk/commands/inspector.py +21 -10
  15. xpk/commands/job.py +25 -9
  16. xpk/commands/kind.py +39 -40
  17. xpk/commands/kjob_common.py +4 -4
  18. xpk/commands/run.py +9 -2
  19. xpk/commands/shell.py +13 -10
  20. xpk/commands/storage.py +21 -0
  21. xpk/commands/version.py +0 -4
  22. xpk/commands/workload.py +84 -29
  23. xpk/commands/workload_test.py +81 -0
  24. xpk/core/blueprint/blueprint_generator.py +4 -40
  25. xpk/core/blueprint/blueprint_test.py +0 -6
  26. xpk/core/blueprint/testing/__init__.py +15 -0
  27. xpk/core/capacity.py +6 -5
  28. xpk/core/cluster.py +91 -194
  29. xpk/core/cluster_private.py +6 -11
  30. xpk/core/commands.py +11 -18
  31. xpk/core/config.py +1 -1
  32. xpk/core/docker_image.py +3 -4
  33. xpk/core/gcloud_context.py +26 -2
  34. xpk/core/gcloud_context_test.py +96 -0
  35. xpk/core/gcluster_manager.py +0 -3
  36. xpk/core/jobset.py +4 -7
  37. xpk/core/kjob.py +14 -27
  38. xpk/core/kueue_manager.py +423 -0
  39. xpk/core/kueue_manager_test.py +574 -0
  40. xpk/core/monitoring.py +1 -1
  41. xpk/core/nap.py +10 -15
  42. xpk/core/network.py +17 -18
  43. xpk/core/nodepool.py +66 -77
  44. xpk/core/nodepool_test.py +198 -1
  45. xpk/core/pathways.py +5 -5
  46. xpk/core/ray.py +10 -14
  47. xpk/core/resources.py +6 -11
  48. xpk/core/scheduling.py +19 -1
  49. xpk/core/scheduling_test.py +31 -0
  50. xpk/core/system_characteristics.py +350 -232
  51. xpk/core/system_characteristics_test.py +73 -0
  52. xpk/core/vertex.py +1 -1
  53. xpk/core/workload.py +7 -8
  54. xpk/main.py +2 -4
  55. xpk/parser/cluster.py +7 -0
  56. xpk/parser/cluster_test.py +66 -0
  57. xpk/parser/common.py +11 -0
  58. xpk/parser/workload.py +62 -25
  59. xpk/parser/workload_test.py +82 -0
  60. xpk/templates/cluster_preheat.yaml.j2 +31 -0
  61. xpk/templates/filestore-pv.yaml +17 -0
  62. xpk/templates/filestore-pvc.yaml +11 -0
  63. xpk/templates/filestore-sc.yaml +10 -0
  64. xpk/templates/fuse-pv.yaml +17 -0
  65. xpk/templates/fuse-pvc.yaml +13 -0
  66. xpk/templates/kueue_config.yaml.j2 +95 -0
  67. xpk/templates/kueue_gke_default_topology.yaml.j2 +10 -0
  68. xpk/templates/kueue_sub_slicing_topology.yaml.j2 +14 -0
  69. xpk/templates/mtc-cpc.yaml +15 -0
  70. xpk/templates/volume_bundle.yaml +7 -0
  71. xpk/utils/feature_flags.py +28 -0
  72. xpk/utils/kueue.py +20 -0
  73. xpk/utils/templates.py +15 -0
  74. xpk/utils/topology.py +46 -0
  75. xpk/utils/topology_test.py +63 -0
  76. xpk/utils/validation.py +79 -55
  77. xpk/utils/validation_test.py +37 -0
  78. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/METADATA +6 -1
  79. xpk-0.14.1.dist-info/RECORD +133 -0
  80. xpk-0.14.1.dist-info/top_level.txt +2 -0
  81. xpk/core/kueue.py +0 -561
  82. xpk-0.13.0.dist-info/RECORD +0 -101
  83. xpk-0.13.0.dist-info/top_level.txt +0 -1
  84. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/WHEEL +0 -0
  85. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/entry_points.txt +0 -0
  86. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/licenses/LICENSE +0 -0
xpk/commands/workload.py CHANGED
@@ -34,7 +34,7 @@ from ..core.docker_container import (
34
34
  )
35
35
  from ..core.docker_resources import get_volumes, parse_env_config
36
36
  from ..core.gcloud_context import add_zone_and_project
37
- from ..core.kueue import LOCAL_QUEUE_NAME
37
+ from ..core.kueue_manager import LOCAL_QUEUE_NAME
38
38
  from ..core.monitoring import get_gke_outlier_dashboard
39
39
  from ..core.nap import (
40
40
  get_autoprovisioning_node_selector_args,
@@ -52,10 +52,7 @@ from ..core.pathways import (
52
52
  get_user_workload_for_pathways,
53
53
  try_to_delete_pathwaysjob_first,
54
54
  )
55
- from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
56
- from ..core.capacity import (
57
- CapacityType,
58
- )
55
+ from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics, SystemCharacteristics
59
56
  from ..core.resources import CLUSTER_METADATA_CONFIGMAP, get_cluster_configmap
60
57
  from ..core.scheduling import (
61
58
  check_if_workload_can_schedule,
@@ -65,6 +62,7 @@ from ..core.scheduling import (
65
62
  create_tpu_topology,
66
63
  get_cpu_affinity,
67
64
  get_gpu_scheduler,
65
+ create_sub_slicing_annotations,
68
66
  )
69
67
  from ..core.storage import (
70
68
  GCE_PD_TYPE,
@@ -80,6 +78,7 @@ from ..core.storage import (
80
78
  from ..core.system_characteristics import (
81
79
  AcceleratorType,
82
80
  get_system_characteristics,
81
+ compute_vms_per_slice,
83
82
  )
84
83
  from ..core.vertex import create_vertex_experiment
85
84
  from ..core.workload import (
@@ -87,7 +86,7 @@ from ..core.workload import (
87
86
  get_jobsets_list_gcp_link,
88
87
  get_workload_list,
89
88
  wait_for_job_completion,
90
- zone_to_region,
89
+ get_cluster_location,
91
90
  )
92
91
  from ..core.workload_decorators import (
93
92
  rdma_decorator,
@@ -98,8 +97,11 @@ from ..core.workload_decorators import (
98
97
  from ..utils.console import get_user_input, xpk_exit, xpk_print
99
98
  from ..utils.file import write_tmp_file
100
99
  from ..utils.execution_context import is_dry_run
100
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
101
101
  from . import cluster_gcluster
102
- from .common import is_TAS_possible
102
+ from .common import is_TAS_possible, validate_sub_slicing_system
103
+ from ..utils.topology import is_topology_contained
104
+ from ..utils.feature_flags import FeatureFlags
103
105
 
104
106
  WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
105
107
  kind: JobSet
@@ -120,8 +122,8 @@ spec:
120
122
  replicas: {args.num_slices}
121
123
  template:
122
124
  spec:
123
- parallelism: {system.vms_per_slice} # Equal to the number of VMs per slice
124
- completions: {system.vms_per_slice} # Same as the above.
125
+ parallelism: {vms_per_slice} # Equal to the number of VMs per slice (or sub-slice).
126
+ completions: {vms_per_slice} # Same as the above.
125
127
  backoffLimit: 0 # When any pod fails, the job is failed
126
128
  {pod_failure_policy}
127
129
  template:
@@ -130,6 +132,7 @@ spec:
130
132
  xpk.google.com/workload: {args.workload}
131
133
  annotations:
132
134
  {storage_annotations}
135
+ {sub_slicing_annotations}
133
136
  spec:
134
137
  schedulerName: {args.scheduler}
135
138
  imagePullSecrets:
@@ -267,6 +270,8 @@ PW_WORKLOAD_CREATE_YAML = """
267
270
  maxSliceRestarts: {args.max_slice_restarts}
268
271
  terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
269
272
  priorityClassName: {args.priority}
273
+ nodeSelector:
274
+ {autoprovisioning_args}
270
275
  pathwaysDir: {args.pathways_gcs_location} #This bucket needs to be created in advance.
271
276
  controller:
272
277
  # #Pod template for training, default mode.
@@ -277,6 +282,8 @@ PW_WORKLOAD_CREATE_YAML = """
277
282
  {user_workload}
278
283
  """
279
284
 
285
+ SUB_SLICING_TOPOLOGIES = ['2x2', '2x4', '4x4', '4x8', '8x8', '8x16', '16x16']
286
+
280
287
 
281
288
  def workload_create_pathways(args) -> None:
282
289
  """Run jobset apply command for a file, specifically for Pathways.
@@ -307,6 +314,12 @@ def workload_create(args) -> None:
307
314
  Returns:
308
315
  0 if successful and 1 otherwise.
309
316
  """
317
+ if should_validate_dependencies(args):
318
+ validate_dependencies_list([
319
+ SystemDependency.KUBECTL,
320
+ SystemDependency.GCLOUD,
321
+ SystemDependency.DOCKER,
322
+ ])
310
323
  k8s_api_client = None
311
324
  if not is_dry_run():
312
325
  k8s_api_client = setup_k8s_env(args)
@@ -321,20 +334,21 @@ def workload_create(args) -> None:
321
334
  )
322
335
  xpk_exit(1)
323
336
 
324
- xpk_print('Starting workload create', flush=True)
325
337
  system, return_code = get_system_characteristics(args)
326
-
327
338
  if return_code > 0 or system is None:
328
339
  xpk_print('Fetching system characteristics failed!')
329
340
  xpk_exit(return_code)
330
341
 
342
+ if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing_topology is not None:
343
+ _validate_sub_slicing_topology(system, args.sub_slicing_topology)
344
+
331
345
  if not check_if_workload_can_schedule(args, system):
332
346
  xpk_exit(1)
333
347
 
334
348
  xpk_print('Starting workload create', flush=True)
335
349
 
336
350
  metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
337
- cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
351
+ cluster_config_map = get_cluster_configmap(metadata_configmap_name)
338
352
  cluster_xpk_version = None
339
353
  if cluster_config_map is None:
340
354
  xpk_print(
@@ -482,16 +496,12 @@ def workload_create(args) -> None:
482
496
  capacity_type = get_cluster_capacity_type(args)
483
497
 
484
498
  annotations = (
485
- ''
486
- if not is_TAS_possible(
487
- system_characteristics,
488
- capacity_type,
489
- flex=True if capacity_type == CapacityType.FLEX_START else False,
490
- )
491
- else (
499
+ (
492
500
  'kueue.x-k8s.io/podset-preferred-topology:'
493
501
  ' "cloud.google.com/gce-topology-host"'
494
502
  )
503
+ if is_TAS_possible(system_characteristics, capacity_type)
504
+ else ''
495
505
  )
496
506
 
497
507
  if (
@@ -507,7 +517,7 @@ def workload_create(args) -> None:
507
517
  annotations=annotations,
508
518
  )
509
519
 
510
- sub_networks = get_cluster_subnetworks(args)
520
+ sub_networks = get_cluster_subnetworks()
511
521
  if args.device_type == a3high_device_type:
512
522
  yml_string = tcpx_decorator.decorate_jobset(yml_string)
513
523
  elif args.device_type == a3mega_device_type:
@@ -545,6 +555,7 @@ def workload_create(args) -> None:
545
555
  colocated_python_sidecar=append_custom_colocated_python_sidecar(args),
546
556
  user_workload=get_user_workload_for_pathways(args, system),
547
557
  local_queue_name=LOCAL_QUEUE_NAME,
558
+ autoprovisioning_args=autoprovisioning_args,
548
559
  )
549
560
  else:
550
561
  container, debugging_dashboard_id = get_user_workload_container(
@@ -552,12 +563,26 @@ def workload_create(args) -> None:
552
563
  )
553
564
  yml_string = WORKLOAD_CREATE_YAML.format(
554
565
  args=args,
555
- system=system,
556
566
  container=container,
567
+ vms_per_slice=(
568
+ compute_vms_per_slice(args.sub_slicing_topology)
569
+ if system.accelerator_type == AcceleratorType['TPU']
570
+ and FeatureFlags.SUB_SLICING_ENABLED
571
+ and args.sub_slicing_topology is not None
572
+ else system.vms_per_slice
573
+ ),
557
574
  affinity=get_cpu_affinity(system.accelerator_type),
558
575
  accelerator_label=create_accelerator_label(
559
576
  system.accelerator_type, system
560
577
  ),
578
+ sub_slicing_annotations=(
579
+ ''
580
+ if not FeatureFlags.SUB_SLICING_ENABLED
581
+ or args.sub_slicing_topology is None
582
+ else ('\n' + (' ' * 16)).join(
583
+ create_sub_slicing_annotations(args.sub_slicing_topology)
584
+ )
585
+ ),
561
586
  machine_label=create_machine_label(system.accelerator_type, system),
562
587
  local_queue_name=LOCAL_QUEUE_NAME,
563
588
  autoprovisioning_args=autoprovisioning_args,
@@ -575,7 +600,7 @@ def workload_create(args) -> None:
575
600
  )
576
601
  tmp = write_tmp_file(yml_string)
577
602
  command = f'kubectl apply -f {str(tmp)}'
578
- return_code = run_command_with_updates(command, 'Creating Workload', args)
603
+ return_code = run_command_with_updates(command, 'Creating Workload')
579
604
 
580
605
  if return_code != 0:
581
606
  xpk_print(f'Create Workload request returned ERROR {return_code}')
@@ -622,7 +647,9 @@ def workload_create(args) -> None:
622
647
  ' JAX_PLATFORMS=proxy; JAX_BACKEND_TARGET=grpc://127.0.0.1:29000;'
623
648
  " python -c 'import pathwaysutils; import jax; print(jax.devices())'"
624
649
  )
625
- pathways_proxy_link = f'https://console.cloud.google.com/kubernetes/job/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}-proxy-0/details?project={args.project}'
650
+ pathways_proxy_link = (
651
+ f'https://console.cloud.google.com/kubernetes/job/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}-proxy-0/details?project={args.project}'
652
+ )
626
653
  xpk_print(
627
654
  'Follow the proxy here:'
628
655
  # pylint: disable=line-too-long)
@@ -636,7 +663,7 @@ def workload_create(args) -> None:
636
663
  xpk_print(
637
664
  'Follow your workload here:'
638
665
  # pylint: disable=line-too-long
639
- f' https://console.cloud.google.com/kubernetes/service/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
666
+ f' https://console.cloud.google.com/kubernetes/service/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
640
667
  )
641
668
  duration_of_logs = 'P1D' # Past 1 Day
642
669
  xpk_print(
@@ -645,12 +672,35 @@ def workload_create(args) -> None:
645
672
  ' ([prefix]-slice-job-[slice_number]-[worker_number])'
646
673
  ' after clicking the url if you want other worker logs.'
647
674
  # pylint: disable=line-too-long
648
- f' https://console.cloud.google.com/logs/query;query=resource.type%3D%22k8s_container%22%0Aresource.labels.project_id%3D%22{args.project}%22%0Aresource.labels.location%3D%22{zone_to_region(args.zone)}%22%0Aresource.labels.cluster_name%3D%22{args.cluster}%22%0Aresource.labels.namespace_name%3D%22default%22%0Aresource.labels.pod_name:%22{args.workload}-slice-job-0-0-%22%20severity%3E%3DDEFAULT;storageScope=project;duration={duration_of_logs}?e=13802955&mods=allow_workbench_image_override&project={args.project}'
675
+ f' https://console.cloud.google.com/logs/query;query=resource.type%3D%22k8s_container%22%0Aresource.labels.project_id%3D%22{args.project}%22%0Aresource.labels.location%3D%22{get_cluster_location(args.project, args.cluster, args.zone)}%22%0Aresource.labels.cluster_name%3D%22{args.cluster}%22%0Aresource.labels.namespace_name%3D%22default%22%0Aresource.labels.pod_name:%22{args.workload}-slice-job-0-0-%22%20severity%3E%3DDEFAULT;storageScope=project;duration={duration_of_logs}?e=13802955&mods=allow_workbench_image_override&project={args.project}'
649
676
  )
650
677
 
651
678
  xpk_exit(0)
652
679
 
653
680
 
681
+ def _validate_sub_slicing_topology(
682
+ system_characteristics: SystemCharacteristics, sub_slicing_topology: str
683
+ ) -> None:
684
+ if sub_slicing_topology not in SUB_SLICING_TOPOLOGIES:
685
+ xpk_print(
686
+ f'Error: --sub-slicing-topology={sub_slicing_topology} shape is'
687
+ f' invalid. It has to be one of: {", ".join(SUB_SLICING_TOPOLOGIES)}.'
688
+ )
689
+ xpk_exit(1)
690
+
691
+ if not is_topology_contained(
692
+ contained=sub_slicing_topology, container=system_characteristics.topology
693
+ ):
694
+ xpk_print(
695
+ f'Error: --sub-slicing-topology={sub_slicing_topology} shape is too'
696
+ ' large. The shape cannot be bigger than'
697
+ f' {system_characteristics.topology}.'
698
+ )
699
+ xpk_exit(1)
700
+
701
+ validate_sub_slicing_system(system_characteristics)
702
+
703
+
654
704
  def get_restart_exit_codes(args) -> list:
655
705
  exit_codes = [42]
656
706
  exit_codes.extend(range(127, 256, 1))
@@ -678,6 +728,10 @@ def workload_delete(args) -> None:
678
728
  Returns:
679
729
  0 if successful and 1 otherwise.
680
730
  """
731
+ if should_validate_dependencies(args):
732
+ validate_dependencies_list(
733
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
734
+ )
681
735
  xpk_print('Starting Workload delete', flush=True)
682
736
  add_zone_and_project(args)
683
737
  get_cluster_credentials(args)
@@ -725,16 +779,13 @@ def workload_delete(args) -> None:
725
779
 
726
780
  # Not batching deletion for single workload
727
781
  if len(workloads) == 1:
728
- return_code = run_command_with_updates(
729
- commands[0], 'Delete Workload', args
730
- )
782
+ return_code = run_command_with_updates(commands[0], 'Delete Workload')
731
783
  else:
732
784
  return_code = run_commands(
733
785
  commands,
734
786
  'Delete Workload',
735
787
  task_names,
736
788
  batch=100,
737
- dry_run=args.dry_run,
738
789
  )
739
790
 
740
791
  if return_code != 0:
@@ -752,6 +803,10 @@ def workload_list(args) -> None:
752
803
  Returns:
753
804
  0 if successful and 1 otherwise.
754
805
  """
806
+ if should_validate_dependencies(args):
807
+ validate_dependencies_list(
808
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
809
+ )
755
810
  xpk_print('Starting workload list', flush=True)
756
811
  add_zone_and_project(args)
757
812
  get_cluster_credentials(args)
@@ -0,0 +1,81 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import dataclasses
18
+ from unittest.mock import MagicMock, patch
19
+ import pytest
20
+ from ..core.system_characteristics import SystemCharacteristics
21
+ from .workload import _validate_sub_slicing_topology
22
+
23
+
24
+ SYSTEM_CHARACTERISTICS = SystemCharacteristics(
25
+ topology='8x8',
26
+ vms_per_slice=1,
27
+ gke_accelerator='nvidia-l4',
28
+ gce_machine_type='g2-standard-12',
29
+ chips_per_vm=1,
30
+ accelerator_type=1,
31
+ device_type='l4-1',
32
+ supports_sub_slicing=True,
33
+ requires_workload_policy=False,
34
+ )
35
+
36
+
37
+ @pytest.fixture(autouse=True)
38
+ def xpk_print(mocker):
39
+ return mocker.patch('xpk.commands.workload.xpk_print')
40
+
41
+
42
+ def test_validate_sub_slicing_topology_exits_for_unsupported_topology(
43
+ xpk_print,
44
+ ):
45
+ with pytest.raises(SystemExit):
46
+ _validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '2x1')
47
+
48
+ assert (
49
+ 'shape is invalid. It has to be one of' in xpk_print.mock_calls[0].args[0]
50
+ )
51
+
52
+
53
+ def test_validate_sub_slicing_topology_exits_for_too_large_topology(xpk_print):
54
+ with pytest.raises(SystemExit):
55
+ _validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '16x16')
56
+
57
+ assert (
58
+ 'shape is too large. The shape cannot be'
59
+ in xpk_print.mock_calls[0].args[0]
60
+ )
61
+
62
+
63
+ def test_validate_sub_slicing_topology_does_nothing_for_supported_topology():
64
+ _validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '4x4')
65
+
66
+
67
+ @patch('xpk.commands.common.xpk_print')
68
+ def test_validate_sub_slicing_topology_fails_for_unsupported_system(
69
+ common_xpk_print: MagicMock,
70
+ ):
71
+ unsupported_system = dataclasses.replace(
72
+ SYSTEM_CHARACTERISTICS, supports_sub_slicing=False
73
+ )
74
+
75
+ with pytest.raises(SystemExit):
76
+ _validate_sub_slicing_topology(unsupported_system, '4x4')
77
+
78
+ assert (
79
+ 'l4-1 does not support Sub-slicing.'
80
+ in common_xpk_print.mock_calls[0].args[0]
81
+ )
@@ -32,7 +32,6 @@ from ..capacity import (
32
32
  )
33
33
  from ..system_characteristics import get_system_characteristics_by_device_type
34
34
  from .blueprint_definitions import Blueprint, DeploymentGroup, DeploymentModule
35
- from ..kueue import KUEUE_VERSION
36
35
 
37
36
  yaml_parser = yaml.YAML()
38
37
 
@@ -53,6 +52,7 @@ blueprint_dependencies_dir = {
53
52
 
54
53
  cluster_toolkit_url = "github.com/GoogleCloudPlatform/cluster-toolkit"
55
54
  cluster_toolkit_version = "v1.62.2"
55
+ common_cluster_labels = {"gke_product_type": "xpk"}
56
56
 
57
57
 
58
58
  class BlueprintGeneratorOutput:
@@ -216,26 +216,11 @@ class BlueprintGenerator:
216
216
  a3_megagpu_pool_0.settings.update({"static_node_count": num_nodes})
217
217
 
218
218
  set_placement_policy = capacity_type != CapacityType.SPOT
219
- num_chips = num_nodes * system.chips_per_vm
220
219
  workload = DeploymentModule(
221
220
  id="workload_component_install",
222
221
  source="modules/management/kubectl-apply",
223
222
  use=["gke_cluster"],
224
223
  settings={
225
- "kueue": {
226
- "install": True,
227
- "version": KUEUE_VERSION, # TAS feature-gates is enabled in CT
228
- "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
229
- "config_template_vars": {
230
- "num_chips": num_chips,
231
- "reservation": (
232
- 1 if capacity_type == CapacityType.RESERVATION else 0
233
- ),
234
- "flex_start": (
235
- 1 if capacity_type == CapacityType.FLEX_START else 0
236
- ),
237
- },
238
- },
239
224
  "jobset": {"install": True, "version": "v0.7.2"},
240
225
  "apply_manifests": [{
241
226
  "source": f'$(ghpc_stage("{blueprint_name}"))/storage_crd.yaml'
@@ -298,6 +283,7 @@ class BlueprintGenerator:
298
283
  "deployment_name": blueprint_name,
299
284
  "region": region,
300
285
  "zone": zone,
286
+ "labels": common_cluster_labels,
301
287
  },
302
288
  )
303
289
 
@@ -598,24 +584,12 @@ class BlueprintGenerator:
598
584
  else:
599
585
  gpu_pool.settings.update({"static_node_count": num_nodes})
600
586
 
601
- num_chips = num_nodes * system.chips_per_vm
602
587
  workload_manager_install_id = "workload-manager-install"
603
588
  workload_manager_install = DeploymentModule(
604
589
  id=workload_manager_install_id,
605
590
  source="modules/management/kubectl-apply",
606
591
  use=[cluster_id],
607
592
  settings={
608
- "kueue": {
609
- "install": True,
610
- "version": KUEUE_VERSION, # TAS feature-gates is enabled in CT
611
- "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
612
- "config_template_vars": {
613
- "num_chips": num_chips,
614
- "flex_start": (
615
- 1 if capacity_type == CapacityType.FLEX_START else 0
616
- ),
617
- },
618
- },
619
593
  "jobset": {"install": True, "version": "v0.7.2"},
620
594
  "apply_manifests": [
621
595
  {"source": nccl_installer_path},
@@ -676,6 +650,7 @@ class BlueprintGenerator:
676
650
  "deployment_name": blueprint_name,
677
651
  "region": region,
678
652
  "zone": zone,
653
+ "labels": common_cluster_labels,
679
654
  },
680
655
  )
681
656
 
@@ -884,24 +859,12 @@ class BlueprintGenerator:
884
859
  else:
885
860
  gpu_pool.settings.update({"static_node_count": num_nodes})
886
861
 
887
- num_chips = num_nodes * system.chips_per_vm
888
862
  workload_manager_install_id = "workload-manager-install"
889
863
  workload_manager_install = DeploymentModule(
890
864
  id=workload_manager_install_id,
891
865
  source="modules/management/kubectl-apply",
892
866
  use=[cluster_id],
893
867
  settings={
894
- "kueue": {
895
- "install": True,
896
- "version": KUEUE_VERSION, # TAS feature-gates is enabled in CT
897
- "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
898
- "config_template_vars": {
899
- "num_chips": num_chips,
900
- "flex_start": (
901
- 1 if capacity_type == CapacityType.FLEX_START else 0
902
- ),
903
- },
904
- },
905
868
  "jobset": {"install": True, "version": "v0.7.2"},
906
869
  "apply_manifests": [
907
870
  {"source": nccl_installer_path},
@@ -962,6 +925,7 @@ class BlueprintGenerator:
962
925
  "deployment_name": blueprint_name,
963
926
  "region": region,
964
927
  "zone": zone,
928
+ "labels": common_cluster_labels,
965
929
  },
966
930
  )
967
931
 
@@ -32,7 +32,6 @@ a3_spot_yaml_test_path = "src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml"
32
32
  a3_ultra_yaml_test_path = "src/xpk/core/blueprint/testing/data/a3_ultra.yaml"
33
33
  a4_yaml_test_path = "src/xpk/core/blueprint/testing/data/a4.yaml"
34
34
  config_map_filename = "config-map.yaml.tftpl"
35
- kueue_conf_filename = "kueue-xpk-configuration.yaml.tftpl"
36
35
  tmp_test_dir = "/tmp/xpk_test"
37
36
 
38
37
 
@@ -82,11 +81,6 @@ def test_generate_a3_mega_blueprint():
82
81
  tmp_test_dir, "prefix", blueprint_name, config_map_filename
83
82
  )
84
83
  )
85
- assert os.path.exists(
86
- os.path.join(
87
- tmp_test_dir, "prefix", blueprint_name, kueue_conf_filename
88
- )
89
- )
90
84
 
91
85
  shutil.rmtree(tmp_test_dir)
92
86
 
@@ -0,0 +1,15 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
xpk/core/capacity.py CHANGED
@@ -17,6 +17,7 @@ limitations under the License.
17
17
  import enum
18
18
 
19
19
  from ..utils.console import xpk_print, xpk_exit
20
+ from ..utils.kueue import is_queued_cluster
20
21
  from .commands import run_command_with_updates, run_command_for_value
21
22
 
22
23
  AUTOPROVISIONING_CONFIG_VALUE = 'AUTOPROVISION'
@@ -50,7 +51,7 @@ def print_reservations(args) -> int:
50
51
  """
51
52
  command = f'gcloud beta compute reservations list --project={args.project}'
52
53
  return_code = run_command_with_updates(
53
- command, 'Get all reservations in the project', args
54
+ command, 'Get all reservations in the project'
54
55
  )
55
56
  if return_code != 0:
56
57
  xpk_print(f'Get all reservations returned ERROR {return_code}')
@@ -119,7 +120,7 @@ def get_reservation_maintenance_interval(
119
120
  f' --project={project} --zone={zone} --format="value(specificReservation.instanceProperties.maintenanceInterval)"'
120
121
  )
121
122
  return_code, output = run_command_for_value(
122
- command, 'Get reservation maintenance interval', None
123
+ command, 'Get reservation maintenance interval'
123
124
  )
124
125
  if return_code != 0:
125
126
  xpk_print(f'Get reservation maintenance interval ERROR {return_code}')
@@ -143,7 +144,7 @@ def get_reservation_placement_policy(
143
144
  f' --project={project} --zone={zone} --format="value(resourcePolicies.policy)"'
144
145
  )
145
146
  return_code, output = run_command_for_value(
146
- command, 'Get reservation placement policy', None
147
+ command, 'Get reservation placement policy'
147
148
  )
148
149
  if return_code != 0:
149
150
  xpk_print(f'Get reservation placement policy ERROR {return_code}')
@@ -164,7 +165,7 @@ def verify_reservation_exists(args) -> int:
164
165
  f'gcloud beta compute reservations describe {args.reservation}'
165
166
  f' --project={args.project} --zone={args.zone}'
166
167
  )
167
- return_code = run_command_with_updates(command, 'Describe reservation', args)
168
+ return_code = run_command_with_updates(command, 'Describe reservation')
168
169
  if return_code != 0:
169
170
  xpk_print(f'Describe reservation returned ERROR {return_code}')
170
171
  xpk_print('Please confirm that your reservation name is correct.')
@@ -199,7 +200,7 @@ def get_capacity_arguments_from_capacity_type(
199
200
  ' --location-policy=ANY --reservation-affinity=none'
200
201
  f' --no-enable-autorepair --max-nodes={max_nodes}'
201
202
  )
202
- if args.num_slices <= 1:
203
+ if is_queued_cluster(args.num_slices):
203
204
  capacity_args += ' --enable-queued-provisioning'
204
205
  case CapacityType.RESERVATION:
205
206
  capacity_args = (