xpk 1.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xpk/commands/cluster.py CHANGED
@@ -369,7 +369,7 @@ def cluster_create(args) -> None:
369
369
 
370
370
  get_cluster_credentials(args)
371
371
 
372
- update_coredns_command_code = update_coredns_if_necessary()
372
+ update_coredns_command_code = update_coredns_if_necessary(args)
373
373
  if update_coredns_command_code != 0:
374
374
  xpk_exit(update_coredns_command_code)
375
375
 
@@ -927,7 +927,7 @@ def scale_down_deployment(deployment_name: str, namespace: str = 'kube-system'):
927
927
  xpk_print(f'{deployment_name} has been scaled down.')
928
928
 
929
929
 
930
- def scale_up_coredns(replicas: int = 15, namespace: str = 'kube-system'):
930
+ def scale_up_coredns(replicas: int, namespace: str = 'kube-system'):
931
931
  """Scales up the CoreDNS deployment to a specified number of replicas."""
932
932
  command_coredns_scale = (
933
933
  f'kubectl scale deployment coredns --replicas={replicas} -n {namespace}'
@@ -1008,7 +1008,14 @@ def cleanup_coredns_repo(coredns_repo_full_path: str):
1008
1008
  xpk_print(f'Error deleting directory {coredns_repo_full_path}: {e}')
1009
1009
 
1010
1010
 
1011
- def update_coredns() -> int:
1011
+ def _get_coredns_replica_count(args) -> int:
1012
+ # XPK large scale guide recommends 15 coreDNS replicas for clusters with 5000 VMs.
1013
+ # Otherwise, limit the replica count to the desired number of default pool nodes.
1014
+ default_pool_node_count: int = args.default_pool_cpu_num_nodes
1015
+ return min(15, default_pool_node_count)
1016
+
1017
+
1018
+ def update_coredns(args) -> int:
1012
1019
  """Updates and deploys CoreDNS within a cluster.
1013
1020
 
1014
1021
  Returns:
@@ -1018,6 +1025,8 @@ def update_coredns() -> int:
1018
1025
  coredns_repo_dir_name = 'deployment'
1019
1026
  coredns_repo_full_path = os.path.join(coredns_repo_dir, coredns_repo_dir_name)
1020
1027
  coredns_k8s_path = os.path.join(coredns_repo_full_path, 'kubernetes')
1028
+ coredns_replica_count = _get_coredns_replica_count(args)
1029
+
1021
1030
  # 1. Install jq
1022
1031
  install_jq()
1023
1032
 
@@ -1034,7 +1043,7 @@ def update_coredns() -> int:
1034
1043
  scale_down_deployment('kube-dns')
1035
1044
 
1036
1045
  # 6. Scale up coredns and verify readiness
1037
- scale_up_coredns(replicas=15)
1046
+ scale_up_coredns(coredns_replica_count)
1038
1047
  verify_coredns_readiness()
1039
1048
 
1040
1049
  xpk_print('The CoreDNS setup process has been completed.')
@@ -1074,7 +1083,7 @@ def coredns_deployment_exists(namespace: str = 'kube-system') -> bool:
1074
1083
  return False
1075
1084
 
1076
1085
 
1077
- def update_coredns_if_necessary() -> int:
1086
+ def update_coredns_if_necessary(args) -> int:
1078
1087
  """Updates and deploys CoreDNS within the cluster if it's not already present.
1079
1088
 
1080
1089
  This function checks for the existence of the CoreDNS deployment.
@@ -1089,7 +1098,7 @@ def update_coredns_if_necessary() -> int:
1089
1098
  return 0
1090
1099
  else:
1091
1100
  xpk_print('CoreDNS deployment not found. Proceeding with CoreDNS setup.')
1092
- return update_coredns()
1101
+ return update_coredns(args)
1093
1102
 
1094
1103
 
1095
1104
  def create_cluster_if_necessary(
@@ -22,7 +22,7 @@ from unittest.mock import MagicMock, patch
22
22
  import pytest
23
23
 
24
24
  from xpk.core.telemetry import MetricsCollector
25
- from xpk.commands.cluster import _install_kueue, _validate_cluster_create_args, run_gke_cluster_create_command, cluster_create, _log_cluster_create_telemetry
25
+ from xpk.commands.cluster import _install_kueue, _validate_cluster_create_args, _get_coredns_replica_count, run_gke_cluster_create_command, cluster_create, _log_cluster_create_telemetry
26
26
  from xpk.core.capacity import CapacityType
27
27
  from xpk.core.system_characteristics import SystemCharacteristics, UserFacingNameToSystemCharacteristics
28
28
  from xpk.core.testing.commands_tester import CommandsTester
@@ -787,3 +787,18 @@ def test_validate_cluster_create_args_sets_correct_num_slices(
787
787
  _validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
788
788
 
789
789
  assert args.num_slices == expected
790
+
791
+
792
+ def test_get_coredns_replica_count_lower_limit_is_number_of_nodes():
793
+ args = construct_args(
794
+ default_pool_cpu_num_nodes=7,
795
+ )
796
+
797
+ assert _get_coredns_replica_count(args) == 7
798
+
799
+
800
+ def test_get_coredns_replica_count_upper_limit_is_15():
801
+ args = construct_args(
802
+ default_pool_cpu_num_nodes=20,
803
+ )
804
+ assert _get_coredns_replica_count(args) == 15
xpk/core/cluster.py CHANGED
@@ -21,6 +21,8 @@ from kubernetes import client as k8s_client
21
21
  from kubernetes import config
22
22
  from kubernetes.client.exceptions import ApiException
23
23
 
24
+ from .kubectl_common import PatchResources, patch_controller_manager_resources
25
+ from ..utils.feature_flags import FeatureFlags
24
26
  from ..utils.console import xpk_exit, xpk_print
25
27
  from .capacity import H200_DEVICE_TYPE
26
28
  from .commands import (
@@ -33,6 +35,7 @@ from .gcloud_context import (
33
35
  get_cluster_location,
34
36
  zone_to_region,
35
37
  )
38
+ from .nodepool import recreate_nodes_in_existing_node_pools
36
39
  from .resources import get_cluster_system_characteristics
37
40
  from .system_characteristics import INSTALLER_NCCL_TCPXO, SystemCharacteristics
38
41
 
@@ -72,7 +75,21 @@ def set_jobset_on_cluster(args) -> int:
72
75
  ' https://github.com/google/xpk/blob/main/README.md#troubleshooting for'
73
76
  ' instructions on how to fix these permissions.'
74
77
  )
75
- return return_code
78
+ return return_code
79
+
80
+ if FeatureFlags.SUPER_SLICING_ENABLED and args.super_slicing:
81
+ return patch_controller_manager_resources(
82
+ name='jobset-controller-manager',
83
+ namespace='jobset-system',
84
+ patch_resources=PatchResources(
85
+ cpu_request=4,
86
+ cpu_limit=4,
87
+ memory_request='16Gi',
88
+ memory_limit='16Gi',
89
+ ),
90
+ )
91
+
92
+ return 0
76
93
 
77
94
 
78
95
  def set_pathways_job_on_cluster(args) -> int:
@@ -605,6 +622,19 @@ def update_gke_cluster_with_lustre_driver_enabled(args) -> int:
605
622
  if return_code != 0:
606
623
  xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
607
624
  return 1
625
+
626
+ xpk_print(
627
+ 'Recreating existing nodes (if any) to complete the Lustre CSI driver'
628
+ ' installation.'
629
+ )
630
+ return_code = recreate_nodes_in_existing_node_pools(args)
631
+ if return_code != 0:
632
+ xpk_print(
633
+ f'Node recreation failed with ERROR {return_code}. You must recreate'
634
+ ' the nodes manually in order to access Lustre storage from your'
635
+ ' workloads.'
636
+ )
637
+ return 1
608
638
  return 0
609
639
 
610
640
 
xpk/core/cluster_test.py CHANGED
@@ -14,10 +14,12 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ from unittest.mock import MagicMock
17
18
  import pytest
18
19
  from .testing.commands_tester import CommandsTester
19
- from .cluster import get_cluster_credentials, update_gke_cluster_with_lustre_driver_enabled, update_cluster_with_lustre_driver_if_necessary
20
+ from .cluster import get_cluster_credentials, update_gke_cluster_with_lustre_driver_enabled, update_cluster_with_lustre_driver_if_necessary, set_jobset_on_cluster
20
21
  from pytest_mock import MockerFixture
22
+ from ..utils.feature_flags import FeatureFlags
21
23
 
22
24
 
23
25
  @pytest.fixture(autouse=True)
@@ -26,6 +28,9 @@ def commands_tester(mocker: MockerFixture) -> CommandsTester:
26
28
  mocker=mocker,
27
29
  run_command_for_value_path="xpk.core.cluster.run_command_for_value",
28
30
  run_command_with_updates_path="xpk.core.cluster.run_command_with_updates",
31
+ run_command_with_updates_retry_path=(
32
+ "xpk.core.cluster.run_command_with_updates_retry"
33
+ ),
29
34
  )
30
35
 
31
36
 
@@ -38,7 +43,17 @@ def mock_location(mocker: MockerFixture):
38
43
 
39
44
  @pytest.fixture(autouse=True)
40
45
  def command_args(mocker: MockerFixture):
41
- return mocker.Mock(cluster="cluster", project="project", zone="zone")
46
+ return mocker.Mock(
47
+ cluster="cluster", project="project", zone="zone", super_slicing=False
48
+ )
49
+
50
+
51
+ @pytest.fixture(autouse=True)
52
+ def mock_patch_controller_manager_resources(mocker: MockerFixture) -> MagicMock:
53
+ return mocker.patch(
54
+ "xpk.core.cluster.patch_controller_manager_resources",
55
+ return_value=0,
56
+ )
42
57
 
43
58
 
44
59
  def test_get_cluster_credentials_returns_1_when_retrieval_commands_fail(
@@ -166,11 +181,14 @@ def test_update_cluster_with_lustre_driver_if_necessary_with_legacy_port_runs_co
166
181
 
167
182
 
168
183
  def test_update_gke_cluster_with_lustre_driver_enabled_default_port(
169
- commands_tester: CommandsTester, command_args
184
+ commands_tester: CommandsTester, command_args, mocker: MockerFixture
170
185
  ):
171
186
  commands_tester.set_result_for_command(
172
187
  (0, ""), "gcloud container clusters update"
173
188
  )
189
+ mocker.patch(
190
+ "xpk.core.cluster.recreate_nodes_in_existing_node_pools", return_value=0
191
+ )
174
192
  command_args.enable_legacy_lustre_port = None
175
193
  update_gke_cluster_with_lustre_driver_enabled(command_args)
176
194
 
@@ -181,12 +199,30 @@ def test_update_gke_cluster_with_lustre_driver_enabled_default_port(
181
199
  ]
182
200
 
183
201
 
202
+ def test_update_gke_cluster_with_lustre_driver_enabled_fails_if_node_recreation_failed(
203
+ commands_tester: CommandsTester, command_args, mocker: MockerFixture
204
+ ):
205
+ commands_tester.set_result_for_command(
206
+ (0, ""), "gcloud container clusters update"
207
+ )
208
+ mocker.patch(
209
+ "xpk.core.cluster.recreate_nodes_in_existing_node_pools", return_value=123
210
+ )
211
+ command_args.enable_legacy_lustre_port = None
212
+ return_code = update_gke_cluster_with_lustre_driver_enabled(command_args)
213
+
214
+ assert return_code != 0
215
+
216
+
184
217
  def test_update_gke_cluster_with_lustre_driver_enabled_legacy_port(
185
- commands_tester: CommandsTester, command_args
218
+ commands_tester: CommandsTester, command_args, mocker: MockerFixture
186
219
  ):
187
220
  commands_tester.set_result_for_command(
188
221
  (0, ""), "gcloud container clusters update"
189
222
  )
223
+ mocker.patch(
224
+ "xpk.core.cluster.recreate_nodes_in_existing_node_pools", return_value=0
225
+ )
190
226
  command_args.enable_legacy_lustre_port = True
191
227
  update_gke_cluster_with_lustre_driver_enabled(command_args)
192
228
 
@@ -195,3 +231,24 @@ def test_update_gke_cluster_with_lustre_driver_enabled_legacy_port(
195
231
  "gcloud container clusters update cluster --project=project"
196
232
  " --location=us-central1 --quiet --enable-legacy-lustre-port"
197
233
  ]
234
+
235
+
236
+ def test_set_jobset_on_cluster_not_setting_resources_by_default(
237
+ mock_patch_controller_manager_resources: MagicMock, command_args
238
+ ):
239
+ result = set_jobset_on_cluster(command_args)
240
+
241
+ assert result == 0
242
+ mock_patch_controller_manager_resources.assert_not_called()
243
+
244
+
245
+ def test_set_jobset_on_cluster_super_slicing_resources(
246
+ mock_patch_controller_manager_resources: MagicMock, command_args
247
+ ):
248
+ FeatureFlags.SUPER_SLICING_ENABLED = True
249
+ command_args.super_slicing = True
250
+
251
+ result = set_jobset_on_cluster(command_args)
252
+
253
+ assert result == 0
254
+ mock_patch_controller_manager_resources.assert_called()
@@ -181,7 +181,9 @@ def get_main_container(
181
181
  tpu_stacktrace_terminate_command=tpu_stacktrace_terminate_command,
182
182
  gpu_workload_terminate_command=gpu_workload_terminate_command,
183
183
  xpk_internal_commands=xpk_internal_commands,
184
- resources=get_main_container_resources(args, system, resource_type),
184
+ resources=get_main_container_resources(
185
+ args, system, resource_type, parallel_containers
186
+ ),
185
187
  volume_mounts=volume_mounts,
186
188
  )
187
189
  )
@@ -23,7 +23,10 @@ from ..utils.execution_context import is_dry_run
23
23
 
24
24
 
25
25
  def get_main_container_resources(
26
- args, system: SystemCharacteristics, resource_type
26
+ args,
27
+ system: SystemCharacteristics,
28
+ resource_type: str,
29
+ parallel_containers: int,
27
30
  ) -> str:
28
31
  """Resources for the main container.
29
32
  Args:
@@ -53,10 +56,7 @@ def get_main_container_resources(
53
56
  offset_vCPUs = int(system.chips_per_vm) * 0.95
54
57
  return f'{resource_type}: {offset_vCPUs}'
55
58
 
56
- return (
57
- f'{resource_type}:'
58
- f' {int(system.chips_per_vm / system.parallel_containers)}'
59
- )
59
+ return f'{resource_type}: {int(system.chips_per_vm / parallel_containers)}'
60
60
 
61
61
 
62
62
  def get_env_container(args, system: SystemCharacteristics) -> str:
@@ -0,0 +1,77 @@
1
+ """
2
+ Copyright 2026 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from collections import defaultdict
18
+ from dataclasses import dataclass
19
+ import json
20
+ from typing import Any
21
+
22
+ from .commands import run_command_with_updates_retry
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class PatchResources:
27
+ cpu_request: int | None = None
28
+ cpu_limit: int | None = None
29
+ memory_request: str | None = None
30
+ memory_limit: str | None = None
31
+
32
+
33
+ _EMPTY_PATCH_RESOURCES = PatchResources()
34
+
35
+
36
+ def patch_controller_manager_resources(
37
+ name: str,
38
+ namespace: str,
39
+ replicas: int | None = None,
40
+ patch_resources: PatchResources | None = None,
41
+ ) -> int:
42
+ if replicas is None and patch_resources is None:
43
+ return 0
44
+
45
+ patch: dict[str, Any] = {"spec": {}}
46
+
47
+ if replicas is not None:
48
+ patch["spec"]["replicas"] = replicas
49
+
50
+ if patch_resources and patch_resources != _EMPTY_PATCH_RESOURCES:
51
+ resources: dict[str, dict[str, str]] = defaultdict(dict)
52
+ if patch_resources.cpu_request is not None:
53
+ resources["requests"]["cpu"] = str(patch_resources.cpu_request)
54
+ if patch_resources.cpu_limit is not None:
55
+ resources["limits"]["cpu"] = str(patch_resources.cpu_limit)
56
+ if patch_resources.memory_request is not None:
57
+ resources["requests"]["memory"] = patch_resources.memory_request
58
+ if patch_resources.memory_limit is not None:
59
+ resources["limits"]["memory"] = patch_resources.memory_limit
60
+ patch["spec"]["template"] = {
61
+ "spec": {
62
+ "containers": [{
63
+ "name": "manager",
64
+ "resources": resources,
65
+ }]
66
+ }
67
+ }
68
+
69
+ patch_str = json.dumps(patch)
70
+ patch_command = (
71
+ f"kubectl patch deployment {name} -n {namespace}"
72
+ f" --type='strategic' --patch='{patch_str}'"
73
+ )
74
+ return run_command_with_updates_retry(
75
+ patch_command,
76
+ "Updating Controller Manager resources",
77
+ )
@@ -0,0 +1,174 @@
1
+ """
2
+ Copyright 2026 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import json
18
+ import pytest
19
+ from xpk.core.kubectl_common import PatchResources, patch_controller_manager_resources
20
+ from xpk.core.testing.commands_tester import CommandsTester
21
+
22
+
23
+ @pytest.fixture
24
+ def commands_tester(mocker):
25
+ return CommandsTester(
26
+ mocker,
27
+ run_command_with_updates_retry_path=(
28
+ "xpk.core.kubectl_common.run_command_with_updates_retry"
29
+ ),
30
+ )
31
+
32
+
33
+ def test_patch_controller_manager_resources_full(
34
+ commands_tester: CommandsTester,
35
+ ):
36
+ result = patch_controller_manager_resources(
37
+ name="name",
38
+ namespace="namespace",
39
+ replicas=7,
40
+ patch_resources=PatchResources(
41
+ cpu_request=1,
42
+ cpu_limit=2,
43
+ memory_request="10Gi",
44
+ memory_limit="20Gi",
45
+ ),
46
+ )
47
+
48
+ assert result == 0
49
+ commands_tester.assert_command_run(
50
+ "kubectl patch deployment", "name", "-n namespace"
51
+ )
52
+ expected_patch_dict = {
53
+ "spec": {
54
+ "replicas": 7,
55
+ "template": {
56
+ "spec": {
57
+ "containers": [{
58
+ "name": "manager",
59
+ "resources": {
60
+ "requests": {"cpu": "1", "memory": "10Gi"},
61
+ "limits": {"cpu": "2", "memory": "20Gi"},
62
+ },
63
+ }]
64
+ }
65
+ },
66
+ }
67
+ }
68
+ commands_tester.assert_command_run(
69
+ "kubectl patch", json.dumps(expected_patch_dict)
70
+ )
71
+
72
+
73
+ def test_patch_controller_manager_resources_only_replicas(
74
+ commands_tester: CommandsTester,
75
+ ):
76
+ result = patch_controller_manager_resources(
77
+ name="name",
78
+ namespace="namespace",
79
+ replicas=7,
80
+ )
81
+
82
+ assert result == 0
83
+ expected_patch_dict = {
84
+ "spec": {
85
+ "replicas": 7,
86
+ }
87
+ }
88
+ commands_tester.assert_command_run(
89
+ "kubectl patch", json.dumps(expected_patch_dict)
90
+ )
91
+
92
+
93
+ def test_patch_controller_manager_resources_only_requests(
94
+ commands_tester: CommandsTester,
95
+ ):
96
+ result = patch_controller_manager_resources(
97
+ name="name",
98
+ namespace="namespace",
99
+ patch_resources=PatchResources(
100
+ cpu_request=1,
101
+ memory_request="10Gi",
102
+ ),
103
+ )
104
+
105
+ assert result == 0
106
+ commands_tester.assert_command_run(
107
+ "kubectl patch deployment", "name", "-n namespace"
108
+ )
109
+ expected_patch_dict = {
110
+ "spec": {
111
+ "template": {
112
+ "spec": {
113
+ "containers": [{
114
+ "name": "manager",
115
+ "resources": {
116
+ "requests": {"cpu": "1", "memory": "10Gi"},
117
+ },
118
+ }]
119
+ }
120
+ },
121
+ }
122
+ }
123
+ commands_tester.assert_command_run(
124
+ "kubectl patch", json.dumps(expected_patch_dict)
125
+ )
126
+
127
+
128
+ def test_patch_controller_manager_resources_only_limits(
129
+ commands_tester: CommandsTester,
130
+ ):
131
+ result = patch_controller_manager_resources(
132
+ name="name",
133
+ namespace="namespace",
134
+ patch_resources=PatchResources(
135
+ cpu_limit=2,
136
+ memory_limit="20Gi",
137
+ ),
138
+ )
139
+
140
+ assert result == 0
141
+ commands_tester.assert_command_run(
142
+ "kubectl patch deployment", "name", "-n namespace"
143
+ )
144
+ expected_patch_dict = {
145
+ "spec": {
146
+ "template": {
147
+ "spec": {
148
+ "containers": [{
149
+ "name": "manager",
150
+ "resources": {
151
+ "limits": {"cpu": "2", "memory": "20Gi"},
152
+ },
153
+ }]
154
+ }
155
+ },
156
+ }
157
+ }
158
+ commands_tester.assert_command_run(
159
+ "kubectl patch", json.dumps(expected_patch_dict)
160
+ )
161
+
162
+
163
+ def test_patch_controller_manager_resources_propagates_error(
164
+ commands_tester: CommandsTester,
165
+ ):
166
+ commands_tester.set_result_for_command((123, "kubectl patch"))
167
+
168
+ result = patch_controller_manager_resources(
169
+ name="name",
170
+ namespace="namespace",
171
+ replicas=7,
172
+ )
173
+
174
+ assert result == 123
xpk/core/kueue_manager.py CHANGED
@@ -20,6 +20,7 @@ from typing import Optional, List, Dict, Any
20
20
  import json
21
21
  from jinja2 import Environment, FileSystemLoader
22
22
 
23
+ from .kubectl_common import PatchResources, patch_controller_manager_resources
23
24
  from ..utils.topology import get_slice_topology_level, get_topology_product, is_topology_contained
24
25
  from ..utils.kueue import is_queued_cluster
25
26
  from kubernetes.utils import parse_quantity
@@ -304,7 +305,9 @@ class KueueManager:
304
305
  if return_code != 0:
305
306
  return return_code
306
307
 
307
- return self.__update_kueue_resources_if_necessary()
308
+ return self.__update_kueue_resources_if_necessary(
309
+ configure_super_slicing=kueue_config.configure_super_slicing
310
+ )
308
311
 
309
312
  def __build_template_context(
310
313
  self,
@@ -452,8 +455,23 @@ class KueueManager:
452
455
  command = f"kubectl apply -f {tmp_file}"
453
456
  return run_command_with_updates(command, task)
454
457
 
455
- def __update_kueue_resources_if_necessary(self) -> int:
458
+ def __update_kueue_resources_if_necessary(
459
+ self, configure_super_slicing: bool
460
+ ) -> int:
456
461
  """Patch memory size limit if necessary."""
462
+ if configure_super_slicing:
463
+ return patch_controller_manager_resources(
464
+ name="kueue-controller-manager",
465
+ namespace="kueue-system",
466
+ replicas=3,
467
+ patch_resources=PatchResources(
468
+ cpu_request=16,
469
+ cpu_limit=16,
470
+ memory_request="64Gi",
471
+ memory_limit="64Gi",
472
+ ),
473
+ )
474
+
457
475
  # Get total number of nodes
458
476
  cmd_total_node_num = "kubectl get node --no-headers | wc -l"
459
477
  return_code, out = run_command_for_value(
@@ -465,31 +483,13 @@ class KueueManager:
465
483
  new_memory_limit = (
466
484
  f"{max(math.ceil(int(out) * MEMORY_SIZE_PER_VM), MIN_MEMORY_LIMIT_SIZE)}Mi"
467
485
  )
468
- patch = {
469
- "spec": {
470
- "template": {
471
- "spec": {
472
- "containers": [{
473
- "name": "manager",
474
- "resources": {"limits": {"memory": new_memory_limit}},
475
- }]
476
- }
477
- }
478
- }
479
- }
480
- patch_str = json.dumps(patch)
481
- patch_command = (
482
- "kubectl patch deployment kueue-controller-manager -n kueue-system"
483
- f" --type='strategic' --patch='{patch_str}'"
484
- )
485
- task = "Updating Kueue Controller Manager resources"
486
- return_code = run_command_with_updates_retry(
487
- patch_command,
488
- task,
486
+ return patch_controller_manager_resources(
487
+ name="kueue-controller-manager",
488
+ namespace="kueue-system",
489
+ patch_resources=PatchResources(
490
+ memory_limit=new_memory_limit,
491
+ ),
489
492
  )
490
- if return_code != 0:
491
- xpk_print(f"{task} returned ERROR {return_code}")
492
- return return_code
493
493
 
494
494
  def __autocorrect_resource_limits(
495
495
  self, kueue_config: KueueConfig
@@ -21,6 +21,7 @@ from pytest_mock import MockerFixture
21
21
  import yaml
22
22
  from unittest.mock import MagicMock, patch
23
23
 
24
+ from xpk.core.kubectl_common import PatchResources
24
25
  from xpk.core.kueue_manager import KueueConfig, KueueManager, has_sub_slicing_enabled, has_super_slicing_enabled
25
26
  from xpk.core.system_characteristics import GpuConfig, DockerPlatform, AcceleratorType, SystemCharacteristics, UserFacingNameToSystemCharacteristics
26
27
  from xpk.core.testing.commands_tester import CommandsTester
@@ -86,6 +87,14 @@ def mock_commands(mocker: MockerFixture) -> CommandsTester:
86
87
  )
87
88
 
88
89
 
90
+ @pytest.fixture(autouse=True)
91
+ def mock_patch_controller_manager_resources(mocker: MockerFixture) -> MagicMock:
92
+ return mocker.patch(
93
+ "xpk.core.kueue_manager.patch_controller_manager_resources",
94
+ return_value=0,
95
+ )
96
+
97
+
89
98
  @pytest.fixture(autouse=True)
90
99
  @patch("jinja2.Environment", return_value=MagicMock())
91
100
  def kueue_manager(mock_env: MagicMock) -> KueueManager:
@@ -239,7 +248,9 @@ def test_installation_without_tolerations(
239
248
 
240
249
 
241
250
  def test_resource_update_for_small_cluster(
242
- mock_commands: CommandsTester, kueue_manager: KueueManager
251
+ mock_commands: CommandsTester,
252
+ kueue_manager: KueueManager,
253
+ mock_patch_controller_manager_resources: MagicMock,
243
254
  ):
244
255
  """Test resource update logic for a small cluster."""
245
256
  set_installed_kueue_version(mock_commands, None)
@@ -248,17 +259,21 @@ def test_resource_update_for_small_cluster(
248
259
  result = kueue_manager.install_or_upgrade(KUEUE_CONFIG)
249
260
 
250
261
  assert result == 0
262
+
251
263
  # 100 * 1.2 = 120, which is less than 4096. So it should be 4096.
252
- mock_commands.assert_command_run(
253
- "kubectl patch deployment kueue-controller-manager -n kueue-system"
254
- ' --type=\'strategic\' --patch=\'{"spec": {"template": {"spec":'
255
- ' {"containers": [{"name": "manager", "resources": {"limits":'
256
- ' {"memory": "4096Mi"}}}]}}}}\'',
264
+ mock_patch_controller_manager_resources.assert_called_with(
265
+ name="kueue-controller-manager",
266
+ namespace="kueue-system",
267
+ patch_resources=PatchResources(
268
+ memory_limit="4096Mi",
269
+ ),
257
270
  )
258
271
 
259
272
 
260
273
  def test_resource_update_for_large_cluster(
261
- mock_commands: CommandsTester, kueue_manager: KueueManager
274
+ mock_commands: CommandsTester,
275
+ kueue_manager: KueueManager,
276
+ mock_patch_controller_manager_resources: MagicMock,
262
277
  ):
263
278
  """Test resource update logic for a large cluster."""
264
279
  set_installed_kueue_version(mock_commands, None)
@@ -268,11 +283,36 @@ def test_resource_update_for_large_cluster(
268
283
 
269
284
  assert result == 0
270
285
  # 5000 * 1.2 = 6000, which is > 4096.
271
- mock_commands.assert_command_run(
272
- "kubectl patch deployment kueue-controller-manager -n kueue-system"
273
- ' --type=\'strategic\' --patch=\'{"spec": {"template": {"spec":'
274
- ' {"containers": [{"name": "manager", "resources": {"limits":'
275
- ' {"memory": "6000Mi"}}}]}}}}\'',
286
+ mock_patch_controller_manager_resources.assert_called_with(
287
+ name="kueue-controller-manager",
288
+ namespace="kueue-system",
289
+ patch_resources=PatchResources(
290
+ memory_limit="6000Mi",
291
+ ),
292
+ )
293
+
294
+
295
+ def test_resource_update_for_super_slicing_cluster(
296
+ mock_commands: CommandsTester,
297
+ kueue_manager: KueueManager,
298
+ mock_patch_controller_manager_resources: MagicMock,
299
+ ):
300
+ set_installed_kueue_version(mock_commands, None)
301
+ kueue_config = dataclasses.replace(KUEUE_CONFIG, configure_super_slicing=True)
302
+
303
+ result = kueue_manager.install_or_upgrade(kueue_config)
304
+
305
+ assert result == 0
306
+ mock_patch_controller_manager_resources.assert_called_with(
307
+ name="kueue-controller-manager",
308
+ namespace="kueue-system",
309
+ replicas=3,
310
+ patch_resources=PatchResources(
311
+ cpu_request=16,
312
+ cpu_limit=16,
313
+ memory_request="64Gi",
314
+ memory_limit="64Gi",
315
+ ),
276
316
  )
277
317
 
278
318
 
xpk/core/nodepool.py CHANGED
@@ -691,3 +691,37 @@ def _validate_reservation_count(
691
691
  )
692
692
  return 1
693
693
  return 0
694
+
695
+
696
+ def recreate_nodes_in_existing_node_pools(args) -> int:
697
+ """Triggers a manual upgrade of nodepools to the same version to force recreation
698
+ of nodes.
699
+ """
700
+
701
+ existing_node_pool_names, return_code = get_all_nodepools_programmatic(args)
702
+ if return_code > 0:
703
+ xpk_print('Listing all node pools failed!')
704
+ return return_code
705
+
706
+ commands = [
707
+ (
708
+ f'gcloud container clusters upgrade {args.cluster}'
709
+ f' --project={args.project}'
710
+ f' --node-pool={node_pool_name}'
711
+ f' --location={get_cluster_location(args.project, args.cluster, args.zone)}'
712
+ ' --quiet'
713
+ )
714
+ for node_pool_name in existing_node_pool_names
715
+ ]
716
+ task_names = [
717
+ f'NodesRecreate-{node_pool_name}'
718
+ for node_pool_name in existing_node_pool_names
719
+ ]
720
+ for i, command in enumerate(commands):
721
+ xpk_print(f'To complete {task_names[i]} we are executing {command}')
722
+ maybe_failure = run_commands(
723
+ commands,
724
+ 'Recreate nodes in nodepools',
725
+ task_names,
726
+ )
727
+ return maybe_failure.return_code if maybe_failure is not None else 0
xpk/core/nodepool_test.py CHANGED
@@ -20,6 +20,7 @@ from xpk.core.nodepool import (
20
20
  ensure_resource_policy_exists,
21
21
  get_desired_node_pool_names,
22
22
  run_gke_node_pool_create_command,
23
+ recreate_nodes_in_existing_node_pools,
23
24
  _validate_reservation_count,
24
25
  )
25
26
  from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics, DockerPlatform, GpuConfig
@@ -568,3 +569,106 @@ def test_run_gke_node_pool_create_command_partial_reservations(
568
569
  commands_tester.assert_command_run(
569
570
  "gcloud", "node-pools create", "test-cluster-np-2", "--reservation=res2"
570
571
  )
572
+
573
+
574
+ def test_recreate_nodes_in_existing_node_pools_upgrades_existing_nodepools(
575
+ mocker,
576
+ commands_tester: CommandsTester,
577
+ ):
578
+ mocker.patch(
579
+ "xpk.core.nodepool.get_all_nodepools_programmatic",
580
+ return_value=(["nodepool1", "nodepool2"], 0),
581
+ )
582
+ mocker.patch(
583
+ "xpk.core.nodepool.get_cluster_location", return_value="us-central1"
584
+ )
585
+ args = mocker.Mock(
586
+ cluster="test-cluster",
587
+ project="test-project",
588
+ zone="us-central1-a",
589
+ )
590
+
591
+ commands_tester.set_result_for_command(
592
+ (0, ""), "gcloud container clusters upgrade"
593
+ )
594
+
595
+ result = recreate_nodes_in_existing_node_pools(args)
596
+
597
+ assert result == 0
598
+ commands_tester.assert_command_run(
599
+ "gcloud",
600
+ "container clusters upgrade test-cluster",
601
+ "--project=test-project",
602
+ "--node-pool=nodepool1",
603
+ "--location=us-central1",
604
+ "--quiet",
605
+ )
606
+ commands_tester.assert_command_run(
607
+ "gcloud",
608
+ "container clusters upgrade test-cluster",
609
+ "--project=test-project",
610
+ "--node-pool=nodepool2",
611
+ "--location=us-central1",
612
+ "--quiet",
613
+ )
614
+
615
+
616
+ def test_recreate_nodes_in_existing_node_pools_returns_error_code_if_listing_fails(
617
+ mocker,
618
+ commands_tester: CommandsTester,
619
+ ):
620
+ mocker.patch(
621
+ "xpk.core.nodepool.get_all_nodepools_programmatic",
622
+ return_value=([], 123),
623
+ )
624
+ args = mocker.Mock(
625
+ cluster="test-cluster",
626
+ project="test-project",
627
+ zone="us-central1-a",
628
+ )
629
+
630
+ result = recreate_nodes_in_existing_node_pools(args)
631
+
632
+ assert result == 123
633
+
634
+
635
+ def test_recreate_nodes_in_existing_node_pools_returns_error_code_if_upgrade_fails(
636
+ mocker,
637
+ commands_tester: CommandsTester,
638
+ ):
639
+ mocker.patch(
640
+ "xpk.core.nodepool.get_all_nodepools_programmatic",
641
+ return_value=(["nodepool1", "nodepool2"], 0),
642
+ )
643
+ mocker.patch(
644
+ "xpk.core.nodepool.get_cluster_location", return_value="us-central1"
645
+ )
646
+ args = mocker.Mock(
647
+ cluster="test-cluster",
648
+ project="test-project",
649
+ zone="us-central1-a",
650
+ )
651
+
652
+ commands_tester.set_result_for_command(
653
+ (123, ""), "gcloud container clusters upgrade"
654
+ )
655
+
656
+ result = recreate_nodes_in_existing_node_pools(args)
657
+
658
+ assert result == 123
659
+ commands_tester.assert_command_run(
660
+ "gcloud",
661
+ "container clusters upgrade test-cluster",
662
+ "--project=test-project",
663
+ "--node-pool=nodepool1",
664
+ "--location=us-central1",
665
+ "--quiet",
666
+ )
667
+ commands_tester.assert_command_not_run(
668
+ "gcloud",
669
+ "container clusters upgrade test-cluster",
670
+ "--project=test-project",
671
+ "--node-pool=nodepool2",
672
+ "--location=us-central1",
673
+ "--quiet",
674
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xpk
3
- Version: 1.1.1
3
+ Version: 1.2.0
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
5
  Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
@@ -13,10 +13,10 @@ xpk/blueprints/a4/config-map.yaml.tftpl,sha256=o6LeGIYUfFGyj3vj-8ztV5ildQ46QZVl7
13
13
  xpk/blueprints/a4/nccl-rdma-installer-a4.yaml,sha256=if3WOmNLVGTJIJHU76EWC1FyiIXDTRIXcwo4OsBxarQ,2113
14
14
  xpk/blueprints/a4/storage_crd.yaml,sha256=r4WFXnSJJ25EUF-t4Ljfbl-cJoSaiFiZkP8451eTub4,1260
15
15
  xpk/commands/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
16
- xpk/commands/cluster.py,sha256=9LGVlrXg5C2w7wRKYTWe6lh3wSl7ZFpaNXSlpBfd0tc,45311
16
+ xpk/commands/cluster.py,sha256=9Eqj0d77_4FZ4ztsYFiIvX514jp-XvOTkZIVMdvitOI,45715
17
17
  xpk/commands/cluster_gcluster.py,sha256=H6_pphIrIzDCHZg-ZH3o-xA2AgVQKSjE2HKbhIW6-Xo,13727
18
18
  xpk/commands/cluster_gcluster_test.py,sha256=zdxz5gAMu3HRVNsj7F-VYRf4TYSPMjuOG7DolQN2Pb4,6263
19
- xpk/commands/cluster_test.py,sha256=va1ODWx0Y2fSZIGzIsqEIVN25tIsfYr95ubcyxTMDfA,23494
19
+ xpk/commands/cluster_test.py,sha256=osBhaZTdB2kgQw1aY3W_ybFOqyb0UZncvVLyZNuKRpU,23878
20
20
  xpk/commands/common.py,sha256=fsM4Sud3y1RU6a8JHi99l13O4raYvW2oPahCBzvMwh4,3884
21
21
  xpk/commands/common_test.py,sha256=BDYFtN-cVfpEpj6akZy4R2KrnP53AIV1Lh1FEImhXx0,6106
22
22
  xpk/commands/config.py,sha256=L_zRpQTxMcSh6rxOT8gG263V6YGqzVoz4UxdWywTFdA,850
@@ -32,30 +32,32 @@ xpk/commands/workload_test.py,sha256=m79x6YDYn-36BX0CttTtAMdt_O-WJY40FLTGa6KwKg8
32
32
  xpk/core/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
33
33
  xpk/core/capacity.py,sha256=MGiNOwBCwg8Ci-hsssbZYIJ2xXTm6Y5yKTO4J5ozqEk,11053
34
34
  xpk/core/capacity_test.py,sha256=04ecANSECL3APmFCjdtkw2Wz6JxWkRZwE_QHa2m1_68,6026
35
- xpk/core/cluster.py,sha256=RC-91Dk4nx9F-jLmZPP78ALlUgSBq670geaIqnRMZxY,24184
35
+ xpk/core/cluster.py,sha256=JLHFlrcc3Ch3ggLv2EJrOdQOOTn9MB-0IF5fQLIRPD0,25159
36
36
  xpk/core/cluster_private.py,sha256=43BEO49MA-p1KfKuchxVZk6wwIlRCUU17j-6MMMeq4I,6868
37
- xpk/core/cluster_test.py,sha256=VeC1C7kN0OJe6yeoL8GCaFk4uPhijP6CjvQAcE7q9xw,6653
37
+ xpk/core/cluster_test.py,sha256=0ymagvKxyX909NjA5J23lutq7xyHXlIWwvGRFoVE0n8,8492
38
38
  xpk/core/commands.py,sha256=at73VJHdZ4rVA8uvW997tNrvnCjP9v6zaw96bU0kd74,10841
39
39
  xpk/core/config.py,sha256=U2JDXx-XBuqQpZJf2iUDoww5--E8ejZfgmIxKeGu-gU,4668
40
40
  xpk/core/config_test.py,sha256=POSuofK0LFbNNygDAo2fjtKY4NMrRjUFeGcpBh9JOS4,3569
41
- xpk/core/docker_container.py,sha256=SZ3msIMx-2gBIok3j5N73KO1wVcMfzw1pQpgTdyZOQA,8243
41
+ xpk/core/docker_container.py,sha256=9kJpTEholW_d_GamjcqunCWT4XwrDyZs3fcvcPNCb8Y,8294
42
42
  xpk/core/docker_image.py,sha256=9vwqbb6Mc3C5ZEOph03WS-EWI5hxMYGGigqzIMkDTjE,6909
43
43
  xpk/core/docker_manager.py,sha256=vGPCWPDB507sxEsXvSD4IM-h5HqQzYLk7WSdCUmSDb4,10568
44
- xpk/core/docker_resources.py,sha256=7EXV1CvwCVogE5-m6utSE1GXxwf6EpB4QDYeuGXWHmI,12547
44
+ xpk/core/docker_resources.py,sha256=bwHGNh_gOtprVOeoFC8NObgKGD9aDjNc2XBMS6syD2Q,12562
45
45
  xpk/core/filestore.py,sha256=mcuUzsAPARbnrBG4fIGsEoN8NmzjaQ6k0tvIwMtjO9k,8068
46
46
  xpk/core/gcloud_context.py,sha256=d1wQ76zp7QMdG5BxB3sJz4b4OF5Mc8OzmPd_m0xd-Ys,6810
47
47
  xpk/core/gcloud_context_test.py,sha256=M8rp6S1zaEcAI7u4Bt8ukWKzv82HH5h9oYVojBcKgHk,5987
48
48
  xpk/core/gcluster_manager.py,sha256=lyv_MvdnkByy9_PEBj_ugAEBwnCbFNiWTSrEFjrMlPc,6236
49
49
  xpk/core/gcsfuse.py,sha256=kg5pgxdTjgiqquuGjev9fXzJPb8oiWPTK6wzCddzheQ,2125
50
50
  xpk/core/jobset.py,sha256=PJ4Fd8TNNLuYKNOMehoMYRIUEXyc5jsbHctJGqfW_8Y,4037
51
- xpk/core/kueue_manager.py,sha256=qpz4Df7tfWNKzBFlTbMUfsHnXl15SdI7r_mHlCFRYdc,19998
52
- xpk/core/kueue_manager_test.py,sha256=iJZFQE-fhQAI8MVXe66zUJpSbU5HHUZmNFnnCPCXNZs,22042
51
+ xpk/core/kubectl_common.py,sha256=mVuHJERdRXVEigU9Fxcmq7SHABkWDUmfYTJFvkHEbLs,2353
52
+ xpk/core/kubectl_common_test.py,sha256=2PTTcr5d8pSifuki2L7uA8-UWOqcIpv6PJQ1hlFruJQ,4469
53
+ xpk/core/kueue_manager.py,sha256=8VVOEvkh6Cif3s9ii0HfUR27HuxEJWUwnt4x4YM6ydc,20031
54
+ xpk/core/kueue_manager_test.py,sha256=Xb-w_eF11doqkpfNPb5EYm9BneEo3dtOxHuH-jtfjAI,23001
53
55
  xpk/core/monitoring.py,sha256=__bzTq_DIDAK8yIaN4F3MJh-yjYw5X1OlxmRgYOpf1g,4332
54
56
  xpk/core/mtc.py,sha256=pO7p3l-EzLFdTE8MdwWV8i0Zu-7epGql_kPoksVofIU,6259
55
57
  xpk/core/nap.py,sha256=gBxXu8Png1-BlAHbxLWZgbSXeLMGVixufkQVMR0fmvk,12963
56
58
  xpk/core/network.py,sha256=Oulb7U69lWkpOKxOC1C7ekJDpC51TLwd7XdZA3NQ7E0,10505
57
- xpk/core/nodepool.py,sha256=FX2ljKvwMsG3fXfn_CDCRwiKH4UAArQeDiFLq3XK9F0,25495
58
- xpk/core/nodepool_test.py,sha256=9xSFpn-1j9Vd0J8KFzbq8ywS_Ibsbx4CgR1er68mRnw,17542
59
+ xpk/core/nodepool.py,sha256=dgE4l-HETNvnVj6WfCNWZkegzqrLiNg61RotGTMPDd0,26575
60
+ xpk/core/nodepool_test.py,sha256=Zf1NW9Hoj0D517NxSsxcj6p8NxeEbFmRpdIACz9NPdM,20207
59
61
  xpk/core/pathways.py,sha256=9w_VrpLLjQSSdNd8HJLWWtIYzA0NpR7t70knRSVLK0w,11574
60
62
  xpk/core/pathways_test.py,sha256=UeuSo_g9BNI27to-wflQwc6dJFVSA5-kOK_cjmY5qgU,1809
61
63
  xpk/core/ray.py,sha256=JWhc_ToRHpF4_URGnuE_47FMgamaRsA4KVUMpqThWzw,6145
@@ -142,9 +144,9 @@ xpk/utils/validation.py,sha256=rE9LTkXJT7jIesodFb9pONL7ixhLqiQleyoaz7N39Dw,2765
142
144
  xpk/utils/validation_test.py,sha256=PEDSMUqZdt_Lx1FSR-LOTXKKtsJ47JH1fxugM0Gfz6Y,1168
143
145
  xpk/utils/versions.py,sha256=_Ep68W70a9605XjiaOOpBa9Is9jXlsoOiwL8v5Xt-WA,897
144
146
  xpk/utils/yaml.py,sha256=j8xuAJ9yAAwnQi6ozwZ-nMnDyDnc3xWkeBZMtSuP4RU,844
145
- xpk-1.1.1.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
146
- xpk-1.1.1.dist-info/METADATA,sha256=CqGGSACNDUVw7uPODTokMvMqpBMc1gDbY99AUmvQ68Q,10013
147
- xpk-1.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
148
- xpk-1.1.1.dist-info/entry_points.txt,sha256=mzEtiIesFkT1kmcTUVDA1o3uOhiniX6tIz2wmOlMu1M,38
149
- xpk-1.1.1.dist-info/top_level.txt,sha256=aDe4N0jicmuWExx_6w0TxWQJaEuPSs9BnLU-3aF1GLo,4
150
- xpk-1.1.1.dist-info/RECORD,,
147
+ xpk-1.2.0.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
148
+ xpk-1.2.0.dist-info/METADATA,sha256=Jkgz_1jDxS-vJnbqKI_Kdd5LRH9mikljvnEFXTk8lWA,10013
149
+ xpk-1.2.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
150
+ xpk-1.2.0.dist-info/entry_points.txt,sha256=mzEtiIesFkT1kmcTUVDA1o3uOhiniX6tIz2wmOlMu1M,38
151
+ xpk-1.2.0.dist-info/top_level.txt,sha256=aDe4N0jicmuWExx_6w0TxWQJaEuPSs9BnLU-3aF1GLo,4
152
+ xpk-1.2.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5