xpk 0.7.2__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +19 -13
- xpk/commands/cluster.py +240 -71
- xpk/commands/cluster_gcluster.py +22 -5
- xpk/commands/common.py +33 -1
- xpk/commands/info.py +2 -4
- xpk/commands/job.py +7 -8
- xpk/commands/kjob_common.py +30 -18
- xpk/commands/run.py +17 -12
- xpk/commands/shell.py +3 -4
- xpk/commands/storage.py +75 -19
- xpk/commands/workload.py +161 -324
- xpk/core/blueprint/blueprint_definitions.py +2 -0
- xpk/core/blueprint/blueprint_generator.py +335 -45
- xpk/core/capacity.py +1 -0
- xpk/core/cluster.py +193 -12
- xpk/core/config.py +3 -1
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +9 -21
- xpk/core/filestore.py +5 -1
- xpk/core/gcsfuse.py +27 -6
- xpk/core/kjob.py +66 -20
- xpk/core/kueue.py +30 -0
- xpk/core/mtc.py +195 -0
- xpk/core/nap.py +4 -0
- xpk/core/network.py +34 -22
- xpk/core/nodepool.py +28 -26
- xpk/core/pathways.py +165 -210
- xpk/core/resources.py +21 -0
- xpk/core/scheduling.py +36 -0
- xpk/core/storage.py +66 -12
- xpk/core/system_characteristics.py +9 -0
- xpk/core/workload.py +28 -83
- xpk/core/workload_decorators/rdma_decorator.py +11 -15
- xpk/core/workload_decorators/storage_decorator.py +8 -3
- xpk/core/workload_decorators/tcpx_decorator.py +179 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +17 -16
- xpk/parser/cluster.py +574 -381
- xpk/parser/storage.py +25 -5
- xpk/parser/workload.py +59 -31
- xpk/utils/kubectl.py +4 -1
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/METADATA +192 -93
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/RECORD +46 -44
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/WHEEL +1 -1
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/top_level.txt +0 -0
xpk/core/mtc.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import requests
|
|
18
|
+
import yaml
|
|
19
|
+
|
|
20
|
+
from ..core.cluster import JOBSET_VERSION
|
|
21
|
+
from ..core.cluster import setup_k8s_env
|
|
22
|
+
from ..utils import templates
|
|
23
|
+
from ..utils.console import xpk_exit
|
|
24
|
+
from ..utils.console import xpk_print
|
|
25
|
+
from ..utils.kubectl import apply_kubectl_manifest
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
MTC_CPC_PATH = "/../templates/mtc-cpc.yaml"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def create_mtc_cpc(
|
|
32
|
+
mtc_gcs_bucket: str,
|
|
33
|
+
mtc_machine_type: str,
|
|
34
|
+
mtc_toleration_key: str,
|
|
35
|
+
mtc_ramdisk_size: str,
|
|
36
|
+
) -> dict:
|
|
37
|
+
"""Create MTC Checkpoint Configuration.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
mtc_gcs_bucket: GCS bucket for MTC
|
|
41
|
+
mtc_machine_type: Machine type for MTC
|
|
42
|
+
mtc_toleration_key: Toleration key for MTC
|
|
43
|
+
mtc_ramdisk_size: Ramdisk size for MTC
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
MTC Checkpoint Configuration
|
|
47
|
+
"""
|
|
48
|
+
data = templates.load(MTC_CPC_PATH)
|
|
49
|
+
|
|
50
|
+
data["spec"]["cloudStorageBucketName"] = mtc_gcs_bucket
|
|
51
|
+
data["spec"]["nodeSelector"][
|
|
52
|
+
"node.kubernetes.io/instance-type"
|
|
53
|
+
] = mtc_machine_type
|
|
54
|
+
data["spec"]["tolerations"][0]["key"] = mtc_toleration_key
|
|
55
|
+
data["spec"]["inMemoryVolumeSize"] = mtc_ramdisk_size
|
|
56
|
+
|
|
57
|
+
return data
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def install_mtc_on_cluster(args, system) -> int:
|
|
61
|
+
"""Install MTC on the cluster.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
args: user provided arguments for running the command.
|
|
65
|
+
system: system related information.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
return code of the command.
|
|
69
|
+
"""
|
|
70
|
+
if args.mtc_gcs_bucket is None:
|
|
71
|
+
xpk_print("MTC GCS bucket is required.")
|
|
72
|
+
xpk_exit(1)
|
|
73
|
+
if args.mtc_gcs_bucket.startswith("gs://"):
|
|
74
|
+
args.mtc_gcs_bucket = args.mtc_gcs_bucket.replace("gs://", "")
|
|
75
|
+
|
|
76
|
+
if args.mtc_ramdisk_size is None:
|
|
77
|
+
xpk_print("MTC ramdisk size is required.")
|
|
78
|
+
xpk_exit(1)
|
|
79
|
+
|
|
80
|
+
if args.mtc_toleration_key is None:
|
|
81
|
+
args.mtc_toleration_key = "google.com/tpu"
|
|
82
|
+
|
|
83
|
+
k8s_api_client = setup_k8s_env(args)
|
|
84
|
+
jobset_manifest = update_jobset_manifest()
|
|
85
|
+
if jobset_manifest is None:
|
|
86
|
+
xpk_print(
|
|
87
|
+
"Updated jobset manifest is empty, not updating the jobset controller."
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
xpk_print("Applying Jobset with MTC Configuration")
|
|
91
|
+
return_code = apply_kubectl_manifest(k8s_api_client, [jobset_manifest])
|
|
92
|
+
if return_code != 0:
|
|
93
|
+
return return_code
|
|
94
|
+
|
|
95
|
+
mtc_checkpoint_configuration_crd_data = create_mtc_cpc(
|
|
96
|
+
args.mtc_gcs_bucket,
|
|
97
|
+
system.gce_machine_type,
|
|
98
|
+
args.mtc_toleration_key,
|
|
99
|
+
args.mtc_ramdisk_size,
|
|
100
|
+
)
|
|
101
|
+
xpk_print("Applying MTC Checkpoint Configuration")
|
|
102
|
+
return_code = apply_kubectl_manifest(
|
|
103
|
+
k8s_api_client, [mtc_checkpoint_configuration_crd_data]
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
return return_code
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def update_jobset_manifest():
|
|
110
|
+
"""Update the jobset manifest to increase the resources for the jobset controller manager.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
The updated jobset manifest.
|
|
114
|
+
"""
|
|
115
|
+
manifest_url = f"https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml"
|
|
116
|
+
manifest_content = None
|
|
117
|
+
# Fetch the manifest content
|
|
118
|
+
try:
|
|
119
|
+
response = requests.get(manifest_url, timeout=10)
|
|
120
|
+
response.raise_for_status() # Raise an exception for HTTP errors
|
|
121
|
+
manifest_content = response.text
|
|
122
|
+
except requests.exceptions.Timeout as e:
|
|
123
|
+
xpk_print(f"Error: Request to {manifest_url} after 10 seconds: {e}")
|
|
124
|
+
xpk_exit(1)
|
|
125
|
+
except requests.exceptions.RequestException as e:
|
|
126
|
+
xpk_print(f"Error fetching manifest from {manifest_url}: {e}")
|
|
127
|
+
xpk_exit(1)
|
|
128
|
+
|
|
129
|
+
if manifest_content is None:
|
|
130
|
+
xpk_print("Manifest content not found.")
|
|
131
|
+
xpk_exit(1)
|
|
132
|
+
|
|
133
|
+
# Load all YAML documents from the manifest
|
|
134
|
+
yaml_data_list = list(yaml.safe_load_all(manifest_content))
|
|
135
|
+
# Iterate through the yaml_data to find the Deployment for
|
|
136
|
+
# jobset-controller-manager
|
|
137
|
+
update_manifest = False
|
|
138
|
+
for yaml_data in yaml_data_list:
|
|
139
|
+
if (
|
|
140
|
+
yaml_data
|
|
141
|
+
and yaml_data.get("apiVersion") == "apps/v1"
|
|
142
|
+
and yaml_data.get("kind") == "Deployment"
|
|
143
|
+
and yaml_data.get("metadata", {}).get("name")
|
|
144
|
+
== "jobset-controller-manager"
|
|
145
|
+
):
|
|
146
|
+
# Found the Deployment, now modify the resources
|
|
147
|
+
containers = yaml_data["spec"]["template"]["spec"]["containers"]
|
|
148
|
+
for container in containers:
|
|
149
|
+
if container["name"] == "manager":
|
|
150
|
+
# Update resource limits and requests
|
|
151
|
+
current_cpu_request = (
|
|
152
|
+
container["resources"].get("requests", {}).get("cpu", "0m")
|
|
153
|
+
)
|
|
154
|
+
current_memory_request = (
|
|
155
|
+
container["resources"].get("requests", {}).get("memory", "0Mi")
|
|
156
|
+
)
|
|
157
|
+
current_memory_limit = (
|
|
158
|
+
container["resources"].get("limits", {}).get("memory", "0Mi")
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Define new values for comparison
|
|
162
|
+
new_cpu_request = "1000m"
|
|
163
|
+
new_memory_request = "1Gi"
|
|
164
|
+
new_memory_limit = "2Gi"
|
|
165
|
+
|
|
166
|
+
if parse_resource_value(current_cpu_request) < parse_resource_value(
|
|
167
|
+
new_cpu_request
|
|
168
|
+
):
|
|
169
|
+
container["resources"]["requests"]["cpu"] = new_cpu_request
|
|
170
|
+
update_manifest = True
|
|
171
|
+
if parse_resource_value(
|
|
172
|
+
current_memory_request
|
|
173
|
+
) < parse_resource_value(new_memory_request):
|
|
174
|
+
container["resources"]["requests"]["memory"] = new_memory_request
|
|
175
|
+
update_manifest = True
|
|
176
|
+
if parse_resource_value(current_memory_limit) < parse_resource_value(
|
|
177
|
+
new_memory_limit
|
|
178
|
+
):
|
|
179
|
+
container["resources"]["limits"]["memory"] = new_memory_limit
|
|
180
|
+
update_manifest = True
|
|
181
|
+
break
|
|
182
|
+
if update_manifest:
|
|
183
|
+
xpk_print("Jobset controller updation required.")
|
|
184
|
+
return yaml_data
|
|
185
|
+
xpk_print("Jobset controller no updation required.")
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def parse_resource_value(value) -> int:
|
|
189
|
+
if value.endswith("m"):
|
|
190
|
+
return int(value[:-1])
|
|
191
|
+
if value.endswith("Mi"):
|
|
192
|
+
return int(value[:-2])
|
|
193
|
+
if value.endswith("Gi"):
|
|
194
|
+
return int(value[:-2]) * 1024
|
|
195
|
+
return int(value)
|
xpk/core/nap.py
CHANGED
|
@@ -255,6 +255,10 @@ def is_autoprovisioning_enabled(
|
|
|
255
255
|
bool is true if autoprovisioning is enabled, false otherwise.
|
|
256
256
|
int of 0 if successful and 1 otherwise.
|
|
257
257
|
"""
|
|
258
|
+
# Currently autoprovisioning is not enabled for Pathways workloads. b/360898087
|
|
259
|
+
if args.use_pathways:
|
|
260
|
+
return False, 0
|
|
261
|
+
|
|
258
262
|
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
|
259
263
|
cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
|
|
260
264
|
|
xpk/core/network.py
CHANGED
|
@@ -14,12 +14,10 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
from ..utils.console import xpk_print
|
|
17
|
+
from ..utils.console import xpk_exit, xpk_print
|
|
18
18
|
from ..utils.file import write_tmp_file
|
|
19
|
-
from .capacity import H100_DEVICE_TYPE
|
|
20
19
|
from .commands import run_command_for_value, run_command_with_updates
|
|
21
20
|
from .gcloud_context import zone_to_region
|
|
22
|
-
from .system_characteristics import SystemCharacteristics
|
|
23
21
|
|
|
24
22
|
# cluster_network_yaml: the config when creating the network for a3 cluster
|
|
25
23
|
CLUSTER_NETWORK_YAML = """
|
|
@@ -175,16 +173,6 @@ def create_cluster_subnet(args, index) -> int:
|
|
|
175
173
|
return 0
|
|
176
174
|
|
|
177
175
|
|
|
178
|
-
def get_subnetworks_for_a3mega(cluster_name: str) -> list[str]:
|
|
179
|
-
return [f'{cluster_name}-gpunet-{i}-subnet' for i in range(8)]
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
def get_subnetworks_for_a3ultra(cluster_name: str) -> list[str]:
|
|
183
|
-
return [f'{cluster_name}-sub-1'] + [
|
|
184
|
-
f'{cluster_name}-rdma-sub-{i}' for i in range(8)
|
|
185
|
-
]
|
|
186
|
-
|
|
187
|
-
|
|
188
176
|
def create_cluster_firewall_rule(args, index) -> int:
|
|
189
177
|
"""Create one GKE Cluster firewall rule.
|
|
190
178
|
|
|
@@ -247,20 +235,40 @@ def create_cluster_network_config(args) -> int:
|
|
|
247
235
|
return 0
|
|
248
236
|
|
|
249
237
|
|
|
250
|
-
def
|
|
251
|
-
"""
|
|
252
|
-
|
|
253
|
-
|
|
238
|
+
def get_cluster_subnetworks(args) -> list[str]:
|
|
239
|
+
"""Gets the list of cluster networks.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
args: user provided arguments for running the command.
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
list[str]: list of cluster networks
|
|
246
|
+
"""
|
|
247
|
+
command = 'kubectl get GKENetworkParamSet'
|
|
248
|
+
return_code, stdout = run_command_for_value(
|
|
249
|
+
command, 'Get Cluster Networks', args
|
|
250
|
+
)
|
|
251
|
+
if return_code != 0:
|
|
252
|
+
xpk_print('GKE Cluster Get NetworkParamSet failed')
|
|
253
|
+
xpk_exit(return_code)
|
|
254
|
+
|
|
255
|
+
networks = [line.split()[0] for line in stdout.splitlines()][1:]
|
|
256
|
+
|
|
257
|
+
return networks
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def set_up_cluster_network_for_a3(args) -> int:
|
|
261
|
+
"""Set up GKE Cluster networks, subnets and firewall rules for A3.
|
|
262
|
+
Note: there are 4 NICs for GPU-GPU bw and 1 NIC for host in an A3 node.
|
|
254
263
|
|
|
255
264
|
Args:
|
|
256
265
|
args: user provided arguments for running the command.
|
|
257
|
-
system: system characteristics.
|
|
258
266
|
|
|
259
267
|
Returns:
|
|
260
268
|
0 if successful and 1 otherwise.
|
|
261
269
|
"""
|
|
262
|
-
num_networks =
|
|
263
|
-
for i in range(1, num_networks):
|
|
270
|
+
num_networks = 4
|
|
271
|
+
for i in range(1, num_networks + 1):
|
|
264
272
|
return_code = create_cluster_network(args, i)
|
|
265
273
|
if return_code != 0:
|
|
266
274
|
return 1
|
|
@@ -315,7 +323,10 @@ def get_all_networks_programmatic(args) -> tuple[list[str], int]:
|
|
|
315
323
|
Returns:
|
|
316
324
|
List of networks and 0 if successful and 1 otherwise.
|
|
317
325
|
"""
|
|
318
|
-
command =
|
|
326
|
+
command = (
|
|
327
|
+
'gcloud compute networks list --format="csv[no-heading](name)" '
|
|
328
|
+
f' --project={args.project}'
|
|
329
|
+
)
|
|
319
330
|
return_code, raw_network_output = run_command_for_value(
|
|
320
331
|
command, 'Get All Networks', args
|
|
321
332
|
)
|
|
@@ -365,7 +376,8 @@ def get_all_firewall_rules_programmatic(args) -> tuple[list[str], int]:
|
|
|
365
376
|
List of firewall rules and 0 if successful and 1 otherwise.
|
|
366
377
|
"""
|
|
367
378
|
command = (
|
|
368
|
-
'gcloud compute firewall-rules list --format="csv[no-heading](name)"'
|
|
379
|
+
'gcloud compute firewall-rules list --format="csv[no-heading](name)" '
|
|
380
|
+
f' --project={args.project}'
|
|
369
381
|
)
|
|
370
382
|
return_code, raw_subnets_output = run_command_for_value(
|
|
371
383
|
command, 'Get All Firewall Rules', args
|
xpk/core/nodepool.py
CHANGED
|
@@ -37,6 +37,8 @@ CLOUD_PLATFORM_AUTH_SCOPE_URL = (
|
|
|
37
37
|
'"https://www.googleapis.com/auth/cloud-platform"'
|
|
38
38
|
)
|
|
39
39
|
|
|
40
|
+
OLDER_PATHWAYS_CPU_NP_TO_DELETE = ['cpu-rm-np', 'cpu-proxy-np', 'cpu-user-np']
|
|
41
|
+
|
|
40
42
|
|
|
41
43
|
def run_gke_node_pool_create_command(
|
|
42
44
|
args, system, gke_node_pool_version
|
|
@@ -122,7 +124,10 @@ def run_gke_node_pool_create_command(
|
|
|
122
124
|
args, system, existing_node_pool_names, desired_node_pool_names
|
|
123
125
|
)
|
|
124
126
|
for node_pool_name in existing_node_pool_names:
|
|
125
|
-
if
|
|
127
|
+
if (
|
|
128
|
+
node_pool_name.find(f'{args.cluster}-np-') != 0
|
|
129
|
+
and node_pool_name not in OLDER_PATHWAYS_CPU_NP_TO_DELETE
|
|
130
|
+
):
|
|
126
131
|
continue
|
|
127
132
|
|
|
128
133
|
if node_pool_name in node_pools_to_delete:
|
|
@@ -283,28 +288,15 @@ def run_gke_node_pool_create_command(
|
|
|
283
288
|
command += (
|
|
284
289
|
' --accelerator'
|
|
285
290
|
f' type={system.gke_accelerator},count={str(system.chips_per_vm)},gpu-driver-version=latest'
|
|
286
|
-
' --no-enable-autoupgrade '
|
|
287
|
-
f' --scopes={CLOUD_PLATFORM_AUTH_SCOPE_URL} --additional-node-network'
|
|
288
|
-
f' network={args.cluster}-net-1,subnetwork={subnet_prefix}-sub-1'
|
|
289
|
-
' --additional-node-network'
|
|
290
|
-
f' network={args.cluster}-net-2,subnetwork={subnet_prefix}-sub-2'
|
|
291
|
-
' --additional-node-network'
|
|
292
|
-
f' network={args.cluster}-net-3,subnetwork={subnet_prefix}-sub-3'
|
|
293
|
-
' --additional-node-network'
|
|
294
|
-
f' network={args.cluster}-net-4,subnetwork={subnet_prefix}-sub-4'
|
|
291
|
+
f' --no-enable-autoupgrade --scopes={CLOUD_PLATFORM_AUTH_SCOPE_URL}'
|
|
295
292
|
)
|
|
296
293
|
if device_type == H100_MEGA_DEVICE_TYPE:
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
f' network={args.cluster}-net-7,subnetwork={subnet_prefix}-sub-7'
|
|
304
|
-
' --additional-node-network'
|
|
305
|
-
f' network={args.cluster}-net-8,subnetwork={subnet_prefix}-sub-8'
|
|
306
|
-
' --max-pods-per-node=32'
|
|
307
|
-
)
|
|
294
|
+
for i in range(1, 9):
|
|
295
|
+
command += (
|
|
296
|
+
' --additional-node-network'
|
|
297
|
+
f' network={args.cluster}-net-{i},subnetwork={subnet_prefix}-sub-{i}'
|
|
298
|
+
)
|
|
299
|
+
command += ' --max-pods-per-node=32'
|
|
308
300
|
elif system.accelerator_type == AcceleratorType['CPU']:
|
|
309
301
|
command += f' --num-nodes={system.vms_per_slice}'
|
|
310
302
|
command += (
|
|
@@ -318,7 +310,7 @@ def run_gke_node_pool_create_command(
|
|
|
318
310
|
create_commands.append(command)
|
|
319
311
|
create_task_names.append(task)
|
|
320
312
|
|
|
321
|
-
desired_pw_cpu_node_pools = ['cpu-
|
|
313
|
+
desired_pw_cpu_node_pools = ['cpu-np']
|
|
322
314
|
if args.enable_pathways:
|
|
323
315
|
# Pathways needs CPU nodepools in addition to TPU nodepools
|
|
324
316
|
for node_pool_name in desired_pw_cpu_node_pools:
|
|
@@ -368,11 +360,9 @@ def get_node_pools_to_delete(
|
|
|
368
360
|
check_resource, is_requested_resource_in_cluster = check_cluster_resources(
|
|
369
361
|
args, system
|
|
370
362
|
)
|
|
371
|
-
|
|
372
|
-
# Deletion logic would leave behind any Pathways CPU nodepools.
|
|
373
|
-
if existing_node_pool_name.find(f'{args.cluster}-np-') != 0:
|
|
374
|
-
continue
|
|
363
|
+
xpk_print('Existing node pool names ', existing_node_pool_names)
|
|
375
364
|
|
|
365
|
+
for existing_node_pool_name in existing_node_pool_names:
|
|
376
366
|
# Nodepools will be deleted in two scenarios:
|
|
377
367
|
# Scenario 1: Cluster exists with 3 nodepools of 'x' device_type/gke_accelerator and now we are updating
|
|
378
368
|
# the cluster to 2 nodepools of 'x' device_type/gke_accelerator. In this case, we will delete
|
|
@@ -380,6 +370,18 @@ def get_node_pools_to_delete(
|
|
|
380
370
|
# Scenario 2: Cluster exists with 2 nodepools of 'x' device_type/gke_accelerator and now we are updating
|
|
381
371
|
# the cluster to 2 nodepools of 'y' device_type/gke_accelerator. In this case, we will delete
|
|
382
372
|
# '{args.cluster}-np-0' and '{args.cluster}-np-1' from the cluster.
|
|
373
|
+
# Scenario 3: Deletes older Pathways CPU nodepools named cpu-rm-np, cpu-proxy-np and cpu-user-np
|
|
374
|
+
|
|
375
|
+
if existing_node_pool_name in OLDER_PATHWAYS_CPU_NP_TO_DELETE:
|
|
376
|
+
node_pools_to_delete.append(existing_node_pool_name)
|
|
377
|
+
xpk_print(
|
|
378
|
+
'Upgrading Pathways version on the cluster. Deleting older pathways'
|
|
379
|
+
' nodepool ',
|
|
380
|
+
existing_node_pool_name,
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
if existing_node_pool_name.find(f'{args.cluster}-np-') != 0:
|
|
384
|
+
continue
|
|
383
385
|
if existing_node_pool_name not in desired_node_pool_names or (
|
|
384
386
|
check_resource and not is_requested_resource_in_cluster
|
|
385
387
|
):
|