xpk 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/api/__init__.py +15 -0
- xpk/api/storage_crd.yaml +52 -0
- xpk/commands/batch.py +27 -5
- xpk/commands/cluster.py +104 -80
- xpk/commands/cluster_gcluster.py +94 -10
- xpk/commands/common.py +44 -0
- xpk/commands/config.py +29 -0
- xpk/commands/info.py +8 -10
- xpk/commands/inspector.py +5 -11
- xpk/commands/job.py +9 -7
- xpk/commands/kind.py +34 -4
- xpk/commands/kjob_common.py +44 -0
- xpk/commands/run.py +128 -0
- xpk/commands/shell.py +27 -7
- xpk/commands/storage.py +280 -0
- xpk/commands/version.py +6 -18
- xpk/commands/workload.py +381 -184
- xpk/core/blueprint/blueprint_definitions.py +1 -0
- xpk/core/blueprint/blueprint_generator.py +132 -76
- xpk/core/capacity.py +185 -0
- xpk/core/cluster.py +564 -0
- xpk/core/cluster_private.py +6 -3
- xpk/core/commands.py +18 -14
- xpk/core/config.py +179 -0
- xpk/core/docker_container.py +225 -0
- xpk/core/docker_image.py +210 -0
- xpk/core/docker_resources.py +350 -0
- xpk/core/filestore.py +251 -0
- xpk/core/gcloud_context.py +196 -0
- xpk/core/gcluster_manager.py +20 -2
- xpk/core/gcsfuse.py +50 -0
- xpk/core/kjob.py +257 -18
- xpk/core/kueue.py +12 -6
- xpk/core/monitoring.py +134 -0
- xpk/core/nap.py +32 -20
- xpk/core/network.py +377 -0
- xpk/core/nodepool.py +581 -0
- xpk/core/pathways.py +124 -45
- xpk/core/remote_state/__init__.py +15 -0
- xpk/core/remote_state/fuse_remote_state.py +99 -0
- xpk/core/remote_state/remote_state_client.py +38 -0
- xpk/core/resources.py +238 -0
- xpk/core/scheduling.py +253 -0
- xpk/core/storage.py +581 -0
- xpk/core/system_characteristics.py +38 -1
- xpk/core/vertex.py +105 -0
- xpk/core/workload.py +209 -1
- xpk/core/workload_decorators/rdma_decorator.py +25 -5
- xpk/core/workload_decorators/storage_decorator.py +52 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +70 -37
- xpk/main.py +3 -1
- xpk/parser/batch.py +10 -151
- xpk/parser/cluster.py +49 -8
- xpk/parser/common.py +189 -1
- xpk/parser/config.py +49 -0
- xpk/parser/core.py +27 -1
- xpk/parser/info.py +2 -1
- xpk/parser/inspector.py +3 -3
- xpk/parser/job.py +25 -4
- xpk/parser/kind.py +3 -2
- xpk/parser/run.py +47 -0
- xpk/parser/shell.py +10 -1
- xpk/parser/storage.py +326 -0
- xpk/parser/validators.py +3 -3
- xpk/parser/workload.py +118 -76
- xpk/templates/__init__.py +15 -0
- xpk/templates/storage.yaml +13 -0
- xpk/utils/gcs_utils.py +125 -0
- xpk/utils/kubectl.py +57 -0
- xpk/utils/objects.py +8 -5
- xpk/utils/templates.py +28 -0
- xpk/utils/validation.py +80 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/METADATA +169 -15
- xpk-0.7.1.dist-info/RECORD +92 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/WHEEL +1 -1
- xpk/core/core.py +0 -2824
- xpk-0.6.0.dist-info/RECORD +0 -57
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/entry_points.txt +0 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info/licenses}/LICENSE +0 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/top_level.txt +0 -0
xpk/core/nodepool.py
ADDED
|
@@ -0,0 +1,581 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from ..utils.console import get_user_input, xpk_print
|
|
18
|
+
from .capacity import (
|
|
19
|
+
AUTOPROVISIONING_CONFIG_VALUE,
|
|
20
|
+
H100_MEGA_DEVICE_TYPE,
|
|
21
|
+
CapacityType,
|
|
22
|
+
get_capacity_arguments_from_capacity_type,
|
|
23
|
+
get_capacity_type,
|
|
24
|
+
print_reservations,
|
|
25
|
+
)
|
|
26
|
+
from .commands import run_command_for_value, run_commands
|
|
27
|
+
from .gcloud_context import GkeServerConfig, zone_to_region
|
|
28
|
+
from .resources import (
|
|
29
|
+
CLUSTER_CONFIGMAP_YAML,
|
|
30
|
+
CLUSTER_RESOURCES_CONFIGMAP,
|
|
31
|
+
check_cluster_resources,
|
|
32
|
+
create_or_update_cluster_configmap,
|
|
33
|
+
)
|
|
34
|
+
from .system_characteristics import AcceleratorType
|
|
35
|
+
|
|
36
|
+
CLOUD_PLATFORM_AUTH_SCOPE_URL = (
|
|
37
|
+
'"https://www.googleapis.com/auth/cloud-platform"'
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def run_gke_node_pool_create_command(
|
|
42
|
+
args, system, gke_node_pool_version
|
|
43
|
+
) -> int:
|
|
44
|
+
"""Run the Create GKE Node Pool request.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
args: user provided arguments for running the command.
|
|
48
|
+
system: System characteristics based on device type/topology.
|
|
49
|
+
gke_node_pool_version: GKE version to use to create node pools.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
0 if successful and 1 otherwise.
|
|
53
|
+
"""
|
|
54
|
+
device_type = args.tpu_type if args.tpu_type else args.device_type
|
|
55
|
+
xpk_print(
|
|
56
|
+
f'Creating {args.num_slices} node pool or pools of {device_type}\n'
|
|
57
|
+
f'We assume that the underlying system is: {system}'
|
|
58
|
+
)
|
|
59
|
+
existing_node_pool_names, return_code = get_all_nodepools_programmatic(args)
|
|
60
|
+
if return_code > 0:
|
|
61
|
+
xpk_print('Listing all node pools failed!')
|
|
62
|
+
return return_code
|
|
63
|
+
|
|
64
|
+
capacity_type, return_code = get_capacity_type(args)
|
|
65
|
+
if return_code > 0:
|
|
66
|
+
xpk_print('Parsing capacity type failed!')
|
|
67
|
+
return return_code
|
|
68
|
+
if capacity_type == CapacityType.UNKNOWN:
|
|
69
|
+
return_code = print_reservations(args)
|
|
70
|
+
xpk_print(
|
|
71
|
+
'ERROR: User needs to provide the capacity type. Please specify one of'
|
|
72
|
+
' the following `--reservation=$RESERVATION_NAME`, `--on-demand`'
|
|
73
|
+
' or `--spot`. See the above list of reservations to choose from.'
|
|
74
|
+
)
|
|
75
|
+
if return_code > 0:
|
|
76
|
+
xpk_print('Listing all reservations failed!')
|
|
77
|
+
return_code = 1
|
|
78
|
+
capacity_args, return_code = get_capacity_arguments_from_capacity_type(
|
|
79
|
+
args, capacity_type
|
|
80
|
+
)
|
|
81
|
+
if return_code > 0:
|
|
82
|
+
xpk_print('Parsing capacity arguments failed!')
|
|
83
|
+
return return_code
|
|
84
|
+
|
|
85
|
+
if system.accelerator_type == AcceleratorType['GPU']:
|
|
86
|
+
xpk_print(
|
|
87
|
+
f'Creating 1 node pool with {args.num_nodes} nodes of'
|
|
88
|
+
f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
|
|
89
|
+
)
|
|
90
|
+
desired_node_pool_names = [f'{args.cluster}-np-0']
|
|
91
|
+
else:
|
|
92
|
+
xpk_print(
|
|
93
|
+
f'Creating {args.num_slices} node pool or pools of'
|
|
94
|
+
f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
|
|
95
|
+
)
|
|
96
|
+
desired_node_pool_names = [
|
|
97
|
+
f'{args.cluster}-np-{slice_num}' for slice_num in range(args.num_slices)
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
node_pools_to_remain = []
|
|
101
|
+
delete_commands = []
|
|
102
|
+
delete_task_names = []
|
|
103
|
+
node_pools_to_update_WI = []
|
|
104
|
+
update_WI_commands = []
|
|
105
|
+
update_WI_task_names = []
|
|
106
|
+
if existing_node_pool_names:
|
|
107
|
+
return_code, existing_node_pool_zone = get_nodepool_zone(
|
|
108
|
+
args, existing_node_pool_names[0]
|
|
109
|
+
)
|
|
110
|
+
if return_code != 0:
|
|
111
|
+
return 1
|
|
112
|
+
|
|
113
|
+
if existing_node_pool_zone and existing_node_pool_zone != args.zone:
|
|
114
|
+
xpk_print(
|
|
115
|
+
f'Cluster {args.cluster} already has nodepools in zone:'
|
|
116
|
+
f' {existing_node_pool_zone}. Use the same zone to update nodepools'
|
|
117
|
+
' in the cluster.'
|
|
118
|
+
)
|
|
119
|
+
return 1
|
|
120
|
+
|
|
121
|
+
node_pools_to_delete = get_node_pools_to_delete(
|
|
122
|
+
args, system, existing_node_pool_names, desired_node_pool_names
|
|
123
|
+
)
|
|
124
|
+
for node_pool_name in existing_node_pool_names:
|
|
125
|
+
if node_pool_name.find(f'{args.cluster}-np-') != 0:
|
|
126
|
+
continue
|
|
127
|
+
|
|
128
|
+
if node_pool_name in node_pools_to_delete:
|
|
129
|
+
command = (
|
|
130
|
+
'gcloud beta container node-pools delete'
|
|
131
|
+
f' {node_pool_name} --cluster={args.cluster}'
|
|
132
|
+
f' --zone={zone_to_region(args.zone)}'
|
|
133
|
+
f' --project={args.project} --quiet'
|
|
134
|
+
)
|
|
135
|
+
task = f'NodepoolDelete-{node_pool_name}'
|
|
136
|
+
delete_commands.append(command)
|
|
137
|
+
delete_task_names.append(task)
|
|
138
|
+
else:
|
|
139
|
+
node_pools_to_remain.append(node_pool_name)
|
|
140
|
+
|
|
141
|
+
# Workload Identity for existing nodepools
|
|
142
|
+
if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
|
|
143
|
+
for node_pool_name in existing_node_pool_names:
|
|
144
|
+
if not node_pool_name in node_pools_to_delete:
|
|
145
|
+
# Check if workload identity is not already enabled:
|
|
146
|
+
return_code, existing_node_pool_medadata_mode = (
|
|
147
|
+
get_nodepool_workload_metadata_mode(args, node_pool_name)
|
|
148
|
+
)
|
|
149
|
+
if return_code != 0:
|
|
150
|
+
return 1
|
|
151
|
+
|
|
152
|
+
if (
|
|
153
|
+
existing_node_pool_zone
|
|
154
|
+
and existing_node_pool_medadata_mode != 'GKE_METADATA'
|
|
155
|
+
):
|
|
156
|
+
command = (
|
|
157
|
+
'gcloud container node-pools update'
|
|
158
|
+
f' {node_pool_name} --cluster={args.cluster}'
|
|
159
|
+
f' --zone={zone_to_region(args.zone)}'
|
|
160
|
+
f' --project={args.project} --quiet'
|
|
161
|
+
' --workload-metadata=GKE_METADATA'
|
|
162
|
+
)
|
|
163
|
+
task = (
|
|
164
|
+
'Update nodepool with Workload Identity enabled'
|
|
165
|
+
f' {node_pool_name}'
|
|
166
|
+
)
|
|
167
|
+
update_WI_commands.append(command)
|
|
168
|
+
update_WI_task_names.append(task)
|
|
169
|
+
node_pools_to_update_WI.append(node_pool_name)
|
|
170
|
+
|
|
171
|
+
# Deletion of nodepools should happen before attempting to create new nodepools for the case
|
|
172
|
+
# when cluster is getting updated from 'x' device_type/gke_accelerator to 'y' device_type/gke_accelerator.
|
|
173
|
+
# In that case, '{args.cluster}-np-i' nodepool will be re-created for 'y' device_type/gke_accelerator.
|
|
174
|
+
if delete_commands:
|
|
175
|
+
will_delete = True
|
|
176
|
+
if node_pools_to_delete and not args.force:
|
|
177
|
+
will_delete = get_user_input(
|
|
178
|
+
f'Planning to delete {len(node_pools_to_delete)} node pools including'
|
|
179
|
+
f' {node_pools_to_delete}. \nDo you wish to delete: y (yes) / n'
|
|
180
|
+
' (no):\n'
|
|
181
|
+
)
|
|
182
|
+
if not will_delete:
|
|
183
|
+
xpk_print(
|
|
184
|
+
'You have requested to not delete the existing nodepools in the'
|
|
185
|
+
' cluster. There will be no change to the cluster.'
|
|
186
|
+
)
|
|
187
|
+
return 1
|
|
188
|
+
|
|
189
|
+
for i, command in enumerate(delete_commands):
|
|
190
|
+
xpk_print(
|
|
191
|
+
f'To complete {delete_task_names[i]} we are executing {command}'
|
|
192
|
+
)
|
|
193
|
+
max_return_code = run_commands(
|
|
194
|
+
delete_commands,
|
|
195
|
+
'Delete Nodepools',
|
|
196
|
+
delete_task_names,
|
|
197
|
+
dry_run=args.dry_run,
|
|
198
|
+
)
|
|
199
|
+
if max_return_code != 0:
|
|
200
|
+
xpk_print(f'Delete Nodepools returned ERROR {max_return_code}')
|
|
201
|
+
return 1
|
|
202
|
+
|
|
203
|
+
# Enable Workload Identity on existing Nodepools
|
|
204
|
+
if update_WI_commands:
|
|
205
|
+
will_update_WI = True
|
|
206
|
+
if node_pools_to_update_WI and not args.force:
|
|
207
|
+
will_update_WI = get_user_input(
|
|
208
|
+
'Planning to enable Workload Identity Federation on'
|
|
209
|
+
f' {len(node_pools_to_update_WI)} existing node pools including'
|
|
210
|
+
f' {node_pools_to_update_WI}.This immediately enables Workload'
|
|
211
|
+
' Identity Federation for GKE for any workloads running in the node'
|
|
212
|
+
' pool. Also, xpk does not support disabling Workload Identity on'
|
|
213
|
+
' clusters that have it enabled already \nDo you wish to update: y'
|
|
214
|
+
' (yes) / n (no):\n'
|
|
215
|
+
)
|
|
216
|
+
if not will_update_WI:
|
|
217
|
+
for i, command in enumerate(update_WI_commands):
|
|
218
|
+
xpk_print(
|
|
219
|
+
f'To complete {update_WI_task_names[i]} we are executing {command}'
|
|
220
|
+
)
|
|
221
|
+
max_return_code = run_commands(
|
|
222
|
+
update_WI_commands,
|
|
223
|
+
'Enable Workload Identity on existing Nodepools',
|
|
224
|
+
update_WI_task_names,
|
|
225
|
+
dry_run=args.dry_run,
|
|
226
|
+
)
|
|
227
|
+
if max_return_code != 0:
|
|
228
|
+
xpk_print(
|
|
229
|
+
'Enable Workload Identity on existing Nodepools returned ERROR'
|
|
230
|
+
f' {max_return_code}'
|
|
231
|
+
)
|
|
232
|
+
return 1
|
|
233
|
+
|
|
234
|
+
# Update {args.cluster}-{_CLUSTER_RESOURCES_CONFIGMAP} ConfigMap to 'y': '0'
|
|
235
|
+
# and remove 'x' from the ConfigMap when cluster is getting updated from
|
|
236
|
+
# 'x' device_type/gke_accelerator to 'y' device_type/gke_accelerator.
|
|
237
|
+
if not node_pools_to_remain:
|
|
238
|
+
if args.enable_autoprovisioning:
|
|
239
|
+
resources_data = (
|
|
240
|
+
f'{system.gke_accelerator}: {AUTOPROVISIONING_CONFIG_VALUE}'
|
|
241
|
+
)
|
|
242
|
+
else:
|
|
243
|
+
resources_data = f'{device_type}: "0"'
|
|
244
|
+
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
|
245
|
+
resources_yml = CLUSTER_CONFIGMAP_YAML.format(
|
|
246
|
+
args=args, name=resources_configmap_name, data=resources_data
|
|
247
|
+
)
|
|
248
|
+
configmap_yml = {}
|
|
249
|
+
configmap_yml[resources_configmap_name] = resources_yml
|
|
250
|
+
return_code = create_or_update_cluster_configmap(configmap_yml)
|
|
251
|
+
if return_code != 0:
|
|
252
|
+
return 1
|
|
253
|
+
|
|
254
|
+
create_commands = []
|
|
255
|
+
create_task_names = []
|
|
256
|
+
for node_pool_name in desired_node_pool_names:
|
|
257
|
+
if node_pool_name in node_pools_to_remain:
|
|
258
|
+
continue
|
|
259
|
+
command = (
|
|
260
|
+
'gcloud beta container node-pools create'
|
|
261
|
+
f' {node_pool_name}'
|
|
262
|
+
f' --region={zone_to_region(args.zone)}'
|
|
263
|
+
f' --cluster={args.cluster}'
|
|
264
|
+
f' --project={args.project} --node-locations={args.zone}'
|
|
265
|
+
f' --machine-type={system.gce_machine_type}'
|
|
266
|
+
f' --host-maintenance-interval={args.host_maintenance_interval}'
|
|
267
|
+
f' {capacity_args}'
|
|
268
|
+
' --enable-gvnic'
|
|
269
|
+
f' {args.custom_nodepool_arguments}'
|
|
270
|
+
)
|
|
271
|
+
if system.accelerator_type == AcceleratorType['TPU']:
|
|
272
|
+
command += f' --node-version={gke_node_pool_version}'
|
|
273
|
+
command += f' --num-nodes={system.vms_per_slice}'
|
|
274
|
+
command += ' --placement-type=COMPACT --max-pods-per-node 15'
|
|
275
|
+
command += (
|
|
276
|
+
f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}'
|
|
277
|
+
)
|
|
278
|
+
command += f' --tpu-topology={system.topology}'
|
|
279
|
+
command += f' {args.custom_tpu_nodepool_arguments}'
|
|
280
|
+
elif system.accelerator_type == AcceleratorType['GPU']:
|
|
281
|
+
subnet_prefix = f'{args.cluster}-{zone_to_region(args.zone)}'
|
|
282
|
+
command += f' --num-nodes={args.num_nodes}'
|
|
283
|
+
command += (
|
|
284
|
+
' --accelerator'
|
|
285
|
+
f' type={system.gke_accelerator},count={str(system.chips_per_vm)},gpu-driver-version=latest'
|
|
286
|
+
' --no-enable-autoupgrade '
|
|
287
|
+
f' --scopes={CLOUD_PLATFORM_AUTH_SCOPE_URL} --additional-node-network'
|
|
288
|
+
f' network={args.cluster}-net-1,subnetwork={subnet_prefix}-sub-1'
|
|
289
|
+
' --additional-node-network'
|
|
290
|
+
f' network={args.cluster}-net-2,subnetwork={subnet_prefix}-sub-2'
|
|
291
|
+
' --additional-node-network'
|
|
292
|
+
f' network={args.cluster}-net-3,subnetwork={subnet_prefix}-sub-3'
|
|
293
|
+
' --additional-node-network'
|
|
294
|
+
f' network={args.cluster}-net-4,subnetwork={subnet_prefix}-sub-4'
|
|
295
|
+
)
|
|
296
|
+
if device_type == H100_MEGA_DEVICE_TYPE:
|
|
297
|
+
command += (
|
|
298
|
+
' --additional-node-network'
|
|
299
|
+
f' network={args.cluster}-net-5,subnetwork={subnet_prefix}-sub-5'
|
|
300
|
+
' --additional-node-network'
|
|
301
|
+
f' network={args.cluster}-net-6,subnetwork={subnet_prefix}-sub-6'
|
|
302
|
+
' --additional-node-network'
|
|
303
|
+
f' network={args.cluster}-net-7,subnetwork={subnet_prefix}-sub-7'
|
|
304
|
+
' --additional-node-network'
|
|
305
|
+
f' network={args.cluster}-net-8,subnetwork={subnet_prefix}-sub-8'
|
|
306
|
+
' --max-pods-per-node=32'
|
|
307
|
+
)
|
|
308
|
+
elif system.accelerator_type == AcceleratorType['CPU']:
|
|
309
|
+
command += f' --num-nodes={system.vms_per_slice}'
|
|
310
|
+
command += (
|
|
311
|
+
f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}'
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
|
|
315
|
+
command += ' --workload-metadata=GKE_METADATA'
|
|
316
|
+
|
|
317
|
+
task = f'NodepoolCreate-{node_pool_name}'
|
|
318
|
+
create_commands.append(command)
|
|
319
|
+
create_task_names.append(task)
|
|
320
|
+
|
|
321
|
+
desired_pw_cpu_node_pools = ['cpu-user-np', 'cpu-rm-np', 'cpu-proxy-np']
|
|
322
|
+
if args.enable_pathways:
|
|
323
|
+
# Pathways needs CPU nodepools in addition to TPU nodepools
|
|
324
|
+
for node_pool_name in desired_pw_cpu_node_pools:
|
|
325
|
+
if node_pool_name in existing_node_pool_names:
|
|
326
|
+
continue
|
|
327
|
+
command = (
|
|
328
|
+
'gcloud beta container node-pools create'
|
|
329
|
+
f' {node_pool_name} --node-version={gke_node_pool_version} --cluster={args.cluster} --project={args.project} --node-locations={args.zone} --region={zone_to_region(args.zone)} --num-nodes=1'
|
|
330
|
+
f' --machine-type={args.pathways_gce_machine_type} --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL} --enable-autoscaling'
|
|
331
|
+
' --min-nodes=1 --max-nodes=20'
|
|
332
|
+
)
|
|
333
|
+
task = f'NodepoolCreate-{node_pool_name}'
|
|
334
|
+
create_commands.append(command)
|
|
335
|
+
create_task_names.append(task)
|
|
336
|
+
|
|
337
|
+
for i, command in enumerate(create_commands):
|
|
338
|
+
xpk_print(f'To complete {create_task_names[i]} we are executing {command}')
|
|
339
|
+
max_return_code = run_commands(
|
|
340
|
+
create_commands,
|
|
341
|
+
'Create Nodepools',
|
|
342
|
+
create_task_names,
|
|
343
|
+
dry_run=args.dry_run,
|
|
344
|
+
)
|
|
345
|
+
if max_return_code != 0:
|
|
346
|
+
xpk_print(f'Create Nodepools returned ERROR {max_return_code}')
|
|
347
|
+
return 1
|
|
348
|
+
|
|
349
|
+
xpk_print('Create or delete node pool request complete.')
|
|
350
|
+
return 0
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def get_node_pools_to_delete(
|
|
354
|
+
args, system, existing_node_pool_names, desired_node_pool_names
|
|
355
|
+
) -> list:
|
|
356
|
+
"""Get list of nodepools to delete from the cluster.
|
|
357
|
+
|
|
358
|
+
Args:
|
|
359
|
+
args: user provided arguments for running the command.
|
|
360
|
+
system: system characteristics.
|
|
361
|
+
existing_node_pool_names: names of nodepools that already exist in the cluster.
|
|
362
|
+
desired_node_pool_names: names of nodepools that should exist in the cluster.
|
|
363
|
+
|
|
364
|
+
Returns:
|
|
365
|
+
List of nodepool names to delete.
|
|
366
|
+
"""
|
|
367
|
+
node_pools_to_delete = []
|
|
368
|
+
check_resource, is_requested_resource_in_cluster = check_cluster_resources(
|
|
369
|
+
args, system
|
|
370
|
+
)
|
|
371
|
+
for existing_node_pool_name in existing_node_pool_names:
|
|
372
|
+
# Deletion logic would leave behind any Pathways CPU nodepools.
|
|
373
|
+
if existing_node_pool_name.find(f'{args.cluster}-np-') != 0:
|
|
374
|
+
continue
|
|
375
|
+
|
|
376
|
+
# Nodepools will be deleted in two scenarios:
|
|
377
|
+
# Scenario 1: Cluster exists with 3 nodepools of 'x' device_type/gke_accelerator and now we are updating
|
|
378
|
+
# the cluster to 2 nodepools of 'x' device_type/gke_accelerator. In this case, we will delete
|
|
379
|
+
# '{args.cluster}-np-2' from the cluster.
|
|
380
|
+
# Scenario 2: Cluster exists with 2 nodepools of 'x' device_type/gke_accelerator and now we are updating
|
|
381
|
+
# the cluster to 2 nodepools of 'y' device_type/gke_accelerator. In this case, we will delete
|
|
382
|
+
# '{args.cluster}-np-0' and '{args.cluster}-np-1' from the cluster.
|
|
383
|
+
if existing_node_pool_name not in desired_node_pool_names or (
|
|
384
|
+
check_resource and not is_requested_resource_in_cluster
|
|
385
|
+
):
|
|
386
|
+
node_pools_to_delete.append(existing_node_pool_name)
|
|
387
|
+
|
|
388
|
+
return node_pools_to_delete
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def get_all_nodepools_programmatic(args) -> tuple[list[str], int]:
|
|
392
|
+
"""Gets all the nodepools associated with the cluster / project / region.
|
|
393
|
+
|
|
394
|
+
Args:
|
|
395
|
+
args: user provided arguments for running the command.
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
List of nodepools and 0 if successful and 1 otherwise.
|
|
399
|
+
"""
|
|
400
|
+
command = (
|
|
401
|
+
'gcloud beta container node-pools list'
|
|
402
|
+
' --cluster'
|
|
403
|
+
f' {args.cluster} --project={args.project} --region={zone_to_region(args.zone)}'
|
|
404
|
+
' --format="csv[no-heading](name)"'
|
|
405
|
+
)
|
|
406
|
+
return_code, raw_nodepool_output = run_command_for_value(
|
|
407
|
+
command, 'Get All Node Pools', args
|
|
408
|
+
)
|
|
409
|
+
if return_code != 0:
|
|
410
|
+
xpk_print(f'Get All Node Pools returned ERROR {return_code}')
|
|
411
|
+
return [], 1
|
|
412
|
+
|
|
413
|
+
return raw_nodepool_output.splitlines(), 0
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def get_nodepool_zone(args, nodepool_name) -> tuple[int, str | None]:
|
|
417
|
+
"""Return zone in which nodepool exists in the cluster.
|
|
418
|
+
|
|
419
|
+
Args:
|
|
420
|
+
args: user provided arguments for running the command.
|
|
421
|
+
nodepool_name: name of nodepool.
|
|
422
|
+
|
|
423
|
+
Returns:
|
|
424
|
+
Tuple of int, str where
|
|
425
|
+
int is the return code - 0 if successful, 1 otherwise.
|
|
426
|
+
str is the zone of nodepool.
|
|
427
|
+
"""
|
|
428
|
+
command = (
|
|
429
|
+
f'gcloud beta container node-pools describe {nodepool_name}'
|
|
430
|
+
f' --cluster {args.cluster} --project={args.project}'
|
|
431
|
+
f' --region={zone_to_region(args.zone)} --format="value(locations)"'
|
|
432
|
+
)
|
|
433
|
+
return_code, nodepool_zone = run_command_for_value(
|
|
434
|
+
command, 'Get Node Pool Zone', args
|
|
435
|
+
)
|
|
436
|
+
if return_code != 0:
|
|
437
|
+
xpk_print(f'Get Node Pool Zone returned ERROR {return_code}')
|
|
438
|
+
return 1, None
|
|
439
|
+
|
|
440
|
+
return 0, nodepool_zone.strip()
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def get_gke_node_pool_version(
|
|
444
|
+
args, gke_server_config: GkeServerConfig
|
|
445
|
+
) -> tuple[int, str | None]:
|
|
446
|
+
"""Determine the gke node pool version for the node pool.
|
|
447
|
+
|
|
448
|
+
Args:
|
|
449
|
+
args: user provided arguments for running the command.
|
|
450
|
+
gke_server_config: holds valid gke versions and recommended default version.
|
|
451
|
+
|
|
452
|
+
Returns:
|
|
453
|
+
Tuple of
|
|
454
|
+
int: 0 if successful and 1 otherwise.
|
|
455
|
+
str: gke control plane version to use.
|
|
456
|
+
"""
|
|
457
|
+
|
|
458
|
+
# By default use the current gke master version for creating node pools.
|
|
459
|
+
command_description = 'Determine current gke master version'
|
|
460
|
+
command = (
|
|
461
|
+
f'gcloud beta container clusters describe {args.cluster}'
|
|
462
|
+
f' --region {zone_to_region(args.zone)} --project {args.project}'
|
|
463
|
+
' --format="value(currentMasterVersion)"'
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
return_code, current_gke_master_version = run_command_for_value(
|
|
467
|
+
command, command_description, args
|
|
468
|
+
)
|
|
469
|
+
if return_code != 0:
|
|
470
|
+
xpk_print(
|
|
471
|
+
f'Unable to get server config for command: {command_description}.'
|
|
472
|
+
)
|
|
473
|
+
return return_code, None
|
|
474
|
+
|
|
475
|
+
# Override with user provide gke version if specified.
|
|
476
|
+
if args.gke_version is not None:
|
|
477
|
+
node_pool_gke_version = args.gke_version
|
|
478
|
+
else:
|
|
479
|
+
master_gke_version = current_gke_master_version.strip()
|
|
480
|
+
node_pool_gke_version = ''
|
|
481
|
+
# Select minimum version which is >= master_gke_version and has the same minor version.
|
|
482
|
+
# If this does not exist select maximum version which is < master_gke_version.
|
|
483
|
+
for version in gke_server_config.valid_versions:
|
|
484
|
+
if (
|
|
485
|
+
(node_pool_gke_version == '' or node_pool_gke_version < version)
|
|
486
|
+
and version < master_gke_version
|
|
487
|
+
) or (
|
|
488
|
+
(node_pool_gke_version == '' or node_pool_gke_version > version)
|
|
489
|
+
and master_gke_version <= version
|
|
490
|
+
and master_gke_version.split('.')[:2] == version.split('.')[:2]
|
|
491
|
+
):
|
|
492
|
+
node_pool_gke_version = version
|
|
493
|
+
|
|
494
|
+
is_supported_node_pool_version = (
|
|
495
|
+
node_pool_gke_version in gke_server_config.valid_versions
|
|
496
|
+
)
|
|
497
|
+
# In rare cases, user's provided gke version may be invalid, but gke will return an error if so.
|
|
498
|
+
# An example scenario is if the user provided gke version is greater than the master version.
|
|
499
|
+
if not is_supported_node_pool_version:
|
|
500
|
+
xpk_print(
|
|
501
|
+
f'Planned node pool version {node_pool_gke_version} is not supported in'
|
|
502
|
+
' valid version'
|
|
503
|
+
f' {gke_server_config.valid_versions}\nPlease adjust the gke version'
|
|
504
|
+
' using --gke-version=x or remove the arg and depend on xpk default of'
|
|
505
|
+
f' {current_gke_master_version}'
|
|
506
|
+
)
|
|
507
|
+
return 1, None
|
|
508
|
+
return 0, node_pool_gke_version
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def upgrade_gke_nodepools_version(args, default_rapid_gke_version) -> int:
|
|
512
|
+
"""Upgrade nodepools in the cluster to default rapid gke version. Recreates the nodes.
|
|
513
|
+
|
|
514
|
+
Args:
|
|
515
|
+
args: user provided arguments for running the command.
|
|
516
|
+
default_rapid_gke_version: Rapid default version for the upgrade.
|
|
517
|
+
|
|
518
|
+
Returns:
|
|
519
|
+
0 if successful and 1 otherwise.
|
|
520
|
+
"""
|
|
521
|
+
existing_node_pool_names, return_code = get_all_nodepools_programmatic(args)
|
|
522
|
+
if return_code != 0:
|
|
523
|
+
xpk_print('Listing all node pools failed!')
|
|
524
|
+
return return_code
|
|
525
|
+
|
|
526
|
+
# Batch execution to upgrade node pools simultaneously
|
|
527
|
+
commands = []
|
|
528
|
+
task_names = []
|
|
529
|
+
for node_pool_name in existing_node_pool_names:
|
|
530
|
+
commands.append(
|
|
531
|
+
'gcloud container clusters upgrade'
|
|
532
|
+
f' {args.cluster} --project={args.project}'
|
|
533
|
+
f' --region={zone_to_region(args.zone)}'
|
|
534
|
+
f' --cluster-version={default_rapid_gke_version}'
|
|
535
|
+
f' --node-pool={node_pool_name}'
|
|
536
|
+
' --quiet'
|
|
537
|
+
)
|
|
538
|
+
task_names.append(f'Upgrading node pool {node_pool_name}.')
|
|
539
|
+
|
|
540
|
+
for i, command in enumerate(commands):
|
|
541
|
+
xpk_print(f'To complete {task_names[i]} we are executing {command}')
|
|
542
|
+
max_return_code = run_commands(
|
|
543
|
+
commands, 'Update GKE node pools to default RAPID GKE version', task_names
|
|
544
|
+
)
|
|
545
|
+
if max_return_code != 0:
|
|
546
|
+
xpk_print(
|
|
547
|
+
'GKE node pools update to default RAPID GKE version returned ERROR:'
|
|
548
|
+
f' {max_return_code}'
|
|
549
|
+
)
|
|
550
|
+
return int(max_return_code)
|
|
551
|
+
return 0
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
def get_nodepool_workload_metadata_mode(
|
|
555
|
+
args, nodepool_name
|
|
556
|
+
) -> tuple[int, str | None]:
|
|
557
|
+
"""Return Workload Identity metadata mode of the nodepool.
|
|
558
|
+
Args:
|
|
559
|
+
args: user provided arguments for running the command.
|
|
560
|
+
nodepool_name: name of nodepool.
|
|
561
|
+
Returns:
|
|
562
|
+
Tuple of int, str where
|
|
563
|
+
int is the return code - 0 if successful, 1 otherwise.
|
|
564
|
+
str is the workload metadata mode of nodepool.
|
|
565
|
+
"""
|
|
566
|
+
command = (
|
|
567
|
+
f'gcloud beta container node-pools describe {nodepool_name}'
|
|
568
|
+
f' --cluster {args.cluster} --project={args.project}'
|
|
569
|
+
f' --region={zone_to_region(args.zone)} --format="value(config.workloadMetadataConfig.mode)"'
|
|
570
|
+
)
|
|
571
|
+
return_code, nodepool_WI_mode = run_command_for_value(
|
|
572
|
+
command, 'Get Node Pool Workload Identity Metadata Mode', args
|
|
573
|
+
)
|
|
574
|
+
if return_code != 0:
|
|
575
|
+
xpk_print(
|
|
576
|
+
'Get Node Pool Workload Identity Metadata Mode returned ERROR'
|
|
577
|
+
f' {return_code}'
|
|
578
|
+
)
|
|
579
|
+
return 1, None
|
|
580
|
+
|
|
581
|
+
return 0, nodepool_WI_mode.strip()
|