xpk 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. xpk/api/__init__.py +15 -0
  2. xpk/api/storage_crd.yaml +52 -0
  3. xpk/commands/batch.py +27 -5
  4. xpk/commands/cluster.py +104 -80
  5. xpk/commands/cluster_gcluster.py +94 -10
  6. xpk/commands/common.py +44 -0
  7. xpk/commands/config.py +29 -0
  8. xpk/commands/info.py +8 -10
  9. xpk/commands/inspector.py +5 -11
  10. xpk/commands/job.py +9 -7
  11. xpk/commands/kind.py +34 -4
  12. xpk/commands/kjob_common.py +44 -0
  13. xpk/commands/run.py +128 -0
  14. xpk/commands/shell.py +27 -7
  15. xpk/commands/storage.py +280 -0
  16. xpk/commands/version.py +6 -18
  17. xpk/commands/workload.py +381 -184
  18. xpk/core/blueprint/blueprint_definitions.py +1 -0
  19. xpk/core/blueprint/blueprint_generator.py +132 -76
  20. xpk/core/capacity.py +185 -0
  21. xpk/core/cluster.py +564 -0
  22. xpk/core/cluster_private.py +6 -3
  23. xpk/core/commands.py +18 -14
  24. xpk/core/config.py +179 -0
  25. xpk/core/docker_container.py +225 -0
  26. xpk/core/docker_image.py +210 -0
  27. xpk/core/docker_resources.py +350 -0
  28. xpk/core/filestore.py +251 -0
  29. xpk/core/gcloud_context.py +196 -0
  30. xpk/core/gcluster_manager.py +20 -2
  31. xpk/core/gcsfuse.py +50 -0
  32. xpk/core/kjob.py +257 -18
  33. xpk/core/kueue.py +12 -6
  34. xpk/core/monitoring.py +134 -0
  35. xpk/core/nap.py +32 -20
  36. xpk/core/network.py +377 -0
  37. xpk/core/nodepool.py +581 -0
  38. xpk/core/pathways.py +124 -45
  39. xpk/core/remote_state/__init__.py +15 -0
  40. xpk/core/remote_state/fuse_remote_state.py +99 -0
  41. xpk/core/remote_state/remote_state_client.py +38 -0
  42. xpk/core/resources.py +238 -0
  43. xpk/core/scheduling.py +253 -0
  44. xpk/core/storage.py +581 -0
  45. xpk/core/system_characteristics.py +38 -1
  46. xpk/core/vertex.py +105 -0
  47. xpk/core/workload.py +209 -1
  48. xpk/core/workload_decorators/rdma_decorator.py +25 -5
  49. xpk/core/workload_decorators/storage_decorator.py +52 -0
  50. xpk/core/workload_decorators/tcpxo_decorator.py +70 -37
  51. xpk/main.py +3 -1
  52. xpk/parser/batch.py +10 -151
  53. xpk/parser/cluster.py +49 -8
  54. xpk/parser/common.py +189 -1
  55. xpk/parser/config.py +49 -0
  56. xpk/parser/core.py +27 -1
  57. xpk/parser/info.py +2 -1
  58. xpk/parser/inspector.py +3 -3
  59. xpk/parser/job.py +25 -4
  60. xpk/parser/kind.py +3 -2
  61. xpk/parser/run.py +47 -0
  62. xpk/parser/shell.py +10 -1
  63. xpk/parser/storage.py +326 -0
  64. xpk/parser/validators.py +3 -3
  65. xpk/parser/workload.py +118 -76
  66. xpk/templates/__init__.py +15 -0
  67. xpk/templates/storage.yaml +13 -0
  68. xpk/utils/gcs_utils.py +125 -0
  69. xpk/utils/kubectl.py +57 -0
  70. xpk/utils/objects.py +8 -5
  71. xpk/utils/templates.py +28 -0
  72. xpk/utils/validation.py +80 -0
  73. {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/METADATA +169 -15
  74. xpk-0.7.1.dist-info/RECORD +92 -0
  75. {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/WHEEL +1 -1
  76. xpk/core/core.py +0 -2824
  77. xpk-0.6.0.dist-info/RECORD +0 -57
  78. {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/entry_points.txt +0 -0
  79. {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info/licenses}/LICENSE +0 -0
  80. {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/top_level.txt +0 -0
xpk/core/nodepool.py ADDED
@@ -0,0 +1,581 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from ..utils.console import get_user_input, xpk_print
18
+ from .capacity import (
19
+ AUTOPROVISIONING_CONFIG_VALUE,
20
+ H100_MEGA_DEVICE_TYPE,
21
+ CapacityType,
22
+ get_capacity_arguments_from_capacity_type,
23
+ get_capacity_type,
24
+ print_reservations,
25
+ )
26
+ from .commands import run_command_for_value, run_commands
27
+ from .gcloud_context import GkeServerConfig, zone_to_region
28
+ from .resources import (
29
+ CLUSTER_CONFIGMAP_YAML,
30
+ CLUSTER_RESOURCES_CONFIGMAP,
31
+ check_cluster_resources,
32
+ create_or_update_cluster_configmap,
33
+ )
34
+ from .system_characteristics import AcceleratorType
35
+
36
+ CLOUD_PLATFORM_AUTH_SCOPE_URL = (
37
+ '"https://www.googleapis.com/auth/cloud-platform"'
38
+ )
39
+
40
+
41
+ def run_gke_node_pool_create_command(
42
+ args, system, gke_node_pool_version
43
+ ) -> int:
44
+ """Run the Create GKE Node Pool request.
45
+
46
+ Args:
47
+ args: user provided arguments for running the command.
48
+ system: System characteristics based on device type/topology.
49
+ gke_node_pool_version: GKE version to use to create node pools.
50
+
51
+ Returns:
52
+ 0 if successful and 1 otherwise.
53
+ """
54
+ device_type = args.tpu_type if args.tpu_type else args.device_type
55
+ xpk_print(
56
+ f'Creating {args.num_slices} node pool or pools of {device_type}\n'
57
+ f'We assume that the underlying system is: {system}'
58
+ )
59
+ existing_node_pool_names, return_code = get_all_nodepools_programmatic(args)
60
+ if return_code > 0:
61
+ xpk_print('Listing all node pools failed!')
62
+ return return_code
63
+
64
+ capacity_type, return_code = get_capacity_type(args)
65
+ if return_code > 0:
66
+ xpk_print('Parsing capacity type failed!')
67
+ return return_code
68
+ if capacity_type == CapacityType.UNKNOWN:
69
+ return_code = print_reservations(args)
70
+ xpk_print(
71
+ 'ERROR: User needs to provide the capacity type. Please specify one of'
72
+ ' the following `--reservation=$RESERVATION_NAME`, `--on-demand`'
73
+ ' or `--spot`. See the above list of reservations to choose from.'
74
+ )
75
+ if return_code > 0:
76
+ xpk_print('Listing all reservations failed!')
77
+ return_code = 1
78
+ capacity_args, return_code = get_capacity_arguments_from_capacity_type(
79
+ args, capacity_type
80
+ )
81
+ if return_code > 0:
82
+ xpk_print('Parsing capacity arguments failed!')
83
+ return return_code
84
+
85
+ if system.accelerator_type == AcceleratorType['GPU']:
86
+ xpk_print(
87
+ f'Creating 1 node pool with {args.num_nodes} nodes of'
88
+ f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
89
+ )
90
+ desired_node_pool_names = [f'{args.cluster}-np-0']
91
+ else:
92
+ xpk_print(
93
+ f'Creating {args.num_slices} node pool or pools of'
94
+ f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
95
+ )
96
+ desired_node_pool_names = [
97
+ f'{args.cluster}-np-{slice_num}' for slice_num in range(args.num_slices)
98
+ ]
99
+
100
+ node_pools_to_remain = []
101
+ delete_commands = []
102
+ delete_task_names = []
103
+ node_pools_to_update_WI = []
104
+ update_WI_commands = []
105
+ update_WI_task_names = []
106
+ if existing_node_pool_names:
107
+ return_code, existing_node_pool_zone = get_nodepool_zone(
108
+ args, existing_node_pool_names[0]
109
+ )
110
+ if return_code != 0:
111
+ return 1
112
+
113
+ if existing_node_pool_zone and existing_node_pool_zone != args.zone:
114
+ xpk_print(
115
+ f'Cluster {args.cluster} already has nodepools in zone:'
116
+ f' {existing_node_pool_zone}. Use the same zone to update nodepools'
117
+ ' in the cluster.'
118
+ )
119
+ return 1
120
+
121
+ node_pools_to_delete = get_node_pools_to_delete(
122
+ args, system, existing_node_pool_names, desired_node_pool_names
123
+ )
124
+ for node_pool_name in existing_node_pool_names:
125
+ if node_pool_name.find(f'{args.cluster}-np-') != 0:
126
+ continue
127
+
128
+ if node_pool_name in node_pools_to_delete:
129
+ command = (
130
+ 'gcloud beta container node-pools delete'
131
+ f' {node_pool_name} --cluster={args.cluster}'
132
+ f' --zone={zone_to_region(args.zone)}'
133
+ f' --project={args.project} --quiet'
134
+ )
135
+ task = f'NodepoolDelete-{node_pool_name}'
136
+ delete_commands.append(command)
137
+ delete_task_names.append(task)
138
+ else:
139
+ node_pools_to_remain.append(node_pool_name)
140
+
141
+ # Workload Identity for existing nodepools
142
+ if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
143
+ for node_pool_name in existing_node_pool_names:
144
+ if not node_pool_name in node_pools_to_delete:
145
+ # Check if workload identity is not already enabled:
146
+ return_code, existing_node_pool_medadata_mode = (
147
+ get_nodepool_workload_metadata_mode(args, node_pool_name)
148
+ )
149
+ if return_code != 0:
150
+ return 1
151
+
152
+ if (
153
+ existing_node_pool_zone
154
+ and existing_node_pool_medadata_mode != 'GKE_METADATA'
155
+ ):
156
+ command = (
157
+ 'gcloud container node-pools update'
158
+ f' {node_pool_name} --cluster={args.cluster}'
159
+ f' --zone={zone_to_region(args.zone)}'
160
+ f' --project={args.project} --quiet'
161
+ ' --workload-metadata=GKE_METADATA'
162
+ )
163
+ task = (
164
+ 'Update nodepool with Workload Identity enabled'
165
+ f' {node_pool_name}'
166
+ )
167
+ update_WI_commands.append(command)
168
+ update_WI_task_names.append(task)
169
+ node_pools_to_update_WI.append(node_pool_name)
170
+
171
+ # Deletion of nodepools should happen before attempting to create new nodepools for the case
172
+ # when cluster is getting updated from 'x' device_type/gke_accelerator to 'y' device_type/gke_accelerator.
173
+ # In that case, '{args.cluster}-np-i' nodepool will be re-created for 'y' device_type/gke_accelerator.
174
+ if delete_commands:
175
+ will_delete = True
176
+ if node_pools_to_delete and not args.force:
177
+ will_delete = get_user_input(
178
+ f'Planning to delete {len(node_pools_to_delete)} node pools including'
179
+ f' {node_pools_to_delete}. \nDo you wish to delete: y (yes) / n'
180
+ ' (no):\n'
181
+ )
182
+ if not will_delete:
183
+ xpk_print(
184
+ 'You have requested to not delete the existing nodepools in the'
185
+ ' cluster. There will be no change to the cluster.'
186
+ )
187
+ return 1
188
+
189
+ for i, command in enumerate(delete_commands):
190
+ xpk_print(
191
+ f'To complete {delete_task_names[i]} we are executing {command}'
192
+ )
193
+ max_return_code = run_commands(
194
+ delete_commands,
195
+ 'Delete Nodepools',
196
+ delete_task_names,
197
+ dry_run=args.dry_run,
198
+ )
199
+ if max_return_code != 0:
200
+ xpk_print(f'Delete Nodepools returned ERROR {max_return_code}')
201
+ return 1
202
+
203
+ # Enable Workload Identity on existing Nodepools
204
+ if update_WI_commands:
205
+ will_update_WI = True
206
+ if node_pools_to_update_WI and not args.force:
207
+ will_update_WI = get_user_input(
208
+ 'Planning to enable Workload Identity Federation on'
209
+ f' {len(node_pools_to_update_WI)} existing node pools including'
210
+ f' {node_pools_to_update_WI}.This immediately enables Workload'
211
+ ' Identity Federation for GKE for any workloads running in the node'
212
+ ' pool. Also, xpk does not support disabling Workload Identity on'
213
+ ' clusters that have it enabled already \nDo you wish to update: y'
214
+ ' (yes) / n (no):\n'
215
+ )
216
+ if not will_update_WI:
217
+ for i, command in enumerate(update_WI_commands):
218
+ xpk_print(
219
+ f'To complete {update_WI_task_names[i]} we are executing {command}'
220
+ )
221
+ max_return_code = run_commands(
222
+ update_WI_commands,
223
+ 'Enable Workload Identity on existing Nodepools',
224
+ update_WI_task_names,
225
+ dry_run=args.dry_run,
226
+ )
227
+ if max_return_code != 0:
228
+ xpk_print(
229
+ 'Enable Workload Identity on existing Nodepools returned ERROR'
230
+ f' {max_return_code}'
231
+ )
232
+ return 1
233
+
234
+ # Update {args.cluster}-{_CLUSTER_RESOURCES_CONFIGMAP} ConfigMap to 'y': '0'
235
+ # and remove 'x' from the ConfigMap when cluster is getting updated from
236
+ # 'x' device_type/gke_accelerator to 'y' device_type/gke_accelerator.
237
+ if not node_pools_to_remain:
238
+ if args.enable_autoprovisioning:
239
+ resources_data = (
240
+ f'{system.gke_accelerator}: {AUTOPROVISIONING_CONFIG_VALUE}'
241
+ )
242
+ else:
243
+ resources_data = f'{device_type}: "0"'
244
+ resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
245
+ resources_yml = CLUSTER_CONFIGMAP_YAML.format(
246
+ args=args, name=resources_configmap_name, data=resources_data
247
+ )
248
+ configmap_yml = {}
249
+ configmap_yml[resources_configmap_name] = resources_yml
250
+ return_code = create_or_update_cluster_configmap(configmap_yml)
251
+ if return_code != 0:
252
+ return 1
253
+
254
+ create_commands = []
255
+ create_task_names = []
256
+ for node_pool_name in desired_node_pool_names:
257
+ if node_pool_name in node_pools_to_remain:
258
+ continue
259
+ command = (
260
+ 'gcloud beta container node-pools create'
261
+ f' {node_pool_name}'
262
+ f' --region={zone_to_region(args.zone)}'
263
+ f' --cluster={args.cluster}'
264
+ f' --project={args.project} --node-locations={args.zone}'
265
+ f' --machine-type={system.gce_machine_type}'
266
+ f' --host-maintenance-interval={args.host_maintenance_interval}'
267
+ f' {capacity_args}'
268
+ ' --enable-gvnic'
269
+ f' {args.custom_nodepool_arguments}'
270
+ )
271
+ if system.accelerator_type == AcceleratorType['TPU']:
272
+ command += f' --node-version={gke_node_pool_version}'
273
+ command += f' --num-nodes={system.vms_per_slice}'
274
+ command += ' --placement-type=COMPACT --max-pods-per-node 15'
275
+ command += (
276
+ f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}'
277
+ )
278
+ command += f' --tpu-topology={system.topology}'
279
+ command += f' {args.custom_tpu_nodepool_arguments}'
280
+ elif system.accelerator_type == AcceleratorType['GPU']:
281
+ subnet_prefix = f'{args.cluster}-{zone_to_region(args.zone)}'
282
+ command += f' --num-nodes={args.num_nodes}'
283
+ command += (
284
+ ' --accelerator'
285
+ f' type={system.gke_accelerator},count={str(system.chips_per_vm)},gpu-driver-version=latest'
286
+ ' --no-enable-autoupgrade '
287
+ f' --scopes={CLOUD_PLATFORM_AUTH_SCOPE_URL} --additional-node-network'
288
+ f' network={args.cluster}-net-1,subnetwork={subnet_prefix}-sub-1'
289
+ ' --additional-node-network'
290
+ f' network={args.cluster}-net-2,subnetwork={subnet_prefix}-sub-2'
291
+ ' --additional-node-network'
292
+ f' network={args.cluster}-net-3,subnetwork={subnet_prefix}-sub-3'
293
+ ' --additional-node-network'
294
+ f' network={args.cluster}-net-4,subnetwork={subnet_prefix}-sub-4'
295
+ )
296
+ if device_type == H100_MEGA_DEVICE_TYPE:
297
+ command += (
298
+ ' --additional-node-network'
299
+ f' network={args.cluster}-net-5,subnetwork={subnet_prefix}-sub-5'
300
+ ' --additional-node-network'
301
+ f' network={args.cluster}-net-6,subnetwork={subnet_prefix}-sub-6'
302
+ ' --additional-node-network'
303
+ f' network={args.cluster}-net-7,subnetwork={subnet_prefix}-sub-7'
304
+ ' --additional-node-network'
305
+ f' network={args.cluster}-net-8,subnetwork={subnet_prefix}-sub-8'
306
+ ' --max-pods-per-node=32'
307
+ )
308
+ elif system.accelerator_type == AcceleratorType['CPU']:
309
+ command += f' --num-nodes={system.vms_per_slice}'
310
+ command += (
311
+ f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}'
312
+ )
313
+
314
+ if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
315
+ command += ' --workload-metadata=GKE_METADATA'
316
+
317
+ task = f'NodepoolCreate-{node_pool_name}'
318
+ create_commands.append(command)
319
+ create_task_names.append(task)
320
+
321
+ desired_pw_cpu_node_pools = ['cpu-user-np', 'cpu-rm-np', 'cpu-proxy-np']
322
+ if args.enable_pathways:
323
+ # Pathways needs CPU nodepools in addition to TPU nodepools
324
+ for node_pool_name in desired_pw_cpu_node_pools:
325
+ if node_pool_name in existing_node_pool_names:
326
+ continue
327
+ command = (
328
+ 'gcloud beta container node-pools create'
329
+ f' {node_pool_name} --node-version={gke_node_pool_version} --cluster={args.cluster} --project={args.project} --node-locations={args.zone} --region={zone_to_region(args.zone)} --num-nodes=1'
330
+ f' --machine-type={args.pathways_gce_machine_type} --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL} --enable-autoscaling'
331
+ ' --min-nodes=1 --max-nodes=20'
332
+ )
333
+ task = f'NodepoolCreate-{node_pool_name}'
334
+ create_commands.append(command)
335
+ create_task_names.append(task)
336
+
337
+ for i, command in enumerate(create_commands):
338
+ xpk_print(f'To complete {create_task_names[i]} we are executing {command}')
339
+ max_return_code = run_commands(
340
+ create_commands,
341
+ 'Create Nodepools',
342
+ create_task_names,
343
+ dry_run=args.dry_run,
344
+ )
345
+ if max_return_code != 0:
346
+ xpk_print(f'Create Nodepools returned ERROR {max_return_code}')
347
+ return 1
348
+
349
+ xpk_print('Create or delete node pool request complete.')
350
+ return 0
351
+
352
+
353
+ def get_node_pools_to_delete(
354
+ args, system, existing_node_pool_names, desired_node_pool_names
355
+ ) -> list:
356
+ """Get list of nodepools to delete from the cluster.
357
+
358
+ Args:
359
+ args: user provided arguments for running the command.
360
+ system: system characteristics.
361
+ existing_node_pool_names: names of nodepools that already exist in the cluster.
362
+ desired_node_pool_names: names of nodepools that should exist in the cluster.
363
+
364
+ Returns:
365
+ List of nodepool names to delete.
366
+ """
367
+ node_pools_to_delete = []
368
+ check_resource, is_requested_resource_in_cluster = check_cluster_resources(
369
+ args, system
370
+ )
371
+ for existing_node_pool_name in existing_node_pool_names:
372
+ # Deletion logic would leave behind any Pathways CPU nodepools.
373
+ if existing_node_pool_name.find(f'{args.cluster}-np-') != 0:
374
+ continue
375
+
376
+ # Nodepools will be deleted in two scenarios:
377
+ # Scenario 1: Cluster exists with 3 nodepools of 'x' device_type/gke_accelerator and now we are updating
378
+ # the cluster to 2 nodepools of 'x' device_type/gke_accelerator. In this case, we will delete
379
+ # '{args.cluster}-np-2' from the cluster.
380
+ # Scenario 2: Cluster exists with 2 nodepools of 'x' device_type/gke_accelerator and now we are updating
381
+ # the cluster to 2 nodepools of 'y' device_type/gke_accelerator. In this case, we will delete
382
+ # '{args.cluster}-np-0' and '{args.cluster}-np-1' from the cluster.
383
+ if existing_node_pool_name not in desired_node_pool_names or (
384
+ check_resource and not is_requested_resource_in_cluster
385
+ ):
386
+ node_pools_to_delete.append(existing_node_pool_name)
387
+
388
+ return node_pools_to_delete
389
+
390
+
391
+ def get_all_nodepools_programmatic(args) -> tuple[list[str], int]:
392
+ """Gets all the nodepools associated with the cluster / project / region.
393
+
394
+ Args:
395
+ args: user provided arguments for running the command.
396
+
397
+ Returns:
398
+ List of nodepools and 0 if successful and 1 otherwise.
399
+ """
400
+ command = (
401
+ 'gcloud beta container node-pools list'
402
+ ' --cluster'
403
+ f' {args.cluster} --project={args.project} --region={zone_to_region(args.zone)}'
404
+ ' --format="csv[no-heading](name)"'
405
+ )
406
+ return_code, raw_nodepool_output = run_command_for_value(
407
+ command, 'Get All Node Pools', args
408
+ )
409
+ if return_code != 0:
410
+ xpk_print(f'Get All Node Pools returned ERROR {return_code}')
411
+ return [], 1
412
+
413
+ return raw_nodepool_output.splitlines(), 0
414
+
415
+
416
+ def get_nodepool_zone(args, nodepool_name) -> tuple[int, str | None]:
417
+ """Return zone in which nodepool exists in the cluster.
418
+
419
+ Args:
420
+ args: user provided arguments for running the command.
421
+ nodepool_name: name of nodepool.
422
+
423
+ Returns:
424
+ Tuple of int, str where
425
+ int is the return code - 0 if successful, 1 otherwise.
426
+ str is the zone of nodepool.
427
+ """
428
+ command = (
429
+ f'gcloud beta container node-pools describe {nodepool_name}'
430
+ f' --cluster {args.cluster} --project={args.project}'
431
+ f' --region={zone_to_region(args.zone)} --format="value(locations)"'
432
+ )
433
+ return_code, nodepool_zone = run_command_for_value(
434
+ command, 'Get Node Pool Zone', args
435
+ )
436
+ if return_code != 0:
437
+ xpk_print(f'Get Node Pool Zone returned ERROR {return_code}')
438
+ return 1, None
439
+
440
+ return 0, nodepool_zone.strip()
441
+
442
+
443
+ def get_gke_node_pool_version(
444
+ args, gke_server_config: GkeServerConfig
445
+ ) -> tuple[int, str | None]:
446
+ """Determine the gke node pool version for the node pool.
447
+
448
+ Args:
449
+ args: user provided arguments for running the command.
450
+ gke_server_config: holds valid gke versions and recommended default version.
451
+
452
+ Returns:
453
+ Tuple of
454
+ int: 0 if successful and 1 otherwise.
455
+ str: gke control plane version to use.
456
+ """
457
+
458
+ # By default use the current gke master version for creating node pools.
459
+ command_description = 'Determine current gke master version'
460
+ command = (
461
+ f'gcloud beta container clusters describe {args.cluster}'
462
+ f' --region {zone_to_region(args.zone)} --project {args.project}'
463
+ ' --format="value(currentMasterVersion)"'
464
+ )
465
+
466
+ return_code, current_gke_master_version = run_command_for_value(
467
+ command, command_description, args
468
+ )
469
+ if return_code != 0:
470
+ xpk_print(
471
+ f'Unable to get server config for command: {command_description}.'
472
+ )
473
+ return return_code, None
474
+
475
+ # Override with user provide gke version if specified.
476
+ if args.gke_version is not None:
477
+ node_pool_gke_version = args.gke_version
478
+ else:
479
+ master_gke_version = current_gke_master_version.strip()
480
+ node_pool_gke_version = ''
481
+ # Select minimum version which is >= master_gke_version and has the same minor version.
482
+ # If this does not exist select maximum version which is < master_gke_version.
483
+ for version in gke_server_config.valid_versions:
484
+ if (
485
+ (node_pool_gke_version == '' or node_pool_gke_version < version)
486
+ and version < master_gke_version
487
+ ) or (
488
+ (node_pool_gke_version == '' or node_pool_gke_version > version)
489
+ and master_gke_version <= version
490
+ and master_gke_version.split('.')[:2] == version.split('.')[:2]
491
+ ):
492
+ node_pool_gke_version = version
493
+
494
+ is_supported_node_pool_version = (
495
+ node_pool_gke_version in gke_server_config.valid_versions
496
+ )
497
+ # In rare cases, user's provided gke version may be invalid, but gke will return an error if so.
498
+ # An example scenario is if the user provided gke version is greater than the master version.
499
+ if not is_supported_node_pool_version:
500
+ xpk_print(
501
+ f'Planned node pool version {node_pool_gke_version} is not supported in'
502
+ ' valid version'
503
+ f' {gke_server_config.valid_versions}\nPlease adjust the gke version'
504
+ ' using --gke-version=x or remove the arg and depend on xpk default of'
505
+ f' {current_gke_master_version}'
506
+ )
507
+ return 1, None
508
+ return 0, node_pool_gke_version
509
+
510
+
511
+ def upgrade_gke_nodepools_version(args, default_rapid_gke_version) -> int:
512
+ """Upgrade nodepools in the cluster to default rapid gke version. Recreates the nodes.
513
+
514
+ Args:
515
+ args: user provided arguments for running the command.
516
+ default_rapid_gke_version: Rapid default version for the upgrade.
517
+
518
+ Returns:
519
+ 0 if successful and 1 otherwise.
520
+ """
521
+ existing_node_pool_names, return_code = get_all_nodepools_programmatic(args)
522
+ if return_code != 0:
523
+ xpk_print('Listing all node pools failed!')
524
+ return return_code
525
+
526
+ # Batch execution to upgrade node pools simultaneously
527
+ commands = []
528
+ task_names = []
529
+ for node_pool_name in existing_node_pool_names:
530
+ commands.append(
531
+ 'gcloud container clusters upgrade'
532
+ f' {args.cluster} --project={args.project}'
533
+ f' --region={zone_to_region(args.zone)}'
534
+ f' --cluster-version={default_rapid_gke_version}'
535
+ f' --node-pool={node_pool_name}'
536
+ ' --quiet'
537
+ )
538
+ task_names.append(f'Upgrading node pool {node_pool_name}.')
539
+
540
+ for i, command in enumerate(commands):
541
+ xpk_print(f'To complete {task_names[i]} we are executing {command}')
542
+ max_return_code = run_commands(
543
+ commands, 'Update GKE node pools to default RAPID GKE version', task_names
544
+ )
545
+ if max_return_code != 0:
546
+ xpk_print(
547
+ 'GKE node pools update to default RAPID GKE version returned ERROR:'
548
+ f' {max_return_code}'
549
+ )
550
+ return int(max_return_code)
551
+ return 0
552
+
553
+
554
+ def get_nodepool_workload_metadata_mode(
555
+ args, nodepool_name
556
+ ) -> tuple[int, str | None]:
557
+ """Return Workload Identity metadata mode of the nodepool.
558
+ Args:
559
+ args: user provided arguments for running the command.
560
+ nodepool_name: name of nodepool.
561
+ Returns:
562
+ Tuple of int, str where
563
+ int is the return code - 0 if successful, 1 otherwise.
564
+ str is the workload metadata mode of nodepool.
565
+ """
566
+ command = (
567
+ f'gcloud beta container node-pools describe {nodepool_name}'
568
+ f' --cluster {args.cluster} --project={args.project}'
569
+ f' --region={zone_to_region(args.zone)} --format="value(config.workloadMetadataConfig.mode)"'
570
+ )
571
+ return_code, nodepool_WI_mode = run_command_for_value(
572
+ command, 'Get Node Pool Workload Identity Metadata Mode', args
573
+ )
574
+ if return_code != 0:
575
+ xpk_print(
576
+ 'Get Node Pool Workload Identity Metadata Mode returned ERROR'
577
+ f' {return_code}'
578
+ )
579
+ return 1, None
580
+
581
+ return 0, nodepool_WI_mode.strip()