xpk 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. xpk/__init__.py +15 -0
  2. xpk/api/__init__.py +15 -0
  3. xpk/api/storage_crd.yaml +52 -0
  4. xpk/commands/__init__.py +15 -0
  5. xpk/commands/batch.py +131 -0
  6. xpk/commands/cluster.py +808 -0
  7. xpk/commands/cluster_gcluster.py +269 -0
  8. xpk/commands/common.py +44 -0
  9. xpk/commands/config.py +29 -0
  10. xpk/commands/info.py +243 -0
  11. xpk/commands/inspector.py +357 -0
  12. xpk/commands/job.py +199 -0
  13. xpk/commands/kind.py +283 -0
  14. xpk/commands/kjob_common.py +44 -0
  15. xpk/commands/run.py +128 -0
  16. xpk/commands/shell.py +140 -0
  17. xpk/commands/storage.py +267 -0
  18. xpk/commands/version.py +27 -0
  19. xpk/commands/workload.py +889 -0
  20. xpk/core/__init__.py +15 -0
  21. xpk/core/blueprint/__init__.py +15 -0
  22. xpk/core/blueprint/blueprint_definitions.py +62 -0
  23. xpk/core/blueprint/blueprint_generator.py +708 -0
  24. xpk/core/capacity.py +185 -0
  25. xpk/core/cluster.py +564 -0
  26. xpk/core/cluster_private.py +200 -0
  27. xpk/core/commands.py +356 -0
  28. xpk/core/config.py +179 -0
  29. xpk/core/docker_container.py +225 -0
  30. xpk/core/docker_image.py +210 -0
  31. xpk/core/docker_manager.py +308 -0
  32. xpk/core/docker_resources.py +350 -0
  33. xpk/core/filestore.py +251 -0
  34. xpk/core/gcloud_context.py +196 -0
  35. xpk/core/gcluster_manager.py +176 -0
  36. xpk/core/gcsfuse.py +50 -0
  37. xpk/core/kjob.py +444 -0
  38. xpk/core/kueue.py +358 -0
  39. xpk/core/monitoring.py +134 -0
  40. xpk/core/nap.py +361 -0
  41. xpk/core/network.py +377 -0
  42. xpk/core/nodepool.py +581 -0
  43. xpk/core/pathways.py +377 -0
  44. xpk/core/ray.py +222 -0
  45. xpk/core/remote_state/__init__.py +15 -0
  46. xpk/core/remote_state/fuse_remote_state.py +99 -0
  47. xpk/core/remote_state/remote_state_client.py +38 -0
  48. xpk/core/resources.py +238 -0
  49. xpk/core/scheduling.py +253 -0
  50. xpk/core/storage.py +581 -0
  51. xpk/core/system_characteristics.py +1432 -0
  52. xpk/core/vertex.py +105 -0
  53. xpk/core/workload.py +341 -0
  54. xpk/core/workload_decorators/__init__.py +15 -0
  55. xpk/core/workload_decorators/rdma_decorator.py +129 -0
  56. xpk/core/workload_decorators/storage_decorator.py +52 -0
  57. xpk/core/workload_decorators/tcpxo_decorator.py +190 -0
  58. xpk/main.py +75 -0
  59. xpk/parser/__init__.py +15 -0
  60. xpk/parser/batch.py +43 -0
  61. xpk/parser/cluster.py +662 -0
  62. xpk/parser/common.py +259 -0
  63. xpk/parser/config.py +49 -0
  64. xpk/parser/core.py +135 -0
  65. xpk/parser/info.py +64 -0
  66. xpk/parser/inspector.py +65 -0
  67. xpk/parser/job.py +147 -0
  68. xpk/parser/kind.py +95 -0
  69. xpk/parser/run.py +47 -0
  70. xpk/parser/shell.py +59 -0
  71. xpk/parser/storage.py +316 -0
  72. xpk/parser/validators.py +39 -0
  73. xpk/parser/version.py +23 -0
  74. xpk/parser/workload.py +726 -0
  75. xpk/templates/__init__.py +15 -0
  76. xpk/templates/storage.yaml +13 -0
  77. xpk/utils/__init__.py +15 -0
  78. xpk/utils/console.py +55 -0
  79. xpk/utils/file.py +82 -0
  80. xpk/utils/gcs_utils.py +125 -0
  81. xpk/utils/kubectl.py +57 -0
  82. xpk/utils/network.py +168 -0
  83. xpk/utils/objects.py +88 -0
  84. xpk/utils/templates.py +28 -0
  85. xpk/utils/validation.py +80 -0
  86. xpk/utils/yaml.py +30 -0
  87. {xpk-0.5.0.dist-info → xpk-0.7.0.dist-info}/METADATA +456 -32
  88. xpk-0.7.0.dist-info/RECORD +92 -0
  89. {xpk-0.5.0.dist-info → xpk-0.7.0.dist-info}/WHEEL +1 -1
  90. xpk-0.7.0.dist-info/entry_points.txt +2 -0
  91. xpk-0.5.0.dist-info/RECORD +0 -7
  92. xpk-0.5.0.dist-info/entry_points.txt +0 -2
  93. xpk.py +0 -7282
  94. {xpk-0.5.0.dist-info → xpk-0.7.0.dist-info}/LICENSE +0 -0
  95. {xpk-0.5.0.dist-info → xpk-0.7.0.dist-info}/top_level.txt +0 -0
xpk/core/cluster.py ADDED
@@ -0,0 +1,564 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from google.api_core.exceptions import PermissionDenied
18
+ from google.cloud import resourcemanager_v3
19
+ from kubernetes import client as k8s_client
20
+ from kubernetes import config
21
+ from kubernetes.client.exceptions import ApiException
22
+ from .resources import get_cluster_system_characteristics
23
+
24
+ from ..utils.console import xpk_exit, xpk_print
25
+ from .capacity import H100_DEVICE_TYPE
26
+ from .commands import (
27
+ run_command_for_value,
28
+ run_command_with_updates,
29
+ run_command_with_updates_retry,
30
+ )
31
+ from .gcloud_context import add_zone_and_project, get_gke_server_config, zone_to_region
32
+ from .nodepool import upgrade_gke_nodepools_version
33
+ from .system_characteristics import SystemCharacteristics
34
+
35
+ JOBSET_VERSION = 'v0.7.2'
36
+ INSTALLER_NCC_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-tcpx-installer.yaml'
37
+ INSTALLER_NCC_TCPXO = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml'
38
+
39
+ DEFAULT_NAMESPACE = 'default'
40
+ XPK_SA = 'xpk-sa'
41
+
42
+
43
+ # TODO(vbarr): Remove this function when jobsets gets enabled by default on
44
+ # GKE clusters.
45
+ def set_jobset_on_cluster(args) -> int:
46
+ """Add jobset command on server side and ask user to verify it is created.
47
+
48
+ Args:
49
+ args: user provided arguments for running the command.
50
+
51
+ Returns:
52
+ 0 if successful and 1 otherwise.
53
+ """
54
+ command = (
55
+ 'kubectl apply --server-side -f'
56
+ f' https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml'
57
+ )
58
+ task = f'Install Jobset on {args.cluster}'
59
+ return_code = run_command_with_updates_retry(command, task, args)
60
+
61
+ if return_code != 0:
62
+ xpk_print(f'{task} returned with ERROR {return_code}.\n')
63
+ xpk_print(
64
+ "This LIKELY means you're missing Kubernetes Permissions, you can"
65
+ ' validate this by checking if the error references permission problems'
66
+ ' such as `requires one of ["container.*"] permission(s)`. Follow our'
67
+ ' readme:'
68
+ ' https://github.com/google/xpk/blob/main/README.md#troubleshooting for'
69
+ ' instructions on how to fix these permissions.'
70
+ )
71
+ return return_code
72
+
73
+
74
+ def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
75
+ """Install NCCL plugin on the cluster.
76
+
77
+ Args:
78
+ args: user provided arguments for running the command.
79
+ system: system characteristics.
80
+
81
+ Returns:
82
+ 0 if successful and 1 otherwise.
83
+ """
84
+ if system.device_type == H100_DEVICE_TYPE:
85
+ command = f'kubectl apply -f {INSTALLER_NCC_TCPX}'
86
+ else:
87
+ command = f'kubectl apply -f {INSTALLER_NCC_TCPXO}'
88
+
89
+ return_code = run_command_with_updates(
90
+ command, 'Install NCCL Plugin On Cluster', args
91
+ )
92
+
93
+ if return_code != 0:
94
+ xpk_print(
95
+ f'Install NCCL Plugin On Cluster request returned ERROR {return_code}'
96
+ )
97
+ return 1
98
+
99
+ return 0
100
+
101
+
102
+ def get_cluster_network(args) -> str:
103
+ xpk_print("Getting cluster's VPC network...")
104
+ cluster_network_cmd = (
105
+ 'gcloud container clusters describe'
106
+ f' {args.cluster} --zone={zone_to_region(args.zone)} --project={args.project} --format="value(network)"'
107
+ )
108
+ err_code, val = run_command_for_value(
109
+ command=cluster_network_cmd,
110
+ task='Get network cluster is in',
111
+ global_args=args,
112
+ )
113
+ if err_code != 0:
114
+ xpk_exit(err_code)
115
+ return val.strip()
116
+
117
+
118
+ def update_cluster_with_gcpfilestore_driver_if_necessary(args) -> int:
119
+ """Updates a GKE cluster to enable GCPFilestore CSI driver, if not enabled already.
120
+ Args:
121
+ args: user provided arguments for running the command.
122
+ Returns:
123
+ 0 if successful and error code otherwise.
124
+ """
125
+
126
+ if is_driver_enabled_on_cluster(args, driver='gcpFilestoreCsiDriver'):
127
+ return 0
128
+ cluster_update_return_code = update_gke_cluster_with_addon(
129
+ args, 'GcpFilestoreCsiDriver'
130
+ )
131
+ if cluster_update_return_code > 0:
132
+ xpk_print('Updating GKE cluster to enable GCPFilestore CSI driver failed!')
133
+ return cluster_update_return_code
134
+
135
+ return 0
136
+
137
+
138
+ def is_driver_enabled_on_cluster(args, driver: str) -> bool:
139
+ """Checks if GCSFuse CSI driver is enabled on the cluster.
140
+ Args:
141
+ args: user provided arguments for running the command.
142
+ driver (str) : name of the driver
143
+ Returns:
144
+ True if driver is enabled on the cluster and False otherwise.
145
+ """
146
+ command = (
147
+ f'gcloud container clusters describe {args.cluster}'
148
+ f' --project={args.project} --region={zone_to_region(args.zone)}'
149
+ f' --format="value(addonsConfig.{driver}Config.enabled)"'
150
+ )
151
+ return_code, gcsfuse_driver_enabled = run_command_for_value(
152
+ command,
153
+ f'Checks if {driver} driver is enabled in cluster describe.',
154
+ args,
155
+ )
156
+ if return_code != 0:
157
+ xpk_exit(return_code)
158
+ if gcsfuse_driver_enabled.lower() == 'true':
159
+ xpk_print(f'{driver} driver is enabled on the cluster, no update needed.')
160
+ return True
161
+ return False
162
+
163
+
164
+ def update_gke_cluster_with_addon(args, addon: str) -> int:
165
+ """Run the GKE cluster update command for existing cluster and enabling passed addon.
166
+ Args:
167
+ args: user provided arguments for running the command.
168
+ Returns:
169
+ 0 if successful and 1 otherwise.
170
+ """
171
+ command = (
172
+ 'gcloud container clusters update'
173
+ f' {args.cluster} --project={args.project}'
174
+ f' --region={zone_to_region(args.zone)}'
175
+ f' --update-addons {addon}=ENABLED'
176
+ ' --quiet'
177
+ )
178
+ xpk_print(f'Updating GKE cluster to enable {addon}, may take a while!')
179
+ return_code = run_command_with_updates(
180
+ command, f'GKE Cluster Update to enable {addon}', args
181
+ )
182
+ if return_code != 0:
183
+ xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
184
+ return 1
185
+ return 0
186
+
187
+
188
+ def get_all_clusters_programmatic(args) -> tuple[list[str], int]:
189
+ """Gets all the clusters associated with the project / region.
190
+
191
+ Args:
192
+ args: user provided arguments for running the command.
193
+
194
+ Returns:
195
+ List of cluster names and 0 if successful and 1 otherwise.
196
+ """
197
+ command = (
198
+ 'gcloud container clusters list'
199
+ f' --project={args.project} --region={zone_to_region(args.zone)}'
200
+ ' --format="csv[no-heading](name)"'
201
+ )
202
+ return_code, raw_cluster_output = run_command_for_value(
203
+ command, 'Find if Cluster Exists', args
204
+ )
205
+ if return_code != 0:
206
+ xpk_print(f'Find if Cluster Exists returned ERROR {return_code}')
207
+ return [], return_code
208
+
209
+ return raw_cluster_output.splitlines(), 0
210
+
211
+
212
+ def project_id_to_project_number(project_id: str) -> str:
213
+ client = resourcemanager_v3.ProjectsClient()
214
+ request = resourcemanager_v3.GetProjectRequest()
215
+ request.name = f'projects/{project_id}'
216
+ try:
217
+ response = client.get_project(request=request)
218
+ except PermissionDenied as e:
219
+ xpk_print(
220
+ f"Couldn't translate project id: {project_id} to project number."
221
+ f' Error: {e}'
222
+ )
223
+ xpk_exit(1)
224
+ parts = response.name.split('/', 1)
225
+ xpk_print(f'Project number for project: {project_id} is {parts[1]}')
226
+ return str(parts[1])
227
+
228
+
229
+ def setup_k8s_env(args) -> k8s_client.ApiClient:
230
+ if not getattr(args, 'kind_cluster', False):
231
+ add_zone_and_project(args)
232
+ get_cluster_credentials(args)
233
+ args.project_number = project_id_to_project_number(args.project)
234
+
235
+ config.load_kube_config()
236
+ return k8s_client.ApiClient() # pytype: disable=bad-return-type
237
+
238
+
239
+ def get_gpu_type_from_cluster(args) -> str:
240
+ system = get_cluster_system_characteristics(args)
241
+ if not system is None:
242
+ return system.device_type
243
+ return ''
244
+
245
+
246
+ def create_xpk_k8s_service_account() -> None:
247
+ k8s_core_client = k8s_client.CoreV1Api()
248
+ sa = k8s_client.V1ServiceAccount(
249
+ metadata=k8s_client.V1ObjectMeta(name=XPK_SA)
250
+ )
251
+
252
+ xpk_print(f'Creating a new service account: {XPK_SA}')
253
+ try:
254
+ k8s_core_client.create_namespaced_service_account(
255
+ DEFAULT_NAMESPACE, sa, pretty=True
256
+ )
257
+ xpk_print(f'Created a new service account: {sa} successfully')
258
+ except ApiException:
259
+ xpk_print(
260
+ f'Service account: {XPK_SA} already exists. Skipping its creation'
261
+ )
262
+
263
+
264
+ def update_gke_cluster_with_clouddns(args) -> int:
265
+ """Run the GKE cluster update command for existing clusters and enable CloudDNS.
266
+
267
+ Args:
268
+ args: user provided arguments for running the command.
269
+
270
+ Returns:
271
+ 0 if successful and 1 otherwise.
272
+ """
273
+ command = (
274
+ 'gcloud container clusters update'
275
+ f' {args.cluster} --project={args.project}'
276
+ f' --region={zone_to_region(args.zone)}'
277
+ ' --cluster-dns=clouddns'
278
+ ' --cluster-dns-scope=vpc'
279
+ f' --cluster-dns-domain={args.cluster}-domain'
280
+ ' --quiet'
281
+ )
282
+ xpk_print('Updating GKE cluster to use Cloud DNS, may take a while!')
283
+ return_code = run_command_with_updates(
284
+ command, 'GKE Cluster Update to enable Cloud DNS', args
285
+ )
286
+ if return_code != 0:
287
+ xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
288
+ return 1
289
+ return 0
290
+
291
+
292
+ def update_gke_cluster_with_workload_identity_enabled(args) -> int:
293
+ """Run the GKE cluster update command for existing cluster and enable Workload Identity Federation.
294
+ Args:
295
+ args: user provided arguments for running the command.
296
+ Returns:
297
+ 0 if successful and 1 otherwise.
298
+ """
299
+ command = (
300
+ 'gcloud container clusters update'
301
+ f' {args.cluster} --project={args.project}'
302
+ f' --region={zone_to_region(args.zone)}'
303
+ f' --workload-pool={args.project}.svc.id.goog'
304
+ ' --quiet'
305
+ )
306
+ xpk_print(
307
+ 'Updating GKE cluster to enable Workload Identity Federation, may take a'
308
+ ' while!'
309
+ )
310
+ return_code = run_command_with_updates(
311
+ command, 'GKE Cluster Update to enable Workload Identity Federation', args
312
+ )
313
+ if return_code != 0:
314
+ xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
315
+ return 1
316
+ return 0
317
+
318
+
319
+ def update_gke_cluster_with_gcsfuse_driver_enabled(args) -> int:
320
+ """Run the GKE cluster update command for existing cluster and enable GCSFuse CSI driver.
321
+ Args:
322
+ args: user provided arguments for running the command.
323
+ Returns:
324
+ 0 if successful and 1 otherwise.
325
+ """
326
+ command = (
327
+ 'gcloud container clusters update'
328
+ f' {args.cluster} --project={args.project}'
329
+ f' --region={zone_to_region(args.zone)}'
330
+ ' --update-addons GcsFuseCsiDriver=ENABLED'
331
+ ' --quiet'
332
+ )
333
+ xpk_print(
334
+ 'Updating GKE cluster to enable GCSFuse CSI driver, may take a while!'
335
+ )
336
+ return_code = run_command_with_updates(
337
+ command, 'GKE Cluster Update to enable GCSFuse CSI driver', args
338
+ )
339
+ if return_code != 0:
340
+ xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
341
+ return 1
342
+ return 0
343
+
344
+
345
+ def upgrade_gke_control_plane_version(args, default_rapid_gke_version) -> int:
346
+ """Upgrade GKE cluster's control plane version before updating nodepools to use CloudDNS.
347
+
348
+ Args:
349
+ args: user provided arguments for running the command.
350
+ default_rapid_gke_version: Rapid default version for the upgrade.
351
+
352
+ Returns:
353
+ 0 if successful and 1 otherwise.
354
+ """
355
+ command = (
356
+ 'gcloud container clusters upgrade'
357
+ f' {args.cluster} --project={args.project}'
358
+ f' --region={zone_to_region(args.zone)}'
359
+ f' --cluster-version={default_rapid_gke_version}'
360
+ ' --master'
361
+ ' --quiet'
362
+ )
363
+ xpk_print("Updating GKE cluster's control plane version, may take a while!")
364
+ return_code = run_command_with_updates(
365
+ command,
366
+ 'GKE Cluster control plane version update to enable Cloud DNS',
367
+ args,
368
+ )
369
+ if return_code != 0:
370
+ xpk_print(
371
+ "GKE cluster's control plane version update request returned"
372
+ f' ERROR {return_code}'
373
+ )
374
+ return 1
375
+ return 0
376
+
377
+
378
+ def is_cluster_using_clouddns(args) -> bool:
379
+ """Checks if cluster is using CloudDNS.
380
+ Args:
381
+ args: user provided arguments for running the command.
382
+
383
+ Returns:
384
+ True if cluster is using CloudDNS and False otherwise.
385
+ """
386
+ command = (
387
+ f'gcloud container clusters describe {args.cluster}'
388
+ f' --project={args.project} --region={zone_to_region(args.zone)}'
389
+ ' 2> /dev/null | grep "clusterDns: CLOUD_DNS"'
390
+ )
391
+ return_code, _ = run_command_for_value(
392
+ command,
393
+ 'Check if Cloud DNS is enabled in cluster describe.',
394
+ args,
395
+ )
396
+ if return_code == 0:
397
+ xpk_print('Cloud DNS is enabled on the cluster, no update needed.')
398
+ return True
399
+ return False
400
+
401
+
402
+ def is_workload_identity_enabled_on_cluster(args) -> bool:
403
+ """Checks if Workload Identity Federation is enabled on the cluster.
404
+ Args:
405
+ args: user provided arguments for running the command.
406
+ Returns:
407
+ True if Workload Identity Federation is enabled on the cluster and False otherwise.
408
+ """
409
+ command = (
410
+ f'gcloud container clusters describe {args.cluster}'
411
+ f' --project={args.project} --region={zone_to_region(args.zone)}'
412
+ ' --format="value(workloadIdentityConfig.workloadPool)"'
413
+ )
414
+ return_code, workload_pool = run_command_for_value(
415
+ command,
416
+ 'Checks if Workload Identity Federation is enabled in cluster describe.',
417
+ args,
418
+ )
419
+ if return_code != 0:
420
+ xpk_exit(return_code)
421
+ if workload_pool == f'{args.project}.svc.id.goog':
422
+ xpk_print(
423
+ 'Workload Identity Federation is enabled on the cluster, no update'
424
+ ' needed.'
425
+ )
426
+ return True
427
+ return False
428
+
429
+
430
+ def is_gcsfuse_driver_enabled_on_cluster(args) -> bool:
431
+ """Checks if GCSFuse CSI driver is enabled on the cluster.
432
+ Args:
433
+ args: user provided arguments for running the command.
434
+ Returns:
435
+ True if GCSFuse CSI driver is enabled on the cluster and False otherwise.
436
+ """
437
+ command = (
438
+ f'gcloud container clusters describe {args.cluster}'
439
+ f' --project={args.project} --region={zone_to_region(args.zone)}'
440
+ ' --format="value(addonsConfig.gcsFuseCsiDriverConfig.enabled)"'
441
+ )
442
+ return_code, gcsfuse_driver_enabled = run_command_for_value(
443
+ command,
444
+ 'Checks if GCSFuse CSI driver is enabled in cluster describe.',
445
+ args,
446
+ )
447
+ if return_code != 0:
448
+ xpk_exit(return_code)
449
+ if gcsfuse_driver_enabled.lower() == 'true':
450
+ xpk_print('GCSFuse CSI driver is enabled on the cluster, no update needed.')
451
+ return True
452
+ return False
453
+
454
+
455
+ def update_cluster_with_clouddns_if_necessary(args) -> int:
456
+ """Updates a GKE cluster to use CloudDNS, if not enabled already.
457
+
458
+ Args:
459
+ args: user provided arguments for running the command.
460
+
461
+ Returns:
462
+ 0 if successful and error code otherwise.
463
+ """
464
+ all_clusters, return_code = get_all_clusters_programmatic(args)
465
+ if return_code > 0:
466
+ xpk_print('Listing all clusters failed!')
467
+ return 1
468
+ if args.cluster in all_clusters:
469
+ # If cluster is already using clouddns, no update necessary!
470
+ if is_cluster_using_clouddns(args):
471
+ return 0
472
+ cluster_update_return_code = update_gke_cluster_with_clouddns(args)
473
+ if cluster_update_return_code > 0:
474
+ xpk_print('Updating GKE cluster to use CloudDNS failed!')
475
+ return cluster_update_return_code
476
+
477
+ # Find default rapid control plane version and update the control plane to the same.
478
+ server_config_return_code, gke_server_config = get_gke_server_config(args)
479
+ if server_config_return_code != 0:
480
+ xpk_exit(server_config_return_code)
481
+ upgrade_master_return_code = upgrade_gke_control_plane_version(
482
+ args,
483
+ gke_server_config.default_rapid_gke_version, # pytype: disable=attribute-error
484
+ )
485
+ if upgrade_master_return_code > 0:
486
+ xpk_print("Updating GKE cluster's control plane upgrade failed!")
487
+ return upgrade_master_return_code
488
+
489
+ # Upgrade nodepools version after the master upgrade.
490
+ node_pool_update_code = upgrade_gke_nodepools_version(
491
+ args,
492
+ gke_server_config.default_rapid_gke_version, # pytype: disable=attribute-error
493
+ )
494
+ if node_pool_update_code > 0:
495
+ xpk_print('Upgrading nodepools version failed!')
496
+ return node_pool_update_code
497
+ return 0
498
+
499
+
500
+ def update_cluster_with_workload_identity_if_necessary(args) -> int:
501
+ """Updates a GKE cluster to enable Workload Identity Federation, if not enabled already.
502
+ Args:
503
+ args: user provided arguments for running the command.
504
+ Returns:
505
+ 0 if successful and error code otherwise.
506
+ """
507
+
508
+ if is_workload_identity_enabled_on_cluster(args):
509
+ return 0
510
+ cluster_update_return_code = (
511
+ update_gke_cluster_with_workload_identity_enabled(args)
512
+ )
513
+ if cluster_update_return_code > 0:
514
+ xpk_print(
515
+ 'Updating GKE cluster to enable Workload Identity Federation failed!'
516
+ )
517
+ return cluster_update_return_code
518
+
519
+ return 0
520
+
521
+
522
+ def update_cluster_with_gcsfuse_driver_if_necessary(args) -> int:
523
+ """Updates a GKE cluster to enable GCSFuse CSI driver, if not enabled already.
524
+ Args:
525
+ args: user provided arguments for running the command.
526
+ Returns:
527
+ 0 if successful and error code otherwise.
528
+ """
529
+
530
+ if is_gcsfuse_driver_enabled_on_cluster(args):
531
+ return 0
532
+ cluster_update_return_code = update_gke_cluster_with_gcsfuse_driver_enabled(
533
+ args
534
+ )
535
+ if cluster_update_return_code > 0:
536
+ xpk_print('Updating GKE cluster to enable GCSFuse CSI driver failed!')
537
+ return cluster_update_return_code
538
+
539
+ return 0
540
+
541
+
542
+ def get_cluster_credentials(args) -> None:
543
+ """Run cluster configuration command to set the kubectl config.
544
+
545
+ Args:
546
+ args: user provided arguments for running the command.
547
+
548
+ Returns:
549
+ 0 if successful and 1 otherwise.
550
+ """
551
+ command = (
552
+ 'gcloud container clusters get-credentials'
553
+ f' {args.cluster} --region={zone_to_region(args.zone)}'
554
+ f' --project={args.project} &&'
555
+ ' kubectl config view && kubectl config set-context --current'
556
+ ' --namespace=default'
557
+ )
558
+ task = f'get-credentials to cluster {args.cluster}'
559
+ return_code = run_command_with_updates_retry(
560
+ command, task, args, verbose=False
561
+ )
562
+ if return_code != 0:
563
+ xpk_print(f'{task} returned ERROR {return_code}')
564
+ xpk_exit(return_code)