xpk 0.12.0__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. xpk/commands/batch.py +17 -10
  2. xpk/commands/cluster.py +137 -123
  3. xpk/commands/cluster_gcluster.py +77 -14
  4. xpk/commands/cluster_gcluster_test.py +177 -0
  5. xpk/commands/common.py +13 -27
  6. xpk/commands/info.py +11 -9
  7. xpk/commands/inspector.py +22 -11
  8. xpk/commands/job.py +53 -9
  9. xpk/commands/kind.py +38 -40
  10. xpk/commands/kjob_common.py +4 -4
  11. xpk/commands/run.py +9 -2
  12. xpk/commands/shell.py +13 -10
  13. xpk/commands/storage.py +26 -2
  14. xpk/commands/version.py +0 -4
  15. xpk/commands/workload.py +58 -30
  16. xpk/core/blueprint/blueprint_generator.py +4 -40
  17. xpk/core/blueprint/blueprint_test.py +0 -6
  18. xpk/core/capacity.py +6 -5
  19. xpk/core/cluster.py +96 -195
  20. xpk/core/cluster_private.py +9 -12
  21. xpk/core/commands.py +21 -25
  22. xpk/core/config.py +1 -1
  23. xpk/core/docker_image.py +17 -9
  24. xpk/core/docker_resources.py +9 -4
  25. xpk/core/gcloud_context.py +26 -2
  26. xpk/core/gcloud_context_test.py +96 -0
  27. xpk/core/gcluster_manager.py +0 -3
  28. xpk/core/jobset.py +5 -8
  29. xpk/core/kjob.py +19 -29
  30. xpk/core/kueue_manager.py +383 -0
  31. xpk/core/kueue_manager_test.py +542 -0
  32. xpk/core/monitoring.py +1 -1
  33. xpk/core/nap.py +11 -16
  34. xpk/core/network.py +18 -19
  35. xpk/core/nodepool.py +65 -71
  36. xpk/core/nodepool_test.py +198 -1
  37. xpk/core/pathways.py +9 -5
  38. xpk/core/ray.py +11 -15
  39. xpk/core/resources.py +15 -10
  40. xpk/core/scheduling.py +23 -1
  41. xpk/core/scheduling_test.py +31 -0
  42. xpk/core/system_characteristics.py +335 -229
  43. xpk/core/vertex.py +1 -1
  44. xpk/core/workload.py +7 -8
  45. xpk/main.py +3 -2
  46. xpk/parser/cluster.py +50 -0
  47. xpk/parser/cluster_test.py +66 -0
  48. xpk/parser/common.py +11 -0
  49. xpk/parser/workload.py +62 -25
  50. xpk/parser/workload_test.py +82 -0
  51. xpk/utils/execution_context.py +28 -0
  52. xpk/utils/feature_flags.py +28 -0
  53. xpk/utils/file.py +25 -10
  54. xpk/utils/kueue.py +20 -0
  55. xpk/utils/network.py +4 -0
  56. xpk/utils/templates.py +2 -0
  57. xpk/utils/topology.py +37 -0
  58. xpk/utils/topology_test.py +43 -0
  59. xpk/utils/validation.py +79 -55
  60. xpk/utils/validation_test.py +37 -0
  61. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
  62. xpk-0.14.0.dist-info/RECORD +112 -0
  63. xpk/core/kueue.py +0 -545
  64. xpk-0.12.0.dist-info/RECORD +0 -100
  65. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
  66. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
  67. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
  68. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
xpk/core/cluster.py CHANGED
@@ -30,15 +30,14 @@ from .commands import (
30
30
  )
31
31
  from .gcloud_context import (
32
32
  add_zone_and_project,
33
- get_gke_server_config,
33
+ get_cluster_location,
34
34
  zone_to_region,
35
35
  )
36
- from .nodepool import upgrade_gke_nodepools_version
37
36
  from .resources import get_cluster_system_characteristics
38
37
  from .system_characteristics import SystemCharacteristics
39
38
 
40
39
  JOBSET_VERSION = 'v0.8.0'
41
- PATHWAYS_JOB_VERSION = 'v0.1.2'
40
+ PATHWAYS_JOB_VERSION = 'v0.1.3'
42
41
  INSTALLER_NCCL_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-tcpx-installer.yaml'
43
42
  INSTALLER_NCCL_TCPXO = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml'
44
43
  INSTALLER_NCCL_RDMA = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer.yaml'
@@ -66,7 +65,7 @@ def set_jobset_on_cluster(args) -> int:
66
65
  f' -f https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml'
67
66
  )
68
67
  task = f'Install Jobset on {args.cluster}'
69
- return_code = run_command_with_updates_retry(command, task, args)
68
+ return_code = run_command_with_updates_retry(command, task)
70
69
 
71
70
  if return_code != 0:
72
71
  xpk_print(f'{task} returned with ERROR {return_code}.\n')
@@ -95,7 +94,7 @@ def set_pathways_job_on_cluster(args) -> int:
95
94
  f' https://github.com/google/pathways-job/releases/download/{PATHWAYS_JOB_VERSION}/install.yaml'
96
95
  )
97
96
  task = f'Install PathwaysJob on {args.cluster}'
98
- return_code = run_command_with_updates_retry(command, task, args)
97
+ return_code = run_command_with_updates_retry(command, task)
99
98
 
100
99
  if return_code != 0:
101
100
  xpk_print(f'{task} returned with ERROR {return_code}.\n')
@@ -110,11 +109,10 @@ def set_pathways_job_on_cluster(args) -> int:
110
109
  return return_code
111
110
 
112
111
 
113
- def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
112
+ def install_nccl_on_cluster(system: SystemCharacteristics) -> int:
114
113
  """Install NCCL plugin on the cluster.
115
114
 
116
115
  Args:
117
- args: user provided arguments for running the command.
118
116
  system: system characteristics.
119
117
 
120
118
  Returns:
@@ -128,7 +126,7 @@ def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
128
126
  command = f'kubectl apply -f {INSTALLER_NCCL_TCPXO}'
129
127
 
130
128
  return_code = run_command_with_updates(
131
- command, 'Install NCCL Plugin On Cluster', args
129
+ command, 'Install NCCL Plugin On Cluster'
132
130
  )
133
131
 
134
132
  if return_code != 0:
@@ -141,7 +139,7 @@ def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
141
139
  command = f'kubectl apply -f {CONFIG_NCCL_TCPX}'
142
140
 
143
141
  return_code = run_command_with_updates(
144
- command, 'Install NCCL Config On Cluster', args
142
+ command, 'Install NCCL Config On Cluster'
145
143
  )
146
144
 
147
145
  if return_code != 0:
@@ -153,19 +151,14 @@ def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
153
151
  return 0
154
152
 
155
153
 
156
- def disable_mglru_on_cluster(args) -> int:
154
+ def disable_mglru_on_cluster() -> int:
157
155
  """Disable MGLRU on the cluster.
158
156
 
159
- Args:
160
- args: user provided arguments for running the command.
161
-
162
157
  Returns:
163
158
  0 if successful and 1 otherwise.
164
159
  """
165
160
  command = f'kubectl apply -f {MGLRU_DISABLE}'
166
- return_code = run_command_with_updates(
167
- command, 'Disable MGLRU On Cluster', args
168
- )
161
+ return_code = run_command_with_updates(command, 'Disable MGLRU On Cluster')
169
162
 
170
163
  if return_code != 0:
171
164
  xpk_print('Disablig MGLRU On Cluster request returned ERROR')
@@ -174,11 +167,10 @@ def disable_mglru_on_cluster(args) -> int:
174
167
  return 0
175
168
 
176
169
 
177
- def install_nri_on_cluster(args) -> int:
170
+ def install_nri_on_cluster() -> int:
178
171
  """Install NRI Device Injector on the cluster.
179
172
 
180
173
  Args:
181
- args: user provided arguments for running the command.
182
174
  system: system characteristics.
183
175
 
184
176
  Returns:
@@ -186,7 +178,7 @@ def install_nri_on_cluster(args) -> int:
186
178
  """
187
179
  command = f'kubectl apply -f {NRI_DEVICE_INJECTOR}'
188
180
  return_code = run_command_with_updates(
189
- command, 'Install NRI Device Injector On Cluster', args
181
+ command, 'Install NRI Device Injector On Cluster'
190
182
  )
191
183
 
192
184
  if return_code != 0:
@@ -199,12 +191,9 @@ def install_nri_on_cluster(args) -> int:
199
191
  return 0
200
192
 
201
193
 
202
- def get_cluster_nodes_info(args) -> list[dict]:
194
+ def get_cluster_nodes_info() -> list[dict]:
203
195
  """Get list of cluster's nodes descrition in yaml format
204
196
 
205
- Args:
206
- args: user provided arguments for running the command.
207
-
208
197
  Returns:
209
198
  List of nodes info yaml objects.
210
199
  """
@@ -213,7 +202,6 @@ def get_cluster_nodes_info(args) -> list[dict]:
213
202
  err_code, val = run_command_for_value(
214
203
  command=command,
215
204
  task='Get cluster nodes info',
216
- global_args=args,
217
205
  )
218
206
  if err_code != 0:
219
207
  xpk_exit(err_code)
@@ -221,9 +209,9 @@ def get_cluster_nodes_info(args) -> list[dict]:
221
209
  return data['items']
222
210
 
223
211
 
224
- def count_nodes_on_cluster(args, system: SystemCharacteristics) -> int:
212
+ def count_nodes_on_cluster(system: SystemCharacteristics) -> int:
225
213
  """Count cluster nodes by accelerator type"""
226
- nodes_info = get_cluster_nodes_info(args)
214
+ nodes_info = get_cluster_nodes_info()
227
215
  accelerators = [
228
216
  node['metadata']['labels']['cloud.google.com/gke-accelerator']
229
217
  for node in nodes_info
@@ -243,12 +231,11 @@ def get_cluster_network(args) -> str:
243
231
  xpk_print("Getting cluster's VPC network...")
244
232
  cluster_network_cmd = (
245
233
  'gcloud container clusters describe'
246
- f' {args.cluster} --zone={zone_to_region(args.zone)} --project={args.project} --format="value(network)"'
234
+ f' {args.cluster} --location={get_cluster_location(args.project, args.cluster, args.zone)} --project={args.project} --format="value(network)"'
247
235
  )
248
236
  err_code, val = run_command_for_value(
249
237
  command=cluster_network_cmd,
250
238
  task='Get network cluster is in',
251
- global_args=args,
252
239
  )
253
240
  if err_code != 0:
254
241
  xpk_exit(err_code)
@@ -354,14 +341,13 @@ def is_driver_enabled_on_cluster(
354
341
  """
355
342
  command = (
356
343
  f'gcloud container clusters describe {args.cluster}'
357
- f' --project={args.project} --region={zone_to_region(args.zone)}'
344
+ f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
358
345
  f' --format="value(addonsConfig.{driver}Config.{config_key})"'
359
346
  )
360
347
  return_code, driver_enabled = run_command_for_value(
361
348
  command,
362
349
  f"Checks if {driver} driver's {config_key} is enabled in cluster"
363
350
  ' describe.',
364
- args,
365
351
  )
366
352
  if return_code != 0:
367
353
  xpk_exit(return_code)
@@ -382,14 +368,12 @@ def update_gke_cluster_with_addon(args, addon: str) -> int:
382
368
  """
383
369
  command = (
384
370
  'gcloud container clusters update'
385
- f' {args.cluster} --project={args.project}'
386
- f' --region={zone_to_region(args.zone)}'
387
- f' --update-addons {addon}=ENABLED'
388
- ' --quiet'
371
+ f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --update-addons'
372
+ f' {addon}=ENABLED --quiet'
389
373
  )
390
374
  xpk_print(f'Updating GKE cluster to enable {addon}, may take a while!')
391
375
  return_code = run_command_with_updates(
392
- command, f'GKE Cluster Update to enable {addon}', args
376
+ command, f'GKE Cluster Update to enable {addon}'
393
377
  )
394
378
  if return_code != 0:
395
379
  xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
@@ -408,11 +392,12 @@ def get_all_clusters_programmatic(args) -> tuple[list[str], int]:
408
392
  """
409
393
  command = (
410
394
  'gcloud container clusters list'
411
- f' --project={args.project} --region={zone_to_region(args.zone)}'
395
+ f' --project={args.project} '
396
+ f'--filter=location~"{zone_to_region(args.zone)}.*"'
412
397
  ' --format="csv[no-heading](name)"'
413
398
  )
414
399
  return_code, raw_cluster_output = run_command_for_value(
415
- command, 'Find if Cluster Exists', args
400
+ command, 'Find if Cluster Exists'
416
401
  )
417
402
  if return_code != 0:
418
403
  xpk_print(f'Find if Cluster Exists returned ERROR {return_code}')
@@ -442,7 +427,11 @@ def setup_k8s_env(args) -> k8s_client.ApiClient:
442
427
  if not getattr(args, 'kind_cluster', False):
443
428
  add_zone_and_project(args)
444
429
  get_cluster_credentials(args)
445
- args.project_number = project_id_to_project_number(args.project)
430
+ args.project_number = (
431
+ project_id_to_project_number(args.project)
432
+ if not args.dry_run
433
+ else abs(hash(args.project) % (10**12)) # 12 digit hash
434
+ )
446
435
 
447
436
  config.load_kube_config()
448
437
  return k8s_client.ApiClient()
@@ -574,34 +563,6 @@ def create_role_binding(sa: str, role_name: str) -> None:
574
563
  xpk_exit(1)
575
564
 
576
565
 
577
- def update_gke_cluster_with_clouddns(args) -> int:
578
- """Run the GKE cluster update command for existing clusters and enable CloudDNS.
579
-
580
- Args:
581
- args: user provided arguments for running the command.
582
-
583
- Returns:
584
- 0 if successful and 1 otherwise.
585
- """
586
- command = (
587
- 'gcloud container clusters update'
588
- f' {args.cluster} --project={args.project}'
589
- f' --region={zone_to_region(args.zone)}'
590
- ' --cluster-dns=clouddns'
591
- ' --cluster-dns-scope=vpc'
592
- f' --cluster-dns-domain={args.cluster}-domain'
593
- ' --quiet'
594
- )
595
- xpk_print('Updating GKE cluster to use Cloud DNS, may take a while!')
596
- return_code = run_command_with_updates(
597
- command, 'GKE Cluster Update to enable Cloud DNS', args
598
- )
599
- if return_code != 0:
600
- xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
601
- return 1
602
- return 0
603
-
604
-
605
566
  def update_gke_cluster_with_workload_identity_enabled(args) -> int:
606
567
  """Run the GKE cluster update command for existing cluster and enable Workload Identity Federation.
607
568
  Args:
@@ -611,9 +572,7 @@ def update_gke_cluster_with_workload_identity_enabled(args) -> int:
611
572
  """
612
573
  command = (
613
574
  'gcloud container clusters update'
614
- f' {args.cluster} --project={args.project}'
615
- f' --region={zone_to_region(args.zone)}'
616
- f' --workload-pool={args.project}.svc.id.goog'
575
+ f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --workload-pool={args.project}.svc.id.goog'
617
576
  ' --quiet'
618
577
  )
619
578
  xpk_print(
@@ -621,7 +580,7 @@ def update_gke_cluster_with_workload_identity_enabled(args) -> int:
621
580
  ' while!'
622
581
  )
623
582
  return_code = run_command_with_updates(
624
- command, 'GKE Cluster Update to enable Workload Identity Federation', args
583
+ command, 'GKE Cluster Update to enable Workload Identity Federation'
625
584
  )
626
585
  if return_code != 0:
627
586
  xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
@@ -638,16 +597,14 @@ def update_gke_cluster_with_gcsfuse_driver_enabled(args) -> int:
638
597
  """
639
598
  command = (
640
599
  'gcloud container clusters update'
641
- f' {args.cluster} --project={args.project}'
642
- f' --region={zone_to_region(args.zone)}'
643
- ' --update-addons GcsFuseCsiDriver=ENABLED'
644
- ' --quiet'
600
+ f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --update-addons'
601
+ ' GcsFuseCsiDriver=ENABLED --quiet'
645
602
  )
646
603
  xpk_print(
647
604
  'Updating GKE cluster to enable GCSFuse CSI driver, may take a while!'
648
605
  )
649
606
  return_code = run_command_with_updates(
650
- command, 'GKE Cluster Update to enable GCSFuse CSI driver', args
607
+ command, 'GKE Cluster Update to enable GCSFuse CSI driver'
651
608
  )
652
609
  if return_code != 0:
653
610
  xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
@@ -664,16 +621,14 @@ def update_gke_cluster_with_lustre_driver_enabled(args) -> int:
664
621
  """
665
622
  command = (
666
623
  'gcloud container clusters update'
667
- f' {args.cluster} --project={args.project}'
668
- f' --region={zone_to_region(args.zone)}'
669
- ' --enable-legacy-lustre-port'
624
+ f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --enable-legacy-lustre-port'
670
625
  ' --quiet'
671
626
  )
672
627
  xpk_print(
673
628
  'Updating GKE cluster to enable Lustre CSI driver, may take a while!'
674
629
  )
675
630
  return_code = run_command_with_updates(
676
- command, 'GKE Cluster Update to enable Lustre CSI driver', args
631
+ command, 'GKE Cluster Update to enable Lustre CSI driver'
677
632
  )
678
633
  if return_code != 0:
679
634
  xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
@@ -681,63 +636,6 @@ def update_gke_cluster_with_lustre_driver_enabled(args) -> int:
681
636
  return 0
682
637
 
683
638
 
684
- def upgrade_gke_control_plane_version(args, default_rapid_gke_version) -> int:
685
- """Upgrade GKE cluster's control plane version before updating nodepools to use CloudDNS.
686
-
687
- Args:
688
- args: user provided arguments for running the command.
689
- default_rapid_gke_version: Rapid default version for the upgrade.
690
-
691
- Returns:
692
- 0 if successful and 1 otherwise.
693
- """
694
- command = (
695
- 'gcloud container clusters upgrade'
696
- f' {args.cluster} --project={args.project}'
697
- f' --region={zone_to_region(args.zone)}'
698
- f' --cluster-version={default_rapid_gke_version}'
699
- ' --master'
700
- ' --quiet'
701
- )
702
- xpk_print("Updating GKE cluster's control plane version, may take a while!")
703
- return_code = run_command_with_updates(
704
- command,
705
- 'GKE Cluster control plane version update to enable Cloud DNS',
706
- args,
707
- )
708
- if return_code != 0:
709
- xpk_print(
710
- "GKE cluster's control plane version update request returned"
711
- f' ERROR {return_code}'
712
- )
713
- return 1
714
- return 0
715
-
716
-
717
- def is_cluster_using_clouddns(args) -> bool:
718
- """Checks if cluster is using CloudDNS.
719
- Args:
720
- args: user provided arguments for running the command.
721
-
722
- Returns:
723
- True if cluster is using CloudDNS and False otherwise.
724
- """
725
- command = (
726
- f'gcloud container clusters describe {args.cluster}'
727
- f' --project={args.project} --region={zone_to_region(args.zone)}'
728
- ' 2> /dev/null | grep "clusterDns: CLOUD_DNS"'
729
- )
730
- return_code, _ = run_command_for_value(
731
- command,
732
- 'Check if Cloud DNS is enabled in cluster describe.',
733
- args,
734
- )
735
- if return_code == 0:
736
- xpk_print('Cloud DNS is enabled on the cluster, no update needed.')
737
- return True
738
- return False
739
-
740
-
741
639
  def is_workload_identity_enabled_on_cluster(args) -> bool:
742
640
  """Checks if Workload Identity Federation is enabled on the cluster.
743
641
  Args:
@@ -747,13 +645,12 @@ def is_workload_identity_enabled_on_cluster(args) -> bool:
747
645
  """
748
646
  command = (
749
647
  f'gcloud container clusters describe {args.cluster}'
750
- f' --project={args.project} --region={zone_to_region(args.zone)}'
648
+ f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
751
649
  ' --format="value(workloadIdentityConfig.workloadPool)"'
752
650
  )
753
651
  return_code, workload_pool = run_command_for_value(
754
652
  command,
755
653
  'Checks if Workload Identity Federation is enabled in cluster describe.',
756
- args,
757
654
  )
758
655
  if return_code != 0:
759
656
  xpk_exit(return_code)
@@ -775,13 +672,12 @@ def is_gcsfuse_driver_enabled_on_cluster(args) -> bool:
775
672
  """
776
673
  command = (
777
674
  f'gcloud container clusters describe {args.cluster}'
778
- f' --project={args.project} --region={zone_to_region(args.zone)}'
675
+ f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
779
676
  ' --format="value(addonsConfig.gcsFuseCsiDriverConfig.enabled)"'
780
677
  )
781
678
  return_code, gcsfuse_driver_enabled = run_command_for_value(
782
679
  command,
783
680
  'Checks if GCSFuse CSI driver is enabled in cluster describe.',
784
- args,
785
681
  )
786
682
  if return_code != 0:
787
683
  xpk_exit(return_code)
@@ -791,53 +687,6 @@ def is_gcsfuse_driver_enabled_on_cluster(args) -> bool:
791
687
  return False
792
688
 
793
689
 
794
- def update_cluster_with_clouddns_if_necessary(args) -> int:
795
- """Updates a GKE cluster to use CloudDNS, if not enabled already.
796
-
797
- Args:
798
- args: user provided arguments for running the command.
799
-
800
- Returns:
801
- 0 if successful and error code otherwise.
802
- """
803
- all_clusters, return_code = get_all_clusters_programmatic(args)
804
- if return_code > 0:
805
- xpk_print('Listing all clusters failed!')
806
- return 1
807
- if args.cluster in all_clusters:
808
- # If cluster is already using clouddns, no update necessary!
809
- if is_cluster_using_clouddns(args):
810
- return 0
811
- cluster_update_return_code = update_gke_cluster_with_clouddns(args)
812
- if cluster_update_return_code > 0:
813
- xpk_print('Updating GKE cluster to use CloudDNS failed!')
814
- return cluster_update_return_code
815
-
816
- # Find default rapid control plane version and update the control plane to the same.
817
- server_config_return_code, gke_server_config = get_gke_server_config(args)
818
- if server_config_return_code != 0:
819
- xpk_exit(server_config_return_code)
820
- assert gke_server_config
821
-
822
- upgrade_master_return_code = upgrade_gke_control_plane_version(
823
- args,
824
- gke_server_config.default_rapid_gke_version,
825
- )
826
- if upgrade_master_return_code > 0:
827
- xpk_print("Updating GKE cluster's control plane upgrade failed!")
828
- return upgrade_master_return_code
829
-
830
- # Upgrade nodepools version after the master upgrade.
831
- node_pool_update_code = upgrade_gke_nodepools_version(
832
- args,
833
- gke_server_config.default_rapid_gke_version,
834
- )
835
- if node_pool_update_code > 0:
836
- xpk_print('Upgrading nodepools version failed!')
837
- return node_pool_update_code
838
- return 0
839
-
840
-
841
690
  def update_cluster_with_workload_identity_if_necessary(args) -> int:
842
691
  """Updates a GKE cluster to enable Workload Identity Federation, if not enabled already.
843
692
  Args:
@@ -880,26 +729,78 @@ def update_cluster_with_gcsfuse_driver_if_necessary(args) -> int:
880
729
  return 0
881
730
 
882
731
 
883
- def get_cluster_credentials(args) -> None:
884
- """Run cluster configuration command to set the kubectl config.
732
+ def test_and_retry_credentials_with_dns_logic(args) -> int:
733
+ """Tests kubectl credentials and retries with default settings if a DNS error is found.
885
734
 
886
735
  Args:
887
736
  args: user provided arguments for running the command.
888
737
 
889
738
  Returns:
890
- 0 if successful and 1 otherwise.
739
+ 0 if credentials are valid after retrying, 1 otherwise.
891
740
  """
892
- command = (
741
+
742
+ xpk_print('Testing credentials with kubectl...')
743
+ kubectl_command = 'kubectl get pods'
744
+ kubectl_return_code, kubectl_output = run_command_for_value(
745
+ kubectl_command, 'kubectl get pods'
746
+ )
747
+ if kubectl_return_code == 0:
748
+ xpk_print('Credentials test succeeded.')
749
+ return 0
750
+
751
+ dns_endpoint_error = (
752
+ 'control_plane_endpoints_config.dns_endpoint_config.allow_external_traffic'
753
+ ' is disabled'
754
+ )
755
+ if dns_endpoint_error not in kubectl_output:
756
+ xpk_print(f'kubectl failed with an unhandled error: {kubectl_output}')
757
+ xpk_exit(kubectl_return_code)
758
+ xpk_print(
759
+ 'Detected DNS endpoint-related error. Retrying without --dns-endpoint'
760
+ ' flag...'
761
+ )
762
+
763
+ location = get_cluster_location(args.project, args.cluster, args.zone)
764
+ without_dns_command = (
893
765
  'gcloud container clusters get-credentials'
894
- f' {args.cluster} --region={zone_to_region(args.zone)}'
766
+ f' {args.cluster} --location={location}'
895
767
  f' --project={args.project} &&'
896
768
  ' kubectl config view && kubectl config set-context --current'
897
769
  ' --namespace=default'
898
770
  )
899
- task = f'get-credentials to cluster {args.cluster}'
900
- return_code = run_command_with_updates_retry(
901
- command, task, args, verbose=False
771
+ return_code = run_command_with_updates(
772
+ without_dns_command, 'get-credentials to cluster', verbose=False
902
773
  )
774
+ if return_code != 0:
775
+ xpk_print('Failed to get credentials even without --dns-endpoint. Exiting.')
776
+ xpk_exit(return_code)
777
+ return 0
778
+
779
+
780
+ def get_cluster_credentials(args) -> int:
781
+ """Run cluster configuration command to set the kubectl config.
782
+
783
+ Args:
784
+ args: user provided arguments for running the command.
785
+
786
+ Returns:
787
+ 0 if successful and 1 otherwise.
788
+ """
789
+ location = get_cluster_location(args.project, args.cluster, args.zone)
790
+ command = (
791
+ 'gcloud container clusters get-credentials'
792
+ f' {args.cluster} --location={location} --dns-endpoint'
793
+ f' --project={args.project} && kubectl config view && kubectl config'
794
+ ' set-context --current --namespace=default'
795
+ )
796
+ task = f'get-credentials-dns-endpoint to cluster {args.cluster}'
797
+ return_code = run_command_with_updates_retry(command, task, verbose=False)
798
+
903
799
  if return_code != 0:
904
800
  xpk_print(f'{task} returned ERROR {return_code}')
905
801
  xpk_exit(return_code)
802
+
803
+ return_code = test_and_retry_credentials_with_dns_logic(args)
804
+ xpk_print('Finished get-credentials and kubectl setup.')
805
+
806
+ return return_code
@@ -19,9 +19,10 @@ from ..utils.network import (
19
19
  add_current_machine_to_networks,
20
20
  is_current_machine_in_any_network,
21
21
  )
22
+ from ..utils.execution_context import is_dry_run
22
23
  from ..utils.objects import is_text_true
23
24
  from .commands import run_command_for_value, run_command_with_updates
24
- from .gcloud_context import zone_to_region
25
+ from .gcloud_context import get_cluster_location
25
26
 
26
27
 
27
28
  def authorize_private_cluster_access_if_necessary(args) -> int:
@@ -37,7 +38,7 @@ def authorize_private_cluster_access_if_necessary(args) -> int:
37
38
  if not args.private and args.authorized_networks is None:
38
39
  xpk_print('Cluster is public and no need to authorize networks.')
39
40
  return 0
40
- else:
41
+ elif not is_dry_run():
41
42
  xpk_print(
42
43
  'Cannot convert an existing public cluster to private. The arguments'
43
44
  ' --private and --authorized-networks are not acceptable for public'
@@ -126,13 +127,12 @@ def is_cluster_private(args) -> bool:
126
127
  """
127
128
  command = (
128
129
  f'gcloud container clusters describe {args.cluster}'
129
- f' --project={args.project} --region={zone_to_region(args.zone)}'
130
+ f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
130
131
  ' --format="value(privateClusterConfig.enablePrivateNodes)"'
131
132
  )
132
133
  return_code, private_nodes_enabled = run_command_for_value(
133
134
  command,
134
135
  'Check if Private Nodes is enabled in cluster.',
135
- args,
136
136
  )
137
137
 
138
138
  if return_code != 0:
@@ -157,13 +157,13 @@ def get_cluster_authorized_networks(args) -> list[str]:
157
157
  """
158
158
  command = (
159
159
  f'gcloud container clusters describe {args.cluster}'
160
- f' --project={args.project} --region={zone_to_region(args.zone)}'
160
+ f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
161
161
  ' --format="value(masterAuthorizedNetworksConfig.cidrBlocks[].cidrBlock)"'
162
162
  )
163
163
  return_code, authorized_networks = run_command_for_value(
164
164
  command,
165
165
  'Fetching the list of authorized network from cluster describe.',
166
- args,
166
+ dry_run_return_val='127.0.0.1/32',
167
167
  )
168
168
 
169
169
  if return_code != 0:
@@ -187,15 +187,12 @@ def update_cluster_authorized_networks(args, authorized_networks) -> int:
187
187
  """
188
188
  command = (
189
189
  'gcloud container clusters update'
190
- f' {args.cluster} --project={args.project}'
191
- f' --region={zone_to_region(args.zone)}'
192
- ' --enable-master-authorized-networks'
193
- f' --master-authorized-networks={",".join(authorized_networks)}'
194
- ' --quiet'
190
+ f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --enable-master-authorized-networks'
191
+ f' --master-authorized-networks={",".join(authorized_networks)} --quiet'
195
192
  )
196
193
 
197
194
  return_code = run_command_with_updates(
198
- command, 'GKE Cluster Update master authorized networks', args
195
+ command, 'GKE Cluster Update master authorized networks'
199
196
  )
200
197
 
201
198
  if return_code != 0: