xpk 0.13.0__py3-none-any.whl → 0.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. integration/__init__.py +15 -0
  2. integration/docker_manager_test.py +102 -0
  3. integration/gcluster_a3mega_test.py +204 -0
  4. integration/gcluster_a3ultra_test.py +176 -0
  5. integration/gcluster_a4_test.py +176 -0
  6. integration/gcluster_test.py +107 -0
  7. xpk/commands/batch.py +9 -2
  8. xpk/commands/cluster.py +143 -117
  9. xpk/commands/cluster_gcluster.py +81 -14
  10. xpk/commands/cluster_gcluster_test.py +177 -0
  11. xpk/commands/cluster_test.py +92 -0
  12. xpk/commands/common.py +14 -26
  13. xpk/commands/info.py +11 -9
  14. xpk/commands/inspector.py +21 -10
  15. xpk/commands/job.py +25 -9
  16. xpk/commands/kind.py +39 -40
  17. xpk/commands/kjob_common.py +4 -4
  18. xpk/commands/run.py +9 -2
  19. xpk/commands/shell.py +13 -10
  20. xpk/commands/storage.py +21 -0
  21. xpk/commands/version.py +0 -4
  22. xpk/commands/workload.py +84 -29
  23. xpk/commands/workload_test.py +81 -0
  24. xpk/core/blueprint/blueprint_generator.py +4 -40
  25. xpk/core/blueprint/blueprint_test.py +0 -6
  26. xpk/core/blueprint/testing/__init__.py +15 -0
  27. xpk/core/capacity.py +6 -5
  28. xpk/core/cluster.py +91 -194
  29. xpk/core/cluster_private.py +6 -11
  30. xpk/core/commands.py +11 -18
  31. xpk/core/config.py +1 -1
  32. xpk/core/docker_image.py +3 -4
  33. xpk/core/gcloud_context.py +26 -2
  34. xpk/core/gcloud_context_test.py +96 -0
  35. xpk/core/gcluster_manager.py +0 -3
  36. xpk/core/jobset.py +4 -7
  37. xpk/core/kjob.py +14 -27
  38. xpk/core/kueue_manager.py +423 -0
  39. xpk/core/kueue_manager_test.py +574 -0
  40. xpk/core/monitoring.py +1 -1
  41. xpk/core/nap.py +10 -15
  42. xpk/core/network.py +17 -18
  43. xpk/core/nodepool.py +66 -77
  44. xpk/core/nodepool_test.py +198 -1
  45. xpk/core/pathways.py +5 -5
  46. xpk/core/ray.py +10 -14
  47. xpk/core/resources.py +6 -11
  48. xpk/core/scheduling.py +19 -1
  49. xpk/core/scheduling_test.py +31 -0
  50. xpk/core/system_characteristics.py +350 -232
  51. xpk/core/system_characteristics_test.py +73 -0
  52. xpk/core/vertex.py +1 -1
  53. xpk/core/workload.py +7 -8
  54. xpk/main.py +2 -4
  55. xpk/parser/cluster.py +7 -0
  56. xpk/parser/cluster_test.py +66 -0
  57. xpk/parser/common.py +11 -0
  58. xpk/parser/workload.py +62 -25
  59. xpk/parser/workload_test.py +82 -0
  60. xpk/templates/cluster_preheat.yaml.j2 +31 -0
  61. xpk/templates/filestore-pv.yaml +17 -0
  62. xpk/templates/filestore-pvc.yaml +11 -0
  63. xpk/templates/filestore-sc.yaml +10 -0
  64. xpk/templates/fuse-pv.yaml +17 -0
  65. xpk/templates/fuse-pvc.yaml +13 -0
  66. xpk/templates/kueue_config.yaml.j2 +95 -0
  67. xpk/templates/kueue_gke_default_topology.yaml.j2 +10 -0
  68. xpk/templates/kueue_sub_slicing_topology.yaml.j2 +14 -0
  69. xpk/templates/mtc-cpc.yaml +15 -0
  70. xpk/templates/volume_bundle.yaml +7 -0
  71. xpk/utils/feature_flags.py +28 -0
  72. xpk/utils/kueue.py +20 -0
  73. xpk/utils/templates.py +15 -0
  74. xpk/utils/topology.py +46 -0
  75. xpk/utils/topology_test.py +63 -0
  76. xpk/utils/validation.py +79 -55
  77. xpk/utils/validation_test.py +37 -0
  78. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/METADATA +6 -1
  79. xpk-0.14.1.dist-info/RECORD +133 -0
  80. xpk-0.14.1.dist-info/top_level.txt +2 -0
  81. xpk/core/kueue.py +0 -561
  82. xpk-0.13.0.dist-info/RECORD +0 -101
  83. xpk-0.13.0.dist-info/top_level.txt +0 -1
  84. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/WHEEL +0 -0
  85. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/entry_points.txt +0 -0
  86. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/licenses/LICENSE +0 -0
xpk/core/cluster.py CHANGED
@@ -30,15 +30,14 @@ from .commands import (
30
30
  )
31
31
  from .gcloud_context import (
32
32
  add_zone_and_project,
33
- get_gke_server_config,
33
+ get_cluster_location,
34
34
  zone_to_region,
35
35
  )
36
- from .nodepool import upgrade_gke_nodepools_version
37
36
  from .resources import get_cluster_system_characteristics
38
37
  from .system_characteristics import SystemCharacteristics
39
38
 
40
39
  JOBSET_VERSION = 'v0.8.0'
41
- PATHWAYS_JOB_VERSION = 'v0.1.2'
40
+ PATHWAYS_JOB_VERSION = 'v0.1.3'
42
41
  INSTALLER_NCCL_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-tcpx-installer.yaml'
43
42
  INSTALLER_NCCL_TCPXO = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml'
44
43
  INSTALLER_NCCL_RDMA = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer.yaml'
@@ -66,7 +65,7 @@ def set_jobset_on_cluster(args) -> int:
66
65
  f' -f https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml'
67
66
  )
68
67
  task = f'Install Jobset on {args.cluster}'
69
- return_code = run_command_with_updates_retry(command, task, args)
68
+ return_code = run_command_with_updates_retry(command, task)
70
69
 
71
70
  if return_code != 0:
72
71
  xpk_print(f'{task} returned with ERROR {return_code}.\n')
@@ -95,7 +94,7 @@ def set_pathways_job_on_cluster(args) -> int:
95
94
  f' https://github.com/google/pathways-job/releases/download/{PATHWAYS_JOB_VERSION}/install.yaml'
96
95
  )
97
96
  task = f'Install PathwaysJob on {args.cluster}'
98
- return_code = run_command_with_updates_retry(command, task, args)
97
+ return_code = run_command_with_updates_retry(command, task)
99
98
 
100
99
  if return_code != 0:
101
100
  xpk_print(f'{task} returned with ERROR {return_code}.\n')
@@ -110,11 +109,10 @@ def set_pathways_job_on_cluster(args) -> int:
110
109
  return return_code
111
110
 
112
111
 
113
- def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
112
+ def install_nccl_on_cluster(system: SystemCharacteristics) -> int:
114
113
  """Install NCCL plugin on the cluster.
115
114
 
116
115
  Args:
117
- args: user provided arguments for running the command.
118
116
  system: system characteristics.
119
117
 
120
118
  Returns:
@@ -128,7 +126,7 @@ def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
128
126
  command = f'kubectl apply -f {INSTALLER_NCCL_TCPXO}'
129
127
 
130
128
  return_code = run_command_with_updates(
131
- command, 'Install NCCL Plugin On Cluster', args
129
+ command, 'Install NCCL Plugin On Cluster'
132
130
  )
133
131
 
134
132
  if return_code != 0:
@@ -141,7 +139,7 @@ def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
141
139
  command = f'kubectl apply -f {CONFIG_NCCL_TCPX}'
142
140
 
143
141
  return_code = run_command_with_updates(
144
- command, 'Install NCCL Config On Cluster', args
142
+ command, 'Install NCCL Config On Cluster'
145
143
  )
146
144
 
147
145
  if return_code != 0:
@@ -153,19 +151,14 @@ def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
153
151
  return 0
154
152
 
155
153
 
156
- def disable_mglru_on_cluster(args) -> int:
154
+ def disable_mglru_on_cluster() -> int:
157
155
  """Disable MGLRU on the cluster.
158
156
 
159
- Args:
160
- args: user provided arguments for running the command.
161
-
162
157
  Returns:
163
158
  0 if successful and 1 otherwise.
164
159
  """
165
160
  command = f'kubectl apply -f {MGLRU_DISABLE}'
166
- return_code = run_command_with_updates(
167
- command, 'Disable MGLRU On Cluster', args
168
- )
161
+ return_code = run_command_with_updates(command, 'Disable MGLRU On Cluster')
169
162
 
170
163
  if return_code != 0:
171
164
  xpk_print('Disablig MGLRU On Cluster request returned ERROR')
@@ -174,11 +167,10 @@ def disable_mglru_on_cluster(args) -> int:
174
167
  return 0
175
168
 
176
169
 
177
- def install_nri_on_cluster(args) -> int:
170
+ def install_nri_on_cluster() -> int:
178
171
  """Install NRI Device Injector on the cluster.
179
172
 
180
173
  Args:
181
- args: user provided arguments for running the command.
182
174
  system: system characteristics.
183
175
 
184
176
  Returns:
@@ -186,7 +178,7 @@ def install_nri_on_cluster(args) -> int:
186
178
  """
187
179
  command = f'kubectl apply -f {NRI_DEVICE_INJECTOR}'
188
180
  return_code = run_command_with_updates(
189
- command, 'Install NRI Device Injector On Cluster', args
181
+ command, 'Install NRI Device Injector On Cluster'
190
182
  )
191
183
 
192
184
  if return_code != 0:
@@ -199,12 +191,9 @@ def install_nri_on_cluster(args) -> int:
199
191
  return 0
200
192
 
201
193
 
202
- def get_cluster_nodes_info(args) -> list[dict]:
194
+ def get_cluster_nodes_info() -> list[dict]:
203
195
  """Get list of cluster's nodes descrition in yaml format
204
196
 
205
- Args:
206
- args: user provided arguments for running the command.
207
-
208
197
  Returns:
209
198
  List of nodes info yaml objects.
210
199
  """
@@ -213,7 +202,6 @@ def get_cluster_nodes_info(args) -> list[dict]:
213
202
  err_code, val = run_command_for_value(
214
203
  command=command,
215
204
  task='Get cluster nodes info',
216
- global_args=args,
217
205
  )
218
206
  if err_code != 0:
219
207
  xpk_exit(err_code)
@@ -221,9 +209,9 @@ def get_cluster_nodes_info(args) -> list[dict]:
221
209
  return data['items']
222
210
 
223
211
 
224
- def count_nodes_on_cluster(args, system: SystemCharacteristics) -> int:
212
+ def count_nodes_on_cluster(system: SystemCharacteristics) -> int:
225
213
  """Count cluster nodes by accelerator type"""
226
- nodes_info = get_cluster_nodes_info(args)
214
+ nodes_info = get_cluster_nodes_info()
227
215
  accelerators = [
228
216
  node['metadata']['labels']['cloud.google.com/gke-accelerator']
229
217
  for node in nodes_info
@@ -243,12 +231,11 @@ def get_cluster_network(args) -> str:
243
231
  xpk_print("Getting cluster's VPC network...")
244
232
  cluster_network_cmd = (
245
233
  'gcloud container clusters describe'
246
- f' {args.cluster} --zone={zone_to_region(args.zone)} --project={args.project} --format="value(network)"'
234
+ f' {args.cluster} --location={get_cluster_location(args.project, args.cluster, args.zone)} --project={args.project} --format="value(network)"'
247
235
  )
248
236
  err_code, val = run_command_for_value(
249
237
  command=cluster_network_cmd,
250
238
  task='Get network cluster is in',
251
- global_args=args,
252
239
  )
253
240
  if err_code != 0:
254
241
  xpk_exit(err_code)
@@ -354,14 +341,13 @@ def is_driver_enabled_on_cluster(
354
341
  """
355
342
  command = (
356
343
  f'gcloud container clusters describe {args.cluster}'
357
- f' --project={args.project} --region={zone_to_region(args.zone)}'
344
+ f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
358
345
  f' --format="value(addonsConfig.{driver}Config.{config_key})"'
359
346
  )
360
347
  return_code, driver_enabled = run_command_for_value(
361
348
  command,
362
349
  f"Checks if {driver} driver's {config_key} is enabled in cluster"
363
350
  ' describe.',
364
- args,
365
351
  )
366
352
  if return_code != 0:
367
353
  xpk_exit(return_code)
@@ -382,14 +368,12 @@ def update_gke_cluster_with_addon(args, addon: str) -> int:
382
368
  """
383
369
  command = (
384
370
  'gcloud container clusters update'
385
- f' {args.cluster} --project={args.project}'
386
- f' --region={zone_to_region(args.zone)}'
387
- f' --update-addons {addon}=ENABLED'
388
- ' --quiet'
371
+ f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --update-addons'
372
+ f' {addon}=ENABLED --quiet'
389
373
  )
390
374
  xpk_print(f'Updating GKE cluster to enable {addon}, may take a while!')
391
375
  return_code = run_command_with_updates(
392
- command, f'GKE Cluster Update to enable {addon}', args
376
+ command, f'GKE Cluster Update to enable {addon}'
393
377
  )
394
378
  if return_code != 0:
395
379
  xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
@@ -408,11 +392,12 @@ def get_all_clusters_programmatic(args) -> tuple[list[str], int]:
408
392
  """
409
393
  command = (
410
394
  'gcloud container clusters list'
411
- f' --project={args.project} --region={zone_to_region(args.zone)}'
395
+ f' --project={args.project} '
396
+ f'--filter=location~"{zone_to_region(args.zone)}.*"'
412
397
  ' --format="csv[no-heading](name)"'
413
398
  )
414
399
  return_code, raw_cluster_output = run_command_for_value(
415
- command, 'Find if Cluster Exists', args
400
+ command, 'Find if Cluster Exists'
416
401
  )
417
402
  if return_code != 0:
418
403
  xpk_print(f'Find if Cluster Exists returned ERROR {return_code}')
@@ -578,34 +563,6 @@ def create_role_binding(sa: str, role_name: str) -> None:
578
563
  xpk_exit(1)
579
564
 
580
565
 
581
- def update_gke_cluster_with_clouddns(args) -> int:
582
- """Run the GKE cluster update command for existing clusters and enable CloudDNS.
583
-
584
- Args:
585
- args: user provided arguments for running the command.
586
-
587
- Returns:
588
- 0 if successful and 1 otherwise.
589
- """
590
- command = (
591
- 'gcloud container clusters update'
592
- f' {args.cluster} --project={args.project}'
593
- f' --region={zone_to_region(args.zone)}'
594
- ' --cluster-dns=clouddns'
595
- ' --cluster-dns-scope=vpc'
596
- f' --cluster-dns-domain={args.cluster}-domain'
597
- ' --quiet'
598
- )
599
- xpk_print('Updating GKE cluster to use Cloud DNS, may take a while!')
600
- return_code = run_command_with_updates(
601
- command, 'GKE Cluster Update to enable Cloud DNS', args
602
- )
603
- if return_code != 0:
604
- xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
605
- return 1
606
- return 0
607
-
608
-
609
566
  def update_gke_cluster_with_workload_identity_enabled(args) -> int:
610
567
  """Run the GKE cluster update command for existing cluster and enable Workload Identity Federation.
611
568
  Args:
@@ -615,9 +572,7 @@ def update_gke_cluster_with_workload_identity_enabled(args) -> int:
615
572
  """
616
573
  command = (
617
574
  'gcloud container clusters update'
618
- f' {args.cluster} --project={args.project}'
619
- f' --region={zone_to_region(args.zone)}'
620
- f' --workload-pool={args.project}.svc.id.goog'
575
+ f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --workload-pool={args.project}.svc.id.goog'
621
576
  ' --quiet'
622
577
  )
623
578
  xpk_print(
@@ -625,7 +580,7 @@ def update_gke_cluster_with_workload_identity_enabled(args) -> int:
625
580
  ' while!'
626
581
  )
627
582
  return_code = run_command_with_updates(
628
- command, 'GKE Cluster Update to enable Workload Identity Federation', args
583
+ command, 'GKE Cluster Update to enable Workload Identity Federation'
629
584
  )
630
585
  if return_code != 0:
631
586
  xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
@@ -642,16 +597,14 @@ def update_gke_cluster_with_gcsfuse_driver_enabled(args) -> int:
642
597
  """
643
598
  command = (
644
599
  'gcloud container clusters update'
645
- f' {args.cluster} --project={args.project}'
646
- f' --region={zone_to_region(args.zone)}'
647
- ' --update-addons GcsFuseCsiDriver=ENABLED'
648
- ' --quiet'
600
+ f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --update-addons'
601
+ ' GcsFuseCsiDriver=ENABLED --quiet'
649
602
  )
650
603
  xpk_print(
651
604
  'Updating GKE cluster to enable GCSFuse CSI driver, may take a while!'
652
605
  )
653
606
  return_code = run_command_with_updates(
654
- command, 'GKE Cluster Update to enable GCSFuse CSI driver', args
607
+ command, 'GKE Cluster Update to enable GCSFuse CSI driver'
655
608
  )
656
609
  if return_code != 0:
657
610
  xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
@@ -668,16 +621,14 @@ def update_gke_cluster_with_lustre_driver_enabled(args) -> int:
668
621
  """
669
622
  command = (
670
623
  'gcloud container clusters update'
671
- f' {args.cluster} --project={args.project}'
672
- f' --region={zone_to_region(args.zone)}'
673
- ' --enable-legacy-lustre-port'
624
+ f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --enable-legacy-lustre-port'
674
625
  ' --quiet'
675
626
  )
676
627
  xpk_print(
677
628
  'Updating GKE cluster to enable Lustre CSI driver, may take a while!'
678
629
  )
679
630
  return_code = run_command_with_updates(
680
- command, 'GKE Cluster Update to enable Lustre CSI driver', args
631
+ command, 'GKE Cluster Update to enable Lustre CSI driver'
681
632
  )
682
633
  if return_code != 0:
683
634
  xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
@@ -685,63 +636,6 @@ def update_gke_cluster_with_lustre_driver_enabled(args) -> int:
685
636
  return 0
686
637
 
687
638
 
688
- def upgrade_gke_control_plane_version(args, default_rapid_gke_version) -> int:
689
- """Upgrade GKE cluster's control plane version before updating nodepools to use CloudDNS.
690
-
691
- Args:
692
- args: user provided arguments for running the command.
693
- default_rapid_gke_version: Rapid default version for the upgrade.
694
-
695
- Returns:
696
- 0 if successful and 1 otherwise.
697
- """
698
- command = (
699
- 'gcloud container clusters upgrade'
700
- f' {args.cluster} --project={args.project}'
701
- f' --region={zone_to_region(args.zone)}'
702
- f' --cluster-version={default_rapid_gke_version}'
703
- ' --master'
704
- ' --quiet'
705
- )
706
- xpk_print("Updating GKE cluster's control plane version, may take a while!")
707
- return_code = run_command_with_updates(
708
- command,
709
- 'GKE Cluster control plane version update to enable Cloud DNS',
710
- args,
711
- )
712
- if return_code != 0:
713
- xpk_print(
714
- "GKE cluster's control plane version update request returned"
715
- f' ERROR {return_code}'
716
- )
717
- return 1
718
- return 0
719
-
720
-
721
- def is_cluster_using_clouddns(args) -> bool:
722
- """Checks if cluster is using CloudDNS.
723
- Args:
724
- args: user provided arguments for running the command.
725
-
726
- Returns:
727
- True if cluster is using CloudDNS and False otherwise.
728
- """
729
- command = (
730
- f'gcloud container clusters describe {args.cluster}'
731
- f' --project={args.project} --region={zone_to_region(args.zone)}'
732
- ' 2> /dev/null | grep "clusterDns: CLOUD_DNS"'
733
- )
734
- return_code, _ = run_command_for_value(
735
- command,
736
- 'Check if Cloud DNS is enabled in cluster describe.',
737
- args,
738
- )
739
- if return_code == 0:
740
- xpk_print('Cloud DNS is enabled on the cluster, no update needed.')
741
- return True
742
- return False
743
-
744
-
745
639
  def is_workload_identity_enabled_on_cluster(args) -> bool:
746
640
  """Checks if Workload Identity Federation is enabled on the cluster.
747
641
  Args:
@@ -751,13 +645,12 @@ def is_workload_identity_enabled_on_cluster(args) -> bool:
751
645
  """
752
646
  command = (
753
647
  f'gcloud container clusters describe {args.cluster}'
754
- f' --project={args.project} --region={zone_to_region(args.zone)}'
648
+ f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
755
649
  ' --format="value(workloadIdentityConfig.workloadPool)"'
756
650
  )
757
651
  return_code, workload_pool = run_command_for_value(
758
652
  command,
759
653
  'Checks if Workload Identity Federation is enabled in cluster describe.',
760
- args,
761
654
  )
762
655
  if return_code != 0:
763
656
  xpk_exit(return_code)
@@ -779,13 +672,12 @@ def is_gcsfuse_driver_enabled_on_cluster(args) -> bool:
779
672
  """
780
673
  command = (
781
674
  f'gcloud container clusters describe {args.cluster}'
782
- f' --project={args.project} --region={zone_to_region(args.zone)}'
675
+ f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
783
676
  ' --format="value(addonsConfig.gcsFuseCsiDriverConfig.enabled)"'
784
677
  )
785
678
  return_code, gcsfuse_driver_enabled = run_command_for_value(
786
679
  command,
787
680
  'Checks if GCSFuse CSI driver is enabled in cluster describe.',
788
- args,
789
681
  )
790
682
  if return_code != 0:
791
683
  xpk_exit(return_code)
@@ -795,53 +687,6 @@ def is_gcsfuse_driver_enabled_on_cluster(args) -> bool:
795
687
  return False
796
688
 
797
689
 
798
- def update_cluster_with_clouddns_if_necessary(args) -> int:
799
- """Updates a GKE cluster to use CloudDNS, if not enabled already.
800
-
801
- Args:
802
- args: user provided arguments for running the command.
803
-
804
- Returns:
805
- 0 if successful and error code otherwise.
806
- """
807
- all_clusters, return_code = get_all_clusters_programmatic(args)
808
- if return_code > 0:
809
- xpk_print('Listing all clusters failed!')
810
- return 1
811
- if args.cluster in all_clusters:
812
- # If cluster is already using clouddns, no update necessary!
813
- if is_cluster_using_clouddns(args):
814
- return 0
815
- cluster_update_return_code = update_gke_cluster_with_clouddns(args)
816
- if cluster_update_return_code > 0:
817
- xpk_print('Updating GKE cluster to use CloudDNS failed!')
818
- return cluster_update_return_code
819
-
820
- # Find default rapid control plane version and update the control plane to the same.
821
- server_config_return_code, gke_server_config = get_gke_server_config(args)
822
- if server_config_return_code != 0:
823
- xpk_exit(server_config_return_code)
824
- assert gke_server_config
825
-
826
- upgrade_master_return_code = upgrade_gke_control_plane_version(
827
- args,
828
- gke_server_config.default_rapid_gke_version,
829
- )
830
- if upgrade_master_return_code > 0:
831
- xpk_print("Updating GKE cluster's control plane upgrade failed!")
832
- return upgrade_master_return_code
833
-
834
- # Upgrade nodepools version after the master upgrade.
835
- node_pool_update_code = upgrade_gke_nodepools_version(
836
- args,
837
- gke_server_config.default_rapid_gke_version,
838
- )
839
- if node_pool_update_code > 0:
840
- xpk_print('Upgrading nodepools version failed!')
841
- return node_pool_update_code
842
- return 0
843
-
844
-
845
690
  def update_cluster_with_workload_identity_if_necessary(args) -> int:
846
691
  """Updates a GKE cluster to enable Workload Identity Federation, if not enabled already.
847
692
  Args:
@@ -884,26 +729,78 @@ def update_cluster_with_gcsfuse_driver_if_necessary(args) -> int:
884
729
  return 0
885
730
 
886
731
 
887
- def get_cluster_credentials(args) -> None:
888
- """Run cluster configuration command to set the kubectl config.
732
+ def test_and_retry_credentials_with_dns_logic(args) -> int:
733
+ """Tests kubectl credentials and retries with default settings if a DNS error is found.
889
734
 
890
735
  Args:
891
736
  args: user provided arguments for running the command.
892
737
 
893
738
  Returns:
894
- 0 if successful and 1 otherwise.
739
+ 0 if credentials are valid after retrying, 1 otherwise.
895
740
  """
896
- command = (
741
+
742
+ xpk_print('Testing credentials with kubectl...')
743
+ kubectl_command = 'kubectl get pods'
744
+ kubectl_return_code, kubectl_output = run_command_for_value(
745
+ kubectl_command, 'kubectl get pods'
746
+ )
747
+ if kubectl_return_code == 0:
748
+ xpk_print('Credentials test succeeded.')
749
+ return 0
750
+
751
+ dns_endpoint_error = (
752
+ 'control_plane_endpoints_config.dns_endpoint_config.allow_external_traffic'
753
+ ' is disabled'
754
+ )
755
+ if dns_endpoint_error not in kubectl_output:
756
+ xpk_print(f'kubectl failed with an unhandled error: {kubectl_output}')
757
+ xpk_exit(kubectl_return_code)
758
+ xpk_print(
759
+ 'Detected DNS endpoint-related error. Retrying without --dns-endpoint'
760
+ ' flag...'
761
+ )
762
+
763
+ location = get_cluster_location(args.project, args.cluster, args.zone)
764
+ without_dns_command = (
897
765
  'gcloud container clusters get-credentials'
898
- f' {args.cluster} --region={zone_to_region(args.zone)}'
766
+ f' {args.cluster} --location={location}'
899
767
  f' --project={args.project} &&'
900
768
  ' kubectl config view && kubectl config set-context --current'
901
769
  ' --namespace=default'
902
770
  )
903
- task = f'get-credentials to cluster {args.cluster}'
904
- return_code = run_command_with_updates_retry(
905
- command, task, args, verbose=False
771
+ return_code = run_command_with_updates(
772
+ without_dns_command, 'get-credentials to cluster', verbose=False
906
773
  )
774
+ if return_code != 0:
775
+ xpk_print('Failed to get credentials even without --dns-endpoint. Exiting.')
776
+ xpk_exit(return_code)
777
+ return 0
778
+
779
+
780
+ def get_cluster_credentials(args) -> int:
781
+ """Run cluster configuration command to set the kubectl config.
782
+
783
+ Args:
784
+ args: user provided arguments for running the command.
785
+
786
+ Returns:
787
+ 0 if successful and 1 otherwise.
788
+ """
789
+ location = get_cluster_location(args.project, args.cluster, args.zone)
790
+ command = (
791
+ 'gcloud container clusters get-credentials'
792
+ f' {args.cluster} --location={location} --dns-endpoint'
793
+ f' --project={args.project} && kubectl config view && kubectl config'
794
+ ' set-context --current --namespace=default'
795
+ )
796
+ task = f'get-credentials-dns-endpoint to cluster {args.cluster}'
797
+ return_code = run_command_with_updates_retry(command, task, verbose=False)
798
+
907
799
  if return_code != 0:
908
800
  xpk_print(f'{task} returned ERROR {return_code}')
909
801
  xpk_exit(return_code)
802
+
803
+ return_code = test_and_retry_credentials_with_dns_logic(args)
804
+ xpk_print('Finished get-credentials and kubectl setup.')
805
+
806
+ return return_code
@@ -22,7 +22,7 @@ from ..utils.network import (
22
22
  from ..utils.execution_context import is_dry_run
23
23
  from ..utils.objects import is_text_true
24
24
  from .commands import run_command_for_value, run_command_with_updates
25
- from .gcloud_context import zone_to_region
25
+ from .gcloud_context import get_cluster_location
26
26
 
27
27
 
28
28
  def authorize_private_cluster_access_if_necessary(args) -> int:
@@ -127,13 +127,12 @@ def is_cluster_private(args) -> bool:
127
127
  """
128
128
  command = (
129
129
  f'gcloud container clusters describe {args.cluster}'
130
- f' --project={args.project} --region={zone_to_region(args.zone)}'
130
+ f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
131
131
  ' --format="value(privateClusterConfig.enablePrivateNodes)"'
132
132
  )
133
133
  return_code, private_nodes_enabled = run_command_for_value(
134
134
  command,
135
135
  'Check if Private Nodes is enabled in cluster.',
136
- args,
137
136
  )
138
137
 
139
138
  if return_code != 0:
@@ -158,13 +157,12 @@ def get_cluster_authorized_networks(args) -> list[str]:
158
157
  """
159
158
  command = (
160
159
  f'gcloud container clusters describe {args.cluster}'
161
- f' --project={args.project} --region={zone_to_region(args.zone)}'
160
+ f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
162
161
  ' --format="value(masterAuthorizedNetworksConfig.cidrBlocks[].cidrBlock)"'
163
162
  )
164
163
  return_code, authorized_networks = run_command_for_value(
165
164
  command,
166
165
  'Fetching the list of authorized network from cluster describe.',
167
- args,
168
166
  dry_run_return_val='127.0.0.1/32',
169
167
  )
170
168
 
@@ -189,15 +187,12 @@ def update_cluster_authorized_networks(args, authorized_networks) -> int:
189
187
  """
190
188
  command = (
191
189
  'gcloud container clusters update'
192
- f' {args.cluster} --project={args.project}'
193
- f' --region={zone_to_region(args.zone)}'
194
- ' --enable-master-authorized-networks'
195
- f' --master-authorized-networks={",".join(authorized_networks)}'
196
- ' --quiet'
190
+ f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --enable-master-authorized-networks'
191
+ f' --master-authorized-networks={",".join(authorized_networks)} --quiet'
197
192
  )
198
193
 
199
194
  return_code = run_command_with_updates(
200
- command, 'GKE Cluster Update master authorized networks', args
195
+ command, 'GKE Cluster Update master authorized networks'
201
196
  )
202
197
 
203
198
  if return_code != 0: