xpk 0.13.0__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +9 -2
- xpk/commands/cluster.py +128 -115
- xpk/commands/cluster_gcluster.py +77 -14
- xpk/commands/cluster_gcluster_test.py +177 -0
- xpk/commands/common.py +10 -28
- xpk/commands/info.py +11 -9
- xpk/commands/inspector.py +21 -10
- xpk/commands/job.py +25 -9
- xpk/commands/kind.py +38 -40
- xpk/commands/kjob_common.py +4 -4
- xpk/commands/run.py +9 -2
- xpk/commands/shell.py +13 -10
- xpk/commands/storage.py +21 -0
- xpk/commands/version.py +0 -4
- xpk/commands/workload.py +43 -22
- xpk/core/blueprint/blueprint_generator.py +4 -40
- xpk/core/blueprint/blueprint_test.py +0 -6
- xpk/core/capacity.py +6 -5
- xpk/core/cluster.py +91 -194
- xpk/core/cluster_private.py +6 -11
- xpk/core/commands.py +11 -18
- xpk/core/config.py +1 -1
- xpk/core/docker_image.py +3 -4
- xpk/core/gcloud_context.py +26 -2
- xpk/core/gcloud_context_test.py +96 -0
- xpk/core/gcluster_manager.py +0 -3
- xpk/core/jobset.py +4 -7
- xpk/core/kjob.py +14 -27
- xpk/core/kueue_manager.py +383 -0
- xpk/core/kueue_manager_test.py +542 -0
- xpk/core/monitoring.py +1 -1
- xpk/core/nap.py +10 -15
- xpk/core/network.py +17 -18
- xpk/core/nodepool.py +66 -77
- xpk/core/nodepool_test.py +198 -1
- xpk/core/pathways.py +5 -5
- xpk/core/ray.py +10 -14
- xpk/core/resources.py +6 -11
- xpk/core/scheduling.py +19 -1
- xpk/core/scheduling_test.py +31 -0
- xpk/core/system_characteristics.py +335 -229
- xpk/core/vertex.py +1 -1
- xpk/core/workload.py +7 -8
- xpk/main.py +2 -4
- xpk/parser/cluster.py +7 -0
- xpk/parser/cluster_test.py +66 -0
- xpk/parser/common.py +11 -0
- xpk/parser/workload.py +62 -25
- xpk/parser/workload_test.py +82 -0
- xpk/utils/feature_flags.py +28 -0
- xpk/utils/kueue.py +20 -0
- xpk/utils/templates.py +2 -0
- xpk/utils/topology.py +37 -0
- xpk/utils/topology_test.py +43 -0
- xpk/utils/validation.py +79 -55
- xpk/utils/validation_test.py +37 -0
- {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
- xpk-0.14.0.dist-info/RECORD +112 -0
- xpk/core/kueue.py +0 -561
- xpk-0.13.0.dist-info/RECORD +0 -101
- {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
xpk/core/cluster.py
CHANGED
|
@@ -30,15 +30,14 @@ from .commands import (
|
|
|
30
30
|
)
|
|
31
31
|
from .gcloud_context import (
|
|
32
32
|
add_zone_and_project,
|
|
33
|
-
|
|
33
|
+
get_cluster_location,
|
|
34
34
|
zone_to_region,
|
|
35
35
|
)
|
|
36
|
-
from .nodepool import upgrade_gke_nodepools_version
|
|
37
36
|
from .resources import get_cluster_system_characteristics
|
|
38
37
|
from .system_characteristics import SystemCharacteristics
|
|
39
38
|
|
|
40
39
|
JOBSET_VERSION = 'v0.8.0'
|
|
41
|
-
PATHWAYS_JOB_VERSION = 'v0.1.
|
|
40
|
+
PATHWAYS_JOB_VERSION = 'v0.1.3'
|
|
42
41
|
INSTALLER_NCCL_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-tcpx-installer.yaml'
|
|
43
42
|
INSTALLER_NCCL_TCPXO = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml'
|
|
44
43
|
INSTALLER_NCCL_RDMA = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer.yaml'
|
|
@@ -66,7 +65,7 @@ def set_jobset_on_cluster(args) -> int:
|
|
|
66
65
|
f' -f https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml'
|
|
67
66
|
)
|
|
68
67
|
task = f'Install Jobset on {args.cluster}'
|
|
69
|
-
return_code = run_command_with_updates_retry(command, task
|
|
68
|
+
return_code = run_command_with_updates_retry(command, task)
|
|
70
69
|
|
|
71
70
|
if return_code != 0:
|
|
72
71
|
xpk_print(f'{task} returned with ERROR {return_code}.\n')
|
|
@@ -95,7 +94,7 @@ def set_pathways_job_on_cluster(args) -> int:
|
|
|
95
94
|
f' https://github.com/google/pathways-job/releases/download/{PATHWAYS_JOB_VERSION}/install.yaml'
|
|
96
95
|
)
|
|
97
96
|
task = f'Install PathwaysJob on {args.cluster}'
|
|
98
|
-
return_code = run_command_with_updates_retry(command, task
|
|
97
|
+
return_code = run_command_with_updates_retry(command, task)
|
|
99
98
|
|
|
100
99
|
if return_code != 0:
|
|
101
100
|
xpk_print(f'{task} returned with ERROR {return_code}.\n')
|
|
@@ -110,11 +109,10 @@ def set_pathways_job_on_cluster(args) -> int:
|
|
|
110
109
|
return return_code
|
|
111
110
|
|
|
112
111
|
|
|
113
|
-
def install_nccl_on_cluster(
|
|
112
|
+
def install_nccl_on_cluster(system: SystemCharacteristics) -> int:
|
|
114
113
|
"""Install NCCL plugin on the cluster.
|
|
115
114
|
|
|
116
115
|
Args:
|
|
117
|
-
args: user provided arguments for running the command.
|
|
118
116
|
system: system characteristics.
|
|
119
117
|
|
|
120
118
|
Returns:
|
|
@@ -128,7 +126,7 @@ def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
|
|
|
128
126
|
command = f'kubectl apply -f {INSTALLER_NCCL_TCPXO}'
|
|
129
127
|
|
|
130
128
|
return_code = run_command_with_updates(
|
|
131
|
-
command, 'Install NCCL Plugin On Cluster'
|
|
129
|
+
command, 'Install NCCL Plugin On Cluster'
|
|
132
130
|
)
|
|
133
131
|
|
|
134
132
|
if return_code != 0:
|
|
@@ -141,7 +139,7 @@ def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
|
|
|
141
139
|
command = f'kubectl apply -f {CONFIG_NCCL_TCPX}'
|
|
142
140
|
|
|
143
141
|
return_code = run_command_with_updates(
|
|
144
|
-
command, 'Install NCCL Config On Cluster'
|
|
142
|
+
command, 'Install NCCL Config On Cluster'
|
|
145
143
|
)
|
|
146
144
|
|
|
147
145
|
if return_code != 0:
|
|
@@ -153,19 +151,14 @@ def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
|
|
|
153
151
|
return 0
|
|
154
152
|
|
|
155
153
|
|
|
156
|
-
def disable_mglru_on_cluster(
|
|
154
|
+
def disable_mglru_on_cluster() -> int:
|
|
157
155
|
"""Disable MGLRU on the cluster.
|
|
158
156
|
|
|
159
|
-
Args:
|
|
160
|
-
args: user provided arguments for running the command.
|
|
161
|
-
|
|
162
157
|
Returns:
|
|
163
158
|
0 if successful and 1 otherwise.
|
|
164
159
|
"""
|
|
165
160
|
command = f'kubectl apply -f {MGLRU_DISABLE}'
|
|
166
|
-
return_code = run_command_with_updates(
|
|
167
|
-
command, 'Disable MGLRU On Cluster', args
|
|
168
|
-
)
|
|
161
|
+
return_code = run_command_with_updates(command, 'Disable MGLRU On Cluster')
|
|
169
162
|
|
|
170
163
|
if return_code != 0:
|
|
171
164
|
xpk_print('Disablig MGLRU On Cluster request returned ERROR')
|
|
@@ -174,11 +167,10 @@ def disable_mglru_on_cluster(args) -> int:
|
|
|
174
167
|
return 0
|
|
175
168
|
|
|
176
169
|
|
|
177
|
-
def install_nri_on_cluster(
|
|
170
|
+
def install_nri_on_cluster() -> int:
|
|
178
171
|
"""Install NRI Device Injector on the cluster.
|
|
179
172
|
|
|
180
173
|
Args:
|
|
181
|
-
args: user provided arguments for running the command.
|
|
182
174
|
system: system characteristics.
|
|
183
175
|
|
|
184
176
|
Returns:
|
|
@@ -186,7 +178,7 @@ def install_nri_on_cluster(args) -> int:
|
|
|
186
178
|
"""
|
|
187
179
|
command = f'kubectl apply -f {NRI_DEVICE_INJECTOR}'
|
|
188
180
|
return_code = run_command_with_updates(
|
|
189
|
-
command, 'Install NRI Device Injector On Cluster'
|
|
181
|
+
command, 'Install NRI Device Injector On Cluster'
|
|
190
182
|
)
|
|
191
183
|
|
|
192
184
|
if return_code != 0:
|
|
@@ -199,12 +191,9 @@ def install_nri_on_cluster(args) -> int:
|
|
|
199
191
|
return 0
|
|
200
192
|
|
|
201
193
|
|
|
202
|
-
def get_cluster_nodes_info(
|
|
194
|
+
def get_cluster_nodes_info() -> list[dict]:
|
|
203
195
|
"""Get list of cluster's nodes descrition in yaml format
|
|
204
196
|
|
|
205
|
-
Args:
|
|
206
|
-
args: user provided arguments for running the command.
|
|
207
|
-
|
|
208
197
|
Returns:
|
|
209
198
|
List of nodes info yaml objects.
|
|
210
199
|
"""
|
|
@@ -213,7 +202,6 @@ def get_cluster_nodes_info(args) -> list[dict]:
|
|
|
213
202
|
err_code, val = run_command_for_value(
|
|
214
203
|
command=command,
|
|
215
204
|
task='Get cluster nodes info',
|
|
216
|
-
global_args=args,
|
|
217
205
|
)
|
|
218
206
|
if err_code != 0:
|
|
219
207
|
xpk_exit(err_code)
|
|
@@ -221,9 +209,9 @@ def get_cluster_nodes_info(args) -> list[dict]:
|
|
|
221
209
|
return data['items']
|
|
222
210
|
|
|
223
211
|
|
|
224
|
-
def count_nodes_on_cluster(
|
|
212
|
+
def count_nodes_on_cluster(system: SystemCharacteristics) -> int:
|
|
225
213
|
"""Count cluster nodes by accelerator type"""
|
|
226
|
-
nodes_info = get_cluster_nodes_info(
|
|
214
|
+
nodes_info = get_cluster_nodes_info()
|
|
227
215
|
accelerators = [
|
|
228
216
|
node['metadata']['labels']['cloud.google.com/gke-accelerator']
|
|
229
217
|
for node in nodes_info
|
|
@@ -243,12 +231,11 @@ def get_cluster_network(args) -> str:
|
|
|
243
231
|
xpk_print("Getting cluster's VPC network...")
|
|
244
232
|
cluster_network_cmd = (
|
|
245
233
|
'gcloud container clusters describe'
|
|
246
|
-
f' {args.cluster} --
|
|
234
|
+
f' {args.cluster} --location={get_cluster_location(args.project, args.cluster, args.zone)} --project={args.project} --format="value(network)"'
|
|
247
235
|
)
|
|
248
236
|
err_code, val = run_command_for_value(
|
|
249
237
|
command=cluster_network_cmd,
|
|
250
238
|
task='Get network cluster is in',
|
|
251
|
-
global_args=args,
|
|
252
239
|
)
|
|
253
240
|
if err_code != 0:
|
|
254
241
|
xpk_exit(err_code)
|
|
@@ -354,14 +341,13 @@ def is_driver_enabled_on_cluster(
|
|
|
354
341
|
"""
|
|
355
342
|
command = (
|
|
356
343
|
f'gcloud container clusters describe {args.cluster}'
|
|
357
|
-
f' --project={args.project} --
|
|
344
|
+
f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
|
|
358
345
|
f' --format="value(addonsConfig.{driver}Config.{config_key})"'
|
|
359
346
|
)
|
|
360
347
|
return_code, driver_enabled = run_command_for_value(
|
|
361
348
|
command,
|
|
362
349
|
f"Checks if {driver} driver's {config_key} is enabled in cluster"
|
|
363
350
|
' describe.',
|
|
364
|
-
args,
|
|
365
351
|
)
|
|
366
352
|
if return_code != 0:
|
|
367
353
|
xpk_exit(return_code)
|
|
@@ -382,14 +368,12 @@ def update_gke_cluster_with_addon(args, addon: str) -> int:
|
|
|
382
368
|
"""
|
|
383
369
|
command = (
|
|
384
370
|
'gcloud container clusters update'
|
|
385
|
-
f' {args.cluster} --project={args.project}'
|
|
386
|
-
f'
|
|
387
|
-
f' --update-addons {addon}=ENABLED'
|
|
388
|
-
' --quiet'
|
|
371
|
+
f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --update-addons'
|
|
372
|
+
f' {addon}=ENABLED --quiet'
|
|
389
373
|
)
|
|
390
374
|
xpk_print(f'Updating GKE cluster to enable {addon}, may take a while!')
|
|
391
375
|
return_code = run_command_with_updates(
|
|
392
|
-
command, f'GKE Cluster Update to enable {addon}'
|
|
376
|
+
command, f'GKE Cluster Update to enable {addon}'
|
|
393
377
|
)
|
|
394
378
|
if return_code != 0:
|
|
395
379
|
xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
|
|
@@ -408,11 +392,12 @@ def get_all_clusters_programmatic(args) -> tuple[list[str], int]:
|
|
|
408
392
|
"""
|
|
409
393
|
command = (
|
|
410
394
|
'gcloud container clusters list'
|
|
411
|
-
f' --project={args.project}
|
|
395
|
+
f' --project={args.project} '
|
|
396
|
+
f'--filter=location~"{zone_to_region(args.zone)}.*"'
|
|
412
397
|
' --format="csv[no-heading](name)"'
|
|
413
398
|
)
|
|
414
399
|
return_code, raw_cluster_output = run_command_for_value(
|
|
415
|
-
command, 'Find if Cluster Exists'
|
|
400
|
+
command, 'Find if Cluster Exists'
|
|
416
401
|
)
|
|
417
402
|
if return_code != 0:
|
|
418
403
|
xpk_print(f'Find if Cluster Exists returned ERROR {return_code}')
|
|
@@ -578,34 +563,6 @@ def create_role_binding(sa: str, role_name: str) -> None:
|
|
|
578
563
|
xpk_exit(1)
|
|
579
564
|
|
|
580
565
|
|
|
581
|
-
def update_gke_cluster_with_clouddns(args) -> int:
|
|
582
|
-
"""Run the GKE cluster update command for existing clusters and enable CloudDNS.
|
|
583
|
-
|
|
584
|
-
Args:
|
|
585
|
-
args: user provided arguments for running the command.
|
|
586
|
-
|
|
587
|
-
Returns:
|
|
588
|
-
0 if successful and 1 otherwise.
|
|
589
|
-
"""
|
|
590
|
-
command = (
|
|
591
|
-
'gcloud container clusters update'
|
|
592
|
-
f' {args.cluster} --project={args.project}'
|
|
593
|
-
f' --region={zone_to_region(args.zone)}'
|
|
594
|
-
' --cluster-dns=clouddns'
|
|
595
|
-
' --cluster-dns-scope=vpc'
|
|
596
|
-
f' --cluster-dns-domain={args.cluster}-domain'
|
|
597
|
-
' --quiet'
|
|
598
|
-
)
|
|
599
|
-
xpk_print('Updating GKE cluster to use Cloud DNS, may take a while!')
|
|
600
|
-
return_code = run_command_with_updates(
|
|
601
|
-
command, 'GKE Cluster Update to enable Cloud DNS', args
|
|
602
|
-
)
|
|
603
|
-
if return_code != 0:
|
|
604
|
-
xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
|
|
605
|
-
return 1
|
|
606
|
-
return 0
|
|
607
|
-
|
|
608
|
-
|
|
609
566
|
def update_gke_cluster_with_workload_identity_enabled(args) -> int:
|
|
610
567
|
"""Run the GKE cluster update command for existing cluster and enable Workload Identity Federation.
|
|
611
568
|
Args:
|
|
@@ -615,9 +572,7 @@ def update_gke_cluster_with_workload_identity_enabled(args) -> int:
|
|
|
615
572
|
"""
|
|
616
573
|
command = (
|
|
617
574
|
'gcloud container clusters update'
|
|
618
|
-
f' {args.cluster} --project={args.project}'
|
|
619
|
-
f' --region={zone_to_region(args.zone)}'
|
|
620
|
-
f' --workload-pool={args.project}.svc.id.goog'
|
|
575
|
+
f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --workload-pool={args.project}.svc.id.goog'
|
|
621
576
|
' --quiet'
|
|
622
577
|
)
|
|
623
578
|
xpk_print(
|
|
@@ -625,7 +580,7 @@ def update_gke_cluster_with_workload_identity_enabled(args) -> int:
|
|
|
625
580
|
' while!'
|
|
626
581
|
)
|
|
627
582
|
return_code = run_command_with_updates(
|
|
628
|
-
command, 'GKE Cluster Update to enable Workload Identity Federation'
|
|
583
|
+
command, 'GKE Cluster Update to enable Workload Identity Federation'
|
|
629
584
|
)
|
|
630
585
|
if return_code != 0:
|
|
631
586
|
xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
|
|
@@ -642,16 +597,14 @@ def update_gke_cluster_with_gcsfuse_driver_enabled(args) -> int:
|
|
|
642
597
|
"""
|
|
643
598
|
command = (
|
|
644
599
|
'gcloud container clusters update'
|
|
645
|
-
f' {args.cluster} --project={args.project}'
|
|
646
|
-
|
|
647
|
-
' --update-addons GcsFuseCsiDriver=ENABLED'
|
|
648
|
-
' --quiet'
|
|
600
|
+
f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --update-addons'
|
|
601
|
+
' GcsFuseCsiDriver=ENABLED --quiet'
|
|
649
602
|
)
|
|
650
603
|
xpk_print(
|
|
651
604
|
'Updating GKE cluster to enable GCSFuse CSI driver, may take a while!'
|
|
652
605
|
)
|
|
653
606
|
return_code = run_command_with_updates(
|
|
654
|
-
command, 'GKE Cluster Update to enable GCSFuse CSI driver'
|
|
607
|
+
command, 'GKE Cluster Update to enable GCSFuse CSI driver'
|
|
655
608
|
)
|
|
656
609
|
if return_code != 0:
|
|
657
610
|
xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
|
|
@@ -668,16 +621,14 @@ def update_gke_cluster_with_lustre_driver_enabled(args) -> int:
|
|
|
668
621
|
"""
|
|
669
622
|
command = (
|
|
670
623
|
'gcloud container clusters update'
|
|
671
|
-
f' {args.cluster} --project={args.project}'
|
|
672
|
-
f' --region={zone_to_region(args.zone)}'
|
|
673
|
-
' --enable-legacy-lustre-port'
|
|
624
|
+
f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --enable-legacy-lustre-port'
|
|
674
625
|
' --quiet'
|
|
675
626
|
)
|
|
676
627
|
xpk_print(
|
|
677
628
|
'Updating GKE cluster to enable Lustre CSI driver, may take a while!'
|
|
678
629
|
)
|
|
679
630
|
return_code = run_command_with_updates(
|
|
680
|
-
command, 'GKE Cluster Update to enable Lustre CSI driver'
|
|
631
|
+
command, 'GKE Cluster Update to enable Lustre CSI driver'
|
|
681
632
|
)
|
|
682
633
|
if return_code != 0:
|
|
683
634
|
xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
|
|
@@ -685,63 +636,6 @@ def update_gke_cluster_with_lustre_driver_enabled(args) -> int:
|
|
|
685
636
|
return 0
|
|
686
637
|
|
|
687
638
|
|
|
688
|
-
def upgrade_gke_control_plane_version(args, default_rapid_gke_version) -> int:
|
|
689
|
-
"""Upgrade GKE cluster's control plane version before updating nodepools to use CloudDNS.
|
|
690
|
-
|
|
691
|
-
Args:
|
|
692
|
-
args: user provided arguments for running the command.
|
|
693
|
-
default_rapid_gke_version: Rapid default version for the upgrade.
|
|
694
|
-
|
|
695
|
-
Returns:
|
|
696
|
-
0 if successful and 1 otherwise.
|
|
697
|
-
"""
|
|
698
|
-
command = (
|
|
699
|
-
'gcloud container clusters upgrade'
|
|
700
|
-
f' {args.cluster} --project={args.project}'
|
|
701
|
-
f' --region={zone_to_region(args.zone)}'
|
|
702
|
-
f' --cluster-version={default_rapid_gke_version}'
|
|
703
|
-
' --master'
|
|
704
|
-
' --quiet'
|
|
705
|
-
)
|
|
706
|
-
xpk_print("Updating GKE cluster's control plane version, may take a while!")
|
|
707
|
-
return_code = run_command_with_updates(
|
|
708
|
-
command,
|
|
709
|
-
'GKE Cluster control plane version update to enable Cloud DNS',
|
|
710
|
-
args,
|
|
711
|
-
)
|
|
712
|
-
if return_code != 0:
|
|
713
|
-
xpk_print(
|
|
714
|
-
"GKE cluster's control plane version update request returned"
|
|
715
|
-
f' ERROR {return_code}'
|
|
716
|
-
)
|
|
717
|
-
return 1
|
|
718
|
-
return 0
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
def is_cluster_using_clouddns(args) -> bool:
|
|
722
|
-
"""Checks if cluster is using CloudDNS.
|
|
723
|
-
Args:
|
|
724
|
-
args: user provided arguments for running the command.
|
|
725
|
-
|
|
726
|
-
Returns:
|
|
727
|
-
True if cluster is using CloudDNS and False otherwise.
|
|
728
|
-
"""
|
|
729
|
-
command = (
|
|
730
|
-
f'gcloud container clusters describe {args.cluster}'
|
|
731
|
-
f' --project={args.project} --region={zone_to_region(args.zone)}'
|
|
732
|
-
' 2> /dev/null | grep "clusterDns: CLOUD_DNS"'
|
|
733
|
-
)
|
|
734
|
-
return_code, _ = run_command_for_value(
|
|
735
|
-
command,
|
|
736
|
-
'Check if Cloud DNS is enabled in cluster describe.',
|
|
737
|
-
args,
|
|
738
|
-
)
|
|
739
|
-
if return_code == 0:
|
|
740
|
-
xpk_print('Cloud DNS is enabled on the cluster, no update needed.')
|
|
741
|
-
return True
|
|
742
|
-
return False
|
|
743
|
-
|
|
744
|
-
|
|
745
639
|
def is_workload_identity_enabled_on_cluster(args) -> bool:
|
|
746
640
|
"""Checks if Workload Identity Federation is enabled on the cluster.
|
|
747
641
|
Args:
|
|
@@ -751,13 +645,12 @@ def is_workload_identity_enabled_on_cluster(args) -> bool:
|
|
|
751
645
|
"""
|
|
752
646
|
command = (
|
|
753
647
|
f'gcloud container clusters describe {args.cluster}'
|
|
754
|
-
f' --project={args.project} --
|
|
648
|
+
f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
|
|
755
649
|
' --format="value(workloadIdentityConfig.workloadPool)"'
|
|
756
650
|
)
|
|
757
651
|
return_code, workload_pool = run_command_for_value(
|
|
758
652
|
command,
|
|
759
653
|
'Checks if Workload Identity Federation is enabled in cluster describe.',
|
|
760
|
-
args,
|
|
761
654
|
)
|
|
762
655
|
if return_code != 0:
|
|
763
656
|
xpk_exit(return_code)
|
|
@@ -779,13 +672,12 @@ def is_gcsfuse_driver_enabled_on_cluster(args) -> bool:
|
|
|
779
672
|
"""
|
|
780
673
|
command = (
|
|
781
674
|
f'gcloud container clusters describe {args.cluster}'
|
|
782
|
-
f' --project={args.project} --
|
|
675
|
+
f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
|
|
783
676
|
' --format="value(addonsConfig.gcsFuseCsiDriverConfig.enabled)"'
|
|
784
677
|
)
|
|
785
678
|
return_code, gcsfuse_driver_enabled = run_command_for_value(
|
|
786
679
|
command,
|
|
787
680
|
'Checks if GCSFuse CSI driver is enabled in cluster describe.',
|
|
788
|
-
args,
|
|
789
681
|
)
|
|
790
682
|
if return_code != 0:
|
|
791
683
|
xpk_exit(return_code)
|
|
@@ -795,53 +687,6 @@ def is_gcsfuse_driver_enabled_on_cluster(args) -> bool:
|
|
|
795
687
|
return False
|
|
796
688
|
|
|
797
689
|
|
|
798
|
-
def update_cluster_with_clouddns_if_necessary(args) -> int:
|
|
799
|
-
"""Updates a GKE cluster to use CloudDNS, if not enabled already.
|
|
800
|
-
|
|
801
|
-
Args:
|
|
802
|
-
args: user provided arguments for running the command.
|
|
803
|
-
|
|
804
|
-
Returns:
|
|
805
|
-
0 if successful and error code otherwise.
|
|
806
|
-
"""
|
|
807
|
-
all_clusters, return_code = get_all_clusters_programmatic(args)
|
|
808
|
-
if return_code > 0:
|
|
809
|
-
xpk_print('Listing all clusters failed!')
|
|
810
|
-
return 1
|
|
811
|
-
if args.cluster in all_clusters:
|
|
812
|
-
# If cluster is already using clouddns, no update necessary!
|
|
813
|
-
if is_cluster_using_clouddns(args):
|
|
814
|
-
return 0
|
|
815
|
-
cluster_update_return_code = update_gke_cluster_with_clouddns(args)
|
|
816
|
-
if cluster_update_return_code > 0:
|
|
817
|
-
xpk_print('Updating GKE cluster to use CloudDNS failed!')
|
|
818
|
-
return cluster_update_return_code
|
|
819
|
-
|
|
820
|
-
# Find default rapid control plane version and update the control plane to the same.
|
|
821
|
-
server_config_return_code, gke_server_config = get_gke_server_config(args)
|
|
822
|
-
if server_config_return_code != 0:
|
|
823
|
-
xpk_exit(server_config_return_code)
|
|
824
|
-
assert gke_server_config
|
|
825
|
-
|
|
826
|
-
upgrade_master_return_code = upgrade_gke_control_plane_version(
|
|
827
|
-
args,
|
|
828
|
-
gke_server_config.default_rapid_gke_version,
|
|
829
|
-
)
|
|
830
|
-
if upgrade_master_return_code > 0:
|
|
831
|
-
xpk_print("Updating GKE cluster's control plane upgrade failed!")
|
|
832
|
-
return upgrade_master_return_code
|
|
833
|
-
|
|
834
|
-
# Upgrade nodepools version after the master upgrade.
|
|
835
|
-
node_pool_update_code = upgrade_gke_nodepools_version(
|
|
836
|
-
args,
|
|
837
|
-
gke_server_config.default_rapid_gke_version,
|
|
838
|
-
)
|
|
839
|
-
if node_pool_update_code > 0:
|
|
840
|
-
xpk_print('Upgrading nodepools version failed!')
|
|
841
|
-
return node_pool_update_code
|
|
842
|
-
return 0
|
|
843
|
-
|
|
844
|
-
|
|
845
690
|
def update_cluster_with_workload_identity_if_necessary(args) -> int:
|
|
846
691
|
"""Updates a GKE cluster to enable Workload Identity Federation, if not enabled already.
|
|
847
692
|
Args:
|
|
@@ -884,26 +729,78 @@ def update_cluster_with_gcsfuse_driver_if_necessary(args) -> int:
|
|
|
884
729
|
return 0
|
|
885
730
|
|
|
886
731
|
|
|
887
|
-
def
|
|
888
|
-
"""
|
|
732
|
+
def test_and_retry_credentials_with_dns_logic(args) -> int:
|
|
733
|
+
"""Tests kubectl credentials and retries with default settings if a DNS error is found.
|
|
889
734
|
|
|
890
735
|
Args:
|
|
891
736
|
args: user provided arguments for running the command.
|
|
892
737
|
|
|
893
738
|
Returns:
|
|
894
|
-
0 if
|
|
739
|
+
0 if credentials are valid after retrying, 1 otherwise.
|
|
895
740
|
"""
|
|
896
|
-
|
|
741
|
+
|
|
742
|
+
xpk_print('Testing credentials with kubectl...')
|
|
743
|
+
kubectl_command = 'kubectl get pods'
|
|
744
|
+
kubectl_return_code, kubectl_output = run_command_for_value(
|
|
745
|
+
kubectl_command, 'kubectl get pods'
|
|
746
|
+
)
|
|
747
|
+
if kubectl_return_code == 0:
|
|
748
|
+
xpk_print('Credentials test succeeded.')
|
|
749
|
+
return 0
|
|
750
|
+
|
|
751
|
+
dns_endpoint_error = (
|
|
752
|
+
'control_plane_endpoints_config.dns_endpoint_config.allow_external_traffic'
|
|
753
|
+
' is disabled'
|
|
754
|
+
)
|
|
755
|
+
if dns_endpoint_error not in kubectl_output:
|
|
756
|
+
xpk_print(f'kubectl failed with an unhandled error: {kubectl_output}')
|
|
757
|
+
xpk_exit(kubectl_return_code)
|
|
758
|
+
xpk_print(
|
|
759
|
+
'Detected DNS endpoint-related error. Retrying without --dns-endpoint'
|
|
760
|
+
' flag...'
|
|
761
|
+
)
|
|
762
|
+
|
|
763
|
+
location = get_cluster_location(args.project, args.cluster, args.zone)
|
|
764
|
+
without_dns_command = (
|
|
897
765
|
'gcloud container clusters get-credentials'
|
|
898
|
-
f' {args.cluster} --
|
|
766
|
+
f' {args.cluster} --location={location}'
|
|
899
767
|
f' --project={args.project} &&'
|
|
900
768
|
' kubectl config view && kubectl config set-context --current'
|
|
901
769
|
' --namespace=default'
|
|
902
770
|
)
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
command, task, args, verbose=False
|
|
771
|
+
return_code = run_command_with_updates(
|
|
772
|
+
without_dns_command, 'get-credentials to cluster', verbose=False
|
|
906
773
|
)
|
|
774
|
+
if return_code != 0:
|
|
775
|
+
xpk_print('Failed to get credentials even without --dns-endpoint. Exiting.')
|
|
776
|
+
xpk_exit(return_code)
|
|
777
|
+
return 0
|
|
778
|
+
|
|
779
|
+
|
|
780
|
+
def get_cluster_credentials(args) -> int:
|
|
781
|
+
"""Run cluster configuration command to set the kubectl config.
|
|
782
|
+
|
|
783
|
+
Args:
|
|
784
|
+
args: user provided arguments for running the command.
|
|
785
|
+
|
|
786
|
+
Returns:
|
|
787
|
+
0 if successful and 1 otherwise.
|
|
788
|
+
"""
|
|
789
|
+
location = get_cluster_location(args.project, args.cluster, args.zone)
|
|
790
|
+
command = (
|
|
791
|
+
'gcloud container clusters get-credentials'
|
|
792
|
+
f' {args.cluster} --location={location} --dns-endpoint'
|
|
793
|
+
f' --project={args.project} && kubectl config view && kubectl config'
|
|
794
|
+
' set-context --current --namespace=default'
|
|
795
|
+
)
|
|
796
|
+
task = f'get-credentials-dns-endpoint to cluster {args.cluster}'
|
|
797
|
+
return_code = run_command_with_updates_retry(command, task, verbose=False)
|
|
798
|
+
|
|
907
799
|
if return_code != 0:
|
|
908
800
|
xpk_print(f'{task} returned ERROR {return_code}')
|
|
909
801
|
xpk_exit(return_code)
|
|
802
|
+
|
|
803
|
+
return_code = test_and_retry_credentials_with_dns_logic(args)
|
|
804
|
+
xpk_print('Finished get-credentials and kubectl setup.')
|
|
805
|
+
|
|
806
|
+
return return_code
|
xpk/core/cluster_private.py
CHANGED
|
@@ -22,7 +22,7 @@ from ..utils.network import (
|
|
|
22
22
|
from ..utils.execution_context import is_dry_run
|
|
23
23
|
from ..utils.objects import is_text_true
|
|
24
24
|
from .commands import run_command_for_value, run_command_with_updates
|
|
25
|
-
from .gcloud_context import
|
|
25
|
+
from .gcloud_context import get_cluster_location
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
def authorize_private_cluster_access_if_necessary(args) -> int:
|
|
@@ -127,13 +127,12 @@ def is_cluster_private(args) -> bool:
|
|
|
127
127
|
"""
|
|
128
128
|
command = (
|
|
129
129
|
f'gcloud container clusters describe {args.cluster}'
|
|
130
|
-
f' --project={args.project} --
|
|
130
|
+
f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
|
|
131
131
|
' --format="value(privateClusterConfig.enablePrivateNodes)"'
|
|
132
132
|
)
|
|
133
133
|
return_code, private_nodes_enabled = run_command_for_value(
|
|
134
134
|
command,
|
|
135
135
|
'Check if Private Nodes is enabled in cluster.',
|
|
136
|
-
args,
|
|
137
136
|
)
|
|
138
137
|
|
|
139
138
|
if return_code != 0:
|
|
@@ -158,13 +157,12 @@ def get_cluster_authorized_networks(args) -> list[str]:
|
|
|
158
157
|
"""
|
|
159
158
|
command = (
|
|
160
159
|
f'gcloud container clusters describe {args.cluster}'
|
|
161
|
-
f' --project={args.project} --
|
|
160
|
+
f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
|
|
162
161
|
' --format="value(masterAuthorizedNetworksConfig.cidrBlocks[].cidrBlock)"'
|
|
163
162
|
)
|
|
164
163
|
return_code, authorized_networks = run_command_for_value(
|
|
165
164
|
command,
|
|
166
165
|
'Fetching the list of authorized network from cluster describe.',
|
|
167
|
-
args,
|
|
168
166
|
dry_run_return_val='127.0.0.1/32',
|
|
169
167
|
)
|
|
170
168
|
|
|
@@ -189,15 +187,12 @@ def update_cluster_authorized_networks(args, authorized_networks) -> int:
|
|
|
189
187
|
"""
|
|
190
188
|
command = (
|
|
191
189
|
'gcloud container clusters update'
|
|
192
|
-
f' {args.cluster} --project={args.project}'
|
|
193
|
-
f' --
|
|
194
|
-
' --enable-master-authorized-networks'
|
|
195
|
-
f' --master-authorized-networks={",".join(authorized_networks)}'
|
|
196
|
-
' --quiet'
|
|
190
|
+
f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --enable-master-authorized-networks'
|
|
191
|
+
f' --master-authorized-networks={",".join(authorized_networks)} --quiet'
|
|
197
192
|
)
|
|
198
193
|
|
|
199
194
|
return_code = run_command_with_updates(
|
|
200
|
-
command, 'GKE Cluster Update master authorized networks'
|
|
195
|
+
command, 'GKE Cluster Update master authorized networks'
|
|
201
196
|
)
|
|
202
197
|
|
|
203
198
|
if return_code != 0:
|