xpk 0.12.0__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +17 -10
- xpk/commands/cluster.py +137 -123
- xpk/commands/cluster_gcluster.py +77 -14
- xpk/commands/cluster_gcluster_test.py +177 -0
- xpk/commands/common.py +13 -27
- xpk/commands/info.py +11 -9
- xpk/commands/inspector.py +22 -11
- xpk/commands/job.py +53 -9
- xpk/commands/kind.py +38 -40
- xpk/commands/kjob_common.py +4 -4
- xpk/commands/run.py +9 -2
- xpk/commands/shell.py +13 -10
- xpk/commands/storage.py +26 -2
- xpk/commands/version.py +0 -4
- xpk/commands/workload.py +58 -30
- xpk/core/blueprint/blueprint_generator.py +4 -40
- xpk/core/blueprint/blueprint_test.py +0 -6
- xpk/core/capacity.py +6 -5
- xpk/core/cluster.py +96 -195
- xpk/core/cluster_private.py +9 -12
- xpk/core/commands.py +21 -25
- xpk/core/config.py +1 -1
- xpk/core/docker_image.py +17 -9
- xpk/core/docker_resources.py +9 -4
- xpk/core/gcloud_context.py +26 -2
- xpk/core/gcloud_context_test.py +96 -0
- xpk/core/gcluster_manager.py +0 -3
- xpk/core/jobset.py +5 -8
- xpk/core/kjob.py +19 -29
- xpk/core/kueue_manager.py +383 -0
- xpk/core/kueue_manager_test.py +542 -0
- xpk/core/monitoring.py +1 -1
- xpk/core/nap.py +11 -16
- xpk/core/network.py +18 -19
- xpk/core/nodepool.py +65 -71
- xpk/core/nodepool_test.py +198 -1
- xpk/core/pathways.py +9 -5
- xpk/core/ray.py +11 -15
- xpk/core/resources.py +15 -10
- xpk/core/scheduling.py +23 -1
- xpk/core/scheduling_test.py +31 -0
- xpk/core/system_characteristics.py +335 -229
- xpk/core/vertex.py +1 -1
- xpk/core/workload.py +7 -8
- xpk/main.py +3 -2
- xpk/parser/cluster.py +50 -0
- xpk/parser/cluster_test.py +66 -0
- xpk/parser/common.py +11 -0
- xpk/parser/workload.py +62 -25
- xpk/parser/workload_test.py +82 -0
- xpk/utils/execution_context.py +28 -0
- xpk/utils/feature_flags.py +28 -0
- xpk/utils/file.py +25 -10
- xpk/utils/kueue.py +20 -0
- xpk/utils/network.py +4 -0
- xpk/utils/templates.py +2 -0
- xpk/utils/topology.py +37 -0
- xpk/utils/topology_test.py +43 -0
- xpk/utils/validation.py +79 -55
- xpk/utils/validation_test.py +37 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
- xpk-0.14.0.dist-info/RECORD +112 -0
- xpk/core/kueue.py +0 -545
- xpk-0.12.0.dist-info/RECORD +0 -100
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
xpk/core/cluster.py
CHANGED
|
@@ -30,15 +30,14 @@ from .commands import (
|
|
|
30
30
|
)
|
|
31
31
|
from .gcloud_context import (
|
|
32
32
|
add_zone_and_project,
|
|
33
|
-
|
|
33
|
+
get_cluster_location,
|
|
34
34
|
zone_to_region,
|
|
35
35
|
)
|
|
36
|
-
from .nodepool import upgrade_gke_nodepools_version
|
|
37
36
|
from .resources import get_cluster_system_characteristics
|
|
38
37
|
from .system_characteristics import SystemCharacteristics
|
|
39
38
|
|
|
40
39
|
JOBSET_VERSION = 'v0.8.0'
|
|
41
|
-
PATHWAYS_JOB_VERSION = 'v0.1.
|
|
40
|
+
PATHWAYS_JOB_VERSION = 'v0.1.3'
|
|
42
41
|
INSTALLER_NCCL_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-tcpx-installer.yaml'
|
|
43
42
|
INSTALLER_NCCL_TCPXO = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml'
|
|
44
43
|
INSTALLER_NCCL_RDMA = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer.yaml'
|
|
@@ -66,7 +65,7 @@ def set_jobset_on_cluster(args) -> int:
|
|
|
66
65
|
f' -f https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml'
|
|
67
66
|
)
|
|
68
67
|
task = f'Install Jobset on {args.cluster}'
|
|
69
|
-
return_code = run_command_with_updates_retry(command, task
|
|
68
|
+
return_code = run_command_with_updates_retry(command, task)
|
|
70
69
|
|
|
71
70
|
if return_code != 0:
|
|
72
71
|
xpk_print(f'{task} returned with ERROR {return_code}.\n')
|
|
@@ -95,7 +94,7 @@ def set_pathways_job_on_cluster(args) -> int:
|
|
|
95
94
|
f' https://github.com/google/pathways-job/releases/download/{PATHWAYS_JOB_VERSION}/install.yaml'
|
|
96
95
|
)
|
|
97
96
|
task = f'Install PathwaysJob on {args.cluster}'
|
|
98
|
-
return_code = run_command_with_updates_retry(command, task
|
|
97
|
+
return_code = run_command_with_updates_retry(command, task)
|
|
99
98
|
|
|
100
99
|
if return_code != 0:
|
|
101
100
|
xpk_print(f'{task} returned with ERROR {return_code}.\n')
|
|
@@ -110,11 +109,10 @@ def set_pathways_job_on_cluster(args) -> int:
|
|
|
110
109
|
return return_code
|
|
111
110
|
|
|
112
111
|
|
|
113
|
-
def install_nccl_on_cluster(
|
|
112
|
+
def install_nccl_on_cluster(system: SystemCharacteristics) -> int:
|
|
114
113
|
"""Install NCCL plugin on the cluster.
|
|
115
114
|
|
|
116
115
|
Args:
|
|
117
|
-
args: user provided arguments for running the command.
|
|
118
116
|
system: system characteristics.
|
|
119
117
|
|
|
120
118
|
Returns:
|
|
@@ -128,7 +126,7 @@ def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
|
|
|
128
126
|
command = f'kubectl apply -f {INSTALLER_NCCL_TCPXO}'
|
|
129
127
|
|
|
130
128
|
return_code = run_command_with_updates(
|
|
131
|
-
command, 'Install NCCL Plugin On Cluster'
|
|
129
|
+
command, 'Install NCCL Plugin On Cluster'
|
|
132
130
|
)
|
|
133
131
|
|
|
134
132
|
if return_code != 0:
|
|
@@ -141,7 +139,7 @@ def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
|
|
|
141
139
|
command = f'kubectl apply -f {CONFIG_NCCL_TCPX}'
|
|
142
140
|
|
|
143
141
|
return_code = run_command_with_updates(
|
|
144
|
-
command, 'Install NCCL Config On Cluster'
|
|
142
|
+
command, 'Install NCCL Config On Cluster'
|
|
145
143
|
)
|
|
146
144
|
|
|
147
145
|
if return_code != 0:
|
|
@@ -153,19 +151,14 @@ def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
|
|
|
153
151
|
return 0
|
|
154
152
|
|
|
155
153
|
|
|
156
|
-
def disable_mglru_on_cluster(
|
|
154
|
+
def disable_mglru_on_cluster() -> int:
|
|
157
155
|
"""Disable MGLRU on the cluster.
|
|
158
156
|
|
|
159
|
-
Args:
|
|
160
|
-
args: user provided arguments for running the command.
|
|
161
|
-
|
|
162
157
|
Returns:
|
|
163
158
|
0 if successful and 1 otherwise.
|
|
164
159
|
"""
|
|
165
160
|
command = f'kubectl apply -f {MGLRU_DISABLE}'
|
|
166
|
-
return_code = run_command_with_updates(
|
|
167
|
-
command, 'Disable MGLRU On Cluster', args
|
|
168
|
-
)
|
|
161
|
+
return_code = run_command_with_updates(command, 'Disable MGLRU On Cluster')
|
|
169
162
|
|
|
170
163
|
if return_code != 0:
|
|
171
164
|
xpk_print('Disablig MGLRU On Cluster request returned ERROR')
|
|
@@ -174,11 +167,10 @@ def disable_mglru_on_cluster(args) -> int:
|
|
|
174
167
|
return 0
|
|
175
168
|
|
|
176
169
|
|
|
177
|
-
def install_nri_on_cluster(
|
|
170
|
+
def install_nri_on_cluster() -> int:
|
|
178
171
|
"""Install NRI Device Injector on the cluster.
|
|
179
172
|
|
|
180
173
|
Args:
|
|
181
|
-
args: user provided arguments for running the command.
|
|
182
174
|
system: system characteristics.
|
|
183
175
|
|
|
184
176
|
Returns:
|
|
@@ -186,7 +178,7 @@ def install_nri_on_cluster(args) -> int:
|
|
|
186
178
|
"""
|
|
187
179
|
command = f'kubectl apply -f {NRI_DEVICE_INJECTOR}'
|
|
188
180
|
return_code = run_command_with_updates(
|
|
189
|
-
command, 'Install NRI Device Injector On Cluster'
|
|
181
|
+
command, 'Install NRI Device Injector On Cluster'
|
|
190
182
|
)
|
|
191
183
|
|
|
192
184
|
if return_code != 0:
|
|
@@ -199,12 +191,9 @@ def install_nri_on_cluster(args) -> int:
|
|
|
199
191
|
return 0
|
|
200
192
|
|
|
201
193
|
|
|
202
|
-
def get_cluster_nodes_info(
|
|
194
|
+
def get_cluster_nodes_info() -> list[dict]:
|
|
203
195
|
"""Get list of cluster's nodes descrition in yaml format
|
|
204
196
|
|
|
205
|
-
Args:
|
|
206
|
-
args: user provided arguments for running the command.
|
|
207
|
-
|
|
208
197
|
Returns:
|
|
209
198
|
List of nodes info yaml objects.
|
|
210
199
|
"""
|
|
@@ -213,7 +202,6 @@ def get_cluster_nodes_info(args) -> list[dict]:
|
|
|
213
202
|
err_code, val = run_command_for_value(
|
|
214
203
|
command=command,
|
|
215
204
|
task='Get cluster nodes info',
|
|
216
|
-
global_args=args,
|
|
217
205
|
)
|
|
218
206
|
if err_code != 0:
|
|
219
207
|
xpk_exit(err_code)
|
|
@@ -221,9 +209,9 @@ def get_cluster_nodes_info(args) -> list[dict]:
|
|
|
221
209
|
return data['items']
|
|
222
210
|
|
|
223
211
|
|
|
224
|
-
def count_nodes_on_cluster(
|
|
212
|
+
def count_nodes_on_cluster(system: SystemCharacteristics) -> int:
|
|
225
213
|
"""Count cluster nodes by accelerator type"""
|
|
226
|
-
nodes_info = get_cluster_nodes_info(
|
|
214
|
+
nodes_info = get_cluster_nodes_info()
|
|
227
215
|
accelerators = [
|
|
228
216
|
node['metadata']['labels']['cloud.google.com/gke-accelerator']
|
|
229
217
|
for node in nodes_info
|
|
@@ -243,12 +231,11 @@ def get_cluster_network(args) -> str:
|
|
|
243
231
|
xpk_print("Getting cluster's VPC network...")
|
|
244
232
|
cluster_network_cmd = (
|
|
245
233
|
'gcloud container clusters describe'
|
|
246
|
-
f' {args.cluster} --
|
|
234
|
+
f' {args.cluster} --location={get_cluster_location(args.project, args.cluster, args.zone)} --project={args.project} --format="value(network)"'
|
|
247
235
|
)
|
|
248
236
|
err_code, val = run_command_for_value(
|
|
249
237
|
command=cluster_network_cmd,
|
|
250
238
|
task='Get network cluster is in',
|
|
251
|
-
global_args=args,
|
|
252
239
|
)
|
|
253
240
|
if err_code != 0:
|
|
254
241
|
xpk_exit(err_code)
|
|
@@ -354,14 +341,13 @@ def is_driver_enabled_on_cluster(
|
|
|
354
341
|
"""
|
|
355
342
|
command = (
|
|
356
343
|
f'gcloud container clusters describe {args.cluster}'
|
|
357
|
-
f' --project={args.project} --
|
|
344
|
+
f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
|
|
358
345
|
f' --format="value(addonsConfig.{driver}Config.{config_key})"'
|
|
359
346
|
)
|
|
360
347
|
return_code, driver_enabled = run_command_for_value(
|
|
361
348
|
command,
|
|
362
349
|
f"Checks if {driver} driver's {config_key} is enabled in cluster"
|
|
363
350
|
' describe.',
|
|
364
|
-
args,
|
|
365
351
|
)
|
|
366
352
|
if return_code != 0:
|
|
367
353
|
xpk_exit(return_code)
|
|
@@ -382,14 +368,12 @@ def update_gke_cluster_with_addon(args, addon: str) -> int:
|
|
|
382
368
|
"""
|
|
383
369
|
command = (
|
|
384
370
|
'gcloud container clusters update'
|
|
385
|
-
f' {args.cluster} --project={args.project}'
|
|
386
|
-
f'
|
|
387
|
-
f' --update-addons {addon}=ENABLED'
|
|
388
|
-
' --quiet'
|
|
371
|
+
f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --update-addons'
|
|
372
|
+
f' {addon}=ENABLED --quiet'
|
|
389
373
|
)
|
|
390
374
|
xpk_print(f'Updating GKE cluster to enable {addon}, may take a while!')
|
|
391
375
|
return_code = run_command_with_updates(
|
|
392
|
-
command, f'GKE Cluster Update to enable {addon}'
|
|
376
|
+
command, f'GKE Cluster Update to enable {addon}'
|
|
393
377
|
)
|
|
394
378
|
if return_code != 0:
|
|
395
379
|
xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
|
|
@@ -408,11 +392,12 @@ def get_all_clusters_programmatic(args) -> tuple[list[str], int]:
|
|
|
408
392
|
"""
|
|
409
393
|
command = (
|
|
410
394
|
'gcloud container clusters list'
|
|
411
|
-
f' --project={args.project}
|
|
395
|
+
f' --project={args.project} '
|
|
396
|
+
f'--filter=location~"{zone_to_region(args.zone)}.*"'
|
|
412
397
|
' --format="csv[no-heading](name)"'
|
|
413
398
|
)
|
|
414
399
|
return_code, raw_cluster_output = run_command_for_value(
|
|
415
|
-
command, 'Find if Cluster Exists'
|
|
400
|
+
command, 'Find if Cluster Exists'
|
|
416
401
|
)
|
|
417
402
|
if return_code != 0:
|
|
418
403
|
xpk_print(f'Find if Cluster Exists returned ERROR {return_code}')
|
|
@@ -442,7 +427,11 @@ def setup_k8s_env(args) -> k8s_client.ApiClient:
|
|
|
442
427
|
if not getattr(args, 'kind_cluster', False):
|
|
443
428
|
add_zone_and_project(args)
|
|
444
429
|
get_cluster_credentials(args)
|
|
445
|
-
args.project_number =
|
|
430
|
+
args.project_number = (
|
|
431
|
+
project_id_to_project_number(args.project)
|
|
432
|
+
if not args.dry_run
|
|
433
|
+
else abs(hash(args.project) % (10**12)) # 12 digit hash
|
|
434
|
+
)
|
|
446
435
|
|
|
447
436
|
config.load_kube_config()
|
|
448
437
|
return k8s_client.ApiClient()
|
|
@@ -574,34 +563,6 @@ def create_role_binding(sa: str, role_name: str) -> None:
|
|
|
574
563
|
xpk_exit(1)
|
|
575
564
|
|
|
576
565
|
|
|
577
|
-
def update_gke_cluster_with_clouddns(args) -> int:
|
|
578
|
-
"""Run the GKE cluster update command for existing clusters and enable CloudDNS.
|
|
579
|
-
|
|
580
|
-
Args:
|
|
581
|
-
args: user provided arguments for running the command.
|
|
582
|
-
|
|
583
|
-
Returns:
|
|
584
|
-
0 if successful and 1 otherwise.
|
|
585
|
-
"""
|
|
586
|
-
command = (
|
|
587
|
-
'gcloud container clusters update'
|
|
588
|
-
f' {args.cluster} --project={args.project}'
|
|
589
|
-
f' --region={zone_to_region(args.zone)}'
|
|
590
|
-
' --cluster-dns=clouddns'
|
|
591
|
-
' --cluster-dns-scope=vpc'
|
|
592
|
-
f' --cluster-dns-domain={args.cluster}-domain'
|
|
593
|
-
' --quiet'
|
|
594
|
-
)
|
|
595
|
-
xpk_print('Updating GKE cluster to use Cloud DNS, may take a while!')
|
|
596
|
-
return_code = run_command_with_updates(
|
|
597
|
-
command, 'GKE Cluster Update to enable Cloud DNS', args
|
|
598
|
-
)
|
|
599
|
-
if return_code != 0:
|
|
600
|
-
xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
|
|
601
|
-
return 1
|
|
602
|
-
return 0
|
|
603
|
-
|
|
604
|
-
|
|
605
566
|
def update_gke_cluster_with_workload_identity_enabled(args) -> int:
|
|
606
567
|
"""Run the GKE cluster update command for existing cluster and enable Workload Identity Federation.
|
|
607
568
|
Args:
|
|
@@ -611,9 +572,7 @@ def update_gke_cluster_with_workload_identity_enabled(args) -> int:
|
|
|
611
572
|
"""
|
|
612
573
|
command = (
|
|
613
574
|
'gcloud container clusters update'
|
|
614
|
-
f' {args.cluster} --project={args.project}'
|
|
615
|
-
f' --region={zone_to_region(args.zone)}'
|
|
616
|
-
f' --workload-pool={args.project}.svc.id.goog'
|
|
575
|
+
f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --workload-pool={args.project}.svc.id.goog'
|
|
617
576
|
' --quiet'
|
|
618
577
|
)
|
|
619
578
|
xpk_print(
|
|
@@ -621,7 +580,7 @@ def update_gke_cluster_with_workload_identity_enabled(args) -> int:
|
|
|
621
580
|
' while!'
|
|
622
581
|
)
|
|
623
582
|
return_code = run_command_with_updates(
|
|
624
|
-
command, 'GKE Cluster Update to enable Workload Identity Federation'
|
|
583
|
+
command, 'GKE Cluster Update to enable Workload Identity Federation'
|
|
625
584
|
)
|
|
626
585
|
if return_code != 0:
|
|
627
586
|
xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
|
|
@@ -638,16 +597,14 @@ def update_gke_cluster_with_gcsfuse_driver_enabled(args) -> int:
|
|
|
638
597
|
"""
|
|
639
598
|
command = (
|
|
640
599
|
'gcloud container clusters update'
|
|
641
|
-
f' {args.cluster} --project={args.project}'
|
|
642
|
-
|
|
643
|
-
' --update-addons GcsFuseCsiDriver=ENABLED'
|
|
644
|
-
' --quiet'
|
|
600
|
+
f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --update-addons'
|
|
601
|
+
' GcsFuseCsiDriver=ENABLED --quiet'
|
|
645
602
|
)
|
|
646
603
|
xpk_print(
|
|
647
604
|
'Updating GKE cluster to enable GCSFuse CSI driver, may take a while!'
|
|
648
605
|
)
|
|
649
606
|
return_code = run_command_with_updates(
|
|
650
|
-
command, 'GKE Cluster Update to enable GCSFuse CSI driver'
|
|
607
|
+
command, 'GKE Cluster Update to enable GCSFuse CSI driver'
|
|
651
608
|
)
|
|
652
609
|
if return_code != 0:
|
|
653
610
|
xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
|
|
@@ -664,16 +621,14 @@ def update_gke_cluster_with_lustre_driver_enabled(args) -> int:
|
|
|
664
621
|
"""
|
|
665
622
|
command = (
|
|
666
623
|
'gcloud container clusters update'
|
|
667
|
-
f' {args.cluster} --project={args.project}'
|
|
668
|
-
f' --region={zone_to_region(args.zone)}'
|
|
669
|
-
' --enable-legacy-lustre-port'
|
|
624
|
+
f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --enable-legacy-lustre-port'
|
|
670
625
|
' --quiet'
|
|
671
626
|
)
|
|
672
627
|
xpk_print(
|
|
673
628
|
'Updating GKE cluster to enable Lustre CSI driver, may take a while!'
|
|
674
629
|
)
|
|
675
630
|
return_code = run_command_with_updates(
|
|
676
|
-
command, 'GKE Cluster Update to enable Lustre CSI driver'
|
|
631
|
+
command, 'GKE Cluster Update to enable Lustre CSI driver'
|
|
677
632
|
)
|
|
678
633
|
if return_code != 0:
|
|
679
634
|
xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
|
|
@@ -681,63 +636,6 @@ def update_gke_cluster_with_lustre_driver_enabled(args) -> int:
|
|
|
681
636
|
return 0
|
|
682
637
|
|
|
683
638
|
|
|
684
|
-
def upgrade_gke_control_plane_version(args, default_rapid_gke_version) -> int:
|
|
685
|
-
"""Upgrade GKE cluster's control plane version before updating nodepools to use CloudDNS.
|
|
686
|
-
|
|
687
|
-
Args:
|
|
688
|
-
args: user provided arguments for running the command.
|
|
689
|
-
default_rapid_gke_version: Rapid default version for the upgrade.
|
|
690
|
-
|
|
691
|
-
Returns:
|
|
692
|
-
0 if successful and 1 otherwise.
|
|
693
|
-
"""
|
|
694
|
-
command = (
|
|
695
|
-
'gcloud container clusters upgrade'
|
|
696
|
-
f' {args.cluster} --project={args.project}'
|
|
697
|
-
f' --region={zone_to_region(args.zone)}'
|
|
698
|
-
f' --cluster-version={default_rapid_gke_version}'
|
|
699
|
-
' --master'
|
|
700
|
-
' --quiet'
|
|
701
|
-
)
|
|
702
|
-
xpk_print("Updating GKE cluster's control plane version, may take a while!")
|
|
703
|
-
return_code = run_command_with_updates(
|
|
704
|
-
command,
|
|
705
|
-
'GKE Cluster control plane version update to enable Cloud DNS',
|
|
706
|
-
args,
|
|
707
|
-
)
|
|
708
|
-
if return_code != 0:
|
|
709
|
-
xpk_print(
|
|
710
|
-
"GKE cluster's control plane version update request returned"
|
|
711
|
-
f' ERROR {return_code}'
|
|
712
|
-
)
|
|
713
|
-
return 1
|
|
714
|
-
return 0
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
def is_cluster_using_clouddns(args) -> bool:
|
|
718
|
-
"""Checks if cluster is using CloudDNS.
|
|
719
|
-
Args:
|
|
720
|
-
args: user provided arguments for running the command.
|
|
721
|
-
|
|
722
|
-
Returns:
|
|
723
|
-
True if cluster is using CloudDNS and False otherwise.
|
|
724
|
-
"""
|
|
725
|
-
command = (
|
|
726
|
-
f'gcloud container clusters describe {args.cluster}'
|
|
727
|
-
f' --project={args.project} --region={zone_to_region(args.zone)}'
|
|
728
|
-
' 2> /dev/null | grep "clusterDns: CLOUD_DNS"'
|
|
729
|
-
)
|
|
730
|
-
return_code, _ = run_command_for_value(
|
|
731
|
-
command,
|
|
732
|
-
'Check if Cloud DNS is enabled in cluster describe.',
|
|
733
|
-
args,
|
|
734
|
-
)
|
|
735
|
-
if return_code == 0:
|
|
736
|
-
xpk_print('Cloud DNS is enabled on the cluster, no update needed.')
|
|
737
|
-
return True
|
|
738
|
-
return False
|
|
739
|
-
|
|
740
|
-
|
|
741
639
|
def is_workload_identity_enabled_on_cluster(args) -> bool:
|
|
742
640
|
"""Checks if Workload Identity Federation is enabled on the cluster.
|
|
743
641
|
Args:
|
|
@@ -747,13 +645,12 @@ def is_workload_identity_enabled_on_cluster(args) -> bool:
|
|
|
747
645
|
"""
|
|
748
646
|
command = (
|
|
749
647
|
f'gcloud container clusters describe {args.cluster}'
|
|
750
|
-
f' --project={args.project} --
|
|
648
|
+
f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
|
|
751
649
|
' --format="value(workloadIdentityConfig.workloadPool)"'
|
|
752
650
|
)
|
|
753
651
|
return_code, workload_pool = run_command_for_value(
|
|
754
652
|
command,
|
|
755
653
|
'Checks if Workload Identity Federation is enabled in cluster describe.',
|
|
756
|
-
args,
|
|
757
654
|
)
|
|
758
655
|
if return_code != 0:
|
|
759
656
|
xpk_exit(return_code)
|
|
@@ -775,13 +672,12 @@ def is_gcsfuse_driver_enabled_on_cluster(args) -> bool:
|
|
|
775
672
|
"""
|
|
776
673
|
command = (
|
|
777
674
|
f'gcloud container clusters describe {args.cluster}'
|
|
778
|
-
f' --project={args.project} --
|
|
675
|
+
f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
|
|
779
676
|
' --format="value(addonsConfig.gcsFuseCsiDriverConfig.enabled)"'
|
|
780
677
|
)
|
|
781
678
|
return_code, gcsfuse_driver_enabled = run_command_for_value(
|
|
782
679
|
command,
|
|
783
680
|
'Checks if GCSFuse CSI driver is enabled in cluster describe.',
|
|
784
|
-
args,
|
|
785
681
|
)
|
|
786
682
|
if return_code != 0:
|
|
787
683
|
xpk_exit(return_code)
|
|
@@ -791,53 +687,6 @@ def is_gcsfuse_driver_enabled_on_cluster(args) -> bool:
|
|
|
791
687
|
return False
|
|
792
688
|
|
|
793
689
|
|
|
794
|
-
def update_cluster_with_clouddns_if_necessary(args) -> int:
|
|
795
|
-
"""Updates a GKE cluster to use CloudDNS, if not enabled already.
|
|
796
|
-
|
|
797
|
-
Args:
|
|
798
|
-
args: user provided arguments for running the command.
|
|
799
|
-
|
|
800
|
-
Returns:
|
|
801
|
-
0 if successful and error code otherwise.
|
|
802
|
-
"""
|
|
803
|
-
all_clusters, return_code = get_all_clusters_programmatic(args)
|
|
804
|
-
if return_code > 0:
|
|
805
|
-
xpk_print('Listing all clusters failed!')
|
|
806
|
-
return 1
|
|
807
|
-
if args.cluster in all_clusters:
|
|
808
|
-
# If cluster is already using clouddns, no update necessary!
|
|
809
|
-
if is_cluster_using_clouddns(args):
|
|
810
|
-
return 0
|
|
811
|
-
cluster_update_return_code = update_gke_cluster_with_clouddns(args)
|
|
812
|
-
if cluster_update_return_code > 0:
|
|
813
|
-
xpk_print('Updating GKE cluster to use CloudDNS failed!')
|
|
814
|
-
return cluster_update_return_code
|
|
815
|
-
|
|
816
|
-
# Find default rapid control plane version and update the control plane to the same.
|
|
817
|
-
server_config_return_code, gke_server_config = get_gke_server_config(args)
|
|
818
|
-
if server_config_return_code != 0:
|
|
819
|
-
xpk_exit(server_config_return_code)
|
|
820
|
-
assert gke_server_config
|
|
821
|
-
|
|
822
|
-
upgrade_master_return_code = upgrade_gke_control_plane_version(
|
|
823
|
-
args,
|
|
824
|
-
gke_server_config.default_rapid_gke_version,
|
|
825
|
-
)
|
|
826
|
-
if upgrade_master_return_code > 0:
|
|
827
|
-
xpk_print("Updating GKE cluster's control plane upgrade failed!")
|
|
828
|
-
return upgrade_master_return_code
|
|
829
|
-
|
|
830
|
-
# Upgrade nodepools version after the master upgrade.
|
|
831
|
-
node_pool_update_code = upgrade_gke_nodepools_version(
|
|
832
|
-
args,
|
|
833
|
-
gke_server_config.default_rapid_gke_version,
|
|
834
|
-
)
|
|
835
|
-
if node_pool_update_code > 0:
|
|
836
|
-
xpk_print('Upgrading nodepools version failed!')
|
|
837
|
-
return node_pool_update_code
|
|
838
|
-
return 0
|
|
839
|
-
|
|
840
|
-
|
|
841
690
|
def update_cluster_with_workload_identity_if_necessary(args) -> int:
|
|
842
691
|
"""Updates a GKE cluster to enable Workload Identity Federation, if not enabled already.
|
|
843
692
|
Args:
|
|
@@ -880,26 +729,78 @@ def update_cluster_with_gcsfuse_driver_if_necessary(args) -> int:
|
|
|
880
729
|
return 0
|
|
881
730
|
|
|
882
731
|
|
|
883
|
-
def
|
|
884
|
-
"""
|
|
732
|
+
def test_and_retry_credentials_with_dns_logic(args) -> int:
|
|
733
|
+
"""Tests kubectl credentials and retries with default settings if a DNS error is found.
|
|
885
734
|
|
|
886
735
|
Args:
|
|
887
736
|
args: user provided arguments for running the command.
|
|
888
737
|
|
|
889
738
|
Returns:
|
|
890
|
-
0 if
|
|
739
|
+
0 if credentials are valid after retrying, 1 otherwise.
|
|
891
740
|
"""
|
|
892
|
-
|
|
741
|
+
|
|
742
|
+
xpk_print('Testing credentials with kubectl...')
|
|
743
|
+
kubectl_command = 'kubectl get pods'
|
|
744
|
+
kubectl_return_code, kubectl_output = run_command_for_value(
|
|
745
|
+
kubectl_command, 'kubectl get pods'
|
|
746
|
+
)
|
|
747
|
+
if kubectl_return_code == 0:
|
|
748
|
+
xpk_print('Credentials test succeeded.')
|
|
749
|
+
return 0
|
|
750
|
+
|
|
751
|
+
dns_endpoint_error = (
|
|
752
|
+
'control_plane_endpoints_config.dns_endpoint_config.allow_external_traffic'
|
|
753
|
+
' is disabled'
|
|
754
|
+
)
|
|
755
|
+
if dns_endpoint_error not in kubectl_output:
|
|
756
|
+
xpk_print(f'kubectl failed with an unhandled error: {kubectl_output}')
|
|
757
|
+
xpk_exit(kubectl_return_code)
|
|
758
|
+
xpk_print(
|
|
759
|
+
'Detected DNS endpoint-related error. Retrying without --dns-endpoint'
|
|
760
|
+
' flag...'
|
|
761
|
+
)
|
|
762
|
+
|
|
763
|
+
location = get_cluster_location(args.project, args.cluster, args.zone)
|
|
764
|
+
without_dns_command = (
|
|
893
765
|
'gcloud container clusters get-credentials'
|
|
894
|
-
f' {args.cluster} --
|
|
766
|
+
f' {args.cluster} --location={location}'
|
|
895
767
|
f' --project={args.project} &&'
|
|
896
768
|
' kubectl config view && kubectl config set-context --current'
|
|
897
769
|
' --namespace=default'
|
|
898
770
|
)
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
command, task, args, verbose=False
|
|
771
|
+
return_code = run_command_with_updates(
|
|
772
|
+
without_dns_command, 'get-credentials to cluster', verbose=False
|
|
902
773
|
)
|
|
774
|
+
if return_code != 0:
|
|
775
|
+
xpk_print('Failed to get credentials even without --dns-endpoint. Exiting.')
|
|
776
|
+
xpk_exit(return_code)
|
|
777
|
+
return 0
|
|
778
|
+
|
|
779
|
+
|
|
780
|
+
def get_cluster_credentials(args) -> int:
|
|
781
|
+
"""Run cluster configuration command to set the kubectl config.
|
|
782
|
+
|
|
783
|
+
Args:
|
|
784
|
+
args: user provided arguments for running the command.
|
|
785
|
+
|
|
786
|
+
Returns:
|
|
787
|
+
0 if successful and 1 otherwise.
|
|
788
|
+
"""
|
|
789
|
+
location = get_cluster_location(args.project, args.cluster, args.zone)
|
|
790
|
+
command = (
|
|
791
|
+
'gcloud container clusters get-credentials'
|
|
792
|
+
f' {args.cluster} --location={location} --dns-endpoint'
|
|
793
|
+
f' --project={args.project} && kubectl config view && kubectl config'
|
|
794
|
+
' set-context --current --namespace=default'
|
|
795
|
+
)
|
|
796
|
+
task = f'get-credentials-dns-endpoint to cluster {args.cluster}'
|
|
797
|
+
return_code = run_command_with_updates_retry(command, task, verbose=False)
|
|
798
|
+
|
|
903
799
|
if return_code != 0:
|
|
904
800
|
xpk_print(f'{task} returned ERROR {return_code}')
|
|
905
801
|
xpk_exit(return_code)
|
|
802
|
+
|
|
803
|
+
return_code = test_and_retry_credentials_with_dns_logic(args)
|
|
804
|
+
xpk_print('Finished get-credentials and kubectl setup.')
|
|
805
|
+
|
|
806
|
+
return return_code
|
xpk/core/cluster_private.py
CHANGED
|
@@ -19,9 +19,10 @@ from ..utils.network import (
|
|
|
19
19
|
add_current_machine_to_networks,
|
|
20
20
|
is_current_machine_in_any_network,
|
|
21
21
|
)
|
|
22
|
+
from ..utils.execution_context import is_dry_run
|
|
22
23
|
from ..utils.objects import is_text_true
|
|
23
24
|
from .commands import run_command_for_value, run_command_with_updates
|
|
24
|
-
from .gcloud_context import
|
|
25
|
+
from .gcloud_context import get_cluster_location
|
|
25
26
|
|
|
26
27
|
|
|
27
28
|
def authorize_private_cluster_access_if_necessary(args) -> int:
|
|
@@ -37,7 +38,7 @@ def authorize_private_cluster_access_if_necessary(args) -> int:
|
|
|
37
38
|
if not args.private and args.authorized_networks is None:
|
|
38
39
|
xpk_print('Cluster is public and no need to authorize networks.')
|
|
39
40
|
return 0
|
|
40
|
-
|
|
41
|
+
elif not is_dry_run():
|
|
41
42
|
xpk_print(
|
|
42
43
|
'Cannot convert an existing public cluster to private. The arguments'
|
|
43
44
|
' --private and --authorized-networks are not acceptable for public'
|
|
@@ -126,13 +127,12 @@ def is_cluster_private(args) -> bool:
|
|
|
126
127
|
"""
|
|
127
128
|
command = (
|
|
128
129
|
f'gcloud container clusters describe {args.cluster}'
|
|
129
|
-
f' --project={args.project} --
|
|
130
|
+
f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
|
|
130
131
|
' --format="value(privateClusterConfig.enablePrivateNodes)"'
|
|
131
132
|
)
|
|
132
133
|
return_code, private_nodes_enabled = run_command_for_value(
|
|
133
134
|
command,
|
|
134
135
|
'Check if Private Nodes is enabled in cluster.',
|
|
135
|
-
args,
|
|
136
136
|
)
|
|
137
137
|
|
|
138
138
|
if return_code != 0:
|
|
@@ -157,13 +157,13 @@ def get_cluster_authorized_networks(args) -> list[str]:
|
|
|
157
157
|
"""
|
|
158
158
|
command = (
|
|
159
159
|
f'gcloud container clusters describe {args.cluster}'
|
|
160
|
-
f' --project={args.project} --
|
|
160
|
+
f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
|
|
161
161
|
' --format="value(masterAuthorizedNetworksConfig.cidrBlocks[].cidrBlock)"'
|
|
162
162
|
)
|
|
163
163
|
return_code, authorized_networks = run_command_for_value(
|
|
164
164
|
command,
|
|
165
165
|
'Fetching the list of authorized network from cluster describe.',
|
|
166
|
-
|
|
166
|
+
dry_run_return_val='127.0.0.1/32',
|
|
167
167
|
)
|
|
168
168
|
|
|
169
169
|
if return_code != 0:
|
|
@@ -187,15 +187,12 @@ def update_cluster_authorized_networks(args, authorized_networks) -> int:
|
|
|
187
187
|
"""
|
|
188
188
|
command = (
|
|
189
189
|
'gcloud container clusters update'
|
|
190
|
-
f' {args.cluster} --project={args.project}'
|
|
191
|
-
f' --
|
|
192
|
-
' --enable-master-authorized-networks'
|
|
193
|
-
f' --master-authorized-networks={",".join(authorized_networks)}'
|
|
194
|
-
' --quiet'
|
|
190
|
+
f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --enable-master-authorized-networks'
|
|
191
|
+
f' --master-authorized-networks={",".join(authorized_networks)} --quiet'
|
|
195
192
|
)
|
|
196
193
|
|
|
197
194
|
return_code = run_command_with_updates(
|
|
198
|
-
command, 'GKE Cluster Update master authorized networks'
|
|
195
|
+
command, 'GKE Cluster Update master authorized networks'
|
|
199
196
|
)
|
|
200
197
|
|
|
201
198
|
if return_code != 0:
|