xpk 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/api/__init__.py +15 -0
- xpk/api/storage_crd.yaml +52 -0
- xpk/commands/batch.py +27 -5
- xpk/commands/cluster.py +104 -80
- xpk/commands/cluster_gcluster.py +94 -10
- xpk/commands/common.py +44 -0
- xpk/commands/config.py +29 -0
- xpk/commands/info.py +8 -10
- xpk/commands/inspector.py +5 -11
- xpk/commands/job.py +9 -7
- xpk/commands/kind.py +34 -4
- xpk/commands/kjob_common.py +44 -0
- xpk/commands/run.py +128 -0
- xpk/commands/shell.py +27 -7
- xpk/commands/storage.py +280 -0
- xpk/commands/version.py +6 -18
- xpk/commands/workload.py +381 -184
- xpk/core/blueprint/blueprint_definitions.py +1 -0
- xpk/core/blueprint/blueprint_generator.py +132 -76
- xpk/core/capacity.py +185 -0
- xpk/core/cluster.py +564 -0
- xpk/core/cluster_private.py +6 -3
- xpk/core/commands.py +18 -14
- xpk/core/config.py +179 -0
- xpk/core/docker_container.py +225 -0
- xpk/core/docker_image.py +210 -0
- xpk/core/docker_resources.py +350 -0
- xpk/core/filestore.py +251 -0
- xpk/core/gcloud_context.py +196 -0
- xpk/core/gcluster_manager.py +20 -2
- xpk/core/gcsfuse.py +50 -0
- xpk/core/kjob.py +257 -18
- xpk/core/kueue.py +12 -6
- xpk/core/monitoring.py +134 -0
- xpk/core/nap.py +32 -20
- xpk/core/network.py +377 -0
- xpk/core/nodepool.py +581 -0
- xpk/core/pathways.py +124 -45
- xpk/core/remote_state/__init__.py +15 -0
- xpk/core/remote_state/fuse_remote_state.py +99 -0
- xpk/core/remote_state/remote_state_client.py +38 -0
- xpk/core/resources.py +238 -0
- xpk/core/scheduling.py +253 -0
- xpk/core/storage.py +581 -0
- xpk/core/system_characteristics.py +38 -1
- xpk/core/vertex.py +105 -0
- xpk/core/workload.py +209 -1
- xpk/core/workload_decorators/rdma_decorator.py +25 -5
- xpk/core/workload_decorators/storage_decorator.py +52 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +70 -37
- xpk/main.py +3 -1
- xpk/parser/batch.py +10 -151
- xpk/parser/cluster.py +49 -8
- xpk/parser/common.py +189 -1
- xpk/parser/config.py +49 -0
- xpk/parser/core.py +27 -1
- xpk/parser/info.py +2 -1
- xpk/parser/inspector.py +3 -3
- xpk/parser/job.py +25 -4
- xpk/parser/kind.py +3 -2
- xpk/parser/run.py +47 -0
- xpk/parser/shell.py +10 -1
- xpk/parser/storage.py +326 -0
- xpk/parser/validators.py +3 -3
- xpk/parser/workload.py +118 -76
- xpk/templates/__init__.py +15 -0
- xpk/templates/storage.yaml +13 -0
- xpk/utils/gcs_utils.py +125 -0
- xpk/utils/kubectl.py +57 -0
- xpk/utils/objects.py +8 -5
- xpk/utils/templates.py +28 -0
- xpk/utils/validation.py +80 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/METADATA +169 -15
- xpk-0.7.1.dist-info/RECORD +92 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/WHEEL +1 -1
- xpk/core/core.py +0 -2824
- xpk-0.6.0.dist-info/RECORD +0 -57
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/entry_points.txt +0 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info/licenses}/LICENSE +0 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/top_level.txt +0 -0
xpk/core/cluster.py
ADDED
|
@@ -0,0 +1,564 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from google.api_core.exceptions import PermissionDenied
|
|
18
|
+
from google.cloud import resourcemanager_v3
|
|
19
|
+
from kubernetes import client as k8s_client
|
|
20
|
+
from kubernetes import config
|
|
21
|
+
from kubernetes.client.exceptions import ApiException
|
|
22
|
+
from .resources import get_cluster_system_characteristics
|
|
23
|
+
|
|
24
|
+
from ..utils.console import xpk_exit, xpk_print
|
|
25
|
+
from .capacity import H100_DEVICE_TYPE
|
|
26
|
+
from .commands import (
|
|
27
|
+
run_command_for_value,
|
|
28
|
+
run_command_with_updates,
|
|
29
|
+
run_command_with_updates_retry,
|
|
30
|
+
)
|
|
31
|
+
from .gcloud_context import add_zone_and_project, get_gke_server_config, zone_to_region
|
|
32
|
+
from .nodepool import upgrade_gke_nodepools_version
|
|
33
|
+
from .system_characteristics import SystemCharacteristics
|
|
34
|
+
|
|
35
|
+
JOBSET_VERSION = 'v0.7.2'
|
|
36
|
+
INSTALLER_NCC_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-tcpx-installer.yaml'
|
|
37
|
+
INSTALLER_NCC_TCPXO = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml'
|
|
38
|
+
|
|
39
|
+
DEFAULT_NAMESPACE = 'default'
|
|
40
|
+
XPK_SA = 'xpk-sa'
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# TODO(vbarr): Remove this function when jobsets gets enabled by default on
|
|
44
|
+
# GKE clusters.
|
|
45
|
+
def set_jobset_on_cluster(args) -> int:
|
|
46
|
+
"""Add jobset command on server side and ask user to verify it is created.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
args: user provided arguments for running the command.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
0 if successful and 1 otherwise.
|
|
53
|
+
"""
|
|
54
|
+
command = (
|
|
55
|
+
'kubectl apply --server-side -f'
|
|
56
|
+
f' https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml'
|
|
57
|
+
)
|
|
58
|
+
task = f'Install Jobset on {args.cluster}'
|
|
59
|
+
return_code = run_command_with_updates_retry(command, task, args)
|
|
60
|
+
|
|
61
|
+
if return_code != 0:
|
|
62
|
+
xpk_print(f'{task} returned with ERROR {return_code}.\n')
|
|
63
|
+
xpk_print(
|
|
64
|
+
"This LIKELY means you're missing Kubernetes Permissions, you can"
|
|
65
|
+
' validate this by checking if the error references permission problems'
|
|
66
|
+
' such as `requires one of ["container.*"] permission(s)`. Follow our'
|
|
67
|
+
' readme:'
|
|
68
|
+
' https://github.com/google/xpk/blob/main/README.md#troubleshooting for'
|
|
69
|
+
' instructions on how to fix these permissions.'
|
|
70
|
+
)
|
|
71
|
+
return return_code
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
|
|
75
|
+
"""Install NCCL plugin on the cluster.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
args: user provided arguments for running the command.
|
|
79
|
+
system: system characteristics.
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
0 if successful and 1 otherwise.
|
|
83
|
+
"""
|
|
84
|
+
if system.device_type == H100_DEVICE_TYPE:
|
|
85
|
+
command = f'kubectl apply -f {INSTALLER_NCC_TCPX}'
|
|
86
|
+
else:
|
|
87
|
+
command = f'kubectl apply -f {INSTALLER_NCC_TCPXO}'
|
|
88
|
+
|
|
89
|
+
return_code = run_command_with_updates(
|
|
90
|
+
command, 'Install NCCL Plugin On Cluster', args
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
if return_code != 0:
|
|
94
|
+
xpk_print(
|
|
95
|
+
f'Install NCCL Plugin On Cluster request returned ERROR {return_code}'
|
|
96
|
+
)
|
|
97
|
+
return 1
|
|
98
|
+
|
|
99
|
+
return 0
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def get_cluster_network(args) -> str:
|
|
103
|
+
xpk_print("Getting cluster's VPC network...")
|
|
104
|
+
cluster_network_cmd = (
|
|
105
|
+
'gcloud container clusters describe'
|
|
106
|
+
f' {args.cluster} --zone={zone_to_region(args.zone)} --project={args.project} --format="value(network)"'
|
|
107
|
+
)
|
|
108
|
+
err_code, val = run_command_for_value(
|
|
109
|
+
command=cluster_network_cmd,
|
|
110
|
+
task='Get network cluster is in',
|
|
111
|
+
global_args=args,
|
|
112
|
+
)
|
|
113
|
+
if err_code != 0:
|
|
114
|
+
xpk_exit(err_code)
|
|
115
|
+
return val.strip()
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def update_cluster_with_gcpfilestore_driver_if_necessary(args) -> int:
|
|
119
|
+
"""Updates a GKE cluster to enable GCPFilestore CSI driver, if not enabled already.
|
|
120
|
+
Args:
|
|
121
|
+
args: user provided arguments for running the command.
|
|
122
|
+
Returns:
|
|
123
|
+
0 if successful and error code otherwise.
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
if is_driver_enabled_on_cluster(args, driver='gcpFilestoreCsiDriver'):
|
|
127
|
+
return 0
|
|
128
|
+
cluster_update_return_code = update_gke_cluster_with_addon(
|
|
129
|
+
args, 'GcpFilestoreCsiDriver'
|
|
130
|
+
)
|
|
131
|
+
if cluster_update_return_code > 0:
|
|
132
|
+
xpk_print('Updating GKE cluster to enable GCPFilestore CSI driver failed!')
|
|
133
|
+
return cluster_update_return_code
|
|
134
|
+
|
|
135
|
+
return 0
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def is_driver_enabled_on_cluster(args, driver: str) -> bool:
|
|
139
|
+
"""Checks if GCSFuse CSI driver is enabled on the cluster.
|
|
140
|
+
Args:
|
|
141
|
+
args: user provided arguments for running the command.
|
|
142
|
+
driver (str) : name of the driver
|
|
143
|
+
Returns:
|
|
144
|
+
True if driver is enabled on the cluster and False otherwise.
|
|
145
|
+
"""
|
|
146
|
+
command = (
|
|
147
|
+
f'gcloud container clusters describe {args.cluster}'
|
|
148
|
+
f' --project={args.project} --region={zone_to_region(args.zone)}'
|
|
149
|
+
f' --format="value(addonsConfig.{driver}Config.enabled)"'
|
|
150
|
+
)
|
|
151
|
+
return_code, gcsfuse_driver_enabled = run_command_for_value(
|
|
152
|
+
command,
|
|
153
|
+
f'Checks if {driver} driver is enabled in cluster describe.',
|
|
154
|
+
args,
|
|
155
|
+
)
|
|
156
|
+
if return_code != 0:
|
|
157
|
+
xpk_exit(return_code)
|
|
158
|
+
if gcsfuse_driver_enabled.lower() == 'true':
|
|
159
|
+
xpk_print(f'{driver} driver is enabled on the cluster, no update needed.')
|
|
160
|
+
return True
|
|
161
|
+
return False
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def update_gke_cluster_with_addon(args, addon: str) -> int:
|
|
165
|
+
"""Run the GKE cluster update command for existing cluster and enabling passed addon.
|
|
166
|
+
Args:
|
|
167
|
+
args: user provided arguments for running the command.
|
|
168
|
+
Returns:
|
|
169
|
+
0 if successful and 1 otherwise.
|
|
170
|
+
"""
|
|
171
|
+
command = (
|
|
172
|
+
'gcloud container clusters update'
|
|
173
|
+
f' {args.cluster} --project={args.project}'
|
|
174
|
+
f' --region={zone_to_region(args.zone)}'
|
|
175
|
+
f' --update-addons {addon}=ENABLED'
|
|
176
|
+
' --quiet'
|
|
177
|
+
)
|
|
178
|
+
xpk_print(f'Updating GKE cluster to enable {addon}, may take a while!')
|
|
179
|
+
return_code = run_command_with_updates(
|
|
180
|
+
command, f'GKE Cluster Update to enable {addon}', args
|
|
181
|
+
)
|
|
182
|
+
if return_code != 0:
|
|
183
|
+
xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
|
|
184
|
+
return 1
|
|
185
|
+
return 0
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def get_all_clusters_programmatic(args) -> tuple[list[str], int]:
|
|
189
|
+
"""Gets all the clusters associated with the project / region.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
args: user provided arguments for running the command.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
List of cluster names and 0 if successful and 1 otherwise.
|
|
196
|
+
"""
|
|
197
|
+
command = (
|
|
198
|
+
'gcloud container clusters list'
|
|
199
|
+
f' --project={args.project} --region={zone_to_region(args.zone)}'
|
|
200
|
+
' --format="csv[no-heading](name)"'
|
|
201
|
+
)
|
|
202
|
+
return_code, raw_cluster_output = run_command_for_value(
|
|
203
|
+
command, 'Find if Cluster Exists', args
|
|
204
|
+
)
|
|
205
|
+
if return_code != 0:
|
|
206
|
+
xpk_print(f'Find if Cluster Exists returned ERROR {return_code}')
|
|
207
|
+
return [], return_code
|
|
208
|
+
|
|
209
|
+
return raw_cluster_output.splitlines(), 0
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def project_id_to_project_number(project_id: str) -> str:
|
|
213
|
+
client = resourcemanager_v3.ProjectsClient()
|
|
214
|
+
request = resourcemanager_v3.GetProjectRequest()
|
|
215
|
+
request.name = f'projects/{project_id}'
|
|
216
|
+
try:
|
|
217
|
+
response = client.get_project(request=request)
|
|
218
|
+
except PermissionDenied as e:
|
|
219
|
+
xpk_print(
|
|
220
|
+
f"Couldn't translate project id: {project_id} to project number."
|
|
221
|
+
f' Error: {e}'
|
|
222
|
+
)
|
|
223
|
+
xpk_exit(1)
|
|
224
|
+
parts = response.name.split('/', 1)
|
|
225
|
+
xpk_print(f'Project number for project: {project_id} is {parts[1]}')
|
|
226
|
+
return str(parts[1])
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def setup_k8s_env(args) -> k8s_client.ApiClient:
|
|
230
|
+
if not getattr(args, 'kind_cluster', False):
|
|
231
|
+
add_zone_and_project(args)
|
|
232
|
+
get_cluster_credentials(args)
|
|
233
|
+
args.project_number = project_id_to_project_number(args.project)
|
|
234
|
+
|
|
235
|
+
config.load_kube_config()
|
|
236
|
+
return k8s_client.ApiClient() # pytype: disable=bad-return-type
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def get_gpu_type_from_cluster(args) -> str:
|
|
240
|
+
system = get_cluster_system_characteristics(args)
|
|
241
|
+
if not system is None:
|
|
242
|
+
return system.device_type
|
|
243
|
+
return ''
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def create_xpk_k8s_service_account() -> None:
|
|
247
|
+
k8s_core_client = k8s_client.CoreV1Api()
|
|
248
|
+
sa = k8s_client.V1ServiceAccount(
|
|
249
|
+
metadata=k8s_client.V1ObjectMeta(name=XPK_SA)
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
xpk_print(f'Creating a new service account: {XPK_SA}')
|
|
253
|
+
try:
|
|
254
|
+
k8s_core_client.create_namespaced_service_account(
|
|
255
|
+
DEFAULT_NAMESPACE, sa, pretty=True
|
|
256
|
+
)
|
|
257
|
+
xpk_print(f'Created a new service account: {sa} successfully')
|
|
258
|
+
except ApiException:
|
|
259
|
+
xpk_print(
|
|
260
|
+
f'Service account: {XPK_SA} already exists. Skipping its creation'
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def update_gke_cluster_with_clouddns(args) -> int:
|
|
265
|
+
"""Run the GKE cluster update command for existing clusters and enable CloudDNS.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
args: user provided arguments for running the command.
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
0 if successful and 1 otherwise.
|
|
272
|
+
"""
|
|
273
|
+
command = (
|
|
274
|
+
'gcloud container clusters update'
|
|
275
|
+
f' {args.cluster} --project={args.project}'
|
|
276
|
+
f' --region={zone_to_region(args.zone)}'
|
|
277
|
+
' --cluster-dns=clouddns'
|
|
278
|
+
' --cluster-dns-scope=vpc'
|
|
279
|
+
f' --cluster-dns-domain={args.cluster}-domain'
|
|
280
|
+
' --quiet'
|
|
281
|
+
)
|
|
282
|
+
xpk_print('Updating GKE cluster to use Cloud DNS, may take a while!')
|
|
283
|
+
return_code = run_command_with_updates(
|
|
284
|
+
command, 'GKE Cluster Update to enable Cloud DNS', args
|
|
285
|
+
)
|
|
286
|
+
if return_code != 0:
|
|
287
|
+
xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
|
|
288
|
+
return 1
|
|
289
|
+
return 0
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def update_gke_cluster_with_workload_identity_enabled(args) -> int:
|
|
293
|
+
"""Run the GKE cluster update command for existing cluster and enable Workload Identity Federation.
|
|
294
|
+
Args:
|
|
295
|
+
args: user provided arguments for running the command.
|
|
296
|
+
Returns:
|
|
297
|
+
0 if successful and 1 otherwise.
|
|
298
|
+
"""
|
|
299
|
+
command = (
|
|
300
|
+
'gcloud container clusters update'
|
|
301
|
+
f' {args.cluster} --project={args.project}'
|
|
302
|
+
f' --region={zone_to_region(args.zone)}'
|
|
303
|
+
f' --workload-pool={args.project}.svc.id.goog'
|
|
304
|
+
' --quiet'
|
|
305
|
+
)
|
|
306
|
+
xpk_print(
|
|
307
|
+
'Updating GKE cluster to enable Workload Identity Federation, may take a'
|
|
308
|
+
' while!'
|
|
309
|
+
)
|
|
310
|
+
return_code = run_command_with_updates(
|
|
311
|
+
command, 'GKE Cluster Update to enable Workload Identity Federation', args
|
|
312
|
+
)
|
|
313
|
+
if return_code != 0:
|
|
314
|
+
xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
|
|
315
|
+
return 1
|
|
316
|
+
return 0
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def update_gke_cluster_with_gcsfuse_driver_enabled(args) -> int:
|
|
320
|
+
"""Run the GKE cluster update command for existing cluster and enable GCSFuse CSI driver.
|
|
321
|
+
Args:
|
|
322
|
+
args: user provided arguments for running the command.
|
|
323
|
+
Returns:
|
|
324
|
+
0 if successful and 1 otherwise.
|
|
325
|
+
"""
|
|
326
|
+
command = (
|
|
327
|
+
'gcloud container clusters update'
|
|
328
|
+
f' {args.cluster} --project={args.project}'
|
|
329
|
+
f' --region={zone_to_region(args.zone)}'
|
|
330
|
+
' --update-addons GcsFuseCsiDriver=ENABLED'
|
|
331
|
+
' --quiet'
|
|
332
|
+
)
|
|
333
|
+
xpk_print(
|
|
334
|
+
'Updating GKE cluster to enable GCSFuse CSI driver, may take a while!'
|
|
335
|
+
)
|
|
336
|
+
return_code = run_command_with_updates(
|
|
337
|
+
command, 'GKE Cluster Update to enable GCSFuse CSI driver', args
|
|
338
|
+
)
|
|
339
|
+
if return_code != 0:
|
|
340
|
+
xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
|
|
341
|
+
return 1
|
|
342
|
+
return 0
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def upgrade_gke_control_plane_version(args, default_rapid_gke_version) -> int:
|
|
346
|
+
"""Upgrade GKE cluster's control plane version before updating nodepools to use CloudDNS.
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
args: user provided arguments for running the command.
|
|
350
|
+
default_rapid_gke_version: Rapid default version for the upgrade.
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
0 if successful and 1 otherwise.
|
|
354
|
+
"""
|
|
355
|
+
command = (
|
|
356
|
+
'gcloud container clusters upgrade'
|
|
357
|
+
f' {args.cluster} --project={args.project}'
|
|
358
|
+
f' --region={zone_to_region(args.zone)}'
|
|
359
|
+
f' --cluster-version={default_rapid_gke_version}'
|
|
360
|
+
' --master'
|
|
361
|
+
' --quiet'
|
|
362
|
+
)
|
|
363
|
+
xpk_print("Updating GKE cluster's control plane version, may take a while!")
|
|
364
|
+
return_code = run_command_with_updates(
|
|
365
|
+
command,
|
|
366
|
+
'GKE Cluster control plane version update to enable Cloud DNS',
|
|
367
|
+
args,
|
|
368
|
+
)
|
|
369
|
+
if return_code != 0:
|
|
370
|
+
xpk_print(
|
|
371
|
+
"GKE cluster's control plane version update request returned"
|
|
372
|
+
f' ERROR {return_code}'
|
|
373
|
+
)
|
|
374
|
+
return 1
|
|
375
|
+
return 0
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def is_cluster_using_clouddns(args) -> bool:
|
|
379
|
+
"""Checks if cluster is using CloudDNS.
|
|
380
|
+
Args:
|
|
381
|
+
args: user provided arguments for running the command.
|
|
382
|
+
|
|
383
|
+
Returns:
|
|
384
|
+
True if cluster is using CloudDNS and False otherwise.
|
|
385
|
+
"""
|
|
386
|
+
command = (
|
|
387
|
+
f'gcloud container clusters describe {args.cluster}'
|
|
388
|
+
f' --project={args.project} --region={zone_to_region(args.zone)}'
|
|
389
|
+
' 2> /dev/null | grep "clusterDns: CLOUD_DNS"'
|
|
390
|
+
)
|
|
391
|
+
return_code, _ = run_command_for_value(
|
|
392
|
+
command,
|
|
393
|
+
'Check if Cloud DNS is enabled in cluster describe.',
|
|
394
|
+
args,
|
|
395
|
+
)
|
|
396
|
+
if return_code == 0:
|
|
397
|
+
xpk_print('Cloud DNS is enabled on the cluster, no update needed.')
|
|
398
|
+
return True
|
|
399
|
+
return False
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def is_workload_identity_enabled_on_cluster(args) -> bool:
|
|
403
|
+
"""Checks if Workload Identity Federation is enabled on the cluster.
|
|
404
|
+
Args:
|
|
405
|
+
args: user provided arguments for running the command.
|
|
406
|
+
Returns:
|
|
407
|
+
True if Workload Identity Federation is enabled on the cluster and False otherwise.
|
|
408
|
+
"""
|
|
409
|
+
command = (
|
|
410
|
+
f'gcloud container clusters describe {args.cluster}'
|
|
411
|
+
f' --project={args.project} --region={zone_to_region(args.zone)}'
|
|
412
|
+
' --format="value(workloadIdentityConfig.workloadPool)"'
|
|
413
|
+
)
|
|
414
|
+
return_code, workload_pool = run_command_for_value(
|
|
415
|
+
command,
|
|
416
|
+
'Checks if Workload Identity Federation is enabled in cluster describe.',
|
|
417
|
+
args,
|
|
418
|
+
)
|
|
419
|
+
if return_code != 0:
|
|
420
|
+
xpk_exit(return_code)
|
|
421
|
+
if workload_pool == f'{args.project}.svc.id.goog':
|
|
422
|
+
xpk_print(
|
|
423
|
+
'Workload Identity Federation is enabled on the cluster, no update'
|
|
424
|
+
' needed.'
|
|
425
|
+
)
|
|
426
|
+
return True
|
|
427
|
+
return False
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def is_gcsfuse_driver_enabled_on_cluster(args) -> bool:
|
|
431
|
+
"""Checks if GCSFuse CSI driver is enabled on the cluster.
|
|
432
|
+
Args:
|
|
433
|
+
args: user provided arguments for running the command.
|
|
434
|
+
Returns:
|
|
435
|
+
True if GCSFuse CSI driver is enabled on the cluster and False otherwise.
|
|
436
|
+
"""
|
|
437
|
+
command = (
|
|
438
|
+
f'gcloud container clusters describe {args.cluster}'
|
|
439
|
+
f' --project={args.project} --region={zone_to_region(args.zone)}'
|
|
440
|
+
' --format="value(addonsConfig.gcsFuseCsiDriverConfig.enabled)"'
|
|
441
|
+
)
|
|
442
|
+
return_code, gcsfuse_driver_enabled = run_command_for_value(
|
|
443
|
+
command,
|
|
444
|
+
'Checks if GCSFuse CSI driver is enabled in cluster describe.',
|
|
445
|
+
args,
|
|
446
|
+
)
|
|
447
|
+
if return_code != 0:
|
|
448
|
+
xpk_exit(return_code)
|
|
449
|
+
if gcsfuse_driver_enabled.lower() == 'true':
|
|
450
|
+
xpk_print('GCSFuse CSI driver is enabled on the cluster, no update needed.')
|
|
451
|
+
return True
|
|
452
|
+
return False
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
def update_cluster_with_clouddns_if_necessary(args) -> int:
|
|
456
|
+
"""Updates a GKE cluster to use CloudDNS, if not enabled already.
|
|
457
|
+
|
|
458
|
+
Args:
|
|
459
|
+
args: user provided arguments for running the command.
|
|
460
|
+
|
|
461
|
+
Returns:
|
|
462
|
+
0 if successful and error code otherwise.
|
|
463
|
+
"""
|
|
464
|
+
all_clusters, return_code = get_all_clusters_programmatic(args)
|
|
465
|
+
if return_code > 0:
|
|
466
|
+
xpk_print('Listing all clusters failed!')
|
|
467
|
+
return 1
|
|
468
|
+
if args.cluster in all_clusters:
|
|
469
|
+
# If cluster is already using clouddns, no update necessary!
|
|
470
|
+
if is_cluster_using_clouddns(args):
|
|
471
|
+
return 0
|
|
472
|
+
cluster_update_return_code = update_gke_cluster_with_clouddns(args)
|
|
473
|
+
if cluster_update_return_code > 0:
|
|
474
|
+
xpk_print('Updating GKE cluster to use CloudDNS failed!')
|
|
475
|
+
return cluster_update_return_code
|
|
476
|
+
|
|
477
|
+
# Find default rapid control plane version and update the control plane to the same.
|
|
478
|
+
server_config_return_code, gke_server_config = get_gke_server_config(args)
|
|
479
|
+
if server_config_return_code != 0:
|
|
480
|
+
xpk_exit(server_config_return_code)
|
|
481
|
+
upgrade_master_return_code = upgrade_gke_control_plane_version(
|
|
482
|
+
args,
|
|
483
|
+
gke_server_config.default_rapid_gke_version, # pytype: disable=attribute-error
|
|
484
|
+
)
|
|
485
|
+
if upgrade_master_return_code > 0:
|
|
486
|
+
xpk_print("Updating GKE cluster's control plane upgrade failed!")
|
|
487
|
+
return upgrade_master_return_code
|
|
488
|
+
|
|
489
|
+
# Upgrade nodepools version after the master upgrade.
|
|
490
|
+
node_pool_update_code = upgrade_gke_nodepools_version(
|
|
491
|
+
args,
|
|
492
|
+
gke_server_config.default_rapid_gke_version, # pytype: disable=attribute-error
|
|
493
|
+
)
|
|
494
|
+
if node_pool_update_code > 0:
|
|
495
|
+
xpk_print('Upgrading nodepools version failed!')
|
|
496
|
+
return node_pool_update_code
|
|
497
|
+
return 0
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
def update_cluster_with_workload_identity_if_necessary(args) -> int:
|
|
501
|
+
"""Updates a GKE cluster to enable Workload Identity Federation, if not enabled already.
|
|
502
|
+
Args:
|
|
503
|
+
args: user provided arguments for running the command.
|
|
504
|
+
Returns:
|
|
505
|
+
0 if successful and error code otherwise.
|
|
506
|
+
"""
|
|
507
|
+
|
|
508
|
+
if is_workload_identity_enabled_on_cluster(args):
|
|
509
|
+
return 0
|
|
510
|
+
cluster_update_return_code = (
|
|
511
|
+
update_gke_cluster_with_workload_identity_enabled(args)
|
|
512
|
+
)
|
|
513
|
+
if cluster_update_return_code > 0:
|
|
514
|
+
xpk_print(
|
|
515
|
+
'Updating GKE cluster to enable Workload Identity Federation failed!'
|
|
516
|
+
)
|
|
517
|
+
return cluster_update_return_code
|
|
518
|
+
|
|
519
|
+
return 0
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
def update_cluster_with_gcsfuse_driver_if_necessary(args) -> int:
|
|
523
|
+
"""Updates a GKE cluster to enable GCSFuse CSI driver, if not enabled already.
|
|
524
|
+
Args:
|
|
525
|
+
args: user provided arguments for running the command.
|
|
526
|
+
Returns:
|
|
527
|
+
0 if successful and error code otherwise.
|
|
528
|
+
"""
|
|
529
|
+
|
|
530
|
+
if is_gcsfuse_driver_enabled_on_cluster(args):
|
|
531
|
+
return 0
|
|
532
|
+
cluster_update_return_code = update_gke_cluster_with_gcsfuse_driver_enabled(
|
|
533
|
+
args
|
|
534
|
+
)
|
|
535
|
+
if cluster_update_return_code > 0:
|
|
536
|
+
xpk_print('Updating GKE cluster to enable GCSFuse CSI driver failed!')
|
|
537
|
+
return cluster_update_return_code
|
|
538
|
+
|
|
539
|
+
return 0
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
def get_cluster_credentials(args) -> None:
|
|
543
|
+
"""Run cluster configuration command to set the kubectl config.
|
|
544
|
+
|
|
545
|
+
Args:
|
|
546
|
+
args: user provided arguments for running the command.
|
|
547
|
+
|
|
548
|
+
Returns:
|
|
549
|
+
0 if successful and 1 otherwise.
|
|
550
|
+
"""
|
|
551
|
+
command = (
|
|
552
|
+
'gcloud container clusters get-credentials'
|
|
553
|
+
f' {args.cluster} --region={zone_to_region(args.zone)}'
|
|
554
|
+
f' --project={args.project} &&'
|
|
555
|
+
' kubectl config view && kubectl config set-context --current'
|
|
556
|
+
' --namespace=default'
|
|
557
|
+
)
|
|
558
|
+
task = f'get-credentials to cluster {args.cluster}'
|
|
559
|
+
return_code = run_command_with_updates_retry(
|
|
560
|
+
command, task, args, verbose=False
|
|
561
|
+
)
|
|
562
|
+
if return_code != 0:
|
|
563
|
+
xpk_print(f'{task} returned ERROR {return_code}')
|
|
564
|
+
xpk_exit(return_code)
|
xpk/core/cluster_private.py
CHANGED
|
@@ -14,11 +14,14 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
from .core import zone_to_region
|
|
18
|
-
from .commands import run_command_for_value, run_command_with_updates
|
|
19
17
|
from ..utils.console import xpk_exit, xpk_print
|
|
20
|
-
from ..utils.network import
|
|
18
|
+
from ..utils.network import (
|
|
19
|
+
add_current_machine_to_networks,
|
|
20
|
+
is_current_machine_in_any_network,
|
|
21
|
+
)
|
|
21
22
|
from ..utils.objects import is_text_true
|
|
23
|
+
from .commands import run_command_for_value, run_command_with_updates
|
|
24
|
+
from .gcloud_context import zone_to_region
|
|
22
25
|
|
|
23
26
|
|
|
24
27
|
def authorize_private_cluster_access_if_necessary(args) -> int:
|
xpk/core/commands.py
CHANGED
|
@@ -194,7 +194,7 @@ def run_command_with_updates(command, task, global_args, verbose=True) -> int:
|
|
|
194
194
|
while True:
|
|
195
195
|
return_code = child.poll()
|
|
196
196
|
if return_code is None:
|
|
197
|
-
xpk_print(f'Waiting for `{task}`, for {i} seconds')
|
|
197
|
+
xpk_print(f'Waiting for `{task}`, for {i} seconds...', end='\r')
|
|
198
198
|
time.sleep(1)
|
|
199
199
|
i += 1
|
|
200
200
|
else:
|
|
@@ -246,7 +246,7 @@ def run_command_for_value(
|
|
|
246
246
|
int: return_code, default is 0
|
|
247
247
|
str: return_val, default is '0'
|
|
248
248
|
"""
|
|
249
|
-
if global_args.dry_run:
|
|
249
|
+
if global_args is not None and global_args.dry_run:
|
|
250
250
|
xpk_print(
|
|
251
251
|
f'Task: `{task}` is implemented by the following command'
|
|
252
252
|
' not running since it is a dry run.'
|
|
@@ -268,7 +268,7 @@ def run_command_for_value(
|
|
|
268
268
|
return_code = child.poll()
|
|
269
269
|
if return_code is None:
|
|
270
270
|
if not quiet:
|
|
271
|
-
xpk_print(f'Waiting for `{task}`, for {i} seconds')
|
|
271
|
+
xpk_print(f'Waiting for `{task}`, for {i} seconds...', end='\r')
|
|
272
272
|
time.sleep(1)
|
|
273
273
|
i += 1
|
|
274
274
|
else:
|
|
@@ -303,7 +303,7 @@ def run_command_with_full_controls(
|
|
|
303
303
|
command: str,
|
|
304
304
|
task: str,
|
|
305
305
|
global_args: Namespace,
|
|
306
|
-
instructions: str = None,
|
|
306
|
+
instructions: str | None = None,
|
|
307
307
|
) -> int:
|
|
308
308
|
"""Run command in current shell with system out, in and error handles. Wait
|
|
309
309
|
until it exits.
|
|
@@ -333,16 +333,20 @@ def run_command_with_full_controls(
|
|
|
333
333
|
if instructions is not None:
|
|
334
334
|
xpk_print(instructions)
|
|
335
335
|
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
336
|
+
try:
|
|
337
|
+
with subprocess.Popen(
|
|
338
|
+
command,
|
|
339
|
+
stdout=sys.stdout,
|
|
340
|
+
stderr=sys.stderr,
|
|
341
|
+
stdin=sys.stdin,
|
|
342
|
+
shell=True,
|
|
343
|
+
) as child:
|
|
344
|
+
return_code = child.wait()
|
|
345
|
+
xpk_print(f'Task: `{task}` terminated with code `{return_code}`')
|
|
346
|
+
except KeyboardInterrupt:
|
|
347
|
+
return_code = 0
|
|
348
|
+
|
|
349
|
+
return return_code
|
|
346
350
|
|
|
347
351
|
|
|
348
352
|
def run_kubectl_apply(yml_string: str, task: str, args: Namespace) -> int:
|