xpk 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/__init__.py +15 -0
- xpk/commands/__init__.py +15 -0
- xpk/commands/batch.py +109 -0
- xpk/commands/cluster.py +784 -0
- xpk/commands/cluster_gcluster.py +185 -0
- xpk/commands/info.py +245 -0
- xpk/commands/inspector.py +363 -0
- xpk/commands/job.py +197 -0
- xpk/commands/kind.py +253 -0
- xpk/commands/shell.py +120 -0
- xpk/commands/version.py +39 -0
- xpk/commands/workload.py +692 -0
- xpk/core/__init__.py +15 -0
- xpk/core/blueprint/__init__.py +15 -0
- xpk/core/blueprint/blueprint_definitions.py +61 -0
- xpk/core/blueprint/blueprint_generator.py +652 -0
- xpk/core/cluster_private.py +197 -0
- xpk/core/commands.py +352 -0
- xpk/core/core.py +2824 -0
- xpk/core/docker_manager.py +308 -0
- xpk/core/gcluster_manager.py +158 -0
- xpk/core/kjob.py +205 -0
- xpk/core/kueue.py +352 -0
- xpk/core/nap.py +349 -0
- xpk/core/pathways.py +298 -0
- xpk/core/ray.py +222 -0
- xpk/core/system_characteristics.py +1395 -0
- xpk/core/workload.py +133 -0
- xpk/core/workload_decorators/__init__.py +15 -0
- xpk/core/workload_decorators/rdma_decorator.py +109 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +157 -0
- xpk/main.py +73 -0
- xpk/parser/__init__.py +15 -0
- xpk/parser/batch.py +184 -0
- xpk/parser/cluster.py +621 -0
- xpk/parser/common.py +71 -0
- xpk/parser/core.py +109 -0
- xpk/parser/info.py +63 -0
- xpk/parser/inspector.py +65 -0
- xpk/parser/job.py +126 -0
- xpk/parser/kind.py +94 -0
- xpk/parser/shell.py +50 -0
- xpk/parser/validators.py +39 -0
- xpk/parser/version.py +23 -0
- xpk/parser/workload.py +684 -0
- xpk/utils/__init__.py +15 -0
- xpk/utils/console.py +55 -0
- xpk/utils/file.py +82 -0
- xpk/utils/network.py +168 -0
- xpk/utils/objects.py +85 -0
- xpk/utils/yaml.py +30 -0
- {xpk-0.4.0.dist-info → xpk-0.6.0.dist-info}/METADATA +307 -38
- xpk-0.6.0.dist-info/RECORD +57 -0
- {xpk-0.4.0.dist-info → xpk-0.6.0.dist-info}/WHEEL +1 -1
- xpk-0.6.0.dist-info/entry_points.txt +2 -0
- xpk-0.4.0.dist-info/RECORD +0 -7
- xpk-0.4.0.dist-info/entry_points.txt +0 -2
- xpk.py +0 -7218
- {xpk-0.4.0.dist-info → xpk-0.6.0.dist-info}/LICENSE +0 -0
- {xpk-0.4.0.dist-info → xpk-0.6.0.dist-info}/top_level.txt +0 -0
xpk/commands/cluster.py
ADDED
|
@@ -0,0 +1,784 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from ..core.commands import (
|
|
18
|
+
run_command_for_value,
|
|
19
|
+
run_command_with_updates,
|
|
20
|
+
run_command_with_updates_retry,
|
|
21
|
+
)
|
|
22
|
+
from ..core.core import (
|
|
23
|
+
VERTEX_TENSORBOARD_FEATURE_FLAG,
|
|
24
|
+
add_zone_and_project,
|
|
25
|
+
create_cluster_configmaps,
|
|
26
|
+
create_cluster_network_config,
|
|
27
|
+
create_vertex_tensorboard,
|
|
28
|
+
delete_cluster_subnets,
|
|
29
|
+
get_all_clusters_programmatic,
|
|
30
|
+
get_gke_control_plane_version,
|
|
31
|
+
get_gke_node_pool_version,
|
|
32
|
+
get_gke_server_config,
|
|
33
|
+
h100_device_type,
|
|
34
|
+
install_nccl_on_cluster,
|
|
35
|
+
run_gke_node_pool_create_command,
|
|
36
|
+
set_jobset_on_cluster,
|
|
37
|
+
set_up_cluster_network_for_gpu,
|
|
38
|
+
zone_to_region,
|
|
39
|
+
get_user_input,
|
|
40
|
+
)
|
|
41
|
+
from ..core.cluster_private import authorize_private_cluster_access_if_necessary
|
|
42
|
+
from ..core.kjob import (
|
|
43
|
+
verify_kjob_installed,
|
|
44
|
+
prepare_kjob,
|
|
45
|
+
apply_kjob_crds,
|
|
46
|
+
)
|
|
47
|
+
from ..core.kueue import (
|
|
48
|
+
cluster_preheat_yml,
|
|
49
|
+
install_kueue_crs,
|
|
50
|
+
install_kueue_on_cluster,
|
|
51
|
+
wait_for_kueue_available,
|
|
52
|
+
)
|
|
53
|
+
from ..core.nap import enable_autoprovisioning_on_cluster
|
|
54
|
+
from ..core.ray import install_ray_cluster
|
|
55
|
+
from ..core.system_characteristics import (
|
|
56
|
+
AcceleratorType,
|
|
57
|
+
AcceleratorTypeToAcceleratorCharacteristics,
|
|
58
|
+
SystemCharacteristics,
|
|
59
|
+
get_system_characteristics,
|
|
60
|
+
)
|
|
61
|
+
from ..core.workload import get_workload_list
|
|
62
|
+
from ..utils.file import write_tmp_file
|
|
63
|
+
from ..utils.console import xpk_exit, xpk_print
|
|
64
|
+
from . import cluster_gcluster
|
|
65
|
+
|
|
66
|
+
from tabulate import tabulate
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def cluster_create(args) -> None:
|
|
70
|
+
"""Function around cluster creation.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
args: user provided arguments for running the command.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
0 if successful and 1 otherwise.
|
|
77
|
+
"""
|
|
78
|
+
system, return_code = get_system_characteristics(args)
|
|
79
|
+
|
|
80
|
+
if return_code > 0:
|
|
81
|
+
xpk_print('Fetching system characteristics failed!')
|
|
82
|
+
xpk_exit(return_code)
|
|
83
|
+
|
|
84
|
+
xpk_print(f'Starting cluster create for cluster {args.cluster}:', flush=True)
|
|
85
|
+
add_zone_and_project(args)
|
|
86
|
+
|
|
87
|
+
if system.device_type in cluster_gcluster.supported_device_types:
|
|
88
|
+
xpk_print(
|
|
89
|
+
'Creating the cluster using Cluster Toolkit. Machine Type:'
|
|
90
|
+
f' {system.gce_machine_type} ...'
|
|
91
|
+
)
|
|
92
|
+
cluster_gcluster.cluster_create(args)
|
|
93
|
+
xpk_exit(0)
|
|
94
|
+
|
|
95
|
+
return_code, gke_server_config = get_gke_server_config(args)
|
|
96
|
+
if return_code != 0:
|
|
97
|
+
xpk_exit(return_code)
|
|
98
|
+
|
|
99
|
+
return_code, gke_control_plane_version = get_gke_control_plane_version(
|
|
100
|
+
args, gke_server_config
|
|
101
|
+
)
|
|
102
|
+
if return_code != 0:
|
|
103
|
+
xpk_exit(return_code)
|
|
104
|
+
|
|
105
|
+
create_cluster_command_code = create_cluster_if_necessary(
|
|
106
|
+
args, gke_control_plane_version, system
|
|
107
|
+
)
|
|
108
|
+
if create_cluster_command_code != 0:
|
|
109
|
+
xpk_exit(create_cluster_command_code)
|
|
110
|
+
|
|
111
|
+
authorize_private_cluster_access_command_code = (
|
|
112
|
+
authorize_private_cluster_access_if_necessary(args)
|
|
113
|
+
)
|
|
114
|
+
if authorize_private_cluster_access_command_code != 0:
|
|
115
|
+
xpk_exit(authorize_private_cluster_access_command_code)
|
|
116
|
+
|
|
117
|
+
# ToDo(roshanin@) - Re-enable CloudDNS on Pathways clusters conditionally.
|
|
118
|
+
|
|
119
|
+
set_cluster_command_code = set_cluster_command(args)
|
|
120
|
+
if set_cluster_command_code != 0:
|
|
121
|
+
xpk_exit(set_cluster_command_code)
|
|
122
|
+
|
|
123
|
+
# create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
|
|
124
|
+
tensorboard_config = {}
|
|
125
|
+
if VERTEX_TENSORBOARD_FEATURE_FLAG and args.create_vertex_tensorboard:
|
|
126
|
+
tensorboard_config = create_vertex_tensorboard(args)
|
|
127
|
+
# exit if failed to create Tensorboard in Vertex AI
|
|
128
|
+
if not tensorboard_config:
|
|
129
|
+
xpk_exit(1)
|
|
130
|
+
|
|
131
|
+
if system.accelerator_type == AcceleratorType['GPU']:
|
|
132
|
+
xpk_print('Setting up Network for cluster')
|
|
133
|
+
set_up_cluster_network_code = set_up_cluster_network_for_gpu(args, system)
|
|
134
|
+
if set_up_cluster_network_code != 0:
|
|
135
|
+
xpk_exit(set_up_cluster_network_code)
|
|
136
|
+
|
|
137
|
+
if system.device_type == h100_device_type:
|
|
138
|
+
xpk_print('Creating Network Config for cluster')
|
|
139
|
+
create_cluster_network_config_code = create_cluster_network_config(args)
|
|
140
|
+
if create_cluster_network_config_code != 0:
|
|
141
|
+
xpk_exit(create_cluster_network_config_code)
|
|
142
|
+
|
|
143
|
+
# Check the control plane version of the cluster and determine the node pool
|
|
144
|
+
# version to use.
|
|
145
|
+
return_code, gke_node_pool_version = get_gke_node_pool_version(
|
|
146
|
+
args, gke_server_config
|
|
147
|
+
)
|
|
148
|
+
if return_code != 0:
|
|
149
|
+
xpk_exit(return_code)
|
|
150
|
+
|
|
151
|
+
run_gke_node_pool_create_command_code = run_gke_node_pool_create_command(
|
|
152
|
+
args, system, gke_node_pool_version
|
|
153
|
+
)
|
|
154
|
+
if run_gke_node_pool_create_command_code != 0:
|
|
155
|
+
xpk_exit(run_gke_node_pool_create_command_code)
|
|
156
|
+
|
|
157
|
+
xpk_print(
|
|
158
|
+
'Enabling the jobset API on our cluster, to be deprecated when Jobset is'
|
|
159
|
+
' globally available'
|
|
160
|
+
)
|
|
161
|
+
set_jobset_on_cluster_code = set_jobset_on_cluster(args)
|
|
162
|
+
if set_jobset_on_cluster_code != 0:
|
|
163
|
+
xpk_exit(set_jobset_on_cluster_code)
|
|
164
|
+
|
|
165
|
+
xpk_print('Enabling Kueue on the cluster')
|
|
166
|
+
install_kueue_on_cluster_code = install_kueue_on_cluster(args)
|
|
167
|
+
if install_kueue_on_cluster_code != 0:
|
|
168
|
+
xpk_exit(install_kueue_on_cluster_code)
|
|
169
|
+
|
|
170
|
+
xpk_print('Verifying kjob installation')
|
|
171
|
+
err_code = verify_kjob_installed(args)
|
|
172
|
+
if err_code > 0:
|
|
173
|
+
xpk_exit(err_code)
|
|
174
|
+
|
|
175
|
+
xpk_print('Applying kjob CDRs')
|
|
176
|
+
err_code = apply_kjob_crds(args)
|
|
177
|
+
if err_code > 0:
|
|
178
|
+
xpk_exit(err_code)
|
|
179
|
+
|
|
180
|
+
xpk_print('Preparing kjob')
|
|
181
|
+
err_code = prepare_kjob(args)
|
|
182
|
+
if err_code > 0:
|
|
183
|
+
xpk_exit(err_code)
|
|
184
|
+
# Provision node pools dynamically based on incoming workloads:
|
|
185
|
+
# Currently autoprovisioning is not supported with Pathways.
|
|
186
|
+
autoprovisioning_config = None
|
|
187
|
+
if not args.enable_pathways and args.enable_autoprovisioning:
|
|
188
|
+
xpk_print('Enabling Autoprovisioning')
|
|
189
|
+
autoprovisioning_config, return_code = enable_autoprovisioning_on_cluster(
|
|
190
|
+
args, system
|
|
191
|
+
)
|
|
192
|
+
if return_code != 0:
|
|
193
|
+
xpk_exit(return_code)
|
|
194
|
+
|
|
195
|
+
xpk_print('Wait for Kueue to be fully available')
|
|
196
|
+
wait_for_kueue_available_code = wait_for_kueue_available(args)
|
|
197
|
+
if wait_for_kueue_available_code != 0:
|
|
198
|
+
xpk_exit(wait_for_kueue_available_code)
|
|
199
|
+
|
|
200
|
+
xpk_print('Install Kueue Custom Resources')
|
|
201
|
+
enable_kueue_credentials_code = install_kueue_crs(
|
|
202
|
+
args, system, autoprovisioning_config
|
|
203
|
+
)
|
|
204
|
+
if enable_kueue_credentials_code != 0:
|
|
205
|
+
xpk_exit(enable_kueue_credentials_code)
|
|
206
|
+
|
|
207
|
+
if system.accelerator_type == AcceleratorType['GPU']:
|
|
208
|
+
xpk_print('Installing NCCL Plugin for cluster')
|
|
209
|
+
install_nccl_code = install_nccl_on_cluster(args, system)
|
|
210
|
+
if install_nccl_code != 0:
|
|
211
|
+
xpk_exit(install_nccl_code)
|
|
212
|
+
|
|
213
|
+
xpk_print('Creating ConfigMap for cluster')
|
|
214
|
+
create_cluster_configmaps_code = create_cluster_configmaps(
|
|
215
|
+
args, system, tensorboard_config, autoprovisioning_config
|
|
216
|
+
)
|
|
217
|
+
if create_cluster_configmaps_code != 0:
|
|
218
|
+
xpk_exit(create_cluster_configmaps_code)
|
|
219
|
+
|
|
220
|
+
if args.enable_ray_cluster:
|
|
221
|
+
return_code = install_ray_cluster(args, system)
|
|
222
|
+
if return_code != 0:
|
|
223
|
+
xpk_print('Installation of RayCluster failed.')
|
|
224
|
+
xpk_exit(return_code)
|
|
225
|
+
|
|
226
|
+
xpk_print('GKE commands done! Resources are created.')
|
|
227
|
+
xpk_print(
|
|
228
|
+
'See your GKE Cluster here:'
|
|
229
|
+
# pylint: disable=line-too-long
|
|
230
|
+
f' https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}'
|
|
231
|
+
)
|
|
232
|
+
xpk_exit(0)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def cluster_delete(args) -> None:
|
|
236
|
+
"""Function around cluster delete.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
args: user provided arguments for running the command.
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
0 if successful and 1 otherwise.
|
|
243
|
+
"""
|
|
244
|
+
xpk_print(f'Starting cluster delete for cluster: {args.cluster}', flush=True)
|
|
245
|
+
add_zone_and_project(args)
|
|
246
|
+
|
|
247
|
+
if cluster_gcluster.created_by_gcluster(args):
|
|
248
|
+
xpk_print(f'Deleting {args.cluster} cluster using Cluster Toolkit...')
|
|
249
|
+
cluster_gcluster.cluster_delete(args)
|
|
250
|
+
xpk_exit(0)
|
|
251
|
+
|
|
252
|
+
run_gke_cluster_delete_command_code = run_gke_cluster_delete_command(args)
|
|
253
|
+
if run_gke_cluster_delete_command_code != 0:
|
|
254
|
+
xpk_exit(run_gke_cluster_delete_command_code)
|
|
255
|
+
xpk_print(f'GKE commands done! Cluster {args.cluster} deleted.\n')
|
|
256
|
+
xpk_exit(0)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def cluster_cacheimage(args) -> None:
|
|
260
|
+
"""Function around cluster cacheimage.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
args: user provided arguments for running the command.
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
0 if successful and 1 otherwise.
|
|
267
|
+
"""
|
|
268
|
+
xpk_print(
|
|
269
|
+
f'Starting cluster cacheimage for cluster: {args.cluster}', flush=True
|
|
270
|
+
)
|
|
271
|
+
add_zone_and_project(args)
|
|
272
|
+
|
|
273
|
+
set_cluster_command_code = set_cluster_command(args)
|
|
274
|
+
if set_cluster_command_code != 0:
|
|
275
|
+
xpk_exit(set_cluster_command_code)
|
|
276
|
+
system, return_code = get_system_characteristics(args)
|
|
277
|
+
|
|
278
|
+
if return_code > 0:
|
|
279
|
+
xpk_print('Fetching system characteristics failed!')
|
|
280
|
+
xpk_exit(return_code)
|
|
281
|
+
|
|
282
|
+
node_selector_key = AcceleratorTypeToAcceleratorCharacteristics[
|
|
283
|
+
system.accelerator_type
|
|
284
|
+
].accelerator_label
|
|
285
|
+
yml_string = cluster_preheat_yml.format(
|
|
286
|
+
cachekey=args.cache_key,
|
|
287
|
+
image_name=args.docker_image,
|
|
288
|
+
nodeSelectorKey=node_selector_key,
|
|
289
|
+
)
|
|
290
|
+
tmp = write_tmp_file(yml_string)
|
|
291
|
+
command_apply = f'kubectl apply -f {str(tmp.file.name)}'
|
|
292
|
+
command_delete = (
|
|
293
|
+
f'kubectl delete -f {str(tmp.file.name)} --ignore-not-found=true'
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
return_code = run_command_with_updates(
|
|
297
|
+
command_delete, 'Deleting Cached Image', args
|
|
298
|
+
)
|
|
299
|
+
if return_code != 0:
|
|
300
|
+
xpk_print(f'Delete Cached Image returned ERROR {return_code}')
|
|
301
|
+
xpk_exit(return_code)
|
|
302
|
+
|
|
303
|
+
return_code = run_command_with_updates(
|
|
304
|
+
command_apply, 'Creating Cached Image', args
|
|
305
|
+
)
|
|
306
|
+
if return_code != 0:
|
|
307
|
+
xpk_print(f'Create Cached Image returned ERROR {return_code}')
|
|
308
|
+
xpk_exit(return_code)
|
|
309
|
+
xpk_exit(0)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def cluster_describe(args) -> None:
|
|
313
|
+
"""Function around cluster describe.
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
args: user provided arguments for running the command.
|
|
317
|
+
|
|
318
|
+
Returns:
|
|
319
|
+
0 if successful and 1 otherwise.
|
|
320
|
+
"""
|
|
321
|
+
xpk_print(f'Starting nodepool list for cluster: {args.cluster}', flush=True)
|
|
322
|
+
add_zone_and_project(args)
|
|
323
|
+
|
|
324
|
+
set_cluster_command_code = set_cluster_command(args)
|
|
325
|
+
if set_cluster_command_code != 0:
|
|
326
|
+
xpk_exit(set_cluster_command_code)
|
|
327
|
+
|
|
328
|
+
return_code, data_table = nodepools_build_table(args)
|
|
329
|
+
if return_code != 0:
|
|
330
|
+
xpk_exit(return_code)
|
|
331
|
+
|
|
332
|
+
if len(data_table) > 1:
|
|
333
|
+
xpk_print(
|
|
334
|
+
'Nodepools info:\n',
|
|
335
|
+
tabulate(data_table, headers='firstrow', tablefmt='plain'),
|
|
336
|
+
)
|
|
337
|
+
else:
|
|
338
|
+
xpk_print('No nodepools info found')
|
|
339
|
+
|
|
340
|
+
return_code_node_output, node_output = run_command_for_value(
|
|
341
|
+
r'kubectl get node --no-headers=true'
|
|
342
|
+
r" --selector='cloud.google.com/gke-tpu-accelerator' | wc -l",
|
|
343
|
+
'Count TPU Nodes',
|
|
344
|
+
args,
|
|
345
|
+
)
|
|
346
|
+
if return_code_node_output != 0:
|
|
347
|
+
xpk_exit(return_code_node_output)
|
|
348
|
+
node_output = node_output.splitlines()[-1]
|
|
349
|
+
number_tpu_vms_in_cluster = int(node_output)
|
|
350
|
+
|
|
351
|
+
return_code_pod_output, pod_output = run_command_for_value(
|
|
352
|
+
"kubectl get pod -o=custom-columns='Status:.status.phase' | grep -i"
|
|
353
|
+
' Running | wc -l',
|
|
354
|
+
'Count TPU Pods',
|
|
355
|
+
args,
|
|
356
|
+
)
|
|
357
|
+
if return_code_pod_output != 0:
|
|
358
|
+
xpk_exit(return_code_pod_output)
|
|
359
|
+
number_tpu_pods_in_cluster = int(pod_output)
|
|
360
|
+
|
|
361
|
+
xpk_print(
|
|
362
|
+
f'The cluster contains {number_tpu_vms_in_cluster} TPUVMs of which'
|
|
363
|
+
f' {number_tpu_pods_in_cluster} are in use.'
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
xpk_print('GKE commands done!\n')
|
|
367
|
+
xpk_exit(0)
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def nodepools_build_table(args) -> tuple[int, list[list]]:
|
|
371
|
+
table = [[
|
|
372
|
+
'NODEPOOL_NAME',
|
|
373
|
+
'SLICE',
|
|
374
|
+
'TYPE',
|
|
375
|
+
'EXPECTED_HEALTHY_NODES',
|
|
376
|
+
'ACTUAL_HEALTHY_NODES',
|
|
377
|
+
'TOTAL_NODES',
|
|
378
|
+
]]
|
|
379
|
+
|
|
380
|
+
nodepools_data = {}
|
|
381
|
+
|
|
382
|
+
nodepools, return_code = get_node_pools_name(args)
|
|
383
|
+
if return_code != 0:
|
|
384
|
+
xpk_print(f'Get node pools name returned ERROR {return_code}')
|
|
385
|
+
|
|
386
|
+
for name in nodepools:
|
|
387
|
+
nodepools_data[name] = [name]
|
|
388
|
+
|
|
389
|
+
slices, return_code = get_slice_node_pool_size(args)
|
|
390
|
+
if return_code != 0:
|
|
391
|
+
xpk_print(f'Get slice node pool size returned ERROR {return_code}')
|
|
392
|
+
|
|
393
|
+
for line in slices:
|
|
394
|
+
s = line.split()
|
|
395
|
+
count, nodepool_name = s[0], s[1]
|
|
396
|
+
nodepools_data[nodepool_name].append(count)
|
|
397
|
+
|
|
398
|
+
type_nodepool, return_code = get_node_pool_instance_type(args)
|
|
399
|
+
if return_code != 0:
|
|
400
|
+
xpk_print(f'Get node pool instance type returned ERROR {return_code}')
|
|
401
|
+
|
|
402
|
+
for line in type_nodepool:
|
|
403
|
+
tn = line.split()
|
|
404
|
+
nodepool_name, instance_type = tn[0], tn[1]
|
|
405
|
+
nodepools_data[nodepool_name].append(instance_type)
|
|
406
|
+
|
|
407
|
+
expected_healthy_nodes, return_code = get_expected_healthy_nodes(args)
|
|
408
|
+
if return_code != 0:
|
|
409
|
+
xpk_print(f'Get expected healthy nodes returned ERROR {return_code}')
|
|
410
|
+
|
|
411
|
+
for line in expected_healthy_nodes:
|
|
412
|
+
ehn = line.split()
|
|
413
|
+
count, nodepool_name = ehn[0], ehn[1]
|
|
414
|
+
nodepools_data[nodepool_name].append(count)
|
|
415
|
+
|
|
416
|
+
actual_healthy_nodes, return_code = get_actual_healthy_nodes(args)
|
|
417
|
+
if return_code != 0:
|
|
418
|
+
xpk_print(f'Get actual healthy nodes returned ERROR {return_code}')
|
|
419
|
+
|
|
420
|
+
for line in actual_healthy_nodes:
|
|
421
|
+
ahn = line.split()
|
|
422
|
+
count, nodepool_name = ahn[0], ahn[1]
|
|
423
|
+
nodepools_data[nodepool_name].append(count)
|
|
424
|
+
|
|
425
|
+
total_nodes, return_code = get_total_nodes_per_node_pool(args)
|
|
426
|
+
if return_code != 0:
|
|
427
|
+
xpk_print(f'Get total nodes per node pool returned ERROR {return_code}')
|
|
428
|
+
|
|
429
|
+
for line in total_nodes:
|
|
430
|
+
tn = line.split()
|
|
431
|
+
count, nodepool_name = tn[0], tn[1]
|
|
432
|
+
nodepools_data[nodepool_name].append(count)
|
|
433
|
+
|
|
434
|
+
for _, np_data in nodepools_data.items():
|
|
435
|
+
table.append(np_data)
|
|
436
|
+
|
|
437
|
+
return 0, table
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def get_node_pools_name(args) -> tuple[list[str], int]:
|
|
441
|
+
cmd_nodepools = (
|
|
442
|
+
'kubectl get node --no-headers=true -o'
|
|
443
|
+
" custom-columns='NODEPOOL:.metadata.labels.cloud\\.google\\.com/gke-nodepool'"
|
|
444
|
+
" | grep -v 'none' | sort | uniq"
|
|
445
|
+
)
|
|
446
|
+
return_code, out = run_command_for_value(cmd_nodepools, 'Nodepool list', args)
|
|
447
|
+
if return_code != 0:
|
|
448
|
+
return [], return_code
|
|
449
|
+
|
|
450
|
+
return out.splitlines(), 0
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def get_slice_node_pool_size(args) -> tuple[list[str], int]:
|
|
454
|
+
cmd_slices = (
|
|
455
|
+
'kubectl get node --no-headers=true -o'
|
|
456
|
+
" custom-columns=':metadata.labels.cloud\\.google\\.com/gke-nodepool'"
|
|
457
|
+
" | grep -v 'none'"
|
|
458
|
+
' | sort'
|
|
459
|
+
' | uniq -c'
|
|
460
|
+
)
|
|
461
|
+
return_code, out = run_command_for_value(
|
|
462
|
+
cmd_slices, 'Count nodes per nodepool slice', args
|
|
463
|
+
)
|
|
464
|
+
if return_code != 0:
|
|
465
|
+
return [], return_code
|
|
466
|
+
|
|
467
|
+
return out.splitlines(), 0
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def get_node_pool_instance_type(args) -> tuple[list[str], int]:
|
|
471
|
+
cmd_type_nodepool = (
|
|
472
|
+
'kubectl get node --no-headers=true -o'
|
|
473
|
+
" custom-columns='NODEPOOL:.metadata.labels.cloud\\.google\\.com/gke-nodepool,"
|
|
474
|
+
" TYPE:.metadata.labels.node\\.kubernetes\\.io/instance-type' | grep -v"
|
|
475
|
+
" 'none' | sort | uniq"
|
|
476
|
+
)
|
|
477
|
+
return_code, out = run_command_for_value(
|
|
478
|
+
cmd_type_nodepool, 'Instance type of nodepools', args
|
|
479
|
+
)
|
|
480
|
+
if return_code != 0:
|
|
481
|
+
return [], return_code
|
|
482
|
+
|
|
483
|
+
return out.splitlines(), 0
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
def get_expected_healthy_nodes(args) -> tuple[list[str], int]:
|
|
487
|
+
cmd_expected_healthy_nodes = (
|
|
488
|
+
'kubectl get node --no-headers=true -o'
|
|
489
|
+
" custom-columns=':metadata.labels.cloud\\.google\\.com/gke-nodepool'"
|
|
490
|
+
" | grep -v 'none'"
|
|
491
|
+
' | sort'
|
|
492
|
+
' | uniq -c'
|
|
493
|
+
)
|
|
494
|
+
return_code, out = run_command_for_value(
|
|
495
|
+
cmd_expected_healthy_nodes,
|
|
496
|
+
'Count expected healthy nodes per nodepool',
|
|
497
|
+
args,
|
|
498
|
+
)
|
|
499
|
+
if return_code != 0:
|
|
500
|
+
return [], return_code
|
|
501
|
+
|
|
502
|
+
return out.splitlines(), 0
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
def get_actual_healthy_nodes(args) -> tuple[list[str], int]:
|
|
506
|
+
cmd_actual_healthy_nodes = (
|
|
507
|
+
'kubectl get node --no-headers=true -o'
|
|
508
|
+
" custom-columns='NODE_NAME:metadata.name,"
|
|
509
|
+
' READY_STATUS:.status.conditions[?(@.type=="Ready")].status,'
|
|
510
|
+
" NODEPOOL:metadata.labels.cloud\\.google\\.com/gke-nodepool' "
|
|
511
|
+
' | grep -w True'
|
|
512
|
+
" | grep -v 'none'"
|
|
513
|
+
" | awk {'print $3'}"
|
|
514
|
+
' | sort'
|
|
515
|
+
' | uniq -c'
|
|
516
|
+
)
|
|
517
|
+
return_code, out = run_command_for_value(
|
|
518
|
+
cmd_actual_healthy_nodes, 'Count actual healthy nodes per nodepool', args
|
|
519
|
+
)
|
|
520
|
+
if return_code != 0:
|
|
521
|
+
return [], return_code
|
|
522
|
+
|
|
523
|
+
return out.splitlines(), 0
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
def get_total_nodes_per_node_pool(args) -> tuple[list[str], int]:
|
|
527
|
+
cmd_total_nodes = (
|
|
528
|
+
'kubectl get node --no-headers=true -o'
|
|
529
|
+
" custom-columns='NODE_NAME:metadata.name,"
|
|
530
|
+
' READY_STATUS:.status.conditions[?(@.type=="Ready")].status,'
|
|
531
|
+
" NODEPOOL:metadata.labels.cloud\\.google\\.com/gke-nodepool'"
|
|
532
|
+
" | grep -v 'none'"
|
|
533
|
+
" | awk {'print $3'}"
|
|
534
|
+
' | sort'
|
|
535
|
+
' | uniq -c'
|
|
536
|
+
)
|
|
537
|
+
return_code, out = run_command_for_value(
|
|
538
|
+
cmd_total_nodes, 'Count total nodes per nodepool', args
|
|
539
|
+
)
|
|
540
|
+
if return_code != 0:
|
|
541
|
+
return [], return_code
|
|
542
|
+
|
|
543
|
+
return out.splitlines(), 0
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
def cluster_list(args) -> None:
|
|
547
|
+
"""Function around cluster list.
|
|
548
|
+
|
|
549
|
+
Args:
|
|
550
|
+
args: user provided arguments for running the command.
|
|
551
|
+
|
|
552
|
+
Returns:
|
|
553
|
+
0 if successful and 1 otherwise.
|
|
554
|
+
"""
|
|
555
|
+
add_zone_and_project(args)
|
|
556
|
+
xpk_print(f'For project {args.project} and zone {args.zone}:', flush=True)
|
|
557
|
+
if run_gke_clusters_list_command(args):
|
|
558
|
+
xpk_exit(1)
|
|
559
|
+
xpk_exit(0)
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def cluster_create_pathways(args) -> None:
|
|
563
|
+
"""Function around cluster creation for Pathways.
|
|
564
|
+
|
|
565
|
+
Args:
|
|
566
|
+
args: user provided arguments for running the command.
|
|
567
|
+
|
|
568
|
+
Returns:
|
|
569
|
+
0 if successful and 1 otherwise.
|
|
570
|
+
"""
|
|
571
|
+
args.enable_pathways = True
|
|
572
|
+
args.enable_ray_cluster = False
|
|
573
|
+
cluster_create(args)
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
def cluster_create_ray_cluster(args) -> None:
|
|
577
|
+
"""Function around cluster creation for RayCluster.
|
|
578
|
+
|
|
579
|
+
Args:
|
|
580
|
+
args: user provided arguments for running the command.
|
|
581
|
+
|
|
582
|
+
Returns:
|
|
583
|
+
None
|
|
584
|
+
"""
|
|
585
|
+
args.enable_ray_cluster = True
|
|
586
|
+
args.enable_autoprovisioning = False
|
|
587
|
+
cluster_create(args)
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
def create_cluster_if_necessary(
|
|
591
|
+
args, gke_control_plane_version: str, system: SystemCharacteristics
|
|
592
|
+
) -> int:
|
|
593
|
+
"""Creates cluster if not present in the project.
|
|
594
|
+
|
|
595
|
+
Args:
|
|
596
|
+
args: user provided arguments for running the command.
|
|
597
|
+
gke_control_plane_version: version used if creating the cluster.
|
|
598
|
+
system: system characteristics.
|
|
599
|
+
|
|
600
|
+
Returns:
|
|
601
|
+
0 if successful and 1 otherwise.
|
|
602
|
+
"""
|
|
603
|
+
all_clusters, return_code = get_all_clusters_programmatic(args)
|
|
604
|
+
if return_code > 0:
|
|
605
|
+
xpk_print('Listing all clusters failed!')
|
|
606
|
+
return 1
|
|
607
|
+
if args.cluster in all_clusters:
|
|
608
|
+
xpk_print('Skipping cluster creation since it already exists.')
|
|
609
|
+
return 0
|
|
610
|
+
else:
|
|
611
|
+
return run_gke_cluster_create_command(
|
|
612
|
+
args, gke_control_plane_version, system
|
|
613
|
+
)
|
|
614
|
+
|
|
615
|
+
|
|
616
|
+
def run_gke_cluster_delete_command(args) -> int:
|
|
617
|
+
"""Run the Delete GKE Cluster request.
|
|
618
|
+
|
|
619
|
+
Args:
|
|
620
|
+
args: user provided arguments for running the command.
|
|
621
|
+
|
|
622
|
+
Returns:
|
|
623
|
+
0 if successful and 1 otherwise.
|
|
624
|
+
"""
|
|
625
|
+
if not args.force:
|
|
626
|
+
xpk_print('Get the name of the workloads in the cluster.')
|
|
627
|
+
args.filter_by_status = 'EVERYTHING'
|
|
628
|
+
return_code, return_value = get_workload_list(args)
|
|
629
|
+
if return_code != 0:
|
|
630
|
+
xpk_print(f'List Job request returned ERROR {return_code}')
|
|
631
|
+
return return_code
|
|
632
|
+
|
|
633
|
+
# Ignore Column Names line.
|
|
634
|
+
if len(return_value) > 1:
|
|
635
|
+
workloads = [x.split(' ')[0] for x in return_value.splitlines()][1:]
|
|
636
|
+
if workloads and not get_user_input(
|
|
637
|
+
f'Planning to delete {len(workloads)} workloads in the cluster'
|
|
638
|
+
f' {args.cluster} including {workloads}. \nDo you wish to delete: y'
|
|
639
|
+
' (yes) / n (no):\n'
|
|
640
|
+
):
|
|
641
|
+
xpk_print('Skipping delete command.')
|
|
642
|
+
return 0
|
|
643
|
+
|
|
644
|
+
command = (
|
|
645
|
+
'gcloud beta container clusters delete'
|
|
646
|
+
f' {args.cluster} --project={args.project}'
|
|
647
|
+
f' --region={zone_to_region(args.zone)} --quiet'
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
return_code = run_command_with_updates(command, 'Cluster Delete', args)
|
|
651
|
+
if return_code != 0:
|
|
652
|
+
xpk_print(f'Cluster delete request returned ERROR {return_code}')
|
|
653
|
+
return 1
|
|
654
|
+
|
|
655
|
+
return_code = delete_cluster_subnets(args)
|
|
656
|
+
if return_code != 0:
|
|
657
|
+
return return_code
|
|
658
|
+
|
|
659
|
+
return 0
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
def run_gke_clusters_list_command(args) -> int:
|
|
663
|
+
"""List GKE Clusters within the project and location.
|
|
664
|
+
|
|
665
|
+
Args:
|
|
666
|
+
args: user provided arguments for running the command.
|
|
667
|
+
|
|
668
|
+
Returns:
|
|
669
|
+
0 if successful and 1 otherwise.
|
|
670
|
+
"""
|
|
671
|
+
command = (
|
|
672
|
+
'gcloud container clusters list'
|
|
673
|
+
f' --project={args.project} --region={zone_to_region(args.zone)}'
|
|
674
|
+
)
|
|
675
|
+
return_code = run_command_with_updates(command, 'Cluster List', args)
|
|
676
|
+
if return_code != 0:
|
|
677
|
+
xpk_print(f'Cluster list request returned ERROR {return_code}')
|
|
678
|
+
return 1
|
|
679
|
+
|
|
680
|
+
return 0
|
|
681
|
+
|
|
682
|
+
|
|
683
|
+
def run_gke_cluster_create_command(
|
|
684
|
+
args, gke_control_plane_version: str, system: SystemCharacteristics
|
|
685
|
+
) -> int:
|
|
686
|
+
"""Run the Create GKE Cluster request.
|
|
687
|
+
|
|
688
|
+
Args:
|
|
689
|
+
args: user provided arguments for running the command.
|
|
690
|
+
gke_control_plane_version: version used if creating the cluster.
|
|
691
|
+
system: system characteristics.
|
|
692
|
+
|
|
693
|
+
Returns:
|
|
694
|
+
0 if successful and 1 otherwise.
|
|
695
|
+
"""
|
|
696
|
+
machine_type = args.default_pool_cpu_machine_type
|
|
697
|
+
if args.cluster_cpu_machine_type != '':
|
|
698
|
+
xpk_print(
|
|
699
|
+
'Warning: Note that cluster-cpu-machine-type is soon to be',
|
|
700
|
+
' deprecated. Please use --default-pool-cpu-machine-type instead,'
|
|
701
|
+
' to denote the machine type of the default cpu node pool. Set'
|
|
702
|
+
' the machine type of other cpu nodepools using `--device-type`.',
|
|
703
|
+
)
|
|
704
|
+
machine_type = args.cluster_cpu_machine_type
|
|
705
|
+
|
|
706
|
+
# Create the regional cluster with `num-nodes` CPU nodes in the same zone as
|
|
707
|
+
# TPUs. This has been tested with clusters of 300 VMs. Larger clusters will
|
|
708
|
+
# benefit from a larger initial `--num-nodes`. After the cluster is created,
|
|
709
|
+
# the auto-scaler can reduce/increase the nodes based on the load.
|
|
710
|
+
|
|
711
|
+
# If the user passes in the gke version then we use that directly instead of the rapid release.
|
|
712
|
+
# This allows users to directly pass a specified gke version without release channel constraints.
|
|
713
|
+
rapid_release_cmd = ''
|
|
714
|
+
if args.gke_version is not None:
|
|
715
|
+
rapid_release_cmd = ' --release-channel rapid'
|
|
716
|
+
|
|
717
|
+
command = (
|
|
718
|
+
'gcloud beta container clusters create'
|
|
719
|
+
f' {args.cluster} --project={args.project}'
|
|
720
|
+
f' --region={zone_to_region(args.zone)}'
|
|
721
|
+
f' --node-locations={args.zone}'
|
|
722
|
+
f' --cluster-version={gke_control_plane_version}'
|
|
723
|
+
f' --machine-type={machine_type}'
|
|
724
|
+
' --enable-autoscaling'
|
|
725
|
+
' --total-min-nodes 1 --total-max-nodes 1000'
|
|
726
|
+
f' --num-nodes {args.default_pool_cpu_num_nodes}'
|
|
727
|
+
f' {args.custom_cluster_arguments}'
|
|
728
|
+
f' {rapid_release_cmd}'
|
|
729
|
+
)
|
|
730
|
+
|
|
731
|
+
enable_ip_alias = False
|
|
732
|
+
|
|
733
|
+
if args.private or args.authorized_networks is not None:
|
|
734
|
+
enable_ip_alias = True
|
|
735
|
+
command += ' --enable-master-authorized-networks --enable-private-nodes'
|
|
736
|
+
|
|
737
|
+
if system.accelerator_type == AcceleratorType['GPU']:
|
|
738
|
+
enable_ip_alias = True
|
|
739
|
+
command += (
|
|
740
|
+
' --enable-dataplane-v2'
|
|
741
|
+
' --enable-multi-networking --no-enable-autoupgrade'
|
|
742
|
+
)
|
|
743
|
+
else:
|
|
744
|
+
command += ' --location-policy=BALANCED --scopes=storage-full,gke-default'
|
|
745
|
+
|
|
746
|
+
if args.enable_pathways:
|
|
747
|
+
enable_ip_alias = True
|
|
748
|
+
|
|
749
|
+
if enable_ip_alias:
|
|
750
|
+
command += ' --enable-ip-alias'
|
|
751
|
+
|
|
752
|
+
if args.enable_ray_cluster:
|
|
753
|
+
command += ' --addons RayOperator'
|
|
754
|
+
|
|
755
|
+
return_code = run_command_with_updates(command, 'GKE Cluster Create', args)
|
|
756
|
+
if return_code != 0:
|
|
757
|
+
xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
|
|
758
|
+
return 1
|
|
759
|
+
return 0
|
|
760
|
+
|
|
761
|
+
|
|
762
|
+
def set_cluster_command(args) -> int:
|
|
763
|
+
"""Run cluster configuration command to set the kubectl config.
|
|
764
|
+
|
|
765
|
+
Args:
|
|
766
|
+
args: user provided arguments for running the command.
|
|
767
|
+
|
|
768
|
+
Returns:
|
|
769
|
+
0 if successful and 1 otherwise.
|
|
770
|
+
"""
|
|
771
|
+
command = (
|
|
772
|
+
'gcloud container clusters get-credentials'
|
|
773
|
+
f' {args.cluster} --region={zone_to_region(args.zone)}'
|
|
774
|
+
f' --project={args.project} &&'
|
|
775
|
+
' kubectl config view && kubectl config set-context --current'
|
|
776
|
+
' --namespace=default'
|
|
777
|
+
)
|
|
778
|
+
task = f'get-credentials to cluster {args.cluster}'
|
|
779
|
+
return_code = run_command_with_updates_retry(
|
|
780
|
+
command, task, args, verbose=False
|
|
781
|
+
)
|
|
782
|
+
if return_code != 0:
|
|
783
|
+
xpk_print(f'{task} returned ERROR {return_code}')
|
|
784
|
+
return return_code
|