xpk 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. xpk/__init__.py +15 -0
  2. xpk/api/__init__.py +15 -0
  3. xpk/api/storage_crd.yaml +52 -0
  4. xpk/commands/__init__.py +15 -0
  5. xpk/commands/batch.py +131 -0
  6. xpk/commands/cluster.py +808 -0
  7. xpk/commands/cluster_gcluster.py +269 -0
  8. xpk/commands/common.py +44 -0
  9. xpk/commands/config.py +29 -0
  10. xpk/commands/info.py +243 -0
  11. xpk/commands/inspector.py +357 -0
  12. xpk/commands/job.py +199 -0
  13. xpk/commands/kind.py +283 -0
  14. xpk/commands/kjob_common.py +44 -0
  15. xpk/commands/run.py +128 -0
  16. xpk/commands/shell.py +140 -0
  17. xpk/commands/storage.py +267 -0
  18. xpk/commands/version.py +27 -0
  19. xpk/commands/workload.py +889 -0
  20. xpk/core/__init__.py +15 -0
  21. xpk/core/blueprint/__init__.py +15 -0
  22. xpk/core/blueprint/blueprint_definitions.py +62 -0
  23. xpk/core/blueprint/blueprint_generator.py +708 -0
  24. xpk/core/capacity.py +185 -0
  25. xpk/core/cluster.py +564 -0
  26. xpk/core/cluster_private.py +200 -0
  27. xpk/core/commands.py +356 -0
  28. xpk/core/config.py +179 -0
  29. xpk/core/docker_container.py +225 -0
  30. xpk/core/docker_image.py +210 -0
  31. xpk/core/docker_manager.py +308 -0
  32. xpk/core/docker_resources.py +350 -0
  33. xpk/core/filestore.py +251 -0
  34. xpk/core/gcloud_context.py +196 -0
  35. xpk/core/gcluster_manager.py +176 -0
  36. xpk/core/gcsfuse.py +50 -0
  37. xpk/core/kjob.py +444 -0
  38. xpk/core/kueue.py +358 -0
  39. xpk/core/monitoring.py +134 -0
  40. xpk/core/nap.py +361 -0
  41. xpk/core/network.py +377 -0
  42. xpk/core/nodepool.py +581 -0
  43. xpk/core/pathways.py +377 -0
  44. xpk/core/ray.py +222 -0
  45. xpk/core/remote_state/__init__.py +15 -0
  46. xpk/core/remote_state/fuse_remote_state.py +99 -0
  47. xpk/core/remote_state/remote_state_client.py +38 -0
  48. xpk/core/resources.py +238 -0
  49. xpk/core/scheduling.py +253 -0
  50. xpk/core/storage.py +581 -0
  51. xpk/core/system_characteristics.py +1432 -0
  52. xpk/core/vertex.py +105 -0
  53. xpk/core/workload.py +341 -0
  54. xpk/core/workload_decorators/__init__.py +15 -0
  55. xpk/core/workload_decorators/rdma_decorator.py +129 -0
  56. xpk/core/workload_decorators/storage_decorator.py +52 -0
  57. xpk/core/workload_decorators/tcpxo_decorator.py +190 -0
  58. xpk/main.py +75 -0
  59. xpk/parser/__init__.py +15 -0
  60. xpk/parser/batch.py +43 -0
  61. xpk/parser/cluster.py +662 -0
  62. xpk/parser/common.py +259 -0
  63. xpk/parser/config.py +49 -0
  64. xpk/parser/core.py +135 -0
  65. xpk/parser/info.py +64 -0
  66. xpk/parser/inspector.py +65 -0
  67. xpk/parser/job.py +147 -0
  68. xpk/parser/kind.py +95 -0
  69. xpk/parser/run.py +47 -0
  70. xpk/parser/shell.py +59 -0
  71. xpk/parser/storage.py +316 -0
  72. xpk/parser/validators.py +39 -0
  73. xpk/parser/version.py +23 -0
  74. xpk/parser/workload.py +726 -0
  75. xpk/templates/__init__.py +15 -0
  76. xpk/templates/storage.yaml +13 -0
  77. xpk/utils/__init__.py +15 -0
  78. xpk/utils/console.py +55 -0
  79. xpk/utils/file.py +82 -0
  80. xpk/utils/gcs_utils.py +125 -0
  81. xpk/utils/kubectl.py +57 -0
  82. xpk/utils/network.py +168 -0
  83. xpk/utils/objects.py +88 -0
  84. xpk/utils/templates.py +28 -0
  85. xpk/utils/validation.py +80 -0
  86. xpk/utils/yaml.py +30 -0
  87. {xpk-0.5.0.dist-info → xpk-0.7.0.dist-info}/METADATA +456 -32
  88. xpk-0.7.0.dist-info/RECORD +92 -0
  89. {xpk-0.5.0.dist-info → xpk-0.7.0.dist-info}/WHEEL +1 -1
  90. xpk-0.7.0.dist-info/entry_points.txt +2 -0
  91. xpk-0.5.0.dist-info/RECORD +0 -7
  92. xpk-0.5.0.dist-info/entry_points.txt +0 -2
  93. xpk.py +0 -7282
  94. {xpk-0.5.0.dist-info → xpk-0.7.0.dist-info}/LICENSE +0 -0
  95. {xpk-0.5.0.dist-info → xpk-0.7.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,808 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from tabulate import tabulate
18
+
19
+ from ..core.capacity import H100_DEVICE_TYPE
20
+ from ..core.cluster import (
21
+ get_all_clusters_programmatic,
22
+ get_cluster_credentials,
23
+ install_nccl_on_cluster,
24
+ set_jobset_on_cluster,
25
+ setup_k8s_env,
26
+ update_cluster_with_gcsfuse_driver_if_necessary,
27
+ update_cluster_with_workload_identity_if_necessary,
28
+ )
29
+ from ..core.cluster_private import authorize_private_cluster_access_if_necessary
30
+ from ..core.commands import run_command_for_value, run_command_with_updates
31
+ from ..core.config import VERTEX_TENSORBOARD_FEATURE_FLAG
32
+ from ..core.gcloud_context import (
33
+ add_zone_and_project,
34
+ get_gke_control_plane_version,
35
+ get_gke_server_config,
36
+ zone_to_region,
37
+ )
38
+ from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
39
+ from ..core.kueue import (
40
+ cluster_preheat_yml,
41
+ install_kueue_crs,
42
+ install_kueue_on_cluster,
43
+ wait_for_kueue_available,
44
+ )
45
+ from ..core.nap import enable_autoprovisioning_on_cluster
46
+ from ..core.network import (
47
+ create_cluster_network_config,
48
+ delete_cluster_subnets,
49
+ set_up_cluster_network_for_gpu,
50
+ )
51
+ from ..core.nodepool import get_gke_node_pool_version, run_gke_node_pool_create_command
52
+ from ..core.ray import install_ray_cluster
53
+ from ..core.resources import create_cluster_configmaps
54
+ from ..core.storage import install_storage_crd
55
+ from ..core.system_characteristics import (
56
+ AcceleratorType,
57
+ AcceleratorTypeToAcceleratorCharacteristics,
58
+ SystemCharacteristics,
59
+ get_system_characteristics,
60
+ )
61
+ from ..core.vertex import create_vertex_tensorboard
62
+ from ..core.workload import get_workload_list
63
+ from ..utils.console import get_user_input, xpk_exit, xpk_print
64
+ from ..utils.file import write_tmp_file
65
+ from . import cluster_gcluster
66
+ from .common import set_cluster_command
67
+ from ..core.cluster import update_cluster_with_gcpfilestore_driver_if_necessary
68
+
69
+
70
+ def cluster_create(args) -> None:
71
+ """Function around cluster creation.
72
+
73
+ Args:
74
+ args: user provided arguments for running the command.
75
+
76
+ Returns:
77
+ 0 if successful and 1 otherwise.
78
+ """
79
+ system, return_code = get_system_characteristics(args)
80
+
81
+ if return_code > 0:
82
+ xpk_print('Fetching system characteristics failed!')
83
+ xpk_exit(return_code)
84
+
85
+ xpk_print(f'Starting cluster create for cluster {args.cluster}:', flush=True)
86
+ add_zone_and_project(args)
87
+
88
+ if system.device_type in cluster_gcluster.supported_device_types:
89
+ xpk_print(
90
+ 'Creating the cluster using Cluster Toolkit. Machine Type:'
91
+ f' {system.gce_machine_type} ...'
92
+ )
93
+ cluster_gcluster.cluster_create(args)
94
+ xpk_exit(0)
95
+
96
+ return_code, gke_server_config = get_gke_server_config(args)
97
+ if return_code != 0:
98
+ xpk_exit(return_code)
99
+
100
+ return_code, gke_control_plane_version = get_gke_control_plane_version(
101
+ args, gke_server_config
102
+ )
103
+ if return_code != 0:
104
+ xpk_exit(return_code)
105
+
106
+ create_cluster_command_code = create_cluster_if_necessary(
107
+ args, gke_control_plane_version, system
108
+ )
109
+ if create_cluster_command_code != 0:
110
+ xpk_exit(create_cluster_command_code)
111
+
112
+ authorize_private_cluster_access_command_code = (
113
+ authorize_private_cluster_access_if_necessary(args)
114
+ )
115
+ if authorize_private_cluster_access_command_code != 0:
116
+ xpk_exit(authorize_private_cluster_access_command_code)
117
+
118
+ # ToDo(roshanin@) - Re-enable CloudDNS on Pathways clusters conditionally.
119
+ # Enable WorkloadIdentity if not enabled already.
120
+ if (
121
+ args.enable_workload_identity
122
+ or args.enable_gcsfuse_csi_driver
123
+ or args.enable_gcpfilestore_csi_driver
124
+ ):
125
+ update_cluster_command_code = (
126
+ update_cluster_with_workload_identity_if_necessary(args)
127
+ )
128
+ if update_cluster_command_code != 0:
129
+ xpk_exit(update_cluster_command_code)
130
+
131
+ # Enable GCSFuse CSI Driver if not enabled already.
132
+ if args.enable_gcsfuse_csi_driver:
133
+ update_cluster_command_code = (
134
+ update_cluster_with_gcsfuse_driver_if_necessary(args)
135
+ )
136
+ if update_cluster_command_code != 0:
137
+ xpk_exit(update_cluster_command_code)
138
+
139
+ if args.enable_gcpfilestore_csi_driver:
140
+ update_cluster_command_code = (
141
+ update_cluster_with_gcpfilestore_driver_if_necessary(args)
142
+ )
143
+ if update_cluster_command_code != 0:
144
+ xpk_exit(update_cluster_command_code)
145
+
146
+ # Update Pathways clusters with CloudDNS if not enabled already.
147
+
148
+ get_cluster_credentials(args)
149
+
150
+ # create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
151
+ tensorboard_config = {}
152
+ if VERTEX_TENSORBOARD_FEATURE_FLAG and args.create_vertex_tensorboard:
153
+ tensorboard_config = create_vertex_tensorboard(args)
154
+ # exit if failed to create Tensorboard in Vertex AI
155
+ if not tensorboard_config:
156
+ xpk_exit(1)
157
+
158
+ if system.accelerator_type == AcceleratorType['GPU']:
159
+ xpk_print('Setting up Network for cluster')
160
+ set_up_cluster_network_code = set_up_cluster_network_for_gpu(args, system)
161
+ if set_up_cluster_network_code != 0:
162
+ xpk_exit(set_up_cluster_network_code)
163
+
164
+ if system.device_type == H100_DEVICE_TYPE:
165
+ xpk_print('Creating Network Config for cluster')
166
+ create_cluster_network_config_code = create_cluster_network_config(args)
167
+ if create_cluster_network_config_code != 0:
168
+ xpk_exit(create_cluster_network_config_code)
169
+
170
+ # Check the control plane version of the cluster and determine the node pool
171
+ # version to use.
172
+ return_code, gke_node_pool_version = get_gke_node_pool_version(
173
+ args, gke_server_config
174
+ )
175
+ if return_code != 0:
176
+ xpk_exit(return_code)
177
+
178
+ run_gke_node_pool_create_command_code = run_gke_node_pool_create_command(
179
+ args, system, gke_node_pool_version
180
+ )
181
+ if run_gke_node_pool_create_command_code != 0:
182
+ xpk_exit(run_gke_node_pool_create_command_code)
183
+
184
+ # Provision node pools dynamically based on incoming workloads:
185
+ # Currently autoprovisioning is not supported with Pathways.
186
+ autoprovisioning_config = None
187
+ if not args.enable_pathways and args.enable_autoprovisioning:
188
+ xpk_print('Enabling Autoprovisioning')
189
+ autoprovisioning_config, return_code = enable_autoprovisioning_on_cluster(
190
+ args, system
191
+ )
192
+ if return_code != 0:
193
+ xpk_exit(return_code)
194
+
195
+ xpk_print('Creating ConfigMap for cluster')
196
+ create_cluster_configmaps_code = create_cluster_configmaps(
197
+ args, system, tensorboard_config, autoprovisioning_config
198
+ )
199
+ if create_cluster_configmaps_code != 0:
200
+ xpk_exit(create_cluster_configmaps_code)
201
+
202
+ xpk_print(
203
+ 'Enabling the jobset API on our cluster, to be deprecated when Jobset is'
204
+ ' globally available'
205
+ )
206
+ set_jobset_on_cluster_code = set_jobset_on_cluster(args)
207
+ if set_jobset_on_cluster_code != 0:
208
+ xpk_exit(set_jobset_on_cluster_code)
209
+
210
+ xpk_print('Enabling Kueue on the cluster')
211
+ install_kueue_on_cluster_code = install_kueue_on_cluster(args)
212
+ if install_kueue_on_cluster_code != 0:
213
+ xpk_exit(install_kueue_on_cluster_code)
214
+
215
+ xpk_print('Verifying kjob installation')
216
+ err_code = verify_kjob_installed(args)
217
+ if err_code > 0:
218
+ xpk_exit(err_code)
219
+
220
+ xpk_print('Applying kjob CDRs')
221
+ err_code = apply_kjob_crds(args)
222
+ if err_code > 0:
223
+ xpk_exit(err_code)
224
+
225
+ err_code = prepare_kjob(args)
226
+ if err_code > 0:
227
+ xpk_exit(err_code)
228
+
229
+ k8s_client = setup_k8s_env(args)
230
+ install_storage_crd(k8s_client)
231
+
232
+ xpk_print('Wait for Kueue to be fully available')
233
+ wait_for_kueue_available_code = wait_for_kueue_available(args)
234
+ if wait_for_kueue_available_code != 0:
235
+ xpk_exit(wait_for_kueue_available_code)
236
+
237
+ xpk_print('Install Kueue Custom Resources')
238
+ enable_kueue_credentials_code = install_kueue_crs(
239
+ args, system, autoprovisioning_config
240
+ )
241
+ if enable_kueue_credentials_code != 0:
242
+ xpk_exit(enable_kueue_credentials_code)
243
+
244
+ if system.accelerator_type == AcceleratorType['GPU']:
245
+ xpk_print('Installing NCCL Plugin for cluster')
246
+ install_nccl_code = install_nccl_on_cluster(args, system)
247
+ if install_nccl_code != 0:
248
+ xpk_exit(install_nccl_code)
249
+
250
+ if args.enable_ray_cluster:
251
+ return_code = install_ray_cluster(args, system)
252
+ if return_code != 0:
253
+ xpk_print('Installation of RayCluster failed.')
254
+ xpk_exit(return_code)
255
+
256
+ xpk_print('GKE commands done! Resources are created.')
257
+ xpk_print(
258
+ 'See your GKE Cluster here:'
259
+ # pylint: disable=line-too-long
260
+ f' https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}'
261
+ )
262
+ xpk_exit(0)
263
+
264
+
265
+ def cluster_delete(args) -> None:
266
+ """Function around cluster delete.
267
+
268
+ Args:
269
+ args: user provided arguments for running the command.
270
+
271
+ Returns:
272
+ 0 if successful and 1 otherwise.
273
+ """
274
+ xpk_print(f'Starting cluster delete for cluster: {args.cluster}', flush=True)
275
+ add_zone_and_project(args)
276
+
277
+ if cluster_gcluster.created_by_gcluster(args):
278
+ xpk_print(f'Deleting {args.cluster} cluster using Cluster Toolkit...')
279
+ cluster_gcluster.cluster_delete(args)
280
+ xpk_exit(0)
281
+
282
+ set_cluster_command_code = set_cluster_command(args)
283
+ if set_cluster_command_code != 0:
284
+ xpk_exit(set_cluster_command_code)
285
+
286
+ run_gke_cluster_delete_command_code = run_gke_cluster_delete_command(args)
287
+
288
+ if run_gke_cluster_delete_command_code != 0:
289
+ xpk_exit(run_gke_cluster_delete_command_code)
290
+ xpk_print(f'GKE commands done! Cluster {args.cluster} deleted.\n')
291
+ xpk_exit(0)
292
+
293
+
294
+ def cluster_cacheimage(args) -> None:
295
+ """Function around cluster cacheimage.
296
+
297
+ Args:
298
+ args: user provided arguments for running the command.
299
+
300
+ Returns:
301
+ 0 if successful and 1 otherwise.
302
+ """
303
+ xpk_print(
304
+ f'Starting cluster cacheimage for cluster: {args.cluster}', flush=True
305
+ )
306
+ add_zone_and_project(args)
307
+
308
+ get_cluster_credentials(args)
309
+ system, return_code = get_system_characteristics(args)
310
+
311
+ if return_code > 0:
312
+ xpk_print('Fetching system characteristics failed!')
313
+ xpk_exit(return_code)
314
+
315
+ node_selector_key = AcceleratorTypeToAcceleratorCharacteristics[
316
+ system.accelerator_type
317
+ ].accelerator_label
318
+ yml_string = cluster_preheat_yml.format(
319
+ cachekey=args.cache_key,
320
+ image_name=args.docker_image,
321
+ nodeSelectorKey=node_selector_key,
322
+ )
323
+ tmp = write_tmp_file(yml_string)
324
+ command_apply = f'kubectl apply -f {str(tmp.file.name)}'
325
+ command_delete = (
326
+ f'kubectl delete -f {str(tmp.file.name)} --ignore-not-found=true'
327
+ )
328
+
329
+ return_code = run_command_with_updates(
330
+ command_delete, 'Deleting Cached Image', args
331
+ )
332
+ if return_code != 0:
333
+ xpk_print(f'Delete Cached Image returned ERROR {return_code}')
334
+ xpk_exit(return_code)
335
+
336
+ return_code = run_command_with_updates(
337
+ command_apply, 'Creating Cached Image', args
338
+ )
339
+ if return_code != 0:
340
+ xpk_print(f'Create Cached Image returned ERROR {return_code}')
341
+ xpk_exit(return_code)
342
+ xpk_exit(0)
343
+
344
+
345
+ def cluster_describe(args) -> None:
346
+ """Function around cluster describe.
347
+
348
+ Args:
349
+ args: user provided arguments for running the command.
350
+
351
+ Returns:
352
+ 0 if successful and 1 otherwise.
353
+ """
354
+ xpk_print(f'Starting nodepool list for cluster: {args.cluster}', flush=True)
355
+ add_zone_and_project(args)
356
+
357
+ get_cluster_credentials(args)
358
+
359
+ return_code, data_table = nodepools_build_table(args)
360
+ if return_code != 0:
361
+ xpk_exit(return_code)
362
+
363
+ if len(data_table) > 1:
364
+ xpk_print(
365
+ 'Nodepools info:\n',
366
+ tabulate(data_table, headers='firstrow', tablefmt='plain'),
367
+ )
368
+ else:
369
+ xpk_print('No nodepools info found')
370
+
371
+ return_code_node_output, node_output = run_command_for_value(
372
+ r'kubectl get node --no-headers=true'
373
+ r" --selector='cloud.google.com/gke-tpu-accelerator' | wc -l",
374
+ 'Count TPU Nodes',
375
+ args,
376
+ )
377
+ if return_code_node_output != 0:
378
+ xpk_exit(return_code_node_output)
379
+ node_output = node_output.splitlines()[-1]
380
+ number_tpu_vms_in_cluster = int(node_output)
381
+
382
+ return_code_pod_output, pod_output = run_command_for_value(
383
+ "kubectl get pod -o=custom-columns='Status:.status.phase' | grep -i"
384
+ ' Running | wc -l',
385
+ 'Count TPU Pods',
386
+ args,
387
+ )
388
+ if return_code_pod_output != 0:
389
+ xpk_exit(return_code_pod_output)
390
+ number_tpu_pods_in_cluster = int(pod_output)
391
+
392
+ xpk_print(
393
+ f'The cluster contains {number_tpu_vms_in_cluster} TPUVMs of which'
394
+ f' {number_tpu_pods_in_cluster} are in use.'
395
+ )
396
+
397
+ xpk_print('GKE commands done!\n')
398
+ xpk_exit(0)
399
+
400
+
401
+ def nodepools_build_table(args) -> tuple[int, list[list]]:
402
+ table = [[
403
+ 'NODEPOOL_NAME',
404
+ 'SLICE',
405
+ 'TYPE',
406
+ 'EXPECTED_HEALTHY_NODES',
407
+ 'ACTUAL_HEALTHY_NODES',
408
+ 'TOTAL_NODES',
409
+ ]]
410
+
411
+ nodepools_data = {}
412
+
413
+ nodepools, return_code = get_node_pools_name(args)
414
+ if return_code != 0:
415
+ xpk_print(f'Get node pools name returned ERROR {return_code}')
416
+
417
+ for name in nodepools:
418
+ nodepools_data[name] = [name]
419
+
420
+ slices, return_code = get_slice_node_pool_size(args)
421
+ if return_code != 0:
422
+ xpk_print(f'Get slice node pool size returned ERROR {return_code}')
423
+
424
+ for line in slices:
425
+ s = line.split()
426
+ count, nodepool_name = s[0], s[1]
427
+ nodepools_data[nodepool_name].append(count)
428
+
429
+ type_nodepool, return_code = get_node_pool_instance_type(args)
430
+ if return_code != 0:
431
+ xpk_print(f'Get node pool instance type returned ERROR {return_code}')
432
+
433
+ for line in type_nodepool:
434
+ tn = line.split()
435
+ nodepool_name, instance_type = tn[0], tn[1]
436
+ nodepools_data[nodepool_name].append(instance_type)
437
+
438
+ expected_healthy_nodes, return_code = get_expected_healthy_nodes(args)
439
+ if return_code != 0:
440
+ xpk_print(f'Get expected healthy nodes returned ERROR {return_code}')
441
+
442
+ for line in expected_healthy_nodes:
443
+ ehn = line.split()
444
+ count, nodepool_name = ehn[0], ehn[1]
445
+ nodepools_data[nodepool_name].append(count)
446
+
447
+ actual_healthy_nodes, return_code = get_actual_healthy_nodes(args)
448
+ if return_code != 0:
449
+ xpk_print(f'Get actual healthy nodes returned ERROR {return_code}')
450
+
451
+ for line in actual_healthy_nodes:
452
+ ahn = line.split()
453
+ count, nodepool_name = ahn[0], ahn[1]
454
+ nodepools_data[nodepool_name].append(count)
455
+
456
+ total_nodes, return_code = get_total_nodes_per_node_pool(args)
457
+ if return_code != 0:
458
+ xpk_print(f'Get total nodes per node pool returned ERROR {return_code}')
459
+
460
+ for line in total_nodes:
461
+ tn = line.split()
462
+ count, nodepool_name = tn[0], tn[1]
463
+ nodepools_data[nodepool_name].append(count)
464
+
465
+ for _, np_data in nodepools_data.items():
466
+ table.append(np_data)
467
+
468
+ return 0, table
469
+
470
+
471
+ def get_node_pools_name(args) -> tuple[list[str], int]:
472
+ cmd_nodepools = (
473
+ 'kubectl get node --no-headers=true -o'
474
+ " custom-columns='NODEPOOL:.metadata.labels.cloud\\.google\\.com/gke-nodepool'"
475
+ " | grep -v 'none' | sort | uniq"
476
+ )
477
+ return_code, out = run_command_for_value(cmd_nodepools, 'Nodepool list', args)
478
+ if return_code != 0:
479
+ return [], return_code
480
+
481
+ return out.splitlines(), 0
482
+
483
+
484
+ def get_slice_node_pool_size(args) -> tuple[list[str], int]:
485
+ cmd_slices = (
486
+ 'kubectl get node --no-headers=true -o'
487
+ " custom-columns=':metadata.labels.cloud\\.google\\.com/gke-nodepool'"
488
+ " | grep -v 'none'"
489
+ ' | sort'
490
+ ' | uniq -c'
491
+ )
492
+ return_code, out = run_command_for_value(
493
+ cmd_slices, 'Count nodes per nodepool slice', args
494
+ )
495
+ if return_code != 0:
496
+ return [], return_code
497
+
498
+ return out.splitlines(), 0
499
+
500
+
501
+ def get_node_pool_instance_type(args) -> tuple[list[str], int]:
502
+ cmd_type_nodepool = (
503
+ 'kubectl get node --no-headers=true -o'
504
+ " custom-columns='NODEPOOL:.metadata.labels.cloud\\.google\\.com/gke-nodepool,"
505
+ " TYPE:.metadata.labels.node\\.kubernetes\\.io/instance-type' | grep -v"
506
+ " 'none' | sort | uniq"
507
+ )
508
+ return_code, out = run_command_for_value(
509
+ cmd_type_nodepool, 'Instance type of nodepools', args
510
+ )
511
+ if return_code != 0:
512
+ return [], return_code
513
+
514
+ return out.splitlines(), 0
515
+
516
+
517
+ def get_expected_healthy_nodes(args) -> tuple[list[str], int]:
518
+ cmd_expected_healthy_nodes = (
519
+ 'kubectl get node --no-headers=true -o'
520
+ " custom-columns=':metadata.labels.cloud\\.google\\.com/gke-nodepool'"
521
+ " | grep -v 'none'"
522
+ ' | sort'
523
+ ' | uniq -c'
524
+ )
525
+ return_code, out = run_command_for_value(
526
+ cmd_expected_healthy_nodes,
527
+ 'Count expected healthy nodes per nodepool',
528
+ args,
529
+ )
530
+ if return_code != 0:
531
+ return [], return_code
532
+
533
+ return out.splitlines(), 0
534
+
535
+
536
+ def get_actual_healthy_nodes(args) -> tuple[list[str], int]:
537
+ cmd_actual_healthy_nodes = (
538
+ 'kubectl get node --no-headers=true -o'
539
+ " custom-columns='NODE_NAME:metadata.name,"
540
+ ' READY_STATUS:.status.conditions[?(@.type=="Ready")].status,'
541
+ " NODEPOOL:metadata.labels.cloud\\.google\\.com/gke-nodepool' "
542
+ ' | grep -w True'
543
+ " | grep -v 'none'"
544
+ " | awk {'print $3'}"
545
+ ' | sort'
546
+ ' | uniq -c'
547
+ )
548
+ return_code, out = run_command_for_value(
549
+ cmd_actual_healthy_nodes, 'Count actual healthy nodes per nodepool', args
550
+ )
551
+ if return_code != 0:
552
+ return [], return_code
553
+
554
+ return out.splitlines(), 0
555
+
556
+
557
+ def get_total_nodes_per_node_pool(args) -> tuple[list[str], int]:
558
+ cmd_total_nodes = (
559
+ 'kubectl get node --no-headers=true -o'
560
+ " custom-columns='NODE_NAME:metadata.name,"
561
+ ' READY_STATUS:.status.conditions[?(@.type=="Ready")].status,'
562
+ " NODEPOOL:metadata.labels.cloud\\.google\\.com/gke-nodepool'"
563
+ " | grep -v 'none'"
564
+ " | awk {'print $3'}"
565
+ ' | sort'
566
+ ' | uniq -c'
567
+ )
568
+ return_code, out = run_command_for_value(
569
+ cmd_total_nodes, 'Count total nodes per nodepool', args
570
+ )
571
+ if return_code != 0:
572
+ return [], return_code
573
+
574
+ return out.splitlines(), 0
575
+
576
+
577
+ def cluster_list(args) -> None:
578
+ """Function around cluster list.
579
+
580
+ Args:
581
+ args: user provided arguments for running the command.
582
+
583
+ Returns:
584
+ 0 if successful and 1 otherwise.
585
+ """
586
+ add_zone_and_project(args)
587
+ xpk_print(f'For project {args.project} and zone {args.zone}:', flush=True)
588
+ if run_gke_clusters_list_command(args):
589
+ xpk_exit(1)
590
+ xpk_exit(0)
591
+
592
+
593
+ def cluster_create_pathways(args) -> None:
594
+ """Function around cluster creation for Pathways.
595
+
596
+ Args:
597
+ args: user provided arguments for running the command.
598
+
599
+ Returns:
600
+ 0 if successful and 1 otherwise.
601
+ """
602
+ args.enable_pathways = True
603
+ args.enable_ray_cluster = False
604
+ cluster_create(args)
605
+
606
+
607
+ def cluster_create_ray_cluster(args) -> None:
608
+ """Function around cluster creation for RayCluster.
609
+
610
+ Args:
611
+ args: user provided arguments for running the command.
612
+
613
+ Returns:
614
+ None
615
+ """
616
+ args.enable_ray_cluster = True
617
+ args.enable_autoprovisioning = False
618
+ cluster_create(args)
619
+
620
+
621
+ def create_cluster_if_necessary(
622
+ args, gke_control_plane_version: str, system: SystemCharacteristics
623
+ ) -> int:
624
+ """Creates cluster if not present in the project.
625
+
626
+ Args:
627
+ args: user provided arguments for running the command.
628
+ gke_control_plane_version: version used if creating the cluster.
629
+ system: system characteristics.
630
+
631
+ Returns:
632
+ 0 if successful and 1 otherwise.
633
+ """
634
+ all_clusters, return_code = get_all_clusters_programmatic(args)
635
+ if return_code > 0:
636
+ xpk_print('Listing all clusters failed!')
637
+ return 1
638
+ if args.cluster in all_clusters:
639
+ xpk_print('Skipping cluster creation since it already exists.')
640
+ return 0
641
+ else:
642
+ return run_gke_cluster_create_command(
643
+ args, gke_control_plane_version, system
644
+ )
645
+
646
+
647
+ def run_gke_cluster_delete_command(args) -> int:
648
+ """Run the Delete GKE Cluster request.
649
+
650
+ Args:
651
+ args: user provided arguments for running the command.
652
+
653
+ Returns:
654
+ 0 if successful and 1 otherwise.
655
+ """
656
+ if not args.force:
657
+ xpk_print('Get the name of the workloads in the cluster.')
658
+ args.filter_by_status = 'EVERYTHING'
659
+ return_code, return_value = get_workload_list(args)
660
+ if return_code != 0:
661
+ xpk_print(f'List Job request returned ERROR {return_code}')
662
+ return return_code
663
+
664
+ # Ignore Column Names line.
665
+ if len(return_value) > 1:
666
+ workloads = [x.split(' ')[0] for x in return_value.splitlines()][1:]
667
+ if workloads and not get_user_input(
668
+ f'Planning to delete {len(workloads)} workloads in the cluster'
669
+ f' {args.cluster} including {workloads}. \nDo you wish to delete: y'
670
+ ' (yes) / n (no):\n'
671
+ ):
672
+ xpk_print('Skipping delete command.')
673
+ return 0
674
+
675
+ command = (
676
+ 'gcloud beta container clusters delete'
677
+ f' {args.cluster} --project={args.project}'
678
+ f' --region={zone_to_region(args.zone)} --quiet'
679
+ )
680
+
681
+ return_code = run_command_with_updates(command, 'Cluster Delete', args)
682
+ if return_code != 0:
683
+ xpk_print(f'Cluster delete request returned ERROR {return_code}')
684
+ return 1
685
+
686
+ return_code = delete_cluster_subnets(args)
687
+ if return_code != 0:
688
+ return return_code
689
+
690
+ return 0
691
+
692
+
693
+ def run_gke_clusters_list_command(args) -> int:
694
+ """List GKE Clusters within the project and location.
695
+
696
+ Args:
697
+ args: user provided arguments for running the command.
698
+
699
+ Returns:
700
+ 0 if successful and 1 otherwise.
701
+ """
702
+ command = (
703
+ 'gcloud container clusters list'
704
+ f' --project={args.project} --region={zone_to_region(args.zone)}'
705
+ )
706
+ return_code = run_command_with_updates(command, 'Cluster List', args)
707
+ if return_code != 0:
708
+ xpk_print(f'Cluster list request returned ERROR {return_code}')
709
+ return 1
710
+
711
+ return 0
712
+
713
+
714
+ def run_gke_cluster_create_command(
715
+ args, gke_control_plane_version: str, system: SystemCharacteristics
716
+ ) -> int:
717
+ """Run the Create GKE Cluster request.
718
+
719
+ Args:
720
+ args: user provided arguments for running the command.
721
+ gke_control_plane_version: version used if creating the cluster.
722
+ system: system characteristics.
723
+
724
+ Returns:
725
+ 0 if successful and 1 otherwise.
726
+ """
727
+ machine_type = args.default_pool_cpu_machine_type
728
+ if args.cluster_cpu_machine_type != '':
729
+ xpk_print(
730
+ 'Warning: Note that cluster-cpu-machine-type is soon to be',
731
+ ' deprecated. Please use --default-pool-cpu-machine-type instead,'
732
+ ' to denote the machine type of the default cpu node pool. Set'
733
+ ' the machine type of other cpu nodepools using `--device-type`.',
734
+ )
735
+ machine_type = args.cluster_cpu_machine_type
736
+
737
+ # Create the regional cluster with `num-nodes` CPU nodes in the same zone as
738
+ # TPUs. This has been tested with clusters of 300 VMs. Larger clusters will
739
+ # benefit from a larger initial `--num-nodes`. After the cluster is created,
740
+ # the auto-scaler can reduce/increase the nodes based on the load.
741
+
742
+ # If the user passes in the gke version then we use that directly instead of the rapid release.
743
+ # This allows users to directly pass a specified gke version without release channel constraints.
744
+ rapid_release_cmd = ''
745
+ if args.gke_version is not None:
746
+ rapid_release_cmd = ' --release-channel rapid'
747
+
748
+ command = (
749
+ 'gcloud beta container clusters create'
750
+ f' {args.cluster} --project={args.project}'
751
+ f' --region={zone_to_region(args.zone)}'
752
+ f' --node-locations={args.zone}'
753
+ f' --cluster-version={gke_control_plane_version}'
754
+ f' --machine-type={machine_type}'
755
+ ' --enable-autoscaling'
756
+ ' --total-min-nodes 1 --total-max-nodes 1000'
757
+ f' --num-nodes {args.default_pool_cpu_num_nodes}'
758
+ f' {args.custom_cluster_arguments}'
759
+ f' {rapid_release_cmd}'
760
+ )
761
+
762
+ enable_ip_alias = False
763
+
764
+ if args.private or args.authorized_networks is not None:
765
+ enable_ip_alias = True
766
+ command += ' --enable-master-authorized-networks --enable-private-nodes'
767
+
768
+ if system.accelerator_type == AcceleratorType['GPU']:
769
+ enable_ip_alias = True
770
+ command += (
771
+ ' --enable-dataplane-v2'
772
+ ' --enable-multi-networking --no-enable-autoupgrade'
773
+ )
774
+ else:
775
+ command += ' --location-policy=BALANCED --scopes=storage-full,gke-default'
776
+
777
+ if args.enable_pathways:
778
+ enable_ip_alias = True
779
+
780
+ if enable_ip_alias:
781
+ command += ' --enable-ip-alias'
782
+
783
+ if args.enable_ray_cluster:
784
+ command += ' --addons RayOperator'
785
+
786
+ if (
787
+ args.enable_workload_identity
788
+ or args.enable_gcsfuse_csi_driver
789
+ or args.enable_gcpfilestore_csi_driver
790
+ ):
791
+ command += f' --workload-pool={args.project}.svc.id.goog'
792
+
793
+ addons = []
794
+ if args.enable_gcsfuse_csi_driver:
795
+ addons.append('GcsFuseCsiDriver')
796
+
797
+ if args.enable_gcpfilestore_csi_driver:
798
+ addons.append('GcpFilestoreCsiDriver')
799
+
800
+ if len(addons) > 0:
801
+ addons_str = ','.join(addons)
802
+ command += f' --addons={addons_str}'
803
+
804
+ return_code = run_command_with_updates(command, 'GKE Cluster Create', args)
805
+ if return_code != 0:
806
+ xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
807
+ return 1
808
+ return 0