xpk 0.13.0__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +9 -2
- xpk/commands/cluster.py +128 -115
- xpk/commands/cluster_gcluster.py +77 -14
- xpk/commands/cluster_gcluster_test.py +177 -0
- xpk/commands/common.py +10 -28
- xpk/commands/info.py +11 -9
- xpk/commands/inspector.py +21 -10
- xpk/commands/job.py +25 -9
- xpk/commands/kind.py +38 -40
- xpk/commands/kjob_common.py +4 -4
- xpk/commands/run.py +9 -2
- xpk/commands/shell.py +13 -10
- xpk/commands/storage.py +21 -0
- xpk/commands/version.py +0 -4
- xpk/commands/workload.py +43 -22
- xpk/core/blueprint/blueprint_generator.py +4 -40
- xpk/core/blueprint/blueprint_test.py +0 -6
- xpk/core/capacity.py +6 -5
- xpk/core/cluster.py +91 -194
- xpk/core/cluster_private.py +6 -11
- xpk/core/commands.py +11 -18
- xpk/core/config.py +1 -1
- xpk/core/docker_image.py +3 -4
- xpk/core/gcloud_context.py +26 -2
- xpk/core/gcloud_context_test.py +96 -0
- xpk/core/gcluster_manager.py +0 -3
- xpk/core/jobset.py +4 -7
- xpk/core/kjob.py +14 -27
- xpk/core/kueue_manager.py +383 -0
- xpk/core/kueue_manager_test.py +542 -0
- xpk/core/monitoring.py +1 -1
- xpk/core/nap.py +10 -15
- xpk/core/network.py +17 -18
- xpk/core/nodepool.py +66 -77
- xpk/core/nodepool_test.py +198 -1
- xpk/core/pathways.py +5 -5
- xpk/core/ray.py +10 -14
- xpk/core/resources.py +6 -11
- xpk/core/scheduling.py +19 -1
- xpk/core/scheduling_test.py +31 -0
- xpk/core/system_characteristics.py +335 -229
- xpk/core/vertex.py +1 -1
- xpk/core/workload.py +7 -8
- xpk/main.py +2 -4
- xpk/parser/cluster.py +7 -0
- xpk/parser/cluster_test.py +66 -0
- xpk/parser/common.py +11 -0
- xpk/parser/workload.py +62 -25
- xpk/parser/workload_test.py +82 -0
- xpk/utils/feature_flags.py +28 -0
- xpk/utils/kueue.py +20 -0
- xpk/utils/templates.py +2 -0
- xpk/utils/topology.py +37 -0
- xpk/utils/topology_test.py +43 -0
- xpk/utils/validation.py +79 -55
- xpk/utils/validation_test.py +37 -0
- {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
- xpk-0.14.0.dist-info/RECORD +112 -0
- xpk/core/kueue.py +0 -561
- xpk-0.13.0.dist-info/RECORD +0 -101
- {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
xpk/commands/cluster.py
CHANGED
|
@@ -41,17 +41,12 @@ from ..core.gcloud_context import (
|
|
|
41
41
|
add_zone_and_project,
|
|
42
42
|
get_gke_control_plane_version,
|
|
43
43
|
get_gke_server_config,
|
|
44
|
+
get_cluster_location,
|
|
44
45
|
zone_to_region,
|
|
45
46
|
)
|
|
46
47
|
from ..core.jobset import update_jobset_resources_if_necessary
|
|
47
48
|
from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
|
|
48
|
-
from ..core.
|
|
49
|
-
cluster_preheat_yml,
|
|
50
|
-
install_kueue_crs,
|
|
51
|
-
install_kueue_on_cluster,
|
|
52
|
-
wait_for_kueue_available,
|
|
53
|
-
update_kueue_resources_if_necessary,
|
|
54
|
-
)
|
|
49
|
+
from ..core.kueue_manager import (KueueConfig, KueueManager)
|
|
55
50
|
from ..core.nap import enable_autoprovisioning_on_cluster
|
|
56
51
|
from ..core.network import (
|
|
57
52
|
create_cluster_network_config,
|
|
@@ -65,6 +60,7 @@ from ..core.nodepool import (
|
|
|
65
60
|
from ..core.ray import install_ray_cluster
|
|
66
61
|
from ..core.mtc import install_mtc_on_cluster
|
|
67
62
|
from ..core.resources import create_cluster_configmaps
|
|
63
|
+
from ..core.scheduling import get_total_chips_requested_from_args
|
|
68
64
|
from ..core.storage import install_storage_crd
|
|
69
65
|
from ..core.system_characteristics import (
|
|
70
66
|
AcceleratorType,
|
|
@@ -77,11 +73,16 @@ from ..core.workload import get_workload_list
|
|
|
77
73
|
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
78
74
|
from ..utils.file import write_tmp_file
|
|
79
75
|
from ..utils.execution_context import is_dry_run
|
|
76
|
+
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
80
77
|
from . import cluster_gcluster
|
|
81
78
|
from .common import set_cluster_command
|
|
79
|
+
from jinja2 import Environment, FileSystemLoader
|
|
80
|
+
from ..utils.templates import TEMPLATE_PATH
|
|
82
81
|
import shutil
|
|
83
82
|
import os
|
|
84
83
|
|
|
84
|
+
CLUSTER_PREHEAT_JINJA_FILE = 'cluster_preheat.yaml.j2'
|
|
85
|
+
|
|
85
86
|
|
|
86
87
|
def cluster_adapt(args) -> None:
|
|
87
88
|
"""Function that performs cluster adaptation.
|
|
@@ -89,6 +90,12 @@ def cluster_adapt(args) -> None:
|
|
|
89
90
|
Args:
|
|
90
91
|
args: user provided arguments for running the command.
|
|
91
92
|
"""
|
|
93
|
+
if should_validate_dependencies(args):
|
|
94
|
+
validate_dependencies_list([
|
|
95
|
+
SystemDependency.KUBECTL,
|
|
96
|
+
SystemDependency.KJOB,
|
|
97
|
+
SystemDependency.GCLOUD,
|
|
98
|
+
])
|
|
92
99
|
args.enable_pathways = False
|
|
93
100
|
|
|
94
101
|
system, return_code = get_system_characteristics(args)
|
|
@@ -109,7 +116,7 @@ def cluster_adapt(args) -> None:
|
|
|
109
116
|
'Argument --num-nodes was not provided, trying to determine number of'
|
|
110
117
|
' nodes based on the available nodes in the cluster...'
|
|
111
118
|
)
|
|
112
|
-
args.num_nodes = count_nodes_on_cluster(
|
|
119
|
+
args.num_nodes = count_nodes_on_cluster(system)
|
|
113
120
|
if args.num_nodes == 0:
|
|
114
121
|
xpk_print(
|
|
115
122
|
'Found unexpected number of nodes. Is the --device-type correct?'
|
|
@@ -176,7 +183,7 @@ def cluster_adapt(args) -> None:
|
|
|
176
183
|
|
|
177
184
|
install_kjob(args)
|
|
178
185
|
if system.accelerator_type == AcceleratorType['GPU']:
|
|
179
|
-
prepare_gpus(
|
|
186
|
+
prepare_gpus(system)
|
|
180
187
|
|
|
181
188
|
if args.enable_ray_cluster:
|
|
182
189
|
return_code = install_ray_cluster(args, system)
|
|
@@ -188,7 +195,7 @@ def cluster_adapt(args) -> None:
|
|
|
188
195
|
xpk_print(
|
|
189
196
|
'See your GKE Cluster here:'
|
|
190
197
|
# pylint: disable=line-too-long
|
|
191
|
-
f' https://console.cloud.google.com/kubernetes/clusters/details/{
|
|
198
|
+
f' https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
|
|
192
199
|
)
|
|
193
200
|
xpk_exit(0)
|
|
194
201
|
|
|
@@ -199,6 +206,12 @@ def cluster_create(args) -> None:
|
|
|
199
206
|
Args:
|
|
200
207
|
args: user provided arguments for running the command.
|
|
201
208
|
"""
|
|
209
|
+
if should_validate_dependencies(args):
|
|
210
|
+
validate_dependencies_list([
|
|
211
|
+
SystemDependency.KUBECTL,
|
|
212
|
+
SystemDependency.KJOB,
|
|
213
|
+
SystemDependency.GCLOUD,
|
|
214
|
+
])
|
|
202
215
|
system, return_code = get_system_characteristics(args)
|
|
203
216
|
|
|
204
217
|
if return_code > 0 or system is None:
|
|
@@ -249,7 +262,7 @@ def cluster_create(args) -> None:
|
|
|
249
262
|
|
|
250
263
|
get_cluster_credentials(args)
|
|
251
264
|
|
|
252
|
-
update_coredns_command_code = update_coredns_if_necessary(
|
|
265
|
+
update_coredns_command_code = update_coredns_if_necessary()
|
|
253
266
|
if update_coredns_command_code != 0:
|
|
254
267
|
xpk_exit(update_cluster_command_code)
|
|
255
268
|
|
|
@@ -317,7 +330,7 @@ def cluster_create(args) -> None:
|
|
|
317
330
|
set_jobset_on_cluster_code = set_jobset_on_cluster(args)
|
|
318
331
|
if set_jobset_on_cluster_code != 0:
|
|
319
332
|
xpk_exit(set_jobset_on_cluster_code)
|
|
320
|
-
update_jobset_resources_code = update_jobset_resources_if_necessary(
|
|
333
|
+
update_jobset_resources_code = update_jobset_resources_if_necessary()
|
|
321
334
|
if update_jobset_resources_code != 0:
|
|
322
335
|
xpk_exit(update_jobset_resources_code)
|
|
323
336
|
|
|
@@ -330,7 +343,7 @@ def cluster_create(args) -> None:
|
|
|
330
343
|
install_kjob(args)
|
|
331
344
|
|
|
332
345
|
if system.accelerator_type == AcceleratorType['GPU']:
|
|
333
|
-
prepare_gpus(
|
|
346
|
+
prepare_gpus(system)
|
|
334
347
|
|
|
335
348
|
if args.enable_ray_cluster:
|
|
336
349
|
return_code = install_ray_cluster(args, system)
|
|
@@ -348,7 +361,7 @@ def cluster_create(args) -> None:
|
|
|
348
361
|
xpk_print(
|
|
349
362
|
'See your GKE Cluster here:'
|
|
350
363
|
# pylint: disable=line-too-long
|
|
351
|
-
f' https://console.cloud.google.com/kubernetes/clusters/details/{
|
|
364
|
+
f' https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
|
|
352
365
|
)
|
|
353
366
|
xpk_exit(0)
|
|
354
367
|
|
|
@@ -362,6 +375,8 @@ def cluster_delete(args) -> None:
|
|
|
362
375
|
Returns:
|
|
363
376
|
0 if successful and 1 otherwise.
|
|
364
377
|
"""
|
|
378
|
+
if should_validate_dependencies(args):
|
|
379
|
+
validate_dependencies_list([SystemDependency.GCLOUD])
|
|
365
380
|
xpk_print(f'Starting cluster delete for cluster: {args.cluster}', flush=True)
|
|
366
381
|
add_zone_and_project(args)
|
|
367
382
|
|
|
@@ -391,6 +406,10 @@ def cluster_cacheimage(args) -> None:
|
|
|
391
406
|
Returns:
|
|
392
407
|
0 if successful and 1 otherwise.
|
|
393
408
|
"""
|
|
409
|
+
if should_validate_dependencies(args):
|
|
410
|
+
validate_dependencies_list(
|
|
411
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
412
|
+
)
|
|
394
413
|
xpk_print(
|
|
395
414
|
f'Starting cluster cacheimage for cluster: {args.cluster}', flush=True
|
|
396
415
|
)
|
|
@@ -406,25 +425,26 @@ def cluster_cacheimage(args) -> None:
|
|
|
406
425
|
node_selector_key = AcceleratorTypeToAcceleratorCharacteristics[
|
|
407
426
|
system.accelerator_type
|
|
408
427
|
].accelerator_label
|
|
409
|
-
|
|
428
|
+
|
|
429
|
+
template_env = Environment(loader=FileSystemLoader(TEMPLATE_PATH))
|
|
430
|
+
cluster_preheat_yaml = template_env.get_template(CLUSTER_PREHEAT_JINJA_FILE)
|
|
431
|
+
rendered_yaml = cluster_preheat_yaml.render(
|
|
410
432
|
cachekey=args.cache_key,
|
|
411
433
|
image_name=args.docker_image,
|
|
412
434
|
nodeSelectorKey=node_selector_key,
|
|
413
435
|
)
|
|
414
|
-
tmp = write_tmp_file(
|
|
436
|
+
tmp = write_tmp_file(rendered_yaml)
|
|
415
437
|
command_apply = f'kubectl apply -f {str(tmp)}'
|
|
416
438
|
command_delete = f'kubectl delete -f {str(tmp)} --ignore-not-found=true'
|
|
417
439
|
|
|
418
440
|
return_code = run_command_with_updates(
|
|
419
|
-
command_delete, 'Deleting Cached Image'
|
|
441
|
+
command_delete, 'Deleting Cached Image'
|
|
420
442
|
)
|
|
421
443
|
if return_code != 0:
|
|
422
444
|
xpk_print(f'Delete Cached Image returned ERROR {return_code}')
|
|
423
445
|
xpk_exit(return_code)
|
|
424
446
|
|
|
425
|
-
return_code = run_command_with_updates(
|
|
426
|
-
command_apply, 'Creating Cached Image', args
|
|
427
|
-
)
|
|
447
|
+
return_code = run_command_with_updates(command_apply, 'Creating Cached Image')
|
|
428
448
|
if return_code != 0:
|
|
429
449
|
xpk_print(f'Create Cached Image returned ERROR {return_code}')
|
|
430
450
|
xpk_exit(return_code)
|
|
@@ -440,12 +460,16 @@ def cluster_describe(args) -> None:
|
|
|
440
460
|
Returns:
|
|
441
461
|
0 if successful and 1 otherwise.
|
|
442
462
|
"""
|
|
463
|
+
if should_validate_dependencies(args):
|
|
464
|
+
validate_dependencies_list(
|
|
465
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
466
|
+
)
|
|
443
467
|
xpk_print(f'Starting nodepool list for cluster: {args.cluster}', flush=True)
|
|
444
468
|
add_zone_and_project(args)
|
|
445
469
|
|
|
446
470
|
get_cluster_credentials(args)
|
|
447
471
|
|
|
448
|
-
return_code, data_table = nodepools_build_table(
|
|
472
|
+
return_code, data_table = nodepools_build_table()
|
|
449
473
|
if return_code != 0:
|
|
450
474
|
xpk_exit(return_code)
|
|
451
475
|
|
|
@@ -461,7 +485,6 @@ def cluster_describe(args) -> None:
|
|
|
461
485
|
r'kubectl get node --no-headers=true'
|
|
462
486
|
r" --selector='cloud.google.com/gke-tpu-accelerator' | wc -l",
|
|
463
487
|
'Count TPU Nodes',
|
|
464
|
-
args,
|
|
465
488
|
)
|
|
466
489
|
if return_code_node_output != 0:
|
|
467
490
|
xpk_exit(return_code_node_output)
|
|
@@ -472,7 +495,6 @@ def cluster_describe(args) -> None:
|
|
|
472
495
|
"kubectl get pod -o=custom-columns='Status:.status.phase' | grep -i"
|
|
473
496
|
' Running | wc -l',
|
|
474
497
|
'Count TPU Pods',
|
|
475
|
-
args,
|
|
476
498
|
)
|
|
477
499
|
if return_code_pod_output != 0:
|
|
478
500
|
xpk_exit(return_code_pod_output)
|
|
@@ -487,7 +509,7 @@ def cluster_describe(args) -> None:
|
|
|
487
509
|
xpk_exit(0)
|
|
488
510
|
|
|
489
511
|
|
|
490
|
-
def nodepools_build_table(
|
|
512
|
+
def nodepools_build_table() -> tuple[int, list[list]]:
|
|
491
513
|
table = [[
|
|
492
514
|
'NODEPOOL_NAME',
|
|
493
515
|
'SLICE',
|
|
@@ -499,14 +521,14 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
|
|
|
499
521
|
|
|
500
522
|
nodepools_data = {}
|
|
501
523
|
|
|
502
|
-
nodepools, return_code = get_node_pools_name(
|
|
524
|
+
nodepools, return_code = get_node_pools_name()
|
|
503
525
|
if return_code != 0:
|
|
504
526
|
xpk_print(f'Get node pools name returned ERROR {return_code}')
|
|
505
527
|
|
|
506
528
|
for name in nodepools:
|
|
507
529
|
nodepools_data[name] = [name]
|
|
508
530
|
|
|
509
|
-
slices, return_code = get_slice_node_pool_size(
|
|
531
|
+
slices, return_code = get_slice_node_pool_size()
|
|
510
532
|
if return_code != 0:
|
|
511
533
|
xpk_print(f'Get slice node pool size returned ERROR {return_code}')
|
|
512
534
|
|
|
@@ -515,7 +537,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
|
|
|
515
537
|
count, nodepool_name = s[0], s[1]
|
|
516
538
|
nodepools_data[nodepool_name].append(count)
|
|
517
539
|
|
|
518
|
-
type_nodepool, return_code = get_node_pool_instance_type(
|
|
540
|
+
type_nodepool, return_code = get_node_pool_instance_type()
|
|
519
541
|
if return_code != 0:
|
|
520
542
|
xpk_print(f'Get node pool instance type returned ERROR {return_code}')
|
|
521
543
|
|
|
@@ -524,7 +546,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
|
|
|
524
546
|
nodepool_name, instance_type = tn[0], tn[1]
|
|
525
547
|
nodepools_data[nodepool_name].append(instance_type)
|
|
526
548
|
|
|
527
|
-
expected_healthy_nodes, return_code = get_expected_healthy_nodes(
|
|
549
|
+
expected_healthy_nodes, return_code = get_expected_healthy_nodes()
|
|
528
550
|
if return_code != 0:
|
|
529
551
|
xpk_print(f'Get expected healthy nodes returned ERROR {return_code}')
|
|
530
552
|
|
|
@@ -533,7 +555,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
|
|
|
533
555
|
count, nodepool_name = ehn[0], ehn[1]
|
|
534
556
|
nodepools_data[nodepool_name].append(count)
|
|
535
557
|
|
|
536
|
-
actual_healthy_nodes, return_code = get_actual_healthy_nodes(
|
|
558
|
+
actual_healthy_nodes, return_code = get_actual_healthy_nodes()
|
|
537
559
|
if return_code != 0:
|
|
538
560
|
xpk_print(f'Get actual healthy nodes returned ERROR {return_code}')
|
|
539
561
|
|
|
@@ -542,7 +564,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
|
|
|
542
564
|
count, nodepool_name = ahn[0], ahn[1]
|
|
543
565
|
nodepools_data[nodepool_name].append(count)
|
|
544
566
|
|
|
545
|
-
total_nodes, return_code = get_total_nodes_per_node_pool(
|
|
567
|
+
total_nodes, return_code = get_total_nodes_per_node_pool()
|
|
546
568
|
if return_code != 0:
|
|
547
569
|
xpk_print(f'Get total nodes per node pool returned ERROR {return_code}')
|
|
548
570
|
|
|
@@ -557,20 +579,20 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
|
|
|
557
579
|
return 0, table
|
|
558
580
|
|
|
559
581
|
|
|
560
|
-
def get_node_pools_name(
|
|
582
|
+
def get_node_pools_name() -> tuple[list[str], int]:
|
|
561
583
|
cmd_nodepools = (
|
|
562
584
|
'kubectl get node --no-headers=true -o'
|
|
563
585
|
" custom-columns='NODEPOOL:.metadata.labels.cloud\\.google\\.com/gke-nodepool'"
|
|
564
586
|
" | grep -v 'none' | sort | uniq"
|
|
565
587
|
)
|
|
566
|
-
return_code, out = run_command_for_value(cmd_nodepools, 'Nodepool list'
|
|
588
|
+
return_code, out = run_command_for_value(cmd_nodepools, 'Nodepool list')
|
|
567
589
|
if return_code != 0:
|
|
568
590
|
return [], return_code
|
|
569
591
|
|
|
570
592
|
return out.splitlines(), 0
|
|
571
593
|
|
|
572
594
|
|
|
573
|
-
def get_slice_node_pool_size(
|
|
595
|
+
def get_slice_node_pool_size() -> tuple[list[str], int]:
|
|
574
596
|
cmd_slices = (
|
|
575
597
|
'kubectl get node --no-headers=true -o'
|
|
576
598
|
" custom-columns=':metadata.labels.cloud\\.google\\.com/gke-nodepool'"
|
|
@@ -579,7 +601,7 @@ def get_slice_node_pool_size(args) -> tuple[list[str], int]:
|
|
|
579
601
|
' | uniq -c'
|
|
580
602
|
)
|
|
581
603
|
return_code, out = run_command_for_value(
|
|
582
|
-
cmd_slices, 'Count nodes per nodepool slice'
|
|
604
|
+
cmd_slices, 'Count nodes per nodepool slice'
|
|
583
605
|
)
|
|
584
606
|
if return_code != 0:
|
|
585
607
|
return [], return_code
|
|
@@ -587,7 +609,7 @@ def get_slice_node_pool_size(args) -> tuple[list[str], int]:
|
|
|
587
609
|
return out.splitlines(), 0
|
|
588
610
|
|
|
589
611
|
|
|
590
|
-
def get_node_pool_instance_type(
|
|
612
|
+
def get_node_pool_instance_type() -> tuple[list[str], int]:
|
|
591
613
|
cmd_type_nodepool = (
|
|
592
614
|
'kubectl get node --no-headers=true -o'
|
|
593
615
|
" custom-columns='NODEPOOL:.metadata.labels.cloud\\.google\\.com/gke-nodepool,"
|
|
@@ -595,7 +617,7 @@ def get_node_pool_instance_type(args) -> tuple[list[str], int]:
|
|
|
595
617
|
" 'none' | sort | uniq"
|
|
596
618
|
)
|
|
597
619
|
return_code, out = run_command_for_value(
|
|
598
|
-
cmd_type_nodepool, 'Instance type of nodepools'
|
|
620
|
+
cmd_type_nodepool, 'Instance type of nodepools'
|
|
599
621
|
)
|
|
600
622
|
if return_code != 0:
|
|
601
623
|
return [], return_code
|
|
@@ -603,7 +625,7 @@ def get_node_pool_instance_type(args) -> tuple[list[str], int]:
|
|
|
603
625
|
return out.splitlines(), 0
|
|
604
626
|
|
|
605
627
|
|
|
606
|
-
def get_expected_healthy_nodes(
|
|
628
|
+
def get_expected_healthy_nodes() -> tuple[list[str], int]:
|
|
607
629
|
cmd_expected_healthy_nodes = (
|
|
608
630
|
'kubectl get node --no-headers=true -o'
|
|
609
631
|
" custom-columns=':metadata.labels.cloud\\.google\\.com/gke-nodepool'"
|
|
@@ -614,7 +636,6 @@ def get_expected_healthy_nodes(args) -> tuple[list[str], int]:
|
|
|
614
636
|
return_code, out = run_command_for_value(
|
|
615
637
|
cmd_expected_healthy_nodes,
|
|
616
638
|
'Count expected healthy nodes per nodepool',
|
|
617
|
-
args,
|
|
618
639
|
)
|
|
619
640
|
if return_code != 0:
|
|
620
641
|
return [], return_code
|
|
@@ -622,7 +643,7 @@ def get_expected_healthy_nodes(args) -> tuple[list[str], int]:
|
|
|
622
643
|
return out.splitlines(), 0
|
|
623
644
|
|
|
624
645
|
|
|
625
|
-
def get_actual_healthy_nodes(
|
|
646
|
+
def get_actual_healthy_nodes() -> tuple[list[str], int]:
|
|
626
647
|
cmd_actual_healthy_nodes = (
|
|
627
648
|
'kubectl get node --no-headers=true -o'
|
|
628
649
|
" custom-columns='NODE_NAME:metadata.name,"
|
|
@@ -635,7 +656,7 @@ def get_actual_healthy_nodes(args) -> tuple[list[str], int]:
|
|
|
635
656
|
' | uniq -c'
|
|
636
657
|
)
|
|
637
658
|
return_code, out = run_command_for_value(
|
|
638
|
-
cmd_actual_healthy_nodes, 'Count actual healthy nodes per nodepool'
|
|
659
|
+
cmd_actual_healthy_nodes, 'Count actual healthy nodes per nodepool'
|
|
639
660
|
)
|
|
640
661
|
if return_code != 0:
|
|
641
662
|
return [], return_code
|
|
@@ -643,7 +664,7 @@ def get_actual_healthy_nodes(args) -> tuple[list[str], int]:
|
|
|
643
664
|
return out.splitlines(), 0
|
|
644
665
|
|
|
645
666
|
|
|
646
|
-
def get_total_nodes_per_node_pool(
|
|
667
|
+
def get_total_nodes_per_node_pool() -> tuple[list[str], int]:
|
|
647
668
|
cmd_total_nodes = (
|
|
648
669
|
'kubectl get node --no-headers=true -o'
|
|
649
670
|
" custom-columns='NODE_NAME:metadata.name,"
|
|
@@ -655,7 +676,7 @@ def get_total_nodes_per_node_pool(args) -> tuple[list[str], int]:
|
|
|
655
676
|
' | uniq -c'
|
|
656
677
|
)
|
|
657
678
|
return_code, out = run_command_for_value(
|
|
658
|
-
cmd_total_nodes, 'Count total nodes per nodepool'
|
|
679
|
+
cmd_total_nodes, 'Count total nodes per nodepool'
|
|
659
680
|
)
|
|
660
681
|
if return_code != 0:
|
|
661
682
|
return [], return_code
|
|
@@ -672,6 +693,8 @@ def cluster_list(args) -> None:
|
|
|
672
693
|
Returns:
|
|
673
694
|
0 if successful and 1 otherwise.
|
|
674
695
|
"""
|
|
696
|
+
if should_validate_dependencies(args):
|
|
697
|
+
validate_dependencies_list([SystemDependency.GCLOUD])
|
|
675
698
|
add_zone_and_project(args)
|
|
676
699
|
xpk_print(f'For project {args.project} and zone {args.zone}:', flush=True)
|
|
677
700
|
if run_gke_clusters_list_command(args):
|
|
@@ -707,20 +730,20 @@ def cluster_create_ray_cluster(args) -> None:
|
|
|
707
730
|
cluster_create(args)
|
|
708
731
|
|
|
709
732
|
|
|
710
|
-
def install_jq(
|
|
733
|
+
def install_jq():
|
|
711
734
|
"""Installs 'jq' utility."""
|
|
712
735
|
if shutil.which('jq'):
|
|
713
736
|
xpk_print("Task: 'Install jq' skipped, jq already installed.")
|
|
714
737
|
return
|
|
715
738
|
command_jq_install = 'sudo apt install jq -y'
|
|
716
739
|
xpk_print("Task: 'Install jq' in progress.")
|
|
717
|
-
return_code = run_command_with_updates(command_jq_install, 'Install jq'
|
|
740
|
+
return_code = run_command_with_updates(command_jq_install, 'Install jq')
|
|
718
741
|
if return_code != 0:
|
|
719
742
|
xpk_print(f'Install jq error {return_code}')
|
|
720
743
|
xpk_exit(return_code)
|
|
721
744
|
|
|
722
745
|
|
|
723
|
-
def clone_coredns_deployment_repo(
|
|
746
|
+
def clone_coredns_deployment_repo(coredns_repo_full_path: str):
|
|
724
747
|
"""Clones the CoreDNS deployment repository if it doesn't exist."""
|
|
725
748
|
if os.path.exists(coredns_repo_full_path):
|
|
726
749
|
xpk_print(
|
|
@@ -735,15 +758,13 @@ def clone_coredns_deployment_repo(args, coredns_repo_full_path: str):
|
|
|
735
758
|
"Task: 'Clone deployment' in progress, Target"
|
|
736
759
|
f' directory:{coredns_repo_full_path}.'
|
|
737
760
|
)
|
|
738
|
-
return_code = run_command_with_updates(
|
|
739
|
-
command_git_clone, 'Clone deployment', args
|
|
740
|
-
)
|
|
761
|
+
return_code = run_command_with_updates(command_git_clone, 'Clone deployment')
|
|
741
762
|
if return_code != 0:
|
|
742
763
|
xpk_print(f'Clone deployment error {return_code}')
|
|
743
764
|
xpk_exit(return_code)
|
|
744
765
|
|
|
745
766
|
|
|
746
|
-
def deploy_coredns_manifests(
|
|
767
|
+
def deploy_coredns_manifests(coredns_k8s_path: str):
|
|
747
768
|
"""Deploys CoreDNS manifests to the cluster."""
|
|
748
769
|
if not os.path.isdir(coredns_k8s_path):
|
|
749
770
|
xpk_print(
|
|
@@ -761,7 +782,7 @@ def deploy_coredns_manifests(args, coredns_k8s_path: str):
|
|
|
761
782
|
f"Task: 'Deploy CoreDNS' in progress, Located at '{coredns_k8s_path}'"
|
|
762
783
|
)
|
|
763
784
|
return_code = run_command_with_updates(
|
|
764
|
-
command_deploy_coredns, 'Deploy CoreDNS'
|
|
785
|
+
command_deploy_coredns, 'Deploy CoreDNS'
|
|
765
786
|
)
|
|
766
787
|
if return_code != 0:
|
|
767
788
|
xpk_print(f'Deploy CoreDNS error {return_code}')
|
|
@@ -773,9 +794,7 @@ def deploy_coredns_manifests(args, coredns_k8s_path: str):
|
|
|
773
794
|
xpk_exit(return_code)
|
|
774
795
|
|
|
775
796
|
|
|
776
|
-
def scale_down_deployment(
|
|
777
|
-
args, deployment_name: str, namespace: str = 'kube-system'
|
|
778
|
-
):
|
|
797
|
+
def scale_down_deployment(deployment_name: str, namespace: str = 'kube-system'):
|
|
779
798
|
"""Scales down a specified Kubernetes deployment to 0 replicas."""
|
|
780
799
|
command = (
|
|
781
800
|
f'kubectl scale deployment {deployment_name} --replicas=0'
|
|
@@ -783,29 +802,27 @@ def scale_down_deployment(
|
|
|
783
802
|
)
|
|
784
803
|
xpk_print(f"Task: 'Scaling down {deployment_name}' in progress")
|
|
785
804
|
return_code = run_command_with_updates(
|
|
786
|
-
command, f'Scale down {deployment_name}'
|
|
805
|
+
command, f'Scale down {deployment_name}'
|
|
787
806
|
)
|
|
788
807
|
if return_code != 0:
|
|
789
808
|
xpk_print(f'Scale down {deployment_name} error {return_code}')
|
|
790
809
|
xpk_exit(return_code)
|
|
791
|
-
xpk_print(f'
|
|
810
|
+
xpk_print(f'{deployment_name} has been scaled down.')
|
|
792
811
|
|
|
793
812
|
|
|
794
|
-
def scale_up_coredns(
|
|
813
|
+
def scale_up_coredns(replicas: int = 15, namespace: str = 'kube-system'):
|
|
795
814
|
"""Scales up the CoreDNS deployment to a specified number of replicas."""
|
|
796
815
|
command_coredns_scale = (
|
|
797
816
|
f'kubectl scale deployment coredns --replicas={replicas} -n {namespace}'
|
|
798
817
|
)
|
|
799
818
|
xpk_print(f"Task: 'Scale CoreDNS' in progress (to {replicas} replicas)")
|
|
800
|
-
return_code = run_command_with_updates(
|
|
801
|
-
command_coredns_scale, 'Scale CoreDNS', args
|
|
802
|
-
)
|
|
819
|
+
return_code = run_command_with_updates(command_coredns_scale, 'Scale CoreDNS')
|
|
803
820
|
if return_code != 0:
|
|
804
821
|
xpk_print(f'Scale CoreDNS error {return_code}')
|
|
805
822
|
xpk_exit(return_code)
|
|
806
823
|
|
|
807
824
|
|
|
808
|
-
def check_deployment_exists(
|
|
825
|
+
def check_deployment_exists(deployment_name: str, namespace: str) -> bool:
|
|
809
826
|
"""Check for the existence of a specific Deployment in a given namespace."""
|
|
810
827
|
# TODO: rewrite this to be more obvious, check if it is correct
|
|
811
828
|
command = (
|
|
@@ -813,17 +830,17 @@ def check_deployment_exists(args, deployment_name: str, namespace: str) -> bool:
|
|
|
813
830
|
f' {namespace} --ignore-not-found'
|
|
814
831
|
)
|
|
815
832
|
result = run_command_with_updates(
|
|
816
|
-
command, 'Waiting for kubeDNS to be checked.'
|
|
833
|
+
command, 'Waiting for kubeDNS to be checked.'
|
|
817
834
|
)
|
|
818
835
|
return result != 0
|
|
819
836
|
|
|
820
837
|
|
|
821
838
|
def verify_coredns_readiness(
|
|
822
|
-
|
|
839
|
+
timeout: int = 240, namespace: str = 'kube-system'
|
|
823
840
|
):
|
|
824
841
|
"""Verifies CoreDNS readiness using kubectl wait commands."""
|
|
825
842
|
xpk_print('Now verifying CoreDNS readiness...')
|
|
826
|
-
kube_dns_exists = check_deployment_exists(
|
|
843
|
+
kube_dns_exists = check_deployment_exists('kube-dns', namespace)
|
|
827
844
|
if kube_dns_exists:
|
|
828
845
|
# Wait for kube-dns to be fully scaled down
|
|
829
846
|
command_kube_dns_wait_scaled_down = (
|
|
@@ -833,7 +850,7 @@ def verify_coredns_readiness(
|
|
|
833
850
|
)
|
|
834
851
|
xpk_print('Verifying if kube-dns has scaled down...')
|
|
835
852
|
return_code_kube_dns = run_command_with_updates(
|
|
836
|
-
command_kube_dns_wait_scaled_down, 'Wait for kube-dns scale down'
|
|
853
|
+
command_kube_dns_wait_scaled_down, 'Wait for kube-dns scale down'
|
|
837
854
|
)
|
|
838
855
|
if return_code_kube_dns != 0:
|
|
839
856
|
xpk_print('kube-dns did not scale down successfully within the timeout.')
|
|
@@ -849,7 +866,7 @@ def verify_coredns_readiness(
|
|
|
849
866
|
)
|
|
850
867
|
xpk_print('Verifying if CoreDNS is available...')
|
|
851
868
|
return_code_coredns = run_command_with_updates(
|
|
852
|
-
command_coredns_wait_available, 'Wait for coredns available'
|
|
869
|
+
command_coredns_wait_available, 'Wait for coredns available'
|
|
853
870
|
)
|
|
854
871
|
if return_code_coredns != 0:
|
|
855
872
|
xpk_print(
|
|
@@ -874,12 +891,9 @@ def cleanup_coredns_repo(coredns_repo_full_path: str):
|
|
|
874
891
|
xpk_print(f'Error deleting directory {coredns_repo_full_path}: {e}')
|
|
875
892
|
|
|
876
893
|
|
|
877
|
-
def update_coredns(
|
|
894
|
+
def update_coredns() -> int:
|
|
878
895
|
"""Updates and deploys CoreDNS within a cluster.
|
|
879
896
|
|
|
880
|
-
Args:
|
|
881
|
-
args: user provided arguments for running the command.
|
|
882
|
-
|
|
883
897
|
Returns:
|
|
884
898
|
0 if successful and 1 otherwise.
|
|
885
899
|
"""
|
|
@@ -888,23 +902,23 @@ def update_coredns(args) -> int:
|
|
|
888
902
|
coredns_repo_full_path = os.path.join(coredns_repo_dir, coredns_repo_dir_name)
|
|
889
903
|
coredns_k8s_path = os.path.join(coredns_repo_full_path, 'kubernetes')
|
|
890
904
|
# 1. Install jq
|
|
891
|
-
install_jq(
|
|
905
|
+
install_jq()
|
|
892
906
|
|
|
893
907
|
# 2. Clone CoreDNS deployment repository
|
|
894
|
-
clone_coredns_deployment_repo(
|
|
908
|
+
clone_coredns_deployment_repo(coredns_repo_full_path)
|
|
895
909
|
|
|
896
910
|
# 3. Deploy CoreDNS to the cluster
|
|
897
|
-
deploy_coredns_manifests(
|
|
911
|
+
deploy_coredns_manifests(coredns_k8s_path)
|
|
898
912
|
|
|
899
913
|
# 4. Scale down kube-dns-autoscaler
|
|
900
|
-
scale_down_deployment(
|
|
914
|
+
scale_down_deployment('kube-dns-autoscaler')
|
|
901
915
|
|
|
902
916
|
# 5. Scale down kube-dns
|
|
903
|
-
scale_down_deployment(
|
|
917
|
+
scale_down_deployment('kube-dns')
|
|
904
918
|
|
|
905
919
|
# 6. Scale up coredns and verify readiness
|
|
906
|
-
scale_up_coredns(
|
|
907
|
-
verify_coredns_readiness(
|
|
920
|
+
scale_up_coredns(replicas=15)
|
|
921
|
+
verify_coredns_readiness(timeout=120)
|
|
908
922
|
|
|
909
923
|
xpk_print('The CoreDNS setup process has been completed.')
|
|
910
924
|
|
|
@@ -914,7 +928,7 @@ def update_coredns(args) -> int:
|
|
|
914
928
|
return 0
|
|
915
929
|
|
|
916
930
|
|
|
917
|
-
def coredns_deployment_exists(
|
|
931
|
+
def coredns_deployment_exists(namespace: str = 'kube-system') -> bool:
|
|
918
932
|
"""Checks if the CoreDNS deployment exists in the given namespace.
|
|
919
933
|
|
|
920
934
|
Args:
|
|
@@ -929,10 +943,10 @@ def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool:
|
|
|
929
943
|
f' namespace: {namespace}'
|
|
930
944
|
)
|
|
931
945
|
return_code = run_command_with_updates(
|
|
932
|
-
command, f'Check CoreDNS deployment in {namespace}'
|
|
946
|
+
command, f'Check CoreDNS deployment in {namespace}'
|
|
933
947
|
)
|
|
934
948
|
if return_code == 0:
|
|
935
|
-
verify_coredns_readiness(
|
|
949
|
+
verify_coredns_readiness()
|
|
936
950
|
xpk_print(f"CoreDNS deployment 'coredns' found in namespace '{namespace}'.")
|
|
937
951
|
return True
|
|
938
952
|
else:
|
|
@@ -943,25 +957,22 @@ def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool:
|
|
|
943
957
|
return False
|
|
944
958
|
|
|
945
959
|
|
|
946
|
-
def update_coredns_if_necessary(
|
|
960
|
+
def update_coredns_if_necessary() -> int:
|
|
947
961
|
"""Updates and deploys CoreDNS within the cluster if it's not already present.
|
|
948
962
|
|
|
949
963
|
This function checks for the existence of the CoreDNS deployment.
|
|
950
964
|
If it's not found, it proceeds to deploy and configure CoreDNS.
|
|
951
965
|
|
|
952
|
-
Args:
|
|
953
|
-
args: User-provided arguments for running the command.
|
|
954
|
-
|
|
955
966
|
Returns:
|
|
956
967
|
0 if successful (CoreDNS was already present or successfully deployed),
|
|
957
968
|
and 1 otherwise.
|
|
958
969
|
"""
|
|
959
|
-
if coredns_deployment_exists(
|
|
970
|
+
if coredns_deployment_exists(namespace='kube-system'):
|
|
960
971
|
xpk_print('Skipping CoreDNS deployment since it already exists.')
|
|
961
972
|
return 0
|
|
962
973
|
else:
|
|
963
974
|
xpk_print('CoreDNS deployment not found. Proceeding with CoreDNS setup.')
|
|
964
|
-
return update_coredns(
|
|
975
|
+
return update_coredns()
|
|
965
976
|
|
|
966
977
|
|
|
967
978
|
def create_cluster_if_necessary(
|
|
@@ -1021,10 +1032,10 @@ def run_gke_cluster_delete_command(args) -> int:
|
|
|
1021
1032
|
command = (
|
|
1022
1033
|
'gcloud beta container clusters delete'
|
|
1023
1034
|
f' {args.cluster} --project={args.project}'
|
|
1024
|
-
f' --
|
|
1035
|
+
f' --location={get_cluster_location(args.project, args.cluster, args.zone)} --quiet'
|
|
1025
1036
|
)
|
|
1026
1037
|
|
|
1027
|
-
return_code = run_command_with_updates(command, 'Cluster Delete'
|
|
1038
|
+
return_code = run_command_with_updates(command, 'Cluster Delete')
|
|
1028
1039
|
if return_code != 0:
|
|
1029
1040
|
xpk_print(f'Cluster delete request returned ERROR {return_code}')
|
|
1030
1041
|
return 1
|
|
@@ -1047,9 +1058,9 @@ def run_gke_clusters_list_command(args) -> int:
|
|
|
1047
1058
|
"""
|
|
1048
1059
|
command = (
|
|
1049
1060
|
'gcloud container clusters list'
|
|
1050
|
-
f' --project={args.project} --
|
|
1061
|
+
f' --project={args.project} --filter=location~"{zone_to_region(args.zone)}.*"'
|
|
1051
1062
|
)
|
|
1052
|
-
return_code = run_command_with_updates(command, 'Cluster List'
|
|
1063
|
+
return_code = run_command_with_updates(command, 'Cluster List')
|
|
1053
1064
|
if return_code != 0:
|
|
1054
1065
|
xpk_print(f'Cluster list request returned ERROR {return_code}')
|
|
1055
1066
|
return 1
|
|
@@ -1105,6 +1116,7 @@ def run_gke_cluster_create_command(
|
|
|
1105
1116
|
f' {rapid_release_cmd}'
|
|
1106
1117
|
' --enable-dns-access'
|
|
1107
1118
|
' --autoscaling-profile=optimize-utilization'
|
|
1119
|
+
' --labels=gke_product_type=xpk'
|
|
1108
1120
|
)
|
|
1109
1121
|
|
|
1110
1122
|
enable_ip_alias = False
|
|
@@ -1158,7 +1170,7 @@ def run_gke_cluster_create_command(
|
|
|
1158
1170
|
addons_str = ','.join(addons)
|
|
1159
1171
|
command += f' --addons={addons_str}'
|
|
1160
1172
|
|
|
1161
|
-
return_code = run_command_with_updates(command, 'GKE Cluster Create'
|
|
1173
|
+
return_code = run_command_with_updates(command, 'GKE Cluster Create')
|
|
1162
1174
|
if return_code != 0:
|
|
1163
1175
|
xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
|
|
1164
1176
|
return 1
|
|
@@ -1204,12 +1216,12 @@ def install_storage_csis(args):
|
|
|
1204
1216
|
|
|
1205
1217
|
def install_kjob(args):
|
|
1206
1218
|
xpk_print('Verifying kjob installation')
|
|
1207
|
-
err_code = verify_kjob_installed(
|
|
1219
|
+
err_code = verify_kjob_installed()
|
|
1208
1220
|
if err_code > 0:
|
|
1209
1221
|
xpk_exit(err_code)
|
|
1210
1222
|
|
|
1211
1223
|
xpk_print('Applying kjob CDRs')
|
|
1212
|
-
err_code = apply_kjob_crds(
|
|
1224
|
+
err_code = apply_kjob_crds()
|
|
1213
1225
|
if err_code > 0:
|
|
1214
1226
|
xpk_exit(err_code)
|
|
1215
1227
|
|
|
@@ -1220,42 +1232,43 @@ def install_kjob(args):
|
|
|
1220
1232
|
|
|
1221
1233
|
def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config):
|
|
1222
1234
|
xpk_print('Enabling Kueue on the cluster')
|
|
1223
|
-
|
|
1224
|
-
if
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
+
autoprovisioning_enabled = False
|
|
1236
|
+
if autoprovisioning_config:
|
|
1237
|
+
# Determine total resources available based on autoprovisioning max chips.
|
|
1238
|
+
autoprovisioning_enabled = True
|
|
1239
|
+
total_chips = autoprovisioning_config.maximum_chips
|
|
1240
|
+
else:
|
|
1241
|
+
# Determine total chips based on user specified topology.
|
|
1242
|
+
total_chips = get_total_chips_requested_from_args(args, system)
|
|
1243
|
+
kueue_manager = KueueManager()
|
|
1244
|
+
kueue_manager.install_or_upgrade(
|
|
1245
|
+
KueueConfig(
|
|
1246
|
+
system,
|
|
1247
|
+
total_chips=total_chips,
|
|
1248
|
+
autoprovisioning_enabled=autoprovisioning_enabled,
|
|
1249
|
+
num_slices=args.num_slices,
|
|
1250
|
+
flex=args.flex,
|
|
1251
|
+
memory_limit=args.memory_limit,
|
|
1252
|
+
cpu_limit=args.cpu_limit,
|
|
1253
|
+
is_pathways_cluster=args.enable_pathways,
|
|
1254
|
+
),
|
|
1235
1255
|
)
|
|
1236
|
-
if enable_kueue_credentials_code != 0:
|
|
1237
|
-
xpk_exit(enable_kueue_credentials_code)
|
|
1238
|
-
|
|
1239
|
-
xpk_print('Update Kueue Controller Manager resources')
|
|
1240
|
-
update_kueue_resources_code = update_kueue_resources_if_necessary(args)
|
|
1241
|
-
if update_kueue_resources_code != 0:
|
|
1242
|
-
xpk_exit(update_kueue_resources_code)
|
|
1243
1256
|
|
|
1244
1257
|
|
|
1245
|
-
def prepare_gpus(
|
|
1258
|
+
def prepare_gpus(system: SystemCharacteristics):
|
|
1246
1259
|
xpk_print('Installing NCCL Plugin for cluster')
|
|
1247
|
-
install_nccl_code = install_nccl_on_cluster(
|
|
1260
|
+
install_nccl_code = install_nccl_on_cluster(system)
|
|
1248
1261
|
if install_nccl_code != 0:
|
|
1249
1262
|
xpk_exit(install_nccl_code)
|
|
1250
1263
|
|
|
1251
1264
|
if system.device_type == H100_DEVICE_TYPE:
|
|
1252
1265
|
xpk_print('Installing NRI device injector for cluster')
|
|
1253
|
-
install_nri_code = install_nri_on_cluster(
|
|
1266
|
+
install_nri_code = install_nri_on_cluster()
|
|
1254
1267
|
if install_nri_code != 0:
|
|
1255
1268
|
xpk_exit(install_nri_code)
|
|
1256
1269
|
|
|
1257
1270
|
if system.device_type in [H200_DEVICE_TYPE, B200_DEVICE_TYPE]:
|
|
1258
1271
|
xpk_print('Disabling MGLRU')
|
|
1259
|
-
err_code = disable_mglru_on_cluster(
|
|
1272
|
+
err_code = disable_mglru_on_cluster()
|
|
1260
1273
|
if err_code > 0:
|
|
1261
1274
|
xpk_exit(err_code)
|