xpk 0.12.0__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +17 -10
- xpk/commands/cluster.py +137 -123
- xpk/commands/cluster_gcluster.py +77 -14
- xpk/commands/cluster_gcluster_test.py +177 -0
- xpk/commands/common.py +13 -27
- xpk/commands/info.py +11 -9
- xpk/commands/inspector.py +22 -11
- xpk/commands/job.py +53 -9
- xpk/commands/kind.py +38 -40
- xpk/commands/kjob_common.py +4 -4
- xpk/commands/run.py +9 -2
- xpk/commands/shell.py +13 -10
- xpk/commands/storage.py +26 -2
- xpk/commands/version.py +0 -4
- xpk/commands/workload.py +58 -30
- xpk/core/blueprint/blueprint_generator.py +4 -40
- xpk/core/blueprint/blueprint_test.py +0 -6
- xpk/core/capacity.py +6 -5
- xpk/core/cluster.py +96 -195
- xpk/core/cluster_private.py +9 -12
- xpk/core/commands.py +21 -25
- xpk/core/config.py +1 -1
- xpk/core/docker_image.py +17 -9
- xpk/core/docker_resources.py +9 -4
- xpk/core/gcloud_context.py +26 -2
- xpk/core/gcloud_context_test.py +96 -0
- xpk/core/gcluster_manager.py +0 -3
- xpk/core/jobset.py +5 -8
- xpk/core/kjob.py +19 -29
- xpk/core/kueue_manager.py +383 -0
- xpk/core/kueue_manager_test.py +542 -0
- xpk/core/monitoring.py +1 -1
- xpk/core/nap.py +11 -16
- xpk/core/network.py +18 -19
- xpk/core/nodepool.py +65 -71
- xpk/core/nodepool_test.py +198 -1
- xpk/core/pathways.py +9 -5
- xpk/core/ray.py +11 -15
- xpk/core/resources.py +15 -10
- xpk/core/scheduling.py +23 -1
- xpk/core/scheduling_test.py +31 -0
- xpk/core/system_characteristics.py +335 -229
- xpk/core/vertex.py +1 -1
- xpk/core/workload.py +7 -8
- xpk/main.py +3 -2
- xpk/parser/cluster.py +50 -0
- xpk/parser/cluster_test.py +66 -0
- xpk/parser/common.py +11 -0
- xpk/parser/workload.py +62 -25
- xpk/parser/workload_test.py +82 -0
- xpk/utils/execution_context.py +28 -0
- xpk/utils/feature_flags.py +28 -0
- xpk/utils/file.py +25 -10
- xpk/utils/kueue.py +20 -0
- xpk/utils/network.py +4 -0
- xpk/utils/templates.py +2 -0
- xpk/utils/topology.py +37 -0
- xpk/utils/topology_test.py +43 -0
- xpk/utils/validation.py +79 -55
- xpk/utils/validation_test.py +37 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
- xpk-0.14.0.dist-info/RECORD +112 -0
- xpk/core/kueue.py +0 -545
- xpk-0.12.0.dist-info/RECORD +0 -100
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
xpk/commands/cluster.py
CHANGED
|
@@ -41,17 +41,12 @@ from ..core.gcloud_context import (
|
|
|
41
41
|
add_zone_and_project,
|
|
42
42
|
get_gke_control_plane_version,
|
|
43
43
|
get_gke_server_config,
|
|
44
|
+
get_cluster_location,
|
|
44
45
|
zone_to_region,
|
|
45
46
|
)
|
|
46
47
|
from ..core.jobset import update_jobset_resources_if_necessary
|
|
47
48
|
from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
|
|
48
|
-
from ..core.
|
|
49
|
-
cluster_preheat_yml,
|
|
50
|
-
install_kueue_crs,
|
|
51
|
-
install_kueue_on_cluster,
|
|
52
|
-
wait_for_kueue_available,
|
|
53
|
-
update_kueue_resources_if_necessary,
|
|
54
|
-
)
|
|
49
|
+
from ..core.kueue_manager import (KueueConfig, KueueManager)
|
|
55
50
|
from ..core.nap import enable_autoprovisioning_on_cluster
|
|
56
51
|
from ..core.network import (
|
|
57
52
|
create_cluster_network_config,
|
|
@@ -65,6 +60,7 @@ from ..core.nodepool import (
|
|
|
65
60
|
from ..core.ray import install_ray_cluster
|
|
66
61
|
from ..core.mtc import install_mtc_on_cluster
|
|
67
62
|
from ..core.resources import create_cluster_configmaps
|
|
63
|
+
from ..core.scheduling import get_total_chips_requested_from_args
|
|
68
64
|
from ..core.storage import install_storage_crd
|
|
69
65
|
from ..core.system_characteristics import (
|
|
70
66
|
AcceleratorType,
|
|
@@ -76,11 +72,17 @@ from ..core.vertex import create_vertex_tensorboard
|
|
|
76
72
|
from ..core.workload import get_workload_list
|
|
77
73
|
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
78
74
|
from ..utils.file import write_tmp_file
|
|
75
|
+
from ..utils.execution_context import is_dry_run
|
|
76
|
+
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
79
77
|
from . import cluster_gcluster
|
|
80
78
|
from .common import set_cluster_command
|
|
79
|
+
from jinja2 import Environment, FileSystemLoader
|
|
80
|
+
from ..utils.templates import TEMPLATE_PATH
|
|
81
81
|
import shutil
|
|
82
82
|
import os
|
|
83
83
|
|
|
84
|
+
CLUSTER_PREHEAT_JINJA_FILE = 'cluster_preheat.yaml.j2'
|
|
85
|
+
|
|
84
86
|
|
|
85
87
|
def cluster_adapt(args) -> None:
|
|
86
88
|
"""Function that performs cluster adaptation.
|
|
@@ -88,6 +90,12 @@ def cluster_adapt(args) -> None:
|
|
|
88
90
|
Args:
|
|
89
91
|
args: user provided arguments for running the command.
|
|
90
92
|
"""
|
|
93
|
+
if should_validate_dependencies(args):
|
|
94
|
+
validate_dependencies_list([
|
|
95
|
+
SystemDependency.KUBECTL,
|
|
96
|
+
SystemDependency.KJOB,
|
|
97
|
+
SystemDependency.GCLOUD,
|
|
98
|
+
])
|
|
91
99
|
args.enable_pathways = False
|
|
92
100
|
|
|
93
101
|
system, return_code = get_system_characteristics(args)
|
|
@@ -108,7 +116,7 @@ def cluster_adapt(args) -> None:
|
|
|
108
116
|
'Argument --num-nodes was not provided, trying to determine number of'
|
|
109
117
|
' nodes based on the available nodes in the cluster...'
|
|
110
118
|
)
|
|
111
|
-
args.num_nodes = count_nodes_on_cluster(
|
|
119
|
+
args.num_nodes = count_nodes_on_cluster(system)
|
|
112
120
|
if args.num_nodes == 0:
|
|
113
121
|
xpk_print(
|
|
114
122
|
'Found unexpected number of nodes. Is the --device-type correct?'
|
|
@@ -128,9 +136,10 @@ def cluster_adapt(args) -> None:
|
|
|
128
136
|
|
|
129
137
|
get_cluster_credentials(args)
|
|
130
138
|
|
|
131
|
-
|
|
139
|
+
if not is_dry_run():
|
|
140
|
+
k8s_client = setup_k8s_env(args)
|
|
141
|
+
install_storage_crd(k8s_client)
|
|
132
142
|
|
|
133
|
-
install_storage_crd(k8s_client)
|
|
134
143
|
install_storage_csis(args)
|
|
135
144
|
|
|
136
145
|
# create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
|
|
@@ -174,7 +183,7 @@ def cluster_adapt(args) -> None:
|
|
|
174
183
|
|
|
175
184
|
install_kjob(args)
|
|
176
185
|
if system.accelerator_type == AcceleratorType['GPU']:
|
|
177
|
-
prepare_gpus(
|
|
186
|
+
prepare_gpus(system)
|
|
178
187
|
|
|
179
188
|
if args.enable_ray_cluster:
|
|
180
189
|
return_code = install_ray_cluster(args, system)
|
|
@@ -186,7 +195,7 @@ def cluster_adapt(args) -> None:
|
|
|
186
195
|
xpk_print(
|
|
187
196
|
'See your GKE Cluster here:'
|
|
188
197
|
# pylint: disable=line-too-long
|
|
189
|
-
f' https://console.cloud.google.com/kubernetes/clusters/details/{
|
|
198
|
+
f' https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
|
|
190
199
|
)
|
|
191
200
|
xpk_exit(0)
|
|
192
201
|
|
|
@@ -197,6 +206,12 @@ def cluster_create(args) -> None:
|
|
|
197
206
|
Args:
|
|
198
207
|
args: user provided arguments for running the command.
|
|
199
208
|
"""
|
|
209
|
+
if should_validate_dependencies(args):
|
|
210
|
+
validate_dependencies_list([
|
|
211
|
+
SystemDependency.KUBECTL,
|
|
212
|
+
SystemDependency.KJOB,
|
|
213
|
+
SystemDependency.GCLOUD,
|
|
214
|
+
])
|
|
200
215
|
system, return_code = get_system_characteristics(args)
|
|
201
216
|
|
|
202
217
|
if return_code > 0 or system is None:
|
|
@@ -247,13 +262,14 @@ def cluster_create(args) -> None:
|
|
|
247
262
|
|
|
248
263
|
get_cluster_credentials(args)
|
|
249
264
|
|
|
250
|
-
update_coredns_command_code = update_coredns_if_necessary(
|
|
265
|
+
update_coredns_command_code = update_coredns_if_necessary()
|
|
251
266
|
if update_coredns_command_code != 0:
|
|
252
267
|
xpk_exit(update_cluster_command_code)
|
|
253
268
|
|
|
254
|
-
|
|
269
|
+
if not is_dry_run():
|
|
270
|
+
k8s_client = setup_k8s_env(args)
|
|
271
|
+
install_storage_crd(k8s_client)
|
|
255
272
|
|
|
256
|
-
install_storage_crd(k8s_client)
|
|
257
273
|
install_storage_csis(args)
|
|
258
274
|
|
|
259
275
|
# create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
|
|
@@ -314,7 +330,7 @@ def cluster_create(args) -> None:
|
|
|
314
330
|
set_jobset_on_cluster_code = set_jobset_on_cluster(args)
|
|
315
331
|
if set_jobset_on_cluster_code != 0:
|
|
316
332
|
xpk_exit(set_jobset_on_cluster_code)
|
|
317
|
-
update_jobset_resources_code = update_jobset_resources_if_necessary(
|
|
333
|
+
update_jobset_resources_code = update_jobset_resources_if_necessary()
|
|
318
334
|
if update_jobset_resources_code != 0:
|
|
319
335
|
xpk_exit(update_jobset_resources_code)
|
|
320
336
|
|
|
@@ -327,7 +343,7 @@ def cluster_create(args) -> None:
|
|
|
327
343
|
install_kjob(args)
|
|
328
344
|
|
|
329
345
|
if system.accelerator_type == AcceleratorType['GPU']:
|
|
330
|
-
prepare_gpus(
|
|
346
|
+
prepare_gpus(system)
|
|
331
347
|
|
|
332
348
|
if args.enable_ray_cluster:
|
|
333
349
|
return_code = install_ray_cluster(args, system)
|
|
@@ -345,7 +361,7 @@ def cluster_create(args) -> None:
|
|
|
345
361
|
xpk_print(
|
|
346
362
|
'See your GKE Cluster here:'
|
|
347
363
|
# pylint: disable=line-too-long
|
|
348
|
-
f' https://console.cloud.google.com/kubernetes/clusters/details/{
|
|
364
|
+
f' https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
|
|
349
365
|
)
|
|
350
366
|
xpk_exit(0)
|
|
351
367
|
|
|
@@ -359,6 +375,8 @@ def cluster_delete(args) -> None:
|
|
|
359
375
|
Returns:
|
|
360
376
|
0 if successful and 1 otherwise.
|
|
361
377
|
"""
|
|
378
|
+
if should_validate_dependencies(args):
|
|
379
|
+
validate_dependencies_list([SystemDependency.GCLOUD])
|
|
362
380
|
xpk_print(f'Starting cluster delete for cluster: {args.cluster}', flush=True)
|
|
363
381
|
add_zone_and_project(args)
|
|
364
382
|
|
|
@@ -388,6 +406,10 @@ def cluster_cacheimage(args) -> None:
|
|
|
388
406
|
Returns:
|
|
389
407
|
0 if successful and 1 otherwise.
|
|
390
408
|
"""
|
|
409
|
+
if should_validate_dependencies(args):
|
|
410
|
+
validate_dependencies_list(
|
|
411
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
412
|
+
)
|
|
391
413
|
xpk_print(
|
|
392
414
|
f'Starting cluster cacheimage for cluster: {args.cluster}', flush=True
|
|
393
415
|
)
|
|
@@ -403,27 +425,26 @@ def cluster_cacheimage(args) -> None:
|
|
|
403
425
|
node_selector_key = AcceleratorTypeToAcceleratorCharacteristics[
|
|
404
426
|
system.accelerator_type
|
|
405
427
|
].accelerator_label
|
|
406
|
-
|
|
428
|
+
|
|
429
|
+
template_env = Environment(loader=FileSystemLoader(TEMPLATE_PATH))
|
|
430
|
+
cluster_preheat_yaml = template_env.get_template(CLUSTER_PREHEAT_JINJA_FILE)
|
|
431
|
+
rendered_yaml = cluster_preheat_yaml.render(
|
|
407
432
|
cachekey=args.cache_key,
|
|
408
433
|
image_name=args.docker_image,
|
|
409
434
|
nodeSelectorKey=node_selector_key,
|
|
410
435
|
)
|
|
411
|
-
tmp = write_tmp_file(
|
|
412
|
-
command_apply = f'kubectl apply -f {str(tmp
|
|
413
|
-
command_delete = (
|
|
414
|
-
f'kubectl delete -f {str(tmp.file.name)} --ignore-not-found=true'
|
|
415
|
-
)
|
|
436
|
+
tmp = write_tmp_file(rendered_yaml)
|
|
437
|
+
command_apply = f'kubectl apply -f {str(tmp)}'
|
|
438
|
+
command_delete = f'kubectl delete -f {str(tmp)} --ignore-not-found=true'
|
|
416
439
|
|
|
417
440
|
return_code = run_command_with_updates(
|
|
418
|
-
command_delete, 'Deleting Cached Image'
|
|
441
|
+
command_delete, 'Deleting Cached Image'
|
|
419
442
|
)
|
|
420
443
|
if return_code != 0:
|
|
421
444
|
xpk_print(f'Delete Cached Image returned ERROR {return_code}')
|
|
422
445
|
xpk_exit(return_code)
|
|
423
446
|
|
|
424
|
-
return_code = run_command_with_updates(
|
|
425
|
-
command_apply, 'Creating Cached Image', args
|
|
426
|
-
)
|
|
447
|
+
return_code = run_command_with_updates(command_apply, 'Creating Cached Image')
|
|
427
448
|
if return_code != 0:
|
|
428
449
|
xpk_print(f'Create Cached Image returned ERROR {return_code}')
|
|
429
450
|
xpk_exit(return_code)
|
|
@@ -439,12 +460,16 @@ def cluster_describe(args) -> None:
|
|
|
439
460
|
Returns:
|
|
440
461
|
0 if successful and 1 otherwise.
|
|
441
462
|
"""
|
|
463
|
+
if should_validate_dependencies(args):
|
|
464
|
+
validate_dependencies_list(
|
|
465
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
466
|
+
)
|
|
442
467
|
xpk_print(f'Starting nodepool list for cluster: {args.cluster}', flush=True)
|
|
443
468
|
add_zone_and_project(args)
|
|
444
469
|
|
|
445
470
|
get_cluster_credentials(args)
|
|
446
471
|
|
|
447
|
-
return_code, data_table = nodepools_build_table(
|
|
472
|
+
return_code, data_table = nodepools_build_table()
|
|
448
473
|
if return_code != 0:
|
|
449
474
|
xpk_exit(return_code)
|
|
450
475
|
|
|
@@ -460,7 +485,6 @@ def cluster_describe(args) -> None:
|
|
|
460
485
|
r'kubectl get node --no-headers=true'
|
|
461
486
|
r" --selector='cloud.google.com/gke-tpu-accelerator' | wc -l",
|
|
462
487
|
'Count TPU Nodes',
|
|
463
|
-
args,
|
|
464
488
|
)
|
|
465
489
|
if return_code_node_output != 0:
|
|
466
490
|
xpk_exit(return_code_node_output)
|
|
@@ -471,7 +495,6 @@ def cluster_describe(args) -> None:
|
|
|
471
495
|
"kubectl get pod -o=custom-columns='Status:.status.phase' | grep -i"
|
|
472
496
|
' Running | wc -l',
|
|
473
497
|
'Count TPU Pods',
|
|
474
|
-
args,
|
|
475
498
|
)
|
|
476
499
|
if return_code_pod_output != 0:
|
|
477
500
|
xpk_exit(return_code_pod_output)
|
|
@@ -486,7 +509,7 @@ def cluster_describe(args) -> None:
|
|
|
486
509
|
xpk_exit(0)
|
|
487
510
|
|
|
488
511
|
|
|
489
|
-
def nodepools_build_table(
|
|
512
|
+
def nodepools_build_table() -> tuple[int, list[list]]:
|
|
490
513
|
table = [[
|
|
491
514
|
'NODEPOOL_NAME',
|
|
492
515
|
'SLICE',
|
|
@@ -498,14 +521,14 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
|
|
|
498
521
|
|
|
499
522
|
nodepools_data = {}
|
|
500
523
|
|
|
501
|
-
nodepools, return_code = get_node_pools_name(
|
|
524
|
+
nodepools, return_code = get_node_pools_name()
|
|
502
525
|
if return_code != 0:
|
|
503
526
|
xpk_print(f'Get node pools name returned ERROR {return_code}')
|
|
504
527
|
|
|
505
528
|
for name in nodepools:
|
|
506
529
|
nodepools_data[name] = [name]
|
|
507
530
|
|
|
508
|
-
slices, return_code = get_slice_node_pool_size(
|
|
531
|
+
slices, return_code = get_slice_node_pool_size()
|
|
509
532
|
if return_code != 0:
|
|
510
533
|
xpk_print(f'Get slice node pool size returned ERROR {return_code}')
|
|
511
534
|
|
|
@@ -514,7 +537,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
|
|
|
514
537
|
count, nodepool_name = s[0], s[1]
|
|
515
538
|
nodepools_data[nodepool_name].append(count)
|
|
516
539
|
|
|
517
|
-
type_nodepool, return_code = get_node_pool_instance_type(
|
|
540
|
+
type_nodepool, return_code = get_node_pool_instance_type()
|
|
518
541
|
if return_code != 0:
|
|
519
542
|
xpk_print(f'Get node pool instance type returned ERROR {return_code}')
|
|
520
543
|
|
|
@@ -523,7 +546,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
|
|
|
523
546
|
nodepool_name, instance_type = tn[0], tn[1]
|
|
524
547
|
nodepools_data[nodepool_name].append(instance_type)
|
|
525
548
|
|
|
526
|
-
expected_healthy_nodes, return_code = get_expected_healthy_nodes(
|
|
549
|
+
expected_healthy_nodes, return_code = get_expected_healthy_nodes()
|
|
527
550
|
if return_code != 0:
|
|
528
551
|
xpk_print(f'Get expected healthy nodes returned ERROR {return_code}')
|
|
529
552
|
|
|
@@ -532,7 +555,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
|
|
|
532
555
|
count, nodepool_name = ehn[0], ehn[1]
|
|
533
556
|
nodepools_data[nodepool_name].append(count)
|
|
534
557
|
|
|
535
|
-
actual_healthy_nodes, return_code = get_actual_healthy_nodes(
|
|
558
|
+
actual_healthy_nodes, return_code = get_actual_healthy_nodes()
|
|
536
559
|
if return_code != 0:
|
|
537
560
|
xpk_print(f'Get actual healthy nodes returned ERROR {return_code}')
|
|
538
561
|
|
|
@@ -541,7 +564,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
|
|
|
541
564
|
count, nodepool_name = ahn[0], ahn[1]
|
|
542
565
|
nodepools_data[nodepool_name].append(count)
|
|
543
566
|
|
|
544
|
-
total_nodes, return_code = get_total_nodes_per_node_pool(
|
|
567
|
+
total_nodes, return_code = get_total_nodes_per_node_pool()
|
|
545
568
|
if return_code != 0:
|
|
546
569
|
xpk_print(f'Get total nodes per node pool returned ERROR {return_code}')
|
|
547
570
|
|
|
@@ -556,20 +579,20 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
|
|
|
556
579
|
return 0, table
|
|
557
580
|
|
|
558
581
|
|
|
559
|
-
def get_node_pools_name(
|
|
582
|
+
def get_node_pools_name() -> tuple[list[str], int]:
|
|
560
583
|
cmd_nodepools = (
|
|
561
584
|
'kubectl get node --no-headers=true -o'
|
|
562
585
|
" custom-columns='NODEPOOL:.metadata.labels.cloud\\.google\\.com/gke-nodepool'"
|
|
563
586
|
" | grep -v 'none' | sort | uniq"
|
|
564
587
|
)
|
|
565
|
-
return_code, out = run_command_for_value(cmd_nodepools, 'Nodepool list'
|
|
588
|
+
return_code, out = run_command_for_value(cmd_nodepools, 'Nodepool list')
|
|
566
589
|
if return_code != 0:
|
|
567
590
|
return [], return_code
|
|
568
591
|
|
|
569
592
|
return out.splitlines(), 0
|
|
570
593
|
|
|
571
594
|
|
|
572
|
-
def get_slice_node_pool_size(
|
|
595
|
+
def get_slice_node_pool_size() -> tuple[list[str], int]:
|
|
573
596
|
cmd_slices = (
|
|
574
597
|
'kubectl get node --no-headers=true -o'
|
|
575
598
|
" custom-columns=':metadata.labels.cloud\\.google\\.com/gke-nodepool'"
|
|
@@ -578,7 +601,7 @@ def get_slice_node_pool_size(args) -> tuple[list[str], int]:
|
|
|
578
601
|
' | uniq -c'
|
|
579
602
|
)
|
|
580
603
|
return_code, out = run_command_for_value(
|
|
581
|
-
cmd_slices, 'Count nodes per nodepool slice'
|
|
604
|
+
cmd_slices, 'Count nodes per nodepool slice'
|
|
582
605
|
)
|
|
583
606
|
if return_code != 0:
|
|
584
607
|
return [], return_code
|
|
@@ -586,7 +609,7 @@ def get_slice_node_pool_size(args) -> tuple[list[str], int]:
|
|
|
586
609
|
return out.splitlines(), 0
|
|
587
610
|
|
|
588
611
|
|
|
589
|
-
def get_node_pool_instance_type(
|
|
612
|
+
def get_node_pool_instance_type() -> tuple[list[str], int]:
|
|
590
613
|
cmd_type_nodepool = (
|
|
591
614
|
'kubectl get node --no-headers=true -o'
|
|
592
615
|
" custom-columns='NODEPOOL:.metadata.labels.cloud\\.google\\.com/gke-nodepool,"
|
|
@@ -594,7 +617,7 @@ def get_node_pool_instance_type(args) -> tuple[list[str], int]:
|
|
|
594
617
|
" 'none' | sort | uniq"
|
|
595
618
|
)
|
|
596
619
|
return_code, out = run_command_for_value(
|
|
597
|
-
cmd_type_nodepool, 'Instance type of nodepools'
|
|
620
|
+
cmd_type_nodepool, 'Instance type of nodepools'
|
|
598
621
|
)
|
|
599
622
|
if return_code != 0:
|
|
600
623
|
return [], return_code
|
|
@@ -602,7 +625,7 @@ def get_node_pool_instance_type(args) -> tuple[list[str], int]:
|
|
|
602
625
|
return out.splitlines(), 0
|
|
603
626
|
|
|
604
627
|
|
|
605
|
-
def get_expected_healthy_nodes(
|
|
628
|
+
def get_expected_healthy_nodes() -> tuple[list[str], int]:
|
|
606
629
|
cmd_expected_healthy_nodes = (
|
|
607
630
|
'kubectl get node --no-headers=true -o'
|
|
608
631
|
" custom-columns=':metadata.labels.cloud\\.google\\.com/gke-nodepool'"
|
|
@@ -613,7 +636,6 @@ def get_expected_healthy_nodes(args) -> tuple[list[str], int]:
|
|
|
613
636
|
return_code, out = run_command_for_value(
|
|
614
637
|
cmd_expected_healthy_nodes,
|
|
615
638
|
'Count expected healthy nodes per nodepool',
|
|
616
|
-
args,
|
|
617
639
|
)
|
|
618
640
|
if return_code != 0:
|
|
619
641
|
return [], return_code
|
|
@@ -621,7 +643,7 @@ def get_expected_healthy_nodes(args) -> tuple[list[str], int]:
|
|
|
621
643
|
return out.splitlines(), 0
|
|
622
644
|
|
|
623
645
|
|
|
624
|
-
def get_actual_healthy_nodes(
|
|
646
|
+
def get_actual_healthy_nodes() -> tuple[list[str], int]:
|
|
625
647
|
cmd_actual_healthy_nodes = (
|
|
626
648
|
'kubectl get node --no-headers=true -o'
|
|
627
649
|
" custom-columns='NODE_NAME:metadata.name,"
|
|
@@ -634,7 +656,7 @@ def get_actual_healthy_nodes(args) -> tuple[list[str], int]:
|
|
|
634
656
|
' | uniq -c'
|
|
635
657
|
)
|
|
636
658
|
return_code, out = run_command_for_value(
|
|
637
|
-
cmd_actual_healthy_nodes, 'Count actual healthy nodes per nodepool'
|
|
659
|
+
cmd_actual_healthy_nodes, 'Count actual healthy nodes per nodepool'
|
|
638
660
|
)
|
|
639
661
|
if return_code != 0:
|
|
640
662
|
return [], return_code
|
|
@@ -642,7 +664,7 @@ def get_actual_healthy_nodes(args) -> tuple[list[str], int]:
|
|
|
642
664
|
return out.splitlines(), 0
|
|
643
665
|
|
|
644
666
|
|
|
645
|
-
def get_total_nodes_per_node_pool(
|
|
667
|
+
def get_total_nodes_per_node_pool() -> tuple[list[str], int]:
|
|
646
668
|
cmd_total_nodes = (
|
|
647
669
|
'kubectl get node --no-headers=true -o'
|
|
648
670
|
" custom-columns='NODE_NAME:metadata.name,"
|
|
@@ -654,7 +676,7 @@ def get_total_nodes_per_node_pool(args) -> tuple[list[str], int]:
|
|
|
654
676
|
' | uniq -c'
|
|
655
677
|
)
|
|
656
678
|
return_code, out = run_command_for_value(
|
|
657
|
-
cmd_total_nodes, 'Count total nodes per nodepool'
|
|
679
|
+
cmd_total_nodes, 'Count total nodes per nodepool'
|
|
658
680
|
)
|
|
659
681
|
if return_code != 0:
|
|
660
682
|
return [], return_code
|
|
@@ -671,6 +693,8 @@ def cluster_list(args) -> None:
|
|
|
671
693
|
Returns:
|
|
672
694
|
0 if successful and 1 otherwise.
|
|
673
695
|
"""
|
|
696
|
+
if should_validate_dependencies(args):
|
|
697
|
+
validate_dependencies_list([SystemDependency.GCLOUD])
|
|
674
698
|
add_zone_and_project(args)
|
|
675
699
|
xpk_print(f'For project {args.project} and zone {args.zone}:', flush=True)
|
|
676
700
|
if run_gke_clusters_list_command(args):
|
|
@@ -706,20 +730,20 @@ def cluster_create_ray_cluster(args) -> None:
|
|
|
706
730
|
cluster_create(args)
|
|
707
731
|
|
|
708
732
|
|
|
709
|
-
def install_jq(
|
|
733
|
+
def install_jq():
|
|
710
734
|
"""Installs 'jq' utility."""
|
|
711
735
|
if shutil.which('jq'):
|
|
712
736
|
xpk_print("Task: 'Install jq' skipped, jq already installed.")
|
|
713
737
|
return
|
|
714
738
|
command_jq_install = 'sudo apt install jq -y'
|
|
715
739
|
xpk_print("Task: 'Install jq' in progress.")
|
|
716
|
-
return_code = run_command_with_updates(command_jq_install, 'Install jq'
|
|
740
|
+
return_code = run_command_with_updates(command_jq_install, 'Install jq')
|
|
717
741
|
if return_code != 0:
|
|
718
742
|
xpk_print(f'Install jq error {return_code}')
|
|
719
743
|
xpk_exit(return_code)
|
|
720
744
|
|
|
721
745
|
|
|
722
|
-
def clone_coredns_deployment_repo(
|
|
746
|
+
def clone_coredns_deployment_repo(coredns_repo_full_path: str):
|
|
723
747
|
"""Clones the CoreDNS deployment repository if it doesn't exist."""
|
|
724
748
|
if os.path.exists(coredns_repo_full_path):
|
|
725
749
|
xpk_print(
|
|
@@ -734,15 +758,13 @@ def clone_coredns_deployment_repo(args, coredns_repo_full_path: str):
|
|
|
734
758
|
"Task: 'Clone deployment' in progress, Target"
|
|
735
759
|
f' directory:{coredns_repo_full_path}.'
|
|
736
760
|
)
|
|
737
|
-
return_code = run_command_with_updates(
|
|
738
|
-
command_git_clone, 'Clone deployment', args
|
|
739
|
-
)
|
|
761
|
+
return_code = run_command_with_updates(command_git_clone, 'Clone deployment')
|
|
740
762
|
if return_code != 0:
|
|
741
763
|
xpk_print(f'Clone deployment error {return_code}')
|
|
742
764
|
xpk_exit(return_code)
|
|
743
765
|
|
|
744
766
|
|
|
745
|
-
def deploy_coredns_manifests(
|
|
767
|
+
def deploy_coredns_manifests(coredns_k8s_path: str):
|
|
746
768
|
"""Deploys CoreDNS manifests to the cluster."""
|
|
747
769
|
if not os.path.isdir(coredns_k8s_path):
|
|
748
770
|
xpk_print(
|
|
@@ -760,7 +782,7 @@ def deploy_coredns_manifests(args, coredns_k8s_path: str):
|
|
|
760
782
|
f"Task: 'Deploy CoreDNS' in progress, Located at '{coredns_k8s_path}'"
|
|
761
783
|
)
|
|
762
784
|
return_code = run_command_with_updates(
|
|
763
|
-
command_deploy_coredns, 'Deploy CoreDNS'
|
|
785
|
+
command_deploy_coredns, 'Deploy CoreDNS'
|
|
764
786
|
)
|
|
765
787
|
if return_code != 0:
|
|
766
788
|
xpk_print(f'Deploy CoreDNS error {return_code}')
|
|
@@ -772,9 +794,7 @@ def deploy_coredns_manifests(args, coredns_k8s_path: str):
|
|
|
772
794
|
xpk_exit(return_code)
|
|
773
795
|
|
|
774
796
|
|
|
775
|
-
def scale_down_deployment(
|
|
776
|
-
args, deployment_name: str, namespace: str = 'kube-system'
|
|
777
|
-
):
|
|
797
|
+
def scale_down_deployment(deployment_name: str, namespace: str = 'kube-system'):
|
|
778
798
|
"""Scales down a specified Kubernetes deployment to 0 replicas."""
|
|
779
799
|
command = (
|
|
780
800
|
f'kubectl scale deployment {deployment_name} --replicas=0'
|
|
@@ -782,29 +802,27 @@ def scale_down_deployment(
|
|
|
782
802
|
)
|
|
783
803
|
xpk_print(f"Task: 'Scaling down {deployment_name}' in progress")
|
|
784
804
|
return_code = run_command_with_updates(
|
|
785
|
-
command, f'Scale down {deployment_name}'
|
|
805
|
+
command, f'Scale down {deployment_name}'
|
|
786
806
|
)
|
|
787
807
|
if return_code != 0:
|
|
788
808
|
xpk_print(f'Scale down {deployment_name} error {return_code}')
|
|
789
809
|
xpk_exit(return_code)
|
|
790
|
-
xpk_print(f'
|
|
810
|
+
xpk_print(f'{deployment_name} has been scaled down.')
|
|
791
811
|
|
|
792
812
|
|
|
793
|
-
def scale_up_coredns(
|
|
813
|
+
def scale_up_coredns(replicas: int = 15, namespace: str = 'kube-system'):
|
|
794
814
|
"""Scales up the CoreDNS deployment to a specified number of replicas."""
|
|
795
815
|
command_coredns_scale = (
|
|
796
816
|
f'kubectl scale deployment coredns --replicas={replicas} -n {namespace}'
|
|
797
817
|
)
|
|
798
818
|
xpk_print(f"Task: 'Scale CoreDNS' in progress (to {replicas} replicas)")
|
|
799
|
-
return_code = run_command_with_updates(
|
|
800
|
-
command_coredns_scale, 'Scale CoreDNS', args
|
|
801
|
-
)
|
|
819
|
+
return_code = run_command_with_updates(command_coredns_scale, 'Scale CoreDNS')
|
|
802
820
|
if return_code != 0:
|
|
803
821
|
xpk_print(f'Scale CoreDNS error {return_code}')
|
|
804
822
|
xpk_exit(return_code)
|
|
805
823
|
|
|
806
824
|
|
|
807
|
-
def check_deployment_exists(
|
|
825
|
+
def check_deployment_exists(deployment_name: str, namespace: str) -> bool:
|
|
808
826
|
"""Check for the existence of a specific Deployment in a given namespace."""
|
|
809
827
|
# TODO: rewrite this to be more obvious, check if it is correct
|
|
810
828
|
command = (
|
|
@@ -812,17 +830,17 @@ def check_deployment_exists(args, deployment_name: str, namespace: str) -> bool:
|
|
|
812
830
|
f' {namespace} --ignore-not-found'
|
|
813
831
|
)
|
|
814
832
|
result = run_command_with_updates(
|
|
815
|
-
command, 'Waiting for kubeDNS to be checked.'
|
|
833
|
+
command, 'Waiting for kubeDNS to be checked.'
|
|
816
834
|
)
|
|
817
835
|
return result != 0
|
|
818
836
|
|
|
819
837
|
|
|
820
838
|
def verify_coredns_readiness(
|
|
821
|
-
|
|
839
|
+
timeout: int = 240, namespace: str = 'kube-system'
|
|
822
840
|
):
|
|
823
841
|
"""Verifies CoreDNS readiness using kubectl wait commands."""
|
|
824
842
|
xpk_print('Now verifying CoreDNS readiness...')
|
|
825
|
-
kube_dns_exists = check_deployment_exists(
|
|
843
|
+
kube_dns_exists = check_deployment_exists('kube-dns', namespace)
|
|
826
844
|
if kube_dns_exists:
|
|
827
845
|
# Wait for kube-dns to be fully scaled down
|
|
828
846
|
command_kube_dns_wait_scaled_down = (
|
|
@@ -832,7 +850,7 @@ def verify_coredns_readiness(
|
|
|
832
850
|
)
|
|
833
851
|
xpk_print('Verifying if kube-dns has scaled down...')
|
|
834
852
|
return_code_kube_dns = run_command_with_updates(
|
|
835
|
-
command_kube_dns_wait_scaled_down, 'Wait for kube-dns scale down'
|
|
853
|
+
command_kube_dns_wait_scaled_down, 'Wait for kube-dns scale down'
|
|
836
854
|
)
|
|
837
855
|
if return_code_kube_dns != 0:
|
|
838
856
|
xpk_print('kube-dns did not scale down successfully within the timeout.')
|
|
@@ -848,7 +866,7 @@ def verify_coredns_readiness(
|
|
|
848
866
|
)
|
|
849
867
|
xpk_print('Verifying if CoreDNS is available...')
|
|
850
868
|
return_code_coredns = run_command_with_updates(
|
|
851
|
-
command_coredns_wait_available, 'Wait for coredns available'
|
|
869
|
+
command_coredns_wait_available, 'Wait for coredns available'
|
|
852
870
|
)
|
|
853
871
|
if return_code_coredns != 0:
|
|
854
872
|
xpk_print(
|
|
@@ -873,12 +891,9 @@ def cleanup_coredns_repo(coredns_repo_full_path: str):
|
|
|
873
891
|
xpk_print(f'Error deleting directory {coredns_repo_full_path}: {e}')
|
|
874
892
|
|
|
875
893
|
|
|
876
|
-
def update_coredns(
|
|
894
|
+
def update_coredns() -> int:
|
|
877
895
|
"""Updates and deploys CoreDNS within a cluster.
|
|
878
896
|
|
|
879
|
-
Args:
|
|
880
|
-
args: user provided arguments for running the command.
|
|
881
|
-
|
|
882
897
|
Returns:
|
|
883
898
|
0 if successful and 1 otherwise.
|
|
884
899
|
"""
|
|
@@ -887,23 +902,23 @@ def update_coredns(args) -> int:
|
|
|
887
902
|
coredns_repo_full_path = os.path.join(coredns_repo_dir, coredns_repo_dir_name)
|
|
888
903
|
coredns_k8s_path = os.path.join(coredns_repo_full_path, 'kubernetes')
|
|
889
904
|
# 1. Install jq
|
|
890
|
-
install_jq(
|
|
905
|
+
install_jq()
|
|
891
906
|
|
|
892
907
|
# 2. Clone CoreDNS deployment repository
|
|
893
|
-
clone_coredns_deployment_repo(
|
|
908
|
+
clone_coredns_deployment_repo(coredns_repo_full_path)
|
|
894
909
|
|
|
895
910
|
# 3. Deploy CoreDNS to the cluster
|
|
896
|
-
deploy_coredns_manifests(
|
|
911
|
+
deploy_coredns_manifests(coredns_k8s_path)
|
|
897
912
|
|
|
898
913
|
# 4. Scale down kube-dns-autoscaler
|
|
899
|
-
scale_down_deployment(
|
|
914
|
+
scale_down_deployment('kube-dns-autoscaler')
|
|
900
915
|
|
|
901
916
|
# 5. Scale down kube-dns
|
|
902
|
-
scale_down_deployment(
|
|
917
|
+
scale_down_deployment('kube-dns')
|
|
903
918
|
|
|
904
919
|
# 6. Scale up coredns and verify readiness
|
|
905
|
-
scale_up_coredns(
|
|
906
|
-
verify_coredns_readiness(
|
|
920
|
+
scale_up_coredns(replicas=15)
|
|
921
|
+
verify_coredns_readiness(timeout=120)
|
|
907
922
|
|
|
908
923
|
xpk_print('The CoreDNS setup process has been completed.')
|
|
909
924
|
|
|
@@ -913,7 +928,7 @@ def update_coredns(args) -> int:
|
|
|
913
928
|
return 0
|
|
914
929
|
|
|
915
930
|
|
|
916
|
-
def coredns_deployment_exists(
|
|
931
|
+
def coredns_deployment_exists(namespace: str = 'kube-system') -> bool:
|
|
917
932
|
"""Checks if the CoreDNS deployment exists in the given namespace.
|
|
918
933
|
|
|
919
934
|
Args:
|
|
@@ -928,10 +943,10 @@ def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool:
|
|
|
928
943
|
f' namespace: {namespace}'
|
|
929
944
|
)
|
|
930
945
|
return_code = run_command_with_updates(
|
|
931
|
-
command, f'Check CoreDNS deployment in {namespace}'
|
|
946
|
+
command, f'Check CoreDNS deployment in {namespace}'
|
|
932
947
|
)
|
|
933
948
|
if return_code == 0:
|
|
934
|
-
verify_coredns_readiness(
|
|
949
|
+
verify_coredns_readiness()
|
|
935
950
|
xpk_print(f"CoreDNS deployment 'coredns' found in namespace '{namespace}'.")
|
|
936
951
|
return True
|
|
937
952
|
else:
|
|
@@ -942,25 +957,22 @@ def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool:
|
|
|
942
957
|
return False
|
|
943
958
|
|
|
944
959
|
|
|
945
|
-
def update_coredns_if_necessary(
|
|
960
|
+
def update_coredns_if_necessary() -> int:
|
|
946
961
|
"""Updates and deploys CoreDNS within the cluster if it's not already present.
|
|
947
962
|
|
|
948
963
|
This function checks for the existence of the CoreDNS deployment.
|
|
949
964
|
If it's not found, it proceeds to deploy and configure CoreDNS.
|
|
950
965
|
|
|
951
|
-
Args:
|
|
952
|
-
args: User-provided arguments for running the command.
|
|
953
|
-
|
|
954
966
|
Returns:
|
|
955
967
|
0 if successful (CoreDNS was already present or successfully deployed),
|
|
956
968
|
and 1 otherwise.
|
|
957
969
|
"""
|
|
958
|
-
if coredns_deployment_exists(
|
|
970
|
+
if coredns_deployment_exists(namespace='kube-system'):
|
|
959
971
|
xpk_print('Skipping CoreDNS deployment since it already exists.')
|
|
960
972
|
return 0
|
|
961
973
|
else:
|
|
962
974
|
xpk_print('CoreDNS deployment not found. Proceeding with CoreDNS setup.')
|
|
963
|
-
return update_coredns(
|
|
975
|
+
return update_coredns()
|
|
964
976
|
|
|
965
977
|
|
|
966
978
|
def create_cluster_if_necessary(
|
|
@@ -1020,10 +1032,10 @@ def run_gke_cluster_delete_command(args) -> int:
|
|
|
1020
1032
|
command = (
|
|
1021
1033
|
'gcloud beta container clusters delete'
|
|
1022
1034
|
f' {args.cluster} --project={args.project}'
|
|
1023
|
-
f' --
|
|
1035
|
+
f' --location={get_cluster_location(args.project, args.cluster, args.zone)} --quiet'
|
|
1024
1036
|
)
|
|
1025
1037
|
|
|
1026
|
-
return_code = run_command_with_updates(command, 'Cluster Delete'
|
|
1038
|
+
return_code = run_command_with_updates(command, 'Cluster Delete')
|
|
1027
1039
|
if return_code != 0:
|
|
1028
1040
|
xpk_print(f'Cluster delete request returned ERROR {return_code}')
|
|
1029
1041
|
return 1
|
|
@@ -1046,9 +1058,9 @@ def run_gke_clusters_list_command(args) -> int:
|
|
|
1046
1058
|
"""
|
|
1047
1059
|
command = (
|
|
1048
1060
|
'gcloud container clusters list'
|
|
1049
|
-
f' --project={args.project} --
|
|
1061
|
+
f' --project={args.project} --filter=location~"{zone_to_region(args.zone)}.*"'
|
|
1050
1062
|
)
|
|
1051
|
-
return_code = run_command_with_updates(command, 'Cluster List'
|
|
1063
|
+
return_code = run_command_with_updates(command, 'Cluster List')
|
|
1052
1064
|
if return_code != 0:
|
|
1053
1065
|
xpk_print(f'Cluster list request returned ERROR {return_code}')
|
|
1054
1066
|
return 1
|
|
@@ -1104,6 +1116,7 @@ def run_gke_cluster_create_command(
|
|
|
1104
1116
|
f' {rapid_release_cmd}'
|
|
1105
1117
|
' --enable-dns-access'
|
|
1106
1118
|
' --autoscaling-profile=optimize-utilization'
|
|
1119
|
+
' --labels=gke_product_type=xpk'
|
|
1107
1120
|
)
|
|
1108
1121
|
|
|
1109
1122
|
enable_ip_alias = False
|
|
@@ -1157,7 +1170,7 @@ def run_gke_cluster_create_command(
|
|
|
1157
1170
|
addons_str = ','.join(addons)
|
|
1158
1171
|
command += f' --addons={addons_str}'
|
|
1159
1172
|
|
|
1160
|
-
return_code = run_command_with_updates(command, 'GKE Cluster Create'
|
|
1173
|
+
return_code = run_command_with_updates(command, 'GKE Cluster Create')
|
|
1161
1174
|
if return_code != 0:
|
|
1162
1175
|
xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
|
|
1163
1176
|
return 1
|
|
@@ -1203,12 +1216,12 @@ def install_storage_csis(args):
|
|
|
1203
1216
|
|
|
1204
1217
|
def install_kjob(args):
|
|
1205
1218
|
xpk_print('Verifying kjob installation')
|
|
1206
|
-
err_code = verify_kjob_installed(
|
|
1219
|
+
err_code = verify_kjob_installed()
|
|
1207
1220
|
if err_code > 0:
|
|
1208
1221
|
xpk_exit(err_code)
|
|
1209
1222
|
|
|
1210
1223
|
xpk_print('Applying kjob CDRs')
|
|
1211
|
-
err_code = apply_kjob_crds(
|
|
1224
|
+
err_code = apply_kjob_crds()
|
|
1212
1225
|
if err_code > 0:
|
|
1213
1226
|
xpk_exit(err_code)
|
|
1214
1227
|
|
|
@@ -1219,42 +1232,43 @@ def install_kjob(args):
|
|
|
1219
1232
|
|
|
1220
1233
|
def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config):
|
|
1221
1234
|
xpk_print('Enabling Kueue on the cluster')
|
|
1222
|
-
|
|
1223
|
-
if
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1235
|
+
autoprovisioning_enabled = False
|
|
1236
|
+
if autoprovisioning_config:
|
|
1237
|
+
# Determine total resources available based on autoprovisioning max chips.
|
|
1238
|
+
autoprovisioning_enabled = True
|
|
1239
|
+
total_chips = autoprovisioning_config.maximum_chips
|
|
1240
|
+
else:
|
|
1241
|
+
# Determine total chips based on user specified topology.
|
|
1242
|
+
total_chips = get_total_chips_requested_from_args(args, system)
|
|
1243
|
+
kueue_manager = KueueManager()
|
|
1244
|
+
kueue_manager.install_or_upgrade(
|
|
1245
|
+
KueueConfig(
|
|
1246
|
+
system,
|
|
1247
|
+
total_chips=total_chips,
|
|
1248
|
+
autoprovisioning_enabled=autoprovisioning_enabled,
|
|
1249
|
+
num_slices=args.num_slices,
|
|
1250
|
+
flex=args.flex,
|
|
1251
|
+
memory_limit=args.memory_limit,
|
|
1252
|
+
cpu_limit=args.cpu_limit,
|
|
1253
|
+
is_pathways_cluster=args.enable_pathways,
|
|
1254
|
+
),
|
|
1234
1255
|
)
|
|
1235
|
-
if enable_kueue_credentials_code != 0:
|
|
1236
|
-
xpk_exit(enable_kueue_credentials_code)
|
|
1237
|
-
|
|
1238
|
-
xpk_print('Update Kueue Controller Manager resources')
|
|
1239
|
-
update_kueue_resources_code = update_kueue_resources_if_necessary(args)
|
|
1240
|
-
if update_kueue_resources_code != 0:
|
|
1241
|
-
xpk_exit(update_kueue_resources_code)
|
|
1242
1256
|
|
|
1243
1257
|
|
|
1244
|
-
def prepare_gpus(
|
|
1258
|
+
def prepare_gpus(system: SystemCharacteristics):
|
|
1245
1259
|
xpk_print('Installing NCCL Plugin for cluster')
|
|
1246
|
-
install_nccl_code = install_nccl_on_cluster(
|
|
1260
|
+
install_nccl_code = install_nccl_on_cluster(system)
|
|
1247
1261
|
if install_nccl_code != 0:
|
|
1248
1262
|
xpk_exit(install_nccl_code)
|
|
1249
1263
|
|
|
1250
1264
|
if system.device_type == H100_DEVICE_TYPE:
|
|
1251
1265
|
xpk_print('Installing NRI device injector for cluster')
|
|
1252
|
-
install_nri_code = install_nri_on_cluster(
|
|
1266
|
+
install_nri_code = install_nri_on_cluster()
|
|
1253
1267
|
if install_nri_code != 0:
|
|
1254
1268
|
xpk_exit(install_nri_code)
|
|
1255
1269
|
|
|
1256
1270
|
if system.device_type in [H200_DEVICE_TYPE, B200_DEVICE_TYPE]:
|
|
1257
1271
|
xpk_print('Disabling MGLRU')
|
|
1258
|
-
err_code = disable_mglru_on_cluster(
|
|
1272
|
+
err_code = disable_mglru_on_cluster()
|
|
1259
1273
|
if err_code > 0:
|
|
1260
1274
|
xpk_exit(err_code)
|