xpk 0.13.0__py3-none-any.whl → 0.14.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/__init__.py +15 -0
- integration/docker_manager_test.py +102 -0
- integration/gcluster_a3mega_test.py +204 -0
- integration/gcluster_a3ultra_test.py +176 -0
- integration/gcluster_a4_test.py +176 -0
- integration/gcluster_test.py +107 -0
- xpk/commands/batch.py +9 -2
- xpk/commands/cluster.py +143 -117
- xpk/commands/cluster_gcluster.py +81 -14
- xpk/commands/cluster_gcluster_test.py +177 -0
- xpk/commands/cluster_test.py +92 -0
- xpk/commands/common.py +14 -26
- xpk/commands/info.py +11 -9
- xpk/commands/inspector.py +21 -10
- xpk/commands/job.py +25 -9
- xpk/commands/kind.py +39 -40
- xpk/commands/kjob_common.py +4 -4
- xpk/commands/run.py +9 -2
- xpk/commands/shell.py +13 -10
- xpk/commands/storage.py +21 -0
- xpk/commands/version.py +0 -4
- xpk/commands/workload.py +84 -29
- xpk/commands/workload_test.py +81 -0
- xpk/core/blueprint/blueprint_generator.py +4 -40
- xpk/core/blueprint/blueprint_test.py +0 -6
- xpk/core/blueprint/testing/__init__.py +15 -0
- xpk/core/capacity.py +6 -5
- xpk/core/cluster.py +91 -194
- xpk/core/cluster_private.py +6 -11
- xpk/core/commands.py +11 -18
- xpk/core/config.py +1 -1
- xpk/core/docker_image.py +3 -4
- xpk/core/gcloud_context.py +26 -2
- xpk/core/gcloud_context_test.py +96 -0
- xpk/core/gcluster_manager.py +0 -3
- xpk/core/jobset.py +4 -7
- xpk/core/kjob.py +14 -27
- xpk/core/kueue_manager.py +423 -0
- xpk/core/kueue_manager_test.py +574 -0
- xpk/core/monitoring.py +1 -1
- xpk/core/nap.py +10 -15
- xpk/core/network.py +17 -18
- xpk/core/nodepool.py +66 -77
- xpk/core/nodepool_test.py +198 -1
- xpk/core/pathways.py +5 -5
- xpk/core/ray.py +10 -14
- xpk/core/resources.py +6 -11
- xpk/core/scheduling.py +19 -1
- xpk/core/scheduling_test.py +31 -0
- xpk/core/system_characteristics.py +350 -232
- xpk/core/system_characteristics_test.py +73 -0
- xpk/core/vertex.py +1 -1
- xpk/core/workload.py +7 -8
- xpk/main.py +2 -4
- xpk/parser/cluster.py +7 -0
- xpk/parser/cluster_test.py +66 -0
- xpk/parser/common.py +11 -0
- xpk/parser/workload.py +62 -25
- xpk/parser/workload_test.py +82 -0
- xpk/templates/cluster_preheat.yaml.j2 +31 -0
- xpk/templates/filestore-pv.yaml +17 -0
- xpk/templates/filestore-pvc.yaml +11 -0
- xpk/templates/filestore-sc.yaml +10 -0
- xpk/templates/fuse-pv.yaml +17 -0
- xpk/templates/fuse-pvc.yaml +13 -0
- xpk/templates/kueue_config.yaml.j2 +95 -0
- xpk/templates/kueue_gke_default_topology.yaml.j2 +10 -0
- xpk/templates/kueue_sub_slicing_topology.yaml.j2 +14 -0
- xpk/templates/mtc-cpc.yaml +15 -0
- xpk/templates/volume_bundle.yaml +7 -0
- xpk/utils/feature_flags.py +28 -0
- xpk/utils/kueue.py +20 -0
- xpk/utils/templates.py +15 -0
- xpk/utils/topology.py +46 -0
- xpk/utils/topology_test.py +63 -0
- xpk/utils/validation.py +79 -55
- xpk/utils/validation_test.py +37 -0
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/METADATA +6 -1
- xpk-0.14.1.dist-info/RECORD +133 -0
- xpk-0.14.1.dist-info/top_level.txt +2 -0
- xpk/core/kueue.py +0 -561
- xpk-0.13.0.dist-info/RECORD +0 -101
- xpk-0.13.0.dist-info/top_level.txt +0 -1
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/WHEEL +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/entry_points.txt +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/licenses/LICENSE +0 -0
xpk/commands/cluster.py
CHANGED
|
@@ -16,6 +16,7 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
from tabulate import tabulate
|
|
18
18
|
|
|
19
|
+
from ..utils.feature_flags import FeatureFlags
|
|
19
20
|
from ..core.capacity import H100_DEVICE_TYPE, H200_DEVICE_TYPE, B200_DEVICE_TYPE
|
|
20
21
|
from ..core.cluster import (
|
|
21
22
|
get_all_clusters_programmatic,
|
|
@@ -41,17 +42,12 @@ from ..core.gcloud_context import (
|
|
|
41
42
|
add_zone_and_project,
|
|
42
43
|
get_gke_control_plane_version,
|
|
43
44
|
get_gke_server_config,
|
|
45
|
+
get_cluster_location,
|
|
44
46
|
zone_to_region,
|
|
45
47
|
)
|
|
46
48
|
from ..core.jobset import update_jobset_resources_if_necessary
|
|
47
49
|
from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
|
|
48
|
-
from ..core.
|
|
49
|
-
cluster_preheat_yml,
|
|
50
|
-
install_kueue_crs,
|
|
51
|
-
install_kueue_on_cluster,
|
|
52
|
-
wait_for_kueue_available,
|
|
53
|
-
update_kueue_resources_if_necessary,
|
|
54
|
-
)
|
|
50
|
+
from ..core.kueue_manager import (KueueConfig, KueueManager)
|
|
55
51
|
from ..core.nap import enable_autoprovisioning_on_cluster
|
|
56
52
|
from ..core.network import (
|
|
57
53
|
create_cluster_network_config,
|
|
@@ -65,6 +61,7 @@ from ..core.nodepool import (
|
|
|
65
61
|
from ..core.ray import install_ray_cluster
|
|
66
62
|
from ..core.mtc import install_mtc_on_cluster
|
|
67
63
|
from ..core.resources import create_cluster_configmaps
|
|
64
|
+
from ..core.scheduling import get_total_chips_requested_from_args
|
|
68
65
|
from ..core.storage import install_storage_crd
|
|
69
66
|
from ..core.system_characteristics import (
|
|
70
67
|
AcceleratorType,
|
|
@@ -77,11 +74,16 @@ from ..core.workload import get_workload_list
|
|
|
77
74
|
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
78
75
|
from ..utils.file import write_tmp_file
|
|
79
76
|
from ..utils.execution_context import is_dry_run
|
|
77
|
+
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
80
78
|
from . import cluster_gcluster
|
|
81
|
-
from .common import set_cluster_command
|
|
79
|
+
from .common import set_cluster_command, validate_sub_slicing_system
|
|
80
|
+
from jinja2 import Environment, FileSystemLoader
|
|
81
|
+
from ..utils.templates import get_templates_absolute_path
|
|
82
82
|
import shutil
|
|
83
83
|
import os
|
|
84
84
|
|
|
85
|
+
CLUSTER_PREHEAT_JINJA_FILE = 'cluster_preheat.yaml.j2'
|
|
86
|
+
|
|
85
87
|
|
|
86
88
|
def cluster_adapt(args) -> None:
|
|
87
89
|
"""Function that performs cluster adaptation.
|
|
@@ -89,6 +91,12 @@ def cluster_adapt(args) -> None:
|
|
|
89
91
|
Args:
|
|
90
92
|
args: user provided arguments for running the command.
|
|
91
93
|
"""
|
|
94
|
+
if should_validate_dependencies(args):
|
|
95
|
+
validate_dependencies_list([
|
|
96
|
+
SystemDependency.KUBECTL,
|
|
97
|
+
SystemDependency.KJOB,
|
|
98
|
+
SystemDependency.GCLOUD,
|
|
99
|
+
])
|
|
92
100
|
args.enable_pathways = False
|
|
93
101
|
|
|
94
102
|
system, return_code = get_system_characteristics(args)
|
|
@@ -109,7 +117,7 @@ def cluster_adapt(args) -> None:
|
|
|
109
117
|
'Argument --num-nodes was not provided, trying to determine number of'
|
|
110
118
|
' nodes based on the available nodes in the cluster...'
|
|
111
119
|
)
|
|
112
|
-
args.num_nodes = count_nodes_on_cluster(
|
|
120
|
+
args.num_nodes = count_nodes_on_cluster(system)
|
|
113
121
|
if args.num_nodes == 0:
|
|
114
122
|
xpk_print(
|
|
115
123
|
'Found unexpected number of nodes. Is the --device-type correct?'
|
|
@@ -176,7 +184,7 @@ def cluster_adapt(args) -> None:
|
|
|
176
184
|
|
|
177
185
|
install_kjob(args)
|
|
178
186
|
if system.accelerator_type == AcceleratorType['GPU']:
|
|
179
|
-
prepare_gpus(
|
|
187
|
+
prepare_gpus(system)
|
|
180
188
|
|
|
181
189
|
if args.enable_ray_cluster:
|
|
182
190
|
return_code = install_ray_cluster(args, system)
|
|
@@ -188,23 +196,36 @@ def cluster_adapt(args) -> None:
|
|
|
188
196
|
xpk_print(
|
|
189
197
|
'See your GKE Cluster here:'
|
|
190
198
|
# pylint: disable=line-too-long
|
|
191
|
-
f' https://console.cloud.google.com/kubernetes/clusters/details/{
|
|
199
|
+
f' https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
|
|
192
200
|
)
|
|
193
201
|
xpk_exit(0)
|
|
194
202
|
|
|
195
203
|
|
|
204
|
+
def _validate_cluster_create_args(args, system: SystemCharacteristics):
|
|
205
|
+
if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing:
|
|
206
|
+
validate_sub_slicing_system(system)
|
|
207
|
+
|
|
208
|
+
|
|
196
209
|
def cluster_create(args) -> None:
|
|
197
210
|
"""Function around cluster creation.
|
|
198
211
|
|
|
199
212
|
Args:
|
|
200
213
|
args: user provided arguments for running the command.
|
|
201
214
|
"""
|
|
202
|
-
|
|
215
|
+
if should_validate_dependencies(args):
|
|
216
|
+
validate_dependencies_list([
|
|
217
|
+
SystemDependency.KUBECTL,
|
|
218
|
+
SystemDependency.KJOB,
|
|
219
|
+
SystemDependency.GCLOUD,
|
|
220
|
+
])
|
|
203
221
|
|
|
222
|
+
system, return_code = get_system_characteristics(args)
|
|
204
223
|
if return_code > 0 or system is None:
|
|
205
224
|
xpk_print('Fetching system characteristics failed!')
|
|
206
225
|
xpk_exit(return_code)
|
|
207
226
|
|
|
227
|
+
_validate_cluster_create_args(args, system)
|
|
228
|
+
|
|
208
229
|
xpk_print(f'Starting cluster create for cluster {args.cluster}:', flush=True)
|
|
209
230
|
add_zone_and_project(args)
|
|
210
231
|
|
|
@@ -249,7 +270,7 @@ def cluster_create(args) -> None:
|
|
|
249
270
|
|
|
250
271
|
get_cluster_credentials(args)
|
|
251
272
|
|
|
252
|
-
update_coredns_command_code = update_coredns_if_necessary(
|
|
273
|
+
update_coredns_command_code = update_coredns_if_necessary()
|
|
253
274
|
if update_coredns_command_code != 0:
|
|
254
275
|
xpk_exit(update_cluster_command_code)
|
|
255
276
|
|
|
@@ -317,7 +338,7 @@ def cluster_create(args) -> None:
|
|
|
317
338
|
set_jobset_on_cluster_code = set_jobset_on_cluster(args)
|
|
318
339
|
if set_jobset_on_cluster_code != 0:
|
|
319
340
|
xpk_exit(set_jobset_on_cluster_code)
|
|
320
|
-
update_jobset_resources_code = update_jobset_resources_if_necessary(
|
|
341
|
+
update_jobset_resources_code = update_jobset_resources_if_necessary()
|
|
321
342
|
if update_jobset_resources_code != 0:
|
|
322
343
|
xpk_exit(update_jobset_resources_code)
|
|
323
344
|
|
|
@@ -330,7 +351,7 @@ def cluster_create(args) -> None:
|
|
|
330
351
|
install_kjob(args)
|
|
331
352
|
|
|
332
353
|
if system.accelerator_type == AcceleratorType['GPU']:
|
|
333
|
-
prepare_gpus(
|
|
354
|
+
prepare_gpus(system)
|
|
334
355
|
|
|
335
356
|
if args.enable_ray_cluster:
|
|
336
357
|
return_code = install_ray_cluster(args, system)
|
|
@@ -348,7 +369,7 @@ def cluster_create(args) -> None:
|
|
|
348
369
|
xpk_print(
|
|
349
370
|
'See your GKE Cluster here:'
|
|
350
371
|
# pylint: disable=line-too-long
|
|
351
|
-
f' https://console.cloud.google.com/kubernetes/clusters/details/{
|
|
372
|
+
f' https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
|
|
352
373
|
)
|
|
353
374
|
xpk_exit(0)
|
|
354
375
|
|
|
@@ -362,6 +383,8 @@ def cluster_delete(args) -> None:
|
|
|
362
383
|
Returns:
|
|
363
384
|
0 if successful and 1 otherwise.
|
|
364
385
|
"""
|
|
386
|
+
if should_validate_dependencies(args):
|
|
387
|
+
validate_dependencies_list([SystemDependency.GCLOUD])
|
|
365
388
|
xpk_print(f'Starting cluster delete for cluster: {args.cluster}', flush=True)
|
|
366
389
|
add_zone_and_project(args)
|
|
367
390
|
|
|
@@ -391,6 +414,10 @@ def cluster_cacheimage(args) -> None:
|
|
|
391
414
|
Returns:
|
|
392
415
|
0 if successful and 1 otherwise.
|
|
393
416
|
"""
|
|
417
|
+
if should_validate_dependencies(args):
|
|
418
|
+
validate_dependencies_list(
|
|
419
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
420
|
+
)
|
|
394
421
|
xpk_print(
|
|
395
422
|
f'Starting cluster cacheimage for cluster: {args.cluster}', flush=True
|
|
396
423
|
)
|
|
@@ -406,25 +433,28 @@ def cluster_cacheimage(args) -> None:
|
|
|
406
433
|
node_selector_key = AcceleratorTypeToAcceleratorCharacteristics[
|
|
407
434
|
system.accelerator_type
|
|
408
435
|
].accelerator_label
|
|
409
|
-
|
|
436
|
+
|
|
437
|
+
template_env = Environment(
|
|
438
|
+
loader=FileSystemLoader(searchpath=get_templates_absolute_path())
|
|
439
|
+
)
|
|
440
|
+
cluster_preheat_yaml = template_env.get_template(CLUSTER_PREHEAT_JINJA_FILE)
|
|
441
|
+
rendered_yaml = cluster_preheat_yaml.render(
|
|
410
442
|
cachekey=args.cache_key,
|
|
411
443
|
image_name=args.docker_image,
|
|
412
444
|
nodeSelectorKey=node_selector_key,
|
|
413
445
|
)
|
|
414
|
-
tmp = write_tmp_file(
|
|
446
|
+
tmp = write_tmp_file(rendered_yaml)
|
|
415
447
|
command_apply = f'kubectl apply -f {str(tmp)}'
|
|
416
448
|
command_delete = f'kubectl delete -f {str(tmp)} --ignore-not-found=true'
|
|
417
449
|
|
|
418
450
|
return_code = run_command_with_updates(
|
|
419
|
-
command_delete, 'Deleting Cached Image'
|
|
451
|
+
command_delete, 'Deleting Cached Image'
|
|
420
452
|
)
|
|
421
453
|
if return_code != 0:
|
|
422
454
|
xpk_print(f'Delete Cached Image returned ERROR {return_code}')
|
|
423
455
|
xpk_exit(return_code)
|
|
424
456
|
|
|
425
|
-
return_code = run_command_with_updates(
|
|
426
|
-
command_apply, 'Creating Cached Image', args
|
|
427
|
-
)
|
|
457
|
+
return_code = run_command_with_updates(command_apply, 'Creating Cached Image')
|
|
428
458
|
if return_code != 0:
|
|
429
459
|
xpk_print(f'Create Cached Image returned ERROR {return_code}')
|
|
430
460
|
xpk_exit(return_code)
|
|
@@ -440,12 +470,16 @@ def cluster_describe(args) -> None:
|
|
|
440
470
|
Returns:
|
|
441
471
|
0 if successful and 1 otherwise.
|
|
442
472
|
"""
|
|
473
|
+
if should_validate_dependencies(args):
|
|
474
|
+
validate_dependencies_list(
|
|
475
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
476
|
+
)
|
|
443
477
|
xpk_print(f'Starting nodepool list for cluster: {args.cluster}', flush=True)
|
|
444
478
|
add_zone_and_project(args)
|
|
445
479
|
|
|
446
480
|
get_cluster_credentials(args)
|
|
447
481
|
|
|
448
|
-
return_code, data_table = nodepools_build_table(
|
|
482
|
+
return_code, data_table = nodepools_build_table()
|
|
449
483
|
if return_code != 0:
|
|
450
484
|
xpk_exit(return_code)
|
|
451
485
|
|
|
@@ -461,7 +495,6 @@ def cluster_describe(args) -> None:
|
|
|
461
495
|
r'kubectl get node --no-headers=true'
|
|
462
496
|
r" --selector='cloud.google.com/gke-tpu-accelerator' | wc -l",
|
|
463
497
|
'Count TPU Nodes',
|
|
464
|
-
args,
|
|
465
498
|
)
|
|
466
499
|
if return_code_node_output != 0:
|
|
467
500
|
xpk_exit(return_code_node_output)
|
|
@@ -472,7 +505,6 @@ def cluster_describe(args) -> None:
|
|
|
472
505
|
"kubectl get pod -o=custom-columns='Status:.status.phase' | grep -i"
|
|
473
506
|
' Running | wc -l',
|
|
474
507
|
'Count TPU Pods',
|
|
475
|
-
args,
|
|
476
508
|
)
|
|
477
509
|
if return_code_pod_output != 0:
|
|
478
510
|
xpk_exit(return_code_pod_output)
|
|
@@ -487,7 +519,7 @@ def cluster_describe(args) -> None:
|
|
|
487
519
|
xpk_exit(0)
|
|
488
520
|
|
|
489
521
|
|
|
490
|
-
def nodepools_build_table(
|
|
522
|
+
def nodepools_build_table() -> tuple[int, list[list]]:
|
|
491
523
|
table = [[
|
|
492
524
|
'NODEPOOL_NAME',
|
|
493
525
|
'SLICE',
|
|
@@ -499,14 +531,14 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
|
|
|
499
531
|
|
|
500
532
|
nodepools_data = {}
|
|
501
533
|
|
|
502
|
-
nodepools, return_code = get_node_pools_name(
|
|
534
|
+
nodepools, return_code = get_node_pools_name()
|
|
503
535
|
if return_code != 0:
|
|
504
536
|
xpk_print(f'Get node pools name returned ERROR {return_code}')
|
|
505
537
|
|
|
506
538
|
for name in nodepools:
|
|
507
539
|
nodepools_data[name] = [name]
|
|
508
540
|
|
|
509
|
-
slices, return_code = get_slice_node_pool_size(
|
|
541
|
+
slices, return_code = get_slice_node_pool_size()
|
|
510
542
|
if return_code != 0:
|
|
511
543
|
xpk_print(f'Get slice node pool size returned ERROR {return_code}')
|
|
512
544
|
|
|
@@ -515,7 +547,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
|
|
|
515
547
|
count, nodepool_name = s[0], s[1]
|
|
516
548
|
nodepools_data[nodepool_name].append(count)
|
|
517
549
|
|
|
518
|
-
type_nodepool, return_code = get_node_pool_instance_type(
|
|
550
|
+
type_nodepool, return_code = get_node_pool_instance_type()
|
|
519
551
|
if return_code != 0:
|
|
520
552
|
xpk_print(f'Get node pool instance type returned ERROR {return_code}')
|
|
521
553
|
|
|
@@ -524,7 +556,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
|
|
|
524
556
|
nodepool_name, instance_type = tn[0], tn[1]
|
|
525
557
|
nodepools_data[nodepool_name].append(instance_type)
|
|
526
558
|
|
|
527
|
-
expected_healthy_nodes, return_code = get_expected_healthy_nodes(
|
|
559
|
+
expected_healthy_nodes, return_code = get_expected_healthy_nodes()
|
|
528
560
|
if return_code != 0:
|
|
529
561
|
xpk_print(f'Get expected healthy nodes returned ERROR {return_code}')
|
|
530
562
|
|
|
@@ -533,7 +565,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
|
|
|
533
565
|
count, nodepool_name = ehn[0], ehn[1]
|
|
534
566
|
nodepools_data[nodepool_name].append(count)
|
|
535
567
|
|
|
536
|
-
actual_healthy_nodes, return_code = get_actual_healthy_nodes(
|
|
568
|
+
actual_healthy_nodes, return_code = get_actual_healthy_nodes()
|
|
537
569
|
if return_code != 0:
|
|
538
570
|
xpk_print(f'Get actual healthy nodes returned ERROR {return_code}')
|
|
539
571
|
|
|
@@ -542,7 +574,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
|
|
|
542
574
|
count, nodepool_name = ahn[0], ahn[1]
|
|
543
575
|
nodepools_data[nodepool_name].append(count)
|
|
544
576
|
|
|
545
|
-
total_nodes, return_code = get_total_nodes_per_node_pool(
|
|
577
|
+
total_nodes, return_code = get_total_nodes_per_node_pool()
|
|
546
578
|
if return_code != 0:
|
|
547
579
|
xpk_print(f'Get total nodes per node pool returned ERROR {return_code}')
|
|
548
580
|
|
|
@@ -557,20 +589,20 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
|
|
|
557
589
|
return 0, table
|
|
558
590
|
|
|
559
591
|
|
|
560
|
-
def get_node_pools_name(
|
|
592
|
+
def get_node_pools_name() -> tuple[list[str], int]:
|
|
561
593
|
cmd_nodepools = (
|
|
562
594
|
'kubectl get node --no-headers=true -o'
|
|
563
595
|
" custom-columns='NODEPOOL:.metadata.labels.cloud\\.google\\.com/gke-nodepool'"
|
|
564
596
|
" | grep -v 'none' | sort | uniq"
|
|
565
597
|
)
|
|
566
|
-
return_code, out = run_command_for_value(cmd_nodepools, 'Nodepool list'
|
|
598
|
+
return_code, out = run_command_for_value(cmd_nodepools, 'Nodepool list')
|
|
567
599
|
if return_code != 0:
|
|
568
600
|
return [], return_code
|
|
569
601
|
|
|
570
602
|
return out.splitlines(), 0
|
|
571
603
|
|
|
572
604
|
|
|
573
|
-
def get_slice_node_pool_size(
|
|
605
|
+
def get_slice_node_pool_size() -> tuple[list[str], int]:
|
|
574
606
|
cmd_slices = (
|
|
575
607
|
'kubectl get node --no-headers=true -o'
|
|
576
608
|
" custom-columns=':metadata.labels.cloud\\.google\\.com/gke-nodepool'"
|
|
@@ -579,7 +611,7 @@ def get_slice_node_pool_size(args) -> tuple[list[str], int]:
|
|
|
579
611
|
' | uniq -c'
|
|
580
612
|
)
|
|
581
613
|
return_code, out = run_command_for_value(
|
|
582
|
-
cmd_slices, 'Count nodes per nodepool slice'
|
|
614
|
+
cmd_slices, 'Count nodes per nodepool slice'
|
|
583
615
|
)
|
|
584
616
|
if return_code != 0:
|
|
585
617
|
return [], return_code
|
|
@@ -587,7 +619,7 @@ def get_slice_node_pool_size(args) -> tuple[list[str], int]:
|
|
|
587
619
|
return out.splitlines(), 0
|
|
588
620
|
|
|
589
621
|
|
|
590
|
-
def get_node_pool_instance_type(
|
|
622
|
+
def get_node_pool_instance_type() -> tuple[list[str], int]:
|
|
591
623
|
cmd_type_nodepool = (
|
|
592
624
|
'kubectl get node --no-headers=true -o'
|
|
593
625
|
" custom-columns='NODEPOOL:.metadata.labels.cloud\\.google\\.com/gke-nodepool,"
|
|
@@ -595,7 +627,7 @@ def get_node_pool_instance_type(args) -> tuple[list[str], int]:
|
|
|
595
627
|
" 'none' | sort | uniq"
|
|
596
628
|
)
|
|
597
629
|
return_code, out = run_command_for_value(
|
|
598
|
-
cmd_type_nodepool, 'Instance type of nodepools'
|
|
630
|
+
cmd_type_nodepool, 'Instance type of nodepools'
|
|
599
631
|
)
|
|
600
632
|
if return_code != 0:
|
|
601
633
|
return [], return_code
|
|
@@ -603,7 +635,7 @@ def get_node_pool_instance_type(args) -> tuple[list[str], int]:
|
|
|
603
635
|
return out.splitlines(), 0
|
|
604
636
|
|
|
605
637
|
|
|
606
|
-
def get_expected_healthy_nodes(
|
|
638
|
+
def get_expected_healthy_nodes() -> tuple[list[str], int]:
|
|
607
639
|
cmd_expected_healthy_nodes = (
|
|
608
640
|
'kubectl get node --no-headers=true -o'
|
|
609
641
|
" custom-columns=':metadata.labels.cloud\\.google\\.com/gke-nodepool'"
|
|
@@ -614,7 +646,6 @@ def get_expected_healthy_nodes(args) -> tuple[list[str], int]:
|
|
|
614
646
|
return_code, out = run_command_for_value(
|
|
615
647
|
cmd_expected_healthy_nodes,
|
|
616
648
|
'Count expected healthy nodes per nodepool',
|
|
617
|
-
args,
|
|
618
649
|
)
|
|
619
650
|
if return_code != 0:
|
|
620
651
|
return [], return_code
|
|
@@ -622,7 +653,7 @@ def get_expected_healthy_nodes(args) -> tuple[list[str], int]:
|
|
|
622
653
|
return out.splitlines(), 0
|
|
623
654
|
|
|
624
655
|
|
|
625
|
-
def get_actual_healthy_nodes(
|
|
656
|
+
def get_actual_healthy_nodes() -> tuple[list[str], int]:
|
|
626
657
|
cmd_actual_healthy_nodes = (
|
|
627
658
|
'kubectl get node --no-headers=true -o'
|
|
628
659
|
" custom-columns='NODE_NAME:metadata.name,"
|
|
@@ -635,7 +666,7 @@ def get_actual_healthy_nodes(args) -> tuple[list[str], int]:
|
|
|
635
666
|
' | uniq -c'
|
|
636
667
|
)
|
|
637
668
|
return_code, out = run_command_for_value(
|
|
638
|
-
cmd_actual_healthy_nodes, 'Count actual healthy nodes per nodepool'
|
|
669
|
+
cmd_actual_healthy_nodes, 'Count actual healthy nodes per nodepool'
|
|
639
670
|
)
|
|
640
671
|
if return_code != 0:
|
|
641
672
|
return [], return_code
|
|
@@ -643,7 +674,7 @@ def get_actual_healthy_nodes(args) -> tuple[list[str], int]:
|
|
|
643
674
|
return out.splitlines(), 0
|
|
644
675
|
|
|
645
676
|
|
|
646
|
-
def get_total_nodes_per_node_pool(
|
|
677
|
+
def get_total_nodes_per_node_pool() -> tuple[list[str], int]:
|
|
647
678
|
cmd_total_nodes = (
|
|
648
679
|
'kubectl get node --no-headers=true -o'
|
|
649
680
|
" custom-columns='NODE_NAME:metadata.name,"
|
|
@@ -655,7 +686,7 @@ def get_total_nodes_per_node_pool(args) -> tuple[list[str], int]:
|
|
|
655
686
|
' | uniq -c'
|
|
656
687
|
)
|
|
657
688
|
return_code, out = run_command_for_value(
|
|
658
|
-
cmd_total_nodes, 'Count total nodes per nodepool'
|
|
689
|
+
cmd_total_nodes, 'Count total nodes per nodepool'
|
|
659
690
|
)
|
|
660
691
|
if return_code != 0:
|
|
661
692
|
return [], return_code
|
|
@@ -672,6 +703,8 @@ def cluster_list(args) -> None:
|
|
|
672
703
|
Returns:
|
|
673
704
|
0 if successful and 1 otherwise.
|
|
674
705
|
"""
|
|
706
|
+
if should_validate_dependencies(args):
|
|
707
|
+
validate_dependencies_list([SystemDependency.GCLOUD])
|
|
675
708
|
add_zone_and_project(args)
|
|
676
709
|
xpk_print(f'For project {args.project} and zone {args.zone}:', flush=True)
|
|
677
710
|
if run_gke_clusters_list_command(args):
|
|
@@ -707,20 +740,20 @@ def cluster_create_ray_cluster(args) -> None:
|
|
|
707
740
|
cluster_create(args)
|
|
708
741
|
|
|
709
742
|
|
|
710
|
-
def install_jq(
|
|
743
|
+
def install_jq():
|
|
711
744
|
"""Installs 'jq' utility."""
|
|
712
745
|
if shutil.which('jq'):
|
|
713
746
|
xpk_print("Task: 'Install jq' skipped, jq already installed.")
|
|
714
747
|
return
|
|
715
748
|
command_jq_install = 'sudo apt install jq -y'
|
|
716
749
|
xpk_print("Task: 'Install jq' in progress.")
|
|
717
|
-
return_code = run_command_with_updates(command_jq_install, 'Install jq'
|
|
750
|
+
return_code = run_command_with_updates(command_jq_install, 'Install jq')
|
|
718
751
|
if return_code != 0:
|
|
719
752
|
xpk_print(f'Install jq error {return_code}')
|
|
720
753
|
xpk_exit(return_code)
|
|
721
754
|
|
|
722
755
|
|
|
723
|
-
def clone_coredns_deployment_repo(
|
|
756
|
+
def clone_coredns_deployment_repo(coredns_repo_full_path: str):
|
|
724
757
|
"""Clones the CoreDNS deployment repository if it doesn't exist."""
|
|
725
758
|
if os.path.exists(coredns_repo_full_path):
|
|
726
759
|
xpk_print(
|
|
@@ -735,15 +768,13 @@ def clone_coredns_deployment_repo(args, coredns_repo_full_path: str):
|
|
|
735
768
|
"Task: 'Clone deployment' in progress, Target"
|
|
736
769
|
f' directory:{coredns_repo_full_path}.'
|
|
737
770
|
)
|
|
738
|
-
return_code = run_command_with_updates(
|
|
739
|
-
command_git_clone, 'Clone deployment', args
|
|
740
|
-
)
|
|
771
|
+
return_code = run_command_with_updates(command_git_clone, 'Clone deployment')
|
|
741
772
|
if return_code != 0:
|
|
742
773
|
xpk_print(f'Clone deployment error {return_code}')
|
|
743
774
|
xpk_exit(return_code)
|
|
744
775
|
|
|
745
776
|
|
|
746
|
-
def deploy_coredns_manifests(
|
|
777
|
+
def deploy_coredns_manifests(coredns_k8s_path: str):
|
|
747
778
|
"""Deploys CoreDNS manifests to the cluster."""
|
|
748
779
|
if not os.path.isdir(coredns_k8s_path):
|
|
749
780
|
xpk_print(
|
|
@@ -761,7 +792,7 @@ def deploy_coredns_manifests(args, coredns_k8s_path: str):
|
|
|
761
792
|
f"Task: 'Deploy CoreDNS' in progress, Located at '{coredns_k8s_path}'"
|
|
762
793
|
)
|
|
763
794
|
return_code = run_command_with_updates(
|
|
764
|
-
command_deploy_coredns, 'Deploy CoreDNS'
|
|
795
|
+
command_deploy_coredns, 'Deploy CoreDNS'
|
|
765
796
|
)
|
|
766
797
|
if return_code != 0:
|
|
767
798
|
xpk_print(f'Deploy CoreDNS error {return_code}')
|
|
@@ -773,9 +804,7 @@ def deploy_coredns_manifests(args, coredns_k8s_path: str):
|
|
|
773
804
|
xpk_exit(return_code)
|
|
774
805
|
|
|
775
806
|
|
|
776
|
-
def scale_down_deployment(
|
|
777
|
-
args, deployment_name: str, namespace: str = 'kube-system'
|
|
778
|
-
):
|
|
807
|
+
def scale_down_deployment(deployment_name: str, namespace: str = 'kube-system'):
|
|
779
808
|
"""Scales down a specified Kubernetes deployment to 0 replicas."""
|
|
780
809
|
command = (
|
|
781
810
|
f'kubectl scale deployment {deployment_name} --replicas=0'
|
|
@@ -783,29 +812,27 @@ def scale_down_deployment(
|
|
|
783
812
|
)
|
|
784
813
|
xpk_print(f"Task: 'Scaling down {deployment_name}' in progress")
|
|
785
814
|
return_code = run_command_with_updates(
|
|
786
|
-
command, f'Scale down {deployment_name}'
|
|
815
|
+
command, f'Scale down {deployment_name}'
|
|
787
816
|
)
|
|
788
817
|
if return_code != 0:
|
|
789
818
|
xpk_print(f'Scale down {deployment_name} error {return_code}')
|
|
790
819
|
xpk_exit(return_code)
|
|
791
|
-
xpk_print(f'
|
|
820
|
+
xpk_print(f'{deployment_name} has been scaled down.')
|
|
792
821
|
|
|
793
822
|
|
|
794
|
-
def scale_up_coredns(
|
|
823
|
+
def scale_up_coredns(replicas: int = 15, namespace: str = 'kube-system'):
|
|
795
824
|
"""Scales up the CoreDNS deployment to a specified number of replicas."""
|
|
796
825
|
command_coredns_scale = (
|
|
797
826
|
f'kubectl scale deployment coredns --replicas={replicas} -n {namespace}'
|
|
798
827
|
)
|
|
799
828
|
xpk_print(f"Task: 'Scale CoreDNS' in progress (to {replicas} replicas)")
|
|
800
|
-
return_code = run_command_with_updates(
|
|
801
|
-
command_coredns_scale, 'Scale CoreDNS', args
|
|
802
|
-
)
|
|
829
|
+
return_code = run_command_with_updates(command_coredns_scale, 'Scale CoreDNS')
|
|
803
830
|
if return_code != 0:
|
|
804
831
|
xpk_print(f'Scale CoreDNS error {return_code}')
|
|
805
832
|
xpk_exit(return_code)
|
|
806
833
|
|
|
807
834
|
|
|
808
|
-
def check_deployment_exists(
|
|
835
|
+
def check_deployment_exists(deployment_name: str, namespace: str) -> bool:
|
|
809
836
|
"""Check for the existence of a specific Deployment in a given namespace."""
|
|
810
837
|
# TODO: rewrite this to be more obvious, check if it is correct
|
|
811
838
|
command = (
|
|
@@ -813,17 +840,17 @@ def check_deployment_exists(args, deployment_name: str, namespace: str) -> bool:
|
|
|
813
840
|
f' {namespace} --ignore-not-found'
|
|
814
841
|
)
|
|
815
842
|
result = run_command_with_updates(
|
|
816
|
-
command, 'Waiting for kubeDNS to be checked.'
|
|
843
|
+
command, 'Waiting for kubeDNS to be checked.'
|
|
817
844
|
)
|
|
818
845
|
return result != 0
|
|
819
846
|
|
|
820
847
|
|
|
821
848
|
def verify_coredns_readiness(
|
|
822
|
-
|
|
849
|
+
timeout: int = 240, namespace: str = 'kube-system'
|
|
823
850
|
):
|
|
824
851
|
"""Verifies CoreDNS readiness using kubectl wait commands."""
|
|
825
852
|
xpk_print('Now verifying CoreDNS readiness...')
|
|
826
|
-
kube_dns_exists = check_deployment_exists(
|
|
853
|
+
kube_dns_exists = check_deployment_exists('kube-dns', namespace)
|
|
827
854
|
if kube_dns_exists:
|
|
828
855
|
# Wait for kube-dns to be fully scaled down
|
|
829
856
|
command_kube_dns_wait_scaled_down = (
|
|
@@ -833,7 +860,7 @@ def verify_coredns_readiness(
|
|
|
833
860
|
)
|
|
834
861
|
xpk_print('Verifying if kube-dns has scaled down...')
|
|
835
862
|
return_code_kube_dns = run_command_with_updates(
|
|
836
|
-
command_kube_dns_wait_scaled_down, 'Wait for kube-dns scale down'
|
|
863
|
+
command_kube_dns_wait_scaled_down, 'Wait for kube-dns scale down'
|
|
837
864
|
)
|
|
838
865
|
if return_code_kube_dns != 0:
|
|
839
866
|
xpk_print('kube-dns did not scale down successfully within the timeout.')
|
|
@@ -849,7 +876,7 @@ def verify_coredns_readiness(
|
|
|
849
876
|
)
|
|
850
877
|
xpk_print('Verifying if CoreDNS is available...')
|
|
851
878
|
return_code_coredns = run_command_with_updates(
|
|
852
|
-
command_coredns_wait_available, 'Wait for coredns available'
|
|
879
|
+
command_coredns_wait_available, 'Wait for coredns available'
|
|
853
880
|
)
|
|
854
881
|
if return_code_coredns != 0:
|
|
855
882
|
xpk_print(
|
|
@@ -874,12 +901,9 @@ def cleanup_coredns_repo(coredns_repo_full_path: str):
|
|
|
874
901
|
xpk_print(f'Error deleting directory {coredns_repo_full_path}: {e}')
|
|
875
902
|
|
|
876
903
|
|
|
877
|
-
def update_coredns(
|
|
904
|
+
def update_coredns() -> int:
|
|
878
905
|
"""Updates and deploys CoreDNS within a cluster.
|
|
879
906
|
|
|
880
|
-
Args:
|
|
881
|
-
args: user provided arguments for running the command.
|
|
882
|
-
|
|
883
907
|
Returns:
|
|
884
908
|
0 if successful and 1 otherwise.
|
|
885
909
|
"""
|
|
@@ -888,23 +912,23 @@ def update_coredns(args) -> int:
|
|
|
888
912
|
coredns_repo_full_path = os.path.join(coredns_repo_dir, coredns_repo_dir_name)
|
|
889
913
|
coredns_k8s_path = os.path.join(coredns_repo_full_path, 'kubernetes')
|
|
890
914
|
# 1. Install jq
|
|
891
|
-
install_jq(
|
|
915
|
+
install_jq()
|
|
892
916
|
|
|
893
917
|
# 2. Clone CoreDNS deployment repository
|
|
894
|
-
clone_coredns_deployment_repo(
|
|
918
|
+
clone_coredns_deployment_repo(coredns_repo_full_path)
|
|
895
919
|
|
|
896
920
|
# 3. Deploy CoreDNS to the cluster
|
|
897
|
-
deploy_coredns_manifests(
|
|
921
|
+
deploy_coredns_manifests(coredns_k8s_path)
|
|
898
922
|
|
|
899
923
|
# 4. Scale down kube-dns-autoscaler
|
|
900
|
-
scale_down_deployment(
|
|
924
|
+
scale_down_deployment('kube-dns-autoscaler')
|
|
901
925
|
|
|
902
926
|
# 5. Scale down kube-dns
|
|
903
|
-
scale_down_deployment(
|
|
927
|
+
scale_down_deployment('kube-dns')
|
|
904
928
|
|
|
905
929
|
# 6. Scale up coredns and verify readiness
|
|
906
|
-
scale_up_coredns(
|
|
907
|
-
verify_coredns_readiness(
|
|
930
|
+
scale_up_coredns(replicas=15)
|
|
931
|
+
verify_coredns_readiness(timeout=120)
|
|
908
932
|
|
|
909
933
|
xpk_print('The CoreDNS setup process has been completed.')
|
|
910
934
|
|
|
@@ -914,7 +938,7 @@ def update_coredns(args) -> int:
|
|
|
914
938
|
return 0
|
|
915
939
|
|
|
916
940
|
|
|
917
|
-
def coredns_deployment_exists(
|
|
941
|
+
def coredns_deployment_exists(namespace: str = 'kube-system') -> bool:
|
|
918
942
|
"""Checks if the CoreDNS deployment exists in the given namespace.
|
|
919
943
|
|
|
920
944
|
Args:
|
|
@@ -929,10 +953,10 @@ def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool:
|
|
|
929
953
|
f' namespace: {namespace}'
|
|
930
954
|
)
|
|
931
955
|
return_code = run_command_with_updates(
|
|
932
|
-
command, f'Check CoreDNS deployment in {namespace}'
|
|
956
|
+
command, f'Check CoreDNS deployment in {namespace}'
|
|
933
957
|
)
|
|
934
958
|
if return_code == 0:
|
|
935
|
-
verify_coredns_readiness(
|
|
959
|
+
verify_coredns_readiness()
|
|
936
960
|
xpk_print(f"CoreDNS deployment 'coredns' found in namespace '{namespace}'.")
|
|
937
961
|
return True
|
|
938
962
|
else:
|
|
@@ -943,25 +967,22 @@ def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool:
|
|
|
943
967
|
return False
|
|
944
968
|
|
|
945
969
|
|
|
946
|
-
def update_coredns_if_necessary(
|
|
970
|
+
def update_coredns_if_necessary() -> int:
|
|
947
971
|
"""Updates and deploys CoreDNS within the cluster if it's not already present.
|
|
948
972
|
|
|
949
973
|
This function checks for the existence of the CoreDNS deployment.
|
|
950
974
|
If it's not found, it proceeds to deploy and configure CoreDNS.
|
|
951
975
|
|
|
952
|
-
Args:
|
|
953
|
-
args: User-provided arguments for running the command.
|
|
954
|
-
|
|
955
976
|
Returns:
|
|
956
977
|
0 if successful (CoreDNS was already present or successfully deployed),
|
|
957
978
|
and 1 otherwise.
|
|
958
979
|
"""
|
|
959
|
-
if coredns_deployment_exists(
|
|
980
|
+
if coredns_deployment_exists(namespace='kube-system'):
|
|
960
981
|
xpk_print('Skipping CoreDNS deployment since it already exists.')
|
|
961
982
|
return 0
|
|
962
983
|
else:
|
|
963
984
|
xpk_print('CoreDNS deployment not found. Proceeding with CoreDNS setup.')
|
|
964
|
-
return update_coredns(
|
|
985
|
+
return update_coredns()
|
|
965
986
|
|
|
966
987
|
|
|
967
988
|
def create_cluster_if_necessary(
|
|
@@ -1021,10 +1042,10 @@ def run_gke_cluster_delete_command(args) -> int:
|
|
|
1021
1042
|
command = (
|
|
1022
1043
|
'gcloud beta container clusters delete'
|
|
1023
1044
|
f' {args.cluster} --project={args.project}'
|
|
1024
|
-
f' --
|
|
1045
|
+
f' --location={get_cluster_location(args.project, args.cluster, args.zone)} --quiet'
|
|
1025
1046
|
)
|
|
1026
1047
|
|
|
1027
|
-
return_code = run_command_with_updates(command, 'Cluster Delete'
|
|
1048
|
+
return_code = run_command_with_updates(command, 'Cluster Delete')
|
|
1028
1049
|
if return_code != 0:
|
|
1029
1050
|
xpk_print(f'Cluster delete request returned ERROR {return_code}')
|
|
1030
1051
|
return 1
|
|
@@ -1047,9 +1068,9 @@ def run_gke_clusters_list_command(args) -> int:
|
|
|
1047
1068
|
"""
|
|
1048
1069
|
command = (
|
|
1049
1070
|
'gcloud container clusters list'
|
|
1050
|
-
f' --project={args.project} --
|
|
1071
|
+
f' --project={args.project} --filter=location~"{zone_to_region(args.zone)}.*"'
|
|
1051
1072
|
)
|
|
1052
|
-
return_code = run_command_with_updates(command, 'Cluster List'
|
|
1073
|
+
return_code = run_command_with_updates(command, 'Cluster List')
|
|
1053
1074
|
if return_code != 0:
|
|
1054
1075
|
xpk_print(f'Cluster list request returned ERROR {return_code}')
|
|
1055
1076
|
return 1
|
|
@@ -1105,6 +1126,7 @@ def run_gke_cluster_create_command(
|
|
|
1105
1126
|
f' {rapid_release_cmd}'
|
|
1106
1127
|
' --enable-dns-access'
|
|
1107
1128
|
' --autoscaling-profile=optimize-utilization'
|
|
1129
|
+
' --labels=gke_product_type=xpk'
|
|
1108
1130
|
)
|
|
1109
1131
|
|
|
1110
1132
|
enable_ip_alias = False
|
|
@@ -1158,7 +1180,7 @@ def run_gke_cluster_create_command(
|
|
|
1158
1180
|
addons_str = ','.join(addons)
|
|
1159
1181
|
command += f' --addons={addons_str}'
|
|
1160
1182
|
|
|
1161
|
-
return_code = run_command_with_updates(command, 'GKE Cluster Create'
|
|
1183
|
+
return_code = run_command_with_updates(command, 'GKE Cluster Create')
|
|
1162
1184
|
if return_code != 0:
|
|
1163
1185
|
xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
|
|
1164
1186
|
return 1
|
|
@@ -1204,12 +1226,12 @@ def install_storage_csis(args):
|
|
|
1204
1226
|
|
|
1205
1227
|
def install_kjob(args):
|
|
1206
1228
|
xpk_print('Verifying kjob installation')
|
|
1207
|
-
err_code = verify_kjob_installed(
|
|
1229
|
+
err_code = verify_kjob_installed()
|
|
1208
1230
|
if err_code > 0:
|
|
1209
1231
|
xpk_exit(err_code)
|
|
1210
1232
|
|
|
1211
1233
|
xpk_print('Applying kjob CDRs')
|
|
1212
|
-
err_code = apply_kjob_crds(
|
|
1234
|
+
err_code = apply_kjob_crds()
|
|
1213
1235
|
if err_code > 0:
|
|
1214
1236
|
xpk_exit(err_code)
|
|
1215
1237
|
|
|
@@ -1220,42 +1242,46 @@ def install_kjob(args):
|
|
|
1220
1242
|
|
|
1221
1243
|
def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config):
|
|
1222
1244
|
xpk_print('Enabling Kueue on the cluster')
|
|
1223
|
-
|
|
1224
|
-
if
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1245
|
+
autoprovisioning_enabled = False
|
|
1246
|
+
if autoprovisioning_config:
|
|
1247
|
+
# Determine total resources available based on autoprovisioning max chips.
|
|
1248
|
+
autoprovisioning_enabled = True
|
|
1249
|
+
total_chips = autoprovisioning_config.maximum_chips
|
|
1250
|
+
else:
|
|
1251
|
+
# Determine total chips based on user specified topology.
|
|
1252
|
+
total_chips = get_total_chips_requested_from_args(args, system)
|
|
1253
|
+
kueue_manager = KueueManager()
|
|
1254
|
+
kueue_manager.install_or_upgrade(
|
|
1255
|
+
KueueConfig(
|
|
1256
|
+
system,
|
|
1257
|
+
total_chips=total_chips,
|
|
1258
|
+
autoprovisioning_enabled=autoprovisioning_enabled,
|
|
1259
|
+
num_slices=args.num_slices,
|
|
1260
|
+
flex=args.flex,
|
|
1261
|
+
memory_limit=args.memory_limit,
|
|
1262
|
+
cpu_limit=args.cpu_limit,
|
|
1263
|
+
is_pathways_cluster=args.enable_pathways,
|
|
1264
|
+
configure_sub_slicing=(
|
|
1265
|
+
FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing
|
|
1266
|
+
),
|
|
1267
|
+
),
|
|
1235
1268
|
)
|
|
1236
|
-
if enable_kueue_credentials_code != 0:
|
|
1237
|
-
xpk_exit(enable_kueue_credentials_code)
|
|
1238
|
-
|
|
1239
|
-
xpk_print('Update Kueue Controller Manager resources')
|
|
1240
|
-
update_kueue_resources_code = update_kueue_resources_if_necessary(args)
|
|
1241
|
-
if update_kueue_resources_code != 0:
|
|
1242
|
-
xpk_exit(update_kueue_resources_code)
|
|
1243
1269
|
|
|
1244
1270
|
|
|
1245
|
-
def prepare_gpus(
|
|
1271
|
+
def prepare_gpus(system: SystemCharacteristics):
|
|
1246
1272
|
xpk_print('Installing NCCL Plugin for cluster')
|
|
1247
|
-
install_nccl_code = install_nccl_on_cluster(
|
|
1273
|
+
install_nccl_code = install_nccl_on_cluster(system)
|
|
1248
1274
|
if install_nccl_code != 0:
|
|
1249
1275
|
xpk_exit(install_nccl_code)
|
|
1250
1276
|
|
|
1251
1277
|
if system.device_type == H100_DEVICE_TYPE:
|
|
1252
1278
|
xpk_print('Installing NRI device injector for cluster')
|
|
1253
|
-
install_nri_code = install_nri_on_cluster(
|
|
1279
|
+
install_nri_code = install_nri_on_cluster()
|
|
1254
1280
|
if install_nri_code != 0:
|
|
1255
1281
|
xpk_exit(install_nri_code)
|
|
1256
1282
|
|
|
1257
1283
|
if system.device_type in [H200_DEVICE_TYPE, B200_DEVICE_TYPE]:
|
|
1258
1284
|
xpk_print('Disabling MGLRU')
|
|
1259
|
-
err_code = disable_mglru_on_cluster(
|
|
1285
|
+
err_code = disable_mglru_on_cluster()
|
|
1260
1286
|
if err_code > 0:
|
|
1261
1287
|
xpk_exit(err_code)
|