xpk 0.15.0__py3-none-any.whl → 0.16.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/README.md +19 -0
- xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3mega/storage_crd.yaml +52 -0
- xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
- xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
- xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
- xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
- xpk/blueprints/a4/storage_crd.yaml +52 -0
- xpk/commands/cluster.py +33 -12
- xpk/commands/cluster_gcluster_test.py +5 -1
- xpk/commands/cluster_test.py +125 -0
- xpk/commands/config.py +3 -3
- xpk/commands/inspector.py +5 -3
- xpk/commands/kind.py +2 -0
- xpk/commands/managed_ml_diagnostics.py +249 -0
- xpk/commands/managed_ml_diagnostics_test.py +146 -0
- xpk/commands/workload.py +125 -139
- xpk/commands/workload_test.py +160 -118
- xpk/core/blueprint/blueprint_generator.py +3 -0
- xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
- xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
- xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
- xpk/core/blueprint/testing/data/a4.yaml +185 -0
- xpk/core/capacity.py +2 -0
- xpk/core/cluster.py +18 -47
- xpk/core/cluster_test.py +76 -1
- xpk/core/config.py +81 -7
- xpk/core/config_test.py +67 -11
- xpk/core/docker_container.py +3 -1
- xpk/core/docker_image.py +10 -6
- xpk/core/docker_resources.py +1 -10
- xpk/core/kjob.py +17 -16
- xpk/core/kueue_manager.py +13 -19
- xpk/core/kueue_manager_test.py +27 -1
- xpk/core/nap.py +13 -14
- xpk/core/nodepool.py +17 -15
- xpk/core/nodepool_test.py +25 -4
- xpk/core/pathways.py +23 -0
- xpk/core/pathways_test.py +57 -0
- xpk/core/resources.py +84 -27
- xpk/core/scheduling.py +128 -132
- xpk/core/scheduling_test.py +215 -2
- xpk/core/system_characteristics.py +179 -0
- xpk/core/system_characteristics_test.py +49 -1
- xpk/core/telemetry.py +4 -4
- xpk/core/telemetry_test.py +9 -9
- xpk/core/vertex.py +4 -3
- xpk/core/workload_decorators/tcpx_decorator.py +5 -1
- xpk/main.py +2 -0
- xpk/parser/cluster.py +22 -88
- xpk/parser/cluster_test.py +41 -0
- xpk/parser/common.py +84 -0
- xpk/parser/storage.py +10 -0
- xpk/parser/storage_test.py +47 -0
- xpk/parser/workload.py +14 -41
- xpk/parser/workload_test.py +2 -48
- xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
- xpk/utils/feature_flags.py +3 -0
- xpk/utils/validation.py +2 -2
- xpk-0.16.1.dist-info/METADATA +127 -0
- {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/RECORD +67 -48
- xpk-0.15.0.dist-info/METADATA +0 -1666
- {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/WHEEL +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/entry_points.txt +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/top_level.txt +0 -0
xpk/commands/workload.py
CHANGED
|
@@ -16,9 +16,7 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
from ..core.blueprint.blueprint_generator import (
|
|
18
18
|
a3high_device_type,
|
|
19
|
-
|
|
20
|
-
a3ultra_device_type,
|
|
21
|
-
a4_device_type,
|
|
19
|
+
a4x_device_types,
|
|
22
20
|
)
|
|
23
21
|
from ..core.cluster import (
|
|
24
22
|
XPK_SA,
|
|
@@ -32,7 +30,7 @@ from ..core.docker_container import (
|
|
|
32
30
|
get_main_container_docker_image,
|
|
33
31
|
get_user_workload_container,
|
|
34
32
|
)
|
|
35
|
-
from ..core.kueue_manager import
|
|
33
|
+
from ..core.kueue_manager import LOCAL_QUEUE_NAME
|
|
36
34
|
from ..core.docker_resources import get_volumes, parse_env_config
|
|
37
35
|
from ..core.gcloud_context import add_zone_and_project
|
|
38
36
|
from ..core.monitoring import get_gke_outlier_dashboard
|
|
@@ -52,18 +50,19 @@ from ..core.pathways import (
|
|
|
52
50
|
get_user_workload_for_pathways,
|
|
53
51
|
try_to_delete_pathwaysjob_first,
|
|
54
52
|
)
|
|
55
|
-
from ..core.resources import get_cluster_capacity_type,
|
|
56
|
-
from ..core.resources import
|
|
53
|
+
from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics_from_config_map
|
|
54
|
+
from ..core.resources import ConfigMapType, get_cluster_configmap
|
|
55
|
+
from ..core.nodepool import ensure_resource_policy_exists
|
|
57
56
|
from ..core.scheduling import (
|
|
57
|
+
WorkloadScheduling,
|
|
58
58
|
check_if_workload_can_schedule,
|
|
59
|
-
create_accelerator_label,
|
|
60
|
-
create_machine_label,
|
|
61
59
|
create_tpu_machine_type,
|
|
62
60
|
create_tpu_topology,
|
|
63
61
|
get_cpu_affinity,
|
|
64
62
|
get_gpu_scheduler,
|
|
65
63
|
create_sub_slicing_annotations,
|
|
66
64
|
create_placement_policy_label,
|
|
65
|
+
get_placement_policy_name,
|
|
67
66
|
is_placement_policy_supported,
|
|
68
67
|
)
|
|
69
68
|
from ..core.storage import (
|
|
@@ -78,10 +77,10 @@ from ..core.storage import (
|
|
|
78
77
|
get_storages_to_mount,
|
|
79
78
|
)
|
|
80
79
|
from ..core.system_characteristics import (
|
|
81
|
-
SUB_SLICING_TOPOLOGIES,
|
|
82
80
|
AcceleratorType,
|
|
81
|
+
create_accelerator_label,
|
|
82
|
+
create_machine_label,
|
|
83
83
|
get_system_characteristics,
|
|
84
|
-
compute_vms_per_slice,
|
|
85
84
|
)
|
|
86
85
|
from ..core.vertex import create_vertex_experiment
|
|
87
86
|
from ..core.workload import (
|
|
@@ -92,20 +91,16 @@ from ..core.workload import (
|
|
|
92
91
|
get_cluster_location,
|
|
93
92
|
)
|
|
94
93
|
from ..core.workload_decorators import (
|
|
95
|
-
rdma_decorator,
|
|
96
94
|
storage_decorator,
|
|
97
|
-
tcpx_decorator,
|
|
98
|
-
tcpxo_decorator,
|
|
99
95
|
)
|
|
100
96
|
from ..utils.console import ask_for_user_consent, xpk_exit, xpk_print
|
|
101
|
-
from packaging.version import Version
|
|
102
97
|
from ..utils.file import write_tmp_file
|
|
103
98
|
from ..utils.execution_context import is_dry_run
|
|
104
99
|
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
105
100
|
from . import cluster_gcluster
|
|
106
|
-
from .common import is_TAS_possible
|
|
107
|
-
from
|
|
108
|
-
from ..utils.
|
|
101
|
+
from .common import is_TAS_possible
|
|
102
|
+
from jinja2 import Environment, FileSystemLoader
|
|
103
|
+
from ..utils.templates import get_templates_absolute_path
|
|
109
104
|
|
|
110
105
|
WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
|
|
111
106
|
kind: JobSet
|
|
@@ -292,7 +287,7 @@ PW_WORKLOAD_CREATE_YAML = """
|
|
|
292
287
|
{user_workload}
|
|
293
288
|
"""
|
|
294
289
|
|
|
295
|
-
|
|
290
|
+
ARM_GPU_WORKLOAD_CREATE_JINJA_FILE = 'arm_gpu_workload_crate.yaml.j2'
|
|
296
291
|
|
|
297
292
|
|
|
298
293
|
def workload_create_pathways(args) -> None:
|
|
@@ -344,26 +339,35 @@ def workload_create(args) -> None:
|
|
|
344
339
|
)
|
|
345
340
|
xpk_exit(1)
|
|
346
341
|
|
|
347
|
-
|
|
348
|
-
if return_code > 0 or
|
|
342
|
+
workload_system, return_code = get_system_characteristics(args)
|
|
343
|
+
if return_code > 0 or workload_system is None:
|
|
349
344
|
xpk_print('Fetching system characteristics failed!')
|
|
350
345
|
xpk_exit(return_code)
|
|
351
346
|
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
347
|
+
resources_config_map = get_cluster_configmap(
|
|
348
|
+
args.cluster, ConfigMapType.RESOURCES
|
|
349
|
+
)
|
|
350
|
+
cluster_system = get_cluster_system_characteristics_from_config_map(
|
|
351
|
+
resources_config_map
|
|
352
|
+
)
|
|
353
|
+
workload_scheduling = check_if_workload_can_schedule(
|
|
354
|
+
args=args,
|
|
355
|
+
workload_system=workload_system,
|
|
356
|
+
cluster_system=cluster_system,
|
|
357
|
+
resources_config_map=resources_config_map,
|
|
358
|
+
)
|
|
359
|
+
if workload_scheduling == WorkloadScheduling.UNAVAILABLE:
|
|
357
360
|
xpk_exit(1)
|
|
358
361
|
|
|
359
362
|
xpk_print('Starting workload create', flush=True)
|
|
360
363
|
|
|
361
|
-
|
|
362
|
-
|
|
364
|
+
cluster_config_map = get_cluster_configmap(
|
|
365
|
+
args.cluster, ConfigMapType.METADATA
|
|
366
|
+
)
|
|
363
367
|
cluster_xpk_version = None
|
|
364
368
|
if cluster_config_map is None:
|
|
365
369
|
xpk_print(
|
|
366
|
-
|
|
370
|
+
'Warning: Unable to find ConfigMap for the'
|
|
367
371
|
' cluster. We recommend to upgrade your cluster by running `xpk'
|
|
368
372
|
' cluster create`.'
|
|
369
373
|
)
|
|
@@ -395,7 +399,7 @@ def workload_create(args) -> None:
|
|
|
395
399
|
|
|
396
400
|
autoprovisioning_args = ''
|
|
397
401
|
autoprovisioning_enabled, return_code = is_autoprovisioning_enabled(
|
|
398
|
-
args,
|
|
402
|
+
args, workload_system
|
|
399
403
|
)
|
|
400
404
|
if return_code != 0:
|
|
401
405
|
xpk_exit(return_code)
|
|
@@ -488,28 +492,36 @@ def workload_create(args) -> None:
|
|
|
488
492
|
podFailurePolicy:
|
|
489
493
|
rules:
|
|
490
494
|
- action: FailJob
|
|
495
|
+
onPodConditions: []
|
|
491
496
|
onExitCodes:
|
|
492
|
-
containerName: {get_main_container_docker_image(args,
|
|
497
|
+
containerName: {get_main_container_docker_image(args, workload_system)}
|
|
493
498
|
operator: NotIn
|
|
494
499
|
values: [{restart_on_exit_codes}]"""
|
|
495
500
|
|
|
501
|
+
if is_placement_policy_supported(workload_system):
|
|
502
|
+
ensure_resource_policy_exists(
|
|
503
|
+
resource_policy_name=get_placement_policy_name(workload_system),
|
|
504
|
+
project=args.project,
|
|
505
|
+
zone=args.zone,
|
|
506
|
+
topology=workload_system.topology,
|
|
507
|
+
)
|
|
508
|
+
|
|
496
509
|
placement_policy_label = (
|
|
497
|
-
create_placement_policy_label(
|
|
498
|
-
if is_placement_policy_supported(
|
|
510
|
+
create_placement_policy_label(workload_system)
|
|
511
|
+
if is_placement_policy_supported(workload_system)
|
|
499
512
|
else ''
|
|
500
513
|
)
|
|
501
514
|
|
|
502
515
|
# Create the workload file based on accelerator type or workload type.
|
|
503
|
-
if
|
|
516
|
+
if workload_system.accelerator_type == AcceleratorType.GPU:
|
|
504
517
|
container, debugging_dashboard_id = get_user_workload_container(
|
|
505
|
-
args,
|
|
518
|
+
args, workload_system
|
|
506
519
|
)
|
|
507
520
|
gpu_scheduler, return_code = get_gpu_scheduler(
|
|
508
|
-
args,
|
|
521
|
+
args, workload_system, autoprovisioning_args
|
|
509
522
|
)
|
|
510
523
|
if return_code != 0:
|
|
511
524
|
xpk_exit(return_code)
|
|
512
|
-
system_characteristics = get_cluster_system_characteristics(args)
|
|
513
525
|
capacity_type = get_cluster_capacity_type(args)
|
|
514
526
|
|
|
515
527
|
annotations = (
|
|
@@ -517,31 +529,55 @@ def workload_create(args) -> None:
|
|
|
517
529
|
'kueue.x-k8s.io/podset-preferred-topology:'
|
|
518
530
|
' "cloud.google.com/gce-topology-host"'
|
|
519
531
|
)
|
|
520
|
-
if is_TAS_possible(
|
|
532
|
+
if is_TAS_possible(cluster_system, capacity_type)
|
|
521
533
|
else ''
|
|
522
534
|
)
|
|
523
535
|
|
|
524
536
|
if (
|
|
525
|
-
|
|
526
|
-
or
|
|
537
|
+
workload_system.device_type in cluster_gcluster.supported_device_types
|
|
538
|
+
or workload_system.device_type == a3high_device_type
|
|
539
|
+
or workload_system.device_type in a4x_device_types
|
|
527
540
|
):
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
541
|
+
if workload_system.device_type in a4x_device_types:
|
|
542
|
+
template_env = Environment(
|
|
543
|
+
loader=FileSystemLoader(searchpath=get_templates_absolute_path())
|
|
544
|
+
)
|
|
545
|
+
workload_create_yaml = template_env.get_template(
|
|
546
|
+
ARM_GPU_WORKLOAD_CREATE_JINJA_FILE
|
|
547
|
+
)
|
|
548
|
+
yml_string = workload_create_yaml.render(
|
|
549
|
+
workload=args.workload,
|
|
550
|
+
num_nodes=args.num_nodes,
|
|
551
|
+
ttl_seconds_after_finished=args.ttl_seconds_after_finished,
|
|
552
|
+
max_restarts=args.max_restarts,
|
|
553
|
+
priority=args.priority,
|
|
554
|
+
termination_grace_period_seconds=args.termination_grace_period_seconds,
|
|
555
|
+
docker_image_pull_secret=args.docker_image_pull_secret,
|
|
556
|
+
container=container,
|
|
557
|
+
service_account=XPK_SA,
|
|
558
|
+
failure_policy_rules=failure_policy_rules,
|
|
559
|
+
pod_failure_policy=pod_failure_policy,
|
|
560
|
+
annotations=annotations,
|
|
561
|
+
placement_policy_label=placement_policy_label,
|
|
562
|
+
)
|
|
563
|
+
else:
|
|
564
|
+
yml_string = A3_GPU_WORKLOAD_CREATE_YAML.format(
|
|
565
|
+
args=args,
|
|
566
|
+
container=container,
|
|
567
|
+
service_account=XPK_SA,
|
|
568
|
+
failure_policy_rules=failure_policy_rules,
|
|
569
|
+
pod_failure_policy=pod_failure_policy,
|
|
570
|
+
annotations=annotations,
|
|
571
|
+
placement_policy_label=placement_policy_label,
|
|
572
|
+
)
|
|
537
573
|
|
|
538
574
|
sub_networks = get_cluster_subnetworks()
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
yml_string =
|
|
575
|
+
|
|
576
|
+
if workload_system.gpu_config and callable(
|
|
577
|
+
workload_system.gpu_config.jobset_decorator_fn
|
|
578
|
+
):
|
|
579
|
+
decorator_fn = workload_system.gpu_config.jobset_decorator_fn
|
|
580
|
+
yml_string = decorator_fn(yml_string, sub_networks)
|
|
545
581
|
|
|
546
582
|
if all_storages:
|
|
547
583
|
yml_string = storage_decorator.decorate_jobset(yml_string, all_storages)
|
|
@@ -550,7 +586,7 @@ def workload_create(args) -> None:
|
|
|
550
586
|
args=args,
|
|
551
587
|
container=container,
|
|
552
588
|
gpu_scheduler=gpu_scheduler,
|
|
553
|
-
volumes=get_volumes(args,
|
|
589
|
+
volumes=get_volumes(args, workload_system),
|
|
554
590
|
storage_annotations=('\n' + (' ' * 12)).join(
|
|
555
591
|
get_storage_annotations(all_storages)
|
|
556
592
|
),
|
|
@@ -561,53 +597,53 @@ def workload_create(args) -> None:
|
|
|
561
597
|
)
|
|
562
598
|
|
|
563
599
|
elif args.use_pathways and ensure_pathways_workload_prerequisites(
|
|
564
|
-
args,
|
|
600
|
+
args, workload_system
|
|
565
601
|
):
|
|
566
602
|
yml_string = PW_WORKLOAD_CREATE_YAML.format(
|
|
567
603
|
args=args,
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
machine_type=create_tpu_machine_type(system.accelerator_type, system),
|
|
604
|
+
topology=create_tpu_topology(workload_system),
|
|
605
|
+
machine_type=create_tpu_machine_type(workload_system),
|
|
571
606
|
custom_pathways_proxy_server=append_custom_pathways_proxy_server(args),
|
|
572
607
|
custom_pathways_server=append_custom_pathways_server(args),
|
|
573
608
|
custom_pathways_worker=append_custom_pathways_worker(args),
|
|
574
609
|
colocated_python_sidecar=append_custom_colocated_python_sidecar(args),
|
|
575
|
-
user_workload=get_user_workload_for_pathways(args,
|
|
610
|
+
user_workload=get_user_workload_for_pathways(args, workload_system),
|
|
576
611
|
local_queue_name=LOCAL_QUEUE_NAME,
|
|
577
612
|
autoprovisioning_args=autoprovisioning_args,
|
|
578
613
|
placement_policy_label=placement_policy_label,
|
|
579
614
|
)
|
|
580
615
|
else:
|
|
616
|
+
use_sub_slicing = (
|
|
617
|
+
workload_scheduling == WorkloadScheduling.SUB_SLICING_AVAILABLE
|
|
618
|
+
)
|
|
619
|
+
if use_sub_slicing:
|
|
620
|
+
xpk_print('Workload will be scheduled using the Sub-slicing feature.')
|
|
621
|
+
|
|
581
622
|
container, debugging_dashboard_id = get_user_workload_container(
|
|
582
|
-
args,
|
|
623
|
+
args, workload_system
|
|
583
624
|
)
|
|
584
625
|
yml_string = WORKLOAD_CREATE_YAML.format(
|
|
585
626
|
args=args,
|
|
586
627
|
container=container,
|
|
587
|
-
vms_per_slice=
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
and FeatureFlags.SUB_SLICING_ENABLED
|
|
591
|
-
and args.sub_slicing_topology is not None
|
|
592
|
-
else system.vms_per_slice
|
|
593
|
-
),
|
|
594
|
-
affinity=get_cpu_affinity(system.accelerator_type),
|
|
595
|
-
accelerator_label=create_accelerator_label(
|
|
596
|
-
system.accelerator_type, system
|
|
597
|
-
),
|
|
628
|
+
vms_per_slice=workload_system.vms_per_slice,
|
|
629
|
+
affinity=get_cpu_affinity(workload_system.accelerator_type),
|
|
630
|
+
accelerator_label=create_accelerator_label(workload_system),
|
|
598
631
|
sub_slicing_annotations=(
|
|
599
|
-
''
|
|
600
|
-
|
|
601
|
-
or args.sub_slicing_topology is None
|
|
602
|
-
else ('\n' + (' ' * 16)).join(
|
|
603
|
-
create_sub_slicing_annotations(args.sub_slicing_topology)
|
|
632
|
+
('\n' + (' ' * 16)).join(
|
|
633
|
+
create_sub_slicing_annotations(workload_system.topology)
|
|
604
634
|
)
|
|
635
|
+
if use_sub_slicing
|
|
636
|
+
else ''
|
|
605
637
|
),
|
|
606
638
|
placement_policy_label=placement_policy_label,
|
|
607
|
-
machine_label=
|
|
639
|
+
machine_label=(
|
|
640
|
+
create_machine_label(cluster_system)
|
|
641
|
+
if use_sub_slicing and cluster_system
|
|
642
|
+
else create_machine_label(workload_system)
|
|
643
|
+
),
|
|
608
644
|
local_queue_name=LOCAL_QUEUE_NAME,
|
|
609
645
|
autoprovisioning_args=autoprovisioning_args,
|
|
610
|
-
volumes=get_volumes(args,
|
|
646
|
+
volumes=get_volumes(args, workload_system),
|
|
611
647
|
storage_annotations=('\n' + (' ' * 16)).join(
|
|
612
648
|
get_storage_annotations(all_storages)
|
|
613
649
|
),
|
|
@@ -615,10 +651,18 @@ def workload_create(args) -> None:
|
|
|
615
651
|
tpu_toleration="""
|
|
616
652
|
- operator: "Exists"
|
|
617
653
|
key: google.com/tpu
|
|
618
|
-
""" if
|
|
654
|
+
""" if workload_system.accelerator_type == AcceleratorType.TPU else '',
|
|
619
655
|
failure_policy_rules=failure_policy_rules,
|
|
620
656
|
pod_failure_policy=pod_failure_policy,
|
|
621
657
|
)
|
|
658
|
+
if args.output_manifest_file:
|
|
659
|
+
with open(args.output_manifest_file, 'w', encoding='utf-8') as f:
|
|
660
|
+
f.write(yml_string)
|
|
661
|
+
xpk_print(
|
|
662
|
+
f'Workload {args.workload} manifest written to'
|
|
663
|
+
f' {args.output_manifest_file}'
|
|
664
|
+
)
|
|
665
|
+
|
|
622
666
|
tmp = write_tmp_file(yml_string)
|
|
623
667
|
command = f'kubectl apply -f {str(tmp)}'
|
|
624
668
|
return_code = run_command_with_updates(command, 'Creating Workload')
|
|
@@ -632,7 +676,7 @@ def workload_create(args) -> None:
|
|
|
632
676
|
|
|
633
677
|
# Get GKE outlier dashboard for TPU
|
|
634
678
|
outlier_dashboard_id = None
|
|
635
|
-
if
|
|
679
|
+
if workload_system.accelerator_type == AcceleratorType.TPU:
|
|
636
680
|
outlier_dashboard_id = get_gke_outlier_dashboard(args)
|
|
637
681
|
|
|
638
682
|
# Outlier and debugging dashboards
|
|
@@ -699,64 +743,6 @@ def workload_create(args) -> None:
|
|
|
699
743
|
xpk_exit(0)
|
|
700
744
|
|
|
701
745
|
|
|
702
|
-
def _validate_sub_slicing_availability():
|
|
703
|
-
return_code, sub_slicing_enabled = has_sub_slicing_enabled()
|
|
704
|
-
if return_code != 0:
|
|
705
|
-
xpk_print(
|
|
706
|
-
'Error: Unable to validate sub-slicing support on a given cluster.'
|
|
707
|
-
)
|
|
708
|
-
xpk_exit(1)
|
|
709
|
-
|
|
710
|
-
if not sub_slicing_enabled:
|
|
711
|
-
xpk_print(
|
|
712
|
-
'Error: Cluster has not been not set up for Sub-slicing. Please enable'
|
|
713
|
-
' --sub-slicing in "cluster create" command first.'
|
|
714
|
-
)
|
|
715
|
-
xpk_exit(1)
|
|
716
|
-
|
|
717
|
-
return_code, current_version = get_installed_kueue_version(
|
|
718
|
-
dry_run_version=Version('0.13')
|
|
719
|
-
)
|
|
720
|
-
if return_code != 0 or not current_version:
|
|
721
|
-
xpk_print(
|
|
722
|
-
'Error: Unable to validate sub-slicing support on a given cluster.'
|
|
723
|
-
)
|
|
724
|
-
xpk_exit(1)
|
|
725
|
-
|
|
726
|
-
if current_version < SUB_SLICING_MINIMUM_KUEUE_VERSION:
|
|
727
|
-
xpk_print(
|
|
728
|
-
"Error: Current Kueue version ({current_version}) doesn't support"
|
|
729
|
-
' Sub-slicing. The minimal required version is'
|
|
730
|
-
' v{SUB_SLICING_MINIMUM_KUEUE_VERSION}. Please either update Kueue'
|
|
731
|
-
' manually, or run "cluster create --sub-slicing" on the existing'
|
|
732
|
-
' cluster.'
|
|
733
|
-
)
|
|
734
|
-
xpk_exit(1)
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
def _validate_sub_slicing_topology(
|
|
738
|
-
system_characteristics: SystemCharacteristics, sub_slicing_topology: str
|
|
739
|
-
) -> None:
|
|
740
|
-
if sub_slicing_topology not in SUB_SLICING_TOPOLOGIES:
|
|
741
|
-
xpk_print(
|
|
742
|
-
f'Error: --sub-slicing-topology={sub_slicing_topology} shape is'
|
|
743
|
-
f' invalid. It has to be one of: {", ".join(SUB_SLICING_TOPOLOGIES)}.'
|
|
744
|
-
)
|
|
745
|
-
xpk_exit(1)
|
|
746
|
-
|
|
747
|
-
if not is_topology_contained(
|
|
748
|
-
contained=sub_slicing_topology, container=system_characteristics.topology
|
|
749
|
-
):
|
|
750
|
-
xpk_print(
|
|
751
|
-
f'Error: --sub-slicing-topology={sub_slicing_topology} shape is too'
|
|
752
|
-
' large. The shape cannot be bigger than'
|
|
753
|
-
f' {system_characteristics.topology}.'
|
|
754
|
-
)
|
|
755
|
-
xpk_exit(1)
|
|
756
|
-
|
|
757
|
-
validate_sub_slicing_system(system_characteristics)
|
|
758
|
-
|
|
759
|
-
|
|
760
746
|
def get_restart_exit_codes(args) -> list:
|
|
761
747
|
exit_codes = [42]
|
|
762
748
|
exit_codes.extend(range(127, 256, 1))
|