xpk 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/README.md +19 -0
- xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3mega/storage_crd.yaml +52 -0
- xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
- xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
- xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
- xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
- xpk/blueprints/a4/storage_crd.yaml +52 -0
- xpk/commands/cluster.py +33 -12
- xpk/commands/cluster_gcluster_test.py +5 -1
- xpk/commands/cluster_test.py +125 -0
- xpk/commands/config.py +3 -3
- xpk/commands/inspector.py +5 -3
- xpk/commands/kind.py +2 -0
- xpk/commands/managed_ml_diagnostics.py +249 -0
- xpk/commands/managed_ml_diagnostics_test.py +146 -0
- xpk/commands/workload.py +124 -139
- xpk/commands/workload_test.py +160 -118
- xpk/core/blueprint/blueprint_generator.py +3 -0
- xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
- xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
- xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
- xpk/core/blueprint/testing/data/a4.yaml +185 -0
- xpk/core/capacity.py +2 -0
- xpk/core/cluster.py +18 -47
- xpk/core/cluster_test.py +76 -1
- xpk/core/config.py +81 -7
- xpk/core/config_test.py +67 -11
- xpk/core/docker_container.py +3 -1
- xpk/core/docker_image.py +10 -6
- xpk/core/docker_resources.py +1 -10
- xpk/core/kjob.py +17 -16
- xpk/core/kueue_manager.py +13 -19
- xpk/core/kueue_manager_test.py +27 -1
- xpk/core/nap.py +13 -14
- xpk/core/nodepool.py +17 -15
- xpk/core/nodepool_test.py +25 -4
- xpk/core/pathways.py +23 -0
- xpk/core/pathways_test.py +57 -0
- xpk/core/resources.py +84 -27
- xpk/core/scheduling.py +128 -132
- xpk/core/scheduling_test.py +215 -2
- xpk/core/system_characteristics.py +179 -0
- xpk/core/system_characteristics_test.py +49 -1
- xpk/core/telemetry.py +4 -4
- xpk/core/telemetry_test.py +9 -9
- xpk/core/vertex.py +4 -3
- xpk/core/workload_decorators/tcpx_decorator.py +5 -1
- xpk/main.py +2 -0
- xpk/parser/cluster.py +22 -88
- xpk/parser/cluster_test.py +41 -0
- xpk/parser/common.py +84 -0
- xpk/parser/storage.py +10 -0
- xpk/parser/storage_test.py +47 -0
- xpk/parser/workload.py +14 -41
- xpk/parser/workload_test.py +2 -48
- xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
- xpk/utils/feature_flags.py +3 -0
- xpk/utils/validation.py +2 -2
- xpk-0.16.0.dist-info/METADATA +127 -0
- {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/RECORD +67 -48
- xpk-0.15.0.dist-info/METADATA +0 -1666
- {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
xpk/commands/workload.py
CHANGED
|
@@ -16,9 +16,7 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
from ..core.blueprint.blueprint_generator import (
|
|
18
18
|
a3high_device_type,
|
|
19
|
-
|
|
20
|
-
a3ultra_device_type,
|
|
21
|
-
a4_device_type,
|
|
19
|
+
a4x_device_types,
|
|
22
20
|
)
|
|
23
21
|
from ..core.cluster import (
|
|
24
22
|
XPK_SA,
|
|
@@ -32,7 +30,7 @@ from ..core.docker_container import (
|
|
|
32
30
|
get_main_container_docker_image,
|
|
33
31
|
get_user_workload_container,
|
|
34
32
|
)
|
|
35
|
-
from ..core.kueue_manager import
|
|
33
|
+
from ..core.kueue_manager import LOCAL_QUEUE_NAME
|
|
36
34
|
from ..core.docker_resources import get_volumes, parse_env_config
|
|
37
35
|
from ..core.gcloud_context import add_zone_and_project
|
|
38
36
|
from ..core.monitoring import get_gke_outlier_dashboard
|
|
@@ -52,18 +50,19 @@ from ..core.pathways import (
|
|
|
52
50
|
get_user_workload_for_pathways,
|
|
53
51
|
try_to_delete_pathwaysjob_first,
|
|
54
52
|
)
|
|
55
|
-
from ..core.resources import get_cluster_capacity_type,
|
|
56
|
-
from ..core.resources import
|
|
53
|
+
from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics_from_config_map
|
|
54
|
+
from ..core.resources import ConfigMapType, get_cluster_configmap
|
|
55
|
+
from ..core.nodepool import ensure_resource_policy_exists
|
|
57
56
|
from ..core.scheduling import (
|
|
57
|
+
WorkloadScheduling,
|
|
58
58
|
check_if_workload_can_schedule,
|
|
59
|
-
create_accelerator_label,
|
|
60
|
-
create_machine_label,
|
|
61
59
|
create_tpu_machine_type,
|
|
62
60
|
create_tpu_topology,
|
|
63
61
|
get_cpu_affinity,
|
|
64
62
|
get_gpu_scheduler,
|
|
65
63
|
create_sub_slicing_annotations,
|
|
66
64
|
create_placement_policy_label,
|
|
65
|
+
get_placement_policy_name,
|
|
67
66
|
is_placement_policy_supported,
|
|
68
67
|
)
|
|
69
68
|
from ..core.storage import (
|
|
@@ -78,10 +77,10 @@ from ..core.storage import (
|
|
|
78
77
|
get_storages_to_mount,
|
|
79
78
|
)
|
|
80
79
|
from ..core.system_characteristics import (
|
|
81
|
-
SUB_SLICING_TOPOLOGIES,
|
|
82
80
|
AcceleratorType,
|
|
81
|
+
create_accelerator_label,
|
|
82
|
+
create_machine_label,
|
|
83
83
|
get_system_characteristics,
|
|
84
|
-
compute_vms_per_slice,
|
|
85
84
|
)
|
|
86
85
|
from ..core.vertex import create_vertex_experiment
|
|
87
86
|
from ..core.workload import (
|
|
@@ -92,20 +91,16 @@ from ..core.workload import (
|
|
|
92
91
|
get_cluster_location,
|
|
93
92
|
)
|
|
94
93
|
from ..core.workload_decorators import (
|
|
95
|
-
rdma_decorator,
|
|
96
94
|
storage_decorator,
|
|
97
|
-
tcpx_decorator,
|
|
98
|
-
tcpxo_decorator,
|
|
99
95
|
)
|
|
100
96
|
from ..utils.console import ask_for_user_consent, xpk_exit, xpk_print
|
|
101
|
-
from packaging.version import Version
|
|
102
97
|
from ..utils.file import write_tmp_file
|
|
103
98
|
from ..utils.execution_context import is_dry_run
|
|
104
99
|
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
105
100
|
from . import cluster_gcluster
|
|
106
|
-
from .common import is_TAS_possible
|
|
107
|
-
from
|
|
108
|
-
from ..utils.
|
|
101
|
+
from .common import is_TAS_possible
|
|
102
|
+
from jinja2 import Environment, FileSystemLoader
|
|
103
|
+
from ..utils.templates import get_templates_absolute_path
|
|
109
104
|
|
|
110
105
|
WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
|
|
111
106
|
kind: JobSet
|
|
@@ -292,7 +287,7 @@ PW_WORKLOAD_CREATE_YAML = """
|
|
|
292
287
|
{user_workload}
|
|
293
288
|
"""
|
|
294
289
|
|
|
295
|
-
|
|
290
|
+
ARM_GPU_WORKLOAD_CREATE_JINJA_FILE = 'arm_gpu_workload_crate.yaml.j2'
|
|
296
291
|
|
|
297
292
|
|
|
298
293
|
def workload_create_pathways(args) -> None:
|
|
@@ -344,26 +339,35 @@ def workload_create(args) -> None:
|
|
|
344
339
|
)
|
|
345
340
|
xpk_exit(1)
|
|
346
341
|
|
|
347
|
-
|
|
348
|
-
if return_code > 0 or
|
|
342
|
+
workload_system, return_code = get_system_characteristics(args)
|
|
343
|
+
if return_code > 0 or workload_system is None:
|
|
349
344
|
xpk_print('Fetching system characteristics failed!')
|
|
350
345
|
xpk_exit(return_code)
|
|
351
346
|
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
347
|
+
resources_config_map = get_cluster_configmap(
|
|
348
|
+
args.cluster, ConfigMapType.RESOURCES
|
|
349
|
+
)
|
|
350
|
+
cluster_system = get_cluster_system_characteristics_from_config_map(
|
|
351
|
+
resources_config_map
|
|
352
|
+
)
|
|
353
|
+
workload_scheduling = check_if_workload_can_schedule(
|
|
354
|
+
args=args,
|
|
355
|
+
workload_system=workload_system,
|
|
356
|
+
cluster_system=cluster_system,
|
|
357
|
+
resources_config_map=resources_config_map,
|
|
358
|
+
)
|
|
359
|
+
if workload_scheduling == WorkloadScheduling.UNAVAILABLE:
|
|
357
360
|
xpk_exit(1)
|
|
358
361
|
|
|
359
362
|
xpk_print('Starting workload create', flush=True)
|
|
360
363
|
|
|
361
|
-
|
|
362
|
-
|
|
364
|
+
cluster_config_map = get_cluster_configmap(
|
|
365
|
+
args.cluster, ConfigMapType.METADATA
|
|
366
|
+
)
|
|
363
367
|
cluster_xpk_version = None
|
|
364
368
|
if cluster_config_map is None:
|
|
365
369
|
xpk_print(
|
|
366
|
-
|
|
370
|
+
'Warning: Unable to find ConfigMap for the'
|
|
367
371
|
' cluster. We recommend to upgrade your cluster by running `xpk'
|
|
368
372
|
' cluster create`.'
|
|
369
373
|
)
|
|
@@ -395,7 +399,7 @@ def workload_create(args) -> None:
|
|
|
395
399
|
|
|
396
400
|
autoprovisioning_args = ''
|
|
397
401
|
autoprovisioning_enabled, return_code = is_autoprovisioning_enabled(
|
|
398
|
-
args,
|
|
402
|
+
args, workload_system
|
|
399
403
|
)
|
|
400
404
|
if return_code != 0:
|
|
401
405
|
xpk_exit(return_code)
|
|
@@ -489,27 +493,34 @@ def workload_create(args) -> None:
|
|
|
489
493
|
rules:
|
|
490
494
|
- action: FailJob
|
|
491
495
|
onExitCodes:
|
|
492
|
-
containerName: {get_main_container_docker_image(args,
|
|
496
|
+
containerName: {get_main_container_docker_image(args, workload_system)}
|
|
493
497
|
operator: NotIn
|
|
494
498
|
values: [{restart_on_exit_codes}]"""
|
|
495
499
|
|
|
500
|
+
if is_placement_policy_supported(workload_system):
|
|
501
|
+
ensure_resource_policy_exists(
|
|
502
|
+
resource_policy_name=get_placement_policy_name(workload_system),
|
|
503
|
+
project=args.project,
|
|
504
|
+
zone=args.zone,
|
|
505
|
+
topology=workload_system.topology,
|
|
506
|
+
)
|
|
507
|
+
|
|
496
508
|
placement_policy_label = (
|
|
497
|
-
create_placement_policy_label(
|
|
498
|
-
if is_placement_policy_supported(
|
|
509
|
+
create_placement_policy_label(workload_system)
|
|
510
|
+
if is_placement_policy_supported(workload_system)
|
|
499
511
|
else ''
|
|
500
512
|
)
|
|
501
513
|
|
|
502
514
|
# Create the workload file based on accelerator type or workload type.
|
|
503
|
-
if
|
|
515
|
+
if workload_system.accelerator_type == AcceleratorType.GPU:
|
|
504
516
|
container, debugging_dashboard_id = get_user_workload_container(
|
|
505
|
-
args,
|
|
517
|
+
args, workload_system
|
|
506
518
|
)
|
|
507
519
|
gpu_scheduler, return_code = get_gpu_scheduler(
|
|
508
|
-
args,
|
|
520
|
+
args, workload_system, autoprovisioning_args
|
|
509
521
|
)
|
|
510
522
|
if return_code != 0:
|
|
511
523
|
xpk_exit(return_code)
|
|
512
|
-
system_characteristics = get_cluster_system_characteristics(args)
|
|
513
524
|
capacity_type = get_cluster_capacity_type(args)
|
|
514
525
|
|
|
515
526
|
annotations = (
|
|
@@ -517,31 +528,55 @@ def workload_create(args) -> None:
|
|
|
517
528
|
'kueue.x-k8s.io/podset-preferred-topology:'
|
|
518
529
|
' "cloud.google.com/gce-topology-host"'
|
|
519
530
|
)
|
|
520
|
-
if is_TAS_possible(
|
|
531
|
+
if is_TAS_possible(cluster_system, capacity_type)
|
|
521
532
|
else ''
|
|
522
533
|
)
|
|
523
534
|
|
|
524
535
|
if (
|
|
525
|
-
|
|
526
|
-
or
|
|
536
|
+
workload_system.device_type in cluster_gcluster.supported_device_types
|
|
537
|
+
or workload_system.device_type == a3high_device_type
|
|
538
|
+
or workload_system.device_type in a4x_device_types
|
|
527
539
|
):
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
540
|
+
if workload_system.device_type in a4x_device_types:
|
|
541
|
+
template_env = Environment(
|
|
542
|
+
loader=FileSystemLoader(searchpath=get_templates_absolute_path())
|
|
543
|
+
)
|
|
544
|
+
workload_create_yaml = template_env.get_template(
|
|
545
|
+
ARM_GPU_WORKLOAD_CREATE_JINJA_FILE
|
|
546
|
+
)
|
|
547
|
+
yml_string = workload_create_yaml.render(
|
|
548
|
+
workload=args.workload,
|
|
549
|
+
num_nodes=args.num_nodes,
|
|
550
|
+
ttl_seconds_after_finished=args.ttl_seconds_after_finished,
|
|
551
|
+
max_restarts=args.max_restarts,
|
|
552
|
+
priority=args.priority,
|
|
553
|
+
termination_grace_period_seconds=args.termination_grace_period_seconds,
|
|
554
|
+
docker_image_pull_secret=args.docker_image_pull_secret,
|
|
555
|
+
container=container,
|
|
556
|
+
service_account=XPK_SA,
|
|
557
|
+
failure_policy_rules=failure_policy_rules,
|
|
558
|
+
pod_failure_policy=pod_failure_policy,
|
|
559
|
+
annotations=annotations,
|
|
560
|
+
placement_policy_label=placement_policy_label,
|
|
561
|
+
)
|
|
562
|
+
else:
|
|
563
|
+
yml_string = A3_GPU_WORKLOAD_CREATE_YAML.format(
|
|
564
|
+
args=args,
|
|
565
|
+
container=container,
|
|
566
|
+
service_account=XPK_SA,
|
|
567
|
+
failure_policy_rules=failure_policy_rules,
|
|
568
|
+
pod_failure_policy=pod_failure_policy,
|
|
569
|
+
annotations=annotations,
|
|
570
|
+
placement_policy_label=placement_policy_label,
|
|
571
|
+
)
|
|
537
572
|
|
|
538
573
|
sub_networks = get_cluster_subnetworks()
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
yml_string =
|
|
574
|
+
|
|
575
|
+
if workload_system.gpu_config and callable(
|
|
576
|
+
workload_system.gpu_config.jobset_decorator_fn
|
|
577
|
+
):
|
|
578
|
+
decorator_fn = workload_system.gpu_config.jobset_decorator_fn
|
|
579
|
+
yml_string = decorator_fn(yml_string, sub_networks)
|
|
545
580
|
|
|
546
581
|
if all_storages:
|
|
547
582
|
yml_string = storage_decorator.decorate_jobset(yml_string, all_storages)
|
|
@@ -550,7 +585,7 @@ def workload_create(args) -> None:
|
|
|
550
585
|
args=args,
|
|
551
586
|
container=container,
|
|
552
587
|
gpu_scheduler=gpu_scheduler,
|
|
553
|
-
volumes=get_volumes(args,
|
|
588
|
+
volumes=get_volumes(args, workload_system),
|
|
554
589
|
storage_annotations=('\n' + (' ' * 12)).join(
|
|
555
590
|
get_storage_annotations(all_storages)
|
|
556
591
|
),
|
|
@@ -561,53 +596,53 @@ def workload_create(args) -> None:
|
|
|
561
596
|
)
|
|
562
597
|
|
|
563
598
|
elif args.use_pathways and ensure_pathways_workload_prerequisites(
|
|
564
|
-
args,
|
|
599
|
+
args, workload_system
|
|
565
600
|
):
|
|
566
601
|
yml_string = PW_WORKLOAD_CREATE_YAML.format(
|
|
567
602
|
args=args,
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
machine_type=create_tpu_machine_type(system.accelerator_type, system),
|
|
603
|
+
topology=create_tpu_topology(workload_system),
|
|
604
|
+
machine_type=create_tpu_machine_type(workload_system),
|
|
571
605
|
custom_pathways_proxy_server=append_custom_pathways_proxy_server(args),
|
|
572
606
|
custom_pathways_server=append_custom_pathways_server(args),
|
|
573
607
|
custom_pathways_worker=append_custom_pathways_worker(args),
|
|
574
608
|
colocated_python_sidecar=append_custom_colocated_python_sidecar(args),
|
|
575
|
-
user_workload=get_user_workload_for_pathways(args,
|
|
609
|
+
user_workload=get_user_workload_for_pathways(args, workload_system),
|
|
576
610
|
local_queue_name=LOCAL_QUEUE_NAME,
|
|
577
611
|
autoprovisioning_args=autoprovisioning_args,
|
|
578
612
|
placement_policy_label=placement_policy_label,
|
|
579
613
|
)
|
|
580
614
|
else:
|
|
615
|
+
use_sub_slicing = (
|
|
616
|
+
workload_scheduling == WorkloadScheduling.SUB_SLICING_AVAILABLE
|
|
617
|
+
)
|
|
618
|
+
if use_sub_slicing:
|
|
619
|
+
xpk_print('Workload will be scheduled using the Sub-slicing feature.')
|
|
620
|
+
|
|
581
621
|
container, debugging_dashboard_id = get_user_workload_container(
|
|
582
|
-
args,
|
|
622
|
+
args, workload_system
|
|
583
623
|
)
|
|
584
624
|
yml_string = WORKLOAD_CREATE_YAML.format(
|
|
585
625
|
args=args,
|
|
586
626
|
container=container,
|
|
587
|
-
vms_per_slice=
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
and FeatureFlags.SUB_SLICING_ENABLED
|
|
591
|
-
and args.sub_slicing_topology is not None
|
|
592
|
-
else system.vms_per_slice
|
|
593
|
-
),
|
|
594
|
-
affinity=get_cpu_affinity(system.accelerator_type),
|
|
595
|
-
accelerator_label=create_accelerator_label(
|
|
596
|
-
system.accelerator_type, system
|
|
597
|
-
),
|
|
627
|
+
vms_per_slice=workload_system.vms_per_slice,
|
|
628
|
+
affinity=get_cpu_affinity(workload_system.accelerator_type),
|
|
629
|
+
accelerator_label=create_accelerator_label(workload_system),
|
|
598
630
|
sub_slicing_annotations=(
|
|
599
|
-
''
|
|
600
|
-
|
|
601
|
-
or args.sub_slicing_topology is None
|
|
602
|
-
else ('\n' + (' ' * 16)).join(
|
|
603
|
-
create_sub_slicing_annotations(args.sub_slicing_topology)
|
|
631
|
+
('\n' + (' ' * 16)).join(
|
|
632
|
+
create_sub_slicing_annotations(workload_system.topology)
|
|
604
633
|
)
|
|
634
|
+
if use_sub_slicing
|
|
635
|
+
else ''
|
|
605
636
|
),
|
|
606
637
|
placement_policy_label=placement_policy_label,
|
|
607
|
-
machine_label=
|
|
638
|
+
machine_label=(
|
|
639
|
+
create_machine_label(cluster_system)
|
|
640
|
+
if use_sub_slicing and cluster_system
|
|
641
|
+
else create_machine_label(workload_system)
|
|
642
|
+
),
|
|
608
643
|
local_queue_name=LOCAL_QUEUE_NAME,
|
|
609
644
|
autoprovisioning_args=autoprovisioning_args,
|
|
610
|
-
volumes=get_volumes(args,
|
|
645
|
+
volumes=get_volumes(args, workload_system),
|
|
611
646
|
storage_annotations=('\n' + (' ' * 16)).join(
|
|
612
647
|
get_storage_annotations(all_storages)
|
|
613
648
|
),
|
|
@@ -615,10 +650,18 @@ def workload_create(args) -> None:
|
|
|
615
650
|
tpu_toleration="""
|
|
616
651
|
- operator: "Exists"
|
|
617
652
|
key: google.com/tpu
|
|
618
|
-
""" if
|
|
653
|
+
""" if workload_system.accelerator_type == AcceleratorType.TPU else '',
|
|
619
654
|
failure_policy_rules=failure_policy_rules,
|
|
620
655
|
pod_failure_policy=pod_failure_policy,
|
|
621
656
|
)
|
|
657
|
+
if args.output_manifest_file:
|
|
658
|
+
with open(args.output_manifest_file, 'w', encoding='utf-8') as f:
|
|
659
|
+
f.write(yml_string)
|
|
660
|
+
xpk_print(
|
|
661
|
+
f'Workload {args.workload} manifest written to'
|
|
662
|
+
f' {args.output_manifest_file}'
|
|
663
|
+
)
|
|
664
|
+
|
|
622
665
|
tmp = write_tmp_file(yml_string)
|
|
623
666
|
command = f'kubectl apply -f {str(tmp)}'
|
|
624
667
|
return_code = run_command_with_updates(command, 'Creating Workload')
|
|
@@ -632,7 +675,7 @@ def workload_create(args) -> None:
|
|
|
632
675
|
|
|
633
676
|
# Get GKE outlier dashboard for TPU
|
|
634
677
|
outlier_dashboard_id = None
|
|
635
|
-
if
|
|
678
|
+
if workload_system.accelerator_type == AcceleratorType.TPU:
|
|
636
679
|
outlier_dashboard_id = get_gke_outlier_dashboard(args)
|
|
637
680
|
|
|
638
681
|
# Outlier and debugging dashboards
|
|
@@ -699,64 +742,6 @@ def workload_create(args) -> None:
|
|
|
699
742
|
xpk_exit(0)
|
|
700
743
|
|
|
701
744
|
|
|
702
|
-
def _validate_sub_slicing_availability():
|
|
703
|
-
return_code, sub_slicing_enabled = has_sub_slicing_enabled()
|
|
704
|
-
if return_code != 0:
|
|
705
|
-
xpk_print(
|
|
706
|
-
'Error: Unable to validate sub-slicing support on a given cluster.'
|
|
707
|
-
)
|
|
708
|
-
xpk_exit(1)
|
|
709
|
-
|
|
710
|
-
if not sub_slicing_enabled:
|
|
711
|
-
xpk_print(
|
|
712
|
-
'Error: Cluster has not been not set up for Sub-slicing. Please enable'
|
|
713
|
-
' --sub-slicing in "cluster create" command first.'
|
|
714
|
-
)
|
|
715
|
-
xpk_exit(1)
|
|
716
|
-
|
|
717
|
-
return_code, current_version = get_installed_kueue_version(
|
|
718
|
-
dry_run_version=Version('0.13')
|
|
719
|
-
)
|
|
720
|
-
if return_code != 0 or not current_version:
|
|
721
|
-
xpk_print(
|
|
722
|
-
'Error: Unable to validate sub-slicing support on a given cluster.'
|
|
723
|
-
)
|
|
724
|
-
xpk_exit(1)
|
|
725
|
-
|
|
726
|
-
if current_version < SUB_SLICING_MINIMUM_KUEUE_VERSION:
|
|
727
|
-
xpk_print(
|
|
728
|
-
"Error: Current Kueue version ({current_version}) doesn't support"
|
|
729
|
-
' Sub-slicing. The minimal required version is'
|
|
730
|
-
' v{SUB_SLICING_MINIMUM_KUEUE_VERSION}. Please either update Kueue'
|
|
731
|
-
' manually, or run "cluster create --sub-slicing" on the existing'
|
|
732
|
-
' cluster.'
|
|
733
|
-
)
|
|
734
|
-
xpk_exit(1)
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
def _validate_sub_slicing_topology(
|
|
738
|
-
system_characteristics: SystemCharacteristics, sub_slicing_topology: str
|
|
739
|
-
) -> None:
|
|
740
|
-
if sub_slicing_topology not in SUB_SLICING_TOPOLOGIES:
|
|
741
|
-
xpk_print(
|
|
742
|
-
f'Error: --sub-slicing-topology={sub_slicing_topology} shape is'
|
|
743
|
-
f' invalid. It has to be one of: {", ".join(SUB_SLICING_TOPOLOGIES)}.'
|
|
744
|
-
)
|
|
745
|
-
xpk_exit(1)
|
|
746
|
-
|
|
747
|
-
if not is_topology_contained(
|
|
748
|
-
contained=sub_slicing_topology, container=system_characteristics.topology
|
|
749
|
-
):
|
|
750
|
-
xpk_print(
|
|
751
|
-
f'Error: --sub-slicing-topology={sub_slicing_topology} shape is too'
|
|
752
|
-
' large. The shape cannot be bigger than'
|
|
753
|
-
f' {system_characteristics.topology}.'
|
|
754
|
-
)
|
|
755
|
-
xpk_exit(1)
|
|
756
|
-
|
|
757
|
-
validate_sub_slicing_system(system_characteristics)
|
|
758
|
-
|
|
759
|
-
|
|
760
745
|
def get_restart_exit_codes(args) -> list:
|
|
761
746
|
exit_codes = [42]
|
|
762
747
|
exit_codes.extend(range(127, 256, 1))
|