xpk 0.14.4__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/README.md +19 -0
- integration/gcluster_a3mega_test.py +11 -0
- integration/gcluster_a3ultra_test.py +11 -0
- integration/gcluster_a4_test.py +11 -0
- xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3mega/storage_crd.yaml +52 -0
- xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
- xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
- xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
- xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
- xpk/blueprints/a4/storage_crd.yaml +52 -0
- xpk/commands/cluster.py +89 -32
- xpk/commands/cluster_gcluster.py +25 -5
- xpk/commands/cluster_gcluster_test.py +16 -3
- xpk/commands/cluster_test.py +353 -7
- xpk/commands/config.py +3 -5
- xpk/commands/inspector.py +5 -3
- xpk/commands/kind.py +3 -1
- xpk/commands/managed_ml_diagnostics.py +249 -0
- xpk/commands/managed_ml_diagnostics_test.py +146 -0
- xpk/commands/storage.py +8 -10
- xpk/commands/workload.py +143 -142
- xpk/commands/workload_test.py +160 -118
- xpk/core/blueprint/blueprint_generator.py +73 -33
- xpk/core/blueprint/blueprint_test.py +9 -0
- xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
- xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
- xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
- xpk/core/blueprint/testing/data/a4.yaml +185 -0
- xpk/core/capacity.py +48 -8
- xpk/core/capacity_test.py +32 -1
- xpk/core/cluster.py +55 -104
- xpk/core/cluster_test.py +170 -0
- xpk/core/commands.py +4 -10
- xpk/core/config.py +88 -7
- xpk/core/config_test.py +67 -11
- xpk/core/docker_container.py +3 -1
- xpk/core/docker_image.py +10 -6
- xpk/core/docker_resources.py +1 -10
- xpk/core/gcloud_context.py +18 -12
- xpk/core/gcloud_context_test.py +111 -1
- xpk/core/kjob.py +17 -19
- xpk/core/kueue_manager.py +205 -51
- xpk/core/kueue_manager_test.py +158 -4
- xpk/core/nap.py +13 -14
- xpk/core/nodepool.py +37 -43
- xpk/core/nodepool_test.py +42 -19
- xpk/core/pathways.py +23 -0
- xpk/core/pathways_test.py +57 -0
- xpk/core/resources.py +84 -27
- xpk/core/scheduling.py +144 -133
- xpk/core/scheduling_test.py +298 -6
- xpk/core/system_characteristics.py +256 -19
- xpk/core/system_characteristics_test.py +128 -5
- xpk/core/telemetry.py +263 -0
- xpk/core/telemetry_test.py +211 -0
- xpk/core/vertex.py +4 -3
- xpk/core/workload_decorators/tcpx_decorator.py +5 -1
- xpk/main.py +33 -13
- xpk/parser/cluster.py +40 -67
- xpk/parser/cluster_test.py +83 -3
- xpk/parser/common.py +84 -0
- xpk/parser/storage.py +10 -0
- xpk/parser/storage_test.py +47 -0
- xpk/parser/workload.py +14 -29
- xpk/parser/workload_test.py +3 -49
- xpk/telemetry_uploader.py +29 -0
- xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
- xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
- xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
- xpk/utils/console.py +41 -10
- xpk/utils/console_test.py +106 -0
- xpk/utils/feature_flags.py +10 -1
- xpk/utils/file.py +4 -1
- xpk/utils/topology.py +4 -0
- xpk/utils/user_agent.py +35 -0
- xpk/utils/user_agent_test.py +44 -0
- xpk/utils/user_input.py +48 -0
- xpk/utils/user_input_test.py +92 -0
- xpk/utils/validation.py +2 -13
- xpk/utils/versions.py +31 -0
- xpk-0.16.0.dist-info/METADATA +127 -0
- xpk-0.16.0.dist-info/RECORD +168 -0
- xpk-0.14.4.dist-info/METADATA +0 -1645
- xpk-0.14.4.dist-info/RECORD +0 -139
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
xpk/commands/workload.py
CHANGED
|
@@ -16,9 +16,7 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
from ..core.blueprint.blueprint_generator import (
|
|
18
18
|
a3high_device_type,
|
|
19
|
-
|
|
20
|
-
a3ultra_device_type,
|
|
21
|
-
a4_device_type,
|
|
19
|
+
a4x_device_types,
|
|
22
20
|
)
|
|
23
21
|
from ..core.cluster import (
|
|
24
22
|
XPK_SA,
|
|
@@ -27,15 +25,14 @@ from ..core.cluster import (
|
|
|
27
25
|
setup_k8s_env,
|
|
28
26
|
)
|
|
29
27
|
from ..core.commands import run_command_with_updates, run_commands
|
|
30
|
-
from ..core.kueue_manager import KueueManager, has_sub_slicing_enabled
|
|
31
28
|
from ..core.config import (VERTEX_TENSORBOARD_FEATURE_FLAG, XPK_CURRENT_VERSION)
|
|
32
29
|
from ..core.docker_container import (
|
|
33
30
|
get_main_container_docker_image,
|
|
34
31
|
get_user_workload_container,
|
|
35
32
|
)
|
|
33
|
+
from ..core.kueue_manager import LOCAL_QUEUE_NAME
|
|
36
34
|
from ..core.docker_resources import get_volumes, parse_env_config
|
|
37
35
|
from ..core.gcloud_context import add_zone_and_project
|
|
38
|
-
from ..core.kueue_manager import LOCAL_QUEUE_NAME
|
|
39
36
|
from ..core.monitoring import get_gke_outlier_dashboard
|
|
40
37
|
from ..core.nap import (
|
|
41
38
|
get_autoprovisioning_node_selector_args,
|
|
@@ -53,17 +50,20 @@ from ..core.pathways import (
|
|
|
53
50
|
get_user_workload_for_pathways,
|
|
54
51
|
try_to_delete_pathwaysjob_first,
|
|
55
52
|
)
|
|
56
|
-
from ..core.resources import get_cluster_capacity_type,
|
|
57
|
-
from ..core.resources import
|
|
53
|
+
from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics_from_config_map
|
|
54
|
+
from ..core.resources import ConfigMapType, get_cluster_configmap
|
|
55
|
+
from ..core.nodepool import ensure_resource_policy_exists
|
|
58
56
|
from ..core.scheduling import (
|
|
57
|
+
WorkloadScheduling,
|
|
59
58
|
check_if_workload_can_schedule,
|
|
60
|
-
create_accelerator_label,
|
|
61
|
-
create_machine_label,
|
|
62
59
|
create_tpu_machine_type,
|
|
63
60
|
create_tpu_topology,
|
|
64
61
|
get_cpu_affinity,
|
|
65
62
|
get_gpu_scheduler,
|
|
66
63
|
create_sub_slicing_annotations,
|
|
64
|
+
create_placement_policy_label,
|
|
65
|
+
get_placement_policy_name,
|
|
66
|
+
is_placement_policy_supported,
|
|
67
67
|
)
|
|
68
68
|
from ..core.storage import (
|
|
69
69
|
GCE_PD_TYPE,
|
|
@@ -78,8 +78,9 @@ from ..core.storage import (
|
|
|
78
78
|
)
|
|
79
79
|
from ..core.system_characteristics import (
|
|
80
80
|
AcceleratorType,
|
|
81
|
+
create_accelerator_label,
|
|
82
|
+
create_machine_label,
|
|
81
83
|
get_system_characteristics,
|
|
82
|
-
compute_vms_per_slice,
|
|
83
84
|
)
|
|
84
85
|
from ..core.vertex import create_vertex_experiment
|
|
85
86
|
from ..core.workload import (
|
|
@@ -90,20 +91,16 @@ from ..core.workload import (
|
|
|
90
91
|
get_cluster_location,
|
|
91
92
|
)
|
|
92
93
|
from ..core.workload_decorators import (
|
|
93
|
-
rdma_decorator,
|
|
94
94
|
storage_decorator,
|
|
95
|
-
tcpx_decorator,
|
|
96
|
-
tcpxo_decorator,
|
|
97
95
|
)
|
|
98
|
-
from ..utils.console import
|
|
99
|
-
from packaging.version import Version
|
|
96
|
+
from ..utils.console import ask_for_user_consent, xpk_exit, xpk_print
|
|
100
97
|
from ..utils.file import write_tmp_file
|
|
101
98
|
from ..utils.execution_context import is_dry_run
|
|
102
99
|
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
103
100
|
from . import cluster_gcluster
|
|
104
|
-
from .common import is_TAS_possible
|
|
105
|
-
from
|
|
106
|
-
from ..utils.
|
|
101
|
+
from .common import is_TAS_possible
|
|
102
|
+
from jinja2 import Environment, FileSystemLoader
|
|
103
|
+
from ..utils.templates import get_templates_absolute_path
|
|
107
104
|
|
|
108
105
|
WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
|
|
109
106
|
kind: JobSet
|
|
@@ -144,6 +141,7 @@ spec:
|
|
|
144
141
|
nodeSelector:
|
|
145
142
|
{accelerator_label}
|
|
146
143
|
{machine_label}
|
|
144
|
+
{placement_policy_label}
|
|
147
145
|
{autoprovisioning_args}
|
|
148
146
|
priorityClassName: {args.priority}
|
|
149
147
|
hostNetwork: true
|
|
@@ -193,6 +191,8 @@ spec:
|
|
|
193
191
|
{gpu_scheduler}
|
|
194
192
|
priorityClassName: {args.priority}
|
|
195
193
|
restartPolicy: Never
|
|
194
|
+
nodeSelector:
|
|
195
|
+
{placement_policy_label}
|
|
196
196
|
imagePullSecrets:
|
|
197
197
|
- name: {args.docker_image_pull_secret}
|
|
198
198
|
hostNetwork: true
|
|
@@ -238,6 +238,8 @@ spec:
|
|
|
238
238
|
spec:
|
|
239
239
|
priorityClassName: {args.priority}
|
|
240
240
|
restartPolicy: Never
|
|
241
|
+
nodeSelector:
|
|
242
|
+
{placement_policy_label}
|
|
241
243
|
imagePullSecrets:
|
|
242
244
|
- name: {args.docker_image_pull_secret}
|
|
243
245
|
dnsPolicy: ClusterFirstWithHostNet
|
|
@@ -273,6 +275,7 @@ PW_WORKLOAD_CREATE_YAML = """
|
|
|
273
275
|
terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
|
|
274
276
|
priorityClassName: {args.priority}
|
|
275
277
|
nodeSelector:
|
|
278
|
+
{placement_policy_label}
|
|
276
279
|
{autoprovisioning_args}
|
|
277
280
|
pathwaysDir: {args.pathways_gcs_location} #This bucket needs to be created in advance.
|
|
278
281
|
controller:
|
|
@@ -284,8 +287,7 @@ PW_WORKLOAD_CREATE_YAML = """
|
|
|
284
287
|
{user_workload}
|
|
285
288
|
"""
|
|
286
289
|
|
|
287
|
-
|
|
288
|
-
SUB_SLICING_MINIMUM_KUEUE_VERSION = Version('0.13.0')
|
|
290
|
+
ARM_GPU_WORKLOAD_CREATE_JINJA_FILE = 'arm_gpu_workload_crate.yaml.j2'
|
|
289
291
|
|
|
290
292
|
|
|
291
293
|
def workload_create_pathways(args) -> None:
|
|
@@ -337,26 +339,35 @@ def workload_create(args) -> None:
|
|
|
337
339
|
)
|
|
338
340
|
xpk_exit(1)
|
|
339
341
|
|
|
340
|
-
|
|
341
|
-
if return_code > 0 or
|
|
342
|
+
workload_system, return_code = get_system_characteristics(args)
|
|
343
|
+
if return_code > 0 or workload_system is None:
|
|
342
344
|
xpk_print('Fetching system characteristics failed!')
|
|
343
345
|
xpk_exit(return_code)
|
|
344
346
|
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
347
|
+
resources_config_map = get_cluster_configmap(
|
|
348
|
+
args.cluster, ConfigMapType.RESOURCES
|
|
349
|
+
)
|
|
350
|
+
cluster_system = get_cluster_system_characteristics_from_config_map(
|
|
351
|
+
resources_config_map
|
|
352
|
+
)
|
|
353
|
+
workload_scheduling = check_if_workload_can_schedule(
|
|
354
|
+
args=args,
|
|
355
|
+
workload_system=workload_system,
|
|
356
|
+
cluster_system=cluster_system,
|
|
357
|
+
resources_config_map=resources_config_map,
|
|
358
|
+
)
|
|
359
|
+
if workload_scheduling == WorkloadScheduling.UNAVAILABLE:
|
|
350
360
|
xpk_exit(1)
|
|
351
361
|
|
|
352
362
|
xpk_print('Starting workload create', flush=True)
|
|
353
363
|
|
|
354
|
-
|
|
355
|
-
|
|
364
|
+
cluster_config_map = get_cluster_configmap(
|
|
365
|
+
args.cluster, ConfigMapType.METADATA
|
|
366
|
+
)
|
|
356
367
|
cluster_xpk_version = None
|
|
357
368
|
if cluster_config_map is None:
|
|
358
369
|
xpk_print(
|
|
359
|
-
|
|
370
|
+
'Warning: Unable to find ConfigMap for the'
|
|
360
371
|
' cluster. We recommend to upgrade your cluster by running `xpk'
|
|
361
372
|
' cluster create`.'
|
|
362
373
|
)
|
|
@@ -388,7 +399,7 @@ def workload_create(args) -> None:
|
|
|
388
399
|
|
|
389
400
|
autoprovisioning_args = ''
|
|
390
401
|
autoprovisioning_enabled, return_code = is_autoprovisioning_enabled(
|
|
391
|
-
args,
|
|
402
|
+
args, workload_system
|
|
392
403
|
)
|
|
393
404
|
if return_code != 0:
|
|
394
405
|
xpk_exit(return_code)
|
|
@@ -481,23 +492,35 @@ def workload_create(args) -> None:
|
|
|
481
492
|
podFailurePolicy:
|
|
482
493
|
rules:
|
|
483
494
|
- action: FailJob
|
|
484
|
-
onPodConditions: []
|
|
485
495
|
onExitCodes:
|
|
486
|
-
containerName: {get_main_container_docker_image(args,
|
|
496
|
+
containerName: {get_main_container_docker_image(args, workload_system)}
|
|
487
497
|
operator: NotIn
|
|
488
498
|
values: [{restart_on_exit_codes}]"""
|
|
489
499
|
|
|
500
|
+
if is_placement_policy_supported(workload_system):
|
|
501
|
+
ensure_resource_policy_exists(
|
|
502
|
+
resource_policy_name=get_placement_policy_name(workload_system),
|
|
503
|
+
project=args.project,
|
|
504
|
+
zone=args.zone,
|
|
505
|
+
topology=workload_system.topology,
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
placement_policy_label = (
|
|
509
|
+
create_placement_policy_label(workload_system)
|
|
510
|
+
if is_placement_policy_supported(workload_system)
|
|
511
|
+
else ''
|
|
512
|
+
)
|
|
513
|
+
|
|
490
514
|
# Create the workload file based on accelerator type or workload type.
|
|
491
|
-
if
|
|
515
|
+
if workload_system.accelerator_type == AcceleratorType.GPU:
|
|
492
516
|
container, debugging_dashboard_id = get_user_workload_container(
|
|
493
|
-
args,
|
|
517
|
+
args, workload_system
|
|
494
518
|
)
|
|
495
519
|
gpu_scheduler, return_code = get_gpu_scheduler(
|
|
496
|
-
args,
|
|
520
|
+
args, workload_system, autoprovisioning_args
|
|
497
521
|
)
|
|
498
522
|
if return_code != 0:
|
|
499
523
|
xpk_exit(return_code)
|
|
500
|
-
system_characteristics = get_cluster_system_characteristics(args)
|
|
501
524
|
capacity_type = get_cluster_capacity_type(args)
|
|
502
525
|
|
|
503
526
|
annotations = (
|
|
@@ -505,30 +528,55 @@ def workload_create(args) -> None:
|
|
|
505
528
|
'kueue.x-k8s.io/podset-preferred-topology:'
|
|
506
529
|
' "cloud.google.com/gce-topology-host"'
|
|
507
530
|
)
|
|
508
|
-
if is_TAS_possible(
|
|
531
|
+
if is_TAS_possible(cluster_system, capacity_type)
|
|
509
532
|
else ''
|
|
510
533
|
)
|
|
511
534
|
|
|
512
535
|
if (
|
|
513
|
-
|
|
514
|
-
or
|
|
536
|
+
workload_system.device_type in cluster_gcluster.supported_device_types
|
|
537
|
+
or workload_system.device_type == a3high_device_type
|
|
538
|
+
or workload_system.device_type in a4x_device_types
|
|
515
539
|
):
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
540
|
+
if workload_system.device_type in a4x_device_types:
|
|
541
|
+
template_env = Environment(
|
|
542
|
+
loader=FileSystemLoader(searchpath=get_templates_absolute_path())
|
|
543
|
+
)
|
|
544
|
+
workload_create_yaml = template_env.get_template(
|
|
545
|
+
ARM_GPU_WORKLOAD_CREATE_JINJA_FILE
|
|
546
|
+
)
|
|
547
|
+
yml_string = workload_create_yaml.render(
|
|
548
|
+
workload=args.workload,
|
|
549
|
+
num_nodes=args.num_nodes,
|
|
550
|
+
ttl_seconds_after_finished=args.ttl_seconds_after_finished,
|
|
551
|
+
max_restarts=args.max_restarts,
|
|
552
|
+
priority=args.priority,
|
|
553
|
+
termination_grace_period_seconds=args.termination_grace_period_seconds,
|
|
554
|
+
docker_image_pull_secret=args.docker_image_pull_secret,
|
|
555
|
+
container=container,
|
|
556
|
+
service_account=XPK_SA,
|
|
557
|
+
failure_policy_rules=failure_policy_rules,
|
|
558
|
+
pod_failure_policy=pod_failure_policy,
|
|
559
|
+
annotations=annotations,
|
|
560
|
+
placement_policy_label=placement_policy_label,
|
|
561
|
+
)
|
|
562
|
+
else:
|
|
563
|
+
yml_string = A3_GPU_WORKLOAD_CREATE_YAML.format(
|
|
564
|
+
args=args,
|
|
565
|
+
container=container,
|
|
566
|
+
service_account=XPK_SA,
|
|
567
|
+
failure_policy_rules=failure_policy_rules,
|
|
568
|
+
pod_failure_policy=pod_failure_policy,
|
|
569
|
+
annotations=annotations,
|
|
570
|
+
placement_policy_label=placement_policy_label,
|
|
571
|
+
)
|
|
524
572
|
|
|
525
573
|
sub_networks = get_cluster_subnetworks()
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
yml_string =
|
|
574
|
+
|
|
575
|
+
if workload_system.gpu_config and callable(
|
|
576
|
+
workload_system.gpu_config.jobset_decorator_fn
|
|
577
|
+
):
|
|
578
|
+
decorator_fn = workload_system.gpu_config.jobset_decorator_fn
|
|
579
|
+
yml_string = decorator_fn(yml_string, sub_networks)
|
|
532
580
|
|
|
533
581
|
if all_storages:
|
|
534
582
|
yml_string = storage_decorator.decorate_jobset(yml_string, all_storages)
|
|
@@ -537,61 +585,64 @@ def workload_create(args) -> None:
|
|
|
537
585
|
args=args,
|
|
538
586
|
container=container,
|
|
539
587
|
gpu_scheduler=gpu_scheduler,
|
|
540
|
-
volumes=get_volumes(args,
|
|
588
|
+
volumes=get_volumes(args, workload_system),
|
|
541
589
|
storage_annotations=('\n' + (' ' * 12)).join(
|
|
542
590
|
get_storage_annotations(all_storages)
|
|
543
591
|
),
|
|
544
592
|
service_account=service_account,
|
|
545
593
|
failure_policy_rules=failure_policy_rules,
|
|
546
594
|
pod_failure_policy=pod_failure_policy,
|
|
595
|
+
placement_policy_label=placement_policy_label,
|
|
547
596
|
)
|
|
548
597
|
|
|
549
598
|
elif args.use_pathways and ensure_pathways_workload_prerequisites(
|
|
550
|
-
args,
|
|
599
|
+
args, workload_system
|
|
551
600
|
):
|
|
552
601
|
yml_string = PW_WORKLOAD_CREATE_YAML.format(
|
|
553
602
|
args=args,
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
machine_type=create_tpu_machine_type(system.accelerator_type, system),
|
|
603
|
+
topology=create_tpu_topology(workload_system),
|
|
604
|
+
machine_type=create_tpu_machine_type(workload_system),
|
|
557
605
|
custom_pathways_proxy_server=append_custom_pathways_proxy_server(args),
|
|
558
606
|
custom_pathways_server=append_custom_pathways_server(args),
|
|
559
607
|
custom_pathways_worker=append_custom_pathways_worker(args),
|
|
560
608
|
colocated_python_sidecar=append_custom_colocated_python_sidecar(args),
|
|
561
|
-
user_workload=get_user_workload_for_pathways(args,
|
|
609
|
+
user_workload=get_user_workload_for_pathways(args, workload_system),
|
|
562
610
|
local_queue_name=LOCAL_QUEUE_NAME,
|
|
563
611
|
autoprovisioning_args=autoprovisioning_args,
|
|
612
|
+
placement_policy_label=placement_policy_label,
|
|
564
613
|
)
|
|
565
614
|
else:
|
|
615
|
+
use_sub_slicing = (
|
|
616
|
+
workload_scheduling == WorkloadScheduling.SUB_SLICING_AVAILABLE
|
|
617
|
+
)
|
|
618
|
+
if use_sub_slicing:
|
|
619
|
+
xpk_print('Workload will be scheduled using the Sub-slicing feature.')
|
|
620
|
+
|
|
566
621
|
container, debugging_dashboard_id = get_user_workload_container(
|
|
567
|
-
args,
|
|
622
|
+
args, workload_system
|
|
568
623
|
)
|
|
569
624
|
yml_string = WORKLOAD_CREATE_YAML.format(
|
|
570
625
|
args=args,
|
|
571
626
|
container=container,
|
|
572
|
-
vms_per_slice=
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
and FeatureFlags.SUB_SLICING_ENABLED
|
|
576
|
-
and args.sub_slicing_topology is not None
|
|
577
|
-
else system.vms_per_slice
|
|
578
|
-
),
|
|
579
|
-
affinity=get_cpu_affinity(system.accelerator_type),
|
|
580
|
-
accelerator_label=create_accelerator_label(
|
|
581
|
-
system.accelerator_type, system
|
|
582
|
-
),
|
|
627
|
+
vms_per_slice=workload_system.vms_per_slice,
|
|
628
|
+
affinity=get_cpu_affinity(workload_system.accelerator_type),
|
|
629
|
+
accelerator_label=create_accelerator_label(workload_system),
|
|
583
630
|
sub_slicing_annotations=(
|
|
584
|
-
''
|
|
585
|
-
|
|
586
|
-
or args.sub_slicing_topology is None
|
|
587
|
-
else ('\n' + (' ' * 16)).join(
|
|
588
|
-
create_sub_slicing_annotations(args.sub_slicing_topology)
|
|
631
|
+
('\n' + (' ' * 16)).join(
|
|
632
|
+
create_sub_slicing_annotations(workload_system.topology)
|
|
589
633
|
)
|
|
634
|
+
if use_sub_slicing
|
|
635
|
+
else ''
|
|
636
|
+
),
|
|
637
|
+
placement_policy_label=placement_policy_label,
|
|
638
|
+
machine_label=(
|
|
639
|
+
create_machine_label(cluster_system)
|
|
640
|
+
if use_sub_slicing and cluster_system
|
|
641
|
+
else create_machine_label(workload_system)
|
|
590
642
|
),
|
|
591
|
-
machine_label=create_machine_label(system.accelerator_type, system),
|
|
592
643
|
local_queue_name=LOCAL_QUEUE_NAME,
|
|
593
644
|
autoprovisioning_args=autoprovisioning_args,
|
|
594
|
-
volumes=get_volumes(args,
|
|
645
|
+
volumes=get_volumes(args, workload_system),
|
|
595
646
|
storage_annotations=('\n' + (' ' * 16)).join(
|
|
596
647
|
get_storage_annotations(all_storages)
|
|
597
648
|
),
|
|
@@ -599,10 +650,18 @@ def workload_create(args) -> None:
|
|
|
599
650
|
tpu_toleration="""
|
|
600
651
|
- operator: "Exists"
|
|
601
652
|
key: google.com/tpu
|
|
602
|
-
""" if
|
|
653
|
+
""" if workload_system.accelerator_type == AcceleratorType.TPU else '',
|
|
603
654
|
failure_policy_rules=failure_policy_rules,
|
|
604
655
|
pod_failure_policy=pod_failure_policy,
|
|
605
656
|
)
|
|
657
|
+
if args.output_manifest_file:
|
|
658
|
+
with open(args.output_manifest_file, 'w', encoding='utf-8') as f:
|
|
659
|
+
f.write(yml_string)
|
|
660
|
+
xpk_print(
|
|
661
|
+
f'Workload {args.workload} manifest written to'
|
|
662
|
+
f' {args.output_manifest_file}'
|
|
663
|
+
)
|
|
664
|
+
|
|
606
665
|
tmp = write_tmp_file(yml_string)
|
|
607
666
|
command = f'kubectl apply -f {str(tmp)}'
|
|
608
667
|
return_code = run_command_with_updates(command, 'Creating Workload')
|
|
@@ -616,7 +675,7 @@ def workload_create(args) -> None:
|
|
|
616
675
|
|
|
617
676
|
# Get GKE outlier dashboard for TPU
|
|
618
677
|
outlier_dashboard_id = None
|
|
619
|
-
if
|
|
678
|
+
if workload_system.accelerator_type == AcceleratorType.TPU:
|
|
620
679
|
outlier_dashboard_id = get_gke_outlier_dashboard(args)
|
|
621
680
|
|
|
622
681
|
# Outlier and debugging dashboards
|
|
@@ -683,63 +742,6 @@ def workload_create(args) -> None:
|
|
|
683
742
|
xpk_exit(0)
|
|
684
743
|
|
|
685
744
|
|
|
686
|
-
def _validate_sub_slicing_availability():
|
|
687
|
-
return_code, sub_slicing_enabled = has_sub_slicing_enabled()
|
|
688
|
-
if return_code != 0:
|
|
689
|
-
xpk_print(
|
|
690
|
-
'Error: Unable to validate sub-slicing support on a given cluster.'
|
|
691
|
-
)
|
|
692
|
-
xpk_exit(1)
|
|
693
|
-
|
|
694
|
-
if not sub_slicing_enabled:
|
|
695
|
-
xpk_print(
|
|
696
|
-
'Error: Cluster has not been not set up for Sub-slicing. Please enable'
|
|
697
|
-
' --sub-slicing in "cluster create" command first.'
|
|
698
|
-
)
|
|
699
|
-
xpk_exit(1)
|
|
700
|
-
|
|
701
|
-
kueue_manager = KueueManager()
|
|
702
|
-
return_code, current_version = kueue_manager.get_installed_kueue_version()
|
|
703
|
-
if return_code != 0:
|
|
704
|
-
xpk_print(
|
|
705
|
-
'Error: Unable to validate sub-slicing support on a given cluster.'
|
|
706
|
-
)
|
|
707
|
-
xpk_exit(1)
|
|
708
|
-
|
|
709
|
-
if current_version < SUB_SLICING_MINIMUM_KUEUE_VERSION:
|
|
710
|
-
xpk_print(
|
|
711
|
-
"Error: Current Kueue version ({current_version}) doesn't support"
|
|
712
|
-
' Sub-slicing. The minimal required version is'
|
|
713
|
-
' v{SUB_SLICING_MINIMUM_KUEUE_VERSION}. Please either update Kueue'
|
|
714
|
-
' manually, or run "cluster create --sub-slicing" on the existing'
|
|
715
|
-
' cluster.'
|
|
716
|
-
)
|
|
717
|
-
xpk_exit(1)
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
def _validate_sub_slicing_topology(
|
|
721
|
-
system_characteristics: SystemCharacteristics, sub_slicing_topology: str
|
|
722
|
-
) -> None:
|
|
723
|
-
if sub_slicing_topology not in SUB_SLICING_TOPOLOGIES:
|
|
724
|
-
xpk_print(
|
|
725
|
-
f'Error: --sub-slicing-topology={sub_slicing_topology} shape is'
|
|
726
|
-
f' invalid. It has to be one of: {", ".join(SUB_SLICING_TOPOLOGIES)}.'
|
|
727
|
-
)
|
|
728
|
-
xpk_exit(1)
|
|
729
|
-
|
|
730
|
-
if not is_topology_contained(
|
|
731
|
-
contained=sub_slicing_topology, container=system_characteristics.topology
|
|
732
|
-
):
|
|
733
|
-
xpk_print(
|
|
734
|
-
f'Error: --sub-slicing-topology={sub_slicing_topology} shape is too'
|
|
735
|
-
' large. The shape cannot be bigger than'
|
|
736
|
-
f' {system_characteristics.topology}.'
|
|
737
|
-
)
|
|
738
|
-
xpk_exit(1)
|
|
739
|
-
|
|
740
|
-
validate_sub_slicing_system(system_characteristics)
|
|
741
|
-
|
|
742
|
-
|
|
743
745
|
def get_restart_exit_codes(args) -> list:
|
|
744
746
|
exit_codes = [42]
|
|
745
747
|
exit_codes.extend(range(127, 256, 1))
|
|
@@ -785,11 +787,10 @@ def workload_delete(args) -> None:
|
|
|
785
787
|
xpk_exit(return_code)
|
|
786
788
|
# Skip the header
|
|
787
789
|
workloads = [x.split(' ')[0] for x in return_value.splitlines()][1:]
|
|
788
|
-
if workloads
|
|
789
|
-
will_delete =
|
|
790
|
+
if workloads:
|
|
791
|
+
will_delete = ask_for_user_consent(
|
|
790
792
|
f'Planning to delete {len(workloads)} workloads in the cluster'
|
|
791
|
-
f' {args.cluster} including {workloads}. \nDo you wish to delete
|
|
792
|
-
' (yes) / n (no):\n'
|
|
793
|
+
f' {args.cluster} including {workloads}. \nDo you wish to delete?'
|
|
793
794
|
)
|
|
794
795
|
else:
|
|
795
796
|
workloads = [args.workload]
|