xpk 0.13.0__py3-none-any.whl → 0.14.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/__init__.py +15 -0
- integration/docker_manager_test.py +102 -0
- integration/gcluster_a3mega_test.py +204 -0
- integration/gcluster_a3ultra_test.py +176 -0
- integration/gcluster_a4_test.py +176 -0
- integration/gcluster_test.py +107 -0
- xpk/commands/batch.py +9 -2
- xpk/commands/cluster.py +143 -117
- xpk/commands/cluster_gcluster.py +81 -14
- xpk/commands/cluster_gcluster_test.py +177 -0
- xpk/commands/cluster_test.py +92 -0
- xpk/commands/common.py +14 -26
- xpk/commands/info.py +11 -9
- xpk/commands/inspector.py +21 -10
- xpk/commands/job.py +25 -9
- xpk/commands/kind.py +39 -40
- xpk/commands/kjob_common.py +4 -4
- xpk/commands/run.py +9 -2
- xpk/commands/shell.py +13 -10
- xpk/commands/storage.py +21 -0
- xpk/commands/version.py +0 -4
- xpk/commands/workload.py +84 -29
- xpk/commands/workload_test.py +81 -0
- xpk/core/blueprint/blueprint_generator.py +4 -40
- xpk/core/blueprint/blueprint_test.py +0 -6
- xpk/core/blueprint/testing/__init__.py +15 -0
- xpk/core/capacity.py +6 -5
- xpk/core/cluster.py +91 -194
- xpk/core/cluster_private.py +6 -11
- xpk/core/commands.py +11 -18
- xpk/core/config.py +1 -1
- xpk/core/docker_image.py +3 -4
- xpk/core/gcloud_context.py +26 -2
- xpk/core/gcloud_context_test.py +96 -0
- xpk/core/gcluster_manager.py +0 -3
- xpk/core/jobset.py +4 -7
- xpk/core/kjob.py +14 -27
- xpk/core/kueue_manager.py +423 -0
- xpk/core/kueue_manager_test.py +574 -0
- xpk/core/monitoring.py +1 -1
- xpk/core/nap.py +10 -15
- xpk/core/network.py +17 -18
- xpk/core/nodepool.py +66 -77
- xpk/core/nodepool_test.py +198 -1
- xpk/core/pathways.py +5 -5
- xpk/core/ray.py +10 -14
- xpk/core/resources.py +6 -11
- xpk/core/scheduling.py +19 -1
- xpk/core/scheduling_test.py +31 -0
- xpk/core/system_characteristics.py +350 -232
- xpk/core/system_characteristics_test.py +73 -0
- xpk/core/vertex.py +1 -1
- xpk/core/workload.py +7 -8
- xpk/main.py +2 -4
- xpk/parser/cluster.py +7 -0
- xpk/parser/cluster_test.py +66 -0
- xpk/parser/common.py +11 -0
- xpk/parser/workload.py +62 -25
- xpk/parser/workload_test.py +82 -0
- xpk/templates/cluster_preheat.yaml.j2 +31 -0
- xpk/templates/filestore-pv.yaml +17 -0
- xpk/templates/filestore-pvc.yaml +11 -0
- xpk/templates/filestore-sc.yaml +10 -0
- xpk/templates/fuse-pv.yaml +17 -0
- xpk/templates/fuse-pvc.yaml +13 -0
- xpk/templates/kueue_config.yaml.j2 +95 -0
- xpk/templates/kueue_gke_default_topology.yaml.j2 +10 -0
- xpk/templates/kueue_sub_slicing_topology.yaml.j2 +14 -0
- xpk/templates/mtc-cpc.yaml +15 -0
- xpk/templates/volume_bundle.yaml +7 -0
- xpk/utils/feature_flags.py +28 -0
- xpk/utils/kueue.py +20 -0
- xpk/utils/templates.py +15 -0
- xpk/utils/topology.py +46 -0
- xpk/utils/topology_test.py +63 -0
- xpk/utils/validation.py +79 -55
- xpk/utils/validation_test.py +37 -0
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/METADATA +6 -1
- xpk-0.14.1.dist-info/RECORD +133 -0
- xpk-0.14.1.dist-info/top_level.txt +2 -0
- xpk/core/kueue.py +0 -561
- xpk-0.13.0.dist-info/RECORD +0 -101
- xpk-0.13.0.dist-info/top_level.txt +0 -1
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/WHEEL +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/entry_points.txt +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/licenses/LICENSE +0 -0
xpk/commands/workload.py
CHANGED
|
@@ -34,7 +34,7 @@ from ..core.docker_container import (
|
|
|
34
34
|
)
|
|
35
35
|
from ..core.docker_resources import get_volumes, parse_env_config
|
|
36
36
|
from ..core.gcloud_context import add_zone_and_project
|
|
37
|
-
from ..core.
|
|
37
|
+
from ..core.kueue_manager import LOCAL_QUEUE_NAME
|
|
38
38
|
from ..core.monitoring import get_gke_outlier_dashboard
|
|
39
39
|
from ..core.nap import (
|
|
40
40
|
get_autoprovisioning_node_selector_args,
|
|
@@ -52,10 +52,7 @@ from ..core.pathways import (
|
|
|
52
52
|
get_user_workload_for_pathways,
|
|
53
53
|
try_to_delete_pathwaysjob_first,
|
|
54
54
|
)
|
|
55
|
-
from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
|
|
56
|
-
from ..core.capacity import (
|
|
57
|
-
CapacityType,
|
|
58
|
-
)
|
|
55
|
+
from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics, SystemCharacteristics
|
|
59
56
|
from ..core.resources import CLUSTER_METADATA_CONFIGMAP, get_cluster_configmap
|
|
60
57
|
from ..core.scheduling import (
|
|
61
58
|
check_if_workload_can_schedule,
|
|
@@ -65,6 +62,7 @@ from ..core.scheduling import (
|
|
|
65
62
|
create_tpu_topology,
|
|
66
63
|
get_cpu_affinity,
|
|
67
64
|
get_gpu_scheduler,
|
|
65
|
+
create_sub_slicing_annotations,
|
|
68
66
|
)
|
|
69
67
|
from ..core.storage import (
|
|
70
68
|
GCE_PD_TYPE,
|
|
@@ -80,6 +78,7 @@ from ..core.storage import (
|
|
|
80
78
|
from ..core.system_characteristics import (
|
|
81
79
|
AcceleratorType,
|
|
82
80
|
get_system_characteristics,
|
|
81
|
+
compute_vms_per_slice,
|
|
83
82
|
)
|
|
84
83
|
from ..core.vertex import create_vertex_experiment
|
|
85
84
|
from ..core.workload import (
|
|
@@ -87,7 +86,7 @@ from ..core.workload import (
|
|
|
87
86
|
get_jobsets_list_gcp_link,
|
|
88
87
|
get_workload_list,
|
|
89
88
|
wait_for_job_completion,
|
|
90
|
-
|
|
89
|
+
get_cluster_location,
|
|
91
90
|
)
|
|
92
91
|
from ..core.workload_decorators import (
|
|
93
92
|
rdma_decorator,
|
|
@@ -98,8 +97,11 @@ from ..core.workload_decorators import (
|
|
|
98
97
|
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
99
98
|
from ..utils.file import write_tmp_file
|
|
100
99
|
from ..utils.execution_context import is_dry_run
|
|
100
|
+
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
101
101
|
from . import cluster_gcluster
|
|
102
|
-
from .common import is_TAS_possible
|
|
102
|
+
from .common import is_TAS_possible, validate_sub_slicing_system
|
|
103
|
+
from ..utils.topology import is_topology_contained
|
|
104
|
+
from ..utils.feature_flags import FeatureFlags
|
|
103
105
|
|
|
104
106
|
WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
|
|
105
107
|
kind: JobSet
|
|
@@ -120,8 +122,8 @@ spec:
|
|
|
120
122
|
replicas: {args.num_slices}
|
|
121
123
|
template:
|
|
122
124
|
spec:
|
|
123
|
-
parallelism: {
|
|
124
|
-
completions: {
|
|
125
|
+
parallelism: {vms_per_slice} # Equal to the number of VMs per slice (or sub-slice).
|
|
126
|
+
completions: {vms_per_slice} # Same as the above.
|
|
125
127
|
backoffLimit: 0 # When any pod fails, the job is failed
|
|
126
128
|
{pod_failure_policy}
|
|
127
129
|
template:
|
|
@@ -130,6 +132,7 @@ spec:
|
|
|
130
132
|
xpk.google.com/workload: {args.workload}
|
|
131
133
|
annotations:
|
|
132
134
|
{storage_annotations}
|
|
135
|
+
{sub_slicing_annotations}
|
|
133
136
|
spec:
|
|
134
137
|
schedulerName: {args.scheduler}
|
|
135
138
|
imagePullSecrets:
|
|
@@ -267,6 +270,8 @@ PW_WORKLOAD_CREATE_YAML = """
|
|
|
267
270
|
maxSliceRestarts: {args.max_slice_restarts}
|
|
268
271
|
terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
|
|
269
272
|
priorityClassName: {args.priority}
|
|
273
|
+
nodeSelector:
|
|
274
|
+
{autoprovisioning_args}
|
|
270
275
|
pathwaysDir: {args.pathways_gcs_location} #This bucket needs to be created in advance.
|
|
271
276
|
controller:
|
|
272
277
|
# #Pod template for training, default mode.
|
|
@@ -277,6 +282,8 @@ PW_WORKLOAD_CREATE_YAML = """
|
|
|
277
282
|
{user_workload}
|
|
278
283
|
"""
|
|
279
284
|
|
|
285
|
+
SUB_SLICING_TOPOLOGIES = ['2x2', '2x4', '4x4', '4x8', '8x8', '8x16', '16x16']
|
|
286
|
+
|
|
280
287
|
|
|
281
288
|
def workload_create_pathways(args) -> None:
|
|
282
289
|
"""Run jobset apply command for a file, specifically for Pathways.
|
|
@@ -307,6 +314,12 @@ def workload_create(args) -> None:
|
|
|
307
314
|
Returns:
|
|
308
315
|
0 if successful and 1 otherwise.
|
|
309
316
|
"""
|
|
317
|
+
if should_validate_dependencies(args):
|
|
318
|
+
validate_dependencies_list([
|
|
319
|
+
SystemDependency.KUBECTL,
|
|
320
|
+
SystemDependency.GCLOUD,
|
|
321
|
+
SystemDependency.DOCKER,
|
|
322
|
+
])
|
|
310
323
|
k8s_api_client = None
|
|
311
324
|
if not is_dry_run():
|
|
312
325
|
k8s_api_client = setup_k8s_env(args)
|
|
@@ -321,20 +334,21 @@ def workload_create(args) -> None:
|
|
|
321
334
|
)
|
|
322
335
|
xpk_exit(1)
|
|
323
336
|
|
|
324
|
-
xpk_print('Starting workload create', flush=True)
|
|
325
337
|
system, return_code = get_system_characteristics(args)
|
|
326
|
-
|
|
327
338
|
if return_code > 0 or system is None:
|
|
328
339
|
xpk_print('Fetching system characteristics failed!')
|
|
329
340
|
xpk_exit(return_code)
|
|
330
341
|
|
|
342
|
+
if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing_topology is not None:
|
|
343
|
+
_validate_sub_slicing_topology(system, args.sub_slicing_topology)
|
|
344
|
+
|
|
331
345
|
if not check_if_workload_can_schedule(args, system):
|
|
332
346
|
xpk_exit(1)
|
|
333
347
|
|
|
334
348
|
xpk_print('Starting workload create', flush=True)
|
|
335
349
|
|
|
336
350
|
metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
|
|
337
|
-
cluster_config_map = get_cluster_configmap(
|
|
351
|
+
cluster_config_map = get_cluster_configmap(metadata_configmap_name)
|
|
338
352
|
cluster_xpk_version = None
|
|
339
353
|
if cluster_config_map is None:
|
|
340
354
|
xpk_print(
|
|
@@ -482,16 +496,12 @@ def workload_create(args) -> None:
|
|
|
482
496
|
capacity_type = get_cluster_capacity_type(args)
|
|
483
497
|
|
|
484
498
|
annotations = (
|
|
485
|
-
|
|
486
|
-
if not is_TAS_possible(
|
|
487
|
-
system_characteristics,
|
|
488
|
-
capacity_type,
|
|
489
|
-
flex=True if capacity_type == CapacityType.FLEX_START else False,
|
|
490
|
-
)
|
|
491
|
-
else (
|
|
499
|
+
(
|
|
492
500
|
'kueue.x-k8s.io/podset-preferred-topology:'
|
|
493
501
|
' "cloud.google.com/gce-topology-host"'
|
|
494
502
|
)
|
|
503
|
+
if is_TAS_possible(system_characteristics, capacity_type)
|
|
504
|
+
else ''
|
|
495
505
|
)
|
|
496
506
|
|
|
497
507
|
if (
|
|
@@ -507,7 +517,7 @@ def workload_create(args) -> None:
|
|
|
507
517
|
annotations=annotations,
|
|
508
518
|
)
|
|
509
519
|
|
|
510
|
-
sub_networks = get_cluster_subnetworks(
|
|
520
|
+
sub_networks = get_cluster_subnetworks()
|
|
511
521
|
if args.device_type == a3high_device_type:
|
|
512
522
|
yml_string = tcpx_decorator.decorate_jobset(yml_string)
|
|
513
523
|
elif args.device_type == a3mega_device_type:
|
|
@@ -545,6 +555,7 @@ def workload_create(args) -> None:
|
|
|
545
555
|
colocated_python_sidecar=append_custom_colocated_python_sidecar(args),
|
|
546
556
|
user_workload=get_user_workload_for_pathways(args, system),
|
|
547
557
|
local_queue_name=LOCAL_QUEUE_NAME,
|
|
558
|
+
autoprovisioning_args=autoprovisioning_args,
|
|
548
559
|
)
|
|
549
560
|
else:
|
|
550
561
|
container, debugging_dashboard_id = get_user_workload_container(
|
|
@@ -552,12 +563,26 @@ def workload_create(args) -> None:
|
|
|
552
563
|
)
|
|
553
564
|
yml_string = WORKLOAD_CREATE_YAML.format(
|
|
554
565
|
args=args,
|
|
555
|
-
system=system,
|
|
556
566
|
container=container,
|
|
567
|
+
vms_per_slice=(
|
|
568
|
+
compute_vms_per_slice(args.sub_slicing_topology)
|
|
569
|
+
if system.accelerator_type == AcceleratorType['TPU']
|
|
570
|
+
and FeatureFlags.SUB_SLICING_ENABLED
|
|
571
|
+
and args.sub_slicing_topology is not None
|
|
572
|
+
else system.vms_per_slice
|
|
573
|
+
),
|
|
557
574
|
affinity=get_cpu_affinity(system.accelerator_type),
|
|
558
575
|
accelerator_label=create_accelerator_label(
|
|
559
576
|
system.accelerator_type, system
|
|
560
577
|
),
|
|
578
|
+
sub_slicing_annotations=(
|
|
579
|
+
''
|
|
580
|
+
if not FeatureFlags.SUB_SLICING_ENABLED
|
|
581
|
+
or args.sub_slicing_topology is None
|
|
582
|
+
else ('\n' + (' ' * 16)).join(
|
|
583
|
+
create_sub_slicing_annotations(args.sub_slicing_topology)
|
|
584
|
+
)
|
|
585
|
+
),
|
|
561
586
|
machine_label=create_machine_label(system.accelerator_type, system),
|
|
562
587
|
local_queue_name=LOCAL_QUEUE_NAME,
|
|
563
588
|
autoprovisioning_args=autoprovisioning_args,
|
|
@@ -575,7 +600,7 @@ def workload_create(args) -> None:
|
|
|
575
600
|
)
|
|
576
601
|
tmp = write_tmp_file(yml_string)
|
|
577
602
|
command = f'kubectl apply -f {str(tmp)}'
|
|
578
|
-
return_code = run_command_with_updates(command, 'Creating Workload'
|
|
603
|
+
return_code = run_command_with_updates(command, 'Creating Workload')
|
|
579
604
|
|
|
580
605
|
if return_code != 0:
|
|
581
606
|
xpk_print(f'Create Workload request returned ERROR {return_code}')
|
|
@@ -622,7 +647,9 @@ def workload_create(args) -> None:
|
|
|
622
647
|
' JAX_PLATFORMS=proxy; JAX_BACKEND_TARGET=grpc://127.0.0.1:29000;'
|
|
623
648
|
" python -c 'import pathwaysutils; import jax; print(jax.devices())'"
|
|
624
649
|
)
|
|
625
|
-
pathways_proxy_link =
|
|
650
|
+
pathways_proxy_link = (
|
|
651
|
+
f'https://console.cloud.google.com/kubernetes/job/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}-proxy-0/details?project={args.project}'
|
|
652
|
+
)
|
|
626
653
|
xpk_print(
|
|
627
654
|
'Follow the proxy here:'
|
|
628
655
|
# pylint: disable=line-too-long)
|
|
@@ -636,7 +663,7 @@ def workload_create(args) -> None:
|
|
|
636
663
|
xpk_print(
|
|
637
664
|
'Follow your workload here:'
|
|
638
665
|
# pylint: disable=line-too-long
|
|
639
|
-
f' https://console.cloud.google.com/kubernetes/service/{
|
|
666
|
+
f' https://console.cloud.google.com/kubernetes/service/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
|
|
640
667
|
)
|
|
641
668
|
duration_of_logs = 'P1D' # Past 1 Day
|
|
642
669
|
xpk_print(
|
|
@@ -645,12 +672,35 @@ def workload_create(args) -> None:
|
|
|
645
672
|
' ([prefix]-slice-job-[slice_number]-[worker_number])'
|
|
646
673
|
' after clicking the url if you want other worker logs.'
|
|
647
674
|
# pylint: disable=line-too-long
|
|
648
|
-
f' https://console.cloud.google.com/logs/query;query=resource.type%3D%22k8s_container%22%0Aresource.labels.project_id%3D%22{args.project}%22%0Aresource.labels.location%3D%22{
|
|
675
|
+
f' https://console.cloud.google.com/logs/query;query=resource.type%3D%22k8s_container%22%0Aresource.labels.project_id%3D%22{args.project}%22%0Aresource.labels.location%3D%22{get_cluster_location(args.project, args.cluster, args.zone)}%22%0Aresource.labels.cluster_name%3D%22{args.cluster}%22%0Aresource.labels.namespace_name%3D%22default%22%0Aresource.labels.pod_name:%22{args.workload}-slice-job-0-0-%22%20severity%3E%3DDEFAULT;storageScope=project;duration={duration_of_logs}?e=13802955&mods=allow_workbench_image_override&project={args.project}'
|
|
649
676
|
)
|
|
650
677
|
|
|
651
678
|
xpk_exit(0)
|
|
652
679
|
|
|
653
680
|
|
|
681
|
+
def _validate_sub_slicing_topology(
|
|
682
|
+
system_characteristics: SystemCharacteristics, sub_slicing_topology: str
|
|
683
|
+
) -> None:
|
|
684
|
+
if sub_slicing_topology not in SUB_SLICING_TOPOLOGIES:
|
|
685
|
+
xpk_print(
|
|
686
|
+
f'Error: --sub-slicing-topology={sub_slicing_topology} shape is'
|
|
687
|
+
f' invalid. It has to be one of: {", ".join(SUB_SLICING_TOPOLOGIES)}.'
|
|
688
|
+
)
|
|
689
|
+
xpk_exit(1)
|
|
690
|
+
|
|
691
|
+
if not is_topology_contained(
|
|
692
|
+
contained=sub_slicing_topology, container=system_characteristics.topology
|
|
693
|
+
):
|
|
694
|
+
xpk_print(
|
|
695
|
+
f'Error: --sub-slicing-topology={sub_slicing_topology} shape is too'
|
|
696
|
+
' large. The shape cannot be bigger than'
|
|
697
|
+
f' {system_characteristics.topology}.'
|
|
698
|
+
)
|
|
699
|
+
xpk_exit(1)
|
|
700
|
+
|
|
701
|
+
validate_sub_slicing_system(system_characteristics)
|
|
702
|
+
|
|
703
|
+
|
|
654
704
|
def get_restart_exit_codes(args) -> list:
|
|
655
705
|
exit_codes = [42]
|
|
656
706
|
exit_codes.extend(range(127, 256, 1))
|
|
@@ -678,6 +728,10 @@ def workload_delete(args) -> None:
|
|
|
678
728
|
Returns:
|
|
679
729
|
0 if successful and 1 otherwise.
|
|
680
730
|
"""
|
|
731
|
+
if should_validate_dependencies(args):
|
|
732
|
+
validate_dependencies_list(
|
|
733
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
734
|
+
)
|
|
681
735
|
xpk_print('Starting Workload delete', flush=True)
|
|
682
736
|
add_zone_and_project(args)
|
|
683
737
|
get_cluster_credentials(args)
|
|
@@ -725,16 +779,13 @@ def workload_delete(args) -> None:
|
|
|
725
779
|
|
|
726
780
|
# Not batching deletion for single workload
|
|
727
781
|
if len(workloads) == 1:
|
|
728
|
-
return_code = run_command_with_updates(
|
|
729
|
-
commands[0], 'Delete Workload', args
|
|
730
|
-
)
|
|
782
|
+
return_code = run_command_with_updates(commands[0], 'Delete Workload')
|
|
731
783
|
else:
|
|
732
784
|
return_code = run_commands(
|
|
733
785
|
commands,
|
|
734
786
|
'Delete Workload',
|
|
735
787
|
task_names,
|
|
736
788
|
batch=100,
|
|
737
|
-
dry_run=args.dry_run,
|
|
738
789
|
)
|
|
739
790
|
|
|
740
791
|
if return_code != 0:
|
|
@@ -752,6 +803,10 @@ def workload_list(args) -> None:
|
|
|
752
803
|
Returns:
|
|
753
804
|
0 if successful and 1 otherwise.
|
|
754
805
|
"""
|
|
806
|
+
if should_validate_dependencies(args):
|
|
807
|
+
validate_dependencies_list(
|
|
808
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
809
|
+
)
|
|
755
810
|
xpk_print('Starting workload list', flush=True)
|
|
756
811
|
add_zone_and_project(args)
|
|
757
812
|
get_cluster_credentials(args)
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import dataclasses
|
|
18
|
+
from unittest.mock import MagicMock, patch
|
|
19
|
+
import pytest
|
|
20
|
+
from ..core.system_characteristics import SystemCharacteristics
|
|
21
|
+
from .workload import _validate_sub_slicing_topology
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
SYSTEM_CHARACTERISTICS = SystemCharacteristics(
|
|
25
|
+
topology='8x8',
|
|
26
|
+
vms_per_slice=1,
|
|
27
|
+
gke_accelerator='nvidia-l4',
|
|
28
|
+
gce_machine_type='g2-standard-12',
|
|
29
|
+
chips_per_vm=1,
|
|
30
|
+
accelerator_type=1,
|
|
31
|
+
device_type='l4-1',
|
|
32
|
+
supports_sub_slicing=True,
|
|
33
|
+
requires_workload_policy=False,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@pytest.fixture(autouse=True)
|
|
38
|
+
def xpk_print(mocker):
|
|
39
|
+
return mocker.patch('xpk.commands.workload.xpk_print')
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_validate_sub_slicing_topology_exits_for_unsupported_topology(
|
|
43
|
+
xpk_print,
|
|
44
|
+
):
|
|
45
|
+
with pytest.raises(SystemExit):
|
|
46
|
+
_validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '2x1')
|
|
47
|
+
|
|
48
|
+
assert (
|
|
49
|
+
'shape is invalid. It has to be one of' in xpk_print.mock_calls[0].args[0]
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def test_validate_sub_slicing_topology_exits_for_too_large_topology(xpk_print):
|
|
54
|
+
with pytest.raises(SystemExit):
|
|
55
|
+
_validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '16x16')
|
|
56
|
+
|
|
57
|
+
assert (
|
|
58
|
+
'shape is too large. The shape cannot be'
|
|
59
|
+
in xpk_print.mock_calls[0].args[0]
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def test_validate_sub_slicing_topology_does_nothing_for_supported_topology():
|
|
64
|
+
_validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '4x4')
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@patch('xpk.commands.common.xpk_print')
|
|
68
|
+
def test_validate_sub_slicing_topology_fails_for_unsupported_system(
|
|
69
|
+
common_xpk_print: MagicMock,
|
|
70
|
+
):
|
|
71
|
+
unsupported_system = dataclasses.replace(
|
|
72
|
+
SYSTEM_CHARACTERISTICS, supports_sub_slicing=False
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
with pytest.raises(SystemExit):
|
|
76
|
+
_validate_sub_slicing_topology(unsupported_system, '4x4')
|
|
77
|
+
|
|
78
|
+
assert (
|
|
79
|
+
'l4-1 does not support Sub-slicing.'
|
|
80
|
+
in common_xpk_print.mock_calls[0].args[0]
|
|
81
|
+
)
|
|
@@ -32,7 +32,6 @@ from ..capacity import (
|
|
|
32
32
|
)
|
|
33
33
|
from ..system_characteristics import get_system_characteristics_by_device_type
|
|
34
34
|
from .blueprint_definitions import Blueprint, DeploymentGroup, DeploymentModule
|
|
35
|
-
from ..kueue import KUEUE_VERSION
|
|
36
35
|
|
|
37
36
|
yaml_parser = yaml.YAML()
|
|
38
37
|
|
|
@@ -53,6 +52,7 @@ blueprint_dependencies_dir = {
|
|
|
53
52
|
|
|
54
53
|
cluster_toolkit_url = "github.com/GoogleCloudPlatform/cluster-toolkit"
|
|
55
54
|
cluster_toolkit_version = "v1.62.2"
|
|
55
|
+
common_cluster_labels = {"gke_product_type": "xpk"}
|
|
56
56
|
|
|
57
57
|
|
|
58
58
|
class BlueprintGeneratorOutput:
|
|
@@ -216,26 +216,11 @@ class BlueprintGenerator:
|
|
|
216
216
|
a3_megagpu_pool_0.settings.update({"static_node_count": num_nodes})
|
|
217
217
|
|
|
218
218
|
set_placement_policy = capacity_type != CapacityType.SPOT
|
|
219
|
-
num_chips = num_nodes * system.chips_per_vm
|
|
220
219
|
workload = DeploymentModule(
|
|
221
220
|
id="workload_component_install",
|
|
222
221
|
source="modules/management/kubectl-apply",
|
|
223
222
|
use=["gke_cluster"],
|
|
224
223
|
settings={
|
|
225
|
-
"kueue": {
|
|
226
|
-
"install": True,
|
|
227
|
-
"version": KUEUE_VERSION, # TAS feature-gates is enabled in CT
|
|
228
|
-
"config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
|
|
229
|
-
"config_template_vars": {
|
|
230
|
-
"num_chips": num_chips,
|
|
231
|
-
"reservation": (
|
|
232
|
-
1 if capacity_type == CapacityType.RESERVATION else 0
|
|
233
|
-
),
|
|
234
|
-
"flex_start": (
|
|
235
|
-
1 if capacity_type == CapacityType.FLEX_START else 0
|
|
236
|
-
),
|
|
237
|
-
},
|
|
238
|
-
},
|
|
239
224
|
"jobset": {"install": True, "version": "v0.7.2"},
|
|
240
225
|
"apply_manifests": [{
|
|
241
226
|
"source": f'$(ghpc_stage("{blueprint_name}"))/storage_crd.yaml'
|
|
@@ -298,6 +283,7 @@ class BlueprintGenerator:
|
|
|
298
283
|
"deployment_name": blueprint_name,
|
|
299
284
|
"region": region,
|
|
300
285
|
"zone": zone,
|
|
286
|
+
"labels": common_cluster_labels,
|
|
301
287
|
},
|
|
302
288
|
)
|
|
303
289
|
|
|
@@ -598,24 +584,12 @@ class BlueprintGenerator:
|
|
|
598
584
|
else:
|
|
599
585
|
gpu_pool.settings.update({"static_node_count": num_nodes})
|
|
600
586
|
|
|
601
|
-
num_chips = num_nodes * system.chips_per_vm
|
|
602
587
|
workload_manager_install_id = "workload-manager-install"
|
|
603
588
|
workload_manager_install = DeploymentModule(
|
|
604
589
|
id=workload_manager_install_id,
|
|
605
590
|
source="modules/management/kubectl-apply",
|
|
606
591
|
use=[cluster_id],
|
|
607
592
|
settings={
|
|
608
|
-
"kueue": {
|
|
609
|
-
"install": True,
|
|
610
|
-
"version": KUEUE_VERSION, # TAS feature-gates is enabled in CT
|
|
611
|
-
"config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
|
|
612
|
-
"config_template_vars": {
|
|
613
|
-
"num_chips": num_chips,
|
|
614
|
-
"flex_start": (
|
|
615
|
-
1 if capacity_type == CapacityType.FLEX_START else 0
|
|
616
|
-
),
|
|
617
|
-
},
|
|
618
|
-
},
|
|
619
593
|
"jobset": {"install": True, "version": "v0.7.2"},
|
|
620
594
|
"apply_manifests": [
|
|
621
595
|
{"source": nccl_installer_path},
|
|
@@ -676,6 +650,7 @@ class BlueprintGenerator:
|
|
|
676
650
|
"deployment_name": blueprint_name,
|
|
677
651
|
"region": region,
|
|
678
652
|
"zone": zone,
|
|
653
|
+
"labels": common_cluster_labels,
|
|
679
654
|
},
|
|
680
655
|
)
|
|
681
656
|
|
|
@@ -884,24 +859,12 @@ class BlueprintGenerator:
|
|
|
884
859
|
else:
|
|
885
860
|
gpu_pool.settings.update({"static_node_count": num_nodes})
|
|
886
861
|
|
|
887
|
-
num_chips = num_nodes * system.chips_per_vm
|
|
888
862
|
workload_manager_install_id = "workload-manager-install"
|
|
889
863
|
workload_manager_install = DeploymentModule(
|
|
890
864
|
id=workload_manager_install_id,
|
|
891
865
|
source="modules/management/kubectl-apply",
|
|
892
866
|
use=[cluster_id],
|
|
893
867
|
settings={
|
|
894
|
-
"kueue": {
|
|
895
|
-
"install": True,
|
|
896
|
-
"version": KUEUE_VERSION, # TAS feature-gates is enabled in CT
|
|
897
|
-
"config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
|
|
898
|
-
"config_template_vars": {
|
|
899
|
-
"num_chips": num_chips,
|
|
900
|
-
"flex_start": (
|
|
901
|
-
1 if capacity_type == CapacityType.FLEX_START else 0
|
|
902
|
-
),
|
|
903
|
-
},
|
|
904
|
-
},
|
|
905
868
|
"jobset": {"install": True, "version": "v0.7.2"},
|
|
906
869
|
"apply_manifests": [
|
|
907
870
|
{"source": nccl_installer_path},
|
|
@@ -962,6 +925,7 @@ class BlueprintGenerator:
|
|
|
962
925
|
"deployment_name": blueprint_name,
|
|
963
926
|
"region": region,
|
|
964
927
|
"zone": zone,
|
|
928
|
+
"labels": common_cluster_labels,
|
|
965
929
|
},
|
|
966
930
|
)
|
|
967
931
|
|
|
@@ -32,7 +32,6 @@ a3_spot_yaml_test_path = "src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml"
|
|
|
32
32
|
a3_ultra_yaml_test_path = "src/xpk/core/blueprint/testing/data/a3_ultra.yaml"
|
|
33
33
|
a4_yaml_test_path = "src/xpk/core/blueprint/testing/data/a4.yaml"
|
|
34
34
|
config_map_filename = "config-map.yaml.tftpl"
|
|
35
|
-
kueue_conf_filename = "kueue-xpk-configuration.yaml.tftpl"
|
|
36
35
|
tmp_test_dir = "/tmp/xpk_test"
|
|
37
36
|
|
|
38
37
|
|
|
@@ -82,11 +81,6 @@ def test_generate_a3_mega_blueprint():
|
|
|
82
81
|
tmp_test_dir, "prefix", blueprint_name, config_map_filename
|
|
83
82
|
)
|
|
84
83
|
)
|
|
85
|
-
assert os.path.exists(
|
|
86
|
-
os.path.join(
|
|
87
|
-
tmp_test_dir, "prefix", blueprint_name, kueue_conf_filename
|
|
88
|
-
)
|
|
89
|
-
)
|
|
90
84
|
|
|
91
85
|
shutil.rmtree(tmp_test_dir)
|
|
92
86
|
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
xpk/core/capacity.py
CHANGED
|
@@ -17,6 +17,7 @@ limitations under the License.
|
|
|
17
17
|
import enum
|
|
18
18
|
|
|
19
19
|
from ..utils.console import xpk_print, xpk_exit
|
|
20
|
+
from ..utils.kueue import is_queued_cluster
|
|
20
21
|
from .commands import run_command_with_updates, run_command_for_value
|
|
21
22
|
|
|
22
23
|
AUTOPROVISIONING_CONFIG_VALUE = 'AUTOPROVISION'
|
|
@@ -50,7 +51,7 @@ def print_reservations(args) -> int:
|
|
|
50
51
|
"""
|
|
51
52
|
command = f'gcloud beta compute reservations list --project={args.project}'
|
|
52
53
|
return_code = run_command_with_updates(
|
|
53
|
-
command, 'Get all reservations in the project'
|
|
54
|
+
command, 'Get all reservations in the project'
|
|
54
55
|
)
|
|
55
56
|
if return_code != 0:
|
|
56
57
|
xpk_print(f'Get all reservations returned ERROR {return_code}')
|
|
@@ -119,7 +120,7 @@ def get_reservation_maintenance_interval(
|
|
|
119
120
|
f' --project={project} --zone={zone} --format="value(specificReservation.instanceProperties.maintenanceInterval)"'
|
|
120
121
|
)
|
|
121
122
|
return_code, output = run_command_for_value(
|
|
122
|
-
command, 'Get reservation maintenance interval'
|
|
123
|
+
command, 'Get reservation maintenance interval'
|
|
123
124
|
)
|
|
124
125
|
if return_code != 0:
|
|
125
126
|
xpk_print(f'Get reservation maintenance interval ERROR {return_code}')
|
|
@@ -143,7 +144,7 @@ def get_reservation_placement_policy(
|
|
|
143
144
|
f' --project={project} --zone={zone} --format="value(resourcePolicies.policy)"'
|
|
144
145
|
)
|
|
145
146
|
return_code, output = run_command_for_value(
|
|
146
|
-
command, 'Get reservation placement policy'
|
|
147
|
+
command, 'Get reservation placement policy'
|
|
147
148
|
)
|
|
148
149
|
if return_code != 0:
|
|
149
150
|
xpk_print(f'Get reservation placement policy ERROR {return_code}')
|
|
@@ -164,7 +165,7 @@ def verify_reservation_exists(args) -> int:
|
|
|
164
165
|
f'gcloud beta compute reservations describe {args.reservation}'
|
|
165
166
|
f' --project={args.project} --zone={args.zone}'
|
|
166
167
|
)
|
|
167
|
-
return_code = run_command_with_updates(command, 'Describe reservation'
|
|
168
|
+
return_code = run_command_with_updates(command, 'Describe reservation')
|
|
168
169
|
if return_code != 0:
|
|
169
170
|
xpk_print(f'Describe reservation returned ERROR {return_code}')
|
|
170
171
|
xpk_print('Please confirm that your reservation name is correct.')
|
|
@@ -199,7 +200,7 @@ def get_capacity_arguments_from_capacity_type(
|
|
|
199
200
|
' --location-policy=ANY --reservation-affinity=none'
|
|
200
201
|
f' --no-enable-autorepair --max-nodes={max_nodes}'
|
|
201
202
|
)
|
|
202
|
-
if args.num_slices
|
|
203
|
+
if is_queued_cluster(args.num_slices):
|
|
203
204
|
capacity_args += ' --enable-queued-provisioning'
|
|
204
205
|
case CapacityType.RESERVATION:
|
|
205
206
|
capacity_args = (
|