xpk 0.16.1__py3-none-any.whl → 0.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/cluster.py +48 -5
- xpk/commands/cluster_gcluster.py +3 -0
- xpk/commands/cluster_gcluster_test.py +2 -0
- xpk/commands/cluster_test.py +203 -0
- xpk/commands/common.py +6 -0
- xpk/commands/kind.py +2 -0
- xpk/commands/workload.py +35 -15
- xpk/commands/workload_test.py +1 -0
- xpk/core/capacity.py +83 -46
- xpk/core/capacity_test.py +82 -28
- xpk/core/commands.py +39 -12
- xpk/core/kueue_manager.py +42 -11
- xpk/core/kueue_manager_test.py +83 -3
- xpk/core/nap.py +5 -4
- xpk/core/nodepool.py +57 -20
- xpk/core/nodepool_test.py +152 -23
- xpk/core/pathways.py +2 -1
- xpk/core/resources.py +3 -3
- xpk/core/scheduling.py +54 -10
- xpk/core/scheduling_test.py +118 -13
- xpk/core/system_characteristics.py +41 -24
- xpk/core/system_characteristics_test.py +37 -4
- xpk/core/telemetry.py +5 -0
- xpk/core/telemetry_test.py +19 -2
- xpk/core/updates.py +1 -1
- xpk/main.py +2 -1
- xpk/parser/cluster.py +34 -2
- xpk/parser/cluster_test.py +117 -0
- xpk/parser/common.py +32 -0
- xpk/parser/common_test.py +49 -0
- xpk/templates/kueue_config.yaml.j2 +21 -5
- xpk/templates/kueue_super_slicing_topology.yaml.j2 +9 -0
- xpk/utils/kueue.py +6 -2
- {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/METADATA +2 -1
- {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/RECORD +39 -37
- {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/WHEEL +0 -0
- {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/entry_points.txt +0 -0
- {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/top_level.txt +0 -0
xpk/commands/cluster.py
CHANGED
|
@@ -19,7 +19,7 @@ from tabulate import tabulate
|
|
|
19
19
|
from ..utils.feature_flags import FeatureFlags
|
|
20
20
|
from ..utils.versions import ReleaseChannel
|
|
21
21
|
from ..core.pathways import get_pathways_machine_types
|
|
22
|
-
from ..core.capacity import H100_DEVICE_TYPE, get_reservation_deployment_type
|
|
22
|
+
from ..core.capacity import H100_DEVICE_TYPE, get_reservation_deployment_type, parse_reservation
|
|
23
23
|
from ..core.cluster import (
|
|
24
24
|
get_all_clusters_programmatic,
|
|
25
25
|
get_cluster_credentials,
|
|
@@ -79,7 +79,7 @@ from ..utils.file import write_tmp_file
|
|
|
79
79
|
from ..utils.execution_context import is_dry_run, is_quiet
|
|
80
80
|
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
81
81
|
from . import cluster_gcluster
|
|
82
|
-
from .common import set_cluster_command, validate_sub_slicing_system
|
|
82
|
+
from .common import set_cluster_command, validate_sub_slicing_system, validate_super_slicing_system
|
|
83
83
|
from jinja2 import Environment, FileSystemLoader
|
|
84
84
|
from ..utils.templates import get_templates_absolute_path
|
|
85
85
|
import shutil
|
|
@@ -211,6 +211,11 @@ def _validate_cluster_create_args(args, system: SystemCharacteristics):
|
|
|
211
211
|
if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing:
|
|
212
212
|
validate_sub_slicing_system(system)
|
|
213
213
|
_validate_sub_slicing_reservation(args)
|
|
214
|
+
if FeatureFlags.SUPER_SLICING_ENABLED:
|
|
215
|
+
_validate_num_slices_and_set_default(args)
|
|
216
|
+
if args.super_slicing:
|
|
217
|
+
validate_super_slicing_system(system)
|
|
218
|
+
_validate_super_slicing_reservation(args)
|
|
214
219
|
if args.enable_pathways:
|
|
215
220
|
_validate_pathways_machine(args)
|
|
216
221
|
|
|
@@ -233,15 +238,30 @@ def _validate_pathways_machine(args):
|
|
|
233
238
|
|
|
234
239
|
|
|
235
240
|
def _validate_sub_slicing_reservation(args):
|
|
241
|
+
_validate_gsc_reservation(args, 'Sub-slicing')
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def _validate_super_slicing_reservation(args):
|
|
245
|
+
_validate_gsc_reservation(args, 'Super-slicing')
|
|
246
|
+
reservation = parse_reservation(args.reservation, args.project)
|
|
247
|
+
if reservation.block_name is None:
|
|
248
|
+
xpk_print(
|
|
249
|
+
'Error: Validation failed: Super-slicing cluster creation'
|
|
250
|
+
' requires a block or sub-block reservation.'
|
|
251
|
+
)
|
|
252
|
+
xpk_exit(1)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def _validate_gsc_reservation(args, creation_description: str):
|
|
236
256
|
if args.reservation is None:
|
|
237
257
|
xpk_print(
|
|
238
|
-
'Error: Validation failed:
|
|
239
|
-
' Cluster Director reservation to be specified.'
|
|
258
|
+
f'Error: Validation failed: {creation_description} cluster creation'
|
|
259
|
+
' requires Cluster Director reservation to be specified.'
|
|
240
260
|
)
|
|
241
261
|
xpk_exit(1)
|
|
242
262
|
|
|
243
263
|
deployment_type = get_reservation_deployment_type(
|
|
244
|
-
|
|
264
|
+
reservation_path=args.reservation, project=args.project, zone=args.zone
|
|
245
265
|
)
|
|
246
266
|
if deployment_type != 'DENSE':
|
|
247
267
|
xpk_print(
|
|
@@ -263,6 +283,22 @@ def _validate_sub_slicing_reservation(args):
|
|
|
263
283
|
xpk_exit(1)
|
|
264
284
|
|
|
265
285
|
|
|
286
|
+
def _validate_num_slices_and_set_default(args):
|
|
287
|
+
if args.num_cubes is not None and not args.super_slicing:
|
|
288
|
+
xpk_print('--num-cubes can only be used with --super-slicing')
|
|
289
|
+
xpk_exit(1)
|
|
290
|
+
|
|
291
|
+
if (
|
|
292
|
+
args.num_cubes is not None
|
|
293
|
+
and args.num_slices is not None
|
|
294
|
+
and args.num_cubes != args.num_slices
|
|
295
|
+
):
|
|
296
|
+
xpk_print('--num-cubes must not be different from --num-slices')
|
|
297
|
+
xpk_exit(1)
|
|
298
|
+
|
|
299
|
+
args.num_slices = args.num_slices or args.num_cubes or 1
|
|
300
|
+
|
|
301
|
+
|
|
266
302
|
def cluster_create(args) -> None:
|
|
267
303
|
"""Function around cluster creation.
|
|
268
304
|
|
|
@@ -374,6 +410,7 @@ def cluster_create(args) -> None:
|
|
|
374
410
|
)
|
|
375
411
|
if return_code != 0:
|
|
376
412
|
xpk_exit(return_code)
|
|
413
|
+
assert gke_node_pool_version
|
|
377
414
|
|
|
378
415
|
run_gke_node_pool_create_command_code = run_gke_node_pool_create_command(
|
|
379
416
|
args, system, gke_node_pool_version
|
|
@@ -1256,6 +1293,9 @@ def run_gke_cluster_create_command(
|
|
|
1256
1293
|
addons_str = ','.join(addons)
|
|
1257
1294
|
command += f' --addons={addons_str}'
|
|
1258
1295
|
|
|
1296
|
+
if FeatureFlags.SUPER_SLICING_ENABLED and args.super_slicing:
|
|
1297
|
+
command += ' --enable-slice-controller'
|
|
1298
|
+
|
|
1259
1299
|
if args.custom_cluster_arguments:
|
|
1260
1300
|
command += f' {args.custom_cluster_arguments}'
|
|
1261
1301
|
|
|
@@ -1347,6 +1387,9 @@ def _install_kueue(
|
|
|
1347
1387
|
configure_sub_slicing=(
|
|
1348
1388
|
FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing
|
|
1349
1389
|
),
|
|
1390
|
+
configure_super_slicing=(
|
|
1391
|
+
FeatureFlags.SUPER_SLICING_ENABLED and args.super_slicing
|
|
1392
|
+
),
|
|
1350
1393
|
)
|
|
1351
1394
|
)
|
|
1352
1395
|
|
xpk/commands/cluster_gcluster.py
CHANGED
|
@@ -173,6 +173,9 @@ def __install_kueue(args) -> int:
|
|
|
173
173
|
configure_sub_slicing=(
|
|
174
174
|
FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing
|
|
175
175
|
),
|
|
176
|
+
configure_super_slicing=(
|
|
177
|
+
FeatureFlags.SUPER_SLICING_ENABLED and args.super_slicing
|
|
178
|
+
),
|
|
176
179
|
),
|
|
177
180
|
tolerations=tolerations,
|
|
178
181
|
)
|
|
@@ -97,6 +97,7 @@ def test_install_kueue_standard(
|
|
|
97
97
|
accelerator_type=AcceleratorType.GPU,
|
|
98
98
|
device_type="h100-mega-80gb-8",
|
|
99
99
|
supports_sub_slicing=False,
|
|
100
|
+
supports_super_slicing=False,
|
|
100
101
|
docker_platform=DockerPlatform.ARM,
|
|
101
102
|
gpu_config=GpuConfig(requires_topology=True),
|
|
102
103
|
)
|
|
@@ -150,6 +151,7 @@ def test_install_kueue_with_autoprovisioning(
|
|
|
150
151
|
accelerator_type=AcceleratorType.GPU,
|
|
151
152
|
device_type="h100-mega-80gb-8",
|
|
152
153
|
supports_sub_slicing=False,
|
|
154
|
+
supports_super_slicing=False,
|
|
153
155
|
docker_platform=DockerPlatform.ARM,
|
|
154
156
|
gpu_config=GpuConfig(requires_topology=True),
|
|
155
157
|
)
|
xpk/commands/cluster_test.py
CHANGED
|
@@ -110,6 +110,7 @@ def construct_args(**kwargs: Any) -> Namespace:
|
|
|
110
110
|
cluster='test-cluster',
|
|
111
111
|
default_pool_cpu_num_nodes='100',
|
|
112
112
|
sub_slicing=False,
|
|
113
|
+
super_slicing=False,
|
|
113
114
|
gke_version='',
|
|
114
115
|
private=False,
|
|
115
116
|
authorized_networks=None,
|
|
@@ -226,6 +227,9 @@ GPU_TEST_SYSTEM: SystemCharacteristics = UserFacingNameToSystemCharacteristics[
|
|
|
226
227
|
SUB_SLICING_SYSTEM: SystemCharacteristics = (
|
|
227
228
|
UserFacingNameToSystemCharacteristics['v6e-4x4']
|
|
228
229
|
)
|
|
230
|
+
SUPER_SLICING_SYSTEM: SystemCharacteristics = (
|
|
231
|
+
UserFacingNameToSystemCharacteristics['tpu7x-4x4x4']
|
|
232
|
+
)
|
|
229
233
|
TPU_TEST_SYSTEM: SystemCharacteristics = UserFacingNameToSystemCharacteristics[
|
|
230
234
|
'v6e-4x4'
|
|
231
235
|
]
|
|
@@ -608,3 +612,202 @@ def test_cluster_create_calls_run_command_with_correct_channel_and_version(
|
|
|
608
612
|
]
|
|
609
613
|
|
|
610
614
|
mocks.commands_tester.assert_command_run(*expected_command_parts)
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
def test_run_gke_cluster_create_command_with_super_slicing_enables_slice_controller(
|
|
618
|
+
mocks: _Mocks,
|
|
619
|
+
):
|
|
620
|
+
FeatureFlags.SUPER_SLICING_ENABLED = True
|
|
621
|
+
result = run_gke_cluster_create_command(
|
|
622
|
+
args=construct_args(gke_version='1.2.3', super_slicing=True),
|
|
623
|
+
gke_control_plane_version='1.2.3',
|
|
624
|
+
system=SUPER_SLICING_SYSTEM,
|
|
625
|
+
release_channel=ReleaseChannel.REGULAR,
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
assert result == 0
|
|
629
|
+
mocks.commands_tester.assert_command_run(
|
|
630
|
+
'clusters create', '--enable-slice-controller'
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
def test_validate_cluster_create_args_for_correct_super_slicing_args_pass(
|
|
635
|
+
mocks: _Mocks,
|
|
636
|
+
):
|
|
637
|
+
FeatureFlags.SUPER_SLICING_ENABLED = True
|
|
638
|
+
args = construct_args(
|
|
639
|
+
super_slicing=True,
|
|
640
|
+
reservation='test-reservation/reservationBlocks/block',
|
|
641
|
+
num_cubes=None,
|
|
642
|
+
num_slices=None,
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
_validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
|
|
646
|
+
args = construct_args(
|
|
647
|
+
super_slicing=True,
|
|
648
|
+
reservation='test-reservation/reservationBlocks/block/reservationSubBlocks/subblock',
|
|
649
|
+
num_cubes=None,
|
|
650
|
+
num_slices=None,
|
|
651
|
+
)
|
|
652
|
+
_validate_cluster_create_args(
|
|
653
|
+
args, UserFacingNameToSystemCharacteristics['tpu7x-128']
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
assert mocks.common_print_mock.call_count == 0
|
|
657
|
+
|
|
658
|
+
|
|
659
|
+
def test_validate_cluster_create_args_for_super_slicing_system_not_supported_throws(
|
|
660
|
+
mocks: _Mocks,
|
|
661
|
+
):
|
|
662
|
+
FeatureFlags.SUPER_SLICING_ENABLED = True
|
|
663
|
+
args = construct_args(
|
|
664
|
+
super_slicing=True,
|
|
665
|
+
reservation='test-reservation/reservationBlocks/block',
|
|
666
|
+
num_cubes=None,
|
|
667
|
+
num_slices=None,
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
with pytest.raises(SystemExit):
|
|
671
|
+
_validate_cluster_create_args(
|
|
672
|
+
args, UserFacingNameToSystemCharacteristics['tpu7x-4x4x8']
|
|
673
|
+
)
|
|
674
|
+
|
|
675
|
+
assert mocks.common_print_mock.call_count == 1
|
|
676
|
+
assert (
|
|
677
|
+
mocks.common_print_mock.call_args[0][0]
|
|
678
|
+
== 'Error: tpu7x-256 does not support Super-slicing.'
|
|
679
|
+
)
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
def test_validate_cluster_create_args_for_super_slicing_missing_reservation(
|
|
683
|
+
mocks: _Mocks,
|
|
684
|
+
):
|
|
685
|
+
FeatureFlags.SUPER_SLICING_ENABLED = True
|
|
686
|
+
args = construct_args(
|
|
687
|
+
super_slicing=True,
|
|
688
|
+
reservation=None,
|
|
689
|
+
num_cubes=None,
|
|
690
|
+
num_slices=None,
|
|
691
|
+
)
|
|
692
|
+
|
|
693
|
+
with pytest.raises(SystemExit):
|
|
694
|
+
_validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
|
|
695
|
+
|
|
696
|
+
assert mocks.commands_print_mock.call_count == 1
|
|
697
|
+
assert (
|
|
698
|
+
'Validation failed: Super-slicing cluster creation requires'
|
|
699
|
+
in mocks.commands_print_mock.call_args[0][0]
|
|
700
|
+
)
|
|
701
|
+
|
|
702
|
+
|
|
703
|
+
def test_validate_cluster_create_args_for_super_slicing_reservation_no_blocks(
|
|
704
|
+
mocks: _Mocks,
|
|
705
|
+
):
|
|
706
|
+
FeatureFlags.SUPER_SLICING_ENABLED = True
|
|
707
|
+
args = construct_args(
|
|
708
|
+
super_slicing=True,
|
|
709
|
+
reservation='reservation',
|
|
710
|
+
num_cubes=None,
|
|
711
|
+
num_slices=None,
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
with pytest.raises(SystemExit):
|
|
715
|
+
_validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
|
|
716
|
+
|
|
717
|
+
assert mocks.commands_print_mock.call_count == 1
|
|
718
|
+
assert (
|
|
719
|
+
'requires a block or sub-block reservation'
|
|
720
|
+
in mocks.commands_print_mock.call_args[0][0]
|
|
721
|
+
)
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
def test_validate_cluster_create_args_for_super_slicing_sparse_deployment_type_reservation(
|
|
725
|
+
mocks: _Mocks,
|
|
726
|
+
):
|
|
727
|
+
FeatureFlags.SUPER_SLICING_ENABLED = True
|
|
728
|
+
args = construct_args(
|
|
729
|
+
super_slicing=True,
|
|
730
|
+
reservation='test-reservation/reservationBlocks/block',
|
|
731
|
+
num_cubes=None,
|
|
732
|
+
num_slices=None,
|
|
733
|
+
)
|
|
734
|
+
mocks.commands_get_reservation_deployment_type.return_value = 'SPARSE'
|
|
735
|
+
|
|
736
|
+
with pytest.raises(SystemExit):
|
|
737
|
+
_validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
|
|
738
|
+
|
|
739
|
+
assert mocks.commands_print_mock.call_count == 5
|
|
740
|
+
assert (
|
|
741
|
+
'Refer to the documentation for more information on creating Cluster'
|
|
742
|
+
in mocks.commands_print_mock.call_args[0][0]
|
|
743
|
+
)
|
|
744
|
+
|
|
745
|
+
|
|
746
|
+
def test_validate_cluster_create_args_forbids_num_cubes_without_superslicing(
|
|
747
|
+
mocks: _Mocks,
|
|
748
|
+
):
|
|
749
|
+
FeatureFlags.SUPER_SLICING_ENABLED = True # enable the feature
|
|
750
|
+
args = construct_args(
|
|
751
|
+
super_slicing=False, # but disable the flag
|
|
752
|
+
reservation='test-reservation/reservationBlocks/block',
|
|
753
|
+
num_cubes=1,
|
|
754
|
+
num_slices=None,
|
|
755
|
+
)
|
|
756
|
+
|
|
757
|
+
with pytest.raises(SystemExit):
|
|
758
|
+
_validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
|
|
759
|
+
|
|
760
|
+
assert mocks.commands_print_mock.call_count == 1
|
|
761
|
+
assert (
|
|
762
|
+
'--num-cubes can only be used with --super-slicing'
|
|
763
|
+
in mocks.commands_print_mock.call_args[0][0]
|
|
764
|
+
)
|
|
765
|
+
|
|
766
|
+
|
|
767
|
+
def test_validate_cluster_create_args_forbids_num_cubes_different_from_num_slices(
|
|
768
|
+
mocks: _Mocks,
|
|
769
|
+
):
|
|
770
|
+
FeatureFlags.SUPER_SLICING_ENABLED = True
|
|
771
|
+
args = construct_args(
|
|
772
|
+
super_slicing=True,
|
|
773
|
+
reservation='test-reservation/reservationBlocks/block',
|
|
774
|
+
num_cubes=1,
|
|
775
|
+
num_slices=2,
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
with pytest.raises(SystemExit):
|
|
779
|
+
_validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
|
|
780
|
+
|
|
781
|
+
assert mocks.commands_print_mock.call_count == 1
|
|
782
|
+
assert (
|
|
783
|
+
'--num-cubes must not be different from --num-slices'
|
|
784
|
+
in mocks.commands_print_mock.call_args[0][0]
|
|
785
|
+
)
|
|
786
|
+
|
|
787
|
+
|
|
788
|
+
@pytest.mark.parametrize(
|
|
789
|
+
'num_cubes, num_slices, expected',
|
|
790
|
+
[
|
|
791
|
+
(None, None, 1),
|
|
792
|
+
(3, None, 3),
|
|
793
|
+
(None, 3, 3),
|
|
794
|
+
(3, 3, 3),
|
|
795
|
+
],
|
|
796
|
+
)
|
|
797
|
+
def test_validate_cluster_create_args_sets_correct_num_slices(
|
|
798
|
+
mocks: _Mocks,
|
|
799
|
+
num_cubes: int | None,
|
|
800
|
+
num_slices: int | None,
|
|
801
|
+
expected: int,
|
|
802
|
+
):
|
|
803
|
+
FeatureFlags.SUPER_SLICING_ENABLED = True
|
|
804
|
+
args = construct_args(
|
|
805
|
+
super_slicing=True,
|
|
806
|
+
reservation='test-reservation/reservationBlocks/block',
|
|
807
|
+
num_cubes=num_cubes,
|
|
808
|
+
num_slices=num_slices,
|
|
809
|
+
)
|
|
810
|
+
|
|
811
|
+
_validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
|
|
812
|
+
|
|
813
|
+
assert args.num_slices == expected
|
xpk/commands/common.py
CHANGED
|
@@ -73,3 +73,9 @@ def validate_sub_slicing_system(system: SystemCharacteristics):
|
|
|
73
73
|
if not system.supports_sub_slicing:
|
|
74
74
|
xpk_print(f'Error: {system.device_type} does not support Sub-slicing.')
|
|
75
75
|
xpk_exit(1)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def validate_super_slicing_system(system: SystemCharacteristics):
|
|
79
|
+
if not system.supports_super_slicing:
|
|
80
|
+
xpk_print(f'Error: {system.device_type} does not support Super-slicing.')
|
|
81
|
+
xpk_exit(1)
|
xpk/commands/kind.py
CHANGED
|
@@ -98,6 +98,7 @@ def cluster_create(args) -> None:
|
|
|
98
98
|
AcceleratorType.CPU,
|
|
99
99
|
'kind',
|
|
100
100
|
supports_sub_slicing=False,
|
|
101
|
+
supports_super_slicing=False,
|
|
101
102
|
docker_platform=DockerPlatform.ARM,
|
|
102
103
|
)
|
|
103
104
|
|
|
@@ -113,6 +114,7 @@ def cluster_create(args) -> None:
|
|
|
113
114
|
is_pathways_cluster=False,
|
|
114
115
|
flex=False,
|
|
115
116
|
configure_sub_slicing=False,
|
|
117
|
+
configure_super_slicing=False,
|
|
116
118
|
),
|
|
117
119
|
)
|
|
118
120
|
|
xpk/commands/workload.py
CHANGED
|
@@ -132,6 +132,7 @@ spec:
|
|
|
132
132
|
annotations:
|
|
133
133
|
{storage_annotations}
|
|
134
134
|
{sub_slicing_annotations}
|
|
135
|
+
{annotations_machine_label}
|
|
135
136
|
spec:
|
|
136
137
|
schedulerName: {args.scheduler}
|
|
137
138
|
imagePullSecrets:
|
|
@@ -140,7 +141,7 @@ spec:
|
|
|
140
141
|
{affinity}
|
|
141
142
|
nodeSelector:
|
|
142
143
|
{accelerator_label}
|
|
143
|
-
{
|
|
144
|
+
{node_selector_machine_label}
|
|
144
145
|
{placement_policy_label}
|
|
145
146
|
{autoprovisioning_args}
|
|
146
147
|
priorityClassName: {args.priority}
|
|
@@ -498,19 +499,24 @@ def workload_create(args) -> None:
|
|
|
498
499
|
operator: NotIn
|
|
499
500
|
values: [{restart_on_exit_codes}]"""
|
|
500
501
|
|
|
501
|
-
|
|
502
|
+
placement_policy_label = ''
|
|
503
|
+
if (
|
|
504
|
+
# Don't bother with placement for sub/super-slicing workloads:
|
|
505
|
+
workload_scheduling == WorkloadScheduling.AVAILABLE
|
|
506
|
+
and is_placement_policy_supported(workload_system)
|
|
507
|
+
):
|
|
502
508
|
ensure_resource_policy_exists(
|
|
503
|
-
resource_policy_name=get_placement_policy_name(
|
|
509
|
+
resource_policy_name=get_placement_policy_name(
|
|
510
|
+
workload_system, super_slicing=False
|
|
511
|
+
),
|
|
504
512
|
project=args.project,
|
|
505
513
|
zone=args.zone,
|
|
506
514
|
topology=workload_system.topology,
|
|
515
|
+
super_slicing=False,
|
|
516
|
+
)
|
|
517
|
+
placement_policy_label = create_placement_policy_label(
|
|
518
|
+
workload_system, super_slicing=False
|
|
507
519
|
)
|
|
508
|
-
|
|
509
|
-
placement_policy_label = (
|
|
510
|
-
create_placement_policy_label(workload_system)
|
|
511
|
-
if is_placement_policy_supported(workload_system)
|
|
512
|
-
else ''
|
|
513
|
-
)
|
|
514
520
|
|
|
515
521
|
# Create the workload file based on accelerator type or workload type.
|
|
516
522
|
if workload_system.accelerator_type == AcceleratorType.GPU:
|
|
@@ -616,12 +622,26 @@ def workload_create(args) -> None:
|
|
|
616
622
|
use_sub_slicing = (
|
|
617
623
|
workload_scheduling == WorkloadScheduling.SUB_SLICING_AVAILABLE
|
|
618
624
|
)
|
|
625
|
+
use_super_slicing = (
|
|
626
|
+
workload_scheduling == WorkloadScheduling.SUPER_SLICING_AVAILABLE
|
|
627
|
+
)
|
|
619
628
|
if use_sub_slicing:
|
|
620
629
|
xpk_print('Workload will be scheduled using the Sub-slicing feature.')
|
|
630
|
+
if use_super_slicing:
|
|
631
|
+
xpk_print('Workload will be scheduled using the Super-slicing feature.')
|
|
621
632
|
|
|
622
633
|
container, debugging_dashboard_id = get_user_workload_container(
|
|
623
634
|
args, workload_system
|
|
624
635
|
)
|
|
636
|
+
|
|
637
|
+
machine_label = (
|
|
638
|
+
create_machine_label(cluster_system)
|
|
639
|
+
if use_sub_slicing and cluster_system
|
|
640
|
+
else create_machine_label(workload_system)
|
|
641
|
+
)
|
|
642
|
+
node_selector_machine_label = machine_label if not use_super_slicing else ''
|
|
643
|
+
annotations_machine_label = machine_label if use_super_slicing else ''
|
|
644
|
+
|
|
625
645
|
yml_string = WORKLOAD_CREATE_YAML.format(
|
|
626
646
|
args=args,
|
|
627
647
|
container=container,
|
|
@@ -636,11 +656,8 @@ def workload_create(args) -> None:
|
|
|
636
656
|
else ''
|
|
637
657
|
),
|
|
638
658
|
placement_policy_label=placement_policy_label,
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
if use_sub_slicing and cluster_system
|
|
642
|
-
else create_machine_label(workload_system)
|
|
643
|
-
),
|
|
659
|
+
node_selector_machine_label=node_selector_machine_label,
|
|
660
|
+
annotations_machine_label=annotations_machine_label,
|
|
644
661
|
local_queue_name=LOCAL_QUEUE_NAME,
|
|
645
662
|
autoprovisioning_args=autoprovisioning_args,
|
|
646
663
|
volumes=get_volumes(args, workload_system),
|
|
@@ -822,12 +839,15 @@ def workload_delete(args) -> None:
|
|
|
822
839
|
if len(workloads) == 1:
|
|
823
840
|
return_code = run_command_with_updates(commands[0], 'Delete Workload')
|
|
824
841
|
else:
|
|
825
|
-
|
|
842
|
+
maybe_failure = run_commands(
|
|
826
843
|
commands,
|
|
827
844
|
'Delete Workload',
|
|
828
845
|
task_names,
|
|
829
846
|
batch=100,
|
|
830
847
|
)
|
|
848
|
+
return_code = (
|
|
849
|
+
maybe_failure.return_code if maybe_failure is not None else 0
|
|
850
|
+
)
|
|
831
851
|
|
|
832
852
|
if return_code != 0:
|
|
833
853
|
xpk_print(f'Delete Workload request returned ERROR {return_code}')
|
xpk/commands/workload_test.py
CHANGED