xpk 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/cluster.py +29 -30
- xpk/commands/cluster_gcluster.py +19 -14
- xpk/commands/cluster_test.py +1 -21
- xpk/commands/common.py +39 -6
- xpk/commands/common_test.py +170 -0
- xpk/commands/info.py +9 -5
- xpk/commands/inspector.py +33 -4
- xpk/commands/inspector_test.py +142 -0
- xpk/commands/workload.py +22 -8
- xpk/commands/workload_test.py +70 -3
- xpk/core/blueprint/blueprint_generator.py +19 -8
- xpk/core/blueprint/testing/data/a3_ultra.yaml +3 -1
- xpk/core/blueprint/testing/data/a4.yaml +3 -1
- xpk/core/capacity.py +37 -17
- xpk/core/capacity_test.py +66 -1
- xpk/core/cluster.py +10 -10
- xpk/core/cluster_private.py +3 -3
- xpk/core/cluster_test.py +29 -2
- xpk/core/docker_container.py +31 -24
- xpk/core/docker_manager.py +4 -4
- xpk/core/docker_resources.py +4 -1
- xpk/core/kueue_manager.py +6 -8
- xpk/core/kueue_manager_test.py +4 -5
- xpk/core/nap.py +14 -3
- xpk/core/nodepool.py +46 -13
- xpk/core/nodepool_test.py +143 -8
- xpk/core/remote_state/fuse_remote_state.py +1 -1
- xpk/core/scheduling.py +4 -1
- xpk/core/scheduling_test.py +1 -1
- xpk/core/system_characteristics.py +6 -0
- xpk/core/telemetry.py +11 -1
- xpk/core/telemetry_test.py +39 -0
- xpk/core/testing/commands_tester.py +26 -0
- xpk/core/testing/commands_tester_test.py +20 -1
- xpk/core/workload_decorators/rdma_decorator.py +9 -0
- xpk/parser/cluster.py +11 -1
- xpk/parser/cluster_test.py +59 -1
- xpk/parser/common.py +11 -0
- xpk/parser/storage.py +3 -3
- xpk/utils/console.py +1 -1
- xpk/utils/feature_flags.py +7 -3
- {xpk-1.0.0.dist-info → xpk-1.1.0.dist-info}/METADATA +37 -21
- {xpk-1.0.0.dist-info → xpk-1.1.0.dist-info}/RECORD +47 -54
- xpk-1.1.0.dist-info/top_level.txt +1 -0
- integration/README.md +0 -19
- integration/__init__.py +0 -15
- integration/docker_manager_test.py +0 -102
- integration/gcluster_a3mega_test.py +0 -215
- integration/gcluster_a3ultra_test.py +0 -187
- integration/gcluster_a4_test.py +0 -187
- integration/gcluster_test.py +0 -107
- xpk/utils/user_input.py +0 -48
- xpk/utils/user_input_test.py +0 -92
- xpk-1.0.0.dist-info/top_level.txt +0 -2
- {xpk-1.0.0.dist-info → xpk-1.1.0.dist-info}/WHEEL +0 -0
- {xpk-1.0.0.dist-info → xpk-1.1.0.dist-info}/entry_points.txt +0 -0
- {xpk-1.0.0.dist-info → xpk-1.1.0.dist-info}/licenses/LICENSE +0 -0
xpk/commands/cluster.py
CHANGED
|
@@ -19,7 +19,6 @@ from tabulate import tabulate
|
|
|
19
19
|
from ..utils.feature_flags import FeatureFlags
|
|
20
20
|
from ..utils.versions import ReleaseChannel
|
|
21
21
|
from ..core.pathways import get_pathways_machine_types
|
|
22
|
-
from ..core.capacity import H100_DEVICE_TYPE, get_reservation_deployment_type, parse_reservation
|
|
23
22
|
from ..core.cluster import (
|
|
24
23
|
get_all_clusters_programmatic,
|
|
25
24
|
get_cluster_credentials,
|
|
@@ -40,7 +39,12 @@ from ..core.cluster_private import authorize_private_cluster_access_if_necessary
|
|
|
40
39
|
from ..core.commands import run_command_for_value, run_command_with_updates
|
|
41
40
|
from ..core.config import VERTEX_TENSORBOARD_FEATURE_FLAG
|
|
42
41
|
from ..core.telemetry import MetricsCollector, MetricsEventMetadataKey
|
|
43
|
-
from ..core.capacity import
|
|
42
|
+
from ..core.capacity import (
|
|
43
|
+
H100_DEVICE_TYPE,
|
|
44
|
+
get_capacity_type,
|
|
45
|
+
get_reservations_list,
|
|
46
|
+
get_reservation_deployment_type,
|
|
47
|
+
)
|
|
44
48
|
from ..core.gcloud_context import (
|
|
45
49
|
add_zone_and_project,
|
|
46
50
|
get_gke_control_plane_version,
|
|
@@ -240,13 +244,6 @@ def _validate_sub_slicing_reservation(args):
|
|
|
240
244
|
|
|
241
245
|
def _validate_super_slicing_reservation(args):
|
|
242
246
|
_validate_gsc_reservation(args, 'Super-slicing')
|
|
243
|
-
reservation = parse_reservation(args.reservation, args.project)
|
|
244
|
-
if reservation.block_name is None:
|
|
245
|
-
xpk_print(
|
|
246
|
-
'Error: Validation failed: Super-slicing cluster creation'
|
|
247
|
-
' requires a block or sub-block reservation.'
|
|
248
|
-
)
|
|
249
|
-
xpk_exit(1)
|
|
250
247
|
|
|
251
248
|
|
|
252
249
|
def _validate_gsc_reservation(args, creation_description: str):
|
|
@@ -257,27 +254,29 @@ def _validate_gsc_reservation(args, creation_description: str):
|
|
|
257
254
|
)
|
|
258
255
|
xpk_exit(1)
|
|
259
256
|
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
if deployment_type != 'DENSE':
|
|
264
|
-
xpk_print(
|
|
265
|
-
'Error: Validation failed: The specified reservation'
|
|
266
|
-
f' "{args.reservation}" is not a Cluster Director reservation.'
|
|
267
|
-
)
|
|
268
|
-
xpk_print(
|
|
269
|
-
'Please provide a reservation created for Cluster Director to proceed.'
|
|
270
|
-
)
|
|
271
|
-
xpk_print('To list valid Cluster Director reservations, run:')
|
|
272
|
-
xpk_print(
|
|
273
|
-
' gcloud compute reservations list --filter="deploymentType=DENSE"'
|
|
274
|
-
)
|
|
275
|
-
xpk_print(
|
|
276
|
-
'Refer to the documentation for more information on creating Cluster'
|
|
277
|
-
' Director reservations:'
|
|
278
|
-
' https://cloud.google.com/cluster-director/docs/reserve-capacity'
|
|
257
|
+
for reservation in get_reservations_list(args):
|
|
258
|
+
deployment_type = get_reservation_deployment_type(
|
|
259
|
+
reservation_path=reservation, project=args.project, zone=args.zone
|
|
279
260
|
)
|
|
280
|
-
|
|
261
|
+
if deployment_type != 'DENSE':
|
|
262
|
+
xpk_print(
|
|
263
|
+
'Error: Validation failed: The specified reservation'
|
|
264
|
+
f' "{reservation}" is not a Cluster Director reservation.'
|
|
265
|
+
)
|
|
266
|
+
xpk_print(
|
|
267
|
+
'Please provide a reservation created for Cluster Director to'
|
|
268
|
+
' proceed.'
|
|
269
|
+
)
|
|
270
|
+
xpk_print('To list valid Cluster Director reservations, run:')
|
|
271
|
+
xpk_print(
|
|
272
|
+
' gcloud compute reservations list --filter="deploymentType=DENSE"'
|
|
273
|
+
)
|
|
274
|
+
xpk_print(
|
|
275
|
+
'Refer to the documentation for more information on creating Cluster'
|
|
276
|
+
' Director reservations:'
|
|
277
|
+
' https://cloud.google.com/cluster-director/docs/reserve-capacity'
|
|
278
|
+
)
|
|
279
|
+
xpk_exit(1)
|
|
281
280
|
|
|
282
281
|
|
|
283
282
|
def _validate_num_slices_and_set_default(args):
|
|
@@ -372,7 +371,7 @@ def cluster_create(args) -> None:
|
|
|
372
371
|
|
|
373
372
|
update_coredns_command_code = update_coredns_if_necessary()
|
|
374
373
|
if update_coredns_command_code != 0:
|
|
375
|
-
xpk_exit(
|
|
374
|
+
xpk_exit(update_coredns_command_code)
|
|
376
375
|
|
|
377
376
|
if not is_dry_run():
|
|
378
377
|
k8s_client = setup_k8s_env(args)
|
xpk/commands/cluster_gcluster.py
CHANGED
|
@@ -32,7 +32,7 @@ from ..core.blueprint.blueprint_generator import (
|
|
|
32
32
|
a4_device_type,
|
|
33
33
|
supported_device_types,
|
|
34
34
|
)
|
|
35
|
-
from ..core.capacity import get_capacity_type
|
|
35
|
+
from ..core.capacity import get_capacity_type, get_reservations_list
|
|
36
36
|
from ..core.cluster import get_cluster_credentials
|
|
37
37
|
from ..core.commands import run_command_for_value
|
|
38
38
|
from ..core.docker_manager import DockerManager
|
|
@@ -304,22 +304,29 @@ def generate_blueprint(
|
|
|
304
304
|
if args.cluster_state_gcs_bucket is not None:
|
|
305
305
|
validate_state_gcs_bucket(args)
|
|
306
306
|
|
|
307
|
+
num_nodes = 2 if args.num_nodes is None else args.num_nodes
|
|
308
|
+
|
|
309
|
+
reservations = get_reservations_list(args)
|
|
310
|
+
if len(reservations) > 1:
|
|
311
|
+
xpk_print(
|
|
312
|
+
'Error: Cluster Toolkit based clusters only support a single'
|
|
313
|
+
' reservation.'
|
|
314
|
+
)
|
|
315
|
+
xpk_exit(1)
|
|
316
|
+
reservation = reservations[0] if len(reservations) > 0 else None
|
|
317
|
+
|
|
307
318
|
if args.device_type in supported_device_types:
|
|
308
319
|
if args.device_type == a3mega_device_type:
|
|
309
|
-
num_nodes = args.num_nodes if not args.num_nodes is None else 2
|
|
310
|
-
|
|
311
320
|
maintenance_interval = (
|
|
312
321
|
get_reservation_maintenance_interval(
|
|
313
|
-
|
|
322
|
+
reservation, args.zone, args.project
|
|
314
323
|
)
|
|
315
|
-
if
|
|
324
|
+
if reservation is not None
|
|
316
325
|
else 'PERIODIC'
|
|
317
326
|
)
|
|
318
327
|
placement_policy_name = (
|
|
319
|
-
get_reservation_placement_policy(
|
|
320
|
-
|
|
321
|
-
)
|
|
322
|
-
if args.reservation is not None
|
|
328
|
+
get_reservation_placement_policy(reservation, args.zone, args.project)
|
|
329
|
+
if reservation is not None
|
|
323
330
|
else None
|
|
324
331
|
)
|
|
325
332
|
placement_policy = (
|
|
@@ -342,7 +349,7 @@ def generate_blueprint(
|
|
|
342
349
|
num_nodes=num_nodes,
|
|
343
350
|
reservation_maintenance_interval=maintenance_interval,
|
|
344
351
|
reservation_placement_policy=placement_policy,
|
|
345
|
-
reservation=
|
|
352
|
+
reservation=reservation,
|
|
346
353
|
capacity_type=capacity_type,
|
|
347
354
|
system_node_pool_machine_type=args.default_pool_cpu_machine_type,
|
|
348
355
|
system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
|
|
@@ -351,7 +358,6 @@ def generate_blueprint(
|
|
|
351
358
|
release_channel=release_channel,
|
|
352
359
|
)
|
|
353
360
|
if args.device_type == a3ultra_device_type:
|
|
354
|
-
num_nodes = args.num_nodes if not args.num_nodes is None else 2
|
|
355
361
|
return bpg.generate_a3_ultra_blueprint(
|
|
356
362
|
blueprint_name=blueprint_name,
|
|
357
363
|
prefix=prefix,
|
|
@@ -361,7 +367,7 @@ def generate_blueprint(
|
|
|
361
367
|
zone=args.zone,
|
|
362
368
|
auth_cidr=all_IPs_cidr,
|
|
363
369
|
num_nodes=num_nodes,
|
|
364
|
-
reservation=
|
|
370
|
+
reservation=reservation,
|
|
365
371
|
enable_filestore_csi_driver=args.enable_gcpfilestore_csi_driver,
|
|
366
372
|
capacity_type=capacity_type,
|
|
367
373
|
system_node_pool_machine_type=args.default_pool_cpu_machine_type,
|
|
@@ -371,7 +377,6 @@ def generate_blueprint(
|
|
|
371
377
|
release_channel=release_channel,
|
|
372
378
|
)
|
|
373
379
|
if args.device_type == a4_device_type:
|
|
374
|
-
num_nodes = args.num_nodes if not args.num_nodes is None else 2
|
|
375
380
|
return bpg.generate_a4_blueprint(
|
|
376
381
|
blueprint_name=blueprint_name,
|
|
377
382
|
prefix=prefix,
|
|
@@ -381,7 +386,7 @@ def generate_blueprint(
|
|
|
381
386
|
zone=args.zone,
|
|
382
387
|
auth_cidr=all_IPs_cidr,
|
|
383
388
|
num_nodes=num_nodes,
|
|
384
|
-
reservation=
|
|
389
|
+
reservation=reservation,
|
|
385
390
|
capacity_type=capacity_type,
|
|
386
391
|
system_node_pool_machine_type=args.default_pool_cpu_machine_type,
|
|
387
392
|
system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
|
xpk/commands/cluster_test.py
CHANGED
|
@@ -146,6 +146,7 @@ def construct_args(**kwargs: Any) -> Namespace:
|
|
|
146
146
|
docker_image_pull_secret='',
|
|
147
147
|
managed_mldiagnostics=False,
|
|
148
148
|
output_manifest_file='',
|
|
149
|
+
num_cubes=None,
|
|
149
150
|
)
|
|
150
151
|
args_dict.update(kwargs)
|
|
151
152
|
return Namespace(**args_dict)
|
|
@@ -696,27 +697,6 @@ def test_validate_cluster_create_args_for_super_slicing_missing_reservation(
|
|
|
696
697
|
)
|
|
697
698
|
|
|
698
699
|
|
|
699
|
-
def test_validate_cluster_create_args_for_super_slicing_reservation_no_blocks(
|
|
700
|
-
mocks: _Mocks,
|
|
701
|
-
):
|
|
702
|
-
FeatureFlags.SUPER_SLICING_ENABLED = True
|
|
703
|
-
args = construct_args(
|
|
704
|
-
super_slicing=True,
|
|
705
|
-
reservation='reservation',
|
|
706
|
-
num_cubes=None,
|
|
707
|
-
num_slices=None,
|
|
708
|
-
)
|
|
709
|
-
|
|
710
|
-
with pytest.raises(SystemExit):
|
|
711
|
-
_validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
|
|
712
|
-
|
|
713
|
-
assert mocks.commands_print_mock.call_count == 1
|
|
714
|
-
assert (
|
|
715
|
-
'requires a block or sub-block reservation'
|
|
716
|
-
in mocks.commands_print_mock.call_args[0][0]
|
|
717
|
-
)
|
|
718
|
-
|
|
719
|
-
|
|
720
700
|
def test_validate_cluster_create_args_for_super_slicing_sparse_deployment_type_reservation(
|
|
721
701
|
mocks: _Mocks,
|
|
722
702
|
):
|
xpk/commands/common.py
CHANGED
|
@@ -14,8 +14,8 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
from ..core.commands import run_command_with_updates_retry
|
|
18
|
-
from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType
|
|
17
|
+
from ..core.commands import run_command_with_updates_retry, run_command_for_value
|
|
18
|
+
from ..core.capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, CapacityType
|
|
19
19
|
from ..core.gcloud_context import get_cluster_location
|
|
20
20
|
from ..utils.console import xpk_print, xpk_exit
|
|
21
21
|
from ..utils.execution_context import is_dry_run
|
|
@@ -46,9 +46,12 @@ def set_cluster_command(args) -> int:
|
|
|
46
46
|
return return_code
|
|
47
47
|
|
|
48
48
|
|
|
49
|
-
def
|
|
49
|
+
def is_GPU_TAS_possible(
|
|
50
50
|
system_characteristics: SystemCharacteristics | None,
|
|
51
51
|
capacity_type: CapacityType | None,
|
|
52
|
+
cluster_name: str,
|
|
53
|
+
zone: str,
|
|
54
|
+
project: str,
|
|
52
55
|
) -> bool:
|
|
53
56
|
"""Check cluster's machine_type and capacity type to determine if Kueue TAS is possible"""
|
|
54
57
|
|
|
@@ -63,10 +66,40 @@ def is_TAS_possible(
|
|
|
63
66
|
xpk_print('capacity_type data was not found in configmaps.')
|
|
64
67
|
xpk_exit(1)
|
|
65
68
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
+
# For A3-High and A3-Mega TAS is supported only for Flex and Reservation
|
|
70
|
+
if (
|
|
71
|
+
system_characteristics.device_type == H100_DEVICE_TYPE
|
|
72
|
+
or system_characteristics.device_type == H100_MEGA_DEVICE_TYPE
|
|
73
|
+
) and (
|
|
74
|
+
capacity_type != CapacityType.FLEX_START
|
|
75
|
+
and capacity_type != CapacityType.RESERVATION
|
|
76
|
+
):
|
|
77
|
+
return False
|
|
78
|
+
|
|
79
|
+
# COMPACT placement and Flex don't work together, Flex is enough to support TAS for A3-High or newer
|
|
80
|
+
if capacity_type == CapacityType.FLEX_START:
|
|
81
|
+
return True
|
|
82
|
+
|
|
83
|
+
# For A3-Ultra or newer, all capacity types support TAS as long as COMPACT placement is used
|
|
84
|
+
command = (
|
|
85
|
+
'gcloud container node-pools list'
|
|
86
|
+
f' --cluster {cluster_name}'
|
|
87
|
+
f' --location={zone}'
|
|
88
|
+
f' --project={project}'
|
|
89
|
+
' --filter="placementPolicy.type=COMPACT"'
|
|
90
|
+
' --format="value(name)"'
|
|
69
91
|
)
|
|
92
|
+
return_code, compact_placement_nps = run_command_for_value(
|
|
93
|
+
command=command,
|
|
94
|
+
task=(
|
|
95
|
+
'Check if there is a COMPACT placement policy nodepool in this'
|
|
96
|
+
' cluster'
|
|
97
|
+
),
|
|
98
|
+
)
|
|
99
|
+
if return_code != 0:
|
|
100
|
+
xpk_print('Node pool retrieval failed, assuming TAS is not possible')
|
|
101
|
+
return False
|
|
102
|
+
return compact_placement_nps.splitlines() != []
|
|
70
103
|
|
|
71
104
|
|
|
72
105
|
def validate_sub_slicing_system(system: SystemCharacteristics):
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import unittest
|
|
18
|
+
from unittest.mock import MagicMock, patch
|
|
19
|
+
|
|
20
|
+
from xpk.commands.common import is_GPU_TAS_possible
|
|
21
|
+
from xpk.core.capacity import (
|
|
22
|
+
H100_DEVICE_TYPE,
|
|
23
|
+
H100_MEGA_DEVICE_TYPE,
|
|
24
|
+
CapacityType,
|
|
25
|
+
)
|
|
26
|
+
from xpk.core.system_characteristics import SystemCharacteristics
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class CommonCommandsTest(unittest.TestCase):
|
|
30
|
+
|
|
31
|
+
@patch("xpk.commands.common.run_command_for_value")
|
|
32
|
+
@patch("xpk.commands.common.xpk_exit")
|
|
33
|
+
@patch("xpk.commands.common.xpk_print")
|
|
34
|
+
@patch("xpk.commands.common.is_dry_run")
|
|
35
|
+
def test_is_GPU_TAS_possible_dry_run(
|
|
36
|
+
self, mock_is_dry_run, mock_xpk_print, mock_xpk_exit, mock_run_command
|
|
37
|
+
):
|
|
38
|
+
"""Test is_GPU_TAS_possible returns True in dry_run mode."""
|
|
39
|
+
mock_is_dry_run.return_value = True
|
|
40
|
+
self.assertTrue(
|
|
41
|
+
is_GPU_TAS_possible(None, None, "cluster", "zone", "project")
|
|
42
|
+
)
|
|
43
|
+
mock_is_dry_run.assert_called_once()
|
|
44
|
+
mock_xpk_print.assert_not_called()
|
|
45
|
+
mock_xpk_exit.assert_not_called()
|
|
46
|
+
mock_run_command.assert_not_called()
|
|
47
|
+
|
|
48
|
+
@patch("xpk.commands.common.is_dry_run", return_value=False)
|
|
49
|
+
@patch("xpk.commands.common.xpk_exit")
|
|
50
|
+
@patch("xpk.commands.common.xpk_print")
|
|
51
|
+
def test_is_GPU_TAS_possible_no_system_characteristics(
|
|
52
|
+
self, mock_xpk_print, mock_xpk_exit, mock_is_dry_run
|
|
53
|
+
):
|
|
54
|
+
"""Test is_GPU_TAS_possible exits if system_characteristics is None."""
|
|
55
|
+
mock_xpk_exit.side_effect = SystemExit(1)
|
|
56
|
+
with self.assertRaises(SystemExit):
|
|
57
|
+
is_GPU_TAS_possible(None, MagicMock(), "cluster", "zone", "project")
|
|
58
|
+
mock_xpk_print.assert_called_with(
|
|
59
|
+
"system_characteristics data was not found in configmaps."
|
|
60
|
+
)
|
|
61
|
+
mock_xpk_exit.assert_called_with(1)
|
|
62
|
+
|
|
63
|
+
@patch("xpk.commands.common.is_dry_run", return_value=False)
|
|
64
|
+
@patch("xpk.commands.common.xpk_exit")
|
|
65
|
+
@patch("xpk.commands.common.xpk_print")
|
|
66
|
+
def test_is_GPU_TAS_possible_no_capacity_type(
|
|
67
|
+
self, mock_xpk_print, mock_xpk_exit, mock_is_dry_run
|
|
68
|
+
):
|
|
69
|
+
"""Test is_GPU_TAS_possible exits if capacity_type is None."""
|
|
70
|
+
mock_xpk_exit.side_effect = SystemExit(1)
|
|
71
|
+
with self.assertRaises(SystemExit):
|
|
72
|
+
is_GPU_TAS_possible(MagicMock(), None, "cluster", "zone", "project")
|
|
73
|
+
mock_xpk_print.assert_called_with(
|
|
74
|
+
"capacity_type data was not found in configmaps."
|
|
75
|
+
)
|
|
76
|
+
mock_xpk_exit.assert_called_with(1)
|
|
77
|
+
|
|
78
|
+
@patch("xpk.commands.common.is_dry_run", return_value=False)
|
|
79
|
+
def test_is_GPU_TAS_possible_h100_unsupported_capacity(self, mock_is_dry_run):
|
|
80
|
+
"""Test is_GPU_TAS_possible for H100 with unsupported capacity type."""
|
|
81
|
+
mock_system = MagicMock(spec=SystemCharacteristics)
|
|
82
|
+
mock_system.device_type = H100_DEVICE_TYPE
|
|
83
|
+
self.assertFalse(
|
|
84
|
+
is_GPU_TAS_possible(
|
|
85
|
+
mock_system, CapacityType.ON_DEMAND, "cluster", "zone", "project"
|
|
86
|
+
)
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
mock_system.device_type = H100_MEGA_DEVICE_TYPE
|
|
90
|
+
self.assertFalse(
|
|
91
|
+
is_GPU_TAS_possible(
|
|
92
|
+
mock_system, CapacityType.ON_DEMAND, "cluster", "zone", "project"
|
|
93
|
+
)
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
@patch("xpk.commands.common.is_dry_run", return_value=False)
|
|
97
|
+
def test_is_GPU_TAS_possible_flex_start_capacity(self, mock_is_dry_run):
|
|
98
|
+
"""Test is_GPU_TAS_possible returns True for FLEX_START capacity."""
|
|
99
|
+
mock_system = MagicMock(spec=SystemCharacteristics)
|
|
100
|
+
mock_system.device_type = "some-device"
|
|
101
|
+
self.assertTrue(
|
|
102
|
+
is_GPU_TAS_possible(
|
|
103
|
+
mock_system, CapacityType.FLEX_START, "cluster", "zone", "project"
|
|
104
|
+
)
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
@patch("xpk.commands.common.run_command_for_value")
|
|
108
|
+
@patch("xpk.commands.common.is_dry_run", return_value=False)
|
|
109
|
+
def test_is_GPU_TAS_possible_compact_placement_exists(
|
|
110
|
+
self, mock_is_dry_run, mock_run_command
|
|
111
|
+
):
|
|
112
|
+
"""Test is_GPU_TAS_possible with COMPACT placement returns True."""
|
|
113
|
+
mock_system = MagicMock(spec=SystemCharacteristics)
|
|
114
|
+
mock_system.device_type = "a3-ultra"
|
|
115
|
+
mock_run_command.return_value = (0, "some-nodepool\nsome-other-nodepool\n")
|
|
116
|
+
self.assertTrue(
|
|
117
|
+
is_GPU_TAS_possible(
|
|
118
|
+
mock_system,
|
|
119
|
+
CapacityType.RESERVATION,
|
|
120
|
+
"cluster",
|
|
121
|
+
"zone",
|
|
122
|
+
"project",
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
@patch("xpk.commands.common.run_command_for_value")
|
|
127
|
+
@patch("xpk.commands.common.is_dry_run", return_value=False)
|
|
128
|
+
def test_is_GPU_TAS_possible_no_compact_placement(
|
|
129
|
+
self, mock_is_dry_run, mock_run_command
|
|
130
|
+
):
|
|
131
|
+
"""Test is_GPU_TAS_possible without COMPACT placement returns False."""
|
|
132
|
+
mock_system = MagicMock(spec=SystemCharacteristics)
|
|
133
|
+
mock_system.device_type = "a3-ultra"
|
|
134
|
+
mock_run_command.return_value = (0, "")
|
|
135
|
+
self.assertFalse(
|
|
136
|
+
is_GPU_TAS_possible(
|
|
137
|
+
mock_system,
|
|
138
|
+
CapacityType.RESERVATION,
|
|
139
|
+
"cluster",
|
|
140
|
+
"zone",
|
|
141
|
+
"project",
|
|
142
|
+
)
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
@patch("xpk.commands.common.xpk_print")
|
|
146
|
+
@patch("xpk.commands.common.run_command_for_value")
|
|
147
|
+
@patch("xpk.commands.common.is_dry_run", return_value=False)
|
|
148
|
+
def test_is_GPU_TAS_possible_command_fails(
|
|
149
|
+
self, mock_is_dry_run, mock_run_command, mock_xpk_print
|
|
150
|
+
):
|
|
151
|
+
"""Test is_GPU_TAS_possible when gcloud command fails."""
|
|
152
|
+
mock_system = MagicMock(spec=SystemCharacteristics)
|
|
153
|
+
mock_system.device_type = "a3-ultra"
|
|
154
|
+
mock_run_command.return_value = (1, "Error")
|
|
155
|
+
self.assertFalse(
|
|
156
|
+
is_GPU_TAS_possible(
|
|
157
|
+
mock_system,
|
|
158
|
+
CapacityType.RESERVATION,
|
|
159
|
+
"cluster",
|
|
160
|
+
"zone",
|
|
161
|
+
"project",
|
|
162
|
+
)
|
|
163
|
+
)
|
|
164
|
+
mock_xpk_print.assert_called_with(
|
|
165
|
+
"Node pool retrieval failed, assuming TAS is not possible"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
if __name__ == "__main__":
|
|
170
|
+
unittest.main()
|
xpk/commands/info.py
CHANGED
|
@@ -76,7 +76,7 @@ def get_nominal_quotas(cqs: str) -> dict[str, dict[str, str]]:
|
|
|
76
76
|
try:
|
|
77
77
|
cq_list = json.loads(cqs)['items']
|
|
78
78
|
except ValueError:
|
|
79
|
-
xpk_print('Incorrect
|
|
79
|
+
xpk_print('Incorrect response from list clusterqueue')
|
|
80
80
|
xpk_print(cqs)
|
|
81
81
|
xpk_exit(1)
|
|
82
82
|
|
|
@@ -98,7 +98,7 @@ def print_formatted_cqs(cqs: str, nominalQuotas) -> None:
|
|
|
98
98
|
try:
|
|
99
99
|
cq_list = json.loads(cqs)['items']
|
|
100
100
|
except ValueError:
|
|
101
|
-
xpk_print('Incorrect
|
|
101
|
+
xpk_print('Incorrect response from list clusterqueue')
|
|
102
102
|
xpk_print(cqs)
|
|
103
103
|
xpk_exit(1)
|
|
104
104
|
|
|
@@ -114,7 +114,7 @@ def print_formatted_lqs(lqs: str, nominalQuotas) -> None:
|
|
|
114
114
|
try:
|
|
115
115
|
lq_list = json.loads(lqs)['items']
|
|
116
116
|
except ValueError:
|
|
117
|
-
xpk_print('Incorrect
|
|
117
|
+
xpk_print('Incorrect response from list localqueue')
|
|
118
118
|
xpk_print(lqs)
|
|
119
119
|
xpk_exit(1)
|
|
120
120
|
|
|
@@ -219,7 +219,9 @@ def run_kueuectl_list_localqueue(args: Namespace) -> str:
|
|
|
219
219
|
command = 'kubectl kueue list localqueue -o json'
|
|
220
220
|
if args.namespace != '':
|
|
221
221
|
command += f' --namespace {args.namespace}'
|
|
222
|
-
return_code, val = run_command_for_value(
|
|
222
|
+
return_code, val = run_command_for_value(
|
|
223
|
+
command, 'list localqueue', hide_error=True
|
|
224
|
+
)
|
|
223
225
|
|
|
224
226
|
if return_code != 0:
|
|
225
227
|
xpk_print(f'Cluster info request returned ERROR {return_code}')
|
|
@@ -235,7 +237,9 @@ def run_kueuectl_list_clusterqueue() -> str:
|
|
|
235
237
|
"""
|
|
236
238
|
command = 'kubectl kueue list clusterqueue -o json'
|
|
237
239
|
|
|
238
|
-
return_code, val = run_command_for_value(
|
|
240
|
+
return_code, val = run_command_for_value(
|
|
241
|
+
command, 'list clusterqueue', hide_error=True
|
|
242
|
+
)
|
|
239
243
|
|
|
240
244
|
if return_code != 0:
|
|
241
245
|
xpk_print(f'Cluster info request returned ERROR {return_code}')
|
xpk/commands/inspector.py
CHANGED
|
@@ -23,7 +23,7 @@ from ..utils.console import xpk_exit, xpk_print
|
|
|
23
23
|
from ..utils.file import append_tmp_file, write_tmp_file
|
|
24
24
|
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
25
25
|
from .workload import get_workload_list
|
|
26
|
-
from ..core.kueue_manager import has_sub_slicing_enabled
|
|
26
|
+
from ..core.kueue_manager import has_sub_slicing_enabled, has_super_slicing_enabled
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
_SPACER = '========================================================'
|
|
@@ -89,12 +89,40 @@ def inspector_run_sub_slicing_helper(args, file: str):
|
|
|
89
89
|
if return_code != 0:
|
|
90
90
|
xpk_exit(return_code)
|
|
91
91
|
if result:
|
|
92
|
-
output = f'Sub-slicing topology set up.\n{_SPACER}'
|
|
92
|
+
output = f'Sub-slicing topology set up.\n{_SPACER}\n'
|
|
93
93
|
append_tmp_file(output, file)
|
|
94
94
|
if args.print_to_terminal:
|
|
95
95
|
xpk_print(output)
|
|
96
96
|
|
|
97
97
|
|
|
98
|
+
def inspector_run_slice_controller_helper(args, file: str):
|
|
99
|
+
return_code, result = has_super_slicing_enabled()
|
|
100
|
+
if return_code != 0:
|
|
101
|
+
xpk_exit(return_code)
|
|
102
|
+
|
|
103
|
+
if not result:
|
|
104
|
+
return
|
|
105
|
+
|
|
106
|
+
output = f'Super-slicing topology set up.\n{_SPACER}\n'
|
|
107
|
+
append_tmp_file(output, file)
|
|
108
|
+
if args.print_to_terminal:
|
|
109
|
+
xpk_print(output)
|
|
110
|
+
|
|
111
|
+
command = (
|
|
112
|
+
'kubectl describe deployment slice-controller-controller-manager -n'
|
|
113
|
+
' slice-controller-system'
|
|
114
|
+
)
|
|
115
|
+
command_description = 'Slice Controller Deployment Details'
|
|
116
|
+
inspector_run_command_helper(args, command, command_description, file)
|
|
117
|
+
|
|
118
|
+
command = (
|
|
119
|
+
'kubectl logs deployment slice-controller-controller-manager -n'
|
|
120
|
+
' slice-controller-system --tail=100 --prefix=True'
|
|
121
|
+
)
|
|
122
|
+
command_description = 'Slice Controller Logs'
|
|
123
|
+
inspector_run_command_helper(args, command, command_description, file)
|
|
124
|
+
|
|
125
|
+
|
|
98
126
|
def inspector_output_link_helper(args, link, link_description, file) -> int:
|
|
99
127
|
"""Outputs a link for xpk inspector to the output file.
|
|
100
128
|
|
|
@@ -257,6 +285,9 @@ def inspector(args) -> None:
|
|
|
257
285
|
f' {description} return code: {return_code}'
|
|
258
286
|
)
|
|
259
287
|
|
|
288
|
+
inspector_run_sub_slicing_helper(args, inspector_file)
|
|
289
|
+
inspector_run_slice_controller_helper(args, inspector_file)
|
|
290
|
+
|
|
260
291
|
# Workload list views:
|
|
261
292
|
filter_by_statuses = ['EVERYTHING', 'QUEUED', 'RUNNING']
|
|
262
293
|
for filter_by_status in filter_by_statuses:
|
|
@@ -321,8 +352,6 @@ def inspector(args) -> None:
|
|
|
321
352
|
f' {command_description} return code: {return_code}'
|
|
322
353
|
)
|
|
323
354
|
|
|
324
|
-
inspector_run_sub_slicing_helper(args, inspector_file)
|
|
325
|
-
|
|
326
355
|
# Cloud Console Links:
|
|
327
356
|
workload_links = []
|
|
328
357
|
if args.workload:
|