xpk 0.13.0__py3-none-any.whl → 0.14.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/__init__.py +15 -0
- integration/docker_manager_test.py +102 -0
- integration/gcluster_a3mega_test.py +204 -0
- integration/gcluster_a3ultra_test.py +176 -0
- integration/gcluster_a4_test.py +176 -0
- integration/gcluster_test.py +107 -0
- xpk/commands/batch.py +9 -2
- xpk/commands/cluster.py +143 -117
- xpk/commands/cluster_gcluster.py +81 -14
- xpk/commands/cluster_gcluster_test.py +177 -0
- xpk/commands/cluster_test.py +92 -0
- xpk/commands/common.py +14 -26
- xpk/commands/info.py +11 -9
- xpk/commands/inspector.py +21 -10
- xpk/commands/job.py +25 -9
- xpk/commands/kind.py +39 -40
- xpk/commands/kjob_common.py +4 -4
- xpk/commands/run.py +9 -2
- xpk/commands/shell.py +13 -10
- xpk/commands/storage.py +21 -0
- xpk/commands/version.py +0 -4
- xpk/commands/workload.py +84 -29
- xpk/commands/workload_test.py +81 -0
- xpk/core/blueprint/blueprint_generator.py +4 -40
- xpk/core/blueprint/blueprint_test.py +0 -6
- xpk/core/blueprint/testing/__init__.py +15 -0
- xpk/core/capacity.py +6 -5
- xpk/core/cluster.py +91 -194
- xpk/core/cluster_private.py +6 -11
- xpk/core/commands.py +11 -18
- xpk/core/config.py +1 -1
- xpk/core/docker_image.py +3 -4
- xpk/core/gcloud_context.py +26 -2
- xpk/core/gcloud_context_test.py +96 -0
- xpk/core/gcluster_manager.py +0 -3
- xpk/core/jobset.py +4 -7
- xpk/core/kjob.py +14 -27
- xpk/core/kueue_manager.py +423 -0
- xpk/core/kueue_manager_test.py +574 -0
- xpk/core/monitoring.py +1 -1
- xpk/core/nap.py +10 -15
- xpk/core/network.py +17 -18
- xpk/core/nodepool.py +66 -77
- xpk/core/nodepool_test.py +198 -1
- xpk/core/pathways.py +5 -5
- xpk/core/ray.py +10 -14
- xpk/core/resources.py +6 -11
- xpk/core/scheduling.py +19 -1
- xpk/core/scheduling_test.py +31 -0
- xpk/core/system_characteristics.py +350 -232
- xpk/core/system_characteristics_test.py +73 -0
- xpk/core/vertex.py +1 -1
- xpk/core/workload.py +7 -8
- xpk/main.py +2 -4
- xpk/parser/cluster.py +7 -0
- xpk/parser/cluster_test.py +66 -0
- xpk/parser/common.py +11 -0
- xpk/parser/workload.py +62 -25
- xpk/parser/workload_test.py +82 -0
- xpk/templates/cluster_preheat.yaml.j2 +31 -0
- xpk/templates/filestore-pv.yaml +17 -0
- xpk/templates/filestore-pvc.yaml +11 -0
- xpk/templates/filestore-sc.yaml +10 -0
- xpk/templates/fuse-pv.yaml +17 -0
- xpk/templates/fuse-pvc.yaml +13 -0
- xpk/templates/kueue_config.yaml.j2 +95 -0
- xpk/templates/kueue_gke_default_topology.yaml.j2 +10 -0
- xpk/templates/kueue_sub_slicing_topology.yaml.j2 +14 -0
- xpk/templates/mtc-cpc.yaml +15 -0
- xpk/templates/volume_bundle.yaml +7 -0
- xpk/utils/feature_flags.py +28 -0
- xpk/utils/kueue.py +20 -0
- xpk/utils/templates.py +15 -0
- xpk/utils/topology.py +46 -0
- xpk/utils/topology_test.py +63 -0
- xpk/utils/validation.py +79 -55
- xpk/utils/validation_test.py +37 -0
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/METADATA +6 -1
- xpk-0.14.1.dist-info/RECORD +133 -0
- xpk-0.14.1.dist-info/top_level.txt +2 -0
- xpk/core/kueue.py +0 -561
- xpk-0.13.0.dist-info/RECORD +0 -101
- xpk-0.13.0.dist-info/top_level.txt +0 -1
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/WHEEL +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/entry_points.txt +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/licenses/LICENSE +0 -0
xpk/commands/cluster_gcluster.py
CHANGED
|
@@ -16,6 +16,13 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
import os
|
|
18
18
|
|
|
19
|
+
from ..utils.feature_flags import FeatureFlags
|
|
20
|
+
from ..utils.execution_context import is_dry_run
|
|
21
|
+
from ..core.kueue_manager import KueueConfig, KueueManager
|
|
22
|
+
from ..core.nap import enable_autoprovisioning_on_cluster
|
|
23
|
+
from ..core.scheduling import get_total_chips_requested_from_args
|
|
24
|
+
from ..core.system_characteristics import get_system_characteristics
|
|
25
|
+
|
|
19
26
|
from ..core.blueprint.blueprint_generator import (
|
|
20
27
|
BlueprintGenerator,
|
|
21
28
|
BlueprintGeneratorOutput,
|
|
@@ -75,22 +82,29 @@ def cluster_create(args) -> None:
|
|
|
75
82
|
bp = generate_blueprint(blueprint_name=unique_name, args=args, prefix=prefix)
|
|
76
83
|
|
|
77
84
|
# staging: sending the blueprint file(s) to gcluster's working directory
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
85
|
+
if is_dry_run():
|
|
86
|
+
xpk_print(f'Blueprint file: {bp.blueprint_file}')
|
|
87
|
+
else:
|
|
88
|
+
bp_staged_path = gcm.stage_files(
|
|
89
|
+
blueprint_file=bp.blueprint_file,
|
|
90
|
+
blueprint_dependencies=bp.blueprint_dependencies,
|
|
91
|
+
prefix=prefix,
|
|
92
|
+
)
|
|
93
|
+
gcm.deploy(
|
|
94
|
+
blueprint_path=bp_staged_path,
|
|
95
|
+
deployment_name=unique_name,
|
|
96
|
+
prefix=prefix,
|
|
97
|
+
)
|
|
98
|
+
if args.cluster_state_gcs_bucket is not None:
|
|
99
|
+
gcm.upload_state()
|
|
90
100
|
|
|
91
101
|
get_cluster_credentials(args)
|
|
92
102
|
|
|
93
|
-
err_code =
|
|
103
|
+
err_code = __install_kueue(args)
|
|
104
|
+
if err_code > 0:
|
|
105
|
+
xpk_exit(err_code)
|
|
106
|
+
|
|
107
|
+
err_code = apply_kjob_crds()
|
|
94
108
|
if err_code > 0:
|
|
95
109
|
xpk_exit(err_code)
|
|
96
110
|
|
|
@@ -101,6 +115,60 @@ def cluster_create(args) -> None:
|
|
|
101
115
|
xpk_exit(0)
|
|
102
116
|
|
|
103
117
|
|
|
118
|
+
def __install_kueue(args) -> int:
|
|
119
|
+
system, return_code = get_system_characteristics(args)
|
|
120
|
+
|
|
121
|
+
if return_code > 0 or system is None:
|
|
122
|
+
xpk_print('Fetching system characteristics failed!')
|
|
123
|
+
return return_code
|
|
124
|
+
|
|
125
|
+
# Provision node pools dynamically based on incoming workloads:
|
|
126
|
+
# Currently autoprovisioning is not supported with Pathways.
|
|
127
|
+
autoprovisioning_config = None
|
|
128
|
+
if args.enable_autoprovisioning:
|
|
129
|
+
xpk_print('Enabling Autoprovisioning')
|
|
130
|
+
autoprovisioning_config, return_code = enable_autoprovisioning_on_cluster(
|
|
131
|
+
args, system
|
|
132
|
+
)
|
|
133
|
+
if return_code != 0:
|
|
134
|
+
return return_code
|
|
135
|
+
|
|
136
|
+
autoprovisioning_enabled = False
|
|
137
|
+
if autoprovisioning_config:
|
|
138
|
+
# Determine total resources available based on autoprovisioning max chips.
|
|
139
|
+
autoprovisioning_enabled = True
|
|
140
|
+
total_chips = autoprovisioning_config.maximum_chips
|
|
141
|
+
else:
|
|
142
|
+
# Determine total chips based on user specified topology.
|
|
143
|
+
total_chips = get_total_chips_requested_from_args(args, system)
|
|
144
|
+
kueue_manager = KueueManager()
|
|
145
|
+
|
|
146
|
+
tolerations = [{
|
|
147
|
+
'key': 'components.gke.io/gke-managed-components',
|
|
148
|
+
'operator': 'Equal',
|
|
149
|
+
'value': 'true',
|
|
150
|
+
'effect': 'NoSchedule',
|
|
151
|
+
}]
|
|
152
|
+
|
|
153
|
+
kueue_manager.install_or_upgrade(
|
|
154
|
+
KueueConfig(
|
|
155
|
+
system,
|
|
156
|
+
total_chips=total_chips,
|
|
157
|
+
autoprovisioning_enabled=autoprovisioning_enabled,
|
|
158
|
+
num_slices=args.num_slices,
|
|
159
|
+
memory_limit=args.memory_limit,
|
|
160
|
+
cpu_limit=args.cpu_limit,
|
|
161
|
+
is_pathways_cluster=args.enable_pathways,
|
|
162
|
+
flex=args.flex,
|
|
163
|
+
configure_sub_slicing=(
|
|
164
|
+
FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing
|
|
165
|
+
),
|
|
166
|
+
),
|
|
167
|
+
tolerations=tolerations,
|
|
168
|
+
)
|
|
169
|
+
return 0
|
|
170
|
+
|
|
171
|
+
|
|
104
172
|
def cluster_delete(args) -> None:
|
|
105
173
|
"""Function around cluster delete for the clusters created by Cluster toolkit.
|
|
106
174
|
|
|
@@ -213,7 +281,6 @@ def validate_state_gcs_bucket(args):
|
|
|
213
281
|
err_code, _ = run_command_for_value(
|
|
214
282
|
bucket_validate_cmd,
|
|
215
283
|
'Validate remote state bucket existence.',
|
|
216
|
-
global_args=args,
|
|
217
284
|
)
|
|
218
285
|
if err_code != 0:
|
|
219
286
|
xpk_exit(err_code)
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from unittest.mock import MagicMock, patch
|
|
18
|
+
|
|
19
|
+
import pytest
|
|
20
|
+
|
|
21
|
+
from xpk.commands.cluster_gcluster import cluster_create
|
|
22
|
+
from xpk.core.kueue_manager import KueueConfig
|
|
23
|
+
from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@pytest.fixture
|
|
27
|
+
def mock_args():
|
|
28
|
+
"""Provides a mock for args."""
|
|
29
|
+
args = MagicMock()
|
|
30
|
+
args.enable_autoprovisioning = False
|
|
31
|
+
args.num_slices = 1
|
|
32
|
+
args.memory_limit = "200G"
|
|
33
|
+
args.cpu_limit = "50"
|
|
34
|
+
args.enable_pathways = False
|
|
35
|
+
args.flex = False
|
|
36
|
+
args.project = "test-project"
|
|
37
|
+
args.cluster = "test-cluster"
|
|
38
|
+
args.zone = "us-central1-c"
|
|
39
|
+
args.cluster_state_gcs_bucket = None
|
|
40
|
+
return args
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@pytest.fixture
|
|
44
|
+
def mock_cluster_create_deps(request):
|
|
45
|
+
"""Mocks dependencies for cluster_create."""
|
|
46
|
+
with (
|
|
47
|
+
patch("xpk.commands.cluster_gcluster.xpk_exit") as mock_exit,
|
|
48
|
+
patch("xpk.commands.cluster_gcluster.prepare_kjob") as mock_prep_kjob,
|
|
49
|
+
patch("xpk.commands.cluster_gcluster.apply_kjob_crds") as mock_apply_kjob,
|
|
50
|
+
patch(
|
|
51
|
+
"xpk.commands.cluster_gcluster.get_cluster_credentials"
|
|
52
|
+
) as mock_get_creds,
|
|
53
|
+
patch("xpk.commands.cluster_gcluster.generate_blueprint") as mock_gen_bp,
|
|
54
|
+
patch(
|
|
55
|
+
"xpk.commands.cluster_gcluster.prepare_gcluster_manager"
|
|
56
|
+
) as mock_prep_gcm,
|
|
57
|
+
patch(
|
|
58
|
+
"xpk.commands.cluster_gcluster.prepare_directories"
|
|
59
|
+
) as mock_prep_dirs,
|
|
60
|
+
patch(
|
|
61
|
+
"xpk.commands.cluster_gcluster.check_gcloud_authenticated"
|
|
62
|
+
) as mock_check_auth,
|
|
63
|
+
patch(
|
|
64
|
+
"xpk.commands.cluster_gcluster.get_system_characteristics"
|
|
65
|
+
) as mock_get_sys_char,
|
|
66
|
+
patch("xpk.commands.cluster_gcluster.KueueManager") as mock_kueue_manager,
|
|
67
|
+
):
|
|
68
|
+
yield {
|
|
69
|
+
"xpk_exit": mock_exit,
|
|
70
|
+
"prepare_kjob": mock_prep_kjob,
|
|
71
|
+
"apply_kjob_crds": mock_apply_kjob,
|
|
72
|
+
"get_cluster_credentials": mock_get_creds,
|
|
73
|
+
"generate_blueprint": mock_gen_bp,
|
|
74
|
+
"prepare_gcluster_manager": mock_prep_gcm,
|
|
75
|
+
"prepare_directories": mock_prep_dirs,
|
|
76
|
+
"check_gcloud_authenticated": mock_check_auth,
|
|
77
|
+
"get_system_characteristics": mock_get_sys_char,
|
|
78
|
+
"KueueManager": mock_kueue_manager,
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@patch("xpk.commands.cluster_gcluster.get_total_chips_requested_from_args")
|
|
83
|
+
def test_install_kueue_standard(
|
|
84
|
+
mock_get_total_chips, mock_args, mock_cluster_create_deps
|
|
85
|
+
):
|
|
86
|
+
"""Tests __install_kueue for a standard installation."""
|
|
87
|
+
mock_cluster_create_deps["prepare_kjob"].return_value = 0
|
|
88
|
+
mock_cluster_create_deps["apply_kjob_crds"].return_value = 0
|
|
89
|
+
|
|
90
|
+
mock_system = SystemCharacteristics(
|
|
91
|
+
topology="N/A",
|
|
92
|
+
vms_per_slice=1,
|
|
93
|
+
gke_accelerator="nvidia-h100-mega-80gb",
|
|
94
|
+
gce_machine_type="a3-megagpu-8g",
|
|
95
|
+
chips_per_vm=8,
|
|
96
|
+
accelerator_type=AcceleratorType["GPU"],
|
|
97
|
+
device_type="h100-mega-80gb-8",
|
|
98
|
+
supports_sub_slicing=False,
|
|
99
|
+
)
|
|
100
|
+
mock_cluster_create_deps["get_system_characteristics"].return_value = (
|
|
101
|
+
mock_system,
|
|
102
|
+
0,
|
|
103
|
+
)
|
|
104
|
+
mock_get_total_chips.return_value = 16
|
|
105
|
+
|
|
106
|
+
cluster_create(mock_args)
|
|
107
|
+
|
|
108
|
+
mock_cluster_create_deps["xpk_exit"].assert_called_with(0)
|
|
109
|
+
mock_kueue_manager = mock_cluster_create_deps["KueueManager"]
|
|
110
|
+
mock_kueue_manager.return_value.install_or_upgrade.assert_called_once()
|
|
111
|
+
call_args, call_kwargs = (
|
|
112
|
+
mock_kueue_manager.return_value.install_or_upgrade.call_args
|
|
113
|
+
)
|
|
114
|
+
kueue_config: KueueConfig = call_args[0]
|
|
115
|
+
|
|
116
|
+
assert kueue_config.system == mock_system
|
|
117
|
+
assert kueue_config.total_chips == 16
|
|
118
|
+
assert not kueue_config.autoprovisioning_enabled
|
|
119
|
+
assert "tolerations" in call_kwargs
|
|
120
|
+
tolerations = call_kwargs["tolerations"]
|
|
121
|
+
assert any(
|
|
122
|
+
t.get("key") == "components.gke.io/gke-managed-components"
|
|
123
|
+
and t.get("effect") == "NoSchedule"
|
|
124
|
+
for t in tolerations
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@patch("xpk.commands.cluster_gcluster.enable_autoprovisioning_on_cluster")
|
|
129
|
+
def test_install_kueue_with_autoprovisioning(
|
|
130
|
+
mock_enable_autoprovisioning, mock_args, mock_cluster_create_deps
|
|
131
|
+
):
|
|
132
|
+
"""Tests __install_kueue with autoprovisioning enabled."""
|
|
133
|
+
mock_cluster_create_deps["prepare_kjob"].return_value = 0
|
|
134
|
+
mock_cluster_create_deps["apply_kjob_crds"].return_value = 0
|
|
135
|
+
|
|
136
|
+
mock_args.enable_autoprovisioning = True
|
|
137
|
+
mock_system = SystemCharacteristics(
|
|
138
|
+
topology="N/A",
|
|
139
|
+
vms_per_slice=1,
|
|
140
|
+
gke_accelerator="nvidia-h100-mega-80gb",
|
|
141
|
+
gce_machine_type="a3-megagpu-8g",
|
|
142
|
+
chips_per_vm=8,
|
|
143
|
+
accelerator_type=AcceleratorType["GPU"],
|
|
144
|
+
device_type="h100-mega-80gb-8",
|
|
145
|
+
supports_sub_slicing=False,
|
|
146
|
+
)
|
|
147
|
+
mock_cluster_create_deps["get_system_characteristics"].return_value = (
|
|
148
|
+
mock_system,
|
|
149
|
+
0,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
mock_autoprovisioning_config = MagicMock()
|
|
153
|
+
mock_autoprovisioning_config.maximum_chips = 128
|
|
154
|
+
mock_enable_autoprovisioning.return_value = (mock_autoprovisioning_config, 0)
|
|
155
|
+
|
|
156
|
+
cluster_create(mock_args)
|
|
157
|
+
|
|
158
|
+
mock_cluster_create_deps["xpk_exit"].assert_called_with(0)
|
|
159
|
+
mock_enable_autoprovisioning.assert_called_once_with(mock_args, mock_system)
|
|
160
|
+
mock_kueue_manager = mock_cluster_create_deps["KueueManager"]
|
|
161
|
+
mock_kueue_manager.return_value.install_or_upgrade.assert_called_once()
|
|
162
|
+
|
|
163
|
+
call_args, call_kwargs = (
|
|
164
|
+
mock_kueue_manager.return_value.install_or_upgrade.call_args
|
|
165
|
+
)
|
|
166
|
+
kueue_config: KueueConfig = call_args[0]
|
|
167
|
+
|
|
168
|
+
assert kueue_config.system == mock_system
|
|
169
|
+
assert kueue_config.total_chips == 128
|
|
170
|
+
assert kueue_config.autoprovisioning_enabled
|
|
171
|
+
assert "tolerations" in call_kwargs
|
|
172
|
+
tolerations = call_kwargs["tolerations"]
|
|
173
|
+
assert any(
|
|
174
|
+
t.get("key") == "components.gke.io/gke-managed-components"
|
|
175
|
+
and t.get("effect") == "NoSchedule"
|
|
176
|
+
for t in tolerations
|
|
177
|
+
)
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from argparse import Namespace
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
from unittest.mock import MagicMock
|
|
20
|
+
import pytest
|
|
21
|
+
|
|
22
|
+
from xpk.commands.cluster import _validate_cluster_create_args
|
|
23
|
+
from xpk.core.system_characteristics import SystemCharacteristics, UserFacingNameToSystemCharacteristics
|
|
24
|
+
from xpk.utils.feature_flags import FeatureFlags
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class _Mocks:
|
|
29
|
+
common_print_mock: MagicMock
|
|
30
|
+
common_exit_mock: MagicMock
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@pytest.fixture
|
|
34
|
+
def mock_common_print_and_exit(mocker):
|
|
35
|
+
common_print_mock = mocker.patch(
|
|
36
|
+
'xpk.commands.common.xpk_print',
|
|
37
|
+
return_value=None,
|
|
38
|
+
)
|
|
39
|
+
common_exit_mock = mocker.patch(
|
|
40
|
+
'xpk.commands.common.xpk_exit',
|
|
41
|
+
return_value=None,
|
|
42
|
+
)
|
|
43
|
+
return _Mocks(
|
|
44
|
+
common_print_mock=common_print_mock, common_exit_mock=common_exit_mock
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
DEFAULT_TEST_SYSTEM: SystemCharacteristics = (
|
|
49
|
+
UserFacingNameToSystemCharacteristics['l4-1']
|
|
50
|
+
)
|
|
51
|
+
SUB_SLICING_SYSTEM: SystemCharacteristics = (
|
|
52
|
+
UserFacingNameToSystemCharacteristics['v6e-4x4']
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def test_validate_cluster_create_args_for_correct_args_pass(
|
|
57
|
+
mock_common_print_and_exit: _Mocks,
|
|
58
|
+
):
|
|
59
|
+
args = Namespace()
|
|
60
|
+
|
|
61
|
+
_validate_cluster_create_args(args, DEFAULT_TEST_SYSTEM)
|
|
62
|
+
|
|
63
|
+
assert mock_common_print_and_exit.common_print_mock.call_count == 0
|
|
64
|
+
assert mock_common_print_and_exit.common_exit_mock.call_count == 0
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_validate_cluster_create_args_for_correct_sub_slicing_args_pass(
|
|
68
|
+
mock_common_print_and_exit: _Mocks,
|
|
69
|
+
):
|
|
70
|
+
FeatureFlags.SUB_SLICING_ENABLED = True
|
|
71
|
+
args = Namespace(sub_slicing=True)
|
|
72
|
+
|
|
73
|
+
_validate_cluster_create_args(args, SUB_SLICING_SYSTEM)
|
|
74
|
+
|
|
75
|
+
assert mock_common_print_and_exit.common_print_mock.call_count == 0
|
|
76
|
+
assert mock_common_print_and_exit.common_exit_mock.call_count == 0
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def test_validate_cluster_create_args_for_not_supported_system_throws(
|
|
80
|
+
mock_common_print_and_exit: _Mocks,
|
|
81
|
+
):
|
|
82
|
+
FeatureFlags.SUB_SLICING_ENABLED = True
|
|
83
|
+
args = Namespace(sub_slicing=True)
|
|
84
|
+
|
|
85
|
+
_validate_cluster_create_args(args, DEFAULT_TEST_SYSTEM)
|
|
86
|
+
|
|
87
|
+
assert mock_common_print_and_exit.common_print_mock.call_count == 1
|
|
88
|
+
assert (
|
|
89
|
+
mock_common_print_and_exit.common_print_mock.call_args[0][0]
|
|
90
|
+
== 'Error: l4-1 does not support Sub-slicing.'
|
|
91
|
+
)
|
|
92
|
+
assert mock_common_print_and_exit.common_exit_mock.call_count == 1
|
xpk/commands/common.py
CHANGED
|
@@ -16,7 +16,7 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
from ..core.commands import run_command_with_updates_retry
|
|
18
18
|
from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType
|
|
19
|
-
from ..core.gcloud_context import
|
|
19
|
+
from ..core.gcloud_context import get_cluster_location
|
|
20
20
|
from ..utils.console import xpk_print, xpk_exit
|
|
21
21
|
from ..utils.execution_context import is_dry_run
|
|
22
22
|
from ..core.system_characteristics import (
|
|
@@ -35,16 +35,12 @@ def set_cluster_command(args) -> int:
|
|
|
35
35
|
"""
|
|
36
36
|
command = (
|
|
37
37
|
'gcloud container clusters get-credentials'
|
|
38
|
-
f' {args.cluster} --
|
|
39
|
-
' --
|
|
40
|
-
|
|
41
|
-
' kubectl config view && kubectl config set-context --current'
|
|
42
|
-
' --namespace=default'
|
|
38
|
+
f' {args.cluster} --location={get_cluster_location(args.project, args.cluster, args.zone)} --dns-endpoint'
|
|
39
|
+
f' --project={args.project} && kubectl config view && kubectl config'
|
|
40
|
+
' set-context --current --namespace=default'
|
|
43
41
|
)
|
|
44
42
|
task = f'get-credentials to cluster {args.cluster}'
|
|
45
|
-
return_code = run_command_with_updates_retry(
|
|
46
|
-
command, task, args, verbose=False
|
|
47
|
-
)
|
|
43
|
+
return_code = run_command_with_updates_retry(command, task, verbose=False)
|
|
48
44
|
if return_code != 0:
|
|
49
45
|
xpk_print(f'{task} returned ERROR {return_code}')
|
|
50
46
|
return return_code
|
|
@@ -53,16 +49,8 @@ def set_cluster_command(args) -> int:
|
|
|
53
49
|
def is_TAS_possible(
|
|
54
50
|
system_characteristics: SystemCharacteristics | None,
|
|
55
51
|
capacity_type: CapacityType | None,
|
|
56
|
-
flex: bool,
|
|
57
52
|
) -> bool:
|
|
58
|
-
"""Check cluster's machine_type and capacity type to determine if Kueue TAS is possible
|
|
59
|
-
|
|
60
|
-
Args:
|
|
61
|
-
args: user provided arguments for running the command.
|
|
62
|
-
|
|
63
|
-
Returns:
|
|
64
|
-
True if possible and False otherwise.
|
|
65
|
-
"""
|
|
53
|
+
"""Check cluster's machine_type and capacity type to determine if Kueue TAS is possible"""
|
|
66
54
|
|
|
67
55
|
if is_dry_run():
|
|
68
56
|
return True
|
|
@@ -75,13 +63,13 @@ def is_TAS_possible(
|
|
|
75
63
|
xpk_print('capacity_type data was not found in configmaps.')
|
|
76
64
|
xpk_exit(1)
|
|
77
65
|
|
|
78
|
-
|
|
79
|
-
|
|
66
|
+
return (
|
|
67
|
+
system_characteristics.device_type != H100_MEGA_DEVICE_TYPE
|
|
68
|
+
or capacity_type == CapacityType.RESERVATION
|
|
69
|
+
)
|
|
80
70
|
|
|
81
|
-
if (
|
|
82
|
-
system_characteristics.device_type == H100_MEGA_DEVICE_TYPE
|
|
83
|
-
and capacity_type != CapacityType.RESERVATION
|
|
84
|
-
):
|
|
85
|
-
return False
|
|
86
71
|
|
|
87
|
-
|
|
72
|
+
def validate_sub_slicing_system(system: SystemCharacteristics):
|
|
73
|
+
if not system.supports_sub_slicing:
|
|
74
|
+
xpk_print(f'Error: {system.device_type} does not support Sub-slicing.')
|
|
75
|
+
xpk_exit(1)
|
xpk/commands/info.py
CHANGED
|
@@ -22,8 +22,8 @@ from tabulate import tabulate
|
|
|
22
22
|
from ..core.commands import run_command_for_value
|
|
23
23
|
from ..core.cluster import get_cluster_credentials
|
|
24
24
|
from ..core.gcloud_context import add_zone_and_project
|
|
25
|
-
from ..core.kueue import verify_kueuectl
|
|
26
25
|
from ..utils.console import xpk_exit, xpk_print
|
|
26
|
+
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
27
27
|
|
|
28
28
|
table_fmt = 'plain'
|
|
29
29
|
|
|
@@ -36,10 +36,15 @@ def info(args: Namespace) -> None:
|
|
|
36
36
|
Returns:
|
|
37
37
|
None
|
|
38
38
|
"""
|
|
39
|
+
if should_validate_dependencies(args):
|
|
40
|
+
validate_dependencies_list([
|
|
41
|
+
SystemDependency.KUBECTL,
|
|
42
|
+
SystemDependency.GCLOUD,
|
|
43
|
+
SystemDependency.KUEUECTL,
|
|
44
|
+
])
|
|
39
45
|
add_zone_and_project(args)
|
|
40
46
|
get_cluster_credentials(args)
|
|
41
47
|
|
|
42
|
-
verify_kueuectl(args)
|
|
43
48
|
lq, cq = bool(args.localqueue), bool(args.clusterqueue)
|
|
44
49
|
if not lq and not cq:
|
|
45
50
|
lq, cq = True, True
|
|
@@ -48,7 +53,7 @@ def info(args: Namespace) -> None:
|
|
|
48
53
|
if lq:
|
|
49
54
|
lqs = run_kueuectl_list_localqueue(args)
|
|
50
55
|
|
|
51
|
-
cqs = run_kueuectl_list_clusterqueue(
|
|
56
|
+
cqs = run_kueuectl_list_clusterqueue()
|
|
52
57
|
quotas = get_nominal_quotas(cqs)
|
|
53
58
|
|
|
54
59
|
if lq and lqs is not None:
|
|
@@ -214,7 +219,7 @@ def run_kueuectl_list_localqueue(args: Namespace) -> str:
|
|
|
214
219
|
command = 'kubectl kueue list localqueue -o json'
|
|
215
220
|
if args.namespace != '':
|
|
216
221
|
command += f' --namespace {args.namespace}'
|
|
217
|
-
return_code, val = run_command_for_value(command, 'list localqueue'
|
|
222
|
+
return_code, val = run_command_for_value(command, 'list localqueue')
|
|
218
223
|
|
|
219
224
|
if return_code != 0:
|
|
220
225
|
xpk_print(f'Cluster info request returned ERROR {return_code}')
|
|
@@ -222,18 +227,15 @@ def run_kueuectl_list_localqueue(args: Namespace) -> str:
|
|
|
222
227
|
return val
|
|
223
228
|
|
|
224
229
|
|
|
225
|
-
def run_kueuectl_list_clusterqueue(
|
|
230
|
+
def run_kueuectl_list_clusterqueue() -> str:
|
|
226
231
|
"""Run the kueuectl list clusterqueue command.
|
|
227
232
|
|
|
228
|
-
Args:
|
|
229
|
-
args: user provided arguments for running the command.
|
|
230
|
-
|
|
231
233
|
Returns:
|
|
232
234
|
kueuectl list clusterqueue formatted as json string
|
|
233
235
|
"""
|
|
234
236
|
command = 'kubectl kueue list clusterqueue -o json'
|
|
235
237
|
|
|
236
|
-
return_code, val = run_command_for_value(command, 'list clusterqueue'
|
|
238
|
+
return_code, val = run_command_for_value(command, 'list clusterqueue')
|
|
237
239
|
|
|
238
240
|
if return_code != 0:
|
|
239
241
|
xpk_print(f'Cluster info request returned ERROR {return_code}')
|
xpk/commands/inspector.py
CHANGED
|
@@ -16,11 +16,12 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
from ..core.cluster import get_cluster_credentials
|
|
18
18
|
from ..core.commands import run_command_for_value
|
|
19
|
-
from ..core.gcloud_context import add_zone_and_project,
|
|
20
|
-
from ..core.
|
|
19
|
+
from ..core.gcloud_context import add_zone_and_project, get_cluster_location
|
|
20
|
+
from ..core.kueue_manager import CLUSTER_QUEUE_NAME, LOCAL_QUEUE_NAME
|
|
21
21
|
from ..core.resources import CLUSTER_METADATA_CONFIGMAP, CLUSTER_RESOURCES_CONFIGMAP
|
|
22
22
|
from ..utils.console import xpk_exit, xpk_print
|
|
23
23
|
from ..utils.file import append_tmp_file, write_tmp_file
|
|
24
|
+
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
24
25
|
from .workload import get_workload_list
|
|
25
26
|
|
|
26
27
|
|
|
@@ -41,7 +42,7 @@ def inspector_run_command_helper(
|
|
|
41
42
|
prefix = f'Command: {command}\nCommand Description: {command_description}\n'
|
|
42
43
|
postfix = '========================================================'
|
|
43
44
|
return_code, command_output = run_command_for_value(
|
|
44
|
-
command, f'{command_description}'
|
|
45
|
+
command, f'{command_description}'
|
|
45
46
|
)
|
|
46
47
|
|
|
47
48
|
if return_code != 0:
|
|
@@ -116,7 +117,10 @@ def inspector(args) -> None:
|
|
|
116
117
|
# Future Improvements for inspector:
|
|
117
118
|
# 2. List what is next in Queue.
|
|
118
119
|
# 3. Split inspector into different subcommands to parse info easier.
|
|
119
|
-
|
|
120
|
+
if should_validate_dependencies(args):
|
|
121
|
+
validate_dependencies_list(
|
|
122
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
123
|
+
)
|
|
120
124
|
final_return_code = 0
|
|
121
125
|
xpk_print(args)
|
|
122
126
|
|
|
@@ -138,8 +142,9 @@ def inspector(args) -> None:
|
|
|
138
142
|
(
|
|
139
143
|
(
|
|
140
144
|
'gcloud beta container clusters list --project'
|
|
141
|
-
f' {args.project} --
|
|
142
|
-
f'
|
|
145
|
+
f' {args.project} --location'
|
|
146
|
+
f' {get_cluster_location(args.project, args.cluster, args.zone)} |'
|
|
147
|
+
f' grep -e NAME -e {args.cluster}'
|
|
143
148
|
),
|
|
144
149
|
'GKE: Cluster Details',
|
|
145
150
|
),
|
|
@@ -160,7 +165,7 @@ def inspector(args) -> None:
|
|
|
160
165
|
(
|
|
161
166
|
(
|
|
162
167
|
f'gcloud beta container node-pools list --cluster {args.cluster} '
|
|
163
|
-
f' --project={args.project} --
|
|
168
|
+
f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
|
|
164
169
|
),
|
|
165
170
|
'GKE: Node pool Details',
|
|
166
171
|
),
|
|
@@ -309,19 +314,25 @@ def inspector(args) -> None:
|
|
|
309
314
|
workload_links = [(
|
|
310
315
|
f'Cloud Console for the workload {args.workload}',
|
|
311
316
|
# pylint: disable=line-too-long
|
|
312
|
-
|
|
317
|
+
(
|
|
318
|
+
f'https://console.cloud.google.com/kubernetes/service/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
|
|
319
|
+
),
|
|
313
320
|
)]
|
|
314
321
|
|
|
315
322
|
links = [
|
|
316
323
|
(
|
|
317
324
|
'Cloud Console for the GKE Cluster',
|
|
318
325
|
# pylint: disable=line-too-long
|
|
319
|
-
|
|
326
|
+
(
|
|
327
|
+
f'https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
|
|
328
|
+
),
|
|
320
329
|
),
|
|
321
330
|
(
|
|
322
331
|
'Cloud Console for all workloads in GKE Cluster',
|
|
323
332
|
# pylint: disable=line-too-long
|
|
324
|
-
|
|
333
|
+
(
|
|
334
|
+
f'https://console.cloud.google.com/kubernetes/workload/overview?project={args.project}&pageState=((gke%2F{get_cluster_location(args.project, args.cluster, args.zone)}%2F{args.cluster}))'
|
|
335
|
+
),
|
|
325
336
|
),
|
|
326
337
|
(
|
|
327
338
|
'Cloud Console for IAM Permissions',
|