xpk 0.14.3__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/gcluster_a3mega_test.py +11 -0
- integration/gcluster_a3ultra_test.py +11 -0
- integration/gcluster_a4_test.py +11 -0
- xpk/commands/cluster.py +57 -21
- xpk/commands/cluster_gcluster.py +25 -5
- xpk/commands/cluster_gcluster_test.py +11 -2
- xpk/commands/cluster_test.py +233 -12
- xpk/commands/config.py +3 -5
- xpk/commands/kind.py +1 -1
- xpk/commands/storage.py +8 -10
- xpk/commands/workload.py +28 -11
- xpk/commands/workload_test.py +3 -3
- xpk/core/blueprint/blueprint_generator.py +70 -33
- xpk/core/blueprint/blueprint_test.py +9 -0
- xpk/core/capacity.py +46 -8
- xpk/core/capacity_test.py +32 -1
- xpk/core/cluster.py +37 -57
- xpk/core/cluster_test.py +95 -0
- xpk/core/commands.py +4 -10
- xpk/core/config.py +9 -2
- xpk/core/gcloud_context.py +18 -12
- xpk/core/gcloud_context_test.py +111 -1
- xpk/core/kjob.py +6 -9
- xpk/core/kueue_manager.py +192 -32
- xpk/core/kueue_manager_test.py +132 -4
- xpk/core/nodepool.py +21 -29
- xpk/core/nodepool_test.py +17 -15
- xpk/core/scheduling.py +16 -1
- xpk/core/scheduling_test.py +85 -6
- xpk/core/system_characteristics.py +77 -19
- xpk/core/system_characteristics_test.py +80 -5
- xpk/core/telemetry.py +263 -0
- xpk/core/telemetry_test.py +211 -0
- xpk/main.py +31 -13
- xpk/parser/cluster.py +48 -9
- xpk/parser/cluster_test.py +42 -3
- xpk/parser/workload.py +12 -0
- xpk/parser/workload_test.py +4 -4
- xpk/telemetry_uploader.py +29 -0
- xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
- xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
- xpk/utils/console.py +41 -10
- xpk/utils/console_test.py +106 -0
- xpk/utils/feature_flags.py +7 -1
- xpk/utils/file.py +4 -1
- xpk/utils/topology.py +4 -0
- xpk/utils/user_agent.py +35 -0
- xpk/utils/user_agent_test.py +44 -0
- xpk/utils/user_input.py +48 -0
- xpk/utils/user_input_test.py +92 -0
- xpk/utils/validation.py +0 -11
- xpk/utils/versions.py +31 -0
- {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/METADATA +113 -92
- {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/RECORD +58 -48
- {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/WHEEL +0 -0
- {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/top_level.txt +0 -0
xpk/core/gcloud_context.py
CHANGED
|
@@ -19,6 +19,7 @@ import sys
|
|
|
19
19
|
from dataclasses import dataclass
|
|
20
20
|
|
|
21
21
|
from ..utils.console import xpk_print, xpk_exit
|
|
22
|
+
from ..utils.versions import ReleaseChannel
|
|
22
23
|
from .commands import run_command_for_value
|
|
23
24
|
from functools import lru_cache
|
|
24
25
|
|
|
@@ -117,15 +118,18 @@ def get_cluster_location(project: str, name: str, zone: str) -> str:
|
|
|
117
118
|
class GkeServerConfig:
|
|
118
119
|
"""Stores the valid gke versions based on gcloud recommendations."""
|
|
119
120
|
|
|
120
|
-
|
|
121
|
+
default_gke_version: str
|
|
121
122
|
valid_versions: set[str]
|
|
122
123
|
|
|
123
124
|
|
|
124
|
-
def get_gke_server_config(
|
|
125
|
+
def get_gke_server_config(
|
|
126
|
+
args, release_channel: ReleaseChannel
|
|
127
|
+
) -> tuple[int, GkeServerConfig | None]:
|
|
125
128
|
"""Determine the GKE versions supported by gcloud currently.
|
|
126
129
|
|
|
127
130
|
Args:
|
|
128
131
|
args: user provided arguments for running the command.
|
|
132
|
+
release_channel: the release channel to use.
|
|
129
133
|
|
|
130
134
|
Returns:
|
|
131
135
|
Tuple of
|
|
@@ -136,22 +140,24 @@ def get_gke_server_config(args) -> tuple[int, GkeServerConfig | None]:
|
|
|
136
140
|
'gcloud container get-server-config'
|
|
137
141
|
f' --project={args.project} --region={zone_to_region(args.zone)}'
|
|
138
142
|
)
|
|
139
|
-
|
|
143
|
+
default_gke_version_cmd = (
|
|
140
144
|
base_command
|
|
141
|
-
+ ' --flatten="channels"
|
|
145
|
+
+ ' --flatten="channels"'
|
|
146
|
+
f' --filter="channels.channel={release_channel.value}"'
|
|
142
147
|
' --format="value(channels.defaultVersion)"'
|
|
143
148
|
)
|
|
144
149
|
valid_versions_cmd = (
|
|
145
150
|
base_command
|
|
146
|
-
+ ' --flatten="channels"
|
|
151
|
+
+ ' --flatten="channels"'
|
|
152
|
+
f' --filter="channels.channel={release_channel.value}"'
|
|
147
153
|
' --format="value(channels.validVersions)"'
|
|
148
154
|
)
|
|
149
155
|
base_command_description = 'Determine server supported GKE versions for '
|
|
150
156
|
|
|
151
157
|
server_config_commands_and_descriptions = [
|
|
152
158
|
(
|
|
153
|
-
|
|
154
|
-
base_command_description + 'default
|
|
159
|
+
default_gke_version_cmd,
|
|
160
|
+
base_command_description + 'default gke version',
|
|
155
161
|
),
|
|
156
162
|
(
|
|
157
163
|
valid_versions_cmd,
|
|
@@ -172,8 +178,8 @@ def get_gke_server_config(args) -> tuple[int, GkeServerConfig | None]:
|
|
|
172
178
|
command_outputs.append(cmd_output)
|
|
173
179
|
|
|
174
180
|
return 0, GkeServerConfig(
|
|
175
|
-
|
|
176
|
-
valid_versions=set(command_outputs[1].split(';')),
|
|
181
|
+
default_gke_version=command_outputs[0].strip(),
|
|
182
|
+
valid_versions=set([s.strip() for s in command_outputs[1].split(';')]),
|
|
177
183
|
)
|
|
178
184
|
|
|
179
185
|
|
|
@@ -196,7 +202,7 @@ def get_gke_control_plane_version(
|
|
|
196
202
|
if args.gke_version is not None:
|
|
197
203
|
master_gke_version = args.gke_version
|
|
198
204
|
else:
|
|
199
|
-
master_gke_version = gke_server_config.
|
|
205
|
+
master_gke_version = gke_server_config.default_gke_version
|
|
200
206
|
|
|
201
207
|
is_valid_version = master_gke_version in gke_server_config.valid_versions
|
|
202
208
|
|
|
@@ -204,7 +210,7 @@ def get_gke_control_plane_version(
|
|
|
204
210
|
xpk_print(
|
|
205
211
|
f'Planned GKE Version: {master_gke_version}\n Valid Versions:'
|
|
206
212
|
f'\n{gke_server_config.valid_versions}\nRecommended / Default GKE'
|
|
207
|
-
f' Version: {gke_server_config.
|
|
213
|
+
f' Version: {gke_server_config.default_gke_version}'
|
|
208
214
|
)
|
|
209
215
|
xpk_print(
|
|
210
216
|
f'Error: Planned GKE Version {master_gke_version} is not valid.'
|
|
@@ -213,7 +219,7 @@ def get_gke_control_plane_version(
|
|
|
213
219
|
xpk_print(
|
|
214
220
|
'Please select a gke version from the above list using --gke-version=x'
|
|
215
221
|
' argument or rely on the default gke version:'
|
|
216
|
-
f' {gke_server_config.
|
|
222
|
+
f' {gke_server_config.default_gke_version}'
|
|
217
223
|
)
|
|
218
224
|
return 1, None
|
|
219
225
|
|
xpk/core/gcloud_context_test.py
CHANGED
|
@@ -15,7 +15,20 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
import pytest
|
|
18
|
-
from .
|
|
18
|
+
from unittest.mock import MagicMock
|
|
19
|
+
from .gcloud_context import (
|
|
20
|
+
get_cluster_location,
|
|
21
|
+
get_gke_control_plane_version,
|
|
22
|
+
get_gke_server_config,
|
|
23
|
+
GkeServerConfig,
|
|
24
|
+
zone_to_region,
|
|
25
|
+
)
|
|
26
|
+
from ..utils.versions import ReleaseChannel
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@pytest.fixture(autouse=True)
|
|
30
|
+
def xpk_print(mocker):
|
|
31
|
+
return mocker.patch("xpk.core.gcloud_context.xpk_print")
|
|
19
32
|
|
|
20
33
|
|
|
21
34
|
def test_zone_to_region_raises_when_zone_is_invalid():
|
|
@@ -94,3 +107,100 @@ def test_get_cluster_location_invokes_command_for_different_input_args(mocker):
|
|
|
94
107
|
get_cluster_location(project="project6", name="name6", zone="us-central1-a")
|
|
95
108
|
|
|
96
109
|
assert mock.call_count == 2
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def test_get_gke_server_config_success(mocker):
|
|
113
|
+
mock_run_command = mocker.patch(
|
|
114
|
+
"xpk.core.gcloud_context.run_command_for_value",
|
|
115
|
+
side_effect=[
|
|
116
|
+
(0, "1.2.3"),
|
|
117
|
+
(0, "1.2.3;1.2.4;1.3.0"),
|
|
118
|
+
],
|
|
119
|
+
)
|
|
120
|
+
args = mocker.Mock(project="test-project", zone="us-central1")
|
|
121
|
+
|
|
122
|
+
return_code, config = get_gke_server_config(args, ReleaseChannel.STABLE)
|
|
123
|
+
|
|
124
|
+
assert return_code == 0
|
|
125
|
+
assert isinstance(config, GkeServerConfig)
|
|
126
|
+
assert config.default_gke_version == "1.2.3"
|
|
127
|
+
assert config.valid_versions == {"1.2.3", "1.2.4", "1.3.0"}
|
|
128
|
+
assert mock_run_command.call_count == 2
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def test_get_gke_server_config_fails_on_default_version_command(mocker):
|
|
132
|
+
mocker.patch(
|
|
133
|
+
"xpk.core.gcloud_context.run_command_for_value",
|
|
134
|
+
return_value=(1, "error"),
|
|
135
|
+
)
|
|
136
|
+
args = mocker.Mock(project="test-project", zone="us-central1")
|
|
137
|
+
|
|
138
|
+
return_code, config = get_gke_server_config(args, ReleaseChannel.STABLE)
|
|
139
|
+
|
|
140
|
+
assert return_code == 1
|
|
141
|
+
assert config is None
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def test_get_gke_server_config_fails_on_valid_versions_command(mocker):
|
|
145
|
+
mocker.patch(
|
|
146
|
+
"xpk.core.gcloud_context.run_command_for_value",
|
|
147
|
+
side_effect=[(0, "1.2.3"), (1, "error")],
|
|
148
|
+
)
|
|
149
|
+
args = mocker.Mock(project="test-project", zone="us-central1")
|
|
150
|
+
|
|
151
|
+
return_code, config = get_gke_server_config(args, ReleaseChannel.STABLE)
|
|
152
|
+
|
|
153
|
+
assert return_code == 1
|
|
154
|
+
assert config is None
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def test_get_gke_control_plane_version_uses_default_when_not_specified(mocker):
|
|
158
|
+
args = mocker.Mock(gke_version=None)
|
|
159
|
+
gke_server_config = GkeServerConfig(
|
|
160
|
+
default_gke_version="1.2.3", valid_versions={"1.2.3", "1.2.4"}
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
return_code, version = get_gke_control_plane_version(args, gke_server_config)
|
|
164
|
+
|
|
165
|
+
assert return_code == 0
|
|
166
|
+
assert version == "1.2.3"
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def test_get_gke_control_plane_version_uses_user_version_when_valid(mocker):
|
|
170
|
+
args = mocker.Mock(gke_version="1.2.4")
|
|
171
|
+
gke_server_config = GkeServerConfig(
|
|
172
|
+
default_gke_version="1.2.3", valid_versions={"1.2.3", "1.2.4"}
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
return_code, version = get_gke_control_plane_version(args, gke_server_config)
|
|
176
|
+
|
|
177
|
+
assert return_code == 0
|
|
178
|
+
assert version == "1.2.4"
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def test_get_gke_control_plane_version_fails_for_invalid_user_version(
|
|
182
|
+
mocker, xpk_print: MagicMock
|
|
183
|
+
):
|
|
184
|
+
args = mocker.Mock(gke_version="1.2.5")
|
|
185
|
+
gke_server_config = GkeServerConfig(
|
|
186
|
+
default_gke_version="1.2.3", valid_versions={"1.2.3", "1.2.4"}
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
return_code, version = get_gke_control_plane_version(args, gke_server_config)
|
|
190
|
+
|
|
191
|
+
assert return_code == 1
|
|
192
|
+
assert version is None
|
|
193
|
+
assert "Planned GKE Version: 1.2.5" in xpk_print.mock_calls[0].args[0]
|
|
194
|
+
assert (
|
|
195
|
+
"Recommended / Default GKE Version: 1.2.3"
|
|
196
|
+
in xpk_print.mock_calls[0].args[0]
|
|
197
|
+
)
|
|
198
|
+
assert (
|
|
199
|
+
"Error: Planned GKE Version 1.2.5 is not valid."
|
|
200
|
+
in xpk_print.mock_calls[1].args[0]
|
|
201
|
+
)
|
|
202
|
+
assert (
|
|
203
|
+
"Please select a gke version from the above list using --gke-version=x"
|
|
204
|
+
" argument or rely on the default gke version: 1.2.3"
|
|
205
|
+
in xpk_print.mock_calls[2].args[0]
|
|
206
|
+
)
|
xpk/core/kjob.py
CHANGED
|
@@ -38,7 +38,7 @@ from .config import (
|
|
|
38
38
|
KJOB_SHELL_IMAGE,
|
|
39
39
|
KJOB_SHELL_INTERACTIVE_COMMAND,
|
|
40
40
|
KJOB_SHELL_WORKING_DIRECTORY,
|
|
41
|
-
|
|
41
|
+
xpk_config,
|
|
42
42
|
)
|
|
43
43
|
from .network import get_cluster_subnetworks
|
|
44
44
|
from .system_characteristics import AcceleratorType, SystemCharacteristics
|
|
@@ -234,8 +234,7 @@ def get_pod_template_interactive_command() -> str:
|
|
|
234
234
|
Returns:
|
|
235
235
|
str - PodTemplate's interactive command
|
|
236
236
|
"""
|
|
237
|
-
|
|
238
|
-
pod_command = config.get(KJOB_SHELL_INTERACTIVE_COMMAND)
|
|
237
|
+
pod_command = xpk_config.get(KJOB_SHELL_INTERACTIVE_COMMAND)
|
|
239
238
|
if pod_command is None or len(pod_command) == 0:
|
|
240
239
|
pod_command = PodTemplateDefaults.INTERACTIVE_COMMAND.value
|
|
241
240
|
|
|
@@ -287,11 +286,10 @@ def create_job_template_instance(
|
|
|
287
286
|
Returns:
|
|
288
287
|
exit_code > 0 if creating JobTemplate fails, 0 otherwise
|
|
289
288
|
"""
|
|
290
|
-
|
|
291
|
-
job_image = config.get(KJOB_BATCH_IMAGE)
|
|
289
|
+
job_image = xpk_config.get(KJOB_BATCH_IMAGE)
|
|
292
290
|
if job_image is None or len(job_image) == 0:
|
|
293
291
|
job_image = JobTemplateDefaults.IMAGE.value
|
|
294
|
-
working_directory =
|
|
292
|
+
working_directory = xpk_config.get(KJOB_BATCH_WORKING_DIRECTORY)
|
|
295
293
|
if working_directory is None or len(working_directory) == 0:
|
|
296
294
|
working_directory = JobTemplateDefaults.WORKING_DIRECTORY.value
|
|
297
295
|
resources = (
|
|
@@ -332,11 +330,10 @@ def create_pod_template_instance(service_account: str) -> int:
|
|
|
332
330
|
Returns:
|
|
333
331
|
exit_code > 0 if creating PodTemplate fails, 0 otherwise
|
|
334
332
|
"""
|
|
335
|
-
|
|
336
|
-
pod_image = config.get(KJOB_SHELL_IMAGE)
|
|
333
|
+
pod_image = xpk_config.get(KJOB_SHELL_IMAGE)
|
|
337
334
|
if pod_image is None or len(pod_image) == 0:
|
|
338
335
|
pod_image = PodTemplateDefaults.IMAGE.value
|
|
339
|
-
working_directory =
|
|
336
|
+
working_directory = xpk_config.get(KJOB_SHELL_WORKING_DIRECTORY)
|
|
340
337
|
if working_directory is None or len(working_directory) == 0:
|
|
341
338
|
working_directory = PodTemplateDefaults.WORKING_DIRECTORY.value
|
|
342
339
|
|
xpk/core/kueue_manager.py
CHANGED
|
@@ -20,15 +20,17 @@ from dataclasses import dataclass
|
|
|
20
20
|
from typing import Optional, List, Dict, Any
|
|
21
21
|
import json
|
|
22
22
|
from jinja2 import Environment, FileSystemLoader
|
|
23
|
-
from ..utils.execution_context import is_dry_run
|
|
24
|
-
from ..utils.kueue import is_queued_cluster
|
|
25
23
|
|
|
24
|
+
from ..utils.topology import get_slice_topology_level, get_topology_product, is_topology_contained
|
|
25
|
+
from ..utils.kueue import is_queued_cluster
|
|
26
|
+
from kubernetes.utils import parse_quantity
|
|
26
27
|
from .capacity import B200_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
|
|
27
28
|
from .scheduling import (
|
|
28
29
|
create_accelerator_label,
|
|
29
30
|
create_machine_label,
|
|
30
31
|
)
|
|
31
32
|
from .system_characteristics import (
|
|
33
|
+
SUB_SLICING_TOPOLOGIES,
|
|
32
34
|
AcceleratorTypeToAcceleratorCharacteristics,
|
|
33
35
|
SystemCharacteristics,
|
|
34
36
|
)
|
|
@@ -38,10 +40,12 @@ from ..core.commands import (
|
|
|
38
40
|
run_command_with_updates_retry,
|
|
39
41
|
)
|
|
40
42
|
from ..utils.file import write_tmp_file
|
|
41
|
-
from ..utils.console import xpk_print, xpk_exit
|
|
43
|
+
from ..utils.console import xpk_print, xpk_exit, ask_for_user_consent
|
|
42
44
|
from ..utils.templates import TEMPLATE_PATH, get_templates_absolute_path
|
|
43
45
|
from packaging.version import Version
|
|
44
46
|
|
|
47
|
+
KUEUE_VERSION = Version("v0.14.3")
|
|
48
|
+
LATEST_BREAKING_VERSION = Version("v0.14.0")
|
|
45
49
|
WAIT_FOR_KUEUE_TIMEOUT = "10m"
|
|
46
50
|
CLUSTER_QUEUE_NAME = "cluster-queue"
|
|
47
51
|
LOCAL_QUEUE_NAME = "multislice-queue"
|
|
@@ -52,10 +56,9 @@ KUEUE_CONTROLLER_MANAGER_JINJA_FILE = "kueue_controller_manager.yaml.j2"
|
|
|
52
56
|
KUEUE_SUB_SLICING_TOPOLOGY_JINJA_FILE = "kueue_sub_slicing_topology.yaml.j2"
|
|
53
57
|
MEMORY_SIZE_PER_VM = 1.2
|
|
54
58
|
MIN_MEMORY_LIMIT_SIZE = 4096
|
|
55
|
-
KUEUE_VERSION = Version("v0.12.2")
|
|
56
59
|
|
|
57
60
|
|
|
58
|
-
@dataclass
|
|
61
|
+
@dataclass(frozen=True)
|
|
59
62
|
class KueueConfig:
|
|
60
63
|
system: SystemCharacteristics
|
|
61
64
|
total_chips: int
|
|
@@ -68,7 +71,7 @@ class KueueConfig:
|
|
|
68
71
|
num_slices: int = 1
|
|
69
72
|
|
|
70
73
|
|
|
71
|
-
@dataclass
|
|
74
|
+
@dataclass(frozen=True)
|
|
72
75
|
class _NameAndYaml:
|
|
73
76
|
name: str
|
|
74
77
|
yaml: str
|
|
@@ -79,9 +82,13 @@ class KueueManager:
|
|
|
79
82
|
|
|
80
83
|
def __init__(
|
|
81
84
|
self,
|
|
85
|
+
project: str,
|
|
86
|
+
zone: str,
|
|
82
87
|
kueue_version: Version = KUEUE_VERSION,
|
|
83
88
|
template_path=TEMPLATE_PATH,
|
|
84
89
|
):
|
|
90
|
+
self.project = project
|
|
91
|
+
self.zone = zone
|
|
85
92
|
self.kueue_version = kueue_version
|
|
86
93
|
|
|
87
94
|
self.template_env = Environment(
|
|
@@ -102,10 +109,10 @@ class KueueManager:
|
|
|
102
109
|
Args:
|
|
103
110
|
tolerations: An optional list of tolerations to apply to the kueue-controller-manager.
|
|
104
111
|
"""
|
|
105
|
-
return_code, installed_version =
|
|
112
|
+
return_code, installed_version = get_installed_kueue_version()
|
|
106
113
|
|
|
107
|
-
if return_code == 0:
|
|
108
|
-
if installed_version
|
|
114
|
+
if return_code == 0 and installed_version:
|
|
115
|
+
if installed_version > self.kueue_version:
|
|
109
116
|
xpk_print(
|
|
110
117
|
f"Cluster has a newer Kueue version, {installed_version}. Skipping"
|
|
111
118
|
" installation."
|
|
@@ -113,6 +120,10 @@ class KueueManager:
|
|
|
113
120
|
return 0
|
|
114
121
|
else:
|
|
115
122
|
xpk_print(f"Upgrading Kueue to version v{self.kueue_version}...")
|
|
123
|
+
assert installed_version
|
|
124
|
+
prepare_code = self.__prepare_for_upgrade(installed_version)
|
|
125
|
+
if prepare_code != 0:
|
|
126
|
+
return prepare_code
|
|
116
127
|
else:
|
|
117
128
|
xpk_print(f"Installing Kueue version v{self.kueue_version}...")
|
|
118
129
|
|
|
@@ -122,24 +133,6 @@ class KueueManager:
|
|
|
122
133
|
|
|
123
134
|
return self.__configure(kueue_config)
|
|
124
135
|
|
|
125
|
-
def get_installed_kueue_version(self) -> tuple[int, Version | None]:
|
|
126
|
-
command = (
|
|
127
|
-
"kubectl get deployment kueue-controller-manager -n kueue-system -o"
|
|
128
|
-
" jsonpath='{.spec.template.spec.containers[0].image}'"
|
|
129
|
-
)
|
|
130
|
-
task = "Get kueue version on server"
|
|
131
|
-
return_code, val = run_command_for_value(
|
|
132
|
-
command,
|
|
133
|
-
task,
|
|
134
|
-
dry_run_return_val="",
|
|
135
|
-
)
|
|
136
|
-
if return_code != 0:
|
|
137
|
-
return return_code, None
|
|
138
|
-
version_tag = val.split(":")
|
|
139
|
-
if len(version_tag) == 1:
|
|
140
|
-
return 1, None
|
|
141
|
-
return return_code, Version(version_tag[-1])
|
|
142
|
-
|
|
143
136
|
def __install(
|
|
144
137
|
self,
|
|
145
138
|
tolerations: Optional[List[Dict[str, Any]]] = None,
|
|
@@ -161,6 +154,60 @@ class KueueManager:
|
|
|
161
154
|
|
|
162
155
|
return self.__wait_for_kueue_available()
|
|
163
156
|
|
|
157
|
+
def __prepare_for_upgrade(self, installed_version: Version) -> int:
|
|
158
|
+
if installed_version >= LATEST_BREAKING_VERSION:
|
|
159
|
+
return 0
|
|
160
|
+
|
|
161
|
+
xpk_print(
|
|
162
|
+
f"Currently installed Kueue version v{installed_version} is"
|
|
163
|
+
f" incompatible with the newer v{self.kueue_version}."
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
changelog_link = f"https://github.com/kubernetes-sigs/kueue/blob/main/CHANGELOG/CHANGELOG-{self.kueue_version.major}.{self.kueue_version.minor}.md"
|
|
167
|
+
agreed = ask_for_user_consent(
|
|
168
|
+
"Do you want to allow XPK to update Kueue automatically? This will"
|
|
169
|
+
" delete all existing Kueue resources and create new ones. If you"
|
|
170
|
+
" decline, you will need to upgrade the Kueue manually (see"
|
|
171
|
+
f" {changelog_link} for help)."
|
|
172
|
+
)
|
|
173
|
+
if not agreed:
|
|
174
|
+
return 1
|
|
175
|
+
|
|
176
|
+
return self.__delete_all_kueue_resources()
|
|
177
|
+
|
|
178
|
+
def __delete_all_kueue_resources(self) -> int:
|
|
179
|
+
return_code, kueue_crds_string = run_command_for_value(
|
|
180
|
+
"kubectl get crd -o name | grep .kueue.x-k8s.io", "Get Kueue CRDs"
|
|
181
|
+
)
|
|
182
|
+
if return_code != 0:
|
|
183
|
+
return return_code
|
|
184
|
+
|
|
185
|
+
kueue_crds = [
|
|
186
|
+
line.strip().removeprefix(
|
|
187
|
+
"customresourcedefinition.apiextensions.k8s.io/"
|
|
188
|
+
)
|
|
189
|
+
for line in kueue_crds_string.strip().split("\n")
|
|
190
|
+
]
|
|
191
|
+
|
|
192
|
+
for crd in kueue_crds:
|
|
193
|
+
return_code = run_command_with_updates(
|
|
194
|
+
f"kubectl delete {crd} --all", f"Delete all resources of type {crd}"
|
|
195
|
+
)
|
|
196
|
+
if return_code != 0:
|
|
197
|
+
return return_code
|
|
198
|
+
|
|
199
|
+
for crd in kueue_crds:
|
|
200
|
+
return_code = run_command_with_updates(
|
|
201
|
+
f"kubectl delete crd {crd}", f"Delete CRD {crd}"
|
|
202
|
+
)
|
|
203
|
+
if return_code != 0:
|
|
204
|
+
return return_code
|
|
205
|
+
|
|
206
|
+
return run_command_with_updates(
|
|
207
|
+
"kubectl delete deployment kueue-controller-manager -n kueue-system",
|
|
208
|
+
"Delete Kueue Controller Manager deployment",
|
|
209
|
+
)
|
|
210
|
+
|
|
164
211
|
def __install_kueue_crs(self) -> int:
|
|
165
212
|
manifest_url = f"https://github.com/kubernetes-sigs/kueue/releases/download/v{self.kueue_version}/manifests.yaml"
|
|
166
213
|
install_command = (
|
|
@@ -228,6 +275,7 @@ class KueueManager:
|
|
|
228
275
|
topology_name = (
|
|
229
276
|
topology_name_and_yaml.name if topology_name_and_yaml else None
|
|
230
277
|
)
|
|
278
|
+
cpu_limit, memory_limit = self.__autocorrect_resource_limits(kueue_config)
|
|
231
279
|
|
|
232
280
|
# The manager builds the context internally based on its opinionated logic
|
|
233
281
|
context = self.__build_template_context(
|
|
@@ -237,8 +285,8 @@ class KueueManager:
|
|
|
237
285
|
autoprovisioning=kueue_config.autoprovisioning_enabled,
|
|
238
286
|
flex=kueue_config.flex,
|
|
239
287
|
num_slices=kueue_config.num_slices,
|
|
240
|
-
cpu_limit=
|
|
241
|
-
memory_limit=
|
|
288
|
+
cpu_limit=cpu_limit,
|
|
289
|
+
memory_limit=memory_limit,
|
|
242
290
|
topology_name=topology_name,
|
|
243
291
|
)
|
|
244
292
|
|
|
@@ -364,12 +412,25 @@ class KueueManager:
|
|
|
364
412
|
).render(),
|
|
365
413
|
)
|
|
366
414
|
elif configure_sub_slicing:
|
|
415
|
+
sorted_topologies = sorted(
|
|
416
|
+
SUB_SLICING_TOPOLOGIES, key=get_topology_product, reverse=True
|
|
417
|
+
)
|
|
418
|
+
levels = [
|
|
419
|
+
get_slice_topology_level(topology)
|
|
420
|
+
for topology in sorted_topologies
|
|
421
|
+
if is_topology_contained(
|
|
422
|
+
contained=topology, container=system.topology
|
|
423
|
+
)
|
|
424
|
+
]
|
|
425
|
+
levels.append("kubernetes.io/hostname")
|
|
426
|
+
|
|
367
427
|
return _NameAndYaml(
|
|
368
428
|
name=SUB_SLICE_TOPOLOGY_NAME,
|
|
369
429
|
yaml=self.template_env.get_template(
|
|
370
430
|
KUEUE_SUB_SLICING_TOPOLOGY_JINJA_FILE
|
|
371
431
|
).render({
|
|
372
432
|
"sub_slice_topology_name": SUB_SLICE_TOPOLOGY_NAME,
|
|
433
|
+
"levels": levels,
|
|
373
434
|
}),
|
|
374
435
|
)
|
|
375
436
|
else:
|
|
@@ -377,8 +438,6 @@ class KueueManager:
|
|
|
377
438
|
|
|
378
439
|
def __apply_manifest(self, manifest: str) -> int:
|
|
379
440
|
task = "Applying Kueue Custom Resources"
|
|
380
|
-
if is_dry_run():
|
|
381
|
-
xpk_print(f"Applying following Kueue resources:{manifest}")
|
|
382
441
|
tmp_file = write_tmp_file(manifest)
|
|
383
442
|
command = f"kubectl apply -f {tmp_file}"
|
|
384
443
|
return run_command_with_updates(command, task)
|
|
@@ -422,13 +481,114 @@ class KueueManager:
|
|
|
422
481
|
xpk_print(f"{task} returned ERROR {return_code}")
|
|
423
482
|
return return_code
|
|
424
483
|
|
|
484
|
+
def __autocorrect_resource_limits(
|
|
485
|
+
self, kueue_config: KueueConfig
|
|
486
|
+
) -> tuple[int, str]:
|
|
487
|
+
"""Verify specified CPU and memory limits against machine type."""
|
|
488
|
+
|
|
489
|
+
cpu_limit = kueue_config.cpu_limit
|
|
490
|
+
memory_limit_str = kueue_config.memory_limit
|
|
491
|
+
if not cpu_limit and not memory_limit_str:
|
|
492
|
+
return cpu_limit, memory_limit_str
|
|
493
|
+
|
|
494
|
+
# Get CPU and memory capacity from machine type
|
|
495
|
+
command = (
|
|
496
|
+
"gcloud compute machine-types describe"
|
|
497
|
+
f" {kueue_config.system.gce_machine_type} "
|
|
498
|
+
f" --project={self.project} --zone={self.zone}"
|
|
499
|
+
" --format='value(guestCpus,memoryMb)'"
|
|
500
|
+
)
|
|
501
|
+
return_code, out = run_command_for_value(
|
|
502
|
+
command,
|
|
503
|
+
"Get vCPU and memory capacity for machine type",
|
|
504
|
+
dry_run_return_val="10 10",
|
|
505
|
+
)
|
|
506
|
+
if return_code != 0:
|
|
507
|
+
xpk_print(
|
|
508
|
+
"Unable to verify vCPU and memory capacity for machine type."
|
|
509
|
+
" XPK will proceed with using user-defined limits."
|
|
510
|
+
)
|
|
511
|
+
return cpu_limit, memory_limit_str
|
|
512
|
+
|
|
513
|
+
cpu_capacity_str, memory_capacity_MB_str = out.split()
|
|
514
|
+
if cpu_limit:
|
|
515
|
+
cpu_limit = _autocorrect_cpu_limit(cpu_limit, int(cpu_capacity_str))
|
|
516
|
+
if memory_limit_str:
|
|
517
|
+
memory_limit_str = _autocorrect_memory_limit(
|
|
518
|
+
memory_limit_str, memory_capacity_MB_str
|
|
519
|
+
)
|
|
520
|
+
return cpu_limit, memory_limit_str
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
def get_installed_kueue_version(
|
|
524
|
+
dry_run_version: Version | None = None,
|
|
525
|
+
) -> tuple[int, Version | None]:
|
|
526
|
+
command = (
|
|
527
|
+
"kubectl get deployment kueue-controller-manager -n kueue-system -o"
|
|
528
|
+
" jsonpath='{.spec.template.spec.containers[0].image}'"
|
|
529
|
+
)
|
|
530
|
+
task = "Get kueue version on server"
|
|
531
|
+
return_code, val = run_command_for_value(
|
|
532
|
+
command,
|
|
533
|
+
task,
|
|
534
|
+
dry_run_return_val=(
|
|
535
|
+
f"registry.k8s.io/kueue/kueue:v{dry_run_version}"
|
|
536
|
+
if dry_run_version
|
|
537
|
+
else ""
|
|
538
|
+
),
|
|
539
|
+
)
|
|
540
|
+
if return_code != 0:
|
|
541
|
+
return return_code, None
|
|
542
|
+
version_tag = val.split(":")
|
|
543
|
+
if len(version_tag) == 1:
|
|
544
|
+
return 1, None
|
|
545
|
+
return return_code, Version(version_tag[-1])
|
|
546
|
+
|
|
425
547
|
|
|
426
548
|
def has_sub_slicing_enabled() -> tuple[int, bool | None]:
|
|
427
549
|
return_code, value = run_command_for_value(
|
|
428
|
-
command="kubectl get topology",
|
|
550
|
+
command="kubectl get topology",
|
|
551
|
+
task="Get defined topologies",
|
|
552
|
+
dry_run_return_val=SUB_SLICE_TOPOLOGY_NAME,
|
|
429
553
|
)
|
|
430
554
|
|
|
431
555
|
if return_code != 0:
|
|
432
556
|
return return_code, None
|
|
433
557
|
|
|
434
558
|
return return_code, SUB_SLICE_TOPOLOGY_NAME in value
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
def _autocorrect_cpu_limit(cpu_limit: int, cpu_capacity: int) -> int:
|
|
562
|
+
if cpu_limit > cpu_capacity:
|
|
563
|
+
xpk_print(
|
|
564
|
+
"The CPU limit is above the available capacity."
|
|
565
|
+
f" We will set CPU limit to {cpu_capacity}."
|
|
566
|
+
)
|
|
567
|
+
elif cpu_limit < cpu_capacity:
|
|
568
|
+
xpk_print(
|
|
569
|
+
"The CPU limit is below the available capacity, which would lead"
|
|
570
|
+
f" to underutilization. We will set CPU limit to {cpu_capacity}."
|
|
571
|
+
)
|
|
572
|
+
return cpu_capacity
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def _autocorrect_memory_limit(
|
|
576
|
+
memory_limit_str: str, memory_capacity_MB_str: str
|
|
577
|
+
) -> str:
|
|
578
|
+
memory_limit_bytes = parse_quantity(memory_limit_str)
|
|
579
|
+
memory_capacity_bytes = int(memory_capacity_MB_str) << 20
|
|
580
|
+
if memory_limit_bytes == memory_capacity_bytes:
|
|
581
|
+
return memory_limit_str
|
|
582
|
+
memory_limit_str = memory_capacity_MB_str + "Mi"
|
|
583
|
+
if memory_limit_bytes > memory_capacity_bytes:
|
|
584
|
+
xpk_print(
|
|
585
|
+
"The memory limit is above the available capacity. We will set"
|
|
586
|
+
f" memory limit to {memory_limit_str}."
|
|
587
|
+
)
|
|
588
|
+
else:
|
|
589
|
+
xpk_print(
|
|
590
|
+
"The memory limit is below the available capacity, which would"
|
|
591
|
+
" lead to underutilization. We will set the memory limit to"
|
|
592
|
+
f" {memory_limit_str}."
|
|
593
|
+
)
|
|
594
|
+
return memory_limit_str
|