xpk 0.12.0__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +17 -10
- xpk/commands/cluster.py +137 -123
- xpk/commands/cluster_gcluster.py +77 -14
- xpk/commands/cluster_gcluster_test.py +177 -0
- xpk/commands/common.py +13 -27
- xpk/commands/info.py +11 -9
- xpk/commands/inspector.py +22 -11
- xpk/commands/job.py +53 -9
- xpk/commands/kind.py +38 -40
- xpk/commands/kjob_common.py +4 -4
- xpk/commands/run.py +9 -2
- xpk/commands/shell.py +13 -10
- xpk/commands/storage.py +26 -2
- xpk/commands/version.py +0 -4
- xpk/commands/workload.py +58 -30
- xpk/core/blueprint/blueprint_generator.py +4 -40
- xpk/core/blueprint/blueprint_test.py +0 -6
- xpk/core/capacity.py +6 -5
- xpk/core/cluster.py +96 -195
- xpk/core/cluster_private.py +9 -12
- xpk/core/commands.py +21 -25
- xpk/core/config.py +1 -1
- xpk/core/docker_image.py +17 -9
- xpk/core/docker_resources.py +9 -4
- xpk/core/gcloud_context.py +26 -2
- xpk/core/gcloud_context_test.py +96 -0
- xpk/core/gcluster_manager.py +0 -3
- xpk/core/jobset.py +5 -8
- xpk/core/kjob.py +19 -29
- xpk/core/kueue_manager.py +383 -0
- xpk/core/kueue_manager_test.py +542 -0
- xpk/core/monitoring.py +1 -1
- xpk/core/nap.py +11 -16
- xpk/core/network.py +18 -19
- xpk/core/nodepool.py +65 -71
- xpk/core/nodepool_test.py +198 -1
- xpk/core/pathways.py +9 -5
- xpk/core/ray.py +11 -15
- xpk/core/resources.py +15 -10
- xpk/core/scheduling.py +23 -1
- xpk/core/scheduling_test.py +31 -0
- xpk/core/system_characteristics.py +335 -229
- xpk/core/vertex.py +1 -1
- xpk/core/workload.py +7 -8
- xpk/main.py +3 -2
- xpk/parser/cluster.py +50 -0
- xpk/parser/cluster_test.py +66 -0
- xpk/parser/common.py +11 -0
- xpk/parser/workload.py +62 -25
- xpk/parser/workload_test.py +82 -0
- xpk/utils/execution_context.py +28 -0
- xpk/utils/feature_flags.py +28 -0
- xpk/utils/file.py +25 -10
- xpk/utils/kueue.py +20 -0
- xpk/utils/network.py +4 -0
- xpk/utils/templates.py +2 -0
- xpk/utils/topology.py +37 -0
- xpk/utils/topology_test.py +43 -0
- xpk/utils/validation.py +79 -55
- xpk/utils/validation_test.py +37 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
- xpk-0.14.0.dist-info/RECORD +112 -0
- xpk/core/kueue.py +0 -545
- xpk-0.12.0.dist-info/RECORD +0 -100
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import math
|
|
18
|
+
import textwrap
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from typing import Optional, List, Dict, Any
|
|
21
|
+
import json
|
|
22
|
+
from jinja2 import Environment, FileSystemLoader
|
|
23
|
+
from ..utils.execution_context import is_dry_run
|
|
24
|
+
from ..utils.kueue import is_queued_cluster
|
|
25
|
+
|
|
26
|
+
from .capacity import B200_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
|
|
27
|
+
from .scheduling import (
|
|
28
|
+
create_accelerator_label,
|
|
29
|
+
create_machine_label,
|
|
30
|
+
)
|
|
31
|
+
from .system_characteristics import (
|
|
32
|
+
AcceleratorTypeToAcceleratorCharacteristics,
|
|
33
|
+
SystemCharacteristics,
|
|
34
|
+
)
|
|
35
|
+
from ..core.commands import (
|
|
36
|
+
run_command_for_value,
|
|
37
|
+
run_command_with_updates,
|
|
38
|
+
run_command_with_updates_retry,
|
|
39
|
+
)
|
|
40
|
+
from ..utils.file import write_tmp_file
|
|
41
|
+
from ..utils.console import xpk_print, xpk_exit
|
|
42
|
+
from ..utils.templates import TEMPLATE_PATH
|
|
43
|
+
|
|
44
|
+
WAIT_FOR_KUEUE_TIMEOUT = "10m"
|
|
45
|
+
CLUSTER_QUEUE_NAME = "cluster-queue"
|
|
46
|
+
LOCAL_QUEUE_NAME = "multislice-queue"
|
|
47
|
+
KUEUE_CONFIG_JINJA_FILE = "kueue_config.yaml.j2"
|
|
48
|
+
KUEUE_TOPOLOGY_JINJA_FILE = "kueue_topology.yaml.j2"
|
|
49
|
+
KUEUE_CONTROLLER_MANAGER_JINJA_FILE = "kueue_controller_manager.yaml.j2"
|
|
50
|
+
MEMORY_SIZE_PER_VM = 1.2
|
|
51
|
+
MIN_MEMORY_LIMIT_SIZE = 4096
|
|
52
|
+
KUEUE_VERSION = "v0.14.1"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class KueueConfig:
|
|
57
|
+
system: SystemCharacteristics
|
|
58
|
+
total_chips: int
|
|
59
|
+
cpu_limit: int
|
|
60
|
+
memory_limit: str
|
|
61
|
+
is_pathways_cluster: bool = False
|
|
62
|
+
autoprovisioning_enabled: bool = False
|
|
63
|
+
flex: bool = False
|
|
64
|
+
num_slices: int = 1
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class KueueManager:
|
|
68
|
+
"""Manages the installation and configuration of Kueue on an XPK cluster."""
|
|
69
|
+
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
kueue_version: str = KUEUE_VERSION,
|
|
73
|
+
template_path=TEMPLATE_PATH,
|
|
74
|
+
):
|
|
75
|
+
self.kueue_version = kueue_version
|
|
76
|
+
self.template_env = Environment(loader=FileSystemLoader(template_path))
|
|
77
|
+
|
|
78
|
+
def install_or_upgrade(
|
|
79
|
+
self,
|
|
80
|
+
kueue_config: KueueConfig,
|
|
81
|
+
tolerations: Optional[List[Dict[str, Any]]] = None,
|
|
82
|
+
) -> int:
|
|
83
|
+
"""
|
|
84
|
+
Ensures the correct version of Kueue is installed. Upgrades if the installed
|
|
85
|
+
version is older or non-existent.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
tolerations: An optional list of tolerations to apply to the kueue-controller-manager.
|
|
89
|
+
"""
|
|
90
|
+
return_code, installed_version = self.__get_installed_kueue_version()
|
|
91
|
+
|
|
92
|
+
if return_code == 0:
|
|
93
|
+
if installed_version and installed_version > self.kueue_version:
|
|
94
|
+
xpk_print(
|
|
95
|
+
f"Cluster has a newer Kueue version, {installed_version}. Skipping"
|
|
96
|
+
" installation."
|
|
97
|
+
)
|
|
98
|
+
return 0
|
|
99
|
+
else:
|
|
100
|
+
xpk_print(f"Upgrading Kueue to version {self.kueue_version}...")
|
|
101
|
+
else:
|
|
102
|
+
xpk_print(f"Installing Kueue version {self.kueue_version}...")
|
|
103
|
+
|
|
104
|
+
install_return_code = self.__install(tolerations)
|
|
105
|
+
if install_return_code != 0:
|
|
106
|
+
return install_return_code
|
|
107
|
+
|
|
108
|
+
return self.__configure(kueue_config)
|
|
109
|
+
|
|
110
|
+
def __get_installed_kueue_version(self) -> tuple[int, str | None]:
|
|
111
|
+
command = (
|
|
112
|
+
"kubectl get deployment kueue-controller-manager -n kueue-system -o"
|
|
113
|
+
" jsonpath='{.spec.template.spec.containers[0].image}'"
|
|
114
|
+
)
|
|
115
|
+
task = "Get kueue version on server"
|
|
116
|
+
return_code, val = run_command_for_value(
|
|
117
|
+
command,
|
|
118
|
+
task,
|
|
119
|
+
dry_run_return_val="""
|
|
120
|
+
v0.14.1""",
|
|
121
|
+
)
|
|
122
|
+
if return_code != 0:
|
|
123
|
+
return return_code, None
|
|
124
|
+
version_tag = val.split(":")
|
|
125
|
+
if len(version_tag) == 1:
|
|
126
|
+
return 1, None
|
|
127
|
+
return return_code, version_tag[-1]
|
|
128
|
+
|
|
129
|
+
def __install(
|
|
130
|
+
self,
|
|
131
|
+
tolerations: Optional[List[Dict[str, Any]]] = None,
|
|
132
|
+
) -> int:
|
|
133
|
+
"""
|
|
134
|
+
Installs Kueue from the official manifest and then applies any necessary patches.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
tolerations: An optional list of tolerations to apply to the kueue-controller-manager.
|
|
138
|
+
"""
|
|
139
|
+
return_code = self.__install_kueue_crs()
|
|
140
|
+
if return_code != 0:
|
|
141
|
+
return return_code
|
|
142
|
+
|
|
143
|
+
if tolerations:
|
|
144
|
+
return_code = self.__patch_tolerations(tolerations)
|
|
145
|
+
if return_code != 0:
|
|
146
|
+
return return_code
|
|
147
|
+
|
|
148
|
+
return self.__wait_for_kueue_available()
|
|
149
|
+
|
|
150
|
+
def __install_kueue_crs(self) -> int:
|
|
151
|
+
manifest_url = f"https://github.com/kubernetes-sigs/kueue/releases/download/{self.kueue_version}/manifests.yaml"
|
|
152
|
+
install_command = (
|
|
153
|
+
f"kubectl apply --server-side --force-conflicts -f {manifest_url}"
|
|
154
|
+
)
|
|
155
|
+
task = "Installing Kueue Custom Resources"
|
|
156
|
+
return_code = run_command_with_updates_retry(
|
|
157
|
+
install_command, "Install Kueue"
|
|
158
|
+
)
|
|
159
|
+
if return_code != 0:
|
|
160
|
+
xpk_print(f"{task} returned ERROR {return_code}")
|
|
161
|
+
return return_code
|
|
162
|
+
|
|
163
|
+
def __patch_tolerations(self, tolerations: List[Dict[str, Any]]) -> int:
|
|
164
|
+
patch = {"spec": {"template": {"spec": {"tolerations": tolerations}}}}
|
|
165
|
+
patch_str = json.dumps(patch)
|
|
166
|
+
patch_command = (
|
|
167
|
+
"kubectl patch deployment kueue-controller-manager -n kueue-system"
|
|
168
|
+
f" --type='strategic' --patch='{patch_str}'"
|
|
169
|
+
)
|
|
170
|
+
task = "Patch Kueue Tolerations"
|
|
171
|
+
return_code = run_command_with_updates_retry(
|
|
172
|
+
patch_command, "Patch Kueue Tolerations"
|
|
173
|
+
)
|
|
174
|
+
if return_code != 0:
|
|
175
|
+
xpk_print(f"{task} returned ERROR {return_code}")
|
|
176
|
+
return return_code
|
|
177
|
+
|
|
178
|
+
def __wait_for_kueue_available(self) -> int:
|
|
179
|
+
"""Wait for Kueue to be fully available.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
args: user provided arguments for running the command.
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
0 if successful and 1 otherwise.
|
|
186
|
+
"""
|
|
187
|
+
command = (
|
|
188
|
+
"kubectl wait deploy/kueue-controller-manager -nkueue-system"
|
|
189
|
+
f" --for=condition=available --timeout={WAIT_FOR_KUEUE_TIMEOUT}"
|
|
190
|
+
)
|
|
191
|
+
task = "Wait for Kueue to be available"
|
|
192
|
+
return_code = run_command_with_updates(command, task)
|
|
193
|
+
if return_code != 0:
|
|
194
|
+
xpk_print(f"{task} returned ERROR {return_code}")
|
|
195
|
+
return return_code
|
|
196
|
+
|
|
197
|
+
def __configure(
|
|
198
|
+
self,
|
|
199
|
+
kueue_config: KueueConfig,
|
|
200
|
+
) -> int:
|
|
201
|
+
"""
|
|
202
|
+
Configures Kueue with opinionated defaults for XPK.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
kueue_config: The KueueConfig object containing all configuration parameters.
|
|
206
|
+
Returns:
|
|
207
|
+
0 if successful and 1 otherwise.
|
|
208
|
+
"""
|
|
209
|
+
template = self.template_env.get_template(KUEUE_CONFIG_JINJA_FILE)
|
|
210
|
+
|
|
211
|
+
# The manager builds the context internally based on its opinionated logic
|
|
212
|
+
context = self.__build_template_context(
|
|
213
|
+
system=kueue_config.system,
|
|
214
|
+
total_chips=kueue_config.total_chips,
|
|
215
|
+
is_pathways=kueue_config.is_pathways_cluster,
|
|
216
|
+
autoprovisioning=kueue_config.autoprovisioning_enabled,
|
|
217
|
+
flex=kueue_config.flex,
|
|
218
|
+
num_slices=kueue_config.num_slices,
|
|
219
|
+
cpu_limit=kueue_config.cpu_limit,
|
|
220
|
+
memory_limit=kueue_config.memory_limit,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
rendered_manifest = template.render(context)
|
|
224
|
+
|
|
225
|
+
if kueue_config.system.device_type in [
|
|
226
|
+
H100_MEGA_DEVICE_TYPE,
|
|
227
|
+
H200_DEVICE_TYPE,
|
|
228
|
+
B200_DEVICE_TYPE,
|
|
229
|
+
]:
|
|
230
|
+
topology_yaml = self.template_env.get_template(KUEUE_TOPOLOGY_JINJA_FILE)
|
|
231
|
+
rendered_manifest = topology_yaml.render() + rendered_manifest
|
|
232
|
+
|
|
233
|
+
return_code = self.__apply_manifest(rendered_manifest)
|
|
234
|
+
if return_code != 0:
|
|
235
|
+
return return_code
|
|
236
|
+
|
|
237
|
+
return self.__update_kueue_resources_if_necessary()
|
|
238
|
+
|
|
239
|
+
def __build_template_context(
|
|
240
|
+
self,
|
|
241
|
+
system: SystemCharacteristics,
|
|
242
|
+
total_chips: int,
|
|
243
|
+
is_pathways: bool,
|
|
244
|
+
autoprovisioning: bool,
|
|
245
|
+
flex: bool,
|
|
246
|
+
num_slices: int,
|
|
247
|
+
cpu_limit: int,
|
|
248
|
+
memory_limit: str,
|
|
249
|
+
) -> Dict[str, Any]:
|
|
250
|
+
"""Prepares the context for the Jinja2 template."""
|
|
251
|
+
# Main accelerator flavor
|
|
252
|
+
device_type_str = system.device_type.replace("_", "-")
|
|
253
|
+
main_flavor_name = f"{num_slices}x{device_type_str}"
|
|
254
|
+
|
|
255
|
+
node_labels_dict = {}
|
|
256
|
+
accelerator_label = create_accelerator_label(
|
|
257
|
+
system.accelerator_type, system
|
|
258
|
+
)
|
|
259
|
+
if accelerator_label:
|
|
260
|
+
key, value = accelerator_label.split(":", 1)
|
|
261
|
+
node_labels_dict[key] = value.strip()
|
|
262
|
+
|
|
263
|
+
machine_label = create_machine_label(
|
|
264
|
+
system.accelerator_type, system, autoprovisioning
|
|
265
|
+
)
|
|
266
|
+
if machine_label:
|
|
267
|
+
key, value = machine_label.split(":", 1)
|
|
268
|
+
node_labels_dict[key] = value.strip()
|
|
269
|
+
|
|
270
|
+
topology_label = ""
|
|
271
|
+
if system.device_type in [
|
|
272
|
+
H100_MEGA_DEVICE_TYPE,
|
|
273
|
+
H200_DEVICE_TYPE,
|
|
274
|
+
B200_DEVICE_TYPE,
|
|
275
|
+
]:
|
|
276
|
+
topology_label = 'topologyName: "gke-default"'
|
|
277
|
+
|
|
278
|
+
flavors = [{
|
|
279
|
+
"name": main_flavor_name,
|
|
280
|
+
"nodeLabels": node_labels_dict,
|
|
281
|
+
"topologyLabel": topology_label,
|
|
282
|
+
}]
|
|
283
|
+
|
|
284
|
+
managed_resource = AcceleratorTypeToAcceleratorCharacteristics[
|
|
285
|
+
system.accelerator_type
|
|
286
|
+
].resource_type
|
|
287
|
+
|
|
288
|
+
covered_resources = [managed_resource]
|
|
289
|
+
resources = [{"name": managed_resource, "nominalQuota": total_chips}]
|
|
290
|
+
|
|
291
|
+
if cpu_limit:
|
|
292
|
+
covered_resources.append("cpu")
|
|
293
|
+
resources.append({"name": "cpu", "nominalQuota": cpu_limit})
|
|
294
|
+
if memory_limit:
|
|
295
|
+
covered_resources.append("memory")
|
|
296
|
+
resources.append({"name": "memory", "nominalQuota": memory_limit})
|
|
297
|
+
|
|
298
|
+
resource_groups = [{
|
|
299
|
+
"coveredResources": covered_resources,
|
|
300
|
+
"flavors": [{"name": main_flavor_name, "resources": resources}],
|
|
301
|
+
}]
|
|
302
|
+
|
|
303
|
+
# Add Pathway-specific resources if needed
|
|
304
|
+
if is_pathways:
|
|
305
|
+
flavors.append({
|
|
306
|
+
"name": "cpu-user",
|
|
307
|
+
"nodeLabels": {"cloud.google.com/gke-nodepool": "cpu-np"},
|
|
308
|
+
})
|
|
309
|
+
resource_groups.append({
|
|
310
|
+
"coveredResources": ["cpu", "memory"],
|
|
311
|
+
"flavors": [{
|
|
312
|
+
"name": "cpu-user",
|
|
313
|
+
"resources": [
|
|
314
|
+
{"name": "cpu", "nominalQuota": 480},
|
|
315
|
+
{"name": "memory", "nominalQuota": "2000G"},
|
|
316
|
+
],
|
|
317
|
+
}],
|
|
318
|
+
})
|
|
319
|
+
|
|
320
|
+
if flex and is_queued_cluster(num_slices):
|
|
321
|
+
admission_checks = textwrap.dedent("""
|
|
322
|
+
admissionChecks:
|
|
323
|
+
- dws-prov
|
|
324
|
+
""")
|
|
325
|
+
else:
|
|
326
|
+
admission_checks = ""
|
|
327
|
+
|
|
328
|
+
return {
|
|
329
|
+
"flavors": flavors,
|
|
330
|
+
"resource_groups": resource_groups,
|
|
331
|
+
"autoprovisioning_enabled": autoprovisioning,
|
|
332
|
+
"managed_resource": managed_resource,
|
|
333
|
+
"cluster_queue_name": CLUSTER_QUEUE_NAME,
|
|
334
|
+
"local_queue_name": LOCAL_QUEUE_NAME,
|
|
335
|
+
"admission_checks": admission_checks,
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
def __apply_manifest(self, manifest: str) -> int:
|
|
339
|
+
task = "Applying Kueue Custom Resources"
|
|
340
|
+
if is_dry_run():
|
|
341
|
+
xpk_print(f"Applying following Kueue resources:{manifest}")
|
|
342
|
+
tmp_file = write_tmp_file(manifest)
|
|
343
|
+
command = f"kubectl apply -f {tmp_file}"
|
|
344
|
+
return run_command_with_updates(command, task)
|
|
345
|
+
|
|
346
|
+
def __update_kueue_resources_if_necessary(self) -> int:
|
|
347
|
+
"""Patch memory size limit if necessary."""
|
|
348
|
+
# Get total number of nodes
|
|
349
|
+
cmd_total_node_num = "kubectl get node --no-headers | wc -l"
|
|
350
|
+
return_code, out = run_command_for_value(
|
|
351
|
+
cmd_total_node_num, "Count total nodes"
|
|
352
|
+
)
|
|
353
|
+
if return_code != 0:
|
|
354
|
+
xpk_exit(1)
|
|
355
|
+
# 1.2MiB per VM or 4GiB (whichever is greater).
|
|
356
|
+
new_memory_limit = (
|
|
357
|
+
f"{max(math.ceil(int(out) * MEMORY_SIZE_PER_VM), MIN_MEMORY_LIMIT_SIZE)}Mi"
|
|
358
|
+
)
|
|
359
|
+
patch = {
|
|
360
|
+
"spec": {
|
|
361
|
+
"template": {
|
|
362
|
+
"spec": {
|
|
363
|
+
"containers": [{
|
|
364
|
+
"name": "manager",
|
|
365
|
+
"resources": {"limits": {"memory": new_memory_limit}},
|
|
366
|
+
}]
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
patch_str = json.dumps(patch)
|
|
372
|
+
patch_command = (
|
|
373
|
+
"kubectl patch deployment kueue-controller-manager -n kueue-system"
|
|
374
|
+
f" --type='strategic' --patch='{patch_str}'"
|
|
375
|
+
)
|
|
376
|
+
task = "Updating Kueue Controller Manager resources"
|
|
377
|
+
return_code = run_command_with_updates_retry(
|
|
378
|
+
patch_command,
|
|
379
|
+
task,
|
|
380
|
+
)
|
|
381
|
+
if return_code != 0:
|
|
382
|
+
xpk_print(f"{task} returned ERROR {return_code}")
|
|
383
|
+
return return_code
|