xpk 0.14.4__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/README.md +19 -0
- integration/gcluster_a3mega_test.py +11 -0
- integration/gcluster_a3ultra_test.py +11 -0
- integration/gcluster_a4_test.py +11 -0
- xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3mega/storage_crd.yaml +52 -0
- xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
- xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
- xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
- xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
- xpk/blueprints/a4/storage_crd.yaml +52 -0
- xpk/commands/cluster.py +89 -32
- xpk/commands/cluster_gcluster.py +25 -5
- xpk/commands/cluster_gcluster_test.py +16 -3
- xpk/commands/cluster_test.py +353 -7
- xpk/commands/config.py +3 -5
- xpk/commands/inspector.py +5 -3
- xpk/commands/kind.py +3 -1
- xpk/commands/managed_ml_diagnostics.py +249 -0
- xpk/commands/managed_ml_diagnostics_test.py +146 -0
- xpk/commands/storage.py +8 -10
- xpk/commands/workload.py +143 -142
- xpk/commands/workload_test.py +160 -118
- xpk/core/blueprint/blueprint_generator.py +73 -33
- xpk/core/blueprint/blueprint_test.py +9 -0
- xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
- xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
- xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
- xpk/core/blueprint/testing/data/a4.yaml +185 -0
- xpk/core/capacity.py +48 -8
- xpk/core/capacity_test.py +32 -1
- xpk/core/cluster.py +55 -104
- xpk/core/cluster_test.py +170 -0
- xpk/core/commands.py +4 -10
- xpk/core/config.py +88 -7
- xpk/core/config_test.py +67 -11
- xpk/core/docker_container.py +3 -1
- xpk/core/docker_image.py +10 -6
- xpk/core/docker_resources.py +1 -10
- xpk/core/gcloud_context.py +18 -12
- xpk/core/gcloud_context_test.py +111 -1
- xpk/core/kjob.py +17 -19
- xpk/core/kueue_manager.py +205 -51
- xpk/core/kueue_manager_test.py +158 -4
- xpk/core/nap.py +13 -14
- xpk/core/nodepool.py +37 -43
- xpk/core/nodepool_test.py +42 -19
- xpk/core/pathways.py +23 -0
- xpk/core/pathways_test.py +57 -0
- xpk/core/resources.py +84 -27
- xpk/core/scheduling.py +144 -133
- xpk/core/scheduling_test.py +298 -6
- xpk/core/system_characteristics.py +256 -19
- xpk/core/system_characteristics_test.py +128 -5
- xpk/core/telemetry.py +263 -0
- xpk/core/telemetry_test.py +211 -0
- xpk/core/vertex.py +4 -3
- xpk/core/workload_decorators/tcpx_decorator.py +5 -1
- xpk/main.py +33 -13
- xpk/parser/cluster.py +40 -67
- xpk/parser/cluster_test.py +83 -3
- xpk/parser/common.py +84 -0
- xpk/parser/storage.py +10 -0
- xpk/parser/storage_test.py +47 -0
- xpk/parser/workload.py +14 -29
- xpk/parser/workload_test.py +3 -49
- xpk/telemetry_uploader.py +29 -0
- xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
- xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
- xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
- xpk/utils/console.py +41 -10
- xpk/utils/console_test.py +106 -0
- xpk/utils/feature_flags.py +10 -1
- xpk/utils/file.py +4 -1
- xpk/utils/topology.py +4 -0
- xpk/utils/user_agent.py +35 -0
- xpk/utils/user_agent_test.py +44 -0
- xpk/utils/user_input.py +48 -0
- xpk/utils/user_input_test.py +92 -0
- xpk/utils/validation.py +2 -13
- xpk/utils/versions.py +31 -0
- xpk-0.16.0.dist-info/METADATA +127 -0
- xpk-0.16.0.dist-info/RECORD +168 -0
- xpk-0.14.4.dist-info/METADATA +0 -1645
- xpk-0.14.4.dist-info/RECORD +0 -139
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
xpk/core/kueue_manager.py
CHANGED
|
@@ -20,17 +20,17 @@ from dataclasses import dataclass
|
|
|
20
20
|
from typing import Optional, List, Dict, Any
|
|
21
21
|
import json
|
|
22
22
|
from jinja2 import Environment, FileSystemLoader
|
|
23
|
-
from ..utils.execution_context import is_dry_run
|
|
24
|
-
from ..utils.kueue import is_queued_cluster
|
|
25
23
|
|
|
26
|
-
from .
|
|
27
|
-
from .
|
|
28
|
-
|
|
29
|
-
create_machine_label,
|
|
30
|
-
)
|
|
24
|
+
from ..utils.topology import get_slice_topology_level, get_topology_product, is_topology_contained
|
|
25
|
+
from ..utils.kueue import is_queued_cluster
|
|
26
|
+
from kubernetes.utils import parse_quantity
|
|
31
27
|
from .system_characteristics import (
|
|
28
|
+
SUB_SLICING_TOPOLOGIES,
|
|
29
|
+
AcceleratorType,
|
|
32
30
|
AcceleratorTypeToAcceleratorCharacteristics,
|
|
33
31
|
SystemCharacteristics,
|
|
32
|
+
create_accelerator_label,
|
|
33
|
+
create_machine_label,
|
|
34
34
|
)
|
|
35
35
|
from ..core.commands import (
|
|
36
36
|
run_command_for_value,
|
|
@@ -38,10 +38,12 @@ from ..core.commands import (
|
|
|
38
38
|
run_command_with_updates_retry,
|
|
39
39
|
)
|
|
40
40
|
from ..utils.file import write_tmp_file
|
|
41
|
-
from ..utils.console import xpk_print, xpk_exit
|
|
41
|
+
from ..utils.console import xpk_print, xpk_exit, ask_for_user_consent
|
|
42
42
|
from ..utils.templates import TEMPLATE_PATH, get_templates_absolute_path
|
|
43
43
|
from packaging.version import Version
|
|
44
44
|
|
|
45
|
+
KUEUE_VERSION = Version("v0.14.3")
|
|
46
|
+
LATEST_BREAKING_VERSION = Version("v0.14.0")
|
|
45
47
|
WAIT_FOR_KUEUE_TIMEOUT = "10m"
|
|
46
48
|
CLUSTER_QUEUE_NAME = "cluster-queue"
|
|
47
49
|
LOCAL_QUEUE_NAME = "multislice-queue"
|
|
@@ -52,10 +54,9 @@ KUEUE_CONTROLLER_MANAGER_JINJA_FILE = "kueue_controller_manager.yaml.j2"
|
|
|
52
54
|
KUEUE_SUB_SLICING_TOPOLOGY_JINJA_FILE = "kueue_sub_slicing_topology.yaml.j2"
|
|
53
55
|
MEMORY_SIZE_PER_VM = 1.2
|
|
54
56
|
MIN_MEMORY_LIMIT_SIZE = 4096
|
|
55
|
-
KUEUE_VERSION = Version("v0.12.2")
|
|
56
57
|
|
|
57
58
|
|
|
58
|
-
@dataclass
|
|
59
|
+
@dataclass(frozen=True)
|
|
59
60
|
class KueueConfig:
|
|
60
61
|
system: SystemCharacteristics
|
|
61
62
|
total_chips: int
|
|
@@ -68,7 +69,7 @@ class KueueConfig:
|
|
|
68
69
|
num_slices: int = 1
|
|
69
70
|
|
|
70
71
|
|
|
71
|
-
@dataclass
|
|
72
|
+
@dataclass(frozen=True)
|
|
72
73
|
class _NameAndYaml:
|
|
73
74
|
name: str
|
|
74
75
|
yaml: str
|
|
@@ -79,9 +80,13 @@ class KueueManager:
|
|
|
79
80
|
|
|
80
81
|
def __init__(
|
|
81
82
|
self,
|
|
83
|
+
project: str,
|
|
84
|
+
zone: str,
|
|
82
85
|
kueue_version: Version = KUEUE_VERSION,
|
|
83
86
|
template_path=TEMPLATE_PATH,
|
|
84
87
|
):
|
|
88
|
+
self.project = project
|
|
89
|
+
self.zone = zone
|
|
85
90
|
self.kueue_version = kueue_version
|
|
86
91
|
|
|
87
92
|
self.template_env = Environment(
|
|
@@ -102,10 +107,10 @@ class KueueManager:
|
|
|
102
107
|
Args:
|
|
103
108
|
tolerations: An optional list of tolerations to apply to the kueue-controller-manager.
|
|
104
109
|
"""
|
|
105
|
-
return_code, installed_version =
|
|
110
|
+
return_code, installed_version = get_installed_kueue_version()
|
|
106
111
|
|
|
107
|
-
if return_code == 0:
|
|
108
|
-
if installed_version
|
|
112
|
+
if return_code == 0 and installed_version:
|
|
113
|
+
if installed_version > self.kueue_version:
|
|
109
114
|
xpk_print(
|
|
110
115
|
f"Cluster has a newer Kueue version, {installed_version}. Skipping"
|
|
111
116
|
" installation."
|
|
@@ -113,6 +118,10 @@ class KueueManager:
|
|
|
113
118
|
return 0
|
|
114
119
|
else:
|
|
115
120
|
xpk_print(f"Upgrading Kueue to version v{self.kueue_version}...")
|
|
121
|
+
assert installed_version
|
|
122
|
+
prepare_code = self.__prepare_for_upgrade(installed_version)
|
|
123
|
+
if prepare_code != 0:
|
|
124
|
+
return prepare_code
|
|
116
125
|
else:
|
|
117
126
|
xpk_print(f"Installing Kueue version v{self.kueue_version}...")
|
|
118
127
|
|
|
@@ -122,24 +131,6 @@ class KueueManager:
|
|
|
122
131
|
|
|
123
132
|
return self.__configure(kueue_config)
|
|
124
133
|
|
|
125
|
-
def get_installed_kueue_version(self) -> tuple[int, Version | None]:
|
|
126
|
-
command = (
|
|
127
|
-
"kubectl get deployment kueue-controller-manager -n kueue-system -o"
|
|
128
|
-
" jsonpath='{.spec.template.spec.containers[0].image}'"
|
|
129
|
-
)
|
|
130
|
-
task = "Get kueue version on server"
|
|
131
|
-
return_code, val = run_command_for_value(
|
|
132
|
-
command,
|
|
133
|
-
task,
|
|
134
|
-
dry_run_return_val="",
|
|
135
|
-
)
|
|
136
|
-
if return_code != 0:
|
|
137
|
-
return return_code, None
|
|
138
|
-
version_tag = val.split(":")
|
|
139
|
-
if len(version_tag) == 1:
|
|
140
|
-
return 1, None
|
|
141
|
-
return return_code, Version(version_tag[-1])
|
|
142
|
-
|
|
143
134
|
def __install(
|
|
144
135
|
self,
|
|
145
136
|
tolerations: Optional[List[Dict[str, Any]]] = None,
|
|
@@ -161,6 +152,60 @@ class KueueManager:
|
|
|
161
152
|
|
|
162
153
|
return self.__wait_for_kueue_available()
|
|
163
154
|
|
|
155
|
+
def __prepare_for_upgrade(self, installed_version: Version) -> int:
|
|
156
|
+
if installed_version >= LATEST_BREAKING_VERSION:
|
|
157
|
+
return 0
|
|
158
|
+
|
|
159
|
+
xpk_print(
|
|
160
|
+
f"Currently installed Kueue version v{installed_version} is"
|
|
161
|
+
f" incompatible with the newer v{self.kueue_version}."
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
changelog_link = f"https://github.com/kubernetes-sigs/kueue/blob/main/CHANGELOG/CHANGELOG-{self.kueue_version.major}.{self.kueue_version.minor}.md"
|
|
165
|
+
agreed = ask_for_user_consent(
|
|
166
|
+
"Do you want to allow XPK to update Kueue automatically? This will"
|
|
167
|
+
" delete all existing Kueue resources and create new ones. If you"
|
|
168
|
+
" decline, you will need to upgrade the Kueue manually (see"
|
|
169
|
+
f" {changelog_link} for help)."
|
|
170
|
+
)
|
|
171
|
+
if not agreed:
|
|
172
|
+
return 1
|
|
173
|
+
|
|
174
|
+
return self.__delete_all_kueue_resources()
|
|
175
|
+
|
|
176
|
+
def __delete_all_kueue_resources(self) -> int:
|
|
177
|
+
return_code, kueue_crds_string = run_command_for_value(
|
|
178
|
+
"kubectl get crd -o name | grep .kueue.x-k8s.io", "Get Kueue CRDs"
|
|
179
|
+
)
|
|
180
|
+
if return_code != 0:
|
|
181
|
+
return return_code
|
|
182
|
+
|
|
183
|
+
kueue_crds = [
|
|
184
|
+
line.strip().removeprefix(
|
|
185
|
+
"customresourcedefinition.apiextensions.k8s.io/"
|
|
186
|
+
)
|
|
187
|
+
for line in kueue_crds_string.strip().split("\n")
|
|
188
|
+
]
|
|
189
|
+
|
|
190
|
+
for crd in kueue_crds:
|
|
191
|
+
return_code = run_command_with_updates(
|
|
192
|
+
f"kubectl delete {crd} --all", f"Delete all resources of type {crd}"
|
|
193
|
+
)
|
|
194
|
+
if return_code != 0:
|
|
195
|
+
return return_code
|
|
196
|
+
|
|
197
|
+
for crd in kueue_crds:
|
|
198
|
+
return_code = run_command_with_updates(
|
|
199
|
+
f"kubectl delete crd {crd}", f"Delete CRD {crd}"
|
|
200
|
+
)
|
|
201
|
+
if return_code != 0:
|
|
202
|
+
return return_code
|
|
203
|
+
|
|
204
|
+
return run_command_with_updates(
|
|
205
|
+
"kubectl delete deployment kueue-controller-manager -n kueue-system",
|
|
206
|
+
"Delete Kueue Controller Manager deployment",
|
|
207
|
+
)
|
|
208
|
+
|
|
164
209
|
def __install_kueue_crs(self) -> int:
|
|
165
210
|
manifest_url = f"https://github.com/kubernetes-sigs/kueue/releases/download/v{self.kueue_version}/manifests.yaml"
|
|
166
211
|
install_command = (
|
|
@@ -228,6 +273,7 @@ class KueueManager:
|
|
|
228
273
|
topology_name = (
|
|
229
274
|
topology_name_and_yaml.name if topology_name_and_yaml else None
|
|
230
275
|
)
|
|
276
|
+
cpu_limit, memory_limit = self.__autocorrect_resource_limits(kueue_config)
|
|
231
277
|
|
|
232
278
|
# The manager builds the context internally based on its opinionated logic
|
|
233
279
|
context = self.__build_template_context(
|
|
@@ -237,8 +283,8 @@ class KueueManager:
|
|
|
237
283
|
autoprovisioning=kueue_config.autoprovisioning_enabled,
|
|
238
284
|
flex=kueue_config.flex,
|
|
239
285
|
num_slices=kueue_config.num_slices,
|
|
240
|
-
cpu_limit=
|
|
241
|
-
memory_limit=
|
|
286
|
+
cpu_limit=cpu_limit,
|
|
287
|
+
memory_limit=memory_limit,
|
|
242
288
|
topology_name=topology_name,
|
|
243
289
|
)
|
|
244
290
|
|
|
@@ -273,19 +319,16 @@ class KueueManager:
|
|
|
273
319
|
main_flavor_name = f"{num_slices}x{device_type_str}"
|
|
274
320
|
|
|
275
321
|
node_labels_dict = {}
|
|
276
|
-
accelerator_label = create_accelerator_label(
|
|
277
|
-
system.accelerator_type, system
|
|
278
|
-
)
|
|
322
|
+
accelerator_label = create_accelerator_label(system)
|
|
279
323
|
if accelerator_label:
|
|
280
324
|
key, value = accelerator_label.split(":", 1)
|
|
281
325
|
node_labels_dict[key] = value.strip()
|
|
282
326
|
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
node_labels_dict[key] = value.strip()
|
|
327
|
+
if not autoprovisioning:
|
|
328
|
+
machine_label = create_machine_label(system)
|
|
329
|
+
if machine_label:
|
|
330
|
+
key, value = machine_label.split(":", 1)
|
|
331
|
+
node_labels_dict[key] = value.strip()
|
|
289
332
|
|
|
290
333
|
topology_label = f"topologyName: {topology_name}" if topology_name else ""
|
|
291
334
|
|
|
@@ -352,11 +395,10 @@ class KueueManager:
|
|
|
352
395
|
def __get_topology_name_and_yaml(
|
|
353
396
|
self, system: SystemCharacteristics, configure_sub_slicing: bool
|
|
354
397
|
) -> _NameAndYaml | None:
|
|
355
|
-
if
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
]:
|
|
398
|
+
if (
|
|
399
|
+
system.accelerator_type == AcceleratorType["GPU"]
|
|
400
|
+
and system.gpu_requires_topology
|
|
401
|
+
):
|
|
360
402
|
return _NameAndYaml(
|
|
361
403
|
name="gke-default",
|
|
362
404
|
yaml=self.template_env.get_template(
|
|
@@ -364,12 +406,25 @@ class KueueManager:
|
|
|
364
406
|
).render(),
|
|
365
407
|
)
|
|
366
408
|
elif configure_sub_slicing:
|
|
409
|
+
sorted_topologies = sorted(
|
|
410
|
+
SUB_SLICING_TOPOLOGIES, key=get_topology_product, reverse=True
|
|
411
|
+
)
|
|
412
|
+
levels = [
|
|
413
|
+
get_slice_topology_level(topology)
|
|
414
|
+
for topology in sorted_topologies
|
|
415
|
+
if is_topology_contained(
|
|
416
|
+
contained=topology, container=system.topology
|
|
417
|
+
)
|
|
418
|
+
]
|
|
419
|
+
levels.append("kubernetes.io/hostname")
|
|
420
|
+
|
|
367
421
|
return _NameAndYaml(
|
|
368
422
|
name=SUB_SLICE_TOPOLOGY_NAME,
|
|
369
423
|
yaml=self.template_env.get_template(
|
|
370
424
|
KUEUE_SUB_SLICING_TOPOLOGY_JINJA_FILE
|
|
371
425
|
).render({
|
|
372
426
|
"sub_slice_topology_name": SUB_SLICE_TOPOLOGY_NAME,
|
|
427
|
+
"levels": levels,
|
|
373
428
|
}),
|
|
374
429
|
)
|
|
375
430
|
else:
|
|
@@ -377,8 +432,6 @@ class KueueManager:
|
|
|
377
432
|
|
|
378
433
|
def __apply_manifest(self, manifest: str) -> int:
|
|
379
434
|
task = "Applying Kueue Custom Resources"
|
|
380
|
-
if is_dry_run():
|
|
381
|
-
xpk_print(f"Applying following Kueue resources:{manifest}")
|
|
382
435
|
tmp_file = write_tmp_file(manifest)
|
|
383
436
|
command = f"kubectl apply -f {tmp_file}"
|
|
384
437
|
return run_command_with_updates(command, task)
|
|
@@ -422,13 +475,114 @@ class KueueManager:
|
|
|
422
475
|
xpk_print(f"{task} returned ERROR {return_code}")
|
|
423
476
|
return return_code
|
|
424
477
|
|
|
478
|
+
def __autocorrect_resource_limits(
|
|
479
|
+
self, kueue_config: KueueConfig
|
|
480
|
+
) -> tuple[int, str]:
|
|
481
|
+
"""Verify specified CPU and memory limits against machine type."""
|
|
482
|
+
|
|
483
|
+
cpu_limit = kueue_config.cpu_limit
|
|
484
|
+
memory_limit_str = kueue_config.memory_limit
|
|
485
|
+
if not cpu_limit and not memory_limit_str:
|
|
486
|
+
return cpu_limit, memory_limit_str
|
|
487
|
+
|
|
488
|
+
# Get CPU and memory capacity from machine type
|
|
489
|
+
command = (
|
|
490
|
+
"gcloud compute machine-types describe"
|
|
491
|
+
f" {kueue_config.system.gce_machine_type} "
|
|
492
|
+
f" --project={self.project} --zone={self.zone}"
|
|
493
|
+
" --format='value(guestCpus,memoryMb)'"
|
|
494
|
+
)
|
|
495
|
+
return_code, out = run_command_for_value(
|
|
496
|
+
command,
|
|
497
|
+
"Get vCPU and memory capacity for machine type",
|
|
498
|
+
dry_run_return_val="10 10",
|
|
499
|
+
)
|
|
500
|
+
if return_code != 0:
|
|
501
|
+
xpk_print(
|
|
502
|
+
"Unable to verify vCPU and memory capacity for machine type."
|
|
503
|
+
" XPK will proceed with using user-defined limits."
|
|
504
|
+
)
|
|
505
|
+
return cpu_limit, memory_limit_str
|
|
506
|
+
|
|
507
|
+
cpu_capacity_str, memory_capacity_MB_str = out.split()
|
|
508
|
+
if cpu_limit:
|
|
509
|
+
cpu_limit = _autocorrect_cpu_limit(cpu_limit, int(cpu_capacity_str))
|
|
510
|
+
if memory_limit_str:
|
|
511
|
+
memory_limit_str = _autocorrect_memory_limit(
|
|
512
|
+
memory_limit_str, memory_capacity_MB_str
|
|
513
|
+
)
|
|
514
|
+
return cpu_limit, memory_limit_str
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
def get_installed_kueue_version(
|
|
518
|
+
dry_run_version: Version | None = None,
|
|
519
|
+
) -> tuple[int, Version | None]:
|
|
520
|
+
command = (
|
|
521
|
+
"kubectl get deployment kueue-controller-manager -n kueue-system -o"
|
|
522
|
+
" jsonpath='{.spec.template.spec.containers[0].image}'"
|
|
523
|
+
)
|
|
524
|
+
task = "Get kueue version on server"
|
|
525
|
+
return_code, val = run_command_for_value(
|
|
526
|
+
command,
|
|
527
|
+
task,
|
|
528
|
+
dry_run_return_val=(
|
|
529
|
+
f"registry.k8s.io/kueue/kueue:v{dry_run_version}"
|
|
530
|
+
if dry_run_version
|
|
531
|
+
else ""
|
|
532
|
+
),
|
|
533
|
+
)
|
|
534
|
+
if return_code != 0:
|
|
535
|
+
return return_code, None
|
|
536
|
+
version_tag = val.split(":")
|
|
537
|
+
if len(version_tag) == 1:
|
|
538
|
+
return 1, None
|
|
539
|
+
return return_code, Version(version_tag[-1])
|
|
540
|
+
|
|
425
541
|
|
|
426
542
|
def has_sub_slicing_enabled() -> tuple[int, bool | None]:
|
|
427
543
|
return_code, value = run_command_for_value(
|
|
428
|
-
command="kubectl get topology",
|
|
544
|
+
command="kubectl get topology",
|
|
545
|
+
task="Get defined topologies",
|
|
546
|
+
dry_run_return_val=SUB_SLICE_TOPOLOGY_NAME,
|
|
429
547
|
)
|
|
430
548
|
|
|
431
549
|
if return_code != 0:
|
|
432
550
|
return return_code, None
|
|
433
551
|
|
|
434
552
|
return return_code, SUB_SLICE_TOPOLOGY_NAME in value
|
|
553
|
+
|
|
554
|
+
|
|
555
|
+
def _autocorrect_cpu_limit(cpu_limit: int, cpu_capacity: int) -> int:
|
|
556
|
+
if cpu_limit > cpu_capacity:
|
|
557
|
+
xpk_print(
|
|
558
|
+
"The CPU limit is above the available capacity."
|
|
559
|
+
f" We will set CPU limit to {cpu_capacity}."
|
|
560
|
+
)
|
|
561
|
+
elif cpu_limit < cpu_capacity:
|
|
562
|
+
xpk_print(
|
|
563
|
+
"The CPU limit is below the available capacity, which would lead"
|
|
564
|
+
f" to underutilization. We will set CPU limit to {cpu_capacity}."
|
|
565
|
+
)
|
|
566
|
+
return cpu_capacity
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
def _autocorrect_memory_limit(
|
|
570
|
+
memory_limit_str: str, memory_capacity_MB_str: str
|
|
571
|
+
) -> str:
|
|
572
|
+
memory_limit_bytes = parse_quantity(memory_limit_str)
|
|
573
|
+
memory_capacity_bytes = int(memory_capacity_MB_str) << 20
|
|
574
|
+
if memory_limit_bytes == memory_capacity_bytes:
|
|
575
|
+
return memory_limit_str
|
|
576
|
+
memory_limit_str = memory_capacity_MB_str + "Mi"
|
|
577
|
+
if memory_limit_bytes > memory_capacity_bytes:
|
|
578
|
+
xpk_print(
|
|
579
|
+
"The memory limit is above the available capacity. We will set"
|
|
580
|
+
f" memory limit to {memory_limit_str}."
|
|
581
|
+
)
|
|
582
|
+
else:
|
|
583
|
+
xpk_print(
|
|
584
|
+
"The memory limit is below the available capacity, which would"
|
|
585
|
+
" lead to underutilization. We will set the memory limit to"
|
|
586
|
+
f" {memory_limit_str}."
|
|
587
|
+
)
|
|
588
|
+
return memory_limit_str
|
xpk/core/kueue_manager_test.py
CHANGED
|
@@ -22,7 +22,7 @@ import yaml
|
|
|
22
22
|
from unittest.mock import MagicMock, patch
|
|
23
23
|
|
|
24
24
|
from xpk.core.kueue_manager import KueueConfig, KueueManager, has_sub_slicing_enabled
|
|
25
|
-
from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics
|
|
25
|
+
from xpk.core.system_characteristics import GpuConfig, DockerPlatform, AcceleratorType, SystemCharacteristics, UserFacingNameToSystemCharacteristics
|
|
26
26
|
from xpk.core.testing.commands_tester import CommandsTester
|
|
27
27
|
from packaging.version import Version
|
|
28
28
|
|
|
@@ -35,6 +35,7 @@ TPU_SYSTEM: SystemCharacteristics = SystemCharacteristics(
|
|
|
35
35
|
accelerator_type=AcceleratorType.TPU,
|
|
36
36
|
device_type="v5p-8",
|
|
37
37
|
supports_sub_slicing=False,
|
|
38
|
+
docker_platform=DockerPlatform.ARM,
|
|
38
39
|
)
|
|
39
40
|
|
|
40
41
|
KUEUE_CONFIG: KueueConfig = KueueConfig(
|
|
@@ -61,6 +62,13 @@ def set_installed_kueue_version(
|
|
|
61
62
|
)
|
|
62
63
|
|
|
63
64
|
|
|
65
|
+
@pytest.fixture(autouse=True)
|
|
66
|
+
def mock_ask_for_user_consent(mocker: MockerFixture) -> MagicMock:
|
|
67
|
+
return mocker.patch(
|
|
68
|
+
"xpk.core.kueue_manager.ask_for_user_consent", return_value=True
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
64
72
|
@pytest.fixture(autouse=True)
|
|
65
73
|
def mock_commands(mocker: MockerFixture) -> CommandsTester:
|
|
66
74
|
return CommandsTester(
|
|
@@ -78,7 +86,7 @@ def mock_commands(mocker: MockerFixture) -> CommandsTester:
|
|
|
78
86
|
@pytest.fixture(autouse=True)
|
|
79
87
|
@patch("jinja2.Environment", return_value=MagicMock())
|
|
80
88
|
def kueue_manager(mock_env: MagicMock) -> KueueManager:
|
|
81
|
-
return KueueManager()
|
|
89
|
+
return KueueManager("test-project", "test-zone")
|
|
82
90
|
|
|
83
91
|
|
|
84
92
|
def test_install_or_upgrade_when_newer_version_already_installed(
|
|
@@ -102,7 +110,7 @@ def test_install_or_upgrade_when_outdated(
|
|
|
102
110
|
result = kueue_manager.install_or_upgrade(KUEUE_CONFIG)
|
|
103
111
|
|
|
104
112
|
assert result == 0
|
|
105
|
-
mock_commands.assert_command_run("kubectl apply", "v0.
|
|
113
|
+
mock_commands.assert_command_run("kubectl apply", "v0.14.3/manifests.yaml")
|
|
106
114
|
mock_commands.assert_command_run("kubectl apply -f", "/tmp/")
|
|
107
115
|
|
|
108
116
|
|
|
@@ -115,10 +123,84 @@ def test_install_or_upgrade_when_not_installed(
|
|
|
115
123
|
result = kueue_manager.install_or_upgrade(KUEUE_CONFIG)
|
|
116
124
|
|
|
117
125
|
assert result == 0
|
|
118
|
-
mock_commands.assert_command_run("kubectl apply", "v0.
|
|
126
|
+
mock_commands.assert_command_run("kubectl apply", "v0.14.3/manifests.yaml")
|
|
119
127
|
mock_commands.assert_command_run("kubectl apply -f", "/tmp/")
|
|
120
128
|
|
|
121
129
|
|
|
130
|
+
def test_upgrade_when_no_breaking_changes_between_versions_no_preparation_needed(
|
|
131
|
+
mock_commands: CommandsTester,
|
|
132
|
+
kueue_manager: KueueManager,
|
|
133
|
+
mock_ask_for_user_consent: MagicMock,
|
|
134
|
+
):
|
|
135
|
+
set_installed_kueue_version(mock_commands, Version("0.14.0"))
|
|
136
|
+
|
|
137
|
+
kueue_manager.install_or_upgrade(KUEUE_CONFIG)
|
|
138
|
+
|
|
139
|
+
mock_ask_for_user_consent.assert_not_called()
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def test_upgrade_with_breaking_changes_between_versions_runs_preparation(
|
|
143
|
+
mock_commands: CommandsTester,
|
|
144
|
+
kueue_manager: KueueManager,
|
|
145
|
+
mock_ask_for_user_consent: MagicMock,
|
|
146
|
+
):
|
|
147
|
+
set_installed_kueue_version(mock_commands, Version("0.11.0"))
|
|
148
|
+
fake_crds = (
|
|
149
|
+
"customresourcedefinition.apiextensions.k8s.io/kueue-crd-1.kueue.x-k8s.io\n"
|
|
150
|
+
"customresourcedefinition.apiextensions.k8s.io/kueue-crd-2.kueue.x-k8s.io"
|
|
151
|
+
)
|
|
152
|
+
mock_commands.set_result_for_command(
|
|
153
|
+
(0, fake_crds), "kubectl get crd -o name"
|
|
154
|
+
)
|
|
155
|
+
mock_ask_for_user_consent.return_value = True
|
|
156
|
+
|
|
157
|
+
result = kueue_manager.install_or_upgrade(KUEUE_CONFIG)
|
|
158
|
+
|
|
159
|
+
assert result == 0
|
|
160
|
+
mock_ask_for_user_consent.assert_called_once()
|
|
161
|
+
assert (
|
|
162
|
+
"CHANGELOG/CHANGELOG-0.14.md"
|
|
163
|
+
in mock_ask_for_user_consent.mock_calls[0].args[0]
|
|
164
|
+
)
|
|
165
|
+
mock_commands.assert_command_run(
|
|
166
|
+
"kubectl delete kueue-crd-1.kueue.x-k8s.io --all"
|
|
167
|
+
)
|
|
168
|
+
mock_commands.assert_command_run(
|
|
169
|
+
"kubectl delete kueue-crd-2.kueue.x-k8s.io --all"
|
|
170
|
+
)
|
|
171
|
+
mock_commands.assert_command_run(
|
|
172
|
+
"kubectl delete crd kueue-crd-1.kueue.x-k8s.io"
|
|
173
|
+
)
|
|
174
|
+
mock_commands.assert_command_run(
|
|
175
|
+
"kubectl delete crd kueue-crd-2.kueue.x-k8s.io"
|
|
176
|
+
)
|
|
177
|
+
mock_commands.assert_command_run(
|
|
178
|
+
"kubectl delete deployment kueue-controller-manager"
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def test_upgrade_with_breaking_changes_between_versions_does_not_run_preparation_without_consent(
|
|
183
|
+
mock_commands: CommandsTester,
|
|
184
|
+
kueue_manager: KueueManager,
|
|
185
|
+
mock_ask_for_user_consent: MagicMock,
|
|
186
|
+
):
|
|
187
|
+
set_installed_kueue_version(mock_commands, Version("0.11.0"))
|
|
188
|
+
mock_commands.set_result_for_command(
|
|
189
|
+
(
|
|
190
|
+
0,
|
|
191
|
+
"customresourcedefinition.apiextensions.k8s.io/kueue-crd-1.kueue.x-k8s.io",
|
|
192
|
+
),
|
|
193
|
+
"kubectl get crd -o name",
|
|
194
|
+
)
|
|
195
|
+
mock_ask_for_user_consent.return_value = False
|
|
196
|
+
|
|
197
|
+
result = kueue_manager.install_or_upgrade(KUEUE_CONFIG)
|
|
198
|
+
|
|
199
|
+
assert result == 1
|
|
200
|
+
# Assert there was no command run for the Kueue crd:
|
|
201
|
+
mock_commands.assert_command_not_run("kueue-crd-1.kueue.x-k8s.io")
|
|
202
|
+
|
|
203
|
+
|
|
122
204
|
def test_installation_with_tolerations(
|
|
123
205
|
mock_commands: CommandsTester, kueue_manager: KueueManager
|
|
124
206
|
):
|
|
@@ -199,6 +281,10 @@ def test_configure_generates_correct_manifest_for_tpu(
|
|
|
199
281
|
):
|
|
200
282
|
"""Test that __configure generates the correct manifest content for TPUs."""
|
|
201
283
|
set_installed_kueue_version(mock_commands, None)
|
|
284
|
+
mock_commands.set_result_for_command(
|
|
285
|
+
(0, "100 102400"), "gcloud compute machine-types describe"
|
|
286
|
+
)
|
|
287
|
+
|
|
202
288
|
tpu_kueue_config = dataclasses.replace(
|
|
203
289
|
KUEUE_CONFIG, system=TPU_SYSTEM, num_slices=2
|
|
204
290
|
)
|
|
@@ -239,6 +325,39 @@ def test_configure_generates_correct_manifest_for_tpu(
|
|
|
239
325
|
)
|
|
240
326
|
|
|
241
327
|
|
|
328
|
+
@patch("xpk.core.kueue_manager.write_tmp_file")
|
|
329
|
+
def test_install_autocorrects_resource_limits(
|
|
330
|
+
write_tmp_file_mock: MagicMock,
|
|
331
|
+
mock_commands: CommandsTester,
|
|
332
|
+
kueue_manager: KueueManager,
|
|
333
|
+
):
|
|
334
|
+
"""Test that installation auto-corrects the specified resource limits."""
|
|
335
|
+
set_installed_kueue_version(mock_commands, None)
|
|
336
|
+
# set 50 vCPU, 200Gi memory
|
|
337
|
+
mock_commands.set_result_for_command(
|
|
338
|
+
(0, "50 204800"), "gcloud compute machine-types describe"
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
kueue_config = dataclasses.replace(
|
|
342
|
+
KUEUE_CONFIG, cpu_limit=100, memory_limit="100Gi"
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
kueue_manager.install_or_upgrade(kueue_config)
|
|
346
|
+
|
|
347
|
+
rendered_manifest: str = write_tmp_file_mock.call_args[0][0]
|
|
348
|
+
manifest_docs = list(yaml.safe_load_all(rendered_manifest))
|
|
349
|
+
cluster_queue = _first(
|
|
350
|
+
doc for doc in manifest_docs if doc["kind"] == "ClusterQueue"
|
|
351
|
+
)
|
|
352
|
+
resources = cluster_queue["spec"]["resourceGroups"][0]["flavors"][0][
|
|
353
|
+
"resources"
|
|
354
|
+
]
|
|
355
|
+
cpu_resource = _first(r for r in resources if r["name"] == "cpu")
|
|
356
|
+
memory_resource = _first(r for r in resources if r["name"] == "memory")
|
|
357
|
+
assert cpu_resource["nominalQuota"] == 50
|
|
358
|
+
assert memory_resource["nominalQuota"] == "204800Mi"
|
|
359
|
+
|
|
360
|
+
|
|
242
361
|
@patch("xpk.core.kueue_manager.write_tmp_file")
|
|
243
362
|
def test_configure_generates_manifest_with_admission_checks_for_flex_single_slice(
|
|
244
363
|
write_tmp_file_mock: MagicMock,
|
|
@@ -287,6 +406,8 @@ def test_configure_generates_correct_manifest_with_gke_default_topology(
|
|
|
287
406
|
accelerator_type=AcceleratorType.GPU,
|
|
288
407
|
device_type="h100-mega-80gb-8",
|
|
289
408
|
supports_sub_slicing=False,
|
|
409
|
+
docker_platform=DockerPlatform.ARM,
|
|
410
|
+
gpu_config=GpuConfig(requires_topology=True),
|
|
290
411
|
),
|
|
291
412
|
)
|
|
292
413
|
|
|
@@ -317,6 +438,7 @@ def test_configure_generates_correct_manifest_with_sub_slicing(
|
|
|
317
438
|
kueue_config = dataclasses.replace(
|
|
318
439
|
KUEUE_CONFIG,
|
|
319
440
|
configure_sub_slicing=True,
|
|
441
|
+
system=UserFacingNameToSystemCharacteristics["v6e-8x8"],
|
|
320
442
|
)
|
|
321
443
|
|
|
322
444
|
kueue_manager.install_or_upgrade(kueue_config)
|
|
@@ -329,6 +451,15 @@ def test_configure_generates_correct_manifest_with_sub_slicing(
|
|
|
329
451
|
assert resource_flavor["spec"]["topologyName"] == "sub-slice-topology"
|
|
330
452
|
topology = _first(doc for doc in manifest_docs if doc["kind"] == "Topology")
|
|
331
453
|
assert topology["metadata"]["name"] == "sub-slice-topology"
|
|
454
|
+
expected_levels = [
|
|
455
|
+
"cloud.google.com/gke-tpu-slice-8x8-id",
|
|
456
|
+
"cloud.google.com/gke-tpu-slice-4x8-id",
|
|
457
|
+
"cloud.google.com/gke-tpu-slice-4x4-id",
|
|
458
|
+
"cloud.google.com/gke-tpu-slice-2x4-id",
|
|
459
|
+
"kubernetes.io/hostname",
|
|
460
|
+
]
|
|
461
|
+
actual_levels = [level["nodeLabel"] for level in topology["spec"]["levels"]]
|
|
462
|
+
assert actual_levels == expected_levels
|
|
332
463
|
|
|
333
464
|
|
|
334
465
|
@patch("xpk.core.kueue_manager.write_tmp_file")
|
|
@@ -373,6 +504,29 @@ def test_configure_generates_correct_manifest_with_pathways(
|
|
|
373
504
|
assert pathways_rg["flavors"][0]["resources"][1]["nominalQuota"] == "2000G"
|
|
374
505
|
|
|
375
506
|
|
|
507
|
+
@patch("xpk.core.kueue_manager.write_tmp_file")
|
|
508
|
+
def test_configure_generates_correct_manifest_for_a4x(
|
|
509
|
+
write_tmp_file_mock: MagicMock,
|
|
510
|
+
mock_commands: CommandsTester,
|
|
511
|
+
kueue_manager: KueueManager,
|
|
512
|
+
):
|
|
513
|
+
"""Test that __configure generates correct manifest for a4x GPUs."""
|
|
514
|
+
set_installed_kueue_version(mock_commands, None)
|
|
515
|
+
kueue_config = dataclasses.replace(
|
|
516
|
+
KUEUE_CONFIG,
|
|
517
|
+
system=UserFacingNameToSystemCharacteristics["gb200-4"],
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
kueue_manager.install_or_upgrade(kueue_config)
|
|
521
|
+
|
|
522
|
+
rendered_manifest: str = write_tmp_file_mock.call_args[0][0]
|
|
523
|
+
manifest_docs = list(yaml.safe_load_all(rendered_manifest))
|
|
524
|
+
|
|
525
|
+
# Check that the gke-default topology is present for a4x.
|
|
526
|
+
topology = _first(doc for doc in manifest_docs if doc["kind"] == "Topology")
|
|
527
|
+
assert topology["metadata"]["name"] == "gke-default"
|
|
528
|
+
|
|
529
|
+
|
|
376
530
|
def test_has_sub_slicing_enabled_returns_exit_code_when_command_fails(
|
|
377
531
|
mock_commands: CommandsTester,
|
|
378
532
|
):
|
xpk/core/nap.py
CHANGED
|
@@ -30,9 +30,8 @@ from .commands import run_command_with_updates, run_commands
|
|
|
30
30
|
from .gcloud_context import get_cluster_location
|
|
31
31
|
from .nodepool import get_all_nodepools_programmatic
|
|
32
32
|
from .resources import (
|
|
33
|
-
CLUSTER_METADATA_CONFIGMAP,
|
|
34
|
-
CLUSTER_RESOURCES_CONFIGMAP,
|
|
35
33
|
AutoprovisioningConfig,
|
|
34
|
+
ConfigMapType,
|
|
36
35
|
get_cluster_configmap,
|
|
37
36
|
)
|
|
38
37
|
from .scheduling import get_total_chips_requested_from_args
|
|
@@ -266,14 +265,12 @@ def is_autoprovisioning_enabled(
|
|
|
266
265
|
int of 0 if successful and 1 otherwise.
|
|
267
266
|
"""
|
|
268
267
|
|
|
269
|
-
|
|
270
|
-
|
|
268
|
+
cluster_config_map = get_cluster_configmap(
|
|
269
|
+
args.cluster, ConfigMapType.RESOURCES
|
|
270
|
+
)
|
|
271
271
|
|
|
272
272
|
if cluster_config_map is None:
|
|
273
|
-
xpk_print(
|
|
274
|
-
f'Unable to find config map: {resources_configmap_name}.'
|
|
275
|
-
' Autoprovisioning is not enabled.'
|
|
276
|
-
)
|
|
273
|
+
xpk_print('Unable to find config map. Autoprovisioning is not enabled.')
|
|
277
274
|
return False, 0
|
|
278
275
|
|
|
279
276
|
return_code, autoprovisioning_value = get_value_from_map(
|
|
@@ -281,8 +278,8 @@ def is_autoprovisioning_enabled(
|
|
|
281
278
|
)
|
|
282
279
|
if return_code != 0:
|
|
283
280
|
xpk_print(
|
|
284
|
-
'gke_accelerator type not found in config map
|
|
285
|
-
|
|
281
|
+
'gke_accelerator type not found in config map. Autoprovisioning is not'
|
|
282
|
+
' enabled.'
|
|
286
283
|
)
|
|
287
284
|
return False, 0
|
|
288
285
|
|
|
@@ -319,8 +316,9 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
|
|
|
319
316
|
|
|
320
317
|
if capacity_type_str == CapacityType.UNKNOWN.name:
|
|
321
318
|
# Use default settings from cluster creation.
|
|
322
|
-
|
|
323
|
-
|
|
319
|
+
cluster_config_map = get_cluster_configmap(
|
|
320
|
+
args.cluster, ConfigMapType.METADATA
|
|
321
|
+
)
|
|
324
322
|
|
|
325
323
|
# Error out if the metadata config map doesn't exist, and is attempting to use
|
|
326
324
|
# autoprovisioning.
|
|
@@ -363,8 +361,9 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
|
|
|
363
361
|
|
|
364
362
|
|
|
365
363
|
def get_cluster_provisioner(args) -> str:
|
|
366
|
-
|
|
367
|
-
|
|
364
|
+
cluster_config_map = get_cluster_configmap(
|
|
365
|
+
args.cluster, ConfigMapType.METADATA
|
|
366
|
+
)
|
|
368
367
|
cluster_provisioner = 'gcloud'
|
|
369
368
|
if not cluster_config_map is None:
|
|
370
369
|
provisioner = cluster_config_map.get('provisioner')
|