xpk 0.14.0__py3-none-any.whl → 0.14.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/__init__.py +15 -0
- integration/docker_manager_test.py +102 -0
- integration/gcluster_a3mega_test.py +204 -0
- integration/gcluster_a3ultra_test.py +176 -0
- integration/gcluster_a4_test.py +176 -0
- integration/gcluster_test.py +107 -0
- xpk/commands/cluster.py +17 -4
- xpk/commands/cluster_gcluster.py +4 -0
- xpk/commands/cluster_test.py +92 -0
- xpk/commands/common.py +6 -0
- xpk/commands/kind.py +1 -0
- xpk/commands/workload.py +41 -7
- xpk/commands/workload_test.py +81 -0
- xpk/core/blueprint/testing/__init__.py +15 -0
- xpk/core/cluster.py +1 -1
- xpk/core/config.py +1 -1
- xpk/core/kueue_manager.py +62 -22
- xpk/core/kueue_manager_test.py +53 -21
- xpk/core/system_characteristics.py +16 -4
- xpk/core/system_characteristics_test.py +73 -0
- xpk/templates/cluster_preheat.yaml.j2 +31 -0
- xpk/templates/filestore-pv.yaml +17 -0
- xpk/templates/filestore-pvc.yaml +11 -0
- xpk/templates/filestore-sc.yaml +10 -0
- xpk/templates/fuse-pv.yaml +17 -0
- xpk/templates/fuse-pvc.yaml +13 -0
- xpk/templates/kueue_config.yaml.j2 +95 -0
- xpk/templates/kueue_gke_default_topology.yaml.j2 +10 -0
- xpk/templates/kueue_sub_slicing_topology.yaml.j2 +14 -0
- xpk/templates/mtc-cpc.yaml +15 -0
- xpk/templates/volume_bundle.yaml +7 -0
- xpk/utils/templates.py +14 -1
- xpk/utils/topology.py +9 -0
- xpk/utils/topology_test.py +21 -1
- {xpk-0.14.0.dist-info → xpk-0.14.2.dist-info}/METADATA +1 -1
- {xpk-0.14.0.dist-info → xpk-0.14.2.dist-info}/RECORD +40 -19
- xpk-0.14.2.dist-info/top_level.txt +2 -0
- xpk-0.14.0.dist-info/top_level.txt +0 -1
- {xpk-0.14.0.dist-info → xpk-0.14.2.dist-info}/WHEEL +0 -0
- {xpk-0.14.0.dist-info → xpk-0.14.2.dist-info}/entry_points.txt +0 -0
- {xpk-0.14.0.dist-info → xpk-0.14.2.dist-info}/licenses/LICENSE +0 -0
xpk/core/kueue_manager.py
CHANGED
|
@@ -39,17 +39,19 @@ from ..core.commands import (
|
|
|
39
39
|
)
|
|
40
40
|
from ..utils.file import write_tmp_file
|
|
41
41
|
from ..utils.console import xpk_print, xpk_exit
|
|
42
|
-
from ..utils.templates import TEMPLATE_PATH
|
|
42
|
+
from ..utils.templates import TEMPLATE_PATH, get_templates_absolute_path
|
|
43
43
|
|
|
44
44
|
WAIT_FOR_KUEUE_TIMEOUT = "10m"
|
|
45
45
|
CLUSTER_QUEUE_NAME = "cluster-queue"
|
|
46
46
|
LOCAL_QUEUE_NAME = "multislice-queue"
|
|
47
|
+
SUB_SLICE_TOPOLOGY_NAME = "sub-slice-topology"
|
|
47
48
|
KUEUE_CONFIG_JINJA_FILE = "kueue_config.yaml.j2"
|
|
48
|
-
|
|
49
|
+
KUEUE_GKE_DEFAULT_TOPOLOGY_JINJA_FILE = "kueue_gke_default_topology.yaml.j2"
|
|
49
50
|
KUEUE_CONTROLLER_MANAGER_JINJA_FILE = "kueue_controller_manager.yaml.j2"
|
|
51
|
+
KUEUE_SUB_SLICING_TOPOLOGY_JINJA_FILE = "kueue_sub_slicing_topology.yaml.j2"
|
|
50
52
|
MEMORY_SIZE_PER_VM = 1.2
|
|
51
53
|
MIN_MEMORY_LIMIT_SIZE = 4096
|
|
52
|
-
KUEUE_VERSION = "v0.
|
|
54
|
+
KUEUE_VERSION = "v0.12.2"
|
|
53
55
|
|
|
54
56
|
|
|
55
57
|
@dataclass
|
|
@@ -58,12 +60,19 @@ class KueueConfig:
|
|
|
58
60
|
total_chips: int
|
|
59
61
|
cpu_limit: int
|
|
60
62
|
memory_limit: str
|
|
63
|
+
configure_sub_slicing: bool
|
|
61
64
|
is_pathways_cluster: bool = False
|
|
62
65
|
autoprovisioning_enabled: bool = False
|
|
63
66
|
flex: bool = False
|
|
64
67
|
num_slices: int = 1
|
|
65
68
|
|
|
66
69
|
|
|
70
|
+
@dataclass
|
|
71
|
+
class _NameAndYaml:
|
|
72
|
+
name: str
|
|
73
|
+
yaml: str
|
|
74
|
+
|
|
75
|
+
|
|
67
76
|
class KueueManager:
|
|
68
77
|
"""Manages the installation and configuration of Kueue on an XPK cluster."""
|
|
69
78
|
|
|
@@ -73,7 +82,12 @@ class KueueManager:
|
|
|
73
82
|
template_path=TEMPLATE_PATH,
|
|
74
83
|
):
|
|
75
84
|
self.kueue_version = kueue_version
|
|
76
|
-
|
|
85
|
+
|
|
86
|
+
self.template_env = Environment(
|
|
87
|
+
loader=FileSystemLoader(
|
|
88
|
+
searchpath=get_templates_absolute_path(template_path)
|
|
89
|
+
)
|
|
90
|
+
)
|
|
77
91
|
|
|
78
92
|
def install_or_upgrade(
|
|
79
93
|
self,
|
|
@@ -87,7 +101,7 @@ class KueueManager:
|
|
|
87
101
|
Args:
|
|
88
102
|
tolerations: An optional list of tolerations to apply to the kueue-controller-manager.
|
|
89
103
|
"""
|
|
90
|
-
return_code, installed_version = self.
|
|
104
|
+
return_code, installed_version = self.get_installed_kueue_version()
|
|
91
105
|
|
|
92
106
|
if return_code == 0:
|
|
93
107
|
if installed_version and installed_version > self.kueue_version:
|
|
@@ -107,7 +121,7 @@ class KueueManager:
|
|
|
107
121
|
|
|
108
122
|
return self.__configure(kueue_config)
|
|
109
123
|
|
|
110
|
-
def
|
|
124
|
+
def get_installed_kueue_version(self) -> tuple[int, str | None]:
|
|
111
125
|
command = (
|
|
112
126
|
"kubectl get deployment kueue-controller-manager -n kueue-system -o"
|
|
113
127
|
" jsonpath='{.spec.template.spec.containers[0].image}'"
|
|
@@ -117,7 +131,7 @@ class KueueManager:
|
|
|
117
131
|
command,
|
|
118
132
|
task,
|
|
119
133
|
dry_run_return_val="""
|
|
120
|
-
v0.
|
|
134
|
+
v0.12.1""",
|
|
121
135
|
)
|
|
122
136
|
if return_code != 0:
|
|
123
137
|
return return_code, None
|
|
@@ -208,6 +222,13 @@ class KueueManager:
|
|
|
208
222
|
"""
|
|
209
223
|
template = self.template_env.get_template(KUEUE_CONFIG_JINJA_FILE)
|
|
210
224
|
|
|
225
|
+
topology_name_and_yaml = self.__get_topology_name_and_yaml(
|
|
226
|
+
kueue_config.system, kueue_config.configure_sub_slicing
|
|
227
|
+
)
|
|
228
|
+
topology_name = (
|
|
229
|
+
topology_name_and_yaml.name if topology_name_and_yaml else None
|
|
230
|
+
)
|
|
231
|
+
|
|
211
232
|
# The manager builds the context internally based on its opinionated logic
|
|
212
233
|
context = self.__build_template_context(
|
|
213
234
|
system=kueue_config.system,
|
|
@@ -218,18 +239,16 @@ class KueueManager:
|
|
|
218
239
|
num_slices=kueue_config.num_slices,
|
|
219
240
|
cpu_limit=kueue_config.cpu_limit,
|
|
220
241
|
memory_limit=kueue_config.memory_limit,
|
|
242
|
+
topology_name=topology_name,
|
|
221
243
|
)
|
|
222
244
|
|
|
223
|
-
|
|
245
|
+
config_yaml = template.render(context)
|
|
246
|
+
yamls = [config_yaml]
|
|
224
247
|
|
|
225
|
-
if
|
|
226
|
-
|
|
227
|
-
H200_DEVICE_TYPE,
|
|
228
|
-
B200_DEVICE_TYPE,
|
|
229
|
-
]:
|
|
230
|
-
topology_yaml = self.template_env.get_template(KUEUE_TOPOLOGY_JINJA_FILE)
|
|
231
|
-
rendered_manifest = topology_yaml.render() + rendered_manifest
|
|
248
|
+
if topology_name_and_yaml:
|
|
249
|
+
yamls.append(topology_name_and_yaml.yaml)
|
|
232
250
|
|
|
251
|
+
rendered_manifest = "\n---\n".join(yamls)
|
|
233
252
|
return_code = self.__apply_manifest(rendered_manifest)
|
|
234
253
|
if return_code != 0:
|
|
235
254
|
return return_code
|
|
@@ -246,6 +265,7 @@ class KueueManager:
|
|
|
246
265
|
num_slices: int,
|
|
247
266
|
cpu_limit: int,
|
|
248
267
|
memory_limit: str,
|
|
268
|
+
topology_name: str | None,
|
|
249
269
|
) -> Dict[str, Any]:
|
|
250
270
|
"""Prepares the context for the Jinja2 template."""
|
|
251
271
|
# Main accelerator flavor
|
|
@@ -267,13 +287,7 @@ class KueueManager:
|
|
|
267
287
|
key, value = machine_label.split(":", 1)
|
|
268
288
|
node_labels_dict[key] = value.strip()
|
|
269
289
|
|
|
270
|
-
topology_label = ""
|
|
271
|
-
if system.device_type in [
|
|
272
|
-
H100_MEGA_DEVICE_TYPE,
|
|
273
|
-
H200_DEVICE_TYPE,
|
|
274
|
-
B200_DEVICE_TYPE,
|
|
275
|
-
]:
|
|
276
|
-
topology_label = 'topologyName: "gke-default"'
|
|
290
|
+
topology_label = f"topologyName: {topology_name}" if topology_name else ""
|
|
277
291
|
|
|
278
292
|
flavors = [{
|
|
279
293
|
"name": main_flavor_name,
|
|
@@ -335,6 +349,32 @@ class KueueManager:
|
|
|
335
349
|
"admission_checks": admission_checks,
|
|
336
350
|
}
|
|
337
351
|
|
|
352
|
+
def __get_topology_name_and_yaml(
|
|
353
|
+
self, system: SystemCharacteristics, configure_sub_slicing: bool
|
|
354
|
+
) -> _NameAndYaml | None:
|
|
355
|
+
if system.device_type in [
|
|
356
|
+
H100_MEGA_DEVICE_TYPE,
|
|
357
|
+
H200_DEVICE_TYPE,
|
|
358
|
+
B200_DEVICE_TYPE,
|
|
359
|
+
]:
|
|
360
|
+
return _NameAndYaml(
|
|
361
|
+
name="gke-default",
|
|
362
|
+
yaml=self.template_env.get_template(
|
|
363
|
+
KUEUE_GKE_DEFAULT_TOPOLOGY_JINJA_FILE
|
|
364
|
+
).render(),
|
|
365
|
+
)
|
|
366
|
+
elif configure_sub_slicing:
|
|
367
|
+
return _NameAndYaml(
|
|
368
|
+
name=SUB_SLICE_TOPOLOGY_NAME,
|
|
369
|
+
yaml=self.template_env.get_template(
|
|
370
|
+
KUEUE_SUB_SLICING_TOPOLOGY_JINJA_FILE
|
|
371
|
+
).render({
|
|
372
|
+
"sub_slice_topology_name": SUB_SLICE_TOPOLOGY_NAME,
|
|
373
|
+
}),
|
|
374
|
+
)
|
|
375
|
+
else:
|
|
376
|
+
return None
|
|
377
|
+
|
|
338
378
|
def __apply_manifest(self, manifest: str) -> int:
|
|
339
379
|
task = "Applying Kueue Custom Resources"
|
|
340
380
|
if is_dry_run():
|
xpk/core/kueue_manager_test.py
CHANGED
|
@@ -76,16 +76,14 @@ class KueueManagerTest(unittest.TestCase):
|
|
|
76
76
|
mock_install.assert_called_once()
|
|
77
77
|
mock_configure.assert_called_once()
|
|
78
78
|
|
|
79
|
-
@patch(
|
|
80
|
-
"xpk.core.kueue_manager.KueueManager._KueueManager__get_installed_kueue_version"
|
|
81
|
-
)
|
|
79
|
+
@patch("xpk.core.kueue_manager.KueueManager.get_installed_kueue_version")
|
|
82
80
|
@patch("xpk.core.kueue_manager.KueueManager._KueueManager__install")
|
|
83
81
|
@patch("xpk.core.kueue_manager.KueueManager._KueueManager__configure")
|
|
84
82
|
def test_install_or_upgrade_when_newer_version_already_installed(
|
|
85
83
|
self, mock_configure, mock_install, mock_get_version
|
|
86
84
|
):
|
|
87
85
|
"""Test install_or_upgrade when Kueue is already up to date."""
|
|
88
|
-
mock_get_version.return_value = (0, "v0.
|
|
86
|
+
mock_get_version.return_value = (0, "v0.12.3")
|
|
89
87
|
kueue_config = MagicMock(spec=KueueConfig)
|
|
90
88
|
|
|
91
89
|
result = self.kueue_manager.install_or_upgrade(kueue_config)
|
|
@@ -95,9 +93,7 @@ class KueueManagerTest(unittest.TestCase):
|
|
|
95
93
|
mock_install.assert_not_called()
|
|
96
94
|
mock_configure.assert_not_called()
|
|
97
95
|
|
|
98
|
-
@patch(
|
|
99
|
-
"xpk.core.kueue_manager.KueueManager._KueueManager__get_installed_kueue_version"
|
|
100
|
-
)
|
|
96
|
+
@patch("xpk.core.kueue_manager.KueueManager.get_installed_kueue_version")
|
|
101
97
|
def test_install_or_upgrade_when_outdated(
|
|
102
98
|
self,
|
|
103
99
|
mock_get_version,
|
|
@@ -121,9 +117,7 @@ class KueueManagerTest(unittest.TestCase):
|
|
|
121
117
|
mock_install.assert_called_once()
|
|
122
118
|
mock_configure.assert_called_once()
|
|
123
119
|
|
|
124
|
-
@patch(
|
|
125
|
-
"xpk.core.kueue_manager.KueueManager._KueueManager__get_installed_kueue_version"
|
|
126
|
-
)
|
|
120
|
+
@patch("xpk.core.kueue_manager.KueueManager.get_installed_kueue_version")
|
|
127
121
|
def test_install_or_upgrade_when_not_installed(
|
|
128
122
|
self,
|
|
129
123
|
mock_get_version,
|
|
@@ -155,7 +149,7 @@ class KueueManagerTest(unittest.TestCase):
|
|
|
155
149
|
return_value=0,
|
|
156
150
|
) as mock_run_retry,
|
|
157
151
|
patch(
|
|
158
|
-
"xpk.core.kueue_manager.KueueManager.
|
|
152
|
+
"xpk.core.kueue_manager.KueueManager.get_installed_kueue_version",
|
|
159
153
|
return_value=(1, None),
|
|
160
154
|
),
|
|
161
155
|
patch(
|
|
@@ -199,7 +193,7 @@ class KueueManagerTest(unittest.TestCase):
|
|
|
199
193
|
return_value=0,
|
|
200
194
|
) as mock_run_retry,
|
|
201
195
|
patch(
|
|
202
|
-
"xpk.core.kueue_manager.KueueManager.
|
|
196
|
+
"xpk.core.kueue_manager.KueueManager.get_installed_kueue_version",
|
|
203
197
|
return_value=(1, None),
|
|
204
198
|
),
|
|
205
199
|
patch(
|
|
@@ -224,9 +218,7 @@ class KueueManagerTest(unittest.TestCase):
|
|
|
224
218
|
self.assertEqual(result, 0)
|
|
225
219
|
self.assertEqual(mock_run_retry.call_count, 0)
|
|
226
220
|
|
|
227
|
-
@patch(
|
|
228
|
-
"xpk.core.kueue_manager.KueueManager._KueueManager__get_installed_kueue_version"
|
|
229
|
-
)
|
|
221
|
+
@patch("xpk.core.kueue_manager.KueueManager.get_installed_kueue_version")
|
|
230
222
|
@patch("xpk.core.kueue_manager.KueueManager._KueueManager__apply_manifest")
|
|
231
223
|
def test_configuration_updates_resources(
|
|
232
224
|
self, mock_apply_manifest, mock_get_version
|
|
@@ -240,6 +232,7 @@ class KueueManagerTest(unittest.TestCase):
|
|
|
240
232
|
total_chips=8,
|
|
241
233
|
cpu_limit=100,
|
|
242
234
|
memory_limit="100Gi",
|
|
235
|
+
configure_sub_slicing=False,
|
|
243
236
|
)
|
|
244
237
|
|
|
245
238
|
with (
|
|
@@ -265,6 +258,7 @@ class KueueManagerTest(unittest.TestCase):
|
|
|
265
258
|
total_chips=8,
|
|
266
259
|
cpu_limit=100,
|
|
267
260
|
memory_limit="100Gi",
|
|
261
|
+
configure_sub_slicing=False,
|
|
268
262
|
)
|
|
269
263
|
|
|
270
264
|
with (
|
|
@@ -274,7 +268,7 @@ class KueueManagerTest(unittest.TestCase):
|
|
|
274
268
|
),
|
|
275
269
|
patch.object(
|
|
276
270
|
self.kueue_manager,
|
|
277
|
-
"
|
|
271
|
+
"get_installed_kueue_version",
|
|
278
272
|
return_value=(1, None),
|
|
279
273
|
),
|
|
280
274
|
patch.object(
|
|
@@ -307,6 +301,7 @@ class KueueManagerTest(unittest.TestCase):
|
|
|
307
301
|
total_chips=8,
|
|
308
302
|
cpu_limit=100,
|
|
309
303
|
memory_limit="100Gi",
|
|
304
|
+
configure_sub_slicing=False,
|
|
310
305
|
)
|
|
311
306
|
|
|
312
307
|
with (
|
|
@@ -316,7 +311,7 @@ class KueueManagerTest(unittest.TestCase):
|
|
|
316
311
|
),
|
|
317
312
|
patch.object(
|
|
318
313
|
self.kueue_manager,
|
|
319
|
-
"
|
|
314
|
+
"get_installed_kueue_version",
|
|
320
315
|
return_value=(1, None),
|
|
321
316
|
),
|
|
322
317
|
patch.object(
|
|
@@ -344,7 +339,7 @@ class KueueManagerTest(unittest.TestCase):
|
|
|
344
339
|
@patch(
|
|
345
340
|
"xpk.core.kueue_manager.KueueManager._KueueManager__update_kueue_resources_if_necessary"
|
|
346
341
|
)
|
|
347
|
-
def
|
|
342
|
+
def test_configure_generates_correct_manifest_for_tpu(
|
|
348
343
|
self, mock_update_resources, mock_install
|
|
349
344
|
):
|
|
350
345
|
"""Test that __configure generates the correct manifest content for TPUs."""
|
|
@@ -357,6 +352,7 @@ class KueueManagerTest(unittest.TestCase):
|
|
|
357
352
|
memory_limit="100Gi",
|
|
358
353
|
autoprovisioning_enabled=False,
|
|
359
354
|
num_slices=2,
|
|
355
|
+
configure_sub_slicing=False,
|
|
360
356
|
)
|
|
361
357
|
|
|
362
358
|
rendered_manifest = self._trigger_installation(kueue_config)
|
|
@@ -413,6 +409,7 @@ class KueueManagerTest(unittest.TestCase):
|
|
|
413
409
|
autoprovisioning_enabled=False,
|
|
414
410
|
num_slices=1,
|
|
415
411
|
flex=True,
|
|
412
|
+
configure_sub_slicing=False,
|
|
416
413
|
)
|
|
417
414
|
|
|
418
415
|
rendered_manifest = self._trigger_installation(kueue_config)
|
|
@@ -432,7 +429,7 @@ class KueueManagerTest(unittest.TestCase):
|
|
|
432
429
|
@patch(
|
|
433
430
|
"xpk.core.kueue_manager.KueueManager._KueueManager__update_kueue_resources_if_necessary"
|
|
434
431
|
)
|
|
435
|
-
def
|
|
432
|
+
def test_configure_generates_correct_manifest_with_gke_default_topology(
|
|
436
433
|
self, mock_update_resources, mock_install
|
|
437
434
|
):
|
|
438
435
|
"""Test that __configure generates correct manifest for GPUs."""
|
|
@@ -444,11 +441,11 @@ class KueueManagerTest(unittest.TestCase):
|
|
|
444
441
|
cpu_limit=100,
|
|
445
442
|
memory_limit="100Gi",
|
|
446
443
|
num_slices=2,
|
|
444
|
+
configure_sub_slicing=False,
|
|
447
445
|
)
|
|
448
446
|
|
|
449
447
|
rendered_manifest = self._trigger_installation(kueue_config)
|
|
450
448
|
|
|
451
|
-
self.assertIn("kind: Topology", rendered_manifest)
|
|
452
449
|
manifest_docs = list(yaml.safe_load_all(rendered_manifest))
|
|
453
450
|
resource_flavor = _first(
|
|
454
451
|
doc for doc in manifest_docs if doc["kind"] == "ResourceFlavor"
|
|
@@ -459,6 +456,40 @@ class KueueManagerTest(unittest.TestCase):
|
|
|
459
456
|
],
|
|
460
457
|
"h100-mega-80gb-8",
|
|
461
458
|
)
|
|
459
|
+
self.assertEqual(resource_flavor["spec"]["topologyName"], "gke-default")
|
|
460
|
+
topology = _first(doc for doc in manifest_docs if doc["kind"] == "Topology")
|
|
461
|
+
self.assertEqual(topology["metadata"]["name"], "gke-default")
|
|
462
|
+
|
|
463
|
+
@patch("xpk.core.kueue_manager.KueueManager._KueueManager__install")
|
|
464
|
+
@patch(
|
|
465
|
+
"xpk.core.kueue_manager.KueueManager._KueueManager__update_kueue_resources_if_necessary"
|
|
466
|
+
)
|
|
467
|
+
def test_configure_generates_correct_manifest_with_sub_slicing(
|
|
468
|
+
self, mock_update_resources, mock_install
|
|
469
|
+
):
|
|
470
|
+
"""Test that __configure generates correct manifest with sub-slicing topology."""
|
|
471
|
+
mock_install.return_value = 0
|
|
472
|
+
mock_update_resources.return_value = 0
|
|
473
|
+
kueue_config = KueueConfig(
|
|
474
|
+
system=self.mock_system_chars,
|
|
475
|
+
total_chips=16,
|
|
476
|
+
cpu_limit=100,
|
|
477
|
+
memory_limit="100Gi",
|
|
478
|
+
num_slices=2,
|
|
479
|
+
configure_sub_slicing=True,
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
rendered_manifest = self._trigger_installation(kueue_config)
|
|
483
|
+
|
|
484
|
+
manifest_docs = list(yaml.safe_load_all(rendered_manifest))
|
|
485
|
+
resource_flavor = _first(
|
|
486
|
+
doc for doc in manifest_docs if doc["kind"] == "ResourceFlavor"
|
|
487
|
+
)
|
|
488
|
+
self.assertEqual(
|
|
489
|
+
resource_flavor["spec"]["topologyName"], "sub-slice-topology"
|
|
490
|
+
)
|
|
491
|
+
topology = _first(doc for doc in manifest_docs if doc["kind"] == "Topology")
|
|
492
|
+
self.assertEqual(topology["metadata"]["name"], "sub-slice-topology")
|
|
462
493
|
|
|
463
494
|
@patch("xpk.core.kueue_manager.KueueManager._KueueManager__install")
|
|
464
495
|
@patch(
|
|
@@ -477,6 +508,7 @@ class KueueManagerTest(unittest.TestCase):
|
|
|
477
508
|
memory_limit="100Gi",
|
|
478
509
|
is_pathways_cluster=True,
|
|
479
510
|
num_slices=2,
|
|
511
|
+
configure_sub_slicing=False,
|
|
480
512
|
)
|
|
481
513
|
|
|
482
514
|
rendered_manifest = self._trigger_installation(kueue_config)
|
|
@@ -513,7 +545,7 @@ class KueueManagerTest(unittest.TestCase):
|
|
|
513
545
|
"""Calls Kueue installation and returns the rendered manifest."""
|
|
514
546
|
with (
|
|
515
547
|
patch.object(
|
|
516
|
-
self.kueue_manager, "
|
|
548
|
+
self.kueue_manager, "get_installed_kueue_version"
|
|
517
549
|
) as mock_get_version,
|
|
518
550
|
patch.object(
|
|
519
551
|
self.kueue_manager, "_KueueManager__apply_manifest"
|
|
@@ -135,10 +135,9 @@ def get_tpu_system_characteristics_map(
|
|
|
135
135
|
) -> dict[str, SystemCharacteristics]:
|
|
136
136
|
system_characteristics_map = {}
|
|
137
137
|
for topology in supported_topologies:
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
vms_per_slice = total_chips // chips_per_vm
|
|
138
|
+
chips_per_vm = compute_chips_per_vm(topology)
|
|
139
|
+
vms_per_slice = compute_vms_per_slice(topology)
|
|
140
|
+
num_tensorcores = compute_num_tensorcores(tensorcores_per_chip, topology)
|
|
142
141
|
system = SystemCharacteristics(
|
|
143
142
|
topology=topology,
|
|
144
143
|
vms_per_slice=vms_per_slice,
|
|
@@ -156,6 +155,19 @@ def get_tpu_system_characteristics_map(
|
|
|
156
155
|
return system_characteristics_map
|
|
157
156
|
|
|
158
157
|
|
|
158
|
+
def compute_chips_per_vm(topology: str) -> int:
|
|
159
|
+
return 1 if get_topology_product(topology) == 1 else 4
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def compute_num_tensorcores(tensorcores_per_chip: int, topology: str) -> int:
|
|
163
|
+
return get_topology_product(topology) * tensorcores_per_chip
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def compute_vms_per_slice(topology: str) -> int:
|
|
167
|
+
chips_per_vm = compute_chips_per_vm(topology)
|
|
168
|
+
return get_topology_product(topology) // chips_per_vm
|
|
169
|
+
|
|
170
|
+
|
|
159
171
|
################### Subcommand Helper Functions #############################
|
|
160
172
|
""" !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
161
173
|
IF YOU MODIFY THE BELOW UserFacingNameToSystemCharacteristics MAP YOU SHOULD
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from .system_characteristics import get_tpu_system_characteristics_map, SystemCharacteristics
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topology():
|
|
21
|
+
result = get_tpu_system_characteristics_map(
|
|
22
|
+
prefix="test",
|
|
23
|
+
tensorcores_per_chip=1,
|
|
24
|
+
gke_accelerator="test",
|
|
25
|
+
machine_type="test",
|
|
26
|
+
supported_topologies=["1x1"],
|
|
27
|
+
supports_sub_slicing=False,
|
|
28
|
+
requires_workload_policy=True,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
expected_system_characteristics = SystemCharacteristics(
|
|
32
|
+
topology="1x1",
|
|
33
|
+
vms_per_slice=1,
|
|
34
|
+
gke_accelerator="test",
|
|
35
|
+
gce_machine_type="test",
|
|
36
|
+
chips_per_vm=1,
|
|
37
|
+
accelerator_type=1,
|
|
38
|
+
device_type="test-1",
|
|
39
|
+
supports_sub_slicing=False,
|
|
40
|
+
requires_workload_policy=True,
|
|
41
|
+
)
|
|
42
|
+
assert result == {
|
|
43
|
+
"test-1": expected_system_characteristics,
|
|
44
|
+
"test-1x1": expected_system_characteristics,
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topology():
|
|
49
|
+
result = get_tpu_system_characteristics_map(
|
|
50
|
+
prefix="test",
|
|
51
|
+
tensorcores_per_chip=2,
|
|
52
|
+
gke_accelerator="test",
|
|
53
|
+
machine_type="test",
|
|
54
|
+
supported_topologies=["2x2"],
|
|
55
|
+
supports_sub_slicing=False,
|
|
56
|
+
requires_workload_policy=True,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
expected_system_characteristics = SystemCharacteristics(
|
|
60
|
+
topology="2x2",
|
|
61
|
+
vms_per_slice=1,
|
|
62
|
+
gke_accelerator="test",
|
|
63
|
+
gce_machine_type="test",
|
|
64
|
+
chips_per_vm=4,
|
|
65
|
+
accelerator_type=1,
|
|
66
|
+
device_type="test-8",
|
|
67
|
+
supports_sub_slicing=False,
|
|
68
|
+
requires_workload_policy=True,
|
|
69
|
+
)
|
|
70
|
+
assert result == {
|
|
71
|
+
"test-8": expected_system_characteristics,
|
|
72
|
+
"test-2x2": expected_system_characteristics,
|
|
73
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
apiVersion: apps/v1
|
|
2
|
+
kind: DaemonSet
|
|
3
|
+
metadata:
|
|
4
|
+
name: {{ cachekey }}
|
|
5
|
+
labels:
|
|
6
|
+
k8s-app: {{ cachekey }}
|
|
7
|
+
spec:
|
|
8
|
+
selector:
|
|
9
|
+
matchLabels:
|
|
10
|
+
k8s-app: {{ cachekey }}
|
|
11
|
+
updateStrategy:
|
|
12
|
+
type: RollingUpdate
|
|
13
|
+
template:
|
|
14
|
+
metadata:
|
|
15
|
+
labels:
|
|
16
|
+
name: {{ cachekey }}
|
|
17
|
+
k8s-app: {{ cachekey }}
|
|
18
|
+
spec:
|
|
19
|
+
affinity:
|
|
20
|
+
nodeAffinity:
|
|
21
|
+
requiredDuringSchedulingIgnoredDuringExecution:
|
|
22
|
+
nodeSelectorTerms:
|
|
23
|
+
- matchExpressions:
|
|
24
|
+
- key: {{ nodeSelectorKey }}
|
|
25
|
+
operator: Exists
|
|
26
|
+
tolerations:
|
|
27
|
+
- operator: "Exists"
|
|
28
|
+
containers:
|
|
29
|
+
- image: {{ image_name }}
|
|
30
|
+
name: {{ cachekey }}
|
|
31
|
+
command: [ "sleep", "inf" ]
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
apiVersion: v1
|
|
2
|
+
kind: PersistentVolume
|
|
3
|
+
metadata:
|
|
4
|
+
name: xpk-filestore-pv
|
|
5
|
+
spec:
|
|
6
|
+
storageClassName:
|
|
7
|
+
capacity:
|
|
8
|
+
storage:
|
|
9
|
+
accessModes:
|
|
10
|
+
persistentVolumeReclaimPolicy: Retain
|
|
11
|
+
volumeMode: Filesystem
|
|
12
|
+
csi:
|
|
13
|
+
driver: filestore.csi.storage.gke.io
|
|
14
|
+
volumeHandle:
|
|
15
|
+
volumeAttributes:
|
|
16
|
+
ip:
|
|
17
|
+
volume:
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
apiVersion: v1
|
|
2
|
+
kind: PersistentVolume
|
|
3
|
+
metadata:
|
|
4
|
+
name:
|
|
5
|
+
spec:
|
|
6
|
+
accessModes:
|
|
7
|
+
- ReadWriteMany
|
|
8
|
+
capacity:
|
|
9
|
+
storage:
|
|
10
|
+
storageClassName: example-storage-class
|
|
11
|
+
mountOptions:
|
|
12
|
+
- implicit-dirs
|
|
13
|
+
csi:
|
|
14
|
+
driver: gcsfuse.csi.storage.gke.io
|
|
15
|
+
volumeHandle:
|
|
16
|
+
volumeAttributes:
|
|
17
|
+
gcsfuseLoggingSeverity: warning
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
{% for flavor in flavors %}
|
|
2
|
+
apiVersion: kueue.x-k8s.io/v1beta1
|
|
3
|
+
kind: ResourceFlavor
|
|
4
|
+
metadata:
|
|
5
|
+
name: "{{ flavor.name }}"
|
|
6
|
+
spec:
|
|
7
|
+
nodeLabels: {{ flavor.nodeLabels | tojson }}
|
|
8
|
+
{% if flavor.topologyLabel %}
|
|
9
|
+
{{ flavor.topologyLabel }}
|
|
10
|
+
{% endif %}
|
|
11
|
+
---
|
|
12
|
+
{% endfor %}
|
|
13
|
+
apiVersion: kueue.x-k8s.io/v1beta1
|
|
14
|
+
kind: AdmissionCheck
|
|
15
|
+
metadata:
|
|
16
|
+
name: dws-prov
|
|
17
|
+
spec:
|
|
18
|
+
controllerName: kueue.x-k8s.io/provisioning-request
|
|
19
|
+
parameters:
|
|
20
|
+
apiGroup: kueue.x-k8s.io
|
|
21
|
+
kind: ProvisioningRequestConfig
|
|
22
|
+
name: dws-config
|
|
23
|
+
---
|
|
24
|
+
apiVersion: kueue.x-k8s.io/v1beta1
|
|
25
|
+
kind: ProvisioningRequestConfig
|
|
26
|
+
metadata:
|
|
27
|
+
name: dws-config
|
|
28
|
+
spec:
|
|
29
|
+
provisioningClassName: queued-provisioning.gke.io
|
|
30
|
+
podSetUpdates:
|
|
31
|
+
nodeSelector:
|
|
32
|
+
- key: autoscaling.gke.io/provisioning-request
|
|
33
|
+
valueFromProvisioningClassDetail: ResizeRequestName
|
|
34
|
+
managedResources:
|
|
35
|
+
- {{ managed_resource }}
|
|
36
|
+
---
|
|
37
|
+
apiVersion: kueue.x-k8s.io/v1beta1
|
|
38
|
+
kind: ClusterQueue
|
|
39
|
+
metadata:
|
|
40
|
+
name: "{{ cluster_queue_name }}"
|
|
41
|
+
spec:
|
|
42
|
+
preemption:
|
|
43
|
+
reclaimWithinCohort: Never # Don't preempt other queues in the cohort.
|
|
44
|
+
withinClusterQueue: LowerPriority
|
|
45
|
+
namespaceSelector: {} # match all.
|
|
46
|
+
resourceGroups: {{ resource_groups }}
|
|
47
|
+
{{ admission_checks | indent(2) }}
|
|
48
|
+
---
|
|
49
|
+
apiVersion: kueue.x-k8s.io/v1beta1
|
|
50
|
+
kind: LocalQueue
|
|
51
|
+
metadata:
|
|
52
|
+
namespace: default
|
|
53
|
+
name: {{ local_queue_name }}
|
|
54
|
+
spec:
|
|
55
|
+
clusterQueue: {{ cluster_queue_name }}
|
|
56
|
+
---
|
|
57
|
+
apiVersion: scheduling.k8s.io/v1
|
|
58
|
+
kind: PriorityClass
|
|
59
|
+
metadata:
|
|
60
|
+
name: very-low
|
|
61
|
+
value: 100
|
|
62
|
+
globalDefault: false
|
|
63
|
+
description: "Very Low"
|
|
64
|
+
---
|
|
65
|
+
apiVersion: scheduling.k8s.io/v1
|
|
66
|
+
kind: PriorityClass
|
|
67
|
+
metadata:
|
|
68
|
+
name: low
|
|
69
|
+
value: 250
|
|
70
|
+
globalDefault: false
|
|
71
|
+
description: "Low"
|
|
72
|
+
---
|
|
73
|
+
apiVersion: scheduling.k8s.io/v1
|
|
74
|
+
kind: PriorityClass
|
|
75
|
+
metadata:
|
|
76
|
+
name: medium
|
|
77
|
+
value: 500
|
|
78
|
+
globalDefault: false
|
|
79
|
+
description: "Medium"
|
|
80
|
+
---
|
|
81
|
+
apiVersion: scheduling.k8s.io/v1
|
|
82
|
+
kind: PriorityClass
|
|
83
|
+
metadata:
|
|
84
|
+
name: high
|
|
85
|
+
value: 750
|
|
86
|
+
globalDefault: false
|
|
87
|
+
description: "High"
|
|
88
|
+
---
|
|
89
|
+
apiVersion: scheduling.k8s.io/v1
|
|
90
|
+
kind: PriorityClass
|
|
91
|
+
metadata:
|
|
92
|
+
name: very-high
|
|
93
|
+
value: 1000
|
|
94
|
+
globalDefault: false
|
|
95
|
+
description: "Very High"
|