xpk 0.12.0__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +17 -10
- xpk/commands/cluster.py +137 -123
- xpk/commands/cluster_gcluster.py +77 -14
- xpk/commands/cluster_gcluster_test.py +177 -0
- xpk/commands/common.py +13 -27
- xpk/commands/info.py +11 -9
- xpk/commands/inspector.py +22 -11
- xpk/commands/job.py +53 -9
- xpk/commands/kind.py +38 -40
- xpk/commands/kjob_common.py +4 -4
- xpk/commands/run.py +9 -2
- xpk/commands/shell.py +13 -10
- xpk/commands/storage.py +26 -2
- xpk/commands/version.py +0 -4
- xpk/commands/workload.py +58 -30
- xpk/core/blueprint/blueprint_generator.py +4 -40
- xpk/core/blueprint/blueprint_test.py +0 -6
- xpk/core/capacity.py +6 -5
- xpk/core/cluster.py +96 -195
- xpk/core/cluster_private.py +9 -12
- xpk/core/commands.py +21 -25
- xpk/core/config.py +1 -1
- xpk/core/docker_image.py +17 -9
- xpk/core/docker_resources.py +9 -4
- xpk/core/gcloud_context.py +26 -2
- xpk/core/gcloud_context_test.py +96 -0
- xpk/core/gcluster_manager.py +0 -3
- xpk/core/jobset.py +5 -8
- xpk/core/kjob.py +19 -29
- xpk/core/kueue_manager.py +383 -0
- xpk/core/kueue_manager_test.py +542 -0
- xpk/core/monitoring.py +1 -1
- xpk/core/nap.py +11 -16
- xpk/core/network.py +18 -19
- xpk/core/nodepool.py +65 -71
- xpk/core/nodepool_test.py +198 -1
- xpk/core/pathways.py +9 -5
- xpk/core/ray.py +11 -15
- xpk/core/resources.py +15 -10
- xpk/core/scheduling.py +23 -1
- xpk/core/scheduling_test.py +31 -0
- xpk/core/system_characteristics.py +335 -229
- xpk/core/vertex.py +1 -1
- xpk/core/workload.py +7 -8
- xpk/main.py +3 -2
- xpk/parser/cluster.py +50 -0
- xpk/parser/cluster_test.py +66 -0
- xpk/parser/common.py +11 -0
- xpk/parser/workload.py +62 -25
- xpk/parser/workload_test.py +82 -0
- xpk/utils/execution_context.py +28 -0
- xpk/utils/feature_flags.py +28 -0
- xpk/utils/file.py +25 -10
- xpk/utils/kueue.py +20 -0
- xpk/utils/network.py +4 -0
- xpk/utils/templates.py +2 -0
- xpk/utils/topology.py +37 -0
- xpk/utils/topology_test.py +43 -0
- xpk/utils/validation.py +79 -55
- xpk/utils/validation_test.py +37 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
- xpk-0.14.0.dist-info/RECORD +112 -0
- xpk/core/kueue.py +0 -545
- xpk-0.12.0.dist-info/RECORD +0 -100
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,542 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from typing import Generator, TypeVar
|
|
18
|
+
import unittest
|
|
19
|
+
import yaml
|
|
20
|
+
from unittest.mock import MagicMock, patch
|
|
21
|
+
|
|
22
|
+
from xpk.core.kueue_manager import KueueConfig, KueueManager
|
|
23
|
+
from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class KueueManagerTest(unittest.TestCase):
|
|
27
|
+
"""Unit tests for the KueueManager class."""
|
|
28
|
+
|
|
29
|
+
def setUp(self):
|
|
30
|
+
"""Set up test environment."""
|
|
31
|
+
self.mock_system_chars_gpu = SystemCharacteristics(
|
|
32
|
+
topology="2x2x1",
|
|
33
|
+
vms_per_slice=1,
|
|
34
|
+
gke_accelerator="h100-mega-80gb-8",
|
|
35
|
+
gce_machine_type="a3-megagpu-8g",
|
|
36
|
+
chips_per_vm=8,
|
|
37
|
+
accelerator_type=AcceleratorType["GPU"],
|
|
38
|
+
device_type="h100-mega-80gb-8",
|
|
39
|
+
supports_sub_slicing=False,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
self.mock_system_chars = SystemCharacteristics(
|
|
43
|
+
topology="2x2x1",
|
|
44
|
+
vms_per_slice=1,
|
|
45
|
+
gke_accelerator="test-accelerator",
|
|
46
|
+
gce_machine_type="test-machine",
|
|
47
|
+
chips_per_vm=4,
|
|
48
|
+
accelerator_type=AcceleratorType["TPU"],
|
|
49
|
+
device_type="v5p-8",
|
|
50
|
+
supports_sub_slicing=False,
|
|
51
|
+
)
|
|
52
|
+
mock_env = MagicMock()
|
|
53
|
+
|
|
54
|
+
with patch("jinja2.Environment", return_value=mock_env):
|
|
55
|
+
self.kueue_manager = KueueManager()
|
|
56
|
+
|
|
57
|
+
@patch("xpk.core.kueue_manager.run_command_for_value")
|
|
58
|
+
def test_version_check_when_kueue_not_installed(self, mock_run_for_value):
|
|
59
|
+
mock_run_for_value.return_value = (
|
|
60
|
+
0,
|
|
61
|
+
"Kueue deployment does not exist error message",
|
|
62
|
+
)
|
|
63
|
+
kueue_config = MagicMock(spec=KueueConfig)
|
|
64
|
+
|
|
65
|
+
with (
|
|
66
|
+
patch.object(
|
|
67
|
+
self.kueue_manager, "_KueueManager__install", return_value=0
|
|
68
|
+
) as mock_install,
|
|
69
|
+
patch.object(
|
|
70
|
+
self.kueue_manager, "_KueueManager__configure", return_value=0
|
|
71
|
+
) as mock_configure,
|
|
72
|
+
):
|
|
73
|
+
result = self.kueue_manager.install_or_upgrade(kueue_config)
|
|
74
|
+
|
|
75
|
+
self.assertEqual(result, 0)
|
|
76
|
+
mock_install.assert_called_once()
|
|
77
|
+
mock_configure.assert_called_once()
|
|
78
|
+
|
|
79
|
+
@patch(
|
|
80
|
+
"xpk.core.kueue_manager.KueueManager._KueueManager__get_installed_kueue_version"
|
|
81
|
+
)
|
|
82
|
+
@patch("xpk.core.kueue_manager.KueueManager._KueueManager__install")
|
|
83
|
+
@patch("xpk.core.kueue_manager.KueueManager._KueueManager__configure")
|
|
84
|
+
def test_install_or_upgrade_when_newer_version_already_installed(
|
|
85
|
+
self, mock_configure, mock_install, mock_get_version
|
|
86
|
+
):
|
|
87
|
+
"""Test install_or_upgrade when Kueue is already up to date."""
|
|
88
|
+
mock_get_version.return_value = (0, "v0.99.9")
|
|
89
|
+
kueue_config = MagicMock(spec=KueueConfig)
|
|
90
|
+
|
|
91
|
+
result = self.kueue_manager.install_or_upgrade(kueue_config)
|
|
92
|
+
|
|
93
|
+
self.assertEqual(result, 0)
|
|
94
|
+
mock_get_version.assert_called_once()
|
|
95
|
+
mock_install.assert_not_called()
|
|
96
|
+
mock_configure.assert_not_called()
|
|
97
|
+
|
|
98
|
+
@patch(
|
|
99
|
+
"xpk.core.kueue_manager.KueueManager._KueueManager__get_installed_kueue_version"
|
|
100
|
+
)
|
|
101
|
+
def test_install_or_upgrade_when_outdated(
|
|
102
|
+
self,
|
|
103
|
+
mock_get_version,
|
|
104
|
+
):
|
|
105
|
+
"""Test install_or_upgrade when an older version of Kueue is installed."""
|
|
106
|
+
mock_get_version.return_value = (0, "v0.11.0")
|
|
107
|
+
kueue_config = MagicMock(spec=KueueConfig)
|
|
108
|
+
|
|
109
|
+
with (
|
|
110
|
+
patch.object(
|
|
111
|
+
self.kueue_manager, "_KueueManager__install", return_value=0
|
|
112
|
+
) as mock_install,
|
|
113
|
+
patch.object(
|
|
114
|
+
self.kueue_manager, "_KueueManager__configure", return_value=0
|
|
115
|
+
) as mock_configure,
|
|
116
|
+
):
|
|
117
|
+
result = self.kueue_manager.install_or_upgrade(kueue_config)
|
|
118
|
+
|
|
119
|
+
self.assertEqual(result, 0)
|
|
120
|
+
mock_get_version.assert_called_once()
|
|
121
|
+
mock_install.assert_called_once()
|
|
122
|
+
mock_configure.assert_called_once()
|
|
123
|
+
|
|
124
|
+
@patch(
|
|
125
|
+
"xpk.core.kueue_manager.KueueManager._KueueManager__get_installed_kueue_version"
|
|
126
|
+
)
|
|
127
|
+
def test_install_or_upgrade_when_not_installed(
|
|
128
|
+
self,
|
|
129
|
+
mock_get_version,
|
|
130
|
+
):
|
|
131
|
+
"""Test install_or_upgrade when Kueue is not installed."""
|
|
132
|
+
mock_get_version.return_value = (1, None)
|
|
133
|
+
kueue_config = MagicMock(spec=KueueConfig)
|
|
134
|
+
|
|
135
|
+
with (
|
|
136
|
+
patch.object(
|
|
137
|
+
self.kueue_manager, "_KueueManager__install", return_value=0
|
|
138
|
+
) as mock_install,
|
|
139
|
+
patch.object(
|
|
140
|
+
self.kueue_manager, "_KueueManager__configure", return_value=0
|
|
141
|
+
) as mock_configure,
|
|
142
|
+
):
|
|
143
|
+
result = self.kueue_manager.install_or_upgrade(kueue_config)
|
|
144
|
+
|
|
145
|
+
self.assertEqual(result, 0)
|
|
146
|
+
mock_get_version.assert_called_once()
|
|
147
|
+
mock_install.assert_called_once()
|
|
148
|
+
mock_configure.assert_called_once()
|
|
149
|
+
|
|
150
|
+
def test_installation_with_tolerations(self):
|
|
151
|
+
"""Test that tolerations are patched during installation."""
|
|
152
|
+
with (
|
|
153
|
+
patch(
|
|
154
|
+
"xpk.core.kueue_manager.run_command_with_updates_retry",
|
|
155
|
+
return_value=0,
|
|
156
|
+
) as mock_run_retry,
|
|
157
|
+
patch(
|
|
158
|
+
"xpk.core.kueue_manager.KueueManager._KueueManager__get_installed_kueue_version",
|
|
159
|
+
return_value=(1, None),
|
|
160
|
+
),
|
|
161
|
+
patch(
|
|
162
|
+
"xpk.core.kueue_manager.KueueManager._KueueManager__install_kueue_crs",
|
|
163
|
+
return_value=0,
|
|
164
|
+
),
|
|
165
|
+
patch(
|
|
166
|
+
"xpk.core.kueue_manager.KueueManager._KueueManager__wait_for_kueue_available",
|
|
167
|
+
return_value=0,
|
|
168
|
+
),
|
|
169
|
+
patch(
|
|
170
|
+
"xpk.core.kueue_manager.KueueManager._KueueManager__configure",
|
|
171
|
+
return_value=0,
|
|
172
|
+
),
|
|
173
|
+
):
|
|
174
|
+
tolerations = [
|
|
175
|
+
{"key": "test", "operator": "Exists", "effect": "NoSchedule"}
|
|
176
|
+
]
|
|
177
|
+
kueue_config = MagicMock(spec=KueueConfig)
|
|
178
|
+
|
|
179
|
+
result = self.kueue_manager.install_or_upgrade(
|
|
180
|
+
kueue_config, tolerations=tolerations
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
self.assertEqual(result, 0)
|
|
184
|
+
self.assertEqual(mock_run_retry.call_count, 1)
|
|
185
|
+
patch_call = mock_run_retry.call_args_list[0]
|
|
186
|
+
self.assertIn(
|
|
187
|
+
"kubectl patch deployment kueue-controller-manager -n kueue-system"
|
|
188
|
+
' --type=\'strategic\' --patch=\'{"spec": {"template": {"spec":'
|
|
189
|
+
' {"tolerations": [{"key": "test", "operator": "Exists", "effect":'
|
|
190
|
+
' "NoSchedule"}]}}}}',
|
|
191
|
+
patch_call[0][0],
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
def test_installation_without_tolerations(self):
|
|
195
|
+
"""Test that tolerations are not patched when not provided."""
|
|
196
|
+
with (
|
|
197
|
+
patch(
|
|
198
|
+
"xpk.core.kueue_manager.run_command_with_updates_retry",
|
|
199
|
+
return_value=0,
|
|
200
|
+
) as mock_run_retry,
|
|
201
|
+
patch(
|
|
202
|
+
"xpk.core.kueue_manager.KueueManager._KueueManager__get_installed_kueue_version",
|
|
203
|
+
return_value=(1, None),
|
|
204
|
+
),
|
|
205
|
+
patch(
|
|
206
|
+
"xpk.core.kueue_manager.KueueManager._KueueManager__install_kueue_crs",
|
|
207
|
+
return_value=0,
|
|
208
|
+
),
|
|
209
|
+
patch(
|
|
210
|
+
"xpk.core.kueue_manager.KueueManager._KueueManager__wait_for_kueue_available",
|
|
211
|
+
return_value=0,
|
|
212
|
+
),
|
|
213
|
+
patch(
|
|
214
|
+
"xpk.core.kueue_manager.KueueManager._KueueManager__configure",
|
|
215
|
+
return_value=0,
|
|
216
|
+
),
|
|
217
|
+
):
|
|
218
|
+
kueue_config = MagicMock(spec=KueueConfig)
|
|
219
|
+
|
|
220
|
+
result = self.kueue_manager.install_or_upgrade(
|
|
221
|
+
kueue_config, tolerations=None
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
self.assertEqual(result, 0)
|
|
225
|
+
self.assertEqual(mock_run_retry.call_count, 0)
|
|
226
|
+
|
|
227
|
+
@patch(
|
|
228
|
+
"xpk.core.kueue_manager.KueueManager._KueueManager__get_installed_kueue_version"
|
|
229
|
+
)
|
|
230
|
+
@patch("xpk.core.kueue_manager.KueueManager._KueueManager__apply_manifest")
|
|
231
|
+
def test_configuration_updates_resources(
|
|
232
|
+
self, mock_apply_manifest, mock_get_version
|
|
233
|
+
):
|
|
234
|
+
"""Test that configuration updates Kueue resources."""
|
|
235
|
+
mock_get_version.return_value = (1, None) # Trigger install
|
|
236
|
+
mock_apply_manifest.return_value = 0
|
|
237
|
+
|
|
238
|
+
kueue_config = KueueConfig(
|
|
239
|
+
system=self.mock_system_chars,
|
|
240
|
+
total_chips=8,
|
|
241
|
+
cpu_limit=100,
|
|
242
|
+
memory_limit="100Gi",
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
with (
|
|
246
|
+
patch.object(
|
|
247
|
+
self.kueue_manager, "_KueueManager__install", return_value=0
|
|
248
|
+
),
|
|
249
|
+
patch.object(
|
|
250
|
+
self.kueue_manager,
|
|
251
|
+
"_KueueManager__update_kueue_resources_if_necessary",
|
|
252
|
+
return_value=0,
|
|
253
|
+
) as mock_update_resources,
|
|
254
|
+
):
|
|
255
|
+
self.kueue_manager.install_or_upgrade(kueue_config)
|
|
256
|
+
mock_apply_manifest.assert_called()
|
|
257
|
+
mock_update_resources.assert_called_once()
|
|
258
|
+
|
|
259
|
+
@patch("xpk.core.kueue_manager.run_command_with_updates_retry")
|
|
260
|
+
def test_resource_update_for_small_cluster(self, mock_run_retry):
|
|
261
|
+
"""Test resource update logic for a small cluster."""
|
|
262
|
+
mock_run_retry.return_value = 0
|
|
263
|
+
kueue_config = KueueConfig(
|
|
264
|
+
system=self.mock_system_chars,
|
|
265
|
+
total_chips=8,
|
|
266
|
+
cpu_limit=100,
|
|
267
|
+
memory_limit="100Gi",
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
with (
|
|
271
|
+
patch(
|
|
272
|
+
"xpk.core.kueue_manager.run_command_for_value",
|
|
273
|
+
return_value=(0, "100"), # 100 nodes
|
|
274
|
+
),
|
|
275
|
+
patch.object(
|
|
276
|
+
self.kueue_manager,
|
|
277
|
+
"_KueueManager__get_installed_kueue_version",
|
|
278
|
+
return_value=(1, None),
|
|
279
|
+
),
|
|
280
|
+
patch.object(
|
|
281
|
+
self.kueue_manager, "_KueueManager__install", return_value=0
|
|
282
|
+
),
|
|
283
|
+
patch.object(
|
|
284
|
+
self.kueue_manager, "_KueueManager__apply_manifest", return_value=0
|
|
285
|
+
),
|
|
286
|
+
):
|
|
287
|
+
result = self.kueue_manager.install_or_upgrade(kueue_config)
|
|
288
|
+
|
|
289
|
+
self.assertEqual(result, 0)
|
|
290
|
+
mock_run_retry.assert_called_once()
|
|
291
|
+
patch_call = mock_run_retry.call_args_list[0]
|
|
292
|
+
# 100 * 1.2 = 120, which is less than 4096. So it should be 4096.
|
|
293
|
+
self.assertIn(
|
|
294
|
+
"kubectl patch deployment kueue-controller-manager -n kueue-system"
|
|
295
|
+
' --type=\'strategic\' --patch=\'{"spec": {"template": {"spec":'
|
|
296
|
+
' {"containers": [{"name": "manager", "resources": {"limits":'
|
|
297
|
+
' {"memory": "4096Mi"}}}]}}}}\'',
|
|
298
|
+
patch_call[0][0],
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
@patch("xpk.core.kueue_manager.run_command_with_updates_retry")
|
|
302
|
+
def test_resource_update_for_large_cluster(self, mock_run_retry):
|
|
303
|
+
"""Test resource update logic for a large cluster."""
|
|
304
|
+
mock_run_retry.return_value = 0
|
|
305
|
+
kueue_config = KueueConfig(
|
|
306
|
+
system=self.mock_system_chars,
|
|
307
|
+
total_chips=8,
|
|
308
|
+
cpu_limit=100,
|
|
309
|
+
memory_limit="100Gi",
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
with (
|
|
313
|
+
patch(
|
|
314
|
+
"xpk.core.kueue_manager.run_command_for_value",
|
|
315
|
+
return_value=(0, "5000"), # 5000 nodes
|
|
316
|
+
),
|
|
317
|
+
patch.object(
|
|
318
|
+
self.kueue_manager,
|
|
319
|
+
"_KueueManager__get_installed_kueue_version",
|
|
320
|
+
return_value=(1, None),
|
|
321
|
+
),
|
|
322
|
+
patch.object(
|
|
323
|
+
self.kueue_manager, "_KueueManager__install", return_value=0
|
|
324
|
+
),
|
|
325
|
+
patch.object(
|
|
326
|
+
self.kueue_manager, "_KueueManager__apply_manifest", return_value=0
|
|
327
|
+
),
|
|
328
|
+
):
|
|
329
|
+
result = self.kueue_manager.install_or_upgrade(kueue_config)
|
|
330
|
+
|
|
331
|
+
self.assertEqual(result, 0)
|
|
332
|
+
mock_run_retry.assert_called_once()
|
|
333
|
+
patch_call = mock_run_retry.call_args_list[0]
|
|
334
|
+
# 5000 * 1.2 = 6000, which is > 4096.
|
|
335
|
+
self.assertIn(
|
|
336
|
+
"kubectl patch deployment kueue-controller-manager -n kueue-system"
|
|
337
|
+
' --type=\'strategic\' --patch=\'{"spec": {"template": {"spec":'
|
|
338
|
+
' {"containers": [{"name": "manager", "resources": {"limits":'
|
|
339
|
+
' {"memory": "6000Mi"}}}]}}}}\'',
|
|
340
|
+
patch_call[0][0],
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
@patch("xpk.core.kueue_manager.KueueManager._KueueManager__install")
|
|
344
|
+
@patch(
|
|
345
|
+
"xpk.core.kueue_manager.KueueManager._KueueManager__update_kueue_resources_if_necessary"
|
|
346
|
+
)
|
|
347
|
+
def test_configure_generates_correct_manifest(
|
|
348
|
+
self, mock_update_resources, mock_install
|
|
349
|
+
):
|
|
350
|
+
"""Test that __configure generates the correct manifest content for TPUs."""
|
|
351
|
+
mock_install.return_value = 0
|
|
352
|
+
mock_update_resources.return_value = 0
|
|
353
|
+
kueue_config = KueueConfig(
|
|
354
|
+
system=self.mock_system_chars,
|
|
355
|
+
total_chips=8,
|
|
356
|
+
cpu_limit=100,
|
|
357
|
+
memory_limit="100Gi",
|
|
358
|
+
autoprovisioning_enabled=False,
|
|
359
|
+
num_slices=2,
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
rendered_manifest = self._trigger_installation(kueue_config)
|
|
363
|
+
|
|
364
|
+
self.assertNotIn("kind: Topology", rendered_manifest)
|
|
365
|
+
manifest_docs = list(yaml.safe_load_all(rendered_manifest))
|
|
366
|
+
cluster_queue = _first(
|
|
367
|
+
doc for doc in manifest_docs if doc["kind"] == "ClusterQueue"
|
|
368
|
+
)
|
|
369
|
+
self.assertEqual(
|
|
370
|
+
cluster_queue["spec"]["resourceGroups"][0]["flavors"][0]["name"],
|
|
371
|
+
"2xv5p-8",
|
|
372
|
+
)
|
|
373
|
+
resources = cluster_queue["spec"]["resourceGroups"][0]["flavors"][0][
|
|
374
|
+
"resources"
|
|
375
|
+
]
|
|
376
|
+
tpu_resource = _first(r for r in resources if r["name"] == "google.com/tpu")
|
|
377
|
+
cpu_resource = _first(r for r in resources if r["name"] == "cpu")
|
|
378
|
+
memory_resource = _first(r for r in resources if r["name"] == "memory")
|
|
379
|
+
self.assertEqual(tpu_resource["nominalQuota"], 8)
|
|
380
|
+
self.assertEqual(cpu_resource["nominalQuota"], 100)
|
|
381
|
+
self.assertEqual(memory_resource["nominalQuota"], "100Gi")
|
|
382
|
+
resource_flavor = _first(
|
|
383
|
+
doc for doc in manifest_docs if doc["kind"] == "ResourceFlavor"
|
|
384
|
+
)
|
|
385
|
+
self.assertEqual(
|
|
386
|
+
resource_flavor["spec"]["nodeLabels"][
|
|
387
|
+
"cloud.google.com/gke-tpu-accelerator"
|
|
388
|
+
],
|
|
389
|
+
"test-accelerator",
|
|
390
|
+
)
|
|
391
|
+
self.assertEqual(
|
|
392
|
+
resource_flavor["spec"]["nodeLabels"][
|
|
393
|
+
"cloud.google.com/gke-tpu-topology"
|
|
394
|
+
],
|
|
395
|
+
"2x2x1",
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
@patch("xpk.core.kueue_manager.KueueManager._KueueManager__install")
|
|
399
|
+
@patch(
|
|
400
|
+
"xpk.core.kueue_manager.KueueManager._KueueManager__update_kueue_resources_if_necessary"
|
|
401
|
+
)
|
|
402
|
+
def test_configure_generates_manifest_with_admission_checks_for_flex_single_slice(
|
|
403
|
+
self, mock_update_resources, mock_install
|
|
404
|
+
):
|
|
405
|
+
"""Test that __configure generates the correct manifest with admission checks."""
|
|
406
|
+
mock_install.return_value = 0
|
|
407
|
+
mock_update_resources.return_value = 0
|
|
408
|
+
kueue_config = KueueConfig(
|
|
409
|
+
system=self.mock_system_chars,
|
|
410
|
+
total_chips=8,
|
|
411
|
+
cpu_limit=100,
|
|
412
|
+
memory_limit="100Gi",
|
|
413
|
+
autoprovisioning_enabled=False,
|
|
414
|
+
num_slices=1,
|
|
415
|
+
flex=True,
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
rendered_manifest = self._trigger_installation(kueue_config)
|
|
419
|
+
|
|
420
|
+
self.assertNotIn("kind: Topology", rendered_manifest)
|
|
421
|
+
manifest_docs = list(yaml.safe_load_all(rendered_manifest))
|
|
422
|
+
cluster_queue = _first(
|
|
423
|
+
doc for doc in manifest_docs if doc["kind"] == "ClusterQueue"
|
|
424
|
+
)
|
|
425
|
+
self.assertEqual(
|
|
426
|
+
cluster_queue["spec"]["resourceGroups"][0]["flavors"][0]["name"],
|
|
427
|
+
"1xv5p-8",
|
|
428
|
+
)
|
|
429
|
+
self.assertEqual(cluster_queue["spec"]["admissionChecks"][0], "dws-prov")
|
|
430
|
+
|
|
431
|
+
@patch("xpk.core.kueue_manager.KueueManager._KueueManager__install")
|
|
432
|
+
@patch(
|
|
433
|
+
"xpk.core.kueue_manager.KueueManager._KueueManager__update_kueue_resources_if_necessary"
|
|
434
|
+
)
|
|
435
|
+
def test_configure_generates_correct_manifest_with_topology(
|
|
436
|
+
self, mock_update_resources, mock_install
|
|
437
|
+
):
|
|
438
|
+
"""Test that __configure generates correct manifest for GPUs."""
|
|
439
|
+
mock_install.return_value = 0
|
|
440
|
+
mock_update_resources.return_value = 0
|
|
441
|
+
kueue_config = KueueConfig(
|
|
442
|
+
system=self.mock_system_chars_gpu,
|
|
443
|
+
total_chips=16,
|
|
444
|
+
cpu_limit=100,
|
|
445
|
+
memory_limit="100Gi",
|
|
446
|
+
num_slices=2,
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
rendered_manifest = self._trigger_installation(kueue_config)
|
|
450
|
+
|
|
451
|
+
self.assertIn("kind: Topology", rendered_manifest)
|
|
452
|
+
manifest_docs = list(yaml.safe_load_all(rendered_manifest))
|
|
453
|
+
resource_flavor = _first(
|
|
454
|
+
doc for doc in manifest_docs if doc["kind"] == "ResourceFlavor"
|
|
455
|
+
)
|
|
456
|
+
self.assertEqual(
|
|
457
|
+
resource_flavor["spec"]["nodeLabels"][
|
|
458
|
+
"cloud.google.com/gke-accelerator"
|
|
459
|
+
],
|
|
460
|
+
"h100-mega-80gb-8",
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
@patch("xpk.core.kueue_manager.KueueManager._KueueManager__install")
|
|
464
|
+
@patch(
|
|
465
|
+
"xpk.core.kueue_manager.KueueManager._KueueManager__update_kueue_resources_if_necessary"
|
|
466
|
+
)
|
|
467
|
+
def test_configure_generates_correct_manifest_with_pathways(
|
|
468
|
+
self, mock_update_resources, mock_install
|
|
469
|
+
):
|
|
470
|
+
"""Test that __configure generates the correct manifest with pathways enabled."""
|
|
471
|
+
mock_install.return_value = 0
|
|
472
|
+
mock_update_resources.return_value = 0
|
|
473
|
+
kueue_config = KueueConfig(
|
|
474
|
+
system=self.mock_system_chars,
|
|
475
|
+
total_chips=8,
|
|
476
|
+
cpu_limit=100,
|
|
477
|
+
memory_limit="100Gi",
|
|
478
|
+
is_pathways_cluster=True,
|
|
479
|
+
num_slices=2,
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
rendered_manifest = self._trigger_installation(kueue_config)
|
|
483
|
+
manifest_docs = list(yaml.safe_load_all(rendered_manifest))
|
|
484
|
+
|
|
485
|
+
# Check for the new "cpu-user" ResourceFlavor
|
|
486
|
+
cpu_user_flavor = _first(
|
|
487
|
+
doc
|
|
488
|
+
for doc in manifest_docs
|
|
489
|
+
if doc["kind"] == "ResourceFlavor"
|
|
490
|
+
and doc["metadata"]["name"] == "cpu-user"
|
|
491
|
+
)
|
|
492
|
+
self.assertEqual(
|
|
493
|
+
cpu_user_flavor["spec"]["nodeLabels"]["cloud.google.com/gke-nodepool"],
|
|
494
|
+
"cpu-np",
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
# Check that the ClusterQueue has the new resource group for pathways
|
|
498
|
+
cluster_queue = _first(
|
|
499
|
+
doc for doc in manifest_docs if doc["kind"] == "ClusterQueue"
|
|
500
|
+
)
|
|
501
|
+
self.assertEqual(len(cluster_queue["spec"]["resourceGroups"]), 2)
|
|
502
|
+
pathways_rg = cluster_queue["spec"]["resourceGroups"][1]
|
|
503
|
+
self.assertEqual(pathways_rg["coveredResources"], ["cpu", "memory"])
|
|
504
|
+
self.assertEqual(pathways_rg["flavors"][0]["name"], "cpu-user")
|
|
505
|
+
self.assertEqual(
|
|
506
|
+
pathways_rg["flavors"][0]["resources"][0]["nominalQuota"], 480
|
|
507
|
+
)
|
|
508
|
+
self.assertEqual(
|
|
509
|
+
pathways_rg["flavors"][0]["resources"][1]["nominalQuota"], "2000G"
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
def _trigger_installation(self, kueue_config: KueueConfig) -> str:
|
|
513
|
+
"""Calls Kueue installation and returns the rendered manifest."""
|
|
514
|
+
with (
|
|
515
|
+
patch.object(
|
|
516
|
+
self.kueue_manager, "_KueueManager__get_installed_kueue_version"
|
|
517
|
+
) as mock_get_version,
|
|
518
|
+
patch.object(
|
|
519
|
+
self.kueue_manager, "_KueueManager__apply_manifest"
|
|
520
|
+
) as mock_apply_manifest,
|
|
521
|
+
):
|
|
522
|
+
mock_apply_manifest.return_value = 0
|
|
523
|
+
mock_get_version.return_value = (1, None)
|
|
524
|
+
self.kueue_manager.install_or_upgrade(kueue_config)
|
|
525
|
+
|
|
526
|
+
mock_apply_manifest.assert_called_once()
|
|
527
|
+
manifest = mock_apply_manifest.call_args[0][0]
|
|
528
|
+
assert isinstance(manifest, str)
|
|
529
|
+
return manifest
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
T = TypeVar("T")
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
def _first(generator: Generator[T, None, None]) -> T:
|
|
536
|
+
result = next(generator, None)
|
|
537
|
+
assert result is not None
|
|
538
|
+
return result
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
if __name__ == "__main__":
|
|
542
|
+
unittest.main()
|
xpk/core/monitoring.py
CHANGED
xpk/core/nap.py
CHANGED
|
@@ -27,7 +27,7 @@ from .capacity import (
|
|
|
27
27
|
verify_reservation_exists,
|
|
28
28
|
)
|
|
29
29
|
from .commands import run_command_with_updates, run_commands
|
|
30
|
-
from .gcloud_context import
|
|
30
|
+
from .gcloud_context import get_cluster_location
|
|
31
31
|
from .nodepool import get_all_nodepools_programmatic
|
|
32
32
|
from .resources import (
|
|
33
33
|
CLUSTER_METADATA_CONFIGMAP,
|
|
@@ -98,13 +98,12 @@ def enable_autoprovisioning_on_cluster(
|
|
|
98
98
|
|
|
99
99
|
command = (
|
|
100
100
|
'gcloud container clusters update'
|
|
101
|
-
f' {args.cluster} --project={args.project}'
|
|
102
|
-
f' --region={zone_to_region(args.zone)} --enable-autoprovisioning'
|
|
101
|
+
f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --enable-autoprovisioning'
|
|
103
102
|
' --autoprovisioning-config-file'
|
|
104
103
|
f' {autoprovisioning_config.config_filename}'
|
|
105
104
|
)
|
|
106
105
|
task = 'Update cluster with autoprovisioning enabled'
|
|
107
|
-
return_code = run_command_with_updates(command, task
|
|
106
|
+
return_code = run_command_with_updates(command, task)
|
|
108
107
|
if return_code != 0:
|
|
109
108
|
xpk_print(f'{task} request returned ERROR {return_code}')
|
|
110
109
|
return autoprovisioning_config, return_code
|
|
@@ -112,11 +111,11 @@ def enable_autoprovisioning_on_cluster(
|
|
|
112
111
|
command = (
|
|
113
112
|
'gcloud container clusters update'
|
|
114
113
|
f' {args.cluster} --project={args.project}'
|
|
115
|
-
f' --
|
|
114
|
+
f' --location={get_cluster_location(args.project, args.cluster, args.zone)}'
|
|
116
115
|
' --autoscaling-profile=optimize-utilization'
|
|
117
116
|
)
|
|
118
117
|
task = 'Update cluster with autoscaling-profile'
|
|
119
|
-
return_code = run_command_with_updates(command, task
|
|
118
|
+
return_code = run_command_with_updates(command, task)
|
|
120
119
|
if return_code != 0:
|
|
121
120
|
xpk_print(f'{task} request returned ERROR {return_code}')
|
|
122
121
|
return autoprovisioning_config, return_code
|
|
@@ -138,11 +137,8 @@ def enable_autoprovisioning_on_cluster(
|
|
|
138
137
|
# Ignore node pools that are not created yet, and not of the accelerator type.
|
|
139
138
|
continue
|
|
140
139
|
commands.append(
|
|
141
|
-
f'gcloud container node-pools update {node_pool_name}'
|
|
142
|
-
f'
|
|
143
|
-
f' --project={args.project}'
|
|
144
|
-
f' --region={zone_to_region(args.zone)}'
|
|
145
|
-
' --enable-autoprovisioning'
|
|
140
|
+
f'gcloud container node-pools update {node_pool_name} --cluster'
|
|
141
|
+
f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --enable-autoprovisioning'
|
|
146
142
|
' --enable-autoscaling'
|
|
147
143
|
)
|
|
148
144
|
task_name = (
|
|
@@ -156,7 +152,6 @@ def enable_autoprovisioning_on_cluster(
|
|
|
156
152
|
commands,
|
|
157
153
|
'Update node pools with autoprovisioning support',
|
|
158
154
|
task_names,
|
|
159
|
-
dry_run=args.dry_run,
|
|
160
155
|
)
|
|
161
156
|
if max_return_code != 0:
|
|
162
157
|
xpk_print(
|
|
@@ -250,7 +245,7 @@ def create_autoprovisioning_config(
|
|
|
250
245
|
zones=f'- {args.zone}',
|
|
251
246
|
)
|
|
252
247
|
autoprovisioning_config = AutoprovisioningConfig(
|
|
253
|
-
config_filename=write_tmp_file(yml_string)
|
|
248
|
+
config_filename=write_tmp_file(yml_string),
|
|
254
249
|
minimum_chips=minimum,
|
|
255
250
|
maximum_chips=maximum,
|
|
256
251
|
)
|
|
@@ -272,7 +267,7 @@ def is_autoprovisioning_enabled(
|
|
|
272
267
|
"""
|
|
273
268
|
|
|
274
269
|
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
|
275
|
-
cluster_config_map = get_cluster_configmap(
|
|
270
|
+
cluster_config_map = get_cluster_configmap(resources_configmap_name)
|
|
276
271
|
|
|
277
272
|
if cluster_config_map is None:
|
|
278
273
|
xpk_print(
|
|
@@ -325,7 +320,7 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
|
|
|
325
320
|
if capacity_type_str == CapacityType.UNKNOWN.name:
|
|
326
321
|
# Use default settings from cluster creation.
|
|
327
322
|
metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
|
|
328
|
-
cluster_config_map = get_cluster_configmap(
|
|
323
|
+
cluster_config_map = get_cluster_configmap(metadata_configmap_name)
|
|
329
324
|
|
|
330
325
|
# Error out if the metadata config map doesn't exist, and is attempting to use
|
|
331
326
|
# autoprovisioning.
|
|
@@ -369,7 +364,7 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
|
|
|
369
364
|
|
|
370
365
|
def get_cluster_provisioner(args) -> str:
|
|
371
366
|
metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
|
|
372
|
-
cluster_config_map = get_cluster_configmap(
|
|
367
|
+
cluster_config_map = get_cluster_configmap(metadata_configmap_name)
|
|
373
368
|
cluster_provisioner = 'gcloud'
|
|
374
369
|
if not cluster_config_map is None:
|
|
375
370
|
provisioner = cluster_config_map.get('provisioner')
|