xpk 0.14.2__py3-none-any.whl → 0.14.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. xpk/commands/cluster.py +57 -22
  2. xpk/commands/cluster_gcluster_test.py +2 -2
  3. xpk/commands/cluster_test.py +197 -25
  4. xpk/commands/inspector.py +20 -7
  5. xpk/commands/kind.py +1 -1
  6. xpk/commands/workload.py +43 -4
  7. xpk/commands/workload_test.py +88 -5
  8. xpk/core/blueprint/blueprint_definitions.py +16 -1
  9. xpk/core/blueprint/blueprint_generator.py +11 -11
  10. xpk/core/capacity.py +17 -0
  11. xpk/core/capacity_test.py +50 -0
  12. xpk/core/config.py +1 -1
  13. xpk/core/docker_container.py +4 -4
  14. xpk/core/docker_resources.py +11 -11
  15. xpk/core/kjob.py +3 -5
  16. xpk/core/kueue_manager.py +21 -10
  17. xpk/core/kueue_manager_test.py +379 -536
  18. xpk/core/nap.py +1 -1
  19. xpk/core/nodepool.py +9 -9
  20. xpk/core/nodepool_test.py +4 -4
  21. xpk/core/pathways.py +1 -1
  22. xpk/core/resources.py +1 -1
  23. xpk/core/scheduling.py +7 -13
  24. xpk/core/system_characteristics.py +42 -35
  25. xpk/core/system_characteristics_test.py +3 -3
  26. xpk/core/testing/__init__.py +15 -0
  27. xpk/core/testing/commands_tester.py +131 -0
  28. xpk/core/testing/commands_tester_test.py +129 -0
  29. xpk/core/updates.py +57 -0
  30. xpk/core/updates_test.py +80 -0
  31. xpk/main.py +7 -4
  32. xpk/parser/common.py +8 -0
  33. xpk/utils/execution_context.py +20 -2
  34. {xpk-0.14.2.dist-info → xpk-0.14.4.dist-info}/METADATA +1 -3
  35. {xpk-0.14.2.dist-info → xpk-0.14.4.dist-info}/RECORD +39 -33
  36. {xpk-0.14.2.dist-info → xpk-0.14.4.dist-info}/WHEEL +0 -0
  37. {xpk-0.14.2.dist-info → xpk-0.14.4.dist-info}/entry_points.txt +0 -0
  38. {xpk-0.14.2.dist-info → xpk-0.14.4.dist-info}/licenses/LICENSE +0 -0
  39. {xpk-0.14.2.dist-info → xpk-0.14.4.dist-info}/top_level.txt +0 -0
@@ -14,551 +14,398 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ import dataclasses
17
18
  from typing import Generator, TypeVar
18
- import unittest
19
+ import pytest
20
+ from pytest_mock import MockerFixture
19
21
  import yaml
20
22
  from unittest.mock import MagicMock, patch
21
23
 
22
- from xpk.core.kueue_manager import KueueConfig, KueueManager
24
+ from xpk.core.kueue_manager import KueueConfig, KueueManager, has_sub_slicing_enabled
23
25
  from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics
26
+ from xpk.core.testing.commands_tester import CommandsTester
27
+ from packaging.version import Version
28
+
29
+ TPU_SYSTEM: SystemCharacteristics = SystemCharacteristics(
30
+ topology="2x2x1",
31
+ vms_per_slice=1,
32
+ gke_accelerator="test-accelerator",
33
+ gce_machine_type="test-machine",
34
+ chips_per_vm=4,
35
+ accelerator_type=AcceleratorType.TPU,
36
+ device_type="v5p-8",
37
+ supports_sub_slicing=False,
38
+ )
39
+
40
+ KUEUE_CONFIG: KueueConfig = KueueConfig(
41
+ system=TPU_SYSTEM,
42
+ total_chips=8,
43
+ cpu_limit=100,
44
+ memory_limit="100Gi",
45
+ configure_sub_slicing=False,
46
+ )
47
+
48
+
49
+ def set_installed_kueue_version(
50
+ commands_tester: CommandsTester, version: Version | None
51
+ ):
52
+ result = (
53
+ (1, "")
54
+ if version is None
55
+ else (0, f"registry.k8s.io/kueue/kueue:v{version}")
56
+ )
57
+ commands_tester.set_result_for_command(
58
+ result,
59
+ "kubectl get deployment kueue-controller-manager",
60
+ "containers[0].image",
61
+ )
62
+
63
+
64
+ @pytest.fixture(autouse=True)
65
+ def mock_commands(mocker: MockerFixture) -> CommandsTester:
66
+ return CommandsTester(
67
+ mocker,
68
+ run_command_for_value_path="xpk.core.kueue_manager.run_command_for_value",
69
+ run_command_with_updates_path=(
70
+ "xpk.core.kueue_manager.run_command_with_updates"
71
+ ),
72
+ run_command_with_updates_retry_path=(
73
+ "xpk.core.kueue_manager.run_command_with_updates_retry"
74
+ ),
75
+ )
76
+
77
+
78
+ @pytest.fixture(autouse=True)
79
+ @patch("jinja2.Environment", return_value=MagicMock())
80
+ def kueue_manager(mock_env: MagicMock) -> KueueManager:
81
+ return KueueManager()
82
+
83
+
84
+ def test_install_or_upgrade_when_newer_version_already_installed(
85
+ mock_commands: CommandsTester, kueue_manager: KueueManager
86
+ ):
87
+ """Test install_or_upgrade when Kueue is already up to date."""
88
+ set_installed_kueue_version(mock_commands, Version("0.99.0"))
89
+
90
+ result = kueue_manager.install_or_upgrade(KUEUE_CONFIG)
91
+
92
+ assert result == 0
93
+ mock_commands.assert_command_not_run("kubectl apply")
94
+
95
+
96
+ def test_install_or_upgrade_when_outdated(
97
+ mock_commands: CommandsTester, kueue_manager: KueueManager
98
+ ):
99
+ """Test install_or_upgrade when an older version of Kueue is installed."""
100
+ set_installed_kueue_version(mock_commands, Version("0.11.0"))
101
+
102
+ result = kueue_manager.install_or_upgrade(KUEUE_CONFIG)
103
+
104
+ assert result == 0
105
+ mock_commands.assert_command_run("kubectl apply", "v0.12.2/manifests.yaml")
106
+ mock_commands.assert_command_run("kubectl apply -f", "/tmp/")
107
+
108
+
109
+ def test_install_or_upgrade_when_not_installed(
110
+ mock_commands: CommandsTester, kueue_manager: KueueManager
111
+ ):
112
+ """Test install_or_upgrade when Kueue is not installed."""
113
+ set_installed_kueue_version(mock_commands, None)
114
+
115
+ result = kueue_manager.install_or_upgrade(KUEUE_CONFIG)
116
+
117
+ assert result == 0
118
+ mock_commands.assert_command_run("kubectl apply", "v0.12.2/manifests.yaml")
119
+ mock_commands.assert_command_run("kubectl apply -f", "/tmp/")
120
+
121
+
122
+ def test_installation_with_tolerations(
123
+ mock_commands: CommandsTester, kueue_manager: KueueManager
124
+ ):
125
+ """Test that tolerations are patched during installation."""
126
+ set_installed_kueue_version(mock_commands, None)
127
+ tolerations = [{"key": "test", "operator": "Exists", "effect": "NoSchedule"}]
128
+
129
+ result = kueue_manager.install_or_upgrade(
130
+ KUEUE_CONFIG, tolerations=tolerations
131
+ )
132
+
133
+ assert result == 0
134
+ mock_commands.assert_command_run(
135
+ "kubectl patch deployment kueue-controller-manager -n kueue-system"
136
+ ' --type=\'strategic\' --patch=\'{"spec": {"template": {"spec":'
137
+ ' {"tolerations": [{"key": "test", "operator": "Exists", "effect":'
138
+ ' "NoSchedule"}]}}}}\''
139
+ )
140
+
141
+
142
+ def test_installation_without_tolerations(
143
+ mock_commands: CommandsTester, kueue_manager: KueueManager
144
+ ):
145
+ """Test that tolerations are not patched when not provided."""
146
+ set_installed_kueue_version(mock_commands, None)
147
+
148
+ result = kueue_manager.install_or_upgrade(KUEUE_CONFIG, tolerations=None)
149
+
150
+ assert result == 0
151
+ mock_commands.assert_command_not_run(
152
+ "kubectl patch deployment kueue-controller-manager", "tolerations"
153
+ )
154
+
155
+
156
+ def test_resource_update_for_small_cluster(
157
+ mock_commands: CommandsTester, kueue_manager: KueueManager
158
+ ):
159
+ """Test resource update logic for a small cluster."""
160
+ set_installed_kueue_version(mock_commands, None)
161
+ mock_commands.set_result_for_command((0, "100"), "kubectl get node")
162
+
163
+ result = kueue_manager.install_or_upgrade(KUEUE_CONFIG)
164
+
165
+ assert result == 0
166
+ # 100 * 1.2 = 120, which is less than 4096. So it should be 4096.
167
+ mock_commands.assert_command_run(
168
+ "kubectl patch deployment kueue-controller-manager -n kueue-system"
169
+ ' --type=\'strategic\' --patch=\'{"spec": {"template": {"spec":'
170
+ ' {"containers": [{"name": "manager", "resources": {"limits":'
171
+ ' {"memory": "4096Mi"}}}]}}}}\'',
172
+ )
173
+
174
+
175
+ def test_resource_update_for_large_cluster(
176
+ mock_commands: CommandsTester, kueue_manager: KueueManager
177
+ ):
178
+ """Test resource update logic for a large cluster."""
179
+ set_installed_kueue_version(mock_commands, None)
180
+ mock_commands.set_result_for_command((0, "5000"), "kubectl get node")
181
+
182
+ result = kueue_manager.install_or_upgrade(KUEUE_CONFIG)
183
+
184
+ assert result == 0
185
+ # 5000 * 1.2 = 6000, which is > 4096.
186
+ mock_commands.assert_command_run(
187
+ "kubectl patch deployment kueue-controller-manager -n kueue-system"
188
+ ' --type=\'strategic\' --patch=\'{"spec": {"template": {"spec":'
189
+ ' {"containers": [{"name": "manager", "resources": {"limits":'
190
+ ' {"memory": "6000Mi"}}}]}}}}\'',
191
+ )
192
+
24
193
 
194
+ @patch("xpk.core.kueue_manager.write_tmp_file")
195
+ def test_configure_generates_correct_manifest_for_tpu(
196
+ write_tmp_file_mock: MagicMock,
197
+ mock_commands: CommandsTester,
198
+ kueue_manager: KueueManager,
199
+ ):
200
+ """Test that __configure generates the correct manifest content for TPUs."""
201
+ set_installed_kueue_version(mock_commands, None)
202
+ tpu_kueue_config = dataclasses.replace(
203
+ KUEUE_CONFIG, system=TPU_SYSTEM, num_slices=2
204
+ )
205
+
206
+ kueue_manager.install_or_upgrade(tpu_kueue_config)
25
207
 
26
- class KueueManagerTest(unittest.TestCase):
27
- """Unit tests for the KueueManager class."""
28
-
29
- def setUp(self):
30
- """Set up test environment."""
31
- self.mock_system_chars_gpu = SystemCharacteristics(
32
- topology="2x2x1",
33
- vms_per_slice=1,
34
- gke_accelerator="h100-mega-80gb-8",
35
- gce_machine_type="a3-megagpu-8g",
36
- chips_per_vm=8,
37
- accelerator_type=AcceleratorType["GPU"],
38
- device_type="h100-mega-80gb-8",
39
- supports_sub_slicing=False,
40
- )
41
-
42
- self.mock_system_chars = SystemCharacteristics(
43
- topology="2x2x1",
44
- vms_per_slice=1,
45
- gke_accelerator="test-accelerator",
46
- gce_machine_type="test-machine",
47
- chips_per_vm=4,
48
- accelerator_type=AcceleratorType["TPU"],
49
- device_type="v5p-8",
50
- supports_sub_slicing=False,
51
- )
52
- mock_env = MagicMock()
53
-
54
- with patch("jinja2.Environment", return_value=mock_env):
55
- self.kueue_manager = KueueManager()
56
-
57
- @patch("xpk.core.kueue_manager.run_command_for_value")
58
- def test_version_check_when_kueue_not_installed(self, mock_run_for_value):
59
- mock_run_for_value.return_value = (
60
- 0,
61
- "Kueue deployment does not exist error message",
62
- )
63
- kueue_config = MagicMock(spec=KueueConfig)
64
-
65
- with (
66
- patch.object(
67
- self.kueue_manager, "_KueueManager__install", return_value=0
68
- ) as mock_install,
69
- patch.object(
70
- self.kueue_manager, "_KueueManager__configure", return_value=0
71
- ) as mock_configure,
72
- ):
73
- result = self.kueue_manager.install_or_upgrade(kueue_config)
74
-
75
- self.assertEqual(result, 0)
76
- mock_install.assert_called_once()
77
- mock_configure.assert_called_once()
78
-
79
- @patch("xpk.core.kueue_manager.KueueManager.get_installed_kueue_version")
80
- @patch("xpk.core.kueue_manager.KueueManager._KueueManager__install")
81
- @patch("xpk.core.kueue_manager.KueueManager._KueueManager__configure")
82
- def test_install_or_upgrade_when_newer_version_already_installed(
83
- self, mock_configure, mock_install, mock_get_version
84
- ):
85
- """Test install_or_upgrade when Kueue is already up to date."""
86
- mock_get_version.return_value = (0, "v0.12.3")
87
- kueue_config = MagicMock(spec=KueueConfig)
88
-
89
- result = self.kueue_manager.install_or_upgrade(kueue_config)
90
-
91
- self.assertEqual(result, 0)
92
- mock_get_version.assert_called_once()
93
- mock_install.assert_not_called()
94
- mock_configure.assert_not_called()
95
-
96
- @patch("xpk.core.kueue_manager.KueueManager.get_installed_kueue_version")
97
- def test_install_or_upgrade_when_outdated(
98
- self,
99
- mock_get_version,
100
- ):
101
- """Test install_or_upgrade when an older version of Kueue is installed."""
102
- mock_get_version.return_value = (0, "v0.11.0")
103
- kueue_config = MagicMock(spec=KueueConfig)
104
-
105
- with (
106
- patch.object(
107
- self.kueue_manager, "_KueueManager__install", return_value=0
108
- ) as mock_install,
109
- patch.object(
110
- self.kueue_manager, "_KueueManager__configure", return_value=0
111
- ) as mock_configure,
112
- ):
113
- result = self.kueue_manager.install_or_upgrade(kueue_config)
114
-
115
- self.assertEqual(result, 0)
116
- mock_get_version.assert_called_once()
117
- mock_install.assert_called_once()
118
- mock_configure.assert_called_once()
119
-
120
- @patch("xpk.core.kueue_manager.KueueManager.get_installed_kueue_version")
121
- def test_install_or_upgrade_when_not_installed(
122
- self,
123
- mock_get_version,
124
- ):
125
- """Test install_or_upgrade when Kueue is not installed."""
126
- mock_get_version.return_value = (1, None)
127
- kueue_config = MagicMock(spec=KueueConfig)
128
-
129
- with (
130
- patch.object(
131
- self.kueue_manager, "_KueueManager__install", return_value=0
132
- ) as mock_install,
133
- patch.object(
134
- self.kueue_manager, "_KueueManager__configure", return_value=0
135
- ) as mock_configure,
136
- ):
137
- result = self.kueue_manager.install_or_upgrade(kueue_config)
138
-
139
- self.assertEqual(result, 0)
140
- mock_get_version.assert_called_once()
141
- mock_install.assert_called_once()
142
- mock_configure.assert_called_once()
143
-
144
- def test_installation_with_tolerations(self):
145
- """Test that tolerations are patched during installation."""
146
- with (
147
- patch(
148
- "xpk.core.kueue_manager.run_command_with_updates_retry",
149
- return_value=0,
150
- ) as mock_run_retry,
151
- patch(
152
- "xpk.core.kueue_manager.KueueManager.get_installed_kueue_version",
153
- return_value=(1, None),
154
- ),
155
- patch(
156
- "xpk.core.kueue_manager.KueueManager._KueueManager__install_kueue_crs",
157
- return_value=0,
158
- ),
159
- patch(
160
- "xpk.core.kueue_manager.KueueManager._KueueManager__wait_for_kueue_available",
161
- return_value=0,
162
- ),
163
- patch(
164
- "xpk.core.kueue_manager.KueueManager._KueueManager__configure",
165
- return_value=0,
166
- ),
167
- ):
168
- tolerations = [
169
- {"key": "test", "operator": "Exists", "effect": "NoSchedule"}
208
+ rendered_manifest: str = write_tmp_file_mock.call_args[0][0]
209
+ assert "kind: Topology" not in rendered_manifest
210
+ manifest_docs = list(yaml.safe_load_all(rendered_manifest))
211
+ cluster_queue = _first(
212
+ doc for doc in manifest_docs if doc["kind"] == "ClusterQueue"
213
+ )
214
+ assert (
215
+ cluster_queue["spec"]["resourceGroups"][0]["flavors"][0]["name"]
216
+ == "2xv5p-8"
217
+ )
218
+ resources = cluster_queue["spec"]["resourceGroups"][0]["flavors"][0][
219
+ "resources"
220
+ ]
221
+ tpu_resource = _first(r for r in resources if r["name"] == "google.com/tpu")
222
+ cpu_resource = _first(r for r in resources if r["name"] == "cpu")
223
+ memory_resource = _first(r for r in resources if r["name"] == "memory")
224
+ assert tpu_resource["nominalQuota"] == 8
225
+ assert cpu_resource["nominalQuota"] == 100
226
+ assert memory_resource["nominalQuota"] == "100Gi"
227
+ resource_flavor = _first(
228
+ doc for doc in manifest_docs if doc["kind"] == "ResourceFlavor"
229
+ )
230
+ assert (
231
+ resource_flavor["spec"]["nodeLabels"][
232
+ "cloud.google.com/gke-tpu-accelerator"
170
233
  ]
171
- kueue_config = MagicMock(spec=KueueConfig)
172
-
173
- result = self.kueue_manager.install_or_upgrade(
174
- kueue_config, tolerations=tolerations
175
- )
176
-
177
- self.assertEqual(result, 0)
178
- self.assertEqual(mock_run_retry.call_count, 1)
179
- patch_call = mock_run_retry.call_args_list[0]
180
- self.assertIn(
181
- "kubectl patch deployment kueue-controller-manager -n kueue-system"
182
- ' --type=\'strategic\' --patch=\'{"spec": {"template": {"spec":'
183
- ' {"tolerations": [{"key": "test", "operator": "Exists", "effect":'
184
- ' "NoSchedule"}]}}}}',
185
- patch_call[0][0],
186
- )
187
-
188
- def test_installation_without_tolerations(self):
189
- """Test that tolerations are not patched when not provided."""
190
- with (
191
- patch(
192
- "xpk.core.kueue_manager.run_command_with_updates_retry",
193
- return_value=0,
194
- ) as mock_run_retry,
195
- patch(
196
- "xpk.core.kueue_manager.KueueManager.get_installed_kueue_version",
197
- return_value=(1, None),
198
- ),
199
- patch(
200
- "xpk.core.kueue_manager.KueueManager._KueueManager__install_kueue_crs",
201
- return_value=0,
202
- ),
203
- patch(
204
- "xpk.core.kueue_manager.KueueManager._KueueManager__wait_for_kueue_available",
205
- return_value=0,
206
- ),
207
- patch(
208
- "xpk.core.kueue_manager.KueueManager._KueueManager__configure",
209
- return_value=0,
210
- ),
211
- ):
212
- kueue_config = MagicMock(spec=KueueConfig)
213
-
214
- result = self.kueue_manager.install_or_upgrade(
215
- kueue_config, tolerations=None
216
- )
217
-
218
- self.assertEqual(result, 0)
219
- self.assertEqual(mock_run_retry.call_count, 0)
220
-
221
- @patch("xpk.core.kueue_manager.KueueManager.get_installed_kueue_version")
222
- @patch("xpk.core.kueue_manager.KueueManager._KueueManager__apply_manifest")
223
- def test_configuration_updates_resources(
224
- self, mock_apply_manifest, mock_get_version
225
- ):
226
- """Test that configuration updates Kueue resources."""
227
- mock_get_version.return_value = (1, None) # Trigger install
228
- mock_apply_manifest.return_value = 0
229
-
230
- kueue_config = KueueConfig(
231
- system=self.mock_system_chars,
232
- total_chips=8,
233
- cpu_limit=100,
234
- memory_limit="100Gi",
235
- configure_sub_slicing=False,
236
- )
237
-
238
- with (
239
- patch.object(
240
- self.kueue_manager, "_KueueManager__install", return_value=0
241
- ),
242
- patch.object(
243
- self.kueue_manager,
244
- "_KueueManager__update_kueue_resources_if_necessary",
245
- return_value=0,
246
- ) as mock_update_resources,
247
- ):
248
- self.kueue_manager.install_or_upgrade(kueue_config)
249
- mock_apply_manifest.assert_called()
250
- mock_update_resources.assert_called_once()
251
-
252
- @patch("xpk.core.kueue_manager.run_command_with_updates_retry")
253
- def test_resource_update_for_small_cluster(self, mock_run_retry):
254
- """Test resource update logic for a small cluster."""
255
- mock_run_retry.return_value = 0
256
- kueue_config = KueueConfig(
257
- system=self.mock_system_chars,
258
- total_chips=8,
259
- cpu_limit=100,
260
- memory_limit="100Gi",
261
- configure_sub_slicing=False,
262
- )
263
-
264
- with (
265
- patch(
266
- "xpk.core.kueue_manager.run_command_for_value",
267
- return_value=(0, "100"), # 100 nodes
268
- ),
269
- patch.object(
270
- self.kueue_manager,
271
- "get_installed_kueue_version",
272
- return_value=(1, None),
273
- ),
274
- patch.object(
275
- self.kueue_manager, "_KueueManager__install", return_value=0
276
- ),
277
- patch.object(
278
- self.kueue_manager, "_KueueManager__apply_manifest", return_value=0
279
- ),
280
- ):
281
- result = self.kueue_manager.install_or_upgrade(kueue_config)
282
-
283
- self.assertEqual(result, 0)
284
- mock_run_retry.assert_called_once()
285
- patch_call = mock_run_retry.call_args_list[0]
286
- # 100 * 1.2 = 120, which is less than 4096. So it should be 4096.
287
- self.assertIn(
288
- "kubectl patch deployment kueue-controller-manager -n kueue-system"
289
- ' --type=\'strategic\' --patch=\'{"spec": {"template": {"spec":'
290
- ' {"containers": [{"name": "manager", "resources": {"limits":'
291
- ' {"memory": "4096Mi"}}}]}}}}\'',
292
- patch_call[0][0],
293
- )
294
-
295
- @patch("xpk.core.kueue_manager.run_command_with_updates_retry")
296
- def test_resource_update_for_large_cluster(self, mock_run_retry):
297
- """Test resource update logic for a large cluster."""
298
- mock_run_retry.return_value = 0
299
- kueue_config = KueueConfig(
300
- system=self.mock_system_chars,
301
- total_chips=8,
302
- cpu_limit=100,
303
- memory_limit="100Gi",
304
- configure_sub_slicing=False,
305
- )
306
-
307
- with (
308
- patch(
309
- "xpk.core.kueue_manager.run_command_for_value",
310
- return_value=(0, "5000"), # 5000 nodes
311
- ),
312
- patch.object(
313
- self.kueue_manager,
314
- "get_installed_kueue_version",
315
- return_value=(1, None),
316
- ),
317
- patch.object(
318
- self.kueue_manager, "_KueueManager__install", return_value=0
319
- ),
320
- patch.object(
321
- self.kueue_manager, "_KueueManager__apply_manifest", return_value=0
322
- ),
323
- ):
324
- result = self.kueue_manager.install_or_upgrade(kueue_config)
325
-
326
- self.assertEqual(result, 0)
327
- mock_run_retry.assert_called_once()
328
- patch_call = mock_run_retry.call_args_list[0]
329
- # 5000 * 1.2 = 6000, which is > 4096.
330
- self.assertIn(
331
- "kubectl patch deployment kueue-controller-manager -n kueue-system"
332
- ' --type=\'strategic\' --patch=\'{"spec": {"template": {"spec":'
333
- ' {"containers": [{"name": "manager", "resources": {"limits":'
334
- ' {"memory": "6000Mi"}}}]}}}}\'',
335
- patch_call[0][0],
336
- )
337
-
338
- @patch("xpk.core.kueue_manager.KueueManager._KueueManager__install")
339
- @patch(
340
- "xpk.core.kueue_manager.KueueManager._KueueManager__update_kueue_resources_if_necessary"
234
+ == "test-accelerator"
341
235
  )
342
- def test_configure_generates_correct_manifest_for_tpu(
343
- self, mock_update_resources, mock_install
344
- ):
345
- """Test that __configure generates the correct manifest content for TPUs."""
346
- mock_install.return_value = 0
347
- mock_update_resources.return_value = 0
348
- kueue_config = KueueConfig(
349
- system=self.mock_system_chars,
350
- total_chips=8,
351
- cpu_limit=100,
352
- memory_limit="100Gi",
353
- autoprovisioning_enabled=False,
354
- num_slices=2,
355
- configure_sub_slicing=False,
356
- )
357
-
358
- rendered_manifest = self._trigger_installation(kueue_config)
359
-
360
- self.assertNotIn("kind: Topology", rendered_manifest)
361
- manifest_docs = list(yaml.safe_load_all(rendered_manifest))
362
- cluster_queue = _first(
363
- doc for doc in manifest_docs if doc["kind"] == "ClusterQueue"
364
- )
365
- self.assertEqual(
366
- cluster_queue["spec"]["resourceGroups"][0]["flavors"][0]["name"],
367
- "2xv5p-8",
368
- )
369
- resources = cluster_queue["spec"]["resourceGroups"][0]["flavors"][0][
370
- "resources"
371
- ]
372
- tpu_resource = _first(r for r in resources if r["name"] == "google.com/tpu")
373
- cpu_resource = _first(r for r in resources if r["name"] == "cpu")
374
- memory_resource = _first(r for r in resources if r["name"] == "memory")
375
- self.assertEqual(tpu_resource["nominalQuota"], 8)
376
- self.assertEqual(cpu_resource["nominalQuota"], 100)
377
- self.assertEqual(memory_resource["nominalQuota"], "100Gi")
378
- resource_flavor = _first(
379
- doc for doc in manifest_docs if doc["kind"] == "ResourceFlavor"
380
- )
381
- self.assertEqual(
382
- resource_flavor["spec"]["nodeLabels"][
383
- "cloud.google.com/gke-tpu-accelerator"
384
- ],
385
- "test-accelerator",
386
- )
387
- self.assertEqual(
388
- resource_flavor["spec"]["nodeLabels"][
389
- "cloud.google.com/gke-tpu-topology"
390
- ],
391
- "2x2x1",
392
- )
393
-
394
- @patch("xpk.core.kueue_manager.KueueManager._KueueManager__install")
395
- @patch(
396
- "xpk.core.kueue_manager.KueueManager._KueueManager__update_kueue_resources_if_necessary"
236
+ assert (
237
+ resource_flavor["spec"]["nodeLabels"]["cloud.google.com/gke-tpu-topology"]
238
+ == "2x2x1"
397
239
  )
398
- def test_configure_generates_manifest_with_admission_checks_for_flex_single_slice(
399
- self, mock_update_resources, mock_install
400
- ):
401
- """Test that __configure generates the correct manifest with admission checks."""
402
- mock_install.return_value = 0
403
- mock_update_resources.return_value = 0
404
- kueue_config = KueueConfig(
405
- system=self.mock_system_chars,
406
- total_chips=8,
407
- cpu_limit=100,
408
- memory_limit="100Gi",
409
- autoprovisioning_enabled=False,
410
- num_slices=1,
411
- flex=True,
412
- configure_sub_slicing=False,
413
- )
414
-
415
- rendered_manifest = self._trigger_installation(kueue_config)
416
-
417
- self.assertNotIn("kind: Topology", rendered_manifest)
418
- manifest_docs = list(yaml.safe_load_all(rendered_manifest))
419
- cluster_queue = _first(
420
- doc for doc in manifest_docs if doc["kind"] == "ClusterQueue"
421
- )
422
- self.assertEqual(
423
- cluster_queue["spec"]["resourceGroups"][0]["flavors"][0]["name"],
424
- "1xv5p-8",
425
- )
426
- self.assertEqual(cluster_queue["spec"]["admissionChecks"][0], "dws-prov")
427
-
428
- @patch("xpk.core.kueue_manager.KueueManager._KueueManager__install")
429
- @patch(
430
- "xpk.core.kueue_manager.KueueManager._KueueManager__update_kueue_resources_if_necessary"
240
+
241
+
242
+ @patch("xpk.core.kueue_manager.write_tmp_file")
243
+ def test_configure_generates_manifest_with_admission_checks_for_flex_single_slice(
244
+ write_tmp_file_mock: MagicMock,
245
+ mock_commands: CommandsTester,
246
+ kueue_manager: KueueManager,
247
+ ):
248
+ """Test that __configure generates the correct manifest with admission checks."""
249
+ set_installed_kueue_version(mock_commands, None)
250
+ kueue_config = dataclasses.replace(
251
+ KUEUE_CONFIG,
252
+ num_slices=1,
253
+ flex=True,
431
254
  )
432
- def test_configure_generates_correct_manifest_with_gke_default_topology(
433
- self, mock_update_resources, mock_install
434
- ):
435
- """Test that __configure generates correct manifest for GPUs."""
436
- mock_install.return_value = 0
437
- mock_update_resources.return_value = 0
438
- kueue_config = KueueConfig(
439
- system=self.mock_system_chars_gpu,
440
- total_chips=16,
441
- cpu_limit=100,
442
- memory_limit="100Gi",
443
- num_slices=2,
444
- configure_sub_slicing=False,
445
- )
446
-
447
- rendered_manifest = self._trigger_installation(kueue_config)
448
-
449
- manifest_docs = list(yaml.safe_load_all(rendered_manifest))
450
- resource_flavor = _first(
451
- doc for doc in manifest_docs if doc["kind"] == "ResourceFlavor"
452
- )
453
- self.assertEqual(
454
- resource_flavor["spec"]["nodeLabels"][
455
- "cloud.google.com/gke-accelerator"
456
- ],
457
- "h100-mega-80gb-8",
458
- )
459
- self.assertEqual(resource_flavor["spec"]["topologyName"], "gke-default")
460
- topology = _first(doc for doc in manifest_docs if doc["kind"] == "Topology")
461
- self.assertEqual(topology["metadata"]["name"], "gke-default")
462
-
463
- @patch("xpk.core.kueue_manager.KueueManager._KueueManager__install")
464
- @patch(
465
- "xpk.core.kueue_manager.KueueManager._KueueManager__update_kueue_resources_if_necessary"
255
+
256
+ kueue_manager.install_or_upgrade(kueue_config)
257
+
258
+ rendered_manifest: str = write_tmp_file_mock.call_args[0][0]
259
+ assert "kind: Topology" not in rendered_manifest
260
+ manifest_docs = list(yaml.safe_load_all(rendered_manifest))
261
+ cluster_queue = _first(
262
+ doc for doc in manifest_docs if doc["kind"] == "ClusterQueue"
466
263
  )
467
- def test_configure_generates_correct_manifest_with_sub_slicing(
468
- self, mock_update_resources, mock_install
469
- ):
470
- """Test that __configure generates correct manifest with sub-slicing topology."""
471
- mock_install.return_value = 0
472
- mock_update_resources.return_value = 0
473
- kueue_config = KueueConfig(
474
- system=self.mock_system_chars,
475
- total_chips=16,
476
- cpu_limit=100,
477
- memory_limit="100Gi",
478
- num_slices=2,
479
- configure_sub_slicing=True,
480
- )
481
-
482
- rendered_manifest = self._trigger_installation(kueue_config)
483
-
484
- manifest_docs = list(yaml.safe_load_all(rendered_manifest))
485
- resource_flavor = _first(
486
- doc for doc in manifest_docs if doc["kind"] == "ResourceFlavor"
487
- )
488
- self.assertEqual(
489
- resource_flavor["spec"]["topologyName"], "sub-slice-topology"
490
- )
491
- topology = _first(doc for doc in manifest_docs if doc["kind"] == "Topology")
492
- self.assertEqual(topology["metadata"]["name"], "sub-slice-topology")
493
-
494
- @patch("xpk.core.kueue_manager.KueueManager._KueueManager__install")
495
- @patch(
496
- "xpk.core.kueue_manager.KueueManager._KueueManager__update_kueue_resources_if_necessary"
264
+ assert (
265
+ cluster_queue["spec"]["resourceGroups"][0]["flavors"][0]["name"]
266
+ == "1xv5p-8"
267
+ )
268
+ assert cluster_queue["spec"]["admissionChecks"][0] == "dws-prov"
269
+
270
+
271
+ @patch("xpk.core.kueue_manager.write_tmp_file")
272
+ def test_configure_generates_correct_manifest_with_gke_default_topology(
273
+ write_tmp_file_mock: MagicMock,
274
+ mock_commands: CommandsTester,
275
+ kueue_manager: KueueManager,
276
+ ):
277
+ """Test that __configure generates correct manifest for GPUs."""
278
+ set_installed_kueue_version(mock_commands, None)
279
+ kueue_config = dataclasses.replace(
280
+ KUEUE_CONFIG,
281
+ system=SystemCharacteristics(
282
+ topology="2x2x1",
283
+ vms_per_slice=1,
284
+ gke_accelerator="h100-mega-80gb-8",
285
+ gce_machine_type="a3-megagpu-8g",
286
+ chips_per_vm=8,
287
+ accelerator_type=AcceleratorType.GPU,
288
+ device_type="h100-mega-80gb-8",
289
+ supports_sub_slicing=False,
290
+ ),
497
291
  )
498
- def test_configure_generates_correct_manifest_with_pathways(
499
- self, mock_update_resources, mock_install
500
- ):
501
- """Test that __configure generates the correct manifest with pathways enabled."""
502
- mock_install.return_value = 0
503
- mock_update_resources.return_value = 0
504
- kueue_config = KueueConfig(
505
- system=self.mock_system_chars,
506
- total_chips=8,
507
- cpu_limit=100,
508
- memory_limit="100Gi",
509
- is_pathways_cluster=True,
510
- num_slices=2,
511
- configure_sub_slicing=False,
512
- )
513
-
514
- rendered_manifest = self._trigger_installation(kueue_config)
515
- manifest_docs = list(yaml.safe_load_all(rendered_manifest))
516
-
517
- # Check for the new "cpu-user" ResourceFlavor
518
- cpu_user_flavor = _first(
519
- doc
520
- for doc in manifest_docs
521
- if doc["kind"] == "ResourceFlavor"
522
- and doc["metadata"]["name"] == "cpu-user"
523
- )
524
- self.assertEqual(
525
- cpu_user_flavor["spec"]["nodeLabels"]["cloud.google.com/gke-nodepool"],
526
- "cpu-np",
527
- )
528
-
529
- # Check that the ClusterQueue has the new resource group for pathways
530
- cluster_queue = _first(
531
- doc for doc in manifest_docs if doc["kind"] == "ClusterQueue"
532
- )
533
- self.assertEqual(len(cluster_queue["spec"]["resourceGroups"]), 2)
534
- pathways_rg = cluster_queue["spec"]["resourceGroups"][1]
535
- self.assertEqual(pathways_rg["coveredResources"], ["cpu", "memory"])
536
- self.assertEqual(pathways_rg["flavors"][0]["name"], "cpu-user")
537
- self.assertEqual(
538
- pathways_rg["flavors"][0]["resources"][0]["nominalQuota"], 480
539
- )
540
- self.assertEqual(
541
- pathways_rg["flavors"][0]["resources"][1]["nominalQuota"], "2000G"
542
- )
543
-
544
- def _trigger_installation(self, kueue_config: KueueConfig) -> str:
545
- """Calls Kueue installation and returns the rendered manifest."""
546
- with (
547
- patch.object(
548
- self.kueue_manager, "get_installed_kueue_version"
549
- ) as mock_get_version,
550
- patch.object(
551
- self.kueue_manager, "_KueueManager__apply_manifest"
552
- ) as mock_apply_manifest,
553
- ):
554
- mock_apply_manifest.return_value = 0
555
- mock_get_version.return_value = (1, None)
556
- self.kueue_manager.install_or_upgrade(kueue_config)
557
-
558
- mock_apply_manifest.assert_called_once()
559
- manifest = mock_apply_manifest.call_args[0][0]
560
- assert isinstance(manifest, str)
561
- return manifest
292
+
293
+ kueue_manager.install_or_upgrade(kueue_config)
294
+
295
+ rendered_manifest: str = write_tmp_file_mock.call_args[0][0]
296
+ manifest_docs = list(yaml.safe_load_all(rendered_manifest))
297
+ resource_flavor = _first(
298
+ doc for doc in manifest_docs if doc["kind"] == "ResourceFlavor"
299
+ )
300
+ assert (
301
+ resource_flavor["spec"]["nodeLabels"]["cloud.google.com/gke-accelerator"]
302
+ == "h100-mega-80gb-8"
303
+ )
304
+ assert resource_flavor["spec"]["topologyName"] == "gke-default"
305
+ topology = _first(doc for doc in manifest_docs if doc["kind"] == "Topology")
306
+ assert topology["metadata"]["name"] == "gke-default"
307
+
308
+
309
+ @patch("xpk.core.kueue_manager.write_tmp_file")
310
+ def test_configure_generates_correct_manifest_with_sub_slicing(
311
+ write_tmp_file_mock: MagicMock,
312
+ mock_commands: CommandsTester,
313
+ kueue_manager: KueueManager,
314
+ ):
315
+ """Test that __configure generates correct manifest with sub-slicing topology."""
316
+ set_installed_kueue_version(mock_commands, None)
317
+ kueue_config = dataclasses.replace(
318
+ KUEUE_CONFIG,
319
+ configure_sub_slicing=True,
320
+ )
321
+
322
+ kueue_manager.install_or_upgrade(kueue_config)
323
+
324
+ rendered_manifest: str = write_tmp_file_mock.call_args[0][0]
325
+ manifest_docs = list(yaml.safe_load_all(rendered_manifest))
326
+ resource_flavor = _first(
327
+ doc for doc in manifest_docs if doc["kind"] == "ResourceFlavor"
328
+ )
329
+ assert resource_flavor["spec"]["topologyName"] == "sub-slice-topology"
330
+ topology = _first(doc for doc in manifest_docs if doc["kind"] == "Topology")
331
+ assert topology["metadata"]["name"] == "sub-slice-topology"
332
+
333
+
334
+ @patch("xpk.core.kueue_manager.write_tmp_file")
335
+ def test_configure_generates_correct_manifest_with_pathways(
336
+ write_tmp_file_mock: MagicMock,
337
+ mock_commands: CommandsTester,
338
+ kueue_manager: KueueManager,
339
+ ):
340
+ """Test that __configure generates the correct manifest with pathways enabled."""
341
+ set_installed_kueue_version(mock_commands, None)
342
+ kueue_config = dataclasses.replace(
343
+ KUEUE_CONFIG,
344
+ is_pathways_cluster=True,
345
+ )
346
+
347
+ kueue_manager.install_or_upgrade(kueue_config)
348
+
349
+ rendered_manifest: str = write_tmp_file_mock.call_args[0][0]
350
+ manifest_docs = list(yaml.safe_load_all(rendered_manifest))
351
+
352
+ # Check for the new "cpu-user" ResourceFlavor
353
+ cpu_user_flavor = _first(
354
+ doc
355
+ for doc in manifest_docs
356
+ if doc["kind"] == "ResourceFlavor"
357
+ and doc["metadata"]["name"] == "cpu-user"
358
+ )
359
+ assert (
360
+ cpu_user_flavor["spec"]["nodeLabels"]["cloud.google.com/gke-nodepool"]
361
+ == "cpu-np"
362
+ )
363
+
364
+ # Check that the ClusterQueue has the new resource group for pathways
365
+ cluster_queue = _first(
366
+ doc for doc in manifest_docs if doc["kind"] == "ClusterQueue"
367
+ )
368
+ assert len(cluster_queue["spec"]["resourceGroups"]) == 2
369
+ pathways_rg = cluster_queue["spec"]["resourceGroups"][1]
370
+ assert pathways_rg["coveredResources"] == ["cpu", "memory"]
371
+ assert pathways_rg["flavors"][0]["name"] == "cpu-user"
372
+ assert pathways_rg["flavors"][0]["resources"][0]["nominalQuota"] == 480
373
+ assert pathways_rg["flavors"][0]["resources"][1]["nominalQuota"] == "2000G"
374
+
375
+
376
+ def test_has_sub_slicing_enabled_returns_exit_code_when_command_fails(
377
+ mock_commands: CommandsTester,
378
+ ):
379
+ mock_commands.set_result_for_command((1, ""), "kubectl get topology")
380
+
381
+ return_code, result = has_sub_slicing_enabled()
382
+
383
+ assert return_code == 1
384
+ assert result is None
385
+
386
+
387
+ def test_has_sub_slicing_enabled_returns_false_when_sub_slicing_topology_is_not_present(
388
+ mock_commands: CommandsTester,
389
+ ):
390
+ mock_commands.set_result_for_command((0, ""), "kubectl get topology")
391
+
392
+ return_code, result = has_sub_slicing_enabled()
393
+
394
+ assert return_code == 0
395
+ assert result is False
396
+
397
+
398
+ def test_has_sub_slicing_enabled_returns_true_when_sub_slicing_topology_is_not_present(
399
+ mock_commands: CommandsTester,
400
+ ):
401
+ mock_commands.set_result_for_command(
402
+ (0, "sub-slice-topology"), "kubectl get topology"
403
+ )
404
+
405
+ return_code, result = has_sub_slicing_enabled()
406
+
407
+ assert return_code == 0
408
+ assert result is True
562
409
 
563
410
 
564
411
  T = TypeVar("T")
@@ -568,7 +415,3 @@ def _first(generator: Generator[T, None, None]) -> T:
568
415
  result = next(generator, None)
569
416
  assert result is not None
570
417
  return result
571
-
572
-
573
- if __name__ == "__main__":
574
- unittest.main()