xpk 0.13.0__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. xpk/commands/batch.py +9 -2
  2. xpk/commands/cluster.py +128 -115
  3. xpk/commands/cluster_gcluster.py +77 -14
  4. xpk/commands/cluster_gcluster_test.py +177 -0
  5. xpk/commands/common.py +10 -28
  6. xpk/commands/info.py +11 -9
  7. xpk/commands/inspector.py +21 -10
  8. xpk/commands/job.py +25 -9
  9. xpk/commands/kind.py +38 -40
  10. xpk/commands/kjob_common.py +4 -4
  11. xpk/commands/run.py +9 -2
  12. xpk/commands/shell.py +13 -10
  13. xpk/commands/storage.py +21 -0
  14. xpk/commands/version.py +0 -4
  15. xpk/commands/workload.py +43 -22
  16. xpk/core/blueprint/blueprint_generator.py +4 -40
  17. xpk/core/blueprint/blueprint_test.py +0 -6
  18. xpk/core/capacity.py +6 -5
  19. xpk/core/cluster.py +91 -194
  20. xpk/core/cluster_private.py +6 -11
  21. xpk/core/commands.py +11 -18
  22. xpk/core/config.py +1 -1
  23. xpk/core/docker_image.py +3 -4
  24. xpk/core/gcloud_context.py +26 -2
  25. xpk/core/gcloud_context_test.py +96 -0
  26. xpk/core/gcluster_manager.py +0 -3
  27. xpk/core/jobset.py +4 -7
  28. xpk/core/kjob.py +14 -27
  29. xpk/core/kueue_manager.py +383 -0
  30. xpk/core/kueue_manager_test.py +542 -0
  31. xpk/core/monitoring.py +1 -1
  32. xpk/core/nap.py +10 -15
  33. xpk/core/network.py +17 -18
  34. xpk/core/nodepool.py +66 -77
  35. xpk/core/nodepool_test.py +198 -1
  36. xpk/core/pathways.py +5 -5
  37. xpk/core/ray.py +10 -14
  38. xpk/core/resources.py +6 -11
  39. xpk/core/scheduling.py +19 -1
  40. xpk/core/scheduling_test.py +31 -0
  41. xpk/core/system_characteristics.py +335 -229
  42. xpk/core/vertex.py +1 -1
  43. xpk/core/workload.py +7 -8
  44. xpk/main.py +2 -4
  45. xpk/parser/cluster.py +7 -0
  46. xpk/parser/cluster_test.py +66 -0
  47. xpk/parser/common.py +11 -0
  48. xpk/parser/workload.py +62 -25
  49. xpk/parser/workload_test.py +82 -0
  50. xpk/utils/feature_flags.py +28 -0
  51. xpk/utils/kueue.py +20 -0
  52. xpk/utils/templates.py +2 -0
  53. xpk/utils/topology.py +37 -0
  54. xpk/utils/topology_test.py +43 -0
  55. xpk/utils/validation.py +79 -55
  56. xpk/utils/validation_test.py +37 -0
  57. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
  58. xpk-0.14.0.dist-info/RECORD +112 -0
  59. xpk/core/kueue.py +0 -561
  60. xpk-0.13.0.dist-info/RECORD +0 -101
  61. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
  62. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
  63. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
  64. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,542 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from typing import Generator, TypeVar
18
+ import unittest
19
+ import yaml
20
+ from unittest.mock import MagicMock, patch
21
+
22
+ from xpk.core.kueue_manager import KueueConfig, KueueManager
23
+ from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics
24
+
25
+
26
+ class KueueManagerTest(unittest.TestCase):
27
+ """Unit tests for the KueueManager class."""
28
+
29
+ def setUp(self):
30
+ """Set up test environment."""
31
+ self.mock_system_chars_gpu = SystemCharacteristics(
32
+ topology="2x2x1",
33
+ vms_per_slice=1,
34
+ gke_accelerator="h100-mega-80gb-8",
35
+ gce_machine_type="a3-megagpu-8g",
36
+ chips_per_vm=8,
37
+ accelerator_type=AcceleratorType["GPU"],
38
+ device_type="h100-mega-80gb-8",
39
+ supports_sub_slicing=False,
40
+ )
41
+
42
+ self.mock_system_chars = SystemCharacteristics(
43
+ topology="2x2x1",
44
+ vms_per_slice=1,
45
+ gke_accelerator="test-accelerator",
46
+ gce_machine_type="test-machine",
47
+ chips_per_vm=4,
48
+ accelerator_type=AcceleratorType["TPU"],
49
+ device_type="v5p-8",
50
+ supports_sub_slicing=False,
51
+ )
52
+ mock_env = MagicMock()
53
+
54
+ with patch("jinja2.Environment", return_value=mock_env):
55
+ self.kueue_manager = KueueManager()
56
+
57
+ @patch("xpk.core.kueue_manager.run_command_for_value")
58
+ def test_version_check_when_kueue_not_installed(self, mock_run_for_value):
59
+ mock_run_for_value.return_value = (
60
+ 0,
61
+ "Kueue deployment does not exist error message",
62
+ )
63
+ kueue_config = MagicMock(spec=KueueConfig)
64
+
65
+ with (
66
+ patch.object(
67
+ self.kueue_manager, "_KueueManager__install", return_value=0
68
+ ) as mock_install,
69
+ patch.object(
70
+ self.kueue_manager, "_KueueManager__configure", return_value=0
71
+ ) as mock_configure,
72
+ ):
73
+ result = self.kueue_manager.install_or_upgrade(kueue_config)
74
+
75
+ self.assertEqual(result, 0)
76
+ mock_install.assert_called_once()
77
+ mock_configure.assert_called_once()
78
+
79
+ @patch(
80
+ "xpk.core.kueue_manager.KueueManager._KueueManager__get_installed_kueue_version"
81
+ )
82
+ @patch("xpk.core.kueue_manager.KueueManager._KueueManager__install")
83
+ @patch("xpk.core.kueue_manager.KueueManager._KueueManager__configure")
84
+ def test_install_or_upgrade_when_newer_version_already_installed(
85
+ self, mock_configure, mock_install, mock_get_version
86
+ ):
87
+ """Test install_or_upgrade when Kueue is already up to date."""
88
+ mock_get_version.return_value = (0, "v0.99.9")
89
+ kueue_config = MagicMock(spec=KueueConfig)
90
+
91
+ result = self.kueue_manager.install_or_upgrade(kueue_config)
92
+
93
+ self.assertEqual(result, 0)
94
+ mock_get_version.assert_called_once()
95
+ mock_install.assert_not_called()
96
+ mock_configure.assert_not_called()
97
+
98
+ @patch(
99
+ "xpk.core.kueue_manager.KueueManager._KueueManager__get_installed_kueue_version"
100
+ )
101
+ def test_install_or_upgrade_when_outdated(
102
+ self,
103
+ mock_get_version,
104
+ ):
105
+ """Test install_or_upgrade when an older version of Kueue is installed."""
106
+ mock_get_version.return_value = (0, "v0.11.0")
107
+ kueue_config = MagicMock(spec=KueueConfig)
108
+
109
+ with (
110
+ patch.object(
111
+ self.kueue_manager, "_KueueManager__install", return_value=0
112
+ ) as mock_install,
113
+ patch.object(
114
+ self.kueue_manager, "_KueueManager__configure", return_value=0
115
+ ) as mock_configure,
116
+ ):
117
+ result = self.kueue_manager.install_or_upgrade(kueue_config)
118
+
119
+ self.assertEqual(result, 0)
120
+ mock_get_version.assert_called_once()
121
+ mock_install.assert_called_once()
122
+ mock_configure.assert_called_once()
123
+
124
+ @patch(
125
+ "xpk.core.kueue_manager.KueueManager._KueueManager__get_installed_kueue_version"
126
+ )
127
+ def test_install_or_upgrade_when_not_installed(
128
+ self,
129
+ mock_get_version,
130
+ ):
131
+ """Test install_or_upgrade when Kueue is not installed."""
132
+ mock_get_version.return_value = (1, None)
133
+ kueue_config = MagicMock(spec=KueueConfig)
134
+
135
+ with (
136
+ patch.object(
137
+ self.kueue_manager, "_KueueManager__install", return_value=0
138
+ ) as mock_install,
139
+ patch.object(
140
+ self.kueue_manager, "_KueueManager__configure", return_value=0
141
+ ) as mock_configure,
142
+ ):
143
+ result = self.kueue_manager.install_or_upgrade(kueue_config)
144
+
145
+ self.assertEqual(result, 0)
146
+ mock_get_version.assert_called_once()
147
+ mock_install.assert_called_once()
148
+ mock_configure.assert_called_once()
149
+
150
+ def test_installation_with_tolerations(self):
151
+ """Test that tolerations are patched during installation."""
152
+ with (
153
+ patch(
154
+ "xpk.core.kueue_manager.run_command_with_updates_retry",
155
+ return_value=0,
156
+ ) as mock_run_retry,
157
+ patch(
158
+ "xpk.core.kueue_manager.KueueManager._KueueManager__get_installed_kueue_version",
159
+ return_value=(1, None),
160
+ ),
161
+ patch(
162
+ "xpk.core.kueue_manager.KueueManager._KueueManager__install_kueue_crs",
163
+ return_value=0,
164
+ ),
165
+ patch(
166
+ "xpk.core.kueue_manager.KueueManager._KueueManager__wait_for_kueue_available",
167
+ return_value=0,
168
+ ),
169
+ patch(
170
+ "xpk.core.kueue_manager.KueueManager._KueueManager__configure",
171
+ return_value=0,
172
+ ),
173
+ ):
174
+ tolerations = [
175
+ {"key": "test", "operator": "Exists", "effect": "NoSchedule"}
176
+ ]
177
+ kueue_config = MagicMock(spec=KueueConfig)
178
+
179
+ result = self.kueue_manager.install_or_upgrade(
180
+ kueue_config, tolerations=tolerations
181
+ )
182
+
183
+ self.assertEqual(result, 0)
184
+ self.assertEqual(mock_run_retry.call_count, 1)
185
+ patch_call = mock_run_retry.call_args_list[0]
186
+ self.assertIn(
187
+ "kubectl patch deployment kueue-controller-manager -n kueue-system"
188
+ ' --type=\'strategic\' --patch=\'{"spec": {"template": {"spec":'
189
+ ' {"tolerations": [{"key": "test", "operator": "Exists", "effect":'
190
+ ' "NoSchedule"}]}}}}',
191
+ patch_call[0][0],
192
+ )
193
+
194
+ def test_installation_without_tolerations(self):
195
+ """Test that tolerations are not patched when not provided."""
196
+ with (
197
+ patch(
198
+ "xpk.core.kueue_manager.run_command_with_updates_retry",
199
+ return_value=0,
200
+ ) as mock_run_retry,
201
+ patch(
202
+ "xpk.core.kueue_manager.KueueManager._KueueManager__get_installed_kueue_version",
203
+ return_value=(1, None),
204
+ ),
205
+ patch(
206
+ "xpk.core.kueue_manager.KueueManager._KueueManager__install_kueue_crs",
207
+ return_value=0,
208
+ ),
209
+ patch(
210
+ "xpk.core.kueue_manager.KueueManager._KueueManager__wait_for_kueue_available",
211
+ return_value=0,
212
+ ),
213
+ patch(
214
+ "xpk.core.kueue_manager.KueueManager._KueueManager__configure",
215
+ return_value=0,
216
+ ),
217
+ ):
218
+ kueue_config = MagicMock(spec=KueueConfig)
219
+
220
+ result = self.kueue_manager.install_or_upgrade(
221
+ kueue_config, tolerations=None
222
+ )
223
+
224
+ self.assertEqual(result, 0)
225
+ self.assertEqual(mock_run_retry.call_count, 0)
226
+
227
+ @patch(
228
+ "xpk.core.kueue_manager.KueueManager._KueueManager__get_installed_kueue_version"
229
+ )
230
+ @patch("xpk.core.kueue_manager.KueueManager._KueueManager__apply_manifest")
231
+ def test_configuration_updates_resources(
232
+ self, mock_apply_manifest, mock_get_version
233
+ ):
234
+ """Test that configuration updates Kueue resources."""
235
+ mock_get_version.return_value = (1, None) # Trigger install
236
+ mock_apply_manifest.return_value = 0
237
+
238
+ kueue_config = KueueConfig(
239
+ system=self.mock_system_chars,
240
+ total_chips=8,
241
+ cpu_limit=100,
242
+ memory_limit="100Gi",
243
+ )
244
+
245
+ with (
246
+ patch.object(
247
+ self.kueue_manager, "_KueueManager__install", return_value=0
248
+ ),
249
+ patch.object(
250
+ self.kueue_manager,
251
+ "_KueueManager__update_kueue_resources_if_necessary",
252
+ return_value=0,
253
+ ) as mock_update_resources,
254
+ ):
255
+ self.kueue_manager.install_or_upgrade(kueue_config)
256
+ mock_apply_manifest.assert_called()
257
+ mock_update_resources.assert_called_once()
258
+
259
+ @patch("xpk.core.kueue_manager.run_command_with_updates_retry")
260
+ def test_resource_update_for_small_cluster(self, mock_run_retry):
261
+ """Test resource update logic for a small cluster."""
262
+ mock_run_retry.return_value = 0
263
+ kueue_config = KueueConfig(
264
+ system=self.mock_system_chars,
265
+ total_chips=8,
266
+ cpu_limit=100,
267
+ memory_limit="100Gi",
268
+ )
269
+
270
+ with (
271
+ patch(
272
+ "xpk.core.kueue_manager.run_command_for_value",
273
+ return_value=(0, "100"), # 100 nodes
274
+ ),
275
+ patch.object(
276
+ self.kueue_manager,
277
+ "_KueueManager__get_installed_kueue_version",
278
+ return_value=(1, None),
279
+ ),
280
+ patch.object(
281
+ self.kueue_manager, "_KueueManager__install", return_value=0
282
+ ),
283
+ patch.object(
284
+ self.kueue_manager, "_KueueManager__apply_manifest", return_value=0
285
+ ),
286
+ ):
287
+ result = self.kueue_manager.install_or_upgrade(kueue_config)
288
+
289
+ self.assertEqual(result, 0)
290
+ mock_run_retry.assert_called_once()
291
+ patch_call = mock_run_retry.call_args_list[0]
292
+ # 100 * 1.2 = 120, which is less than 4096. So it should be 4096.
293
+ self.assertIn(
294
+ "kubectl patch deployment kueue-controller-manager -n kueue-system"
295
+ ' --type=\'strategic\' --patch=\'{"spec": {"template": {"spec":'
296
+ ' {"containers": [{"name": "manager", "resources": {"limits":'
297
+ ' {"memory": "4096Mi"}}}]}}}}\'',
298
+ patch_call[0][0],
299
+ )
300
+
301
+ @patch("xpk.core.kueue_manager.run_command_with_updates_retry")
302
+ def test_resource_update_for_large_cluster(self, mock_run_retry):
303
+ """Test resource update logic for a large cluster."""
304
+ mock_run_retry.return_value = 0
305
+ kueue_config = KueueConfig(
306
+ system=self.mock_system_chars,
307
+ total_chips=8,
308
+ cpu_limit=100,
309
+ memory_limit="100Gi",
310
+ )
311
+
312
+ with (
313
+ patch(
314
+ "xpk.core.kueue_manager.run_command_for_value",
315
+ return_value=(0, "5000"), # 5000 nodes
316
+ ),
317
+ patch.object(
318
+ self.kueue_manager,
319
+ "_KueueManager__get_installed_kueue_version",
320
+ return_value=(1, None),
321
+ ),
322
+ patch.object(
323
+ self.kueue_manager, "_KueueManager__install", return_value=0
324
+ ),
325
+ patch.object(
326
+ self.kueue_manager, "_KueueManager__apply_manifest", return_value=0
327
+ ),
328
+ ):
329
+ result = self.kueue_manager.install_or_upgrade(kueue_config)
330
+
331
+ self.assertEqual(result, 0)
332
+ mock_run_retry.assert_called_once()
333
+ patch_call = mock_run_retry.call_args_list[0]
334
+ # 5000 * 1.2 = 6000, which is > 4096.
335
+ self.assertIn(
336
+ "kubectl patch deployment kueue-controller-manager -n kueue-system"
337
+ ' --type=\'strategic\' --patch=\'{"spec": {"template": {"spec":'
338
+ ' {"containers": [{"name": "manager", "resources": {"limits":'
339
+ ' {"memory": "6000Mi"}}}]}}}}\'',
340
+ patch_call[0][0],
341
+ )
342
+
343
+ @patch("xpk.core.kueue_manager.KueueManager._KueueManager__install")
344
+ @patch(
345
+ "xpk.core.kueue_manager.KueueManager._KueueManager__update_kueue_resources_if_necessary"
346
+ )
347
+ def test_configure_generates_correct_manifest(
348
+ self, mock_update_resources, mock_install
349
+ ):
350
+ """Test that __configure generates the correct manifest content for TPUs."""
351
+ mock_install.return_value = 0
352
+ mock_update_resources.return_value = 0
353
+ kueue_config = KueueConfig(
354
+ system=self.mock_system_chars,
355
+ total_chips=8,
356
+ cpu_limit=100,
357
+ memory_limit="100Gi",
358
+ autoprovisioning_enabled=False,
359
+ num_slices=2,
360
+ )
361
+
362
+ rendered_manifest = self._trigger_installation(kueue_config)
363
+
364
+ self.assertNotIn("kind: Topology", rendered_manifest)
365
+ manifest_docs = list(yaml.safe_load_all(rendered_manifest))
366
+ cluster_queue = _first(
367
+ doc for doc in manifest_docs if doc["kind"] == "ClusterQueue"
368
+ )
369
+ self.assertEqual(
370
+ cluster_queue["spec"]["resourceGroups"][0]["flavors"][0]["name"],
371
+ "2xv5p-8",
372
+ )
373
+ resources = cluster_queue["spec"]["resourceGroups"][0]["flavors"][0][
374
+ "resources"
375
+ ]
376
+ tpu_resource = _first(r for r in resources if r["name"] == "google.com/tpu")
377
+ cpu_resource = _first(r for r in resources if r["name"] == "cpu")
378
+ memory_resource = _first(r for r in resources if r["name"] == "memory")
379
+ self.assertEqual(tpu_resource["nominalQuota"], 8)
380
+ self.assertEqual(cpu_resource["nominalQuota"], 100)
381
+ self.assertEqual(memory_resource["nominalQuota"], "100Gi")
382
+ resource_flavor = _first(
383
+ doc for doc in manifest_docs if doc["kind"] == "ResourceFlavor"
384
+ )
385
+ self.assertEqual(
386
+ resource_flavor["spec"]["nodeLabels"][
387
+ "cloud.google.com/gke-tpu-accelerator"
388
+ ],
389
+ "test-accelerator",
390
+ )
391
+ self.assertEqual(
392
+ resource_flavor["spec"]["nodeLabels"][
393
+ "cloud.google.com/gke-tpu-topology"
394
+ ],
395
+ "2x2x1",
396
+ )
397
+
398
+ @patch("xpk.core.kueue_manager.KueueManager._KueueManager__install")
399
+ @patch(
400
+ "xpk.core.kueue_manager.KueueManager._KueueManager__update_kueue_resources_if_necessary"
401
+ )
402
+ def test_configure_generates_manifest_with_admission_checks_for_flex_single_slice(
403
+ self, mock_update_resources, mock_install
404
+ ):
405
+ """Test that __configure generates the correct manifest with admission checks."""
406
+ mock_install.return_value = 0
407
+ mock_update_resources.return_value = 0
408
+ kueue_config = KueueConfig(
409
+ system=self.mock_system_chars,
410
+ total_chips=8,
411
+ cpu_limit=100,
412
+ memory_limit="100Gi",
413
+ autoprovisioning_enabled=False,
414
+ num_slices=1,
415
+ flex=True,
416
+ )
417
+
418
+ rendered_manifest = self._trigger_installation(kueue_config)
419
+
420
+ self.assertNotIn("kind: Topology", rendered_manifest)
421
+ manifest_docs = list(yaml.safe_load_all(rendered_manifest))
422
+ cluster_queue = _first(
423
+ doc for doc in manifest_docs if doc["kind"] == "ClusterQueue"
424
+ )
425
+ self.assertEqual(
426
+ cluster_queue["spec"]["resourceGroups"][0]["flavors"][0]["name"],
427
+ "1xv5p-8",
428
+ )
429
+ self.assertEqual(cluster_queue["spec"]["admissionChecks"][0], "dws-prov")
430
+
431
+ @patch("xpk.core.kueue_manager.KueueManager._KueueManager__install")
432
+ @patch(
433
+ "xpk.core.kueue_manager.KueueManager._KueueManager__update_kueue_resources_if_necessary"
434
+ )
435
+ def test_configure_generates_correct_manifest_with_topology(
436
+ self, mock_update_resources, mock_install
437
+ ):
438
+ """Test that __configure generates correct manifest for GPUs."""
439
+ mock_install.return_value = 0
440
+ mock_update_resources.return_value = 0
441
+ kueue_config = KueueConfig(
442
+ system=self.mock_system_chars_gpu,
443
+ total_chips=16,
444
+ cpu_limit=100,
445
+ memory_limit="100Gi",
446
+ num_slices=2,
447
+ )
448
+
449
+ rendered_manifest = self._trigger_installation(kueue_config)
450
+
451
+ self.assertIn("kind: Topology", rendered_manifest)
452
+ manifest_docs = list(yaml.safe_load_all(rendered_manifest))
453
+ resource_flavor = _first(
454
+ doc for doc in manifest_docs if doc["kind"] == "ResourceFlavor"
455
+ )
456
+ self.assertEqual(
457
+ resource_flavor["spec"]["nodeLabels"][
458
+ "cloud.google.com/gke-accelerator"
459
+ ],
460
+ "h100-mega-80gb-8",
461
+ )
462
+
463
+ @patch("xpk.core.kueue_manager.KueueManager._KueueManager__install")
464
+ @patch(
465
+ "xpk.core.kueue_manager.KueueManager._KueueManager__update_kueue_resources_if_necessary"
466
+ )
467
+ def test_configure_generates_correct_manifest_with_pathways(
468
+ self, mock_update_resources, mock_install
469
+ ):
470
+ """Test that __configure generates the correct manifest with pathways enabled."""
471
+ mock_install.return_value = 0
472
+ mock_update_resources.return_value = 0
473
+ kueue_config = KueueConfig(
474
+ system=self.mock_system_chars,
475
+ total_chips=8,
476
+ cpu_limit=100,
477
+ memory_limit="100Gi",
478
+ is_pathways_cluster=True,
479
+ num_slices=2,
480
+ )
481
+
482
+ rendered_manifest = self._trigger_installation(kueue_config)
483
+ manifest_docs = list(yaml.safe_load_all(rendered_manifest))
484
+
485
+ # Check for the new "cpu-user" ResourceFlavor
486
+ cpu_user_flavor = _first(
487
+ doc
488
+ for doc in manifest_docs
489
+ if doc["kind"] == "ResourceFlavor"
490
+ and doc["metadata"]["name"] == "cpu-user"
491
+ )
492
+ self.assertEqual(
493
+ cpu_user_flavor["spec"]["nodeLabels"]["cloud.google.com/gke-nodepool"],
494
+ "cpu-np",
495
+ )
496
+
497
+ # Check that the ClusterQueue has the new resource group for pathways
498
+ cluster_queue = _first(
499
+ doc for doc in manifest_docs if doc["kind"] == "ClusterQueue"
500
+ )
501
+ self.assertEqual(len(cluster_queue["spec"]["resourceGroups"]), 2)
502
+ pathways_rg = cluster_queue["spec"]["resourceGroups"][1]
503
+ self.assertEqual(pathways_rg["coveredResources"], ["cpu", "memory"])
504
+ self.assertEqual(pathways_rg["flavors"][0]["name"], "cpu-user")
505
+ self.assertEqual(
506
+ pathways_rg["flavors"][0]["resources"][0]["nominalQuota"], 480
507
+ )
508
+ self.assertEqual(
509
+ pathways_rg["flavors"][0]["resources"][1]["nominalQuota"], "2000G"
510
+ )
511
+
512
+ def _trigger_installation(self, kueue_config: KueueConfig) -> str:
513
+ """Calls Kueue installation and returns the rendered manifest."""
514
+ with (
515
+ patch.object(
516
+ self.kueue_manager, "_KueueManager__get_installed_kueue_version"
517
+ ) as mock_get_version,
518
+ patch.object(
519
+ self.kueue_manager, "_KueueManager__apply_manifest"
520
+ ) as mock_apply_manifest,
521
+ ):
522
+ mock_apply_manifest.return_value = 0
523
+ mock_get_version.return_value = (1, None)
524
+ self.kueue_manager.install_or_upgrade(kueue_config)
525
+
526
+ mock_apply_manifest.assert_called_once()
527
+ manifest = mock_apply_manifest.call_args[0][0]
528
+ assert isinstance(manifest, str)
529
+ return manifest
530
+
531
+
532
+ T = TypeVar("T")
533
+
534
+
535
+ def _first(generator: Generator[T, None, None]) -> T:
536
+ result = next(generator, None)
537
+ assert result is not None
538
+ return result
539
+
540
+
541
+ if __name__ == "__main__":
542
+ unittest.main()
xpk/core/monitoring.py CHANGED
@@ -40,7 +40,7 @@ def get_gke_dashboard(args, dashboard_filter) -> tuple[bool, str | None]:
40
40
  )
41
41
 
42
42
  return_code, return_value = run_command_for_value(
43
- command, 'GKE Dashboard List', args
43
+ command, 'GKE Dashboard List'
44
44
  )
45
45
 
46
46
  if return_code != 0:
xpk/core/nap.py CHANGED
@@ -27,7 +27,7 @@ from .capacity import (
27
27
  verify_reservation_exists,
28
28
  )
29
29
  from .commands import run_command_with_updates, run_commands
30
- from .gcloud_context import zone_to_region
30
+ from .gcloud_context import get_cluster_location
31
31
  from .nodepool import get_all_nodepools_programmatic
32
32
  from .resources import (
33
33
  CLUSTER_METADATA_CONFIGMAP,
@@ -98,13 +98,12 @@ def enable_autoprovisioning_on_cluster(
98
98
 
99
99
  command = (
100
100
  'gcloud container clusters update'
101
- f' {args.cluster} --project={args.project}'
102
- f' --region={zone_to_region(args.zone)} --enable-autoprovisioning'
101
+ f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --enable-autoprovisioning'
103
102
  ' --autoprovisioning-config-file'
104
103
  f' {autoprovisioning_config.config_filename}'
105
104
  )
106
105
  task = 'Update cluster with autoprovisioning enabled'
107
- return_code = run_command_with_updates(command, task, args)
106
+ return_code = run_command_with_updates(command, task)
108
107
  if return_code != 0:
109
108
  xpk_print(f'{task} request returned ERROR {return_code}')
110
109
  return autoprovisioning_config, return_code
@@ -112,11 +111,11 @@ def enable_autoprovisioning_on_cluster(
112
111
  command = (
113
112
  'gcloud container clusters update'
114
113
  f' {args.cluster} --project={args.project}'
115
- f' --region={zone_to_region(args.zone)}'
114
+ f' --location={get_cluster_location(args.project, args.cluster, args.zone)}'
116
115
  ' --autoscaling-profile=optimize-utilization'
117
116
  )
118
117
  task = 'Update cluster with autoscaling-profile'
119
- return_code = run_command_with_updates(command, task, args)
118
+ return_code = run_command_with_updates(command, task)
120
119
  if return_code != 0:
121
120
  xpk_print(f'{task} request returned ERROR {return_code}')
122
121
  return autoprovisioning_config, return_code
@@ -138,11 +137,8 @@ def enable_autoprovisioning_on_cluster(
138
137
  # Ignore node pools that are not created yet, and not of the accelerator type.
139
138
  continue
140
139
  commands.append(
141
- f'gcloud container node-pools update {node_pool_name}'
142
- f' --cluster {args.cluster}'
143
- f' --project={args.project}'
144
- f' --region={zone_to_region(args.zone)}'
145
- ' --enable-autoprovisioning'
140
+ f'gcloud container node-pools update {node_pool_name} --cluster'
141
+ f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --enable-autoprovisioning'
146
142
  ' --enable-autoscaling'
147
143
  )
148
144
  task_name = (
@@ -156,7 +152,6 @@ def enable_autoprovisioning_on_cluster(
156
152
  commands,
157
153
  'Update node pools with autoprovisioning support',
158
154
  task_names,
159
- dry_run=args.dry_run,
160
155
  )
161
156
  if max_return_code != 0:
162
157
  xpk_print(
@@ -272,7 +267,7 @@ def is_autoprovisioning_enabled(
272
267
  """
273
268
 
274
269
  resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
275
- cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
270
+ cluster_config_map = get_cluster_configmap(resources_configmap_name)
276
271
 
277
272
  if cluster_config_map is None:
278
273
  xpk_print(
@@ -325,7 +320,7 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
325
320
  if capacity_type_str == CapacityType.UNKNOWN.name:
326
321
  # Use default settings from cluster creation.
327
322
  metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
328
- cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
323
+ cluster_config_map = get_cluster_configmap(metadata_configmap_name)
329
324
 
330
325
  # Error out if the metadata config map doesn't exist, and is attempting to use
331
326
  # autoprovisioning.
@@ -369,7 +364,7 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
369
364
 
370
365
  def get_cluster_provisioner(args) -> str:
371
366
  metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
372
- cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
367
+ cluster_config_map = get_cluster_configmap(metadata_configmap_name)
373
368
  cluster_provisioner = 'gcloud'
374
369
  if not cluster_config_map is None:
375
370
  provisioner = cluster_config_map.get('provisioner')