xpk 0.12.0__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. xpk/commands/batch.py +17 -10
  2. xpk/commands/cluster.py +137 -123
  3. xpk/commands/cluster_gcluster.py +77 -14
  4. xpk/commands/cluster_gcluster_test.py +177 -0
  5. xpk/commands/common.py +13 -27
  6. xpk/commands/info.py +11 -9
  7. xpk/commands/inspector.py +22 -11
  8. xpk/commands/job.py +53 -9
  9. xpk/commands/kind.py +38 -40
  10. xpk/commands/kjob_common.py +4 -4
  11. xpk/commands/run.py +9 -2
  12. xpk/commands/shell.py +13 -10
  13. xpk/commands/storage.py +26 -2
  14. xpk/commands/version.py +0 -4
  15. xpk/commands/workload.py +58 -30
  16. xpk/core/blueprint/blueprint_generator.py +4 -40
  17. xpk/core/blueprint/blueprint_test.py +0 -6
  18. xpk/core/capacity.py +6 -5
  19. xpk/core/cluster.py +96 -195
  20. xpk/core/cluster_private.py +9 -12
  21. xpk/core/commands.py +21 -25
  22. xpk/core/config.py +1 -1
  23. xpk/core/docker_image.py +17 -9
  24. xpk/core/docker_resources.py +9 -4
  25. xpk/core/gcloud_context.py +26 -2
  26. xpk/core/gcloud_context_test.py +96 -0
  27. xpk/core/gcluster_manager.py +0 -3
  28. xpk/core/jobset.py +5 -8
  29. xpk/core/kjob.py +19 -29
  30. xpk/core/kueue_manager.py +383 -0
  31. xpk/core/kueue_manager_test.py +542 -0
  32. xpk/core/monitoring.py +1 -1
  33. xpk/core/nap.py +11 -16
  34. xpk/core/network.py +18 -19
  35. xpk/core/nodepool.py +65 -71
  36. xpk/core/nodepool_test.py +198 -1
  37. xpk/core/pathways.py +9 -5
  38. xpk/core/ray.py +11 -15
  39. xpk/core/resources.py +15 -10
  40. xpk/core/scheduling.py +23 -1
  41. xpk/core/scheduling_test.py +31 -0
  42. xpk/core/system_characteristics.py +335 -229
  43. xpk/core/vertex.py +1 -1
  44. xpk/core/workload.py +7 -8
  45. xpk/main.py +3 -2
  46. xpk/parser/cluster.py +50 -0
  47. xpk/parser/cluster_test.py +66 -0
  48. xpk/parser/common.py +11 -0
  49. xpk/parser/workload.py +62 -25
  50. xpk/parser/workload_test.py +82 -0
  51. xpk/utils/execution_context.py +28 -0
  52. xpk/utils/feature_flags.py +28 -0
  53. xpk/utils/file.py +25 -10
  54. xpk/utils/kueue.py +20 -0
  55. xpk/utils/network.py +4 -0
  56. xpk/utils/templates.py +2 -0
  57. xpk/utils/topology.py +37 -0
  58. xpk/utils/topology_test.py +43 -0
  59. xpk/utils/validation.py +79 -55
  60. xpk/utils/validation_test.py +37 -0
  61. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
  62. xpk-0.14.0.dist-info/RECORD +112 -0
  63. xpk/core/kueue.py +0 -545
  64. xpk-0.12.0.dist-info/RECORD +0 -100
  65. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
  66. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
  67. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
  68. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,383 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import math
18
+ import textwrap
19
+ from dataclasses import dataclass
20
+ from typing import Optional, List, Dict, Any
21
+ import json
22
+ from jinja2 import Environment, FileSystemLoader
23
+ from ..utils.execution_context import is_dry_run
24
+ from ..utils.kueue import is_queued_cluster
25
+
26
+ from .capacity import B200_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
27
+ from .scheduling import (
28
+ create_accelerator_label,
29
+ create_machine_label,
30
+ )
31
+ from .system_characteristics import (
32
+ AcceleratorTypeToAcceleratorCharacteristics,
33
+ SystemCharacteristics,
34
+ )
35
+ from ..core.commands import (
36
+ run_command_for_value,
37
+ run_command_with_updates,
38
+ run_command_with_updates_retry,
39
+ )
40
+ from ..utils.file import write_tmp_file
41
+ from ..utils.console import xpk_print, xpk_exit
42
+ from ..utils.templates import TEMPLATE_PATH
43
+
44
+ WAIT_FOR_KUEUE_TIMEOUT = "10m"
45
+ CLUSTER_QUEUE_NAME = "cluster-queue"
46
+ LOCAL_QUEUE_NAME = "multislice-queue"
47
+ KUEUE_CONFIG_JINJA_FILE = "kueue_config.yaml.j2"
48
+ KUEUE_TOPOLOGY_JINJA_FILE = "kueue_topology.yaml.j2"
49
+ KUEUE_CONTROLLER_MANAGER_JINJA_FILE = "kueue_controller_manager.yaml.j2"
50
+ MEMORY_SIZE_PER_VM = 1.2
51
+ MIN_MEMORY_LIMIT_SIZE = 4096
52
+ KUEUE_VERSION = "v0.14.1"
53
+
54
+
55
+ @dataclass
56
+ class KueueConfig:
57
+ system: SystemCharacteristics
58
+ total_chips: int
59
+ cpu_limit: int
60
+ memory_limit: str
61
+ is_pathways_cluster: bool = False
62
+ autoprovisioning_enabled: bool = False
63
+ flex: bool = False
64
+ num_slices: int = 1
65
+
66
+
67
+ class KueueManager:
68
+ """Manages the installation and configuration of Kueue on an XPK cluster."""
69
+
70
+ def __init__(
71
+ self,
72
+ kueue_version: str = KUEUE_VERSION,
73
+ template_path=TEMPLATE_PATH,
74
+ ):
75
+ self.kueue_version = kueue_version
76
+ self.template_env = Environment(loader=FileSystemLoader(template_path))
77
+
78
+ def install_or_upgrade(
79
+ self,
80
+ kueue_config: KueueConfig,
81
+ tolerations: Optional[List[Dict[str, Any]]] = None,
82
+ ) -> int:
83
+ """
84
+ Ensures the correct version of Kueue is installed. Upgrades if the installed
85
+ version is older or non-existent.
86
+
87
+ Args:
88
+ tolerations: An optional list of tolerations to apply to the kueue-controller-manager.
89
+ """
90
+ return_code, installed_version = self.__get_installed_kueue_version()
91
+
92
+ if return_code == 0:
93
+ if installed_version and installed_version > self.kueue_version:
94
+ xpk_print(
95
+ f"Cluster has a newer Kueue version, {installed_version}. Skipping"
96
+ " installation."
97
+ )
98
+ return 0
99
+ else:
100
+ xpk_print(f"Upgrading Kueue to version {self.kueue_version}...")
101
+ else:
102
+ xpk_print(f"Installing Kueue version {self.kueue_version}...")
103
+
104
+ install_return_code = self.__install(tolerations)
105
+ if install_return_code != 0:
106
+ return install_return_code
107
+
108
+ return self.__configure(kueue_config)
109
+
110
+ def __get_installed_kueue_version(self) -> tuple[int, str | None]:
111
+ command = (
112
+ "kubectl get deployment kueue-controller-manager -n kueue-system -o"
113
+ " jsonpath='{.spec.template.spec.containers[0].image}'"
114
+ )
115
+ task = "Get kueue version on server"
116
+ return_code, val = run_command_for_value(
117
+ command,
118
+ task,
119
+ dry_run_return_val="""
120
+ v0.14.1""",
121
+ )
122
+ if return_code != 0:
123
+ return return_code, None
124
+ version_tag = val.split(":")
125
+ if len(version_tag) == 1:
126
+ return 1, None
127
+ return return_code, version_tag[-1]
128
+
129
+ def __install(
130
+ self,
131
+ tolerations: Optional[List[Dict[str, Any]]] = None,
132
+ ) -> int:
133
+ """
134
+ Installs Kueue from the official manifest and then applies any necessary patches.
135
+
136
+ Args:
137
+ tolerations: An optional list of tolerations to apply to the kueue-controller-manager.
138
+ """
139
+ return_code = self.__install_kueue_crs()
140
+ if return_code != 0:
141
+ return return_code
142
+
143
+ if tolerations:
144
+ return_code = self.__patch_tolerations(tolerations)
145
+ if return_code != 0:
146
+ return return_code
147
+
148
+ return self.__wait_for_kueue_available()
149
+
150
+ def __install_kueue_crs(self) -> int:
151
+ manifest_url = f"https://github.com/kubernetes-sigs/kueue/releases/download/{self.kueue_version}/manifests.yaml"
152
+ install_command = (
153
+ f"kubectl apply --server-side --force-conflicts -f {manifest_url}"
154
+ )
155
+ task = "Installing Kueue Custom Resources"
156
+ return_code = run_command_with_updates_retry(
157
+ install_command, "Install Kueue"
158
+ )
159
+ if return_code != 0:
160
+ xpk_print(f"{task} returned ERROR {return_code}")
161
+ return return_code
162
+
163
+ def __patch_tolerations(self, tolerations: List[Dict[str, Any]]) -> int:
164
+ patch = {"spec": {"template": {"spec": {"tolerations": tolerations}}}}
165
+ patch_str = json.dumps(patch)
166
+ patch_command = (
167
+ "kubectl patch deployment kueue-controller-manager -n kueue-system"
168
+ f" --type='strategic' --patch='{patch_str}'"
169
+ )
170
+ task = "Patch Kueue Tolerations"
171
+ return_code = run_command_with_updates_retry(
172
+ patch_command, "Patch Kueue Tolerations"
173
+ )
174
+ if return_code != 0:
175
+ xpk_print(f"{task} returned ERROR {return_code}")
176
+ return return_code
177
+
178
+ def __wait_for_kueue_available(self) -> int:
179
+ """Wait for Kueue to be fully available.
180
+
181
+ Args:
182
+ args: user provided arguments for running the command.
183
+
184
+ Returns:
185
+ 0 if successful and 1 otherwise.
186
+ """
187
+ command = (
188
+ "kubectl wait deploy/kueue-controller-manager -nkueue-system"
189
+ f" --for=condition=available --timeout={WAIT_FOR_KUEUE_TIMEOUT}"
190
+ )
191
+ task = "Wait for Kueue to be available"
192
+ return_code = run_command_with_updates(command, task)
193
+ if return_code != 0:
194
+ xpk_print(f"{task} returned ERROR {return_code}")
195
+ return return_code
196
+
197
+ def __configure(
198
+ self,
199
+ kueue_config: KueueConfig,
200
+ ) -> int:
201
+ """
202
+ Configures Kueue with opinionated defaults for XPK.
203
+
204
+ Args:
205
+ kueue_config: The KueueConfig object containing all configuration parameters.
206
+ Returns:
207
+ 0 if successful and 1 otherwise.
208
+ """
209
+ template = self.template_env.get_template(KUEUE_CONFIG_JINJA_FILE)
210
+
211
+ # The manager builds the context internally based on its opinionated logic
212
+ context = self.__build_template_context(
213
+ system=kueue_config.system,
214
+ total_chips=kueue_config.total_chips,
215
+ is_pathways=kueue_config.is_pathways_cluster,
216
+ autoprovisioning=kueue_config.autoprovisioning_enabled,
217
+ flex=kueue_config.flex,
218
+ num_slices=kueue_config.num_slices,
219
+ cpu_limit=kueue_config.cpu_limit,
220
+ memory_limit=kueue_config.memory_limit,
221
+ )
222
+
223
+ rendered_manifest = template.render(context)
224
+
225
+ if kueue_config.system.device_type in [
226
+ H100_MEGA_DEVICE_TYPE,
227
+ H200_DEVICE_TYPE,
228
+ B200_DEVICE_TYPE,
229
+ ]:
230
+ topology_yaml = self.template_env.get_template(KUEUE_TOPOLOGY_JINJA_FILE)
231
+ rendered_manifest = topology_yaml.render() + rendered_manifest
232
+
233
+ return_code = self.__apply_manifest(rendered_manifest)
234
+ if return_code != 0:
235
+ return return_code
236
+
237
+ return self.__update_kueue_resources_if_necessary()
238
+
239
+ def __build_template_context(
240
+ self,
241
+ system: SystemCharacteristics,
242
+ total_chips: int,
243
+ is_pathways: bool,
244
+ autoprovisioning: bool,
245
+ flex: bool,
246
+ num_slices: int,
247
+ cpu_limit: int,
248
+ memory_limit: str,
249
+ ) -> Dict[str, Any]:
250
+ """Prepares the context for the Jinja2 template."""
251
+ # Main accelerator flavor
252
+ device_type_str = system.device_type.replace("_", "-")
253
+ main_flavor_name = f"{num_slices}x{device_type_str}"
254
+
255
+ node_labels_dict = {}
256
+ accelerator_label = create_accelerator_label(
257
+ system.accelerator_type, system
258
+ )
259
+ if accelerator_label:
260
+ key, value = accelerator_label.split(":", 1)
261
+ node_labels_dict[key] = value.strip()
262
+
263
+ machine_label = create_machine_label(
264
+ system.accelerator_type, system, autoprovisioning
265
+ )
266
+ if machine_label:
267
+ key, value = machine_label.split(":", 1)
268
+ node_labels_dict[key] = value.strip()
269
+
270
+ topology_label = ""
271
+ if system.device_type in [
272
+ H100_MEGA_DEVICE_TYPE,
273
+ H200_DEVICE_TYPE,
274
+ B200_DEVICE_TYPE,
275
+ ]:
276
+ topology_label = 'topologyName: "gke-default"'
277
+
278
+ flavors = [{
279
+ "name": main_flavor_name,
280
+ "nodeLabels": node_labels_dict,
281
+ "topologyLabel": topology_label,
282
+ }]
283
+
284
+ managed_resource = AcceleratorTypeToAcceleratorCharacteristics[
285
+ system.accelerator_type
286
+ ].resource_type
287
+
288
+ covered_resources = [managed_resource]
289
+ resources = [{"name": managed_resource, "nominalQuota": total_chips}]
290
+
291
+ if cpu_limit:
292
+ covered_resources.append("cpu")
293
+ resources.append({"name": "cpu", "nominalQuota": cpu_limit})
294
+ if memory_limit:
295
+ covered_resources.append("memory")
296
+ resources.append({"name": "memory", "nominalQuota": memory_limit})
297
+
298
+ resource_groups = [{
299
+ "coveredResources": covered_resources,
300
+ "flavors": [{"name": main_flavor_name, "resources": resources}],
301
+ }]
302
+
303
+ # Add Pathway-specific resources if needed
304
+ if is_pathways:
305
+ flavors.append({
306
+ "name": "cpu-user",
307
+ "nodeLabels": {"cloud.google.com/gke-nodepool": "cpu-np"},
308
+ })
309
+ resource_groups.append({
310
+ "coveredResources": ["cpu", "memory"],
311
+ "flavors": [{
312
+ "name": "cpu-user",
313
+ "resources": [
314
+ {"name": "cpu", "nominalQuota": 480},
315
+ {"name": "memory", "nominalQuota": "2000G"},
316
+ ],
317
+ }],
318
+ })
319
+
320
+ if flex and is_queued_cluster(num_slices):
321
+ admission_checks = textwrap.dedent("""
322
+ admissionChecks:
323
+ - dws-prov
324
+ """)
325
+ else:
326
+ admission_checks = ""
327
+
328
+ return {
329
+ "flavors": flavors,
330
+ "resource_groups": resource_groups,
331
+ "autoprovisioning_enabled": autoprovisioning,
332
+ "managed_resource": managed_resource,
333
+ "cluster_queue_name": CLUSTER_QUEUE_NAME,
334
+ "local_queue_name": LOCAL_QUEUE_NAME,
335
+ "admission_checks": admission_checks,
336
+ }
337
+
338
+ def __apply_manifest(self, manifest: str) -> int:
339
+ task = "Applying Kueue Custom Resources"
340
+ if is_dry_run():
341
+ xpk_print(f"Applying following Kueue resources:{manifest}")
342
+ tmp_file = write_tmp_file(manifest)
343
+ command = f"kubectl apply -f {tmp_file}"
344
+ return run_command_with_updates(command, task)
345
+
346
+ def __update_kueue_resources_if_necessary(self) -> int:
347
+ """Patch memory size limit if necessary."""
348
+ # Get total number of nodes
349
+ cmd_total_node_num = "kubectl get node --no-headers | wc -l"
350
+ return_code, out = run_command_for_value(
351
+ cmd_total_node_num, "Count total nodes"
352
+ )
353
+ if return_code != 0:
354
+ xpk_exit(1)
355
+ # 1.2MiB per VM or 4GiB (whichever is greater).
356
+ new_memory_limit = (
357
+ f"{max(math.ceil(int(out) * MEMORY_SIZE_PER_VM), MIN_MEMORY_LIMIT_SIZE)}Mi"
358
+ )
359
+ patch = {
360
+ "spec": {
361
+ "template": {
362
+ "spec": {
363
+ "containers": [{
364
+ "name": "manager",
365
+ "resources": {"limits": {"memory": new_memory_limit}},
366
+ }]
367
+ }
368
+ }
369
+ }
370
+ }
371
+ patch_str = json.dumps(patch)
372
+ patch_command = (
373
+ "kubectl patch deployment kueue-controller-manager -n kueue-system"
374
+ f" --type='strategic' --patch='{patch_str}'"
375
+ )
376
+ task = "Updating Kueue Controller Manager resources"
377
+ return_code = run_command_with_updates_retry(
378
+ patch_command,
379
+ task,
380
+ )
381
+ if return_code != 0:
382
+ xpk_print(f"{task} returned ERROR {return_code}")
383
+ return return_code