xpk 0.13.0__py3-none-any.whl → 0.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. integration/__init__.py +15 -0
  2. integration/docker_manager_test.py +102 -0
  3. integration/gcluster_a3mega_test.py +204 -0
  4. integration/gcluster_a3ultra_test.py +176 -0
  5. integration/gcluster_a4_test.py +176 -0
  6. integration/gcluster_test.py +107 -0
  7. xpk/commands/batch.py +9 -2
  8. xpk/commands/cluster.py +143 -117
  9. xpk/commands/cluster_gcluster.py +81 -14
  10. xpk/commands/cluster_gcluster_test.py +177 -0
  11. xpk/commands/cluster_test.py +92 -0
  12. xpk/commands/common.py +14 -26
  13. xpk/commands/info.py +11 -9
  14. xpk/commands/inspector.py +21 -10
  15. xpk/commands/job.py +25 -9
  16. xpk/commands/kind.py +39 -40
  17. xpk/commands/kjob_common.py +4 -4
  18. xpk/commands/run.py +9 -2
  19. xpk/commands/shell.py +13 -10
  20. xpk/commands/storage.py +21 -0
  21. xpk/commands/version.py +0 -4
  22. xpk/commands/workload.py +84 -29
  23. xpk/commands/workload_test.py +81 -0
  24. xpk/core/blueprint/blueprint_generator.py +4 -40
  25. xpk/core/blueprint/blueprint_test.py +0 -6
  26. xpk/core/blueprint/testing/__init__.py +15 -0
  27. xpk/core/capacity.py +6 -5
  28. xpk/core/cluster.py +91 -194
  29. xpk/core/cluster_private.py +6 -11
  30. xpk/core/commands.py +11 -18
  31. xpk/core/config.py +1 -1
  32. xpk/core/docker_image.py +3 -4
  33. xpk/core/gcloud_context.py +26 -2
  34. xpk/core/gcloud_context_test.py +96 -0
  35. xpk/core/gcluster_manager.py +0 -3
  36. xpk/core/jobset.py +4 -7
  37. xpk/core/kjob.py +14 -27
  38. xpk/core/kueue_manager.py +423 -0
  39. xpk/core/kueue_manager_test.py +574 -0
  40. xpk/core/monitoring.py +1 -1
  41. xpk/core/nap.py +10 -15
  42. xpk/core/network.py +17 -18
  43. xpk/core/nodepool.py +66 -77
  44. xpk/core/nodepool_test.py +198 -1
  45. xpk/core/pathways.py +5 -5
  46. xpk/core/ray.py +10 -14
  47. xpk/core/resources.py +6 -11
  48. xpk/core/scheduling.py +19 -1
  49. xpk/core/scheduling_test.py +31 -0
  50. xpk/core/system_characteristics.py +350 -232
  51. xpk/core/system_characteristics_test.py +73 -0
  52. xpk/core/vertex.py +1 -1
  53. xpk/core/workload.py +7 -8
  54. xpk/main.py +2 -4
  55. xpk/parser/cluster.py +7 -0
  56. xpk/parser/cluster_test.py +66 -0
  57. xpk/parser/common.py +11 -0
  58. xpk/parser/workload.py +62 -25
  59. xpk/parser/workload_test.py +82 -0
  60. xpk/templates/cluster_preheat.yaml.j2 +31 -0
  61. xpk/templates/filestore-pv.yaml +17 -0
  62. xpk/templates/filestore-pvc.yaml +11 -0
  63. xpk/templates/filestore-sc.yaml +10 -0
  64. xpk/templates/fuse-pv.yaml +17 -0
  65. xpk/templates/fuse-pvc.yaml +13 -0
  66. xpk/templates/kueue_config.yaml.j2 +95 -0
  67. xpk/templates/kueue_gke_default_topology.yaml.j2 +10 -0
  68. xpk/templates/kueue_sub_slicing_topology.yaml.j2 +14 -0
  69. xpk/templates/mtc-cpc.yaml +15 -0
  70. xpk/templates/volume_bundle.yaml +7 -0
  71. xpk/utils/feature_flags.py +28 -0
  72. xpk/utils/kueue.py +20 -0
  73. xpk/utils/templates.py +15 -0
  74. xpk/utils/topology.py +46 -0
  75. xpk/utils/topology_test.py +63 -0
  76. xpk/utils/validation.py +79 -55
  77. xpk/utils/validation_test.py +37 -0
  78. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/METADATA +6 -1
  79. xpk-0.14.1.dist-info/RECORD +133 -0
  80. xpk-0.14.1.dist-info/top_level.txt +2 -0
  81. xpk/core/kueue.py +0 -561
  82. xpk-0.13.0.dist-info/RECORD +0 -101
  83. xpk-0.13.0.dist-info/top_level.txt +0 -1
  84. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/WHEEL +0 -0
  85. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/entry_points.txt +0 -0
  86. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,423 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import math
18
+ import textwrap
19
+ from dataclasses import dataclass
20
+ from typing import Optional, List, Dict, Any
21
+ import json
22
+ from jinja2 import Environment, FileSystemLoader
23
+ from ..utils.execution_context import is_dry_run
24
+ from ..utils.kueue import is_queued_cluster
25
+
26
+ from .capacity import B200_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
27
+ from .scheduling import (
28
+ create_accelerator_label,
29
+ create_machine_label,
30
+ )
31
+ from .system_characteristics import (
32
+ AcceleratorTypeToAcceleratorCharacteristics,
33
+ SystemCharacteristics,
34
+ )
35
+ from ..core.commands import (
36
+ run_command_for_value,
37
+ run_command_with_updates,
38
+ run_command_with_updates_retry,
39
+ )
40
+ from ..utils.file import write_tmp_file
41
+ from ..utils.console import xpk_print, xpk_exit
42
+ from ..utils.templates import TEMPLATE_PATH, get_templates_absolute_path
43
+
44
+ WAIT_FOR_KUEUE_TIMEOUT = "10m"
45
+ CLUSTER_QUEUE_NAME = "cluster-queue"
46
+ LOCAL_QUEUE_NAME = "multislice-queue"
47
+ SUB_SLICE_TOPOLOGY_NAME = "sub-slice-topology"
48
+ KUEUE_CONFIG_JINJA_FILE = "kueue_config.yaml.j2"
49
+ KUEUE_GKE_DEFAULT_TOPOLOGY_JINJA_FILE = "kueue_gke_default_topology.yaml.j2"
50
+ KUEUE_CONTROLLER_MANAGER_JINJA_FILE = "kueue_controller_manager.yaml.j2"
51
+ KUEUE_SUB_SLICING_TOPOLOGY_JINJA_FILE = "kueue_sub_slicing_topology.yaml.j2"
52
+ MEMORY_SIZE_PER_VM = 1.2
53
+ MIN_MEMORY_LIMIT_SIZE = 4096
54
+ KUEUE_VERSION = "v0.14.1"
55
+
56
+
57
+ @dataclass
58
+ class KueueConfig:
59
+ system: SystemCharacteristics
60
+ total_chips: int
61
+ cpu_limit: int
62
+ memory_limit: str
63
+ configure_sub_slicing: bool
64
+ is_pathways_cluster: bool = False
65
+ autoprovisioning_enabled: bool = False
66
+ flex: bool = False
67
+ num_slices: int = 1
68
+
69
+
70
+ @dataclass
71
+ class _NameAndYaml:
72
+ name: str
73
+ yaml: str
74
+
75
+
76
+ class KueueManager:
77
+ """Manages the installation and configuration of Kueue on an XPK cluster."""
78
+
79
+ def __init__(
80
+ self,
81
+ kueue_version: str = KUEUE_VERSION,
82
+ template_path=TEMPLATE_PATH,
83
+ ):
84
+ self.kueue_version = kueue_version
85
+
86
+ self.template_env = Environment(
87
+ loader=FileSystemLoader(
88
+ searchpath=get_templates_absolute_path(template_path)
89
+ )
90
+ )
91
+
92
+ def install_or_upgrade(
93
+ self,
94
+ kueue_config: KueueConfig,
95
+ tolerations: Optional[List[Dict[str, Any]]] = None,
96
+ ) -> int:
97
+ """
98
+ Ensures the correct version of Kueue is installed. Upgrades if the installed
99
+ version is older or non-existent.
100
+
101
+ Args:
102
+ tolerations: An optional list of tolerations to apply to the kueue-controller-manager.
103
+ """
104
+ return_code, installed_version = self.get_installed_kueue_version()
105
+
106
+ if return_code == 0:
107
+ if installed_version and installed_version > self.kueue_version:
108
+ xpk_print(
109
+ f"Cluster has a newer Kueue version, {installed_version}. Skipping"
110
+ " installation."
111
+ )
112
+ return 0
113
+ else:
114
+ xpk_print(f"Upgrading Kueue to version {self.kueue_version}...")
115
+ else:
116
+ xpk_print(f"Installing Kueue version {self.kueue_version}...")
117
+
118
+ install_return_code = self.__install(tolerations)
119
+ if install_return_code != 0:
120
+ return install_return_code
121
+
122
+ return self.__configure(kueue_config)
123
+
124
+ def get_installed_kueue_version(self) -> tuple[int, str | None]:
125
+ command = (
126
+ "kubectl get deployment kueue-controller-manager -n kueue-system -o"
127
+ " jsonpath='{.spec.template.spec.containers[0].image}'"
128
+ )
129
+ task = "Get kueue version on server"
130
+ return_code, val = run_command_for_value(
131
+ command,
132
+ task,
133
+ dry_run_return_val="""
134
+ v0.14.1""",
135
+ )
136
+ if return_code != 0:
137
+ return return_code, None
138
+ version_tag = val.split(":")
139
+ if len(version_tag) == 1:
140
+ return 1, None
141
+ return return_code, version_tag[-1]
142
+
143
+ def __install(
144
+ self,
145
+ tolerations: Optional[List[Dict[str, Any]]] = None,
146
+ ) -> int:
147
+ """
148
+ Installs Kueue from the official manifest and then applies any necessary patches.
149
+
150
+ Args:
151
+ tolerations: An optional list of tolerations to apply to the kueue-controller-manager.
152
+ """
153
+ return_code = self.__install_kueue_crs()
154
+ if return_code != 0:
155
+ return return_code
156
+
157
+ if tolerations:
158
+ return_code = self.__patch_tolerations(tolerations)
159
+ if return_code != 0:
160
+ return return_code
161
+
162
+ return self.__wait_for_kueue_available()
163
+
164
+ def __install_kueue_crs(self) -> int:
165
+ manifest_url = f"https://github.com/kubernetes-sigs/kueue/releases/download/{self.kueue_version}/manifests.yaml"
166
+ install_command = (
167
+ f"kubectl apply --server-side --force-conflicts -f {manifest_url}"
168
+ )
169
+ task = "Installing Kueue Custom Resources"
170
+ return_code = run_command_with_updates_retry(
171
+ install_command, "Install Kueue"
172
+ )
173
+ if return_code != 0:
174
+ xpk_print(f"{task} returned ERROR {return_code}")
175
+ return return_code
176
+
177
+ def __patch_tolerations(self, tolerations: List[Dict[str, Any]]) -> int:
178
+ patch = {"spec": {"template": {"spec": {"tolerations": tolerations}}}}
179
+ patch_str = json.dumps(patch)
180
+ patch_command = (
181
+ "kubectl patch deployment kueue-controller-manager -n kueue-system"
182
+ f" --type='strategic' --patch='{patch_str}'"
183
+ )
184
+ task = "Patch Kueue Tolerations"
185
+ return_code = run_command_with_updates_retry(
186
+ patch_command, "Patch Kueue Tolerations"
187
+ )
188
+ if return_code != 0:
189
+ xpk_print(f"{task} returned ERROR {return_code}")
190
+ return return_code
191
+
192
+ def __wait_for_kueue_available(self) -> int:
193
+ """Wait for Kueue to be fully available.
194
+
195
+ Args:
196
+ args: user provided arguments for running the command.
197
+
198
+ Returns:
199
+ 0 if successful and 1 otherwise.
200
+ """
201
+ command = (
202
+ "kubectl wait deploy/kueue-controller-manager -nkueue-system"
203
+ f" --for=condition=available --timeout={WAIT_FOR_KUEUE_TIMEOUT}"
204
+ )
205
+ task = "Wait for Kueue to be available"
206
+ return_code = run_command_with_updates(command, task)
207
+ if return_code != 0:
208
+ xpk_print(f"{task} returned ERROR {return_code}")
209
+ return return_code
210
+
211
+ def __configure(
212
+ self,
213
+ kueue_config: KueueConfig,
214
+ ) -> int:
215
+ """
216
+ Configures Kueue with opinionated defaults for XPK.
217
+
218
+ Args:
219
+ kueue_config: The KueueConfig object containing all configuration parameters.
220
+ Returns:
221
+ 0 if successful and 1 otherwise.
222
+ """
223
+ template = self.template_env.get_template(KUEUE_CONFIG_JINJA_FILE)
224
+
225
+ topology_name_and_yaml = self.__get_topology_name_and_yaml(
226
+ kueue_config.system, kueue_config.configure_sub_slicing
227
+ )
228
+ topology_name = (
229
+ topology_name_and_yaml.name if topology_name_and_yaml else None
230
+ )
231
+
232
+ # The manager builds the context internally based on its opinionated logic
233
+ context = self.__build_template_context(
234
+ system=kueue_config.system,
235
+ total_chips=kueue_config.total_chips,
236
+ is_pathways=kueue_config.is_pathways_cluster,
237
+ autoprovisioning=kueue_config.autoprovisioning_enabled,
238
+ flex=kueue_config.flex,
239
+ num_slices=kueue_config.num_slices,
240
+ cpu_limit=kueue_config.cpu_limit,
241
+ memory_limit=kueue_config.memory_limit,
242
+ topology_name=topology_name,
243
+ )
244
+
245
+ config_yaml = template.render(context)
246
+ yamls = [config_yaml]
247
+
248
+ if topology_name_and_yaml:
249
+ yamls.append(topology_name_and_yaml.yaml)
250
+
251
+ rendered_manifest = "\n---\n".join(yamls)
252
+ return_code = self.__apply_manifest(rendered_manifest)
253
+ if return_code != 0:
254
+ return return_code
255
+
256
+ return self.__update_kueue_resources_if_necessary()
257
+
258
+ def __build_template_context(
259
+ self,
260
+ system: SystemCharacteristics,
261
+ total_chips: int,
262
+ is_pathways: bool,
263
+ autoprovisioning: bool,
264
+ flex: bool,
265
+ num_slices: int,
266
+ cpu_limit: int,
267
+ memory_limit: str,
268
+ topology_name: str | None,
269
+ ) -> Dict[str, Any]:
270
+ """Prepares the context for the Jinja2 template."""
271
+ # Main accelerator flavor
272
+ device_type_str = system.device_type.replace("_", "-")
273
+ main_flavor_name = f"{num_slices}x{device_type_str}"
274
+
275
+ node_labels_dict = {}
276
+ accelerator_label = create_accelerator_label(
277
+ system.accelerator_type, system
278
+ )
279
+ if accelerator_label:
280
+ key, value = accelerator_label.split(":", 1)
281
+ node_labels_dict[key] = value.strip()
282
+
283
+ machine_label = create_machine_label(
284
+ system.accelerator_type, system, autoprovisioning
285
+ )
286
+ if machine_label:
287
+ key, value = machine_label.split(":", 1)
288
+ node_labels_dict[key] = value.strip()
289
+
290
+ topology_label = f"topologyName: {topology_name}" if topology_name else ""
291
+
292
+ flavors = [{
293
+ "name": main_flavor_name,
294
+ "nodeLabels": node_labels_dict,
295
+ "topologyLabel": topology_label,
296
+ }]
297
+
298
+ managed_resource = AcceleratorTypeToAcceleratorCharacteristics[
299
+ system.accelerator_type
300
+ ].resource_type
301
+
302
+ covered_resources = [managed_resource]
303
+ resources = [{"name": managed_resource, "nominalQuota": total_chips}]
304
+
305
+ if cpu_limit:
306
+ covered_resources.append("cpu")
307
+ resources.append({"name": "cpu", "nominalQuota": cpu_limit})
308
+ if memory_limit:
309
+ covered_resources.append("memory")
310
+ resources.append({"name": "memory", "nominalQuota": memory_limit})
311
+
312
+ resource_groups = [{
313
+ "coveredResources": covered_resources,
314
+ "flavors": [{"name": main_flavor_name, "resources": resources}],
315
+ }]
316
+
317
+ # Add Pathway-specific resources if needed
318
+ if is_pathways:
319
+ flavors.append({
320
+ "name": "cpu-user",
321
+ "nodeLabels": {"cloud.google.com/gke-nodepool": "cpu-np"},
322
+ })
323
+ resource_groups.append({
324
+ "coveredResources": ["cpu", "memory"],
325
+ "flavors": [{
326
+ "name": "cpu-user",
327
+ "resources": [
328
+ {"name": "cpu", "nominalQuota": 480},
329
+ {"name": "memory", "nominalQuota": "2000G"},
330
+ ],
331
+ }],
332
+ })
333
+
334
+ if flex and is_queued_cluster(num_slices):
335
+ admission_checks = textwrap.dedent("""
336
+ admissionChecks:
337
+ - dws-prov
338
+ """)
339
+ else:
340
+ admission_checks = ""
341
+
342
+ return {
343
+ "flavors": flavors,
344
+ "resource_groups": resource_groups,
345
+ "autoprovisioning_enabled": autoprovisioning,
346
+ "managed_resource": managed_resource,
347
+ "cluster_queue_name": CLUSTER_QUEUE_NAME,
348
+ "local_queue_name": LOCAL_QUEUE_NAME,
349
+ "admission_checks": admission_checks,
350
+ }
351
+
352
+ def __get_topology_name_and_yaml(
353
+ self, system: SystemCharacteristics, configure_sub_slicing: bool
354
+ ) -> _NameAndYaml | None:
355
+ if system.device_type in [
356
+ H100_MEGA_DEVICE_TYPE,
357
+ H200_DEVICE_TYPE,
358
+ B200_DEVICE_TYPE,
359
+ ]:
360
+ return _NameAndYaml(
361
+ name="gke-default",
362
+ yaml=self.template_env.get_template(
363
+ KUEUE_GKE_DEFAULT_TOPOLOGY_JINJA_FILE
364
+ ).render(),
365
+ )
366
+ elif configure_sub_slicing:
367
+ return _NameAndYaml(
368
+ name=SUB_SLICE_TOPOLOGY_NAME,
369
+ yaml=self.template_env.get_template(
370
+ KUEUE_SUB_SLICING_TOPOLOGY_JINJA_FILE
371
+ ).render({
372
+ "sub_slice_topology_name": SUB_SLICE_TOPOLOGY_NAME,
373
+ }),
374
+ )
375
+ else:
376
+ return None
377
+
378
+ def __apply_manifest(self, manifest: str) -> int:
379
+ task = "Applying Kueue Custom Resources"
380
+ if is_dry_run():
381
+ xpk_print(f"Applying following Kueue resources:{manifest}")
382
+ tmp_file = write_tmp_file(manifest)
383
+ command = f"kubectl apply -f {tmp_file}"
384
+ return run_command_with_updates(command, task)
385
+
386
+ def __update_kueue_resources_if_necessary(self) -> int:
387
+ """Patch memory size limit if necessary."""
388
+ # Get total number of nodes
389
+ cmd_total_node_num = "kubectl get node --no-headers | wc -l"
390
+ return_code, out = run_command_for_value(
391
+ cmd_total_node_num, "Count total nodes"
392
+ )
393
+ if return_code != 0:
394
+ xpk_exit(1)
395
+ # 1.2MiB per VM or 4GiB (whichever is greater).
396
+ new_memory_limit = (
397
+ f"{max(math.ceil(int(out) * MEMORY_SIZE_PER_VM), MIN_MEMORY_LIMIT_SIZE)}Mi"
398
+ )
399
+ patch = {
400
+ "spec": {
401
+ "template": {
402
+ "spec": {
403
+ "containers": [{
404
+ "name": "manager",
405
+ "resources": {"limits": {"memory": new_memory_limit}},
406
+ }]
407
+ }
408
+ }
409
+ }
410
+ }
411
+ patch_str = json.dumps(patch)
412
+ patch_command = (
413
+ "kubectl patch deployment kueue-controller-manager -n kueue-system"
414
+ f" --type='strategic' --patch='{patch_str}'"
415
+ )
416
+ task = "Updating Kueue Controller Manager resources"
417
+ return_code = run_command_with_updates_retry(
418
+ patch_command,
419
+ task,
420
+ )
421
+ if return_code != 0:
422
+ xpk_print(f"{task} returned ERROR {return_code}")
423
+ return return_code