xpk 0.14.4__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. integration/README.md +19 -0
  2. integration/gcluster_a3mega_test.py +11 -0
  3. integration/gcluster_a3ultra_test.py +11 -0
  4. integration/gcluster_a4_test.py +11 -0
  5. xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
  6. xpk/blueprints/a3mega/storage_crd.yaml +52 -0
  7. xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
  8. xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
  9. xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
  10. xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
  11. xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
  12. xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
  13. xpk/blueprints/a4/storage_crd.yaml +52 -0
  14. xpk/commands/cluster.py +89 -32
  15. xpk/commands/cluster_gcluster.py +25 -5
  16. xpk/commands/cluster_gcluster_test.py +16 -3
  17. xpk/commands/cluster_test.py +353 -7
  18. xpk/commands/config.py +3 -5
  19. xpk/commands/inspector.py +5 -3
  20. xpk/commands/kind.py +3 -1
  21. xpk/commands/managed_ml_diagnostics.py +249 -0
  22. xpk/commands/managed_ml_diagnostics_test.py +146 -0
  23. xpk/commands/storage.py +8 -10
  24. xpk/commands/workload.py +143 -142
  25. xpk/commands/workload_test.py +160 -118
  26. xpk/core/blueprint/blueprint_generator.py +73 -33
  27. xpk/core/blueprint/blueprint_test.py +9 -0
  28. xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
  29. xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
  30. xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
  31. xpk/core/blueprint/testing/data/a4.yaml +185 -0
  32. xpk/core/capacity.py +48 -8
  33. xpk/core/capacity_test.py +32 -1
  34. xpk/core/cluster.py +55 -104
  35. xpk/core/cluster_test.py +170 -0
  36. xpk/core/commands.py +4 -10
  37. xpk/core/config.py +88 -7
  38. xpk/core/config_test.py +67 -11
  39. xpk/core/docker_container.py +3 -1
  40. xpk/core/docker_image.py +10 -6
  41. xpk/core/docker_resources.py +1 -10
  42. xpk/core/gcloud_context.py +18 -12
  43. xpk/core/gcloud_context_test.py +111 -1
  44. xpk/core/kjob.py +17 -19
  45. xpk/core/kueue_manager.py +205 -51
  46. xpk/core/kueue_manager_test.py +158 -4
  47. xpk/core/nap.py +13 -14
  48. xpk/core/nodepool.py +37 -43
  49. xpk/core/nodepool_test.py +42 -19
  50. xpk/core/pathways.py +23 -0
  51. xpk/core/pathways_test.py +57 -0
  52. xpk/core/resources.py +84 -27
  53. xpk/core/scheduling.py +144 -133
  54. xpk/core/scheduling_test.py +298 -6
  55. xpk/core/system_characteristics.py +256 -19
  56. xpk/core/system_characteristics_test.py +128 -5
  57. xpk/core/telemetry.py +263 -0
  58. xpk/core/telemetry_test.py +211 -0
  59. xpk/core/vertex.py +4 -3
  60. xpk/core/workload_decorators/tcpx_decorator.py +5 -1
  61. xpk/main.py +33 -13
  62. xpk/parser/cluster.py +40 -67
  63. xpk/parser/cluster_test.py +83 -3
  64. xpk/parser/common.py +84 -0
  65. xpk/parser/storage.py +10 -0
  66. xpk/parser/storage_test.py +47 -0
  67. xpk/parser/workload.py +14 -29
  68. xpk/parser/workload_test.py +3 -49
  69. xpk/telemetry_uploader.py +29 -0
  70. xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
  71. xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
  72. xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
  73. xpk/utils/console.py +41 -10
  74. xpk/utils/console_test.py +106 -0
  75. xpk/utils/feature_flags.py +10 -1
  76. xpk/utils/file.py +4 -1
  77. xpk/utils/topology.py +4 -0
  78. xpk/utils/user_agent.py +35 -0
  79. xpk/utils/user_agent_test.py +44 -0
  80. xpk/utils/user_input.py +48 -0
  81. xpk/utils/user_input_test.py +92 -0
  82. xpk/utils/validation.py +2 -13
  83. xpk/utils/versions.py +31 -0
  84. xpk-0.16.0.dist-info/METADATA +127 -0
  85. xpk-0.16.0.dist-info/RECORD +168 -0
  86. xpk-0.14.4.dist-info/METADATA +0 -1645
  87. xpk-0.14.4.dist-info/RECORD +0 -139
  88. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
  89. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
  90. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
  91. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
xpk/core/kueue_manager.py CHANGED
@@ -20,17 +20,17 @@ from dataclasses import dataclass
20
20
  from typing import Optional, List, Dict, Any
21
21
  import json
22
22
  from jinja2 import Environment, FileSystemLoader
23
- from ..utils.execution_context import is_dry_run
24
- from ..utils.kueue import is_queued_cluster
25
23
 
26
- from .capacity import B200_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
27
- from .scheduling import (
28
- create_accelerator_label,
29
- create_machine_label,
30
- )
24
+ from ..utils.topology import get_slice_topology_level, get_topology_product, is_topology_contained
25
+ from ..utils.kueue import is_queued_cluster
26
+ from kubernetes.utils import parse_quantity
31
27
  from .system_characteristics import (
28
+ SUB_SLICING_TOPOLOGIES,
29
+ AcceleratorType,
32
30
  AcceleratorTypeToAcceleratorCharacteristics,
33
31
  SystemCharacteristics,
32
+ create_accelerator_label,
33
+ create_machine_label,
34
34
  )
35
35
  from ..core.commands import (
36
36
  run_command_for_value,
@@ -38,10 +38,12 @@ from ..core.commands import (
38
38
  run_command_with_updates_retry,
39
39
  )
40
40
  from ..utils.file import write_tmp_file
41
- from ..utils.console import xpk_print, xpk_exit
41
+ from ..utils.console import xpk_print, xpk_exit, ask_for_user_consent
42
42
  from ..utils.templates import TEMPLATE_PATH, get_templates_absolute_path
43
43
  from packaging.version import Version
44
44
 
45
+ KUEUE_VERSION = Version("v0.14.3")
46
+ LATEST_BREAKING_VERSION = Version("v0.14.0")
45
47
  WAIT_FOR_KUEUE_TIMEOUT = "10m"
46
48
  CLUSTER_QUEUE_NAME = "cluster-queue"
47
49
  LOCAL_QUEUE_NAME = "multislice-queue"
@@ -52,10 +54,9 @@ KUEUE_CONTROLLER_MANAGER_JINJA_FILE = "kueue_controller_manager.yaml.j2"
52
54
  KUEUE_SUB_SLICING_TOPOLOGY_JINJA_FILE = "kueue_sub_slicing_topology.yaml.j2"
53
55
  MEMORY_SIZE_PER_VM = 1.2
54
56
  MIN_MEMORY_LIMIT_SIZE = 4096
55
- KUEUE_VERSION = Version("v0.12.2")
56
57
 
57
58
 
58
- @dataclass
59
+ @dataclass(frozen=True)
59
60
  class KueueConfig:
60
61
  system: SystemCharacteristics
61
62
  total_chips: int
@@ -68,7 +69,7 @@ class KueueConfig:
68
69
  num_slices: int = 1
69
70
 
70
71
 
71
- @dataclass
72
+ @dataclass(frozen=True)
72
73
  class _NameAndYaml:
73
74
  name: str
74
75
  yaml: str
@@ -79,9 +80,13 @@ class KueueManager:
79
80
 
80
81
  def __init__(
81
82
  self,
83
+ project: str,
84
+ zone: str,
82
85
  kueue_version: Version = KUEUE_VERSION,
83
86
  template_path=TEMPLATE_PATH,
84
87
  ):
88
+ self.project = project
89
+ self.zone = zone
85
90
  self.kueue_version = kueue_version
86
91
 
87
92
  self.template_env = Environment(
@@ -102,10 +107,10 @@ class KueueManager:
102
107
  Args:
103
108
  tolerations: An optional list of tolerations to apply to the kueue-controller-manager.
104
109
  """
105
- return_code, installed_version = self.get_installed_kueue_version()
110
+ return_code, installed_version = get_installed_kueue_version()
106
111
 
107
- if return_code == 0:
108
- if installed_version and installed_version > self.kueue_version:
112
+ if return_code == 0 and installed_version:
113
+ if installed_version > self.kueue_version:
109
114
  xpk_print(
110
115
  f"Cluster has a newer Kueue version, {installed_version}. Skipping"
111
116
  " installation."
@@ -113,6 +118,10 @@ class KueueManager:
113
118
  return 0
114
119
  else:
115
120
  xpk_print(f"Upgrading Kueue to version v{self.kueue_version}...")
121
+ assert installed_version
122
+ prepare_code = self.__prepare_for_upgrade(installed_version)
123
+ if prepare_code != 0:
124
+ return prepare_code
116
125
  else:
117
126
  xpk_print(f"Installing Kueue version v{self.kueue_version}...")
118
127
 
@@ -122,24 +131,6 @@ class KueueManager:
122
131
 
123
132
  return self.__configure(kueue_config)
124
133
 
125
- def get_installed_kueue_version(self) -> tuple[int, Version | None]:
126
- command = (
127
- "kubectl get deployment kueue-controller-manager -n kueue-system -o"
128
- " jsonpath='{.spec.template.spec.containers[0].image}'"
129
- )
130
- task = "Get kueue version on server"
131
- return_code, val = run_command_for_value(
132
- command,
133
- task,
134
- dry_run_return_val="",
135
- )
136
- if return_code != 0:
137
- return return_code, None
138
- version_tag = val.split(":")
139
- if len(version_tag) == 1:
140
- return 1, None
141
- return return_code, Version(version_tag[-1])
142
-
143
134
  def __install(
144
135
  self,
145
136
  tolerations: Optional[List[Dict[str, Any]]] = None,
@@ -161,6 +152,60 @@ class KueueManager:
161
152
 
162
153
  return self.__wait_for_kueue_available()
163
154
 
155
+ def __prepare_for_upgrade(self, installed_version: Version) -> int:
156
+ if installed_version >= LATEST_BREAKING_VERSION:
157
+ return 0
158
+
159
+ xpk_print(
160
+ f"Currently installed Kueue version v{installed_version} is"
161
+ f" incompatible with the newer v{self.kueue_version}."
162
+ )
163
+
164
+ changelog_link = f"https://github.com/kubernetes-sigs/kueue/blob/main/CHANGELOG/CHANGELOG-{self.kueue_version.major}.{self.kueue_version.minor}.md"
165
+ agreed = ask_for_user_consent(
166
+ "Do you want to allow XPK to update Kueue automatically? This will"
167
+ " delete all existing Kueue resources and create new ones. If you"
168
+ " decline, you will need to upgrade the Kueue manually (see"
169
+ f" {changelog_link} for help)."
170
+ )
171
+ if not agreed:
172
+ return 1
173
+
174
+ return self.__delete_all_kueue_resources()
175
+
176
+ def __delete_all_kueue_resources(self) -> int:
177
+ return_code, kueue_crds_string = run_command_for_value(
178
+ "kubectl get crd -o name | grep .kueue.x-k8s.io", "Get Kueue CRDs"
179
+ )
180
+ if return_code != 0:
181
+ return return_code
182
+
183
+ kueue_crds = [
184
+ line.strip().removeprefix(
185
+ "customresourcedefinition.apiextensions.k8s.io/"
186
+ )
187
+ for line in kueue_crds_string.strip().split("\n")
188
+ ]
189
+
190
+ for crd in kueue_crds:
191
+ return_code = run_command_with_updates(
192
+ f"kubectl delete {crd} --all", f"Delete all resources of type {crd}"
193
+ )
194
+ if return_code != 0:
195
+ return return_code
196
+
197
+ for crd in kueue_crds:
198
+ return_code = run_command_with_updates(
199
+ f"kubectl delete crd {crd}", f"Delete CRD {crd}"
200
+ )
201
+ if return_code != 0:
202
+ return return_code
203
+
204
+ return run_command_with_updates(
205
+ "kubectl delete deployment kueue-controller-manager -n kueue-system",
206
+ "Delete Kueue Controller Manager deployment",
207
+ )
208
+
164
209
  def __install_kueue_crs(self) -> int:
165
210
  manifest_url = f"https://github.com/kubernetes-sigs/kueue/releases/download/v{self.kueue_version}/manifests.yaml"
166
211
  install_command = (
@@ -228,6 +273,7 @@ class KueueManager:
228
273
  topology_name = (
229
274
  topology_name_and_yaml.name if topology_name_and_yaml else None
230
275
  )
276
+ cpu_limit, memory_limit = self.__autocorrect_resource_limits(kueue_config)
231
277
 
232
278
  # The manager builds the context internally based on its opinionated logic
233
279
  context = self.__build_template_context(
@@ -237,8 +283,8 @@ class KueueManager:
237
283
  autoprovisioning=kueue_config.autoprovisioning_enabled,
238
284
  flex=kueue_config.flex,
239
285
  num_slices=kueue_config.num_slices,
240
- cpu_limit=kueue_config.cpu_limit,
241
- memory_limit=kueue_config.memory_limit,
286
+ cpu_limit=cpu_limit,
287
+ memory_limit=memory_limit,
242
288
  topology_name=topology_name,
243
289
  )
244
290
 
@@ -273,19 +319,16 @@ class KueueManager:
273
319
  main_flavor_name = f"{num_slices}x{device_type_str}"
274
320
 
275
321
  node_labels_dict = {}
276
- accelerator_label = create_accelerator_label(
277
- system.accelerator_type, system
278
- )
322
+ accelerator_label = create_accelerator_label(system)
279
323
  if accelerator_label:
280
324
  key, value = accelerator_label.split(":", 1)
281
325
  node_labels_dict[key] = value.strip()
282
326
 
283
- machine_label = create_machine_label(
284
- system.accelerator_type, system, autoprovisioning
285
- )
286
- if machine_label:
287
- key, value = machine_label.split(":", 1)
288
- node_labels_dict[key] = value.strip()
327
+ if not autoprovisioning:
328
+ machine_label = create_machine_label(system)
329
+ if machine_label:
330
+ key, value = machine_label.split(":", 1)
331
+ node_labels_dict[key] = value.strip()
289
332
 
290
333
  topology_label = f"topologyName: {topology_name}" if topology_name else ""
291
334
 
@@ -352,11 +395,10 @@ class KueueManager:
352
395
  def __get_topology_name_and_yaml(
353
396
  self, system: SystemCharacteristics, configure_sub_slicing: bool
354
397
  ) -> _NameAndYaml | None:
355
- if system.device_type in [
356
- H100_MEGA_DEVICE_TYPE,
357
- H200_DEVICE_TYPE,
358
- B200_DEVICE_TYPE,
359
- ]:
398
+ if (
399
+ system.accelerator_type == AcceleratorType["GPU"]
400
+ and system.gpu_requires_topology
401
+ ):
360
402
  return _NameAndYaml(
361
403
  name="gke-default",
362
404
  yaml=self.template_env.get_template(
@@ -364,12 +406,25 @@ class KueueManager:
364
406
  ).render(),
365
407
  )
366
408
  elif configure_sub_slicing:
409
+ sorted_topologies = sorted(
410
+ SUB_SLICING_TOPOLOGIES, key=get_topology_product, reverse=True
411
+ )
412
+ levels = [
413
+ get_slice_topology_level(topology)
414
+ for topology in sorted_topologies
415
+ if is_topology_contained(
416
+ contained=topology, container=system.topology
417
+ )
418
+ ]
419
+ levels.append("kubernetes.io/hostname")
420
+
367
421
  return _NameAndYaml(
368
422
  name=SUB_SLICE_TOPOLOGY_NAME,
369
423
  yaml=self.template_env.get_template(
370
424
  KUEUE_SUB_SLICING_TOPOLOGY_JINJA_FILE
371
425
  ).render({
372
426
  "sub_slice_topology_name": SUB_SLICE_TOPOLOGY_NAME,
427
+ "levels": levels,
373
428
  }),
374
429
  )
375
430
  else:
@@ -377,8 +432,6 @@ class KueueManager:
377
432
 
378
433
  def __apply_manifest(self, manifest: str) -> int:
379
434
  task = "Applying Kueue Custom Resources"
380
- if is_dry_run():
381
- xpk_print(f"Applying following Kueue resources:{manifest}")
382
435
  tmp_file = write_tmp_file(manifest)
383
436
  command = f"kubectl apply -f {tmp_file}"
384
437
  return run_command_with_updates(command, task)
@@ -422,13 +475,114 @@ class KueueManager:
422
475
  xpk_print(f"{task} returned ERROR {return_code}")
423
476
  return return_code
424
477
 
478
+ def __autocorrect_resource_limits(
479
+ self, kueue_config: KueueConfig
480
+ ) -> tuple[int, str]:
481
+ """Verify specified CPU and memory limits against machine type."""
482
+
483
+ cpu_limit = kueue_config.cpu_limit
484
+ memory_limit_str = kueue_config.memory_limit
485
+ if not cpu_limit and not memory_limit_str:
486
+ return cpu_limit, memory_limit_str
487
+
488
+ # Get CPU and memory capacity from machine type
489
+ command = (
490
+ "gcloud compute machine-types describe"
491
+ f" {kueue_config.system.gce_machine_type} "
492
+ f" --project={self.project} --zone={self.zone}"
493
+ " --format='value(guestCpus,memoryMb)'"
494
+ )
495
+ return_code, out = run_command_for_value(
496
+ command,
497
+ "Get vCPU and memory capacity for machine type",
498
+ dry_run_return_val="10 10",
499
+ )
500
+ if return_code != 0:
501
+ xpk_print(
502
+ "Unable to verify vCPU and memory capacity for machine type."
503
+ " XPK will proceed with using user-defined limits."
504
+ )
505
+ return cpu_limit, memory_limit_str
506
+
507
+ cpu_capacity_str, memory_capacity_MB_str = out.split()
508
+ if cpu_limit:
509
+ cpu_limit = _autocorrect_cpu_limit(cpu_limit, int(cpu_capacity_str))
510
+ if memory_limit_str:
511
+ memory_limit_str = _autocorrect_memory_limit(
512
+ memory_limit_str, memory_capacity_MB_str
513
+ )
514
+ return cpu_limit, memory_limit_str
515
+
516
+
517
+ def get_installed_kueue_version(
518
+ dry_run_version: Version | None = None,
519
+ ) -> tuple[int, Version | None]:
520
+ command = (
521
+ "kubectl get deployment kueue-controller-manager -n kueue-system -o"
522
+ " jsonpath='{.spec.template.spec.containers[0].image}'"
523
+ )
524
+ task = "Get kueue version on server"
525
+ return_code, val = run_command_for_value(
526
+ command,
527
+ task,
528
+ dry_run_return_val=(
529
+ f"registry.k8s.io/kueue/kueue:v{dry_run_version}"
530
+ if dry_run_version
531
+ else ""
532
+ ),
533
+ )
534
+ if return_code != 0:
535
+ return return_code, None
536
+ version_tag = val.split(":")
537
+ if len(version_tag) == 1:
538
+ return 1, None
539
+ return return_code, Version(version_tag[-1])
540
+
425
541
 
426
542
  def has_sub_slicing_enabled() -> tuple[int, bool | None]:
427
543
  return_code, value = run_command_for_value(
428
- command="kubectl get topology", task="Get defined topologies"
544
+ command="kubectl get topology",
545
+ task="Get defined topologies",
546
+ dry_run_return_val=SUB_SLICE_TOPOLOGY_NAME,
429
547
  )
430
548
 
431
549
  if return_code != 0:
432
550
  return return_code, None
433
551
 
434
552
  return return_code, SUB_SLICE_TOPOLOGY_NAME in value
553
+
554
+
555
+ def _autocorrect_cpu_limit(cpu_limit: int, cpu_capacity: int) -> int:
556
+ if cpu_limit > cpu_capacity:
557
+ xpk_print(
558
+ "The CPU limit is above the available capacity."
559
+ f" We will set CPU limit to {cpu_capacity}."
560
+ )
561
+ elif cpu_limit < cpu_capacity:
562
+ xpk_print(
563
+ "The CPU limit is below the available capacity, which would lead"
564
+ f" to underutilization. We will set CPU limit to {cpu_capacity}."
565
+ )
566
+ return cpu_capacity
567
+
568
+
569
+ def _autocorrect_memory_limit(
570
+ memory_limit_str: str, memory_capacity_MB_str: str
571
+ ) -> str:
572
+ memory_limit_bytes = parse_quantity(memory_limit_str)
573
+ memory_capacity_bytes = int(memory_capacity_MB_str) << 20
574
+ if memory_limit_bytes == memory_capacity_bytes:
575
+ return memory_limit_str
576
+ memory_limit_str = memory_capacity_MB_str + "Mi"
577
+ if memory_limit_bytes > memory_capacity_bytes:
578
+ xpk_print(
579
+ "The memory limit is above the available capacity. We will set"
580
+ f" memory limit to {memory_limit_str}."
581
+ )
582
+ else:
583
+ xpk_print(
584
+ "The memory limit is below the available capacity, which would"
585
+ " lead to underutilization. We will set the memory limit to"
586
+ f" {memory_limit_str}."
587
+ )
588
+ return memory_limit_str
@@ -22,7 +22,7 @@ import yaml
22
22
  from unittest.mock import MagicMock, patch
23
23
 
24
24
  from xpk.core.kueue_manager import KueueConfig, KueueManager, has_sub_slicing_enabled
25
- from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics
25
+ from xpk.core.system_characteristics import GpuConfig, DockerPlatform, AcceleratorType, SystemCharacteristics, UserFacingNameToSystemCharacteristics
26
26
  from xpk.core.testing.commands_tester import CommandsTester
27
27
  from packaging.version import Version
28
28
 
@@ -35,6 +35,7 @@ TPU_SYSTEM: SystemCharacteristics = SystemCharacteristics(
35
35
  accelerator_type=AcceleratorType.TPU,
36
36
  device_type="v5p-8",
37
37
  supports_sub_slicing=False,
38
+ docker_platform=DockerPlatform.ARM,
38
39
  )
39
40
 
40
41
  KUEUE_CONFIG: KueueConfig = KueueConfig(
@@ -61,6 +62,13 @@ def set_installed_kueue_version(
61
62
  )
62
63
 
63
64
 
65
+ @pytest.fixture(autouse=True)
66
+ def mock_ask_for_user_consent(mocker: MockerFixture) -> MagicMock:
67
+ return mocker.patch(
68
+ "xpk.core.kueue_manager.ask_for_user_consent", return_value=True
69
+ )
70
+
71
+
64
72
  @pytest.fixture(autouse=True)
65
73
  def mock_commands(mocker: MockerFixture) -> CommandsTester:
66
74
  return CommandsTester(
@@ -78,7 +86,7 @@ def mock_commands(mocker: MockerFixture) -> CommandsTester:
78
86
  @pytest.fixture(autouse=True)
79
87
  @patch("jinja2.Environment", return_value=MagicMock())
80
88
  def kueue_manager(mock_env: MagicMock) -> KueueManager:
81
- return KueueManager()
89
+ return KueueManager("test-project", "test-zone")
82
90
 
83
91
 
84
92
  def test_install_or_upgrade_when_newer_version_already_installed(
@@ -102,7 +110,7 @@ def test_install_or_upgrade_when_outdated(
102
110
  result = kueue_manager.install_or_upgrade(KUEUE_CONFIG)
103
111
 
104
112
  assert result == 0
105
- mock_commands.assert_command_run("kubectl apply", "v0.12.2/manifests.yaml")
113
+ mock_commands.assert_command_run("kubectl apply", "v0.14.3/manifests.yaml")
106
114
  mock_commands.assert_command_run("kubectl apply -f", "/tmp/")
107
115
 
108
116
 
@@ -115,10 +123,84 @@ def test_install_or_upgrade_when_not_installed(
115
123
  result = kueue_manager.install_or_upgrade(KUEUE_CONFIG)
116
124
 
117
125
  assert result == 0
118
- mock_commands.assert_command_run("kubectl apply", "v0.12.2/manifests.yaml")
126
+ mock_commands.assert_command_run("kubectl apply", "v0.14.3/manifests.yaml")
119
127
  mock_commands.assert_command_run("kubectl apply -f", "/tmp/")
120
128
 
121
129
 
130
+ def test_upgrade_when_no_breaking_changes_between_versions_no_preparation_needed(
131
+ mock_commands: CommandsTester,
132
+ kueue_manager: KueueManager,
133
+ mock_ask_for_user_consent: MagicMock,
134
+ ):
135
+ set_installed_kueue_version(mock_commands, Version("0.14.0"))
136
+
137
+ kueue_manager.install_or_upgrade(KUEUE_CONFIG)
138
+
139
+ mock_ask_for_user_consent.assert_not_called()
140
+
141
+
142
+ def test_upgrade_with_breaking_changes_between_versions_runs_preparation(
143
+ mock_commands: CommandsTester,
144
+ kueue_manager: KueueManager,
145
+ mock_ask_for_user_consent: MagicMock,
146
+ ):
147
+ set_installed_kueue_version(mock_commands, Version("0.11.0"))
148
+ fake_crds = (
149
+ "customresourcedefinition.apiextensions.k8s.io/kueue-crd-1.kueue.x-k8s.io\n"
150
+ "customresourcedefinition.apiextensions.k8s.io/kueue-crd-2.kueue.x-k8s.io"
151
+ )
152
+ mock_commands.set_result_for_command(
153
+ (0, fake_crds), "kubectl get crd -o name"
154
+ )
155
+ mock_ask_for_user_consent.return_value = True
156
+
157
+ result = kueue_manager.install_or_upgrade(KUEUE_CONFIG)
158
+
159
+ assert result == 0
160
+ mock_ask_for_user_consent.assert_called_once()
161
+ assert (
162
+ "CHANGELOG/CHANGELOG-0.14.md"
163
+ in mock_ask_for_user_consent.mock_calls[0].args[0]
164
+ )
165
+ mock_commands.assert_command_run(
166
+ "kubectl delete kueue-crd-1.kueue.x-k8s.io --all"
167
+ )
168
+ mock_commands.assert_command_run(
169
+ "kubectl delete kueue-crd-2.kueue.x-k8s.io --all"
170
+ )
171
+ mock_commands.assert_command_run(
172
+ "kubectl delete crd kueue-crd-1.kueue.x-k8s.io"
173
+ )
174
+ mock_commands.assert_command_run(
175
+ "kubectl delete crd kueue-crd-2.kueue.x-k8s.io"
176
+ )
177
+ mock_commands.assert_command_run(
178
+ "kubectl delete deployment kueue-controller-manager"
179
+ )
180
+
181
+
182
+ def test_upgrade_with_breaking_changes_between_versions_does_not_run_preparation_without_consent(
183
+ mock_commands: CommandsTester,
184
+ kueue_manager: KueueManager,
185
+ mock_ask_for_user_consent: MagicMock,
186
+ ):
187
+ set_installed_kueue_version(mock_commands, Version("0.11.0"))
188
+ mock_commands.set_result_for_command(
189
+ (
190
+ 0,
191
+ "customresourcedefinition.apiextensions.k8s.io/kueue-crd-1.kueue.x-k8s.io",
192
+ ),
193
+ "kubectl get crd -o name",
194
+ )
195
+ mock_ask_for_user_consent.return_value = False
196
+
197
+ result = kueue_manager.install_or_upgrade(KUEUE_CONFIG)
198
+
199
+ assert result == 1
200
+ # Assert there was no command run for the Kueue crd:
201
+ mock_commands.assert_command_not_run("kueue-crd-1.kueue.x-k8s.io")
202
+
203
+
122
204
  def test_installation_with_tolerations(
123
205
  mock_commands: CommandsTester, kueue_manager: KueueManager
124
206
  ):
@@ -199,6 +281,10 @@ def test_configure_generates_correct_manifest_for_tpu(
199
281
  ):
200
282
  """Test that __configure generates the correct manifest content for TPUs."""
201
283
  set_installed_kueue_version(mock_commands, None)
284
+ mock_commands.set_result_for_command(
285
+ (0, "100 102400"), "gcloud compute machine-types describe"
286
+ )
287
+
202
288
  tpu_kueue_config = dataclasses.replace(
203
289
  KUEUE_CONFIG, system=TPU_SYSTEM, num_slices=2
204
290
  )
@@ -239,6 +325,39 @@ def test_configure_generates_correct_manifest_for_tpu(
239
325
  )
240
326
 
241
327
 
328
+ @patch("xpk.core.kueue_manager.write_tmp_file")
329
+ def test_install_autocorrects_resource_limits(
330
+ write_tmp_file_mock: MagicMock,
331
+ mock_commands: CommandsTester,
332
+ kueue_manager: KueueManager,
333
+ ):
334
+ """Test that installation auto-corrects the specified resource limits."""
335
+ set_installed_kueue_version(mock_commands, None)
336
+ # set 50 vCPU, 200Gi memory
337
+ mock_commands.set_result_for_command(
338
+ (0, "50 204800"), "gcloud compute machine-types describe"
339
+ )
340
+
341
+ kueue_config = dataclasses.replace(
342
+ KUEUE_CONFIG, cpu_limit=100, memory_limit="100Gi"
343
+ )
344
+
345
+ kueue_manager.install_or_upgrade(kueue_config)
346
+
347
+ rendered_manifest: str = write_tmp_file_mock.call_args[0][0]
348
+ manifest_docs = list(yaml.safe_load_all(rendered_manifest))
349
+ cluster_queue = _first(
350
+ doc for doc in manifest_docs if doc["kind"] == "ClusterQueue"
351
+ )
352
+ resources = cluster_queue["spec"]["resourceGroups"][0]["flavors"][0][
353
+ "resources"
354
+ ]
355
+ cpu_resource = _first(r for r in resources if r["name"] == "cpu")
356
+ memory_resource = _first(r for r in resources if r["name"] == "memory")
357
+ assert cpu_resource["nominalQuota"] == 50
358
+ assert memory_resource["nominalQuota"] == "204800Mi"
359
+
360
+
242
361
  @patch("xpk.core.kueue_manager.write_tmp_file")
243
362
  def test_configure_generates_manifest_with_admission_checks_for_flex_single_slice(
244
363
  write_tmp_file_mock: MagicMock,
@@ -287,6 +406,8 @@ def test_configure_generates_correct_manifest_with_gke_default_topology(
287
406
  accelerator_type=AcceleratorType.GPU,
288
407
  device_type="h100-mega-80gb-8",
289
408
  supports_sub_slicing=False,
409
+ docker_platform=DockerPlatform.ARM,
410
+ gpu_config=GpuConfig(requires_topology=True),
290
411
  ),
291
412
  )
292
413
 
@@ -317,6 +438,7 @@ def test_configure_generates_correct_manifest_with_sub_slicing(
317
438
  kueue_config = dataclasses.replace(
318
439
  KUEUE_CONFIG,
319
440
  configure_sub_slicing=True,
441
+ system=UserFacingNameToSystemCharacteristics["v6e-8x8"],
320
442
  )
321
443
 
322
444
  kueue_manager.install_or_upgrade(kueue_config)
@@ -329,6 +451,15 @@ def test_configure_generates_correct_manifest_with_sub_slicing(
329
451
  assert resource_flavor["spec"]["topologyName"] == "sub-slice-topology"
330
452
  topology = _first(doc for doc in manifest_docs if doc["kind"] == "Topology")
331
453
  assert topology["metadata"]["name"] == "sub-slice-topology"
454
+ expected_levels = [
455
+ "cloud.google.com/gke-tpu-slice-8x8-id",
456
+ "cloud.google.com/gke-tpu-slice-4x8-id",
457
+ "cloud.google.com/gke-tpu-slice-4x4-id",
458
+ "cloud.google.com/gke-tpu-slice-2x4-id",
459
+ "kubernetes.io/hostname",
460
+ ]
461
+ actual_levels = [level["nodeLabel"] for level in topology["spec"]["levels"]]
462
+ assert actual_levels == expected_levels
332
463
 
333
464
 
334
465
  @patch("xpk.core.kueue_manager.write_tmp_file")
@@ -373,6 +504,29 @@ def test_configure_generates_correct_manifest_with_pathways(
373
504
  assert pathways_rg["flavors"][0]["resources"][1]["nominalQuota"] == "2000G"
374
505
 
375
506
 
507
+ @patch("xpk.core.kueue_manager.write_tmp_file")
508
+ def test_configure_generates_correct_manifest_for_a4x(
509
+ write_tmp_file_mock: MagicMock,
510
+ mock_commands: CommandsTester,
511
+ kueue_manager: KueueManager,
512
+ ):
513
+ """Test that __configure generates correct manifest for a4x GPUs."""
514
+ set_installed_kueue_version(mock_commands, None)
515
+ kueue_config = dataclasses.replace(
516
+ KUEUE_CONFIG,
517
+ system=UserFacingNameToSystemCharacteristics["gb200-4"],
518
+ )
519
+
520
+ kueue_manager.install_or_upgrade(kueue_config)
521
+
522
+ rendered_manifest: str = write_tmp_file_mock.call_args[0][0]
523
+ manifest_docs = list(yaml.safe_load_all(rendered_manifest))
524
+
525
+ # Check that the gke-default topology is present for a4x.
526
+ topology = _first(doc for doc in manifest_docs if doc["kind"] == "Topology")
527
+ assert topology["metadata"]["name"] == "gke-default"
528
+
529
+
376
530
  def test_has_sub_slicing_enabled_returns_exit_code_when_command_fails(
377
531
  mock_commands: CommandsTester,
378
532
  ):
xpk/core/nap.py CHANGED
@@ -30,9 +30,8 @@ from .commands import run_command_with_updates, run_commands
30
30
  from .gcloud_context import get_cluster_location
31
31
  from .nodepool import get_all_nodepools_programmatic
32
32
  from .resources import (
33
- CLUSTER_METADATA_CONFIGMAP,
34
- CLUSTER_RESOURCES_CONFIGMAP,
35
33
  AutoprovisioningConfig,
34
+ ConfigMapType,
36
35
  get_cluster_configmap,
37
36
  )
38
37
  from .scheduling import get_total_chips_requested_from_args
@@ -266,14 +265,12 @@ def is_autoprovisioning_enabled(
266
265
  int of 0 if successful and 1 otherwise.
267
266
  """
268
267
 
269
- resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
270
- cluster_config_map = get_cluster_configmap(resources_configmap_name)
268
+ cluster_config_map = get_cluster_configmap(
269
+ args.cluster, ConfigMapType.RESOURCES
270
+ )
271
271
 
272
272
  if cluster_config_map is None:
273
- xpk_print(
274
- f'Unable to find config map: {resources_configmap_name}.'
275
- ' Autoprovisioning is not enabled.'
276
- )
273
+ xpk_print('Unable to find config map. Autoprovisioning is not enabled.')
277
274
  return False, 0
278
275
 
279
276
  return_code, autoprovisioning_value = get_value_from_map(
@@ -281,8 +278,8 @@ def is_autoprovisioning_enabled(
281
278
  )
282
279
  if return_code != 0:
283
280
  xpk_print(
284
- 'gke_accelerator type not found in config map:'
285
- f' {resources_configmap_name}. Autoprovisioning is not enabled.'
281
+ 'gke_accelerator type not found in config map. Autoprovisioning is not'
282
+ ' enabled.'
286
283
  )
287
284
  return False, 0
288
285
 
@@ -319,8 +316,9 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
319
316
 
320
317
  if capacity_type_str == CapacityType.UNKNOWN.name:
321
318
  # Use default settings from cluster creation.
322
- metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
323
- cluster_config_map = get_cluster_configmap(metadata_configmap_name)
319
+ cluster_config_map = get_cluster_configmap(
320
+ args.cluster, ConfigMapType.METADATA
321
+ )
324
322
 
325
323
  # Error out if the metadata config map doesn't exist, and is attempting to use
326
324
  # autoprovisioning.
@@ -363,8 +361,9 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
363
361
 
364
362
 
365
363
  def get_cluster_provisioner(args) -> str:
366
- metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
367
- cluster_config_map = get_cluster_configmap(metadata_configmap_name)
364
+ cluster_config_map = get_cluster_configmap(
365
+ args.cluster, ConfigMapType.METADATA
366
+ )
368
367
  cluster_provisioner = 'gcloud'
369
368
  if not cluster_config_map is None:
370
369
  provisioner = cluster_config_map.get('provisioner')