xpk 0.14.3__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. integration/gcluster_a3mega_test.py +11 -0
  2. integration/gcluster_a3ultra_test.py +11 -0
  3. integration/gcluster_a4_test.py +11 -0
  4. xpk/commands/cluster.py +57 -21
  5. xpk/commands/cluster_gcluster.py +25 -5
  6. xpk/commands/cluster_gcluster_test.py +11 -2
  7. xpk/commands/cluster_test.py +233 -12
  8. xpk/commands/config.py +3 -5
  9. xpk/commands/kind.py +1 -1
  10. xpk/commands/storage.py +8 -10
  11. xpk/commands/workload.py +28 -11
  12. xpk/commands/workload_test.py +3 -3
  13. xpk/core/blueprint/blueprint_generator.py +70 -33
  14. xpk/core/blueprint/blueprint_test.py +9 -0
  15. xpk/core/capacity.py +46 -8
  16. xpk/core/capacity_test.py +32 -1
  17. xpk/core/cluster.py +37 -57
  18. xpk/core/cluster_test.py +95 -0
  19. xpk/core/commands.py +4 -10
  20. xpk/core/config.py +9 -2
  21. xpk/core/gcloud_context.py +18 -12
  22. xpk/core/gcloud_context_test.py +111 -1
  23. xpk/core/kjob.py +6 -9
  24. xpk/core/kueue_manager.py +192 -32
  25. xpk/core/kueue_manager_test.py +132 -4
  26. xpk/core/nodepool.py +21 -29
  27. xpk/core/nodepool_test.py +17 -15
  28. xpk/core/scheduling.py +16 -1
  29. xpk/core/scheduling_test.py +85 -6
  30. xpk/core/system_characteristics.py +77 -19
  31. xpk/core/system_characteristics_test.py +80 -5
  32. xpk/core/telemetry.py +263 -0
  33. xpk/core/telemetry_test.py +211 -0
  34. xpk/main.py +31 -13
  35. xpk/parser/cluster.py +48 -9
  36. xpk/parser/cluster_test.py +42 -3
  37. xpk/parser/workload.py +12 -0
  38. xpk/parser/workload_test.py +4 -4
  39. xpk/telemetry_uploader.py +29 -0
  40. xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
  41. xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
  42. xpk/utils/console.py +41 -10
  43. xpk/utils/console_test.py +106 -0
  44. xpk/utils/feature_flags.py +7 -1
  45. xpk/utils/file.py +4 -1
  46. xpk/utils/topology.py +4 -0
  47. xpk/utils/user_agent.py +35 -0
  48. xpk/utils/user_agent_test.py +44 -0
  49. xpk/utils/user_input.py +48 -0
  50. xpk/utils/user_input_test.py +92 -0
  51. xpk/utils/validation.py +0 -11
  52. xpk/utils/versions.py +31 -0
  53. {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/METADATA +113 -92
  54. {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/RECORD +58 -48
  55. {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/WHEEL +0 -0
  56. {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/entry_points.txt +0 -0
  57. {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/licenses/LICENSE +0 -0
  58. {xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/top_level.txt +0 -0
@@ -19,6 +19,7 @@ import sys
19
19
  from dataclasses import dataclass
20
20
 
21
21
  from ..utils.console import xpk_print, xpk_exit
22
+ from ..utils.versions import ReleaseChannel
22
23
  from .commands import run_command_for_value
23
24
  from functools import lru_cache
24
25
 
@@ -117,15 +118,18 @@ def get_cluster_location(project: str, name: str, zone: str) -> str:
117
118
  class GkeServerConfig:
118
119
  """Stores the valid gke versions based on gcloud recommendations."""
119
120
 
120
- default_rapid_gke_version: str
121
+ default_gke_version: str
121
122
  valid_versions: set[str]
122
123
 
123
124
 
124
- def get_gke_server_config(args) -> tuple[int, GkeServerConfig | None]:
125
+ def get_gke_server_config(
126
+ args, release_channel: ReleaseChannel
127
+ ) -> tuple[int, GkeServerConfig | None]:
125
128
  """Determine the GKE versions supported by gcloud currently.
126
129
 
127
130
  Args:
128
131
  args: user provided arguments for running the command.
132
+ release_channel: the release channel to use.
129
133
 
130
134
  Returns:
131
135
  Tuple of
@@ -136,22 +140,24 @@ def get_gke_server_config(args) -> tuple[int, GkeServerConfig | None]:
136
140
  'gcloud container get-server-config'
137
141
  f' --project={args.project} --region={zone_to_region(args.zone)}'
138
142
  )
139
- default_rapid_gke_version_cmd = (
143
+ default_gke_version_cmd = (
140
144
  base_command
141
- + ' --flatten="channels" --filter="channels.channel=RAPID"'
145
+ + ' --flatten="channels"'
146
+ f' --filter="channels.channel={release_channel.value}"'
142
147
  ' --format="value(channels.defaultVersion)"'
143
148
  )
144
149
  valid_versions_cmd = (
145
150
  base_command
146
- + ' --flatten="channels" --filter="channels.channel=RAPID"'
151
+ + ' --flatten="channels"'
152
+ f' --filter="channels.channel={release_channel.value}"'
147
153
  ' --format="value(channels.validVersions)"'
148
154
  )
149
155
  base_command_description = 'Determine server supported GKE versions for '
150
156
 
151
157
  server_config_commands_and_descriptions = [
152
158
  (
153
- default_rapid_gke_version_cmd,
154
- base_command_description + 'default rapid gke version',
159
+ default_gke_version_cmd,
160
+ base_command_description + 'default gke version',
155
161
  ),
156
162
  (
157
163
  valid_versions_cmd,
@@ -172,8 +178,8 @@ def get_gke_server_config(args) -> tuple[int, GkeServerConfig | None]:
172
178
  command_outputs.append(cmd_output)
173
179
 
174
180
  return 0, GkeServerConfig(
175
- default_rapid_gke_version=command_outputs[0].strip(),
176
- valid_versions=set(command_outputs[1].split(';')),
181
+ default_gke_version=command_outputs[0].strip(),
182
+ valid_versions=set([s.strip() for s in command_outputs[1].split(';')]),
177
183
  )
178
184
 
179
185
 
@@ -196,7 +202,7 @@ def get_gke_control_plane_version(
196
202
  if args.gke_version is not None:
197
203
  master_gke_version = args.gke_version
198
204
  else:
199
- master_gke_version = gke_server_config.default_rapid_gke_version
205
+ master_gke_version = gke_server_config.default_gke_version
200
206
 
201
207
  is_valid_version = master_gke_version in gke_server_config.valid_versions
202
208
 
@@ -204,7 +210,7 @@ def get_gke_control_plane_version(
204
210
  xpk_print(
205
211
  f'Planned GKE Version: {master_gke_version}\n Valid Versions:'
206
212
  f'\n{gke_server_config.valid_versions}\nRecommended / Default GKE'
207
- f' Version: {gke_server_config.default_rapid_gke_version}'
213
+ f' Version: {gke_server_config.default_gke_version}'
208
214
  )
209
215
  xpk_print(
210
216
  f'Error: Planned GKE Version {master_gke_version} is not valid.'
@@ -213,7 +219,7 @@ def get_gke_control_plane_version(
213
219
  xpk_print(
214
220
  'Please select a gke version from the above list using --gke-version=x'
215
221
  ' argument or rely on the default gke version:'
216
- f' {gke_server_config.default_rapid_gke_version}'
222
+ f' {gke_server_config.default_gke_version}'
217
223
  )
218
224
  return 1, None
219
225
 
@@ -15,7 +15,20 @@ limitations under the License.
15
15
  """
16
16
 
17
17
  import pytest
18
- from .gcloud_context import get_cluster_location, zone_to_region
18
+ from unittest.mock import MagicMock
19
+ from .gcloud_context import (
20
+ get_cluster_location,
21
+ get_gke_control_plane_version,
22
+ get_gke_server_config,
23
+ GkeServerConfig,
24
+ zone_to_region,
25
+ )
26
+ from ..utils.versions import ReleaseChannel
27
+
28
+
29
+ @pytest.fixture(autouse=True)
30
+ def xpk_print(mocker):
31
+ return mocker.patch("xpk.core.gcloud_context.xpk_print")
19
32
 
20
33
 
21
34
  def test_zone_to_region_raises_when_zone_is_invalid():
@@ -94,3 +107,100 @@ def test_get_cluster_location_invokes_command_for_different_input_args(mocker):
94
107
  get_cluster_location(project="project6", name="name6", zone="us-central1-a")
95
108
 
96
109
  assert mock.call_count == 2
110
+
111
+
112
+ def test_get_gke_server_config_success(mocker):
113
+ mock_run_command = mocker.patch(
114
+ "xpk.core.gcloud_context.run_command_for_value",
115
+ side_effect=[
116
+ (0, "1.2.3"),
117
+ (0, "1.2.3;1.2.4;1.3.0"),
118
+ ],
119
+ )
120
+ args = mocker.Mock(project="test-project", zone="us-central1")
121
+
122
+ return_code, config = get_gke_server_config(args, ReleaseChannel.STABLE)
123
+
124
+ assert return_code == 0
125
+ assert isinstance(config, GkeServerConfig)
126
+ assert config.default_gke_version == "1.2.3"
127
+ assert config.valid_versions == {"1.2.3", "1.2.4", "1.3.0"}
128
+ assert mock_run_command.call_count == 2
129
+
130
+
131
+ def test_get_gke_server_config_fails_on_default_version_command(mocker):
132
+ mocker.patch(
133
+ "xpk.core.gcloud_context.run_command_for_value",
134
+ return_value=(1, "error"),
135
+ )
136
+ args = mocker.Mock(project="test-project", zone="us-central1")
137
+
138
+ return_code, config = get_gke_server_config(args, ReleaseChannel.STABLE)
139
+
140
+ assert return_code == 1
141
+ assert config is None
142
+
143
+
144
+ def test_get_gke_server_config_fails_on_valid_versions_command(mocker):
145
+ mocker.patch(
146
+ "xpk.core.gcloud_context.run_command_for_value",
147
+ side_effect=[(0, "1.2.3"), (1, "error")],
148
+ )
149
+ args = mocker.Mock(project="test-project", zone="us-central1")
150
+
151
+ return_code, config = get_gke_server_config(args, ReleaseChannel.STABLE)
152
+
153
+ assert return_code == 1
154
+ assert config is None
155
+
156
+
157
+ def test_get_gke_control_plane_version_uses_default_when_not_specified(mocker):
158
+ args = mocker.Mock(gke_version=None)
159
+ gke_server_config = GkeServerConfig(
160
+ default_gke_version="1.2.3", valid_versions={"1.2.3", "1.2.4"}
161
+ )
162
+
163
+ return_code, version = get_gke_control_plane_version(args, gke_server_config)
164
+
165
+ assert return_code == 0
166
+ assert version == "1.2.3"
167
+
168
+
169
+ def test_get_gke_control_plane_version_uses_user_version_when_valid(mocker):
170
+ args = mocker.Mock(gke_version="1.2.4")
171
+ gke_server_config = GkeServerConfig(
172
+ default_gke_version="1.2.3", valid_versions={"1.2.3", "1.2.4"}
173
+ )
174
+
175
+ return_code, version = get_gke_control_plane_version(args, gke_server_config)
176
+
177
+ assert return_code == 0
178
+ assert version == "1.2.4"
179
+
180
+
181
+ def test_get_gke_control_plane_version_fails_for_invalid_user_version(
182
+ mocker, xpk_print: MagicMock
183
+ ):
184
+ args = mocker.Mock(gke_version="1.2.5")
185
+ gke_server_config = GkeServerConfig(
186
+ default_gke_version="1.2.3", valid_versions={"1.2.3", "1.2.4"}
187
+ )
188
+
189
+ return_code, version = get_gke_control_plane_version(args, gke_server_config)
190
+
191
+ assert return_code == 1
192
+ assert version is None
193
+ assert "Planned GKE Version: 1.2.5" in xpk_print.mock_calls[0].args[0]
194
+ assert (
195
+ "Recommended / Default GKE Version: 1.2.3"
196
+ in xpk_print.mock_calls[0].args[0]
197
+ )
198
+ assert (
199
+ "Error: Planned GKE Version 1.2.5 is not valid."
200
+ in xpk_print.mock_calls[1].args[0]
201
+ )
202
+ assert (
203
+ "Please select a gke version from the above list using --gke-version=x"
204
+ " argument or rely on the default gke version: 1.2.3"
205
+ in xpk_print.mock_calls[2].args[0]
206
+ )
xpk/core/kjob.py CHANGED
@@ -38,7 +38,7 @@ from .config import (
38
38
  KJOB_SHELL_IMAGE,
39
39
  KJOB_SHELL_INTERACTIVE_COMMAND,
40
40
  KJOB_SHELL_WORKING_DIRECTORY,
41
- XpkConfig,
41
+ xpk_config,
42
42
  )
43
43
  from .network import get_cluster_subnetworks
44
44
  from .system_characteristics import AcceleratorType, SystemCharacteristics
@@ -234,8 +234,7 @@ def get_pod_template_interactive_command() -> str:
234
234
  Returns:
235
235
  str - PodTemplate's interactive command
236
236
  """
237
- config = XpkConfig()
238
- pod_command = config.get(KJOB_SHELL_INTERACTIVE_COMMAND)
237
+ pod_command = xpk_config.get(KJOB_SHELL_INTERACTIVE_COMMAND)
239
238
  if pod_command is None or len(pod_command) == 0:
240
239
  pod_command = PodTemplateDefaults.INTERACTIVE_COMMAND.value
241
240
 
@@ -287,11 +286,10 @@ def create_job_template_instance(
287
286
  Returns:
288
287
  exit_code > 0 if creating JobTemplate fails, 0 otherwise
289
288
  """
290
- config = XpkConfig()
291
- job_image = config.get(KJOB_BATCH_IMAGE)
289
+ job_image = xpk_config.get(KJOB_BATCH_IMAGE)
292
290
  if job_image is None or len(job_image) == 0:
293
291
  job_image = JobTemplateDefaults.IMAGE.value
294
- working_directory = config.get(KJOB_BATCH_WORKING_DIRECTORY)
292
+ working_directory = xpk_config.get(KJOB_BATCH_WORKING_DIRECTORY)
295
293
  if working_directory is None or len(working_directory) == 0:
296
294
  working_directory = JobTemplateDefaults.WORKING_DIRECTORY.value
297
295
  resources = (
@@ -332,11 +330,10 @@ def create_pod_template_instance(service_account: str) -> int:
332
330
  Returns:
333
331
  exit_code > 0 if creating PodTemplate fails, 0 otherwise
334
332
  """
335
- config = XpkConfig()
336
- pod_image = config.get(KJOB_SHELL_IMAGE)
333
+ pod_image = xpk_config.get(KJOB_SHELL_IMAGE)
337
334
  if pod_image is None or len(pod_image) == 0:
338
335
  pod_image = PodTemplateDefaults.IMAGE.value
339
- working_directory = config.get(KJOB_SHELL_WORKING_DIRECTORY)
336
+ working_directory = xpk_config.get(KJOB_SHELL_WORKING_DIRECTORY)
340
337
  if working_directory is None or len(working_directory) == 0:
341
338
  working_directory = PodTemplateDefaults.WORKING_DIRECTORY.value
342
339
 
xpk/core/kueue_manager.py CHANGED
@@ -20,15 +20,17 @@ from dataclasses import dataclass
20
20
  from typing import Optional, List, Dict, Any
21
21
  import json
22
22
  from jinja2 import Environment, FileSystemLoader
23
- from ..utils.execution_context import is_dry_run
24
- from ..utils.kueue import is_queued_cluster
25
23
 
24
+ from ..utils.topology import get_slice_topology_level, get_topology_product, is_topology_contained
25
+ from ..utils.kueue import is_queued_cluster
26
+ from kubernetes.utils import parse_quantity
26
27
  from .capacity import B200_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
27
28
  from .scheduling import (
28
29
  create_accelerator_label,
29
30
  create_machine_label,
30
31
  )
31
32
  from .system_characteristics import (
33
+ SUB_SLICING_TOPOLOGIES,
32
34
  AcceleratorTypeToAcceleratorCharacteristics,
33
35
  SystemCharacteristics,
34
36
  )
@@ -38,10 +40,12 @@ from ..core.commands import (
38
40
  run_command_with_updates_retry,
39
41
  )
40
42
  from ..utils.file import write_tmp_file
41
- from ..utils.console import xpk_print, xpk_exit
43
+ from ..utils.console import xpk_print, xpk_exit, ask_for_user_consent
42
44
  from ..utils.templates import TEMPLATE_PATH, get_templates_absolute_path
43
45
  from packaging.version import Version
44
46
 
47
+ KUEUE_VERSION = Version("v0.14.3")
48
+ LATEST_BREAKING_VERSION = Version("v0.14.0")
45
49
  WAIT_FOR_KUEUE_TIMEOUT = "10m"
46
50
  CLUSTER_QUEUE_NAME = "cluster-queue"
47
51
  LOCAL_QUEUE_NAME = "multislice-queue"
@@ -52,10 +56,9 @@ KUEUE_CONTROLLER_MANAGER_JINJA_FILE = "kueue_controller_manager.yaml.j2"
52
56
  KUEUE_SUB_SLICING_TOPOLOGY_JINJA_FILE = "kueue_sub_slicing_topology.yaml.j2"
53
57
  MEMORY_SIZE_PER_VM = 1.2
54
58
  MIN_MEMORY_LIMIT_SIZE = 4096
55
- KUEUE_VERSION = Version("v0.12.2")
56
59
 
57
60
 
58
- @dataclass
61
+ @dataclass(frozen=True)
59
62
  class KueueConfig:
60
63
  system: SystemCharacteristics
61
64
  total_chips: int
@@ -68,7 +71,7 @@ class KueueConfig:
68
71
  num_slices: int = 1
69
72
 
70
73
 
71
- @dataclass
74
+ @dataclass(frozen=True)
72
75
  class _NameAndYaml:
73
76
  name: str
74
77
  yaml: str
@@ -79,9 +82,13 @@ class KueueManager:
79
82
 
80
83
  def __init__(
81
84
  self,
85
+ project: str,
86
+ zone: str,
82
87
  kueue_version: Version = KUEUE_VERSION,
83
88
  template_path=TEMPLATE_PATH,
84
89
  ):
90
+ self.project = project
91
+ self.zone = zone
85
92
  self.kueue_version = kueue_version
86
93
 
87
94
  self.template_env = Environment(
@@ -102,10 +109,10 @@ class KueueManager:
102
109
  Args:
103
110
  tolerations: An optional list of tolerations to apply to the kueue-controller-manager.
104
111
  """
105
- return_code, installed_version = self.get_installed_kueue_version()
112
+ return_code, installed_version = get_installed_kueue_version()
106
113
 
107
- if return_code == 0:
108
- if installed_version and installed_version > self.kueue_version:
114
+ if return_code == 0 and installed_version:
115
+ if installed_version > self.kueue_version:
109
116
  xpk_print(
110
117
  f"Cluster has a newer Kueue version, {installed_version}. Skipping"
111
118
  " installation."
@@ -113,6 +120,10 @@ class KueueManager:
113
120
  return 0
114
121
  else:
115
122
  xpk_print(f"Upgrading Kueue to version v{self.kueue_version}...")
123
+ assert installed_version
124
+ prepare_code = self.__prepare_for_upgrade(installed_version)
125
+ if prepare_code != 0:
126
+ return prepare_code
116
127
  else:
117
128
  xpk_print(f"Installing Kueue version v{self.kueue_version}...")
118
129
 
@@ -122,24 +133,6 @@ class KueueManager:
122
133
 
123
134
  return self.__configure(kueue_config)
124
135
 
125
- def get_installed_kueue_version(self) -> tuple[int, Version | None]:
126
- command = (
127
- "kubectl get deployment kueue-controller-manager -n kueue-system -o"
128
- " jsonpath='{.spec.template.spec.containers[0].image}'"
129
- )
130
- task = "Get kueue version on server"
131
- return_code, val = run_command_for_value(
132
- command,
133
- task,
134
- dry_run_return_val="",
135
- )
136
- if return_code != 0:
137
- return return_code, None
138
- version_tag = val.split(":")
139
- if len(version_tag) == 1:
140
- return 1, None
141
- return return_code, Version(version_tag[-1])
142
-
143
136
  def __install(
144
137
  self,
145
138
  tolerations: Optional[List[Dict[str, Any]]] = None,
@@ -161,6 +154,60 @@ class KueueManager:
161
154
 
162
155
  return self.__wait_for_kueue_available()
163
156
 
157
+ def __prepare_for_upgrade(self, installed_version: Version) -> int:
158
+ if installed_version >= LATEST_BREAKING_VERSION:
159
+ return 0
160
+
161
+ xpk_print(
162
+ f"Currently installed Kueue version v{installed_version} is"
163
+ f" incompatible with the newer v{self.kueue_version}."
164
+ )
165
+
166
+ changelog_link = f"https://github.com/kubernetes-sigs/kueue/blob/main/CHANGELOG/CHANGELOG-{self.kueue_version.major}.{self.kueue_version.minor}.md"
167
+ agreed = ask_for_user_consent(
168
+ "Do you want to allow XPK to update Kueue automatically? This will"
169
+ " delete all existing Kueue resources and create new ones. If you"
170
+ " decline, you will need to upgrade the Kueue manually (see"
171
+ f" {changelog_link} for help)."
172
+ )
173
+ if not agreed:
174
+ return 1
175
+
176
+ return self.__delete_all_kueue_resources()
177
+
178
+ def __delete_all_kueue_resources(self) -> int:
179
+ return_code, kueue_crds_string = run_command_for_value(
180
+ "kubectl get crd -o name | grep .kueue.x-k8s.io", "Get Kueue CRDs"
181
+ )
182
+ if return_code != 0:
183
+ return return_code
184
+
185
+ kueue_crds = [
186
+ line.strip().removeprefix(
187
+ "customresourcedefinition.apiextensions.k8s.io/"
188
+ )
189
+ for line in kueue_crds_string.strip().split("\n")
190
+ ]
191
+
192
+ for crd in kueue_crds:
193
+ return_code = run_command_with_updates(
194
+ f"kubectl delete {crd} --all", f"Delete all resources of type {crd}"
195
+ )
196
+ if return_code != 0:
197
+ return return_code
198
+
199
+ for crd in kueue_crds:
200
+ return_code = run_command_with_updates(
201
+ f"kubectl delete crd {crd}", f"Delete CRD {crd}"
202
+ )
203
+ if return_code != 0:
204
+ return return_code
205
+
206
+ return run_command_with_updates(
207
+ "kubectl delete deployment kueue-controller-manager -n kueue-system",
208
+ "Delete Kueue Controller Manager deployment",
209
+ )
210
+
164
211
  def __install_kueue_crs(self) -> int:
165
212
  manifest_url = f"https://github.com/kubernetes-sigs/kueue/releases/download/v{self.kueue_version}/manifests.yaml"
166
213
  install_command = (
@@ -228,6 +275,7 @@ class KueueManager:
228
275
  topology_name = (
229
276
  topology_name_and_yaml.name if topology_name_and_yaml else None
230
277
  )
278
+ cpu_limit, memory_limit = self.__autocorrect_resource_limits(kueue_config)
231
279
 
232
280
  # The manager builds the context internally based on its opinionated logic
233
281
  context = self.__build_template_context(
@@ -237,8 +285,8 @@ class KueueManager:
237
285
  autoprovisioning=kueue_config.autoprovisioning_enabled,
238
286
  flex=kueue_config.flex,
239
287
  num_slices=kueue_config.num_slices,
240
- cpu_limit=kueue_config.cpu_limit,
241
- memory_limit=kueue_config.memory_limit,
288
+ cpu_limit=cpu_limit,
289
+ memory_limit=memory_limit,
242
290
  topology_name=topology_name,
243
291
  )
244
292
 
@@ -364,12 +412,25 @@ class KueueManager:
364
412
  ).render(),
365
413
  )
366
414
  elif configure_sub_slicing:
415
+ sorted_topologies = sorted(
416
+ SUB_SLICING_TOPOLOGIES, key=get_topology_product, reverse=True
417
+ )
418
+ levels = [
419
+ get_slice_topology_level(topology)
420
+ for topology in sorted_topologies
421
+ if is_topology_contained(
422
+ contained=topology, container=system.topology
423
+ )
424
+ ]
425
+ levels.append("kubernetes.io/hostname")
426
+
367
427
  return _NameAndYaml(
368
428
  name=SUB_SLICE_TOPOLOGY_NAME,
369
429
  yaml=self.template_env.get_template(
370
430
  KUEUE_SUB_SLICING_TOPOLOGY_JINJA_FILE
371
431
  ).render({
372
432
  "sub_slice_topology_name": SUB_SLICE_TOPOLOGY_NAME,
433
+ "levels": levels,
373
434
  }),
374
435
  )
375
436
  else:
@@ -377,8 +438,6 @@ class KueueManager:
377
438
 
378
439
  def __apply_manifest(self, manifest: str) -> int:
379
440
  task = "Applying Kueue Custom Resources"
380
- if is_dry_run():
381
- xpk_print(f"Applying following Kueue resources:{manifest}")
382
441
  tmp_file = write_tmp_file(manifest)
383
442
  command = f"kubectl apply -f {tmp_file}"
384
443
  return run_command_with_updates(command, task)
@@ -422,13 +481,114 @@ class KueueManager:
422
481
  xpk_print(f"{task} returned ERROR {return_code}")
423
482
  return return_code
424
483
 
484
+ def __autocorrect_resource_limits(
485
+ self, kueue_config: KueueConfig
486
+ ) -> tuple[int, str]:
487
+ """Verify specified CPU and memory limits against machine type."""
488
+
489
+ cpu_limit = kueue_config.cpu_limit
490
+ memory_limit_str = kueue_config.memory_limit
491
+ if not cpu_limit and not memory_limit_str:
492
+ return cpu_limit, memory_limit_str
493
+
494
+ # Get CPU and memory capacity from machine type
495
+ command = (
496
+ "gcloud compute machine-types describe"
497
+ f" {kueue_config.system.gce_machine_type} "
498
+ f" --project={self.project} --zone={self.zone}"
499
+ " --format='value(guestCpus,memoryMb)'"
500
+ )
501
+ return_code, out = run_command_for_value(
502
+ command,
503
+ "Get vCPU and memory capacity for machine type",
504
+ dry_run_return_val="10 10",
505
+ )
506
+ if return_code != 0:
507
+ xpk_print(
508
+ "Unable to verify vCPU and memory capacity for machine type."
509
+ " XPK will proceed with using user-defined limits."
510
+ )
511
+ return cpu_limit, memory_limit_str
512
+
513
+ cpu_capacity_str, memory_capacity_MB_str = out.split()
514
+ if cpu_limit:
515
+ cpu_limit = _autocorrect_cpu_limit(cpu_limit, int(cpu_capacity_str))
516
+ if memory_limit_str:
517
+ memory_limit_str = _autocorrect_memory_limit(
518
+ memory_limit_str, memory_capacity_MB_str
519
+ )
520
+ return cpu_limit, memory_limit_str
521
+
522
+
523
+ def get_installed_kueue_version(
524
+ dry_run_version: Version | None = None,
525
+ ) -> tuple[int, Version | None]:
526
+ command = (
527
+ "kubectl get deployment kueue-controller-manager -n kueue-system -o"
528
+ " jsonpath='{.spec.template.spec.containers[0].image}'"
529
+ )
530
+ task = "Get kueue version on server"
531
+ return_code, val = run_command_for_value(
532
+ command,
533
+ task,
534
+ dry_run_return_val=(
535
+ f"registry.k8s.io/kueue/kueue:v{dry_run_version}"
536
+ if dry_run_version
537
+ else ""
538
+ ),
539
+ )
540
+ if return_code != 0:
541
+ return return_code, None
542
+ version_tag = val.split(":")
543
+ if len(version_tag) == 1:
544
+ return 1, None
545
+ return return_code, Version(version_tag[-1])
546
+
425
547
 
426
548
  def has_sub_slicing_enabled() -> tuple[int, bool | None]:
427
549
  return_code, value = run_command_for_value(
428
- command="kubectl get topology", task="Get defined topologies"
550
+ command="kubectl get topology",
551
+ task="Get defined topologies",
552
+ dry_run_return_val=SUB_SLICE_TOPOLOGY_NAME,
429
553
  )
430
554
 
431
555
  if return_code != 0:
432
556
  return return_code, None
433
557
 
434
558
  return return_code, SUB_SLICE_TOPOLOGY_NAME in value
559
+
560
+
561
+ def _autocorrect_cpu_limit(cpu_limit: int, cpu_capacity: int) -> int:
562
+ if cpu_limit > cpu_capacity:
563
+ xpk_print(
564
+ "The CPU limit is above the available capacity."
565
+ f" We will set CPU limit to {cpu_capacity}."
566
+ )
567
+ elif cpu_limit < cpu_capacity:
568
+ xpk_print(
569
+ "The CPU limit is below the available capacity, which would lead"
570
+ f" to underutilization. We will set CPU limit to {cpu_capacity}."
571
+ )
572
+ return cpu_capacity
573
+
574
+
575
+ def _autocorrect_memory_limit(
576
+ memory_limit_str: str, memory_capacity_MB_str: str
577
+ ) -> str:
578
+ memory_limit_bytes = parse_quantity(memory_limit_str)
579
+ memory_capacity_bytes = int(memory_capacity_MB_str) << 20
580
+ if memory_limit_bytes == memory_capacity_bytes:
581
+ return memory_limit_str
582
+ memory_limit_str = memory_capacity_MB_str + "Mi"
583
+ if memory_limit_bytes > memory_capacity_bytes:
584
+ xpk_print(
585
+ "The memory limit is above the available capacity. We will set"
586
+ f" memory limit to {memory_limit_str}."
587
+ )
588
+ else:
589
+ xpk_print(
590
+ "The memory limit is below the available capacity, which would"
591
+ " lead to underutilization. We will set the memory limit to"
592
+ f" {memory_limit_str}."
593
+ )
594
+ return memory_limit_str