xpk 0.14.2__py3-none-any.whl → 0.14.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. xpk/commands/cluster.py +57 -22
  2. xpk/commands/cluster_gcluster_test.py +2 -2
  3. xpk/commands/cluster_test.py +197 -25
  4. xpk/commands/inspector.py +20 -7
  5. xpk/commands/kind.py +1 -1
  6. xpk/commands/workload.py +42 -4
  7. xpk/commands/workload_test.py +88 -5
  8. xpk/core/blueprint/blueprint_definitions.py +16 -1
  9. xpk/core/blueprint/blueprint_generator.py +11 -11
  10. xpk/core/capacity.py +17 -0
  11. xpk/core/capacity_test.py +50 -0
  12. xpk/core/config.py +1 -1
  13. xpk/core/docker_container.py +4 -4
  14. xpk/core/docker_resources.py +11 -11
  15. xpk/core/kjob.py +3 -5
  16. xpk/core/kueue_manager.py +21 -10
  17. xpk/core/kueue_manager_test.py +379 -536
  18. xpk/core/nap.py +1 -1
  19. xpk/core/nodepool.py +9 -9
  20. xpk/core/nodepool_test.py +4 -4
  21. xpk/core/pathways.py +1 -1
  22. xpk/core/resources.py +1 -1
  23. xpk/core/scheduling.py +7 -13
  24. xpk/core/system_characteristics.py +42 -35
  25. xpk/core/system_characteristics_test.py +3 -3
  26. xpk/core/testing/__init__.py +15 -0
  27. xpk/core/testing/commands_tester.py +131 -0
  28. xpk/core/testing/commands_tester_test.py +129 -0
  29. xpk/core/updates.py +57 -0
  30. xpk/core/updates_test.py +80 -0
  31. xpk/main.py +7 -4
  32. xpk/parser/common.py +8 -0
  33. xpk/utils/execution_context.py +20 -2
  34. {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/METADATA +1 -3
  35. {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/RECORD +39 -33
  36. {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/WHEEL +0 -0
  37. {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/entry_points.txt +0 -0
  38. {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/licenses/LICENSE +0 -0
  39. {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/top_level.txt +0 -0
@@ -17,8 +17,9 @@ limitations under the License.
17
17
  import dataclasses
18
18
  from unittest.mock import MagicMock, patch
19
19
  import pytest
20
- from ..core.system_characteristics import SystemCharacteristics
21
- from .workload import _validate_sub_slicing_topology
20
+ from ..core.system_characteristics import SystemCharacteristics, AcceleratorType
21
+ from .workload import _validate_sub_slicing_topology, _validate_sub_slicing_availability
22
+ from packaging.version import Version
22
23
 
23
24
 
24
25
  SYSTEM_CHARACTERISTICS = SystemCharacteristics(
@@ -27,7 +28,7 @@ SYSTEM_CHARACTERISTICS = SystemCharacteristics(
27
28
  gke_accelerator='nvidia-l4',
28
29
  gce_machine_type='g2-standard-12',
29
30
  chips_per_vm=1,
30
- accelerator_type=1,
31
+ accelerator_type=AcceleratorType.TPU,
31
32
  device_type='l4-1',
32
33
  supports_sub_slicing=True,
33
34
  requires_workload_policy=False,
@@ -40,7 +41,7 @@ def xpk_print(mocker):
40
41
 
41
42
 
42
43
  def test_validate_sub_slicing_topology_exits_for_unsupported_topology(
43
- xpk_print,
44
+ xpk_print: MagicMock,
44
45
  ):
45
46
  with pytest.raises(SystemExit):
46
47
  _validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '2x1')
@@ -50,7 +51,9 @@ def test_validate_sub_slicing_topology_exits_for_unsupported_topology(
50
51
  )
51
52
 
52
53
 
53
- def test_validate_sub_slicing_topology_exits_for_too_large_topology(xpk_print):
54
+ def test_validate_sub_slicing_topology_exits_for_too_large_topology(
55
+ xpk_print: MagicMock,
56
+ ):
54
57
  with pytest.raises(SystemExit):
55
58
  _validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '16x16')
56
59
 
@@ -64,6 +67,86 @@ def test_validate_sub_slicing_topology_does_nothing_for_supported_topology():
64
67
  _validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '4x4')
65
68
 
66
69
 
70
+ def test_validate_sub_slicing_availability_exits_when_getting_topologies_fails(
71
+ xpk_print: MagicMock, mocker
72
+ ):
73
+ mocker.patch(
74
+ 'xpk.commands.workload.has_sub_slicing_enabled',
75
+ return_value=(1, None),
76
+ )
77
+ with pytest.raises(SystemExit):
78
+ _validate_sub_slicing_availability()
79
+
80
+ assert (
81
+ 'Unable to validate sub-slicing support'
82
+ in xpk_print.mock_calls[0].args[0]
83
+ )
84
+
85
+
86
+ def test_validate_sub_slicing_availability_exits_when_subslicing_topology_is_not_defined(
87
+ xpk_print: MagicMock, mocker
88
+ ):
89
+ mocker.patch(
90
+ 'xpk.commands.workload.has_sub_slicing_enabled',
91
+ return_value=(0, False),
92
+ )
93
+ with pytest.raises(SystemExit):
94
+ _validate_sub_slicing_availability()
95
+
96
+ assert (
97
+ 'Cluster has not been not set up for Sub-slicing.'
98
+ in xpk_print.mock_calls[0].args[0]
99
+ )
100
+
101
+
102
+ def test_validate_sub_slicing_availability_exits_when_kueue_version_cannot_be_determined(
103
+ xpk_print: MagicMock, mocker
104
+ ):
105
+ mocker.patch(
106
+ 'xpk.commands.workload.has_sub_slicing_enabled',
107
+ return_value=(0, True),
108
+ )
109
+ mocker.patch(
110
+ 'xpk.commands.workload.KueueManager.get_installed_kueue_version',
111
+ return_value=(1, None),
112
+ )
113
+ with pytest.raises(SystemExit):
114
+ _validate_sub_slicing_availability()
115
+
116
+ assert 'Unable to validate sub-slicing' in xpk_print.mock_calls[0].args[0]
117
+
118
+
119
+ def test_validate_sub_slicing_availability_exits_when_kueue_version_does_not_meet_minimum_requirements(
120
+ xpk_print: MagicMock, mocker
121
+ ):
122
+ mocker.patch(
123
+ 'xpk.commands.workload.has_sub_slicing_enabled',
124
+ return_value=(0, True),
125
+ )
126
+ mocker.patch(
127
+ 'xpk.commands.workload.KueueManager.get_installed_kueue_version',
128
+ return_value=(0, Version('0.0.0')),
129
+ )
130
+ with pytest.raises(SystemExit):
131
+ _validate_sub_slicing_availability()
132
+
133
+ assert 'The minimal required version is' in xpk_print.mock_calls[0].args[0]
134
+
135
+
136
+ def test_validate_sub_slicing_availability_does_nothing_when_cluster_is_correctly_configured_for_subslicing(
137
+ mocker,
138
+ ):
139
+ mocker.patch(
140
+ 'xpk.commands.workload.has_sub_slicing_enabled',
141
+ return_value=(0, True),
142
+ )
143
+ mocker.patch(
144
+ 'xpk.commands.workload.KueueManager.get_installed_kueue_version',
145
+ return_value=(0, Version('0.13.0')),
146
+ )
147
+ _validate_sub_slicing_availability()
148
+
149
+
67
150
  @patch('xpk.commands.common.xpk_print')
68
151
  def test_validate_sub_slicing_topology_fails_for_unsupported_system(
69
152
  common_xpk_print: MagicMock,
@@ -36,6 +36,21 @@ class DeploymentModule:
36
36
  settings: Optional[dict[str, Any]] = None
37
37
  use: Optional[list[str]] = None
38
38
 
39
+ def update_settings(self, additionalSettings: dict[str, Any]) -> None:
40
+ if self.settings is None:
41
+ self.settings = dict()
42
+ self.settings.update(additionalSettings)
43
+
44
+ def set_setting(self, key: str, value: Any) -> None:
45
+ if self.settings is None:
46
+ self.settings = dict()
47
+ self.settings[key] = value
48
+
49
+ def append_use(self, use: str) -> None:
50
+ if self.use is None:
51
+ self.use = list()
52
+ self.use.append(use)
53
+
39
54
 
40
55
  @dataclass
41
56
  class DeploymentGroup:
@@ -59,6 +74,6 @@ class Blueprint:
59
74
  blueprint_name: Optional[str]
60
75
  toolkit_modules_url: str
61
76
  toolkit_modules_version: str
62
- vars: dict[str, str | list[str]] | None
77
+ vars: dict[str, str | list[str] | dict[str, str]] | None
63
78
  terraform_providers: Optional[dict[str, Any]] = None
64
79
  validators: Optional[list[Any]] = None
@@ -211,9 +211,9 @@ class BlueprintGenerator:
211
211
  outputs=["instructions"],
212
212
  )
213
213
  if capacity_type == CapacityType.FLEX_START:
214
- a3_megagpu_pool_0.settings.update(self.get_dws_flex_start())
214
+ a3_megagpu_pool_0.update_settings(self.get_dws_flex_start())
215
215
  else:
216
- a3_megagpu_pool_0.settings.update({"static_node_count": num_nodes})
216
+ a3_megagpu_pool_0.update_settings({"static_node_count": num_nodes})
217
217
 
218
218
  set_placement_policy = capacity_type != CapacityType.SPOT
219
219
  workload = DeploymentModule(
@@ -252,8 +252,8 @@ class BlueprintGenerator:
252
252
 
253
253
  print(reservation_placement_policy)
254
254
  if reservation_placement_policy is not None:
255
- a3_megagpu_pool_0.settings["placement_policy"] = (
256
- reservation_placement_policy
255
+ a3_megagpu_pool_0.set_setting(
256
+ "placement_policy", reservation_placement_policy
257
257
  )
258
258
 
259
259
  primary_group = DeploymentGroup(
@@ -268,7 +268,7 @@ class BlueprintGenerator:
268
268
  ],
269
269
  )
270
270
  if set_placement_policy and reservation_placement_policy is None:
271
- a3_megagpu_pool_0.use.append(group_placement_0.id)
271
+ a3_megagpu_pool_0.append_use(group_placement_0.id)
272
272
  primary_group.modules.append(group_placement_0)
273
273
  a3_mega_blueprint = Blueprint(
274
274
  terraform_backend_defaults=self._getblock_terraform_backend(
@@ -580,9 +580,9 @@ class BlueprintGenerator:
580
580
  outputs=["instructions"],
581
581
  )
582
582
  if capacity_type == CapacityType.FLEX_START:
583
- gpu_pool.settings.update(self.get_dws_flex_start())
583
+ gpu_pool.update_settings(self.get_dws_flex_start())
584
584
  else:
585
- gpu_pool.settings.update({"static_node_count": num_nodes})
585
+ gpu_pool.update_settings({"static_node_count": num_nodes})
586
586
 
587
587
  workload_manager_install_id = "workload-manager-install"
588
588
  workload_manager_install = DeploymentModule(
@@ -855,9 +855,9 @@ class BlueprintGenerator:
855
855
  outputs=["instructions"],
856
856
  )
857
857
  if capacity_type == CapacityType.FLEX_START:
858
- gpu_pool.settings.update(self.get_dws_flex_start())
858
+ gpu_pool.update_settings(self.get_dws_flex_start())
859
859
  else:
860
- gpu_pool.settings.update({"static_node_count": num_nodes})
860
+ gpu_pool.update_settings({"static_node_count": num_nodes})
861
861
 
862
862
  workload_manager_install_id = "workload-manager-install"
863
863
  workload_manager_install = DeploymentModule(
@@ -956,7 +956,7 @@ class BlueprintGenerator:
956
956
  )
957
957
 
958
958
  def _getblock_terraform_backend(
959
- self, gcs_bucket: str, cluster_name: str, prefix: str = ""
959
+ self, gcs_bucket: str | None, cluster_name: str, prefix: str = ""
960
960
  ) -> dict | None:
961
961
  if gcs_bucket is None:
962
962
  return None
@@ -986,7 +986,7 @@ class BlueprintGenerator:
986
986
  yaml_parser.dump(xpk_blueprint, blueprint_file)
987
987
  return blueprint_path
988
988
 
989
- def _get_blueprint_path(self, blueprint_name, prefix: str = ""):
989
+ def _get_blueprint_path(self, blueprint_name, prefix: str = "") -> str:
990
990
  blueprint_path = os.path.join(
991
991
  self._get_storage_path(prefix), f"{blueprint_name}.yaml"
992
992
  )
xpk/core/capacity.py CHANGED
@@ -152,6 +152,23 @@ def get_reservation_placement_policy(
152
152
  return output.strip()
153
153
 
154
154
 
155
+ def get_reservation_deployment_type(
156
+ reservation: str, zone: str, project: str
157
+ ) -> str:
158
+ """Get reservation deployment type."""
159
+ command = (
160
+ f'gcloud beta compute reservations describe {reservation}'
161
+ f' --project={project} --zone={zone} --format="value(deploymentType)"'
162
+ )
163
+ return_code, output = run_command_for_value(
164
+ command, 'Get reservation deployment type', dry_run_return_val='DENSE'
165
+ )
166
+ if return_code != 0:
167
+ xpk_print(f'Get reservation deployment type ERROR {return_code}')
168
+ xpk_exit(1)
169
+ return output.strip()
170
+
171
+
155
172
  def verify_reservation_exists(args) -> int:
156
173
  """Verify the reservation exists.
157
174
 
@@ -0,0 +1,50 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import pytest
18
+ from unittest.mock import MagicMock, patch
19
+ from .capacity import get_reservation_deployment_type
20
+
21
+
22
+ @patch('xpk.core.capacity.xpk_print')
23
+ def test_get_reservation_deployment_type_exits_with_command_fails(
24
+ xpk_print: MagicMock, mocker
25
+ ):
26
+ mocker.patch(
27
+ target='xpk.core.capacity.run_command_for_value', return_value=(1, '')
28
+ )
29
+ with pytest.raises(SystemExit):
30
+ get_reservation_deployment_type(
31
+ reservation='reservation', zone='zone', project='project'
32
+ )
33
+
34
+ assert (
35
+ 'Get reservation deployment type ERROR 1'
36
+ in xpk_print.mock_calls[0].args[0]
37
+ )
38
+
39
+
40
+ def test_get_reservation_deployment_type_returns_deployment_type_when_command_succeeds(
41
+ mocker,
42
+ ):
43
+ mocker.patch(
44
+ target='xpk.core.capacity.run_command_for_value',
45
+ return_value=(0, 'DENSE'),
46
+ )
47
+ result = get_reservation_deployment_type(
48
+ reservation='reservation', zone='zone', project='project'
49
+ )
50
+ assert result == 'DENSE'
xpk/core/config.py CHANGED
@@ -22,7 +22,7 @@ from ..utils import file
22
22
  from ..utils.console import xpk_print
23
23
 
24
24
  # This is the version for XPK PyPI package
25
- __version__ = 'v0.14.2'
25
+ __version__ = 'v0.14.3'
26
26
  XPK_CURRENT_VERSION = __version__
27
27
  XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
28
28
 
@@ -97,7 +97,7 @@ def get_main_container(args, system, docker_image, resource_type) -> str:
97
97
  )
98
98
 
99
99
  gpu_workload_terminate_command = ''
100
- if system.accelerator_type == AcceleratorType['GPU']:
100
+ if system.accelerator_type == AcceleratorType.GPU:
101
101
  gpu_workload_terminate_command = (
102
102
  'echo Main app is done > /usr/share/workload/workload_terminated; '
103
103
  )
@@ -105,7 +105,7 @@ def get_main_container(args, system, docker_image, resource_type) -> str:
105
105
  tpu_stacktrace_terminate_command = ''
106
106
  if (
107
107
  not args.use_pathways
108
- and system.accelerator_type == AcceleratorType['TPU']
108
+ and system.accelerator_type == AcceleratorType.TPU
109
109
  and args.deploy_stacktrace_sidecar
110
110
  ):
111
111
  tpu_stacktrace_terminate_command = (
@@ -193,7 +193,7 @@ def get_user_workload_container(args, system: SystemCharacteristics):
193
193
  ].resource_type
194
194
  if (
195
195
  not args.use_pathways
196
- and system.accelerator_type == AcceleratorType['TPU']
196
+ and system.accelerator_type == AcceleratorType.TPU
197
197
  and args.deploy_stacktrace_sidecar
198
198
  ):
199
199
  xpk_print(
@@ -219,7 +219,7 @@ def get_main_container_docker_image(args, system: SystemCharacteristics) -> str:
219
219
  Workload docker image as a YAML string
220
220
  """
221
221
 
222
- if system.accelerator_type == AcceleratorType['GPU']:
222
+ if system.accelerator_type == AcceleratorType.GPU:
223
223
  return 'gpu-image'
224
224
 
225
225
  return f'{args.docker_name}'
@@ -43,10 +43,10 @@ def get_main_container_resources(
43
43
  return resources_yaml
44
44
 
45
45
  gpu_resources_yaml = """nvidia.com/gpu: {system.chips_per_vm}"""
46
- if system.accelerator_type == AcceleratorType['GPU']:
46
+ if system.accelerator_type == AcceleratorType.GPU:
47
47
  return gpu_resources_yaml.format(system=system)
48
48
 
49
- if system.accelerator_type == AcceleratorType['CPU']:
49
+ if system.accelerator_type == AcceleratorType.CPU:
50
50
  # CPUs don't have chips, but have a subresource called vCPUs.
51
51
  # system.chips_per_vm is used as a proxy for vCPUs.
52
52
  # Some vCPUs get used in hosting system pods of the workloads,
@@ -67,10 +67,10 @@ def get_env_container(args, system: SystemCharacteristics) -> str:
67
67
  str:
68
68
  YAML with the env config for the main container, as a YAML string.
69
69
  """
70
- if system.accelerator_type == AcceleratorType['GPU']:
70
+ if system.accelerator_type == AcceleratorType.GPU:
71
71
  return get_gpu_env(args, system)
72
72
 
73
- if system.accelerator_type == AcceleratorType['CPU']:
73
+ if system.accelerator_type == AcceleratorType.CPU:
74
74
  return get_cpu_env(args, system)
75
75
 
76
76
  return format_env_dict(args.env, system)
@@ -176,7 +176,7 @@ def get_cpu_env(args, system) -> str:
176
176
 
177
177
 
178
178
  def format_env_dict(env, system: SystemCharacteristics) -> str:
179
- if system.accelerator_type == AcceleratorType['GPU']:
179
+ if system.accelerator_type == AcceleratorType.GPU:
180
180
  # For GPUs, it has two more spaces ahead of name and value respectively
181
181
  env_format = '''
182
182
  - name: {key}
@@ -265,7 +265,7 @@ def get_volumes(args, system: SystemCharacteristics) -> str:
265
265
  driver: {driver}"""
266
266
 
267
267
  if (
268
- system.accelerator_type == AcceleratorType['TPU']
268
+ system.accelerator_type == AcceleratorType.TPU
269
269
  and args.deploy_stacktrace_sidecar
270
270
  ):
271
271
  volumes += """
@@ -317,7 +317,7 @@ def get_volume_mounts(args, system: SystemCharacteristics) -> str:
317
317
  name: shared-tmp
318
318
  """
319
319
  elif (
320
- system.accelerator_type == AcceleratorType['TPU']
320
+ system.accelerator_type == AcceleratorType.TPU
321
321
  and args.deploy_stacktrace_sidecar
322
322
  ):
323
323
  volume_mount_yaml += """- name: tpu-stack-trace
@@ -325,7 +325,7 @@ def get_volume_mounts(args, system: SystemCharacteristics) -> str:
325
325
  - name: shared-data
326
326
  mountPath: /shared-volume
327
327
  """
328
- elif system.accelerator_type == AcceleratorType['GPU']:
328
+ elif system.accelerator_type == AcceleratorType.GPU:
329
329
  volume_mount_yaml = ''
330
330
 
331
331
  storages: list[Storage] = (
@@ -379,7 +379,7 @@ def add_container_ports(args, system: SystemCharacteristics) -> str:
379
379
  return ''
380
380
 
381
381
  gpu_port_yaml = """- containerPort: 6002"""
382
- if system.accelerator_type == AcceleratorType['GPU']:
382
+ if system.accelerator_type == AcceleratorType.GPU:
383
383
  return gpu_port_yaml
384
384
  return port_yaml
385
385
 
@@ -394,7 +394,7 @@ def add_jax_coordinator_port(system) -> str:
394
394
  str:
395
395
  jax coordinator port as a YAML string
396
396
  """
397
- if system.accelerator_type == AcceleratorType['CPU']:
397
+ if system.accelerator_type == AcceleratorType.CPU:
398
398
  return '- containerPort: 1234'
399
399
  return ''
400
400
 
@@ -411,6 +411,6 @@ def add_image_pull_policy_for_pw_or_gpu(args, system: SystemCharacteristics):
411
411
  """
412
412
  yaml = """imagePullPolicy: Always"""
413
413
 
414
- if args.use_pathways or system.accelerator_type == AcceleratorType['GPU']:
414
+ if args.use_pathways or system.accelerator_type == AcceleratorType.GPU:
415
415
  return yaml.format(args=args)
416
416
  return ''
xpk/core/kjob.py CHANGED
@@ -296,15 +296,13 @@ def create_job_template_instance(
296
296
  working_directory = JobTemplateDefaults.WORKING_DIRECTORY.value
297
297
  resources = (
298
298
  job_resources_template.format(gpu_per_node=system.chips_per_vm)
299
- if system is not None
300
- and system.accelerator_type == AcceleratorType["GPU"]
299
+ if system is not None and system.accelerator_type == AcceleratorType.GPU
301
300
  else ""
302
301
  )
303
302
 
304
303
  node_selector = (
305
304
  job_node_selector_template.format(gpu_name=system.gke_accelerator)
306
- if system is not None
307
- and system.accelerator_type == AcceleratorType["GPU"]
305
+ if system is not None and system.accelerator_type == AcceleratorType.GPU
308
306
  else ""
309
307
  )
310
308
  yml_string = job_template_yaml.format(
@@ -319,7 +317,7 @@ def create_job_template_instance(
319
317
  priority=args.priority if hasattr(args, "priority") else "medium",
320
318
  service_account=service_account,
321
319
  )
322
- if system is not None and system.accelerator_type == AcceleratorType["GPU"]:
320
+ if system is not None and system.accelerator_type == AcceleratorType.GPU:
323
321
  yml_string = decorate_job_template_with_gpu(yml_string, system.device_type)
324
322
 
325
323
  return run_kubectl_apply(
xpk/core/kueue_manager.py CHANGED
@@ -40,6 +40,7 @@ from ..core.commands import (
40
40
  from ..utils.file import write_tmp_file
41
41
  from ..utils.console import xpk_print, xpk_exit
42
42
  from ..utils.templates import TEMPLATE_PATH, get_templates_absolute_path
43
+ from packaging.version import Version
43
44
 
44
45
  WAIT_FOR_KUEUE_TIMEOUT = "10m"
45
46
  CLUSTER_QUEUE_NAME = "cluster-queue"
@@ -51,7 +52,7 @@ KUEUE_CONTROLLER_MANAGER_JINJA_FILE = "kueue_controller_manager.yaml.j2"
51
52
  KUEUE_SUB_SLICING_TOPOLOGY_JINJA_FILE = "kueue_sub_slicing_topology.yaml.j2"
52
53
  MEMORY_SIZE_PER_VM = 1.2
53
54
  MIN_MEMORY_LIMIT_SIZE = 4096
54
- KUEUE_VERSION = "v0.12.2"
55
+ KUEUE_VERSION = Version("v0.12.2")
55
56
 
56
57
 
57
58
  @dataclass
@@ -78,7 +79,7 @@ class KueueManager:
78
79
 
79
80
  def __init__(
80
81
  self,
81
- kueue_version: str = KUEUE_VERSION,
82
+ kueue_version: Version = KUEUE_VERSION,
82
83
  template_path=TEMPLATE_PATH,
83
84
  ):
84
85
  self.kueue_version = kueue_version
@@ -111,9 +112,9 @@ class KueueManager:
111
112
  )
112
113
  return 0
113
114
  else:
114
- xpk_print(f"Upgrading Kueue to version {self.kueue_version}...")
115
+ xpk_print(f"Upgrading Kueue to version v{self.kueue_version}...")
115
116
  else:
116
- xpk_print(f"Installing Kueue version {self.kueue_version}...")
117
+ xpk_print(f"Installing Kueue version v{self.kueue_version}...")
117
118
 
118
119
  install_return_code = self.__install(tolerations)
119
120
  if install_return_code != 0:
@@ -121,7 +122,7 @@ class KueueManager:
121
122
 
122
123
  return self.__configure(kueue_config)
123
124
 
124
- def get_installed_kueue_version(self) -> tuple[int, str | None]:
125
+ def get_installed_kueue_version(self) -> tuple[int, Version | None]:
125
126
  command = (
126
127
  "kubectl get deployment kueue-controller-manager -n kueue-system -o"
127
128
  " jsonpath='{.spec.template.spec.containers[0].image}'"
@@ -130,15 +131,14 @@ class KueueManager:
130
131
  return_code, val = run_command_for_value(
131
132
  command,
132
133
  task,
133
- dry_run_return_val="""
134
- v0.12.1""",
134
+ dry_run_return_val="",
135
135
  )
136
136
  if return_code != 0:
137
137
  return return_code, None
138
138
  version_tag = val.split(":")
139
139
  if len(version_tag) == 1:
140
140
  return 1, None
141
- return return_code, version_tag[-1]
141
+ return return_code, Version(version_tag[-1])
142
142
 
143
143
  def __install(
144
144
  self,
@@ -162,7 +162,7 @@ class KueueManager:
162
162
  return self.__wait_for_kueue_available()
163
163
 
164
164
  def __install_kueue_crs(self) -> int:
165
- manifest_url = f"https://github.com/kubernetes-sigs/kueue/releases/download/{self.kueue_version}/manifests.yaml"
165
+ manifest_url = f"https://github.com/kubernetes-sigs/kueue/releases/download/v{self.kueue_version}/manifests.yaml"
166
166
  install_command = (
167
167
  f"kubectl apply --server-side --force-conflicts -f {manifest_url}"
168
168
  )
@@ -199,7 +199,7 @@ class KueueManager:
199
199
  0 if successful and 1 otherwise.
200
200
  """
201
201
  command = (
202
- "kubectl wait deploy/kueue-controller-manager -nkueue-system"
202
+ "kubectl wait deploy/kueue-controller-manager -n kueue-system"
203
203
  f" --for=condition=available --timeout={WAIT_FOR_KUEUE_TIMEOUT}"
204
204
  )
205
205
  task = "Wait for Kueue to be available"
@@ -421,3 +421,14 @@ class KueueManager:
421
421
  if return_code != 0:
422
422
  xpk_print(f"{task} returned ERROR {return_code}")
423
423
  return return_code
424
+
425
+
426
+ def has_sub_slicing_enabled() -> tuple[int, bool | None]:
427
+ return_code, value = run_command_for_value(
428
+ command="kubectl get topology", task="Get defined topologies"
429
+ )
430
+
431
+ if return_code != 0:
432
+ return return_code, None
433
+
434
+ return return_code, SUB_SLICE_TOPOLOGY_NAME in value