xpk 0.14.2__py3-none-any.whl → 0.14.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. xpk/commands/cluster.py +57 -22
  2. xpk/commands/cluster_gcluster_test.py +2 -2
  3. xpk/commands/cluster_test.py +197 -25
  4. xpk/commands/inspector.py +20 -7
  5. xpk/commands/kind.py +1 -1
  6. xpk/commands/workload.py +42 -4
  7. xpk/commands/workload_test.py +88 -5
  8. xpk/core/blueprint/blueprint_definitions.py +16 -1
  9. xpk/core/blueprint/blueprint_generator.py +11 -11
  10. xpk/core/capacity.py +17 -0
  11. xpk/core/capacity_test.py +50 -0
  12. xpk/core/config.py +1 -1
  13. xpk/core/docker_container.py +4 -4
  14. xpk/core/docker_resources.py +11 -11
  15. xpk/core/kjob.py +3 -5
  16. xpk/core/kueue_manager.py +21 -10
  17. xpk/core/kueue_manager_test.py +379 -536
  18. xpk/core/nap.py +1 -1
  19. xpk/core/nodepool.py +9 -9
  20. xpk/core/nodepool_test.py +4 -4
  21. xpk/core/pathways.py +1 -1
  22. xpk/core/resources.py +1 -1
  23. xpk/core/scheduling.py +7 -13
  24. xpk/core/system_characteristics.py +42 -35
  25. xpk/core/system_characteristics_test.py +3 -3
  26. xpk/core/testing/__init__.py +15 -0
  27. xpk/core/testing/commands_tester.py +131 -0
  28. xpk/core/testing/commands_tester_test.py +129 -0
  29. xpk/core/updates.py +57 -0
  30. xpk/core/updates_test.py +80 -0
  31. xpk/main.py +7 -4
  32. xpk/parser/common.py +8 -0
  33. xpk/utils/execution_context.py +20 -2
  34. {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/METADATA +1 -3
  35. {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/RECORD +39 -33
  36. {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/WHEEL +0 -0
  37. {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/entry_points.txt +0 -0
  38. {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/licenses/LICENSE +0 -0
  39. {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/top_level.txt +0 -0
xpk/commands/cluster.py CHANGED
@@ -17,7 +17,7 @@ limitations under the License.
17
17
  from tabulate import tabulate
18
18
 
19
19
  from ..utils.feature_flags import FeatureFlags
20
- from ..core.capacity import H100_DEVICE_TYPE, H200_DEVICE_TYPE, B200_DEVICE_TYPE
20
+ from ..core.capacity import H100_DEVICE_TYPE, H200_DEVICE_TYPE, B200_DEVICE_TYPE, get_reservation_deployment_type
21
21
  from ..core.cluster import (
22
22
  get_all_clusters_programmatic,
23
23
  get_cluster_credentials,
@@ -60,7 +60,7 @@ from ..core.nodepool import (
60
60
  )
61
61
  from ..core.ray import install_ray_cluster
62
62
  from ..core.mtc import install_mtc_on_cluster
63
- from ..core.resources import create_cluster_configmaps
63
+ from ..core.resources import AutoprovisioningConfig, create_cluster_configmaps
64
64
  from ..core.scheduling import get_total_chips_requested_from_args
65
65
  from ..core.storage import install_storage_crd
66
66
  from ..core.system_characteristics import (
@@ -110,7 +110,7 @@ def cluster_adapt(args) -> None:
110
110
  )
111
111
  add_zone_and_project(args)
112
112
 
113
- if system.accelerator_type == AcceleratorType['GPU'] and not getattr(
113
+ if system.accelerator_type == AcceleratorType.GPU and not getattr(
114
114
  args, 'num_nodes'
115
115
  ):
116
116
  xpk_print(
@@ -180,10 +180,12 @@ def cluster_adapt(args) -> None:
180
180
  # if set_pathways_job_on_cluster_code != 0:
181
181
  # xpk_exit(set_pathways_job_on_cluster_code)
182
182
 
183
- install_kueue(args, system, autoprovisioning_config)
183
+ install_kueue_code = _install_kueue(args, system, autoprovisioning_config)
184
+ if install_kueue_code != 0:
185
+ xpk_exit(install_kueue_code)
184
186
 
185
187
  install_kjob(args)
186
- if system.accelerator_type == AcceleratorType['GPU']:
188
+ if system.accelerator_type == AcceleratorType.GPU:
187
189
  prepare_gpus(system)
188
190
 
189
191
  if args.enable_ray_cluster:
@@ -204,6 +206,38 @@ def cluster_adapt(args) -> None:
204
206
  def _validate_cluster_create_args(args, system: SystemCharacteristics):
205
207
  if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing:
206
208
  validate_sub_slicing_system(system)
209
+ _validate_sub_slicing_reservation(args)
210
+
211
+
212
+ def _validate_sub_slicing_reservation(args):
213
+ if args.reservation is None:
214
+ xpk_print(
215
+ 'Error: Validation failed: Sub-slicing cluster creation requires'
216
+ ' Cluster Director reservation to be specified.'
217
+ )
218
+ xpk_exit(1)
219
+
220
+ deployment_type = get_reservation_deployment_type(
221
+ reservation=args.reservation, project=args.project, zone=args.zone
222
+ )
223
+ if deployment_type != 'DENSE':
224
+ xpk_print(
225
+ 'Error: Validation failed: The specified reservation'
226
+ f' "{args.reservation}" is not a Cluster Director reservation.'
227
+ )
228
+ xpk_print(
229
+ 'Please provide a reservation created for Cluster Director to proceed.'
230
+ )
231
+ xpk_print('To list valid Cluster Director reservations, run:')
232
+ xpk_print(
233
+ ' gcloud compute reservations list --filter="deploymentType=DENSE"'
234
+ )
235
+ xpk_print(
236
+ 'Refer to the documentation for more information on creating Cluster'
237
+ ' Director reservations:'
238
+ ' https://cloud.google.com/cluster-director/docs/reserve-capacity'
239
+ )
240
+ xpk_exit(1)
207
241
 
208
242
 
209
243
  def cluster_create(args) -> None:
@@ -346,11 +380,13 @@ def cluster_create(args) -> None:
346
380
  if set_pathways_job_on_cluster_code != 0:
347
381
  xpk_exit(set_pathways_job_on_cluster_code)
348
382
 
349
- install_kueue(args, system, autoprovisioning_config)
383
+ install_kueue_code = _install_kueue(args, system, autoprovisioning_config)
384
+ if install_kueue_code != 0:
385
+ xpk_exit(install_kueue_code)
350
386
 
351
387
  install_kjob(args)
352
388
 
353
- if system.accelerator_type == AcceleratorType['GPU']:
389
+ if system.accelerator_type == AcceleratorType.GPU:
354
390
  prepare_gpus(system)
355
391
 
356
392
  if args.enable_ray_cluster:
@@ -1106,12 +1142,6 @@ def run_gke_cluster_create_command(
1106
1142
  # benefit from a larger initial `--num-nodes`. After the cluster is created,
1107
1143
  # the auto-scaler can reduce/increase the nodes based on the load.
1108
1144
 
1109
- # If the user passes in the gke version then we use that directly instead of the rapid release.
1110
- # This allows users to directly pass a specified gke version without release channel constraints.
1111
- rapid_release_cmd = ''
1112
- if args.gke_version is not None:
1113
- rapid_release_cmd = ' --release-channel rapid'
1114
-
1115
1145
  command = (
1116
1146
  'gcloud beta container clusters create'
1117
1147
  f' {args.cluster} --project={args.project}'
@@ -1122,25 +1152,23 @@ def run_gke_cluster_create_command(
1122
1152
  ' --enable-autoscaling'
1123
1153
  ' --total-min-nodes 1 --total-max-nodes 1000'
1124
1154
  f' --num-nodes {args.default_pool_cpu_num_nodes}'
1125
- f' {args.custom_cluster_arguments}'
1126
- f' {rapid_release_cmd}'
1127
1155
  ' --enable-dns-access'
1128
1156
  ' --autoscaling-profile=optimize-utilization'
1129
1157
  ' --labels=gke_product_type=xpk'
1130
1158
  )
1131
1159
 
1160
+ if args.gke_version or system.accelerator_type == AcceleratorType.GPU:
1161
+ command += ' --no-enable-autoupgrade'
1162
+
1132
1163
  enable_ip_alias = False
1133
1164
 
1134
1165
  if args.private or args.authorized_networks is not None:
1135
1166
  enable_ip_alias = True
1136
1167
  command += ' --enable-master-authorized-networks --enable-private-nodes'
1137
1168
 
1138
- if system.accelerator_type == AcceleratorType['GPU']:
1169
+ if system.accelerator_type == AcceleratorType.GPU:
1139
1170
  enable_ip_alias = True
1140
- command += (
1141
- ' --enable-dataplane-v2'
1142
- ' --enable-multi-networking --no-enable-autoupgrade'
1143
- )
1171
+ command += ' --enable-dataplane-v2 --enable-multi-networking'
1144
1172
  else:
1145
1173
  command += ' --location-policy=BALANCED --scopes=storage-full,gke-default'
1146
1174
 
@@ -1180,6 +1208,9 @@ def run_gke_cluster_create_command(
1180
1208
  addons_str = ','.join(addons)
1181
1209
  command += f' --addons={addons_str}'
1182
1210
 
1211
+ if args.custom_cluster_arguments:
1212
+ command += f' {args.custom_cluster_arguments}'
1213
+
1183
1214
  return_code = run_command_with_updates(command, 'GKE Cluster Create')
1184
1215
  if return_code != 0:
1185
1216
  xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
@@ -1240,7 +1271,11 @@ def install_kjob(args):
1240
1271
  xpk_exit(err_code)
1241
1272
 
1242
1273
 
1243
- def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config):
1274
+ def _install_kueue(
1275
+ args,
1276
+ system: SystemCharacteristics,
1277
+ autoprovisioning_config: AutoprovisioningConfig | None,
1278
+ ) -> int:
1244
1279
  xpk_print('Enabling Kueue on the cluster')
1245
1280
  autoprovisioning_enabled = False
1246
1281
  if autoprovisioning_config:
@@ -1251,7 +1286,7 @@ def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config):
1251
1286
  # Determine total chips based on user specified topology.
1252
1287
  total_chips = get_total_chips_requested_from_args(args, system)
1253
1288
  kueue_manager = KueueManager()
1254
- kueue_manager.install_or_upgrade(
1289
+ return kueue_manager.install_or_upgrade(
1255
1290
  KueueConfig(
1256
1291
  system,
1257
1292
  total_chips=total_chips,
@@ -93,7 +93,7 @@ def test_install_kueue_standard(
93
93
  gke_accelerator="nvidia-h100-mega-80gb",
94
94
  gce_machine_type="a3-megagpu-8g",
95
95
  chips_per_vm=8,
96
- accelerator_type=AcceleratorType["GPU"],
96
+ accelerator_type=AcceleratorType.GPU,
97
97
  device_type="h100-mega-80gb-8",
98
98
  supports_sub_slicing=False,
99
99
  )
@@ -140,7 +140,7 @@ def test_install_kueue_with_autoprovisioning(
140
140
  gke_accelerator="nvidia-h100-mega-80gb",
141
141
  gce_machine_type="a3-megagpu-8g",
142
142
  chips_per_vm=8,
143
- accelerator_type=AcceleratorType["GPU"],
143
+ accelerator_type=AcceleratorType.GPU,
144
144
  device_type="h100-mega-80gb-8",
145
145
  supports_sub_slicing=False,
146
146
  )
@@ -16,77 +16,249 @@ limitations under the License.
16
16
 
17
17
  from argparse import Namespace
18
18
  from dataclasses import dataclass
19
- from unittest.mock import MagicMock
19
+ from typing import Any
20
+ from unittest.mock import MagicMock, patch
20
21
  import pytest
21
22
 
22
- from xpk.commands.cluster import _validate_cluster_create_args
23
+ from xpk.commands.cluster import _install_kueue, _validate_cluster_create_args, run_gke_cluster_create_command
23
24
  from xpk.core.system_characteristics import SystemCharacteristics, UserFacingNameToSystemCharacteristics
25
+ from xpk.core.testing.commands_tester import CommandsTester
24
26
  from xpk.utils.feature_flags import FeatureFlags
25
27
 
26
28
 
27
29
  @dataclass
28
30
  class _Mocks:
29
31
  common_print_mock: MagicMock
30
- common_exit_mock: MagicMock
32
+ commands_print_mock: MagicMock
33
+ commands_get_reservation_deployment_type: MagicMock
34
+ commands_tester: CommandsTester
31
35
 
32
36
 
33
37
  @pytest.fixture
34
- def mock_common_print_and_exit(mocker):
38
+ def mocks(mocker) -> _Mocks:
35
39
  common_print_mock = mocker.patch(
36
40
  'xpk.commands.common.xpk_print',
37
41
  return_value=None,
38
42
  )
39
- common_exit_mock = mocker.patch(
40
- 'xpk.commands.common.xpk_exit',
41
- return_value=None,
43
+ commands_print_mock = mocker.patch(
44
+ 'xpk.commands.cluster.xpk_print', return_value=None
45
+ )
46
+ commands_get_reservation_deployment_type = mocker.patch(
47
+ 'xpk.commands.cluster.get_reservation_deployment_type',
48
+ return_value='DENSE',
42
49
  )
43
50
  return _Mocks(
44
- common_print_mock=common_print_mock, common_exit_mock=common_exit_mock
51
+ common_print_mock=common_print_mock,
52
+ commands_get_reservation_deployment_type=commands_get_reservation_deployment_type,
53
+ commands_print_mock=commands_print_mock,
54
+ commands_tester=CommandsTester(
55
+ mocker,
56
+ run_command_with_updates_path=(
57
+ 'xpk.commands.cluster.run_command_with_updates'
58
+ ),
59
+ ),
45
60
  )
46
61
 
47
62
 
48
- DEFAULT_TEST_SYSTEM: SystemCharacteristics = (
49
- UserFacingNameToSystemCharacteristics['l4-1']
50
- )
63
+ def construct_args(**kwargs: Any) -> Namespace:
64
+ args_dict = dict(
65
+ project='project',
66
+ zone='us-central1-a',
67
+ reservation='',
68
+ default_pool_cpu_machine_type='test-machine-type',
69
+ cluster='test-cluster',
70
+ default_pool_cpu_num_nodes='100',
71
+ sub_slicing=False,
72
+ gke_version='',
73
+ private=False,
74
+ authorized_networks=None,
75
+ enable_pathways=False,
76
+ enable_ray_cluster=False,
77
+ enable_workload_identity=False,
78
+ enable_gcsfuse_csi_driver=False,
79
+ enable_gcpfilestore_csi_driver=False,
80
+ enable_parallelstore_csi_driver=False,
81
+ enable_pd_csi_driver=False,
82
+ enable_lustre_csi_driver=False,
83
+ custom_cluster_arguments='',
84
+ num_slices=1,
85
+ num_nodes=1,
86
+ flex=False,
87
+ memory_limit='100Gi',
88
+ cpu_limit=100,
89
+ cluster_cpu_machine_type='',
90
+ )
91
+ args_dict.update(kwargs)
92
+ return Namespace(**args_dict)
93
+
94
+
95
+ GPU_TEST_SYSTEM: SystemCharacteristics = UserFacingNameToSystemCharacteristics[
96
+ 'l4-1'
97
+ ]
51
98
  SUB_SLICING_SYSTEM: SystemCharacteristics = (
52
99
  UserFacingNameToSystemCharacteristics['v6e-4x4']
53
100
  )
101
+ TPU_TEST_SYSTEM: SystemCharacteristics = UserFacingNameToSystemCharacteristics[
102
+ 'v6e-4x4'
103
+ ]
54
104
 
55
105
 
56
106
  def test_validate_cluster_create_args_for_correct_args_pass(
57
- mock_common_print_and_exit: _Mocks,
107
+ mocks: _Mocks,
58
108
  ):
59
109
  args = Namespace()
60
110
 
61
- _validate_cluster_create_args(args, DEFAULT_TEST_SYSTEM)
111
+ _validate_cluster_create_args(args, GPU_TEST_SYSTEM)
62
112
 
63
- assert mock_common_print_and_exit.common_print_mock.call_count == 0
64
- assert mock_common_print_and_exit.common_exit_mock.call_count == 0
113
+ assert mocks.common_print_mock.call_count == 0
65
114
 
66
115
 
67
116
  def test_validate_cluster_create_args_for_correct_sub_slicing_args_pass(
68
- mock_common_print_and_exit: _Mocks,
117
+ mocks: _Mocks,
69
118
  ):
70
119
  FeatureFlags.SUB_SLICING_ENABLED = True
71
- args = Namespace(sub_slicing=True)
120
+ args = construct_args(
121
+ sub_slicing=True,
122
+ reservation='test-reservation',
123
+ )
72
124
 
73
125
  _validate_cluster_create_args(args, SUB_SLICING_SYSTEM)
74
126
 
75
- assert mock_common_print_and_exit.common_print_mock.call_count == 0
76
- assert mock_common_print_and_exit.common_exit_mock.call_count == 0
127
+ assert mocks.common_print_mock.call_count == 0
77
128
 
78
129
 
79
130
  def test_validate_cluster_create_args_for_not_supported_system_throws(
80
- mock_common_print_and_exit: _Mocks,
131
+ mocks: _Mocks,
81
132
  ):
82
133
  FeatureFlags.SUB_SLICING_ENABLED = True
83
- args = Namespace(sub_slicing=True)
134
+ args = construct_args(
135
+ sub_slicing=True,
136
+ reservation='test-reservation',
137
+ )
84
138
 
85
- _validate_cluster_create_args(args, DEFAULT_TEST_SYSTEM)
139
+ with pytest.raises(SystemExit):
140
+ _validate_cluster_create_args(args, GPU_TEST_SYSTEM)
86
141
 
87
- assert mock_common_print_and_exit.common_print_mock.call_count == 1
142
+ assert mocks.common_print_mock.call_count == 1
88
143
  assert (
89
- mock_common_print_and_exit.common_print_mock.call_args[0][0]
144
+ mocks.common_print_mock.call_args[0][0]
90
145
  == 'Error: l4-1 does not support Sub-slicing.'
91
146
  )
92
- assert mock_common_print_and_exit.common_exit_mock.call_count == 1
147
+
148
+
149
+ def test_validate_cluster_create_args_for_missing_reservation(
150
+ mocks: _Mocks,
151
+ ):
152
+ FeatureFlags.SUB_SLICING_ENABLED = True
153
+ args = construct_args(
154
+ sub_slicing=True,
155
+ reservation=None,
156
+ )
157
+
158
+ with pytest.raises(SystemExit):
159
+ _validate_cluster_create_args(args, SUB_SLICING_SYSTEM)
160
+
161
+ assert mocks.commands_print_mock.call_count == 1
162
+ assert (
163
+ 'Validation failed: Sub-slicing cluster creation requires'
164
+ in mocks.commands_print_mock.call_args[0][0]
165
+ )
166
+
167
+
168
+ def test_validate_cluster_create_args_for_invalid_reservation(
169
+ mocks: _Mocks,
170
+ ):
171
+ FeatureFlags.SUB_SLICING_ENABLED = True
172
+ args = construct_args(
173
+ sub_slicing=True,
174
+ reservation='test-reservation',
175
+ )
176
+ mocks.commands_get_reservation_deployment_type.return_value = 'SPARSE'
177
+
178
+ with pytest.raises(SystemExit):
179
+ _validate_cluster_create_args(args, SUB_SLICING_SYSTEM)
180
+
181
+ assert mocks.commands_print_mock.call_count == 5
182
+ assert (
183
+ 'Refer to the documentation for more information on creating Cluster'
184
+ in mocks.commands_print_mock.call_args[0][0]
185
+ )
186
+
187
+
188
+ @patch('xpk.commands.cluster.KueueManager.install_or_upgrade')
189
+ def test_install_kueue_returns_kueue_installation_code(
190
+ mock_kueue_manager_install: MagicMock,
191
+ ):
192
+ mock_kueue_manager_install.return_value = 17
193
+
194
+ code = _install_kueue(
195
+ args=construct_args(),
196
+ system=GPU_TEST_SYSTEM,
197
+ autoprovisioning_config=None,
198
+ )
199
+
200
+ assert code == 17
201
+
202
+
203
+ def test_run_gke_cluster_create_command_specifies_custom_cluster_arguments_last(
204
+ mocks: _Mocks,
205
+ ):
206
+ result = run_gke_cluster_create_command(
207
+ args=construct_args(
208
+ custom_cluster_arguments='--enable-autoscaling=False --foo=baz'
209
+ ),
210
+ gke_control_plane_version='1.2.3',
211
+ system=TPU_TEST_SYSTEM,
212
+ )
213
+
214
+ assert result == 0
215
+ mocks.commands_tester.assert_command_run(
216
+ 'clusters create',
217
+ ' --enable-autoscaling',
218
+ ' --enable-autoscaling=False --foo=baz',
219
+ )
220
+
221
+
222
+ def test_run_gke_cluster_create_command_without_gke_version_does_not_have_no_autoupgrade_flag(
223
+ mocks: _Mocks,
224
+ ):
225
+ result = run_gke_cluster_create_command(
226
+ args=construct_args(gke_version=''),
227
+ gke_control_plane_version='1.2.3',
228
+ system=TPU_TEST_SYSTEM,
229
+ )
230
+
231
+ assert result == 0
232
+ mocks.commands_tester.assert_command_not_run(
233
+ 'clusters create', ' --no-enable-autoupgrade'
234
+ )
235
+
236
+
237
+ def test_run_gke_cluster_create_command_with_gke_version_has_no_autoupgrade_flag(
238
+ mocks: _Mocks,
239
+ ):
240
+ result = run_gke_cluster_create_command(
241
+ args=construct_args(gke_version='1.2.3'),
242
+ gke_control_plane_version='1.2.3',
243
+ system=TPU_TEST_SYSTEM,
244
+ )
245
+
246
+ assert result == 0
247
+ mocks.commands_tester.assert_command_run(
248
+ 'clusters create', ' --no-enable-autoupgrade'
249
+ )
250
+
251
+
252
+ def test_run_gke_cluster_create_command_with_gpu_system_has_no_enable_autoupgrade(
253
+ mocks: _Mocks,
254
+ ):
255
+ result = run_gke_cluster_create_command(
256
+ args=construct_args(gke_version=''),
257
+ gke_control_plane_version='1.2.3',
258
+ system=GPU_TEST_SYSTEM,
259
+ )
260
+
261
+ assert result == 0
262
+ mocks.commands_tester.assert_command_run(
263
+ 'clusters create', ' --no-enable-autoupgrade'
264
+ )
xpk/commands/inspector.py CHANGED
@@ -23,6 +23,10 @@ from ..utils.console import xpk_exit, xpk_print
23
23
  from ..utils.file import append_tmp_file, write_tmp_file
24
24
  from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
25
25
  from .workload import get_workload_list
26
+ from ..core.kueue_manager import has_sub_slicing_enabled
27
+
28
+
29
+ _SPACER = '========================================================'
26
30
 
27
31
 
28
32
  def inspector_run_command_helper(
@@ -40,7 +44,6 @@ def inspector_run_command_helper(
40
44
  0 if successful and 1 otherwise.
41
45
  """
42
46
  prefix = f'Command: {command}\nCommand Description: {command_description}\n'
43
- postfix = '========================================================'
44
47
  return_code, command_output = run_command_for_value(
45
48
  command, f'{command_description}'
46
49
  )
@@ -51,7 +54,7 @@ def inspector_run_command_helper(
51
54
  )
52
55
  return 1
53
56
 
54
- inspector_command_output = f'{prefix} \n{command_output} \n{postfix} \n'
57
+ inspector_command_output = f'{prefix} \n{command_output} \n{_SPACER} \n'
55
58
  append_tmp_file(inspector_command_output, file)
56
59
 
57
60
  if args.print_to_terminal:
@@ -71,17 +74,27 @@ def inspector_run_workload_list_helper(args, command_description, file) -> int:
71
74
  0 if successful and 1 otherwise.
72
75
  """
73
76
  prefix = f'Command Description: {command_description}\n'
74
- postfix = '========================================================'
75
77
  return_code, command_output = get_workload_list(args)
76
78
  if return_code != 0:
77
79
  xpk_exit(return_code)
78
- inspector_command_output = f'{prefix} \n{command_output} \n{postfix} \n'
80
+ inspector_command_output = f'{prefix} \n{command_output} \n{_SPACER} \n'
79
81
  append_tmp_file(inspector_command_output, file)
80
82
  if args.print_to_terminal:
81
83
  xpk_print(inspector_command_output)
82
84
  return 0
83
85
 
84
86
 
87
+ def inspector_run_sub_slicing_helper(args, file: str):
88
+ return_code, result = has_sub_slicing_enabled()
89
+ if return_code != 0:
90
+ xpk_exit(return_code)
91
+ if result:
92
+ output = f'Sub-slicing topology set up.\n{_SPACER}'
93
+ append_tmp_file(output, file)
94
+ if args.print_to_terminal:
95
+ xpk_print(output)
96
+
97
+
85
98
  def inspector_output_link_helper(args, link, link_description, file) -> int:
86
99
  """Outputs a link for xpk inspector to the output file.
87
100
 
@@ -95,9 +108,7 @@ def inspector_output_link_helper(args, link, link_description, file) -> int:
95
108
  0 if successful and 1 otherwise.
96
109
  """
97
110
  inspector_link = (
98
- f'Link Description: {link_description}\n'
99
- f'Link: {link}\n'
100
- '========================================================'
111
+ f'Link Description: {link_description}\nLink: {link}\n{_SPACER}\n'
101
112
  )
102
113
  append_tmp_file(inspector_link, file)
103
114
  if args.print_to_terminal:
@@ -308,6 +319,8 @@ def inspector(args) -> None:
308
319
  f' {command_description} return code: {return_code}'
309
320
  )
310
321
 
322
+ inspector_run_sub_slicing_helper(args, inspector_file)
323
+
311
324
  # Cloud Console Links:
312
325
  workload_links = []
313
326
  if args.workload:
xpk/commands/kind.py CHANGED
@@ -94,7 +94,7 @@ def cluster_create(args) -> None:
94
94
  'N/A',
95
95
  'N/A',
96
96
  1,
97
- AcceleratorType['CPU'],
97
+ AcceleratorType.CPU,
98
98
  'kind',
99
99
  supports_sub_slicing=False,
100
100
  )
xpk/commands/workload.py CHANGED
@@ -27,6 +27,7 @@ from ..core.cluster import (
27
27
  setup_k8s_env,
28
28
  )
29
29
  from ..core.commands import run_command_with_updates, run_commands
30
+ from ..core.kueue_manager import KueueManager, has_sub_slicing_enabled
30
31
  from ..core.config import (VERTEX_TENSORBOARD_FEATURE_FLAG, XPK_CURRENT_VERSION)
31
32
  from ..core.docker_container import (
32
33
  get_main_container_docker_image,
@@ -95,6 +96,7 @@ from ..core.workload_decorators import (
95
96
  tcpxo_decorator,
96
97
  )
97
98
  from ..utils.console import get_user_input, xpk_exit, xpk_print
99
+ from packaging.version import Version
98
100
  from ..utils.file import write_tmp_file
99
101
  from ..utils.execution_context import is_dry_run
100
102
  from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
@@ -283,6 +285,7 @@ PW_WORKLOAD_CREATE_YAML = """
283
285
  """
284
286
 
285
287
  SUB_SLICING_TOPOLOGIES = ['2x2', '2x4', '4x4', '4x8', '8x8', '8x16', '16x16']
288
+ SUB_SLICING_MINIMUM_KUEUE_VERSION = Version('0.13.0')
286
289
 
287
290
 
288
291
  def workload_create_pathways(args) -> None:
@@ -340,6 +343,7 @@ def workload_create(args) -> None:
340
343
  xpk_exit(return_code)
341
344
 
342
345
  if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing_topology is not None:
346
+ _validate_sub_slicing_availability()
343
347
  _validate_sub_slicing_topology(system, args.sub_slicing_topology)
344
348
 
345
349
  if not check_if_workload_can_schedule(args, system):
@@ -483,7 +487,7 @@ def workload_create(args) -> None:
483
487
  values: [{restart_on_exit_codes}]"""
484
488
 
485
489
  # Create the workload file based on accelerator type or workload type.
486
- if system.accelerator_type == AcceleratorType['GPU']:
490
+ if system.accelerator_type == AcceleratorType.GPU:
487
491
  container, debugging_dashboard_id = get_user_workload_container(
488
492
  args, system
489
493
  )
@@ -566,7 +570,7 @@ def workload_create(args) -> None:
566
570
  container=container,
567
571
  vms_per_slice=(
568
572
  compute_vms_per_slice(args.sub_slicing_topology)
569
- if system.accelerator_type == AcceleratorType['TPU']
573
+ if system.accelerator_type == AcceleratorType.TPU
570
574
  and FeatureFlags.SUB_SLICING_ENABLED
571
575
  and args.sub_slicing_topology is not None
572
576
  else system.vms_per_slice
@@ -594,7 +598,7 @@ def workload_create(args) -> None:
594
598
  tpu_toleration="""
595
599
  - operator: "Exists"
596
600
  key: google.com/tpu
597
- """ if system.accelerator_type == AcceleratorType['TPU'] else '',
601
+ """ if system.accelerator_type == AcceleratorType.TPU else '',
598
602
  failure_policy_rules=failure_policy_rules,
599
603
  pod_failure_policy=pod_failure_policy,
600
604
  )
@@ -611,7 +615,7 @@ def workload_create(args) -> None:
611
615
 
612
616
  # Get GKE outlier dashboard for TPU
613
617
  outlier_dashboard_id = None
614
- if system.accelerator_type == AcceleratorType['TPU']:
618
+ if system.accelerator_type == AcceleratorType.TPU:
615
619
  outlier_dashboard_id = get_gke_outlier_dashboard(args)
616
620
 
617
621
  # Outlier and debugging dashboards
@@ -678,6 +682,40 @@ def workload_create(args) -> None:
678
682
  xpk_exit(0)
679
683
 
680
684
 
685
+ def _validate_sub_slicing_availability():
686
+ return_code, sub_slicing_enabled = has_sub_slicing_enabled()
687
+ if return_code != 0:
688
+ xpk_print(
689
+ 'Error: Unable to validate sub-slicing support on a given cluster.'
690
+ )
691
+ xpk_exit(1)
692
+
693
+ if not sub_slicing_enabled:
694
+ xpk_print(
695
+ 'Error: Cluster has not been not set up for Sub-slicing. Please enable'
696
+ ' --sub-slicing in "cluster create" command first.'
697
+ )
698
+ xpk_exit(1)
699
+
700
+ kueue_manager = KueueManager()
701
+ return_code, current_version = kueue_manager.get_installed_kueue_version()
702
+ if return_code != 0:
703
+ xpk_print(
704
+ 'Error: Unable to validate sub-slicing support on a given cluster.'
705
+ )
706
+ xpk_exit(1)
707
+
708
+ if current_version < SUB_SLICING_MINIMUM_KUEUE_VERSION:
709
+ xpk_print(
710
+ "Error: Current Kueue version ({current_version}) doesn't support"
711
+ ' Sub-slicing. The minimal required version is'
712
+ ' v{SUB_SLICING_MINIMUM_KUEUE_VERSION}. Please either update Kueue'
713
+ ' manually, or run "cluster create --sub-slicing" on the existing'
714
+ ' cluster.'
715
+ )
716
+ xpk_exit(1)
717
+
718
+
681
719
  def _validate_sub_slicing_topology(
682
720
  system_characteristics: SystemCharacteristics, sub_slicing_topology: str
683
721
  ) -> None: