xpk 0.17.3__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. xpk/commands/cluster.py +33 -43
  2. xpk/commands/cluster_gcluster.py +19 -14
  3. xpk/commands/cluster_gcluster_test.py +2 -0
  4. xpk/commands/cluster_test.py +1 -21
  5. xpk/commands/common.py +39 -6
  6. xpk/commands/common_test.py +170 -0
  7. xpk/commands/info.py +9 -5
  8. xpk/commands/inspector.py +33 -4
  9. xpk/commands/inspector_test.py +142 -0
  10. xpk/commands/workload.py +32 -11
  11. xpk/commands/workload_test.py +71 -3
  12. xpk/core/blueprint/blueprint_generator.py +19 -8
  13. xpk/core/blueprint/testing/data/a3_ultra.yaml +3 -1
  14. xpk/core/blueprint/testing/data/a4.yaml +3 -1
  15. xpk/core/capacity.py +37 -17
  16. xpk/core/capacity_test.py +66 -1
  17. xpk/core/cluster.py +11 -10
  18. xpk/core/cluster_private.py +3 -3
  19. xpk/core/cluster_test.py +29 -2
  20. xpk/core/config.py +5 -2
  21. xpk/core/docker_container.py +31 -24
  22. xpk/core/docker_manager.py +4 -4
  23. xpk/core/docker_resources.py +4 -1
  24. xpk/core/kueue_manager.py +6 -8
  25. xpk/core/kueue_manager_test.py +6 -5
  26. xpk/core/nap.py +14 -3
  27. xpk/core/nodepool.py +52 -13
  28. xpk/core/nodepool_test.py +147 -8
  29. xpk/core/remote_state/fuse_remote_state.py +1 -1
  30. xpk/core/scheduling.py +32 -4
  31. xpk/core/scheduling_test.py +39 -2
  32. xpk/core/system_characteristics.py +44 -0
  33. xpk/core/system_characteristics_test.py +11 -0
  34. xpk/core/telemetry.py +11 -1
  35. xpk/core/telemetry_test.py +39 -0
  36. xpk/core/testing/commands_tester.py +26 -0
  37. xpk/core/testing/commands_tester_test.py +20 -1
  38. xpk/core/workload_decorators/rdma_decorator.py +9 -0
  39. xpk/parser/cluster.py +11 -1
  40. xpk/parser/cluster_test.py +59 -1
  41. xpk/parser/common.py +11 -17
  42. xpk/parser/core.py +0 -8
  43. xpk/parser/storage.py +3 -14
  44. xpk/utils/console.py +1 -1
  45. xpk/utils/feature_flags.py +8 -4
  46. {xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/METADATA +50 -23
  47. {xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/RECORD +51 -60
  48. xpk-1.1.0.dist-info/top_level.txt +1 -0
  49. integration/README.md +0 -19
  50. integration/__init__.py +0 -15
  51. integration/docker_manager_test.py +0 -102
  52. integration/gcluster_a3mega_test.py +0 -215
  53. integration/gcluster_a3ultra_test.py +0 -187
  54. integration/gcluster_a4_test.py +0 -187
  55. integration/gcluster_test.py +0 -107
  56. xpk/commands/kind.py +0 -265
  57. xpk/parser/kind.py +0 -95
  58. xpk/utils/user_input.py +0 -48
  59. xpk/utils/user_input_test.py +0 -92
  60. xpk-0.17.3.dist-info/top_level.txt +0 -2
  61. {xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/WHEEL +0 -0
  62. {xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/entry_points.txt +0 -0
  63. {xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/licenses/LICENSE +0 -0
xpk/commands/cluster.py CHANGED
@@ -19,7 +19,6 @@ from tabulate import tabulate
19
19
  from ..utils.feature_flags import FeatureFlags
20
20
  from ..utils.versions import ReleaseChannel
21
21
  from ..core.pathways import get_pathways_machine_types
22
- from ..core.capacity import H100_DEVICE_TYPE, get_reservation_deployment_type, parse_reservation
23
22
  from ..core.cluster import (
24
23
  get_all_clusters_programmatic,
25
24
  get_cluster_credentials,
@@ -40,7 +39,12 @@ from ..core.cluster_private import authorize_private_cluster_access_if_necessary
40
39
  from ..core.commands import run_command_for_value, run_command_with_updates
41
40
  from ..core.config import VERTEX_TENSORBOARD_FEATURE_FLAG
42
41
  from ..core.telemetry import MetricsCollector, MetricsEventMetadataKey
43
- from ..core.capacity import get_capacity_type
42
+ from ..core.capacity import (
43
+ H100_DEVICE_TYPE,
44
+ get_capacity_type,
45
+ get_reservations_list,
46
+ get_reservation_deployment_type,
47
+ )
44
48
  from ..core.gcloud_context import (
45
49
  add_zone_and_project,
46
50
  get_gke_control_plane_version,
@@ -240,13 +244,6 @@ def _validate_sub_slicing_reservation(args):
240
244
 
241
245
  def _validate_super_slicing_reservation(args):
242
246
  _validate_gsc_reservation(args, 'Super-slicing')
243
- reservation = parse_reservation(args.reservation, args.project)
244
- if reservation.block_name is None:
245
- xpk_print(
246
- 'Error: Validation failed: Super-slicing cluster creation'
247
- ' requires a block or sub-block reservation.'
248
- )
249
- xpk_exit(1)
250
247
 
251
248
 
252
249
  def _validate_gsc_reservation(args, creation_description: str):
@@ -257,27 +254,29 @@ def _validate_gsc_reservation(args, creation_description: str):
257
254
  )
258
255
  xpk_exit(1)
259
256
 
260
- deployment_type = get_reservation_deployment_type(
261
- reservation_path=args.reservation, project=args.project, zone=args.zone
262
- )
263
- if deployment_type != 'DENSE':
264
- xpk_print(
265
- 'Error: Validation failed: The specified reservation'
266
- f' "{args.reservation}" is not a Cluster Director reservation.'
267
- )
268
- xpk_print(
269
- 'Please provide a reservation created for Cluster Director to proceed.'
270
- )
271
- xpk_print('To list valid Cluster Director reservations, run:')
272
- xpk_print(
273
- ' gcloud compute reservations list --filter="deploymentType=DENSE"'
257
+ for reservation in get_reservations_list(args):
258
+ deployment_type = get_reservation_deployment_type(
259
+ reservation_path=reservation, project=args.project, zone=args.zone
274
260
  )
275
- xpk_print(
276
- 'Refer to the documentation for more information on creating Cluster'
277
- ' Director reservations:'
278
- ' https://cloud.google.com/cluster-director/docs/reserve-capacity'
279
- )
280
- xpk_exit(1)
261
+ if deployment_type != 'DENSE':
262
+ xpk_print(
263
+ 'Error: Validation failed: The specified reservation'
264
+ f' "{reservation}" is not a Cluster Director reservation.'
265
+ )
266
+ xpk_print(
267
+ 'Please provide a reservation created for Cluster Director to'
268
+ ' proceed.'
269
+ )
270
+ xpk_print('To list valid Cluster Director reservations, run:')
271
+ xpk_print(
272
+ ' gcloud compute reservations list --filter="deploymentType=DENSE"'
273
+ )
274
+ xpk_print(
275
+ 'Refer to the documentation for more information on creating Cluster'
276
+ ' Director reservations:'
277
+ ' https://cloud.google.com/cluster-director/docs/reserve-capacity'
278
+ )
279
+ xpk_exit(1)
281
280
 
282
281
 
283
282
  def _validate_num_slices_and_set_default(args):
@@ -372,7 +371,7 @@ def cluster_create(args) -> None:
372
371
 
373
372
  update_coredns_command_code = update_coredns_if_necessary()
374
373
  if update_coredns_command_code != 0:
375
- xpk_exit(update_cluster_command_code)
374
+ xpk_exit(update_coredns_command_code)
376
375
 
377
376
  if not is_dry_run():
378
377
  k8s_client = setup_k8s_env(args)
@@ -1233,29 +1232,20 @@ def run_gke_cluster_create_command(
1233
1232
  ' --autoscaling-profile=optimize-utilization'
1234
1233
  ' --labels=gke_product_type=xpk'
1235
1234
  f' --release-channel={release_channel.value.lower()}'
1235
+ ' --enable-ip-alias'
1236
+ ' --enable-dataplane-v2'
1237
+ ' --enable-multi-networking'
1236
1238
  )
1237
1239
 
1238
1240
  if args.gke_version:
1239
1241
  command += ' --no-enable-autoupgrade'
1240
1242
 
1241
- enable_ip_alias = False
1242
-
1243
1243
  if args.private or args.authorized_networks is not None:
1244
- enable_ip_alias = True
1245
1244
  command += ' --enable-master-authorized-networks --enable-private-nodes'
1246
1245
 
1247
- if system.accelerator_type == AcceleratorType.GPU:
1248
- enable_ip_alias = True
1249
- command += ' --enable-dataplane-v2 --enable-multi-networking'
1250
- else:
1246
+ if system.accelerator_type != AcceleratorType.GPU:
1251
1247
  command += ' --location-policy=BALANCED --scopes=storage-full,gke-default'
1252
1248
 
1253
- if args.enable_pathways:
1254
- enable_ip_alias = True
1255
-
1256
- if enable_ip_alias:
1257
- command += ' --enable-ip-alias'
1258
-
1259
1249
  if args.enable_ray_cluster:
1260
1250
  command += ' --addons RayOperator'
1261
1251
 
@@ -32,7 +32,7 @@ from ..core.blueprint.blueprint_generator import (
32
32
  a4_device_type,
33
33
  supported_device_types,
34
34
  )
35
- from ..core.capacity import get_capacity_type
35
+ from ..core.capacity import get_capacity_type, get_reservations_list
36
36
  from ..core.cluster import get_cluster_credentials
37
37
  from ..core.commands import run_command_for_value
38
38
  from ..core.docker_manager import DockerManager
@@ -304,22 +304,29 @@ def generate_blueprint(
304
304
  if args.cluster_state_gcs_bucket is not None:
305
305
  validate_state_gcs_bucket(args)
306
306
 
307
+ num_nodes = 2 if args.num_nodes is None else args.num_nodes
308
+
309
+ reservations = get_reservations_list(args)
310
+ if len(reservations) > 1:
311
+ xpk_print(
312
+ 'Error: Cluster Toolkit based clusters only support a single'
313
+ ' reservation.'
314
+ )
315
+ xpk_exit(1)
316
+ reservation = reservations[0] if len(reservations) > 0 else None
317
+
307
318
  if args.device_type in supported_device_types:
308
319
  if args.device_type == a3mega_device_type:
309
- num_nodes = args.num_nodes if not args.num_nodes is None else 2
310
-
311
320
  maintenance_interval = (
312
321
  get_reservation_maintenance_interval(
313
- args.reservation, args.zone, args.project
322
+ reservation, args.zone, args.project
314
323
  )
315
- if args.reservation is not None
324
+ if reservation is not None
316
325
  else 'PERIODIC'
317
326
  )
318
327
  placement_policy_name = (
319
- get_reservation_placement_policy(
320
- args.reservation, args.zone, args.project
321
- )
322
- if args.reservation is not None
328
+ get_reservation_placement_policy(reservation, args.zone, args.project)
329
+ if reservation is not None
323
330
  else None
324
331
  )
325
332
  placement_policy = (
@@ -342,7 +349,7 @@ def generate_blueprint(
342
349
  num_nodes=num_nodes,
343
350
  reservation_maintenance_interval=maintenance_interval,
344
351
  reservation_placement_policy=placement_policy,
345
- reservation=args.reservation if args.reservation else None,
352
+ reservation=reservation,
346
353
  capacity_type=capacity_type,
347
354
  system_node_pool_machine_type=args.default_pool_cpu_machine_type,
348
355
  system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
@@ -351,7 +358,6 @@ def generate_blueprint(
351
358
  release_channel=release_channel,
352
359
  )
353
360
  if args.device_type == a3ultra_device_type:
354
- num_nodes = args.num_nodes if not args.num_nodes is None else 2
355
361
  return bpg.generate_a3_ultra_blueprint(
356
362
  blueprint_name=blueprint_name,
357
363
  prefix=prefix,
@@ -361,7 +367,7 @@ def generate_blueprint(
361
367
  zone=args.zone,
362
368
  auth_cidr=all_IPs_cidr,
363
369
  num_nodes=num_nodes,
364
- reservation=args.reservation if args.reservation else None,
370
+ reservation=reservation,
365
371
  enable_filestore_csi_driver=args.enable_gcpfilestore_csi_driver,
366
372
  capacity_type=capacity_type,
367
373
  system_node_pool_machine_type=args.default_pool_cpu_machine_type,
@@ -371,7 +377,6 @@ def generate_blueprint(
371
377
  release_channel=release_channel,
372
378
  )
373
379
  if args.device_type == a4_device_type:
374
- num_nodes = args.num_nodes if not args.num_nodes is None else 2
375
380
  return bpg.generate_a4_blueprint(
376
381
  blueprint_name=blueprint_name,
377
382
  prefix=prefix,
@@ -381,7 +386,7 @@ def generate_blueprint(
381
386
  zone=args.zone,
382
387
  auth_cidr=all_IPs_cidr,
383
388
  num_nodes=num_nodes,
384
- reservation=args.reservation if args.reservation else None,
389
+ reservation=reservation,
385
390
  capacity_type=capacity_type,
386
391
  system_node_pool_machine_type=args.default_pool_cpu_machine_type,
387
392
  system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
@@ -91,6 +91,7 @@ def test_install_kueue_standard(
91
91
  device_type="h100-mega-80gb-8",
92
92
  supports_sub_slicing=False,
93
93
  supports_super_slicing=False,
94
+ supports_accelerator_network_profile=True,
94
95
  docker_platform=DockerPlatform.ARM,
95
96
  gpu_config=GpuConfig(requires_topology=True),
96
97
  )
@@ -142,6 +143,7 @@ def test_install_kueue_with_autoprovisioning(
142
143
  device_type="h100-mega-80gb-8",
143
144
  supports_sub_slicing=False,
144
145
  supports_super_slicing=False,
146
+ supports_accelerator_network_profile=True,
145
147
  docker_platform=DockerPlatform.ARM,
146
148
  gpu_config=GpuConfig(requires_topology=True),
147
149
  )
@@ -146,6 +146,7 @@ def construct_args(**kwargs: Any) -> Namespace:
146
146
  docker_image_pull_secret='',
147
147
  managed_mldiagnostics=False,
148
148
  output_manifest_file='',
149
+ num_cubes=None,
149
150
  )
150
151
  args_dict.update(kwargs)
151
152
  return Namespace(**args_dict)
@@ -696,27 +697,6 @@ def test_validate_cluster_create_args_for_super_slicing_missing_reservation(
696
697
  )
697
698
 
698
699
 
699
- def test_validate_cluster_create_args_for_super_slicing_reservation_no_blocks(
700
- mocks: _Mocks,
701
- ):
702
- FeatureFlags.SUPER_SLICING_ENABLED = True
703
- args = construct_args(
704
- super_slicing=True,
705
- reservation='reservation',
706
- num_cubes=None,
707
- num_slices=None,
708
- )
709
-
710
- with pytest.raises(SystemExit):
711
- _validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
712
-
713
- assert mocks.commands_print_mock.call_count == 1
714
- assert (
715
- 'requires a block or sub-block reservation'
716
- in mocks.commands_print_mock.call_args[0][0]
717
- )
718
-
719
-
720
700
  def test_validate_cluster_create_args_for_super_slicing_sparse_deployment_type_reservation(
721
701
  mocks: _Mocks,
722
702
  ):
xpk/commands/common.py CHANGED
@@ -14,8 +14,8 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
- from ..core.commands import run_command_with_updates_retry
18
- from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType
17
+ from ..core.commands import run_command_with_updates_retry, run_command_for_value
18
+ from ..core.capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, CapacityType
19
19
  from ..core.gcloud_context import get_cluster_location
20
20
  from ..utils.console import xpk_print, xpk_exit
21
21
  from ..utils.execution_context import is_dry_run
@@ -46,9 +46,12 @@ def set_cluster_command(args) -> int:
46
46
  return return_code
47
47
 
48
48
 
49
- def is_TAS_possible(
49
+ def is_GPU_TAS_possible(
50
50
  system_characteristics: SystemCharacteristics | None,
51
51
  capacity_type: CapacityType | None,
52
+ cluster_name: str,
53
+ zone: str,
54
+ project: str,
52
55
  ) -> bool:
53
56
  """Check cluster's machine_type and capacity type to determine if Kueue TAS is possible"""
54
57
 
@@ -63,10 +66,40 @@ def is_TAS_possible(
63
66
  xpk_print('capacity_type data was not found in configmaps.')
64
67
  xpk_exit(1)
65
68
 
66
- return (
67
- system_characteristics.device_type != H100_MEGA_DEVICE_TYPE
68
- or capacity_type == CapacityType.RESERVATION
69
+ # For A3-High and A3-Mega TAS is supported only for Flex and Reservation
70
+ if (
71
+ system_characteristics.device_type == H100_DEVICE_TYPE
72
+ or system_characteristics.device_type == H100_MEGA_DEVICE_TYPE
73
+ ) and (
74
+ capacity_type != CapacityType.FLEX_START
75
+ and capacity_type != CapacityType.RESERVATION
76
+ ):
77
+ return False
78
+
79
+ # COMPACT placement and Flex don't work together, Flex is enough to support TAS for A3-High or newer
80
+ if capacity_type == CapacityType.FLEX_START:
81
+ return True
82
+
83
+ # For A3-Ultra or newer, all capacity types support TAS as long as COMPACT placement is used
84
+ command = (
85
+ 'gcloud container node-pools list'
86
+ f' --cluster {cluster_name}'
87
+ f' --location={zone}'
88
+ f' --project={project}'
89
+ ' --filter="placementPolicy.type=COMPACT"'
90
+ ' --format="value(name)"'
69
91
  )
92
+ return_code, compact_placement_nps = run_command_for_value(
93
+ command=command,
94
+ task=(
95
+ 'Check if there is a COMPACT placement policy nodepool in this'
96
+ ' cluster'
97
+ ),
98
+ )
99
+ if return_code != 0:
100
+ xpk_print('Node pool retrieval failed, assuming TAS is not possible')
101
+ return False
102
+ return compact_placement_nps.splitlines() != []
70
103
 
71
104
 
72
105
  def validate_sub_slicing_system(system: SystemCharacteristics):
@@ -0,0 +1,170 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import unittest
18
+ from unittest.mock import MagicMock, patch
19
+
20
+ from xpk.commands.common import is_GPU_TAS_possible
21
+ from xpk.core.capacity import (
22
+ H100_DEVICE_TYPE,
23
+ H100_MEGA_DEVICE_TYPE,
24
+ CapacityType,
25
+ )
26
+ from xpk.core.system_characteristics import SystemCharacteristics
27
+
28
+
29
+ class CommonCommandsTest(unittest.TestCase):
30
+
31
+ @patch("xpk.commands.common.run_command_for_value")
32
+ @patch("xpk.commands.common.xpk_exit")
33
+ @patch("xpk.commands.common.xpk_print")
34
+ @patch("xpk.commands.common.is_dry_run")
35
+ def test_is_GPU_TAS_possible_dry_run(
36
+ self, mock_is_dry_run, mock_xpk_print, mock_xpk_exit, mock_run_command
37
+ ):
38
+ """Test is_GPU_TAS_possible returns True in dry_run mode."""
39
+ mock_is_dry_run.return_value = True
40
+ self.assertTrue(
41
+ is_GPU_TAS_possible(None, None, "cluster", "zone", "project")
42
+ )
43
+ mock_is_dry_run.assert_called_once()
44
+ mock_xpk_print.assert_not_called()
45
+ mock_xpk_exit.assert_not_called()
46
+ mock_run_command.assert_not_called()
47
+
48
+ @patch("xpk.commands.common.is_dry_run", return_value=False)
49
+ @patch("xpk.commands.common.xpk_exit")
50
+ @patch("xpk.commands.common.xpk_print")
51
+ def test_is_GPU_TAS_possible_no_system_characteristics(
52
+ self, mock_xpk_print, mock_xpk_exit, mock_is_dry_run
53
+ ):
54
+ """Test is_GPU_TAS_possible exits if system_characteristics is None."""
55
+ mock_xpk_exit.side_effect = SystemExit(1)
56
+ with self.assertRaises(SystemExit):
57
+ is_GPU_TAS_possible(None, MagicMock(), "cluster", "zone", "project")
58
+ mock_xpk_print.assert_called_with(
59
+ "system_characteristics data was not found in configmaps."
60
+ )
61
+ mock_xpk_exit.assert_called_with(1)
62
+
63
+ @patch("xpk.commands.common.is_dry_run", return_value=False)
64
+ @patch("xpk.commands.common.xpk_exit")
65
+ @patch("xpk.commands.common.xpk_print")
66
+ def test_is_GPU_TAS_possible_no_capacity_type(
67
+ self, mock_xpk_print, mock_xpk_exit, mock_is_dry_run
68
+ ):
69
+ """Test is_GPU_TAS_possible exits if capacity_type is None."""
70
+ mock_xpk_exit.side_effect = SystemExit(1)
71
+ with self.assertRaises(SystemExit):
72
+ is_GPU_TAS_possible(MagicMock(), None, "cluster", "zone", "project")
73
+ mock_xpk_print.assert_called_with(
74
+ "capacity_type data was not found in configmaps."
75
+ )
76
+ mock_xpk_exit.assert_called_with(1)
77
+
78
+ @patch("xpk.commands.common.is_dry_run", return_value=False)
79
+ def test_is_GPU_TAS_possible_h100_unsupported_capacity(self, mock_is_dry_run):
80
+ """Test is_GPU_TAS_possible for H100 with unsupported capacity type."""
81
+ mock_system = MagicMock(spec=SystemCharacteristics)
82
+ mock_system.device_type = H100_DEVICE_TYPE
83
+ self.assertFalse(
84
+ is_GPU_TAS_possible(
85
+ mock_system, CapacityType.ON_DEMAND, "cluster", "zone", "project"
86
+ )
87
+ )
88
+
89
+ mock_system.device_type = H100_MEGA_DEVICE_TYPE
90
+ self.assertFalse(
91
+ is_GPU_TAS_possible(
92
+ mock_system, CapacityType.ON_DEMAND, "cluster", "zone", "project"
93
+ )
94
+ )
95
+
96
+ @patch("xpk.commands.common.is_dry_run", return_value=False)
97
+ def test_is_GPU_TAS_possible_flex_start_capacity(self, mock_is_dry_run):
98
+ """Test is_GPU_TAS_possible returns True for FLEX_START capacity."""
99
+ mock_system = MagicMock(spec=SystemCharacteristics)
100
+ mock_system.device_type = "some-device"
101
+ self.assertTrue(
102
+ is_GPU_TAS_possible(
103
+ mock_system, CapacityType.FLEX_START, "cluster", "zone", "project"
104
+ )
105
+ )
106
+
107
+ @patch("xpk.commands.common.run_command_for_value")
108
+ @patch("xpk.commands.common.is_dry_run", return_value=False)
109
+ def test_is_GPU_TAS_possible_compact_placement_exists(
110
+ self, mock_is_dry_run, mock_run_command
111
+ ):
112
+ """Test is_GPU_TAS_possible with COMPACT placement returns True."""
113
+ mock_system = MagicMock(spec=SystemCharacteristics)
114
+ mock_system.device_type = "a3-ultra"
115
+ mock_run_command.return_value = (0, "some-nodepool\nsome-other-nodepool\n")
116
+ self.assertTrue(
117
+ is_GPU_TAS_possible(
118
+ mock_system,
119
+ CapacityType.RESERVATION,
120
+ "cluster",
121
+ "zone",
122
+ "project",
123
+ )
124
+ )
125
+
126
+ @patch("xpk.commands.common.run_command_for_value")
127
+ @patch("xpk.commands.common.is_dry_run", return_value=False)
128
+ def test_is_GPU_TAS_possible_no_compact_placement(
129
+ self, mock_is_dry_run, mock_run_command
130
+ ):
131
+ """Test is_GPU_TAS_possible without COMPACT placement returns False."""
132
+ mock_system = MagicMock(spec=SystemCharacteristics)
133
+ mock_system.device_type = "a3-ultra"
134
+ mock_run_command.return_value = (0, "")
135
+ self.assertFalse(
136
+ is_GPU_TAS_possible(
137
+ mock_system,
138
+ CapacityType.RESERVATION,
139
+ "cluster",
140
+ "zone",
141
+ "project",
142
+ )
143
+ )
144
+
145
+ @patch("xpk.commands.common.xpk_print")
146
+ @patch("xpk.commands.common.run_command_for_value")
147
+ @patch("xpk.commands.common.is_dry_run", return_value=False)
148
+ def test_is_GPU_TAS_possible_command_fails(
149
+ self, mock_is_dry_run, mock_run_command, mock_xpk_print
150
+ ):
151
+ """Test is_GPU_TAS_possible when gcloud command fails."""
152
+ mock_system = MagicMock(spec=SystemCharacteristics)
153
+ mock_system.device_type = "a3-ultra"
154
+ mock_run_command.return_value = (1, "Error")
155
+ self.assertFalse(
156
+ is_GPU_TAS_possible(
157
+ mock_system,
158
+ CapacityType.RESERVATION,
159
+ "cluster",
160
+ "zone",
161
+ "project",
162
+ )
163
+ )
164
+ mock_xpk_print.assert_called_with(
165
+ "Node pool retrieval failed, assuming TAS is not possible"
166
+ )
167
+
168
+
169
+ if __name__ == "__main__":
170
+ unittest.main()
xpk/commands/info.py CHANGED
@@ -76,7 +76,7 @@ def get_nominal_quotas(cqs: str) -> dict[str, dict[str, str]]:
76
76
  try:
77
77
  cq_list = json.loads(cqs)['items']
78
78
  except ValueError:
79
- xpk_print('Incorrect respone from list clusterqueue')
79
+ xpk_print('Incorrect response from list clusterqueue')
80
80
  xpk_print(cqs)
81
81
  xpk_exit(1)
82
82
 
@@ -98,7 +98,7 @@ def print_formatted_cqs(cqs: str, nominalQuotas) -> None:
98
98
  try:
99
99
  cq_list = json.loads(cqs)['items']
100
100
  except ValueError:
101
- xpk_print('Incorrect respone from list clusterqueue')
101
+ xpk_print('Incorrect response from list clusterqueue')
102
102
  xpk_print(cqs)
103
103
  xpk_exit(1)
104
104
 
@@ -114,7 +114,7 @@ def print_formatted_lqs(lqs: str, nominalQuotas) -> None:
114
114
  try:
115
115
  lq_list = json.loads(lqs)['items']
116
116
  except ValueError:
117
- xpk_print('Incorrect respone from list localqueue')
117
+ xpk_print('Incorrect response from list localqueue')
118
118
  xpk_print(lqs)
119
119
  xpk_exit(1)
120
120
 
@@ -219,7 +219,9 @@ def run_kueuectl_list_localqueue(args: Namespace) -> str:
219
219
  command = 'kubectl kueue list localqueue -o json'
220
220
  if args.namespace != '':
221
221
  command += f' --namespace {args.namespace}'
222
- return_code, val = run_command_for_value(command, 'list localqueue')
222
+ return_code, val = run_command_for_value(
223
+ command, 'list localqueue', hide_error=True
224
+ )
223
225
 
224
226
  if return_code != 0:
225
227
  xpk_print(f'Cluster info request returned ERROR {return_code}')
@@ -235,7 +237,9 @@ def run_kueuectl_list_clusterqueue() -> str:
235
237
  """
236
238
  command = 'kubectl kueue list clusterqueue -o json'
237
239
 
238
- return_code, val = run_command_for_value(command, 'list clusterqueue')
240
+ return_code, val = run_command_for_value(
241
+ command, 'list clusterqueue', hide_error=True
242
+ )
239
243
 
240
244
  if return_code != 0:
241
245
  xpk_print(f'Cluster info request returned ERROR {return_code}')
xpk/commands/inspector.py CHANGED
@@ -23,7 +23,7 @@ from ..utils.console import xpk_exit, xpk_print
23
23
  from ..utils.file import append_tmp_file, write_tmp_file
24
24
  from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
25
25
  from .workload import get_workload_list
26
- from ..core.kueue_manager import has_sub_slicing_enabled
26
+ from ..core.kueue_manager import has_sub_slicing_enabled, has_super_slicing_enabled
27
27
 
28
28
 
29
29
  _SPACER = '========================================================'
@@ -89,12 +89,40 @@ def inspector_run_sub_slicing_helper(args, file: str):
89
89
  if return_code != 0:
90
90
  xpk_exit(return_code)
91
91
  if result:
92
- output = f'Sub-slicing topology set up.\n{_SPACER}'
92
+ output = f'Sub-slicing topology set up.\n{_SPACER}\n'
93
93
  append_tmp_file(output, file)
94
94
  if args.print_to_terminal:
95
95
  xpk_print(output)
96
96
 
97
97
 
98
+ def inspector_run_slice_controller_helper(args, file: str):
99
+ return_code, result = has_super_slicing_enabled()
100
+ if return_code != 0:
101
+ xpk_exit(return_code)
102
+
103
+ if not result:
104
+ return
105
+
106
+ output = f'Super-slicing topology set up.\n{_SPACER}\n'
107
+ append_tmp_file(output, file)
108
+ if args.print_to_terminal:
109
+ xpk_print(output)
110
+
111
+ command = (
112
+ 'kubectl describe deployment slice-controller-controller-manager -n'
113
+ ' slice-controller-system'
114
+ )
115
+ command_description = 'Slice Controller Deployment Details'
116
+ inspector_run_command_helper(args, command, command_description, file)
117
+
118
+ command = (
119
+ 'kubectl logs deployment slice-controller-controller-manager -n'
120
+ ' slice-controller-system --tail=100 --prefix=True'
121
+ )
122
+ command_description = 'Slice Controller Logs'
123
+ inspector_run_command_helper(args, command, command_description, file)
124
+
125
+
98
126
  def inspector_output_link_helper(args, link, link_description, file) -> int:
99
127
  """Outputs a link for xpk inspector to the output file.
100
128
 
@@ -257,6 +285,9 @@ def inspector(args) -> None:
257
285
  f' {description} return code: {return_code}'
258
286
  )
259
287
 
288
+ inspector_run_sub_slicing_helper(args, inspector_file)
289
+ inspector_run_slice_controller_helper(args, inspector_file)
290
+
260
291
  # Workload list views:
261
292
  filter_by_statuses = ['EVERYTHING', 'QUEUED', 'RUNNING']
262
293
  for filter_by_status in filter_by_statuses:
@@ -321,8 +352,6 @@ def inspector(args) -> None:
321
352
  f' {command_description} return code: {return_code}'
322
353
  )
323
354
 
324
- inspector_run_sub_slicing_helper(args, inspector_file)
325
-
326
355
  # Cloud Console Links:
327
356
  workload_links = []
328
357
  if args.workload: