xpk 0.13.0__py3-none-any.whl → 0.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. integration/__init__.py +15 -0
  2. integration/docker_manager_test.py +102 -0
  3. integration/gcluster_a3mega_test.py +204 -0
  4. integration/gcluster_a3ultra_test.py +176 -0
  5. integration/gcluster_a4_test.py +176 -0
  6. integration/gcluster_test.py +107 -0
  7. xpk/commands/batch.py +9 -2
  8. xpk/commands/cluster.py +143 -117
  9. xpk/commands/cluster_gcluster.py +81 -14
  10. xpk/commands/cluster_gcluster_test.py +177 -0
  11. xpk/commands/cluster_test.py +92 -0
  12. xpk/commands/common.py +14 -26
  13. xpk/commands/info.py +11 -9
  14. xpk/commands/inspector.py +21 -10
  15. xpk/commands/job.py +25 -9
  16. xpk/commands/kind.py +39 -40
  17. xpk/commands/kjob_common.py +4 -4
  18. xpk/commands/run.py +9 -2
  19. xpk/commands/shell.py +13 -10
  20. xpk/commands/storage.py +21 -0
  21. xpk/commands/version.py +0 -4
  22. xpk/commands/workload.py +84 -29
  23. xpk/commands/workload_test.py +81 -0
  24. xpk/core/blueprint/blueprint_generator.py +4 -40
  25. xpk/core/blueprint/blueprint_test.py +0 -6
  26. xpk/core/blueprint/testing/__init__.py +15 -0
  27. xpk/core/capacity.py +6 -5
  28. xpk/core/cluster.py +91 -194
  29. xpk/core/cluster_private.py +6 -11
  30. xpk/core/commands.py +11 -18
  31. xpk/core/config.py +1 -1
  32. xpk/core/docker_image.py +3 -4
  33. xpk/core/gcloud_context.py +26 -2
  34. xpk/core/gcloud_context_test.py +96 -0
  35. xpk/core/gcluster_manager.py +0 -3
  36. xpk/core/jobset.py +4 -7
  37. xpk/core/kjob.py +14 -27
  38. xpk/core/kueue_manager.py +423 -0
  39. xpk/core/kueue_manager_test.py +574 -0
  40. xpk/core/monitoring.py +1 -1
  41. xpk/core/nap.py +10 -15
  42. xpk/core/network.py +17 -18
  43. xpk/core/nodepool.py +66 -77
  44. xpk/core/nodepool_test.py +198 -1
  45. xpk/core/pathways.py +5 -5
  46. xpk/core/ray.py +10 -14
  47. xpk/core/resources.py +6 -11
  48. xpk/core/scheduling.py +19 -1
  49. xpk/core/scheduling_test.py +31 -0
  50. xpk/core/system_characteristics.py +350 -232
  51. xpk/core/system_characteristics_test.py +73 -0
  52. xpk/core/vertex.py +1 -1
  53. xpk/core/workload.py +7 -8
  54. xpk/main.py +2 -4
  55. xpk/parser/cluster.py +7 -0
  56. xpk/parser/cluster_test.py +66 -0
  57. xpk/parser/common.py +11 -0
  58. xpk/parser/workload.py +62 -25
  59. xpk/parser/workload_test.py +82 -0
  60. xpk/templates/cluster_preheat.yaml.j2 +31 -0
  61. xpk/templates/filestore-pv.yaml +17 -0
  62. xpk/templates/filestore-pvc.yaml +11 -0
  63. xpk/templates/filestore-sc.yaml +10 -0
  64. xpk/templates/fuse-pv.yaml +17 -0
  65. xpk/templates/fuse-pvc.yaml +13 -0
  66. xpk/templates/kueue_config.yaml.j2 +95 -0
  67. xpk/templates/kueue_gke_default_topology.yaml.j2 +10 -0
  68. xpk/templates/kueue_sub_slicing_topology.yaml.j2 +14 -0
  69. xpk/templates/mtc-cpc.yaml +15 -0
  70. xpk/templates/volume_bundle.yaml +7 -0
  71. xpk/utils/feature_flags.py +28 -0
  72. xpk/utils/kueue.py +20 -0
  73. xpk/utils/templates.py +15 -0
  74. xpk/utils/topology.py +46 -0
  75. xpk/utils/topology_test.py +63 -0
  76. xpk/utils/validation.py +79 -55
  77. xpk/utils/validation_test.py +37 -0
  78. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/METADATA +6 -1
  79. xpk-0.14.1.dist-info/RECORD +133 -0
  80. xpk-0.14.1.dist-info/top_level.txt +2 -0
  81. xpk/core/kueue.py +0 -561
  82. xpk-0.13.0.dist-info/RECORD +0 -101
  83. xpk-0.13.0.dist-info/top_level.txt +0 -1
  84. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/WHEEL +0 -0
  85. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/entry_points.txt +0 -0
  86. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/licenses/LICENSE +0 -0
@@ -16,6 +16,13 @@ limitations under the License.
16
16
 
17
17
  import os
18
18
 
19
+ from ..utils.feature_flags import FeatureFlags
20
+ from ..utils.execution_context import is_dry_run
21
+ from ..core.kueue_manager import KueueConfig, KueueManager
22
+ from ..core.nap import enable_autoprovisioning_on_cluster
23
+ from ..core.scheduling import get_total_chips_requested_from_args
24
+ from ..core.system_characteristics import get_system_characteristics
25
+
19
26
  from ..core.blueprint.blueprint_generator import (
20
27
  BlueprintGenerator,
21
28
  BlueprintGeneratorOutput,
@@ -75,22 +82,29 @@ def cluster_create(args) -> None:
75
82
  bp = generate_blueprint(blueprint_name=unique_name, args=args, prefix=prefix)
76
83
 
77
84
  # staging: sending the blueprint file(s) to gcluster's working directory
78
- bp_staged_path = gcm.stage_files(
79
- blueprint_file=bp.blueprint_file,
80
- blueprint_dependencies=bp.blueprint_dependencies,
81
- prefix=prefix,
82
- )
83
- gcm.deploy(
84
- blueprint_path=bp_staged_path,
85
- deployment_name=unique_name,
86
- prefix=prefix,
87
- )
88
- if args.cluster_state_gcs_bucket is not None:
89
- gcm.upload_state()
85
+ if is_dry_run():
86
+ xpk_print(f'Blueprint file: {bp.blueprint_file}')
87
+ else:
88
+ bp_staged_path = gcm.stage_files(
89
+ blueprint_file=bp.blueprint_file,
90
+ blueprint_dependencies=bp.blueprint_dependencies,
91
+ prefix=prefix,
92
+ )
93
+ gcm.deploy(
94
+ blueprint_path=bp_staged_path,
95
+ deployment_name=unique_name,
96
+ prefix=prefix,
97
+ )
98
+ if args.cluster_state_gcs_bucket is not None:
99
+ gcm.upload_state()
90
100
 
91
101
  get_cluster_credentials(args)
92
102
 
93
- err_code = apply_kjob_crds(args)
103
+ err_code = __install_kueue(args)
104
+ if err_code > 0:
105
+ xpk_exit(err_code)
106
+
107
+ err_code = apply_kjob_crds()
94
108
  if err_code > 0:
95
109
  xpk_exit(err_code)
96
110
 
@@ -101,6 +115,60 @@ def cluster_create(args) -> None:
101
115
  xpk_exit(0)
102
116
 
103
117
 
118
+ def __install_kueue(args) -> int:
119
+ system, return_code = get_system_characteristics(args)
120
+
121
+ if return_code > 0 or system is None:
122
+ xpk_print('Fetching system characteristics failed!')
123
+ return return_code
124
+
125
+ # Provision node pools dynamically based on incoming workloads:
126
+ # Currently autoprovisioning is not supported with Pathways.
127
+ autoprovisioning_config = None
128
+ if args.enable_autoprovisioning:
129
+ xpk_print('Enabling Autoprovisioning')
130
+ autoprovisioning_config, return_code = enable_autoprovisioning_on_cluster(
131
+ args, system
132
+ )
133
+ if return_code != 0:
134
+ return return_code
135
+
136
+ autoprovisioning_enabled = False
137
+ if autoprovisioning_config:
138
+ # Determine total resources available based on autoprovisioning max chips.
139
+ autoprovisioning_enabled = True
140
+ total_chips = autoprovisioning_config.maximum_chips
141
+ else:
142
+ # Determine total chips based on user specified topology.
143
+ total_chips = get_total_chips_requested_from_args(args, system)
144
+ kueue_manager = KueueManager()
145
+
146
+ tolerations = [{
147
+ 'key': 'components.gke.io/gke-managed-components',
148
+ 'operator': 'Equal',
149
+ 'value': 'true',
150
+ 'effect': 'NoSchedule',
151
+ }]
152
+
153
+ kueue_manager.install_or_upgrade(
154
+ KueueConfig(
155
+ system,
156
+ total_chips=total_chips,
157
+ autoprovisioning_enabled=autoprovisioning_enabled,
158
+ num_slices=args.num_slices,
159
+ memory_limit=args.memory_limit,
160
+ cpu_limit=args.cpu_limit,
161
+ is_pathways_cluster=args.enable_pathways,
162
+ flex=args.flex,
163
+ configure_sub_slicing=(
164
+ FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing
165
+ ),
166
+ ),
167
+ tolerations=tolerations,
168
+ )
169
+ return 0
170
+
171
+
104
172
  def cluster_delete(args) -> None:
105
173
  """Function around cluster delete for the clusters created by Cluster toolkit.
106
174
 
@@ -213,7 +281,6 @@ def validate_state_gcs_bucket(args):
213
281
  err_code, _ = run_command_for_value(
214
282
  bucket_validate_cmd,
215
283
  'Validate remote state bucket existence.',
216
- global_args=args,
217
284
  )
218
285
  if err_code != 0:
219
286
  xpk_exit(err_code)
@@ -0,0 +1,177 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from unittest.mock import MagicMock, patch
18
+
19
+ import pytest
20
+
21
+ from xpk.commands.cluster_gcluster import cluster_create
22
+ from xpk.core.kueue_manager import KueueConfig
23
+ from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics
24
+
25
+
26
+ @pytest.fixture
27
+ def mock_args():
28
+ """Provides a mock for args."""
29
+ args = MagicMock()
30
+ args.enable_autoprovisioning = False
31
+ args.num_slices = 1
32
+ args.memory_limit = "200G"
33
+ args.cpu_limit = "50"
34
+ args.enable_pathways = False
35
+ args.flex = False
36
+ args.project = "test-project"
37
+ args.cluster = "test-cluster"
38
+ args.zone = "us-central1-c"
39
+ args.cluster_state_gcs_bucket = None
40
+ return args
41
+
42
+
43
+ @pytest.fixture
44
+ def mock_cluster_create_deps(request):
45
+ """Mocks dependencies for cluster_create."""
46
+ with (
47
+ patch("xpk.commands.cluster_gcluster.xpk_exit") as mock_exit,
48
+ patch("xpk.commands.cluster_gcluster.prepare_kjob") as mock_prep_kjob,
49
+ patch("xpk.commands.cluster_gcluster.apply_kjob_crds") as mock_apply_kjob,
50
+ patch(
51
+ "xpk.commands.cluster_gcluster.get_cluster_credentials"
52
+ ) as mock_get_creds,
53
+ patch("xpk.commands.cluster_gcluster.generate_blueprint") as mock_gen_bp,
54
+ patch(
55
+ "xpk.commands.cluster_gcluster.prepare_gcluster_manager"
56
+ ) as mock_prep_gcm,
57
+ patch(
58
+ "xpk.commands.cluster_gcluster.prepare_directories"
59
+ ) as mock_prep_dirs,
60
+ patch(
61
+ "xpk.commands.cluster_gcluster.check_gcloud_authenticated"
62
+ ) as mock_check_auth,
63
+ patch(
64
+ "xpk.commands.cluster_gcluster.get_system_characteristics"
65
+ ) as mock_get_sys_char,
66
+ patch("xpk.commands.cluster_gcluster.KueueManager") as mock_kueue_manager,
67
+ ):
68
+ yield {
69
+ "xpk_exit": mock_exit,
70
+ "prepare_kjob": mock_prep_kjob,
71
+ "apply_kjob_crds": mock_apply_kjob,
72
+ "get_cluster_credentials": mock_get_creds,
73
+ "generate_blueprint": mock_gen_bp,
74
+ "prepare_gcluster_manager": mock_prep_gcm,
75
+ "prepare_directories": mock_prep_dirs,
76
+ "check_gcloud_authenticated": mock_check_auth,
77
+ "get_system_characteristics": mock_get_sys_char,
78
+ "KueueManager": mock_kueue_manager,
79
+ }
80
+
81
+
82
+ @patch("xpk.commands.cluster_gcluster.get_total_chips_requested_from_args")
83
+ def test_install_kueue_standard(
84
+ mock_get_total_chips, mock_args, mock_cluster_create_deps
85
+ ):
86
+ """Tests __install_kueue for a standard installation."""
87
+ mock_cluster_create_deps["prepare_kjob"].return_value = 0
88
+ mock_cluster_create_deps["apply_kjob_crds"].return_value = 0
89
+
90
+ mock_system = SystemCharacteristics(
91
+ topology="N/A",
92
+ vms_per_slice=1,
93
+ gke_accelerator="nvidia-h100-mega-80gb",
94
+ gce_machine_type="a3-megagpu-8g",
95
+ chips_per_vm=8,
96
+ accelerator_type=AcceleratorType["GPU"],
97
+ device_type="h100-mega-80gb-8",
98
+ supports_sub_slicing=False,
99
+ )
100
+ mock_cluster_create_deps["get_system_characteristics"].return_value = (
101
+ mock_system,
102
+ 0,
103
+ )
104
+ mock_get_total_chips.return_value = 16
105
+
106
+ cluster_create(mock_args)
107
+
108
+ mock_cluster_create_deps["xpk_exit"].assert_called_with(0)
109
+ mock_kueue_manager = mock_cluster_create_deps["KueueManager"]
110
+ mock_kueue_manager.return_value.install_or_upgrade.assert_called_once()
111
+ call_args, call_kwargs = (
112
+ mock_kueue_manager.return_value.install_or_upgrade.call_args
113
+ )
114
+ kueue_config: KueueConfig = call_args[0]
115
+
116
+ assert kueue_config.system == mock_system
117
+ assert kueue_config.total_chips == 16
118
+ assert not kueue_config.autoprovisioning_enabled
119
+ assert "tolerations" in call_kwargs
120
+ tolerations = call_kwargs["tolerations"]
121
+ assert any(
122
+ t.get("key") == "components.gke.io/gke-managed-components"
123
+ and t.get("effect") == "NoSchedule"
124
+ for t in tolerations
125
+ )
126
+
127
+
128
+ @patch("xpk.commands.cluster_gcluster.enable_autoprovisioning_on_cluster")
129
+ def test_install_kueue_with_autoprovisioning(
130
+ mock_enable_autoprovisioning, mock_args, mock_cluster_create_deps
131
+ ):
132
+ """Tests __install_kueue with autoprovisioning enabled."""
133
+ mock_cluster_create_deps["prepare_kjob"].return_value = 0
134
+ mock_cluster_create_deps["apply_kjob_crds"].return_value = 0
135
+
136
+ mock_args.enable_autoprovisioning = True
137
+ mock_system = SystemCharacteristics(
138
+ topology="N/A",
139
+ vms_per_slice=1,
140
+ gke_accelerator="nvidia-h100-mega-80gb",
141
+ gce_machine_type="a3-megagpu-8g",
142
+ chips_per_vm=8,
143
+ accelerator_type=AcceleratorType["GPU"],
144
+ device_type="h100-mega-80gb-8",
145
+ supports_sub_slicing=False,
146
+ )
147
+ mock_cluster_create_deps["get_system_characteristics"].return_value = (
148
+ mock_system,
149
+ 0,
150
+ )
151
+
152
+ mock_autoprovisioning_config = MagicMock()
153
+ mock_autoprovisioning_config.maximum_chips = 128
154
+ mock_enable_autoprovisioning.return_value = (mock_autoprovisioning_config, 0)
155
+
156
+ cluster_create(mock_args)
157
+
158
+ mock_cluster_create_deps["xpk_exit"].assert_called_with(0)
159
+ mock_enable_autoprovisioning.assert_called_once_with(mock_args, mock_system)
160
+ mock_kueue_manager = mock_cluster_create_deps["KueueManager"]
161
+ mock_kueue_manager.return_value.install_or_upgrade.assert_called_once()
162
+
163
+ call_args, call_kwargs = (
164
+ mock_kueue_manager.return_value.install_or_upgrade.call_args
165
+ )
166
+ kueue_config: KueueConfig = call_args[0]
167
+
168
+ assert kueue_config.system == mock_system
169
+ assert kueue_config.total_chips == 128
170
+ assert kueue_config.autoprovisioning_enabled
171
+ assert "tolerations" in call_kwargs
172
+ tolerations = call_kwargs["tolerations"]
173
+ assert any(
174
+ t.get("key") == "components.gke.io/gke-managed-components"
175
+ and t.get("effect") == "NoSchedule"
176
+ for t in tolerations
177
+ )
@@ -0,0 +1,92 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from argparse import Namespace
18
+ from dataclasses import dataclass
19
+ from unittest.mock import MagicMock
20
+ import pytest
21
+
22
+ from xpk.commands.cluster import _validate_cluster_create_args
23
+ from xpk.core.system_characteristics import SystemCharacteristics, UserFacingNameToSystemCharacteristics
24
+ from xpk.utils.feature_flags import FeatureFlags
25
+
26
+
27
+ @dataclass
28
+ class _Mocks:
29
+ common_print_mock: MagicMock
30
+ common_exit_mock: MagicMock
31
+
32
+
33
+ @pytest.fixture
34
+ def mock_common_print_and_exit(mocker):
35
+ common_print_mock = mocker.patch(
36
+ 'xpk.commands.common.xpk_print',
37
+ return_value=None,
38
+ )
39
+ common_exit_mock = mocker.patch(
40
+ 'xpk.commands.common.xpk_exit',
41
+ return_value=None,
42
+ )
43
+ return _Mocks(
44
+ common_print_mock=common_print_mock, common_exit_mock=common_exit_mock
45
+ )
46
+
47
+
48
+ DEFAULT_TEST_SYSTEM: SystemCharacteristics = (
49
+ UserFacingNameToSystemCharacteristics['l4-1']
50
+ )
51
+ SUB_SLICING_SYSTEM: SystemCharacteristics = (
52
+ UserFacingNameToSystemCharacteristics['v6e-4x4']
53
+ )
54
+
55
+
56
+ def test_validate_cluster_create_args_for_correct_args_pass(
57
+ mock_common_print_and_exit: _Mocks,
58
+ ):
59
+ args = Namespace()
60
+
61
+ _validate_cluster_create_args(args, DEFAULT_TEST_SYSTEM)
62
+
63
+ assert mock_common_print_and_exit.common_print_mock.call_count == 0
64
+ assert mock_common_print_and_exit.common_exit_mock.call_count == 0
65
+
66
+
67
+ def test_validate_cluster_create_args_for_correct_sub_slicing_args_pass(
68
+ mock_common_print_and_exit: _Mocks,
69
+ ):
70
+ FeatureFlags.SUB_SLICING_ENABLED = True
71
+ args = Namespace(sub_slicing=True)
72
+
73
+ _validate_cluster_create_args(args, SUB_SLICING_SYSTEM)
74
+
75
+ assert mock_common_print_and_exit.common_print_mock.call_count == 0
76
+ assert mock_common_print_and_exit.common_exit_mock.call_count == 0
77
+
78
+
79
+ def test_validate_cluster_create_args_for_not_supported_system_throws(
80
+ mock_common_print_and_exit: _Mocks,
81
+ ):
82
+ FeatureFlags.SUB_SLICING_ENABLED = True
83
+ args = Namespace(sub_slicing=True)
84
+
85
+ _validate_cluster_create_args(args, DEFAULT_TEST_SYSTEM)
86
+
87
+ assert mock_common_print_and_exit.common_print_mock.call_count == 1
88
+ assert (
89
+ mock_common_print_and_exit.common_print_mock.call_args[0][0]
90
+ == 'Error: l4-1 does not support Sub-slicing.'
91
+ )
92
+ assert mock_common_print_and_exit.common_exit_mock.call_count == 1
xpk/commands/common.py CHANGED
@@ -16,7 +16,7 @@ limitations under the License.
16
16
 
17
17
  from ..core.commands import run_command_with_updates_retry
18
18
  from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType
19
- from ..core.gcloud_context import zone_to_region
19
+ from ..core.gcloud_context import get_cluster_location
20
20
  from ..utils.console import xpk_print, xpk_exit
21
21
  from ..utils.execution_context import is_dry_run
22
22
  from ..core.system_characteristics import (
@@ -35,16 +35,12 @@ def set_cluster_command(args) -> int:
35
35
  """
36
36
  command = (
37
37
  'gcloud container clusters get-credentials'
38
- f' {args.cluster} --region={zone_to_region(args.zone)}'
39
- ' --dns-endpoint'
40
- f' --project={args.project} &&'
41
- ' kubectl config view && kubectl config set-context --current'
42
- ' --namespace=default'
38
+ f' {args.cluster} --location={get_cluster_location(args.project, args.cluster, args.zone)} --dns-endpoint'
39
+ f' --project={args.project} && kubectl config view && kubectl config'
40
+ ' set-context --current --namespace=default'
43
41
  )
44
42
  task = f'get-credentials to cluster {args.cluster}'
45
- return_code = run_command_with_updates_retry(
46
- command, task, args, verbose=False
47
- )
43
+ return_code = run_command_with_updates_retry(command, task, verbose=False)
48
44
  if return_code != 0:
49
45
  xpk_print(f'{task} returned ERROR {return_code}')
50
46
  return return_code
@@ -53,16 +49,8 @@ def set_cluster_command(args) -> int:
53
49
  def is_TAS_possible(
54
50
  system_characteristics: SystemCharacteristics | None,
55
51
  capacity_type: CapacityType | None,
56
- flex: bool,
57
52
  ) -> bool:
58
- """Check cluster's machine_type and capacity type to determine if Kueue TAS is possible
59
-
60
- Args:
61
- args: user provided arguments for running the command.
62
-
63
- Returns:
64
- True if possible and False otherwise.
65
- """
53
+ """Check cluster's machine_type and capacity type to determine if Kueue TAS is possible"""
66
54
 
67
55
  if is_dry_run():
68
56
  return True
@@ -75,13 +63,13 @@ def is_TAS_possible(
75
63
  xpk_print('capacity_type data was not found in configmaps.')
76
64
  xpk_exit(1)
77
65
 
78
- if not flex:
79
- return False
66
+ return (
67
+ system_characteristics.device_type != H100_MEGA_DEVICE_TYPE
68
+ or capacity_type == CapacityType.RESERVATION
69
+ )
80
70
 
81
- if (
82
- system_characteristics.device_type == H100_MEGA_DEVICE_TYPE
83
- and capacity_type != CapacityType.RESERVATION
84
- ):
85
- return False
86
71
 
87
- return True
72
+ def validate_sub_slicing_system(system: SystemCharacteristics):
73
+ if not system.supports_sub_slicing:
74
+ xpk_print(f'Error: {system.device_type} does not support Sub-slicing.')
75
+ xpk_exit(1)
xpk/commands/info.py CHANGED
@@ -22,8 +22,8 @@ from tabulate import tabulate
22
22
  from ..core.commands import run_command_for_value
23
23
  from ..core.cluster import get_cluster_credentials
24
24
  from ..core.gcloud_context import add_zone_and_project
25
- from ..core.kueue import verify_kueuectl
26
25
  from ..utils.console import xpk_exit, xpk_print
26
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
27
27
 
28
28
  table_fmt = 'plain'
29
29
 
@@ -36,10 +36,15 @@ def info(args: Namespace) -> None:
36
36
  Returns:
37
37
  None
38
38
  """
39
+ if should_validate_dependencies(args):
40
+ validate_dependencies_list([
41
+ SystemDependency.KUBECTL,
42
+ SystemDependency.GCLOUD,
43
+ SystemDependency.KUEUECTL,
44
+ ])
39
45
  add_zone_and_project(args)
40
46
  get_cluster_credentials(args)
41
47
 
42
- verify_kueuectl(args)
43
48
  lq, cq = bool(args.localqueue), bool(args.clusterqueue)
44
49
  if not lq and not cq:
45
50
  lq, cq = True, True
@@ -48,7 +53,7 @@ def info(args: Namespace) -> None:
48
53
  if lq:
49
54
  lqs = run_kueuectl_list_localqueue(args)
50
55
 
51
- cqs = run_kueuectl_list_clusterqueue(args)
56
+ cqs = run_kueuectl_list_clusterqueue()
52
57
  quotas = get_nominal_quotas(cqs)
53
58
 
54
59
  if lq and lqs is not None:
@@ -214,7 +219,7 @@ def run_kueuectl_list_localqueue(args: Namespace) -> str:
214
219
  command = 'kubectl kueue list localqueue -o json'
215
220
  if args.namespace != '':
216
221
  command += f' --namespace {args.namespace}'
217
- return_code, val = run_command_for_value(command, 'list localqueue', args)
222
+ return_code, val = run_command_for_value(command, 'list localqueue')
218
223
 
219
224
  if return_code != 0:
220
225
  xpk_print(f'Cluster info request returned ERROR {return_code}')
@@ -222,18 +227,15 @@ def run_kueuectl_list_localqueue(args: Namespace) -> str:
222
227
  return val
223
228
 
224
229
 
225
- def run_kueuectl_list_clusterqueue(args: Namespace) -> str:
230
+ def run_kueuectl_list_clusterqueue() -> str:
226
231
  """Run the kueuectl list clusterqueue command.
227
232
 
228
- Args:
229
- args: user provided arguments for running the command.
230
-
231
233
  Returns:
232
234
  kueuectl list clusterqueue formatted as json string
233
235
  """
234
236
  command = 'kubectl kueue list clusterqueue -o json'
235
237
 
236
- return_code, val = run_command_for_value(command, 'list clusterqueue', args)
238
+ return_code, val = run_command_for_value(command, 'list clusterqueue')
237
239
 
238
240
  if return_code != 0:
239
241
  xpk_print(f'Cluster info request returned ERROR {return_code}')
xpk/commands/inspector.py CHANGED
@@ -16,11 +16,12 @@ limitations under the License.
16
16
 
17
17
  from ..core.cluster import get_cluster_credentials
18
18
  from ..core.commands import run_command_for_value
19
- from ..core.gcloud_context import add_zone_and_project, zone_to_region
20
- from ..core.kueue import CLUSTER_QUEUE_NAME, LOCAL_QUEUE_NAME
19
+ from ..core.gcloud_context import add_zone_and_project, get_cluster_location
20
+ from ..core.kueue_manager import CLUSTER_QUEUE_NAME, LOCAL_QUEUE_NAME
21
21
  from ..core.resources import CLUSTER_METADATA_CONFIGMAP, CLUSTER_RESOURCES_CONFIGMAP
22
22
  from ..utils.console import xpk_exit, xpk_print
23
23
  from ..utils.file import append_tmp_file, write_tmp_file
24
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
24
25
  from .workload import get_workload_list
25
26
 
26
27
 
@@ -41,7 +42,7 @@ def inspector_run_command_helper(
41
42
  prefix = f'Command: {command}\nCommand Description: {command_description}\n'
42
43
  postfix = '========================================================'
43
44
  return_code, command_output = run_command_for_value(
44
- command, f'{command_description}', args
45
+ command, f'{command_description}'
45
46
  )
46
47
 
47
48
  if return_code != 0:
@@ -116,7 +117,10 @@ def inspector(args) -> None:
116
117
  # Future Improvements for inspector:
117
118
  # 2. List what is next in Queue.
118
119
  # 3. Split inspector into different subcommands to parse info easier.
119
-
120
+ if should_validate_dependencies(args):
121
+ validate_dependencies_list(
122
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
123
+ )
120
124
  final_return_code = 0
121
125
  xpk_print(args)
122
126
 
@@ -138,8 +142,9 @@ def inspector(args) -> None:
138
142
  (
139
143
  (
140
144
  'gcloud beta container clusters list --project'
141
- f' {args.project} --region {zone_to_region(args.zone)} | grep -e'
142
- f' NAME -e {args.cluster}'
145
+ f' {args.project} --location'
146
+ f' {get_cluster_location(args.project, args.cluster, args.zone)} |'
147
+ f' grep -e NAME -e {args.cluster}'
143
148
  ),
144
149
  'GKE: Cluster Details',
145
150
  ),
@@ -160,7 +165,7 @@ def inspector(args) -> None:
160
165
  (
161
166
  (
162
167
  f'gcloud beta container node-pools list --cluster {args.cluster} '
163
- f' --project={args.project} --region={zone_to_region(args.zone)}'
168
+ f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
164
169
  ),
165
170
  'GKE: Node pool Details',
166
171
  ),
@@ -309,19 +314,25 @@ def inspector(args) -> None:
309
314
  workload_links = [(
310
315
  f'Cloud Console for the workload {args.workload}',
311
316
  # pylint: disable=line-too-long
312
- f'https://console.cloud.google.com/kubernetes/service/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}',
317
+ (
318
+ f'https://console.cloud.google.com/kubernetes/service/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
319
+ ),
313
320
  )]
314
321
 
315
322
  links = [
316
323
  (
317
324
  'Cloud Console for the GKE Cluster',
318
325
  # pylint: disable=line-too-long
319
- f'https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}',
326
+ (
327
+ f'https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
328
+ ),
320
329
  ),
321
330
  (
322
331
  'Cloud Console for all workloads in GKE Cluster',
323
332
  # pylint: disable=line-too-long
324
- f'https://console.cloud.google.com/kubernetes/workload/overview?project={args.project}&pageState=((gke%2F{zone_to_region(args.zone)}%2F{args.cluster}))',
333
+ (
334
+ f'https://console.cloud.google.com/kubernetes/workload/overview?project={args.project}&pageState=((gke%2F{get_cluster_location(args.project, args.cluster, args.zone)}%2F{args.cluster}))'
335
+ ),
325
336
  ),
326
337
  (
327
338
  'Cloud Console for IAM Permissions',