xpk 0.12.0__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. xpk/commands/batch.py +17 -10
  2. xpk/commands/cluster.py +137 -123
  3. xpk/commands/cluster_gcluster.py +77 -14
  4. xpk/commands/cluster_gcluster_test.py +177 -0
  5. xpk/commands/common.py +13 -27
  6. xpk/commands/info.py +11 -9
  7. xpk/commands/inspector.py +22 -11
  8. xpk/commands/job.py +53 -9
  9. xpk/commands/kind.py +38 -40
  10. xpk/commands/kjob_common.py +4 -4
  11. xpk/commands/run.py +9 -2
  12. xpk/commands/shell.py +13 -10
  13. xpk/commands/storage.py +26 -2
  14. xpk/commands/version.py +0 -4
  15. xpk/commands/workload.py +58 -30
  16. xpk/core/blueprint/blueprint_generator.py +4 -40
  17. xpk/core/blueprint/blueprint_test.py +0 -6
  18. xpk/core/capacity.py +6 -5
  19. xpk/core/cluster.py +96 -195
  20. xpk/core/cluster_private.py +9 -12
  21. xpk/core/commands.py +21 -25
  22. xpk/core/config.py +1 -1
  23. xpk/core/docker_image.py +17 -9
  24. xpk/core/docker_resources.py +9 -4
  25. xpk/core/gcloud_context.py +26 -2
  26. xpk/core/gcloud_context_test.py +96 -0
  27. xpk/core/gcluster_manager.py +0 -3
  28. xpk/core/jobset.py +5 -8
  29. xpk/core/kjob.py +19 -29
  30. xpk/core/kueue_manager.py +383 -0
  31. xpk/core/kueue_manager_test.py +542 -0
  32. xpk/core/monitoring.py +1 -1
  33. xpk/core/nap.py +11 -16
  34. xpk/core/network.py +18 -19
  35. xpk/core/nodepool.py +65 -71
  36. xpk/core/nodepool_test.py +198 -1
  37. xpk/core/pathways.py +9 -5
  38. xpk/core/ray.py +11 -15
  39. xpk/core/resources.py +15 -10
  40. xpk/core/scheduling.py +23 -1
  41. xpk/core/scheduling_test.py +31 -0
  42. xpk/core/system_characteristics.py +335 -229
  43. xpk/core/vertex.py +1 -1
  44. xpk/core/workload.py +7 -8
  45. xpk/main.py +3 -2
  46. xpk/parser/cluster.py +50 -0
  47. xpk/parser/cluster_test.py +66 -0
  48. xpk/parser/common.py +11 -0
  49. xpk/parser/workload.py +62 -25
  50. xpk/parser/workload_test.py +82 -0
  51. xpk/utils/execution_context.py +28 -0
  52. xpk/utils/feature_flags.py +28 -0
  53. xpk/utils/file.py +25 -10
  54. xpk/utils/kueue.py +20 -0
  55. xpk/utils/network.py +4 -0
  56. xpk/utils/templates.py +2 -0
  57. xpk/utils/topology.py +37 -0
  58. xpk/utils/topology_test.py +43 -0
  59. xpk/utils/validation.py +79 -55
  60. xpk/utils/validation_test.py +37 -0
  61. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
  62. xpk-0.14.0.dist-info/RECORD +112 -0
  63. xpk/core/kueue.py +0 -545
  64. xpk-0.12.0.dist-info/RECORD +0 -100
  65. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
  66. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
  67. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
  68. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
@@ -16,6 +16,12 @@ limitations under the License.
16
16
 
17
17
  import os
18
18
 
19
+ from ..utils.execution_context import is_dry_run
20
+ from ..core.kueue_manager import KueueConfig, KueueManager
21
+ from ..core.nap import enable_autoprovisioning_on_cluster
22
+ from ..core.scheduling import get_total_chips_requested_from_args
23
+ from ..core.system_characteristics import get_system_characteristics
24
+
19
25
  from ..core.blueprint.blueprint_generator import (
20
26
  BlueprintGenerator,
21
27
  BlueprintGeneratorOutput,
@@ -75,22 +81,29 @@ def cluster_create(args) -> None:
75
81
  bp = generate_blueprint(blueprint_name=unique_name, args=args, prefix=prefix)
76
82
 
77
83
  # staging: sending the blueprint file(s) to gcluster's working directory
78
- bp_staged_path = gcm.stage_files(
79
- blueprint_file=bp.blueprint_file,
80
- blueprint_dependencies=bp.blueprint_dependencies,
81
- prefix=prefix,
82
- )
83
- gcm.deploy(
84
- blueprint_path=bp_staged_path,
85
- deployment_name=unique_name,
86
- prefix=prefix,
87
- )
88
- if args.cluster_state_gcs_bucket is not None:
89
- gcm.upload_state()
84
+ if is_dry_run():
85
+ xpk_print(f'Blueprint file: {bp.blueprint_file}')
86
+ else:
87
+ bp_staged_path = gcm.stage_files(
88
+ blueprint_file=bp.blueprint_file,
89
+ blueprint_dependencies=bp.blueprint_dependencies,
90
+ prefix=prefix,
91
+ )
92
+ gcm.deploy(
93
+ blueprint_path=bp_staged_path,
94
+ deployment_name=unique_name,
95
+ prefix=prefix,
96
+ )
97
+ if args.cluster_state_gcs_bucket is not None:
98
+ gcm.upload_state()
90
99
 
91
100
  get_cluster_credentials(args)
92
101
 
93
- err_code = apply_kjob_crds(args)
102
+ err_code = __install_kueue(args)
103
+ if err_code > 0:
104
+ xpk_exit(err_code)
105
+
106
+ err_code = apply_kjob_crds()
94
107
  if err_code > 0:
95
108
  xpk_exit(err_code)
96
109
 
@@ -101,6 +114,57 @@ def cluster_create(args) -> None:
101
114
  xpk_exit(0)
102
115
 
103
116
 
117
+ def __install_kueue(args) -> int:
118
+ system, return_code = get_system_characteristics(args)
119
+
120
+ if return_code > 0 or system is None:
121
+ xpk_print('Fetching system characteristics failed!')
122
+ return return_code
123
+
124
+ # Provision node pools dynamically based on incoming workloads:
125
+ # Currently autoprovisioning is not supported with Pathways.
126
+ autoprovisioning_config = None
127
+ if args.enable_autoprovisioning:
128
+ xpk_print('Enabling Autoprovisioning')
129
+ autoprovisioning_config, return_code = enable_autoprovisioning_on_cluster(
130
+ args, system
131
+ )
132
+ if return_code != 0:
133
+ return return_code
134
+
135
+ autoprovisioning_enabled = False
136
+ if autoprovisioning_config:
137
+ # Determine total resources available based on autoprovisioning max chips.
138
+ autoprovisioning_enabled = True
139
+ total_chips = autoprovisioning_config.maximum_chips
140
+ else:
141
+ # Determine total chips based on user specified topology.
142
+ total_chips = get_total_chips_requested_from_args(args, system)
143
+ kueue_manager = KueueManager()
144
+
145
+ tolerations = [{
146
+ 'key': 'components.gke.io/gke-managed-components',
147
+ 'operator': 'Equal',
148
+ 'value': 'true',
149
+ 'effect': 'NoSchedule',
150
+ }]
151
+
152
+ kueue_manager.install_or_upgrade(
153
+ KueueConfig(
154
+ system,
155
+ total_chips=total_chips,
156
+ autoprovisioning_enabled=autoprovisioning_enabled,
157
+ num_slices=args.num_slices,
158
+ memory_limit=args.memory_limit,
159
+ cpu_limit=args.cpu_limit,
160
+ is_pathways_cluster=args.enable_pathways,
161
+ flex=args.flex,
162
+ ),
163
+ tolerations=tolerations,
164
+ )
165
+ return 0
166
+
167
+
104
168
  def cluster_delete(args) -> None:
105
169
  """Function around cluster delete for the clusters created by Cluster toolkit.
106
170
 
@@ -213,7 +277,6 @@ def validate_state_gcs_bucket(args):
213
277
  err_code, _ = run_command_for_value(
214
278
  bucket_validate_cmd,
215
279
  'Validate remote state bucket existence.',
216
- global_args=args,
217
280
  )
218
281
  if err_code != 0:
219
282
  xpk_exit(err_code)
@@ -0,0 +1,177 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from unittest.mock import MagicMock, patch
18
+
19
+ import pytest
20
+
21
+ from xpk.commands.cluster_gcluster import cluster_create
22
+ from xpk.core.kueue_manager import KueueConfig
23
+ from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics
24
+
25
+
26
+ @pytest.fixture
27
+ def mock_args():
28
+ """Provides a mock for args."""
29
+ args = MagicMock()
30
+ args.enable_autoprovisioning = False
31
+ args.num_slices = 1
32
+ args.memory_limit = "200G"
33
+ args.cpu_limit = "50"
34
+ args.enable_pathways = False
35
+ args.flex = False
36
+ args.project = "test-project"
37
+ args.cluster = "test-cluster"
38
+ args.zone = "us-central1-c"
39
+ args.cluster_state_gcs_bucket = None
40
+ return args
41
+
42
+
43
+ @pytest.fixture
44
+ def mock_cluster_create_deps(request):
45
+ """Mocks dependencies for cluster_create."""
46
+ with (
47
+ patch("xpk.commands.cluster_gcluster.xpk_exit") as mock_exit,
48
+ patch("xpk.commands.cluster_gcluster.prepare_kjob") as mock_prep_kjob,
49
+ patch("xpk.commands.cluster_gcluster.apply_kjob_crds") as mock_apply_kjob,
50
+ patch(
51
+ "xpk.commands.cluster_gcluster.get_cluster_credentials"
52
+ ) as mock_get_creds,
53
+ patch("xpk.commands.cluster_gcluster.generate_blueprint") as mock_gen_bp,
54
+ patch(
55
+ "xpk.commands.cluster_gcluster.prepare_gcluster_manager"
56
+ ) as mock_prep_gcm,
57
+ patch(
58
+ "xpk.commands.cluster_gcluster.prepare_directories"
59
+ ) as mock_prep_dirs,
60
+ patch(
61
+ "xpk.commands.cluster_gcluster.check_gcloud_authenticated"
62
+ ) as mock_check_auth,
63
+ patch(
64
+ "xpk.commands.cluster_gcluster.get_system_characteristics"
65
+ ) as mock_get_sys_char,
66
+ patch("xpk.commands.cluster_gcluster.KueueManager") as mock_kueue_manager,
67
+ ):
68
+ yield {
69
+ "xpk_exit": mock_exit,
70
+ "prepare_kjob": mock_prep_kjob,
71
+ "apply_kjob_crds": mock_apply_kjob,
72
+ "get_cluster_credentials": mock_get_creds,
73
+ "generate_blueprint": mock_gen_bp,
74
+ "prepare_gcluster_manager": mock_prep_gcm,
75
+ "prepare_directories": mock_prep_dirs,
76
+ "check_gcloud_authenticated": mock_check_auth,
77
+ "get_system_characteristics": mock_get_sys_char,
78
+ "KueueManager": mock_kueue_manager,
79
+ }
80
+
81
+
82
+ @patch("xpk.commands.cluster_gcluster.get_total_chips_requested_from_args")
83
+ def test_install_kueue_standard(
84
+ mock_get_total_chips, mock_args, mock_cluster_create_deps
85
+ ):
86
+ """Tests __install_kueue for a standard installation."""
87
+ mock_cluster_create_deps["prepare_kjob"].return_value = 0
88
+ mock_cluster_create_deps["apply_kjob_crds"].return_value = 0
89
+
90
+ mock_system = SystemCharacteristics(
91
+ topology="N/A",
92
+ vms_per_slice=1,
93
+ gke_accelerator="nvidia-h100-mega-80gb",
94
+ gce_machine_type="a3-megagpu-8g",
95
+ chips_per_vm=8,
96
+ accelerator_type=AcceleratorType["GPU"],
97
+ device_type="h100-mega-80gb-8",
98
+ supports_sub_slicing=False,
99
+ )
100
+ mock_cluster_create_deps["get_system_characteristics"].return_value = (
101
+ mock_system,
102
+ 0,
103
+ )
104
+ mock_get_total_chips.return_value = 16
105
+
106
+ cluster_create(mock_args)
107
+
108
+ mock_cluster_create_deps["xpk_exit"].assert_called_with(0)
109
+ mock_kueue_manager = mock_cluster_create_deps["KueueManager"]
110
+ mock_kueue_manager.return_value.install_or_upgrade.assert_called_once()
111
+ call_args, call_kwargs = (
112
+ mock_kueue_manager.return_value.install_or_upgrade.call_args
113
+ )
114
+ kueue_config: KueueConfig = call_args[0]
115
+
116
+ assert kueue_config.system == mock_system
117
+ assert kueue_config.total_chips == 16
118
+ assert not kueue_config.autoprovisioning_enabled
119
+ assert "tolerations" in call_kwargs
120
+ tolerations = call_kwargs["tolerations"]
121
+ assert any(
122
+ t.get("key") == "components.gke.io/gke-managed-components"
123
+ and t.get("effect") == "NoSchedule"
124
+ for t in tolerations
125
+ )
126
+
127
+
128
+ @patch("xpk.commands.cluster_gcluster.enable_autoprovisioning_on_cluster")
129
+ def test_install_kueue_with_autoprovisioning(
130
+ mock_enable_autoprovisioning, mock_args, mock_cluster_create_deps
131
+ ):
132
+ """Tests __install_kueue with autoprovisioning enabled."""
133
+ mock_cluster_create_deps["prepare_kjob"].return_value = 0
134
+ mock_cluster_create_deps["apply_kjob_crds"].return_value = 0
135
+
136
+ mock_args.enable_autoprovisioning = True
137
+ mock_system = SystemCharacteristics(
138
+ topology="N/A",
139
+ vms_per_slice=1,
140
+ gke_accelerator="nvidia-h100-mega-80gb",
141
+ gce_machine_type="a3-megagpu-8g",
142
+ chips_per_vm=8,
143
+ accelerator_type=AcceleratorType["GPU"],
144
+ device_type="h100-mega-80gb-8",
145
+ supports_sub_slicing=False,
146
+ )
147
+ mock_cluster_create_deps["get_system_characteristics"].return_value = (
148
+ mock_system,
149
+ 0,
150
+ )
151
+
152
+ mock_autoprovisioning_config = MagicMock()
153
+ mock_autoprovisioning_config.maximum_chips = 128
154
+ mock_enable_autoprovisioning.return_value = (mock_autoprovisioning_config, 0)
155
+
156
+ cluster_create(mock_args)
157
+
158
+ mock_cluster_create_deps["xpk_exit"].assert_called_with(0)
159
+ mock_enable_autoprovisioning.assert_called_once_with(mock_args, mock_system)
160
+ mock_kueue_manager = mock_cluster_create_deps["KueueManager"]
161
+ mock_kueue_manager.return_value.install_or_upgrade.assert_called_once()
162
+
163
+ call_args, call_kwargs = (
164
+ mock_kueue_manager.return_value.install_or_upgrade.call_args
165
+ )
166
+ kueue_config: KueueConfig = call_args[0]
167
+
168
+ assert kueue_config.system == mock_system
169
+ assert kueue_config.total_chips == 128
170
+ assert kueue_config.autoprovisioning_enabled
171
+ assert "tolerations" in call_kwargs
172
+ tolerations = call_kwargs["tolerations"]
173
+ assert any(
174
+ t.get("key") == "components.gke.io/gke-managed-components"
175
+ and t.get("effect") == "NoSchedule"
176
+ for t in tolerations
177
+ )
xpk/commands/common.py CHANGED
@@ -16,8 +16,9 @@ limitations under the License.
16
16
 
17
17
  from ..core.commands import run_command_with_updates_retry
18
18
  from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType
19
- from ..core.gcloud_context import zone_to_region
19
+ from ..core.gcloud_context import get_cluster_location
20
20
  from ..utils.console import xpk_print, xpk_exit
21
+ from ..utils.execution_context import is_dry_run
21
22
  from ..core.system_characteristics import (
22
23
  SystemCharacteristics,
23
24
  )
@@ -34,16 +35,12 @@ def set_cluster_command(args) -> int:
34
35
  """
35
36
  command = (
36
37
  'gcloud container clusters get-credentials'
37
- f' {args.cluster} --region={zone_to_region(args.zone)}'
38
- ' --dns-endpoint'
39
- f' --project={args.project} &&'
40
- ' kubectl config view && kubectl config set-context --current'
41
- ' --namespace=default'
38
+ f' {args.cluster} --location={get_cluster_location(args.project, args.cluster, args.zone)} --dns-endpoint'
39
+ f' --project={args.project} && kubectl config view && kubectl config'
40
+ ' set-context --current --namespace=default'
42
41
  )
43
42
  task = f'get-credentials to cluster {args.cluster}'
44
- return_code = run_command_with_updates_retry(
45
- command, task, args, verbose=False
46
- )
43
+ return_code = run_command_with_updates_retry(command, task, verbose=False)
47
44
  if return_code != 0:
48
45
  xpk_print(f'{task} returned ERROR {return_code}')
49
46
  return return_code
@@ -52,16 +49,11 @@ def set_cluster_command(args) -> int:
52
49
  def is_TAS_possible(
53
50
  system_characteristics: SystemCharacteristics | None,
54
51
  capacity_type: CapacityType | None,
55
- flex: bool,
56
52
  ) -> bool:
57
- """Check cluster's machine_type and capacity type to determine if Kueue TAS is possible
53
+ """Check cluster's machine_type and capacity type to determine if Kueue TAS is possible"""
58
54
 
59
- Args:
60
- args: user provided arguments for running the command.
61
-
62
- Returns:
63
- True if possible and False otherwise.
64
- """
55
+ if is_dry_run():
56
+ return True
65
57
 
66
58
  if system_characteristics is None:
67
59
  xpk_print('system_characteristics data was not found in configmaps.')
@@ -71,13 +63,7 @@ def is_TAS_possible(
71
63
  xpk_print('capacity_type data was not found in configmaps.')
72
64
  xpk_exit(1)
73
65
 
74
- if not flex:
75
- return False
76
-
77
- if (
78
- system_characteristics.device_type == H100_MEGA_DEVICE_TYPE
79
- and capacity_type != CapacityType.RESERVATION
80
- ):
81
- return False
82
-
83
- return True
66
+ return (
67
+ system_characteristics.device_type != H100_MEGA_DEVICE_TYPE
68
+ or capacity_type == CapacityType.RESERVATION
69
+ )
xpk/commands/info.py CHANGED
@@ -22,8 +22,8 @@ from tabulate import tabulate
22
22
  from ..core.commands import run_command_for_value
23
23
  from ..core.cluster import get_cluster_credentials
24
24
  from ..core.gcloud_context import add_zone_and_project
25
- from ..core.kueue import verify_kueuectl
26
25
  from ..utils.console import xpk_exit, xpk_print
26
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
27
27
 
28
28
  table_fmt = 'plain'
29
29
 
@@ -36,10 +36,15 @@ def info(args: Namespace) -> None:
36
36
  Returns:
37
37
  None
38
38
  """
39
+ if should_validate_dependencies(args):
40
+ validate_dependencies_list([
41
+ SystemDependency.KUBECTL,
42
+ SystemDependency.GCLOUD,
43
+ SystemDependency.KUEUECTL,
44
+ ])
39
45
  add_zone_and_project(args)
40
46
  get_cluster_credentials(args)
41
47
 
42
- verify_kueuectl(args)
43
48
  lq, cq = bool(args.localqueue), bool(args.clusterqueue)
44
49
  if not lq and not cq:
45
50
  lq, cq = True, True
@@ -48,7 +53,7 @@ def info(args: Namespace) -> None:
48
53
  if lq:
49
54
  lqs = run_kueuectl_list_localqueue(args)
50
55
 
51
- cqs = run_kueuectl_list_clusterqueue(args)
56
+ cqs = run_kueuectl_list_clusterqueue()
52
57
  quotas = get_nominal_quotas(cqs)
53
58
 
54
59
  if lq and lqs is not None:
@@ -214,7 +219,7 @@ def run_kueuectl_list_localqueue(args: Namespace) -> str:
214
219
  command = 'kubectl kueue list localqueue -o json'
215
220
  if args.namespace != '':
216
221
  command += f' --namespace {args.namespace}'
217
- return_code, val = run_command_for_value(command, 'list localqueue', args)
222
+ return_code, val = run_command_for_value(command, 'list localqueue')
218
223
 
219
224
  if return_code != 0:
220
225
  xpk_print(f'Cluster info request returned ERROR {return_code}')
@@ -222,18 +227,15 @@ def run_kueuectl_list_localqueue(args: Namespace) -> str:
222
227
  return val
223
228
 
224
229
 
225
- def run_kueuectl_list_clusterqueue(args: Namespace) -> str:
230
+ def run_kueuectl_list_clusterqueue() -> str:
226
231
  """Run the kueuectl list clusterqueue command.
227
232
 
228
- Args:
229
- args: user provided arguments for running the command.
230
-
231
233
  Returns:
232
234
  kueuectl list clusterqueue formatted as json string
233
235
  """
234
236
  command = 'kubectl kueue list clusterqueue -o json'
235
237
 
236
- return_code, val = run_command_for_value(command, 'list clusterqueue', args)
238
+ return_code, val = run_command_for_value(command, 'list clusterqueue')
237
239
 
238
240
  if return_code != 0:
239
241
  xpk_print(f'Cluster info request returned ERROR {return_code}')
xpk/commands/inspector.py CHANGED
@@ -16,11 +16,12 @@ limitations under the License.
16
16
 
17
17
  from ..core.cluster import get_cluster_credentials
18
18
  from ..core.commands import run_command_for_value
19
- from ..core.gcloud_context import add_zone_and_project, zone_to_region
20
- from ..core.kueue import CLUSTER_QUEUE_NAME, LOCAL_QUEUE_NAME
19
+ from ..core.gcloud_context import add_zone_and_project, get_cluster_location
20
+ from ..core.kueue_manager import CLUSTER_QUEUE_NAME, LOCAL_QUEUE_NAME
21
21
  from ..core.resources import CLUSTER_METADATA_CONFIGMAP, CLUSTER_RESOURCES_CONFIGMAP
22
22
  from ..utils.console import xpk_exit, xpk_print
23
23
  from ..utils.file import append_tmp_file, write_tmp_file
24
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
24
25
  from .workload import get_workload_list
25
26
 
26
27
 
@@ -41,7 +42,7 @@ def inspector_run_command_helper(
41
42
  prefix = f'Command: {command}\nCommand Description: {command_description}\n'
42
43
  postfix = '========================================================'
43
44
  return_code, command_output = run_command_for_value(
44
- command, f'{command_description}', args
45
+ command, f'{command_description}'
45
46
  )
46
47
 
47
48
  if return_code != 0:
@@ -116,7 +117,10 @@ def inspector(args) -> None:
116
117
  # Future Improvements for inspector:
117
118
  # 2. List what is next in Queue.
118
119
  # 3. Split inspector into different subcommands to parse info easier.
119
-
120
+ if should_validate_dependencies(args):
121
+ validate_dependencies_list(
122
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
123
+ )
120
124
  final_return_code = 0
121
125
  xpk_print(args)
122
126
 
@@ -138,8 +142,9 @@ def inspector(args) -> None:
138
142
  (
139
143
  (
140
144
  'gcloud beta container clusters list --project'
141
- f' {args.project} --region {zone_to_region(args.zone)} | grep -e'
142
- f' NAME -e {args.cluster}'
145
+ f' {args.project} --location'
146
+ f' {get_cluster_location(args.project, args.cluster, args.zone)} |'
147
+ f' grep -e NAME -e {args.cluster}'
143
148
  ),
144
149
  'GKE: Cluster Details',
145
150
  ),
@@ -160,7 +165,7 @@ def inspector(args) -> None:
160
165
  (
161
166
  (
162
167
  f'gcloud beta container node-pools list --cluster {args.cluster} '
163
- f' --project={args.project} --region={zone_to_region(args.zone)}'
168
+ f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
164
169
  ),
165
170
  'GKE: Node pool Details',
166
171
  ),
@@ -309,19 +314,25 @@ def inspector(args) -> None:
309
314
  workload_links = [(
310
315
  f'Cloud Console for the workload {args.workload}',
311
316
  # pylint: disable=line-too-long
312
- f'https://console.cloud.google.com/kubernetes/service/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}',
317
+ (
318
+ f'https://console.cloud.google.com/kubernetes/service/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
319
+ ),
313
320
  )]
314
321
 
315
322
  links = [
316
323
  (
317
324
  'Cloud Console for the GKE Cluster',
318
325
  # pylint: disable=line-too-long
319
- f'https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}',
326
+ (
327
+ f'https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
328
+ ),
320
329
  ),
321
330
  (
322
331
  'Cloud Console for all workloads in GKE Cluster',
323
332
  # pylint: disable=line-too-long
324
- f'https://console.cloud.google.com/kubernetes/workload/overview?project={args.project}&pageState=((gke%2F{zone_to_region(args.zone)}%2F{args.cluster}))',
333
+ (
334
+ f'https://console.cloud.google.com/kubernetes/workload/overview?project={args.project}&pageState=((gke%2F{get_cluster_location(args.project, args.cluster, args.zone)}%2F{args.cluster}))'
335
+ ),
325
336
  ),
326
337
  (
327
338
  'Cloud Console for IAM Permissions',
@@ -346,7 +357,7 @@ def inspector(args) -> None:
346
357
  )
347
358
 
348
359
  # Summarize inspector:
349
- xpk_print(f'Find xpk inspector output file: {inspector_file.name}')
360
+ xpk_print(f'Find xpk inspector output file: {inspector_file}')
350
361
 
351
362
  if final_return_code != 0:
352
363
  xpk_print(
xpk/commands/job.py CHANGED
@@ -25,9 +25,32 @@ from ..core.cluster import get_cluster_credentials
25
25
  from ..core.gcloud_context import add_zone_and_project
26
26
  from ..core.kjob import AppProfileDefaults
27
27
  from ..utils.console import xpk_exit, xpk_print
28
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
28
29
  from .kind import set_local_cluster_command
29
30
 
30
31
 
32
+ JOBS_DRY_RUN_YAML = """
33
+ items:
34
+ - apiVersion: slurm.k8s.io/v1alpha1
35
+ kind: SlurmJob
36
+ metadata:
37
+ annotations:
38
+ kjobctl.x-k8s.io/script: echo hello
39
+ creationTimestamp: '2024-04-29T12:00:00Z'
40
+ labels:
41
+ kjobctl.x-k8s.io/app-profile: default
42
+ name: golden-job
43
+ namespace: default
44
+ spec:
45
+ script: echo hello
46
+ """
47
+
48
+ PODS_DRY_RUN_RESULT = """
49
+ foo-pod 2/2 Running 0 2d
50
+ bar-pod 1/1 Evicted 0 1d
51
+ """
52
+
53
+
31
54
  def job_info(args):
32
55
  """Run commands obtaining information about a job given by name.
33
56
 
@@ -37,12 +60,16 @@ def job_info(args):
37
60
  Returns:
38
61
  None
39
62
  """
63
+ if should_validate_dependencies(args):
64
+ validate_dependencies_list([
65
+ SystemDependency.KUBECTL,
66
+ SystemDependency.KJOB,
67
+ SystemDependency.GCLOUD,
68
+ ])
40
69
  job_name = args.name
41
70
 
42
71
  desc_command = f'kubectl-kjob describe slurm {job_name}'
43
- desc_code, desc_text = run_command_for_value(
44
- desc_command, 'Getting job data', args
45
- )
72
+ desc_code, desc_text = run_command_for_value(desc_command, 'Getting job data')
46
73
  if desc_code != 0:
47
74
  xpk_print(f'Data info request returned ERROR {desc_code}')
48
75
  xpk_exit(desc_code)
@@ -52,7 +79,9 @@ def job_info(args):
52
79
  f' metadata.name=={job_name}'
53
80
  )
54
81
  job_code, job_text = run_command_for_value(
55
- job_command, 'Getting job info', args
82
+ job_command,
83
+ 'Getting job info',
84
+ dry_run_return_val=JOBS_DRY_RUN_YAML,
56
85
  )
57
86
  if job_code != 0:
58
87
  xpk_print(f'Job info request returned ERROR {job_code}')
@@ -60,7 +89,9 @@ def job_info(args):
60
89
 
61
90
  pods_command = f'kubectl get pods -l=job-name={job_name} --no-headers'
62
91
  pods_code, pods_text = run_command_for_value(
63
- pods_command, 'Getting pods list', args
92
+ pods_command,
93
+ 'Getting pods list',
94
+ dry_run_return_val=PODS_DRY_RUN_RESULT,
64
95
  )
65
96
  if pods_code != 0:
66
97
  xpk_print(f'Pods list request returned ERROR {pods_code}')
@@ -143,6 +174,12 @@ def job_list(args) -> None:
143
174
  Returns:
144
175
  None
145
176
  """
177
+ if should_validate_dependencies(args):
178
+ validate_dependencies_list([
179
+ SystemDependency.KUBECTL,
180
+ SystemDependency.KJOB,
181
+ SystemDependency.GCLOUD,
182
+ ])
146
183
  if not args.kind_cluster:
147
184
  add_zone_and_project(args)
148
185
  get_cluster_credentials(args)
@@ -155,14 +192,14 @@ def job_list(args) -> None:
155
192
 
156
193
  xpk_print(msg, flush=True)
157
194
 
158
- return_code = run_slurm_job_list_command(args)
195
+ return_code = run_slurm_job_list_command()
159
196
  xpk_exit(return_code)
160
197
 
161
198
 
162
- def run_slurm_job_list_command(args) -> int:
199
+ def run_slurm_job_list_command() -> int:
163
200
  cmd = f'kubectl-kjob list slurm --profile {AppProfileDefaults.NAME.value}'
164
201
 
165
- return_code = run_command_with_updates(cmd, 'list jobs', args)
202
+ return_code = run_command_with_updates(cmd, 'list jobs')
166
203
  if return_code != 0:
167
204
  xpk_print(f'Listing jobs returned ERROR {return_code}')
168
205
  return return_code
@@ -177,6 +214,13 @@ def job_cancel(args) -> None:
177
214
  Returns:
178
215
  None
179
216
  """
217
+ if should_validate_dependencies(args):
218
+ validate_dependencies_list([
219
+ SystemDependency.KUBECTL,
220
+ SystemDependency.KJOB,
221
+ SystemDependency.GCLOUD,
222
+ ])
223
+
180
224
  xpk_print(f'Starting job cancel for job: {args.name}', flush=True)
181
225
  if not args.kind_cluster:
182
226
  add_zone_and_project(args)
@@ -194,7 +238,7 @@ def run_slurm_job_delete_command(args) -> int:
194
238
  list_of_jobs = ' '.join(args.name)
195
239
  cmd = f'kubectl-kjob delete slurm {list_of_jobs}'
196
240
 
197
- return_code = run_command_with_updates(cmd, 'delete job', args)
241
+ return_code = run_command_with_updates(cmd, 'delete job')
198
242
  if return_code != 0:
199
243
  xpk_print(f'Delete job request returned ERROR {return_code}')
200
244
  return return_code