xpk 0.13.0__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. xpk/commands/batch.py +9 -2
  2. xpk/commands/cluster.py +128 -115
  3. xpk/commands/cluster_gcluster.py +77 -14
  4. xpk/commands/cluster_gcluster_test.py +177 -0
  5. xpk/commands/common.py +10 -28
  6. xpk/commands/info.py +11 -9
  7. xpk/commands/inspector.py +21 -10
  8. xpk/commands/job.py +25 -9
  9. xpk/commands/kind.py +38 -40
  10. xpk/commands/kjob_common.py +4 -4
  11. xpk/commands/run.py +9 -2
  12. xpk/commands/shell.py +13 -10
  13. xpk/commands/storage.py +21 -0
  14. xpk/commands/version.py +0 -4
  15. xpk/commands/workload.py +43 -22
  16. xpk/core/blueprint/blueprint_generator.py +4 -40
  17. xpk/core/blueprint/blueprint_test.py +0 -6
  18. xpk/core/capacity.py +6 -5
  19. xpk/core/cluster.py +91 -194
  20. xpk/core/cluster_private.py +6 -11
  21. xpk/core/commands.py +11 -18
  22. xpk/core/config.py +1 -1
  23. xpk/core/docker_image.py +3 -4
  24. xpk/core/gcloud_context.py +26 -2
  25. xpk/core/gcloud_context_test.py +96 -0
  26. xpk/core/gcluster_manager.py +0 -3
  27. xpk/core/jobset.py +4 -7
  28. xpk/core/kjob.py +14 -27
  29. xpk/core/kueue_manager.py +383 -0
  30. xpk/core/kueue_manager_test.py +542 -0
  31. xpk/core/monitoring.py +1 -1
  32. xpk/core/nap.py +10 -15
  33. xpk/core/network.py +17 -18
  34. xpk/core/nodepool.py +66 -77
  35. xpk/core/nodepool_test.py +198 -1
  36. xpk/core/pathways.py +5 -5
  37. xpk/core/ray.py +10 -14
  38. xpk/core/resources.py +6 -11
  39. xpk/core/scheduling.py +19 -1
  40. xpk/core/scheduling_test.py +31 -0
  41. xpk/core/system_characteristics.py +335 -229
  42. xpk/core/vertex.py +1 -1
  43. xpk/core/workload.py +7 -8
  44. xpk/main.py +2 -4
  45. xpk/parser/cluster.py +7 -0
  46. xpk/parser/cluster_test.py +66 -0
  47. xpk/parser/common.py +11 -0
  48. xpk/parser/workload.py +62 -25
  49. xpk/parser/workload_test.py +82 -0
  50. xpk/utils/feature_flags.py +28 -0
  51. xpk/utils/kueue.py +20 -0
  52. xpk/utils/templates.py +2 -0
  53. xpk/utils/topology.py +37 -0
  54. xpk/utils/topology_test.py +43 -0
  55. xpk/utils/validation.py +79 -55
  56. xpk/utils/validation_test.py +37 -0
  57. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
  58. xpk-0.14.0.dist-info/RECORD +112 -0
  59. xpk/core/kueue.py +0 -561
  60. xpk-0.13.0.dist-info/RECORD +0 -101
  61. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
  62. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
  63. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
  64. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
@@ -16,6 +16,12 @@ limitations under the License.
16
16
 
17
17
  import os
18
18
 
19
+ from ..utils.execution_context import is_dry_run
20
+ from ..core.kueue_manager import KueueConfig, KueueManager
21
+ from ..core.nap import enable_autoprovisioning_on_cluster
22
+ from ..core.scheduling import get_total_chips_requested_from_args
23
+ from ..core.system_characteristics import get_system_characteristics
24
+
19
25
  from ..core.blueprint.blueprint_generator import (
20
26
  BlueprintGenerator,
21
27
  BlueprintGeneratorOutput,
@@ -75,22 +81,29 @@ def cluster_create(args) -> None:
75
81
  bp = generate_blueprint(blueprint_name=unique_name, args=args, prefix=prefix)
76
82
 
77
83
  # staging: sending the blueprint file(s) to gcluster's working directory
78
- bp_staged_path = gcm.stage_files(
79
- blueprint_file=bp.blueprint_file,
80
- blueprint_dependencies=bp.blueprint_dependencies,
81
- prefix=prefix,
82
- )
83
- gcm.deploy(
84
- blueprint_path=bp_staged_path,
85
- deployment_name=unique_name,
86
- prefix=prefix,
87
- )
88
- if args.cluster_state_gcs_bucket is not None:
89
- gcm.upload_state()
84
+ if is_dry_run():
85
+ xpk_print(f'Blueprint file: {bp.blueprint_file}')
86
+ else:
87
+ bp_staged_path = gcm.stage_files(
88
+ blueprint_file=bp.blueprint_file,
89
+ blueprint_dependencies=bp.blueprint_dependencies,
90
+ prefix=prefix,
91
+ )
92
+ gcm.deploy(
93
+ blueprint_path=bp_staged_path,
94
+ deployment_name=unique_name,
95
+ prefix=prefix,
96
+ )
97
+ if args.cluster_state_gcs_bucket is not None:
98
+ gcm.upload_state()
90
99
 
91
100
  get_cluster_credentials(args)
92
101
 
93
- err_code = apply_kjob_crds(args)
102
+ err_code = __install_kueue(args)
103
+ if err_code > 0:
104
+ xpk_exit(err_code)
105
+
106
+ err_code = apply_kjob_crds()
94
107
  if err_code > 0:
95
108
  xpk_exit(err_code)
96
109
 
@@ -101,6 +114,57 @@ def cluster_create(args) -> None:
101
114
  xpk_exit(0)
102
115
 
103
116
 
117
+ def __install_kueue(args) -> int:
118
+ system, return_code = get_system_characteristics(args)
119
+
120
+ if return_code > 0 or system is None:
121
+ xpk_print('Fetching system characteristics failed!')
122
+ return return_code
123
+
124
+ # Provision node pools dynamically based on incoming workloads:
125
+ # Currently autoprovisioning is not supported with Pathways.
126
+ autoprovisioning_config = None
127
+ if args.enable_autoprovisioning:
128
+ xpk_print('Enabling Autoprovisioning')
129
+ autoprovisioning_config, return_code = enable_autoprovisioning_on_cluster(
130
+ args, system
131
+ )
132
+ if return_code != 0:
133
+ return return_code
134
+
135
+ autoprovisioning_enabled = False
136
+ if autoprovisioning_config:
137
+ # Determine total resources available based on autoprovisioning max chips.
138
+ autoprovisioning_enabled = True
139
+ total_chips = autoprovisioning_config.maximum_chips
140
+ else:
141
+ # Determine total chips based on user specified topology.
142
+ total_chips = get_total_chips_requested_from_args(args, system)
143
+ kueue_manager = KueueManager()
144
+
145
+ tolerations = [{
146
+ 'key': 'components.gke.io/gke-managed-components',
147
+ 'operator': 'Equal',
148
+ 'value': 'true',
149
+ 'effect': 'NoSchedule',
150
+ }]
151
+
152
+ kueue_manager.install_or_upgrade(
153
+ KueueConfig(
154
+ system,
155
+ total_chips=total_chips,
156
+ autoprovisioning_enabled=autoprovisioning_enabled,
157
+ num_slices=args.num_slices,
158
+ memory_limit=args.memory_limit,
159
+ cpu_limit=args.cpu_limit,
160
+ is_pathways_cluster=args.enable_pathways,
161
+ flex=args.flex,
162
+ ),
163
+ tolerations=tolerations,
164
+ )
165
+ return 0
166
+
167
+
104
168
  def cluster_delete(args) -> None:
105
169
  """Function around cluster delete for the clusters created by Cluster toolkit.
106
170
 
@@ -213,7 +277,6 @@ def validate_state_gcs_bucket(args):
213
277
  err_code, _ = run_command_for_value(
214
278
  bucket_validate_cmd,
215
279
  'Validate remote state bucket existence.',
216
- global_args=args,
217
280
  )
218
281
  if err_code != 0:
219
282
  xpk_exit(err_code)
@@ -0,0 +1,177 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from unittest.mock import MagicMock, patch
18
+
19
+ import pytest
20
+
21
+ from xpk.commands.cluster_gcluster import cluster_create
22
+ from xpk.core.kueue_manager import KueueConfig
23
+ from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics
24
+
25
+
26
+ @pytest.fixture
27
+ def mock_args():
28
+ """Provides a mock for args."""
29
+ args = MagicMock()
30
+ args.enable_autoprovisioning = False
31
+ args.num_slices = 1
32
+ args.memory_limit = "200G"
33
+ args.cpu_limit = "50"
34
+ args.enable_pathways = False
35
+ args.flex = False
36
+ args.project = "test-project"
37
+ args.cluster = "test-cluster"
38
+ args.zone = "us-central1-c"
39
+ args.cluster_state_gcs_bucket = None
40
+ return args
41
+
42
+
43
+ @pytest.fixture
44
+ def mock_cluster_create_deps(request):
45
+ """Mocks dependencies for cluster_create."""
46
+ with (
47
+ patch("xpk.commands.cluster_gcluster.xpk_exit") as mock_exit,
48
+ patch("xpk.commands.cluster_gcluster.prepare_kjob") as mock_prep_kjob,
49
+ patch("xpk.commands.cluster_gcluster.apply_kjob_crds") as mock_apply_kjob,
50
+ patch(
51
+ "xpk.commands.cluster_gcluster.get_cluster_credentials"
52
+ ) as mock_get_creds,
53
+ patch("xpk.commands.cluster_gcluster.generate_blueprint") as mock_gen_bp,
54
+ patch(
55
+ "xpk.commands.cluster_gcluster.prepare_gcluster_manager"
56
+ ) as mock_prep_gcm,
57
+ patch(
58
+ "xpk.commands.cluster_gcluster.prepare_directories"
59
+ ) as mock_prep_dirs,
60
+ patch(
61
+ "xpk.commands.cluster_gcluster.check_gcloud_authenticated"
62
+ ) as mock_check_auth,
63
+ patch(
64
+ "xpk.commands.cluster_gcluster.get_system_characteristics"
65
+ ) as mock_get_sys_char,
66
+ patch("xpk.commands.cluster_gcluster.KueueManager") as mock_kueue_manager,
67
+ ):
68
+ yield {
69
+ "xpk_exit": mock_exit,
70
+ "prepare_kjob": mock_prep_kjob,
71
+ "apply_kjob_crds": mock_apply_kjob,
72
+ "get_cluster_credentials": mock_get_creds,
73
+ "generate_blueprint": mock_gen_bp,
74
+ "prepare_gcluster_manager": mock_prep_gcm,
75
+ "prepare_directories": mock_prep_dirs,
76
+ "check_gcloud_authenticated": mock_check_auth,
77
+ "get_system_characteristics": mock_get_sys_char,
78
+ "KueueManager": mock_kueue_manager,
79
+ }
80
+
81
+
82
+ @patch("xpk.commands.cluster_gcluster.get_total_chips_requested_from_args")
83
+ def test_install_kueue_standard(
84
+ mock_get_total_chips, mock_args, mock_cluster_create_deps
85
+ ):
86
+ """Tests __install_kueue for a standard installation."""
87
+ mock_cluster_create_deps["prepare_kjob"].return_value = 0
88
+ mock_cluster_create_deps["apply_kjob_crds"].return_value = 0
89
+
90
+ mock_system = SystemCharacteristics(
91
+ topology="N/A",
92
+ vms_per_slice=1,
93
+ gke_accelerator="nvidia-h100-mega-80gb",
94
+ gce_machine_type="a3-megagpu-8g",
95
+ chips_per_vm=8,
96
+ accelerator_type=AcceleratorType["GPU"],
97
+ device_type="h100-mega-80gb-8",
98
+ supports_sub_slicing=False,
99
+ )
100
+ mock_cluster_create_deps["get_system_characteristics"].return_value = (
101
+ mock_system,
102
+ 0,
103
+ )
104
+ mock_get_total_chips.return_value = 16
105
+
106
+ cluster_create(mock_args)
107
+
108
+ mock_cluster_create_deps["xpk_exit"].assert_called_with(0)
109
+ mock_kueue_manager = mock_cluster_create_deps["KueueManager"]
110
+ mock_kueue_manager.return_value.install_or_upgrade.assert_called_once()
111
+ call_args, call_kwargs = (
112
+ mock_kueue_manager.return_value.install_or_upgrade.call_args
113
+ )
114
+ kueue_config: KueueConfig = call_args[0]
115
+
116
+ assert kueue_config.system == mock_system
117
+ assert kueue_config.total_chips == 16
118
+ assert not kueue_config.autoprovisioning_enabled
119
+ assert "tolerations" in call_kwargs
120
+ tolerations = call_kwargs["tolerations"]
121
+ assert any(
122
+ t.get("key") == "components.gke.io/gke-managed-components"
123
+ and t.get("effect") == "NoSchedule"
124
+ for t in tolerations
125
+ )
126
+
127
+
128
+ @patch("xpk.commands.cluster_gcluster.enable_autoprovisioning_on_cluster")
129
+ def test_install_kueue_with_autoprovisioning(
130
+ mock_enable_autoprovisioning, mock_args, mock_cluster_create_deps
131
+ ):
132
+ """Tests __install_kueue with autoprovisioning enabled."""
133
+ mock_cluster_create_deps["prepare_kjob"].return_value = 0
134
+ mock_cluster_create_deps["apply_kjob_crds"].return_value = 0
135
+
136
+ mock_args.enable_autoprovisioning = True
137
+ mock_system = SystemCharacteristics(
138
+ topology="N/A",
139
+ vms_per_slice=1,
140
+ gke_accelerator="nvidia-h100-mega-80gb",
141
+ gce_machine_type="a3-megagpu-8g",
142
+ chips_per_vm=8,
143
+ accelerator_type=AcceleratorType["GPU"],
144
+ device_type="h100-mega-80gb-8",
145
+ supports_sub_slicing=False,
146
+ )
147
+ mock_cluster_create_deps["get_system_characteristics"].return_value = (
148
+ mock_system,
149
+ 0,
150
+ )
151
+
152
+ mock_autoprovisioning_config = MagicMock()
153
+ mock_autoprovisioning_config.maximum_chips = 128
154
+ mock_enable_autoprovisioning.return_value = (mock_autoprovisioning_config, 0)
155
+
156
+ cluster_create(mock_args)
157
+
158
+ mock_cluster_create_deps["xpk_exit"].assert_called_with(0)
159
+ mock_enable_autoprovisioning.assert_called_once_with(mock_args, mock_system)
160
+ mock_kueue_manager = mock_cluster_create_deps["KueueManager"]
161
+ mock_kueue_manager.return_value.install_or_upgrade.assert_called_once()
162
+
163
+ call_args, call_kwargs = (
164
+ mock_kueue_manager.return_value.install_or_upgrade.call_args
165
+ )
166
+ kueue_config: KueueConfig = call_args[0]
167
+
168
+ assert kueue_config.system == mock_system
169
+ assert kueue_config.total_chips == 128
170
+ assert kueue_config.autoprovisioning_enabled
171
+ assert "tolerations" in call_kwargs
172
+ tolerations = call_kwargs["tolerations"]
173
+ assert any(
174
+ t.get("key") == "components.gke.io/gke-managed-components"
175
+ and t.get("effect") == "NoSchedule"
176
+ for t in tolerations
177
+ )
xpk/commands/common.py CHANGED
@@ -16,7 +16,7 @@ limitations under the License.
16
16
 
17
17
  from ..core.commands import run_command_with_updates_retry
18
18
  from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType
19
- from ..core.gcloud_context import zone_to_region
19
+ from ..core.gcloud_context import get_cluster_location
20
20
  from ..utils.console import xpk_print, xpk_exit
21
21
  from ..utils.execution_context import is_dry_run
22
22
  from ..core.system_characteristics import (
@@ -35,16 +35,12 @@ def set_cluster_command(args) -> int:
35
35
  """
36
36
  command = (
37
37
  'gcloud container clusters get-credentials'
38
- f' {args.cluster} --region={zone_to_region(args.zone)}'
39
- ' --dns-endpoint'
40
- f' --project={args.project} &&'
41
- ' kubectl config view && kubectl config set-context --current'
42
- ' --namespace=default'
38
+ f' {args.cluster} --location={get_cluster_location(args.project, args.cluster, args.zone)} --dns-endpoint'
39
+ f' --project={args.project} && kubectl config view && kubectl config'
40
+ ' set-context --current --namespace=default'
43
41
  )
44
42
  task = f'get-credentials to cluster {args.cluster}'
45
- return_code = run_command_with_updates_retry(
46
- command, task, args, verbose=False
47
- )
43
+ return_code = run_command_with_updates_retry(command, task, verbose=False)
48
44
  if return_code != 0:
49
45
  xpk_print(f'{task} returned ERROR {return_code}')
50
46
  return return_code
@@ -53,16 +49,8 @@ def set_cluster_command(args) -> int:
53
49
  def is_TAS_possible(
54
50
  system_characteristics: SystemCharacteristics | None,
55
51
  capacity_type: CapacityType | None,
56
- flex: bool,
57
52
  ) -> bool:
58
- """Check cluster's machine_type and capacity type to determine if Kueue TAS is possible
59
-
60
- Args:
61
- args: user provided arguments for running the command.
62
-
63
- Returns:
64
- True if possible and False otherwise.
65
- """
53
+ """Check cluster's machine_type and capacity type to determine if Kueue TAS is possible"""
66
54
 
67
55
  if is_dry_run():
68
56
  return True
@@ -75,13 +63,7 @@ def is_TAS_possible(
75
63
  xpk_print('capacity_type data was not found in configmaps.')
76
64
  xpk_exit(1)
77
65
 
78
- if not flex:
79
- return False
80
-
81
- if (
82
- system_characteristics.device_type == H100_MEGA_DEVICE_TYPE
83
- and capacity_type != CapacityType.RESERVATION
84
- ):
85
- return False
86
-
87
- return True
66
+ return (
67
+ system_characteristics.device_type != H100_MEGA_DEVICE_TYPE
68
+ or capacity_type == CapacityType.RESERVATION
69
+ )
xpk/commands/info.py CHANGED
@@ -22,8 +22,8 @@ from tabulate import tabulate
22
22
  from ..core.commands import run_command_for_value
23
23
  from ..core.cluster import get_cluster_credentials
24
24
  from ..core.gcloud_context import add_zone_and_project
25
- from ..core.kueue import verify_kueuectl
26
25
  from ..utils.console import xpk_exit, xpk_print
26
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
27
27
 
28
28
  table_fmt = 'plain'
29
29
 
@@ -36,10 +36,15 @@ def info(args: Namespace) -> None:
36
36
  Returns:
37
37
  None
38
38
  """
39
+ if should_validate_dependencies(args):
40
+ validate_dependencies_list([
41
+ SystemDependency.KUBECTL,
42
+ SystemDependency.GCLOUD,
43
+ SystemDependency.KUEUECTL,
44
+ ])
39
45
  add_zone_and_project(args)
40
46
  get_cluster_credentials(args)
41
47
 
42
- verify_kueuectl(args)
43
48
  lq, cq = bool(args.localqueue), bool(args.clusterqueue)
44
49
  if not lq and not cq:
45
50
  lq, cq = True, True
@@ -48,7 +53,7 @@ def info(args: Namespace) -> None:
48
53
  if lq:
49
54
  lqs = run_kueuectl_list_localqueue(args)
50
55
 
51
- cqs = run_kueuectl_list_clusterqueue(args)
56
+ cqs = run_kueuectl_list_clusterqueue()
52
57
  quotas = get_nominal_quotas(cqs)
53
58
 
54
59
  if lq and lqs is not None:
@@ -214,7 +219,7 @@ def run_kueuectl_list_localqueue(args: Namespace) -> str:
214
219
  command = 'kubectl kueue list localqueue -o json'
215
220
  if args.namespace != '':
216
221
  command += f' --namespace {args.namespace}'
217
- return_code, val = run_command_for_value(command, 'list localqueue', args)
222
+ return_code, val = run_command_for_value(command, 'list localqueue')
218
223
 
219
224
  if return_code != 0:
220
225
  xpk_print(f'Cluster info request returned ERROR {return_code}')
@@ -222,18 +227,15 @@ def run_kueuectl_list_localqueue(args: Namespace) -> str:
222
227
  return val
223
228
 
224
229
 
225
- def run_kueuectl_list_clusterqueue(args: Namespace) -> str:
230
+ def run_kueuectl_list_clusterqueue() -> str:
226
231
  """Run the kueuectl list clusterqueue command.
227
232
 
228
- Args:
229
- args: user provided arguments for running the command.
230
-
231
233
  Returns:
232
234
  kueuectl list clusterqueue formatted as json string
233
235
  """
234
236
  command = 'kubectl kueue list clusterqueue -o json'
235
237
 
236
- return_code, val = run_command_for_value(command, 'list clusterqueue', args)
238
+ return_code, val = run_command_for_value(command, 'list clusterqueue')
237
239
 
238
240
  if return_code != 0:
239
241
  xpk_print(f'Cluster info request returned ERROR {return_code}')
xpk/commands/inspector.py CHANGED
@@ -16,11 +16,12 @@ limitations under the License.
16
16
 
17
17
  from ..core.cluster import get_cluster_credentials
18
18
  from ..core.commands import run_command_for_value
19
- from ..core.gcloud_context import add_zone_and_project, zone_to_region
20
- from ..core.kueue import CLUSTER_QUEUE_NAME, LOCAL_QUEUE_NAME
19
+ from ..core.gcloud_context import add_zone_and_project, get_cluster_location
20
+ from ..core.kueue_manager import CLUSTER_QUEUE_NAME, LOCAL_QUEUE_NAME
21
21
  from ..core.resources import CLUSTER_METADATA_CONFIGMAP, CLUSTER_RESOURCES_CONFIGMAP
22
22
  from ..utils.console import xpk_exit, xpk_print
23
23
  from ..utils.file import append_tmp_file, write_tmp_file
24
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
24
25
  from .workload import get_workload_list
25
26
 
26
27
 
@@ -41,7 +42,7 @@ def inspector_run_command_helper(
41
42
  prefix = f'Command: {command}\nCommand Description: {command_description}\n'
42
43
  postfix = '========================================================'
43
44
  return_code, command_output = run_command_for_value(
44
- command, f'{command_description}', args
45
+ command, f'{command_description}'
45
46
  )
46
47
 
47
48
  if return_code != 0:
@@ -116,7 +117,10 @@ def inspector(args) -> None:
116
117
  # Future Improvements for inspector:
117
118
  # 2. List what is next in Queue.
118
119
  # 3. Split inspector into different subcommands to parse info easier.
119
-
120
+ if should_validate_dependencies(args):
121
+ validate_dependencies_list(
122
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
123
+ )
120
124
  final_return_code = 0
121
125
  xpk_print(args)
122
126
 
@@ -138,8 +142,9 @@ def inspector(args) -> None:
138
142
  (
139
143
  (
140
144
  'gcloud beta container clusters list --project'
141
- f' {args.project} --region {zone_to_region(args.zone)} | grep -e'
142
- f' NAME -e {args.cluster}'
145
+ f' {args.project} --location'
146
+ f' {get_cluster_location(args.project, args.cluster, args.zone)} |'
147
+ f' grep -e NAME -e {args.cluster}'
143
148
  ),
144
149
  'GKE: Cluster Details',
145
150
  ),
@@ -160,7 +165,7 @@ def inspector(args) -> None:
160
165
  (
161
166
  (
162
167
  f'gcloud beta container node-pools list --cluster {args.cluster} '
163
- f' --project={args.project} --region={zone_to_region(args.zone)}'
168
+ f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
164
169
  ),
165
170
  'GKE: Node pool Details',
166
171
  ),
@@ -309,19 +314,25 @@ def inspector(args) -> None:
309
314
  workload_links = [(
310
315
  f'Cloud Console for the workload {args.workload}',
311
316
  # pylint: disable=line-too-long
312
- f'https://console.cloud.google.com/kubernetes/service/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}',
317
+ (
318
+ f'https://console.cloud.google.com/kubernetes/service/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
319
+ ),
313
320
  )]
314
321
 
315
322
  links = [
316
323
  (
317
324
  'Cloud Console for the GKE Cluster',
318
325
  # pylint: disable=line-too-long
319
- f'https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}',
326
+ (
327
+ f'https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
328
+ ),
320
329
  ),
321
330
  (
322
331
  'Cloud Console for all workloads in GKE Cluster',
323
332
  # pylint: disable=line-too-long
324
- f'https://console.cloud.google.com/kubernetes/workload/overview?project={args.project}&pageState=((gke%2F{zone_to_region(args.zone)}%2F{args.cluster}))',
333
+ (
334
+ f'https://console.cloud.google.com/kubernetes/workload/overview?project={args.project}&pageState=((gke%2F{get_cluster_location(args.project, args.cluster, args.zone)}%2F{args.cluster}))'
335
+ ),
325
336
  ),
326
337
  (
327
338
  'Cloud Console for IAM Permissions',
xpk/commands/job.py CHANGED
@@ -25,6 +25,7 @@ from ..core.cluster import get_cluster_credentials
25
25
  from ..core.gcloud_context import add_zone_and_project
26
26
  from ..core.kjob import AppProfileDefaults
27
27
  from ..utils.console import xpk_exit, xpk_print
28
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
28
29
  from .kind import set_local_cluster_command
29
30
 
30
31
 
@@ -59,12 +60,16 @@ def job_info(args):
59
60
  Returns:
60
61
  None
61
62
  """
63
+ if should_validate_dependencies(args):
64
+ validate_dependencies_list([
65
+ SystemDependency.KUBECTL,
66
+ SystemDependency.KJOB,
67
+ SystemDependency.GCLOUD,
68
+ ])
62
69
  job_name = args.name
63
70
 
64
71
  desc_command = f'kubectl-kjob describe slurm {job_name}'
65
- desc_code, desc_text = run_command_for_value(
66
- desc_command, 'Getting job data', args
67
- )
72
+ desc_code, desc_text = run_command_for_value(desc_command, 'Getting job data')
68
73
  if desc_code != 0:
69
74
  xpk_print(f'Data info request returned ERROR {desc_code}')
70
75
  xpk_exit(desc_code)
@@ -76,7 +81,6 @@ def job_info(args):
76
81
  job_code, job_text = run_command_for_value(
77
82
  job_command,
78
83
  'Getting job info',
79
- args,
80
84
  dry_run_return_val=JOBS_DRY_RUN_YAML,
81
85
  )
82
86
  if job_code != 0:
@@ -87,7 +91,6 @@ def job_info(args):
87
91
  pods_code, pods_text = run_command_for_value(
88
92
  pods_command,
89
93
  'Getting pods list',
90
- args,
91
94
  dry_run_return_val=PODS_DRY_RUN_RESULT,
92
95
  )
93
96
  if pods_code != 0:
@@ -171,6 +174,12 @@ def job_list(args) -> None:
171
174
  Returns:
172
175
  None
173
176
  """
177
+ if should_validate_dependencies(args):
178
+ validate_dependencies_list([
179
+ SystemDependency.KUBECTL,
180
+ SystemDependency.KJOB,
181
+ SystemDependency.GCLOUD,
182
+ ])
174
183
  if not args.kind_cluster:
175
184
  add_zone_and_project(args)
176
185
  get_cluster_credentials(args)
@@ -183,14 +192,14 @@ def job_list(args) -> None:
183
192
 
184
193
  xpk_print(msg, flush=True)
185
194
 
186
- return_code = run_slurm_job_list_command(args)
195
+ return_code = run_slurm_job_list_command()
187
196
  xpk_exit(return_code)
188
197
 
189
198
 
190
- def run_slurm_job_list_command(args) -> int:
199
+ def run_slurm_job_list_command() -> int:
191
200
  cmd = f'kubectl-kjob list slurm --profile {AppProfileDefaults.NAME.value}'
192
201
 
193
- return_code = run_command_with_updates(cmd, 'list jobs', args)
202
+ return_code = run_command_with_updates(cmd, 'list jobs')
194
203
  if return_code != 0:
195
204
  xpk_print(f'Listing jobs returned ERROR {return_code}')
196
205
  return return_code
@@ -205,6 +214,13 @@ def job_cancel(args) -> None:
205
214
  Returns:
206
215
  None
207
216
  """
217
+ if should_validate_dependencies(args):
218
+ validate_dependencies_list([
219
+ SystemDependency.KUBECTL,
220
+ SystemDependency.KJOB,
221
+ SystemDependency.GCLOUD,
222
+ ])
223
+
208
224
  xpk_print(f'Starting job cancel for job: {args.name}', flush=True)
209
225
  if not args.kind_cluster:
210
226
  add_zone_and_project(args)
@@ -222,7 +238,7 @@ def run_slurm_job_delete_command(args) -> int:
222
238
  list_of_jobs = ' '.join(args.name)
223
239
  cmd = f'kubectl-kjob delete slurm {list_of_jobs}'
224
240
 
225
- return_code = run_command_with_updates(cmd, 'delete job', args)
241
+ return_code = run_command_with_updates(cmd, 'delete job')
226
242
  if return_code != 0:
227
243
  xpk_print(f'Delete job request returned ERROR {return_code}')
228
244
  return return_code