xpk 0.12.0__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +17 -10
- xpk/commands/cluster.py +137 -123
- xpk/commands/cluster_gcluster.py +77 -14
- xpk/commands/cluster_gcluster_test.py +177 -0
- xpk/commands/common.py +13 -27
- xpk/commands/info.py +11 -9
- xpk/commands/inspector.py +22 -11
- xpk/commands/job.py +53 -9
- xpk/commands/kind.py +38 -40
- xpk/commands/kjob_common.py +4 -4
- xpk/commands/run.py +9 -2
- xpk/commands/shell.py +13 -10
- xpk/commands/storage.py +26 -2
- xpk/commands/version.py +0 -4
- xpk/commands/workload.py +58 -30
- xpk/core/blueprint/blueprint_generator.py +4 -40
- xpk/core/blueprint/blueprint_test.py +0 -6
- xpk/core/capacity.py +6 -5
- xpk/core/cluster.py +96 -195
- xpk/core/cluster_private.py +9 -12
- xpk/core/commands.py +21 -25
- xpk/core/config.py +1 -1
- xpk/core/docker_image.py +17 -9
- xpk/core/docker_resources.py +9 -4
- xpk/core/gcloud_context.py +26 -2
- xpk/core/gcloud_context_test.py +96 -0
- xpk/core/gcluster_manager.py +0 -3
- xpk/core/jobset.py +5 -8
- xpk/core/kjob.py +19 -29
- xpk/core/kueue_manager.py +383 -0
- xpk/core/kueue_manager_test.py +542 -0
- xpk/core/monitoring.py +1 -1
- xpk/core/nap.py +11 -16
- xpk/core/network.py +18 -19
- xpk/core/nodepool.py +65 -71
- xpk/core/nodepool_test.py +198 -1
- xpk/core/pathways.py +9 -5
- xpk/core/ray.py +11 -15
- xpk/core/resources.py +15 -10
- xpk/core/scheduling.py +23 -1
- xpk/core/scheduling_test.py +31 -0
- xpk/core/system_characteristics.py +335 -229
- xpk/core/vertex.py +1 -1
- xpk/core/workload.py +7 -8
- xpk/main.py +3 -2
- xpk/parser/cluster.py +50 -0
- xpk/parser/cluster_test.py +66 -0
- xpk/parser/common.py +11 -0
- xpk/parser/workload.py +62 -25
- xpk/parser/workload_test.py +82 -0
- xpk/utils/execution_context.py +28 -0
- xpk/utils/feature_flags.py +28 -0
- xpk/utils/file.py +25 -10
- xpk/utils/kueue.py +20 -0
- xpk/utils/network.py +4 -0
- xpk/utils/templates.py +2 -0
- xpk/utils/topology.py +37 -0
- xpk/utils/topology_test.py +43 -0
- xpk/utils/validation.py +79 -55
- xpk/utils/validation_test.py +37 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
- xpk-0.14.0.dist-info/RECORD +112 -0
- xpk/core/kueue.py +0 -545
- xpk-0.12.0.dist-info/RECORD +0 -100
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
xpk/commands/cluster_gcluster.py
CHANGED
|
@@ -16,6 +16,12 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
import os
|
|
18
18
|
|
|
19
|
+
from ..utils.execution_context import is_dry_run
|
|
20
|
+
from ..core.kueue_manager import KueueConfig, KueueManager
|
|
21
|
+
from ..core.nap import enable_autoprovisioning_on_cluster
|
|
22
|
+
from ..core.scheduling import get_total_chips_requested_from_args
|
|
23
|
+
from ..core.system_characteristics import get_system_characteristics
|
|
24
|
+
|
|
19
25
|
from ..core.blueprint.blueprint_generator import (
|
|
20
26
|
BlueprintGenerator,
|
|
21
27
|
BlueprintGeneratorOutput,
|
|
@@ -75,22 +81,29 @@ def cluster_create(args) -> None:
|
|
|
75
81
|
bp = generate_blueprint(blueprint_name=unique_name, args=args, prefix=prefix)
|
|
76
82
|
|
|
77
83
|
# staging: sending the blueprint file(s) to gcluster's working directory
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
84
|
+
if is_dry_run():
|
|
85
|
+
xpk_print(f'Blueprint file: {bp.blueprint_file}')
|
|
86
|
+
else:
|
|
87
|
+
bp_staged_path = gcm.stage_files(
|
|
88
|
+
blueprint_file=bp.blueprint_file,
|
|
89
|
+
blueprint_dependencies=bp.blueprint_dependencies,
|
|
90
|
+
prefix=prefix,
|
|
91
|
+
)
|
|
92
|
+
gcm.deploy(
|
|
93
|
+
blueprint_path=bp_staged_path,
|
|
94
|
+
deployment_name=unique_name,
|
|
95
|
+
prefix=prefix,
|
|
96
|
+
)
|
|
97
|
+
if args.cluster_state_gcs_bucket is not None:
|
|
98
|
+
gcm.upload_state()
|
|
90
99
|
|
|
91
100
|
get_cluster_credentials(args)
|
|
92
101
|
|
|
93
|
-
err_code =
|
|
102
|
+
err_code = __install_kueue(args)
|
|
103
|
+
if err_code > 0:
|
|
104
|
+
xpk_exit(err_code)
|
|
105
|
+
|
|
106
|
+
err_code = apply_kjob_crds()
|
|
94
107
|
if err_code > 0:
|
|
95
108
|
xpk_exit(err_code)
|
|
96
109
|
|
|
@@ -101,6 +114,57 @@ def cluster_create(args) -> None:
|
|
|
101
114
|
xpk_exit(0)
|
|
102
115
|
|
|
103
116
|
|
|
117
|
+
def __install_kueue(args) -> int:
|
|
118
|
+
system, return_code = get_system_characteristics(args)
|
|
119
|
+
|
|
120
|
+
if return_code > 0 or system is None:
|
|
121
|
+
xpk_print('Fetching system characteristics failed!')
|
|
122
|
+
return return_code
|
|
123
|
+
|
|
124
|
+
# Provision node pools dynamically based on incoming workloads:
|
|
125
|
+
# Currently autoprovisioning is not supported with Pathways.
|
|
126
|
+
autoprovisioning_config = None
|
|
127
|
+
if args.enable_autoprovisioning:
|
|
128
|
+
xpk_print('Enabling Autoprovisioning')
|
|
129
|
+
autoprovisioning_config, return_code = enable_autoprovisioning_on_cluster(
|
|
130
|
+
args, system
|
|
131
|
+
)
|
|
132
|
+
if return_code != 0:
|
|
133
|
+
return return_code
|
|
134
|
+
|
|
135
|
+
autoprovisioning_enabled = False
|
|
136
|
+
if autoprovisioning_config:
|
|
137
|
+
# Determine total resources available based on autoprovisioning max chips.
|
|
138
|
+
autoprovisioning_enabled = True
|
|
139
|
+
total_chips = autoprovisioning_config.maximum_chips
|
|
140
|
+
else:
|
|
141
|
+
# Determine total chips based on user specified topology.
|
|
142
|
+
total_chips = get_total_chips_requested_from_args(args, system)
|
|
143
|
+
kueue_manager = KueueManager()
|
|
144
|
+
|
|
145
|
+
tolerations = [{
|
|
146
|
+
'key': 'components.gke.io/gke-managed-components',
|
|
147
|
+
'operator': 'Equal',
|
|
148
|
+
'value': 'true',
|
|
149
|
+
'effect': 'NoSchedule',
|
|
150
|
+
}]
|
|
151
|
+
|
|
152
|
+
kueue_manager.install_or_upgrade(
|
|
153
|
+
KueueConfig(
|
|
154
|
+
system,
|
|
155
|
+
total_chips=total_chips,
|
|
156
|
+
autoprovisioning_enabled=autoprovisioning_enabled,
|
|
157
|
+
num_slices=args.num_slices,
|
|
158
|
+
memory_limit=args.memory_limit,
|
|
159
|
+
cpu_limit=args.cpu_limit,
|
|
160
|
+
is_pathways_cluster=args.enable_pathways,
|
|
161
|
+
flex=args.flex,
|
|
162
|
+
),
|
|
163
|
+
tolerations=tolerations,
|
|
164
|
+
)
|
|
165
|
+
return 0
|
|
166
|
+
|
|
167
|
+
|
|
104
168
|
def cluster_delete(args) -> None:
|
|
105
169
|
"""Function around cluster delete for the clusters created by Cluster toolkit.
|
|
106
170
|
|
|
@@ -213,7 +277,6 @@ def validate_state_gcs_bucket(args):
|
|
|
213
277
|
err_code, _ = run_command_for_value(
|
|
214
278
|
bucket_validate_cmd,
|
|
215
279
|
'Validate remote state bucket existence.',
|
|
216
|
-
global_args=args,
|
|
217
280
|
)
|
|
218
281
|
if err_code != 0:
|
|
219
282
|
xpk_exit(err_code)
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from unittest.mock import MagicMock, patch
|
|
18
|
+
|
|
19
|
+
import pytest
|
|
20
|
+
|
|
21
|
+
from xpk.commands.cluster_gcluster import cluster_create
|
|
22
|
+
from xpk.core.kueue_manager import KueueConfig
|
|
23
|
+
from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@pytest.fixture
|
|
27
|
+
def mock_args():
|
|
28
|
+
"""Provides a mock for args."""
|
|
29
|
+
args = MagicMock()
|
|
30
|
+
args.enable_autoprovisioning = False
|
|
31
|
+
args.num_slices = 1
|
|
32
|
+
args.memory_limit = "200G"
|
|
33
|
+
args.cpu_limit = "50"
|
|
34
|
+
args.enable_pathways = False
|
|
35
|
+
args.flex = False
|
|
36
|
+
args.project = "test-project"
|
|
37
|
+
args.cluster = "test-cluster"
|
|
38
|
+
args.zone = "us-central1-c"
|
|
39
|
+
args.cluster_state_gcs_bucket = None
|
|
40
|
+
return args
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@pytest.fixture
|
|
44
|
+
def mock_cluster_create_deps(request):
|
|
45
|
+
"""Mocks dependencies for cluster_create."""
|
|
46
|
+
with (
|
|
47
|
+
patch("xpk.commands.cluster_gcluster.xpk_exit") as mock_exit,
|
|
48
|
+
patch("xpk.commands.cluster_gcluster.prepare_kjob") as mock_prep_kjob,
|
|
49
|
+
patch("xpk.commands.cluster_gcluster.apply_kjob_crds") as mock_apply_kjob,
|
|
50
|
+
patch(
|
|
51
|
+
"xpk.commands.cluster_gcluster.get_cluster_credentials"
|
|
52
|
+
) as mock_get_creds,
|
|
53
|
+
patch("xpk.commands.cluster_gcluster.generate_blueprint") as mock_gen_bp,
|
|
54
|
+
patch(
|
|
55
|
+
"xpk.commands.cluster_gcluster.prepare_gcluster_manager"
|
|
56
|
+
) as mock_prep_gcm,
|
|
57
|
+
patch(
|
|
58
|
+
"xpk.commands.cluster_gcluster.prepare_directories"
|
|
59
|
+
) as mock_prep_dirs,
|
|
60
|
+
patch(
|
|
61
|
+
"xpk.commands.cluster_gcluster.check_gcloud_authenticated"
|
|
62
|
+
) as mock_check_auth,
|
|
63
|
+
patch(
|
|
64
|
+
"xpk.commands.cluster_gcluster.get_system_characteristics"
|
|
65
|
+
) as mock_get_sys_char,
|
|
66
|
+
patch("xpk.commands.cluster_gcluster.KueueManager") as mock_kueue_manager,
|
|
67
|
+
):
|
|
68
|
+
yield {
|
|
69
|
+
"xpk_exit": mock_exit,
|
|
70
|
+
"prepare_kjob": mock_prep_kjob,
|
|
71
|
+
"apply_kjob_crds": mock_apply_kjob,
|
|
72
|
+
"get_cluster_credentials": mock_get_creds,
|
|
73
|
+
"generate_blueprint": mock_gen_bp,
|
|
74
|
+
"prepare_gcluster_manager": mock_prep_gcm,
|
|
75
|
+
"prepare_directories": mock_prep_dirs,
|
|
76
|
+
"check_gcloud_authenticated": mock_check_auth,
|
|
77
|
+
"get_system_characteristics": mock_get_sys_char,
|
|
78
|
+
"KueueManager": mock_kueue_manager,
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@patch("xpk.commands.cluster_gcluster.get_total_chips_requested_from_args")
|
|
83
|
+
def test_install_kueue_standard(
|
|
84
|
+
mock_get_total_chips, mock_args, mock_cluster_create_deps
|
|
85
|
+
):
|
|
86
|
+
"""Tests __install_kueue for a standard installation."""
|
|
87
|
+
mock_cluster_create_deps["prepare_kjob"].return_value = 0
|
|
88
|
+
mock_cluster_create_deps["apply_kjob_crds"].return_value = 0
|
|
89
|
+
|
|
90
|
+
mock_system = SystemCharacteristics(
|
|
91
|
+
topology="N/A",
|
|
92
|
+
vms_per_slice=1,
|
|
93
|
+
gke_accelerator="nvidia-h100-mega-80gb",
|
|
94
|
+
gce_machine_type="a3-megagpu-8g",
|
|
95
|
+
chips_per_vm=8,
|
|
96
|
+
accelerator_type=AcceleratorType["GPU"],
|
|
97
|
+
device_type="h100-mega-80gb-8",
|
|
98
|
+
supports_sub_slicing=False,
|
|
99
|
+
)
|
|
100
|
+
mock_cluster_create_deps["get_system_characteristics"].return_value = (
|
|
101
|
+
mock_system,
|
|
102
|
+
0,
|
|
103
|
+
)
|
|
104
|
+
mock_get_total_chips.return_value = 16
|
|
105
|
+
|
|
106
|
+
cluster_create(mock_args)
|
|
107
|
+
|
|
108
|
+
mock_cluster_create_deps["xpk_exit"].assert_called_with(0)
|
|
109
|
+
mock_kueue_manager = mock_cluster_create_deps["KueueManager"]
|
|
110
|
+
mock_kueue_manager.return_value.install_or_upgrade.assert_called_once()
|
|
111
|
+
call_args, call_kwargs = (
|
|
112
|
+
mock_kueue_manager.return_value.install_or_upgrade.call_args
|
|
113
|
+
)
|
|
114
|
+
kueue_config: KueueConfig = call_args[0]
|
|
115
|
+
|
|
116
|
+
assert kueue_config.system == mock_system
|
|
117
|
+
assert kueue_config.total_chips == 16
|
|
118
|
+
assert not kueue_config.autoprovisioning_enabled
|
|
119
|
+
assert "tolerations" in call_kwargs
|
|
120
|
+
tolerations = call_kwargs["tolerations"]
|
|
121
|
+
assert any(
|
|
122
|
+
t.get("key") == "components.gke.io/gke-managed-components"
|
|
123
|
+
and t.get("effect") == "NoSchedule"
|
|
124
|
+
for t in tolerations
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@patch("xpk.commands.cluster_gcluster.enable_autoprovisioning_on_cluster")
|
|
129
|
+
def test_install_kueue_with_autoprovisioning(
|
|
130
|
+
mock_enable_autoprovisioning, mock_args, mock_cluster_create_deps
|
|
131
|
+
):
|
|
132
|
+
"""Tests __install_kueue with autoprovisioning enabled."""
|
|
133
|
+
mock_cluster_create_deps["prepare_kjob"].return_value = 0
|
|
134
|
+
mock_cluster_create_deps["apply_kjob_crds"].return_value = 0
|
|
135
|
+
|
|
136
|
+
mock_args.enable_autoprovisioning = True
|
|
137
|
+
mock_system = SystemCharacteristics(
|
|
138
|
+
topology="N/A",
|
|
139
|
+
vms_per_slice=1,
|
|
140
|
+
gke_accelerator="nvidia-h100-mega-80gb",
|
|
141
|
+
gce_machine_type="a3-megagpu-8g",
|
|
142
|
+
chips_per_vm=8,
|
|
143
|
+
accelerator_type=AcceleratorType["GPU"],
|
|
144
|
+
device_type="h100-mega-80gb-8",
|
|
145
|
+
supports_sub_slicing=False,
|
|
146
|
+
)
|
|
147
|
+
mock_cluster_create_deps["get_system_characteristics"].return_value = (
|
|
148
|
+
mock_system,
|
|
149
|
+
0,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
mock_autoprovisioning_config = MagicMock()
|
|
153
|
+
mock_autoprovisioning_config.maximum_chips = 128
|
|
154
|
+
mock_enable_autoprovisioning.return_value = (mock_autoprovisioning_config, 0)
|
|
155
|
+
|
|
156
|
+
cluster_create(mock_args)
|
|
157
|
+
|
|
158
|
+
mock_cluster_create_deps["xpk_exit"].assert_called_with(0)
|
|
159
|
+
mock_enable_autoprovisioning.assert_called_once_with(mock_args, mock_system)
|
|
160
|
+
mock_kueue_manager = mock_cluster_create_deps["KueueManager"]
|
|
161
|
+
mock_kueue_manager.return_value.install_or_upgrade.assert_called_once()
|
|
162
|
+
|
|
163
|
+
call_args, call_kwargs = (
|
|
164
|
+
mock_kueue_manager.return_value.install_or_upgrade.call_args
|
|
165
|
+
)
|
|
166
|
+
kueue_config: KueueConfig = call_args[0]
|
|
167
|
+
|
|
168
|
+
assert kueue_config.system == mock_system
|
|
169
|
+
assert kueue_config.total_chips == 128
|
|
170
|
+
assert kueue_config.autoprovisioning_enabled
|
|
171
|
+
assert "tolerations" in call_kwargs
|
|
172
|
+
tolerations = call_kwargs["tolerations"]
|
|
173
|
+
assert any(
|
|
174
|
+
t.get("key") == "components.gke.io/gke-managed-components"
|
|
175
|
+
and t.get("effect") == "NoSchedule"
|
|
176
|
+
for t in tolerations
|
|
177
|
+
)
|
xpk/commands/common.py
CHANGED
|
@@ -16,8 +16,9 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
from ..core.commands import run_command_with_updates_retry
|
|
18
18
|
from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType
|
|
19
|
-
from ..core.gcloud_context import
|
|
19
|
+
from ..core.gcloud_context import get_cluster_location
|
|
20
20
|
from ..utils.console import xpk_print, xpk_exit
|
|
21
|
+
from ..utils.execution_context import is_dry_run
|
|
21
22
|
from ..core.system_characteristics import (
|
|
22
23
|
SystemCharacteristics,
|
|
23
24
|
)
|
|
@@ -34,16 +35,12 @@ def set_cluster_command(args) -> int:
|
|
|
34
35
|
"""
|
|
35
36
|
command = (
|
|
36
37
|
'gcloud container clusters get-credentials'
|
|
37
|
-
f' {args.cluster} --
|
|
38
|
-
' --
|
|
39
|
-
|
|
40
|
-
' kubectl config view && kubectl config set-context --current'
|
|
41
|
-
' --namespace=default'
|
|
38
|
+
f' {args.cluster} --location={get_cluster_location(args.project, args.cluster, args.zone)} --dns-endpoint'
|
|
39
|
+
f' --project={args.project} && kubectl config view && kubectl config'
|
|
40
|
+
' set-context --current --namespace=default'
|
|
42
41
|
)
|
|
43
42
|
task = f'get-credentials to cluster {args.cluster}'
|
|
44
|
-
return_code = run_command_with_updates_retry(
|
|
45
|
-
command, task, args, verbose=False
|
|
46
|
-
)
|
|
43
|
+
return_code = run_command_with_updates_retry(command, task, verbose=False)
|
|
47
44
|
if return_code != 0:
|
|
48
45
|
xpk_print(f'{task} returned ERROR {return_code}')
|
|
49
46
|
return return_code
|
|
@@ -52,16 +49,11 @@ def set_cluster_command(args) -> int:
|
|
|
52
49
|
def is_TAS_possible(
|
|
53
50
|
system_characteristics: SystemCharacteristics | None,
|
|
54
51
|
capacity_type: CapacityType | None,
|
|
55
|
-
flex: bool,
|
|
56
52
|
) -> bool:
|
|
57
|
-
"""Check cluster's machine_type and capacity type to determine if Kueue TAS is possible
|
|
53
|
+
"""Check cluster's machine_type and capacity type to determine if Kueue TAS is possible"""
|
|
58
54
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
Returns:
|
|
63
|
-
True if possible and False otherwise.
|
|
64
|
-
"""
|
|
55
|
+
if is_dry_run():
|
|
56
|
+
return True
|
|
65
57
|
|
|
66
58
|
if system_characteristics is None:
|
|
67
59
|
xpk_print('system_characteristics data was not found in configmaps.')
|
|
@@ -71,13 +63,7 @@ def is_TAS_possible(
|
|
|
71
63
|
xpk_print('capacity_type data was not found in configmaps.')
|
|
72
64
|
xpk_exit(1)
|
|
73
65
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
system_characteristics.device_type == H100_MEGA_DEVICE_TYPE
|
|
79
|
-
and capacity_type != CapacityType.RESERVATION
|
|
80
|
-
):
|
|
81
|
-
return False
|
|
82
|
-
|
|
83
|
-
return True
|
|
66
|
+
return (
|
|
67
|
+
system_characteristics.device_type != H100_MEGA_DEVICE_TYPE
|
|
68
|
+
or capacity_type == CapacityType.RESERVATION
|
|
69
|
+
)
|
xpk/commands/info.py
CHANGED
|
@@ -22,8 +22,8 @@ from tabulate import tabulate
|
|
|
22
22
|
from ..core.commands import run_command_for_value
|
|
23
23
|
from ..core.cluster import get_cluster_credentials
|
|
24
24
|
from ..core.gcloud_context import add_zone_and_project
|
|
25
|
-
from ..core.kueue import verify_kueuectl
|
|
26
25
|
from ..utils.console import xpk_exit, xpk_print
|
|
26
|
+
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
27
27
|
|
|
28
28
|
table_fmt = 'plain'
|
|
29
29
|
|
|
@@ -36,10 +36,15 @@ def info(args: Namespace) -> None:
|
|
|
36
36
|
Returns:
|
|
37
37
|
None
|
|
38
38
|
"""
|
|
39
|
+
if should_validate_dependencies(args):
|
|
40
|
+
validate_dependencies_list([
|
|
41
|
+
SystemDependency.KUBECTL,
|
|
42
|
+
SystemDependency.GCLOUD,
|
|
43
|
+
SystemDependency.KUEUECTL,
|
|
44
|
+
])
|
|
39
45
|
add_zone_and_project(args)
|
|
40
46
|
get_cluster_credentials(args)
|
|
41
47
|
|
|
42
|
-
verify_kueuectl(args)
|
|
43
48
|
lq, cq = bool(args.localqueue), bool(args.clusterqueue)
|
|
44
49
|
if not lq and not cq:
|
|
45
50
|
lq, cq = True, True
|
|
@@ -48,7 +53,7 @@ def info(args: Namespace) -> None:
|
|
|
48
53
|
if lq:
|
|
49
54
|
lqs = run_kueuectl_list_localqueue(args)
|
|
50
55
|
|
|
51
|
-
cqs = run_kueuectl_list_clusterqueue(
|
|
56
|
+
cqs = run_kueuectl_list_clusterqueue()
|
|
52
57
|
quotas = get_nominal_quotas(cqs)
|
|
53
58
|
|
|
54
59
|
if lq and lqs is not None:
|
|
@@ -214,7 +219,7 @@ def run_kueuectl_list_localqueue(args: Namespace) -> str:
|
|
|
214
219
|
command = 'kubectl kueue list localqueue -o json'
|
|
215
220
|
if args.namespace != '':
|
|
216
221
|
command += f' --namespace {args.namespace}'
|
|
217
|
-
return_code, val = run_command_for_value(command, 'list localqueue'
|
|
222
|
+
return_code, val = run_command_for_value(command, 'list localqueue')
|
|
218
223
|
|
|
219
224
|
if return_code != 0:
|
|
220
225
|
xpk_print(f'Cluster info request returned ERROR {return_code}')
|
|
@@ -222,18 +227,15 @@ def run_kueuectl_list_localqueue(args: Namespace) -> str:
|
|
|
222
227
|
return val
|
|
223
228
|
|
|
224
229
|
|
|
225
|
-
def run_kueuectl_list_clusterqueue(
|
|
230
|
+
def run_kueuectl_list_clusterqueue() -> str:
|
|
226
231
|
"""Run the kueuectl list clusterqueue command.
|
|
227
232
|
|
|
228
|
-
Args:
|
|
229
|
-
args: user provided arguments for running the command.
|
|
230
|
-
|
|
231
233
|
Returns:
|
|
232
234
|
kueuectl list clusterqueue formatted as json string
|
|
233
235
|
"""
|
|
234
236
|
command = 'kubectl kueue list clusterqueue -o json'
|
|
235
237
|
|
|
236
|
-
return_code, val = run_command_for_value(command, 'list clusterqueue'
|
|
238
|
+
return_code, val = run_command_for_value(command, 'list clusterqueue')
|
|
237
239
|
|
|
238
240
|
if return_code != 0:
|
|
239
241
|
xpk_print(f'Cluster info request returned ERROR {return_code}')
|
xpk/commands/inspector.py
CHANGED
|
@@ -16,11 +16,12 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
from ..core.cluster import get_cluster_credentials
|
|
18
18
|
from ..core.commands import run_command_for_value
|
|
19
|
-
from ..core.gcloud_context import add_zone_and_project,
|
|
20
|
-
from ..core.
|
|
19
|
+
from ..core.gcloud_context import add_zone_and_project, get_cluster_location
|
|
20
|
+
from ..core.kueue_manager import CLUSTER_QUEUE_NAME, LOCAL_QUEUE_NAME
|
|
21
21
|
from ..core.resources import CLUSTER_METADATA_CONFIGMAP, CLUSTER_RESOURCES_CONFIGMAP
|
|
22
22
|
from ..utils.console import xpk_exit, xpk_print
|
|
23
23
|
from ..utils.file import append_tmp_file, write_tmp_file
|
|
24
|
+
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
24
25
|
from .workload import get_workload_list
|
|
25
26
|
|
|
26
27
|
|
|
@@ -41,7 +42,7 @@ def inspector_run_command_helper(
|
|
|
41
42
|
prefix = f'Command: {command}\nCommand Description: {command_description}\n'
|
|
42
43
|
postfix = '========================================================'
|
|
43
44
|
return_code, command_output = run_command_for_value(
|
|
44
|
-
command, f'{command_description}'
|
|
45
|
+
command, f'{command_description}'
|
|
45
46
|
)
|
|
46
47
|
|
|
47
48
|
if return_code != 0:
|
|
@@ -116,7 +117,10 @@ def inspector(args) -> None:
|
|
|
116
117
|
# Future Improvements for inspector:
|
|
117
118
|
# 2. List what is next in Queue.
|
|
118
119
|
# 3. Split inspector into different subcommands to parse info easier.
|
|
119
|
-
|
|
120
|
+
if should_validate_dependencies(args):
|
|
121
|
+
validate_dependencies_list(
|
|
122
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
123
|
+
)
|
|
120
124
|
final_return_code = 0
|
|
121
125
|
xpk_print(args)
|
|
122
126
|
|
|
@@ -138,8 +142,9 @@ def inspector(args) -> None:
|
|
|
138
142
|
(
|
|
139
143
|
(
|
|
140
144
|
'gcloud beta container clusters list --project'
|
|
141
|
-
f' {args.project} --
|
|
142
|
-
f'
|
|
145
|
+
f' {args.project} --location'
|
|
146
|
+
f' {get_cluster_location(args.project, args.cluster, args.zone)} |'
|
|
147
|
+
f' grep -e NAME -e {args.cluster}'
|
|
143
148
|
),
|
|
144
149
|
'GKE: Cluster Details',
|
|
145
150
|
),
|
|
@@ -160,7 +165,7 @@ def inspector(args) -> None:
|
|
|
160
165
|
(
|
|
161
166
|
(
|
|
162
167
|
f'gcloud beta container node-pools list --cluster {args.cluster} '
|
|
163
|
-
f' --project={args.project} --
|
|
168
|
+
f' --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
|
|
164
169
|
),
|
|
165
170
|
'GKE: Node pool Details',
|
|
166
171
|
),
|
|
@@ -309,19 +314,25 @@ def inspector(args) -> None:
|
|
|
309
314
|
workload_links = [(
|
|
310
315
|
f'Cloud Console for the workload {args.workload}',
|
|
311
316
|
# pylint: disable=line-too-long
|
|
312
|
-
|
|
317
|
+
(
|
|
318
|
+
f'https://console.cloud.google.com/kubernetes/service/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
|
|
319
|
+
),
|
|
313
320
|
)]
|
|
314
321
|
|
|
315
322
|
links = [
|
|
316
323
|
(
|
|
317
324
|
'Cloud Console for the GKE Cluster',
|
|
318
325
|
# pylint: disable=line-too-long
|
|
319
|
-
|
|
326
|
+
(
|
|
327
|
+
f'https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
|
|
328
|
+
),
|
|
320
329
|
),
|
|
321
330
|
(
|
|
322
331
|
'Cloud Console for all workloads in GKE Cluster',
|
|
323
332
|
# pylint: disable=line-too-long
|
|
324
|
-
|
|
333
|
+
(
|
|
334
|
+
f'https://console.cloud.google.com/kubernetes/workload/overview?project={args.project}&pageState=((gke%2F{get_cluster_location(args.project, args.cluster, args.zone)}%2F{args.cluster}))'
|
|
335
|
+
),
|
|
325
336
|
),
|
|
326
337
|
(
|
|
327
338
|
'Cloud Console for IAM Permissions',
|
|
@@ -346,7 +357,7 @@ def inspector(args) -> None:
|
|
|
346
357
|
)
|
|
347
358
|
|
|
348
359
|
# Summarize inspector:
|
|
349
|
-
xpk_print(f'Find xpk inspector output file: {inspector_file
|
|
360
|
+
xpk_print(f'Find xpk inspector output file: {inspector_file}')
|
|
350
361
|
|
|
351
362
|
if final_return_code != 0:
|
|
352
363
|
xpk_print(
|
xpk/commands/job.py
CHANGED
|
@@ -25,9 +25,32 @@ from ..core.cluster import get_cluster_credentials
|
|
|
25
25
|
from ..core.gcloud_context import add_zone_and_project
|
|
26
26
|
from ..core.kjob import AppProfileDefaults
|
|
27
27
|
from ..utils.console import xpk_exit, xpk_print
|
|
28
|
+
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
28
29
|
from .kind import set_local_cluster_command
|
|
29
30
|
|
|
30
31
|
|
|
32
|
+
JOBS_DRY_RUN_YAML = """
|
|
33
|
+
items:
|
|
34
|
+
- apiVersion: slurm.k8s.io/v1alpha1
|
|
35
|
+
kind: SlurmJob
|
|
36
|
+
metadata:
|
|
37
|
+
annotations:
|
|
38
|
+
kjobctl.x-k8s.io/script: echo hello
|
|
39
|
+
creationTimestamp: '2024-04-29T12:00:00Z'
|
|
40
|
+
labels:
|
|
41
|
+
kjobctl.x-k8s.io/app-profile: default
|
|
42
|
+
name: golden-job
|
|
43
|
+
namespace: default
|
|
44
|
+
spec:
|
|
45
|
+
script: echo hello
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
PODS_DRY_RUN_RESULT = """
|
|
49
|
+
foo-pod 2/2 Running 0 2d
|
|
50
|
+
bar-pod 1/1 Evicted 0 1d
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
|
|
31
54
|
def job_info(args):
|
|
32
55
|
"""Run commands obtaining information about a job given by name.
|
|
33
56
|
|
|
@@ -37,12 +60,16 @@ def job_info(args):
|
|
|
37
60
|
Returns:
|
|
38
61
|
None
|
|
39
62
|
"""
|
|
63
|
+
if should_validate_dependencies(args):
|
|
64
|
+
validate_dependencies_list([
|
|
65
|
+
SystemDependency.KUBECTL,
|
|
66
|
+
SystemDependency.KJOB,
|
|
67
|
+
SystemDependency.GCLOUD,
|
|
68
|
+
])
|
|
40
69
|
job_name = args.name
|
|
41
70
|
|
|
42
71
|
desc_command = f'kubectl-kjob describe slurm {job_name}'
|
|
43
|
-
desc_code, desc_text = run_command_for_value(
|
|
44
|
-
desc_command, 'Getting job data', args
|
|
45
|
-
)
|
|
72
|
+
desc_code, desc_text = run_command_for_value(desc_command, 'Getting job data')
|
|
46
73
|
if desc_code != 0:
|
|
47
74
|
xpk_print(f'Data info request returned ERROR {desc_code}')
|
|
48
75
|
xpk_exit(desc_code)
|
|
@@ -52,7 +79,9 @@ def job_info(args):
|
|
|
52
79
|
f' metadata.name=={job_name}'
|
|
53
80
|
)
|
|
54
81
|
job_code, job_text = run_command_for_value(
|
|
55
|
-
job_command,
|
|
82
|
+
job_command,
|
|
83
|
+
'Getting job info',
|
|
84
|
+
dry_run_return_val=JOBS_DRY_RUN_YAML,
|
|
56
85
|
)
|
|
57
86
|
if job_code != 0:
|
|
58
87
|
xpk_print(f'Job info request returned ERROR {job_code}')
|
|
@@ -60,7 +89,9 @@ def job_info(args):
|
|
|
60
89
|
|
|
61
90
|
pods_command = f'kubectl get pods -l=job-name={job_name} --no-headers'
|
|
62
91
|
pods_code, pods_text = run_command_for_value(
|
|
63
|
-
pods_command,
|
|
92
|
+
pods_command,
|
|
93
|
+
'Getting pods list',
|
|
94
|
+
dry_run_return_val=PODS_DRY_RUN_RESULT,
|
|
64
95
|
)
|
|
65
96
|
if pods_code != 0:
|
|
66
97
|
xpk_print(f'Pods list request returned ERROR {pods_code}')
|
|
@@ -143,6 +174,12 @@ def job_list(args) -> None:
|
|
|
143
174
|
Returns:
|
|
144
175
|
None
|
|
145
176
|
"""
|
|
177
|
+
if should_validate_dependencies(args):
|
|
178
|
+
validate_dependencies_list([
|
|
179
|
+
SystemDependency.KUBECTL,
|
|
180
|
+
SystemDependency.KJOB,
|
|
181
|
+
SystemDependency.GCLOUD,
|
|
182
|
+
])
|
|
146
183
|
if not args.kind_cluster:
|
|
147
184
|
add_zone_and_project(args)
|
|
148
185
|
get_cluster_credentials(args)
|
|
@@ -155,14 +192,14 @@ def job_list(args) -> None:
|
|
|
155
192
|
|
|
156
193
|
xpk_print(msg, flush=True)
|
|
157
194
|
|
|
158
|
-
return_code = run_slurm_job_list_command(
|
|
195
|
+
return_code = run_slurm_job_list_command()
|
|
159
196
|
xpk_exit(return_code)
|
|
160
197
|
|
|
161
198
|
|
|
162
|
-
def run_slurm_job_list_command(
|
|
199
|
+
def run_slurm_job_list_command() -> int:
|
|
163
200
|
cmd = f'kubectl-kjob list slurm --profile {AppProfileDefaults.NAME.value}'
|
|
164
201
|
|
|
165
|
-
return_code = run_command_with_updates(cmd, 'list jobs'
|
|
202
|
+
return_code = run_command_with_updates(cmd, 'list jobs')
|
|
166
203
|
if return_code != 0:
|
|
167
204
|
xpk_print(f'Listing jobs returned ERROR {return_code}')
|
|
168
205
|
return return_code
|
|
@@ -177,6 +214,13 @@ def job_cancel(args) -> None:
|
|
|
177
214
|
Returns:
|
|
178
215
|
None
|
|
179
216
|
"""
|
|
217
|
+
if should_validate_dependencies(args):
|
|
218
|
+
validate_dependencies_list([
|
|
219
|
+
SystemDependency.KUBECTL,
|
|
220
|
+
SystemDependency.KJOB,
|
|
221
|
+
SystemDependency.GCLOUD,
|
|
222
|
+
])
|
|
223
|
+
|
|
180
224
|
xpk_print(f'Starting job cancel for job: {args.name}', flush=True)
|
|
181
225
|
if not args.kind_cluster:
|
|
182
226
|
add_zone_and_project(args)
|
|
@@ -194,7 +238,7 @@ def run_slurm_job_delete_command(args) -> int:
|
|
|
194
238
|
list_of_jobs = ' '.join(args.name)
|
|
195
239
|
cmd = f'kubectl-kjob delete slurm {list_of_jobs}'
|
|
196
240
|
|
|
197
|
-
return_code = run_command_with_updates(cmd, 'delete job'
|
|
241
|
+
return_code = run_command_with_updates(cmd, 'delete job')
|
|
198
242
|
if return_code != 0:
|
|
199
243
|
xpk_print(f'Delete job request returned ERROR {return_code}')
|
|
200
244
|
return return_code
|