xpk 0.12.0__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +17 -10
- xpk/commands/cluster.py +137 -123
- xpk/commands/cluster_gcluster.py +77 -14
- xpk/commands/cluster_gcluster_test.py +177 -0
- xpk/commands/common.py +13 -27
- xpk/commands/info.py +11 -9
- xpk/commands/inspector.py +22 -11
- xpk/commands/job.py +53 -9
- xpk/commands/kind.py +38 -40
- xpk/commands/kjob_common.py +4 -4
- xpk/commands/run.py +9 -2
- xpk/commands/shell.py +13 -10
- xpk/commands/storage.py +26 -2
- xpk/commands/version.py +0 -4
- xpk/commands/workload.py +58 -30
- xpk/core/blueprint/blueprint_generator.py +4 -40
- xpk/core/blueprint/blueprint_test.py +0 -6
- xpk/core/capacity.py +6 -5
- xpk/core/cluster.py +96 -195
- xpk/core/cluster_private.py +9 -12
- xpk/core/commands.py +21 -25
- xpk/core/config.py +1 -1
- xpk/core/docker_image.py +17 -9
- xpk/core/docker_resources.py +9 -4
- xpk/core/gcloud_context.py +26 -2
- xpk/core/gcloud_context_test.py +96 -0
- xpk/core/gcluster_manager.py +0 -3
- xpk/core/jobset.py +5 -8
- xpk/core/kjob.py +19 -29
- xpk/core/kueue_manager.py +383 -0
- xpk/core/kueue_manager_test.py +542 -0
- xpk/core/monitoring.py +1 -1
- xpk/core/nap.py +11 -16
- xpk/core/network.py +18 -19
- xpk/core/nodepool.py +65 -71
- xpk/core/nodepool_test.py +198 -1
- xpk/core/pathways.py +9 -5
- xpk/core/ray.py +11 -15
- xpk/core/resources.py +15 -10
- xpk/core/scheduling.py +23 -1
- xpk/core/scheduling_test.py +31 -0
- xpk/core/system_characteristics.py +335 -229
- xpk/core/vertex.py +1 -1
- xpk/core/workload.py +7 -8
- xpk/main.py +3 -2
- xpk/parser/cluster.py +50 -0
- xpk/parser/cluster_test.py +66 -0
- xpk/parser/common.py +11 -0
- xpk/parser/workload.py +62 -25
- xpk/parser/workload_test.py +82 -0
- xpk/utils/execution_context.py +28 -0
- xpk/utils/feature_flags.py +28 -0
- xpk/utils/file.py +25 -10
- xpk/utils/kueue.py +20 -0
- xpk/utils/network.py +4 -0
- xpk/utils/templates.py +2 -0
- xpk/utils/topology.py +37 -0
- xpk/utils/topology_test.py +43 -0
- xpk/utils/validation.py +79 -55
- xpk/utils/validation_test.py +37 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
- xpk-0.14.0.dist-info/RECORD +112 -0
- xpk/core/kueue.py +0 -545
- xpk-0.12.0.dist-info/RECORD +0 -100
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
xpk/commands/kind.py
CHANGED
|
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
from ..core.kueue_manager import (KueueConfig, KueueManager)
|
|
17
18
|
from ..core.commands import (
|
|
18
19
|
run_command_for_value,
|
|
19
20
|
run_command_with_updates,
|
|
@@ -24,17 +25,14 @@ from ..core.kjob import (
|
|
|
24
25
|
prepare_kjob,
|
|
25
26
|
apply_kjob_crds,
|
|
26
27
|
)
|
|
27
|
-
from ..core.
|
|
28
|
-
install_kueue_on_cluster,
|
|
29
|
-
install_kueue_crs,
|
|
30
|
-
wait_for_kueue_available,
|
|
31
|
-
)
|
|
28
|
+
from ..core.scheduling import get_total_chips_requested_from_args
|
|
32
29
|
from ..core.storage import install_storage_crd
|
|
33
30
|
from ..core.system_characteristics import (
|
|
34
31
|
SystemCharacteristics,
|
|
35
32
|
AcceleratorType,
|
|
36
33
|
)
|
|
37
34
|
from ..utils.console import (xpk_exit, xpk_print)
|
|
35
|
+
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
38
36
|
|
|
39
37
|
|
|
40
38
|
def cluster_create(args) -> None:
|
|
@@ -46,6 +44,12 @@ def cluster_create(args) -> None:
|
|
|
46
44
|
Returns:
|
|
47
45
|
0 if successful and 1 otherwise.
|
|
48
46
|
"""
|
|
47
|
+
if should_validate_dependencies(args):
|
|
48
|
+
validate_dependencies_list([
|
|
49
|
+
SystemDependency.KUBECTL,
|
|
50
|
+
SystemDependency.KJOB,
|
|
51
|
+
SystemDependency.GCLOUD,
|
|
52
|
+
])
|
|
49
53
|
xpk_print(f'Starting cluster create for cluster {args.cluster}:', flush=True)
|
|
50
54
|
|
|
51
55
|
create_cluster_command_code = create_cluster_if_necessary(args)
|
|
@@ -64,18 +68,13 @@ def cluster_create(args) -> None:
|
|
|
64
68
|
if set_jobset_on_cluster_code != 0:
|
|
65
69
|
xpk_exit(set_jobset_on_cluster_code)
|
|
66
70
|
|
|
67
|
-
xpk_print('Enabling Kueue on the cluster')
|
|
68
|
-
install_kueue_on_cluster_code = install_kueue_on_cluster(args)
|
|
69
|
-
if install_kueue_on_cluster_code != 0:
|
|
70
|
-
xpk_exit(install_kueue_on_cluster_code)
|
|
71
|
-
|
|
72
71
|
xpk_print('Verifying kjob installation')
|
|
73
|
-
err_code = verify_kjob_installed(
|
|
72
|
+
err_code = verify_kjob_installed()
|
|
74
73
|
if err_code > 0:
|
|
75
74
|
xpk_exit(err_code)
|
|
76
75
|
|
|
77
76
|
xpk_print('Applying kjob CDRs')
|
|
78
|
-
err_code = apply_kjob_crds(
|
|
77
|
+
err_code = apply_kjob_crds()
|
|
79
78
|
if err_code > 0:
|
|
80
79
|
xpk_exit(err_code)
|
|
81
80
|
|
|
@@ -87,11 +86,6 @@ def cluster_create(args) -> None:
|
|
|
87
86
|
k8s_client = setup_k8s_env(args)
|
|
88
87
|
install_storage_crd(k8s_client)
|
|
89
88
|
|
|
90
|
-
xpk_print('Wait for Kueue to be fully available')
|
|
91
|
-
wait_for_kueue_available_code = wait_for_kueue_available(args)
|
|
92
|
-
if wait_for_kueue_available_code != 0:
|
|
93
|
-
xpk_exit(wait_for_kueue_available_code)
|
|
94
|
-
|
|
95
89
|
args.num_slices = 1
|
|
96
90
|
args.enable_pathways = False
|
|
97
91
|
system = SystemCharacteristics(
|
|
@@ -102,12 +96,22 @@ def cluster_create(args) -> None:
|
|
|
102
96
|
1,
|
|
103
97
|
AcceleratorType['CPU'],
|
|
104
98
|
'kind',
|
|
99
|
+
supports_sub_slicing=False,
|
|
105
100
|
)
|
|
106
101
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
102
|
+
kueue_manager = KueueManager()
|
|
103
|
+
kueue_manager.install_or_upgrade(
|
|
104
|
+
KueueConfig(
|
|
105
|
+
system,
|
|
106
|
+
total_chips=get_total_chips_requested_from_args(args, system),
|
|
107
|
+
autoprovisioning_enabled=False,
|
|
108
|
+
num_slices=args.num_slices,
|
|
109
|
+
memory_limit='',
|
|
110
|
+
cpu_limit=0,
|
|
111
|
+
is_pathways_cluster=False,
|
|
112
|
+
flex=False,
|
|
113
|
+
),
|
|
114
|
+
)
|
|
111
115
|
|
|
112
116
|
xpk_print('Kind commands done! Resources are created.')
|
|
113
117
|
xpk_exit(0)
|
|
@@ -122,6 +126,8 @@ def cluster_delete(args) -> None:
|
|
|
122
126
|
Returns:
|
|
123
127
|
0 if successful and 1 otherwise.
|
|
124
128
|
"""
|
|
129
|
+
if should_validate_dependencies(args):
|
|
130
|
+
validate_dependencies_list([SystemDependency.GCLOUD])
|
|
125
131
|
xpk_print(f'Starting cluster delete for cluster: {args.cluster}', flush=True)
|
|
126
132
|
|
|
127
133
|
run_kind_cluster_delete_command_code = run_kind_cluster_delete_command(args)
|
|
@@ -134,13 +140,12 @@ def cluster_delete(args) -> None:
|
|
|
134
140
|
def cluster_list(args) -> None:
|
|
135
141
|
"""Function around cluster list.
|
|
136
142
|
|
|
137
|
-
Args:
|
|
138
|
-
args: user provided arguments for running the command.
|
|
139
|
-
|
|
140
143
|
Returns:
|
|
141
144
|
0 if successful and 1 otherwise.
|
|
142
145
|
"""
|
|
143
|
-
if
|
|
146
|
+
if should_validate_dependencies(args):
|
|
147
|
+
validate_dependencies_list([SystemDependency.GCLOUD])
|
|
148
|
+
if run_kind_clusters_list_command():
|
|
144
149
|
xpk_exit(1)
|
|
145
150
|
xpk_exit(0)
|
|
146
151
|
|
|
@@ -154,7 +159,7 @@ def create_cluster_if_necessary(args) -> int:
|
|
|
154
159
|
Returns:
|
|
155
160
|
0 if successful and 1 otherwise.
|
|
156
161
|
"""
|
|
157
|
-
all_clusters, return_code = get_all_local_clusters_programmatic(
|
|
162
|
+
all_clusters, return_code = get_all_local_clusters_programmatic()
|
|
158
163
|
if return_code > 0:
|
|
159
164
|
xpk_print('Listing all clusters failed!')
|
|
160
165
|
return 1
|
|
@@ -179,7 +184,7 @@ def run_kind_cluster_delete_command(args) -> int:
|
|
|
179
184
|
if args.cluster:
|
|
180
185
|
command += f' --name={args.cluster}'
|
|
181
186
|
|
|
182
|
-
return_code = run_command_with_updates(command, 'Cluster Delete'
|
|
187
|
+
return_code = run_command_with_updates(command, 'Cluster Delete')
|
|
183
188
|
if return_code != 0:
|
|
184
189
|
xpk_print(f'Cluster delete request returned ERROR {return_code}')
|
|
185
190
|
return 1
|
|
@@ -187,17 +192,14 @@ def run_kind_cluster_delete_command(args) -> int:
|
|
|
187
192
|
return 0
|
|
188
193
|
|
|
189
194
|
|
|
190
|
-
def run_kind_clusters_list_command(
|
|
195
|
+
def run_kind_clusters_list_command() -> int:
|
|
191
196
|
"""List Kind Clusters within the project and location.
|
|
192
197
|
|
|
193
|
-
Args:
|
|
194
|
-
args: user provided arguments for running the command.
|
|
195
|
-
|
|
196
198
|
Returns:
|
|
197
199
|
0 if successful and 1 otherwise.
|
|
198
200
|
"""
|
|
199
201
|
command = 'kind get clusters'
|
|
200
|
-
return_code = run_command_with_updates(command, 'Cluster List'
|
|
202
|
+
return_code = run_command_with_updates(command, 'Cluster List')
|
|
201
203
|
if return_code != 0:
|
|
202
204
|
xpk_print(f'Cluster list request returned ERROR {return_code}')
|
|
203
205
|
return 1
|
|
@@ -222,25 +224,22 @@ def run_kind_cluster_create_command(args) -> int:
|
|
|
222
224
|
if args.k8s_version:
|
|
223
225
|
command += f' --image=kindest/node:v{args.k8s_version}'
|
|
224
226
|
|
|
225
|
-
return_code = run_command_with_updates(command, 'Kind Cluster Create'
|
|
227
|
+
return_code = run_command_with_updates(command, 'Kind Cluster Create')
|
|
226
228
|
if return_code != 0:
|
|
227
229
|
xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
|
|
228
230
|
return 1
|
|
229
231
|
return 0
|
|
230
232
|
|
|
231
233
|
|
|
232
|
-
def get_all_local_clusters_programmatic(
|
|
234
|
+
def get_all_local_clusters_programmatic() -> tuple[list[str], int]:
|
|
233
235
|
"""Gets all the local clusters.
|
|
234
236
|
|
|
235
|
-
Args:
|
|
236
|
-
args: user provided arguments for running the command.
|
|
237
|
-
|
|
238
237
|
Returns:
|
|
239
238
|
List of cluster names and 0 if successful and 1 otherwise.
|
|
240
239
|
"""
|
|
241
240
|
command = 'kind get clusters'
|
|
242
241
|
return_code, raw_cluster_output = run_command_for_value(
|
|
243
|
-
command, 'Find if Cluster Exists'
|
|
242
|
+
command, 'Find if Cluster Exists'
|
|
244
243
|
)
|
|
245
244
|
if return_code != 0:
|
|
246
245
|
xpk_print(f'Find if Cluster Exists returned ERROR {return_code}')
|
|
@@ -261,7 +260,7 @@ def set_local_cluster_command(args) -> int:
|
|
|
261
260
|
if not args.cluster:
|
|
262
261
|
command = 'kubectl config current-context'
|
|
263
262
|
return_code, current_context = run_command_for_value(
|
|
264
|
-
command, 'get current-context'
|
|
263
|
+
command, 'get current-context'
|
|
265
264
|
)
|
|
266
265
|
xpk_print(
|
|
267
266
|
'No local cluster name specified. Using current-context'
|
|
@@ -276,7 +275,6 @@ def set_local_cluster_command(args) -> int:
|
|
|
276
275
|
return_code = run_command_with_updates(
|
|
277
276
|
command,
|
|
278
277
|
task,
|
|
279
|
-
args,
|
|
280
278
|
)
|
|
281
279
|
if return_code != 0:
|
|
282
280
|
xpk_print(f'{task} returned ERROR {return_code}')
|
xpk/commands/kjob_common.py
CHANGED
|
@@ -35,11 +35,11 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
|
|
|
35
35
|
|
|
36
36
|
annotations: tuple
|
|
37
37
|
if gpu_type == H100_MEGA_DEVICE_TYPE:
|
|
38
|
-
annotations = get_a3mega_pod_template_annotations(
|
|
38
|
+
annotations = get_a3mega_pod_template_annotations()
|
|
39
39
|
elif gpu_type == H200_DEVICE_TYPE:
|
|
40
|
-
annotations = get_a3ultra_pod_template_annotations(
|
|
40
|
+
annotations = get_a3ultra_pod_template_annotations()
|
|
41
41
|
elif gpu_type == B200_DEVICE_TYPE:
|
|
42
|
-
annotations = get_a4_pod_template_annotations(
|
|
42
|
+
annotations = get_a4_pod_template_annotations()
|
|
43
43
|
else:
|
|
44
44
|
annotations = tuple()
|
|
45
45
|
|
|
@@ -54,7 +54,7 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
|
|
|
54
54
|
def add_TAS_annotations_to_command(args, cmd: str) -> str:
|
|
55
55
|
system_characteristics = get_cluster_system_characteristics(args)
|
|
56
56
|
capacity_type = get_cluster_capacity_type(args)
|
|
57
|
-
if is_TAS_possible(system_characteristics, capacity_type
|
|
57
|
+
if is_TAS_possible(system_characteristics, capacity_type):
|
|
58
58
|
cmd += f" --pod-template-annotation {Kueue_TAS_annotation}"
|
|
59
59
|
|
|
60
60
|
return cmd
|
xpk/commands/run.py
CHANGED
|
@@ -28,8 +28,9 @@ from ..core.kjob import (
|
|
|
28
28
|
get_storage_annotations,
|
|
29
29
|
prepare_kjob,
|
|
30
30
|
)
|
|
31
|
-
from ..core.
|
|
31
|
+
from ..core.kueue_manager import LOCAL_QUEUE_NAME
|
|
32
32
|
from ..utils.console import xpk_exit, xpk_print
|
|
33
|
+
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
33
34
|
from .kind import set_local_cluster_command
|
|
34
35
|
from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
|
|
35
36
|
|
|
@@ -42,6 +43,12 @@ def run(args: Namespace) -> None:
|
|
|
42
43
|
Returns:
|
|
43
44
|
None
|
|
44
45
|
"""
|
|
46
|
+
if should_validate_dependencies(args):
|
|
47
|
+
validate_dependencies_list([
|
|
48
|
+
SystemDependency.KUBECTL,
|
|
49
|
+
SystemDependency.KJOB,
|
|
50
|
+
SystemDependency.GCLOUD,
|
|
51
|
+
])
|
|
45
52
|
if not args.kind_cluster:
|
|
46
53
|
add_zone_and_project(args)
|
|
47
54
|
get_cluster_credentials(args)
|
|
@@ -126,7 +133,7 @@ def submit_job(args: Namespace) -> None:
|
|
|
126
133
|
if args.time is not None:
|
|
127
134
|
cmd += f' --time {args.time}'
|
|
128
135
|
|
|
129
|
-
return_code = run_command_with_full_controls(cmd, 'run task'
|
|
136
|
+
return_code = run_command_with_full_controls(cmd, 'run task')
|
|
130
137
|
|
|
131
138
|
if return_code != 0:
|
|
132
139
|
xpk_print(f'Running task returned ERROR {return_code}')
|
xpk/commands/shell.py
CHANGED
|
@@ -14,6 +14,7 @@ limitations under the License.
|
|
|
14
14
|
from ..core.commands import run_command_with_full_controls, run_command_for_value, run_command_with_updates
|
|
15
15
|
from ..core.cluster import get_cluster_credentials, add_zone_and_project, setup_k8s_service_accounts
|
|
16
16
|
from ..utils.console import xpk_exit, xpk_print
|
|
17
|
+
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
17
18
|
from argparse import Namespace
|
|
18
19
|
|
|
19
20
|
from ..core.kjob import (
|
|
@@ -33,14 +34,18 @@ def shell(args: Namespace):
|
|
|
33
34
|
Returns:
|
|
34
35
|
0 if successful and 1 otherwise.
|
|
35
36
|
"""
|
|
37
|
+
if should_validate_dependencies(args):
|
|
38
|
+
validate_dependencies_list([
|
|
39
|
+
SystemDependency.KUBECTL,
|
|
40
|
+
SystemDependency.KJOB,
|
|
41
|
+
SystemDependency.GCLOUD,
|
|
42
|
+
])
|
|
36
43
|
exisitng_shell_pod_name = get_existing_shell_pod_name(args)
|
|
37
44
|
|
|
38
45
|
if exisitng_shell_pod_name is None:
|
|
39
46
|
return_code = connect_to_new_interactive_shell(args)
|
|
40
47
|
else:
|
|
41
|
-
return_code = connect_to_existing_interactive_shell(
|
|
42
|
-
exisitng_shell_pod_name, args
|
|
43
|
-
)
|
|
48
|
+
return_code = connect_to_existing_interactive_shell(exisitng_shell_pod_name)
|
|
44
49
|
|
|
45
50
|
if return_code != 0:
|
|
46
51
|
xpk_print(f'The command failed with code {return_code}.')
|
|
@@ -60,7 +65,6 @@ def get_existing_shell_pod_name(args: Namespace) -> str | None:
|
|
|
60
65
|
' -o custom-columns=":metadata.name"'
|
|
61
66
|
),
|
|
62
67
|
task='Get existing interactive shell pod name.',
|
|
63
|
-
global_args=args,
|
|
64
68
|
)
|
|
65
69
|
if return_code != 0:
|
|
66
70
|
xpk_print(
|
|
@@ -95,21 +99,17 @@ def connect_to_new_interactive_shell(args: Namespace) -> int:
|
|
|
95
99
|
return run_command_with_full_controls(
|
|
96
100
|
command=cmd,
|
|
97
101
|
task='Creating new interactive shell and entering it',
|
|
98
|
-
global_args=args,
|
|
99
102
|
instructions=exit_instructions,
|
|
100
103
|
)
|
|
101
104
|
|
|
102
105
|
|
|
103
|
-
def connect_to_existing_interactive_shell(
|
|
104
|
-
pod_name: str, args: Namespace
|
|
105
|
-
) -> int:
|
|
106
|
+
def connect_to_existing_interactive_shell(pod_name: str) -> int:
|
|
106
107
|
return run_command_with_full_controls(
|
|
107
108
|
command=(
|
|
108
109
|
f'kubectl exec --stdin --tty {pod_name} --'
|
|
109
110
|
f' {get_pod_template_interactive_command()}'
|
|
110
111
|
),
|
|
111
112
|
task='Entering existing interactive shell',
|
|
112
|
-
global_args=args,
|
|
113
113
|
instructions=exit_instructions,
|
|
114
114
|
)
|
|
115
115
|
|
|
@@ -121,6 +121,10 @@ def shell_stop(args: Namespace):
|
|
|
121
121
|
Returns:
|
|
122
122
|
0 if successful and 1 otherwise.
|
|
123
123
|
"""
|
|
124
|
+
if should_validate_dependencies(args):
|
|
125
|
+
validate_dependencies_list(
|
|
126
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
127
|
+
)
|
|
124
128
|
exisitng_shell_pod_name = get_existing_shell_pod_name(args)
|
|
125
129
|
|
|
126
130
|
if exisitng_shell_pod_name is None:
|
|
@@ -130,7 +134,6 @@ def shell_stop(args: Namespace):
|
|
|
130
134
|
return_code = run_command_with_updates(
|
|
131
135
|
command=f'kubectl delete pod {exisitng_shell_pod_name}',
|
|
132
136
|
task='Deleting the existing shell.',
|
|
133
|
-
global_args=args,
|
|
134
137
|
)
|
|
135
138
|
if return_code != 0:
|
|
136
139
|
xpk_exit(return_code)
|
xpk/commands/storage.py
CHANGED
|
@@ -58,9 +58,15 @@ from ..core.storage import (
|
|
|
58
58
|
)
|
|
59
59
|
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
60
60
|
from ..utils.kubectl import apply_kubectl_manifest
|
|
61
|
+
from ..utils.execution_context import is_dry_run
|
|
62
|
+
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
61
63
|
|
|
62
64
|
|
|
63
65
|
def storage_create(args: Namespace) -> None:
|
|
66
|
+
if should_validate_dependencies(args):
|
|
67
|
+
validate_dependencies_list(
|
|
68
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
69
|
+
)
|
|
64
70
|
add_zone_and_project(args)
|
|
65
71
|
if args.type == GCP_FILESTORE_TYPE:
|
|
66
72
|
if args.instance is None:
|
|
@@ -106,6 +112,10 @@ def storage_create(args: Namespace) -> None:
|
|
|
106
112
|
|
|
107
113
|
|
|
108
114
|
def storage_delete(args: Namespace) -> None:
|
|
115
|
+
if should_validate_dependencies(args):
|
|
116
|
+
validate_dependencies_list(
|
|
117
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
118
|
+
)
|
|
109
119
|
add_zone_and_project(args)
|
|
110
120
|
k8s_api_client = setup_k8s_env(args)
|
|
111
121
|
storages = list_storages(k8s_api_client)
|
|
@@ -140,6 +150,10 @@ def storage_delete(args: Namespace) -> None:
|
|
|
140
150
|
|
|
141
151
|
|
|
142
152
|
def storage_attach(args: Namespace) -> None:
|
|
153
|
+
if should_validate_dependencies(args):
|
|
154
|
+
validate_dependencies_list(
|
|
155
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
156
|
+
)
|
|
143
157
|
add_zone_and_project(args)
|
|
144
158
|
manifest: list[dict] = [{}]
|
|
145
159
|
if args.type == GCP_FILESTORE_TYPE:
|
|
@@ -243,12 +257,22 @@ def enable_csi_drivers_if_necessary(args: Namespace) -> None:
|
|
|
243
257
|
|
|
244
258
|
|
|
245
259
|
def storage_list(args: Namespace) -> None:
|
|
246
|
-
|
|
247
|
-
|
|
260
|
+
if should_validate_dependencies(args):
|
|
261
|
+
validate_dependencies_list(
|
|
262
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
263
|
+
)
|
|
264
|
+
storages = []
|
|
265
|
+
if not is_dry_run():
|
|
266
|
+
k8s_api_client = setup_k8s_env(args)
|
|
267
|
+
storages = list_storages(k8s_api_client)
|
|
248
268
|
print_storages_for_cluster(storages)
|
|
249
269
|
|
|
250
270
|
|
|
251
271
|
def storage_detach(args: Namespace) -> None:
|
|
272
|
+
if should_validate_dependencies(args):
|
|
273
|
+
validate_dependencies_list(
|
|
274
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
275
|
+
)
|
|
252
276
|
k8s_api_client = setup_k8s_env(args)
|
|
253
277
|
storage = get_storage(k8s_api_client, args.name)
|
|
254
278
|
delete_storage_resources(k8s_api_client, storage)
|
xpk/commands/version.py
CHANGED
|
@@ -18,10 +18,6 @@ from ..core.config import __version__
|
|
|
18
18
|
from ..utils.console import xpk_print
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
def get_xpk_version() -> str:
|
|
22
|
-
return __version__
|
|
23
|
-
|
|
24
|
-
|
|
25
21
|
def version(args) -> None: # pylint: disable=unused-argument
|
|
26
22
|
"""Get version of xpk."""
|
|
27
23
|
xpk_print('xpk_version:', __version__)
|
xpk/commands/workload.py
CHANGED
|
@@ -34,7 +34,7 @@ from ..core.docker_container import (
|
|
|
34
34
|
)
|
|
35
35
|
from ..core.docker_resources import get_volumes, parse_env_config
|
|
36
36
|
from ..core.gcloud_context import add_zone_and_project
|
|
37
|
-
from ..core.
|
|
37
|
+
from ..core.kueue_manager import LOCAL_QUEUE_NAME
|
|
38
38
|
from ..core.monitoring import get_gke_outlier_dashboard
|
|
39
39
|
from ..core.nap import (
|
|
40
40
|
get_autoprovisioning_node_selector_args,
|
|
@@ -53,9 +53,6 @@ from ..core.pathways import (
|
|
|
53
53
|
try_to_delete_pathwaysjob_first,
|
|
54
54
|
)
|
|
55
55
|
from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
|
|
56
|
-
from ..core.capacity import (
|
|
57
|
-
CapacityType,
|
|
58
|
-
)
|
|
59
56
|
from ..core.resources import CLUSTER_METADATA_CONFIGMAP, get_cluster_configmap
|
|
60
57
|
from ..core.scheduling import (
|
|
61
58
|
check_if_workload_can_schedule,
|
|
@@ -65,6 +62,7 @@ from ..core.scheduling import (
|
|
|
65
62
|
create_tpu_topology,
|
|
66
63
|
get_cpu_affinity,
|
|
67
64
|
get_gpu_scheduler,
|
|
65
|
+
create_sub_slicing_annotations,
|
|
68
66
|
)
|
|
69
67
|
from ..core.storage import (
|
|
70
68
|
GCE_PD_TYPE,
|
|
@@ -87,7 +85,7 @@ from ..core.workload import (
|
|
|
87
85
|
get_jobsets_list_gcp_link,
|
|
88
86
|
get_workload_list,
|
|
89
87
|
wait_for_job_completion,
|
|
90
|
-
|
|
88
|
+
get_cluster_location,
|
|
91
89
|
)
|
|
92
90
|
from ..core.workload_decorators import (
|
|
93
91
|
rdma_decorator,
|
|
@@ -97,8 +95,11 @@ from ..core.workload_decorators import (
|
|
|
97
95
|
)
|
|
98
96
|
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
99
97
|
from ..utils.file import write_tmp_file
|
|
98
|
+
from ..utils.execution_context import is_dry_run
|
|
99
|
+
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
100
100
|
from . import cluster_gcluster
|
|
101
101
|
from .common import is_TAS_possible
|
|
102
|
+
from ..utils.feature_flags import FeatureFlags
|
|
102
103
|
|
|
103
104
|
WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
|
|
104
105
|
kind: JobSet
|
|
@@ -129,6 +130,7 @@ spec:
|
|
|
129
130
|
xpk.google.com/workload: {args.workload}
|
|
130
131
|
annotations:
|
|
131
132
|
{storage_annotations}
|
|
133
|
+
{sub_slicing_annotations}
|
|
132
134
|
spec:
|
|
133
135
|
schedulerName: {args.scheduler}
|
|
134
136
|
imagePullSecrets:
|
|
@@ -266,6 +268,8 @@ PW_WORKLOAD_CREATE_YAML = """
|
|
|
266
268
|
maxSliceRestarts: {args.max_slice_restarts}
|
|
267
269
|
terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
|
|
268
270
|
priorityClassName: {args.priority}
|
|
271
|
+
nodeSelector:
|
|
272
|
+
{autoprovisioning_args}
|
|
269
273
|
pathwaysDir: {args.pathways_gcs_location} #This bucket needs to be created in advance.
|
|
270
274
|
controller:
|
|
271
275
|
# #Pod template for training, default mode.
|
|
@@ -306,8 +310,16 @@ def workload_create(args) -> None:
|
|
|
306
310
|
Returns:
|
|
307
311
|
0 if successful and 1 otherwise.
|
|
308
312
|
"""
|
|
309
|
-
|
|
310
|
-
|
|
313
|
+
if should_validate_dependencies(args):
|
|
314
|
+
validate_dependencies_list([
|
|
315
|
+
SystemDependency.KUBECTL,
|
|
316
|
+
SystemDependency.GCLOUD,
|
|
317
|
+
SystemDependency.DOCKER,
|
|
318
|
+
])
|
|
319
|
+
k8s_api_client = None
|
|
320
|
+
if not is_dry_run():
|
|
321
|
+
k8s_api_client = setup_k8s_env(args)
|
|
322
|
+
setup_k8s_service_accounts()
|
|
311
323
|
|
|
312
324
|
workload_exists = check_if_workload_exists(args)
|
|
313
325
|
|
|
@@ -331,7 +343,7 @@ def workload_create(args) -> None:
|
|
|
331
343
|
xpk_print('Starting workload create', flush=True)
|
|
332
344
|
|
|
333
345
|
metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
|
|
334
|
-
cluster_config_map = get_cluster_configmap(
|
|
346
|
+
cluster_config_map = get_cluster_configmap(metadata_configmap_name)
|
|
335
347
|
cluster_xpk_version = None
|
|
336
348
|
if cluster_config_map is None:
|
|
337
349
|
xpk_print(
|
|
@@ -383,8 +395,10 @@ def workload_create(args) -> None:
|
|
|
383
395
|
all_storages = []
|
|
384
396
|
# Currently storage customization is not supported for Pathways workloads. b/408468941
|
|
385
397
|
if not args.use_pathways:
|
|
386
|
-
storages: list[Storage] =
|
|
387
|
-
|
|
398
|
+
storages: list[Storage] = (
|
|
399
|
+
[]
|
|
400
|
+
if k8s_api_client is None
|
|
401
|
+
else get_storages_to_mount(k8s_api_client, args.storage)
|
|
388
402
|
)
|
|
389
403
|
gcs_fuse_storages = list(
|
|
390
404
|
filter(lambda storage: storage.type == GCS_FUSE_TYPE, storages)
|
|
@@ -477,16 +491,12 @@ def workload_create(args) -> None:
|
|
|
477
491
|
capacity_type = get_cluster_capacity_type(args)
|
|
478
492
|
|
|
479
493
|
annotations = (
|
|
480
|
-
|
|
481
|
-
if not is_TAS_possible(
|
|
482
|
-
system_characteristics,
|
|
483
|
-
capacity_type,
|
|
484
|
-
flex=True if capacity_type == CapacityType.FLEX_START else False,
|
|
485
|
-
)
|
|
486
|
-
else (
|
|
494
|
+
(
|
|
487
495
|
'kueue.x-k8s.io/podset-preferred-topology:'
|
|
488
496
|
' "cloud.google.com/gce-topology-host"'
|
|
489
497
|
)
|
|
498
|
+
if is_TAS_possible(system_characteristics, capacity_type)
|
|
499
|
+
else ''
|
|
490
500
|
)
|
|
491
501
|
|
|
492
502
|
if (
|
|
@@ -502,7 +512,7 @@ def workload_create(args) -> None:
|
|
|
502
512
|
annotations=annotations,
|
|
503
513
|
)
|
|
504
514
|
|
|
505
|
-
sub_networks = get_cluster_subnetworks(
|
|
515
|
+
sub_networks = get_cluster_subnetworks()
|
|
506
516
|
if args.device_type == a3high_device_type:
|
|
507
517
|
yml_string = tcpx_decorator.decorate_jobset(yml_string)
|
|
508
518
|
elif args.device_type == a3mega_device_type:
|
|
@@ -540,6 +550,7 @@ def workload_create(args) -> None:
|
|
|
540
550
|
colocated_python_sidecar=append_custom_colocated_python_sidecar(args),
|
|
541
551
|
user_workload=get_user_workload_for_pathways(args, system),
|
|
542
552
|
local_queue_name=LOCAL_QUEUE_NAME,
|
|
553
|
+
autoprovisioning_args=autoprovisioning_args,
|
|
543
554
|
)
|
|
544
555
|
else:
|
|
545
556
|
container, debugging_dashboard_id = get_user_workload_container(
|
|
@@ -553,6 +564,14 @@ def workload_create(args) -> None:
|
|
|
553
564
|
accelerator_label=create_accelerator_label(
|
|
554
565
|
system.accelerator_type, system
|
|
555
566
|
),
|
|
567
|
+
sub_slicing_annotations=(
|
|
568
|
+
''
|
|
569
|
+
if not FeatureFlags.SUB_SLICING_ENABLED
|
|
570
|
+
or args.sub_slicing_topology is None
|
|
571
|
+
else ('\n' + (' ' * 16)).join(
|
|
572
|
+
create_sub_slicing_annotations(args.sub_slicing_topology)
|
|
573
|
+
)
|
|
574
|
+
),
|
|
556
575
|
machine_label=create_machine_label(system.accelerator_type, system),
|
|
557
576
|
local_queue_name=LOCAL_QUEUE_NAME,
|
|
558
577
|
autoprovisioning_args=autoprovisioning_args,
|
|
@@ -569,14 +588,14 @@ def workload_create(args) -> None:
|
|
|
569
588
|
pod_failure_policy=pod_failure_policy,
|
|
570
589
|
)
|
|
571
590
|
tmp = write_tmp_file(yml_string)
|
|
572
|
-
command = f'kubectl apply -f {str(tmp
|
|
573
|
-
return_code = run_command_with_updates(command, 'Creating Workload'
|
|
591
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
592
|
+
return_code = run_command_with_updates(command, 'Creating Workload')
|
|
574
593
|
|
|
575
594
|
if return_code != 0:
|
|
576
595
|
xpk_print(f'Create Workload request returned ERROR {return_code}')
|
|
577
596
|
xpk_exit(return_code)
|
|
578
597
|
|
|
579
|
-
if not args.use_pathways:
|
|
598
|
+
if not args.use_pathways and not is_dry_run():
|
|
580
599
|
add_bucket_iam_members(args, storages)
|
|
581
600
|
|
|
582
601
|
# Get GKE outlier dashboard for TPU
|
|
@@ -617,7 +636,9 @@ def workload_create(args) -> None:
|
|
|
617
636
|
' JAX_PLATFORMS=proxy; JAX_BACKEND_TARGET=grpc://127.0.0.1:29000;'
|
|
618
637
|
" python -c 'import pathwaysutils; import jax; print(jax.devices())'"
|
|
619
638
|
)
|
|
620
|
-
pathways_proxy_link =
|
|
639
|
+
pathways_proxy_link = (
|
|
640
|
+
f'https://console.cloud.google.com/kubernetes/job/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}-proxy-0/details?project={args.project}'
|
|
641
|
+
)
|
|
621
642
|
xpk_print(
|
|
622
643
|
'Follow the proxy here:'
|
|
623
644
|
# pylint: disable=line-too-long)
|
|
@@ -631,7 +652,7 @@ def workload_create(args) -> None:
|
|
|
631
652
|
xpk_print(
|
|
632
653
|
'Follow your workload here:'
|
|
633
654
|
# pylint: disable=line-too-long
|
|
634
|
-
f' https://console.cloud.google.com/kubernetes/service/{
|
|
655
|
+
f' https://console.cloud.google.com/kubernetes/service/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
|
|
635
656
|
)
|
|
636
657
|
duration_of_logs = 'P1D' # Past 1 Day
|
|
637
658
|
xpk_print(
|
|
@@ -640,7 +661,7 @@ def workload_create(args) -> None:
|
|
|
640
661
|
' ([prefix]-slice-job-[slice_number]-[worker_number])'
|
|
641
662
|
' after clicking the url if you want other worker logs.'
|
|
642
663
|
# pylint: disable=line-too-long
|
|
643
|
-
f' https://console.cloud.google.com/logs/query;query=resource.type%3D%22k8s_container%22%0Aresource.labels.project_id%3D%22{args.project}%22%0Aresource.labels.location%3D%22{
|
|
664
|
+
f' https://console.cloud.google.com/logs/query;query=resource.type%3D%22k8s_container%22%0Aresource.labels.project_id%3D%22{args.project}%22%0Aresource.labels.location%3D%22{get_cluster_location(args.project, args.cluster, args.zone)}%22%0Aresource.labels.cluster_name%3D%22{args.cluster}%22%0Aresource.labels.namespace_name%3D%22default%22%0Aresource.labels.pod_name:%22{args.workload}-slice-job-0-0-%22%20severity%3E%3DDEFAULT;storageScope=project;duration={duration_of_logs}?e=13802955&mods=allow_workbench_image_override&project={args.project}'
|
|
644
665
|
)
|
|
645
666
|
|
|
646
667
|
xpk_exit(0)
|
|
@@ -673,6 +694,10 @@ def workload_delete(args) -> None:
|
|
|
673
694
|
Returns:
|
|
674
695
|
0 if successful and 1 otherwise.
|
|
675
696
|
"""
|
|
697
|
+
if should_validate_dependencies(args):
|
|
698
|
+
validate_dependencies_list(
|
|
699
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
700
|
+
)
|
|
676
701
|
xpk_print('Starting Workload delete', flush=True)
|
|
677
702
|
add_zone_and_project(args)
|
|
678
703
|
get_cluster_credentials(args)
|
|
@@ -720,12 +745,13 @@ def workload_delete(args) -> None:
|
|
|
720
745
|
|
|
721
746
|
# Not batching deletion for single workload
|
|
722
747
|
if len(workloads) == 1:
|
|
723
|
-
return_code = run_command_with_updates(
|
|
724
|
-
commands[0], 'Delete Workload', args
|
|
725
|
-
)
|
|
748
|
+
return_code = run_command_with_updates(commands[0], 'Delete Workload')
|
|
726
749
|
else:
|
|
727
750
|
return_code = run_commands(
|
|
728
|
-
commands,
|
|
751
|
+
commands,
|
|
752
|
+
'Delete Workload',
|
|
753
|
+
task_names,
|
|
754
|
+
batch=100,
|
|
729
755
|
)
|
|
730
756
|
|
|
731
757
|
if return_code != 0:
|
|
@@ -743,8 +769,10 @@ def workload_list(args) -> None:
|
|
|
743
769
|
Returns:
|
|
744
770
|
0 if successful and 1 otherwise.
|
|
745
771
|
"""
|
|
746
|
-
|
|
747
|
-
|
|
772
|
+
if should_validate_dependencies(args):
|
|
773
|
+
validate_dependencies_list(
|
|
774
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
775
|
+
)
|
|
748
776
|
xpk_print('Starting workload list', flush=True)
|
|
749
777
|
add_zone_and_project(args)
|
|
750
778
|
get_cluster_credentials(args)
|