xpk 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/__init__.py +15 -0
- xpk/commands/__init__.py +15 -0
- xpk/commands/batch.py +109 -0
- xpk/commands/cluster.py +784 -0
- xpk/commands/cluster_gcluster.py +185 -0
- xpk/commands/info.py +245 -0
- xpk/commands/inspector.py +363 -0
- xpk/commands/job.py +197 -0
- xpk/commands/kind.py +253 -0
- xpk/commands/shell.py +120 -0
- xpk/commands/version.py +39 -0
- xpk/commands/workload.py +692 -0
- xpk/core/__init__.py +15 -0
- xpk/core/blueprint/__init__.py +15 -0
- xpk/core/blueprint/blueprint_definitions.py +61 -0
- xpk/core/blueprint/blueprint_generator.py +652 -0
- xpk/core/cluster_private.py +197 -0
- xpk/core/commands.py +352 -0
- xpk/core/core.py +2824 -0
- xpk/core/docker_manager.py +308 -0
- xpk/core/gcluster_manager.py +158 -0
- xpk/core/kjob.py +205 -0
- xpk/core/kueue.py +352 -0
- xpk/core/nap.py +349 -0
- xpk/core/pathways.py +298 -0
- xpk/core/ray.py +222 -0
- xpk/core/system_characteristics.py +1395 -0
- xpk/core/workload.py +133 -0
- xpk/core/workload_decorators/__init__.py +15 -0
- xpk/core/workload_decorators/rdma_decorator.py +109 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +157 -0
- xpk/main.py +73 -0
- xpk/parser/__init__.py +15 -0
- xpk/parser/batch.py +184 -0
- xpk/parser/cluster.py +621 -0
- xpk/parser/common.py +71 -0
- xpk/parser/core.py +109 -0
- xpk/parser/info.py +63 -0
- xpk/parser/inspector.py +65 -0
- xpk/parser/job.py +126 -0
- xpk/parser/kind.py +94 -0
- xpk/parser/shell.py +50 -0
- xpk/parser/validators.py +39 -0
- xpk/parser/version.py +23 -0
- xpk/parser/workload.py +684 -0
- xpk/utils/__init__.py +15 -0
- xpk/utils/console.py +55 -0
- xpk/utils/file.py +82 -0
- xpk/utils/network.py +168 -0
- xpk/utils/objects.py +85 -0
- xpk/utils/yaml.py +30 -0
- {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/METADATA +301 -28
- xpk-0.6.0.dist-info/RECORD +57 -0
- {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/WHEEL +1 -1
- xpk-0.6.0.dist-info/entry_points.txt +2 -0
- xpk-0.5.0.dist-info/RECORD +0 -7
- xpk-0.5.0.dist-info/entry_points.txt +0 -2
- xpk.py +0 -7282
- {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/LICENSE +0 -0
- {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from ..core.commands import run_command_for_value
|
|
18
|
+
from ..core.core import (
|
|
19
|
+
CLUSTER_METADATA_CONFIGMAP,
|
|
20
|
+
CLUSTER_RESOURCES_CONFIGMAP,
|
|
21
|
+
add_zone_and_project,
|
|
22
|
+
zone_to_region,
|
|
23
|
+
)
|
|
24
|
+
from ..core.kueue import CLUSTER_QUEUE_NAME, LOCAL_QUEUE_NAME
|
|
25
|
+
from ..utils.file import append_tmp_file, write_tmp_file
|
|
26
|
+
from ..utils.console import xpk_exit, xpk_print
|
|
27
|
+
from .cluster import set_cluster_command
|
|
28
|
+
from .workload import get_workload_list
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def inspector_run_command_helper(
|
|
32
|
+
args, command, command_description, file
|
|
33
|
+
) -> int:
|
|
34
|
+
"""Runs a command for xpk inspector, and build the output file.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
args: user provided arguments for running the command.
|
|
38
|
+
command: the cli command to run.
|
|
39
|
+
command_description: a brief description of the command run.
|
|
40
|
+
file: file to add command output to.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
0 if successful and 1 otherwise.
|
|
44
|
+
"""
|
|
45
|
+
prefix = f'Command: {command}\nCommand Description: {command_description}\n'
|
|
46
|
+
postfix = '========================================================'
|
|
47
|
+
return_code, command_output = run_command_for_value(
|
|
48
|
+
command, f'{command_description}', args
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
if return_code != 0:
|
|
52
|
+
xpk_print(
|
|
53
|
+
f'{command} returned ERROR {return_code} with output: {command_output}'
|
|
54
|
+
)
|
|
55
|
+
return 1
|
|
56
|
+
|
|
57
|
+
inspector_command_output = f'{prefix} \n{command_output} \n{postfix} \n'
|
|
58
|
+
append_tmp_file(inspector_command_output, file)
|
|
59
|
+
|
|
60
|
+
if args.print_to_terminal:
|
|
61
|
+
xpk_print(inspector_command_output)
|
|
62
|
+
return 0
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def inspector_run_workload_list_helper(args, command_description, file) -> int:
|
|
66
|
+
"""Runs a workload list command for xpk inspector, and build the output file.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
args: user provided arguments for running the command.
|
|
70
|
+
command_description: a brief description of the command run.
|
|
71
|
+
file: file to add command output to.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
0 if successful and 1 otherwise.
|
|
75
|
+
"""
|
|
76
|
+
prefix = f'Command Description: {command_description}\n'
|
|
77
|
+
postfix = '========================================================'
|
|
78
|
+
return_code, command_output = get_workload_list(args)
|
|
79
|
+
if return_code != 0:
|
|
80
|
+
xpk_exit(return_code)
|
|
81
|
+
inspector_command_output = f'{prefix} \n{command_output} \n{postfix} \n'
|
|
82
|
+
append_tmp_file(inspector_command_output, file)
|
|
83
|
+
if args.print_to_terminal:
|
|
84
|
+
xpk_print(inspector_command_output)
|
|
85
|
+
return 0
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def inspector_output_link_helper(args, link, link_description, file) -> int:
|
|
89
|
+
"""Outputs a link for xpk inspector to the output file.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
args: user provided arguments for.
|
|
93
|
+
link: link to output.
|
|
94
|
+
link_description: describes what the link is for.
|
|
95
|
+
file: file to add command output to.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
0 if successful and 1 otherwise.
|
|
99
|
+
"""
|
|
100
|
+
inspector_link = (
|
|
101
|
+
f'Link Description: {link_description}\n'
|
|
102
|
+
f'Link: {link}\n'
|
|
103
|
+
'========================================================'
|
|
104
|
+
)
|
|
105
|
+
append_tmp_file(inspector_link, file)
|
|
106
|
+
if args.print_to_terminal:
|
|
107
|
+
xpk_print(inspector_link)
|
|
108
|
+
return 0
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def inspector(args) -> None:
|
|
112
|
+
"""Function around inspector which investigates failures in the kueue.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
args: user provided arguments for running the command.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
0 if successful and 1 otherwise.
|
|
119
|
+
"""
|
|
120
|
+
# Future Improvements for inspector:
|
|
121
|
+
# 2. List what is next in Queue.
|
|
122
|
+
# 3. Split inspector into different subcommands to parse info easier.
|
|
123
|
+
|
|
124
|
+
final_return_code = 0
|
|
125
|
+
xpk_print(args)
|
|
126
|
+
|
|
127
|
+
add_zone_and_project(args)
|
|
128
|
+
set_cluster_command_code = set_cluster_command(args)
|
|
129
|
+
if set_cluster_command_code != 0:
|
|
130
|
+
xpk_exit(set_cluster_command_code)
|
|
131
|
+
|
|
132
|
+
inspector_file = write_tmp_file(
|
|
133
|
+
'==================\nXPK inspector OUTPUT:\n==================\n'
|
|
134
|
+
)
|
|
135
|
+
command_and_descriptions = [
|
|
136
|
+
('gcloud version', 'Local Setup: gcloud version'),
|
|
137
|
+
(
|
|
138
|
+
(
|
|
139
|
+
'gcloud config get project; gcloud config get compute/zone;'
|
|
140
|
+
' gcloud config get compute/region'
|
|
141
|
+
),
|
|
142
|
+
'Local Setup: Project / Zone / Region',
|
|
143
|
+
),
|
|
144
|
+
(
|
|
145
|
+
(
|
|
146
|
+
'gcloud beta container clusters list --project'
|
|
147
|
+
f' {args.project} --region {zone_to_region(args.zone)} | grep -e'
|
|
148
|
+
f' NAME -e {args.cluster}'
|
|
149
|
+
),
|
|
150
|
+
'GKE: Cluster Details',
|
|
151
|
+
),
|
|
152
|
+
(
|
|
153
|
+
(
|
|
154
|
+
'kubectl get configmap'
|
|
155
|
+
f' {args.cluster}-{CLUSTER_METADATA_CONFIGMAP} -o yaml'
|
|
156
|
+
),
|
|
157
|
+
'GKE: Cluster Metadata ConfigMap Details',
|
|
158
|
+
),
|
|
159
|
+
(
|
|
160
|
+
(
|
|
161
|
+
'kubectl get configmap'
|
|
162
|
+
f' {args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP} -o yaml'
|
|
163
|
+
),
|
|
164
|
+
'GKE: Cluster Resources ConfigMap Details',
|
|
165
|
+
),
|
|
166
|
+
(
|
|
167
|
+
(
|
|
168
|
+
f'gcloud beta container node-pools list --cluster {args.cluster} '
|
|
169
|
+
f' --project={args.project} --region={zone_to_region(args.zone)}'
|
|
170
|
+
),
|
|
171
|
+
'GKE: Node pool Details',
|
|
172
|
+
),
|
|
173
|
+
(
|
|
174
|
+
(
|
|
175
|
+
"kubectl get node -o custom-columns='NODE_NAME:metadata.name,"
|
|
176
|
+
' READY_STATUS:.status.conditions[?(@.type=="Ready")].status,'
|
|
177
|
+
" NODEPOOL:metadata.labels.cloud\\.google\\.com/gke-nodepool'"
|
|
178
|
+
),
|
|
179
|
+
'Kubectl: All Nodes',
|
|
180
|
+
),
|
|
181
|
+
(
|
|
182
|
+
(
|
|
183
|
+
'kubectl get node -o'
|
|
184
|
+
" custom-columns=':metadata.labels.cloud\\.google\\.com/gke-nodepool'"
|
|
185
|
+
' | sort | uniq -c'
|
|
186
|
+
),
|
|
187
|
+
'Kubectl: Number of Nodes per Node Pool',
|
|
188
|
+
),
|
|
189
|
+
(
|
|
190
|
+
(
|
|
191
|
+
"kubectl get node -o custom-columns='NODE_NAME:metadata.name,"
|
|
192
|
+
' READY_STATUS:.status.conditions[?(@.type=="Ready")].status,'
|
|
193
|
+
" NODEPOOL:metadata.labels.cloud\\.google\\.com/gke-nodepool' |"
|
|
194
|
+
" grep -w True | awk {'print $3'} | sort | uniq -c"
|
|
195
|
+
),
|
|
196
|
+
'Kubectl: Healthy Node Count Per Node Pool',
|
|
197
|
+
),
|
|
198
|
+
(
|
|
199
|
+
f'kubectl describe ClusterQueue {CLUSTER_QUEUE_NAME}',
|
|
200
|
+
'Kueue: ClusterQueue Details',
|
|
201
|
+
),
|
|
202
|
+
(
|
|
203
|
+
f'kubectl describe LocalQueue {LOCAL_QUEUE_NAME}',
|
|
204
|
+
'Kueue: LocalQueue Details',
|
|
205
|
+
),
|
|
206
|
+
('kubectl describe ResourceFlavor', 'Kueue: ResourceFlavor Details'),
|
|
207
|
+
(
|
|
208
|
+
(
|
|
209
|
+
'kubectl describe Deployment kueue-controller-manager -n'
|
|
210
|
+
' kueue-system'
|
|
211
|
+
),
|
|
212
|
+
'Kueue: Kueue Deployment Details',
|
|
213
|
+
),
|
|
214
|
+
(
|
|
215
|
+
(
|
|
216
|
+
'kubectl describe Deployment jobset-controller-manager -n'
|
|
217
|
+
' jobset-system'
|
|
218
|
+
),
|
|
219
|
+
'Jobset: Deployment Details',
|
|
220
|
+
),
|
|
221
|
+
(
|
|
222
|
+
(
|
|
223
|
+
'kubectl logs deployment/kueue-controller-manager -n kueue-system'
|
|
224
|
+
' --tail=100 --prefix=True'
|
|
225
|
+
),
|
|
226
|
+
'Kueue Manager Logs',
|
|
227
|
+
),
|
|
228
|
+
(
|
|
229
|
+
(
|
|
230
|
+
'kubectl logs deployment/jobset-controller-manager -n'
|
|
231
|
+
' jobset-system --tail=100 --prefix=True'
|
|
232
|
+
),
|
|
233
|
+
'Jobset Manager Logs',
|
|
234
|
+
),
|
|
235
|
+
]
|
|
236
|
+
|
|
237
|
+
for command, description in command_and_descriptions:
|
|
238
|
+
return_code = inspector_run_command_helper(
|
|
239
|
+
args, command, description, inspector_file
|
|
240
|
+
)
|
|
241
|
+
if return_code != 0:
|
|
242
|
+
final_return_code = return_code
|
|
243
|
+
xpk_print(
|
|
244
|
+
f'inspector failed in command: {command} description:'
|
|
245
|
+
f' {description} return code: {return_code}'
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Workload list views:
|
|
249
|
+
filter_by_statuses = ['EVERYTHING', 'QUEUED', 'RUNNING']
|
|
250
|
+
for filter_by_status in filter_by_statuses:
|
|
251
|
+
args.filter_by_job = None
|
|
252
|
+
args.filter_by_status = filter_by_status
|
|
253
|
+
command_description = (
|
|
254
|
+
f'xpk workload list --filter-by-status={args.filter_by_status}'
|
|
255
|
+
f' --filter-by-job={args.filter_by_job} --project={args.project} --zone={args.zone}'
|
|
256
|
+
f' --cluster={args.cluster}'
|
|
257
|
+
)
|
|
258
|
+
return_code = inspector_run_workload_list_helper(
|
|
259
|
+
args, command_description, inspector_file
|
|
260
|
+
)
|
|
261
|
+
if return_code != 0:
|
|
262
|
+
final_return_code = return_code
|
|
263
|
+
xpk_print(
|
|
264
|
+
f'inspector failed in description: {command_description} return code:'
|
|
265
|
+
f' {return_code}'
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
# If a workload argument is provided, list out workload specific details.
|
|
269
|
+
if args.workload:
|
|
270
|
+
xpk_print(args.workload)
|
|
271
|
+
args.filter_by_job = args.workload
|
|
272
|
+
args.filter_by_status = 'EVERYTHING'
|
|
273
|
+
command_description = (
|
|
274
|
+
f'xpk workload list --filter-by-status={args.filter_by_status}'
|
|
275
|
+
f' --filter-by-job={args.filter_by_job} --project={args.project} --zone={args.zone}'
|
|
276
|
+
f' --cluster={args.cluster}'
|
|
277
|
+
)
|
|
278
|
+
return_code = inspector_run_workload_list_helper(
|
|
279
|
+
args, command_description, inspector_file
|
|
280
|
+
)
|
|
281
|
+
if return_code != 0:
|
|
282
|
+
final_return_code = return_code
|
|
283
|
+
xpk_print(
|
|
284
|
+
f'inspector failed in description: {command_description} return code:'
|
|
285
|
+
f' {return_code}'
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
command = f'kubectl describe jobsets {args.workload}'
|
|
289
|
+
command_description = f'Jobset config for {args.workload}'
|
|
290
|
+
return_code = inspector_run_command_helper(
|
|
291
|
+
args, command, command_description, inspector_file
|
|
292
|
+
)
|
|
293
|
+
if return_code != 0:
|
|
294
|
+
final_return_code = return_code
|
|
295
|
+
xpk_print(
|
|
296
|
+
f'inspector failed in command: {command} description:'
|
|
297
|
+
f' {command_description} return code: {return_code}'
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
command = f'kubectl describe workloads jobset-{args.workload}'
|
|
301
|
+
command_description = f'Workload config for {args.workload}'
|
|
302
|
+
return_code = inspector_run_command_helper(
|
|
303
|
+
args, command, command_description, inspector_file
|
|
304
|
+
)
|
|
305
|
+
if return_code != 0:
|
|
306
|
+
final_return_code = return_code
|
|
307
|
+
xpk_print(
|
|
308
|
+
f'inspector failed in command: {command} description:'
|
|
309
|
+
f' {command_description} return code: {return_code}'
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
# Cloud Console Links:
|
|
313
|
+
workload_links = []
|
|
314
|
+
if args.workload:
|
|
315
|
+
workload_links = [(
|
|
316
|
+
f'Cloud Console for the workload {args.workload}',
|
|
317
|
+
# pylint: disable=line-too-long
|
|
318
|
+
f'https://console.cloud.google.com/kubernetes/service/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}',
|
|
319
|
+
)]
|
|
320
|
+
|
|
321
|
+
links = [
|
|
322
|
+
(
|
|
323
|
+
'Cloud Console for the GKE Cluster',
|
|
324
|
+
# pylint: disable=line-too-long
|
|
325
|
+
f'https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}',
|
|
326
|
+
),
|
|
327
|
+
(
|
|
328
|
+
'Cloud Console for all workloads in GKE Cluster',
|
|
329
|
+
# pylint: disable=line-too-long
|
|
330
|
+
f'https://console.cloud.google.com/kubernetes/workload/overview?project={args.project}&pageState=((gke%2F{zone_to_region(args.zone)}%2F{args.cluster}))',
|
|
331
|
+
),
|
|
332
|
+
(
|
|
333
|
+
'Cloud Console for IAM Permissions',
|
|
334
|
+
f'https://console.cloud.google.com/iam-admin/iam?project={args.project}',
|
|
335
|
+
),
|
|
336
|
+
(
|
|
337
|
+
'Cloud Console for Quotas',
|
|
338
|
+
f'https://console.cloud.google.com/iam-admin/quotas?project={args.project}',
|
|
339
|
+
),
|
|
340
|
+
]
|
|
341
|
+
links.extend(workload_links)
|
|
342
|
+
|
|
343
|
+
for description, workload_link in links:
|
|
344
|
+
return_code = inspector_output_link_helper(
|
|
345
|
+
args, workload_link, description, inspector_file
|
|
346
|
+
)
|
|
347
|
+
if return_code != 0:
|
|
348
|
+
final_return_code = return_code
|
|
349
|
+
xpk_print(
|
|
350
|
+
f'inspector failed in link: {workload_link} description:'
|
|
351
|
+
f' {description} return code: {return_code}'
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
# Summarize inspector:
|
|
355
|
+
xpk_print(f'Find xpk inspector output file: {inspector_file.name}')
|
|
356
|
+
|
|
357
|
+
if final_return_code != 0:
|
|
358
|
+
xpk_print(
|
|
359
|
+
'Something was unable to run in xpk inspector, please look through the'
|
|
360
|
+
' output as it may clue to the failure reason. Return Code:'
|
|
361
|
+
f' {final_return_code}'
|
|
362
|
+
)
|
|
363
|
+
xpk_exit(final_return_code)
|
xpk/commands/job.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from .cluster import set_cluster_command
|
|
18
|
+
from .kind import set_local_cluster_command
|
|
19
|
+
from ..core.commands import run_command_for_value, run_command_with_updates
|
|
20
|
+
from ..utils.console import xpk_exit, xpk_print
|
|
21
|
+
from ..core.kjob import AppProfileDefaults
|
|
22
|
+
from ..core.core import add_zone_and_project
|
|
23
|
+
from ruamel.yaml import YAML
|
|
24
|
+
import re
|
|
25
|
+
import sys
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def job_info(args):
|
|
29
|
+
"""Run commands obtaining information about a job given by name.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
args: user provided arguments for running the command.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
None
|
|
36
|
+
"""
|
|
37
|
+
job_name = args.name
|
|
38
|
+
|
|
39
|
+
desc_command = f'kubectl-kjob describe slurm {job_name}'
|
|
40
|
+
desc_code, desc_text = run_command_for_value(
|
|
41
|
+
desc_command, 'Getting job data', args
|
|
42
|
+
)
|
|
43
|
+
if desc_code != 0:
|
|
44
|
+
xpk_print(f'Data info request returned ERROR {desc_code}')
|
|
45
|
+
xpk_exit(desc_code)
|
|
46
|
+
|
|
47
|
+
job_command = (
|
|
48
|
+
'kubectl-kjob list slurm -o yaml --field-selector'
|
|
49
|
+
f' metadata.name=={job_name}'
|
|
50
|
+
)
|
|
51
|
+
job_code, job_text = run_command_for_value(
|
|
52
|
+
job_command, 'Getting job info', args
|
|
53
|
+
)
|
|
54
|
+
if job_code != 0:
|
|
55
|
+
xpk_print(f'Job info request returned ERROR {job_code}')
|
|
56
|
+
xpk_exit(job_code)
|
|
57
|
+
|
|
58
|
+
pods_command = f'kubectl get pods -l=job-name={job_name} --no-headers'
|
|
59
|
+
pods_code, pods_text = run_command_for_value(
|
|
60
|
+
pods_command, 'Getting pods list', args
|
|
61
|
+
)
|
|
62
|
+
if pods_code != 0:
|
|
63
|
+
xpk_print(f'Pods list request returned ERROR {pods_code}')
|
|
64
|
+
xpk_exit(pods_code)
|
|
65
|
+
|
|
66
|
+
yaml = YAML(typ='safe')
|
|
67
|
+
job_yaml = yaml.load(job_text)['items'][0]
|
|
68
|
+
|
|
69
|
+
output = {
|
|
70
|
+
'Job name': job_name,
|
|
71
|
+
'Script name': get_script_name(job_yaml),
|
|
72
|
+
'Profile': get_profile(job_yaml),
|
|
73
|
+
'Labels': job_yaml.get('metadata').get('labels', []),
|
|
74
|
+
'Mounts': get_mounts(job_yaml),
|
|
75
|
+
'Pods': get_pods(pods_text),
|
|
76
|
+
'Entrypoint environment variables template': get_kjob_env_vars(desc_text),
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
yaml.default_flow_style = False
|
|
80
|
+
yaml.sort_base_mapping_type_on_output = False
|
|
81
|
+
yaml.dump(output, sys.stdout)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def get_profile(job_yaml: dict) -> str:
|
|
85
|
+
containers = (
|
|
86
|
+
job_yaml.get('spec', {})
|
|
87
|
+
.get('template', {})
|
|
88
|
+
.get('spec', {})
|
|
89
|
+
.get('containers', [])
|
|
90
|
+
)
|
|
91
|
+
env_vars = next(iter(containers), {}).get('env', [])
|
|
92
|
+
profile = next((x['value'] for x in env_vars if x['name'] == 'PROFILE'), '')
|
|
93
|
+
return profile
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def get_mounts(job_yaml: dict) -> list[dict]:
|
|
97
|
+
containers = (
|
|
98
|
+
job_yaml.get('spec', {})
|
|
99
|
+
.get('template', {})
|
|
100
|
+
.get('spec', {})
|
|
101
|
+
.get('containers', [])
|
|
102
|
+
)
|
|
103
|
+
mounts = next(iter(containers), {}).get('volumeMounts', [])
|
|
104
|
+
return mounts
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def get_kjob_env_vars(job_desc_text: str) -> list[tuple[str, str]]:
|
|
108
|
+
regex = r'(SLURM_[A-Z_]*=.*)'
|
|
109
|
+
search_res = re.findall(regex, job_desc_text)
|
|
110
|
+
return search_res
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def get_pods(pods_text: str) -> list[str]:
|
|
114
|
+
pods_lines = pods_text.strip().split('\n')
|
|
115
|
+
pods_lines = [line.split() for line in pods_lines]
|
|
116
|
+
return [
|
|
117
|
+
{
|
|
118
|
+
'Name': line[0],
|
|
119
|
+
'Status': line[2],
|
|
120
|
+
}
|
|
121
|
+
for line in pods_lines
|
|
122
|
+
]
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def get_script_name(job_yaml: dict) -> str | None:
|
|
126
|
+
return (
|
|
127
|
+
job_yaml.get('metadata', {})
|
|
128
|
+
.get('annotations', {})
|
|
129
|
+
.get('kjobctl.x-k8s.io/script', '')
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def job_list(args) -> None:
|
|
134
|
+
"""Function around job list.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
args: user provided arguments for running the command.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
None
|
|
141
|
+
"""
|
|
142
|
+
if not args.kind_cluster:
|
|
143
|
+
add_zone_and_project(args)
|
|
144
|
+
set_cluster_command_code = set_cluster_command(args)
|
|
145
|
+
msg = f'Listing jobs for project {args.project} and zone {args.zone}:'
|
|
146
|
+
else:
|
|
147
|
+
set_cluster_command_code = set_local_cluster_command(args)
|
|
148
|
+
msg = 'Listing jobs:'
|
|
149
|
+
|
|
150
|
+
if set_cluster_command_code != 0:
|
|
151
|
+
xpk_exit(set_cluster_command_code)
|
|
152
|
+
xpk_print(msg, flush=True)
|
|
153
|
+
|
|
154
|
+
return_code = run_slurm_job_list_command(args)
|
|
155
|
+
xpk_exit(return_code)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def run_slurm_job_list_command(args) -> int:
|
|
159
|
+
cmd = f'kubectl-kjob list slurm --profile {AppProfileDefaults.NAME.value}'
|
|
160
|
+
|
|
161
|
+
return_code = run_command_with_updates(cmd, 'list jobs', args)
|
|
162
|
+
if return_code != 0:
|
|
163
|
+
xpk_print(f'Listing jobs returned ERROR {return_code}')
|
|
164
|
+
return return_code
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def job_cancel(args) -> None:
|
|
168
|
+
"""Function around job cancel.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
args: user provided arguments for running the command.
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
None
|
|
175
|
+
"""
|
|
176
|
+
xpk_print(f'Starting job cancel for job: {args.name}', flush=True)
|
|
177
|
+
if not args.kind_cluster:
|
|
178
|
+
add_zone_and_project(args)
|
|
179
|
+
set_cluster_command_code = set_cluster_command(args)
|
|
180
|
+
else:
|
|
181
|
+
set_cluster_command_code = set_local_cluster_command(args)
|
|
182
|
+
|
|
183
|
+
if set_cluster_command_code != 0:
|
|
184
|
+
xpk_exit(set_cluster_command_code)
|
|
185
|
+
|
|
186
|
+
return_code = run_slurm_job_delete_command(args)
|
|
187
|
+
xpk_exit(return_code)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def run_slurm_job_delete_command(args) -> int:
|
|
191
|
+
list_of_jobs = ' '.join(args.name)
|
|
192
|
+
cmd = f'kubectl-kjob delete slurm {list_of_jobs}'
|
|
193
|
+
|
|
194
|
+
return_code = run_command_with_updates(cmd, 'delete job', args)
|
|
195
|
+
if return_code != 0:
|
|
196
|
+
xpk_print(f'Delete job request returned ERROR {return_code}')
|
|
197
|
+
return return_code
|