xpk 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/__init__.py +15 -0
- xpk/commands/__init__.py +15 -0
- xpk/commands/batch.py +109 -0
- xpk/commands/cluster.py +784 -0
- xpk/commands/cluster_gcluster.py +185 -0
- xpk/commands/info.py +245 -0
- xpk/commands/inspector.py +363 -0
- xpk/commands/job.py +197 -0
- xpk/commands/kind.py +253 -0
- xpk/commands/shell.py +120 -0
- xpk/commands/version.py +39 -0
- xpk/commands/workload.py +692 -0
- xpk/core/__init__.py +15 -0
- xpk/core/blueprint/__init__.py +15 -0
- xpk/core/blueprint/blueprint_definitions.py +61 -0
- xpk/core/blueprint/blueprint_generator.py +652 -0
- xpk/core/cluster_private.py +197 -0
- xpk/core/commands.py +352 -0
- xpk/core/core.py +2824 -0
- xpk/core/docker_manager.py +308 -0
- xpk/core/gcluster_manager.py +158 -0
- xpk/core/kjob.py +205 -0
- xpk/core/kueue.py +352 -0
- xpk/core/nap.py +349 -0
- xpk/core/pathways.py +298 -0
- xpk/core/ray.py +222 -0
- xpk/core/system_characteristics.py +1395 -0
- xpk/core/workload.py +133 -0
- xpk/core/workload_decorators/__init__.py +15 -0
- xpk/core/workload_decorators/rdma_decorator.py +109 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +157 -0
- xpk/main.py +73 -0
- xpk/parser/__init__.py +15 -0
- xpk/parser/batch.py +184 -0
- xpk/parser/cluster.py +621 -0
- xpk/parser/common.py +71 -0
- xpk/parser/core.py +109 -0
- xpk/parser/info.py +63 -0
- xpk/parser/inspector.py +65 -0
- xpk/parser/job.py +126 -0
- xpk/parser/kind.py +94 -0
- xpk/parser/shell.py +50 -0
- xpk/parser/validators.py +39 -0
- xpk/parser/version.py +23 -0
- xpk/parser/workload.py +684 -0
- xpk/utils/__init__.py +15 -0
- xpk/utils/console.py +55 -0
- xpk/utils/file.py +82 -0
- xpk/utils/network.py +168 -0
- xpk/utils/objects.py +85 -0
- xpk/utils/yaml.py +30 -0
- {xpk-0.4.0.dist-info → xpk-0.6.0.dist-info}/METADATA +307 -38
- xpk-0.6.0.dist-info/RECORD +57 -0
- {xpk-0.4.0.dist-info → xpk-0.6.0.dist-info}/WHEEL +1 -1
- xpk-0.6.0.dist-info/entry_points.txt +2 -0
- xpk-0.4.0.dist-info/RECORD +0 -7
- xpk-0.4.0.dist-info/entry_points.txt +0 -2
- xpk.py +0 -7218
- {xpk-0.4.0.dist-info → xpk-0.6.0.dist-info}/LICENSE +0 -0
- {xpk-0.4.0.dist-info → xpk-0.6.0.dist-info}/top_level.txt +0 -0
xpk/commands/workload.py
ADDED
|
@@ -0,0 +1,692 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from ..core.commands import (
|
|
18
|
+
run_command_with_updates,
|
|
19
|
+
run_commands,
|
|
20
|
+
)
|
|
21
|
+
from ..core.core import (
|
|
22
|
+
CLUSTER_METADATA_CONFIGMAP,
|
|
23
|
+
VERTEX_TENSORBOARD_FEATURE_FLAG,
|
|
24
|
+
AcceleratorTypeToAcceleratorCharacteristics,
|
|
25
|
+
add_zone_and_project,
|
|
26
|
+
check_if_workload_can_schedule,
|
|
27
|
+
check_if_workload_exists,
|
|
28
|
+
create_accelerator_label,
|
|
29
|
+
create_machine_label,
|
|
30
|
+
create_vertex_experiment,
|
|
31
|
+
get_cluster_configmap,
|
|
32
|
+
get_cpu_affinity,
|
|
33
|
+
get_gke_outlier_dashboard,
|
|
34
|
+
get_gpu_rxdm_cmd,
|
|
35
|
+
get_gpu_rxdm_image,
|
|
36
|
+
get_gpu_scheduler,
|
|
37
|
+
get_gpu_tcp_volume,
|
|
38
|
+
get_gpu_volume,
|
|
39
|
+
get_user_workload_container,
|
|
40
|
+
get_volumes,
|
|
41
|
+
parse_env_config,
|
|
42
|
+
wait_for_job_completion,
|
|
43
|
+
xpk_current_version,
|
|
44
|
+
zone_to_region,
|
|
45
|
+
)
|
|
46
|
+
from ..core.kueue import LOCAL_QUEUE_NAME
|
|
47
|
+
from ..core.nap import (
|
|
48
|
+
get_autoprovisioning_node_selector_args,
|
|
49
|
+
is_autoprovisioning_enabled,
|
|
50
|
+
)
|
|
51
|
+
from ..core.pathways import (
|
|
52
|
+
ensure_pathways_workload_prerequisites,
|
|
53
|
+
get_pathways_proxy_args,
|
|
54
|
+
get_pathways_rm_args,
|
|
55
|
+
get_pathways_unified_query_link,
|
|
56
|
+
get_pathways_worker_args,
|
|
57
|
+
get_user_workload_for_pathways,
|
|
58
|
+
)
|
|
59
|
+
from ..core.system_characteristics import (
|
|
60
|
+
AcceleratorType,
|
|
61
|
+
get_system_characteristics,
|
|
62
|
+
)
|
|
63
|
+
from ..core.workload import get_workload_list
|
|
64
|
+
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
65
|
+
from ..utils.file import write_tmp_file
|
|
66
|
+
from .cluster import set_cluster_command
|
|
67
|
+
from ..core.workload_decorators import tcpxo_decorator, rdma_decorator
|
|
68
|
+
from . import cluster_gcluster
|
|
69
|
+
|
|
70
|
+
workload_create_yaml = """apiVersion: jobset.x-k8s.io/v1alpha2
|
|
71
|
+
kind: JobSet
|
|
72
|
+
metadata:
|
|
73
|
+
name: {args.workload}
|
|
74
|
+
labels:
|
|
75
|
+
kueue.x-k8s.io/queue-name: {local_queue_name} # Name of the LocalQueue
|
|
76
|
+
xpk.google.com/workload: {args.workload}
|
|
77
|
+
annotations:
|
|
78
|
+
alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool # 1:1 job replica to node pool assignment
|
|
79
|
+
spec:
|
|
80
|
+
ttlSecondsAfterFinished: {args.ttl_seconds_after_finished}
|
|
81
|
+
failurePolicy:
|
|
82
|
+
maxRestarts: {args.max_restarts}
|
|
83
|
+
replicatedJobs:
|
|
84
|
+
- name: slice-job
|
|
85
|
+
replicas: {args.num_slices}
|
|
86
|
+
template:
|
|
87
|
+
spec:
|
|
88
|
+
parallelism: {system.vms_per_slice} # Equal to the number of VMs per slice
|
|
89
|
+
completions: {system.vms_per_slice} # Same as the above.
|
|
90
|
+
backoffLimit: 0 # When any pod fails, the job is failed
|
|
91
|
+
template:
|
|
92
|
+
metadata:
|
|
93
|
+
labels:
|
|
94
|
+
xpk.google.com/workload: {args.workload}
|
|
95
|
+
spec:
|
|
96
|
+
schedulerName: {args.scheduler}
|
|
97
|
+
restartPolicy: Never
|
|
98
|
+
{affinity}
|
|
99
|
+
nodeSelector:
|
|
100
|
+
{accelerator_label}
|
|
101
|
+
{machine_label}
|
|
102
|
+
{autoprovisioning_args}
|
|
103
|
+
priorityClassName: {args.priority}
|
|
104
|
+
hostNetwork: true
|
|
105
|
+
dnsPolicy: ClusterFirstWithHostNet
|
|
106
|
+
terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
|
|
107
|
+
containers:
|
|
108
|
+
{container}
|
|
109
|
+
volumes:
|
|
110
|
+
{volumes}
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
gpu_workload_create_yaml = """apiVersion: jobset.x-k8s.io/v1alpha2
|
|
115
|
+
kind: JobSet
|
|
116
|
+
metadata:
|
|
117
|
+
name: {args.workload}
|
|
118
|
+
labels:
|
|
119
|
+
kueue.x-k8s.io/queue-name: multislice-queue # Name of the LocalQueue
|
|
120
|
+
xpk.google.com/workload: {args.workload}
|
|
121
|
+
spec:
|
|
122
|
+
ttlSecondsAfterFinished: {args.ttl_seconds_after_finished}
|
|
123
|
+
failurePolicy:
|
|
124
|
+
maxRestarts: {args.max_restarts}
|
|
125
|
+
replicatedJobs:
|
|
126
|
+
- name: slice-job
|
|
127
|
+
replicas: 1
|
|
128
|
+
template:
|
|
129
|
+
spec:
|
|
130
|
+
parallelism: {args.num_nodes}
|
|
131
|
+
completions: {args.num_nodes}
|
|
132
|
+
backoffLimit: 0 # When any pod fails, the job is failed
|
|
133
|
+
template:
|
|
134
|
+
metadata:
|
|
135
|
+
labels:
|
|
136
|
+
xpk.google.com/workload: {args.workload}
|
|
137
|
+
spec:
|
|
138
|
+
{gpu_scheduler}
|
|
139
|
+
priorityClassName: {args.priority}
|
|
140
|
+
restartPolicy: Never
|
|
141
|
+
hostNetwork: true
|
|
142
|
+
dnsPolicy: ClusterFirstWithHostNet
|
|
143
|
+
terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
|
|
144
|
+
tolerations:
|
|
145
|
+
- operator: "Exists"
|
|
146
|
+
key: nvidia.com/gpu
|
|
147
|
+
volumes:
|
|
148
|
+
{gpu_volume}
|
|
149
|
+
containers:
|
|
150
|
+
{gpu_rxdm_image}
|
|
151
|
+
imagePullPolicy: Always
|
|
152
|
+
command:
|
|
153
|
+
- "bash"
|
|
154
|
+
- "-c"
|
|
155
|
+
- |
|
|
156
|
+
{gpu_rxdm_cmd} &
|
|
157
|
+
while [ ! -e "/usr/share/workload/workload_terminated" ]; do sleep 10; echo "sleeping"; done
|
|
158
|
+
securityContext:
|
|
159
|
+
privileged: true
|
|
160
|
+
volumeMounts:
|
|
161
|
+
{gpu_tcp_volume}
|
|
162
|
+
- name: nvidia-install-dir-host
|
|
163
|
+
mountPath: /usr/local/nvidia/lib64
|
|
164
|
+
- name: workload-terminated-volume
|
|
165
|
+
mountPath: /usr/share/workload
|
|
166
|
+
env:
|
|
167
|
+
- name: LD_LIBRARY_PATH
|
|
168
|
+
value: /usr/local/nvidia/lib64
|
|
169
|
+
{container}
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
a3_gpu_workload_create_yaml = """apiVersion: jobset.x-k8s.io/v1alpha2
|
|
173
|
+
kind: JobSet
|
|
174
|
+
metadata:
|
|
175
|
+
name: {args.workload}
|
|
176
|
+
labels:
|
|
177
|
+
kueue.x-k8s.io/queue-name: multislice-queue # Name of the LocalQueue
|
|
178
|
+
xpk.google.com/workload: {args.workload}
|
|
179
|
+
spec:
|
|
180
|
+
ttlSecondsAfterFinished: {args.ttl_seconds_after_finished}
|
|
181
|
+
failurePolicy:
|
|
182
|
+
maxRestarts: {args.max_restarts}
|
|
183
|
+
replicatedJobs:
|
|
184
|
+
- name: slice-job
|
|
185
|
+
replicas: 1
|
|
186
|
+
template:
|
|
187
|
+
spec:
|
|
188
|
+
parallelism: {args.num_nodes}
|
|
189
|
+
completions: {args.num_nodes}
|
|
190
|
+
backoffLimit: 0 # When any pod fails, the job is failed
|
|
191
|
+
template:
|
|
192
|
+
metadata:
|
|
193
|
+
labels:
|
|
194
|
+
xpk.google.com/workload: {args.workload}
|
|
195
|
+
annotations:
|
|
196
|
+
kueue.x-k8s.io/podset-preferred-topology: "cloud.google.com/gce-topology-host"
|
|
197
|
+
spec:
|
|
198
|
+
priorityClassName: {args.priority}
|
|
199
|
+
restartPolicy: Never
|
|
200
|
+
dnsPolicy: ClusterFirstWithHostNet
|
|
201
|
+
terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
|
|
202
|
+
tolerations:
|
|
203
|
+
- operator: "Exists"
|
|
204
|
+
key: nvidia.com/gpu
|
|
205
|
+
containers:
|
|
206
|
+
{container}
|
|
207
|
+
"""
|
|
208
|
+
|
|
209
|
+
pw_workload_create_yaml = """apiVersion: jobset.x-k8s.io/v1alpha2
|
|
210
|
+
kind: JobSet
|
|
211
|
+
metadata:
|
|
212
|
+
name: {args.workload}
|
|
213
|
+
labels:
|
|
214
|
+
kueue.x-k8s.io/queue-name: {local_queue_name} # Name of the LocalQueue
|
|
215
|
+
xpk.google.com/workload: {args.workload}
|
|
216
|
+
spec:
|
|
217
|
+
ttlSecondsAfterFinished: {args.ttl_seconds_after_finished}
|
|
218
|
+
failurePolicy:
|
|
219
|
+
maxRestarts: {args.max_restarts}
|
|
220
|
+
successPolicy:
|
|
221
|
+
operator: "All"
|
|
222
|
+
targetReplicatedJobs:
|
|
223
|
+
- {args.targetReplicatedJob}
|
|
224
|
+
replicatedJobs:
|
|
225
|
+
- name: worker
|
|
226
|
+
replicas: {args.num_slices}
|
|
227
|
+
template:
|
|
228
|
+
metadata:
|
|
229
|
+
annotations:
|
|
230
|
+
alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool
|
|
231
|
+
labels:
|
|
232
|
+
xpk.google.com/workload: {args.workload}
|
|
233
|
+
spec:
|
|
234
|
+
backoffLimit: {backoff_limit}
|
|
235
|
+
completions: {system.vms_per_slice}
|
|
236
|
+
parallelism: {system.vms_per_slice}
|
|
237
|
+
template:
|
|
238
|
+
spec:
|
|
239
|
+
terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
|
|
240
|
+
containers:
|
|
241
|
+
- args:
|
|
242
|
+
{pathways_worker_args}
|
|
243
|
+
image: {args.server_image}
|
|
244
|
+
imagePullPolicy: Always
|
|
245
|
+
name: pathways-worker
|
|
246
|
+
ports:
|
|
247
|
+
- containerPort: 29001
|
|
248
|
+
- containerPort: 8471
|
|
249
|
+
- containerPort: 8080
|
|
250
|
+
resources:
|
|
251
|
+
limits:
|
|
252
|
+
{resource_type}: {system.chips_per_vm}
|
|
253
|
+
securityContext:
|
|
254
|
+
privileged: true
|
|
255
|
+
volumeMounts:
|
|
256
|
+
- mountPath: /tmp
|
|
257
|
+
name: shared-tmp
|
|
258
|
+
nodeSelector:
|
|
259
|
+
{accelerator_label}
|
|
260
|
+
{machine_label}
|
|
261
|
+
{autoprovisioning_args}
|
|
262
|
+
priorityClassName: {args.priority}
|
|
263
|
+
hostNetwork: true
|
|
264
|
+
dnsPolicy: ClusterFirstWithHostNet
|
|
265
|
+
volumes:
|
|
266
|
+
- hostPath:
|
|
267
|
+
path: /tmp
|
|
268
|
+
type: DirectoryOrCreate
|
|
269
|
+
name: shared-tmp
|
|
270
|
+
- name: rm
|
|
271
|
+
replicas: 1
|
|
272
|
+
template:
|
|
273
|
+
metadata:
|
|
274
|
+
labels:
|
|
275
|
+
xpk.google.com/workload: {args.workload}
|
|
276
|
+
spec:
|
|
277
|
+
backoffLimit: 0
|
|
278
|
+
completions: 1
|
|
279
|
+
parallelism: 1
|
|
280
|
+
template:
|
|
281
|
+
spec:
|
|
282
|
+
containers:
|
|
283
|
+
- args:
|
|
284
|
+
{pathways_rm_args}
|
|
285
|
+
env:
|
|
286
|
+
- name: REPLICATED_JOB_NAME
|
|
287
|
+
valueFrom:
|
|
288
|
+
fieldRef:
|
|
289
|
+
fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
|
|
290
|
+
- name: JOBSET_NAME
|
|
291
|
+
valueFrom:
|
|
292
|
+
fieldRef:
|
|
293
|
+
fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
|
|
294
|
+
- name: HOST_ADDRESS
|
|
295
|
+
value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)
|
|
296
|
+
- name: TPU_SKIP_MDS_QUERY
|
|
297
|
+
value: "true"
|
|
298
|
+
image: {args.server_image}
|
|
299
|
+
imagePullPolicy: Always
|
|
300
|
+
name: pathways-rm
|
|
301
|
+
ports:
|
|
302
|
+
- containerPort: 29001
|
|
303
|
+
securityContext:
|
|
304
|
+
privileged: true
|
|
305
|
+
volumeMounts:
|
|
306
|
+
- mountPath: /tmp
|
|
307
|
+
name: shared-tmp
|
|
308
|
+
nodeSelector:
|
|
309
|
+
cloud.google.com/gke-nodepool: cpu-rm-np
|
|
310
|
+
hostNetwork: true
|
|
311
|
+
dnsPolicy: ClusterFirstWithHostNet
|
|
312
|
+
volumes:
|
|
313
|
+
- hostPath:
|
|
314
|
+
path: /tmp
|
|
315
|
+
type: DirectoryOrCreate
|
|
316
|
+
name: shared-tmp
|
|
317
|
+
- name: proxy
|
|
318
|
+
replicas: 1
|
|
319
|
+
template:
|
|
320
|
+
metadata:
|
|
321
|
+
labels:
|
|
322
|
+
xpk.google.com/workload: {args.workload}
|
|
323
|
+
spec:
|
|
324
|
+
backoffLimit: 0
|
|
325
|
+
completions: 1
|
|
326
|
+
parallelism: 1
|
|
327
|
+
template:
|
|
328
|
+
spec:
|
|
329
|
+
containers:
|
|
330
|
+
- args:
|
|
331
|
+
{pathways_proxy_args}
|
|
332
|
+
image: {args.proxy_server_image}
|
|
333
|
+
imagePullPolicy: Always
|
|
334
|
+
name: pathways-proxy
|
|
335
|
+
ports:
|
|
336
|
+
- containerPort: 29000
|
|
337
|
+
hostNetwork: true
|
|
338
|
+
dnsPolicy: ClusterFirstWithHostNet
|
|
339
|
+
nodeSelector:
|
|
340
|
+
cloud.google.com/gke-nodepool: cpu-proxy-np
|
|
341
|
+
{user_workload}
|
|
342
|
+
"""
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def workload_create_pathways(args) -> None:
|
|
346
|
+
"""Run jobset apply command for a file, specifically for Pathways.
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
args: user provided arguments for running the command.
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
0 if successful and 1 otherwise.
|
|
353
|
+
"""
|
|
354
|
+
args.use_pathways = True
|
|
355
|
+
workload_create(args)
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def workload_create(args) -> None:
|
|
359
|
+
"""Run jobset apply command for a file.
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
args: user provided arguments for running the command.
|
|
363
|
+
|
|
364
|
+
Returns:
|
|
365
|
+
0 if successful and 1 otherwise.
|
|
366
|
+
"""
|
|
367
|
+
add_zone_and_project(args)
|
|
368
|
+
|
|
369
|
+
if args.headless:
|
|
370
|
+
xpk_print(
|
|
371
|
+
'Please use kubectl port forwarding to connect to the Pathways proxy.'
|
|
372
|
+
' kubectl get pods kubectl port-forward <proxy-pod-name> 29000:29000'
|
|
373
|
+
' JAX_PLATFORMS=proxy JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 python'
|
|
374
|
+
" -c 'import pathwaysutils; import jax; print(jax.devices())'"
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
set_cluster_command_code = set_cluster_command(args)
|
|
378
|
+
if set_cluster_command_code != 0:
|
|
379
|
+
xpk_exit(set_cluster_command_code)
|
|
380
|
+
|
|
381
|
+
workload_exists = check_if_workload_exists(args)
|
|
382
|
+
|
|
383
|
+
if workload_exists:
|
|
384
|
+
xpk_print(
|
|
385
|
+
f'{args.workload} already exists, XPK will not create this workload.'
|
|
386
|
+
' Please pick a new workload name'
|
|
387
|
+
)
|
|
388
|
+
xpk_exit(1)
|
|
389
|
+
|
|
390
|
+
xpk_print('Starting workload create', flush=True)
|
|
391
|
+
system, return_code = get_system_characteristics(args)
|
|
392
|
+
|
|
393
|
+
if return_code > 0:
|
|
394
|
+
xpk_print('Fetching system characteristics failed!')
|
|
395
|
+
xpk_exit(return_code)
|
|
396
|
+
|
|
397
|
+
if not check_if_workload_can_schedule(args, system):
|
|
398
|
+
xpk_exit(1)
|
|
399
|
+
|
|
400
|
+
xpk_print('Starting workload create', flush=True)
|
|
401
|
+
|
|
402
|
+
metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
|
|
403
|
+
cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
|
|
404
|
+
cluster_xpk_version = None
|
|
405
|
+
if cluster_config_map is None:
|
|
406
|
+
xpk_print(
|
|
407
|
+
f'Warning: Unable to find ConfigMap: {metadata_configmap_name} for the'
|
|
408
|
+
' cluster. We recommend to upgrade your cluster by running `xpk'
|
|
409
|
+
' cluster create`.'
|
|
410
|
+
)
|
|
411
|
+
else:
|
|
412
|
+
cluster_xpk_version = cluster_config_map.get('xpk_version')
|
|
413
|
+
if (
|
|
414
|
+
cluster_xpk_version is not None
|
|
415
|
+
and cluster_xpk_version != xpk_current_version
|
|
416
|
+
):
|
|
417
|
+
xpk_print(
|
|
418
|
+
'Warning: Cluster has been created using XPK version:'
|
|
419
|
+
f' {cluster_config_map["xpk_version"]} but the XPK version you are'
|
|
420
|
+
f' using to schedule workload is: {xpk_current_version}. Some features'
|
|
421
|
+
' might not be available for this cluster. We recommend to'
|
|
422
|
+
' upgrade/downgrade your XPK version or cluster by running `xpk'
|
|
423
|
+
' cluster create`.'
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
debugging_dashboard_id = None
|
|
427
|
+
|
|
428
|
+
tensorboard_config = {}
|
|
429
|
+
if VERTEX_TENSORBOARD_FEATURE_FLAG and args.use_vertex_tensorboard:
|
|
430
|
+
tensorboard_config = create_vertex_experiment(args)
|
|
431
|
+
# exit if failed to create Experiment in Vertex AI
|
|
432
|
+
if not tensorboard_config:
|
|
433
|
+
xpk_exit(1)
|
|
434
|
+
|
|
435
|
+
parse_env_config(args, tensorboard_config, system)
|
|
436
|
+
|
|
437
|
+
# Currently autoprovisioning is not enabled for Pathways workloads.
|
|
438
|
+
autoprovisioning_args = ''
|
|
439
|
+
autoprovisioning_enabled, return_code = is_autoprovisioning_enabled(
|
|
440
|
+
args, system
|
|
441
|
+
)
|
|
442
|
+
if return_code != 0:
|
|
443
|
+
xpk_exit(return_code)
|
|
444
|
+
if autoprovisioning_enabled:
|
|
445
|
+
# Determine NAP capacity type
|
|
446
|
+
autoprovisioning_args, return_code = (
|
|
447
|
+
get_autoprovisioning_node_selector_args(args)
|
|
448
|
+
)
|
|
449
|
+
if return_code != 0:
|
|
450
|
+
xpk_exit(return_code)
|
|
451
|
+
|
|
452
|
+
# Create the workload file based on accelerator type or workload type.
|
|
453
|
+
if system.accelerator_type == AcceleratorType['GPU']:
|
|
454
|
+
container, debugging_dashboard_id = get_user_workload_container(
|
|
455
|
+
args, system
|
|
456
|
+
)
|
|
457
|
+
gpu_scheduler, return_code = get_gpu_scheduler(
|
|
458
|
+
args, system, autoprovisioning_args
|
|
459
|
+
)
|
|
460
|
+
if return_code != 0:
|
|
461
|
+
xpk_exit(return_code)
|
|
462
|
+
|
|
463
|
+
if system.device_type in cluster_gcluster.supported_device_types:
|
|
464
|
+
yml_string = a3_gpu_workload_create_yaml.format(
|
|
465
|
+
args=args, container=container
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
if args.device_type == cluster_gcluster.a3mega_device_type:
|
|
469
|
+
sub_networks = [f'{args.cluster}-gpunet-{i}-subnet' for i in range(8)]
|
|
470
|
+
yml_string = tcpxo_decorator.decorate_jobset(yml_string, sub_networks)
|
|
471
|
+
|
|
472
|
+
if args.device_type == cluster_gcluster.a3ultra_device_type:
|
|
473
|
+
sub_networks = [f'{args.cluster}-sub-1'] + [
|
|
474
|
+
f'{args.cluster}-rdma-sub-{i}' for i in range(8)
|
|
475
|
+
]
|
|
476
|
+
yml_string = rdma_decorator.decorate_jobset(yml_string, sub_networks)
|
|
477
|
+
else:
|
|
478
|
+
yml_string = gpu_workload_create_yaml.format(
|
|
479
|
+
args=args,
|
|
480
|
+
container=container,
|
|
481
|
+
command=args.command,
|
|
482
|
+
chips_per_vm=system.chips_per_vm,
|
|
483
|
+
gpu_scheduler=gpu_scheduler,
|
|
484
|
+
gpu_volume=get_gpu_volume(system),
|
|
485
|
+
gpu_rxdm_image=get_gpu_rxdm_image(system),
|
|
486
|
+
gpu_rxdm_cmd=get_gpu_rxdm_cmd(system),
|
|
487
|
+
gpu_tcp_volume=get_gpu_tcp_volume(system),
|
|
488
|
+
)
|
|
489
|
+
elif args.use_pathways and ensure_pathways_workload_prerequisites(
|
|
490
|
+
args, system
|
|
491
|
+
):
|
|
492
|
+
yml_string = pw_workload_create_yaml.format(
|
|
493
|
+
args=args,
|
|
494
|
+
system=system,
|
|
495
|
+
accelerator_label=create_accelerator_label(
|
|
496
|
+
system.accelerator_type, system
|
|
497
|
+
),
|
|
498
|
+
machine_label=create_machine_label(system.accelerator_type, system),
|
|
499
|
+
pathways_rm_args=get_pathways_rm_args(args, system),
|
|
500
|
+
pathways_worker_args=get_pathways_worker_args(args),
|
|
501
|
+
pathways_proxy_args=get_pathways_proxy_args(args),
|
|
502
|
+
user_workload=get_user_workload_for_pathways(args, system),
|
|
503
|
+
resource_type=AcceleratorTypeToAcceleratorCharacteristics[
|
|
504
|
+
system.accelerator_type
|
|
505
|
+
].resource_type,
|
|
506
|
+
local_queue_name=LOCAL_QUEUE_NAME,
|
|
507
|
+
autoprovisioning_args=autoprovisioning_args,
|
|
508
|
+
backoff_limit=system.vms_per_slice * 4,
|
|
509
|
+
)
|
|
510
|
+
else:
|
|
511
|
+
container, debugging_dashboard_id = get_user_workload_container(
|
|
512
|
+
args, system
|
|
513
|
+
)
|
|
514
|
+
yml_string = workload_create_yaml.format(
|
|
515
|
+
args=args,
|
|
516
|
+
system=system,
|
|
517
|
+
container=container,
|
|
518
|
+
affinity=get_cpu_affinity(system.accelerator_type),
|
|
519
|
+
accelerator_label=create_accelerator_label(
|
|
520
|
+
system.accelerator_type, system
|
|
521
|
+
),
|
|
522
|
+
machine_label=create_machine_label(system.accelerator_type, system),
|
|
523
|
+
local_queue_name=LOCAL_QUEUE_NAME,
|
|
524
|
+
autoprovisioning_args=autoprovisioning_args,
|
|
525
|
+
volumes=get_volumes(args, system),
|
|
526
|
+
)
|
|
527
|
+
tmp = write_tmp_file(yml_string)
|
|
528
|
+
command = f'kubectl apply -f {str(tmp.file.name)}'
|
|
529
|
+
return_code = run_command_with_updates(command, 'Creating Workload', args)
|
|
530
|
+
|
|
531
|
+
if return_code != 0:
|
|
532
|
+
xpk_print(f'Create Workload request returned ERROR {return_code}')
|
|
533
|
+
xpk_exit(return_code)
|
|
534
|
+
|
|
535
|
+
# Get GKE outlier dashboard for TPU
|
|
536
|
+
outlier_dashboard_id = None
|
|
537
|
+
if system.accelerator_type == AcceleratorType['TPU']:
|
|
538
|
+
outlier_dashboard_id = get_gke_outlier_dashboard(args)
|
|
539
|
+
|
|
540
|
+
# Outlier and debugging dashboards
|
|
541
|
+
if outlier_dashboard_id is not None:
|
|
542
|
+
xpk_print(
|
|
543
|
+
'Check statistics and outlier mode of GKE metrics here:'
|
|
544
|
+
# pylint: disable=line-too-long
|
|
545
|
+
f' https://console.cloud.google.com/monitoring/dashboards/builder/{outlier_dashboard_id}?project={args.project}&f.rlabel.cluster_name.ClusterName={args.cluster}.'
|
|
546
|
+
' To view the metric data for your workload, select'
|
|
547
|
+
f' {args.workload} from the JobName filter on the dashboard.'
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
if debugging_dashboard_id is not None:
|
|
551
|
+
xpk_print(
|
|
552
|
+
'Check stack traces collected in Cloud Logging here:'
|
|
553
|
+
# pylint: disable=line-too-long
|
|
554
|
+
f' https://console.cloud.google.com/monitoring/dashboards/builder/{debugging_dashboard_id}?project={args.project}&f.rlabel.cluster_name.ClusterName={args.cluster}.'
|
|
555
|
+
' To view the stack traces for your workload, select'
|
|
556
|
+
f' {args.workload} from the JobName filter on the dashboard.'
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
if args.use_pathways:
|
|
560
|
+
if args.headless:
|
|
561
|
+
xpk_print(
|
|
562
|
+
' \n ******* Please connect to your Pathways proxy at'
|
|
563
|
+
f' {args.pathways_proxy_address}, once you see "IFRT proxy server'
|
|
564
|
+
' started with status OK" on the proxy link below.'
|
|
565
|
+
' Remember to delete the workload once done! ****** \n'
|
|
566
|
+
)
|
|
567
|
+
pathways_proxy_link = f'https://console.cloud.google.com/kubernetes/job/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}-proxy-0/details?project={args.project}'
|
|
568
|
+
xpk_print(
|
|
569
|
+
'Follow the proxy here:'
|
|
570
|
+
# pylint: disable=line-too-long)
|
|
571
|
+
f' {pathways_proxy_link} '
|
|
572
|
+
)
|
|
573
|
+
xpk_print(
|
|
574
|
+
'Follow your Pathways workload and other resources here : '
|
|
575
|
+
f'{get_pathways_unified_query_link(args)}'
|
|
576
|
+
)
|
|
577
|
+
else:
|
|
578
|
+
xpk_print(
|
|
579
|
+
'Follow your workload here:'
|
|
580
|
+
# pylint: disable=line-too-long
|
|
581
|
+
f' https://console.cloud.google.com/kubernetes/service/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
|
|
582
|
+
)
|
|
583
|
+
duration_of_logs = 'P1D' # Past 1 Day
|
|
584
|
+
xpk_print(
|
|
585
|
+
'Follow your worker 0, slice 0 logs here:'
|
|
586
|
+
' Adjust the pod name'
|
|
587
|
+
' ([prefix]-slice-job-[slice_number]-[worker_number])'
|
|
588
|
+
' after clicking the url if you want other worker logs.'
|
|
589
|
+
# pylint: disable=line-too-long
|
|
590
|
+
f' https://console.cloud.google.com/logs/query;query=resource.type%3D%22k8s_container%22%0Aresource.labels.project_id%3D%22{args.project}%22%0Aresource.labels.location%3D%22{zone_to_region(args.zone)}%22%0Aresource.labels.cluster_name%3D%22{args.cluster}%22%0Aresource.labels.namespace_name%3D%22default%22%0Aresource.labels.pod_name:%22{args.workload}-slice-job-0-0-%22%20severity%3E%3DDEFAULT;storageScope=project;duration={duration_of_logs}?e=13802955&mods=allow_workbench_image_override&project={args.project}'
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
xpk_exit(0)
|
|
594
|
+
|
|
595
|
+
|
|
596
|
+
def workload_delete(args) -> None:
|
|
597
|
+
"""Function around workload delete.
|
|
598
|
+
|
|
599
|
+
Args:
|
|
600
|
+
args: user provided arguments for running the command.
|
|
601
|
+
|
|
602
|
+
Returns:
|
|
603
|
+
0 if successful and 1 otherwise.
|
|
604
|
+
"""
|
|
605
|
+
xpk_print('Starting Workload delete', flush=True)
|
|
606
|
+
add_zone_and_project(args)
|
|
607
|
+
set_cluster_command_code = set_cluster_command(args)
|
|
608
|
+
if set_cluster_command_code != 0:
|
|
609
|
+
xpk_exit(set_cluster_command_code)
|
|
610
|
+
|
|
611
|
+
will_delete = True
|
|
612
|
+
if not args.workload:
|
|
613
|
+
xpk_print('Get the name of the workloads in the cluster.')
|
|
614
|
+
return_code, return_value = get_workload_list(args)
|
|
615
|
+
|
|
616
|
+
if return_code != 0:
|
|
617
|
+
xpk_print(f'List Job request returned ERROR {return_code}')
|
|
618
|
+
xpk_exit(return_code)
|
|
619
|
+
# Skip the header
|
|
620
|
+
workloads = [x.split(' ')[0] for x in return_value.splitlines()][1:]
|
|
621
|
+
if workloads and not args.force:
|
|
622
|
+
will_delete = get_user_input(
|
|
623
|
+
f'Planning to delete {len(workloads)} workloads in the cluster'
|
|
624
|
+
f' {args.cluster} including {workloads}. \nDo you wish to delete: y'
|
|
625
|
+
' (yes) / n (no):\n'
|
|
626
|
+
)
|
|
627
|
+
else:
|
|
628
|
+
workloads = [args.workload]
|
|
629
|
+
|
|
630
|
+
if not workloads:
|
|
631
|
+
xpk_print(
|
|
632
|
+
'There are no workloads to delete matching the filter in the cluster.'
|
|
633
|
+
)
|
|
634
|
+
elif not will_delete:
|
|
635
|
+
xpk_print('Skipping delete command.')
|
|
636
|
+
else:
|
|
637
|
+
commands = []
|
|
638
|
+
task_names = []
|
|
639
|
+
for workload in workloads:
|
|
640
|
+
args.workload = workload
|
|
641
|
+
command = f'kubectl delete jobset {workload} -n default'
|
|
642
|
+
task_name = f'WorkloadDelete-{workload}'
|
|
643
|
+
commands.append(command)
|
|
644
|
+
task_names.append(task_name)
|
|
645
|
+
|
|
646
|
+
# Not batching deletion for single workload
|
|
647
|
+
if len(workloads) == 1:
|
|
648
|
+
return_code = run_command_with_updates(
|
|
649
|
+
commands[0], 'Delete Workload', args
|
|
650
|
+
)
|
|
651
|
+
else:
|
|
652
|
+
return_code = run_commands(
|
|
653
|
+
commands, 'Delete Workload', task_names, batch=100
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
if return_code != 0:
|
|
657
|
+
xpk_print(f'Delete Workload request returned ERROR {return_code}')
|
|
658
|
+
xpk_exit(return_code)
|
|
659
|
+
xpk_exit(0)
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
def workload_list(args) -> None:
|
|
663
|
+
"""Function around workload list.
|
|
664
|
+
|
|
665
|
+
Args:
|
|
666
|
+
args: user provided arguments for running the command.
|
|
667
|
+
|
|
668
|
+
Returns:
|
|
669
|
+
0 if successful and 1 otherwise.
|
|
670
|
+
"""
|
|
671
|
+
xpk_print(args)
|
|
672
|
+
|
|
673
|
+
xpk_print('Starting workload list', flush=True)
|
|
674
|
+
add_zone_and_project(args)
|
|
675
|
+
set_cluster_command_code = set_cluster_command(args)
|
|
676
|
+
if set_cluster_command_code != 0:
|
|
677
|
+
xpk_exit(set_cluster_command_code)
|
|
678
|
+
|
|
679
|
+
if args.wait_for_job_completion:
|
|
680
|
+
return_code = wait_for_job_completion(args)
|
|
681
|
+
if return_code != 0:
|
|
682
|
+
xpk_print(f'Wait for job completion returned ERROR {return_code}')
|
|
683
|
+
xpk_exit(return_code)
|
|
684
|
+
args.filter_by_job = args.wait_for_job_completion
|
|
685
|
+
|
|
686
|
+
return_code, return_value = get_workload_list(args)
|
|
687
|
+
|
|
688
|
+
if return_code != 0:
|
|
689
|
+
xpk_print(f'List Job request returned ERROR {return_code}')
|
|
690
|
+
xpk_exit(return_code)
|
|
691
|
+
xpk_print(f'Workload List Output:\n{return_value}')
|
|
692
|
+
xpk_exit(0)
|
xpk/core/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|