xpk 0.7.2__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +19 -13
- xpk/commands/cluster.py +240 -71
- xpk/commands/cluster_gcluster.py +22 -5
- xpk/commands/common.py +33 -1
- xpk/commands/info.py +2 -4
- xpk/commands/job.py +7 -8
- xpk/commands/kjob_common.py +30 -18
- xpk/commands/run.py +17 -12
- xpk/commands/shell.py +3 -4
- xpk/commands/storage.py +75 -19
- xpk/commands/workload.py +161 -324
- xpk/core/blueprint/blueprint_definitions.py +2 -0
- xpk/core/blueprint/blueprint_generator.py +335 -45
- xpk/core/capacity.py +1 -0
- xpk/core/cluster.py +193 -12
- xpk/core/config.py +3 -1
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +9 -21
- xpk/core/filestore.py +5 -1
- xpk/core/gcsfuse.py +27 -6
- xpk/core/kjob.py +66 -20
- xpk/core/kueue.py +30 -0
- xpk/core/mtc.py +195 -0
- xpk/core/nap.py +4 -0
- xpk/core/network.py +34 -22
- xpk/core/nodepool.py +28 -26
- xpk/core/pathways.py +165 -210
- xpk/core/resources.py +21 -0
- xpk/core/scheduling.py +36 -0
- xpk/core/storage.py +66 -12
- xpk/core/system_characteristics.py +9 -0
- xpk/core/workload.py +28 -83
- xpk/core/workload_decorators/rdma_decorator.py +11 -15
- xpk/core/workload_decorators/storage_decorator.py +8 -3
- xpk/core/workload_decorators/tcpx_decorator.py +179 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +17 -16
- xpk/parser/cluster.py +574 -381
- xpk/parser/storage.py +25 -5
- xpk/parser/workload.py +59 -31
- xpk/utils/kubectl.py +4 -1
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/METADATA +192 -93
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/RECORD +46 -44
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/WHEEL +1 -1
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/top_level.txt +0 -0
xpk/commands/workload.py
CHANGED
|
@@ -14,20 +14,22 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
from ..core.blueprint.blueprint_generator import get_subnetworks_for_a3mega, get_subnetworks_for_a3ultra
|
|
18
17
|
from ..core.cluster import (
|
|
18
|
+
XPK_SA,
|
|
19
19
|
create_xpk_k8s_service_account,
|
|
20
20
|
get_cluster_credentials,
|
|
21
21
|
setup_k8s_env,
|
|
22
|
-
XPK_SA,
|
|
23
22
|
)
|
|
24
23
|
from ..core.commands import run_command_with_updates, run_commands
|
|
25
|
-
from ..core.config import
|
|
24
|
+
from ..core.config import (
|
|
25
|
+
VERTEX_TENSORBOARD_FEATURE_FLAG,
|
|
26
|
+
XPK_CURRENT_VERSION,
|
|
27
|
+
parse_env_config,
|
|
28
|
+
)
|
|
26
29
|
from ..core.docker_container import (
|
|
27
30
|
get_main_container_docker_image,
|
|
28
31
|
get_user_workload_container,
|
|
29
32
|
)
|
|
30
|
-
|
|
31
33
|
from ..core.docker_resources import get_volumes
|
|
32
34
|
from ..core.gcloud_context import add_zone_and_project
|
|
33
35
|
from ..core.kueue import LOCAL_QUEUE_NAME
|
|
@@ -36,54 +38,58 @@ from ..core.nap import (
|
|
|
36
38
|
get_autoprovisioning_node_selector_args,
|
|
37
39
|
is_autoprovisioning_enabled,
|
|
38
40
|
)
|
|
41
|
+
from ..core.network import get_cluster_subnetworks
|
|
39
42
|
from ..core.pathways import (
|
|
43
|
+
append_custom_colocated_python_sidecar,
|
|
44
|
+
append_custom_pathways_proxy_server,
|
|
45
|
+
append_custom_pathways_server,
|
|
46
|
+
append_custom_pathways_worker,
|
|
47
|
+
check_if_pathways_job_is_installed,
|
|
40
48
|
ensure_pathways_workload_prerequisites,
|
|
41
|
-
get_pathways_proxy_args,
|
|
42
|
-
get_pathways_rm_args,
|
|
43
|
-
get_pathways_sidecar_container,
|
|
44
49
|
get_pathways_unified_query_link,
|
|
45
|
-
get_pathways_worker_args,
|
|
46
50
|
get_user_workload_for_pathways,
|
|
51
|
+
try_to_delete_pathwaysjob_first,
|
|
47
52
|
)
|
|
48
53
|
from ..core.resources import CLUSTER_METADATA_CONFIGMAP, get_cluster_configmap
|
|
49
54
|
from ..core.scheduling import (
|
|
50
55
|
check_if_workload_can_schedule,
|
|
51
56
|
create_accelerator_label,
|
|
52
57
|
create_machine_label,
|
|
58
|
+
create_tpu_machine_type,
|
|
59
|
+
create_tpu_topology,
|
|
53
60
|
get_cpu_affinity,
|
|
54
61
|
get_gpu_scheduler,
|
|
55
62
|
)
|
|
56
63
|
from ..core.storage import (
|
|
57
|
-
|
|
64
|
+
GCE_PD_TYPE,
|
|
58
65
|
GCP_FILESTORE_TYPE,
|
|
66
|
+
GCS_FUSE_TYPE,
|
|
67
|
+
PARALLELSTORE_TYPE,
|
|
59
68
|
Storage,
|
|
60
69
|
add_bucket_iam_members,
|
|
61
|
-
|
|
62
|
-
get_storage_volumes_yaml,
|
|
70
|
+
get_storage_annotations,
|
|
63
71
|
get_storages_to_mount,
|
|
64
|
-
get_storage_volume_mounts_yaml_for_gpu,
|
|
65
|
-
get_storage_volumes_yaml_for_gpu,
|
|
66
|
-
GCS_FUSE_ANNOTATION,
|
|
67
72
|
)
|
|
68
73
|
from ..core.system_characteristics import (
|
|
69
74
|
AcceleratorType,
|
|
70
|
-
AcceleratorTypeToAcceleratorCharacteristics,
|
|
71
75
|
get_system_characteristics,
|
|
72
76
|
)
|
|
73
77
|
from ..core.vertex import create_vertex_experiment
|
|
74
78
|
from ..core.workload import (
|
|
79
|
+
add_gpu_rxdm_container,
|
|
75
80
|
check_if_workload_exists,
|
|
76
|
-
get_gpu_rxdm_cmd,
|
|
77
|
-
get_gpu_rxdm_image,
|
|
78
|
-
get_gpu_tcp_volume,
|
|
79
|
-
get_gpu_volume,
|
|
80
81
|
get_workload_list,
|
|
81
82
|
wait_for_job_completion,
|
|
82
83
|
zone_to_region,
|
|
83
84
|
)
|
|
84
|
-
from ..core.workload_decorators import
|
|
85
|
+
from ..core.workload_decorators import (
|
|
86
|
+
rdma_decorator,
|
|
87
|
+
storage_decorator,
|
|
88
|
+
tcpxo_decorator,
|
|
89
|
+
)
|
|
85
90
|
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
86
91
|
from ..utils.file import write_tmp_file
|
|
92
|
+
from .common import is_TAS_possible
|
|
87
93
|
from . import cluster_gcluster
|
|
88
94
|
|
|
89
95
|
WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
|
|
@@ -139,7 +145,8 @@ GPU_WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
|
|
|
139
145
|
kind: JobSet
|
|
140
146
|
metadata:
|
|
141
147
|
name: {args.workload}
|
|
142
|
-
annotations:
|
|
148
|
+
annotations:
|
|
149
|
+
{storage_annotations}
|
|
143
150
|
labels:
|
|
144
151
|
kueue.x-k8s.io/queue-name: multislice-queue # Name of the LocalQueue
|
|
145
152
|
xpk.google.com/workload: {args.workload}
|
|
@@ -176,29 +183,8 @@ spec:
|
|
|
176
183
|
- operator: "Exists"
|
|
177
184
|
key: nvidia.com/gpu
|
|
178
185
|
volumes:
|
|
179
|
-
{
|
|
180
|
-
{storage_volumes}
|
|
186
|
+
{volumes}
|
|
181
187
|
containers:
|
|
182
|
-
{gpu_rxdm_image}
|
|
183
|
-
imagePullPolicy: Always
|
|
184
|
-
command:
|
|
185
|
-
- "bash"
|
|
186
|
-
- "-c"
|
|
187
|
-
- |
|
|
188
|
-
{gpu_rxdm_cmd} &
|
|
189
|
-
while [ ! -e "/usr/share/workload/workload_terminated" ]; do sleep 10; echo "sleeping"; done
|
|
190
|
-
securityContext:
|
|
191
|
-
privileged: true
|
|
192
|
-
volumeMounts:
|
|
193
|
-
{gpu_tcp_volume}
|
|
194
|
-
{storage_volume_mounts}
|
|
195
|
-
- name: nvidia-install-dir-host
|
|
196
|
-
mountPath: /usr/local/nvidia/lib64
|
|
197
|
-
- name: workload-terminated-volume
|
|
198
|
-
mountPath: /usr/share/workload
|
|
199
|
-
env:
|
|
200
|
-
- name: LD_LIBRARY_PATH
|
|
201
|
-
value: /usr/local/nvidia/lib64
|
|
202
188
|
{container}
|
|
203
189
|
"""
|
|
204
190
|
|
|
@@ -228,7 +214,7 @@ spec:
|
|
|
228
214
|
labels:
|
|
229
215
|
xpk.google.com/workload: {args.workload}
|
|
230
216
|
annotations:
|
|
231
|
-
|
|
217
|
+
{kueue_TAS_annotation}
|
|
232
218
|
spec:
|
|
233
219
|
priorityClassName: {args.priority}
|
|
234
220
|
restartPolicy: Never
|
|
@@ -241,219 +227,37 @@ spec:
|
|
|
241
227
|
containers:
|
|
242
228
|
{container}
|
|
243
229
|
"""
|
|
244
|
-
|
|
245
|
-
PW_WORKLOAD_CREATE_YAML = """
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
metadata:
|
|
276
|
-
annotations:
|
|
277
|
-
{storage_annotations}
|
|
278
|
-
spec:
|
|
279
|
-
terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
|
|
280
|
-
serviceAccountName: {service_account}
|
|
281
|
-
containers:
|
|
282
|
-
- args:
|
|
283
|
-
{pathways_worker_args}
|
|
284
|
-
image: {args.server_image}
|
|
285
|
-
imagePullPolicy: Always
|
|
286
|
-
name: pathways-worker
|
|
287
|
-
ports:
|
|
288
|
-
- containerPort: 29001
|
|
289
|
-
- containerPort: 8471
|
|
290
|
-
- containerPort: 8080
|
|
291
|
-
resources:
|
|
292
|
-
limits:
|
|
293
|
-
{resource_type}: {system.chips_per_vm}
|
|
294
|
-
securityContext:
|
|
295
|
-
privileged: true
|
|
296
|
-
volumeMounts:
|
|
297
|
-
- mountPath: /tmp
|
|
298
|
-
name: shared-tmp
|
|
299
|
-
{storage_volume_mounts}
|
|
300
|
-
env:
|
|
301
|
-
- name: PROJECT_ID
|
|
302
|
-
value: {args.project}
|
|
303
|
-
- name: LOCATION
|
|
304
|
-
value: {args.zone}
|
|
305
|
-
- name: CLUSTER_NAME
|
|
306
|
-
value: {args.cluster}
|
|
307
|
-
- name: POD_NAME
|
|
308
|
-
valueFrom:
|
|
309
|
-
fieldRef:
|
|
310
|
-
fieldPath: metadata.name
|
|
311
|
-
- name: CONTAINER_NAME
|
|
312
|
-
value: "pathways-worker"
|
|
313
|
-
- name: NAMESPACE
|
|
314
|
-
valueFrom:
|
|
315
|
-
fieldRef:
|
|
316
|
-
fieldPath: metadata.namespace
|
|
317
|
-
# Workaround for v6e
|
|
318
|
-
- name: MEGASCALE_GRPC_ENABLE_XOR_TRACER
|
|
319
|
-
value: "false"
|
|
320
|
-
- name: MEGASCALE_NUM_SLICES
|
|
321
|
-
valueFrom:
|
|
322
|
-
fieldRef:
|
|
323
|
-
fieldPath: "metadata.labels['jobset.sigs.k8s.io/replicatedjob-replicas']"
|
|
324
|
-
- name: JOBSET_NAME
|
|
325
|
-
valueFrom:
|
|
326
|
-
fieldRef:
|
|
327
|
-
fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
|
|
328
|
-
- name: REPLICATED_JOB_NAME
|
|
329
|
-
valueFrom:
|
|
330
|
-
fieldRef:
|
|
331
|
-
fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
|
|
332
|
-
- name: MEGASCALE_SLICE_ID
|
|
333
|
-
valueFrom:
|
|
334
|
-
fieldRef:
|
|
335
|
-
fieldPath: "metadata.labels['jobset.sigs.k8s.io/job-index']"
|
|
336
|
-
- name: MEGASCALE_COORDINATOR_ADDRESS
|
|
337
|
-
value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-$(MEGASCALE_SLICE_ID)-0.$(JOBSET_NAME)"
|
|
338
|
-
{pathways_sidecar_container}
|
|
339
|
-
nodeSelector:
|
|
340
|
-
{accelerator_label}
|
|
341
|
-
{machine_label}
|
|
342
|
-
{autoprovisioning_args}
|
|
343
|
-
priorityClassName: {args.priority}
|
|
344
|
-
hostNetwork: true
|
|
345
|
-
dnsPolicy: ClusterFirstWithHostNet
|
|
346
|
-
volumes:
|
|
347
|
-
- hostPath:
|
|
348
|
-
path: /tmp
|
|
349
|
-
type: DirectoryOrCreate
|
|
350
|
-
name: shared-tmp
|
|
351
|
-
{storage_volumes}
|
|
352
|
-
- name: rm
|
|
353
|
-
replicas: 1
|
|
354
|
-
template:
|
|
355
|
-
metadata:
|
|
356
|
-
labels:
|
|
357
|
-
xpk.google.com/workload: {args.workload}
|
|
358
|
-
spec:
|
|
359
|
-
backoffLimit: 0
|
|
360
|
-
completions: 1
|
|
361
|
-
parallelism: 1
|
|
362
|
-
template:
|
|
363
|
-
spec:
|
|
364
|
-
containers:
|
|
365
|
-
- args:
|
|
366
|
-
{pathways_rm_args}
|
|
367
|
-
env:
|
|
368
|
-
- name: PROJECT_ID
|
|
369
|
-
value: {args.project}
|
|
370
|
-
- name: LOCATION
|
|
371
|
-
value: {args.zone}
|
|
372
|
-
- name: CLUSTER_NAME
|
|
373
|
-
value: {args.cluster}
|
|
374
|
-
- name: POD_NAME
|
|
375
|
-
valueFrom:
|
|
376
|
-
fieldRef:
|
|
377
|
-
fieldPath: metadata.name
|
|
378
|
-
- name: CONTAINER_NAME
|
|
379
|
-
value: "pathways-rm"
|
|
380
|
-
- name: NAMESPACE
|
|
381
|
-
valueFrom:
|
|
382
|
-
fieldRef:
|
|
383
|
-
fieldPath: metadata.namespace
|
|
384
|
-
- name: REPLICATED_JOB_NAME
|
|
385
|
-
valueFrom:
|
|
386
|
-
fieldRef:
|
|
387
|
-
fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
|
|
388
|
-
- name: JOBSET_NAME
|
|
389
|
-
valueFrom:
|
|
390
|
-
fieldRef:
|
|
391
|
-
fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
|
|
392
|
-
- name: HOST_ADDRESS
|
|
393
|
-
value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)
|
|
394
|
-
- name: TPU_SKIP_MDS_QUERY
|
|
395
|
-
value: "true"
|
|
396
|
-
image: {args.server_image}
|
|
397
|
-
imagePullPolicy: Always
|
|
398
|
-
name: pathways-rm
|
|
399
|
-
ports:
|
|
400
|
-
- containerPort: 29001
|
|
401
|
-
securityContext:
|
|
402
|
-
privileged: true
|
|
403
|
-
volumeMounts:
|
|
404
|
-
- mountPath: /tmp
|
|
405
|
-
name: shared-tmp
|
|
406
|
-
nodeSelector:
|
|
407
|
-
cloud.google.com/gke-nodepool: cpu-rm-np
|
|
408
|
-
hostNetwork: true
|
|
409
|
-
dnsPolicy: ClusterFirstWithHostNet
|
|
410
|
-
volumes:
|
|
411
|
-
- hostPath:
|
|
412
|
-
path: /tmp
|
|
413
|
-
type: DirectoryOrCreate
|
|
414
|
-
name: shared-tmp
|
|
415
|
-
- name: proxy
|
|
416
|
-
replicas: 1
|
|
417
|
-
template:
|
|
418
|
-
metadata:
|
|
419
|
-
labels:
|
|
420
|
-
xpk.google.com/workload: {args.workload}
|
|
421
|
-
spec:
|
|
422
|
-
backoffLimit: 0
|
|
423
|
-
completions: 1
|
|
424
|
-
parallelism: 1
|
|
425
|
-
template:
|
|
426
|
-
spec:
|
|
427
|
-
containers:
|
|
428
|
-
- args:
|
|
429
|
-
{pathways_proxy_args}
|
|
430
|
-
env:
|
|
431
|
-
- name: PROJECT_ID
|
|
432
|
-
value: {args.project}
|
|
433
|
-
- name: LOCATION
|
|
434
|
-
value: {args.zone}
|
|
435
|
-
- name: CLUSTER_NAME
|
|
436
|
-
value: {args.cluster}
|
|
437
|
-
- name: POD_NAME
|
|
438
|
-
valueFrom:
|
|
439
|
-
fieldRef:
|
|
440
|
-
fieldPath: metadata.name
|
|
441
|
-
- name: CONTAINER_NAME
|
|
442
|
-
value: "pathways-proxy"
|
|
443
|
-
- name: NAMESPACE
|
|
444
|
-
valueFrom:
|
|
445
|
-
fieldRef:
|
|
446
|
-
fieldPath: metadata.namespace
|
|
447
|
-
image: {args.proxy_server_image}
|
|
448
|
-
imagePullPolicy: Always
|
|
449
|
-
name: pathways-proxy
|
|
450
|
-
ports:
|
|
451
|
-
- containerPort: 29000
|
|
452
|
-
hostNetwork: true
|
|
453
|
-
dnsPolicy: ClusterFirstWithHostNet
|
|
454
|
-
nodeSelector:
|
|
455
|
-
cloud.google.com/gke-nodepool: cpu-proxy-np
|
|
456
|
-
{user_workload}
|
|
230
|
+
# The indentation of PW_WORKLOAD_CREATE_YAML is intentional to allow reusing the user workload container YAML.
|
|
231
|
+
PW_WORKLOAD_CREATE_YAML = """
|
|
232
|
+
apiVersion: pathways-job.pathways.domain/v1
|
|
233
|
+
kind: PathwaysJob
|
|
234
|
+
metadata:
|
|
235
|
+
name: {args.workload}
|
|
236
|
+
labels:
|
|
237
|
+
kueue.x-k8s.io/queue-name: {local_queue_name} # Name of the LocalQueue
|
|
238
|
+
xpk.google.com/workload: {args.workload}
|
|
239
|
+
spec:
|
|
240
|
+
maxRestarts: {args.max_restarts}
|
|
241
|
+
customComponents:
|
|
242
|
+
{custom_pathways_proxy_server}
|
|
243
|
+
{custom_pathways_server}
|
|
244
|
+
{custom_pathways_worker}
|
|
245
|
+
{colocated_python_sidecar}
|
|
246
|
+
workers:
|
|
247
|
+
- type: {machine_type}
|
|
248
|
+
topology: {topology}
|
|
249
|
+
numSlices: {args.num_slices}
|
|
250
|
+
maxSliceRestarts: {args.max_slice_restarts}
|
|
251
|
+
terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
|
|
252
|
+
priorityClassName: {args.priority}
|
|
253
|
+
pathwaysDir: {args.pathways_gcs_location} #This bucket needs to be created in advance.
|
|
254
|
+
controller:
|
|
255
|
+
# #Pod template for training, default mode.
|
|
256
|
+
deploymentMode: default
|
|
257
|
+
mainContainerName: {args.docker_name}
|
|
258
|
+
elasticSlices: {args.elastic_slices}
|
|
259
|
+
template:
|
|
260
|
+
{user_workload}
|
|
457
261
|
"""
|
|
458
262
|
|
|
459
263
|
|
|
@@ -545,7 +349,6 @@ def workload_create(args) -> None:
|
|
|
545
349
|
|
|
546
350
|
parse_env_config(args, tensorboard_config, system)
|
|
547
351
|
|
|
548
|
-
# Currently autoprovisioning is not enabled for Pathways workloads.
|
|
549
352
|
autoprovisioning_args = ''
|
|
550
353
|
autoprovisioning_enabled, return_code = is_autoprovisioning_enabled(
|
|
551
354
|
args, system
|
|
@@ -560,28 +363,72 @@ def workload_create(args) -> None:
|
|
|
560
363
|
if return_code != 0:
|
|
561
364
|
xpk_exit(return_code)
|
|
562
365
|
|
|
563
|
-
storages: list[Storage] = get_storages_to_mount(k8s_api_client, args.storage)
|
|
564
|
-
gcs_fuse_storages = list(
|
|
565
|
-
filter(lambda storage: storage.type == GCS_FUSE_TYPE, storages)
|
|
566
|
-
)
|
|
567
|
-
gcpfilestore_storages: list[Storage] = list(
|
|
568
|
-
filter(lambda storage: storage.type == GCP_FILESTORE_TYPE, storages)
|
|
569
|
-
)
|
|
570
|
-
storage_annotations = ''
|
|
571
366
|
service_account = ''
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
367
|
+
all_storages = []
|
|
368
|
+
# Currently storage customization is not supported for Pathways workloads. b/408468941
|
|
369
|
+
if not args.use_pathways:
|
|
370
|
+
storages: list[Storage] = get_storages_to_mount(
|
|
371
|
+
k8s_api_client, args.storage
|
|
372
|
+
)
|
|
373
|
+
gcs_fuse_storages = list(
|
|
374
|
+
filter(lambda storage: storage.type == GCS_FUSE_TYPE, storages)
|
|
375
|
+
)
|
|
376
|
+
gcpfilestore_storages: list[Storage] = list(
|
|
377
|
+
filter(lambda storage: storage.type == GCP_FILESTORE_TYPE, storages)
|
|
378
|
+
)
|
|
379
|
+
parallelstore_storages: list[Storage] = list(
|
|
380
|
+
filter(lambda storage: storage.type == PARALLELSTORE_TYPE, storages)
|
|
381
|
+
)
|
|
382
|
+
pd_storages: list[Storage] = list(
|
|
383
|
+
filter(lambda storage: storage.type == GCE_PD_TYPE, storages)
|
|
384
|
+
)
|
|
385
|
+
if len(gcs_fuse_storages) > 0:
|
|
386
|
+
service_account = XPK_SA
|
|
387
|
+
xpk_print(f'Detected gcsfuse Storages to add: {gcs_fuse_storages}')
|
|
388
|
+
else:
|
|
389
|
+
xpk_print('No gcsfuse Storages to add detected')
|
|
390
|
+
|
|
391
|
+
if len(gcpfilestore_storages) > 0:
|
|
392
|
+
service_account = XPK_SA
|
|
393
|
+
xpk_print(
|
|
394
|
+
f'Detected gcp filestores instances to add: {gcpfilestore_storages}'
|
|
395
|
+
)
|
|
396
|
+
else:
|
|
397
|
+
xpk_print('No gcp filestore instances to add detected.')
|
|
398
|
+
|
|
399
|
+
if len(parallelstore_storages) > 0:
|
|
400
|
+
service_account = XPK_SA
|
|
401
|
+
xpk_print(
|
|
402
|
+
'Detected gcp parallelstore instances to add:'
|
|
403
|
+
f' {parallelstore_storages}'
|
|
404
|
+
)
|
|
405
|
+
else:
|
|
406
|
+
xpk_print('No gcp parallelstore instances to add detected.')
|
|
407
|
+
|
|
408
|
+
if len(pd_storages) > 0:
|
|
409
|
+
service_account = XPK_SA
|
|
410
|
+
xpk_print(f'Detected gce persistent disk instances to add: {pd_storages}')
|
|
411
|
+
else:
|
|
412
|
+
xpk_print('No gce persistent disk instances to add detected.')
|
|
413
|
+
|
|
414
|
+
all_storages = (
|
|
415
|
+
gcs_fuse_storages
|
|
416
|
+
+ gcpfilestore_storages
|
|
417
|
+
+ parallelstore_storages
|
|
418
|
+
+ pd_storages
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
# Currently failure policy rules are supported for Pathways workloads. b/408465881
|
|
422
|
+
failure_policy_rules = ''
|
|
423
|
+
pod_failure_policy = ''
|
|
424
|
+
if not args.use_pathways:
|
|
425
|
+
failure_policy_rules = """rules:
|
|
579
426
|
- action: FailJobSet
|
|
580
|
-
onJobFailureReasons:
|
|
427
|
+
onJobFailureReasons:
|
|
581
428
|
- PodFailurePolicy"""
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
429
|
+
restart_on_exit_codes = get_restart_exit_codes(args)
|
|
430
|
+
restart_on_exit_codes = ','.join(map(str, restart_on_exit_codes))
|
|
431
|
+
pod_failure_policy = f"""
|
|
585
432
|
podFailurePolicy:
|
|
586
433
|
rules:
|
|
587
434
|
- action: FailJob
|
|
@@ -590,14 +437,6 @@ def workload_create(args) -> None:
|
|
|
590
437
|
operator: NotIn
|
|
591
438
|
values: [{restart_on_exit_codes}]"""
|
|
592
439
|
|
|
593
|
-
if len(gcpfilestore_storages) > 0:
|
|
594
|
-
xpk_print(
|
|
595
|
-
f'Detected gcp filestores instances to add: {gcpfilestore_storages}'
|
|
596
|
-
)
|
|
597
|
-
service_account = XPK_SA
|
|
598
|
-
else:
|
|
599
|
-
xpk_print('No gcp filestore instances to add detected.')
|
|
600
|
-
all_storages = gcs_fuse_storages + gcpfilestore_storages
|
|
601
440
|
# Create the workload file based on accelerator type or workload type.
|
|
602
441
|
if system.accelerator_type == AcceleratorType['GPU']:
|
|
603
442
|
container, debugging_dashboard_id = get_user_workload_container(
|
|
@@ -609,6 +448,13 @@ def workload_create(args) -> None:
|
|
|
609
448
|
if return_code != 0:
|
|
610
449
|
xpk_exit(return_code)
|
|
611
450
|
|
|
451
|
+
kueue_TAS_annotation = (
|
|
452
|
+
'kueue.x-k8s.io/podset-preferred-topology:'
|
|
453
|
+
' "cloud.google.com/gce-topology-host"'
|
|
454
|
+
)
|
|
455
|
+
if not is_TAS_possible(args):
|
|
456
|
+
kueue_TAS_annotation = ''
|
|
457
|
+
|
|
612
458
|
if system.device_type in cluster_gcluster.supported_device_types:
|
|
613
459
|
yml_string = A3_GPU_WORKLOAD_CREATE_YAML.format(
|
|
614
460
|
args=args,
|
|
@@ -616,38 +462,34 @@ def workload_create(args) -> None:
|
|
|
616
462
|
service_account=XPK_SA,
|
|
617
463
|
failure_policy_rules=failure_policy_rules,
|
|
618
464
|
pod_failure_policy=pod_failure_policy,
|
|
465
|
+
kueue_TAS_annotation=kueue_TAS_annotation,
|
|
619
466
|
)
|
|
620
467
|
|
|
468
|
+
sub_networks = get_cluster_subnetworks(args)
|
|
621
469
|
if args.device_type == cluster_gcluster.a3mega_device_type:
|
|
622
|
-
sub_networks = get_subnetworks_for_a3mega(args.cluster)
|
|
623
470
|
yml_string = tcpxo_decorator.decorate_jobset(yml_string, sub_networks)
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
471
|
+
elif args.device_type in [
|
|
472
|
+
cluster_gcluster.a3ultra_device_type,
|
|
473
|
+
cluster_gcluster.a4_device_type,
|
|
474
|
+
]:
|
|
627
475
|
yml_string = rdma_decorator.decorate_jobset(yml_string, sub_networks)
|
|
628
476
|
|
|
629
|
-
if
|
|
477
|
+
if all_storages:
|
|
630
478
|
yml_string = storage_decorator.decorate_jobset(yml_string, all_storages)
|
|
631
479
|
else:
|
|
632
480
|
yml_string = GPU_WORKLOAD_CREATE_YAML.format(
|
|
633
481
|
args=args,
|
|
634
482
|
container=container,
|
|
635
|
-
command=args.command,
|
|
636
|
-
chips_per_vm=system.chips_per_vm,
|
|
637
483
|
gpu_scheduler=gpu_scheduler,
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
gpu_tcp_volume=get_gpu_tcp_volume(system),
|
|
642
|
-
storage_volumes=get_storage_volumes_yaml_for_gpu(all_storages),
|
|
643
|
-
storage_volume_mounts=get_storage_volume_mounts_yaml_for_gpu(
|
|
644
|
-
all_storages
|
|
484
|
+
volumes=get_volumes(args, system),
|
|
485
|
+
storage_annotations=('\n' + (' ' * 12)).join(
|
|
486
|
+
get_storage_annotations(all_storages)
|
|
645
487
|
),
|
|
646
|
-
storage_annotations=storage_annotations,
|
|
647
488
|
service_account=service_account,
|
|
648
489
|
failure_policy_rules=failure_policy_rules,
|
|
649
490
|
pod_failure_policy=pod_failure_policy,
|
|
650
491
|
)
|
|
492
|
+
yml_string = add_gpu_rxdm_container(yml_string, system, all_storages)
|
|
651
493
|
|
|
652
494
|
elif args.use_pathways and ensure_pathways_workload_prerequisites(
|
|
653
495
|
args, system
|
|
@@ -655,29 +497,14 @@ def workload_create(args) -> None:
|
|
|
655
497
|
yml_string = PW_WORKLOAD_CREATE_YAML.format(
|
|
656
498
|
args=args,
|
|
657
499
|
system=system,
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
),
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
user_workload=get_user_workload_for_pathways(
|
|
666
|
-
args, system, pod_failure_policy, storages
|
|
667
|
-
),
|
|
668
|
-
resource_type=AcceleratorTypeToAcceleratorCharacteristics[
|
|
669
|
-
system.accelerator_type
|
|
670
|
-
].resource_type,
|
|
500
|
+
topology=create_tpu_topology(system.accelerator_type, system),
|
|
501
|
+
machine_type=create_tpu_machine_type(system.accelerator_type, system),
|
|
502
|
+
custom_pathways_proxy_server=append_custom_pathways_proxy_server(args),
|
|
503
|
+
custom_pathways_server=append_custom_pathways_server(args),
|
|
504
|
+
custom_pathways_worker=append_custom_pathways_worker(args),
|
|
505
|
+
colocated_python_sidecar=append_custom_colocated_python_sidecar(args),
|
|
506
|
+
user_workload=get_user_workload_for_pathways(args, system),
|
|
671
507
|
local_queue_name=LOCAL_QUEUE_NAME,
|
|
672
|
-
autoprovisioning_args=autoprovisioning_args,
|
|
673
|
-
backoff_limit=system.vms_per_slice * 4,
|
|
674
|
-
storage_annotations=storage_annotations,
|
|
675
|
-
storage_volumes=get_storage_volumes_yaml(all_storages),
|
|
676
|
-
storage_volume_mounts=get_storage_volume_mounts_yaml(all_storages),
|
|
677
|
-
pathways_rm_args=get_pathways_rm_args(args, system),
|
|
678
|
-
service_account=service_account,
|
|
679
|
-
failure_policy_rules=failure_policy_rules,
|
|
680
|
-
pod_failure_policy=pod_failure_policy,
|
|
681
508
|
)
|
|
682
509
|
else:
|
|
683
510
|
container, debugging_dashboard_id = get_user_workload_container(
|
|
@@ -695,7 +522,9 @@ def workload_create(args) -> None:
|
|
|
695
522
|
local_queue_name=LOCAL_QUEUE_NAME,
|
|
696
523
|
autoprovisioning_args=autoprovisioning_args,
|
|
697
524
|
volumes=get_volumes(args, system),
|
|
698
|
-
storage_annotations=
|
|
525
|
+
storage_annotations=('\n' + (' ' * 16)).join(
|
|
526
|
+
get_storage_annotations(all_storages)
|
|
527
|
+
),
|
|
699
528
|
service_account=service_account,
|
|
700
529
|
failure_policy_rules=failure_policy_rules,
|
|
701
530
|
pod_failure_policy=pod_failure_policy,
|
|
@@ -708,7 +537,9 @@ def workload_create(args) -> None:
|
|
|
708
537
|
xpk_print(f'Create Workload request returned ERROR {return_code}')
|
|
709
538
|
xpk_exit(return_code)
|
|
710
539
|
|
|
711
|
-
|
|
540
|
+
if not args.use_pathways:
|
|
541
|
+
add_bucket_iam_members(args, storages)
|
|
542
|
+
|
|
712
543
|
# Get GKE outlier dashboard for TPU
|
|
713
544
|
outlier_dashboard_id = None
|
|
714
545
|
if system.accelerator_type == AcceleratorType['TPU']:
|
|
@@ -833,6 +664,12 @@ def workload_delete(args) -> None:
|
|
|
833
664
|
elif not will_delete:
|
|
834
665
|
xpk_print('Skipping delete command.')
|
|
835
666
|
else:
|
|
667
|
+
# If PathwaysJob exists, delete it.
|
|
668
|
+
if check_if_pathways_job_is_installed(
|
|
669
|
+
args
|
|
670
|
+
) and try_to_delete_pathwaysjob_first(args, workloads):
|
|
671
|
+
xpk_exit(0)
|
|
672
|
+
# PathwaysJob workload does not exist, delete JobSet
|
|
836
673
|
commands = []
|
|
837
674
|
task_names = []
|
|
838
675
|
for workload in workloads:
|