xpk 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +19 -12
- xpk/commands/cluster.py +33 -16
- xpk/commands/cluster_gcluster.py +22 -5
- xpk/commands/info.py +2 -4
- xpk/commands/job.py +7 -8
- xpk/commands/kjob_common.py +23 -20
- xpk/commands/run.py +17 -11
- xpk/commands/shell.py +3 -4
- xpk/commands/storage.py +64 -19
- xpk/commands/workload.py +154 -319
- xpk/core/blueprint/blueprint_definitions.py +2 -0
- xpk/core/blueprint/blueprint_generator.py +322 -32
- xpk/core/capacity.py +1 -0
- xpk/core/cluster.py +75 -5
- xpk/core/config.py +3 -1
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +9 -21
- xpk/core/filestore.py +11 -3
- xpk/core/gcsfuse.py +8 -5
- xpk/core/kjob.py +57 -18
- xpk/core/nap.py +4 -0
- xpk/core/network.py +11 -21
- xpk/core/nodepool.py +28 -26
- xpk/core/pathways.py +165 -210
- xpk/core/scheduling.py +36 -0
- xpk/core/storage.py +66 -12
- xpk/core/system_characteristics.py +9 -0
- xpk/core/workload.py +27 -82
- xpk/core/workload_decorators/rdma_decorator.py +3 -3
- xpk/core/workload_decorators/storage_decorator.py +8 -3
- xpk/core/workload_decorators/tcpxo_decorator.py +2 -2
- xpk/parser/cluster.py +15 -6
- xpk/parser/storage.py +14 -3
- xpk/parser/workload.py +59 -31
- {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/METADATA +60 -4
- {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/RECORD +40 -40
- {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/WHEEL +1 -1
- {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/top_level.txt +0 -0
xpk/commands/workload.py
CHANGED
|
@@ -14,20 +14,27 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
from ..core.blueprint.blueprint_generator import
|
|
17
|
+
from ..core.blueprint.blueprint_generator import (
|
|
18
|
+
get_subnetworks_for_a3mega,
|
|
19
|
+
get_subnetworks_for_a3ultra,
|
|
20
|
+
get_subnetworks_for_a4,
|
|
21
|
+
)
|
|
18
22
|
from ..core.cluster import (
|
|
23
|
+
XPK_SA,
|
|
19
24
|
create_xpk_k8s_service_account,
|
|
20
25
|
get_cluster_credentials,
|
|
21
26
|
setup_k8s_env,
|
|
22
|
-
XPK_SA,
|
|
23
27
|
)
|
|
24
28
|
from ..core.commands import run_command_with_updates, run_commands
|
|
25
|
-
from ..core.config import
|
|
29
|
+
from ..core.config import (
|
|
30
|
+
VERTEX_TENSORBOARD_FEATURE_FLAG,
|
|
31
|
+
XPK_CURRENT_VERSION,
|
|
32
|
+
parse_env_config,
|
|
33
|
+
)
|
|
26
34
|
from ..core.docker_container import (
|
|
27
35
|
get_main_container_docker_image,
|
|
28
36
|
get_user_workload_container,
|
|
29
37
|
)
|
|
30
|
-
|
|
31
38
|
from ..core.docker_resources import get_volumes
|
|
32
39
|
from ..core.gcloud_context import add_zone_and_project
|
|
33
40
|
from ..core.kueue import LOCAL_QUEUE_NAME
|
|
@@ -37,51 +44,53 @@ from ..core.nap import (
|
|
|
37
44
|
is_autoprovisioning_enabled,
|
|
38
45
|
)
|
|
39
46
|
from ..core.pathways import (
|
|
47
|
+
append_custom_colocated_python_sidecar,
|
|
48
|
+
append_custom_pathways_proxy_server,
|
|
49
|
+
append_custom_pathways_server,
|
|
50
|
+
append_custom_pathways_worker,
|
|
51
|
+
check_if_pathways_job_is_installed,
|
|
40
52
|
ensure_pathways_workload_prerequisites,
|
|
41
|
-
get_pathways_proxy_args,
|
|
42
|
-
get_pathways_rm_args,
|
|
43
|
-
get_pathways_sidecar_container,
|
|
44
53
|
get_pathways_unified_query_link,
|
|
45
|
-
get_pathways_worker_args,
|
|
46
54
|
get_user_workload_for_pathways,
|
|
55
|
+
try_to_delete_pathwaysjob_first,
|
|
47
56
|
)
|
|
48
57
|
from ..core.resources import CLUSTER_METADATA_CONFIGMAP, get_cluster_configmap
|
|
49
58
|
from ..core.scheduling import (
|
|
50
59
|
check_if_workload_can_schedule,
|
|
51
60
|
create_accelerator_label,
|
|
52
61
|
create_machine_label,
|
|
62
|
+
create_tpu_machine_type,
|
|
63
|
+
create_tpu_topology,
|
|
53
64
|
get_cpu_affinity,
|
|
54
65
|
get_gpu_scheduler,
|
|
55
66
|
)
|
|
56
67
|
from ..core.storage import (
|
|
57
|
-
|
|
68
|
+
GCE_PD_TYPE,
|
|
58
69
|
GCP_FILESTORE_TYPE,
|
|
70
|
+
GCS_FUSE_TYPE,
|
|
71
|
+
PARALLELSTORE_TYPE,
|
|
59
72
|
Storage,
|
|
60
73
|
add_bucket_iam_members,
|
|
61
|
-
|
|
62
|
-
get_storage_volumes_yaml,
|
|
74
|
+
get_storage_annotations,
|
|
63
75
|
get_storages_to_mount,
|
|
64
|
-
get_storage_volume_mounts_yaml_for_gpu,
|
|
65
|
-
get_storage_volumes_yaml_for_gpu,
|
|
66
|
-
GCS_FUSE_ANNOTATION,
|
|
67
76
|
)
|
|
68
77
|
from ..core.system_characteristics import (
|
|
69
78
|
AcceleratorType,
|
|
70
|
-
AcceleratorTypeToAcceleratorCharacteristics,
|
|
71
79
|
get_system_characteristics,
|
|
72
80
|
)
|
|
73
81
|
from ..core.vertex import create_vertex_experiment
|
|
74
82
|
from ..core.workload import (
|
|
83
|
+
add_gpu_rxdm_container,
|
|
75
84
|
check_if_workload_exists,
|
|
76
|
-
get_gpu_rxdm_cmd,
|
|
77
|
-
get_gpu_rxdm_image,
|
|
78
|
-
get_gpu_tcp_volume,
|
|
79
|
-
get_gpu_volume,
|
|
80
85
|
get_workload_list,
|
|
81
86
|
wait_for_job_completion,
|
|
82
87
|
zone_to_region,
|
|
83
88
|
)
|
|
84
|
-
from ..core.workload_decorators import
|
|
89
|
+
from ..core.workload_decorators import (
|
|
90
|
+
rdma_decorator,
|
|
91
|
+
storage_decorator,
|
|
92
|
+
tcpxo_decorator,
|
|
93
|
+
)
|
|
85
94
|
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
86
95
|
from ..utils.file import write_tmp_file
|
|
87
96
|
from . import cluster_gcluster
|
|
@@ -139,7 +148,8 @@ GPU_WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
|
|
|
139
148
|
kind: JobSet
|
|
140
149
|
metadata:
|
|
141
150
|
name: {args.workload}
|
|
142
|
-
annotations:
|
|
151
|
+
annotations:
|
|
152
|
+
{storage_annotations}
|
|
143
153
|
labels:
|
|
144
154
|
kueue.x-k8s.io/queue-name: multislice-queue # Name of the LocalQueue
|
|
145
155
|
xpk.google.com/workload: {args.workload}
|
|
@@ -176,29 +186,8 @@ spec:
|
|
|
176
186
|
- operator: "Exists"
|
|
177
187
|
key: nvidia.com/gpu
|
|
178
188
|
volumes:
|
|
179
|
-
{
|
|
180
|
-
{storage_volumes}
|
|
189
|
+
{volumes}
|
|
181
190
|
containers:
|
|
182
|
-
{gpu_rxdm_image}
|
|
183
|
-
imagePullPolicy: Always
|
|
184
|
-
command:
|
|
185
|
-
- "bash"
|
|
186
|
-
- "-c"
|
|
187
|
-
- |
|
|
188
|
-
{gpu_rxdm_cmd} &
|
|
189
|
-
while [ ! -e "/usr/share/workload/workload_terminated" ]; do sleep 10; echo "sleeping"; done
|
|
190
|
-
securityContext:
|
|
191
|
-
privileged: true
|
|
192
|
-
volumeMounts:
|
|
193
|
-
{gpu_tcp_volume}
|
|
194
|
-
{storage_volume_mounts}
|
|
195
|
-
- name: nvidia-install-dir-host
|
|
196
|
-
mountPath: /usr/local/nvidia/lib64
|
|
197
|
-
- name: workload-terminated-volume
|
|
198
|
-
mountPath: /usr/share/workload
|
|
199
|
-
env:
|
|
200
|
-
- name: LD_LIBRARY_PATH
|
|
201
|
-
value: /usr/local/nvidia/lib64
|
|
202
191
|
{container}
|
|
203
192
|
"""
|
|
204
193
|
|
|
@@ -241,219 +230,37 @@ spec:
|
|
|
241
230
|
containers:
|
|
242
231
|
{container}
|
|
243
232
|
"""
|
|
244
|
-
|
|
245
|
-
PW_WORKLOAD_CREATE_YAML = """
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
metadata:
|
|
276
|
-
annotations:
|
|
277
|
-
{storage_annotations}
|
|
278
|
-
spec:
|
|
279
|
-
terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
|
|
280
|
-
serviceAccountName: {service_account}
|
|
281
|
-
containers:
|
|
282
|
-
- args:
|
|
283
|
-
{pathways_worker_args}
|
|
284
|
-
image: {args.server_image}
|
|
285
|
-
imagePullPolicy: Always
|
|
286
|
-
name: pathways-worker
|
|
287
|
-
ports:
|
|
288
|
-
- containerPort: 29001
|
|
289
|
-
- containerPort: 8471
|
|
290
|
-
- containerPort: 8080
|
|
291
|
-
resources:
|
|
292
|
-
limits:
|
|
293
|
-
{resource_type}: {system.chips_per_vm}
|
|
294
|
-
securityContext:
|
|
295
|
-
privileged: true
|
|
296
|
-
volumeMounts:
|
|
297
|
-
- mountPath: /tmp
|
|
298
|
-
name: shared-tmp
|
|
299
|
-
{storage_volume_mounts}
|
|
300
|
-
env:
|
|
301
|
-
- name: PROJECT_ID
|
|
302
|
-
value: {args.project}
|
|
303
|
-
- name: LOCATION
|
|
304
|
-
value: {args.zone}
|
|
305
|
-
- name: CLUSTER_NAME
|
|
306
|
-
value: {args.cluster}
|
|
307
|
-
- name: POD_NAME
|
|
308
|
-
valueFrom:
|
|
309
|
-
fieldRef:
|
|
310
|
-
fieldPath: metadata.name
|
|
311
|
-
- name: CONTAINER_NAME
|
|
312
|
-
value: "pathways-worker"
|
|
313
|
-
- name: NAMESPACE
|
|
314
|
-
valueFrom:
|
|
315
|
-
fieldRef:
|
|
316
|
-
fieldPath: metadata.namespace
|
|
317
|
-
# Workaround for v6e
|
|
318
|
-
- name: MEGASCALE_GRPC_ENABLE_XOR_TRACER
|
|
319
|
-
value: "false"
|
|
320
|
-
- name: MEGASCALE_NUM_SLICES
|
|
321
|
-
valueFrom:
|
|
322
|
-
fieldRef:
|
|
323
|
-
fieldPath: "metadata.labels['jobset.sigs.k8s.io/replicatedjob-replicas']"
|
|
324
|
-
- name: JOBSET_NAME
|
|
325
|
-
valueFrom:
|
|
326
|
-
fieldRef:
|
|
327
|
-
fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
|
|
328
|
-
- name: REPLICATED_JOB_NAME
|
|
329
|
-
valueFrom:
|
|
330
|
-
fieldRef:
|
|
331
|
-
fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
|
|
332
|
-
- name: MEGASCALE_SLICE_ID
|
|
333
|
-
valueFrom:
|
|
334
|
-
fieldRef:
|
|
335
|
-
fieldPath: "metadata.labels['jobset.sigs.k8s.io/job-index']"
|
|
336
|
-
- name: MEGASCALE_COORDINATOR_ADDRESS
|
|
337
|
-
value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-$(MEGASCALE_SLICE_ID)-0.$(JOBSET_NAME)"
|
|
338
|
-
{pathways_sidecar_container}
|
|
339
|
-
nodeSelector:
|
|
340
|
-
{accelerator_label}
|
|
341
|
-
{machine_label}
|
|
342
|
-
{autoprovisioning_args}
|
|
343
|
-
priorityClassName: {args.priority}
|
|
344
|
-
hostNetwork: true
|
|
345
|
-
dnsPolicy: ClusterFirstWithHostNet
|
|
346
|
-
volumes:
|
|
347
|
-
- hostPath:
|
|
348
|
-
path: /tmp
|
|
349
|
-
type: DirectoryOrCreate
|
|
350
|
-
name: shared-tmp
|
|
351
|
-
{storage_volumes}
|
|
352
|
-
- name: rm
|
|
353
|
-
replicas: 1
|
|
354
|
-
template:
|
|
355
|
-
metadata:
|
|
356
|
-
labels:
|
|
357
|
-
xpk.google.com/workload: {args.workload}
|
|
358
|
-
spec:
|
|
359
|
-
backoffLimit: 0
|
|
360
|
-
completions: 1
|
|
361
|
-
parallelism: 1
|
|
362
|
-
template:
|
|
363
|
-
spec:
|
|
364
|
-
containers:
|
|
365
|
-
- args:
|
|
366
|
-
{pathways_rm_args}
|
|
367
|
-
env:
|
|
368
|
-
- name: PROJECT_ID
|
|
369
|
-
value: {args.project}
|
|
370
|
-
- name: LOCATION
|
|
371
|
-
value: {args.zone}
|
|
372
|
-
- name: CLUSTER_NAME
|
|
373
|
-
value: {args.cluster}
|
|
374
|
-
- name: POD_NAME
|
|
375
|
-
valueFrom:
|
|
376
|
-
fieldRef:
|
|
377
|
-
fieldPath: metadata.name
|
|
378
|
-
- name: CONTAINER_NAME
|
|
379
|
-
value: "pathways-rm"
|
|
380
|
-
- name: NAMESPACE
|
|
381
|
-
valueFrom:
|
|
382
|
-
fieldRef:
|
|
383
|
-
fieldPath: metadata.namespace
|
|
384
|
-
- name: REPLICATED_JOB_NAME
|
|
385
|
-
valueFrom:
|
|
386
|
-
fieldRef:
|
|
387
|
-
fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
|
|
388
|
-
- name: JOBSET_NAME
|
|
389
|
-
valueFrom:
|
|
390
|
-
fieldRef:
|
|
391
|
-
fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
|
|
392
|
-
- name: HOST_ADDRESS
|
|
393
|
-
value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)
|
|
394
|
-
- name: TPU_SKIP_MDS_QUERY
|
|
395
|
-
value: "true"
|
|
396
|
-
image: {args.server_image}
|
|
397
|
-
imagePullPolicy: Always
|
|
398
|
-
name: pathways-rm
|
|
399
|
-
ports:
|
|
400
|
-
- containerPort: 29001
|
|
401
|
-
securityContext:
|
|
402
|
-
privileged: true
|
|
403
|
-
volumeMounts:
|
|
404
|
-
- mountPath: /tmp
|
|
405
|
-
name: shared-tmp
|
|
406
|
-
nodeSelector:
|
|
407
|
-
cloud.google.com/gke-nodepool: cpu-rm-np
|
|
408
|
-
hostNetwork: true
|
|
409
|
-
dnsPolicy: ClusterFirstWithHostNet
|
|
410
|
-
volumes:
|
|
411
|
-
- hostPath:
|
|
412
|
-
path: /tmp
|
|
413
|
-
type: DirectoryOrCreate
|
|
414
|
-
name: shared-tmp
|
|
415
|
-
- name: proxy
|
|
416
|
-
replicas: 1
|
|
417
|
-
template:
|
|
418
|
-
metadata:
|
|
419
|
-
labels:
|
|
420
|
-
xpk.google.com/workload: {args.workload}
|
|
421
|
-
spec:
|
|
422
|
-
backoffLimit: 0
|
|
423
|
-
completions: 1
|
|
424
|
-
parallelism: 1
|
|
425
|
-
template:
|
|
426
|
-
spec:
|
|
427
|
-
containers:
|
|
428
|
-
- args:
|
|
429
|
-
{pathways_proxy_args}
|
|
430
|
-
env:
|
|
431
|
-
- name: PROJECT_ID
|
|
432
|
-
value: {args.project}
|
|
433
|
-
- name: LOCATION
|
|
434
|
-
value: {args.zone}
|
|
435
|
-
- name: CLUSTER_NAME
|
|
436
|
-
value: {args.cluster}
|
|
437
|
-
- name: POD_NAME
|
|
438
|
-
valueFrom:
|
|
439
|
-
fieldRef:
|
|
440
|
-
fieldPath: metadata.name
|
|
441
|
-
- name: CONTAINER_NAME
|
|
442
|
-
value: "pathways-proxy"
|
|
443
|
-
- name: NAMESPACE
|
|
444
|
-
valueFrom:
|
|
445
|
-
fieldRef:
|
|
446
|
-
fieldPath: metadata.namespace
|
|
447
|
-
image: {args.proxy_server_image}
|
|
448
|
-
imagePullPolicy: Always
|
|
449
|
-
name: pathways-proxy
|
|
450
|
-
ports:
|
|
451
|
-
- containerPort: 29000
|
|
452
|
-
hostNetwork: true
|
|
453
|
-
dnsPolicy: ClusterFirstWithHostNet
|
|
454
|
-
nodeSelector:
|
|
455
|
-
cloud.google.com/gke-nodepool: cpu-proxy-np
|
|
456
|
-
{user_workload}
|
|
233
|
+
# The indentation of PW_WORKLOAD_CREATE_YAML is intentional to allow reusing the user workload container YAML.
|
|
234
|
+
PW_WORKLOAD_CREATE_YAML = """
|
|
235
|
+
apiVersion: pathways-job.pathways.domain/v1
|
|
236
|
+
kind: PathwaysJob
|
|
237
|
+
metadata:
|
|
238
|
+
name: {args.workload}
|
|
239
|
+
labels:
|
|
240
|
+
kueue.x-k8s.io/queue-name: {local_queue_name} # Name of the LocalQueue
|
|
241
|
+
xpk.google.com/workload: {args.workload}
|
|
242
|
+
spec:
|
|
243
|
+
maxRestarts: {args.max_restarts}
|
|
244
|
+
customComponents:
|
|
245
|
+
{custom_pathways_proxy_server}
|
|
246
|
+
{custom_pathways_server}
|
|
247
|
+
{custom_pathways_worker}
|
|
248
|
+
{colocated_python_sidecar}
|
|
249
|
+
workers:
|
|
250
|
+
- type: {machine_type}
|
|
251
|
+
topology: {topology}
|
|
252
|
+
numSlices: {args.num_slices}
|
|
253
|
+
maxSliceRestarts: {args.max_slice_restarts}
|
|
254
|
+
terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
|
|
255
|
+
priorityClassName: {args.priority}
|
|
256
|
+
pathwaysDir: {args.pathways_gcs_location} #This bucket needs to be created in advance.
|
|
257
|
+
controller:
|
|
258
|
+
# #Pod template for training, default mode.
|
|
259
|
+
deploymentMode: default
|
|
260
|
+
mainContainerName: {args.docker_name}
|
|
261
|
+
elasticSlices: {args.elastic_slices}
|
|
262
|
+
template:
|
|
263
|
+
{user_workload}
|
|
457
264
|
"""
|
|
458
265
|
|
|
459
266
|
|
|
@@ -545,7 +352,6 @@ def workload_create(args) -> None:
|
|
|
545
352
|
|
|
546
353
|
parse_env_config(args, tensorboard_config, system)
|
|
547
354
|
|
|
548
|
-
# Currently autoprovisioning is not enabled for Pathways workloads.
|
|
549
355
|
autoprovisioning_args = ''
|
|
550
356
|
autoprovisioning_enabled, return_code = is_autoprovisioning_enabled(
|
|
551
357
|
args, system
|
|
@@ -560,28 +366,72 @@ def workload_create(args) -> None:
|
|
|
560
366
|
if return_code != 0:
|
|
561
367
|
xpk_exit(return_code)
|
|
562
368
|
|
|
563
|
-
storages: list[Storage] = get_storages_to_mount(k8s_api_client, args.storage)
|
|
564
|
-
gcs_fuse_storages = list(
|
|
565
|
-
filter(lambda storage: storage.type == GCS_FUSE_TYPE, storages)
|
|
566
|
-
)
|
|
567
|
-
gcpfilestore_storages: list[Storage] = list(
|
|
568
|
-
filter(lambda storage: storage.type == GCP_FILESTORE_TYPE, storages)
|
|
569
|
-
)
|
|
570
|
-
storage_annotations = ''
|
|
571
369
|
service_account = ''
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
370
|
+
all_storages = []
|
|
371
|
+
# Currently storage customization is not supported for Pathways workloads. b/408468941
|
|
372
|
+
if not args.use_pathways:
|
|
373
|
+
storages: list[Storage] = get_storages_to_mount(
|
|
374
|
+
k8s_api_client, args.storage
|
|
375
|
+
)
|
|
376
|
+
gcs_fuse_storages = list(
|
|
377
|
+
filter(lambda storage: storage.type == GCS_FUSE_TYPE, storages)
|
|
378
|
+
)
|
|
379
|
+
gcpfilestore_storages: list[Storage] = list(
|
|
380
|
+
filter(lambda storage: storage.type == GCP_FILESTORE_TYPE, storages)
|
|
381
|
+
)
|
|
382
|
+
parallelstore_storages: list[Storage] = list(
|
|
383
|
+
filter(lambda storage: storage.type == PARALLELSTORE_TYPE, storages)
|
|
384
|
+
)
|
|
385
|
+
pd_storages: list[Storage] = list(
|
|
386
|
+
filter(lambda storage: storage.type == GCE_PD_TYPE, storages)
|
|
387
|
+
)
|
|
388
|
+
if len(gcs_fuse_storages) > 0:
|
|
389
|
+
service_account = XPK_SA
|
|
390
|
+
xpk_print(f'Detected gcsfuse Storages to add: {gcs_fuse_storages}')
|
|
391
|
+
else:
|
|
392
|
+
xpk_print('No gcsfuse Storages to add detected')
|
|
393
|
+
|
|
394
|
+
if len(gcpfilestore_storages) > 0:
|
|
395
|
+
service_account = XPK_SA
|
|
396
|
+
xpk_print(
|
|
397
|
+
f'Detected gcp filestores instances to add: {gcpfilestore_storages}'
|
|
398
|
+
)
|
|
399
|
+
else:
|
|
400
|
+
xpk_print('No gcp filestore instances to add detected.')
|
|
401
|
+
|
|
402
|
+
if len(parallelstore_storages) > 0:
|
|
403
|
+
service_account = XPK_SA
|
|
404
|
+
xpk_print(
|
|
405
|
+
'Detected gcp parallelstore instances to add:'
|
|
406
|
+
f' {parallelstore_storages}'
|
|
407
|
+
)
|
|
408
|
+
else:
|
|
409
|
+
xpk_print('No gcp filestore instances to add detected.')
|
|
410
|
+
|
|
411
|
+
if len(pd_storages) > 0:
|
|
412
|
+
service_account = XPK_SA
|
|
413
|
+
xpk_print(f'Detected gce persistent disk instances to add: {pd_storages}')
|
|
414
|
+
else:
|
|
415
|
+
xpk_print('No gce persistent disk instances to add detected.')
|
|
416
|
+
|
|
417
|
+
all_storages = (
|
|
418
|
+
gcs_fuse_storages
|
|
419
|
+
+ gcpfilestore_storages
|
|
420
|
+
+ parallelstore_storages
|
|
421
|
+
+ pd_storages
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
# Currently failure policy rules are supported for Pathways workloads. b/408465881
|
|
425
|
+
failure_policy_rules = ''
|
|
426
|
+
pod_failure_policy = ''
|
|
427
|
+
if not args.use_pathways:
|
|
428
|
+
failure_policy_rules = """rules:
|
|
579
429
|
- action: FailJobSet
|
|
580
|
-
onJobFailureReasons:
|
|
430
|
+
onJobFailureReasons:
|
|
581
431
|
- PodFailurePolicy"""
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
432
|
+
restart_on_exit_codes = get_restart_exit_codes(args)
|
|
433
|
+
restart_on_exit_codes = ','.join(map(str, restart_on_exit_codes))
|
|
434
|
+
pod_failure_policy = f"""
|
|
585
435
|
podFailurePolicy:
|
|
586
436
|
rules:
|
|
587
437
|
- action: FailJob
|
|
@@ -590,14 +440,6 @@ def workload_create(args) -> None:
|
|
|
590
440
|
operator: NotIn
|
|
591
441
|
values: [{restart_on_exit_codes}]"""
|
|
592
442
|
|
|
593
|
-
if len(gcpfilestore_storages) > 0:
|
|
594
|
-
xpk_print(
|
|
595
|
-
f'Detected gcp filestores instances to add: {gcpfilestore_storages}'
|
|
596
|
-
)
|
|
597
|
-
service_account = XPK_SA
|
|
598
|
-
else:
|
|
599
|
-
xpk_print('No gcp filestore instances to add detected.')
|
|
600
|
-
all_storages = gcs_fuse_storages + gcpfilestore_storages
|
|
601
443
|
# Create the workload file based on accelerator type or workload type.
|
|
602
444
|
if system.accelerator_type == AcceleratorType['GPU']:
|
|
603
445
|
container, debugging_dashboard_id = get_user_workload_container(
|
|
@@ -626,28 +468,26 @@ def workload_create(args) -> None:
|
|
|
626
468
|
sub_networks = get_subnetworks_for_a3ultra(args.cluster)
|
|
627
469
|
yml_string = rdma_decorator.decorate_jobset(yml_string, sub_networks)
|
|
628
470
|
|
|
629
|
-
if
|
|
471
|
+
if args.device_type == cluster_gcluster.a4_device_type:
|
|
472
|
+
sub_networks = get_subnetworks_for_a4()
|
|
473
|
+
yml_string = rdma_decorator.decorate_jobset(yml_string, sub_networks)
|
|
474
|
+
|
|
475
|
+
if all_storages:
|
|
630
476
|
yml_string = storage_decorator.decorate_jobset(yml_string, all_storages)
|
|
631
477
|
else:
|
|
632
478
|
yml_string = GPU_WORKLOAD_CREATE_YAML.format(
|
|
633
479
|
args=args,
|
|
634
480
|
container=container,
|
|
635
|
-
command=args.command,
|
|
636
|
-
chips_per_vm=system.chips_per_vm,
|
|
637
481
|
gpu_scheduler=gpu_scheduler,
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
gpu_tcp_volume=get_gpu_tcp_volume(system),
|
|
642
|
-
storage_volumes=get_storage_volumes_yaml_for_gpu(all_storages),
|
|
643
|
-
storage_volume_mounts=get_storage_volume_mounts_yaml_for_gpu(
|
|
644
|
-
all_storages
|
|
482
|
+
volumes=get_volumes(args, system),
|
|
483
|
+
storage_annotations=('\n' + (' ' * 12)).join(
|
|
484
|
+
get_storage_annotations(all_storages)
|
|
645
485
|
),
|
|
646
|
-
storage_annotations=storage_annotations,
|
|
647
486
|
service_account=service_account,
|
|
648
487
|
failure_policy_rules=failure_policy_rules,
|
|
649
488
|
pod_failure_policy=pod_failure_policy,
|
|
650
489
|
)
|
|
490
|
+
yml_string = add_gpu_rxdm_container(yml_string, system, all_storages)
|
|
651
491
|
|
|
652
492
|
elif args.use_pathways and ensure_pathways_workload_prerequisites(
|
|
653
493
|
args, system
|
|
@@ -655,29 +495,14 @@ def workload_create(args) -> None:
|
|
|
655
495
|
yml_string = PW_WORKLOAD_CREATE_YAML.format(
|
|
656
496
|
args=args,
|
|
657
497
|
system=system,
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
),
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
user_workload=get_user_workload_for_pathways(
|
|
666
|
-
args, system, pod_failure_policy, storages
|
|
667
|
-
),
|
|
668
|
-
resource_type=AcceleratorTypeToAcceleratorCharacteristics[
|
|
669
|
-
system.accelerator_type
|
|
670
|
-
].resource_type,
|
|
498
|
+
topology=create_tpu_topology(system.accelerator_type, system),
|
|
499
|
+
machine_type=create_tpu_machine_type(system.accelerator_type, system),
|
|
500
|
+
custom_pathways_proxy_server=append_custom_pathways_proxy_server(args),
|
|
501
|
+
custom_pathways_server=append_custom_pathways_server(args),
|
|
502
|
+
custom_pathways_worker=append_custom_pathways_worker(args),
|
|
503
|
+
colocated_python_sidecar=append_custom_colocated_python_sidecar(args),
|
|
504
|
+
user_workload=get_user_workload_for_pathways(args, system),
|
|
671
505
|
local_queue_name=LOCAL_QUEUE_NAME,
|
|
672
|
-
autoprovisioning_args=autoprovisioning_args,
|
|
673
|
-
backoff_limit=system.vms_per_slice * 4,
|
|
674
|
-
storage_annotations=storage_annotations,
|
|
675
|
-
storage_volumes=get_storage_volumes_yaml(all_storages),
|
|
676
|
-
storage_volume_mounts=get_storage_volume_mounts_yaml(all_storages),
|
|
677
|
-
pathways_rm_args=get_pathways_rm_args(args, system),
|
|
678
|
-
service_account=service_account,
|
|
679
|
-
failure_policy_rules=failure_policy_rules,
|
|
680
|
-
pod_failure_policy=pod_failure_policy,
|
|
681
506
|
)
|
|
682
507
|
else:
|
|
683
508
|
container, debugging_dashboard_id = get_user_workload_container(
|
|
@@ -695,7 +520,9 @@ def workload_create(args) -> None:
|
|
|
695
520
|
local_queue_name=LOCAL_QUEUE_NAME,
|
|
696
521
|
autoprovisioning_args=autoprovisioning_args,
|
|
697
522
|
volumes=get_volumes(args, system),
|
|
698
|
-
storage_annotations=
|
|
523
|
+
storage_annotations=('\n' + (' ' * 16)).join(
|
|
524
|
+
get_storage_annotations(all_storages)
|
|
525
|
+
),
|
|
699
526
|
service_account=service_account,
|
|
700
527
|
failure_policy_rules=failure_policy_rules,
|
|
701
528
|
pod_failure_policy=pod_failure_policy,
|
|
@@ -708,7 +535,9 @@ def workload_create(args) -> None:
|
|
|
708
535
|
xpk_print(f'Create Workload request returned ERROR {return_code}')
|
|
709
536
|
xpk_exit(return_code)
|
|
710
537
|
|
|
711
|
-
|
|
538
|
+
if not args.use_pathways:
|
|
539
|
+
add_bucket_iam_members(args, storages)
|
|
540
|
+
|
|
712
541
|
# Get GKE outlier dashboard for TPU
|
|
713
542
|
outlier_dashboard_id = None
|
|
714
543
|
if system.accelerator_type == AcceleratorType['TPU']:
|
|
@@ -833,6 +662,12 @@ def workload_delete(args) -> None:
|
|
|
833
662
|
elif not will_delete:
|
|
834
663
|
xpk_print('Skipping delete command.')
|
|
835
664
|
else:
|
|
665
|
+
# If PathwaysJob exists, delete it.
|
|
666
|
+
if check_if_pathways_job_is_installed(
|
|
667
|
+
args
|
|
668
|
+
) and try_to_delete_pathwaysjob_first(args, workloads):
|
|
669
|
+
xpk_exit(0)
|
|
670
|
+
# PathwaysJob workload does not exist, delete JobSet
|
|
836
671
|
commands = []
|
|
837
672
|
task_names = []
|
|
838
673
|
for workload in workloads:
|