xpk 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. xpk/commands/batch.py +19 -12
  2. xpk/commands/cluster.py +33 -16
  3. xpk/commands/cluster_gcluster.py +22 -5
  4. xpk/commands/info.py +2 -4
  5. xpk/commands/job.py +7 -8
  6. xpk/commands/kjob_common.py +23 -20
  7. xpk/commands/run.py +17 -11
  8. xpk/commands/shell.py +3 -4
  9. xpk/commands/storage.py +64 -19
  10. xpk/commands/workload.py +154 -319
  11. xpk/core/blueprint/blueprint_definitions.py +2 -0
  12. xpk/core/blueprint/blueprint_generator.py +322 -32
  13. xpk/core/capacity.py +1 -0
  14. xpk/core/cluster.py +75 -5
  15. xpk/core/config.py +3 -1
  16. xpk/core/docker_manager.py +1 -1
  17. xpk/core/docker_resources.py +9 -21
  18. xpk/core/filestore.py +11 -3
  19. xpk/core/gcsfuse.py +8 -5
  20. xpk/core/kjob.py +57 -18
  21. xpk/core/nap.py +4 -0
  22. xpk/core/network.py +11 -21
  23. xpk/core/nodepool.py +28 -26
  24. xpk/core/pathways.py +165 -210
  25. xpk/core/scheduling.py +36 -0
  26. xpk/core/storage.py +66 -12
  27. xpk/core/system_characteristics.py +9 -0
  28. xpk/core/workload.py +27 -82
  29. xpk/core/workload_decorators/rdma_decorator.py +3 -3
  30. xpk/core/workload_decorators/storage_decorator.py +8 -3
  31. xpk/core/workload_decorators/tcpxo_decorator.py +2 -2
  32. xpk/parser/cluster.py +15 -6
  33. xpk/parser/storage.py +14 -3
  34. xpk/parser/workload.py +59 -31
  35. {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/METADATA +60 -4
  36. {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/RECORD +40 -40
  37. {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/WHEEL +1 -1
  38. {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/entry_points.txt +0 -0
  39. {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/licenses/LICENSE +0 -0
  40. {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/top_level.txt +0 -0
xpk/commands/workload.py CHANGED
@@ -14,20 +14,27 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
- from ..core.blueprint.blueprint_generator import get_subnetworks_for_a3mega, get_subnetworks_for_a3ultra
17
+ from ..core.blueprint.blueprint_generator import (
18
+ get_subnetworks_for_a3mega,
19
+ get_subnetworks_for_a3ultra,
20
+ get_subnetworks_for_a4,
21
+ )
18
22
  from ..core.cluster import (
23
+ XPK_SA,
19
24
  create_xpk_k8s_service_account,
20
25
  get_cluster_credentials,
21
26
  setup_k8s_env,
22
- XPK_SA,
23
27
  )
24
28
  from ..core.commands import run_command_with_updates, run_commands
25
- from ..core.config import VERTEX_TENSORBOARD_FEATURE_FLAG, XPK_CURRENT_VERSION, parse_env_config
29
+ from ..core.config import (
30
+ VERTEX_TENSORBOARD_FEATURE_FLAG,
31
+ XPK_CURRENT_VERSION,
32
+ parse_env_config,
33
+ )
26
34
  from ..core.docker_container import (
27
35
  get_main_container_docker_image,
28
36
  get_user_workload_container,
29
37
  )
30
-
31
38
  from ..core.docker_resources import get_volumes
32
39
  from ..core.gcloud_context import add_zone_and_project
33
40
  from ..core.kueue import LOCAL_QUEUE_NAME
@@ -37,51 +44,53 @@ from ..core.nap import (
37
44
  is_autoprovisioning_enabled,
38
45
  )
39
46
  from ..core.pathways import (
47
+ append_custom_colocated_python_sidecar,
48
+ append_custom_pathways_proxy_server,
49
+ append_custom_pathways_server,
50
+ append_custom_pathways_worker,
51
+ check_if_pathways_job_is_installed,
40
52
  ensure_pathways_workload_prerequisites,
41
- get_pathways_proxy_args,
42
- get_pathways_rm_args,
43
- get_pathways_sidecar_container,
44
53
  get_pathways_unified_query_link,
45
- get_pathways_worker_args,
46
54
  get_user_workload_for_pathways,
55
+ try_to_delete_pathwaysjob_first,
47
56
  )
48
57
  from ..core.resources import CLUSTER_METADATA_CONFIGMAP, get_cluster_configmap
49
58
  from ..core.scheduling import (
50
59
  check_if_workload_can_schedule,
51
60
  create_accelerator_label,
52
61
  create_machine_label,
62
+ create_tpu_machine_type,
63
+ create_tpu_topology,
53
64
  get_cpu_affinity,
54
65
  get_gpu_scheduler,
55
66
  )
56
67
  from ..core.storage import (
57
- GCS_FUSE_TYPE,
68
+ GCE_PD_TYPE,
58
69
  GCP_FILESTORE_TYPE,
70
+ GCS_FUSE_TYPE,
71
+ PARALLELSTORE_TYPE,
59
72
  Storage,
60
73
  add_bucket_iam_members,
61
- get_storage_volume_mounts_yaml,
62
- get_storage_volumes_yaml,
74
+ get_storage_annotations,
63
75
  get_storages_to_mount,
64
- get_storage_volume_mounts_yaml_for_gpu,
65
- get_storage_volumes_yaml_for_gpu,
66
- GCS_FUSE_ANNOTATION,
67
76
  )
68
77
  from ..core.system_characteristics import (
69
78
  AcceleratorType,
70
- AcceleratorTypeToAcceleratorCharacteristics,
71
79
  get_system_characteristics,
72
80
  )
73
81
  from ..core.vertex import create_vertex_experiment
74
82
  from ..core.workload import (
83
+ add_gpu_rxdm_container,
75
84
  check_if_workload_exists,
76
- get_gpu_rxdm_cmd,
77
- get_gpu_rxdm_image,
78
- get_gpu_tcp_volume,
79
- get_gpu_volume,
80
85
  get_workload_list,
81
86
  wait_for_job_completion,
82
87
  zone_to_region,
83
88
  )
84
- from ..core.workload_decorators import rdma_decorator, tcpxo_decorator, storage_decorator
89
+ from ..core.workload_decorators import (
90
+ rdma_decorator,
91
+ storage_decorator,
92
+ tcpxo_decorator,
93
+ )
85
94
  from ..utils.console import get_user_input, xpk_exit, xpk_print
86
95
  from ..utils.file import write_tmp_file
87
96
  from . import cluster_gcluster
@@ -139,7 +148,8 @@ GPU_WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
139
148
  kind: JobSet
140
149
  metadata:
141
150
  name: {args.workload}
142
- annotations: {storage_annotations}
151
+ annotations:
152
+ {storage_annotations}
143
153
  labels:
144
154
  kueue.x-k8s.io/queue-name: multislice-queue # Name of the LocalQueue
145
155
  xpk.google.com/workload: {args.workload}
@@ -176,29 +186,8 @@ spec:
176
186
  - operator: "Exists"
177
187
  key: nvidia.com/gpu
178
188
  volumes:
179
- {gpu_volume}
180
- {storage_volumes}
189
+ {volumes}
181
190
  containers:
182
- {gpu_rxdm_image}
183
- imagePullPolicy: Always
184
- command:
185
- - "bash"
186
- - "-c"
187
- - |
188
- {gpu_rxdm_cmd} &
189
- while [ ! -e "/usr/share/workload/workload_terminated" ]; do sleep 10; echo "sleeping"; done
190
- securityContext:
191
- privileged: true
192
- volumeMounts:
193
- {gpu_tcp_volume}
194
- {storage_volume_mounts}
195
- - name: nvidia-install-dir-host
196
- mountPath: /usr/local/nvidia/lib64
197
- - name: workload-terminated-volume
198
- mountPath: /usr/share/workload
199
- env:
200
- - name: LD_LIBRARY_PATH
201
- value: /usr/local/nvidia/lib64
202
191
  {container}
203
192
  """
204
193
 
@@ -241,219 +230,37 @@ spec:
241
230
  containers:
242
231
  {container}
243
232
  """
244
-
245
- PW_WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
246
- kind: JobSet
247
- metadata:
248
- name: {args.workload}
249
- labels:
250
- kueue.x-k8s.io/queue-name: {local_queue_name} # Name of the LocalQueue
251
- xpk.google.com/workload: {args.workload}
252
- spec:
253
- ttlSecondsAfterFinished: {args.ttl_seconds_after_finished}
254
- failurePolicy:
255
- {failure_policy_rules}
256
- maxRestarts: {args.max_restarts}
257
- successPolicy:
258
- operator: "All"
259
- targetReplicatedJobs:
260
- - {args.targetReplicatedJob}
261
- replicatedJobs:
262
- - name: worker
263
- replicas: {args.num_slices}
264
- template:
265
- metadata:
266
- annotations:
267
- alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool
268
- labels:
269
- xpk.google.com/workload: {args.workload}
270
- spec:
271
- backoffLimit: {backoff_limit}
272
- completions: {system.vms_per_slice}
273
- parallelism: {system.vms_per_slice}
274
- template:
275
- metadata:
276
- annotations:
277
- {storage_annotations}
278
- spec:
279
- terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
280
- serviceAccountName: {service_account}
281
- containers:
282
- - args:
283
- {pathways_worker_args}
284
- image: {args.server_image}
285
- imagePullPolicy: Always
286
- name: pathways-worker
287
- ports:
288
- - containerPort: 29001
289
- - containerPort: 8471
290
- - containerPort: 8080
291
- resources:
292
- limits:
293
- {resource_type}: {system.chips_per_vm}
294
- securityContext:
295
- privileged: true
296
- volumeMounts:
297
- - mountPath: /tmp
298
- name: shared-tmp
299
- {storage_volume_mounts}
300
- env:
301
- - name: PROJECT_ID
302
- value: {args.project}
303
- - name: LOCATION
304
- value: {args.zone}
305
- - name: CLUSTER_NAME
306
- value: {args.cluster}
307
- - name: POD_NAME
308
- valueFrom:
309
- fieldRef:
310
- fieldPath: metadata.name
311
- - name: CONTAINER_NAME
312
- value: "pathways-worker"
313
- - name: NAMESPACE
314
- valueFrom:
315
- fieldRef:
316
- fieldPath: metadata.namespace
317
- # Workaround for v6e
318
- - name: MEGASCALE_GRPC_ENABLE_XOR_TRACER
319
- value: "false"
320
- - name: MEGASCALE_NUM_SLICES
321
- valueFrom:
322
- fieldRef:
323
- fieldPath: "metadata.labels['jobset.sigs.k8s.io/replicatedjob-replicas']"
324
- - name: JOBSET_NAME
325
- valueFrom:
326
- fieldRef:
327
- fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
328
- - name: REPLICATED_JOB_NAME
329
- valueFrom:
330
- fieldRef:
331
- fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
332
- - name: MEGASCALE_SLICE_ID
333
- valueFrom:
334
- fieldRef:
335
- fieldPath: "metadata.labels['jobset.sigs.k8s.io/job-index']"
336
- - name: MEGASCALE_COORDINATOR_ADDRESS
337
- value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-$(MEGASCALE_SLICE_ID)-0.$(JOBSET_NAME)"
338
- {pathways_sidecar_container}
339
- nodeSelector:
340
- {accelerator_label}
341
- {machine_label}
342
- {autoprovisioning_args}
343
- priorityClassName: {args.priority}
344
- hostNetwork: true
345
- dnsPolicy: ClusterFirstWithHostNet
346
- volumes:
347
- - hostPath:
348
- path: /tmp
349
- type: DirectoryOrCreate
350
- name: shared-tmp
351
- {storage_volumes}
352
- - name: rm
353
- replicas: 1
354
- template:
355
- metadata:
356
- labels:
357
- xpk.google.com/workload: {args.workload}
358
- spec:
359
- backoffLimit: 0
360
- completions: 1
361
- parallelism: 1
362
- template:
363
- spec:
364
- containers:
365
- - args:
366
- {pathways_rm_args}
367
- env:
368
- - name: PROJECT_ID
369
- value: {args.project}
370
- - name: LOCATION
371
- value: {args.zone}
372
- - name: CLUSTER_NAME
373
- value: {args.cluster}
374
- - name: POD_NAME
375
- valueFrom:
376
- fieldRef:
377
- fieldPath: metadata.name
378
- - name: CONTAINER_NAME
379
- value: "pathways-rm"
380
- - name: NAMESPACE
381
- valueFrom:
382
- fieldRef:
383
- fieldPath: metadata.namespace
384
- - name: REPLICATED_JOB_NAME
385
- valueFrom:
386
- fieldRef:
387
- fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
388
- - name: JOBSET_NAME
389
- valueFrom:
390
- fieldRef:
391
- fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
392
- - name: HOST_ADDRESS
393
- value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)
394
- - name: TPU_SKIP_MDS_QUERY
395
- value: "true"
396
- image: {args.server_image}
397
- imagePullPolicy: Always
398
- name: pathways-rm
399
- ports:
400
- - containerPort: 29001
401
- securityContext:
402
- privileged: true
403
- volumeMounts:
404
- - mountPath: /tmp
405
- name: shared-tmp
406
- nodeSelector:
407
- cloud.google.com/gke-nodepool: cpu-rm-np
408
- hostNetwork: true
409
- dnsPolicy: ClusterFirstWithHostNet
410
- volumes:
411
- - hostPath:
412
- path: /tmp
413
- type: DirectoryOrCreate
414
- name: shared-tmp
415
- - name: proxy
416
- replicas: 1
417
- template:
418
- metadata:
419
- labels:
420
- xpk.google.com/workload: {args.workload}
421
- spec:
422
- backoffLimit: 0
423
- completions: 1
424
- parallelism: 1
425
- template:
426
- spec:
427
- containers:
428
- - args:
429
- {pathways_proxy_args}
430
- env:
431
- - name: PROJECT_ID
432
- value: {args.project}
433
- - name: LOCATION
434
- value: {args.zone}
435
- - name: CLUSTER_NAME
436
- value: {args.cluster}
437
- - name: POD_NAME
438
- valueFrom:
439
- fieldRef:
440
- fieldPath: metadata.name
441
- - name: CONTAINER_NAME
442
- value: "pathways-proxy"
443
- - name: NAMESPACE
444
- valueFrom:
445
- fieldRef:
446
- fieldPath: metadata.namespace
447
- image: {args.proxy_server_image}
448
- imagePullPolicy: Always
449
- name: pathways-proxy
450
- ports:
451
- - containerPort: 29000
452
- hostNetwork: true
453
- dnsPolicy: ClusterFirstWithHostNet
454
- nodeSelector:
455
- cloud.google.com/gke-nodepool: cpu-proxy-np
456
- {user_workload}
233
+ # The indentation of PW_WORKLOAD_CREATE_YAML is intentional to allow reusing the user workload container YAML.
234
+ PW_WORKLOAD_CREATE_YAML = """
235
+ apiVersion: pathways-job.pathways.domain/v1
236
+ kind: PathwaysJob
237
+ metadata:
238
+ name: {args.workload}
239
+ labels:
240
+ kueue.x-k8s.io/queue-name: {local_queue_name} # Name of the LocalQueue
241
+ xpk.google.com/workload: {args.workload}
242
+ spec:
243
+ maxRestarts: {args.max_restarts}
244
+ customComponents:
245
+ {custom_pathways_proxy_server}
246
+ {custom_pathways_server}
247
+ {custom_pathways_worker}
248
+ {colocated_python_sidecar}
249
+ workers:
250
+ - type: {machine_type}
251
+ topology: {topology}
252
+ numSlices: {args.num_slices}
253
+ maxSliceRestarts: {args.max_slice_restarts}
254
+ terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
255
+ priorityClassName: {args.priority}
256
+ pathwaysDir: {args.pathways_gcs_location} #This bucket needs to be created in advance.
257
+ controller:
258
+ # #Pod template for training, default mode.
259
+ deploymentMode: default
260
+ mainContainerName: {args.docker_name}
261
+ elasticSlices: {args.elastic_slices}
262
+ template:
263
+ {user_workload}
457
264
  """
458
265
 
459
266
 
@@ -545,7 +352,6 @@ def workload_create(args) -> None:
545
352
 
546
353
  parse_env_config(args, tensorboard_config, system)
547
354
 
548
- # Currently autoprovisioning is not enabled for Pathways workloads.
549
355
  autoprovisioning_args = ''
550
356
  autoprovisioning_enabled, return_code = is_autoprovisioning_enabled(
551
357
  args, system
@@ -560,28 +366,72 @@ def workload_create(args) -> None:
560
366
  if return_code != 0:
561
367
  xpk_exit(return_code)
562
368
 
563
- storages: list[Storage] = get_storages_to_mount(k8s_api_client, args.storage)
564
- gcs_fuse_storages = list(
565
- filter(lambda storage: storage.type == GCS_FUSE_TYPE, storages)
566
- )
567
- gcpfilestore_storages: list[Storage] = list(
568
- filter(lambda storage: storage.type == GCP_FILESTORE_TYPE, storages)
569
- )
570
- storage_annotations = ''
571
369
  service_account = ''
572
- if len(gcs_fuse_storages) > 0:
573
- storage_annotations = GCS_FUSE_ANNOTATION
574
- service_account = XPK_SA
575
- xpk_print(f'Detected gcsfuse Storages to add: {gcs_fuse_storages}')
576
- else:
577
- xpk_print('No gcsfuse Storages to add detected')
578
- failure_policy_rules = """rules:
370
+ all_storages = []
371
+ # Currently storage customization is not supported for Pathways workloads. b/408468941
372
+ if not args.use_pathways:
373
+ storages: list[Storage] = get_storages_to_mount(
374
+ k8s_api_client, args.storage
375
+ )
376
+ gcs_fuse_storages = list(
377
+ filter(lambda storage: storage.type == GCS_FUSE_TYPE, storages)
378
+ )
379
+ gcpfilestore_storages: list[Storage] = list(
380
+ filter(lambda storage: storage.type == GCP_FILESTORE_TYPE, storages)
381
+ )
382
+ parallelstore_storages: list[Storage] = list(
383
+ filter(lambda storage: storage.type == PARALLELSTORE_TYPE, storages)
384
+ )
385
+ pd_storages: list[Storage] = list(
386
+ filter(lambda storage: storage.type == GCE_PD_TYPE, storages)
387
+ )
388
+ if len(gcs_fuse_storages) > 0:
389
+ service_account = XPK_SA
390
+ xpk_print(f'Detected gcsfuse Storages to add: {gcs_fuse_storages}')
391
+ else:
392
+ xpk_print('No gcsfuse Storages to add detected')
393
+
394
+ if len(gcpfilestore_storages) > 0:
395
+ service_account = XPK_SA
396
+ xpk_print(
397
+ f'Detected gcp filestores instances to add: {gcpfilestore_storages}'
398
+ )
399
+ else:
400
+ xpk_print('No gcp filestore instances to add detected.')
401
+
402
+ if len(parallelstore_storages) > 0:
403
+ service_account = XPK_SA
404
+ xpk_print(
405
+ 'Detected gcp parallelstore instances to add:'
406
+ f' {parallelstore_storages}'
407
+ )
408
+ else:
409
+ xpk_print('No gcp filestore instances to add detected.')
410
+
411
+ if len(pd_storages) > 0:
412
+ service_account = XPK_SA
413
+ xpk_print(f'Detected gce persistent disk instances to add: {pd_storages}')
414
+ else:
415
+ xpk_print('No gce persistent disk instances to add detected.')
416
+
417
+ all_storages = (
418
+ gcs_fuse_storages
419
+ + gcpfilestore_storages
420
+ + parallelstore_storages
421
+ + pd_storages
422
+ )
423
+
424
+ # Currently failure policy rules are supported for Pathways workloads. b/408465881
425
+ failure_policy_rules = ''
426
+ pod_failure_policy = ''
427
+ if not args.use_pathways:
428
+ failure_policy_rules = """rules:
579
429
  - action: FailJobSet
580
- onJobFailureReasons:
430
+ onJobFailureReasons:
581
431
  - PodFailurePolicy"""
582
- restart_on_exit_codes = get_restart_exit_codes(args)
583
- restart_on_exit_codes = ','.join(map(str, restart_on_exit_codes))
584
- pod_failure_policy = f"""
432
+ restart_on_exit_codes = get_restart_exit_codes(args)
433
+ restart_on_exit_codes = ','.join(map(str, restart_on_exit_codes))
434
+ pod_failure_policy = f"""
585
435
  podFailurePolicy:
586
436
  rules:
587
437
  - action: FailJob
@@ -590,14 +440,6 @@ def workload_create(args) -> None:
590
440
  operator: NotIn
591
441
  values: [{restart_on_exit_codes}]"""
592
442
 
593
- if len(gcpfilestore_storages) > 0:
594
- xpk_print(
595
- f'Detected gcp filestores instances to add: {gcpfilestore_storages}'
596
- )
597
- service_account = XPK_SA
598
- else:
599
- xpk_print('No gcp filestore instances to add detected.')
600
- all_storages = gcs_fuse_storages + gcpfilestore_storages
601
443
  # Create the workload file based on accelerator type or workload type.
602
444
  if system.accelerator_type == AcceleratorType['GPU']:
603
445
  container, debugging_dashboard_id = get_user_workload_container(
@@ -626,28 +468,26 @@ def workload_create(args) -> None:
626
468
  sub_networks = get_subnetworks_for_a3ultra(args.cluster)
627
469
  yml_string = rdma_decorator.decorate_jobset(yml_string, sub_networks)
628
470
 
629
- if len(gcs_fuse_storages) + len(gcpfilestore_storages) > 0:
471
+ if args.device_type == cluster_gcluster.a4_device_type:
472
+ sub_networks = get_subnetworks_for_a4()
473
+ yml_string = rdma_decorator.decorate_jobset(yml_string, sub_networks)
474
+
475
+ if all_storages:
630
476
  yml_string = storage_decorator.decorate_jobset(yml_string, all_storages)
631
477
  else:
632
478
  yml_string = GPU_WORKLOAD_CREATE_YAML.format(
633
479
  args=args,
634
480
  container=container,
635
- command=args.command,
636
- chips_per_vm=system.chips_per_vm,
637
481
  gpu_scheduler=gpu_scheduler,
638
- gpu_volume=get_gpu_volume(system),
639
- gpu_rxdm_image=get_gpu_rxdm_image(system),
640
- gpu_rxdm_cmd=get_gpu_rxdm_cmd(system),
641
- gpu_tcp_volume=get_gpu_tcp_volume(system),
642
- storage_volumes=get_storage_volumes_yaml_for_gpu(all_storages),
643
- storage_volume_mounts=get_storage_volume_mounts_yaml_for_gpu(
644
- all_storages
482
+ volumes=get_volumes(args, system),
483
+ storage_annotations=('\n' + (' ' * 12)).join(
484
+ get_storage_annotations(all_storages)
645
485
  ),
646
- storage_annotations=storage_annotations,
647
486
  service_account=service_account,
648
487
  failure_policy_rules=failure_policy_rules,
649
488
  pod_failure_policy=pod_failure_policy,
650
489
  )
490
+ yml_string = add_gpu_rxdm_container(yml_string, system, all_storages)
651
491
 
652
492
  elif args.use_pathways and ensure_pathways_workload_prerequisites(
653
493
  args, system
@@ -655,29 +495,14 @@ def workload_create(args) -> None:
655
495
  yml_string = PW_WORKLOAD_CREATE_YAML.format(
656
496
  args=args,
657
497
  system=system,
658
- accelerator_label=create_accelerator_label(
659
- system.accelerator_type, system
660
- ),
661
- machine_label=create_machine_label(system.accelerator_type, system),
662
- pathways_worker_args=get_pathways_worker_args(args),
663
- pathways_proxy_args=get_pathways_proxy_args(args),
664
- pathways_sidecar_container=get_pathways_sidecar_container(args),
665
- user_workload=get_user_workload_for_pathways(
666
- args, system, pod_failure_policy, storages
667
- ),
668
- resource_type=AcceleratorTypeToAcceleratorCharacteristics[
669
- system.accelerator_type
670
- ].resource_type,
498
+ topology=create_tpu_topology(system.accelerator_type, system),
499
+ machine_type=create_tpu_machine_type(system.accelerator_type, system),
500
+ custom_pathways_proxy_server=append_custom_pathways_proxy_server(args),
501
+ custom_pathways_server=append_custom_pathways_server(args),
502
+ custom_pathways_worker=append_custom_pathways_worker(args),
503
+ colocated_python_sidecar=append_custom_colocated_python_sidecar(args),
504
+ user_workload=get_user_workload_for_pathways(args, system),
671
505
  local_queue_name=LOCAL_QUEUE_NAME,
672
- autoprovisioning_args=autoprovisioning_args,
673
- backoff_limit=system.vms_per_slice * 4,
674
- storage_annotations=storage_annotations,
675
- storage_volumes=get_storage_volumes_yaml(all_storages),
676
- storage_volume_mounts=get_storage_volume_mounts_yaml(all_storages),
677
- pathways_rm_args=get_pathways_rm_args(args, system),
678
- service_account=service_account,
679
- failure_policy_rules=failure_policy_rules,
680
- pod_failure_policy=pod_failure_policy,
681
506
  )
682
507
  else:
683
508
  container, debugging_dashboard_id = get_user_workload_container(
@@ -695,7 +520,9 @@ def workload_create(args) -> None:
695
520
  local_queue_name=LOCAL_QUEUE_NAME,
696
521
  autoprovisioning_args=autoprovisioning_args,
697
522
  volumes=get_volumes(args, system),
698
- storage_annotations=storage_annotations,
523
+ storage_annotations=('\n' + (' ' * 16)).join(
524
+ get_storage_annotations(all_storages)
525
+ ),
699
526
  service_account=service_account,
700
527
  failure_policy_rules=failure_policy_rules,
701
528
  pod_failure_policy=pod_failure_policy,
@@ -708,7 +535,9 @@ def workload_create(args) -> None:
708
535
  xpk_print(f'Create Workload request returned ERROR {return_code}')
709
536
  xpk_exit(return_code)
710
537
 
711
- add_bucket_iam_members(args, storages)
538
+ if not args.use_pathways:
539
+ add_bucket_iam_members(args, storages)
540
+
712
541
  # Get GKE outlier dashboard for TPU
713
542
  outlier_dashboard_id = None
714
543
  if system.accelerator_type == AcceleratorType['TPU']:
@@ -833,6 +662,12 @@ def workload_delete(args) -> None:
833
662
  elif not will_delete:
834
663
  xpk_print('Skipping delete command.')
835
664
  else:
665
+ # If PathwaysJob exists, delete it.
666
+ if check_if_pathways_job_is_installed(
667
+ args
668
+ ) and try_to_delete_pathwaysjob_first(args, workloads):
669
+ xpk_exit(0)
670
+ # PathwaysJob workload does not exist, delete JobSet
836
671
  commands = []
837
672
  task_names = []
838
673
  for workload in workloads:
@@ -60,3 +60,5 @@ class Blueprint:
60
60
  toolkit_modules_url: str
61
61
  toolkit_modules_version: str
62
62
  vars: dict[str, str | list[str]] | None
63
+ terraform_providers: Optional[dict[str, Any]] = None
64
+ validators: Optional[list[Any]] = None