xpk 0.7.2__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. xpk/commands/batch.py +19 -13
  2. xpk/commands/cluster.py +240 -71
  3. xpk/commands/cluster_gcluster.py +22 -5
  4. xpk/commands/common.py +33 -1
  5. xpk/commands/info.py +2 -4
  6. xpk/commands/job.py +7 -8
  7. xpk/commands/kjob_common.py +30 -18
  8. xpk/commands/run.py +17 -12
  9. xpk/commands/shell.py +3 -4
  10. xpk/commands/storage.py +75 -19
  11. xpk/commands/workload.py +161 -324
  12. xpk/core/blueprint/blueprint_definitions.py +2 -0
  13. xpk/core/blueprint/blueprint_generator.py +335 -45
  14. xpk/core/capacity.py +1 -0
  15. xpk/core/cluster.py +193 -12
  16. xpk/core/config.py +3 -1
  17. xpk/core/docker_manager.py +1 -1
  18. xpk/core/docker_resources.py +9 -21
  19. xpk/core/filestore.py +5 -1
  20. xpk/core/gcsfuse.py +27 -6
  21. xpk/core/kjob.py +66 -20
  22. xpk/core/kueue.py +30 -0
  23. xpk/core/mtc.py +195 -0
  24. xpk/core/nap.py +4 -0
  25. xpk/core/network.py +34 -22
  26. xpk/core/nodepool.py +28 -26
  27. xpk/core/pathways.py +165 -210
  28. xpk/core/resources.py +21 -0
  29. xpk/core/scheduling.py +36 -0
  30. xpk/core/storage.py +66 -12
  31. xpk/core/system_characteristics.py +9 -0
  32. xpk/core/workload.py +28 -83
  33. xpk/core/workload_decorators/rdma_decorator.py +11 -15
  34. xpk/core/workload_decorators/storage_decorator.py +8 -3
  35. xpk/core/workload_decorators/tcpx_decorator.py +179 -0
  36. xpk/core/workload_decorators/tcpxo_decorator.py +17 -16
  37. xpk/parser/cluster.py +574 -381
  38. xpk/parser/storage.py +25 -5
  39. xpk/parser/workload.py +59 -31
  40. xpk/utils/kubectl.py +4 -1
  41. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/METADATA +192 -93
  42. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/RECORD +46 -44
  43. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/WHEEL +1 -1
  44. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/entry_points.txt +0 -0
  45. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/licenses/LICENSE +0 -0
  46. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/top_level.txt +0 -0
xpk/commands/workload.py CHANGED
@@ -14,20 +14,22 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
- from ..core.blueprint.blueprint_generator import get_subnetworks_for_a3mega, get_subnetworks_for_a3ultra
18
17
  from ..core.cluster import (
18
+ XPK_SA,
19
19
  create_xpk_k8s_service_account,
20
20
  get_cluster_credentials,
21
21
  setup_k8s_env,
22
- XPK_SA,
23
22
  )
24
23
  from ..core.commands import run_command_with_updates, run_commands
25
- from ..core.config import VERTEX_TENSORBOARD_FEATURE_FLAG, XPK_CURRENT_VERSION, parse_env_config
24
+ from ..core.config import (
25
+ VERTEX_TENSORBOARD_FEATURE_FLAG,
26
+ XPK_CURRENT_VERSION,
27
+ parse_env_config,
28
+ )
26
29
  from ..core.docker_container import (
27
30
  get_main_container_docker_image,
28
31
  get_user_workload_container,
29
32
  )
30
-
31
33
  from ..core.docker_resources import get_volumes
32
34
  from ..core.gcloud_context import add_zone_and_project
33
35
  from ..core.kueue import LOCAL_QUEUE_NAME
@@ -36,54 +38,58 @@ from ..core.nap import (
36
38
  get_autoprovisioning_node_selector_args,
37
39
  is_autoprovisioning_enabled,
38
40
  )
41
+ from ..core.network import get_cluster_subnetworks
39
42
  from ..core.pathways import (
43
+ append_custom_colocated_python_sidecar,
44
+ append_custom_pathways_proxy_server,
45
+ append_custom_pathways_server,
46
+ append_custom_pathways_worker,
47
+ check_if_pathways_job_is_installed,
40
48
  ensure_pathways_workload_prerequisites,
41
- get_pathways_proxy_args,
42
- get_pathways_rm_args,
43
- get_pathways_sidecar_container,
44
49
  get_pathways_unified_query_link,
45
- get_pathways_worker_args,
46
50
  get_user_workload_for_pathways,
51
+ try_to_delete_pathwaysjob_first,
47
52
  )
48
53
  from ..core.resources import CLUSTER_METADATA_CONFIGMAP, get_cluster_configmap
49
54
  from ..core.scheduling import (
50
55
  check_if_workload_can_schedule,
51
56
  create_accelerator_label,
52
57
  create_machine_label,
58
+ create_tpu_machine_type,
59
+ create_tpu_topology,
53
60
  get_cpu_affinity,
54
61
  get_gpu_scheduler,
55
62
  )
56
63
  from ..core.storage import (
57
- GCS_FUSE_TYPE,
64
+ GCE_PD_TYPE,
58
65
  GCP_FILESTORE_TYPE,
66
+ GCS_FUSE_TYPE,
67
+ PARALLELSTORE_TYPE,
59
68
  Storage,
60
69
  add_bucket_iam_members,
61
- get_storage_volume_mounts_yaml,
62
- get_storage_volumes_yaml,
70
+ get_storage_annotations,
63
71
  get_storages_to_mount,
64
- get_storage_volume_mounts_yaml_for_gpu,
65
- get_storage_volumes_yaml_for_gpu,
66
- GCS_FUSE_ANNOTATION,
67
72
  )
68
73
  from ..core.system_characteristics import (
69
74
  AcceleratorType,
70
- AcceleratorTypeToAcceleratorCharacteristics,
71
75
  get_system_characteristics,
72
76
  )
73
77
  from ..core.vertex import create_vertex_experiment
74
78
  from ..core.workload import (
79
+ add_gpu_rxdm_container,
75
80
  check_if_workload_exists,
76
- get_gpu_rxdm_cmd,
77
- get_gpu_rxdm_image,
78
- get_gpu_tcp_volume,
79
- get_gpu_volume,
80
81
  get_workload_list,
81
82
  wait_for_job_completion,
82
83
  zone_to_region,
83
84
  )
84
- from ..core.workload_decorators import rdma_decorator, tcpxo_decorator, storage_decorator
85
+ from ..core.workload_decorators import (
86
+ rdma_decorator,
87
+ storage_decorator,
88
+ tcpxo_decorator,
89
+ )
85
90
  from ..utils.console import get_user_input, xpk_exit, xpk_print
86
91
  from ..utils.file import write_tmp_file
92
+ from .common import is_TAS_possible
87
93
  from . import cluster_gcluster
88
94
 
89
95
  WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
@@ -139,7 +145,8 @@ GPU_WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
139
145
  kind: JobSet
140
146
  metadata:
141
147
  name: {args.workload}
142
- annotations: {storage_annotations}
148
+ annotations:
149
+ {storage_annotations}
143
150
  labels:
144
151
  kueue.x-k8s.io/queue-name: multislice-queue # Name of the LocalQueue
145
152
  xpk.google.com/workload: {args.workload}
@@ -176,29 +183,8 @@ spec:
176
183
  - operator: "Exists"
177
184
  key: nvidia.com/gpu
178
185
  volumes:
179
- {gpu_volume}
180
- {storage_volumes}
186
+ {volumes}
181
187
  containers:
182
- {gpu_rxdm_image}
183
- imagePullPolicy: Always
184
- command:
185
- - "bash"
186
- - "-c"
187
- - |
188
- {gpu_rxdm_cmd} &
189
- while [ ! -e "/usr/share/workload/workload_terminated" ]; do sleep 10; echo "sleeping"; done
190
- securityContext:
191
- privileged: true
192
- volumeMounts:
193
- {gpu_tcp_volume}
194
- {storage_volume_mounts}
195
- - name: nvidia-install-dir-host
196
- mountPath: /usr/local/nvidia/lib64
197
- - name: workload-terminated-volume
198
- mountPath: /usr/share/workload
199
- env:
200
- - name: LD_LIBRARY_PATH
201
- value: /usr/local/nvidia/lib64
202
188
  {container}
203
189
  """
204
190
 
@@ -228,7 +214,7 @@ spec:
228
214
  labels:
229
215
  xpk.google.com/workload: {args.workload}
230
216
  annotations:
231
- kueue.x-k8s.io/podset-preferred-topology: "cloud.google.com/gce-topology-host"
217
+ {kueue_TAS_annotation}
232
218
  spec:
233
219
  priorityClassName: {args.priority}
234
220
  restartPolicy: Never
@@ -241,219 +227,37 @@ spec:
241
227
  containers:
242
228
  {container}
243
229
  """
244
-
245
- PW_WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
246
- kind: JobSet
247
- metadata:
248
- name: {args.workload}
249
- labels:
250
- kueue.x-k8s.io/queue-name: {local_queue_name} # Name of the LocalQueue
251
- xpk.google.com/workload: {args.workload}
252
- spec:
253
- ttlSecondsAfterFinished: {args.ttl_seconds_after_finished}
254
- failurePolicy:
255
- {failure_policy_rules}
256
- maxRestarts: {args.max_restarts}
257
- successPolicy:
258
- operator: "All"
259
- targetReplicatedJobs:
260
- - {args.targetReplicatedJob}
261
- replicatedJobs:
262
- - name: worker
263
- replicas: {args.num_slices}
264
- template:
265
- metadata:
266
- annotations:
267
- alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool
268
- labels:
269
- xpk.google.com/workload: {args.workload}
270
- spec:
271
- backoffLimit: {backoff_limit}
272
- completions: {system.vms_per_slice}
273
- parallelism: {system.vms_per_slice}
274
- template:
275
- metadata:
276
- annotations:
277
- {storage_annotations}
278
- spec:
279
- terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
280
- serviceAccountName: {service_account}
281
- containers:
282
- - args:
283
- {pathways_worker_args}
284
- image: {args.server_image}
285
- imagePullPolicy: Always
286
- name: pathways-worker
287
- ports:
288
- - containerPort: 29001
289
- - containerPort: 8471
290
- - containerPort: 8080
291
- resources:
292
- limits:
293
- {resource_type}: {system.chips_per_vm}
294
- securityContext:
295
- privileged: true
296
- volumeMounts:
297
- - mountPath: /tmp
298
- name: shared-tmp
299
- {storage_volume_mounts}
300
- env:
301
- - name: PROJECT_ID
302
- value: {args.project}
303
- - name: LOCATION
304
- value: {args.zone}
305
- - name: CLUSTER_NAME
306
- value: {args.cluster}
307
- - name: POD_NAME
308
- valueFrom:
309
- fieldRef:
310
- fieldPath: metadata.name
311
- - name: CONTAINER_NAME
312
- value: "pathways-worker"
313
- - name: NAMESPACE
314
- valueFrom:
315
- fieldRef:
316
- fieldPath: metadata.namespace
317
- # Workaround for v6e
318
- - name: MEGASCALE_GRPC_ENABLE_XOR_TRACER
319
- value: "false"
320
- - name: MEGASCALE_NUM_SLICES
321
- valueFrom:
322
- fieldRef:
323
- fieldPath: "metadata.labels['jobset.sigs.k8s.io/replicatedjob-replicas']"
324
- - name: JOBSET_NAME
325
- valueFrom:
326
- fieldRef:
327
- fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
328
- - name: REPLICATED_JOB_NAME
329
- valueFrom:
330
- fieldRef:
331
- fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
332
- - name: MEGASCALE_SLICE_ID
333
- valueFrom:
334
- fieldRef:
335
- fieldPath: "metadata.labels['jobset.sigs.k8s.io/job-index']"
336
- - name: MEGASCALE_COORDINATOR_ADDRESS
337
- value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-$(MEGASCALE_SLICE_ID)-0.$(JOBSET_NAME)"
338
- {pathways_sidecar_container}
339
- nodeSelector:
340
- {accelerator_label}
341
- {machine_label}
342
- {autoprovisioning_args}
343
- priorityClassName: {args.priority}
344
- hostNetwork: true
345
- dnsPolicy: ClusterFirstWithHostNet
346
- volumes:
347
- - hostPath:
348
- path: /tmp
349
- type: DirectoryOrCreate
350
- name: shared-tmp
351
- {storage_volumes}
352
- - name: rm
353
- replicas: 1
354
- template:
355
- metadata:
356
- labels:
357
- xpk.google.com/workload: {args.workload}
358
- spec:
359
- backoffLimit: 0
360
- completions: 1
361
- parallelism: 1
362
- template:
363
- spec:
364
- containers:
365
- - args:
366
- {pathways_rm_args}
367
- env:
368
- - name: PROJECT_ID
369
- value: {args.project}
370
- - name: LOCATION
371
- value: {args.zone}
372
- - name: CLUSTER_NAME
373
- value: {args.cluster}
374
- - name: POD_NAME
375
- valueFrom:
376
- fieldRef:
377
- fieldPath: metadata.name
378
- - name: CONTAINER_NAME
379
- value: "pathways-rm"
380
- - name: NAMESPACE
381
- valueFrom:
382
- fieldRef:
383
- fieldPath: metadata.namespace
384
- - name: REPLICATED_JOB_NAME
385
- valueFrom:
386
- fieldRef:
387
- fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
388
- - name: JOBSET_NAME
389
- valueFrom:
390
- fieldRef:
391
- fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
392
- - name: HOST_ADDRESS
393
- value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)
394
- - name: TPU_SKIP_MDS_QUERY
395
- value: "true"
396
- image: {args.server_image}
397
- imagePullPolicy: Always
398
- name: pathways-rm
399
- ports:
400
- - containerPort: 29001
401
- securityContext:
402
- privileged: true
403
- volumeMounts:
404
- - mountPath: /tmp
405
- name: shared-tmp
406
- nodeSelector:
407
- cloud.google.com/gke-nodepool: cpu-rm-np
408
- hostNetwork: true
409
- dnsPolicy: ClusterFirstWithHostNet
410
- volumes:
411
- - hostPath:
412
- path: /tmp
413
- type: DirectoryOrCreate
414
- name: shared-tmp
415
- - name: proxy
416
- replicas: 1
417
- template:
418
- metadata:
419
- labels:
420
- xpk.google.com/workload: {args.workload}
421
- spec:
422
- backoffLimit: 0
423
- completions: 1
424
- parallelism: 1
425
- template:
426
- spec:
427
- containers:
428
- - args:
429
- {pathways_proxy_args}
430
- env:
431
- - name: PROJECT_ID
432
- value: {args.project}
433
- - name: LOCATION
434
- value: {args.zone}
435
- - name: CLUSTER_NAME
436
- value: {args.cluster}
437
- - name: POD_NAME
438
- valueFrom:
439
- fieldRef:
440
- fieldPath: metadata.name
441
- - name: CONTAINER_NAME
442
- value: "pathways-proxy"
443
- - name: NAMESPACE
444
- valueFrom:
445
- fieldRef:
446
- fieldPath: metadata.namespace
447
- image: {args.proxy_server_image}
448
- imagePullPolicy: Always
449
- name: pathways-proxy
450
- ports:
451
- - containerPort: 29000
452
- hostNetwork: true
453
- dnsPolicy: ClusterFirstWithHostNet
454
- nodeSelector:
455
- cloud.google.com/gke-nodepool: cpu-proxy-np
456
- {user_workload}
230
+ # The indentation of PW_WORKLOAD_CREATE_YAML is intentional to allow reusing the user workload container YAML.
231
+ PW_WORKLOAD_CREATE_YAML = """
232
+ apiVersion: pathways-job.pathways.domain/v1
233
+ kind: PathwaysJob
234
+ metadata:
235
+ name: {args.workload}
236
+ labels:
237
+ kueue.x-k8s.io/queue-name: {local_queue_name} # Name of the LocalQueue
238
+ xpk.google.com/workload: {args.workload}
239
+ spec:
240
+ maxRestarts: {args.max_restarts}
241
+ customComponents:
242
+ {custom_pathways_proxy_server}
243
+ {custom_pathways_server}
244
+ {custom_pathways_worker}
245
+ {colocated_python_sidecar}
246
+ workers:
247
+ - type: {machine_type}
248
+ topology: {topology}
249
+ numSlices: {args.num_slices}
250
+ maxSliceRestarts: {args.max_slice_restarts}
251
+ terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
252
+ priorityClassName: {args.priority}
253
+ pathwaysDir: {args.pathways_gcs_location} #This bucket needs to be created in advance.
254
+ controller:
255
+ # #Pod template for training, default mode.
256
+ deploymentMode: default
257
+ mainContainerName: {args.docker_name}
258
+ elasticSlices: {args.elastic_slices}
259
+ template:
260
+ {user_workload}
457
261
  """
458
262
 
459
263
 
@@ -545,7 +349,6 @@ def workload_create(args) -> None:
545
349
 
546
350
  parse_env_config(args, tensorboard_config, system)
547
351
 
548
- # Currently autoprovisioning is not enabled for Pathways workloads.
549
352
  autoprovisioning_args = ''
550
353
  autoprovisioning_enabled, return_code = is_autoprovisioning_enabled(
551
354
  args, system
@@ -560,28 +363,72 @@ def workload_create(args) -> None:
560
363
  if return_code != 0:
561
364
  xpk_exit(return_code)
562
365
 
563
- storages: list[Storage] = get_storages_to_mount(k8s_api_client, args.storage)
564
- gcs_fuse_storages = list(
565
- filter(lambda storage: storage.type == GCS_FUSE_TYPE, storages)
566
- )
567
- gcpfilestore_storages: list[Storage] = list(
568
- filter(lambda storage: storage.type == GCP_FILESTORE_TYPE, storages)
569
- )
570
- storage_annotations = ''
571
366
  service_account = ''
572
- if len(gcs_fuse_storages) > 0:
573
- storage_annotations = GCS_FUSE_ANNOTATION
574
- service_account = XPK_SA
575
- xpk_print(f'Detected gcsfuse Storages to add: {gcs_fuse_storages}')
576
- else:
577
- xpk_print('No gcsfuse Storages to add detected')
578
- failure_policy_rules = """rules:
367
+ all_storages = []
368
+ # Currently storage customization is not supported for Pathways workloads. b/408468941
369
+ if not args.use_pathways:
370
+ storages: list[Storage] = get_storages_to_mount(
371
+ k8s_api_client, args.storage
372
+ )
373
+ gcs_fuse_storages = list(
374
+ filter(lambda storage: storage.type == GCS_FUSE_TYPE, storages)
375
+ )
376
+ gcpfilestore_storages: list[Storage] = list(
377
+ filter(lambda storage: storage.type == GCP_FILESTORE_TYPE, storages)
378
+ )
379
+ parallelstore_storages: list[Storage] = list(
380
+ filter(lambda storage: storage.type == PARALLELSTORE_TYPE, storages)
381
+ )
382
+ pd_storages: list[Storage] = list(
383
+ filter(lambda storage: storage.type == GCE_PD_TYPE, storages)
384
+ )
385
+ if len(gcs_fuse_storages) > 0:
386
+ service_account = XPK_SA
387
+ xpk_print(f'Detected gcsfuse Storages to add: {gcs_fuse_storages}')
388
+ else:
389
+ xpk_print('No gcsfuse Storages to add detected')
390
+
391
+ if len(gcpfilestore_storages) > 0:
392
+ service_account = XPK_SA
393
+ xpk_print(
394
+ f'Detected gcp filestores instances to add: {gcpfilestore_storages}'
395
+ )
396
+ else:
397
+ xpk_print('No gcp filestore instances to add detected.')
398
+
399
+ if len(parallelstore_storages) > 0:
400
+ service_account = XPK_SA
401
+ xpk_print(
402
+ 'Detected gcp parallelstore instances to add:'
403
+ f' {parallelstore_storages}'
404
+ )
405
+ else:
406
+ xpk_print('No gcp parallelstore instances to add detected.')
407
+
408
+ if len(pd_storages) > 0:
409
+ service_account = XPK_SA
410
+ xpk_print(f'Detected gce persistent disk instances to add: {pd_storages}')
411
+ else:
412
+ xpk_print('No gce persistent disk instances to add detected.')
413
+
414
+ all_storages = (
415
+ gcs_fuse_storages
416
+ + gcpfilestore_storages
417
+ + parallelstore_storages
418
+ + pd_storages
419
+ )
420
+
421
+ # Currently failure policy rules are supported for Pathways workloads. b/408465881
422
+ failure_policy_rules = ''
423
+ pod_failure_policy = ''
424
+ if not args.use_pathways:
425
+ failure_policy_rules = """rules:
579
426
  - action: FailJobSet
580
- onJobFailureReasons:
427
+ onJobFailureReasons:
581
428
  - PodFailurePolicy"""
582
- restart_on_exit_codes = get_restart_exit_codes(args)
583
- restart_on_exit_codes = ','.join(map(str, restart_on_exit_codes))
584
- pod_failure_policy = f"""
429
+ restart_on_exit_codes = get_restart_exit_codes(args)
430
+ restart_on_exit_codes = ','.join(map(str, restart_on_exit_codes))
431
+ pod_failure_policy = f"""
585
432
  podFailurePolicy:
586
433
  rules:
587
434
  - action: FailJob
@@ -590,14 +437,6 @@ def workload_create(args) -> None:
590
437
  operator: NotIn
591
438
  values: [{restart_on_exit_codes}]"""
592
439
 
593
- if len(gcpfilestore_storages) > 0:
594
- xpk_print(
595
- f'Detected gcp filestores instances to add: {gcpfilestore_storages}'
596
- )
597
- service_account = XPK_SA
598
- else:
599
- xpk_print('No gcp filestore instances to add detected.')
600
- all_storages = gcs_fuse_storages + gcpfilestore_storages
601
440
  # Create the workload file based on accelerator type or workload type.
602
441
  if system.accelerator_type == AcceleratorType['GPU']:
603
442
  container, debugging_dashboard_id = get_user_workload_container(
@@ -609,6 +448,13 @@ def workload_create(args) -> None:
609
448
  if return_code != 0:
610
449
  xpk_exit(return_code)
611
450
 
451
+ kueue_TAS_annotation = (
452
+ 'kueue.x-k8s.io/podset-preferred-topology:'
453
+ ' "cloud.google.com/gce-topology-host"'
454
+ )
455
+ if not is_TAS_possible(args):
456
+ kueue_TAS_annotation = ''
457
+
612
458
  if system.device_type in cluster_gcluster.supported_device_types:
613
459
  yml_string = A3_GPU_WORKLOAD_CREATE_YAML.format(
614
460
  args=args,
@@ -616,38 +462,34 @@ def workload_create(args) -> None:
616
462
  service_account=XPK_SA,
617
463
  failure_policy_rules=failure_policy_rules,
618
464
  pod_failure_policy=pod_failure_policy,
465
+ kueue_TAS_annotation=kueue_TAS_annotation,
619
466
  )
620
467
 
468
+ sub_networks = get_cluster_subnetworks(args)
621
469
  if args.device_type == cluster_gcluster.a3mega_device_type:
622
- sub_networks = get_subnetworks_for_a3mega(args.cluster)
623
470
  yml_string = tcpxo_decorator.decorate_jobset(yml_string, sub_networks)
624
-
625
- if args.device_type == cluster_gcluster.a3ultra_device_type:
626
- sub_networks = get_subnetworks_for_a3ultra(args.cluster)
471
+ elif args.device_type in [
472
+ cluster_gcluster.a3ultra_device_type,
473
+ cluster_gcluster.a4_device_type,
474
+ ]:
627
475
  yml_string = rdma_decorator.decorate_jobset(yml_string, sub_networks)
628
476
 
629
- if len(gcs_fuse_storages) + len(gcpfilestore_storages) > 0:
477
+ if all_storages:
630
478
  yml_string = storage_decorator.decorate_jobset(yml_string, all_storages)
631
479
  else:
632
480
  yml_string = GPU_WORKLOAD_CREATE_YAML.format(
633
481
  args=args,
634
482
  container=container,
635
- command=args.command,
636
- chips_per_vm=system.chips_per_vm,
637
483
  gpu_scheduler=gpu_scheduler,
638
- gpu_volume=get_gpu_volume(system),
639
- gpu_rxdm_image=get_gpu_rxdm_image(system),
640
- gpu_rxdm_cmd=get_gpu_rxdm_cmd(system),
641
- gpu_tcp_volume=get_gpu_tcp_volume(system),
642
- storage_volumes=get_storage_volumes_yaml_for_gpu(all_storages),
643
- storage_volume_mounts=get_storage_volume_mounts_yaml_for_gpu(
644
- all_storages
484
+ volumes=get_volumes(args, system),
485
+ storage_annotations=('\n' + (' ' * 12)).join(
486
+ get_storage_annotations(all_storages)
645
487
  ),
646
- storage_annotations=storage_annotations,
647
488
  service_account=service_account,
648
489
  failure_policy_rules=failure_policy_rules,
649
490
  pod_failure_policy=pod_failure_policy,
650
491
  )
492
+ yml_string = add_gpu_rxdm_container(yml_string, system, all_storages)
651
493
 
652
494
  elif args.use_pathways and ensure_pathways_workload_prerequisites(
653
495
  args, system
@@ -655,29 +497,14 @@ def workload_create(args) -> None:
655
497
  yml_string = PW_WORKLOAD_CREATE_YAML.format(
656
498
  args=args,
657
499
  system=system,
658
- accelerator_label=create_accelerator_label(
659
- system.accelerator_type, system
660
- ),
661
- machine_label=create_machine_label(system.accelerator_type, system),
662
- pathways_worker_args=get_pathways_worker_args(args),
663
- pathways_proxy_args=get_pathways_proxy_args(args),
664
- pathways_sidecar_container=get_pathways_sidecar_container(args),
665
- user_workload=get_user_workload_for_pathways(
666
- args, system, pod_failure_policy, storages
667
- ),
668
- resource_type=AcceleratorTypeToAcceleratorCharacteristics[
669
- system.accelerator_type
670
- ].resource_type,
500
+ topology=create_tpu_topology(system.accelerator_type, system),
501
+ machine_type=create_tpu_machine_type(system.accelerator_type, system),
502
+ custom_pathways_proxy_server=append_custom_pathways_proxy_server(args),
503
+ custom_pathways_server=append_custom_pathways_server(args),
504
+ custom_pathways_worker=append_custom_pathways_worker(args),
505
+ colocated_python_sidecar=append_custom_colocated_python_sidecar(args),
506
+ user_workload=get_user_workload_for_pathways(args, system),
671
507
  local_queue_name=LOCAL_QUEUE_NAME,
672
- autoprovisioning_args=autoprovisioning_args,
673
- backoff_limit=system.vms_per_slice * 4,
674
- storage_annotations=storage_annotations,
675
- storage_volumes=get_storage_volumes_yaml(all_storages),
676
- storage_volume_mounts=get_storage_volume_mounts_yaml(all_storages),
677
- pathways_rm_args=get_pathways_rm_args(args, system),
678
- service_account=service_account,
679
- failure_policy_rules=failure_policy_rules,
680
- pod_failure_policy=pod_failure_policy,
681
508
  )
682
509
  else:
683
510
  container, debugging_dashboard_id = get_user_workload_container(
@@ -695,7 +522,9 @@ def workload_create(args) -> None:
695
522
  local_queue_name=LOCAL_QUEUE_NAME,
696
523
  autoprovisioning_args=autoprovisioning_args,
697
524
  volumes=get_volumes(args, system),
698
- storage_annotations=storage_annotations,
525
+ storage_annotations=('\n' + (' ' * 16)).join(
526
+ get_storage_annotations(all_storages)
527
+ ),
699
528
  service_account=service_account,
700
529
  failure_policy_rules=failure_policy_rules,
701
530
  pod_failure_policy=pod_failure_policy,
@@ -708,7 +537,9 @@ def workload_create(args) -> None:
708
537
  xpk_print(f'Create Workload request returned ERROR {return_code}')
709
538
  xpk_exit(return_code)
710
539
 
711
- add_bucket_iam_members(args, storages)
540
+ if not args.use_pathways:
541
+ add_bucket_iam_members(args, storages)
542
+
712
543
  # Get GKE outlier dashboard for TPU
713
544
  outlier_dashboard_id = None
714
545
  if system.accelerator_type == AcceleratorType['TPU']:
@@ -833,6 +664,12 @@ def workload_delete(args) -> None:
833
664
  elif not will_delete:
834
665
  xpk_print('Skipping delete command.')
835
666
  else:
667
+ # If PathwaysJob exists, delete it.
668
+ if check_if_pathways_job_is_installed(
669
+ args
670
+ ) and try_to_delete_pathwaysjob_first(args, workloads):
671
+ xpk_exit(0)
672
+ # PathwaysJob workload does not exist, delete JobSet
836
673
  commands = []
837
674
  task_names = []
838
675
  for workload in workloads:
@@ -60,3 +60,5 @@ class Blueprint:
60
60
  toolkit_modules_url: str
61
61
  toolkit_modules_version: str
62
62
  vars: dict[str, str | list[str]] | None
63
+ terraform_providers: Optional[dict[str, Any]] = None
64
+ validators: Optional[list[Any]] = None