xpk 0.7.2__tar.gz → 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. {xpk-0.7.2/src/xpk.egg-info → xpk-0.8.0}/PKG-INFO +60 -4
  2. {xpk-0.7.2 → xpk-0.8.0}/README.md +59 -3
  3. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/batch.py +19 -12
  4. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/cluster.py +33 -16
  5. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/cluster_gcluster.py +22 -5
  6. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/info.py +2 -4
  7. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/job.py +7 -8
  8. xpk-0.8.0/src/xpk/commands/kjob_common.py +47 -0
  9. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/run.py +17 -11
  10. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/shell.py +3 -4
  11. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/storage.py +64 -19
  12. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/workload.py +154 -319
  13. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/blueprint/blueprint_definitions.py +2 -0
  14. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/blueprint/blueprint_generator.py +322 -32
  15. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/capacity.py +1 -0
  16. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/cluster.py +75 -5
  17. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/config.py +3 -1
  18. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/docker_manager.py +1 -1
  19. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/docker_resources.py +9 -21
  20. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/filestore.py +11 -3
  21. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/gcsfuse.py +8 -5
  22. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/kjob.py +57 -18
  23. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/nap.py +4 -0
  24. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/network.py +11 -21
  25. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/nodepool.py +28 -26
  26. xpk-0.8.0/src/xpk/core/pathways.py +332 -0
  27. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/scheduling.py +36 -0
  28. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/storage.py +66 -12
  29. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/system_characteristics.py +9 -0
  30. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/workload.py +27 -82
  31. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/workload_decorators/rdma_decorator.py +3 -3
  32. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/workload_decorators/storage_decorator.py +8 -3
  33. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/workload_decorators/tcpxo_decorator.py +2 -2
  34. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/cluster.py +15 -6
  35. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/storage.py +14 -3
  36. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/workload.py +59 -31
  37. {xpk-0.7.2 → xpk-0.8.0/src/xpk.egg-info}/PKG-INFO +60 -4
  38. xpk-0.7.2/src/xpk/commands/kjob_common.py +0 -44
  39. xpk-0.7.2/src/xpk/core/pathways.py +0 -377
  40. {xpk-0.7.2 → xpk-0.8.0}/LICENSE +0 -0
  41. {xpk-0.7.2 → xpk-0.8.0}/pyproject.toml +0 -0
  42. {xpk-0.7.2 → xpk-0.8.0}/setup.cfg +0 -0
  43. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/__init__.py +0 -0
  44. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/api/__init__.py +0 -0
  45. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/api/storage_crd.yaml +0 -0
  46. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/__init__.py +0 -0
  47. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/common.py +0 -0
  48. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/config.py +0 -0
  49. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/inspector.py +0 -0
  50. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/kind.py +0 -0
  51. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/version.py +0 -0
  52. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/__init__.py +0 -0
  53. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/blueprint/__init__.py +0 -0
  54. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/cluster_private.py +0 -0
  55. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/commands.py +0 -0
  56. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/docker_container.py +0 -0
  57. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/docker_image.py +0 -0
  58. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/gcloud_context.py +0 -0
  59. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/gcluster_manager.py +0 -0
  60. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/kueue.py +0 -0
  61. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/monitoring.py +0 -0
  62. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/ray.py +0 -0
  63. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/remote_state/__init__.py +0 -0
  64. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
  65. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/remote_state/remote_state_client.py +0 -0
  66. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/resources.py +0 -0
  67. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/vertex.py +0 -0
  68. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/workload_decorators/__init__.py +0 -0
  69. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/main.py +0 -0
  70. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/__init__.py +0 -0
  71. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/batch.py +0 -0
  72. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/common.py +0 -0
  73. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/config.py +0 -0
  74. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/core.py +0 -0
  75. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/info.py +0 -0
  76. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/inspector.py +0 -0
  77. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/job.py +0 -0
  78. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/kind.py +0 -0
  79. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/run.py +0 -0
  80. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/shell.py +0 -0
  81. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/validators.py +0 -0
  82. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/version.py +0 -0
  83. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/templates/__init__.py +0 -0
  84. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/templates/storage.yaml +0 -0
  85. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/utils/__init__.py +0 -0
  86. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/utils/console.py +0 -0
  87. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/utils/file.py +0 -0
  88. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/utils/gcs_utils.py +0 -0
  89. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/utils/kubectl.py +0 -0
  90. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/utils/network.py +0 -0
  91. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/utils/objects.py +0 -0
  92. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/utils/templates.py +0 -0
  93. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/utils/validation.py +0 -0
  94. {xpk-0.7.2 → xpk-0.8.0}/src/xpk/utils/yaml.py +0 -0
  95. {xpk-0.7.2 → xpk-0.8.0}/src/xpk.egg-info/SOURCES.txt +0 -0
  96. {xpk-0.7.2 → xpk-0.8.0}/src/xpk.egg-info/dependency_links.txt +0 -0
  97. {xpk-0.7.2 → xpk-0.8.0}/src/xpk.egg-info/entry_points.txt +0 -0
  98. {xpk-0.7.2 → xpk-0.8.0}/src/xpk.egg-info/requires.txt +0 -0
  99. {xpk-0.7.2 → xpk-0.8.0}/src/xpk.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xpk
3
- Version: 0.7.2
3
+ Version: 0.8.0
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
5
  Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
@@ -88,9 +88,11 @@ and the following GPU types:
88
88
  and the following CPU types:
89
89
  * n2-standard-32
90
90
 
91
- xpk also supports Google Cloud Storage solutions:
91
+ xpk also supports [Google Cloud Storage solutions](#storage):
92
92
  * [Cloud Storage FUSE](#fuse)
93
93
  * [Filestore](#filestore)
94
+ * [Parallelstore](#parallelstore)
95
+ * [Block storage (Persistent Disk, Hyperdisk)](#block-storage-persistent-disk-hyperdisk)
94
96
 
95
97
  # Permissions needed on Cloud Console:
96
98
 
@@ -253,6 +255,7 @@ all zones.
253
255
  --num-slices=4 --on-demand \
254
256
  --tpu-type=v5litepod-16
255
257
  ```
258
+ Note that Pathways clusters need a CPU nodepool of n2-standard-64 or higher.
256
259
 
257
260
  * Cluster Create for Ray:
258
261
  A cluster with KubeRay enabled and a RayCluster can be created using `cluster create-ray`.
@@ -475,7 +478,11 @@ Currently, the below flags/arguments are supported for A3-Mega and A3-Ultra mach
475
478
 
476
479
 
477
480
  ## Storage
478
- Currently XPK supports two types of storages: Cloud Storage FUSE and Google Cloud Filestore.
481
+ Currently XPK supports the below types of storages:
482
+ - [Cloud Storage FUSE](#fuse)
483
+ - [Google Cloud Filestore](#filestore)
484
+ - [Google Cloud Parallelstore](#parallelstore)
485
+ - [Google Cloud Block storages (Persistent Disk, Hyperdisk)](#block-storage-persistent-disk-hyperdisk)
479
486
 
480
487
  ### FUSE
481
488
  A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so applications can read and write objects in your bucket using standard file system semantics.
@@ -499,11 +506,12 @@ Parameters:
499
506
  - `--readonly` - if set to true, workload can only read from storage.
500
507
  - `--size` - size of the storage in Gb.
501
508
  - `--bucket` - name of the storage bucket. If not set then the name of the storage is used as a bucket name.
509
+ - `--mount-options` - comma-separated list of additional mount options for PersistentVolume ([reference](https://cloud.google.com/kubernetes-engine/docs/how-to/cloud-storage-fuse-csi-driver-perf#mount-options)).
502
510
  - `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions. If set, then values from manifest override the following parameters: `--size` and `--bucket`.
503
511
 
504
512
  ### Filestore
505
513
 
506
- A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so applications can read and write objects in your volumes using standard file system semantics.
514
+ A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
507
515
 
508
516
  To create and attach a GCP Filestore instance to your cluster use `xpk storage create` command with `--type=gcpfilestore`:
509
517
 
@@ -537,6 +545,54 @@ Commands `xpk storage create` and `xpk storage attach` with `--type=gcpfilestore
537
545
  - `--instance` - the name of the Filestore instance. If not set then the name parameter is used as an instance name. Useful when connecting multiple volumes from the same Filestore instance.
538
546
  - `--manifest` - path to the manifest file containing PersistentVolume, PresistentVolumeClaim and StorageClass definitions. If set, then values from manifest override the following parameters: `--access-mode`, `--size` and `--volume`.
539
547
 
548
+ ### Parallelstore
549
+
550
+ A Parallelstore adapter lets you mount and access [Parallelstore instances](https://cloud.google.com/parallelstore/) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
551
+
552
+ To use the GCS Parallelstore with XPK you need to create a [Parallelstore Instance](https://console.cloud.google.com/parallelstore/).
553
+
554
+ Once it's ready you can use `xpk storage attach` with `--type=parallelstore` command to attach a Parallelstore instance to your cluster. Currently, attaching a Parallelstore is supported only by providing a manifest file.
555
+
556
+ ```shell
557
+ python3 xpk.py storage attach test-parallelstore-storage --type=parallelstore \
558
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
559
+ --mount-point='/test-mount-point' --readonly=false \
560
+ --auto-mount=true \
561
+ --manifest='./examples/storage/parallelstore-manifest-attach.yaml'
562
+ ```
563
+
564
+ Parameters:
565
+
566
+ - `--type` - type of the storage `parallelstore`
567
+ - `--auto-mount` - if set to true all workloads will have this storage mounted by default.
568
+ - `--mount-point` - the path on which this storage should be mounted for a workload.
569
+ - `--readonly` - if set to true, workload can only read from storage.
570
+ - `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
571
+
572
+ ### Block storage (Persistent Disk, Hyperdisk)
573
+
574
+ A PersistentDisk adapter lets you mount and access Google Cloud Block storage solutions ([Persistent Disk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#pd), [Hyperdisk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#hyperdisk)) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
575
+
576
+ To use the GCE PersistentDisk with XPK you need to create a [disk in GCE](https://cloud.google.com/compute/docs/disks). Please consider that the disk type you are creating is [compatible with the VMs](https://cloud.google.com/compute/docs/machine-resource#machine_type_comparison) in the default and accelerator nodepools.
577
+
578
+ Once it's ready you can use `xpk storage attach` with `--type=pd` command to attach a PersistentDisk instance to your cluster. Currently, attaching a PersistentDisk is supported only by providing a manifest file.
579
+
580
+ ```shell
581
+ python3 xpk.py storage attach test-pd-storage --type=pd \
582
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
583
+ --mount-point='/test-mount-point' --readonly=false \
584
+ --auto-mount=true \
585
+ --manifest='./examples/storage/pd-manifest-attach.yaml'
586
+ ```
587
+
588
+ Parameters:
589
+
590
+ - `--type` - type of the storage `pd`
591
+ - `--auto-mount` - if set to true all workloads will have this storage mounted by default.
592
+ - `--mount-point` - the path on which this storage should be mounted for a workload.
593
+ - `--readonly` - if set to true, workload can only read from storage.
594
+ - `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
595
+
540
596
  ### List attached storages
541
597
 
542
598
  ```shell
@@ -56,9 +56,11 @@ and the following GPU types:
56
56
  and the following CPU types:
57
57
  * n2-standard-32
58
58
 
59
- xpk also supports Google Cloud Storage solutions:
59
+ xpk also supports [Google Cloud Storage solutions](#storage):
60
60
  * [Cloud Storage FUSE](#fuse)
61
61
  * [Filestore](#filestore)
62
+ * [Parallelstore](#parallelstore)
63
+ * [Block storage (Persistent Disk, Hyperdisk)](#block-storage-persistent-disk-hyperdisk)
62
64
 
63
65
  # Permissions needed on Cloud Console:
64
66
 
@@ -221,6 +223,7 @@ all zones.
221
223
  --num-slices=4 --on-demand \
222
224
  --tpu-type=v5litepod-16
223
225
  ```
226
+ Note that Pathways clusters need a CPU nodepool of n2-standard-64 or higher.
224
227
 
225
228
  * Cluster Create for Ray:
226
229
  A cluster with KubeRay enabled and a RayCluster can be created using `cluster create-ray`.
@@ -443,7 +446,11 @@ Currently, the below flags/arguments are supported for A3-Mega and A3-Ultra mach
443
446
 
444
447
 
445
448
  ## Storage
446
- Currently XPK supports two types of storages: Cloud Storage FUSE and Google Cloud Filestore.
449
+ Currently XPK supports the below types of storages:
450
+ - [Cloud Storage FUSE](#fuse)
451
+ - [Google Cloud Filestore](#filestore)
452
+ - [Google Cloud Parallelstore](#parallelstore)
453
+ - [Google Cloud Block storages (Persistent Disk, Hyperdisk)](#block-storage-persistent-disk-hyperdisk)
447
454
 
448
455
  ### FUSE
449
456
  A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so applications can read and write objects in your bucket using standard file system semantics.
@@ -467,11 +474,12 @@ Parameters:
467
474
  - `--readonly` - if set to true, workload can only read from storage.
468
475
  - `--size` - size of the storage in Gb.
469
476
  - `--bucket` - name of the storage bucket. If not set then the name of the storage is used as a bucket name.
477
+ - `--mount-options` - comma-separated list of additional mount options for PersistentVolume ([reference](https://cloud.google.com/kubernetes-engine/docs/how-to/cloud-storage-fuse-csi-driver-perf#mount-options)).
470
478
  - `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions. If set, then values from manifest override the following parameters: `--size` and `--bucket`.
471
479
 
472
480
  ### Filestore
473
481
 
474
- A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so applications can read and write objects in your volumes using standard file system semantics.
482
+ A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
475
483
 
476
484
  To create and attach a GCP Filestore instance to your cluster use `xpk storage create` command with `--type=gcpfilestore`:
477
485
 
@@ -505,6 +513,54 @@ Commands `xpk storage create` and `xpk storage attach` with `--type=gcpfilestore
505
513
  - `--instance` - the name of the Filestore instance. If not set then the name parameter is used as an instance name. Useful when connecting multiple volumes from the same Filestore instance.
506
514
  - `--manifest` - path to the manifest file containing PersistentVolume, PresistentVolumeClaim and StorageClass definitions. If set, then values from manifest override the following parameters: `--access-mode`, `--size` and `--volume`.
507
515
 
516
+ ### Parallelstore
517
+
518
+ A Parallelstore adapter lets you mount and access [Parallelstore instances](https://cloud.google.com/parallelstore/) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
519
+
520
+ To use the GCS Parallelstore with XPK you need to create a [Parallelstore Instance](https://console.cloud.google.com/parallelstore/).
521
+
522
+ Once it's ready you can use `xpk storage attach` with `--type=parallelstore` command to attach a Parallelstore instance to your cluster. Currently, attaching a Parallelstore is supported only by providing a manifest file.
523
+
524
+ ```shell
525
+ python3 xpk.py storage attach test-parallelstore-storage --type=parallelstore \
526
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
527
+ --mount-point='/test-mount-point' --readonly=false \
528
+ --auto-mount=true \
529
+ --manifest='./examples/storage/parallelstore-manifest-attach.yaml'
530
+ ```
531
+
532
+ Parameters:
533
+
534
+ - `--type` - type of the storage `parallelstore`
535
+ - `--auto-mount` - if set to true all workloads will have this storage mounted by default.
536
+ - `--mount-point` - the path on which this storage should be mounted for a workload.
537
+ - `--readonly` - if set to true, workload can only read from storage.
538
+ - `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
539
+
540
+ ### Block storage (Persistent Disk, Hyperdisk)
541
+
542
+ A PersistentDisk adapter lets you mount and access Google Cloud Block storage solutions ([Persistent Disk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#pd), [Hyperdisk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#hyperdisk)) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
543
+
544
+ To use the GCE PersistentDisk with XPK you need to create a [disk in GCE](https://cloud.google.com/compute/docs/disks). Please consider that the disk type you are creating is [compatible with the VMs](https://cloud.google.com/compute/docs/machine-resource#machine_type_comparison) in the default and accelerator nodepools.
545
+
546
+ Once it's ready you can use `xpk storage attach` with `--type=pd` command to attach a PersistentDisk instance to your cluster. Currently, attaching a PersistentDisk is supported only by providing a manifest file.
547
+
548
+ ```shell
549
+ python3 xpk.py storage attach test-pd-storage --type=pd \
550
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
551
+ --mount-point='/test-mount-point' --readonly=false \
552
+ --auto-mount=true \
553
+ --manifest='./examples/storage/pd-manifest-attach.yaml'
554
+ ```
555
+
556
+ Parameters:
557
+
558
+ - `--type` - type of the storage `pd`
559
+ - `--auto-mount` - if set to true all workloads will have this storage mounted by default.
560
+ - `--mount-point` - the path on which this storage should be mounted for a workload.
561
+ - `--readonly` - if set to true, workload can only read from storage.
562
+ - `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
563
+
508
564
  ### List attached storages
509
565
 
510
566
  ```shell
@@ -14,18 +14,26 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ import re
17
18
  from argparse import Namespace
18
19
 
19
- from ..core.cluster import create_xpk_k8s_service_account
20
+ from ..core.cluster import (
21
+ create_xpk_k8s_service_account,
22
+ get_cluster_credentials,
23
+ )
20
24
  from ..core.commands import run_command_for_value
21
25
  from ..core.gcloud_context import add_zone_and_project
26
+ from ..core.kjob import (
27
+ AppProfileDefaults,
28
+ JobTemplateDefaults,
29
+ Kueue_TAS_annotation,
30
+ get_storage_annotations,
31
+ prepare_kjob,
32
+ )
22
33
  from ..core.kueue import LOCAL_QUEUE_NAME
23
34
  from ..utils.console import xpk_exit, xpk_print
24
- from .common import set_cluster_command
25
- from ..core.kjob import AppProfileDefaults, JobTemplateDefaults, prepare_kjob, Kueue_TAS_annotation, get_gcsfuse_annotation
26
- from .kjob_common import add_gpu_networking_annotations_to_command
27
35
  from .kind import set_local_cluster_command
28
- import re
36
+ from .kjob_common import add_gpu_networking_annotations_to_command
29
37
 
30
38
 
31
39
  def batch(args: Namespace) -> None:
@@ -38,12 +46,11 @@ def batch(args: Namespace) -> None:
38
46
  """
39
47
  if not args.kind_cluster:
40
48
  add_zone_and_project(args)
41
- set_cluster_command_code = set_cluster_command(args)
49
+ get_cluster_credentials(args)
42
50
  else:
43
51
  set_cluster_command_code = set_local_cluster_command(args)
44
-
45
- if set_cluster_command_code != 0:
46
- xpk_exit(set_cluster_command_code)
52
+ if set_cluster_command_code != 0:
53
+ xpk_exit(set_cluster_command_code)
47
54
 
48
55
  err_code = prepare_kjob(args)
49
56
  if err_code > 0:
@@ -66,9 +73,9 @@ def submit_job(args: Namespace) -> None:
66
73
  ' --first-node-ip'
67
74
  )
68
75
  cmd = add_gpu_networking_annotations_to_command(args, cmd)
69
- gcsfuse_annotation = get_gcsfuse_annotation(args)
70
- if gcsfuse_annotation is not None:
71
- cmd += f' --pod-template-annotation {gcsfuse_annotation}'
76
+
77
+ for annotation in get_storage_annotations(args):
78
+ cmd += f' --pod-template-annotation {annotation}'
72
79
 
73
80
  if args.ignore_unknown_flags:
74
81
  cmd += ' --ignore-unknown-flags'
@@ -22,9 +22,13 @@ from ..core.cluster import (
22
22
  get_cluster_credentials,
23
23
  install_nccl_on_cluster,
24
24
  set_jobset_on_cluster,
25
+ set_pathways_job_on_cluster,
25
26
  setup_k8s_env,
26
27
  update_cluster_with_gcsfuse_driver_if_necessary,
27
28
  update_cluster_with_workload_identity_if_necessary,
29
+ update_cluster_with_gcpfilestore_driver_if_necessary,
30
+ update_cluster_with_parallelstore_driver_if_necessary,
31
+ update_cluster_with_pd_driver_if_necessary,
28
32
  )
29
33
  from ..core.cluster_private import authorize_private_cluster_access_if_necessary
30
34
  from ..core.commands import run_command_for_value, run_command_with_updates
@@ -46,7 +50,7 @@ from ..core.nap import enable_autoprovisioning_on_cluster
46
50
  from ..core.network import (
47
51
  create_cluster_network_config,
48
52
  delete_cluster_subnets,
49
- set_up_cluster_network_for_gpu,
53
+ set_up_cluster_network_for_a3,
50
54
  )
51
55
  from ..core.nodepool import get_gke_node_pool_version, run_gke_node_pool_create_command
52
56
  from ..core.ray import install_ray_cluster
@@ -64,7 +68,6 @@ from ..utils.console import get_user_input, xpk_exit, xpk_print
64
68
  from ..utils.file import write_tmp_file
65
69
  from . import cluster_gcluster
66
70
  from .common import set_cluster_command
67
- from ..core.cluster import update_cluster_with_gcpfilestore_driver_if_necessary
68
71
 
69
72
 
70
73
  def cluster_create(args) -> None:
@@ -117,11 +120,7 @@ def cluster_create(args) -> None:
117
120
 
118
121
  # ToDo(roshanin@) - Re-enable CloudDNS on Pathways clusters conditionally.
119
122
  # Enable WorkloadIdentity if not enabled already.
120
- if (
121
- args.enable_workload_identity
122
- or args.enable_gcsfuse_csi_driver
123
- or args.enable_gcpfilestore_csi_driver
124
- ):
123
+ if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
125
124
  update_cluster_command_code = (
126
125
  update_cluster_with_workload_identity_if_necessary(args)
127
126
  )
@@ -143,6 +142,20 @@ def cluster_create(args) -> None:
143
142
  if update_cluster_command_code != 0:
144
143
  xpk_exit(update_cluster_command_code)
145
144
 
145
+ if args.enable_parallelstore_csi_driver:
146
+ update_cluster_command_code = (
147
+ update_cluster_with_parallelstore_driver_if_necessary(args)
148
+ )
149
+ if update_cluster_command_code != 0:
150
+ xpk_exit(update_cluster_command_code)
151
+
152
+ if args.enable_pd_csi_driver:
153
+ update_cluster_command_code = update_cluster_with_pd_driver_if_necessary(
154
+ args
155
+ )
156
+ if update_cluster_command_code != 0:
157
+ xpk_exit(update_cluster_command_code)
158
+
146
159
  # Update Pathways clusters with CloudDNS if not enabled already.
147
160
 
148
161
  get_cluster_credentials(args)
@@ -155,13 +168,12 @@ def cluster_create(args) -> None:
155
168
  if not tensorboard_config:
156
169
  xpk_exit(1)
157
170
 
158
- if system.accelerator_type == AcceleratorType['GPU']:
171
+ if system.device_type == H100_DEVICE_TYPE:
159
172
  xpk_print('Setting up Network for cluster')
160
- set_up_cluster_network_code = set_up_cluster_network_for_gpu(args, system)
173
+ set_up_cluster_network_code = set_up_cluster_network_for_a3(args)
161
174
  if set_up_cluster_network_code != 0:
162
175
  xpk_exit(set_up_cluster_network_code)
163
176
 
164
- if system.device_type == H100_DEVICE_TYPE:
165
177
  xpk_print('Creating Network Config for cluster')
166
178
  create_cluster_network_config_code = create_cluster_network_config(args)
167
179
  if create_cluster_network_config_code != 0:
@@ -207,6 +219,10 @@ def cluster_create(args) -> None:
207
219
  if set_jobset_on_cluster_code != 0:
208
220
  xpk_exit(set_jobset_on_cluster_code)
209
221
 
222
+ set_pathways_job_on_cluster_code = set_pathways_job_on_cluster(args)
223
+ if set_pathways_job_on_cluster_code != 0:
224
+ xpk_exit(set_pathways_job_on_cluster_code)
225
+
210
226
  xpk_print('Enabling Kueue on the cluster')
211
227
  install_kueue_on_cluster_code = install_kueue_on_cluster(args)
212
228
  if install_kueue_on_cluster_code != 0:
@@ -783,20 +799,21 @@ def run_gke_cluster_create_command(
783
799
  if args.enable_ray_cluster:
784
800
  command += ' --addons RayOperator'
785
801
 
786
- if (
787
- args.enable_workload_identity
788
- or args.enable_gcsfuse_csi_driver
789
- or args.enable_gcpfilestore_csi_driver
790
- ):
802
+ if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
791
803
  command += f' --workload-pool={args.project}.svc.id.goog'
792
804
 
793
805
  addons = []
794
806
  if args.enable_gcsfuse_csi_driver:
795
807
  addons.append('GcsFuseCsiDriver')
796
-
797
808
  if args.enable_gcpfilestore_csi_driver:
798
809
  addons.append('GcpFilestoreCsiDriver')
799
810
 
811
+ if args.enable_parallelstore_csi_driver:
812
+ addons.append('ParallelstoreCsiDriver')
813
+
814
+ if args.enable_pd_csi_driver:
815
+ addons.append('GcePersistentDiskCsiDriver')
816
+
800
817
  if len(addons) > 0:
801
818
  addons_str = ','.join(addons)
802
819
  command += f' --addons={addons_str}'
@@ -16,26 +16,27 @@ limitations under the License.
16
16
 
17
17
  import os
18
18
 
19
- from ..core.remote_state.remote_state_client import RemoteStateClient
20
- from ..core.remote_state.fuse_remote_state import FuseStateClient
21
19
  from ..core.blueprint.blueprint_generator import (
22
20
  BlueprintGenerator,
23
21
  BlueprintGeneratorOutput,
24
22
  a3mega_device_type,
25
23
  a3ultra_device_type,
24
+ a4_device_type,
26
25
  supported_device_types,
27
26
  )
28
- from ..core.commands import run_command_for_value
29
27
  from ..core.capacity import get_capacity_type
28
+ from ..core.cluster import get_cluster_credentials
29
+ from ..core.commands import run_command_for_value
30
30
  from ..core.docker_manager import DockerManager
31
31
  from ..core.gcloud_context import zone_to_region
32
32
  from ..core.gcluster_manager import GclusterManager
33
+ from ..core.kjob import apply_kjob_crds, prepare_kjob
34
+ from ..core.remote_state.fuse_remote_state import FuseStateClient
35
+ from ..core.remote_state.remote_state_client import RemoteStateClient
33
36
  from ..utils.console import xpk_exit, xpk_print
34
37
  from ..utils.file import ensure_directory_exists
35
38
  from ..utils.network import all_IPs_cidr
36
39
  from ..utils.objects import hash_string
37
- from ..core.cluster import get_cluster_credentials
38
- from ..core.kjob import apply_kjob_crds, prepare_kjob
39
40
 
40
41
  blueprints_path = os.path.abspath('xpkclusters/blueprints')
41
42
  gcluster_working_dir = os.path.abspath('xpkclusters/gcluster-out')
@@ -266,4 +267,20 @@ def generate_blueprint(
266
267
  system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
267
268
  gcs_bucket=args.cluster_state_gcs_bucket,
268
269
  )
270
+ if args.device_type == a4_device_type:
271
+ num_nodes = args.num_nodes if not args.num_nodes is None else 2
272
+ return bpg.generate_a4_blueprint(
273
+ blueprint_name=blueprint_name,
274
+ prefix=prefix,
275
+ cluster_name=args.cluster,
276
+ region=zone_to_region(args.zone),
277
+ project_id=args.project,
278
+ zone=args.zone,
279
+ auth_cidr=all_IPs_cidr,
280
+ num_nodes=num_nodes,
281
+ reservation=args.reservation if args.reservation else None,
282
+ capacity_type=capacity_type,
283
+ system_node_pool_machine_type=args.default_pool_cpu_machine_type,
284
+ system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
285
+ )
269
286
  return None
@@ -20,10 +20,10 @@ from argparse import Namespace
20
20
  from tabulate import tabulate
21
21
 
22
22
  from ..core.commands import run_command_for_value
23
+ from ..core.cluster import get_cluster_credentials
23
24
  from ..core.gcloud_context import add_zone_and_project
24
25
  from ..core.kueue import verify_kueuectl
25
26
  from ..utils.console import xpk_exit, xpk_print
26
- from .common import set_cluster_command
27
27
 
28
28
  table_fmt = 'plain'
29
29
 
@@ -37,9 +37,7 @@ def info(args: Namespace) -> None:
37
37
  None
38
38
  """
39
39
  add_zone_and_project(args)
40
- set_cluster_command_code = set_cluster_command(args)
41
- if set_cluster_command_code != 0:
42
- xpk_exit(set_cluster_command_code)
40
+ get_cluster_credentials(args)
43
41
 
44
42
  verify_kueuectl(args)
45
43
  lq, cq = bool(args.localqueue), bool(args.clusterqueue)
@@ -20,10 +20,10 @@ import sys
20
20
  from ruamel.yaml import YAML
21
21
 
22
22
  from ..core.commands import run_command_for_value, run_command_with_updates
23
+ from ..core.cluster import get_cluster_credentials
23
24
  from ..core.gcloud_context import add_zone_and_project
24
25
  from ..core.kjob import AppProfileDefaults
25
26
  from ..utils.console import xpk_exit, xpk_print
26
- from .common import set_cluster_command
27
27
  from .kind import set_local_cluster_command
28
28
 
29
29
 
@@ -143,14 +143,14 @@ def job_list(args) -> None:
143
143
  """
144
144
  if not args.kind_cluster:
145
145
  add_zone_and_project(args)
146
- set_cluster_command_code = set_cluster_command(args)
146
+ get_cluster_credentials(args)
147
147
  msg = f'Listing jobs for project {args.project} and zone {args.zone}:'
148
148
  else:
149
149
  set_cluster_command_code = set_local_cluster_command(args)
150
150
  msg = 'Listing jobs:'
151
+ if set_cluster_command_code != 0:
152
+ xpk_exit(set_cluster_command_code)
151
153
 
152
- if set_cluster_command_code != 0:
153
- xpk_exit(set_cluster_command_code)
154
154
  xpk_print(msg, flush=True)
155
155
 
156
156
  return_code = run_slurm_job_list_command(args)
@@ -178,12 +178,11 @@ def job_cancel(args) -> None:
178
178
  xpk_print(f'Starting job cancel for job: {args.name}', flush=True)
179
179
  if not args.kind_cluster:
180
180
  add_zone_and_project(args)
181
- set_cluster_command_code = set_cluster_command(args)
181
+ get_cluster_credentials(args)
182
182
  else:
183
183
  set_cluster_command_code = set_local_cluster_command(args)
184
-
185
- if set_cluster_command_code != 0:
186
- xpk_exit(set_cluster_command_code)
184
+ if set_cluster_command_code != 0:
185
+ xpk_exit(set_cluster_command_code)
187
186
 
188
187
  return_code = run_slurm_job_delete_command(args)
189
188
  xpk_exit(return_code)
@@ -0,0 +1,47 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from ..core.capacity import (
18
+ B200_DEVICE_TYPE,
19
+ H100_MEGA_DEVICE_TYPE,
20
+ H200_DEVICE_TYPE,
21
+ )
22
+ from ..core.cluster import get_gpu_type_from_cluster
23
+ from ..core.kjob import (
24
+ get_a3mega_pod_template_annotations,
25
+ get_a3ultra_pod_template_annotations,
26
+ get_a4_pod_template_annotations,
27
+ )
28
+
29
+
30
+ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
31
+ gpu_type = get_gpu_type_from_cluster(args)
32
+
33
+ if gpu_type == H100_MEGA_DEVICE_TYPE:
34
+ annotations = get_a3mega_pod_template_annotations(args)
35
+ elif gpu_type == H200_DEVICE_TYPE:
36
+ annotations = get_a3ultra_pod_template_annotations(args)
37
+ elif gpu_type == B200_DEVICE_TYPE:
38
+ annotations = get_a4_pod_template_annotations()
39
+ else:
40
+ annotations = []
41
+
42
+ flags = [
43
+ f" --pod-template-annotation {annotation} " for annotation in annotations
44
+ ]
45
+ cmd += "\\\n".join(flags)
46
+
47
+ return cmd
@@ -16,15 +16,23 @@ limitations under the License.
16
16
 
17
17
  from argparse import Namespace
18
18
 
19
- from ..core.cluster import create_xpk_k8s_service_account
19
+ from ..core.cluster import (
20
+ create_xpk_k8s_service_account,
21
+ get_cluster_credentials,
22
+ )
20
23
  from ..core.commands import run_command_with_full_controls
21
24
  from ..core.gcloud_context import add_zone_and_project
25
+ from ..core.kjob import (
26
+ AppProfileDefaults,
27
+ JobTemplateDefaults,
28
+ Kueue_TAS_annotation,
29
+ get_storage_annotations,
30
+ prepare_kjob,
31
+ )
22
32
  from ..core.kueue import LOCAL_QUEUE_NAME
23
33
  from ..utils.console import xpk_exit, xpk_print
24
- from .common import set_cluster_command
25
- from ..core.kjob import JobTemplateDefaults, AppProfileDefaults, prepare_kjob, Kueue_TAS_annotation, get_gcsfuse_annotation
26
- from .kjob_common import add_gpu_networking_annotations_to_command
27
34
  from .kind import set_local_cluster_command
35
+ from .kjob_common import add_gpu_networking_annotations_to_command
28
36
 
29
37
 
30
38
  def run(args: Namespace) -> None:
@@ -37,12 +45,11 @@ def run(args: Namespace) -> None:
37
45
  """
38
46
  if not args.kind_cluster:
39
47
  add_zone_and_project(args)
40
- set_cluster_command_code = set_cluster_command(args)
48
+ get_cluster_credentials(args)
41
49
  else:
42
50
  set_cluster_command_code = set_local_cluster_command(args)
43
-
44
- if set_cluster_command_code != 0:
45
- xpk_exit(set_cluster_command_code)
51
+ if set_cluster_command_code != 0:
52
+ xpk_exit(set_cluster_command_code)
46
53
 
47
54
  err_code = prepare_kjob(args)
48
55
  if err_code > 0:
@@ -64,9 +71,8 @@ def submit_job(args: Namespace) -> None:
64
71
  )
65
72
  cmd = add_gpu_networking_annotations_to_command(args, cmd)
66
73
 
67
- gcsfuse_annotation = get_gcsfuse_annotation(args)
68
- if gcsfuse_annotation is not None:
69
- cmd += f' --pod-template-annotation {gcsfuse_annotation}'
74
+ for annotation in get_storage_annotations(args):
75
+ cmd += f' --pod-template-annotation {annotation}'
70
76
 
71
77
  if args.timeout:
72
78
  cmd += f' --wait-timeout {args.timeout}s'
@@ -20,7 +20,7 @@ from ..core.kjob import (
20
20
  AppProfileDefaults,
21
21
  prepare_kjob,
22
22
  get_pod_template_interactive_command,
23
- get_gcsfuse_annotation,
23
+ get_storage_annotations,
24
24
  )
25
25
 
26
26
  exit_instructions = 'To exit the shell input "exit".'
@@ -89,9 +89,8 @@ def connect_to_new_interactive_shell(args: Namespace) -> int:
89
89
  f' {AppProfileDefaults.NAME.value} --pod-running-timeout 180s'
90
90
  )
91
91
 
92
- gcsfuse_annotation = get_gcsfuse_annotation(args)
93
- if gcsfuse_annotation is not None:
94
- cmd += f' --pod-template-annotation {gcsfuse_annotation}'
92
+ for annotation in get_storage_annotations(args):
93
+ cmd += f' --pod-template-annotation {annotation}'
95
94
 
96
95
  return run_command_with_full_controls(
97
96
  command=cmd,