xpk 0.7.2__tar.gz → 0.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xpk-0.7.2/src/xpk.egg-info → xpk-0.8.0}/PKG-INFO +60 -4
- {xpk-0.7.2 → xpk-0.8.0}/README.md +59 -3
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/batch.py +19 -12
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/cluster.py +33 -16
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/cluster_gcluster.py +22 -5
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/info.py +2 -4
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/job.py +7 -8
- xpk-0.8.0/src/xpk/commands/kjob_common.py +47 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/run.py +17 -11
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/shell.py +3 -4
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/storage.py +64 -19
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/workload.py +154 -319
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/blueprint/blueprint_definitions.py +2 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/blueprint/blueprint_generator.py +322 -32
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/capacity.py +1 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/cluster.py +75 -5
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/config.py +3 -1
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/docker_manager.py +1 -1
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/docker_resources.py +9 -21
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/filestore.py +11 -3
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/gcsfuse.py +8 -5
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/kjob.py +57 -18
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/nap.py +4 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/network.py +11 -21
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/nodepool.py +28 -26
- xpk-0.8.0/src/xpk/core/pathways.py +332 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/scheduling.py +36 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/storage.py +66 -12
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/system_characteristics.py +9 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/workload.py +27 -82
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/workload_decorators/rdma_decorator.py +3 -3
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/workload_decorators/storage_decorator.py +8 -3
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/workload_decorators/tcpxo_decorator.py +2 -2
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/cluster.py +15 -6
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/storage.py +14 -3
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/workload.py +59 -31
- {xpk-0.7.2 → xpk-0.8.0/src/xpk.egg-info}/PKG-INFO +60 -4
- xpk-0.7.2/src/xpk/commands/kjob_common.py +0 -44
- xpk-0.7.2/src/xpk/core/pathways.py +0 -377
- {xpk-0.7.2 → xpk-0.8.0}/LICENSE +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/pyproject.toml +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/setup.cfg +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/__init__.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/api/__init__.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/api/storage_crd.yaml +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/__init__.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/common.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/config.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/inspector.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/kind.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/commands/version.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/__init__.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/blueprint/__init__.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/cluster_private.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/commands.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/docker_container.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/docker_image.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/gcloud_context.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/gcluster_manager.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/kueue.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/monitoring.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/ray.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/remote_state/__init__.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/remote_state/remote_state_client.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/resources.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/vertex.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/core/workload_decorators/__init__.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/main.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/__init__.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/batch.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/common.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/config.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/core.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/info.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/inspector.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/job.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/kind.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/run.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/shell.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/validators.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/parser/version.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/templates/__init__.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/templates/storage.yaml +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/utils/__init__.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/utils/console.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/utils/file.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/utils/gcs_utils.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/utils/kubectl.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/utils/network.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/utils/objects.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/utils/templates.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/utils/validation.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk/utils/yaml.py +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk.egg-info/SOURCES.txt +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk.egg-info/dependency_links.txt +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk.egg-info/entry_points.txt +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk.egg-info/requires.txt +0 -0
- {xpk-0.7.2 → xpk-0.8.0}/src/xpk.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xpk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
|
|
5
5
|
Author-email: XPK team <xpk-code-reviewers@google.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -88,9 +88,11 @@ and the following GPU types:
|
|
|
88
88
|
and the following CPU types:
|
|
89
89
|
* n2-standard-32
|
|
90
90
|
|
|
91
|
-
xpk also supports Google Cloud Storage solutions:
|
|
91
|
+
xpk also supports [Google Cloud Storage solutions](#storage):
|
|
92
92
|
* [Cloud Storage FUSE](#fuse)
|
|
93
93
|
* [Filestore](#filestore)
|
|
94
|
+
* [Parallelstore](#parallelstore)
|
|
95
|
+
* [Block storage (Persistent Disk, Hyperdisk)](#block-storage-persistent-disk-hyperdisk)
|
|
94
96
|
|
|
95
97
|
# Permissions needed on Cloud Console:
|
|
96
98
|
|
|
@@ -253,6 +255,7 @@ all zones.
|
|
|
253
255
|
--num-slices=4 --on-demand \
|
|
254
256
|
--tpu-type=v5litepod-16
|
|
255
257
|
```
|
|
258
|
+
Note that Pathways clusters need a CPU nodepool of n2-standard-64 or higher.
|
|
256
259
|
|
|
257
260
|
* Cluster Create for Ray:
|
|
258
261
|
A cluster with KubeRay enabled and a RayCluster can be created using `cluster create-ray`.
|
|
@@ -475,7 +478,11 @@ Currently, the below flags/arguments are supported for A3-Mega and A3-Ultra mach
|
|
|
475
478
|
|
|
476
479
|
|
|
477
480
|
## Storage
|
|
478
|
-
Currently XPK supports
|
|
481
|
+
Currently XPK supports the below types of storages:
|
|
482
|
+
- [Cloud Storage FUSE](#fuse)
|
|
483
|
+
- [Google Cloud Filestore](#filestore)
|
|
484
|
+
- [Google Cloud Parallelstore](#parallelstore)
|
|
485
|
+
- [Google Cloud Block storages (Persistent Disk, Hyperdisk)](#block-storage-persistent-disk-hyperdisk)
|
|
479
486
|
|
|
480
487
|
### FUSE
|
|
481
488
|
A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so applications can read and write objects in your bucket using standard file system semantics.
|
|
@@ -499,11 +506,12 @@ Parameters:
|
|
|
499
506
|
- `--readonly` - if set to true, workload can only read from storage.
|
|
500
507
|
- `--size` - size of the storage in Gb.
|
|
501
508
|
- `--bucket` - name of the storage bucket. If not set then the name of the storage is used as a bucket name.
|
|
509
|
+
- `--mount-options` - comma-separated list of additional mount options for PersistentVolume ([reference](https://cloud.google.com/kubernetes-engine/docs/how-to/cloud-storage-fuse-csi-driver-perf#mount-options)).
|
|
502
510
|
- `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions. If set, then values from manifest override the following parameters: `--size` and `--bucket`.
|
|
503
511
|
|
|
504
512
|
### Filestore
|
|
505
513
|
|
|
506
|
-
A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so applications can read and write
|
|
514
|
+
A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
|
|
507
515
|
|
|
508
516
|
To create and attach a GCP Filestore instance to your cluster use `xpk storage create` command with `--type=gcpfilestore`:
|
|
509
517
|
|
|
@@ -537,6 +545,54 @@ Commands `xpk storage create` and `xpk storage attach` with `--type=gcpfilestore
|
|
|
537
545
|
- `--instance` - the name of the Filestore instance. If not set then the name parameter is used as an instance name. Useful when connecting multiple volumes from the same Filestore instance.
|
|
538
546
|
- `--manifest` - path to the manifest file containing PersistentVolume, PresistentVolumeClaim and StorageClass definitions. If set, then values from manifest override the following parameters: `--access-mode`, `--size` and `--volume`.
|
|
539
547
|
|
|
548
|
+
### Parallelstore
|
|
549
|
+
|
|
550
|
+
A Parallelstore adapter lets you mount and access [Parallelstore instances](https://cloud.google.com/parallelstore/) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
|
|
551
|
+
|
|
552
|
+
To use the GCS Parallelstore with XPK you need to create a [Parallelstore Instance](https://console.cloud.google.com/parallelstore/).
|
|
553
|
+
|
|
554
|
+
Once it's ready you can use `xpk storage attach` with `--type=parallelstore` command to attach a Parallelstore instance to your cluster. Currently, attaching a Parallelstore is supported only by providing a manifest file.
|
|
555
|
+
|
|
556
|
+
```shell
|
|
557
|
+
python3 xpk.py storage attach test-parallelstore-storage --type=parallelstore \
|
|
558
|
+
--project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
|
|
559
|
+
--mount-point='/test-mount-point' --readonly=false \
|
|
560
|
+
--auto-mount=true \
|
|
561
|
+
--manifest='./examples/storage/parallelstore-manifest-attach.yaml'
|
|
562
|
+
```
|
|
563
|
+
|
|
564
|
+
Parameters:
|
|
565
|
+
|
|
566
|
+
- `--type` - type of the storage `parallelstore`
|
|
567
|
+
- `--auto-mount` - if set to true all workloads will have this storage mounted by default.
|
|
568
|
+
- `--mount-point` - the path on which this storage should be mounted for a workload.
|
|
569
|
+
- `--readonly` - if set to true, workload can only read from storage.
|
|
570
|
+
- `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
|
|
571
|
+
|
|
572
|
+
### Block storage (Persistent Disk, Hyperdisk)
|
|
573
|
+
|
|
574
|
+
A PersistentDisk adapter lets you mount and access Google Cloud Block storage solutions ([Persistent Disk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#pd), [Hyperdisk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#hyperdisk)) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
|
|
575
|
+
|
|
576
|
+
To use the GCE PersistentDisk with XPK you need to create a [disk in GCE](https://cloud.google.com/compute/docs/disks). Please consider that the disk type you are creating is [compatible with the VMs](https://cloud.google.com/compute/docs/machine-resource#machine_type_comparison) in the default and accelerator nodepools.
|
|
577
|
+
|
|
578
|
+
Once it's ready you can use `xpk storage attach` with `--type=pd` command to attach a PersistentDisk instance to your cluster. Currently, attaching a PersistentDisk is supported only by providing a manifest file.
|
|
579
|
+
|
|
580
|
+
```shell
|
|
581
|
+
python3 xpk.py storage attach test-pd-storage --type=pd \
|
|
582
|
+
--project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
|
|
583
|
+
--mount-point='/test-mount-point' --readonly=false \
|
|
584
|
+
--auto-mount=true \
|
|
585
|
+
--manifest='./examples/storage/pd-manifest-attach.yaml'
|
|
586
|
+
```
|
|
587
|
+
|
|
588
|
+
Parameters:
|
|
589
|
+
|
|
590
|
+
- `--type` - type of the storage `pd`
|
|
591
|
+
- `--auto-mount` - if set to true all workloads will have this storage mounted by default.
|
|
592
|
+
- `--mount-point` - the path on which this storage should be mounted for a workload.
|
|
593
|
+
- `--readonly` - if set to true, workload can only read from storage.
|
|
594
|
+
- `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
|
|
595
|
+
|
|
540
596
|
### List attached storages
|
|
541
597
|
|
|
542
598
|
```shell
|
|
@@ -56,9 +56,11 @@ and the following GPU types:
|
|
|
56
56
|
and the following CPU types:
|
|
57
57
|
* n2-standard-32
|
|
58
58
|
|
|
59
|
-
xpk also supports Google Cloud Storage solutions:
|
|
59
|
+
xpk also supports [Google Cloud Storage solutions](#storage):
|
|
60
60
|
* [Cloud Storage FUSE](#fuse)
|
|
61
61
|
* [Filestore](#filestore)
|
|
62
|
+
* [Parallelstore](#parallelstore)
|
|
63
|
+
* [Block storage (Persistent Disk, Hyperdisk)](#block-storage-persistent-disk-hyperdisk)
|
|
62
64
|
|
|
63
65
|
# Permissions needed on Cloud Console:
|
|
64
66
|
|
|
@@ -221,6 +223,7 @@ all zones.
|
|
|
221
223
|
--num-slices=4 --on-demand \
|
|
222
224
|
--tpu-type=v5litepod-16
|
|
223
225
|
```
|
|
226
|
+
Note that Pathways clusters need a CPU nodepool of n2-standard-64 or higher.
|
|
224
227
|
|
|
225
228
|
* Cluster Create for Ray:
|
|
226
229
|
A cluster with KubeRay enabled and a RayCluster can be created using `cluster create-ray`.
|
|
@@ -443,7 +446,11 @@ Currently, the below flags/arguments are supported for A3-Mega and A3-Ultra mach
|
|
|
443
446
|
|
|
444
447
|
|
|
445
448
|
## Storage
|
|
446
|
-
Currently XPK supports
|
|
449
|
+
Currently XPK supports the below types of storages:
|
|
450
|
+
- [Cloud Storage FUSE](#fuse)
|
|
451
|
+
- [Google Cloud Filestore](#filestore)
|
|
452
|
+
- [Google Cloud Parallelstore](#parallelstore)
|
|
453
|
+
- [Google Cloud Block storages (Persistent Disk, Hyperdisk)](#block-storage-persistent-disk-hyperdisk)
|
|
447
454
|
|
|
448
455
|
### FUSE
|
|
449
456
|
A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so applications can read and write objects in your bucket using standard file system semantics.
|
|
@@ -467,11 +474,12 @@ Parameters:
|
|
|
467
474
|
- `--readonly` - if set to true, workload can only read from storage.
|
|
468
475
|
- `--size` - size of the storage in Gb.
|
|
469
476
|
- `--bucket` - name of the storage bucket. If not set then the name of the storage is used as a bucket name.
|
|
477
|
+
- `--mount-options` - comma-separated list of additional mount options for PersistentVolume ([reference](https://cloud.google.com/kubernetes-engine/docs/how-to/cloud-storage-fuse-csi-driver-perf#mount-options)).
|
|
470
478
|
- `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions. If set, then values from manifest override the following parameters: `--size` and `--bucket`.
|
|
471
479
|
|
|
472
480
|
### Filestore
|
|
473
481
|
|
|
474
|
-
A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so applications can read and write
|
|
482
|
+
A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
|
|
475
483
|
|
|
476
484
|
To create and attach a GCP Filestore instance to your cluster use `xpk storage create` command with `--type=gcpfilestore`:
|
|
477
485
|
|
|
@@ -505,6 +513,54 @@ Commands `xpk storage create` and `xpk storage attach` with `--type=gcpfilestore
|
|
|
505
513
|
- `--instance` - the name of the Filestore instance. If not set then the name parameter is used as an instance name. Useful when connecting multiple volumes from the same Filestore instance.
|
|
506
514
|
- `--manifest` - path to the manifest file containing PersistentVolume, PresistentVolumeClaim and StorageClass definitions. If set, then values from manifest override the following parameters: `--access-mode`, `--size` and `--volume`.
|
|
507
515
|
|
|
516
|
+
### Parallelstore
|
|
517
|
+
|
|
518
|
+
A Parallelstore adapter lets you mount and access [Parallelstore instances](https://cloud.google.com/parallelstore/) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
|
|
519
|
+
|
|
520
|
+
To use the GCS Parallelstore with XPK you need to create a [Parallelstore Instance](https://console.cloud.google.com/parallelstore/).
|
|
521
|
+
|
|
522
|
+
Once it's ready you can use `xpk storage attach` with `--type=parallelstore` command to attach a Parallelstore instance to your cluster. Currently, attaching a Parallelstore is supported only by providing a manifest file.
|
|
523
|
+
|
|
524
|
+
```shell
|
|
525
|
+
python3 xpk.py storage attach test-parallelstore-storage --type=parallelstore \
|
|
526
|
+
--project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
|
|
527
|
+
--mount-point='/test-mount-point' --readonly=false \
|
|
528
|
+
--auto-mount=true \
|
|
529
|
+
--manifest='./examples/storage/parallelstore-manifest-attach.yaml'
|
|
530
|
+
```
|
|
531
|
+
|
|
532
|
+
Parameters:
|
|
533
|
+
|
|
534
|
+
- `--type` - type of the storage `parallelstore`
|
|
535
|
+
- `--auto-mount` - if set to true all workloads will have this storage mounted by default.
|
|
536
|
+
- `--mount-point` - the path on which this storage should be mounted for a workload.
|
|
537
|
+
- `--readonly` - if set to true, workload can only read from storage.
|
|
538
|
+
- `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
|
|
539
|
+
|
|
540
|
+
### Block storage (Persistent Disk, Hyperdisk)
|
|
541
|
+
|
|
542
|
+
A PersistentDisk adapter lets you mount and access Google Cloud Block storage solutions ([Persistent Disk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#pd), [Hyperdisk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#hyperdisk)) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
|
|
543
|
+
|
|
544
|
+
To use the GCE PersistentDisk with XPK you need to create a [disk in GCE](https://cloud.google.com/compute/docs/disks). Please consider that the disk type you are creating is [compatible with the VMs](https://cloud.google.com/compute/docs/machine-resource#machine_type_comparison) in the default and accelerator nodepools.
|
|
545
|
+
|
|
546
|
+
Once it's ready you can use `xpk storage attach` with `--type=pd` command to attach a PersistentDisk instance to your cluster. Currently, attaching a PersistentDisk is supported only by providing a manifest file.
|
|
547
|
+
|
|
548
|
+
```shell
|
|
549
|
+
python3 xpk.py storage attach test-pd-storage --type=pd \
|
|
550
|
+
--project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
|
|
551
|
+
--mount-point='/test-mount-point' --readonly=false \
|
|
552
|
+
--auto-mount=true \
|
|
553
|
+
--manifest='./examples/storage/pd-manifest-attach.yaml'
|
|
554
|
+
```
|
|
555
|
+
|
|
556
|
+
Parameters:
|
|
557
|
+
|
|
558
|
+
- `--type` - type of the storage `pd`
|
|
559
|
+
- `--auto-mount` - if set to true all workloads will have this storage mounted by default.
|
|
560
|
+
- `--mount-point` - the path on which this storage should be mounted for a workload.
|
|
561
|
+
- `--readonly` - if set to true, workload can only read from storage.
|
|
562
|
+
- `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
|
|
563
|
+
|
|
508
564
|
### List attached storages
|
|
509
565
|
|
|
510
566
|
```shell
|
|
@@ -14,18 +14,26 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
import re
|
|
17
18
|
from argparse import Namespace
|
|
18
19
|
|
|
19
|
-
from ..core.cluster import
|
|
20
|
+
from ..core.cluster import (
|
|
21
|
+
create_xpk_k8s_service_account,
|
|
22
|
+
get_cluster_credentials,
|
|
23
|
+
)
|
|
20
24
|
from ..core.commands import run_command_for_value
|
|
21
25
|
from ..core.gcloud_context import add_zone_and_project
|
|
26
|
+
from ..core.kjob import (
|
|
27
|
+
AppProfileDefaults,
|
|
28
|
+
JobTemplateDefaults,
|
|
29
|
+
Kueue_TAS_annotation,
|
|
30
|
+
get_storage_annotations,
|
|
31
|
+
prepare_kjob,
|
|
32
|
+
)
|
|
22
33
|
from ..core.kueue import LOCAL_QUEUE_NAME
|
|
23
34
|
from ..utils.console import xpk_exit, xpk_print
|
|
24
|
-
from .common import set_cluster_command
|
|
25
|
-
from ..core.kjob import AppProfileDefaults, JobTemplateDefaults, prepare_kjob, Kueue_TAS_annotation, get_gcsfuse_annotation
|
|
26
|
-
from .kjob_common import add_gpu_networking_annotations_to_command
|
|
27
35
|
from .kind import set_local_cluster_command
|
|
28
|
-
import
|
|
36
|
+
from .kjob_common import add_gpu_networking_annotations_to_command
|
|
29
37
|
|
|
30
38
|
|
|
31
39
|
def batch(args: Namespace) -> None:
|
|
@@ -38,12 +46,11 @@ def batch(args: Namespace) -> None:
|
|
|
38
46
|
"""
|
|
39
47
|
if not args.kind_cluster:
|
|
40
48
|
add_zone_and_project(args)
|
|
41
|
-
|
|
49
|
+
get_cluster_credentials(args)
|
|
42
50
|
else:
|
|
43
51
|
set_cluster_command_code = set_local_cluster_command(args)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
xpk_exit(set_cluster_command_code)
|
|
52
|
+
if set_cluster_command_code != 0:
|
|
53
|
+
xpk_exit(set_cluster_command_code)
|
|
47
54
|
|
|
48
55
|
err_code = prepare_kjob(args)
|
|
49
56
|
if err_code > 0:
|
|
@@ -66,9 +73,9 @@ def submit_job(args: Namespace) -> None:
|
|
|
66
73
|
' --first-node-ip'
|
|
67
74
|
)
|
|
68
75
|
cmd = add_gpu_networking_annotations_to_command(args, cmd)
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
cmd += f' --pod-template-annotation {
|
|
76
|
+
|
|
77
|
+
for annotation in get_storage_annotations(args):
|
|
78
|
+
cmd += f' --pod-template-annotation {annotation}'
|
|
72
79
|
|
|
73
80
|
if args.ignore_unknown_flags:
|
|
74
81
|
cmd += ' --ignore-unknown-flags'
|
|
@@ -22,9 +22,13 @@ from ..core.cluster import (
|
|
|
22
22
|
get_cluster_credentials,
|
|
23
23
|
install_nccl_on_cluster,
|
|
24
24
|
set_jobset_on_cluster,
|
|
25
|
+
set_pathways_job_on_cluster,
|
|
25
26
|
setup_k8s_env,
|
|
26
27
|
update_cluster_with_gcsfuse_driver_if_necessary,
|
|
27
28
|
update_cluster_with_workload_identity_if_necessary,
|
|
29
|
+
update_cluster_with_gcpfilestore_driver_if_necessary,
|
|
30
|
+
update_cluster_with_parallelstore_driver_if_necessary,
|
|
31
|
+
update_cluster_with_pd_driver_if_necessary,
|
|
28
32
|
)
|
|
29
33
|
from ..core.cluster_private import authorize_private_cluster_access_if_necessary
|
|
30
34
|
from ..core.commands import run_command_for_value, run_command_with_updates
|
|
@@ -46,7 +50,7 @@ from ..core.nap import enable_autoprovisioning_on_cluster
|
|
|
46
50
|
from ..core.network import (
|
|
47
51
|
create_cluster_network_config,
|
|
48
52
|
delete_cluster_subnets,
|
|
49
|
-
|
|
53
|
+
set_up_cluster_network_for_a3,
|
|
50
54
|
)
|
|
51
55
|
from ..core.nodepool import get_gke_node_pool_version, run_gke_node_pool_create_command
|
|
52
56
|
from ..core.ray import install_ray_cluster
|
|
@@ -64,7 +68,6 @@ from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
|
64
68
|
from ..utils.file import write_tmp_file
|
|
65
69
|
from . import cluster_gcluster
|
|
66
70
|
from .common import set_cluster_command
|
|
67
|
-
from ..core.cluster import update_cluster_with_gcpfilestore_driver_if_necessary
|
|
68
71
|
|
|
69
72
|
|
|
70
73
|
def cluster_create(args) -> None:
|
|
@@ -117,11 +120,7 @@ def cluster_create(args) -> None:
|
|
|
117
120
|
|
|
118
121
|
# ToDo(roshanin@) - Re-enable CloudDNS on Pathways clusters conditionally.
|
|
119
122
|
# Enable WorkloadIdentity if not enabled already.
|
|
120
|
-
if
|
|
121
|
-
args.enable_workload_identity
|
|
122
|
-
or args.enable_gcsfuse_csi_driver
|
|
123
|
-
or args.enable_gcpfilestore_csi_driver
|
|
124
|
-
):
|
|
123
|
+
if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
|
|
125
124
|
update_cluster_command_code = (
|
|
126
125
|
update_cluster_with_workload_identity_if_necessary(args)
|
|
127
126
|
)
|
|
@@ -143,6 +142,20 @@ def cluster_create(args) -> None:
|
|
|
143
142
|
if update_cluster_command_code != 0:
|
|
144
143
|
xpk_exit(update_cluster_command_code)
|
|
145
144
|
|
|
145
|
+
if args.enable_parallelstore_csi_driver:
|
|
146
|
+
update_cluster_command_code = (
|
|
147
|
+
update_cluster_with_parallelstore_driver_if_necessary(args)
|
|
148
|
+
)
|
|
149
|
+
if update_cluster_command_code != 0:
|
|
150
|
+
xpk_exit(update_cluster_command_code)
|
|
151
|
+
|
|
152
|
+
if args.enable_pd_csi_driver:
|
|
153
|
+
update_cluster_command_code = update_cluster_with_pd_driver_if_necessary(
|
|
154
|
+
args
|
|
155
|
+
)
|
|
156
|
+
if update_cluster_command_code != 0:
|
|
157
|
+
xpk_exit(update_cluster_command_code)
|
|
158
|
+
|
|
146
159
|
# Update Pathways clusters with CloudDNS if not enabled already.
|
|
147
160
|
|
|
148
161
|
get_cluster_credentials(args)
|
|
@@ -155,13 +168,12 @@ def cluster_create(args) -> None:
|
|
|
155
168
|
if not tensorboard_config:
|
|
156
169
|
xpk_exit(1)
|
|
157
170
|
|
|
158
|
-
if system.
|
|
171
|
+
if system.device_type == H100_DEVICE_TYPE:
|
|
159
172
|
xpk_print('Setting up Network for cluster')
|
|
160
|
-
set_up_cluster_network_code =
|
|
173
|
+
set_up_cluster_network_code = set_up_cluster_network_for_a3(args)
|
|
161
174
|
if set_up_cluster_network_code != 0:
|
|
162
175
|
xpk_exit(set_up_cluster_network_code)
|
|
163
176
|
|
|
164
|
-
if system.device_type == H100_DEVICE_TYPE:
|
|
165
177
|
xpk_print('Creating Network Config for cluster')
|
|
166
178
|
create_cluster_network_config_code = create_cluster_network_config(args)
|
|
167
179
|
if create_cluster_network_config_code != 0:
|
|
@@ -207,6 +219,10 @@ def cluster_create(args) -> None:
|
|
|
207
219
|
if set_jobset_on_cluster_code != 0:
|
|
208
220
|
xpk_exit(set_jobset_on_cluster_code)
|
|
209
221
|
|
|
222
|
+
set_pathways_job_on_cluster_code = set_pathways_job_on_cluster(args)
|
|
223
|
+
if set_pathways_job_on_cluster_code != 0:
|
|
224
|
+
xpk_exit(set_pathways_job_on_cluster_code)
|
|
225
|
+
|
|
210
226
|
xpk_print('Enabling Kueue on the cluster')
|
|
211
227
|
install_kueue_on_cluster_code = install_kueue_on_cluster(args)
|
|
212
228
|
if install_kueue_on_cluster_code != 0:
|
|
@@ -783,20 +799,21 @@ def run_gke_cluster_create_command(
|
|
|
783
799
|
if args.enable_ray_cluster:
|
|
784
800
|
command += ' --addons RayOperator'
|
|
785
801
|
|
|
786
|
-
if
|
|
787
|
-
args.enable_workload_identity
|
|
788
|
-
or args.enable_gcsfuse_csi_driver
|
|
789
|
-
or args.enable_gcpfilestore_csi_driver
|
|
790
|
-
):
|
|
802
|
+
if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
|
|
791
803
|
command += f' --workload-pool={args.project}.svc.id.goog'
|
|
792
804
|
|
|
793
805
|
addons = []
|
|
794
806
|
if args.enable_gcsfuse_csi_driver:
|
|
795
807
|
addons.append('GcsFuseCsiDriver')
|
|
796
|
-
|
|
797
808
|
if args.enable_gcpfilestore_csi_driver:
|
|
798
809
|
addons.append('GcpFilestoreCsiDriver')
|
|
799
810
|
|
|
811
|
+
if args.enable_parallelstore_csi_driver:
|
|
812
|
+
addons.append('ParallelstoreCsiDriver')
|
|
813
|
+
|
|
814
|
+
if args.enable_pd_csi_driver:
|
|
815
|
+
addons.append('GcePersistentDiskCsiDriver')
|
|
816
|
+
|
|
800
817
|
if len(addons) > 0:
|
|
801
818
|
addons_str = ','.join(addons)
|
|
802
819
|
command += f' --addons={addons_str}'
|
|
@@ -16,26 +16,27 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
import os
|
|
18
18
|
|
|
19
|
-
from ..core.remote_state.remote_state_client import RemoteStateClient
|
|
20
|
-
from ..core.remote_state.fuse_remote_state import FuseStateClient
|
|
21
19
|
from ..core.blueprint.blueprint_generator import (
|
|
22
20
|
BlueprintGenerator,
|
|
23
21
|
BlueprintGeneratorOutput,
|
|
24
22
|
a3mega_device_type,
|
|
25
23
|
a3ultra_device_type,
|
|
24
|
+
a4_device_type,
|
|
26
25
|
supported_device_types,
|
|
27
26
|
)
|
|
28
|
-
from ..core.commands import run_command_for_value
|
|
29
27
|
from ..core.capacity import get_capacity_type
|
|
28
|
+
from ..core.cluster import get_cluster_credentials
|
|
29
|
+
from ..core.commands import run_command_for_value
|
|
30
30
|
from ..core.docker_manager import DockerManager
|
|
31
31
|
from ..core.gcloud_context import zone_to_region
|
|
32
32
|
from ..core.gcluster_manager import GclusterManager
|
|
33
|
+
from ..core.kjob import apply_kjob_crds, prepare_kjob
|
|
34
|
+
from ..core.remote_state.fuse_remote_state import FuseStateClient
|
|
35
|
+
from ..core.remote_state.remote_state_client import RemoteStateClient
|
|
33
36
|
from ..utils.console import xpk_exit, xpk_print
|
|
34
37
|
from ..utils.file import ensure_directory_exists
|
|
35
38
|
from ..utils.network import all_IPs_cidr
|
|
36
39
|
from ..utils.objects import hash_string
|
|
37
|
-
from ..core.cluster import get_cluster_credentials
|
|
38
|
-
from ..core.kjob import apply_kjob_crds, prepare_kjob
|
|
39
40
|
|
|
40
41
|
blueprints_path = os.path.abspath('xpkclusters/blueprints')
|
|
41
42
|
gcluster_working_dir = os.path.abspath('xpkclusters/gcluster-out')
|
|
@@ -266,4 +267,20 @@ def generate_blueprint(
|
|
|
266
267
|
system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
|
|
267
268
|
gcs_bucket=args.cluster_state_gcs_bucket,
|
|
268
269
|
)
|
|
270
|
+
if args.device_type == a4_device_type:
|
|
271
|
+
num_nodes = args.num_nodes if not args.num_nodes is None else 2
|
|
272
|
+
return bpg.generate_a4_blueprint(
|
|
273
|
+
blueprint_name=blueprint_name,
|
|
274
|
+
prefix=prefix,
|
|
275
|
+
cluster_name=args.cluster,
|
|
276
|
+
region=zone_to_region(args.zone),
|
|
277
|
+
project_id=args.project,
|
|
278
|
+
zone=args.zone,
|
|
279
|
+
auth_cidr=all_IPs_cidr,
|
|
280
|
+
num_nodes=num_nodes,
|
|
281
|
+
reservation=args.reservation if args.reservation else None,
|
|
282
|
+
capacity_type=capacity_type,
|
|
283
|
+
system_node_pool_machine_type=args.default_pool_cpu_machine_type,
|
|
284
|
+
system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
|
|
285
|
+
)
|
|
269
286
|
return None
|
|
@@ -20,10 +20,10 @@ from argparse import Namespace
|
|
|
20
20
|
from tabulate import tabulate
|
|
21
21
|
|
|
22
22
|
from ..core.commands import run_command_for_value
|
|
23
|
+
from ..core.cluster import get_cluster_credentials
|
|
23
24
|
from ..core.gcloud_context import add_zone_and_project
|
|
24
25
|
from ..core.kueue import verify_kueuectl
|
|
25
26
|
from ..utils.console import xpk_exit, xpk_print
|
|
26
|
-
from .common import set_cluster_command
|
|
27
27
|
|
|
28
28
|
table_fmt = 'plain'
|
|
29
29
|
|
|
@@ -37,9 +37,7 @@ def info(args: Namespace) -> None:
|
|
|
37
37
|
None
|
|
38
38
|
"""
|
|
39
39
|
add_zone_and_project(args)
|
|
40
|
-
|
|
41
|
-
if set_cluster_command_code != 0:
|
|
42
|
-
xpk_exit(set_cluster_command_code)
|
|
40
|
+
get_cluster_credentials(args)
|
|
43
41
|
|
|
44
42
|
verify_kueuectl(args)
|
|
45
43
|
lq, cq = bool(args.localqueue), bool(args.clusterqueue)
|
|
@@ -20,10 +20,10 @@ import sys
|
|
|
20
20
|
from ruamel.yaml import YAML
|
|
21
21
|
|
|
22
22
|
from ..core.commands import run_command_for_value, run_command_with_updates
|
|
23
|
+
from ..core.cluster import get_cluster_credentials
|
|
23
24
|
from ..core.gcloud_context import add_zone_and_project
|
|
24
25
|
from ..core.kjob import AppProfileDefaults
|
|
25
26
|
from ..utils.console import xpk_exit, xpk_print
|
|
26
|
-
from .common import set_cluster_command
|
|
27
27
|
from .kind import set_local_cluster_command
|
|
28
28
|
|
|
29
29
|
|
|
@@ -143,14 +143,14 @@ def job_list(args) -> None:
|
|
|
143
143
|
"""
|
|
144
144
|
if not args.kind_cluster:
|
|
145
145
|
add_zone_and_project(args)
|
|
146
|
-
|
|
146
|
+
get_cluster_credentials(args)
|
|
147
147
|
msg = f'Listing jobs for project {args.project} and zone {args.zone}:'
|
|
148
148
|
else:
|
|
149
149
|
set_cluster_command_code = set_local_cluster_command(args)
|
|
150
150
|
msg = 'Listing jobs:'
|
|
151
|
+
if set_cluster_command_code != 0:
|
|
152
|
+
xpk_exit(set_cluster_command_code)
|
|
151
153
|
|
|
152
|
-
if set_cluster_command_code != 0:
|
|
153
|
-
xpk_exit(set_cluster_command_code)
|
|
154
154
|
xpk_print(msg, flush=True)
|
|
155
155
|
|
|
156
156
|
return_code = run_slurm_job_list_command(args)
|
|
@@ -178,12 +178,11 @@ def job_cancel(args) -> None:
|
|
|
178
178
|
xpk_print(f'Starting job cancel for job: {args.name}', flush=True)
|
|
179
179
|
if not args.kind_cluster:
|
|
180
180
|
add_zone_and_project(args)
|
|
181
|
-
|
|
181
|
+
get_cluster_credentials(args)
|
|
182
182
|
else:
|
|
183
183
|
set_cluster_command_code = set_local_cluster_command(args)
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
xpk_exit(set_cluster_command_code)
|
|
184
|
+
if set_cluster_command_code != 0:
|
|
185
|
+
xpk_exit(set_cluster_command_code)
|
|
187
186
|
|
|
188
187
|
return_code = run_slurm_job_delete_command(args)
|
|
189
188
|
xpk_exit(return_code)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from ..core.capacity import (
|
|
18
|
+
B200_DEVICE_TYPE,
|
|
19
|
+
H100_MEGA_DEVICE_TYPE,
|
|
20
|
+
H200_DEVICE_TYPE,
|
|
21
|
+
)
|
|
22
|
+
from ..core.cluster import get_gpu_type_from_cluster
|
|
23
|
+
from ..core.kjob import (
|
|
24
|
+
get_a3mega_pod_template_annotations,
|
|
25
|
+
get_a3ultra_pod_template_annotations,
|
|
26
|
+
get_a4_pod_template_annotations,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
|
|
31
|
+
gpu_type = get_gpu_type_from_cluster(args)
|
|
32
|
+
|
|
33
|
+
if gpu_type == H100_MEGA_DEVICE_TYPE:
|
|
34
|
+
annotations = get_a3mega_pod_template_annotations(args)
|
|
35
|
+
elif gpu_type == H200_DEVICE_TYPE:
|
|
36
|
+
annotations = get_a3ultra_pod_template_annotations(args)
|
|
37
|
+
elif gpu_type == B200_DEVICE_TYPE:
|
|
38
|
+
annotations = get_a4_pod_template_annotations()
|
|
39
|
+
else:
|
|
40
|
+
annotations = []
|
|
41
|
+
|
|
42
|
+
flags = [
|
|
43
|
+
f" --pod-template-annotation {annotation} " for annotation in annotations
|
|
44
|
+
]
|
|
45
|
+
cmd += "\\\n".join(flags)
|
|
46
|
+
|
|
47
|
+
return cmd
|
|
@@ -16,15 +16,23 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
from argparse import Namespace
|
|
18
18
|
|
|
19
|
-
from ..core.cluster import
|
|
19
|
+
from ..core.cluster import (
|
|
20
|
+
create_xpk_k8s_service_account,
|
|
21
|
+
get_cluster_credentials,
|
|
22
|
+
)
|
|
20
23
|
from ..core.commands import run_command_with_full_controls
|
|
21
24
|
from ..core.gcloud_context import add_zone_and_project
|
|
25
|
+
from ..core.kjob import (
|
|
26
|
+
AppProfileDefaults,
|
|
27
|
+
JobTemplateDefaults,
|
|
28
|
+
Kueue_TAS_annotation,
|
|
29
|
+
get_storage_annotations,
|
|
30
|
+
prepare_kjob,
|
|
31
|
+
)
|
|
22
32
|
from ..core.kueue import LOCAL_QUEUE_NAME
|
|
23
33
|
from ..utils.console import xpk_exit, xpk_print
|
|
24
|
-
from .common import set_cluster_command
|
|
25
|
-
from ..core.kjob import JobTemplateDefaults, AppProfileDefaults, prepare_kjob, Kueue_TAS_annotation, get_gcsfuse_annotation
|
|
26
|
-
from .kjob_common import add_gpu_networking_annotations_to_command
|
|
27
34
|
from .kind import set_local_cluster_command
|
|
35
|
+
from .kjob_common import add_gpu_networking_annotations_to_command
|
|
28
36
|
|
|
29
37
|
|
|
30
38
|
def run(args: Namespace) -> None:
|
|
@@ -37,12 +45,11 @@ def run(args: Namespace) -> None:
|
|
|
37
45
|
"""
|
|
38
46
|
if not args.kind_cluster:
|
|
39
47
|
add_zone_and_project(args)
|
|
40
|
-
|
|
48
|
+
get_cluster_credentials(args)
|
|
41
49
|
else:
|
|
42
50
|
set_cluster_command_code = set_local_cluster_command(args)
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
xpk_exit(set_cluster_command_code)
|
|
51
|
+
if set_cluster_command_code != 0:
|
|
52
|
+
xpk_exit(set_cluster_command_code)
|
|
46
53
|
|
|
47
54
|
err_code = prepare_kjob(args)
|
|
48
55
|
if err_code > 0:
|
|
@@ -64,9 +71,8 @@ def submit_job(args: Namespace) -> None:
|
|
|
64
71
|
)
|
|
65
72
|
cmd = add_gpu_networking_annotations_to_command(args, cmd)
|
|
66
73
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
cmd += f' --pod-template-annotation {gcsfuse_annotation}'
|
|
74
|
+
for annotation in get_storage_annotations(args):
|
|
75
|
+
cmd += f' --pod-template-annotation {annotation}'
|
|
70
76
|
|
|
71
77
|
if args.timeout:
|
|
72
78
|
cmd += f' --wait-timeout {args.timeout}s'
|
|
@@ -20,7 +20,7 @@ from ..core.kjob import (
|
|
|
20
20
|
AppProfileDefaults,
|
|
21
21
|
prepare_kjob,
|
|
22
22
|
get_pod_template_interactive_command,
|
|
23
|
-
|
|
23
|
+
get_storage_annotations,
|
|
24
24
|
)
|
|
25
25
|
|
|
26
26
|
exit_instructions = 'To exit the shell input "exit".'
|
|
@@ -89,9 +89,8 @@ def connect_to_new_interactive_shell(args: Namespace) -> int:
|
|
|
89
89
|
f' {AppProfileDefaults.NAME.value} --pod-running-timeout 180s'
|
|
90
90
|
)
|
|
91
91
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
cmd += f' --pod-template-annotation {gcsfuse_annotation}'
|
|
92
|
+
for annotation in get_storage_annotations(args):
|
|
93
|
+
cmd += f' --pod-template-annotation {annotation}'
|
|
95
94
|
|
|
96
95
|
return run_command_with_full_controls(
|
|
97
96
|
command=cmd,
|