xpk 0.9.0__tar.gz → 0.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xpk-0.9.0/src/xpk.egg-info → xpk-0.10.0}/PKG-INFO +45 -6
- {xpk-0.9.0 → xpk-0.10.0}/README.md +44 -5
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/commands/batch.py +3 -3
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/commands/cluster.py +22 -1
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/commands/cluster_gcluster.py +27 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/commands/common.py +12 -5
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/commands/kjob_common.py +4 -1
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/commands/run.py +2 -2
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/commands/shell.py +2 -2
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/commands/storage.py +10 -3
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/commands/workload.py +64 -27
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/blueprint/blueprint_generator.py +108 -40
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/capacity.py +66 -6
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/cluster.py +165 -7
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/config.py +1 -65
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/docker_manager.py +1 -1
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/docker_resources.py +145 -72
- xpk-0.10.0/src/xpk/core/jobset.py +143 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/kjob.py +2 -6
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/kueue.py +165 -5
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/nodepool.py +17 -4
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/pathways.py +1 -2
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/storage.py +1 -95
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/system_characteristics.py +1 -1
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/workload.py +0 -44
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/workload_decorators/rdma_decorator.py +2 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/workload_decorators/tcpx_decorator.py +10 -4
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/workload_decorators/tcpxo_decorator.py +7 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/parser/cluster.py +23 -7
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/parser/storage.py +2 -2
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/parser/workload.py +21 -3
- {xpk-0.9.0 → xpk-0.10.0/src/xpk.egg-info}/PKG-INFO +45 -6
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk.egg-info/SOURCES.txt +1 -0
- {xpk-0.9.0 → xpk-0.10.0}/LICENSE +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/pyproject.toml +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/setup.cfg +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/__init__.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/api/__init__.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/api/storage_crd.yaml +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/commands/__init__.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/commands/config.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/commands/info.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/commands/inspector.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/commands/job.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/commands/kind.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/commands/version.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/__init__.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/blueprint/__init__.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/cluster_private.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/commands.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/docker_container.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/docker_image.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/filestore.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/gcloud_context.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/gcluster_manager.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/gcsfuse.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/monitoring.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/mtc.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/nap.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/network.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/ray.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/remote_state/__init__.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/remote_state/remote_state_client.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/resources.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/scheduling.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/vertex.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/workload_decorators/__init__.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/main.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/parser/__init__.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/parser/batch.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/parser/common.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/parser/config.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/parser/core.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/parser/info.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/parser/inspector.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/parser/job.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/parser/kind.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/parser/run.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/parser/shell.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/parser/validators.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/parser/version.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/templates/__init__.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/templates/storage.yaml +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/utils/__init__.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/utils/console.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/utils/file.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/utils/gcs_utils.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/utils/kubectl.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/utils/network.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/utils/objects.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/utils/templates.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/utils/validation.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk/utils/yaml.py +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk.egg-info/dependency_links.txt +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk.egg-info/entry_points.txt +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk.egg-info/requires.txt +0 -0
- {xpk-0.9.0 → xpk-0.10.0}/src/xpk.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xpk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.10.0
|
|
4
4
|
Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
|
|
5
5
|
Author-email: XPK team <xpk-code-reviewers@google.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -259,6 +259,13 @@ all zones.
|
|
|
259
259
|
--num-slices=4 --spot
|
|
260
260
|
```
|
|
261
261
|
|
|
262
|
+
* Cluster Create (DWS flex queued capacity):
|
|
263
|
+
```shell
|
|
264
|
+
python3 xpk.py cluster create \
|
|
265
|
+
--cluster xpk-test --tpu-type=v5litepod-16 \
|
|
266
|
+
--num-slices=4 --flex
|
|
267
|
+
```
|
|
268
|
+
|
|
262
269
|
* Cluster Create for Pathways:
|
|
263
270
|
Pathways compatible cluster can be created using `cluster create-pathways`.
|
|
264
271
|
```shell
|
|
@@ -495,6 +502,7 @@ Currently, the below flags/arguments are supported for A3 Mega, A3 Ultra and A4
|
|
|
495
502
|
* `--reservation`
|
|
496
503
|
* `--spot`
|
|
497
504
|
* `--on-demand` (A3 Mega only)
|
|
505
|
+
* `--flex`
|
|
498
506
|
|
|
499
507
|
## Running XPK on existing clusters
|
|
500
508
|
|
|
@@ -518,9 +526,10 @@ Currently XPK supports the below types of storages:
|
|
|
518
526
|
- [Google Cloud Filestore](#filestore)
|
|
519
527
|
- [Google Cloud Parallelstore](#parallelstore)
|
|
520
528
|
- [Google Cloud Block storages (Persistent Disk, Hyperdisk)](#block-storage-persistent-disk-hyperdisk)
|
|
529
|
+
- [Google Cloud Managed Lustre](#managed-lustre)
|
|
521
530
|
|
|
522
531
|
### FUSE
|
|
523
|
-
A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so
|
|
532
|
+
A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so workloads can read and write objects in your bucket using standard file system semantics.
|
|
524
533
|
|
|
525
534
|
To use the GCS FUSE with XPK you need to create a [Storage Bucket](https://console.cloud.google.com/storage/).
|
|
526
535
|
|
|
@@ -547,7 +556,7 @@ Parameters:
|
|
|
547
556
|
|
|
548
557
|
### Filestore
|
|
549
558
|
|
|
550
|
-
A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so
|
|
559
|
+
A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
|
|
551
560
|
|
|
552
561
|
To create and attach a GCP Filestore instance to your cluster use `xpk storage create` command with `--type=gcpfilestore`:
|
|
553
562
|
|
|
@@ -583,7 +592,7 @@ Commands `xpk storage create` and `xpk storage attach` with `--type=gcpfilestore
|
|
|
583
592
|
|
|
584
593
|
### Parallelstore
|
|
585
594
|
|
|
586
|
-
A Parallelstore adapter lets you mount and access [Parallelstore instances](https://cloud.google.com/parallelstore/) as local file systems, so
|
|
595
|
+
A Parallelstore adapter lets you mount and access [Parallelstore instances](https://cloud.google.com/parallelstore/) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
|
|
587
596
|
|
|
588
597
|
To use the GCS Parallelstore with XPK you need to create a [Parallelstore Instance](https://console.cloud.google.com/parallelstore/).
|
|
589
598
|
|
|
@@ -607,7 +616,7 @@ Parameters:
|
|
|
607
616
|
|
|
608
617
|
### Block storage (Persistent Disk, Hyperdisk)
|
|
609
618
|
|
|
610
|
-
A PersistentDisk adapter lets you mount and access Google Cloud Block storage solutions ([Persistent Disk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#pd), [Hyperdisk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#hyperdisk)) as local file systems, so
|
|
619
|
+
A PersistentDisk adapter lets you mount and access Google Cloud Block storage solutions ([Persistent Disk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#pd), [Hyperdisk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#hyperdisk)) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
|
|
611
620
|
|
|
612
621
|
To use the GCE PersistentDisk with XPK you need to create a [disk in GCE](https://cloud.google.com/compute/docs/disks). Please consider that the disk type you are creating is [compatible with the VMs](https://cloud.google.com/compute/docs/machine-resource#machine_type_comparison) in the default and accelerator nodepools.
|
|
613
622
|
|
|
@@ -629,6 +638,30 @@ Parameters:
|
|
|
629
638
|
- `--readonly` - if set to true, workload can only read from storage.
|
|
630
639
|
- `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
|
|
631
640
|
|
|
641
|
+
### Managed Lustre
|
|
642
|
+
|
|
643
|
+
A Managed Lustre adaptor lets you mount and access [Google Cloud Managed Lustre instances](https://cloud.google.com/kubernetes-engine/docs/concepts/managed-lustre) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
|
|
644
|
+
|
|
645
|
+
To use the GCP Managed Lustre with XPK you need to create [an instance](https://cloud.google.com/managed-lustre/docs/create-instance). Please make sure you enable GKE support when creating the instance (gcloud ex. `--gke-support-enabled`).
|
|
646
|
+
|
|
647
|
+
Once it's ready you can use `xpk storage attach` with `--type=lustre` command to attach a Managed Lustre instance to your cluster. Currently, attaching a Managed Lustre instance is supported only by providing a manifest file.
|
|
648
|
+
|
|
649
|
+
```shell
|
|
650
|
+
python3 xpk.py storage attach test-lustre-storage --type=lustre \
|
|
651
|
+
--project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
|
|
652
|
+
--mount-point='/test-mount-point' --readonly=false \
|
|
653
|
+
--auto-mount=true \
|
|
654
|
+
--manifest='./examples/storage/lustre-manifest-attach.yaml'
|
|
655
|
+
```
|
|
656
|
+
|
|
657
|
+
Parameters:
|
|
658
|
+
|
|
659
|
+
- `--type` - type of the storage `lustre`
|
|
660
|
+
- `--auto-mount` - if set to true all workloads will have this storage mounted by default.
|
|
661
|
+
- `--mount-point` - the path on which this storage should be mounted for a workload.
|
|
662
|
+
- `--readonly` - if set to true, workload can only read from storage.
|
|
663
|
+
- `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
|
|
664
|
+
|
|
632
665
|
### List attached storages
|
|
633
666
|
|
|
634
667
|
```shell
|
|
@@ -670,8 +703,14 @@ python3 xpk.py storage delete test-fs-instance \
|
|
|
670
703
|
python3 xpk.py workload create \
|
|
671
704
|
--workload xpk-test-workload --command "echo goodbye" \
|
|
672
705
|
--cluster xpk-test \
|
|
673
|
-
--tpu-type=v5litepod-16 --
|
|
706
|
+
--tpu-type=v5litepod-16 --project=$PROJECT
|
|
674
707
|
```
|
|
708
|
+
* Workload create(DWS flex with queued provisioning):
|
|
709
|
+
```shell
|
|
710
|
+
python3 xpk.py workload create \
|
|
711
|
+
--workload xpk-test-workload --command "echo goodbye" \
|
|
712
|
+
--cluster xpk-test --flex \
|
|
713
|
+
--tpu-type=v5litepod-16 --project=$PROJECT
|
|
675
714
|
|
|
676
715
|
* Workload Create for Pathways:
|
|
677
716
|
Pathways workload can be submitted using `workload create-pathways` on a Pathways enabled cluster (created with `cluster create-pathways`)
|
|
@@ -227,6 +227,13 @@ all zones.
|
|
|
227
227
|
--num-slices=4 --spot
|
|
228
228
|
```
|
|
229
229
|
|
|
230
|
+
* Cluster Create (DWS flex queued capacity):
|
|
231
|
+
```shell
|
|
232
|
+
python3 xpk.py cluster create \
|
|
233
|
+
--cluster xpk-test --tpu-type=v5litepod-16 \
|
|
234
|
+
--num-slices=4 --flex
|
|
235
|
+
```
|
|
236
|
+
|
|
230
237
|
* Cluster Create for Pathways:
|
|
231
238
|
Pathways compatible cluster can be created using `cluster create-pathways`.
|
|
232
239
|
```shell
|
|
@@ -463,6 +470,7 @@ Currently, the below flags/arguments are supported for A3 Mega, A3 Ultra and A4
|
|
|
463
470
|
* `--reservation`
|
|
464
471
|
* `--spot`
|
|
465
472
|
* `--on-demand` (A3 Mega only)
|
|
473
|
+
* `--flex`
|
|
466
474
|
|
|
467
475
|
## Running XPK on existing clusters
|
|
468
476
|
|
|
@@ -486,9 +494,10 @@ Currently XPK supports the below types of storages:
|
|
|
486
494
|
- [Google Cloud Filestore](#filestore)
|
|
487
495
|
- [Google Cloud Parallelstore](#parallelstore)
|
|
488
496
|
- [Google Cloud Block storages (Persistent Disk, Hyperdisk)](#block-storage-persistent-disk-hyperdisk)
|
|
497
|
+
- [Google Cloud Managed Lustre](#managed-lustre)
|
|
489
498
|
|
|
490
499
|
### FUSE
|
|
491
|
-
A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so
|
|
500
|
+
A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so workloads can read and write objects in your bucket using standard file system semantics.
|
|
492
501
|
|
|
493
502
|
To use the GCS FUSE with XPK you need to create a [Storage Bucket](https://console.cloud.google.com/storage/).
|
|
494
503
|
|
|
@@ -515,7 +524,7 @@ Parameters:
|
|
|
515
524
|
|
|
516
525
|
### Filestore
|
|
517
526
|
|
|
518
|
-
A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so
|
|
527
|
+
A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
|
|
519
528
|
|
|
520
529
|
To create and attach a GCP Filestore instance to your cluster use `xpk storage create` command with `--type=gcpfilestore`:
|
|
521
530
|
|
|
@@ -551,7 +560,7 @@ Commands `xpk storage create` and `xpk storage attach` with `--type=gcpfilestore
|
|
|
551
560
|
|
|
552
561
|
### Parallelstore
|
|
553
562
|
|
|
554
|
-
A Parallelstore adapter lets you mount and access [Parallelstore instances](https://cloud.google.com/parallelstore/) as local file systems, so
|
|
563
|
+
A Parallelstore adapter lets you mount and access [Parallelstore instances](https://cloud.google.com/parallelstore/) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
|
|
555
564
|
|
|
556
565
|
To use the GCS Parallelstore with XPK you need to create a [Parallelstore Instance](https://console.cloud.google.com/parallelstore/).
|
|
557
566
|
|
|
@@ -575,7 +584,7 @@ Parameters:
|
|
|
575
584
|
|
|
576
585
|
### Block storage (Persistent Disk, Hyperdisk)
|
|
577
586
|
|
|
578
|
-
A PersistentDisk adapter lets you mount and access Google Cloud Block storage solutions ([Persistent Disk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#pd), [Hyperdisk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#hyperdisk)) as local file systems, so
|
|
587
|
+
A PersistentDisk adapter lets you mount and access Google Cloud Block storage solutions ([Persistent Disk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#pd), [Hyperdisk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#hyperdisk)) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
|
|
579
588
|
|
|
580
589
|
To use the GCE PersistentDisk with XPK you need to create a [disk in GCE](https://cloud.google.com/compute/docs/disks). Please consider that the disk type you are creating is [compatible with the VMs](https://cloud.google.com/compute/docs/machine-resource#machine_type_comparison) in the default and accelerator nodepools.
|
|
581
590
|
|
|
@@ -597,6 +606,30 @@ Parameters:
|
|
|
597
606
|
- `--readonly` - if set to true, workload can only read from storage.
|
|
598
607
|
- `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
|
|
599
608
|
|
|
609
|
+
### Managed Lustre
|
|
610
|
+
|
|
611
|
+
A Managed Lustre adaptor lets you mount and access [Google Cloud Managed Lustre instances](https://cloud.google.com/kubernetes-engine/docs/concepts/managed-lustre) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
|
|
612
|
+
|
|
613
|
+
To use the GCP Managed Lustre with XPK you need to create [an instance](https://cloud.google.com/managed-lustre/docs/create-instance). Please make sure you enable GKE support when creating the instance (gcloud ex. `--gke-support-enabled`).
|
|
614
|
+
|
|
615
|
+
Once it's ready you can use `xpk storage attach` with `--type=lustre` command to attach a Managed Lustre instance to your cluster. Currently, attaching a Managed Lustre instance is supported only by providing a manifest file.
|
|
616
|
+
|
|
617
|
+
```shell
|
|
618
|
+
python3 xpk.py storage attach test-lustre-storage --type=lustre \
|
|
619
|
+
--project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
|
|
620
|
+
--mount-point='/test-mount-point' --readonly=false \
|
|
621
|
+
--auto-mount=true \
|
|
622
|
+
--manifest='./examples/storage/lustre-manifest-attach.yaml'
|
|
623
|
+
```
|
|
624
|
+
|
|
625
|
+
Parameters:
|
|
626
|
+
|
|
627
|
+
- `--type` - type of the storage `lustre`
|
|
628
|
+
- `--auto-mount` - if set to true all workloads will have this storage mounted by default.
|
|
629
|
+
- `--mount-point` - the path on which this storage should be mounted for a workload.
|
|
630
|
+
- `--readonly` - if set to true, workload can only read from storage.
|
|
631
|
+
- `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
|
|
632
|
+
|
|
600
633
|
### List attached storages
|
|
601
634
|
|
|
602
635
|
```shell
|
|
@@ -638,8 +671,14 @@ python3 xpk.py storage delete test-fs-instance \
|
|
|
638
671
|
python3 xpk.py workload create \
|
|
639
672
|
--workload xpk-test-workload --command "echo goodbye" \
|
|
640
673
|
--cluster xpk-test \
|
|
641
|
-
--tpu-type=v5litepod-16 --
|
|
674
|
+
--tpu-type=v5litepod-16 --project=$PROJECT
|
|
642
675
|
```
|
|
676
|
+
* Workload create(DWS flex with queued provisioning):
|
|
677
|
+
```shell
|
|
678
|
+
python3 xpk.py workload create \
|
|
679
|
+
--workload xpk-test-workload --command "echo goodbye" \
|
|
680
|
+
--cluster xpk-test --flex \
|
|
681
|
+
--tpu-type=v5litepod-16 --project=$PROJECT
|
|
643
682
|
|
|
644
683
|
* Workload Create for Pathways:
|
|
645
684
|
Pathways workload can be submitted using `workload create-pathways` on a Pathways enabled cluster (created with `cluster create-pathways`)
|
|
@@ -18,7 +18,7 @@ import re
|
|
|
18
18
|
from argparse import Namespace
|
|
19
19
|
|
|
20
20
|
from ..core.cluster import (
|
|
21
|
-
|
|
21
|
+
setup_k8s_service_accounts,
|
|
22
22
|
get_cluster_credentials,
|
|
23
23
|
)
|
|
24
24
|
from ..core.commands import run_command_for_value
|
|
@@ -54,14 +54,14 @@ def batch(args: Namespace) -> None:
|
|
|
54
54
|
err_code = prepare_kjob(args)
|
|
55
55
|
if err_code > 0:
|
|
56
56
|
xpk_exit(err_code)
|
|
57
|
-
|
|
57
|
+
setup_k8s_service_accounts()
|
|
58
58
|
|
|
59
59
|
submit_job(args)
|
|
60
60
|
|
|
61
61
|
|
|
62
62
|
def submit_job(args: Namespace) -> None:
|
|
63
63
|
|
|
64
|
-
|
|
64
|
+
setup_k8s_service_accounts()
|
|
65
65
|
|
|
66
66
|
cmd = (
|
|
67
67
|
'kubectl kjob create slurm'
|
|
@@ -31,6 +31,7 @@ from ..core.cluster import (
|
|
|
31
31
|
update_cluster_with_gcsfuse_driver_if_necessary,
|
|
32
32
|
update_cluster_with_parallelstore_driver_if_necessary,
|
|
33
33
|
update_cluster_with_pd_driver_if_necessary,
|
|
34
|
+
update_cluster_with_lustre_driver_if_necessary,
|
|
34
35
|
update_cluster_with_workload_identity_if_necessary,
|
|
35
36
|
)
|
|
36
37
|
from ..core.cluster_private import authorize_private_cluster_access_if_necessary
|
|
@@ -42,12 +43,14 @@ from ..core.gcloud_context import (
|
|
|
42
43
|
get_gke_server_config,
|
|
43
44
|
zone_to_region,
|
|
44
45
|
)
|
|
46
|
+
from ..core.jobset import update_jobset_resources_if_necessary
|
|
45
47
|
from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
|
|
46
48
|
from ..core.kueue import (
|
|
47
49
|
cluster_preheat_yml,
|
|
48
50
|
install_kueue_crs,
|
|
49
51
|
install_kueue_on_cluster,
|
|
50
52
|
wait_for_kueue_available,
|
|
53
|
+
update_kueue_resources_if_necessary,
|
|
51
54
|
)
|
|
52
55
|
from ..core.nap import enable_autoprovisioning_on_cluster
|
|
53
56
|
from ..core.network import (
|
|
@@ -170,7 +173,6 @@ def cluster_adapt(args) -> None:
|
|
|
170
173
|
install_kueue(args, system, autoprovisioning_config)
|
|
171
174
|
|
|
172
175
|
install_kjob(args)
|
|
173
|
-
|
|
174
176
|
if system.accelerator_type == AcceleratorType['GPU']:
|
|
175
177
|
prepare_gpus(args, system)
|
|
176
178
|
|
|
@@ -308,6 +310,9 @@ def cluster_create(args) -> None:
|
|
|
308
310
|
set_jobset_on_cluster_code = set_jobset_on_cluster(args)
|
|
309
311
|
if set_jobset_on_cluster_code != 0:
|
|
310
312
|
xpk_exit(set_jobset_on_cluster_code)
|
|
313
|
+
update_jobset_resources_code = update_jobset_resources_if_necessary(args)
|
|
314
|
+
if update_jobset_resources_code != 0:
|
|
315
|
+
xpk_exit(update_jobset_resources_code)
|
|
311
316
|
|
|
312
317
|
set_pathways_job_on_cluster_code = set_pathways_job_on_cluster(args)
|
|
313
318
|
if set_pathways_job_on_cluster_code != 0:
|
|
@@ -879,6 +884,10 @@ def run_gke_cluster_create_command(
|
|
|
879
884
|
if args.enable_pd_csi_driver:
|
|
880
885
|
addons.append('GcePersistentDiskCsiDriver')
|
|
881
886
|
|
|
887
|
+
if args.enable_lustre_csi_driver:
|
|
888
|
+
addons.append('LustreCsiDriver')
|
|
889
|
+
command += ' --enable-legacy-lustre-port'
|
|
890
|
+
|
|
882
891
|
if hasattr(args, 'enable_mtc') and args.enable_mtc:
|
|
883
892
|
addons.append('HighScaleCheckpointing')
|
|
884
893
|
|
|
@@ -922,6 +931,13 @@ def install_storage_csis(args):
|
|
|
922
931
|
if update_cluster_command_code != 0:
|
|
923
932
|
xpk_exit(update_cluster_command_code)
|
|
924
933
|
|
|
934
|
+
if args.enable_lustre_csi_driver:
|
|
935
|
+
update_cluster_command_code = (
|
|
936
|
+
update_cluster_with_lustre_driver_if_necessary(args)
|
|
937
|
+
)
|
|
938
|
+
if update_cluster_command_code != 0:
|
|
939
|
+
xpk_exit(update_cluster_command_code)
|
|
940
|
+
|
|
925
941
|
|
|
926
942
|
def install_kjob(args):
|
|
927
943
|
xpk_print('Verifying kjob installation')
|
|
@@ -957,6 +973,11 @@ def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config):
|
|
|
957
973
|
if enable_kueue_credentials_code != 0:
|
|
958
974
|
xpk_exit(enable_kueue_credentials_code)
|
|
959
975
|
|
|
976
|
+
xpk_print('Update Kueue Controller Manager resources')
|
|
977
|
+
update_kueue_resources_code = update_kueue_resources_if_necessary(args)
|
|
978
|
+
if update_kueue_resources_code != 0:
|
|
979
|
+
xpk_exit(update_kueue_resources_code)
|
|
980
|
+
|
|
960
981
|
|
|
961
982
|
def prepare_gpus(args, system: SystemCharacteristics):
|
|
962
983
|
xpk_print('Installing NCCL Plugin for cluster')
|
|
@@ -37,6 +37,7 @@ from ..utils.console import xpk_exit, xpk_print
|
|
|
37
37
|
from ..utils.file import ensure_directory_exists
|
|
38
38
|
from ..utils.network import all_IPs_cidr
|
|
39
39
|
from ..utils.objects import hash_string
|
|
40
|
+
from ..core.capacity import get_reservation_maintenance_interval, get_reservation_placement_policy
|
|
40
41
|
|
|
41
42
|
blueprints_path = os.path.abspath('xpkclusters/blueprints')
|
|
42
43
|
gcluster_working_dir = os.path.abspath('xpkclusters/gcluster-out')
|
|
@@ -234,6 +235,30 @@ def generate_blueprint(
|
|
|
234
235
|
if args.device_type in supported_device_types:
|
|
235
236
|
if args.device_type == a3mega_device_type:
|
|
236
237
|
num_nodes = args.num_nodes if not args.num_nodes is None else 2
|
|
238
|
+
|
|
239
|
+
maintenance_interval = (
|
|
240
|
+
get_reservation_maintenance_interval(
|
|
241
|
+
args.reservation, args.zone, args.project
|
|
242
|
+
)
|
|
243
|
+
if args.reservation is not None
|
|
244
|
+
else 'PERIODIC'
|
|
245
|
+
)
|
|
246
|
+
placement_policy_name = (
|
|
247
|
+
get_reservation_placement_policy(
|
|
248
|
+
args.reservation, args.zone, args.project
|
|
249
|
+
)
|
|
250
|
+
if args.reservation is not None
|
|
251
|
+
else None
|
|
252
|
+
)
|
|
253
|
+
placement_policy = (
|
|
254
|
+
{
|
|
255
|
+
'type': 'COMPACT',
|
|
256
|
+
'name': placement_policy_name.split('/')[-1],
|
|
257
|
+
}
|
|
258
|
+
if placement_policy_name is not None
|
|
259
|
+
and len(placement_policy_name) > 0
|
|
260
|
+
else None
|
|
261
|
+
)
|
|
237
262
|
return bpg.generate_a3_mega_blueprint(
|
|
238
263
|
blueprint_name=blueprint_name,
|
|
239
264
|
prefix=prefix,
|
|
@@ -243,6 +268,8 @@ def generate_blueprint(
|
|
|
243
268
|
zone=args.zone,
|
|
244
269
|
auth_cidr=all_IPs_cidr,
|
|
245
270
|
num_nodes=num_nodes,
|
|
271
|
+
reservation_maintenance_interval=maintenance_interval,
|
|
272
|
+
reservation_placement_policy=placement_policy,
|
|
246
273
|
reservation=args.reservation if args.reservation else None,
|
|
247
274
|
capacity_type=capacity_type,
|
|
248
275
|
system_node_pool_machine_type=args.default_pool_cpu_machine_type,
|
|
@@ -15,10 +15,12 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
from ..core.commands import run_command_with_updates_retry
|
|
18
|
-
from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
|
|
19
18
|
from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType
|
|
20
19
|
from ..core.gcloud_context import zone_to_region
|
|
21
20
|
from ..utils.console import xpk_print, xpk_exit
|
|
21
|
+
from ..core.system_characteristics import (
|
|
22
|
+
SystemCharacteristics,
|
|
23
|
+
)
|
|
22
24
|
|
|
23
25
|
|
|
24
26
|
def set_cluster_command(args) -> int:
|
|
@@ -47,7 +49,11 @@ def set_cluster_command(args) -> int:
|
|
|
47
49
|
return return_code
|
|
48
50
|
|
|
49
51
|
|
|
50
|
-
def is_TAS_possible(
|
|
52
|
+
def is_TAS_possible(
|
|
53
|
+
system_characteristics: SystemCharacteristics,
|
|
54
|
+
capacity_type: CapacityType,
|
|
55
|
+
flex: bool,
|
|
56
|
+
) -> bool:
|
|
51
57
|
"""Check cluster's machine_type and capacity type to determine if Kueue TAS is possible
|
|
52
58
|
|
|
53
59
|
Args:
|
|
@@ -56,8 +62,6 @@ def is_TAS_possible(args) -> bool:
|
|
|
56
62
|
Returns:
|
|
57
63
|
True if possible and False otherwise.
|
|
58
64
|
"""
|
|
59
|
-
system_characteristics = get_cluster_system_characteristics(args)
|
|
60
|
-
capacity_type = get_cluster_capacity_type(args)
|
|
61
65
|
|
|
62
66
|
if system_characteristics is None:
|
|
63
67
|
xpk_print('system_characteristics data was not found in configmaps.')
|
|
@@ -67,9 +71,12 @@ def is_TAS_possible(args) -> bool:
|
|
|
67
71
|
xpk_print('capacity_type data was not found in configmaps.')
|
|
68
72
|
xpk_exit(1)
|
|
69
73
|
|
|
74
|
+
if flex:
|
|
75
|
+
return False
|
|
76
|
+
|
|
70
77
|
if (
|
|
71
78
|
system_characteristics.device_type == H100_MEGA_DEVICE_TYPE
|
|
72
|
-
and capacity_type
|
|
79
|
+
and capacity_type != CapacityType.RESERVATION
|
|
73
80
|
):
|
|
74
81
|
return False
|
|
75
82
|
|
|
@@ -27,6 +27,7 @@ from ..core.kjob import (
|
|
|
27
27
|
Kueue_TAS_annotation,
|
|
28
28
|
)
|
|
29
29
|
from .common import is_TAS_possible
|
|
30
|
+
from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
|
|
30
31
|
|
|
31
32
|
|
|
32
33
|
def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
|
|
@@ -50,7 +51,9 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
|
|
|
50
51
|
|
|
51
52
|
|
|
52
53
|
def add_TAS_annotations_to_command(args, cmd: str) -> str:
|
|
53
|
-
|
|
54
|
+
system_characteristics = get_cluster_system_characteristics(args)
|
|
55
|
+
capacity_type = get_cluster_capacity_type(args)
|
|
56
|
+
if is_TAS_possible(system_characteristics, capacity_type, flex=False):
|
|
54
57
|
cmd += f" --pod-template-annotation {Kueue_TAS_annotation}"
|
|
55
58
|
|
|
56
59
|
return cmd
|
|
@@ -17,7 +17,7 @@ limitations under the License.
|
|
|
17
17
|
from argparse import Namespace
|
|
18
18
|
|
|
19
19
|
from ..core.cluster import (
|
|
20
|
-
|
|
20
|
+
setup_k8s_service_accounts,
|
|
21
21
|
get_cluster_credentials,
|
|
22
22
|
)
|
|
23
23
|
from ..core.commands import run_command_with_full_controls
|
|
@@ -53,7 +53,7 @@ def run(args: Namespace) -> None:
|
|
|
53
53
|
err_code = prepare_kjob(args)
|
|
54
54
|
if err_code > 0:
|
|
55
55
|
xpk_exit(err_code)
|
|
56
|
-
|
|
56
|
+
setup_k8s_service_accounts()
|
|
57
57
|
|
|
58
58
|
submit_job(args)
|
|
59
59
|
|
|
@@ -12,7 +12,7 @@ limitations under the License.
|
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
14
|
from ..core.commands import run_command_with_full_controls, run_command_for_value, run_command_with_updates
|
|
15
|
-
from ..core.cluster import get_cluster_credentials, add_zone_and_project,
|
|
15
|
+
from ..core.cluster import get_cluster_credentials, add_zone_and_project, setup_k8s_service_accounts
|
|
16
16
|
from ..utils.console import xpk_exit, xpk_print
|
|
17
17
|
from argparse import Namespace
|
|
18
18
|
|
|
@@ -82,7 +82,7 @@ def connect_to_new_interactive_shell(args: Namespace) -> int:
|
|
|
82
82
|
err_code = prepare_kjob(args)
|
|
83
83
|
if err_code > 0:
|
|
84
84
|
xpk_exit(err_code)
|
|
85
|
-
|
|
85
|
+
setup_k8s_service_accounts()
|
|
86
86
|
|
|
87
87
|
cmd = (
|
|
88
88
|
'kubectl-kjob create interactive --profile'
|
|
@@ -29,6 +29,7 @@ from ..core.cluster import (
|
|
|
29
29
|
setup_k8s_env,
|
|
30
30
|
update_cluster_with_parallelstore_driver_if_necessary,
|
|
31
31
|
update_cluster_with_pd_driver_if_necessary,
|
|
32
|
+
update_cluster_with_lustre_driver_if_necessary,
|
|
32
33
|
update_cluster_with_gcpfilestore_driver_if_necessary,
|
|
33
34
|
update_cluster_with_gcsfuse_driver_if_necessary,
|
|
34
35
|
update_cluster_with_workload_identity_if_necessary,
|
|
@@ -45,6 +46,7 @@ from ..core.storage import (
|
|
|
45
46
|
GCS_FUSE_TYPE,
|
|
46
47
|
GCE_PD_TYPE,
|
|
47
48
|
PARALLELSTORE_TYPE,
|
|
49
|
+
LUSTRE_TYPE,
|
|
48
50
|
STORAGE_CRD_PLURAL,
|
|
49
51
|
XPK_API_GROUP_NAME,
|
|
50
52
|
XPK_API_GROUP_VERSION,
|
|
@@ -183,11 +185,11 @@ def storage_attach(args: Namespace) -> None:
|
|
|
183
185
|
args.prefetch_metadata,
|
|
184
186
|
)
|
|
185
187
|
|
|
186
|
-
elif args.type in [PARALLELSTORE_TYPE, GCE_PD_TYPE]:
|
|
188
|
+
elif args.type in [PARALLELSTORE_TYPE, GCE_PD_TYPE, LUSTRE_TYPE]:
|
|
187
189
|
if args.manifest is None:
|
|
188
190
|
xpk_print(
|
|
189
|
-
"Parallelstore and
|
|
190
|
-
" --manifest"
|
|
191
|
+
"Parallelstore, PersistentDisk, and Lustre are currently supported"
|
|
192
|
+
" only with --manifest"
|
|
191
193
|
)
|
|
192
194
|
xpk_exit(1)
|
|
193
195
|
|
|
@@ -234,6 +236,11 @@ def enable_csi_drivers_if_necessary(args: Namespace) -> None:
|
|
|
234
236
|
if return_code > 0:
|
|
235
237
|
xpk_exit(return_code)
|
|
236
238
|
|
|
239
|
+
if args.type == LUSTRE_TYPE:
|
|
240
|
+
return_code = update_cluster_with_lustre_driver_if_necessary(args)
|
|
241
|
+
if return_code > 0:
|
|
242
|
+
xpk_exit(return_code)
|
|
243
|
+
|
|
237
244
|
|
|
238
245
|
def storage_list(args: Namespace) -> None:
|
|
239
246
|
k8s_api_client = setup_k8s_env(args)
|