xpk 0.7.2__tar.gz → 0.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xpk-0.7.2/src/xpk.egg-info → xpk-0.9.0}/PKG-INFO +192 -93
- {xpk-0.7.2 → xpk-0.9.0}/README.md +191 -92
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/commands/batch.py +19 -13
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/commands/cluster.py +240 -71
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/commands/cluster_gcluster.py +22 -5
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/commands/common.py +33 -1
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/commands/info.py +2 -4
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/commands/job.py +7 -8
- xpk-0.9.0/src/xpk/commands/kjob_common.py +56 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/commands/run.py +17 -12
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/commands/shell.py +3 -4
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/commands/storage.py +75 -19
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/commands/workload.py +161 -324
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/blueprint/blueprint_definitions.py +2 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/blueprint/blueprint_generator.py +335 -45
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/capacity.py +1 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/cluster.py +193 -12
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/config.py +3 -1
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/docker_manager.py +1 -1
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/docker_resources.py +9 -21
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/filestore.py +5 -1
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/gcsfuse.py +27 -6
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/kjob.py +66 -20
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/kueue.py +30 -0
- xpk-0.9.0/src/xpk/core/mtc.py +195 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/nap.py +4 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/network.py +34 -22
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/nodepool.py +28 -26
- xpk-0.9.0/src/xpk/core/pathways.py +332 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/resources.py +21 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/scheduling.py +36 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/storage.py +66 -12
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/system_characteristics.py +9 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/workload.py +28 -83
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/workload_decorators/rdma_decorator.py +11 -15
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/workload_decorators/storage_decorator.py +8 -3
- xpk-0.9.0/src/xpk/core/workload_decorators/tcpx_decorator.py +179 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/workload_decorators/tcpxo_decorator.py +17 -16
- xpk-0.9.0/src/xpk/parser/cluster.py +855 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/parser/storage.py +25 -5
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/parser/workload.py +59 -31
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/utils/kubectl.py +4 -1
- {xpk-0.7.2 → xpk-0.9.0/src/xpk.egg-info}/PKG-INFO +192 -93
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk.egg-info/SOURCES.txt +2 -0
- xpk-0.7.2/src/xpk/commands/kjob_common.py +0 -44
- xpk-0.7.2/src/xpk/core/pathways.py +0 -377
- xpk-0.7.2/src/xpk/parser/cluster.py +0 -662
- {xpk-0.7.2 → xpk-0.9.0}/LICENSE +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/pyproject.toml +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/setup.cfg +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/__init__.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/api/__init__.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/api/storage_crd.yaml +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/commands/__init__.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/commands/config.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/commands/inspector.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/commands/kind.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/commands/version.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/__init__.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/blueprint/__init__.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/cluster_private.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/commands.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/docker_container.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/docker_image.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/gcloud_context.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/gcluster_manager.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/monitoring.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/ray.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/remote_state/__init__.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/remote_state/remote_state_client.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/vertex.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/core/workload_decorators/__init__.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/main.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/parser/__init__.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/parser/batch.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/parser/common.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/parser/config.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/parser/core.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/parser/info.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/parser/inspector.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/parser/job.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/parser/kind.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/parser/run.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/parser/shell.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/parser/validators.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/parser/version.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/templates/__init__.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/templates/storage.yaml +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/utils/__init__.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/utils/console.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/utils/file.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/utils/gcs_utils.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/utils/network.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/utils/objects.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/utils/templates.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/utils/validation.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk/utils/yaml.py +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk.egg-info/dependency_links.txt +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk.egg-info/entry_points.txt +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk.egg-info/requires.txt +0 -0
- {xpk-0.7.2 → xpk-0.9.0}/src/xpk.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xpk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.9.0
|
|
4
4
|
Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
|
|
5
5
|
Author-email: XPK team <xpk-code-reviewers@google.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -46,23 +46,18 @@ Dynamic: license-file
|
|
|
46
46
|
limitations under the License.
|
|
47
47
|
-->
|
|
48
48
|
|
|
49
|
-
[](https://github.com/google/xpk/actions/workflows/build_tests.yaml)
|
|
50
|
-
[](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml)
|
|
51
|
-
[](https://github.com/google/xpk/actions/workflows/build_tests.yaml?query=branch%3Amain)
|
|
50
|
+
[](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml?query=branch%3Amain)
|
|
51
|
+
[](https://github.com/AI-Hypercomputer/xpk/actions/workflows/build_tests.yaml?query=branch%3Adevelop)
|
|
52
|
+
[](https://github.com/AI-Hypercomputer/xpk/actions/workflows/nightly_tests.yaml?query=branch%3Adevelop)
|
|
53
53
|
|
|
54
54
|
# Overview
|
|
55
55
|
|
|
56
|
-
|
|
57
|
-
Cloud developers to orchestrate training jobs on accelerators such as TPUs and
|
|
58
|
-
GPUs on GKE. xpk handles the "multihost pods" of TPUs, GPUs (HGX H100) and CPUs
|
|
59
|
-
(n2-standard-32) as first class citizens.
|
|
56
|
+
XPK (Accelerated Processing Kit, pronounced x-p-k) is a command line interface that simplifies cluster creation and workload execution on Google Kubernetes Engine (GKE). XPK generates preconfigured, training-optimized clusters and allows easy workload scheduling without any Kubernetes expertise.
|
|
60
57
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
physical resources you have available. Workloads represent training jobs -- at
|
|
64
|
-
any time some of these will be completed, others will be running and some will
|
|
65
|
-
be queued, waiting for cluster resources to become available.
|
|
58
|
+
XPK is recommended for quick creation of GKE clusters for proofs of concepts and testing.
|
|
59
|
+
|
|
60
|
+
XPK decouples provisioning capacity from running jobs. There are two structures: clusters (provisioned VMs) and workloads (training jobs). Clusters represent the physical resources you have available. Workloads represent training jobs -- at any time some of these will be completed, others will be running and some will be queued, waiting for cluster resources to become available.
|
|
66
61
|
|
|
67
62
|
The ideal workflow starts by provisioning the clusters for all of the ML
|
|
68
63
|
hardware you have reserved. Then, without re-provisioning, submit jobs as
|
|
@@ -73,7 +68,7 @@ return the hardware back to the shared pool when they complete, developers can
|
|
|
73
68
|
achieve better use of finite hardware resources. And automated tests can run
|
|
74
69
|
overnight while resources tend to be underutilized.
|
|
75
70
|
|
|
76
|
-
|
|
71
|
+
XPK supports the following TPU types:
|
|
77
72
|
* v4
|
|
78
73
|
* v5e
|
|
79
74
|
* v5p
|
|
@@ -82,15 +77,18 @@ xpk supports the following TPU types:
|
|
|
82
77
|
and the following GPU types:
|
|
83
78
|
* A100
|
|
84
79
|
* A3-Highgpu (h100)
|
|
85
|
-
* A3-Mega (h100-mega) - [Create cluster](#provisioning-a3-ultra-
|
|
86
|
-
* A3-Ultra (h200) - [Create cluster](#provisioning-a3-ultra-
|
|
80
|
+
* A3-Mega (h100-mega) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
|
|
81
|
+
* A3-Ultra (h200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
|
|
82
|
+
* A4 (b200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
|
|
87
83
|
|
|
88
84
|
and the following CPU types:
|
|
89
85
|
* n2-standard-32
|
|
90
86
|
|
|
91
|
-
|
|
87
|
+
XPK also supports [Google Cloud Storage solutions](#storage):
|
|
92
88
|
* [Cloud Storage FUSE](#fuse)
|
|
93
89
|
* [Filestore](#filestore)
|
|
90
|
+
* [Parallelstore](#parallelstore)
|
|
91
|
+
* [Block storage (Persistent Disk, Hyperdisk)](#block-storage-persistent-disk-hyperdisk)
|
|
94
92
|
|
|
95
93
|
# Permissions needed on Cloud Console:
|
|
96
94
|
|
|
@@ -104,77 +102,93 @@ xpk also supports Google Cloud Storage solutions:
|
|
|
104
102
|
* Vertex AI Administrator
|
|
105
103
|
* Filestore Editor (This role is neccessary if you want to run `storage create` command with `--type=gcpfilestore`)
|
|
106
104
|
|
|
107
|
-
#
|
|
105
|
+
# Installation
|
|
106
|
+
|
|
107
|
+
There are 2 ways to install XPK:
|
|
108
|
+
|
|
109
|
+
- via Python package installer (`pip`),
|
|
110
|
+
- clone from git and build from source.
|
|
111
|
+
|
|
112
|
+
## Prerequisites
|
|
108
113
|
|
|
109
|
-
|
|
114
|
+
The following tools must be installed:
|
|
110
115
|
|
|
111
|
-
- python >= 3.10
|
|
112
|
-
- pip
|
|
113
|
-
- python venv
|
|
116
|
+
- python >= 3.10: download from [here](https://www.python.org/downloads/)
|
|
117
|
+
- pip: [installation instructions](https://pip.pypa.io/en/stable/installation/)
|
|
118
|
+
- python venv: [installation instructions](https://virtualenv.pypa.io/en/latest/installation.html)
|
|
114
119
|
(all three of above can be installed at once from [here](https://packaging.python.org/en/latest/guides/installing-using-linux-tools/#installing-pip-setuptools-wheel-with-linux-package-managers))
|
|
115
|
-
- gcloud
|
|
120
|
+
- gcloud: install from [here](https://cloud.google.com/sdk/gcloud#download_and_install_the) and then:
|
|
116
121
|
- Run `gcloud init`
|
|
117
122
|
- [Authenticate](https://cloud.google.com/sdk/gcloud/reference/auth/application-default/login) to Google Cloud
|
|
118
|
-
- kubectl
|
|
123
|
+
- kubectl: install from [here](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_kubectl) and then:
|
|
119
124
|
- Install `gke-gcloud-auth-plugin` from [here](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_plugin)
|
|
120
|
-
- docker
|
|
125
|
+
- docker: [installation instructions](https://docs.docker.com/engine/install/) and then:
|
|
126
|
+
- Configure sudoless docker: [guide](https://docs.docker.com/engine/install/linux-postinstall/)
|
|
121
127
|
- Run `gcloud auth configure-docker` to ensure images can be uploaded to registry
|
|
122
|
-
- make - please run below command.
|
|
123
|
-
```shell
|
|
124
|
-
# sudo may be required
|
|
125
|
-
apt-get -y install make
|
|
126
|
-
```
|
|
127
|
-
In addition, below dependencies can be installed either using provided links or using `make install` command, if xpk is downloaded via `git clone` command:
|
|
128
|
-
- kueuectl (install from [here](https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/))
|
|
129
|
-
- kjob (installation instructions [here](https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md))
|
|
130
128
|
|
|
131
|
-
|
|
132
|
-
|
|
129
|
+
### Additional prerequisites when installing from pip
|
|
130
|
+
|
|
131
|
+
- kueuectl: install from [here](https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/)
|
|
132
|
+
- kjob: installation instructions [here](https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md)
|
|
133
|
+
|
|
134
|
+
### Additional prerequisites when installing from source
|
|
135
|
+
|
|
136
|
+
- git: [installation instructions](https://git-scm.com/downloads/linux)
|
|
137
|
+
- make: install by running `apt-get -y install make` (`sudo` might be required)
|
|
138
|
+
|
|
139
|
+
## Installation via pip
|
|
140
|
+
|
|
141
|
+
To install XPK using pip, first install required tools mentioned in [prerequisites](#prerequisites) and [additional prerequisites](#additional-prerequisites-when-installing-from-pip). Then you can install XPK simply by running:
|
|
133
142
|
|
|
134
143
|
```shell
|
|
135
144
|
pip install xpk
|
|
136
145
|
```
|
|
137
146
|
|
|
138
|
-
If you see an error saying: `This environment is externally managed`, please use a virtual environment.
|
|
147
|
+
If you see an error saying: `This environment is externally managed`, please use a virtual environment. For example:
|
|
139
148
|
|
|
140
149
|
```shell
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
150
|
+
# One time step of creating the virtual environment
|
|
151
|
+
VENV_DIR=~/venvp3
|
|
152
|
+
python3 -m venv $VENV_DIR
|
|
153
|
+
|
|
154
|
+
# Activate your virtual environment
|
|
155
|
+
source $VENV_DIR/bin/activate
|
|
156
|
+
|
|
157
|
+
# Install XPK in virtual environment using pip
|
|
158
|
+
pip install xpk
|
|
148
159
|
```
|
|
149
160
|
|
|
150
|
-
|
|
151
|
-
|
|
161
|
+
## Installation from source
|
|
162
|
+
|
|
163
|
+
To install XPK from source, first install required tools mentioned in [prerequisites](#prerequisites) and [additional prerequisites](#additional-prerequisites-when-installing-from-source). Afterwards you can install XPK from source using `make`
|
|
152
164
|
|
|
153
165
|
```shell
|
|
166
|
+
# Clone the XPK repository
|
|
154
167
|
git clone https://github.com/google/xpk.git
|
|
155
168
|
cd xpk
|
|
156
|
-
|
|
169
|
+
|
|
170
|
+
# Install required dependencies and build XPK with make
|
|
157
171
|
make install && export PATH=$PATH:$PWD/bin
|
|
158
172
|
```
|
|
159
173
|
|
|
160
|
-
If you want
|
|
161
|
-
`echo $PWD/bin` and add its value to `PATH` in .bashrc or .zshrc
|
|
174
|
+
If you want the dependecies to be available in your PATH please run: `echo $PWD/bin` and add its value to `PATH` in .bashrc or .zshrc file.
|
|
162
175
|
|
|
163
|
-
If you see an error saying: `This environment is externally managed`, please use a virtual environment.
|
|
164
|
-
|
|
165
|
-
Example:
|
|
176
|
+
If you see an error saying: `This environment is externally managed`, please use a virtual environment. For example:
|
|
166
177
|
|
|
167
178
|
```shell
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
179
|
+
# One time step of creating the virtual environment
|
|
180
|
+
VENV_DIR=~/venvp3
|
|
181
|
+
python3 -m venv $VENV_DIR
|
|
182
|
+
|
|
183
|
+
# Activate your virtual environment
|
|
184
|
+
source $VENV_DIR/bin/activate
|
|
185
|
+
|
|
186
|
+
# Clone the XPK repository
|
|
187
|
+
git clone https://github.com/google/xpk.git
|
|
188
|
+
cd xpk
|
|
189
|
+
|
|
190
|
+
# Install required dependencies and build XPK with make
|
|
191
|
+
make install && export PATH=$PATH:$PWD/bin
|
|
178
192
|
```
|
|
179
193
|
|
|
180
194
|
# XPK for Large Scale (>1k VMs)
|
|
@@ -253,6 +267,7 @@ all zones.
|
|
|
253
267
|
--num-slices=4 --on-demand \
|
|
254
268
|
--tpu-type=v5litepod-16
|
|
255
269
|
```
|
|
270
|
+
Note that Pathways clusters need a CPU nodepool of n2-standard-64 or higher.
|
|
256
271
|
|
|
257
272
|
* Cluster Create for Ray:
|
|
258
273
|
A cluster with KubeRay enabled and a RayCluster can be created using `cluster create-ray`.
|
|
@@ -454,28 +469,55 @@ will fail the cluster creation process because Vertex AI Tensorboard is not supp
|
|
|
454
469
|
--tpu-type=v5litepod-16
|
|
455
470
|
```
|
|
456
471
|
|
|
457
|
-
## Provisioning A3
|
|
458
|
-
To create a cluster with A3 machines, run the below
|
|
459
|
-
* For A3-Ultra: --device-type=h200-141gb-8
|
|
460
|
-
* For A3-Mega: --device-type=h100-mega-80gb-8
|
|
472
|
+
## Provisioning A3 Ultra, A3 Mega and A4 clusters (GPU machines)
|
|
473
|
+
To create a cluster with A3 or A4 machines, run the command below with selected device type. To create workloads on these clusters see [here](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines).
|
|
461
474
|
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
475
|
+
**Note:** Creating A3 Ultra, A3 Mega and A4 clusters is currently supported **only** on linux/amd64 architecture.
|
|
476
|
+
|
|
477
|
+
Machine | Device type
|
|
478
|
+
:- | :-
|
|
479
|
+
A3 Mega | `h100-mega-80gb-8`
|
|
480
|
+
A3 Ultra | `h200-141gb-8`
|
|
481
|
+
A4 | `b200-8`
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
```shell
|
|
485
|
+
python3 xpk.py cluster create \
|
|
486
|
+
--cluster CLUSTER_NAME --device-type DEVICE_TYPE \
|
|
465
487
|
--zone=$COMPUTE_ZONE --project=$PROJECT_ID \
|
|
466
|
-
--num-nodes
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
*
|
|
471
|
-
*
|
|
472
|
-
*
|
|
473
|
-
*
|
|
474
|
-
*
|
|
488
|
+
--num-nodes=$NUM_NODES --reservation=$RESERVATION_ID
|
|
489
|
+
```
|
|
490
|
+
|
|
491
|
+
Currently, the below flags/arguments are supported for A3 Mega, A3 Ultra and A4 machines:
|
|
492
|
+
* `--num-nodes`
|
|
493
|
+
* `--default-pool-cpu-machine-type`
|
|
494
|
+
* `--default-pool-cpu-num-nodes`
|
|
495
|
+
* `--reservation`
|
|
496
|
+
* `--spot`
|
|
497
|
+
* `--on-demand` (A3 Mega only)
|
|
475
498
|
|
|
499
|
+
## Running XPK on existing clusters
|
|
500
|
+
|
|
501
|
+
In order to run XPK commands on a cluster it needs to be set up correctly. This is done automatically when creating a cluster using `xpk cluster create`. For clusters created differently (e.g.: with 'gcloud' or a Cluster Toolkit blueprint) there is a dedicated command: `xpk cluster adapt`. This command installs required config maps, kueue, jobset, CSI drivers etc.
|
|
502
|
+
|
|
503
|
+
Currently `xpk cluster adapt` supports only the following device types:
|
|
504
|
+
|
|
505
|
+
- `h200-141gb-8` (A3 Ultra)
|
|
506
|
+
|
|
507
|
+
Example usage:
|
|
508
|
+
```shell
|
|
509
|
+
python3 xpk.py cluster adapt \
|
|
510
|
+
--cluster=$CLUSTER_NAME --device-type=$DEVICE_TYPE \
|
|
511
|
+
--zone=$COMPUTE_ZONE --project=$PROJECT_ID \
|
|
512
|
+
--num-nodes=$NUM_NODES --reservation=$RESERVATION_ID
|
|
513
|
+
```
|
|
476
514
|
|
|
477
515
|
## Storage
|
|
478
|
-
Currently XPK supports
|
|
516
|
+
Currently XPK supports the below types of storages:
|
|
517
|
+
- [Cloud Storage FUSE](#fuse)
|
|
518
|
+
- [Google Cloud Filestore](#filestore)
|
|
519
|
+
- [Google Cloud Parallelstore](#parallelstore)
|
|
520
|
+
- [Google Cloud Block storages (Persistent Disk, Hyperdisk)](#block-storage-persistent-disk-hyperdisk)
|
|
479
521
|
|
|
480
522
|
### FUSE
|
|
481
523
|
A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so applications can read and write objects in your bucket using standard file system semantics.
|
|
@@ -499,11 +541,13 @@ Parameters:
|
|
|
499
541
|
- `--readonly` - if set to true, workload can only read from storage.
|
|
500
542
|
- `--size` - size of the storage in Gb.
|
|
501
543
|
- `--bucket` - name of the storage bucket. If not set then the name of the storage is used as a bucket name.
|
|
544
|
+
- `--mount-options` - comma-separated list of additional mount options for PersistentVolume ([reference](https://cloud.google.com/kubernetes-engine/docs/how-to/cloud-storage-fuse-csi-driver-perf#mount-options)).
|
|
545
|
+
- `--prefetch-metadata` - enables metadata pre-population when mounting the volume by setting parameter `gcsfuseMetadataPrefetchOnMount` to `true` ([reference](https://cloud.google.com/kubernetes-engine/docs/how-to/cloud-storage-fuse-csi-driver-perf#metadata-prefetch)).
|
|
502
546
|
- `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions. If set, then values from manifest override the following parameters: `--size` and `--bucket`.
|
|
503
547
|
|
|
504
548
|
### Filestore
|
|
505
549
|
|
|
506
|
-
A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so applications can read and write
|
|
550
|
+
A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
|
|
507
551
|
|
|
508
552
|
To create and attach a GCP Filestore instance to your cluster use `xpk storage create` command with `--type=gcpfilestore`:
|
|
509
553
|
|
|
@@ -537,6 +581,54 @@ Commands `xpk storage create` and `xpk storage attach` with `--type=gcpfilestore
|
|
|
537
581
|
- `--instance` - the name of the Filestore instance. If not set then the name parameter is used as an instance name. Useful when connecting multiple volumes from the same Filestore instance.
|
|
538
582
|
- `--manifest` - path to the manifest file containing PersistentVolume, PresistentVolumeClaim and StorageClass definitions. If set, then values from manifest override the following parameters: `--access-mode`, `--size` and `--volume`.
|
|
539
583
|
|
|
584
|
+
### Parallelstore
|
|
585
|
+
|
|
586
|
+
A Parallelstore adapter lets you mount and access [Parallelstore instances](https://cloud.google.com/parallelstore/) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
|
|
587
|
+
|
|
588
|
+
To use the GCS Parallelstore with XPK you need to create a [Parallelstore Instance](https://console.cloud.google.com/parallelstore/).
|
|
589
|
+
|
|
590
|
+
Once it's ready you can use `xpk storage attach` with `--type=parallelstore` command to attach a Parallelstore instance to your cluster. Currently, attaching a Parallelstore is supported only by providing a manifest file.
|
|
591
|
+
|
|
592
|
+
```shell
|
|
593
|
+
python3 xpk.py storage attach test-parallelstore-storage --type=parallelstore \
|
|
594
|
+
--project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
|
|
595
|
+
--mount-point='/test-mount-point' --readonly=false \
|
|
596
|
+
--auto-mount=true \
|
|
597
|
+
--manifest='./examples/storage/parallelstore-manifest-attach.yaml'
|
|
598
|
+
```
|
|
599
|
+
|
|
600
|
+
Parameters:
|
|
601
|
+
|
|
602
|
+
- `--type` - type of the storage `parallelstore`
|
|
603
|
+
- `--auto-mount` - if set to true all workloads will have this storage mounted by default.
|
|
604
|
+
- `--mount-point` - the path on which this storage should be mounted for a workload.
|
|
605
|
+
- `--readonly` - if set to true, workload can only read from storage.
|
|
606
|
+
- `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
|
|
607
|
+
|
|
608
|
+
### Block storage (Persistent Disk, Hyperdisk)
|
|
609
|
+
|
|
610
|
+
A PersistentDisk adapter lets you mount and access Google Cloud Block storage solutions ([Persistent Disk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#pd), [Hyperdisk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#hyperdisk)) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
|
|
611
|
+
|
|
612
|
+
To use the GCE PersistentDisk with XPK you need to create a [disk in GCE](https://cloud.google.com/compute/docs/disks). Please consider that the disk type you are creating is [compatible with the VMs](https://cloud.google.com/compute/docs/machine-resource#machine_type_comparison) in the default and accelerator nodepools.
|
|
613
|
+
|
|
614
|
+
Once it's ready you can use `xpk storage attach` with `--type=pd` command to attach a PersistentDisk instance to your cluster. Currently, attaching a PersistentDisk is supported only by providing a manifest file.
|
|
615
|
+
|
|
616
|
+
```shell
|
|
617
|
+
python3 xpk.py storage attach test-pd-storage --type=pd \
|
|
618
|
+
--project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
|
|
619
|
+
--mount-point='/test-mount-point' --readonly=false \
|
|
620
|
+
--auto-mount=true \
|
|
621
|
+
--manifest='./examples/storage/pd-manifest-attach.yaml'
|
|
622
|
+
```
|
|
623
|
+
|
|
624
|
+
Parameters:
|
|
625
|
+
|
|
626
|
+
- `--type` - type of the storage `pd`
|
|
627
|
+
- `--auto-mount` - if set to true all workloads will have this storage mounted by default.
|
|
628
|
+
- `--mount-point` - the path on which this storage should be mounted for a workload.
|
|
629
|
+
- `--readonly` - if set to true, workload can only read from storage.
|
|
630
|
+
- `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
|
|
631
|
+
|
|
540
632
|
### List attached storages
|
|
541
633
|
|
|
542
634
|
```shell
|
|
@@ -593,7 +685,7 @@ python3 xpk.py storage delete test-fs-instance \
|
|
|
593
685
|
--cluster xpk-pw-test \
|
|
594
686
|
--docker-name='user-workload' \
|
|
595
687
|
--docker-image=<maxtext docker image> \
|
|
596
|
-
--command='python3 MaxText
|
|
688
|
+
--command='python3 -m MaxText.train MaxText/configs/base.yml base_output_directory=<output directory> dataset_path=<dataset path> per_device_batch_size=1 enable_checkpointing=false enable_profiler=false remat_policy=full global_parameter_scale=4 steps=300 max_target_length=2048 use_iota_embed=true reuse_example_batch=1 dataset_type=synthetic attention=flash gcs_metrics=True run_name=$(USER)-pw-xpk-test-1 enable_single_controller=True'
|
|
597
689
|
```
|
|
598
690
|
|
|
599
691
|
Regular workload can also be submitted on a Pathways enabled cluster (created with `cluster create-pathways`)
|
|
@@ -607,7 +699,7 @@ python3 xpk.py storage delete test-fs-instance \
|
|
|
607
699
|
--cluster xpk-pw-test \
|
|
608
700
|
--docker-name='user-workload' \
|
|
609
701
|
--docker-image=<maxtext docker image> \
|
|
610
|
-
--command='python3 MaxText
|
|
702
|
+
--command='python3 -m MaxText.train MaxText/configs/base.yml base_output_directory=<output directory> dataset_path=<dataset path> per_device_batch_size=1 enable_checkpointing=false enable_profiler=false remat_policy=full global_parameter_scale=4 steps=300 max_target_length=2048 use_iota_embed=true reuse_example_batch=1 dataset_type=synthetic attention=flash gcs_metrics=True run_name=$(USER)-pw-xpk-test-1'
|
|
611
703
|
```
|
|
612
704
|
|
|
613
705
|
Pathways in headless mode - Pathways now offers the capability to run JAX workloads in Vertex AI notebooks or in GCE VMs!
|
|
@@ -637,21 +729,27 @@ increase this to a large number, say 50. Real jobs can be interrupted due to
|
|
|
637
729
|
hardware failures and software updates. We assume your job has implemented
|
|
638
730
|
checkpointing so the job restarts near where it was interrupted.
|
|
639
731
|
|
|
640
|
-
### Workloads for A3
|
|
641
|
-
To submit jobs on a cluster with A3 machines, run the
|
|
642
|
-
* For A3-Ultra: --device-type=h200-141gb-8
|
|
643
|
-
* For A3-Mega: --device-type=h100-mega-80gb-8
|
|
732
|
+
### Workloads for A3 Ultra, A3 Mega and A4 clusters (GPU machines)
|
|
733
|
+
To submit jobs on a cluster with A3 or A4 machines, run the command with selected device type. To create a cluster with A3 or A4 machines see [here](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines).
|
|
644
734
|
|
|
645
|
-
|
|
646
|
-
|
|
735
|
+
|
|
736
|
+
Machine | Device type
|
|
737
|
+
:- | :-
|
|
738
|
+
A3 Mega | `h100-mega-80gb-8`
|
|
739
|
+
A3 Ultra | `h200-141gb-8`
|
|
740
|
+
A4 | `b200-8`
|
|
741
|
+
|
|
742
|
+
```shell
|
|
743
|
+
python3 xpk.py workload create \
|
|
647
744
|
--workload=$WORKLOAD_NAME --command="echo goodbye" \
|
|
648
|
-
--cluster=$CLUSTER_NAME --device-type
|
|
745
|
+
--cluster=$CLUSTER_NAME --device-type DEVICE_TYPE \
|
|
649
746
|
--zone=$COMPUTE_ZONE --project=$PROJECT_ID \
|
|
650
747
|
--num-nodes=$WOKRKLOAD_NUM_NODES
|
|
651
|
-
|
|
652
|
-
|
|
748
|
+
```
|
|
749
|
+
|
|
750
|
+
> The docker image flags/arguments introduced in [workloads section](#workload-create) can be used with A3 or A4 machines as well.
|
|
653
751
|
|
|
654
|
-
In order to run NCCL test on A3
|
|
752
|
+
In order to run NCCL test on A3 machines check out [this guide](/examples/nccl/nccl.md).
|
|
655
753
|
|
|
656
754
|
### Workload Priority and Preemption
|
|
657
755
|
* Set the priority level of your workload with `--priority=LEVEL`
|
|
@@ -1498,4 +1596,5 @@ python xpk.py batch [other-options] --kind-cluster script
|
|
|
1498
1596
|
Please note that all other xpk subcommands are intended for use with cloud systems on Google Cloud Engine (GCE) and don't support local testing. This includes commands like cluster, info, inspector, etc.
|
|
1499
1597
|
|
|
1500
1598
|
# Other advanced usage
|
|
1501
|
-
[Use a Jupyter notebook to interact with a Cloud TPU cluster](xpk-notebooks.md)
|
|
1599
|
+
[Use a Jupyter notebook to interact with a Cloud TPU cluster](xpk-notebooks.md) \
|
|
1600
|
+
[Use Slurm like commands in XPK to execute workloads on top of GKE](xpk-slurm-commands.md)
|