xpk 0.8.0__tar.gz → 0.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xpk-0.8.0/src/xpk.egg-info → xpk-0.9.0}/PKG-INFO +134 -91
- {xpk-0.8.0 → xpk-0.9.0}/README.md +133 -90
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/batch.py +2 -3
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/cluster.py +225 -73
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/common.py +33 -1
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/kjob_common.py +10 -1
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/run.py +2 -3
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/storage.py +14 -3
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/workload.py +17 -15
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/blueprint/blueprint_generator.py +18 -18
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/cluster.py +119 -8
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/config.py +1 -1
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/filestore.py +2 -6
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/gcsfuse.py +22 -4
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/kjob.py +20 -13
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/kueue.py +30 -0
- xpk-0.9.0/src/xpk/core/mtc.py +195 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/network.py +23 -1
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/pathways.py +1 -1
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/resources.py +21 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/workload.py +1 -1
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/workload_decorators/rdma_decorator.py +6 -10
- xpk-0.9.0/src/xpk/core/workload_decorators/tcpx_decorator.py +179 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/workload_decorators/tcpxo_decorator.py +15 -14
- xpk-0.9.0/src/xpk/parser/cluster.py +855 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/storage.py +11 -2
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/utils/kubectl.py +4 -1
- {xpk-0.8.0 → xpk-0.9.0/src/xpk.egg-info}/PKG-INFO +134 -91
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk.egg-info/SOURCES.txt +2 -0
- xpk-0.8.0/src/xpk/parser/cluster.py +0 -671
- {xpk-0.8.0 → xpk-0.9.0}/LICENSE +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/pyproject.toml +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/setup.cfg +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/__init__.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/api/__init__.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/api/storage_crd.yaml +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/__init__.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/cluster_gcluster.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/config.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/info.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/inspector.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/job.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/kind.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/shell.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/version.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/__init__.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/blueprint/__init__.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/capacity.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/cluster_private.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/commands.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/docker_container.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/docker_image.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/docker_manager.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/docker_resources.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/gcloud_context.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/gcluster_manager.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/monitoring.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/nap.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/nodepool.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/ray.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/remote_state/__init__.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/remote_state/remote_state_client.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/scheduling.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/storage.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/system_characteristics.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/vertex.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/workload_decorators/__init__.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/main.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/__init__.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/batch.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/common.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/config.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/core.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/info.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/inspector.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/job.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/kind.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/run.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/shell.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/validators.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/version.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/workload.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/templates/__init__.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/templates/storage.yaml +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/utils/__init__.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/utils/console.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/utils/file.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/utils/gcs_utils.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/utils/network.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/utils/objects.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/utils/templates.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/utils/validation.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk/utils/yaml.py +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk.egg-info/dependency_links.txt +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk.egg-info/entry_points.txt +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk.egg-info/requires.txt +0 -0
- {xpk-0.8.0 → xpk-0.9.0}/src/xpk.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xpk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.9.0
|
|
4
4
|
Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
|
|
5
5
|
Author-email: XPK team <xpk-code-reviewers@google.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -46,23 +46,18 @@ Dynamic: license-file
|
|
|
46
46
|
limitations under the License.
|
|
47
47
|
-->
|
|
48
48
|
|
|
49
|
-
[](https://github.com/google/xpk/actions/workflows/build_tests.yaml)
|
|
50
|
-
[](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml)
|
|
51
|
-
[](https://github.com/google/xpk/actions/workflows/build_tests.yaml?query=branch%3Amain)
|
|
50
|
+
[](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml?query=branch%3Amain)
|
|
51
|
+
[](https://github.com/AI-Hypercomputer/xpk/actions/workflows/build_tests.yaml?query=branch%3Adevelop)
|
|
52
|
+
[](https://github.com/AI-Hypercomputer/xpk/actions/workflows/nightly_tests.yaml?query=branch%3Adevelop)
|
|
53
53
|
|
|
54
54
|
# Overview
|
|
55
55
|
|
|
56
|
-
|
|
57
|
-
Cloud developers to orchestrate training jobs on accelerators such as TPUs and
|
|
58
|
-
GPUs on GKE. xpk handles the "multihost pods" of TPUs, GPUs (HGX H100) and CPUs
|
|
59
|
-
(n2-standard-32) as first class citizens.
|
|
56
|
+
XPK (Accelerated Processing Kit, pronounced x-p-k) is a command line interface that simplifies cluster creation and workload execution on Google Kubernetes Engine (GKE). XPK generates preconfigured, training-optimized clusters and allows easy workload scheduling without any Kubernetes expertise.
|
|
60
57
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
physical resources you have available. Workloads represent training jobs -- at
|
|
64
|
-
any time some of these will be completed, others will be running and some will
|
|
65
|
-
be queued, waiting for cluster resources to become available.
|
|
58
|
+
XPK is recommended for quick creation of GKE clusters for proofs of concepts and testing.
|
|
59
|
+
|
|
60
|
+
XPK decouples provisioning capacity from running jobs. There are two structures: clusters (provisioned VMs) and workloads (training jobs). Clusters represent the physical resources you have available. Workloads represent training jobs -- at any time some of these will be completed, others will be running and some will be queued, waiting for cluster resources to become available.
|
|
66
61
|
|
|
67
62
|
The ideal workflow starts by provisioning the clusters for all of the ML
|
|
68
63
|
hardware you have reserved. Then, without re-provisioning, submit jobs as
|
|
@@ -73,7 +68,7 @@ return the hardware back to the shared pool when they complete, developers can
|
|
|
73
68
|
achieve better use of finite hardware resources. And automated tests can run
|
|
74
69
|
overnight while resources tend to be underutilized.
|
|
75
70
|
|
|
76
|
-
|
|
71
|
+
XPK supports the following TPU types:
|
|
77
72
|
* v4
|
|
78
73
|
* v5e
|
|
79
74
|
* v5p
|
|
@@ -82,13 +77,14 @@ xpk supports the following TPU types:
|
|
|
82
77
|
and the following GPU types:
|
|
83
78
|
* A100
|
|
84
79
|
* A3-Highgpu (h100)
|
|
85
|
-
* A3-Mega (h100-mega) - [Create cluster](#provisioning-a3-ultra-
|
|
86
|
-
* A3-Ultra (h200) - [Create cluster](#provisioning-a3-ultra-
|
|
80
|
+
* A3-Mega (h100-mega) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
|
|
81
|
+
* A3-Ultra (h200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
|
|
82
|
+
* A4 (b200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
|
|
87
83
|
|
|
88
84
|
and the following CPU types:
|
|
89
85
|
* n2-standard-32
|
|
90
86
|
|
|
91
|
-
|
|
87
|
+
XPK also supports [Google Cloud Storage solutions](#storage):
|
|
92
88
|
* [Cloud Storage FUSE](#fuse)
|
|
93
89
|
* [Filestore](#filestore)
|
|
94
90
|
* [Parallelstore](#parallelstore)
|
|
@@ -106,77 +102,93 @@ xpk also supports [Google Cloud Storage solutions](#storage):
|
|
|
106
102
|
* Vertex AI Administrator
|
|
107
103
|
* Filestore Editor (This role is neccessary if you want to run `storage create` command with `--type=gcpfilestore`)
|
|
108
104
|
|
|
109
|
-
#
|
|
105
|
+
# Installation
|
|
106
|
+
|
|
107
|
+
There are 2 ways to install XPK:
|
|
110
108
|
|
|
111
|
-
|
|
109
|
+
- via Python package installer (`pip`),
|
|
110
|
+
- clone from git and build from source.
|
|
112
111
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
112
|
+
## Prerequisites
|
|
113
|
+
|
|
114
|
+
The following tools must be installed:
|
|
115
|
+
|
|
116
|
+
- python >= 3.10: download from [here](https://www.python.org/downloads/)
|
|
117
|
+
- pip: [installation instructions](https://pip.pypa.io/en/stable/installation/)
|
|
118
|
+
- python venv: [installation instructions](https://virtualenv.pypa.io/en/latest/installation.html)
|
|
116
119
|
(all three of above can be installed at once from [here](https://packaging.python.org/en/latest/guides/installing-using-linux-tools/#installing-pip-setuptools-wheel-with-linux-package-managers))
|
|
117
|
-
- gcloud
|
|
120
|
+
- gcloud: install from [here](https://cloud.google.com/sdk/gcloud#download_and_install_the) and then:
|
|
118
121
|
- Run `gcloud init`
|
|
119
122
|
- [Authenticate](https://cloud.google.com/sdk/gcloud/reference/auth/application-default/login) to Google Cloud
|
|
120
|
-
- kubectl
|
|
123
|
+
- kubectl: install from [here](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_kubectl) and then:
|
|
121
124
|
- Install `gke-gcloud-auth-plugin` from [here](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_plugin)
|
|
122
|
-
- docker
|
|
125
|
+
- docker: [installation instructions](https://docs.docker.com/engine/install/) and then:
|
|
126
|
+
- Configure sudoless docker: [guide](https://docs.docker.com/engine/install/linux-postinstall/)
|
|
123
127
|
- Run `gcloud auth configure-docker` to ensure images can be uploaded to registry
|
|
124
|
-
- make - please run below command.
|
|
125
|
-
```shell
|
|
126
|
-
# sudo may be required
|
|
127
|
-
apt-get -y install make
|
|
128
|
-
```
|
|
129
|
-
In addition, below dependencies can be installed either using provided links or using `make install` command, if xpk is downloaded via `git clone` command:
|
|
130
|
-
- kueuectl (install from [here](https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/))
|
|
131
|
-
- kjob (installation instructions [here](https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md))
|
|
132
128
|
|
|
133
|
-
|
|
134
|
-
|
|
129
|
+
### Additional prerequisites when installing from pip
|
|
130
|
+
|
|
131
|
+
- kueuectl: install from [here](https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/)
|
|
132
|
+
- kjob: installation instructions [here](https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md)
|
|
133
|
+
|
|
134
|
+
### Additional prerequisites when installing from source
|
|
135
|
+
|
|
136
|
+
- git: [installation instructions](https://git-scm.com/downloads/linux)
|
|
137
|
+
- make: install by running `apt-get -y install make` (`sudo` might be required)
|
|
138
|
+
|
|
139
|
+
## Installation via pip
|
|
140
|
+
|
|
141
|
+
To install XPK using pip, first install required tools mentioned in [prerequisites](#prerequisites) and [additional prerequisites](#additional-prerequisites-when-installing-from-pip). Then you can install XPK simply by running:
|
|
135
142
|
|
|
136
143
|
```shell
|
|
137
144
|
pip install xpk
|
|
138
145
|
```
|
|
139
146
|
|
|
140
|
-
If you see an error saying: `This environment is externally managed`, please use a virtual environment.
|
|
147
|
+
If you see an error saying: `This environment is externally managed`, please use a virtual environment. For example:
|
|
141
148
|
|
|
142
149
|
```shell
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
+
# One time step of creating the virtual environment
|
|
151
|
+
VENV_DIR=~/venvp3
|
|
152
|
+
python3 -m venv $VENV_DIR
|
|
153
|
+
|
|
154
|
+
# Activate your virtual environment
|
|
155
|
+
source $VENV_DIR/bin/activate
|
|
156
|
+
|
|
157
|
+
# Install XPK in virtual environment using pip
|
|
158
|
+
pip install xpk
|
|
150
159
|
```
|
|
151
160
|
|
|
152
|
-
|
|
153
|
-
|
|
161
|
+
## Installation from source
|
|
162
|
+
|
|
163
|
+
To install XPK from source, first install required tools mentioned in [prerequisites](#prerequisites) and [additional prerequisites](#additional-prerequisites-when-installing-from-source). Afterwards you can install XPK from source using `make`
|
|
154
164
|
|
|
155
165
|
```shell
|
|
166
|
+
# Clone the XPK repository
|
|
156
167
|
git clone https://github.com/google/xpk.git
|
|
157
168
|
cd xpk
|
|
158
|
-
|
|
169
|
+
|
|
170
|
+
# Install required dependencies and build XPK with make
|
|
159
171
|
make install && export PATH=$PATH:$PWD/bin
|
|
160
172
|
```
|
|
161
173
|
|
|
162
|
-
If you want
|
|
163
|
-
`echo $PWD/bin` and add its value to `PATH` in .bashrc or .zshrc
|
|
164
|
-
|
|
165
|
-
If you see an error saying: `This environment is externally managed`, please use a virtual environment.
|
|
174
|
+
If you want the dependecies to be available in your PATH please run: `echo $PWD/bin` and add its value to `PATH` in .bashrc or .zshrc file.
|
|
166
175
|
|
|
167
|
-
|
|
176
|
+
If you see an error saying: `This environment is externally managed`, please use a virtual environment. For example:
|
|
168
177
|
|
|
169
178
|
```shell
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
179
|
+
# One time step of creating the virtual environment
|
|
180
|
+
VENV_DIR=~/venvp3
|
|
181
|
+
python3 -m venv $VENV_DIR
|
|
182
|
+
|
|
183
|
+
# Activate your virtual environment
|
|
184
|
+
source $VENV_DIR/bin/activate
|
|
185
|
+
|
|
186
|
+
# Clone the XPK repository
|
|
187
|
+
git clone https://github.com/google/xpk.git
|
|
188
|
+
cd xpk
|
|
189
|
+
|
|
190
|
+
# Install required dependencies and build XPK with make
|
|
191
|
+
make install && export PATH=$PATH:$PWD/bin
|
|
180
192
|
```
|
|
181
193
|
|
|
182
194
|
# XPK for Large Scale (>1k VMs)
|
|
@@ -457,25 +469,48 @@ will fail the cluster creation process because Vertex AI Tensorboard is not supp
|
|
|
457
469
|
--tpu-type=v5litepod-16
|
|
458
470
|
```
|
|
459
471
|
|
|
460
|
-
## Provisioning A3
|
|
461
|
-
To create a cluster with A3 machines, run the below
|
|
462
|
-
* For A3-Ultra: --device-type=h200-141gb-8
|
|
463
|
-
* For A3-Mega: --device-type=h100-mega-80gb-8
|
|
472
|
+
## Provisioning A3 Ultra, A3 Mega and A4 clusters (GPU machines)
|
|
473
|
+
To create a cluster with A3 or A4 machines, run the command below with selected device type. To create workloads on these clusters see [here](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines).
|
|
464
474
|
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
475
|
+
**Note:** Creating A3 Ultra, A3 Mega and A4 clusters is currently supported **only** on linux/amd64 architecture.
|
|
476
|
+
|
|
477
|
+
Machine | Device type
|
|
478
|
+
:- | :-
|
|
479
|
+
A3 Mega | `h100-mega-80gb-8`
|
|
480
|
+
A3 Ultra | `h200-141gb-8`
|
|
481
|
+
A4 | `b200-8`
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
```shell
|
|
485
|
+
python3 xpk.py cluster create \
|
|
486
|
+
--cluster CLUSTER_NAME --device-type DEVICE_TYPE \
|
|
468
487
|
--zone=$COMPUTE_ZONE --project=$PROJECT_ID \
|
|
469
|
-
--num-nodes
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
*
|
|
474
|
-
*
|
|
475
|
-
*
|
|
476
|
-
*
|
|
477
|
-
*
|
|
488
|
+
--num-nodes=$NUM_NODES --reservation=$RESERVATION_ID
|
|
489
|
+
```
|
|
490
|
+
|
|
491
|
+
Currently, the below flags/arguments are supported for A3 Mega, A3 Ultra and A4 machines:
|
|
492
|
+
* `--num-nodes`
|
|
493
|
+
* `--default-pool-cpu-machine-type`
|
|
494
|
+
* `--default-pool-cpu-num-nodes`
|
|
495
|
+
* `--reservation`
|
|
496
|
+
* `--spot`
|
|
497
|
+
* `--on-demand` (A3 Mega only)
|
|
498
|
+
|
|
499
|
+
## Running XPK on existing clusters
|
|
500
|
+
|
|
501
|
+
In order to run XPK commands on a cluster it needs to be set up correctly. This is done automatically when creating a cluster using `xpk cluster create`. For clusters created differently (e.g.: with 'gcloud' or a Cluster Toolkit blueprint) there is a dedicated command: `xpk cluster adapt`. This command installs required config maps, kueue, jobset, CSI drivers etc.
|
|
502
|
+
|
|
503
|
+
Currently `xpk cluster adapt` supports only the following device types:
|
|
478
504
|
|
|
505
|
+
- `h200-141gb-8` (A3 Ultra)
|
|
506
|
+
|
|
507
|
+
Example usage:
|
|
508
|
+
```shell
|
|
509
|
+
python3 xpk.py cluster adapt \
|
|
510
|
+
--cluster=$CLUSTER_NAME --device-type=$DEVICE_TYPE \
|
|
511
|
+
--zone=$COMPUTE_ZONE --project=$PROJECT_ID \
|
|
512
|
+
--num-nodes=$NUM_NODES --reservation=$RESERVATION_ID
|
|
513
|
+
```
|
|
479
514
|
|
|
480
515
|
## Storage
|
|
481
516
|
Currently XPK supports the below types of storages:
|
|
@@ -507,6 +542,7 @@ Parameters:
|
|
|
507
542
|
- `--size` - size of the storage in Gb.
|
|
508
543
|
- `--bucket` - name of the storage bucket. If not set then the name of the storage is used as a bucket name.
|
|
509
544
|
- `--mount-options` - comma-separated list of additional mount options for PersistentVolume ([reference](https://cloud.google.com/kubernetes-engine/docs/how-to/cloud-storage-fuse-csi-driver-perf#mount-options)).
|
|
545
|
+
- `--prefetch-metadata` - enables metadata pre-population when mounting the volume by setting parameter `gcsfuseMetadataPrefetchOnMount` to `true` ([reference](https://cloud.google.com/kubernetes-engine/docs/how-to/cloud-storage-fuse-csi-driver-perf#metadata-prefetch)).
|
|
510
546
|
- `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions. If set, then values from manifest override the following parameters: `--size` and `--bucket`.
|
|
511
547
|
|
|
512
548
|
### Filestore
|
|
@@ -649,7 +685,7 @@ python3 xpk.py storage delete test-fs-instance \
|
|
|
649
685
|
--cluster xpk-pw-test \
|
|
650
686
|
--docker-name='user-workload' \
|
|
651
687
|
--docker-image=<maxtext docker image> \
|
|
652
|
-
--command='python3 MaxText
|
|
688
|
+
--command='python3 -m MaxText.train MaxText/configs/base.yml base_output_directory=<output directory> dataset_path=<dataset path> per_device_batch_size=1 enable_checkpointing=false enable_profiler=false remat_policy=full global_parameter_scale=4 steps=300 max_target_length=2048 use_iota_embed=true reuse_example_batch=1 dataset_type=synthetic attention=flash gcs_metrics=True run_name=$(USER)-pw-xpk-test-1 enable_single_controller=True'
|
|
653
689
|
```
|
|
654
690
|
|
|
655
691
|
Regular workload can also be submitted on a Pathways enabled cluster (created with `cluster create-pathways`)
|
|
@@ -663,7 +699,7 @@ python3 xpk.py storage delete test-fs-instance \
|
|
|
663
699
|
--cluster xpk-pw-test \
|
|
664
700
|
--docker-name='user-workload' \
|
|
665
701
|
--docker-image=<maxtext docker image> \
|
|
666
|
-
--command='python3 MaxText
|
|
702
|
+
--command='python3 -m MaxText.train MaxText/configs/base.yml base_output_directory=<output directory> dataset_path=<dataset path> per_device_batch_size=1 enable_checkpointing=false enable_profiler=false remat_policy=full global_parameter_scale=4 steps=300 max_target_length=2048 use_iota_embed=true reuse_example_batch=1 dataset_type=synthetic attention=flash gcs_metrics=True run_name=$(USER)-pw-xpk-test-1'
|
|
667
703
|
```
|
|
668
704
|
|
|
669
705
|
Pathways in headless mode - Pathways now offers the capability to run JAX workloads in Vertex AI notebooks or in GCE VMs!
|
|
@@ -693,21 +729,27 @@ increase this to a large number, say 50. Real jobs can be interrupted due to
|
|
|
693
729
|
hardware failures and software updates. We assume your job has implemented
|
|
694
730
|
checkpointing so the job restarts near where it was interrupted.
|
|
695
731
|
|
|
696
|
-
### Workloads for A3
|
|
697
|
-
To submit jobs on a cluster with A3 machines, run the
|
|
698
|
-
* For A3-Ultra: --device-type=h200-141gb-8
|
|
699
|
-
* For A3-Mega: --device-type=h100-mega-80gb-8
|
|
732
|
+
### Workloads for A3 Ultra, A3 Mega and A4 clusters (GPU machines)
|
|
733
|
+
To submit jobs on a cluster with A3 or A4 machines, run the command with selected device type. To create a cluster with A3 or A4 machines see [here](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines).
|
|
700
734
|
|
|
701
|
-
|
|
702
|
-
|
|
735
|
+
|
|
736
|
+
Machine | Device type
|
|
737
|
+
:- | :-
|
|
738
|
+
A3 Mega | `h100-mega-80gb-8`
|
|
739
|
+
A3 Ultra | `h200-141gb-8`
|
|
740
|
+
A4 | `b200-8`
|
|
741
|
+
|
|
742
|
+
```shell
|
|
743
|
+
python3 xpk.py workload create \
|
|
703
744
|
--workload=$WORKLOAD_NAME --command="echo goodbye" \
|
|
704
|
-
--cluster=$CLUSTER_NAME --device-type
|
|
745
|
+
--cluster=$CLUSTER_NAME --device-type DEVICE_TYPE \
|
|
705
746
|
--zone=$COMPUTE_ZONE --project=$PROJECT_ID \
|
|
706
747
|
--num-nodes=$WOKRKLOAD_NUM_NODES
|
|
707
|
-
|
|
708
|
-
|
|
748
|
+
```
|
|
749
|
+
|
|
750
|
+
> The docker image flags/arguments introduced in [workloads section](#workload-create) can be used with A3 or A4 machines as well.
|
|
709
751
|
|
|
710
|
-
In order to run NCCL test on A3
|
|
752
|
+
In order to run NCCL test on A3 machines check out [this guide](/examples/nccl/nccl.md).
|
|
711
753
|
|
|
712
754
|
### Workload Priority and Preemption
|
|
713
755
|
* Set the priority level of your workload with `--priority=LEVEL`
|
|
@@ -1554,4 +1596,5 @@ python xpk.py batch [other-options] --kind-cluster script
|
|
|
1554
1596
|
Please note that all other xpk subcommands are intended for use with cloud systems on Google Cloud Engine (GCE) and don't support local testing. This includes commands like cluster, info, inspector, etc.
|
|
1555
1597
|
|
|
1556
1598
|
# Other advanced usage
|
|
1557
|
-
[Use a Jupyter notebook to interact with a Cloud TPU cluster](xpk-notebooks.md)
|
|
1599
|
+
[Use a Jupyter notebook to interact with a Cloud TPU cluster](xpk-notebooks.md) \
|
|
1600
|
+
[Use Slurm like commands in XPK to execute workloads on top of GKE](xpk-slurm-commands.md)
|