xpk 0.8.0__tar.gz → 0.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xpk-0.8.0/src/xpk.egg-info → xpk-0.10.0}/PKG-INFO +178 -96
- {xpk-0.8.0 → xpk-0.10.0}/README.md +177 -95
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/batch.py +5 -6
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/cluster.py +246 -73
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/cluster_gcluster.py +27 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/common.py +40 -1
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/kjob_common.py +13 -1
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/run.py +4 -5
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/shell.py +2 -2
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/storage.py +24 -6
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/workload.py +66 -27
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/blueprint/blueprint_generator.py +115 -47
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/capacity.py +66 -6
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/cluster.py +282 -13
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/config.py +1 -65
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/docker_manager.py +1 -1
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/docker_resources.py +145 -72
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/filestore.py +2 -6
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/gcsfuse.py +22 -4
- xpk-0.10.0/src/xpk/core/jobset.py +143 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/kjob.py +21 -18
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/kueue.py +194 -4
- xpk-0.10.0/src/xpk/core/mtc.py +195 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/network.py +23 -1
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/nodepool.py +17 -4
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/pathways.py +2 -3
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/resources.py +21 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/storage.py +1 -95
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/system_characteristics.py +1 -1
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/workload.py +1 -45
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/workload_decorators/rdma_decorator.py +8 -10
- xpk-0.10.0/src/xpk/core/workload_decorators/tcpx_decorator.py +185 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/workload_decorators/tcpxo_decorator.py +22 -14
- xpk-0.10.0/src/xpk/parser/cluster.py +871 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/storage.py +12 -3
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/workload.py +21 -3
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/utils/kubectl.py +4 -1
- {xpk-0.8.0 → xpk-0.10.0/src/xpk.egg-info}/PKG-INFO +178 -96
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk.egg-info/SOURCES.txt +3 -0
- xpk-0.8.0/src/xpk/parser/cluster.py +0 -671
- {xpk-0.8.0 → xpk-0.10.0}/LICENSE +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/pyproject.toml +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/setup.cfg +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/__init__.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/api/__init__.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/api/storage_crd.yaml +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/__init__.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/config.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/info.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/inspector.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/job.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/kind.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/version.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/__init__.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/blueprint/__init__.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/cluster_private.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/commands.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/docker_container.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/docker_image.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/gcloud_context.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/gcluster_manager.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/monitoring.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/nap.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/ray.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/remote_state/__init__.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/remote_state/remote_state_client.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/scheduling.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/vertex.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/workload_decorators/__init__.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/main.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/__init__.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/batch.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/common.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/config.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/core.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/info.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/inspector.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/job.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/kind.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/run.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/shell.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/validators.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/version.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/templates/__init__.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/templates/storage.yaml +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/utils/__init__.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/utils/console.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/utils/file.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/utils/gcs_utils.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/utils/network.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/utils/objects.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/utils/templates.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/utils/validation.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk/utils/yaml.py +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk.egg-info/dependency_links.txt +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk.egg-info/entry_points.txt +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk.egg-info/requires.txt +0 -0
- {xpk-0.8.0 → xpk-0.10.0}/src/xpk.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xpk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.10.0
|
|
4
4
|
Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
|
|
5
5
|
Author-email: XPK team <xpk-code-reviewers@google.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -46,23 +46,18 @@ Dynamic: license-file
|
|
|
46
46
|
limitations under the License.
|
|
47
47
|
-->
|
|
48
48
|
|
|
49
|
-
[](https://github.com/google/xpk/actions/workflows/build_tests.yaml)
|
|
50
|
-
[](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml)
|
|
51
|
-
[](https://github.com/google/xpk/actions/workflows/build_tests.yaml?query=branch%3Amain)
|
|
50
|
+
[](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml?query=branch%3Amain)
|
|
51
|
+
[](https://github.com/AI-Hypercomputer/xpk/actions/workflows/build_tests.yaml?query=branch%3Adevelop)
|
|
52
|
+
[](https://github.com/AI-Hypercomputer/xpk/actions/workflows/nightly_tests.yaml?query=branch%3Adevelop)
|
|
53
53
|
|
|
54
54
|
# Overview
|
|
55
55
|
|
|
56
|
-
|
|
57
|
-
Cloud developers to orchestrate training jobs on accelerators such as TPUs and
|
|
58
|
-
GPUs on GKE. xpk handles the "multihost pods" of TPUs, GPUs (HGX H100) and CPUs
|
|
59
|
-
(n2-standard-32) as first class citizens.
|
|
56
|
+
XPK (Accelerated Processing Kit, pronounced x-p-k) is a command line interface that simplifies cluster creation and workload execution on Google Kubernetes Engine (GKE). XPK generates preconfigured, training-optimized clusters and allows easy workload scheduling without any Kubernetes expertise.
|
|
60
57
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
physical resources you have available. Workloads represent training jobs -- at
|
|
64
|
-
any time some of these will be completed, others will be running and some will
|
|
65
|
-
be queued, waiting for cluster resources to become available.
|
|
58
|
+
XPK is recommended for quick creation of GKE clusters for proofs of concepts and testing.
|
|
59
|
+
|
|
60
|
+
XPK decouples provisioning capacity from running jobs. There are two structures: clusters (provisioned VMs) and workloads (training jobs). Clusters represent the physical resources you have available. Workloads represent training jobs -- at any time some of these will be completed, others will be running and some will be queued, waiting for cluster resources to become available.
|
|
66
61
|
|
|
67
62
|
The ideal workflow starts by provisioning the clusters for all of the ML
|
|
68
63
|
hardware you have reserved. Then, without re-provisioning, submit jobs as
|
|
@@ -73,7 +68,7 @@ return the hardware back to the shared pool when they complete, developers can
|
|
|
73
68
|
achieve better use of finite hardware resources. And automated tests can run
|
|
74
69
|
overnight while resources tend to be underutilized.
|
|
75
70
|
|
|
76
|
-
|
|
71
|
+
XPK supports the following TPU types:
|
|
77
72
|
* v4
|
|
78
73
|
* v5e
|
|
79
74
|
* v5p
|
|
@@ -82,13 +77,14 @@ xpk supports the following TPU types:
|
|
|
82
77
|
and the following GPU types:
|
|
83
78
|
* A100
|
|
84
79
|
* A3-Highgpu (h100)
|
|
85
|
-
* A3-Mega (h100-mega) - [Create cluster](#provisioning-a3-ultra-
|
|
86
|
-
* A3-Ultra (h200) - [Create cluster](#provisioning-a3-ultra-
|
|
80
|
+
* A3-Mega (h100-mega) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
|
|
81
|
+
* A3-Ultra (h200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
|
|
82
|
+
* A4 (b200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
|
|
87
83
|
|
|
88
84
|
and the following CPU types:
|
|
89
85
|
* n2-standard-32
|
|
90
86
|
|
|
91
|
-
|
|
87
|
+
XPK also supports [Google Cloud Storage solutions](#storage):
|
|
92
88
|
* [Cloud Storage FUSE](#fuse)
|
|
93
89
|
* [Filestore](#filestore)
|
|
94
90
|
* [Parallelstore](#parallelstore)
|
|
@@ -106,77 +102,93 @@ xpk also supports [Google Cloud Storage solutions](#storage):
|
|
|
106
102
|
* Vertex AI Administrator
|
|
107
103
|
* Filestore Editor (This role is neccessary if you want to run `storage create` command with `--type=gcpfilestore`)
|
|
108
104
|
|
|
109
|
-
#
|
|
105
|
+
# Installation
|
|
106
|
+
|
|
107
|
+
There are 2 ways to install XPK:
|
|
108
|
+
|
|
109
|
+
- via Python package installer (`pip`),
|
|
110
|
+
- clone from git and build from source.
|
|
111
|
+
|
|
112
|
+
## Prerequisites
|
|
110
113
|
|
|
111
|
-
|
|
114
|
+
The following tools must be installed:
|
|
112
115
|
|
|
113
|
-
- python >= 3.10
|
|
114
|
-
- pip
|
|
115
|
-
- python venv
|
|
116
|
+
- python >= 3.10: download from [here](https://www.python.org/downloads/)
|
|
117
|
+
- pip: [installation instructions](https://pip.pypa.io/en/stable/installation/)
|
|
118
|
+
- python venv: [installation instructions](https://virtualenv.pypa.io/en/latest/installation.html)
|
|
116
119
|
(all three of above can be installed at once from [here](https://packaging.python.org/en/latest/guides/installing-using-linux-tools/#installing-pip-setuptools-wheel-with-linux-package-managers))
|
|
117
|
-
- gcloud
|
|
120
|
+
- gcloud: install from [here](https://cloud.google.com/sdk/gcloud#download_and_install_the) and then:
|
|
118
121
|
- Run `gcloud init`
|
|
119
122
|
- [Authenticate](https://cloud.google.com/sdk/gcloud/reference/auth/application-default/login) to Google Cloud
|
|
120
|
-
- kubectl
|
|
123
|
+
- kubectl: install from [here](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_kubectl) and then:
|
|
121
124
|
- Install `gke-gcloud-auth-plugin` from [here](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_plugin)
|
|
122
|
-
- docker
|
|
125
|
+
- docker: [installation instructions](https://docs.docker.com/engine/install/) and then:
|
|
126
|
+
- Configure sudoless docker: [guide](https://docs.docker.com/engine/install/linux-postinstall/)
|
|
123
127
|
- Run `gcloud auth configure-docker` to ensure images can be uploaded to registry
|
|
124
|
-
- make - please run below command.
|
|
125
|
-
```shell
|
|
126
|
-
# sudo may be required
|
|
127
|
-
apt-get -y install make
|
|
128
|
-
```
|
|
129
|
-
In addition, below dependencies can be installed either using provided links or using `make install` command, if xpk is downloaded via `git clone` command:
|
|
130
|
-
- kueuectl (install from [here](https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/))
|
|
131
|
-
- kjob (installation instructions [here](https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md))
|
|
132
128
|
|
|
133
|
-
|
|
134
|
-
|
|
129
|
+
### Additional prerequisites when installing from pip
|
|
130
|
+
|
|
131
|
+
- kueuectl: install from [here](https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/)
|
|
132
|
+
- kjob: installation instructions [here](https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md)
|
|
133
|
+
|
|
134
|
+
### Additional prerequisites when installing from source
|
|
135
|
+
|
|
136
|
+
- git: [installation instructions](https://git-scm.com/downloads/linux)
|
|
137
|
+
- make: install by running `apt-get -y install make` (`sudo` might be required)
|
|
138
|
+
|
|
139
|
+
## Installation via pip
|
|
140
|
+
|
|
141
|
+
To install XPK using pip, first install required tools mentioned in [prerequisites](#prerequisites) and [additional prerequisites](#additional-prerequisites-when-installing-from-pip). Then you can install XPK simply by running:
|
|
135
142
|
|
|
136
143
|
```shell
|
|
137
144
|
pip install xpk
|
|
138
145
|
```
|
|
139
146
|
|
|
140
|
-
If you see an error saying: `This environment is externally managed`, please use a virtual environment.
|
|
147
|
+
If you see an error saying: `This environment is externally managed`, please use a virtual environment. For example:
|
|
141
148
|
|
|
142
149
|
```shell
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
+
# One time step of creating the virtual environment
|
|
151
|
+
VENV_DIR=~/venvp3
|
|
152
|
+
python3 -m venv $VENV_DIR
|
|
153
|
+
|
|
154
|
+
# Activate your virtual environment
|
|
155
|
+
source $VENV_DIR/bin/activate
|
|
156
|
+
|
|
157
|
+
# Install XPK in virtual environment using pip
|
|
158
|
+
pip install xpk
|
|
150
159
|
```
|
|
151
160
|
|
|
152
|
-
|
|
153
|
-
|
|
161
|
+
## Installation from source
|
|
162
|
+
|
|
163
|
+
To install XPK from source, first install required tools mentioned in [prerequisites](#prerequisites) and [additional prerequisites](#additional-prerequisites-when-installing-from-source). Afterwards you can install XPK from source using `make`
|
|
154
164
|
|
|
155
165
|
```shell
|
|
166
|
+
# Clone the XPK repository
|
|
156
167
|
git clone https://github.com/google/xpk.git
|
|
157
168
|
cd xpk
|
|
158
|
-
|
|
169
|
+
|
|
170
|
+
# Install required dependencies and build XPK with make
|
|
159
171
|
make install && export PATH=$PATH:$PWD/bin
|
|
160
172
|
```
|
|
161
173
|
|
|
162
|
-
If you want
|
|
163
|
-
`echo $PWD/bin` and add its value to `PATH` in .bashrc or .zshrc
|
|
174
|
+
If you want the dependecies to be available in your PATH please run: `echo $PWD/bin` and add its value to `PATH` in .bashrc or .zshrc file.
|
|
164
175
|
|
|
165
|
-
If you see an error saying: `This environment is externally managed`, please use a virtual environment.
|
|
166
|
-
|
|
167
|
-
Example:
|
|
176
|
+
If you see an error saying: `This environment is externally managed`, please use a virtual environment. For example:
|
|
168
177
|
|
|
169
178
|
```shell
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
179
|
+
# One time step of creating the virtual environment
|
|
180
|
+
VENV_DIR=~/venvp3
|
|
181
|
+
python3 -m venv $VENV_DIR
|
|
182
|
+
|
|
183
|
+
# Activate your virtual environment
|
|
184
|
+
source $VENV_DIR/bin/activate
|
|
185
|
+
|
|
186
|
+
# Clone the XPK repository
|
|
187
|
+
git clone https://github.com/google/xpk.git
|
|
188
|
+
cd xpk
|
|
189
|
+
|
|
190
|
+
# Install required dependencies and build XPK with make
|
|
191
|
+
make install && export PATH=$PATH:$PWD/bin
|
|
180
192
|
```
|
|
181
193
|
|
|
182
194
|
# XPK for Large Scale (>1k VMs)
|
|
@@ -247,6 +259,13 @@ all zones.
|
|
|
247
259
|
--num-slices=4 --spot
|
|
248
260
|
```
|
|
249
261
|
|
|
262
|
+
* Cluster Create (DWS flex queued capacity):
|
|
263
|
+
```shell
|
|
264
|
+
python3 xpk.py cluster create \
|
|
265
|
+
--cluster xpk-test --tpu-type=v5litepod-16 \
|
|
266
|
+
--num-slices=4 --flex
|
|
267
|
+
```
|
|
268
|
+
|
|
250
269
|
* Cluster Create for Pathways:
|
|
251
270
|
Pathways compatible cluster can be created using `cluster create-pathways`.
|
|
252
271
|
```shell
|
|
@@ -457,25 +476,49 @@ will fail the cluster creation process because Vertex AI Tensorboard is not supp
|
|
|
457
476
|
--tpu-type=v5litepod-16
|
|
458
477
|
```
|
|
459
478
|
|
|
460
|
-
## Provisioning A3
|
|
461
|
-
To create a cluster with A3 machines, run the below
|
|
462
|
-
* For A3-Ultra: --device-type=h200-141gb-8
|
|
463
|
-
* For A3-Mega: --device-type=h100-mega-80gb-8
|
|
479
|
+
## Provisioning A3 Ultra, A3 Mega and A4 clusters (GPU machines)
|
|
480
|
+
To create a cluster with A3 or A4 machines, run the command below with selected device type. To create workloads on these clusters see [here](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines).
|
|
464
481
|
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
482
|
+
**Note:** Creating A3 Ultra, A3 Mega and A4 clusters is currently supported **only** on linux/amd64 architecture.
|
|
483
|
+
|
|
484
|
+
Machine | Device type
|
|
485
|
+
:- | :-
|
|
486
|
+
A3 Mega | `h100-mega-80gb-8`
|
|
487
|
+
A3 Ultra | `h200-141gb-8`
|
|
488
|
+
A4 | `b200-8`
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
```shell
|
|
492
|
+
python3 xpk.py cluster create \
|
|
493
|
+
--cluster CLUSTER_NAME --device-type DEVICE_TYPE \
|
|
468
494
|
--zone=$COMPUTE_ZONE --project=$PROJECT_ID \
|
|
469
|
-
--num-nodes
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
*
|
|
474
|
-
*
|
|
475
|
-
*
|
|
476
|
-
*
|
|
477
|
-
*
|
|
495
|
+
--num-nodes=$NUM_NODES --reservation=$RESERVATION_ID
|
|
496
|
+
```
|
|
497
|
+
|
|
498
|
+
Currently, the below flags/arguments are supported for A3 Mega, A3 Ultra and A4 machines:
|
|
499
|
+
* `--num-nodes`
|
|
500
|
+
* `--default-pool-cpu-machine-type`
|
|
501
|
+
* `--default-pool-cpu-num-nodes`
|
|
502
|
+
* `--reservation`
|
|
503
|
+
* `--spot`
|
|
504
|
+
* `--on-demand` (A3 Mega only)
|
|
505
|
+
* `--flex`
|
|
506
|
+
|
|
507
|
+
## Running XPK on existing clusters
|
|
508
|
+
|
|
509
|
+
In order to run XPK commands on a cluster it needs to be set up correctly. This is done automatically when creating a cluster using `xpk cluster create`. For clusters created differently (e.g.: with 'gcloud' or a Cluster Toolkit blueprint) there is a dedicated command: `xpk cluster adapt`. This command installs required config maps, kueue, jobset, CSI drivers etc.
|
|
510
|
+
|
|
511
|
+
Currently `xpk cluster adapt` supports only the following device types:
|
|
478
512
|
|
|
513
|
+
- `h200-141gb-8` (A3 Ultra)
|
|
514
|
+
|
|
515
|
+
Example usage:
|
|
516
|
+
```shell
|
|
517
|
+
python3 xpk.py cluster adapt \
|
|
518
|
+
--cluster=$CLUSTER_NAME --device-type=$DEVICE_TYPE \
|
|
519
|
+
--zone=$COMPUTE_ZONE --project=$PROJECT_ID \
|
|
520
|
+
--num-nodes=$NUM_NODES --reservation=$RESERVATION_ID
|
|
521
|
+
```
|
|
479
522
|
|
|
480
523
|
## Storage
|
|
481
524
|
Currently XPK supports the below types of storages:
|
|
@@ -483,9 +526,10 @@ Currently XPK supports the below types of storages:
|
|
|
483
526
|
- [Google Cloud Filestore](#filestore)
|
|
484
527
|
- [Google Cloud Parallelstore](#parallelstore)
|
|
485
528
|
- [Google Cloud Block storages (Persistent Disk, Hyperdisk)](#block-storage-persistent-disk-hyperdisk)
|
|
529
|
+
- [Google Cloud Managed Lustre](#managed-lustre)
|
|
486
530
|
|
|
487
531
|
### FUSE
|
|
488
|
-
A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so
|
|
532
|
+
A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so workloads can read and write objects in your bucket using standard file system semantics.
|
|
489
533
|
|
|
490
534
|
To use the GCS FUSE with XPK you need to create a [Storage Bucket](https://console.cloud.google.com/storage/).
|
|
491
535
|
|
|
@@ -507,11 +551,12 @@ Parameters:
|
|
|
507
551
|
- `--size` - size of the storage in Gb.
|
|
508
552
|
- `--bucket` - name of the storage bucket. If not set then the name of the storage is used as a bucket name.
|
|
509
553
|
- `--mount-options` - comma-separated list of additional mount options for PersistentVolume ([reference](https://cloud.google.com/kubernetes-engine/docs/how-to/cloud-storage-fuse-csi-driver-perf#mount-options)).
|
|
554
|
+
- `--prefetch-metadata` - enables metadata pre-population when mounting the volume by setting parameter `gcsfuseMetadataPrefetchOnMount` to `true` ([reference](https://cloud.google.com/kubernetes-engine/docs/how-to/cloud-storage-fuse-csi-driver-perf#metadata-prefetch)).
|
|
510
555
|
- `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions. If set, then values from manifest override the following parameters: `--size` and `--bucket`.
|
|
511
556
|
|
|
512
557
|
### Filestore
|
|
513
558
|
|
|
514
|
-
A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so
|
|
559
|
+
A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
|
|
515
560
|
|
|
516
561
|
To create and attach a GCP Filestore instance to your cluster use `xpk storage create` command with `--type=gcpfilestore`:
|
|
517
562
|
|
|
@@ -547,7 +592,7 @@ Commands `xpk storage create` and `xpk storage attach` with `--type=gcpfilestore
|
|
|
547
592
|
|
|
548
593
|
### Parallelstore
|
|
549
594
|
|
|
550
|
-
A Parallelstore adapter lets you mount and access [Parallelstore instances](https://cloud.google.com/parallelstore/) as local file systems, so
|
|
595
|
+
A Parallelstore adapter lets you mount and access [Parallelstore instances](https://cloud.google.com/parallelstore/) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
|
|
551
596
|
|
|
552
597
|
To use the GCS Parallelstore with XPK you need to create a [Parallelstore Instance](https://console.cloud.google.com/parallelstore/).
|
|
553
598
|
|
|
@@ -571,7 +616,7 @@ Parameters:
|
|
|
571
616
|
|
|
572
617
|
### Block storage (Persistent Disk, Hyperdisk)
|
|
573
618
|
|
|
574
|
-
A PersistentDisk adapter lets you mount and access Google Cloud Block storage solutions ([Persistent Disk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#pd), [Hyperdisk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#hyperdisk)) as local file systems, so
|
|
619
|
+
A PersistentDisk adapter lets you mount and access Google Cloud Block storage solutions ([Persistent Disk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#pd), [Hyperdisk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#hyperdisk)) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
|
|
575
620
|
|
|
576
621
|
To use the GCE PersistentDisk with XPK you need to create a [disk in GCE](https://cloud.google.com/compute/docs/disks). Please consider that the disk type you are creating is [compatible with the VMs](https://cloud.google.com/compute/docs/machine-resource#machine_type_comparison) in the default and accelerator nodepools.
|
|
577
622
|
|
|
@@ -593,6 +638,30 @@ Parameters:
|
|
|
593
638
|
- `--readonly` - if set to true, workload can only read from storage.
|
|
594
639
|
- `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
|
|
595
640
|
|
|
641
|
+
### Managed Lustre
|
|
642
|
+
|
|
643
|
+
A Managed Lustre adaptor lets you mount and access [Google Cloud Managed Lustre instances](https://cloud.google.com/kubernetes-engine/docs/concepts/managed-lustre) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
|
|
644
|
+
|
|
645
|
+
To use the GCP Managed Lustre with XPK you need to create [an instance](https://cloud.google.com/managed-lustre/docs/create-instance). Please make sure you enable GKE support when creating the instance (gcloud ex. `--gke-support-enabled`).
|
|
646
|
+
|
|
647
|
+
Once it's ready you can use `xpk storage attach` with `--type=lustre` command to attach a Managed Lustre instance to your cluster. Currently, attaching a Managed Lustre instance is supported only by providing a manifest file.
|
|
648
|
+
|
|
649
|
+
```shell
|
|
650
|
+
python3 xpk.py storage attach test-lustre-storage --type=lustre \
|
|
651
|
+
--project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
|
|
652
|
+
--mount-point='/test-mount-point' --readonly=false \
|
|
653
|
+
--auto-mount=true \
|
|
654
|
+
--manifest='./examples/storage/lustre-manifest-attach.yaml'
|
|
655
|
+
```
|
|
656
|
+
|
|
657
|
+
Parameters:
|
|
658
|
+
|
|
659
|
+
- `--type` - type of the storage `lustre`
|
|
660
|
+
- `--auto-mount` - if set to true all workloads will have this storage mounted by default.
|
|
661
|
+
- `--mount-point` - the path on which this storage should be mounted for a workload.
|
|
662
|
+
- `--readonly` - if set to true, workload can only read from storage.
|
|
663
|
+
- `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
|
|
664
|
+
|
|
596
665
|
### List attached storages
|
|
597
666
|
|
|
598
667
|
```shell
|
|
@@ -634,8 +703,14 @@ python3 xpk.py storage delete test-fs-instance \
|
|
|
634
703
|
python3 xpk.py workload create \
|
|
635
704
|
--workload xpk-test-workload --command "echo goodbye" \
|
|
636
705
|
--cluster xpk-test \
|
|
637
|
-
--tpu-type=v5litepod-16 --
|
|
706
|
+
--tpu-type=v5litepod-16 --project=$PROJECT
|
|
638
707
|
```
|
|
708
|
+
* Workload create(DWS flex with queued provisioning):
|
|
709
|
+
```shell
|
|
710
|
+
python3 xpk.py workload create \
|
|
711
|
+
--workload xpk-test-workload --command "echo goodbye" \
|
|
712
|
+
--cluster xpk-test --flex \
|
|
713
|
+
--tpu-type=v5litepod-16 --project=$PROJECT
|
|
639
714
|
|
|
640
715
|
* Workload Create for Pathways:
|
|
641
716
|
Pathways workload can be submitted using `workload create-pathways` on a Pathways enabled cluster (created with `cluster create-pathways`)
|
|
@@ -649,7 +724,7 @@ python3 xpk.py storage delete test-fs-instance \
|
|
|
649
724
|
--cluster xpk-pw-test \
|
|
650
725
|
--docker-name='user-workload' \
|
|
651
726
|
--docker-image=<maxtext docker image> \
|
|
652
|
-
--command='python3 MaxText
|
|
727
|
+
--command='python3 -m MaxText.train MaxText/configs/base.yml base_output_directory=<output directory> dataset_path=<dataset path> per_device_batch_size=1 enable_checkpointing=false enable_profiler=false remat_policy=full global_parameter_scale=4 steps=300 max_target_length=2048 use_iota_embed=true reuse_example_batch=1 dataset_type=synthetic attention=flash gcs_metrics=True run_name=$(USER)-pw-xpk-test-1 enable_single_controller=True'
|
|
653
728
|
```
|
|
654
729
|
|
|
655
730
|
Regular workload can also be submitted on a Pathways enabled cluster (created with `cluster create-pathways`)
|
|
@@ -663,7 +738,7 @@ python3 xpk.py storage delete test-fs-instance \
|
|
|
663
738
|
--cluster xpk-pw-test \
|
|
664
739
|
--docker-name='user-workload' \
|
|
665
740
|
--docker-image=<maxtext docker image> \
|
|
666
|
-
--command='python3 MaxText
|
|
741
|
+
--command='python3 -m MaxText.train MaxText/configs/base.yml base_output_directory=<output directory> dataset_path=<dataset path> per_device_batch_size=1 enable_checkpointing=false enable_profiler=false remat_policy=full global_parameter_scale=4 steps=300 max_target_length=2048 use_iota_embed=true reuse_example_batch=1 dataset_type=synthetic attention=flash gcs_metrics=True run_name=$(USER)-pw-xpk-test-1'
|
|
667
742
|
```
|
|
668
743
|
|
|
669
744
|
Pathways in headless mode - Pathways now offers the capability to run JAX workloads in Vertex AI notebooks or in GCE VMs!
|
|
@@ -693,21 +768,27 @@ increase this to a large number, say 50. Real jobs can be interrupted due to
|
|
|
693
768
|
hardware failures and software updates. We assume your job has implemented
|
|
694
769
|
checkpointing so the job restarts near where it was interrupted.
|
|
695
770
|
|
|
696
|
-
### Workloads for A3
|
|
697
|
-
To submit jobs on a cluster with A3 machines, run the
|
|
698
|
-
* For A3-Ultra: --device-type=h200-141gb-8
|
|
699
|
-
* For A3-Mega: --device-type=h100-mega-80gb-8
|
|
771
|
+
### Workloads for A3 Ultra, A3 Mega and A4 clusters (GPU machines)
|
|
772
|
+
To submit jobs on a cluster with A3 or A4 machines, run the command with selected device type. To create a cluster with A3 or A4 machines see [here](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines).
|
|
700
773
|
|
|
701
|
-
|
|
702
|
-
|
|
774
|
+
|
|
775
|
+
Machine | Device type
|
|
776
|
+
:- | :-
|
|
777
|
+
A3 Mega | `h100-mega-80gb-8`
|
|
778
|
+
A3 Ultra | `h200-141gb-8`
|
|
779
|
+
A4 | `b200-8`
|
|
780
|
+
|
|
781
|
+
```shell
|
|
782
|
+
python3 xpk.py workload create \
|
|
703
783
|
--workload=$WORKLOAD_NAME --command="echo goodbye" \
|
|
704
|
-
--cluster=$CLUSTER_NAME --device-type
|
|
784
|
+
--cluster=$CLUSTER_NAME --device-type DEVICE_TYPE \
|
|
705
785
|
--zone=$COMPUTE_ZONE --project=$PROJECT_ID \
|
|
706
786
|
--num-nodes=$WOKRKLOAD_NUM_NODES
|
|
707
|
-
|
|
708
|
-
|
|
787
|
+
```
|
|
788
|
+
|
|
789
|
+
> The docker image flags/arguments introduced in [workloads section](#workload-create) can be used with A3 or A4 machines as well.
|
|
709
790
|
|
|
710
|
-
In order to run NCCL test on A3
|
|
791
|
+
In order to run NCCL test on A3 machines check out [this guide](/examples/nccl/nccl.md).
|
|
711
792
|
|
|
712
793
|
### Workload Priority and Preemption
|
|
713
794
|
* Set the priority level of your workload with `--priority=LEVEL`
|
|
@@ -1554,4 +1635,5 @@ python xpk.py batch [other-options] --kind-cluster script
|
|
|
1554
1635
|
Please note that all other xpk subcommands are intended for use with cloud systems on Google Cloud Engine (GCE) and don't support local testing. This includes commands like cluster, info, inspector, etc.
|
|
1555
1636
|
|
|
1556
1637
|
# Other advanced usage
|
|
1557
|
-
[Use a Jupyter notebook to interact with a Cloud TPU cluster](xpk-notebooks.md)
|
|
1638
|
+
[Use a Jupyter notebook to interact with a Cloud TPU cluster](xpk-notebooks.md) \
|
|
1639
|
+
[Use Slurm like commands in XPK to execute workloads on top of GKE](xpk-slurm-commands.md)
|