xpk 0.8.0__tar.gz → 0.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. {xpk-0.8.0/src/xpk.egg-info → xpk-0.9.0}/PKG-INFO +134 -91
  2. {xpk-0.8.0 → xpk-0.9.0}/README.md +133 -90
  3. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/batch.py +2 -3
  4. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/cluster.py +225 -73
  5. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/common.py +33 -1
  6. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/kjob_common.py +10 -1
  7. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/run.py +2 -3
  8. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/storage.py +14 -3
  9. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/workload.py +17 -15
  10. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/blueprint/blueprint_generator.py +18 -18
  11. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/cluster.py +119 -8
  12. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/config.py +1 -1
  13. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/filestore.py +2 -6
  14. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/gcsfuse.py +22 -4
  15. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/kjob.py +20 -13
  16. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/kueue.py +30 -0
  17. xpk-0.9.0/src/xpk/core/mtc.py +195 -0
  18. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/network.py +23 -1
  19. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/pathways.py +1 -1
  20. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/resources.py +21 -0
  21. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/workload.py +1 -1
  22. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/workload_decorators/rdma_decorator.py +6 -10
  23. xpk-0.9.0/src/xpk/core/workload_decorators/tcpx_decorator.py +179 -0
  24. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/workload_decorators/tcpxo_decorator.py +15 -14
  25. xpk-0.9.0/src/xpk/parser/cluster.py +855 -0
  26. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/storage.py +11 -2
  27. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/utils/kubectl.py +4 -1
  28. {xpk-0.8.0 → xpk-0.9.0/src/xpk.egg-info}/PKG-INFO +134 -91
  29. {xpk-0.8.0 → xpk-0.9.0}/src/xpk.egg-info/SOURCES.txt +2 -0
  30. xpk-0.8.0/src/xpk/parser/cluster.py +0 -671
  31. {xpk-0.8.0 → xpk-0.9.0}/LICENSE +0 -0
  32. {xpk-0.8.0 → xpk-0.9.0}/pyproject.toml +0 -0
  33. {xpk-0.8.0 → xpk-0.9.0}/setup.cfg +0 -0
  34. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/__init__.py +0 -0
  35. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/api/__init__.py +0 -0
  36. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/api/storage_crd.yaml +0 -0
  37. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/__init__.py +0 -0
  38. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/cluster_gcluster.py +0 -0
  39. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/config.py +0 -0
  40. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/info.py +0 -0
  41. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/inspector.py +0 -0
  42. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/job.py +0 -0
  43. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/kind.py +0 -0
  44. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/shell.py +0 -0
  45. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/commands/version.py +0 -0
  46. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/__init__.py +0 -0
  47. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/blueprint/__init__.py +0 -0
  48. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
  49. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/capacity.py +0 -0
  50. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/cluster_private.py +0 -0
  51. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/commands.py +0 -0
  52. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/docker_container.py +0 -0
  53. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/docker_image.py +0 -0
  54. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/docker_manager.py +0 -0
  55. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/docker_resources.py +0 -0
  56. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/gcloud_context.py +0 -0
  57. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/gcluster_manager.py +0 -0
  58. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/monitoring.py +0 -0
  59. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/nap.py +0 -0
  60. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/nodepool.py +0 -0
  61. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/ray.py +0 -0
  62. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/remote_state/__init__.py +0 -0
  63. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
  64. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/remote_state/remote_state_client.py +0 -0
  65. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/scheduling.py +0 -0
  66. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/storage.py +0 -0
  67. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/system_characteristics.py +0 -0
  68. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/vertex.py +0 -0
  69. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/workload_decorators/__init__.py +0 -0
  70. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
  71. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/main.py +0 -0
  72. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/__init__.py +0 -0
  73. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/batch.py +0 -0
  74. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/common.py +0 -0
  75. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/config.py +0 -0
  76. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/core.py +0 -0
  77. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/info.py +0 -0
  78. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/inspector.py +0 -0
  79. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/job.py +0 -0
  80. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/kind.py +0 -0
  81. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/run.py +0 -0
  82. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/shell.py +0 -0
  83. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/validators.py +0 -0
  84. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/version.py +0 -0
  85. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/parser/workload.py +0 -0
  86. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/templates/__init__.py +0 -0
  87. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/templates/storage.yaml +0 -0
  88. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/utils/__init__.py +0 -0
  89. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/utils/console.py +0 -0
  90. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/utils/file.py +0 -0
  91. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/utils/gcs_utils.py +0 -0
  92. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/utils/network.py +0 -0
  93. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/utils/objects.py +0 -0
  94. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/utils/templates.py +0 -0
  95. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/utils/validation.py +0 -0
  96. {xpk-0.8.0 → xpk-0.9.0}/src/xpk/utils/yaml.py +0 -0
  97. {xpk-0.8.0 → xpk-0.9.0}/src/xpk.egg-info/dependency_links.txt +0 -0
  98. {xpk-0.8.0 → xpk-0.9.0}/src/xpk.egg-info/entry_points.txt +0 -0
  99. {xpk-0.8.0 → xpk-0.9.0}/src/xpk.egg-info/requires.txt +0 -0
  100. {xpk-0.8.0 → xpk-0.9.0}/src/xpk.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xpk
3
- Version: 0.8.0
3
+ Version: 0.9.0
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
5
  Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
@@ -46,23 +46,18 @@ Dynamic: license-file
46
46
  limitations under the License.
47
47
  -->
48
48
 
49
- [![Build Tests](https://github.com/google/xpk/actions/workflows/build_tests.yaml/badge.svg)](https://github.com/google/xpk/actions/workflows/build_tests.yaml)
50
- [![Nightly Tests](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml/badge.svg)](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml)
51
- [![Develop Tests](https://github.com/AI-Hypercomputer/xpk/actions/workflows/build_tests.yaml/badge.svg?branch=develop)](https://github.com/AI-Hypercomputer/xpk/actions/workflows/build_tests.yaml)
52
- [![Develop Nightly Tests](https://github.com/AI-Hypercomputer/xpk/actions/workflows/nightly_tests.yaml/badge.svg?branch=develop)](https://github.com/AI-Hypercomputer/xpk/actions/workflows/nightly_tests.yaml)
49
+ [![Build Tests](https://github.com/google/xpk/actions/workflows/build_tests.yaml/badge.svg?query=branch%3Amain)](https://github.com/google/xpk/actions/workflows/build_tests.yaml?query=branch%3Amain)
50
+ [![Nightly Tests](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml/badge.svg?query=branch%3Amain)](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml?query=branch%3Amain)
51
+ [![Develop Tests](https://github.com/AI-Hypercomputer/xpk/actions/workflows/build_tests.yaml/badge.svg?query=branch%3Adevelop)](https://github.com/AI-Hypercomputer/xpk/actions/workflows/build_tests.yaml?query=branch%3Adevelop)
52
+ [![Develop Nightly Tests](https://github.com/AI-Hypercomputer/xpk/actions/workflows/nightly_tests.yaml/badge.svg?query=branch%3Adevelop)](https://github.com/AI-Hypercomputer/xpk/actions/workflows/nightly_tests.yaml?query=branch%3Adevelop)
53
53
 
54
54
  # Overview
55
55
 
56
- xpk (Accelerated Processing Kit, pronounced x-p-k,) is a software tool to help
57
- Cloud developers to orchestrate training jobs on accelerators such as TPUs and
58
- GPUs on GKE. xpk handles the "multihost pods" of TPUs, GPUs (HGX H100) and CPUs
59
- (n2-standard-32) as first class citizens.
56
+ XPK (Accelerated Processing Kit, pronounced x-p-k) is a command line interface that simplifies cluster creation and workload execution on Google Kubernetes Engine (GKE). XPK generates preconfigured, training-optimized clusters and allows easy workload scheduling without any Kubernetes expertise.
60
57
 
61
- xpk decouples provisioning capacity from running jobs. There are two structures:
62
- clusters (provisioned VMs) and workloads (training jobs). Clusters represent the
63
- physical resources you have available. Workloads represent training jobs -- at
64
- any time some of these will be completed, others will be running and some will
65
- be queued, waiting for cluster resources to become available.
58
+ XPK is recommended for quick creation of GKE clusters for proofs of concepts and testing.
59
+
60
+ XPK decouples provisioning capacity from running jobs. There are two structures: clusters (provisioned VMs) and workloads (training jobs). Clusters represent the physical resources you have available. Workloads represent training jobs -- at any time some of these will be completed, others will be running and some will be queued, waiting for cluster resources to become available.
66
61
 
67
62
  The ideal workflow starts by provisioning the clusters for all of the ML
68
63
  hardware you have reserved. Then, without re-provisioning, submit jobs as
@@ -73,7 +68,7 @@ return the hardware back to the shared pool when they complete, developers can
73
68
  achieve better use of finite hardware resources. And automated tests can run
74
69
  overnight while resources tend to be underutilized.
75
70
 
76
- xpk supports the following TPU types:
71
+ XPK supports the following TPU types:
77
72
  * v4
78
73
  * v5e
79
74
  * v5p
@@ -82,13 +77,14 @@ xpk supports the following TPU types:
82
77
  and the following GPU types:
83
78
  * A100
84
79
  * A3-Highgpu (h100)
85
- * A3-Mega (h100-mega) - [Create cluster](#provisioning-a3-ultra-and-a3-mega-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-and-a3-mega-clusters-gpu-machines)
86
- * A3-Ultra (h200) - [Create cluster](#provisioning-a3-ultra-and-a3-mega-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-and-a3-mega-clusters-gpu-machines)
80
+ * A3-Mega (h100-mega) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
81
+ * A3-Ultra (h200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
82
+ * A4 (b200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
87
83
 
88
84
  and the following CPU types:
89
85
  * n2-standard-32
90
86
 
91
- xpk also supports [Google Cloud Storage solutions](#storage):
87
+ XPK also supports [Google Cloud Storage solutions](#storage):
92
88
  * [Cloud Storage FUSE](#fuse)
93
89
  * [Filestore](#filestore)
94
90
  * [Parallelstore](#parallelstore)
@@ -106,77 +102,93 @@ xpk also supports [Google Cloud Storage solutions](#storage):
106
102
  * Vertex AI Administrator
107
103
  * Filestore Editor (This role is neccessary if you want to run `storage create` command with `--type=gcpfilestore`)
108
104
 
109
- # Prerequisites
105
+ # Installation
106
+
107
+ There are 2 ways to install XPK:
110
108
 
111
- Following tools must be installed:
109
+ - via Python package installer (`pip`),
110
+ - clone from git and build from source.
112
111
 
113
- - python >= 3.10 (download from [here](https://www.python.org/downloads/))
114
- - pip ([installation instruction](https://pip.pypa.io/en/stable/installation/))
115
- - python venv ([installation instruction](https://virtualenv.pypa.io/en/latest/installation.html))
112
+ ## Prerequisites
113
+
114
+ The following tools must be installed:
115
+
116
+ - python >= 3.10: download from [here](https://www.python.org/downloads/)
117
+ - pip: [installation instructions](https://pip.pypa.io/en/stable/installation/)
118
+ - python venv: [installation instructions](https://virtualenv.pypa.io/en/latest/installation.html)
116
119
  (all three of above can be installed at once from [here](https://packaging.python.org/en/latest/guides/installing-using-linux-tools/#installing-pip-setuptools-wheel-with-linux-package-managers))
117
- - gcloud (install from [here](https://cloud.google.com/sdk/gcloud#download_and_install_the))
120
+ - gcloud: install from [here](https://cloud.google.com/sdk/gcloud#download_and_install_the) and then:
118
121
  - Run `gcloud init`
119
122
  - [Authenticate](https://cloud.google.com/sdk/gcloud/reference/auth/application-default/login) to Google Cloud
120
- - kubectl (install from [here](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_kubectl))
123
+ - kubectl: install from [here](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_kubectl) and then:
121
124
  - Install `gke-gcloud-auth-plugin` from [here](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_plugin)
122
- - docker ([installation instruction](https://docs.docker.com/engine/install/))
125
+ - docker: [installation instructions](https://docs.docker.com/engine/install/) and then:
126
+ - Configure sudoless docker: [guide](https://docs.docker.com/engine/install/linux-postinstall/)
123
127
  - Run `gcloud auth configure-docker` to ensure images can be uploaded to registry
124
- - make - please run below command.
125
- ```shell
126
- # sudo may be required
127
- apt-get -y install make
128
- ```
129
- In addition, below dependencies can be installed either using provided links or using `make install` command, if xpk is downloaded via `git clone` command:
130
- - kueuectl (install from [here](https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/))
131
- - kjob (installation instructions [here](https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md))
132
128
 
133
- # Installation
134
- To install xpk, install required tools mentioned in [prerequisites](#prerequisites). [Makefile](https://github.com/AI-Hypercomputer/xpk/blob/main/Makefile) provides a way to install all neccessary tools. XPK can be installed via pip:
129
+ ### Additional prerequisites when installing from pip
130
+
131
+ - kueuectl: install from [here](https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/)
132
+ - kjob: installation instructions [here](https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md)
133
+
134
+ ### Additional prerequisites when installing from source
135
+
136
+ - git: [installation instructions](https://git-scm.com/downloads/linux)
137
+ - make: install by running `apt-get -y install make` (`sudo` might be required)
138
+
139
+ ## Installation via pip
140
+
141
+ To install XPK using pip, first install required tools mentioned in [prerequisites](#prerequisites) and [additional prerequisites](#additional-prerequisites-when-installing-from-pip). Then you can install XPK simply by running:
135
142
 
136
143
  ```shell
137
144
  pip install xpk
138
145
  ```
139
146
 
140
- If you see an error saying: `This environment is externally managed`, please use a virtual environment.
147
+ If you see an error saying: `This environment is externally managed`, please use a virtual environment. For example:
141
148
 
142
149
  ```shell
143
- ## One time step of creating the venv
144
- VENV_DIR=~/venvp3
145
- python3 -m venv $VENV_DIR
146
- ## Enter your venv.
147
- source $VENV_DIR/bin/activate
148
- ## Clone the repository and installing dependencies.
149
- pip install xpk
150
+ # One time step of creating the virtual environment
151
+ VENV_DIR=~/venvp3
152
+ python3 -m venv $VENV_DIR
153
+
154
+ # Activate your virtual environment
155
+ source $VENV_DIR/bin/activate
156
+
157
+ # Install XPK in virtual environment using pip
158
+ pip install xpk
150
159
  ```
151
160
 
152
- If you are running XPK by cloning GitHub repository, first run the
153
- following commands to begin using XPK commands:
161
+ ## Installation from source
162
+
163
+ To install XPK from source, first install required tools mentioned in [prerequisites](#prerequisites) and [additional prerequisites](#additional-prerequisites-when-installing-from-source). Afterwards you can install XPK from source using `make`
154
164
 
155
165
  ```shell
166
+ # Clone the XPK repository
156
167
  git clone https://github.com/google/xpk.git
157
168
  cd xpk
158
- # Install required dependencies with make
169
+
170
+ # Install required dependencies and build XPK with make
159
171
  make install && export PATH=$PATH:$PWD/bin
160
172
  ```
161
173
 
162
- If you want to have installed dependecies persist in your PATH please run:
163
- `echo $PWD/bin` and add its value to `PATH` in .bashrc or .zshrc
164
-
165
- If you see an error saying: `This environment is externally managed`, please use a virtual environment.
174
+ If you want the dependecies to be available in your PATH please run: `echo $PWD/bin` and add its value to `PATH` in .bashrc or .zshrc file.
166
175
 
167
- Example:
176
+ If you see an error saying: `This environment is externally managed`, please use a virtual environment. For example:
168
177
 
169
178
  ```shell
170
- ## One time step of creating the venv
171
- VENV_DIR=~/venvp3
172
- python3 -m venv $VENV_DIR
173
- ## Enter your venv.
174
- source $VENV_DIR/bin/activate
175
- ## Clone the repository and installing dependencies.
176
- git clone https://github.com/google/xpk.git
177
- cd xpk
178
- # Install required dependencies with make
179
- make install && export PATH=$PATH:$PWD/bin
179
+ # One time step of creating the virtual environment
180
+ VENV_DIR=~/venvp3
181
+ python3 -m venv $VENV_DIR
182
+
183
+ # Activate your virtual environment
184
+ source $VENV_DIR/bin/activate
185
+
186
+ # Clone the XPK repository
187
+ git clone https://github.com/google/xpk.git
188
+ cd xpk
189
+
190
+ # Install required dependencies and build XPK with make
191
+ make install && export PATH=$PATH:$PWD/bin
180
192
  ```
181
193
 
182
194
  # XPK for Large Scale (>1k VMs)
@@ -457,25 +469,48 @@ will fail the cluster creation process because Vertex AI Tensorboard is not supp
457
469
  --tpu-type=v5litepod-16
458
470
  ```
459
471
 
460
- ## Provisioning A3-Ultra and A3-Mega clusters (GPU machines)
461
- To create a cluster with A3 machines, run the below command. To create workloads on these clusters see [here](#workloads-for-a3-ultra-and-a3-mega-clusters-gpu-machines).
462
- * For A3-Ultra: --device-type=h200-141gb-8
463
- * For A3-Mega: --device-type=h100-mega-80gb-8
472
+ ## Provisioning A3 Ultra, A3 Mega and A4 clusters (GPU machines)
473
+ To create a cluster with A3 or A4 machines, run the command below with selected device type. To create workloads on these clusters see [here](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines).
464
474
 
465
- ```shell
466
- python3 xpk.py cluster create \
467
- --cluster CLUSTER_NAME --device-type=h200-141gb-8 \
475
+ **Note:** Creating A3 Ultra, A3 Mega and A4 clusters is currently supported **only** on linux/amd64 architecture.
476
+
477
+ Machine | Device type
478
+ :- | :-
479
+ A3 Mega | `h100-mega-80gb-8`
480
+ A3 Ultra | `h200-141gb-8`
481
+ A4 | `b200-8`
482
+
483
+
484
+ ```shell
485
+ python3 xpk.py cluster create \
486
+ --cluster CLUSTER_NAME --device-type DEVICE_TYPE \
468
487
  --zone=$COMPUTE_ZONE --project=$PROJECT_ID \
469
- --num-nodes=4 --reservation=$RESERVATION_ID
470
- ```
471
- Currently, the below flags/arguments are supported for A3-Mega and A3-Ultra machines:
472
- * --num-nodes
473
- * --default-pool-cpu-machine-type
474
- * --default-pool-cpu-num-nodes
475
- * --reservation
476
- * --spot
477
- * --on-demand (only A3-Mega)
488
+ --num-nodes=$NUM_NODES --reservation=$RESERVATION_ID
489
+ ```
490
+
491
+ Currently, the below flags/arguments are supported for A3 Mega, A3 Ultra and A4 machines:
492
+ * `--num-nodes`
493
+ * `--default-pool-cpu-machine-type`
494
+ * `--default-pool-cpu-num-nodes`
495
+ * `--reservation`
496
+ * `--spot`
497
+ * `--on-demand` (A3 Mega only)
498
+
499
+ ## Running XPK on existing clusters
500
+
501
+ In order to run XPK commands on a cluster it needs to be set up correctly. This is done automatically when creating a cluster using `xpk cluster create`. For clusters created differently (e.g.: with 'gcloud' or a Cluster Toolkit blueprint) there is a dedicated command: `xpk cluster adapt`. This command installs required config maps, kueue, jobset, CSI drivers etc.
502
+
503
+ Currently `xpk cluster adapt` supports only the following device types:
478
504
 
505
+ - `h200-141gb-8` (A3 Ultra)
506
+
507
+ Example usage:
508
+ ```shell
509
+ python3 xpk.py cluster adapt \
510
+ --cluster=$CLUSTER_NAME --device-type=$DEVICE_TYPE \
511
+ --zone=$COMPUTE_ZONE --project=$PROJECT_ID \
512
+ --num-nodes=$NUM_NODES --reservation=$RESERVATION_ID
513
+ ```
479
514
 
480
515
  ## Storage
481
516
  Currently XPK supports the below types of storages:
@@ -507,6 +542,7 @@ Parameters:
507
542
  - `--size` - size of the storage in Gb.
508
543
  - `--bucket` - name of the storage bucket. If not set then the name of the storage is used as a bucket name.
509
544
  - `--mount-options` - comma-separated list of additional mount options for PersistentVolume ([reference](https://cloud.google.com/kubernetes-engine/docs/how-to/cloud-storage-fuse-csi-driver-perf#mount-options)).
545
+ - `--prefetch-metadata` - enables metadata pre-population when mounting the volume by setting parameter `gcsfuseMetadataPrefetchOnMount` to `true` ([reference](https://cloud.google.com/kubernetes-engine/docs/how-to/cloud-storage-fuse-csi-driver-perf#metadata-prefetch)).
510
546
  - `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions. If set, then values from manifest override the following parameters: `--size` and `--bucket`.
511
547
 
512
548
  ### Filestore
@@ -649,7 +685,7 @@ python3 xpk.py storage delete test-fs-instance \
649
685
  --cluster xpk-pw-test \
650
686
  --docker-name='user-workload' \
651
687
  --docker-image=<maxtext docker image> \
652
- --command='python3 MaxText/train.py MaxText/configs/base.yml base_output_directory=<output directory> dataset_path=<dataset path> per_device_batch_size=1 enable_checkpointing=false enable_profiler=false remat_policy=full global_parameter_scale=4 steps=300 max_target_length=2048 use_iota_embed=true reuse_example_batch=1 dataset_type=synthetic attention=flash gcs_metrics=True run_name=$(USER)-pw-xpk-test-1'
688
+ --command='python3 -m MaxText.train MaxText/configs/base.yml base_output_directory=<output directory> dataset_path=<dataset path> per_device_batch_size=1 enable_checkpointing=false enable_profiler=false remat_policy=full global_parameter_scale=4 steps=300 max_target_length=2048 use_iota_embed=true reuse_example_batch=1 dataset_type=synthetic attention=flash gcs_metrics=True run_name=$(USER)-pw-xpk-test-1 enable_single_controller=True'
653
689
  ```
654
690
 
655
691
  Regular workload can also be submitted on a Pathways enabled cluster (created with `cluster create-pathways`)
@@ -663,7 +699,7 @@ python3 xpk.py storage delete test-fs-instance \
663
699
  --cluster xpk-pw-test \
664
700
  --docker-name='user-workload' \
665
701
  --docker-image=<maxtext docker image> \
666
- --command='python3 MaxText/train.py MaxText/configs/base.yml base_output_directory=<output directory> dataset_path=<dataset path> per_device_batch_size=1 enable_checkpointing=false enable_profiler=false remat_policy=full global_parameter_scale=4 steps=300 max_target_length=2048 use_iota_embed=true reuse_example_batch=1 dataset_type=synthetic attention=flash gcs_metrics=True run_name=$(USER)-pw-xpk-test-1'
702
+ --command='python3 -m MaxText.train MaxText/configs/base.yml base_output_directory=<output directory> dataset_path=<dataset path> per_device_batch_size=1 enable_checkpointing=false enable_profiler=false remat_policy=full global_parameter_scale=4 steps=300 max_target_length=2048 use_iota_embed=true reuse_example_batch=1 dataset_type=synthetic attention=flash gcs_metrics=True run_name=$(USER)-pw-xpk-test-1'
667
703
  ```
668
704
 
669
705
  Pathways in headless mode - Pathways now offers the capability to run JAX workloads in Vertex AI notebooks or in GCE VMs!
@@ -693,21 +729,27 @@ increase this to a large number, say 50. Real jobs can be interrupted due to
693
729
  hardware failures and software updates. We assume your job has implemented
694
730
  checkpointing so the job restarts near where it was interrupted.
695
731
 
696
- ### Workloads for A3-Ultra and A3-Mega clusters (GPU machines)
697
- To submit jobs on a cluster with A3 machines, run the below command. To create a cluster with A3 machines see [here](#provisioning-a3-ultra-and-a3-mega-clusters-gpu-machines).
698
- * For A3-Ultra: --device-type=h200-141gb-8
699
- * For A3-Mega: --device-type=h100-mega-80gb-8
732
+ ### Workloads for A3 Ultra, A3 Mega and A4 clusters (GPU machines)
733
+ To submit jobs on a cluster with A3 or A4 machines, run the command with selected device type. To create a cluster with A3 or A4 machines see [here](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines).
700
734
 
701
- ```shell
702
- python3 xpk.py workload create \
735
+
736
+ Machine | Device type
737
+ :- | :-
738
+ A3 Mega | `h100-mega-80gb-8`
739
+ A3 Ultra | `h200-141gb-8`
740
+ A4 | `b200-8`
741
+
742
+ ```shell
743
+ python3 xpk.py workload create \
703
744
  --workload=$WORKLOAD_NAME --command="echo goodbye" \
704
- --cluster=$CLUSTER_NAME --device-type=h200-141gb-8 \
745
+ --cluster=$CLUSTER_NAME --device-type DEVICE_TYPE \
705
746
  --zone=$COMPUTE_ZONE --project=$PROJECT_ID \
706
747
  --num-nodes=$WOKRKLOAD_NUM_NODES
707
- ```
708
- > The docker image flags/arguments introduced in [workloads section](#workload-create) can be used with A3 machines as well.
748
+ ```
749
+
750
+ > The docker image flags/arguments introduced in [workloads section](#workload-create) can be used with A3 or A4 machines as well.
709
751
 
710
- In order to run NCCL test on A3 Ultra machines check out [this guide](/examples/nccl/nccl.md).
752
+ In order to run NCCL test on A3 machines check out [this guide](/examples/nccl/nccl.md).
711
753
 
712
754
  ### Workload Priority and Preemption
713
755
  * Set the priority level of your workload with `--priority=LEVEL`
@@ -1554,4 +1596,5 @@ python xpk.py batch [other-options] --kind-cluster script
1554
1596
  Please note that all other xpk subcommands are intended for use with cloud systems on Google Cloud Engine (GCE) and don't support local testing. This includes commands like cluster, info, inspector, etc.
1555
1597
 
1556
1598
  # Other advanced usage
1557
- [Use a Jupyter notebook to interact with a Cloud TPU cluster](xpk-notebooks.md)
1599
+ [Use a Jupyter notebook to interact with a Cloud TPU cluster](xpk-notebooks.md) \
1600
+ [Use Slurm like commands in XPK to execute workloads on top of GKE](xpk-slurm-commands.md)