xpk 0.8.0__tar.gz → 0.10.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. {xpk-0.8.0/src/xpk.egg-info → xpk-0.10.0}/PKG-INFO +178 -96
  2. {xpk-0.8.0 → xpk-0.10.0}/README.md +177 -95
  3. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/batch.py +5 -6
  4. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/cluster.py +246 -73
  5. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/cluster_gcluster.py +27 -0
  6. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/common.py +40 -1
  7. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/kjob_common.py +13 -1
  8. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/run.py +4 -5
  9. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/shell.py +2 -2
  10. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/storage.py +24 -6
  11. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/workload.py +66 -27
  12. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/blueprint/blueprint_generator.py +115 -47
  13. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/capacity.py +66 -6
  14. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/cluster.py +282 -13
  15. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/config.py +1 -65
  16. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/docker_manager.py +1 -1
  17. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/docker_resources.py +145 -72
  18. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/filestore.py +2 -6
  19. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/gcsfuse.py +22 -4
  20. xpk-0.10.0/src/xpk/core/jobset.py +143 -0
  21. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/kjob.py +21 -18
  22. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/kueue.py +194 -4
  23. xpk-0.10.0/src/xpk/core/mtc.py +195 -0
  24. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/network.py +23 -1
  25. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/nodepool.py +17 -4
  26. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/pathways.py +2 -3
  27. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/resources.py +21 -0
  28. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/storage.py +1 -95
  29. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/system_characteristics.py +1 -1
  30. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/workload.py +1 -45
  31. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/workload_decorators/rdma_decorator.py +8 -10
  32. xpk-0.10.0/src/xpk/core/workload_decorators/tcpx_decorator.py +185 -0
  33. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/workload_decorators/tcpxo_decorator.py +22 -14
  34. xpk-0.10.0/src/xpk/parser/cluster.py +871 -0
  35. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/storage.py +12 -3
  36. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/workload.py +21 -3
  37. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/utils/kubectl.py +4 -1
  38. {xpk-0.8.0 → xpk-0.10.0/src/xpk.egg-info}/PKG-INFO +178 -96
  39. {xpk-0.8.0 → xpk-0.10.0}/src/xpk.egg-info/SOURCES.txt +3 -0
  40. xpk-0.8.0/src/xpk/parser/cluster.py +0 -671
  41. {xpk-0.8.0 → xpk-0.10.0}/LICENSE +0 -0
  42. {xpk-0.8.0 → xpk-0.10.0}/pyproject.toml +0 -0
  43. {xpk-0.8.0 → xpk-0.10.0}/setup.cfg +0 -0
  44. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/__init__.py +0 -0
  45. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/api/__init__.py +0 -0
  46. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/api/storage_crd.yaml +0 -0
  47. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/__init__.py +0 -0
  48. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/config.py +0 -0
  49. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/info.py +0 -0
  50. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/inspector.py +0 -0
  51. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/job.py +0 -0
  52. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/kind.py +0 -0
  53. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/commands/version.py +0 -0
  54. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/__init__.py +0 -0
  55. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/blueprint/__init__.py +0 -0
  56. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
  57. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/cluster_private.py +0 -0
  58. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/commands.py +0 -0
  59. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/docker_container.py +0 -0
  60. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/docker_image.py +0 -0
  61. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/gcloud_context.py +0 -0
  62. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/gcluster_manager.py +0 -0
  63. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/monitoring.py +0 -0
  64. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/nap.py +0 -0
  65. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/ray.py +0 -0
  66. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/remote_state/__init__.py +0 -0
  67. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
  68. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/remote_state/remote_state_client.py +0 -0
  69. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/scheduling.py +0 -0
  70. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/vertex.py +0 -0
  71. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/workload_decorators/__init__.py +0 -0
  72. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
  73. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/main.py +0 -0
  74. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/__init__.py +0 -0
  75. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/batch.py +0 -0
  76. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/common.py +0 -0
  77. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/config.py +0 -0
  78. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/core.py +0 -0
  79. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/info.py +0 -0
  80. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/inspector.py +0 -0
  81. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/job.py +0 -0
  82. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/kind.py +0 -0
  83. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/run.py +0 -0
  84. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/shell.py +0 -0
  85. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/validators.py +0 -0
  86. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/parser/version.py +0 -0
  87. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/templates/__init__.py +0 -0
  88. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/templates/storage.yaml +0 -0
  89. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/utils/__init__.py +0 -0
  90. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/utils/console.py +0 -0
  91. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/utils/file.py +0 -0
  92. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/utils/gcs_utils.py +0 -0
  93. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/utils/network.py +0 -0
  94. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/utils/objects.py +0 -0
  95. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/utils/templates.py +0 -0
  96. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/utils/validation.py +0 -0
  97. {xpk-0.8.0 → xpk-0.10.0}/src/xpk/utils/yaml.py +0 -0
  98. {xpk-0.8.0 → xpk-0.10.0}/src/xpk.egg-info/dependency_links.txt +0 -0
  99. {xpk-0.8.0 → xpk-0.10.0}/src/xpk.egg-info/entry_points.txt +0 -0
  100. {xpk-0.8.0 → xpk-0.10.0}/src/xpk.egg-info/requires.txt +0 -0
  101. {xpk-0.8.0 → xpk-0.10.0}/src/xpk.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xpk
3
- Version: 0.8.0
3
+ Version: 0.10.0
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
5
  Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
@@ -46,23 +46,18 @@ Dynamic: license-file
46
46
  limitations under the License.
47
47
  -->
48
48
 
49
- [![Build Tests](https://github.com/google/xpk/actions/workflows/build_tests.yaml/badge.svg)](https://github.com/google/xpk/actions/workflows/build_tests.yaml)
50
- [![Nightly Tests](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml/badge.svg)](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml)
51
- [![Develop Tests](https://github.com/AI-Hypercomputer/xpk/actions/workflows/build_tests.yaml/badge.svg?branch=develop)](https://github.com/AI-Hypercomputer/xpk/actions/workflows/build_tests.yaml)
52
- [![Develop Nightly Tests](https://github.com/AI-Hypercomputer/xpk/actions/workflows/nightly_tests.yaml/badge.svg?branch=develop)](https://github.com/AI-Hypercomputer/xpk/actions/workflows/nightly_tests.yaml)
49
+ [![Build Tests](https://github.com/google/xpk/actions/workflows/build_tests.yaml/badge.svg?query=branch%3Amain)](https://github.com/google/xpk/actions/workflows/build_tests.yaml?query=branch%3Amain)
50
+ [![Nightly Tests](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml/badge.svg?query=branch%3Amain)](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml?query=branch%3Amain)
51
+ [![Develop Tests](https://github.com/AI-Hypercomputer/xpk/actions/workflows/build_tests.yaml/badge.svg?query=branch%3Adevelop)](https://github.com/AI-Hypercomputer/xpk/actions/workflows/build_tests.yaml?query=branch%3Adevelop)
52
+ [![Develop Nightly Tests](https://github.com/AI-Hypercomputer/xpk/actions/workflows/nightly_tests.yaml/badge.svg?query=branch%3Adevelop)](https://github.com/AI-Hypercomputer/xpk/actions/workflows/nightly_tests.yaml?query=branch%3Adevelop)
53
53
 
54
54
  # Overview
55
55
 
56
- xpk (Accelerated Processing Kit, pronounced x-p-k,) is a software tool to help
57
- Cloud developers to orchestrate training jobs on accelerators such as TPUs and
58
- GPUs on GKE. xpk handles the "multihost pods" of TPUs, GPUs (HGX H100) and CPUs
59
- (n2-standard-32) as first class citizens.
56
+ XPK (Accelerated Processing Kit, pronounced x-p-k) is a command line interface that simplifies cluster creation and workload execution on Google Kubernetes Engine (GKE). XPK generates preconfigured, training-optimized clusters and allows easy workload scheduling without any Kubernetes expertise.
60
57
 
61
- xpk decouples provisioning capacity from running jobs. There are two structures:
62
- clusters (provisioned VMs) and workloads (training jobs). Clusters represent the
63
- physical resources you have available. Workloads represent training jobs -- at
64
- any time some of these will be completed, others will be running and some will
65
- be queued, waiting for cluster resources to become available.
58
+ XPK is recommended for quick creation of GKE clusters for proofs of concepts and testing.
59
+
60
+ XPK decouples provisioning capacity from running jobs. There are two structures: clusters (provisioned VMs) and workloads (training jobs). Clusters represent the physical resources you have available. Workloads represent training jobs -- at any time some of these will be completed, others will be running and some will be queued, waiting for cluster resources to become available.
66
61
 
67
62
  The ideal workflow starts by provisioning the clusters for all of the ML
68
63
  hardware you have reserved. Then, without re-provisioning, submit jobs as
@@ -73,7 +68,7 @@ return the hardware back to the shared pool when they complete, developers can
73
68
  achieve better use of finite hardware resources. And automated tests can run
74
69
  overnight while resources tend to be underutilized.
75
70
 
76
- xpk supports the following TPU types:
71
+ XPK supports the following TPU types:
77
72
  * v4
78
73
  * v5e
79
74
  * v5p
@@ -82,13 +77,14 @@ xpk supports the following TPU types:
82
77
  and the following GPU types:
83
78
  * A100
84
79
  * A3-Highgpu (h100)
85
- * A3-Mega (h100-mega) - [Create cluster](#provisioning-a3-ultra-and-a3-mega-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-and-a3-mega-clusters-gpu-machines)
86
- * A3-Ultra (h200) - [Create cluster](#provisioning-a3-ultra-and-a3-mega-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-and-a3-mega-clusters-gpu-machines)
80
+ * A3-Mega (h100-mega) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
81
+ * A3-Ultra (h200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
82
+ * A4 (b200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
87
83
 
88
84
  and the following CPU types:
89
85
  * n2-standard-32
90
86
 
91
- xpk also supports [Google Cloud Storage solutions](#storage):
87
+ XPK also supports [Google Cloud Storage solutions](#storage):
92
88
  * [Cloud Storage FUSE](#fuse)
93
89
  * [Filestore](#filestore)
94
90
  * [Parallelstore](#parallelstore)
@@ -106,77 +102,93 @@ xpk also supports [Google Cloud Storage solutions](#storage):
106
102
  * Vertex AI Administrator
107
103
  * Filestore Editor (This role is neccessary if you want to run `storage create` command with `--type=gcpfilestore`)
108
104
 
109
- # Prerequisites
105
+ # Installation
106
+
107
+ There are 2 ways to install XPK:
108
+
109
+ - via Python package installer (`pip`),
110
+ - clone from git and build from source.
111
+
112
+ ## Prerequisites
110
113
 
111
- Following tools must be installed:
114
+ The following tools must be installed:
112
115
 
113
- - python >= 3.10 (download from [here](https://www.python.org/downloads/))
114
- - pip ([installation instruction](https://pip.pypa.io/en/stable/installation/))
115
- - python venv ([installation instruction](https://virtualenv.pypa.io/en/latest/installation.html))
116
+ - python >= 3.10: download from [here](https://www.python.org/downloads/)
117
+ - pip: [installation instructions](https://pip.pypa.io/en/stable/installation/)
118
+ - python venv: [installation instructions](https://virtualenv.pypa.io/en/latest/installation.html)
116
119
  (all three of above can be installed at once from [here](https://packaging.python.org/en/latest/guides/installing-using-linux-tools/#installing-pip-setuptools-wheel-with-linux-package-managers))
117
- - gcloud (install from [here](https://cloud.google.com/sdk/gcloud#download_and_install_the))
120
+ - gcloud: install from [here](https://cloud.google.com/sdk/gcloud#download_and_install_the) and then:
118
121
  - Run `gcloud init`
119
122
  - [Authenticate](https://cloud.google.com/sdk/gcloud/reference/auth/application-default/login) to Google Cloud
120
- - kubectl (install from [here](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_kubectl))
123
+ - kubectl: install from [here](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_kubectl) and then:
121
124
  - Install `gke-gcloud-auth-plugin` from [here](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_plugin)
122
- - docker ([installation instruction](https://docs.docker.com/engine/install/))
125
+ - docker: [installation instructions](https://docs.docker.com/engine/install/) and then:
126
+ - Configure sudoless docker: [guide](https://docs.docker.com/engine/install/linux-postinstall/)
123
127
  - Run `gcloud auth configure-docker` to ensure images can be uploaded to registry
124
- - make - please run below command.
125
- ```shell
126
- # sudo may be required
127
- apt-get -y install make
128
- ```
129
- In addition, below dependencies can be installed either using provided links or using `make install` command, if xpk is downloaded via `git clone` command:
130
- - kueuectl (install from [here](https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/))
131
- - kjob (installation instructions [here](https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md))
132
128
 
133
- # Installation
134
- To install xpk, install required tools mentioned in [prerequisites](#prerequisites). [Makefile](https://github.com/AI-Hypercomputer/xpk/blob/main/Makefile) provides a way to install all neccessary tools. XPK can be installed via pip:
129
+ ### Additional prerequisites when installing from pip
130
+
131
+ - kueuectl: install from [here](https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/)
132
+ - kjob: installation instructions [here](https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md)
133
+
134
+ ### Additional prerequisites when installing from source
135
+
136
+ - git: [installation instructions](https://git-scm.com/downloads/linux)
137
+ - make: install by running `apt-get -y install make` (`sudo` might be required)
138
+
139
+ ## Installation via pip
140
+
141
+ To install XPK using pip, first install required tools mentioned in [prerequisites](#prerequisites) and [additional prerequisites](#additional-prerequisites-when-installing-from-pip). Then you can install XPK simply by running:
135
142
 
136
143
  ```shell
137
144
  pip install xpk
138
145
  ```
139
146
 
140
- If you see an error saying: `This environment is externally managed`, please use a virtual environment.
147
+ If you see an error saying: `This environment is externally managed`, please use a virtual environment. For example:
141
148
 
142
149
  ```shell
143
- ## One time step of creating the venv
144
- VENV_DIR=~/venvp3
145
- python3 -m venv $VENV_DIR
146
- ## Enter your venv.
147
- source $VENV_DIR/bin/activate
148
- ## Clone the repository and installing dependencies.
149
- pip install xpk
150
+ # One time step of creating the virtual environment
151
+ VENV_DIR=~/venvp3
152
+ python3 -m venv $VENV_DIR
153
+
154
+ # Activate your virtual environment
155
+ source $VENV_DIR/bin/activate
156
+
157
+ # Install XPK in virtual environment using pip
158
+ pip install xpk
150
159
  ```
151
160
 
152
- If you are running XPK by cloning GitHub repository, first run the
153
- following commands to begin using XPK commands:
161
+ ## Installation from source
162
+
163
+ To install XPK from source, first install required tools mentioned in [prerequisites](#prerequisites) and [additional prerequisites](#additional-prerequisites-when-installing-from-source). Afterwards you can install XPK from source using `make`
154
164
 
155
165
  ```shell
166
+ # Clone the XPK repository
156
167
  git clone https://github.com/google/xpk.git
157
168
  cd xpk
158
- # Install required dependencies with make
169
+
170
+ # Install required dependencies and build XPK with make
159
171
  make install && export PATH=$PATH:$PWD/bin
160
172
  ```
161
173
 
162
- If you want to have installed dependecies persist in your PATH please run:
163
- `echo $PWD/bin` and add its value to `PATH` in .bashrc or .zshrc
174
+ If you want the dependecies to be available in your PATH please run: `echo $PWD/bin` and add its value to `PATH` in .bashrc or .zshrc file.
164
175
 
165
- If you see an error saying: `This environment is externally managed`, please use a virtual environment.
166
-
167
- Example:
176
+ If you see an error saying: `This environment is externally managed`, please use a virtual environment. For example:
168
177
 
169
178
  ```shell
170
- ## One time step of creating the venv
171
- VENV_DIR=~/venvp3
172
- python3 -m venv $VENV_DIR
173
- ## Enter your venv.
174
- source $VENV_DIR/bin/activate
175
- ## Clone the repository and installing dependencies.
176
- git clone https://github.com/google/xpk.git
177
- cd xpk
178
- # Install required dependencies with make
179
- make install && export PATH=$PATH:$PWD/bin
179
+ # One time step of creating the virtual environment
180
+ VENV_DIR=~/venvp3
181
+ python3 -m venv $VENV_DIR
182
+
183
+ # Activate your virtual environment
184
+ source $VENV_DIR/bin/activate
185
+
186
+ # Clone the XPK repository
187
+ git clone https://github.com/google/xpk.git
188
+ cd xpk
189
+
190
+ # Install required dependencies and build XPK with make
191
+ make install && export PATH=$PATH:$PWD/bin
180
192
  ```
181
193
 
182
194
  # XPK for Large Scale (>1k VMs)
@@ -247,6 +259,13 @@ all zones.
247
259
  --num-slices=4 --spot
248
260
  ```
249
261
 
262
+ * Cluster Create (DWS flex queued capacity):
263
+ ```shell
264
+ python3 xpk.py cluster create \
265
+ --cluster xpk-test --tpu-type=v5litepod-16 \
266
+ --num-slices=4 --flex
267
+ ```
268
+
250
269
  * Cluster Create for Pathways:
251
270
  Pathways compatible cluster can be created using `cluster create-pathways`.
252
271
  ```shell
@@ -457,25 +476,49 @@ will fail the cluster creation process because Vertex AI Tensorboard is not supp
457
476
  --tpu-type=v5litepod-16
458
477
  ```
459
478
 
460
- ## Provisioning A3-Ultra and A3-Mega clusters (GPU machines)
461
- To create a cluster with A3 machines, run the below command. To create workloads on these clusters see [here](#workloads-for-a3-ultra-and-a3-mega-clusters-gpu-machines).
462
- * For A3-Ultra: --device-type=h200-141gb-8
463
- * For A3-Mega: --device-type=h100-mega-80gb-8
479
+ ## Provisioning A3 Ultra, A3 Mega and A4 clusters (GPU machines)
480
+ To create a cluster with A3 or A4 machines, run the command below with selected device type. To create workloads on these clusters see [here](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines).
464
481
 
465
- ```shell
466
- python3 xpk.py cluster create \
467
- --cluster CLUSTER_NAME --device-type=h200-141gb-8 \
482
+ **Note:** Creating A3 Ultra, A3 Mega and A4 clusters is currently supported **only** on linux/amd64 architecture.
483
+
484
+ Machine | Device type
485
+ :- | :-
486
+ A3 Mega | `h100-mega-80gb-8`
487
+ A3 Ultra | `h200-141gb-8`
488
+ A4 | `b200-8`
489
+
490
+
491
+ ```shell
492
+ python3 xpk.py cluster create \
493
+ --cluster CLUSTER_NAME --device-type DEVICE_TYPE \
468
494
  --zone=$COMPUTE_ZONE --project=$PROJECT_ID \
469
- --num-nodes=4 --reservation=$RESERVATION_ID
470
- ```
471
- Currently, the below flags/arguments are supported for A3-Mega and A3-Ultra machines:
472
- * --num-nodes
473
- * --default-pool-cpu-machine-type
474
- * --default-pool-cpu-num-nodes
475
- * --reservation
476
- * --spot
477
- * --on-demand (only A3-Mega)
495
+ --num-nodes=$NUM_NODES --reservation=$RESERVATION_ID
496
+ ```
497
+
498
+ Currently, the below flags/arguments are supported for A3 Mega, A3 Ultra and A4 machines:
499
+ * `--num-nodes`
500
+ * `--default-pool-cpu-machine-type`
501
+ * `--default-pool-cpu-num-nodes`
502
+ * `--reservation`
503
+ * `--spot`
504
+ * `--on-demand` (A3 Mega only)
505
+ * `--flex`
506
+
507
+ ## Running XPK on existing clusters
508
+
509
+ In order to run XPK commands on a cluster it needs to be set up correctly. This is done automatically when creating a cluster using `xpk cluster create`. For clusters created differently (e.g.: with 'gcloud' or a Cluster Toolkit blueprint) there is a dedicated command: `xpk cluster adapt`. This command installs required config maps, kueue, jobset, CSI drivers etc.
510
+
511
+ Currently `xpk cluster adapt` supports only the following device types:
478
512
 
513
+ - `h200-141gb-8` (A3 Ultra)
514
+
515
+ Example usage:
516
+ ```shell
517
+ python3 xpk.py cluster adapt \
518
+ --cluster=$CLUSTER_NAME --device-type=$DEVICE_TYPE \
519
+ --zone=$COMPUTE_ZONE --project=$PROJECT_ID \
520
+ --num-nodes=$NUM_NODES --reservation=$RESERVATION_ID
521
+ ```
479
522
 
480
523
  ## Storage
481
524
  Currently XPK supports the below types of storages:
@@ -483,9 +526,10 @@ Currently XPK supports the below types of storages:
483
526
  - [Google Cloud Filestore](#filestore)
484
527
  - [Google Cloud Parallelstore](#parallelstore)
485
528
  - [Google Cloud Block storages (Persistent Disk, Hyperdisk)](#block-storage-persistent-disk-hyperdisk)
529
+ - [Google Cloud Managed Lustre](#managed-lustre)
486
530
 
487
531
  ### FUSE
488
- A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so applications can read and write objects in your bucket using standard file system semantics.
532
+ A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so workloads can read and write objects in your bucket using standard file system semantics.
489
533
 
490
534
  To use the GCS FUSE with XPK you need to create a [Storage Bucket](https://console.cloud.google.com/storage/).
491
535
 
@@ -507,11 +551,12 @@ Parameters:
507
551
  - `--size` - size of the storage in Gb.
508
552
  - `--bucket` - name of the storage bucket. If not set then the name of the storage is used as a bucket name.
509
553
  - `--mount-options` - comma-separated list of additional mount options for PersistentVolume ([reference](https://cloud.google.com/kubernetes-engine/docs/how-to/cloud-storage-fuse-csi-driver-perf#mount-options)).
554
+ - `--prefetch-metadata` - enables metadata pre-population when mounting the volume by setting parameter `gcsfuseMetadataPrefetchOnMount` to `true` ([reference](https://cloud.google.com/kubernetes-engine/docs/how-to/cloud-storage-fuse-csi-driver-perf#metadata-prefetch)).
510
555
  - `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions. If set, then values from manifest override the following parameters: `--size` and `--bucket`.
511
556
 
512
557
  ### Filestore
513
558
 
514
- A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
559
+ A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
515
560
 
516
561
  To create and attach a GCP Filestore instance to your cluster use `xpk storage create` command with `--type=gcpfilestore`:
517
562
 
@@ -547,7 +592,7 @@ Commands `xpk storage create` and `xpk storage attach` with `--type=gcpfilestore
547
592
 
548
593
  ### Parallelstore
549
594
 
550
- A Parallelstore adapter lets you mount and access [Parallelstore instances](https://cloud.google.com/parallelstore/) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
595
+ A Parallelstore adapter lets you mount and access [Parallelstore instances](https://cloud.google.com/parallelstore/) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
551
596
 
552
597
  To use the GCS Parallelstore with XPK you need to create a [Parallelstore Instance](https://console.cloud.google.com/parallelstore/).
553
598
 
@@ -571,7 +616,7 @@ Parameters:
571
616
 
572
617
  ### Block storage (Persistent Disk, Hyperdisk)
573
618
 
574
- A PersistentDisk adapter lets you mount and access Google Cloud Block storage solutions ([Persistent Disk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#pd), [Hyperdisk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#hyperdisk)) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
619
+ A PersistentDisk adapter lets you mount and access Google Cloud Block storage solutions ([Persistent Disk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#pd), [Hyperdisk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#hyperdisk)) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
575
620
 
576
621
  To use the GCE PersistentDisk with XPK you need to create a [disk in GCE](https://cloud.google.com/compute/docs/disks). Please consider that the disk type you are creating is [compatible with the VMs](https://cloud.google.com/compute/docs/machine-resource#machine_type_comparison) in the default and accelerator nodepools.
577
622
 
@@ -593,6 +638,30 @@ Parameters:
593
638
  - `--readonly` - if set to true, workload can only read from storage.
594
639
  - `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
595
640
 
641
+ ### Managed Lustre
642
+
643
+ A Managed Lustre adaptor lets you mount and access [Google Cloud Managed Lustre instances](https://cloud.google.com/kubernetes-engine/docs/concepts/managed-lustre) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
644
+
645
+ To use the GCP Managed Lustre with XPK you need to create [an instance](https://cloud.google.com/managed-lustre/docs/create-instance). Please make sure you enable GKE support when creating the instance (gcloud ex. `--gke-support-enabled`).
646
+
647
+ Once it's ready you can use `xpk storage attach` with `--type=lustre` command to attach a Managed Lustre instance to your cluster. Currently, attaching a Managed Lustre instance is supported only by providing a manifest file.
648
+
649
+ ```shell
650
+ python3 xpk.py storage attach test-lustre-storage --type=lustre \
651
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
652
+ --mount-point='/test-mount-point' --readonly=false \
653
+ --auto-mount=true \
654
+ --manifest='./examples/storage/lustre-manifest-attach.yaml'
655
+ ```
656
+
657
+ Parameters:
658
+
659
+ - `--type` - type of the storage `lustre`
660
+ - `--auto-mount` - if set to true all workloads will have this storage mounted by default.
661
+ - `--mount-point` - the path on which this storage should be mounted for a workload.
662
+ - `--readonly` - if set to true, workload can only read from storage.
663
+ - `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
664
+
596
665
  ### List attached storages
597
666
 
598
667
  ```shell
@@ -634,8 +703,14 @@ python3 xpk.py storage delete test-fs-instance \
634
703
  python3 xpk.py workload create \
635
704
  --workload xpk-test-workload --command "echo goodbye" \
636
705
  --cluster xpk-test \
637
- --tpu-type=v5litepod-16 --projet=$PROJECT
706
+ --tpu-type=v5litepod-16 --project=$PROJECT
638
707
  ```
708
+ * Workload create(DWS flex with queued provisioning):
709
+ ```shell
710
+ python3 xpk.py workload create \
711
+ --workload xpk-test-workload --command "echo goodbye" \
712
+ --cluster xpk-test --flex \
713
+ --tpu-type=v5litepod-16 --project=$PROJECT
639
714
 
640
715
  * Workload Create for Pathways:
641
716
  Pathways workload can be submitted using `workload create-pathways` on a Pathways enabled cluster (created with `cluster create-pathways`)
@@ -649,7 +724,7 @@ python3 xpk.py storage delete test-fs-instance \
649
724
  --cluster xpk-pw-test \
650
725
  --docker-name='user-workload' \
651
726
  --docker-image=<maxtext docker image> \
652
- --command='python3 MaxText/train.py MaxText/configs/base.yml base_output_directory=<output directory> dataset_path=<dataset path> per_device_batch_size=1 enable_checkpointing=false enable_profiler=false remat_policy=full global_parameter_scale=4 steps=300 max_target_length=2048 use_iota_embed=true reuse_example_batch=1 dataset_type=synthetic attention=flash gcs_metrics=True run_name=$(USER)-pw-xpk-test-1'
727
+ --command='python3 -m MaxText.train MaxText/configs/base.yml base_output_directory=<output directory> dataset_path=<dataset path> per_device_batch_size=1 enable_checkpointing=false enable_profiler=false remat_policy=full global_parameter_scale=4 steps=300 max_target_length=2048 use_iota_embed=true reuse_example_batch=1 dataset_type=synthetic attention=flash gcs_metrics=True run_name=$(USER)-pw-xpk-test-1 enable_single_controller=True'
653
728
  ```
654
729
 
655
730
  Regular workload can also be submitted on a Pathways enabled cluster (created with `cluster create-pathways`)
@@ -663,7 +738,7 @@ python3 xpk.py storage delete test-fs-instance \
663
738
  --cluster xpk-pw-test \
664
739
  --docker-name='user-workload' \
665
740
  --docker-image=<maxtext docker image> \
666
- --command='python3 MaxText/train.py MaxText/configs/base.yml base_output_directory=<output directory> dataset_path=<dataset path> per_device_batch_size=1 enable_checkpointing=false enable_profiler=false remat_policy=full global_parameter_scale=4 steps=300 max_target_length=2048 use_iota_embed=true reuse_example_batch=1 dataset_type=synthetic attention=flash gcs_metrics=True run_name=$(USER)-pw-xpk-test-1'
741
+ --command='python3 -m MaxText.train MaxText/configs/base.yml base_output_directory=<output directory> dataset_path=<dataset path> per_device_batch_size=1 enable_checkpointing=false enable_profiler=false remat_policy=full global_parameter_scale=4 steps=300 max_target_length=2048 use_iota_embed=true reuse_example_batch=1 dataset_type=synthetic attention=flash gcs_metrics=True run_name=$(USER)-pw-xpk-test-1'
667
742
  ```
668
743
 
669
744
  Pathways in headless mode - Pathways now offers the capability to run JAX workloads in Vertex AI notebooks or in GCE VMs!
@@ -693,21 +768,27 @@ increase this to a large number, say 50. Real jobs can be interrupted due to
693
768
  hardware failures and software updates. We assume your job has implemented
694
769
  checkpointing so the job restarts near where it was interrupted.
695
770
 
696
- ### Workloads for A3-Ultra and A3-Mega clusters (GPU machines)
697
- To submit jobs on a cluster with A3 machines, run the below command. To create a cluster with A3 machines see [here](#provisioning-a3-ultra-and-a3-mega-clusters-gpu-machines).
698
- * For A3-Ultra: --device-type=h200-141gb-8
699
- * For A3-Mega: --device-type=h100-mega-80gb-8
771
+ ### Workloads for A3 Ultra, A3 Mega and A4 clusters (GPU machines)
772
+ To submit jobs on a cluster with A3 or A4 machines, run the command with selected device type. To create a cluster with A3 or A4 machines see [here](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines).
700
773
 
701
- ```shell
702
- python3 xpk.py workload create \
774
+
775
+ Machine | Device type
776
+ :- | :-
777
+ A3 Mega | `h100-mega-80gb-8`
778
+ A3 Ultra | `h200-141gb-8`
779
+ A4 | `b200-8`
780
+
781
+ ```shell
782
+ python3 xpk.py workload create \
703
783
  --workload=$WORKLOAD_NAME --command="echo goodbye" \
704
- --cluster=$CLUSTER_NAME --device-type=h200-141gb-8 \
784
+ --cluster=$CLUSTER_NAME --device-type DEVICE_TYPE \
705
785
  --zone=$COMPUTE_ZONE --project=$PROJECT_ID \
706
786
  --num-nodes=$WOKRKLOAD_NUM_NODES
707
- ```
708
- > The docker image flags/arguments introduced in [workloads section](#workload-create) can be used with A3 machines as well.
787
+ ```
788
+
789
+ > The docker image flags/arguments introduced in [workloads section](#workload-create) can be used with A3 or A4 machines as well.
709
790
 
710
- In order to run NCCL test on A3 Ultra machines check out [this guide](/examples/nccl/nccl.md).
791
+ In order to run NCCL test on A3 machines check out [this guide](/examples/nccl/nccl.md).
711
792
 
712
793
  ### Workload Priority and Preemption
713
794
  * Set the priority level of your workload with `--priority=LEVEL`
@@ -1554,4 +1635,5 @@ python xpk.py batch [other-options] --kind-cluster script
1554
1635
  Please note that all other xpk subcommands are intended for use with cloud systems on Google Cloud Engine (GCE) and don't support local testing. This includes commands like cluster, info, inspector, etc.
1555
1636
 
1556
1637
  # Other advanced usage
1557
- [Use a Jupyter notebook to interact with a Cloud TPU cluster](xpk-notebooks.md)
1638
+ [Use a Jupyter notebook to interact with a Cloud TPU cluster](xpk-notebooks.md) \
1639
+ [Use Slurm like commands in XPK to execute workloads on top of GKE](xpk-slurm-commands.md)