xpk 0.5.0__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. {xpk-0.5.0 → xpk-0.7.0}/PKG-INFO +453 -29
  2. xpk-0.5.0/xpk.egg-info/PKG-INFO → xpk-0.7.0/README.md +437 -44
  3. {xpk-0.5.0 → xpk-0.7.0}/pyproject.toml +22 -5
  4. xpk-0.7.0/src/xpk/__init__.py +15 -0
  5. xpk-0.7.0/src/xpk/api/__init__.py +15 -0
  6. xpk-0.7.0/src/xpk/api/storage_crd.yaml +52 -0
  7. xpk-0.7.0/src/xpk/commands/__init__.py +15 -0
  8. xpk-0.7.0/src/xpk/commands/batch.py +131 -0
  9. xpk-0.7.0/src/xpk/commands/cluster.py +808 -0
  10. xpk-0.7.0/src/xpk/commands/cluster_gcluster.py +269 -0
  11. xpk-0.7.0/src/xpk/commands/common.py +44 -0
  12. xpk-0.7.0/src/xpk/commands/config.py +29 -0
  13. xpk-0.7.0/src/xpk/commands/info.py +243 -0
  14. xpk-0.7.0/src/xpk/commands/inspector.py +357 -0
  15. xpk-0.7.0/src/xpk/commands/job.py +199 -0
  16. xpk-0.7.0/src/xpk/commands/kind.py +283 -0
  17. xpk-0.7.0/src/xpk/commands/kjob_common.py +44 -0
  18. xpk-0.7.0/src/xpk/commands/run.py +128 -0
  19. xpk-0.7.0/src/xpk/commands/shell.py +140 -0
  20. xpk-0.7.0/src/xpk/commands/storage.py +267 -0
  21. xpk-0.7.0/src/xpk/commands/version.py +27 -0
  22. xpk-0.7.0/src/xpk/commands/workload.py +889 -0
  23. xpk-0.7.0/src/xpk/core/__init__.py +15 -0
  24. xpk-0.7.0/src/xpk/core/blueprint/__init__.py +15 -0
  25. xpk-0.7.0/src/xpk/core/blueprint/blueprint_definitions.py +62 -0
  26. xpk-0.7.0/src/xpk/core/blueprint/blueprint_generator.py +708 -0
  27. xpk-0.7.0/src/xpk/core/capacity.py +185 -0
  28. xpk-0.7.0/src/xpk/core/cluster.py +564 -0
  29. xpk-0.7.0/src/xpk/core/cluster_private.py +200 -0
  30. xpk-0.7.0/src/xpk/core/commands.py +356 -0
  31. xpk-0.7.0/src/xpk/core/config.py +179 -0
  32. xpk-0.7.0/src/xpk/core/docker_container.py +225 -0
  33. xpk-0.7.0/src/xpk/core/docker_image.py +210 -0
  34. xpk-0.7.0/src/xpk/core/docker_manager.py +308 -0
  35. xpk-0.7.0/src/xpk/core/docker_resources.py +350 -0
  36. xpk-0.7.0/src/xpk/core/filestore.py +251 -0
  37. xpk-0.7.0/src/xpk/core/gcloud_context.py +196 -0
  38. xpk-0.7.0/src/xpk/core/gcluster_manager.py +176 -0
  39. xpk-0.7.0/src/xpk/core/gcsfuse.py +50 -0
  40. xpk-0.7.0/src/xpk/core/kjob.py +444 -0
  41. xpk-0.7.0/src/xpk/core/kueue.py +358 -0
  42. xpk-0.7.0/src/xpk/core/monitoring.py +134 -0
  43. xpk-0.7.0/src/xpk/core/nap.py +361 -0
  44. xpk-0.7.0/src/xpk/core/network.py +377 -0
  45. xpk-0.7.0/src/xpk/core/nodepool.py +581 -0
  46. xpk-0.7.0/src/xpk/core/pathways.py +377 -0
  47. xpk-0.7.0/src/xpk/core/ray.py +222 -0
  48. xpk-0.7.0/src/xpk/core/remote_state/__init__.py +15 -0
  49. xpk-0.7.0/src/xpk/core/remote_state/fuse_remote_state.py +99 -0
  50. xpk-0.7.0/src/xpk/core/remote_state/remote_state_client.py +38 -0
  51. xpk-0.7.0/src/xpk/core/resources.py +238 -0
  52. xpk-0.7.0/src/xpk/core/scheduling.py +253 -0
  53. xpk-0.7.0/src/xpk/core/storage.py +581 -0
  54. xpk-0.7.0/src/xpk/core/system_characteristics.py +1432 -0
  55. xpk-0.7.0/src/xpk/core/vertex.py +105 -0
  56. xpk-0.7.0/src/xpk/core/workload.py +341 -0
  57. xpk-0.7.0/src/xpk/core/workload_decorators/__init__.py +15 -0
  58. xpk-0.7.0/src/xpk/core/workload_decorators/rdma_decorator.py +129 -0
  59. xpk-0.7.0/src/xpk/core/workload_decorators/storage_decorator.py +52 -0
  60. xpk-0.7.0/src/xpk/core/workload_decorators/tcpxo_decorator.py +190 -0
  61. xpk-0.7.0/src/xpk/main.py +75 -0
  62. xpk-0.7.0/src/xpk/parser/__init__.py +15 -0
  63. xpk-0.7.0/src/xpk/parser/batch.py +43 -0
  64. xpk-0.7.0/src/xpk/parser/cluster.py +662 -0
  65. xpk-0.7.0/src/xpk/parser/common.py +259 -0
  66. xpk-0.7.0/src/xpk/parser/config.py +49 -0
  67. xpk-0.7.0/src/xpk/parser/core.py +135 -0
  68. xpk-0.7.0/src/xpk/parser/info.py +64 -0
  69. xpk-0.7.0/src/xpk/parser/inspector.py +65 -0
  70. xpk-0.7.0/src/xpk/parser/job.py +147 -0
  71. xpk-0.7.0/src/xpk/parser/kind.py +95 -0
  72. xpk-0.7.0/src/xpk/parser/run.py +47 -0
  73. xpk-0.7.0/src/xpk/parser/shell.py +59 -0
  74. xpk-0.7.0/src/xpk/parser/storage.py +316 -0
  75. xpk-0.7.0/src/xpk/parser/validators.py +39 -0
  76. xpk-0.7.0/src/xpk/parser/version.py +23 -0
  77. xpk-0.7.0/src/xpk/parser/workload.py +726 -0
  78. xpk-0.7.0/src/xpk/templates/__init__.py +15 -0
  79. xpk-0.7.0/src/xpk/templates/storage.yaml +13 -0
  80. xpk-0.7.0/src/xpk/utils/__init__.py +15 -0
  81. xpk-0.7.0/src/xpk/utils/console.py +55 -0
  82. xpk-0.7.0/src/xpk/utils/file.py +82 -0
  83. xpk-0.7.0/src/xpk/utils/gcs_utils.py +125 -0
  84. xpk-0.7.0/src/xpk/utils/kubectl.py +57 -0
  85. xpk-0.7.0/src/xpk/utils/network.py +168 -0
  86. xpk-0.7.0/src/xpk/utils/objects.py +88 -0
  87. xpk-0.7.0/src/xpk/utils/templates.py +28 -0
  88. xpk-0.7.0/src/xpk/utils/validation.py +80 -0
  89. xpk-0.7.0/src/xpk/utils/yaml.py +30 -0
  90. xpk-0.5.0/README.md → xpk-0.7.0/src/xpk.egg-info/PKG-INFO +468 -25
  91. xpk-0.7.0/src/xpk.egg-info/SOURCES.txt +95 -0
  92. xpk-0.7.0/src/xpk.egg-info/entry_points.txt +2 -0
  93. xpk-0.7.0/src/xpk.egg-info/requires.txt +18 -0
  94. xpk-0.5.0/xpk.egg-info/SOURCES.txt +0 -10
  95. xpk-0.5.0/xpk.egg-info/entry_points.txt +0 -2
  96. xpk-0.5.0/xpk.egg-info/requires.txt +0 -6
  97. xpk-0.5.0/xpk.py +0 -7282
  98. {xpk-0.5.0 → xpk-0.7.0}/LICENSE +0 -0
  99. {xpk-0.5.0 → xpk-0.7.0}/setup.cfg +0 -0
  100. {xpk-0.5.0 → xpk-0.7.0/src}/xpk.egg-info/dependency_links.txt +0 -0
  101. {xpk-0.5.0 → xpk-0.7.0/src}/xpk.egg-info/top_level.txt +0 -0
@@ -1,8 +1,8 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: xpk
3
- Version: 0.5.0
3
+ Version: 0.7.0
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
- Author-email: Cloud TPU Team <cloud-tpu-eng@google.com>
5
+ Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
7
7
  Project-URL: Homepage, https://github.com/google/xpk
8
8
  Project-URL: Bug Tracker, https://github.com/google/xpk/issues
@@ -11,11 +11,23 @@ Classifier: Programming Language :: Python :: 3.11
11
11
  Requires-Python: >=3.10
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
- Requires-Dist: cloud-accelerator-diagnostics
14
+ Requires-Dist: cloud-accelerator-diagnostics==0.1.1
15
+ Requires-Dist: tabulate==0.9.0
16
+ Requires-Dist: ruamel.yaml==0.18.10
17
+ Requires-Dist: pyyaml==6.0.2
18
+ Requires-Dist: docker==7.1.0
19
+ Requires-Dist: kubernetes==31.0.0
20
+ Requires-Dist: google-cloud==0.34.0
21
+ Requires-Dist: google-api-core==2.24.1
22
+ Requires-Dist: packaging==24.2
23
+ Requires-Dist: google-cloud-filestore==1.12.0
24
+ Requires-Dist: google-cloud-storage==2.19.0
15
25
  Provides-Extra: dev
16
26
  Requires-Dist: pyink==24.3.0; extra == "dev"
17
27
  Requires-Dist: pylint>=2.6.0; extra == "dev"
18
28
  Requires-Dist: pre-commit; extra == "dev"
29
+ Requires-Dist: pytest; extra == "dev"
30
+ Requires-Dist: docker==7.1.0; extra == "dev"
19
31
 
20
32
  <!--
21
33
  Copyright 2023 Google LLC
@@ -35,6 +47,8 @@ Requires-Dist: pre-commit; extra == "dev"
35
47
 
36
48
  [![Build Tests](https://github.com/google/xpk/actions/workflows/build_tests.yaml/badge.svg)](https://github.com/google/xpk/actions/workflows/build_tests.yaml)
37
49
  [![Nightly Tests](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml/badge.svg)](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml)
50
+ [![Develop Tests](https://github.com/AI-Hypercomputer/xpk/actions/workflows/build_tests.yaml/badge.svg?branch=develop)](https://github.com/AI-Hypercomputer/xpk/actions/workflows/build_tests.yaml)
51
+ [![Develop Nightly Tests](https://github.com/AI-Hypercomputer/xpk/actions/workflows/nightly_tests.yaml/badge.svg?branch=develop)](https://github.com/AI-Hypercomputer/xpk/actions/workflows/nightly_tests.yaml)
38
52
 
39
53
  # Overview
40
54
 
@@ -62,31 +76,89 @@ xpk supports the following TPU types:
62
76
  * v4
63
77
  * v5e
64
78
  * v5p
79
+ * Trillium (v6e)
65
80
 
66
81
  and the following GPU types:
67
- * a100
68
- * h100
82
+ * A100
83
+ * A3-Highgpu (h100)
84
+ * A3-Mega (h100-mega) - [Create cluster](#provisioning-a3-ultra-and-a3-mega-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-and-a3-mega-clusters-gpu-machines)
85
+ * A3-Ultra (h200) - [Create cluster](#provisioning-a3-ultra-and-a3-mega-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-and-a3-mega-clusters-gpu-machines)
69
86
 
70
87
  and the following CPU types:
71
88
  * n2-standard-32
72
89
 
90
+ xpk also supports Google Cloud Storage solutions:
91
+ * [Cloud Storage FUSE](#fuse)
92
+ * [Filestore](#filestore)
93
+
94
+ # Permissions needed on Cloud Console:
95
+
96
+ * Artifact Registry Writer
97
+ * Compute Admin
98
+ * Kubernetes Engine Admin
99
+ * Logging Admin
100
+ * Monitoring Admin
101
+ * Service Account User
102
+ * Storage Admin
103
+ * Vertex AI Administrator
104
+ * Filestore Editor (This role is neccessary if you want to run `storage create` command with `--type=gcpfilestore`)
105
+
106
+ # Prerequisites
107
+
108
+ Following tools must be installed:
109
+
110
+ - python >= 3.10 (download from [here](https://www.python.org/downloads/))
111
+ - pip ([installation instruction](https://pip.pypa.io/en/stable/installation/))
112
+ - python venv ([installation instruction](https://virtualenv.pypa.io/en/latest/installation.html))
113
+ (all three of above can be installed at once from [here](https://packaging.python.org/en/latest/guides/installing-using-linux-tools/#installing-pip-setuptools-wheel-with-linux-package-managers))
114
+ - gcloud (install from [here](https://cloud.google.com/sdk/gcloud#download_and_install_the))
115
+ - Run `gcloud init`
116
+ - [Authenticate](https://cloud.google.com/sdk/gcloud/reference/auth/application-default/login) to Google Cloud
117
+ - kubectl (install from [here](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_kubectl))
118
+ - Install `gke-gcloud-auth-plugin` from [here](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_plugin)
119
+ - docker ([installation instruction](https://docs.docker.com/engine/install/))
120
+ - Run `gcloud auth configure-docker` to ensure images can be uploaded to registry
121
+ - make - please run below command.
122
+ ```shell
123
+ # sudo may be required
124
+ apt-get -y install make
125
+ ```
126
+ In addition, below dependencies can be installed either using provided links or using `make install` command, if xpk is downloaded via `git clone` command:
127
+ - kueuectl (install from [here](https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/))
128
+ - kjob (installation instructions [here](https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md))
129
+
73
130
  # Installation
74
- To install xpk, run the following command:
131
+ To install xpk, install required tools mentioned in [prerequisites](#prerequisites). [Makefile](https://github.com/AI-Hypercomputer/xpk/blob/main/Makefile) provides a way to install all neccessary tools. XPK can be installed via pip:
75
132
 
76
133
  ```shell
77
134
  pip install xpk
78
135
  ```
79
136
 
137
+ If you see an error saying: `This environment is externally managed`, please use a virtual environment.
138
+
139
+ ```shell
140
+ ## One time step of creating the venv
141
+ VENV_DIR=~/venvp3
142
+ python3 -m venv $VENV_DIR
143
+ ## Enter your venv.
144
+ source $VENV_DIR/bin/activate
145
+ ## Clone the repository and installing dependencies.
146
+ pip install xpk
147
+ ```
148
+
80
149
  If you are running XPK by cloning GitHub repository, first run the
81
150
  following commands to begin using XPK commands:
82
151
 
83
152
  ```shell
84
153
  git clone https://github.com/google/xpk.git
85
154
  cd xpk
86
- # Install dependencies such as cloud-accelerator-diagnostics
87
- pip install .
155
+ # Install required dependencies with make
156
+ make install && export PATH=$PATH:$PWD/bin
88
157
  ```
89
158
 
159
+ If you want to have installed dependecies persist in your PATH please run:
160
+ `echo $PWD/bin` and add its value to `PATH` in .bashrc or .zshrc
161
+
90
162
  If you see an error saying: `This environment is externally managed`, please use a virtual environment.
91
163
 
92
164
  Example:
@@ -100,8 +172,8 @@ Example:
100
172
  ## Clone the repository and installing dependencies.
101
173
  git clone https://github.com/google/xpk.git
102
174
  cd xpk
103
- # Install dependencies such as cloud-accelerator-diagnostics
104
- pip install .
175
+ # Install required dependencies with make
176
+ make install && export PATH=$PATH:$PWD/bin
105
177
  ```
106
178
 
107
179
  # XPK for Large Scale (>1k VMs)
@@ -125,6 +197,8 @@ cleanup with a `Cluster Delete`.
125
197
  If you have failures with workloads not running, use `xpk inspector` to investigate
126
198
  more.
127
199
 
200
+ If you need your Workloads to have persistent storage, use `xpk storage` to find out more.
201
+
128
202
  ## Cluster Create
129
203
 
130
204
  First set the project and zone through gcloud config or xpk arguments.
@@ -171,13 +245,22 @@ all zones.
171
245
  ```
172
246
 
173
247
  * Cluster Create for Pathways:
174
- Pathways compatible cluster can be created using `--enable-pathways`
248
+ Pathways compatible cluster can be created using `cluster create-pathways`.
175
249
  ```shell
176
- python3 xpk.py cluster create \
250
+ python3 xpk.py cluster create-pathways \
177
251
  --cluster xpk-pw-test \
178
252
  --num-slices=4 --on-demand \
179
- --tpu-type=v5litepod-16 \
180
- --enable-pathways
253
+ --tpu-type=v5litepod-16
254
+ ```
255
+
256
+ * Cluster Create for Ray:
257
+ A cluster with KubeRay enabled and a RayCluster can be created using `cluster create-ray`.
258
+ ```shell
259
+ python3 xpk.py cluster create-ray \
260
+ --cluster xpk-rc-test \
261
+ --ray-version=2.39.0 \
262
+ --num-slices=4 --on-demand \
263
+ --tpu-type=v5litepod-8
181
264
  ```
182
265
 
183
266
  * Cluster Create can be called again with the same `--cluster name` to modify
@@ -214,9 +297,73 @@ all zones.
214
297
  python3 xpk.py cluster create --force \
215
298
  --cluster xpk-test --tpu-type=v5litepod-16 \
216
299
  --num-slices=6 --reservation=$RESERVATION_ID
300
+ ```
301
+
302
+ and recreates the cluster with 4 slices of v4-8. The command will rerun to delete
303
+ 6 slices of v5litepod-16 and create 4 slices of v4-8. The command will warn the
304
+ user when deleting slices. Use `--force` to skip prompts.
305
+
306
+ ```shell
307
+ python3 xpk.py cluster create \
308
+ --cluster xpk-test --tpu-type=v4-8 \
309
+ --num-slices=4 --reservation=$RESERVATION_ID
217
310
 
311
+ # Skip delete prompts using --force.
312
+
313
+ python3 xpk.py cluster create --force \
314
+ --cluster xpk-test --tpu-type=v4-8 \
315
+ --num-slices=4 --reservation=$RESERVATION_ID
218
316
  ```
219
317
 
318
+ ### Create Private Cluster
319
+
320
+ XPK allows you to create a private GKE cluster for enhanced security. In a private cluster, nodes and pods are isolated from the public internet, providing an additional layer of protection for your workloads.
321
+
322
+ To create a private cluster, use the following arguments:
323
+
324
+ **`--private`**
325
+
326
+ This flag enables the creation of a private GKE cluster. When this flag is set:
327
+
328
+ * Nodes and pods are isolated from the direct internet access.
329
+ * `master_authorized_networks` is automatically enabled.
330
+ * Access to the cluster's control plane is restricted to your current machine's IP address by default.
331
+
332
+ **`--authorized-networks`**
333
+
334
+ This argument allows you to specify additional IP ranges (in CIDR notation) that are authorized to access the private cluster's control plane and perform `kubectl` commands.
335
+
336
+ * Even if this argument is not set when you have `--private`, your current machine's IP address will always be given access to the control plane.
337
+ * If this argument is used with an existing private cluster, it will replace the existing authorized networks.
338
+
339
+ **Example Usage:**
340
+
341
+ * To create a private cluster and allow access to Control Plane only to your current machine:
342
+
343
+ ```shell
344
+ python3 xpk.py cluster create \
345
+ --cluster=xpk-private-cluster \
346
+ --tpu-type=v4-8 --num-slices=2 \
347
+ --private
348
+ ```
349
+
350
+ * To create a private cluster and allow access to Control Plane only to your current machine and the IP ranges `1.2.3.0/24` and `1.2.4.5/32`:
351
+
352
+ ```shell
353
+ python3 xpk.py cluster create \
354
+ --cluster=xpk-private-cluster \
355
+ --tpu-type=v4-8 --num-slices=2 \
356
+ --authorized-networks 1.2.3.0/24 1.2.4.5/32
357
+
358
+ # --private is optional when you set --authorized-networks
359
+ ```
360
+
361
+ > **Important Notes:**
362
+ > * The argument `--private` is only applicable when creating new clusters. You cannot convert an existing public cluster to a private cluster using these flags.
363
+ > * The argument `--authorized-networks` is applicable when creating new clusters or using an existing _*private*_ cluster. You cannot convert an existing public cluster to a private cluster using these flags.
364
+ > * You need to [set up a Cluster NAT for your VPC network](https://cloud.google.com/nat/docs/set-up-manage-network-address-translation#creating_nat) so that the Nodes and Pods have outbound access to the internet. This is required because XPK installs and configures components such as kueue that need access to external sources like `registry.k8.io`.
365
+
366
+
220
367
  ### Create Vertex AI Tensorboard
221
368
  *Note: This feature is available in XPK >= 0.4.0. Enable [Vertex AI API](https://cloud.google.com/vertex-ai/docs/start/cloud-environment#enable_vertexai_apis) in your Google Cloud console to use this feature. Make sure you have
222
369
  [Vertex AI Administrator](https://cloud.google.com/vertex-ai/docs/general/access-control#aiplatform.admin) role
@@ -306,6 +453,121 @@ will fail the cluster creation process because Vertex AI Tensorboard is not supp
306
453
  --tpu-type=v5litepod-16
307
454
  ```
308
455
 
456
+ ## Provisioning A3-Ultra and A3-Mega clusters (GPU machines)
457
+ To create a cluster with A3 machines, run the below command. To create workloads on these clusters see [here](#workloads-for-a3-ultra-and-a3-mega-clusters-gpu-machines).
458
+ * For A3-Ultra: --device-type=h200-141gb-8
459
+ * For A3-Mega: --device-type=h100-mega-80gb-8
460
+
461
+ ```shell
462
+ python3 xpk.py cluster create \
463
+ --cluster CLUSTER_NAME --device-type=h200-141gb-8 \
464
+ --zone=$COMPUTE_ZONE --project=$PROJECT_ID \
465
+ --num-nodes=4 --reservation=$RESERVATION_ID
466
+ ```
467
+ Currently, the below flags/arguments are supported for A3-Mega and A3-Ultra machines:
468
+ * --num-nodes
469
+ * --default-pool-cpu-machine-type
470
+ * --default-pool-cpu-num-nodes
471
+ * --reservation
472
+ * --spot
473
+ * --on-demand (only A3-Mega)
474
+
475
+
476
+ ## Storage
477
+ Currently XPK supports two types of storages: Cloud Storage FUSE and Google Cloud Filestore.
478
+
479
+ ### FUSE
480
+ A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so applications can read and write objects in your bucket using standard file system semantics.
481
+
482
+ To use the GCS FUSE with XPK you need to create a [Storage Bucket](https://console.cloud.google.com/storage/).
483
+
484
+ Once it's ready you can use `xpk storage attach` with `--type=gcsfuse` command to attach a FUSE storage instance to your cluster:
485
+
486
+ ```shell
487
+ python3 xpk.py storage attach test-fuse-storage --type=gcsfuse \
488
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
489
+ --mount-point='/test-mount-point' --readonly=false \
490
+ --bucket=test-bucket --size=1 --auto-mount=false
491
+ ```
492
+
493
+ Parameters:
494
+
495
+ - `--type` - type of the storage, currently xpk supports `gcsfuse` and `gcpfilestore` only.
496
+ - `--auto-mount` - if set to true all workloads will have this storage mounted by default.
497
+ - `--mount-point` - the path on which this storage should be mounted for a workload.
498
+ - `--readonly` - if set to true, workload can only read from storage.
499
+ - `--size` - size of the storage in Gb.
500
+ - `--bucket` - name of the storage bucket. If not set then the name of the storage is used as a bucket name.
501
+
502
+ ### Filestore
503
+
504
+ A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so applications can read and write objects in your volumes using standard file system semantics.
505
+
506
+ To create and attach a GCP Filestore instance to your cluster use `xpk storage create` command with `--type=gcpfilestore`:
507
+
508
+ ```shell
509
+ python3 xpk.py storage create test-fs-storage --type=gcpfilestore \
510
+ --auto-mount=false --mount-point=/data-fs --readonly=false \
511
+ --size=1024 --tier=BASIC_HDD --access_mode=ReadWriteMany --vol=default \
512
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
513
+ ```
514
+
515
+ You can also attach an existing Filestore instance to your cluster using `xpk storage attach` command:
516
+
517
+ ```shell
518
+ python3 xpk.py storage attach test-fs-storage --type=gcpfilestore \
519
+ --auto-mount=false --mount-point=/data-fs --readonly=false \
520
+ --size=1024 --tier=BASIC_HDD --access_mode=ReadWriteMany --vol=default \
521
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
522
+ ```
523
+
524
+ The command above is also useful when attaching multiple volumes from the same Filestore instance.
525
+
526
+ Commands `xpk storage create` and `xpk storage attach` with `--type=gcpfilestore` accept following arguments:
527
+ - `--type` - type of the storage.
528
+ - `--auto-mount` - if set to true all workloads will have this storage mounted by default.
529
+ - `--mount-point` - the path on which this storage should be mounted for a workload.
530
+ - `--readonly` - if set to true, workload can only read from storage.
531
+ - `--size` - size of the Filestore instance that will be created in Gb.
532
+ - `--tier` - tier of the Filestore instance that will be created. Possible options are: `[BASIC_HDD, BASIC_SSD, ZONAL, REGIONAL, ENTERPRISE]`
533
+ - `--access-mode` - access mode of the Filestore instance that will be created. Possible values are: `[ReadWriteOnce, ReadOnlyMany, ReadWriteMany]`
534
+ - `--vol` - file share name of the Filestore instance that will be created.
535
+ - `--instance` - the name of the Filestore instance. If not set then the name parameter is used as an instance name. Useful when connecting multiple volumes from the same Filestore instance.
536
+
537
+ ### List attached storages
538
+
539
+ ```shell
540
+ python3 xpk.py storage list \
541
+ --project=$PROJECT --cluster $CLUSTER --zone=$ZONE
542
+ ```
543
+
544
+ ### Running workloads with storage
545
+
546
+ If you specified `--auto-mount=true` when creating or attaching a storage, then all workloads deployed on the cluster will have the volume attached by default. Otherwise, in order to have the storage attached, you have to add `--storage` parameter to `workload create` command:
547
+
548
+ ```shell
549
+ python3 xpk.py workload create \
550
+ --workload xpk-test-workload --command "echo goodbye" \
551
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
552
+ --tpu-type=v5litepod-16 --storage=test-storage
553
+ ```
554
+
555
+ ### Detaching storage
556
+
557
+ ```shell
558
+ python3 xpk.py storage detach $STORAGE_NAME \
559
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
560
+ ```
561
+
562
+ ### Deleting storage
563
+
564
+ XPK allows you to remove Filestore instances easily with `xpk storage delete` command. **Warning:** this deletes all data contained in the Filestore!
565
+
566
+ ```shell
567
+ python3 xpk.py storage delete test-fs-instance \
568
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
569
+ ```
570
+
309
571
  ## Workload Create
310
572
  * Workload Create (submit training job):
311
573
 
@@ -313,30 +575,29 @@ will fail the cluster creation process because Vertex AI Tensorboard is not supp
313
575
  python3 xpk.py workload create \
314
576
  --workload xpk-test-workload --command "echo goodbye" \
315
577
  --cluster xpk-test \
316
- --tpu-type=v5litepod-16
578
+ --tpu-type=v5litepod-16 --projet=$PROJECT
317
579
  ```
318
580
 
319
581
  * Workload Create for Pathways:
320
- Pathways workload can be submitted using `--use-pathways` on a Pathways enabled cluster (created with `--enable-pathways`)
582
+ Pathways workload can be submitted using `workload create-pathways` on a Pathways enabled cluster (created with `cluster create-pathways`)
321
583
 
322
584
  Pathways workload example:
323
585
  ```shell
324
- python3 xpk.py workload create \
586
+ python3 xpk.py workload create-pathways \
325
587
  --workload xpk-pw-test \
326
588
  --num-slices=1 \
327
589
  --tpu-type=v5litepod-16 \
328
- --use-pathways \
329
590
  --cluster xpk-pw-test \
330
591
  --docker-name='user-workload' \
331
592
  --docker-image=<maxtext docker image> \
332
593
  --command='python3 MaxText/train.py MaxText/configs/base.yml base_output_directory=<output directory> dataset_path=<dataset path> per_device_batch_size=1 enable_checkpointing=false enable_profiler=false remat_policy=full global_parameter_scale=4 steps=300 max_target_length=2048 use_iota_embed=true reuse_example_batch=1 dataset_type=synthetic attention=flash gcs_metrics=True run_name=$(USER)-pw-xpk-test-1'
333
594
  ```
334
595
 
335
- Regular workload can also be submitted on a Pathways enabled cluster (created with `--enable-pathways`)
596
+ Regular workload can also be submitted on a Pathways enabled cluster (created with `cluster create-pathways`)
336
597
 
337
598
  Pathways workload example:
338
599
  ```shell
339
- python3 xpk.py workload create \
600
+ python3 xpk.py workload create-pathways \
340
601
  --workload xpk-regular-test \
341
602
  --num-slices=1 \
342
603
  --tpu-type=v5litepod-16 \
@@ -346,6 +607,25 @@ will fail the cluster creation process because Vertex AI Tensorboard is not supp
346
607
  --command='python3 MaxText/train.py MaxText/configs/base.yml base_output_directory=<output directory> dataset_path=<dataset path> per_device_batch_size=1 enable_checkpointing=false enable_profiler=false remat_policy=full global_parameter_scale=4 steps=300 max_target_length=2048 use_iota_embed=true reuse_example_batch=1 dataset_type=synthetic attention=flash gcs_metrics=True run_name=$(USER)-pw-xpk-test-1'
347
608
  ```
348
609
 
610
+ Pathways in headless mode - Pathways now offers the capability to run JAX workloads in Vertex AI notebooks or in GCE VMs!
611
+ Specify `--headless` with `workload create-pathways` when the user workload is not provided in a docker container.
612
+ ```shell
613
+ python3 xpk.py workload create-pathways --headless \
614
+ --workload xpk-pw-headless \
615
+ --num-slices=1 \
616
+ --tpu-type=v5litepod-16 \
617
+ --cluster xpk-pw-test
618
+ ```
619
+ Executing the command above would provide the address of the proxy that the user job should connect to.
620
+ ```shell
621
+ kubectl get pods
622
+ kubectl port-forward pod/<proxy-pod-name> 29000:29000
623
+ ```
624
+ ```shell
625
+ JAX_PLATFORMS=proxy JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 python -c 'import pathwaysutils; import jax; print(jax.devices())'
626
+ ```
627
+ Specify `JAX_PLATFORMS=proxy` and `JAX_BACKEND_TARGET=<proxy address from above>` and `import pathwaysutils` to establish this connection between the user's JAX code and the Pathways proxy. Execute Pathways workloads interactively on Vertex AI notebooks!
628
+
349
629
  ### Set `max-restarts` for production jobs
350
630
 
351
631
  * `--max-restarts <value>`: By default, this is 0. This will restart the job ""
@@ -354,6 +634,22 @@ increase this to a large number, say 50. Real jobs can be interrupted due to
354
634
  hardware failures and software updates. We assume your job has implemented
355
635
  checkpointing so the job restarts near where it was interrupted.
356
636
 
637
+ ### Workloads for A3-Ultra and A3-Mega clusters (GPU machines)
638
+ To submit jobs on a cluster with A3 machines, run the below command. To create a cluster with A3 machines see [here](#provisioning-a3-ultra-and-a3-mega-clusters-gpu-machines).
639
+ * For A3-Ultra: --device-type=h200-141gb-8
640
+ * For A3-Mega: --device-type=h100-mega-80gb-8
641
+
642
+ ```shell
643
+ python3 xpk.py workload create \
644
+ --workload=$WORKLOAD_NAME --command="echo goodbye" \
645
+ --cluster=$CLUSTER_NAME --device-type=h200-141gb-8 \
646
+ --zone=$COMPUTE_ZONE --project=$PROJECT_ID \
647
+ --num-nodes=$WOKRKLOAD_NUM_NODES
648
+ ```
649
+ > The docker image flags/arguments introduced in [workloads section](#workload-create) can be used with A3 machines as well.
650
+
651
+ In order to run NCCL test on A3 Ultra machines check out [this guide](/examples/nccl/nccl.md).
652
+
357
653
  ### Workload Priority and Preemption
358
654
  * Set the priority level of your workload with `--priority=LEVEL`
359
655
 
@@ -491,9 +787,7 @@ Check out [MaxText example](https://github.com/google/maxtext/pull/570) on how t
491
787
  --cluster xpk-test --filter-by-job=$USER
492
788
  ```
493
789
 
494
- * Workload List supports waiting for the completion of a specific job. XPK will follow an existing job until it has finished or the `timeout`, if provided, has been reached and then list the job. If no `timeout` is specified, the default value is set to the max value, 1 week. You may also set `timeout=0` to poll the job once.
495
- (Note: `restart-on-user-code-failure` must be set
496
- when creating the workload otherwise the workload will always finish with `Completed` status.)
790
+ * Workload List supports waiting for the completion of a specific job. XPK will follow an existing job until it has finished or the `timeout`, if provided, has been reached and then list the job. If no `timeout` is specified, the default value is set to the max value, 1 week. You may also set `timeout=0` to poll the job once.
497
791
 
498
792
  Wait for a job to complete.
499
793
 
@@ -510,11 +804,37 @@ when creating the workload otherwise the workload will always finish with `Compl
510
804
  --timeout=300
511
805
  ```
512
806
 
513
- Return codes
514
- `0`: Workload finished and completed successfully.
515
- `124`: Timeout was reached before workload finished.
516
- `125`: Workload finished but did not complete successfully.
517
- `1`: Other failure.
807
+ Return codes
808
+ `0`: Workload finished and completed successfully.
809
+ `124`: Timeout was reached before workload finished.
810
+ `125`: Workload finished but did not complete successfully.
811
+ `1`: Other failure.
812
+
813
+ ## Job List
814
+
815
+ * Job List (see jobs submitted via batch command):
816
+
817
+ ```shell
818
+ python3 xpk.py job ls --cluster xpk-test
819
+ ```
820
+
821
+ * Example Job List Output:
822
+
823
+ ```
824
+ NAME PROFILE LOCAL QUEUE COMPLETIONS DURATION AGE
825
+ xpk-def-app-profile-slurm-74kbv xpk-def-app-profile 1/1 15s 17h
826
+ xpk-def-app-profile-slurm-brcsg xpk-def-app-profile 1/1 9s 3h56m
827
+ xpk-def-app-profile-slurm-kw99l xpk-def-app-profile 1/1 5s 3h54m
828
+ xpk-def-app-profile-slurm-x99nx xpk-def-app-profile 3/3 29s 17h
829
+ ```
830
+
831
+ ## Job Cancel
832
+
833
+ * Job Cancel (delete job submitted via batch command):
834
+
835
+ ```shell
836
+ python3 xpk.py job cancel xpk-def-app-profile-slurm-74kbv --cluster xpk-test
837
+ ```
518
838
 
519
839
  ## Inspector
520
840
  * Inspector provides debug info to understand cluster health, and why workloads are not running.
@@ -559,6 +879,35 @@ Inspector output is saved to a file.
559
879
  [XPK] Exiting XPK cleanly
560
880
  ```
561
881
 
882
+ ## Run
883
+ * `xpk run` lets you execute scripts on a cluster with ease. It automates task execution, handles interruptions, and streams job output to your console.
884
+
885
+ ```shell
886
+ python xpk.py run --kind-cluster -n 2 -t 0-2 examples/job.sh
887
+ ```
888
+
889
+ * Example Output:
890
+
891
+ ```shell
892
+ [XPK] Starting xpk
893
+ [XPK] Task: `get current-context` is implemented by `kubectl config current-context`, hiding output unless there is an error.
894
+ [XPK] No local cluster name specified. Using current-context `kind-kind`
895
+ [XPK] Task: `run task` is implemented by `kubectl kjob create slurm --profile xpk-def-app-profile --localqueue multislice-queue --wait --rm -- examples/job.sh --partition multislice-queue --ntasks 2 --time 0-2`. Streaming output and input live.
896
+ job.batch/xpk-def-app-profile-slurm-g4vr6 created
897
+ configmap/xpk-def-app-profile-slurm-g4vr6 created
898
+ service/xpk-def-app-profile-slurm-g4vr6 created
899
+ Starting log streaming for pod xpk-def-app-profile-slurm-g4vr6-1-4rmgk...
900
+ Now processing task ID: 3
901
+ Starting log streaming for pod xpk-def-app-profile-slurm-g4vr6-0-bg6dm...
902
+ Now processing task ID: 1
903
+ exit
904
+ exit
905
+ Now processing task ID: 2
906
+ exit
907
+ Job logs streaming finished.[XPK] Task: `run task` terminated with code `0`
908
+ [XPK] XPK Done.
909
+ ```
910
+
562
911
  ## GPU usage
563
912
 
564
913
  In order to use XPK for GPU, you can do so by using `device-type` flag.
@@ -971,6 +1320,14 @@ gcloud compute machine-types list --zones=$ZONE_LIST
971
1320
  python3 xpk.py cluster create --default-pool-cpu-machine-type=CPU_TYPE ...
972
1321
  ```
973
1322
 
1323
+ ## Workload creation fails
1324
+
1325
+ Some XPK cluster configuration might be missing, if workload creation fails with the below error.
1326
+
1327
+ `[XPK] b'error: the server doesn\'t have a resource type "workloads"\n'`
1328
+
1329
+ Mitigate this error by re-running your `xpk.py cluster create ...` command, to refresh the cluster configurations.
1330
+
974
1331
  ## Permission Issues: `requires one of ["permission_name"] permission(s)`.
975
1332
 
976
1333
  1) Determine the role needed based on the permission error:
@@ -1031,6 +1388,11 @@ gcloud beta compute reservations list --project=$PROJECT_ID
1031
1388
  gcloud beta compute reservations describe $RESERVATION --project=$PROJECT_ID --zone=$ZONE
1032
1389
  ```
1033
1390
 
1391
+ ## 403 error on workload create when using `--base-docker-image` flag
1392
+ You need authority to push to the registry from your local machine. Try running `gcloud auth configure-docker`.
1393
+ ## `Kubernetes API exception` - 404 error
1394
+ If error of this kind appeared after updating xpk version it's possible that you need to rerun `cluster create` command in order to update resource definitions.
1395
+
1034
1396
  # TPU Workload Debugging
1035
1397
 
1036
1398
  ## Verbose Logging
@@ -1072,3 +1434,65 @@ To explore the stack traces collected in a temporary directory in Kubernetes Pod
1072
1434
  --workload xpk-test-workload --command "python3 main.py" --cluster \
1073
1435
  xpk-test --tpu-type=v5litepod-16 --deploy-stacktrace-sidecar
1074
1436
  ```
1437
+
1438
+ ### Get information about jobs, queues and resources.
1439
+
1440
+ To list available resources and queues use ```xpk info``` command. It allows to see localqueues and clusterqueues and check for available resources.
1441
+
1442
+ To see queues with usage and workload info use:
1443
+ ```shell
1444
+ python3 xpk.py info --cluster my-cluster
1445
+ ```
1446
+
1447
+ You can specify what kind of resources(clusterqueue or localqueue) you want to see using flags --clusterqueue or --localqueue.
1448
+ ```shell
1449
+ python3 xpk.py info --cluster my-cluster --localqueue
1450
+ ```
1451
+
1452
+ # Local testing with Kind
1453
+
1454
+ To facilitate development and testing locally, we have integrated support for testing with `kind`. This enables you to simulate a Kubernetes environment on your local machine.
1455
+
1456
+ ## Prerequisites
1457
+
1458
+ - Install kind on your local machine. Follow the official documentation here: [Kind Installation Guide.](https://kind.sigs.k8s.io/docs/user/quick-start#installation)
1459
+
1460
+ ## Usage
1461
+
1462
+ xpk interfaces seamlessly with kind to manage Kubernetes clusters locally, facilitating the orchestration and management of workloads. Below are the commands for managing clusters:
1463
+
1464
+ ### Cluster Create
1465
+ * Cluster create:
1466
+
1467
+ ```shell
1468
+ python3 xpk.py kind create \
1469
+ --cluster xpk-test
1470
+ ```
1471
+
1472
+ ### Cluster Delete
1473
+ * Cluster Delete:
1474
+
1475
+ ```shell
1476
+ python3 xpk.py kind delete \
1477
+ --cluster xpk-test
1478
+ ```
1479
+
1480
+ ### Cluster List
1481
+ * Cluster List:
1482
+
1483
+ ```shell
1484
+ python3 xpk.py kind list
1485
+ ```
1486
+
1487
+ ## Local Testing Basics
1488
+
1489
+ Local testing is available exclusively through the `batch` and `job` commands of xpk with the `--kind-cluster` flag. This allows you to simulate training jobs locally:
1490
+
1491
+ ```shell
1492
+ python xpk.py batch [other-options] --kind-cluster script
1493
+ ```
1494
+
1495
+ Please note that all other xpk subcommands are intended for use with cloud systems on Google Cloud Engine (GCE) and don't support local testing. This includes commands like cluster, info, inspector, etc.
1496
+
1497
+ # Other advanced usage
1498
+ [Use a Jupyter notebook to interact with a Cloud TPU cluster](xpk-notebooks.md)