xpk 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/__init__.py +15 -0
- xpk/commands/__init__.py +15 -0
- xpk/commands/batch.py +109 -0
- xpk/commands/cluster.py +784 -0
- xpk/commands/cluster_gcluster.py +185 -0
- xpk/commands/info.py +245 -0
- xpk/commands/inspector.py +363 -0
- xpk/commands/job.py +197 -0
- xpk/commands/kind.py +253 -0
- xpk/commands/shell.py +120 -0
- xpk/commands/version.py +39 -0
- xpk/commands/workload.py +692 -0
- xpk/core/__init__.py +15 -0
- xpk/core/blueprint/__init__.py +15 -0
- xpk/core/blueprint/blueprint_definitions.py +61 -0
- xpk/core/blueprint/blueprint_generator.py +652 -0
- xpk/core/cluster_private.py +197 -0
- xpk/core/commands.py +352 -0
- xpk/core/core.py +2824 -0
- xpk/core/docker_manager.py +308 -0
- xpk/core/gcluster_manager.py +158 -0
- xpk/core/kjob.py +205 -0
- xpk/core/kueue.py +352 -0
- xpk/core/nap.py +349 -0
- xpk/core/pathways.py +298 -0
- xpk/core/ray.py +222 -0
- xpk/core/system_characteristics.py +1395 -0
- xpk/core/workload.py +133 -0
- xpk/core/workload_decorators/__init__.py +15 -0
- xpk/core/workload_decorators/rdma_decorator.py +109 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +157 -0
- xpk/main.py +73 -0
- xpk/parser/__init__.py +15 -0
- xpk/parser/batch.py +184 -0
- xpk/parser/cluster.py +621 -0
- xpk/parser/common.py +71 -0
- xpk/parser/core.py +109 -0
- xpk/parser/info.py +63 -0
- xpk/parser/inspector.py +65 -0
- xpk/parser/job.py +126 -0
- xpk/parser/kind.py +94 -0
- xpk/parser/shell.py +50 -0
- xpk/parser/validators.py +39 -0
- xpk/parser/version.py +23 -0
- xpk/parser/workload.py +684 -0
- xpk/utils/__init__.py +15 -0
- xpk/utils/console.py +55 -0
- xpk/utils/file.py +82 -0
- xpk/utils/network.py +168 -0
- xpk/utils/objects.py +85 -0
- xpk/utils/yaml.py +30 -0
- {xpk-0.4.0.dist-info → xpk-0.6.0.dist-info}/METADATA +307 -38
- xpk-0.6.0.dist-info/RECORD +57 -0
- {xpk-0.4.0.dist-info → xpk-0.6.0.dist-info}/WHEEL +1 -1
- xpk-0.6.0.dist-info/entry_points.txt +2 -0
- xpk-0.4.0.dist-info/RECORD +0 -7
- xpk-0.4.0.dist-info/entry_points.txt +0 -2
- xpk.py +0 -7218
- {xpk-0.4.0.dist-info → xpk-0.6.0.dist-info}/LICENSE +0 -0
- {xpk-0.4.0.dist-info → xpk-0.6.0.dist-info}/top_level.txt +0 -0
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: xpk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
|
|
5
|
-
Author-email:
|
|
5
|
+
Author-email: XPK team <xpk-code-reviewers@google.com>
|
|
6
6
|
License: Apache-2.0
|
|
7
7
|
Project-URL: Homepage, https://github.com/google/xpk
|
|
8
8
|
Project-URL: Bug Tracker, https://github.com/google/xpk/issues
|
|
@@ -12,10 +12,17 @@ Requires-Python: >=3.10
|
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
14
|
Requires-Dist: cloud-accelerator-diagnostics
|
|
15
|
+
Requires-Dist: tabulate
|
|
16
|
+
Requires-Dist: ruamel.yaml
|
|
17
|
+
Requires-Dist: pyyaml
|
|
18
|
+
Requires-Dist: docker
|
|
19
|
+
Requires-Dist: packaging
|
|
15
20
|
Provides-Extra: dev
|
|
16
|
-
Requires-Dist: pyink
|
|
17
|
-
Requires-Dist: pylint
|
|
18
|
-
Requires-Dist: pre-commit
|
|
21
|
+
Requires-Dist: pyink==24.3.0; extra == "dev"
|
|
22
|
+
Requires-Dist: pylint>=2.6.0; extra == "dev"
|
|
23
|
+
Requires-Dist: pre-commit; extra == "dev"
|
|
24
|
+
Requires-Dist: pytest; extra == "dev"
|
|
25
|
+
Requires-Dist: docker; extra == "dev"
|
|
19
26
|
|
|
20
27
|
<!--
|
|
21
28
|
Copyright 2023 Google LLC
|
|
@@ -62,31 +69,73 @@ xpk supports the following TPU types:
|
|
|
62
69
|
* v4
|
|
63
70
|
* v5e
|
|
64
71
|
* v5p
|
|
72
|
+
* Trillium (v6e)
|
|
65
73
|
|
|
66
74
|
and the following GPU types:
|
|
67
|
-
*
|
|
68
|
-
* h100
|
|
75
|
+
* A100
|
|
76
|
+
* A3-Highgpu (h100)
|
|
77
|
+
* A3-Mega (h100-mega) - [Create cluster](#provisioning-a3-ultra-and-a3-mega-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-and-a3-mega-clusters-gpu-machines)
|
|
78
|
+
* A3-Ultra (h200) - [Create cluster](#provisioning-a3-ultra-and-a3-mega-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-and-a3-mega-clusters-gpu-machines)
|
|
69
79
|
|
|
70
80
|
and the following CPU types:
|
|
71
81
|
* n2-standard-32
|
|
72
82
|
|
|
83
|
+
# Cloud Console Permissions on the user or service account needed to run XPK:
|
|
84
|
+
|
|
85
|
+
* Artifact Registry Writer
|
|
86
|
+
* Compute Admin
|
|
87
|
+
* Kubernetes Engine Admin
|
|
88
|
+
* Logging Admin
|
|
89
|
+
* Monitoring Admin
|
|
90
|
+
* Service Account User
|
|
91
|
+
* Storage Admin
|
|
92
|
+
* Vertex AI Administrator
|
|
93
|
+
|
|
94
|
+
# Prerequisites
|
|
95
|
+
|
|
96
|
+
Following tools must be installed:
|
|
97
|
+
|
|
98
|
+
- python >= 3.10 (download from [here](https://www.python.org/downloads/))
|
|
99
|
+
- pip ([installation instruction](https://pip.pypa.io/en/stable/installation/))
|
|
100
|
+
- python venv ([installation instruction](https://virtualenv.pypa.io/en/latest/installation.html))
|
|
101
|
+
(all three of above can be installed at once from [here](https://packaging.python.org/en/latest/guides/installing-using-linux-tools/#installing-pip-setuptools-wheel-with-linux-package-managers))
|
|
102
|
+
- gcloud (install from [here](https://cloud.google.com/sdk/gcloud#download_and_install_the))
|
|
103
|
+
- Run `gcloud init`
|
|
104
|
+
- [Authenticate](https://cloud.google.com/sdk/gcloud/reference/auth/application-default/login) to Google Cloud
|
|
105
|
+
- kubectl (install from [here](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_kubectl))
|
|
106
|
+
- Install `gke-gcloud-auth-plugin` from [here](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_plugin)
|
|
107
|
+
- docker ([installation instruction](https://docs.docker.com/engine/install/))
|
|
108
|
+
- Run `gcloud auth configure-docker` to ensure images can be uploaded to registry
|
|
109
|
+
- make - please run below command.
|
|
110
|
+
```shell
|
|
111
|
+
# sudo may be required
|
|
112
|
+
apt-get -y install make
|
|
113
|
+
```
|
|
114
|
+
In addition, below dependencies will be installed with `make install` command:
|
|
115
|
+
- kueuectl (install from [here](https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/))
|
|
116
|
+
- kjob (installation instructions [here](https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md))
|
|
117
|
+
|
|
73
118
|
# Installation
|
|
74
|
-
To install xpk, run the following command:
|
|
119
|
+
To install xpk, run the following command and install additional tools, mentioned in [prerequisites](#prerequisites). [Makefile](https://github.com/AI-Hypercomputer/xpk/blob/main/Makefile) provides a way to install all neccessary tools:
|
|
75
120
|
|
|
76
121
|
```shell
|
|
77
122
|
pip install xpk
|
|
78
123
|
```
|
|
79
124
|
|
|
125
|
+
|
|
80
126
|
If you are running XPK by cloning GitHub repository, first run the
|
|
81
127
|
following commands to begin using XPK commands:
|
|
82
128
|
|
|
83
129
|
```shell
|
|
84
130
|
git clone https://github.com/google/xpk.git
|
|
85
131
|
cd xpk
|
|
86
|
-
# Install dependencies
|
|
87
|
-
|
|
132
|
+
# Install required dependencies with make
|
|
133
|
+
make install && export PATH=$PATH:$PWD/bin
|
|
88
134
|
```
|
|
89
135
|
|
|
136
|
+
If you want to have installed dependecies persist in your PATH please run:
|
|
137
|
+
`echo $PWD/bin` and add its value to `PATH` in .bashrc or .zshrc
|
|
138
|
+
|
|
90
139
|
If you see an error saying: `This environment is externally managed`, please use a virtual environment.
|
|
91
140
|
|
|
92
141
|
Example:
|
|
@@ -100,8 +149,8 @@ Example:
|
|
|
100
149
|
## Clone the repository and installing dependencies.
|
|
101
150
|
git clone https://github.com/google/xpk.git
|
|
102
151
|
cd xpk
|
|
103
|
-
# Install dependencies
|
|
104
|
-
|
|
152
|
+
# Install required dependencies with make
|
|
153
|
+
make install && export PATH=$PATH:$PWD/bin
|
|
105
154
|
```
|
|
106
155
|
|
|
107
156
|
# XPK for Large Scale (>1k VMs)
|
|
@@ -139,14 +188,6 @@ gcloud config set compute/zone $ZONE
|
|
|
139
188
|
xpk .. --zone $ZONE --project $PROJECT_ID
|
|
140
189
|
```
|
|
141
190
|
|
|
142
|
-
`Cluster Create` command will create a project-specific Service Account. Note that only one service
|
|
143
|
-
account will be created per project. This service account will be attached to the node pools instead of default
|
|
144
|
-
[Compute Engine Service Account](https://cloud.google.com/compute/docs/access/service-accounts#default_service_account).
|
|
145
|
-
All the required permissions will be assigned to this service account by XPK. Make sure you have
|
|
146
|
-
[Service Account Admin](https://cloud.google.com/iam/docs/understanding-roles#iam.serviceAccountAdmin) and
|
|
147
|
-
[Project IAM Admin](https://cloud.google.com/iam/docs/understanding-roles#resourcemanager.projectIamAdmin)
|
|
148
|
-
roles assigned to your user account.
|
|
149
|
-
|
|
150
191
|
The cluster created is a regional cluster to enable the GKE control plane across
|
|
151
192
|
all zones.
|
|
152
193
|
|
|
@@ -179,13 +220,22 @@ all zones.
|
|
|
179
220
|
```
|
|
180
221
|
|
|
181
222
|
* Cluster Create for Pathways:
|
|
182
|
-
Pathways compatible cluster can be created using
|
|
223
|
+
Pathways compatible cluster can be created using `cluster create-pathways`.
|
|
183
224
|
```shell
|
|
184
|
-
python3 xpk.py cluster create \
|
|
225
|
+
python3 xpk.py cluster create-pathways \
|
|
185
226
|
--cluster xpk-pw-test \
|
|
186
227
|
--num-slices=4 --on-demand \
|
|
187
|
-
--tpu-type=v5litepod-16
|
|
188
|
-
|
|
228
|
+
--tpu-type=v5litepod-16
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
* Cluster Create for Ray:
|
|
232
|
+
A cluster with KubeRay enabled and a RayCluster can be created using `cluster create-ray`.
|
|
233
|
+
```shell
|
|
234
|
+
python3 xpk.py cluster create-ray \
|
|
235
|
+
--cluster xpk-rc-test \
|
|
236
|
+
--ray-version=2.39.0 \
|
|
237
|
+
--num-slices=4 --on-demand \
|
|
238
|
+
--tpu-type=v5litepod-8
|
|
189
239
|
```
|
|
190
240
|
|
|
191
241
|
* Cluster Create can be called again with the same `--cluster name` to modify
|
|
@@ -222,11 +272,77 @@ all zones.
|
|
|
222
272
|
python3 xpk.py cluster create --force \
|
|
223
273
|
--cluster xpk-test --tpu-type=v5litepod-16 \
|
|
224
274
|
--num-slices=6 --reservation=$RESERVATION_ID
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
and recreates the cluster with 4 slices of v4-8. The command will rerun to delete
|
|
278
|
+
6 slices of v5litepod-16 and create 4 slices of v4-8. The command will warn the
|
|
279
|
+
user when deleting slices. Use `--force` to skip prompts.
|
|
225
280
|
|
|
281
|
+
```shell
|
|
282
|
+
python3 xpk.py cluster create \
|
|
283
|
+
--cluster xpk-test --tpu-type=v4-8 \
|
|
284
|
+
--num-slices=4 --reservation=$RESERVATION_ID
|
|
285
|
+
|
|
286
|
+
# Skip delete prompts using --force.
|
|
287
|
+
|
|
288
|
+
python3 xpk.py cluster create --force \
|
|
289
|
+
--cluster xpk-test --tpu-type=v4-8 \
|
|
290
|
+
--num-slices=4 --reservation=$RESERVATION_ID
|
|
226
291
|
```
|
|
227
292
|
|
|
293
|
+
### Create Private Cluster
|
|
294
|
+
|
|
295
|
+
XPK allows you to create a private GKE cluster for enhanced security. In a private cluster, nodes and pods are isolated from the public internet, providing an additional layer of protection for your workloads.
|
|
296
|
+
|
|
297
|
+
To create a private cluster, use the following arguments:
|
|
298
|
+
|
|
299
|
+
**`--private`**
|
|
300
|
+
|
|
301
|
+
This flag enables the creation of a private GKE cluster. When this flag is set:
|
|
302
|
+
|
|
303
|
+
* Nodes and pods are isolated from the direct internet access.
|
|
304
|
+
* `master_authorized_networks` is automatically enabled.
|
|
305
|
+
* Access to the cluster's control plane is restricted to your current machine's IP address by default.
|
|
306
|
+
|
|
307
|
+
**`--authorized-networks`**
|
|
308
|
+
|
|
309
|
+
This argument allows you to specify additional IP ranges (in CIDR notation) that are authorized to access the private cluster's control plane and perform `kubectl` commands.
|
|
310
|
+
|
|
311
|
+
* Even if this argument is not set when you have `--private`, your current machine's IP address will always be given access to the control plane.
|
|
312
|
+
* If this argument is used with an existing private cluster, it will replace the existing authorized networks.
|
|
313
|
+
|
|
314
|
+
**Example Usage:**
|
|
315
|
+
|
|
316
|
+
* To create a private cluster and allow access to Control Plane only to your current machine:
|
|
317
|
+
|
|
318
|
+
```shell
|
|
319
|
+
python3 xpk.py cluster create \
|
|
320
|
+
--cluster=xpk-private-cluster \
|
|
321
|
+
--tpu-type=v4-8 --num-slices=2 \
|
|
322
|
+
--private
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
* To create a private cluster and allow access to Control Plane only to your current machine and the IP ranges `1.2.3.0/24` and `1.2.4.5/32`:
|
|
326
|
+
|
|
327
|
+
```shell
|
|
328
|
+
python3 xpk.py cluster create \
|
|
329
|
+
--cluster=xpk-private-cluster \
|
|
330
|
+
--tpu-type=v4-8 --num-slices=2 \
|
|
331
|
+
--authorized-networks 1.2.3.0/24 1.2.4.5/32
|
|
332
|
+
|
|
333
|
+
# --private is optional when you set --authorized-networks
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
> **Important Notes:**
|
|
337
|
+
> * The argument `--private` is only applicable when creating new clusters. You cannot convert an existing public cluster to a private cluster using these flags.
|
|
338
|
+
> * The argument `--authorized-networks` is applicable when creating new clusters or using an existing _*private*_ cluster. You cannot convert an existing public cluster to a private cluster using these flags.
|
|
339
|
+
> * You need to [set up a Cluster NAT for your VPC network](https://cloud.google.com/nat/docs/set-up-manage-network-address-translation#creating_nat) so that the Nodes and Pods have outbound access to the internet. This is required because XPK installs and configures components such as kueue that need access to external sources like `registry.k8.io`.
|
|
340
|
+
|
|
341
|
+
|
|
228
342
|
### Create Vertex AI Tensorboard
|
|
229
|
-
*Note: This feature is available in XPK >= 0.4.0. Enable [Vertex AI API](https://cloud.google.com/vertex-ai/docs/start/cloud-environment#enable_vertexai_apis) in your Google Cloud console to use this feature
|
|
343
|
+
*Note: This feature is available in XPK >= 0.4.0. Enable [Vertex AI API](https://cloud.google.com/vertex-ai/docs/start/cloud-environment#enable_vertexai_apis) in your Google Cloud console to use this feature. Make sure you have
|
|
344
|
+
[Vertex AI Administrator](https://cloud.google.com/vertex-ai/docs/general/access-control#aiplatform.admin) role
|
|
345
|
+
assigned to your user account.*
|
|
230
346
|
|
|
231
347
|
Vertex AI Tensorboard is a fully managed version of open-source Tensorboard. To learn more about Vertex AI Tensorboard, visit [this](https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-introduction). Note that Vertex AI Tensorboard is only available in [these](https://cloud.google.com/vertex-ai/docs/general/locations#available-regions) regions.
|
|
232
348
|
|
|
@@ -312,6 +428,26 @@ will fail the cluster creation process because Vertex AI Tensorboard is not supp
|
|
|
312
428
|
--tpu-type=v5litepod-16
|
|
313
429
|
```
|
|
314
430
|
|
|
431
|
+
## Provisioning A3-Ultra and A3-Mega clusters (GPU machines)
|
|
432
|
+
To create a cluster with A3 machines, run the below command. To create workloads on these clusters see [here](#workloads-for-a3-ultra-and-a3-mega-clusters-gpu-machines).
|
|
433
|
+
* For A3-Ultra: --device-type=h200-141gb-8
|
|
434
|
+
* For A3-Mega: --device-type=h100-mega-80gb-8
|
|
435
|
+
|
|
436
|
+
```shell
|
|
437
|
+
python3 xpk.py cluster create \
|
|
438
|
+
--cluster CLUSTER_NAME --device-type=h200-141gb-8 \
|
|
439
|
+
--zone=$COMPUTE_ZONE --project=$PROJECT_ID \
|
|
440
|
+
--num-nodes=4 --reservation=$RESERVATION_ID
|
|
441
|
+
```
|
|
442
|
+
Currently, the below flags/arguments are supported for A3-Mega and A3-Ultra machines:
|
|
443
|
+
* --num-nodes
|
|
444
|
+
* --default-pool-cpu-machine-type
|
|
445
|
+
* --default-pool-cpu-num-nodes
|
|
446
|
+
* --reservation
|
|
447
|
+
* --spot
|
|
448
|
+
* --on-demand (only A3-Mega)
|
|
449
|
+
|
|
450
|
+
|
|
315
451
|
## Workload Create
|
|
316
452
|
* Workload Create (submit training job):
|
|
317
453
|
|
|
@@ -323,26 +459,25 @@ will fail the cluster creation process because Vertex AI Tensorboard is not supp
|
|
|
323
459
|
```
|
|
324
460
|
|
|
325
461
|
* Workload Create for Pathways:
|
|
326
|
-
Pathways workload can be submitted using
|
|
462
|
+
Pathways workload can be submitted using `workload create-pathways` on a Pathways enabled cluster (created with `cluster create-pathways`)
|
|
327
463
|
|
|
328
464
|
Pathways workload example:
|
|
329
465
|
```shell
|
|
330
|
-
python3 xpk.py workload create \
|
|
466
|
+
python3 xpk.py workload create-pathways \
|
|
331
467
|
--workload xpk-pw-test \
|
|
332
468
|
--num-slices=1 \
|
|
333
469
|
--tpu-type=v5litepod-16 \
|
|
334
|
-
--use-pathways \
|
|
335
470
|
--cluster xpk-pw-test \
|
|
336
471
|
--docker-name='user-workload' \
|
|
337
472
|
--docker-image=<maxtext docker image> \
|
|
338
473
|
--command='python3 MaxText/train.py MaxText/configs/base.yml base_output_directory=<output directory> dataset_path=<dataset path> per_device_batch_size=1 enable_checkpointing=false enable_profiler=false remat_policy=full global_parameter_scale=4 steps=300 max_target_length=2048 use_iota_embed=true reuse_example_batch=1 dataset_type=synthetic attention=flash gcs_metrics=True run_name=$(USER)-pw-xpk-test-1'
|
|
339
474
|
```
|
|
340
475
|
|
|
341
|
-
Regular workload can also be submitted on a Pathways enabled cluster (created with
|
|
476
|
+
Regular workload can also be submitted on a Pathways enabled cluster (created with `cluster create-pathways`)
|
|
342
477
|
|
|
343
478
|
Pathways workload example:
|
|
344
479
|
```shell
|
|
345
|
-
python3 xpk.py workload create \
|
|
480
|
+
python3 xpk.py workload create-pathways \
|
|
346
481
|
--workload xpk-regular-test \
|
|
347
482
|
--num-slices=1 \
|
|
348
483
|
--tpu-type=v5litepod-16 \
|
|
@@ -352,6 +487,25 @@ will fail the cluster creation process because Vertex AI Tensorboard is not supp
|
|
|
352
487
|
--command='python3 MaxText/train.py MaxText/configs/base.yml base_output_directory=<output directory> dataset_path=<dataset path> per_device_batch_size=1 enable_checkpointing=false enable_profiler=false remat_policy=full global_parameter_scale=4 steps=300 max_target_length=2048 use_iota_embed=true reuse_example_batch=1 dataset_type=synthetic attention=flash gcs_metrics=True run_name=$(USER)-pw-xpk-test-1'
|
|
353
488
|
```
|
|
354
489
|
|
|
490
|
+
Pathways in headless mode - Pathways now offers the capability to run JAX workloads in Vertex AI notebooks or in GCE VMs!
|
|
491
|
+
Specify `--headless` with `workload create-pathways` when the user workload is not provided in a docker container.
|
|
492
|
+
```shell
|
|
493
|
+
python3 xpk.py workload create-pathways --headless \
|
|
494
|
+
--workload xpk-pw-headless \
|
|
495
|
+
--num-slices=1 \
|
|
496
|
+
--tpu-type=v5litepod-16 \
|
|
497
|
+
--cluster xpk-pw-test
|
|
498
|
+
```
|
|
499
|
+
Executing the command above would provide the address of the proxy that the user job should connect to.
|
|
500
|
+
```shell
|
|
501
|
+
kubectl get pods
|
|
502
|
+
kubectl port-forward pod/<proxy-pod-name> 29000:29000
|
|
503
|
+
```
|
|
504
|
+
```shell
|
|
505
|
+
JAX_PLATFORMS=proxy JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 python -c 'import pathwaysutils; import jax; print(jax.devices())'
|
|
506
|
+
```
|
|
507
|
+
Specify `JAX_PLATFORMS=proxy` and `JAX_BACKEND_TARGET=<proxy address from above>` and `import pathwaysutils` to establish this connection between the user's JAX code and the Pathways proxy. Execute Pathways workloads interactively on Vertex AI notebooks!
|
|
508
|
+
|
|
355
509
|
### Set `max-restarts` for production jobs
|
|
356
510
|
|
|
357
511
|
* `--max-restarts <value>`: By default, this is 0. This will restart the job ""
|
|
@@ -360,6 +514,20 @@ increase this to a large number, say 50. Real jobs can be interrupted due to
|
|
|
360
514
|
hardware failures and software updates. We assume your job has implemented
|
|
361
515
|
checkpointing so the job restarts near where it was interrupted.
|
|
362
516
|
|
|
517
|
+
### Workloads for A3-Ultra and A3-Mega clusters (GPU machines)
|
|
518
|
+
To submit jobs on a cluster with A3 machines, run the below command. To create a cluster with A3 machines see [here](#provisioning-a3-ultra-and-a3-mega-clusters-gpu-machines).
|
|
519
|
+
* For A3-Ultra: --device-type=h200-141gb-8
|
|
520
|
+
* For A3-Mega: --device-type=h100-mega-80gb-8
|
|
521
|
+
|
|
522
|
+
```shell
|
|
523
|
+
python3 xpk.py workload create \
|
|
524
|
+
--workload=$WORKLOAD_NAME --command="echo goodbye" \
|
|
525
|
+
--cluster=$CLUSTER_NAME --device-type=h200-141gb-8 \
|
|
526
|
+
--zone=$COMPUTE_ZONE --project=$PROJECT_ID \
|
|
527
|
+
--num-nodes=$WOKRKLOAD_NUM_NODES
|
|
528
|
+
```
|
|
529
|
+
> The docker image flags/arguments introduced in [workloads section](#workload-create) can be used with A3 machines as well.
|
|
530
|
+
|
|
363
531
|
### Workload Priority and Preemption
|
|
364
532
|
* Set the priority level of your workload with `--priority=LEVEL`
|
|
365
533
|
|
|
@@ -386,7 +554,9 @@ checkpointing so the job restarts near where it was interrupted.
|
|
|
386
554
|
```
|
|
387
555
|
|
|
388
556
|
### Create Vertex AI Experiment to upload data to Vertex AI Tensorboard
|
|
389
|
-
*Note: This feature is available in XPK >= 0.4.0. Enable [Vertex AI API](https://cloud.google.com/vertex-ai/docs/start/cloud-environment#enable_vertexai_apis) in your Google Cloud console to use this feature
|
|
557
|
+
*Note: This feature is available in XPK >= 0.4.0. Enable [Vertex AI API](https://cloud.google.com/vertex-ai/docs/start/cloud-environment#enable_vertexai_apis) in your Google Cloud console to use this feature. Make sure you have
|
|
558
|
+
[Vertex AI Administrator](https://cloud.google.com/vertex-ai/docs/general/access-control#aiplatform.admin) role
|
|
559
|
+
assigned to your user account and to the [Compute Engine Service account](https://cloud.google.com/compute/docs/access/service-accounts#default_service_account) attached to the node pools in the cluster.*
|
|
390
560
|
|
|
391
561
|
Vertex AI Experiment is a tool that helps to track and analyze an experiment run on Vertex AI Tensorboard. To learn more about Vertex AI Experiments, visit [this](https://cloud.google.com/vertex-ai/docs/experiments/intro-vertex-ai-experiments).
|
|
392
562
|
|
|
@@ -495,7 +665,7 @@ Check out [MaxText example](https://github.com/google/maxtext/pull/570) on how t
|
|
|
495
665
|
--cluster xpk-test --filter-by-job=$USER
|
|
496
666
|
```
|
|
497
667
|
|
|
498
|
-
* Workload List supports waiting for the completion of a specific job. XPK will follow an existing job until it has finished or the `timeout`, if provided, has been reached and then list the job. If no `timeout` is specified, the default value is set to the max value, 1 week. You may also set `timeout=0` to poll the job once.
|
|
668
|
+
* Workload List supports waiting for the completion of a specific job. XPK will follow an existing job until it has finished or the `timeout`, if provided, has been reached and then list the job. If no `timeout` is specified, the default value is set to the max value, 1 week. You may also set `timeout=0` to poll the job once.
|
|
499
669
|
(Note: `restart-on-user-code-failure` must be set
|
|
500
670
|
when creating the workload otherwise the workload will always finish with `Completed` status.)
|
|
501
671
|
|
|
@@ -514,11 +684,37 @@ when creating the workload otherwise the workload will always finish with `Compl
|
|
|
514
684
|
--timeout=300
|
|
515
685
|
```
|
|
516
686
|
|
|
517
|
-
Return codes
|
|
518
|
-
`0`: Workload finished and completed successfully.
|
|
519
|
-
`124`: Timeout was reached before workload finished.
|
|
520
|
-
`125`: Workload finished but did not complete successfully.
|
|
521
|
-
`1`: Other failure.
|
|
687
|
+
Return codes
|
|
688
|
+
`0`: Workload finished and completed successfully.
|
|
689
|
+
`124`: Timeout was reached before workload finished.
|
|
690
|
+
`125`: Workload finished but did not complete successfully.
|
|
691
|
+
`1`: Other failure.
|
|
692
|
+
|
|
693
|
+
## Job List
|
|
694
|
+
|
|
695
|
+
* Job List (see jobs submitted via batch command):
|
|
696
|
+
|
|
697
|
+
```shell
|
|
698
|
+
python3 xpk.py job ls --cluster xpk-test
|
|
699
|
+
```
|
|
700
|
+
|
|
701
|
+
* Example Job List Output:
|
|
702
|
+
|
|
703
|
+
```
|
|
704
|
+
NAME PROFILE LOCAL QUEUE COMPLETIONS DURATION AGE
|
|
705
|
+
xpk-def-app-profile-slurm-74kbv xpk-def-app-profile 1/1 15s 17h
|
|
706
|
+
xpk-def-app-profile-slurm-brcsg xpk-def-app-profile 1/1 9s 3h56m
|
|
707
|
+
xpk-def-app-profile-slurm-kw99l xpk-def-app-profile 1/1 5s 3h54m
|
|
708
|
+
xpk-def-app-profile-slurm-x99nx xpk-def-app-profile 3/3 29s 17h
|
|
709
|
+
```
|
|
710
|
+
|
|
711
|
+
## Job Cancel
|
|
712
|
+
|
|
713
|
+
* Job Cancel (delete job submitted via batch command):
|
|
714
|
+
|
|
715
|
+
```shell
|
|
716
|
+
python3 xpk.py job cancel xpk-def-app-profile-slurm-74kbv --cluster xpk-test
|
|
717
|
+
```
|
|
522
718
|
|
|
523
719
|
## Inspector
|
|
524
720
|
* Inspector provides debug info to understand cluster health, and why workloads are not running.
|
|
@@ -975,6 +1171,14 @@ gcloud compute machine-types list --zones=$ZONE_LIST
|
|
|
975
1171
|
python3 xpk.py cluster create --default-pool-cpu-machine-type=CPU_TYPE ...
|
|
976
1172
|
```
|
|
977
1173
|
|
|
1174
|
+
## Workload creation fails
|
|
1175
|
+
|
|
1176
|
+
Some XPK cluster configuration might be missing, if workload creation fails with the below error.
|
|
1177
|
+
|
|
1178
|
+
`[XPK] b'error: the server doesn\'t have a resource type "workloads"\n'`
|
|
1179
|
+
|
|
1180
|
+
Mitigate this error by re-running your `xpk.py cluster create ...` command, to refresh the cluster configurations.
|
|
1181
|
+
|
|
978
1182
|
## Permission Issues: `requires one of ["permission_name"] permission(s)`.
|
|
979
1183
|
|
|
980
1184
|
1) Determine the role needed based on the permission error:
|
|
@@ -1035,6 +1239,9 @@ gcloud beta compute reservations list --project=$PROJECT_ID
|
|
|
1035
1239
|
gcloud beta compute reservations describe $RESERVATION --project=$PROJECT_ID --zone=$ZONE
|
|
1036
1240
|
```
|
|
1037
1241
|
|
|
1242
|
+
## 403 error on workload create when using `--base-docker-image` flag
|
|
1243
|
+
You need authority to push to the registry from your local machine. Try running `gcloud auth configure-docker`.
|
|
1244
|
+
|
|
1038
1245
|
# TPU Workload Debugging
|
|
1039
1246
|
|
|
1040
1247
|
## Verbose Logging
|
|
@@ -1076,3 +1283,65 @@ To explore the stack traces collected in a temporary directory in Kubernetes Pod
|
|
|
1076
1283
|
--workload xpk-test-workload --command "python3 main.py" --cluster \
|
|
1077
1284
|
xpk-test --tpu-type=v5litepod-16 --deploy-stacktrace-sidecar
|
|
1078
1285
|
```
|
|
1286
|
+
|
|
1287
|
+
### Get information about jobs, queues and resources.
|
|
1288
|
+
|
|
1289
|
+
To list available resources and queues use ```xpk info``` command. It allows to see localqueues and clusterqueues and check for available resources.
|
|
1290
|
+
|
|
1291
|
+
To see queues with usage and workload info use:
|
|
1292
|
+
```shell
|
|
1293
|
+
python3 xpk.py info --cluster my-cluster
|
|
1294
|
+
```
|
|
1295
|
+
|
|
1296
|
+
You can specify what kind of resources(clusterqueue or localqueue) you want to see using flags --clusterqueue or --localqueue.
|
|
1297
|
+
```shell
|
|
1298
|
+
python3 xpk.py info --cluster my-cluster --localqueue
|
|
1299
|
+
```
|
|
1300
|
+
|
|
1301
|
+
# Local testing with Kind
|
|
1302
|
+
|
|
1303
|
+
To facilitate development and testing locally, we have integrated support for testing with `kind`. This enables you to simulate a Kubernetes environment on your local machine.
|
|
1304
|
+
|
|
1305
|
+
## Prerequisites
|
|
1306
|
+
|
|
1307
|
+
- Install kind on your local machine. Follow the official documentation here: [Kind Installation Guide.](https://kind.sigs.k8s.io/docs/user/quick-start#installation)
|
|
1308
|
+
|
|
1309
|
+
## Usage
|
|
1310
|
+
|
|
1311
|
+
xpk interfaces seamlessly with kind to manage Kubernetes clusters locally, facilitating the orchestration and management of workloads. Below are the commands for managing clusters:
|
|
1312
|
+
|
|
1313
|
+
### Cluster Create
|
|
1314
|
+
* Cluster create:
|
|
1315
|
+
|
|
1316
|
+
```shell
|
|
1317
|
+
python3 xpk.py kind create \
|
|
1318
|
+
--cluster xpk-test
|
|
1319
|
+
```
|
|
1320
|
+
|
|
1321
|
+
### Cluster Delete
|
|
1322
|
+
* Cluster Delete:
|
|
1323
|
+
|
|
1324
|
+
```shell
|
|
1325
|
+
python3 xpk.py kind delete \
|
|
1326
|
+
--cluster xpk-test
|
|
1327
|
+
```
|
|
1328
|
+
|
|
1329
|
+
### Cluster List
|
|
1330
|
+
* Cluster List:
|
|
1331
|
+
|
|
1332
|
+
```shell
|
|
1333
|
+
python3 xpk.py kind list
|
|
1334
|
+
```
|
|
1335
|
+
|
|
1336
|
+
## Local Testing Basics
|
|
1337
|
+
|
|
1338
|
+
Local testing is available exclusively through the `batch` and `job` commands of xpk with the `--kind-cluster` flag. This allows you to simulate training jobs locally:
|
|
1339
|
+
|
|
1340
|
+
```shell
|
|
1341
|
+
python xpk.py batch [other-options] --kind-cluster script
|
|
1342
|
+
```
|
|
1343
|
+
|
|
1344
|
+
Please note that all other xpk subcommands are intended for use with cloud systems on Google Cloud Engine (GCE) and don't support local testing. This includes commands like cluster, info, inspector, etc.
|
|
1345
|
+
|
|
1346
|
+
# Other advanced usage
|
|
1347
|
+
[Use a Jupyter notebook to interact with a Cloud TPU cluster](xpk-notebooks.md)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
xpk/__init__.py,sha256=7mu-VQDQMyxM5To0KOhuYe4y2TYGsEkfV7hXZmUyih4,561
|
|
2
|
+
xpk/main.py,sha256=GicnuO9qhWiMZKUHZsKX1xB0XBrdC1c_PSwGYKJtUc8,2274
|
|
3
|
+
xpk/commands/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
|
|
4
|
+
xpk/commands/batch.py,sha256=4eiqNv35UqLoUnY-TyTVGGAAF0iShhwydHbHXeQdhfw,3004
|
|
5
|
+
xpk/commands/cluster.py,sha256=BlkoVf_2uFwgid4aa3qEukVx-OWT-ucgiiA9UyWNedA,24091
|
|
6
|
+
xpk/commands/cluster_gcluster.py,sha256=EspH5Z2qrqJAD-LL2JPjWaPiEk4o3cr123JfKfQGXU8,6427
|
|
7
|
+
xpk/commands/info.py,sha256=_HNA9lCV31qHz7mIxzYwhna1pVP7wfaweYccQz2qI2w,7345
|
|
8
|
+
xpk/commands/inspector.py,sha256=CODeHF2cMGU3T9ZWiBlxPHRehl2tsBaP5-enZzmpdI8,12168
|
|
9
|
+
xpk/commands/job.py,sha256=DZ79T9-8VE1U5jpOkzoQhsygPa54QsdwAl8SaPbvtGU,5496
|
|
10
|
+
xpk/commands/kind.py,sha256=yI3Szi7k1jkqTUAaarebFG43XI75Fj9yMvLJ-n6IzHI,6708
|
|
11
|
+
xpk/commands/shell.py,sha256=qabHDmec-iu1b2o5ju-ETIxR9eil_jyr_kNiIgTak2w,3704
|
|
12
|
+
xpk/commands/version.py,sha256=qAguh5pMWeLhArKhGT3MpvL6zSHOufjdiGR5Fazdv00,1146
|
|
13
|
+
xpk/commands/workload.py,sha256=aTmKdy8aX14mUmfZnqkukpTd5aN1M8SCJ50dmN5_ZMs,23661
|
|
14
|
+
xpk/core/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
|
|
15
|
+
xpk/core/cluster_private.py,sha256=aGNBpQ3oKixiKmppLkZmC8sADPUBZ00OxrzFLWY6GQ4,6634
|
|
16
|
+
xpk/core/commands.py,sha256=ZNpcSpNceRIO6d0rCiQjP-m15fkRRLutrjUdB-JRehc,10586
|
|
17
|
+
xpk/core/core.py,sha256=QUYP0dEdThjwaYjUOz2UIVWTo_6HRsUC_3hHBhIgS7g,91236
|
|
18
|
+
xpk/core/docker_manager.py,sha256=_fE27tDCJPd9dUfswYoQMzZRMAMfxq6SxdFdOT-gzIQ,10566
|
|
19
|
+
xpk/core/gcluster_manager.py,sha256=94NgZ-4qIiihwrgCoNVmaBoDNmImog8rMsnDA7IxIHM,5655
|
|
20
|
+
xpk/core/kjob.py,sha256=u3a5COm7cZDrhcYQtYlELscStBXq5bBp8o-9G3icXGk,5531
|
|
21
|
+
xpk/core/kueue.py,sha256=K319BwTOI4Ik_Eem4WUnd0B_TjfxygSY0wTOTlDNnuw,10054
|
|
22
|
+
xpk/core/nap.py,sha256=KL5w7iJE982EquR569qYNlfFzRnFVv4q8PNZuQKBiw4,11558
|
|
23
|
+
xpk/core/pathways.py,sha256=_TwyAUU6v789CAd5ZBwtBdwboVHUL13RFRWR7vGhZNI,8759
|
|
24
|
+
xpk/core/ray.py,sha256=UxOpIc2enHi1fQ4h3KO8FH8bIyEMtYzGtPoeqJKGG4o,6337
|
|
25
|
+
xpk/core/system_characteristics.py,sha256=4gYcqhmJmS6YvW3IMF5pkSt4IDf1Rzq6ixKJEVbZqTc,30943
|
|
26
|
+
xpk/core/workload.py,sha256=W9DwsvtV8aTPO44yuLWMKQ4HZaRjF-MMKwY_Cqzpxhc,4778
|
|
27
|
+
xpk/core/blueprint/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
|
|
28
|
+
xpk/core/blueprint/blueprint_definitions.py,sha256=jcP0ACV9yJ9PpW5tnBCimGtg9rcgFx7ZESnkrNW_MWE,1659
|
|
29
|
+
xpk/core/blueprint/blueprint_generator.py,sha256=UXkHJu3CUwzdBUl9B79qs2AlIDcC9ZKnYeFXqY0mZKA,23124
|
|
30
|
+
xpk/core/workload_decorators/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
|
|
31
|
+
xpk/core/workload_decorators/rdma_decorator.py,sha256=wKwypU97zbRPwTOV67xqTbYxqVKaapnBjjp1OmMFtzM,3426
|
|
32
|
+
xpk/core/workload_decorators/tcpxo_decorator.py,sha256=Oz5H8rE2X4sU2MmPYYSR5ETeoyJHS3Pdq105oTrjPUI,5213
|
|
33
|
+
xpk/parser/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
|
|
34
|
+
xpk/parser/batch.py,sha256=t-8WdZDy122eoBkxtt959nm7AiYbxOJztls4kcKMKF8,5474
|
|
35
|
+
xpk/parser/cluster.py,sha256=vGdIfTuRad3zp443M6VuyOE_zl-sjkbvz4Gqo5cllSI,20813
|
|
36
|
+
xpk/parser/common.py,sha256=nFv0kU9X_2JOwemfTp-ujzu1scjr2y4CEQnEbkwESLg,2114
|
|
37
|
+
xpk/parser/core.py,sha256=i4dgm1s9DJIgBJijKbfXU4r6EPLuFJ3udig9c8gcGBA,3888
|
|
38
|
+
xpk/parser/info.py,sha256=VnplIxnxS404EpSQ00J_q6xzHsbO8Z18gvwRXqRzwps,1820
|
|
39
|
+
xpk/parser/inspector.py,sha256=9uoiw9TZ_lsmPdwuFky9KsDqv1RwqH8ftEON_pL_doY,2009
|
|
40
|
+
xpk/parser/job.py,sha256=t_iqjg_INMhbRkT50jEGzo9Hm31705CXfw5XreeB2ig,3741
|
|
41
|
+
xpk/parser/kind.py,sha256=OWeP6DuHfChFzvrCxl9t7kGZezcXjbrRbiyAbTgAZK8,2636
|
|
42
|
+
xpk/parser/shell.py,sha256=iJzouiBws46njcbQv9cMtT3Uuksl6awsMTjhgmwLnyI,1693
|
|
43
|
+
xpk/parser/validators.py,sha256=rz-WzGKypqvv345ISuHcKbFNwcZPmaoY6taPnYilTm0,1219
|
|
44
|
+
xpk/parser/version.py,sha256=eJo4PAbbmRQZulgKBs_ytbVgV9zAaaXeNzMMxmgFMVY,769
|
|
45
|
+
xpk/parser/workload.py,sha256=gMtm1d-Bwq7g2lz8EuFn9LQIDgAnsM1YkML6XxVviqk,23232
|
|
46
|
+
xpk/utils/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
|
|
47
|
+
xpk/utils/console.py,sha256=bKibWIswcB1aWGZp0ZpL-NEhvTrxJMy7wWD4-3BVTKI,1479
|
|
48
|
+
xpk/utils/file.py,sha256=jlv2o4ah9UmWJ7NuOCnTwtMZFLerOATBIMQeQ03-kIw,2142
|
|
49
|
+
xpk/utils/network.py,sha256=AAm9qGGFAEfAh1FK39muBheXAo7tdBlxR0A8Tg0TyYQ,4205
|
|
50
|
+
xpk/utils/objects.py,sha256=BhjTI1gfvXPIWEvWtFu9nSgYiugUo0pdIzIJ78r1TuY,2446
|
|
51
|
+
xpk/utils/yaml.py,sha256=j8xuAJ9yAAwnQi6ozwZ-nMnDyDnc3xWkeBZMtSuP4RU,844
|
|
52
|
+
xpk-0.6.0.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
|
53
|
+
xpk-0.6.0.dist-info/METADATA,sha256=bL0HGHYJhj4Zo_WS-nU57JsYNq7rQT0YmQpcUfEFgL8,56084
|
|
54
|
+
xpk-0.6.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
55
|
+
xpk-0.6.0.dist-info/entry_points.txt,sha256=mzEtiIesFkT1kmcTUVDA1o3uOhiniX6tIz2wmOlMu1M,38
|
|
56
|
+
xpk-0.6.0.dist-info/top_level.txt,sha256=aDe4N0jicmuWExx_6w0TxWQJaEuPSs9BnLU-3aF1GLo,4
|
|
57
|
+
xpk-0.6.0.dist-info/RECORD,,
|
xpk-0.4.0.dist-info/RECORD
DELETED
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
xpk.py,sha256=247TxfryVBdgpc6qpwCXOmwXD78nLGi1iHTIfTfflGU,215487
|
|
2
|
-
xpk-0.4.0.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
|
3
|
-
xpk-0.4.0.dist-info/METADATA,sha256=qIptNh3dJ_OhYAkMYLz3Wltr0H_R2nwSDJQpphPZWiA,44333
|
|
4
|
-
xpk-0.4.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
5
|
-
xpk-0.4.0.dist-info/entry_points.txt,sha256=lhrMqkTA09DLePaqxSMyW2RCLUKs2X1c84baGhMev_k,33
|
|
6
|
-
xpk-0.4.0.dist-info/top_level.txt,sha256=aDe4N0jicmuWExx_6w0TxWQJaEuPSs9BnLU-3aF1GLo,4
|
|
7
|
-
xpk-0.4.0.dist-info/RECORD,,
|