xpk 0.6.0__tar.gz → 0.7.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. {xpk-0.6.0 → xpk-0.7.1}/PKG-INFO +169 -15
  2. xpk-0.6.0/src/xpk.egg-info/PKG-INFO → xpk-0.7.1/README.md +154 -32
  3. {xpk-0.6.0 → xpk-0.7.1}/pyproject.toml +15 -9
  4. xpk-0.7.1/src/xpk/api/storage_crd.yaml +52 -0
  5. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/commands/batch.py +27 -5
  6. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/commands/cluster.py +104 -80
  7. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/commands/cluster_gcluster.py +94 -10
  8. xpk-0.7.1/src/xpk/commands/common.py +44 -0
  9. xpk-0.7.1/src/xpk/commands/config.py +29 -0
  10. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/commands/info.py +8 -10
  11. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/commands/inspector.py +5 -11
  12. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/commands/job.py +9 -7
  13. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/commands/kind.py +34 -4
  14. xpk-0.7.1/src/xpk/commands/kjob_common.py +44 -0
  15. xpk-0.7.1/src/xpk/commands/run.py +128 -0
  16. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/commands/shell.py +27 -7
  17. xpk-0.7.1/src/xpk/commands/storage.py +280 -0
  18. xpk-0.7.1/src/xpk/commands/version.py +27 -0
  19. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/commands/workload.py +381 -184
  20. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/core/blueprint/blueprint_definitions.py +1 -0
  21. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/core/blueprint/blueprint_generator.py +132 -76
  22. xpk-0.7.1/src/xpk/core/capacity.py +185 -0
  23. xpk-0.7.1/src/xpk/core/cluster.py +564 -0
  24. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/core/cluster_private.py +6 -3
  25. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/core/commands.py +18 -14
  26. xpk-0.7.1/src/xpk/core/config.py +179 -0
  27. xpk-0.7.1/src/xpk/core/docker_container.py +225 -0
  28. xpk-0.7.1/src/xpk/core/docker_image.py +210 -0
  29. xpk-0.7.1/src/xpk/core/docker_resources.py +350 -0
  30. xpk-0.7.1/src/xpk/core/filestore.py +251 -0
  31. xpk-0.7.1/src/xpk/core/gcloud_context.py +196 -0
  32. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/core/gcluster_manager.py +20 -2
  33. xpk-0.7.1/src/xpk/core/gcsfuse.py +50 -0
  34. xpk-0.7.1/src/xpk/core/kjob.py +444 -0
  35. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/core/kueue.py +12 -6
  36. xpk-0.7.1/src/xpk/core/monitoring.py +134 -0
  37. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/core/nap.py +32 -20
  38. xpk-0.7.1/src/xpk/core/network.py +377 -0
  39. xpk-0.7.1/src/xpk/core/nodepool.py +581 -0
  40. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/core/pathways.py +124 -45
  41. xpk-0.7.1/src/xpk/core/remote_state/__init__.py +15 -0
  42. xpk-0.7.1/src/xpk/core/remote_state/fuse_remote_state.py +99 -0
  43. xpk-0.7.1/src/xpk/core/remote_state/remote_state_client.py +38 -0
  44. xpk-0.7.1/src/xpk/core/resources.py +238 -0
  45. xpk-0.7.1/src/xpk/core/scheduling.py +253 -0
  46. xpk-0.7.1/src/xpk/core/storage.py +581 -0
  47. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/core/system_characteristics.py +38 -1
  48. xpk-0.7.1/src/xpk/core/vertex.py +105 -0
  49. xpk-0.7.1/src/xpk/core/workload.py +341 -0
  50. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/core/workload_decorators/rdma_decorator.py +25 -5
  51. xpk-0.7.1/src/xpk/core/workload_decorators/storage_decorator.py +52 -0
  52. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/core/workload_decorators/tcpxo_decorator.py +70 -37
  53. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/main.py +3 -1
  54. xpk-0.7.1/src/xpk/parser/batch.py +43 -0
  55. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/parser/cluster.py +49 -8
  56. xpk-0.6.0/src/xpk/parser/batch.py → xpk-0.7.1/src/xpk/parser/common.py +107 -32
  57. xpk-0.7.1/src/xpk/parser/config.py +49 -0
  58. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/parser/core.py +27 -1
  59. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/parser/info.py +2 -1
  60. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/parser/inspector.py +3 -3
  61. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/parser/job.py +25 -4
  62. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/parser/kind.py +3 -2
  63. xpk-0.7.1/src/xpk/parser/run.py +47 -0
  64. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/parser/shell.py +10 -1
  65. xpk-0.7.1/src/xpk/parser/storage.py +326 -0
  66. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/parser/validators.py +3 -3
  67. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/parser/workload.py +118 -76
  68. xpk-0.7.1/src/xpk/templates/__init__.py +15 -0
  69. xpk-0.7.1/src/xpk/templates/storage.yaml +13 -0
  70. xpk-0.7.1/src/xpk/utils/__init__.py +15 -0
  71. xpk-0.7.1/src/xpk/utils/gcs_utils.py +125 -0
  72. xpk-0.7.1/src/xpk/utils/kubectl.py +57 -0
  73. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/utils/objects.py +8 -5
  74. xpk-0.7.1/src/xpk/utils/templates.py +28 -0
  75. xpk-0.7.1/src/xpk/utils/validation.py +80 -0
  76. xpk-0.6.0/README.md → xpk-0.7.1/src/xpk.egg-info/PKG-INFO +186 -6
  77. {xpk-0.6.0 → xpk-0.7.1}/src/xpk.egg-info/SOURCES.txt +36 -1
  78. xpk-0.7.1/src/xpk.egg-info/requires.txt +18 -0
  79. xpk-0.6.0/src/xpk/commands/version.py +0 -39
  80. xpk-0.6.0/src/xpk/core/core.py +0 -2824
  81. xpk-0.6.0/src/xpk/core/kjob.py +0 -205
  82. xpk-0.6.0/src/xpk/core/workload.py +0 -133
  83. xpk-0.6.0/src/xpk/parser/common.py +0 -71
  84. xpk-0.6.0/src/xpk.egg-info/requires.txt +0 -13
  85. {xpk-0.6.0 → xpk-0.7.1}/LICENSE +0 -0
  86. {xpk-0.6.0 → xpk-0.7.1}/setup.cfg +0 -0
  87. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/__init__.py +0 -0
  88. {xpk-0.6.0/src/xpk/commands → xpk-0.7.1/src/xpk/api}/__init__.py +0 -0
  89. {xpk-0.6.0/src/xpk/core → xpk-0.7.1/src/xpk/commands}/__init__.py +0 -0
  90. {xpk-0.6.0/src/xpk/core/blueprint → xpk-0.7.1/src/xpk/core}/__init__.py +0 -0
  91. {xpk-0.6.0/src/xpk/core/workload_decorators → xpk-0.7.1/src/xpk/core/blueprint}/__init__.py +0 -0
  92. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/core/docker_manager.py +0 -0
  93. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/core/ray.py +0 -0
  94. {xpk-0.6.0/src/xpk/parser → xpk-0.7.1/src/xpk/core/workload_decorators}/__init__.py +0 -0
  95. {xpk-0.6.0/src/xpk/utils → xpk-0.7.1/src/xpk/parser}/__init__.py +0 -0
  96. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/parser/version.py +0 -0
  97. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/utils/console.py +0 -0
  98. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/utils/file.py +0 -0
  99. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/utils/network.py +0 -0
  100. {xpk-0.6.0 → xpk-0.7.1}/src/xpk/utils/yaml.py +0 -0
  101. {xpk-0.6.0 → xpk-0.7.1}/src/xpk.egg-info/dependency_links.txt +0 -0
  102. {xpk-0.6.0 → xpk-0.7.1}/src/xpk.egg-info/entry_points.txt +0 -0
  103. {xpk-0.6.0 → xpk-0.7.1}/src/xpk.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: xpk
3
- Version: 0.6.0
3
+ Version: 0.7.1
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
5
  Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
@@ -11,18 +11,24 @@ Classifier: Programming Language :: Python :: 3.11
11
11
  Requires-Python: >=3.10
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
- Requires-Dist: cloud-accelerator-diagnostics
15
- Requires-Dist: tabulate
16
- Requires-Dist: ruamel.yaml
17
- Requires-Dist: pyyaml
18
- Requires-Dist: docker
19
- Requires-Dist: packaging
14
+ Requires-Dist: cloud-accelerator-diagnostics==0.1.1
15
+ Requires-Dist: tabulate==0.9.0
16
+ Requires-Dist: ruamel.yaml==0.18.10
17
+ Requires-Dist: pyyaml==6.0.2
18
+ Requires-Dist: docker==7.1.0
19
+ Requires-Dist: kubernetes==31.0.0
20
+ Requires-Dist: google-cloud==0.34.0
21
+ Requires-Dist: google-api-core==2.24.1
22
+ Requires-Dist: packaging==24.2
23
+ Requires-Dist: google-cloud-filestore==1.12.0
24
+ Requires-Dist: google-cloud-storage==2.19.0
20
25
  Provides-Extra: dev
21
26
  Requires-Dist: pyink==24.3.0; extra == "dev"
22
27
  Requires-Dist: pylint>=2.6.0; extra == "dev"
23
28
  Requires-Dist: pre-commit; extra == "dev"
24
29
  Requires-Dist: pytest; extra == "dev"
25
- Requires-Dist: docker; extra == "dev"
30
+ Requires-Dist: docker==7.1.0; extra == "dev"
31
+ Dynamic: license-file
26
32
 
27
33
  <!--
28
34
  Copyright 2023 Google LLC
@@ -42,6 +48,8 @@ Requires-Dist: docker; extra == "dev"
42
48
 
43
49
  [![Build Tests](https://github.com/google/xpk/actions/workflows/build_tests.yaml/badge.svg)](https://github.com/google/xpk/actions/workflows/build_tests.yaml)
44
50
  [![Nightly Tests](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml/badge.svg)](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml)
51
+ [![Develop Tests](https://github.com/AI-Hypercomputer/xpk/actions/workflows/build_tests.yaml/badge.svg?branch=develop)](https://github.com/AI-Hypercomputer/xpk/actions/workflows/build_tests.yaml)
52
+ [![Develop Nightly Tests](https://github.com/AI-Hypercomputer/xpk/actions/workflows/nightly_tests.yaml/badge.svg?branch=develop)](https://github.com/AI-Hypercomputer/xpk/actions/workflows/nightly_tests.yaml)
45
53
 
46
54
  # Overview
47
55
 
@@ -80,7 +88,11 @@ and the following GPU types:
80
88
  and the following CPU types:
81
89
  * n2-standard-32
82
90
 
83
- # Cloud Console Permissions on the user or service account needed to run XPK:
91
+ xpk also supports Google Cloud Storage solutions:
92
+ * [Cloud Storage FUSE](#fuse)
93
+ * [Filestore](#filestore)
94
+
95
+ # Permissions needed on Cloud Console:
84
96
 
85
97
  * Artifact Registry Writer
86
98
  * Compute Admin
@@ -90,6 +102,7 @@ and the following CPU types:
90
102
  * Service Account User
91
103
  * Storage Admin
92
104
  * Vertex AI Administrator
105
+ * Filestore Editor (This role is neccessary if you want to run `storage create` command with `--type=gcpfilestore`)
93
106
 
94
107
  # Prerequisites
95
108
 
@@ -111,17 +124,28 @@ Following tools must be installed:
111
124
  # sudo may be required
112
125
  apt-get -y install make
113
126
  ```
114
- In addition, below dependencies will be installed with `make install` command:
127
+ In addition, below dependencies can be installed either using provided links or using `make install` command, if xpk is downloaded via `git clone` command:
115
128
  - kueuectl (install from [here](https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/))
116
129
  - kjob (installation instructions [here](https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md))
117
130
 
118
131
  # Installation
119
- To install xpk, run the following command and install additional tools, mentioned in [prerequisites](#prerequisites). [Makefile](https://github.com/AI-Hypercomputer/xpk/blob/main/Makefile) provides a way to install all neccessary tools:
132
+ To install xpk, install required tools mentioned in [prerequisites](#prerequisites). [Makefile](https://github.com/AI-Hypercomputer/xpk/blob/main/Makefile) provides a way to install all neccessary tools. XPK can be installed via pip:
120
133
 
121
134
  ```shell
122
135
  pip install xpk
123
136
  ```
124
137
 
138
+ If you see an error saying: `This environment is externally managed`, please use a virtual environment.
139
+
140
+ ```shell
141
+ ## One time step of creating the venv
142
+ VENV_DIR=~/venvp3
143
+ python3 -m venv $VENV_DIR
144
+ ## Enter your venv.
145
+ source $VENV_DIR/bin/activate
146
+ ## Clone the repository and installing dependencies.
147
+ pip install xpk
148
+ ```
125
149
 
126
150
  If you are running XPK by cloning GitHub repository, first run the
127
151
  following commands to begin using XPK commands:
@@ -174,6 +198,8 @@ cleanup with a `Cluster Delete`.
174
198
  If you have failures with workloads not running, use `xpk inspector` to investigate
175
199
  more.
176
200
 
201
+ If you need your Workloads to have persistent storage, use `xpk storage` to find out more.
202
+
177
203
  ## Cluster Create
178
204
 
179
205
  First set the project and zone through gcloud config or xpk arguments.
@@ -448,6 +474,103 @@ Currently, the below flags/arguments are supported for A3-Mega and A3-Ultra mach
448
474
  * --on-demand (only A3-Mega)
449
475
 
450
476
 
477
+ ## Storage
478
+ Currently XPK supports two types of storages: Cloud Storage FUSE and Google Cloud Filestore.
479
+
480
+ ### FUSE
481
+ A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so applications can read and write objects in your bucket using standard file system semantics.
482
+
483
+ To use the GCS FUSE with XPK you need to create a [Storage Bucket](https://console.cloud.google.com/storage/).
484
+
485
+ Once it's ready you can use `xpk storage attach` with `--type=gcsfuse` command to attach a FUSE storage instance to your cluster:
486
+
487
+ ```shell
488
+ python3 xpk.py storage attach test-fuse-storage --type=gcsfuse \
489
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
490
+ --mount-point='/test-mount-point' --readonly=false \
491
+ --bucket=test-bucket --size=1 --auto-mount=false
492
+ ```
493
+
494
+ Parameters:
495
+
496
+ - `--type` - type of the storage, currently xpk supports `gcsfuse` and `gcpfilestore` only.
497
+ - `--auto-mount` - if set to true all workloads will have this storage mounted by default.
498
+ - `--mount-point` - the path on which this storage should be mounted for a workload.
499
+ - `--readonly` - if set to true, workload can only read from storage.
500
+ - `--size` - size of the storage in Gb.
501
+ - `--bucket` - name of the storage bucket. If not set then the name of the storage is used as a bucket name.
502
+ - `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions. If set, then values from manifest override the following parameters: `--size` and `--bucket`.
503
+
504
+ ### Filestore
505
+
506
+ A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so applications can read and write objects in your volumes using standard file system semantics.
507
+
508
+ To create and attach a GCP Filestore instance to your cluster use `xpk storage create` command with `--type=gcpfilestore`:
509
+
510
+ ```shell
511
+ python3 xpk.py storage create test-fs-storage --type=gcpfilestore \
512
+ --auto-mount=false --mount-point=/data-fs --readonly=false \
513
+ --size=1024 --tier=BASIC_HDD --access_mode=ReadWriteMany --vol=default \
514
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
515
+ ```
516
+
517
+ You can also attach an existing Filestore instance to your cluster using `xpk storage attach` command:
518
+
519
+ ```shell
520
+ python3 xpk.py storage attach test-fs-storage --type=gcpfilestore \
521
+ --auto-mount=false --mount-point=/data-fs --readonly=false \
522
+ --size=1024 --tier=BASIC_HDD --access_mode=ReadWriteMany --vol=default \
523
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
524
+ ```
525
+
526
+ The command above is also useful when attaching multiple volumes from the same Filestore instance.
527
+
528
+ Commands `xpk storage create` and `xpk storage attach` with `--type=gcpfilestore` accept following arguments:
529
+ - `--type` - type of the storage.
530
+ - `--auto-mount` - if set to true all workloads will have this storage mounted by default.
531
+ - `--mount-point` - the path on which this storage should be mounted for a workload.
532
+ - `--readonly` - if set to true, workload can only read from storage.
533
+ - `--size` - size of the Filestore instance that will be created in Gb.
534
+ - `--tier` - tier of the Filestore instance that will be created. Possible options are: `[BASIC_HDD, BASIC_SSD, ZONAL, REGIONAL, ENTERPRISE]`
535
+ - `--access-mode` - access mode of the Filestore instance that will be created. Possible values are: `[ReadWriteOnce, ReadOnlyMany, ReadWriteMany]`
536
+ - `--vol` - file share name of the Filestore instance that will be created.
537
+ - `--instance` - the name of the Filestore instance. If not set then the name parameter is used as an instance name. Useful when connecting multiple volumes from the same Filestore instance.
538
+ - `--manifest` - path to the manifest file containing PersistentVolume, PresistentVolumeClaim and StorageClass definitions. If set, then values from manifest override the following parameters: `--access-mode`, `--size` and `--volume`.
539
+
540
+ ### List attached storages
541
+
542
+ ```shell
543
+ python3 xpk.py storage list \
544
+ --project=$PROJECT --cluster $CLUSTER --zone=$ZONE
545
+ ```
546
+
547
+ ### Running workloads with storage
548
+
549
+ If you specified `--auto-mount=true` when creating or attaching a storage, then all workloads deployed on the cluster will have the volume attached by default. Otherwise, in order to have the storage attached, you have to add `--storage` parameter to `workload create` command:
550
+
551
+ ```shell
552
+ python3 xpk.py workload create \
553
+ --workload xpk-test-workload --command "echo goodbye" \
554
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
555
+ --tpu-type=v5litepod-16 --storage=test-storage
556
+ ```
557
+
558
+ ### Detaching storage
559
+
560
+ ```shell
561
+ python3 xpk.py storage detach $STORAGE_NAME \
562
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
563
+ ```
564
+
565
+ ### Deleting storage
566
+
567
+ XPK allows you to remove Filestore instances easily with `xpk storage delete` command. **Warning:** this deletes all data contained in the Filestore!
568
+
569
+ ```shell
570
+ python3 xpk.py storage delete test-fs-instance \
571
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
572
+ ```
573
+
451
574
  ## Workload Create
452
575
  * Workload Create (submit training job):
453
576
 
@@ -455,7 +578,7 @@ Currently, the below flags/arguments are supported for A3-Mega and A3-Ultra mach
455
578
  python3 xpk.py workload create \
456
579
  --workload xpk-test-workload --command "echo goodbye" \
457
580
  --cluster xpk-test \
458
- --tpu-type=v5litepod-16
581
+ --tpu-type=v5litepod-16 --projet=$PROJECT
459
582
  ```
460
583
 
461
584
  * Workload Create for Pathways:
@@ -528,6 +651,8 @@ To submit jobs on a cluster with A3 machines, run the below command. To create a
528
651
  ```
529
652
  > The docker image flags/arguments introduced in [workloads section](#workload-create) can be used with A3 machines as well.
530
653
 
654
+ In order to run NCCL test on A3 Ultra machines check out [this guide](/examples/nccl/nccl.md).
655
+
531
656
  ### Workload Priority and Preemption
532
657
  * Set the priority level of your workload with `--priority=LEVEL`
533
658
 
@@ -666,8 +791,6 @@ Check out [MaxText example](https://github.com/google/maxtext/pull/570) on how t
666
791
  ```
667
792
 
668
793
  * Workload List supports waiting for the completion of a specific job. XPK will follow an existing job until it has finished or the `timeout`, if provided, has been reached and then list the job. If no `timeout` is specified, the default value is set to the max value, 1 week. You may also set `timeout=0` to poll the job once.
669
- (Note: `restart-on-user-code-failure` must be set
670
- when creating the workload otherwise the workload will always finish with `Completed` status.)
671
794
 
672
795
  Wait for a job to complete.
673
796
 
@@ -759,6 +882,35 @@ Inspector output is saved to a file.
759
882
  [XPK] Exiting XPK cleanly
760
883
  ```
761
884
 
885
+ ## Run
886
+ * `xpk run` lets you execute scripts on a cluster with ease. It automates task execution, handles interruptions, and streams job output to your console.
887
+
888
+ ```shell
889
+ python xpk.py run --kind-cluster -n 2 -t 0-2 examples/job.sh
890
+ ```
891
+
892
+ * Example Output:
893
+
894
+ ```shell
895
+ [XPK] Starting xpk
896
+ [XPK] Task: `get current-context` is implemented by `kubectl config current-context`, hiding output unless there is an error.
897
+ [XPK] No local cluster name specified. Using current-context `kind-kind`
898
+ [XPK] Task: `run task` is implemented by `kubectl kjob create slurm --profile xpk-def-app-profile --localqueue multislice-queue --wait --rm -- examples/job.sh --partition multislice-queue --ntasks 2 --time 0-2`. Streaming output and input live.
899
+ job.batch/xpk-def-app-profile-slurm-g4vr6 created
900
+ configmap/xpk-def-app-profile-slurm-g4vr6 created
901
+ service/xpk-def-app-profile-slurm-g4vr6 created
902
+ Starting log streaming for pod xpk-def-app-profile-slurm-g4vr6-1-4rmgk...
903
+ Now processing task ID: 3
904
+ Starting log streaming for pod xpk-def-app-profile-slurm-g4vr6-0-bg6dm...
905
+ Now processing task ID: 1
906
+ exit
907
+ exit
908
+ Now processing task ID: 2
909
+ exit
910
+ Job logs streaming finished.[XPK] Task: `run task` terminated with code `0`
911
+ [XPK] XPK Done.
912
+ ```
913
+
762
914
  ## GPU usage
763
915
 
764
916
  In order to use XPK for GPU, you can do so by using `device-type` flag.
@@ -1241,6 +1393,8 @@ gcloud beta compute reservations describe $RESERVATION --project=$PROJECT_ID --z
1241
1393
 
1242
1394
  ## 403 error on workload create when using `--base-docker-image` flag
1243
1395
  You need authority to push to the registry from your local machine. Try running `gcloud auth configure-docker`.
1396
+ ## `Kubernetes API exception` - 404 error
1397
+ If error of this kind appeared after updating xpk version it's possible that you need to rerun `cluster create` command in order to update resource definitions.
1244
1398
 
1245
1399
  # TPU Workload Debugging
1246
1400
 
@@ -1,29 +1,3 @@
1
- Metadata-Version: 2.2
2
- Name: xpk
3
- Version: 0.6.0
4
- Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
- Author-email: XPK team <xpk-code-reviewers@google.com>
6
- License: Apache-2.0
7
- Project-URL: Homepage, https://github.com/google/xpk
8
- Project-URL: Bug Tracker, https://github.com/google/xpk/issues
9
- Classifier: Programming Language :: Python :: 3.10
10
- Classifier: Programming Language :: Python :: 3.11
11
- Requires-Python: >=3.10
12
- Description-Content-Type: text/markdown
13
- License-File: LICENSE
14
- Requires-Dist: cloud-accelerator-diagnostics
15
- Requires-Dist: tabulate
16
- Requires-Dist: ruamel.yaml
17
- Requires-Dist: pyyaml
18
- Requires-Dist: docker
19
- Requires-Dist: packaging
20
- Provides-Extra: dev
21
- Requires-Dist: pyink==24.3.0; extra == "dev"
22
- Requires-Dist: pylint>=2.6.0; extra == "dev"
23
- Requires-Dist: pre-commit; extra == "dev"
24
- Requires-Dist: pytest; extra == "dev"
25
- Requires-Dist: docker; extra == "dev"
26
-
27
1
  <!--
28
2
  Copyright 2023 Google LLC
29
3
 
@@ -42,6 +16,8 @@ Requires-Dist: docker; extra == "dev"
42
16
 
43
17
  [![Build Tests](https://github.com/google/xpk/actions/workflows/build_tests.yaml/badge.svg)](https://github.com/google/xpk/actions/workflows/build_tests.yaml)
44
18
  [![Nightly Tests](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml/badge.svg)](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml)
19
+ [![Develop Tests](https://github.com/AI-Hypercomputer/xpk/actions/workflows/build_tests.yaml/badge.svg?branch=develop)](https://github.com/AI-Hypercomputer/xpk/actions/workflows/build_tests.yaml)
20
+ [![Develop Nightly Tests](https://github.com/AI-Hypercomputer/xpk/actions/workflows/nightly_tests.yaml/badge.svg?branch=develop)](https://github.com/AI-Hypercomputer/xpk/actions/workflows/nightly_tests.yaml)
45
21
 
46
22
  # Overview
47
23
 
@@ -80,7 +56,11 @@ and the following GPU types:
80
56
  and the following CPU types:
81
57
  * n2-standard-32
82
58
 
83
- # Cloud Console Permissions on the user or service account needed to run XPK:
59
+ xpk also supports Google Cloud Storage solutions:
60
+ * [Cloud Storage FUSE](#fuse)
61
+ * [Filestore](#filestore)
62
+
63
+ # Permissions needed on Cloud Console:
84
64
 
85
65
  * Artifact Registry Writer
86
66
  * Compute Admin
@@ -90,6 +70,7 @@ and the following CPU types:
90
70
  * Service Account User
91
71
  * Storage Admin
92
72
  * Vertex AI Administrator
73
+ * Filestore Editor (This role is neccessary if you want to run `storage create` command with `--type=gcpfilestore`)
93
74
 
94
75
  # Prerequisites
95
76
 
@@ -111,17 +92,28 @@ Following tools must be installed:
111
92
  # sudo may be required
112
93
  apt-get -y install make
113
94
  ```
114
- In addition, below dependencies will be installed with `make install` command:
95
+ In addition, below dependencies can be installed either using provided links or using `make install` command, if xpk is downloaded via `git clone` command:
115
96
  - kueuectl (install from [here](https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/))
116
97
  - kjob (installation instructions [here](https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md))
117
98
 
118
99
  # Installation
119
- To install xpk, run the following command and install additional tools, mentioned in [prerequisites](#prerequisites). [Makefile](https://github.com/AI-Hypercomputer/xpk/blob/main/Makefile) provides a way to install all neccessary tools:
100
+ To install xpk, install required tools mentioned in [prerequisites](#prerequisites). [Makefile](https://github.com/AI-Hypercomputer/xpk/blob/main/Makefile) provides a way to install all neccessary tools. XPK can be installed via pip:
120
101
 
121
102
  ```shell
122
103
  pip install xpk
123
104
  ```
124
105
 
106
+ If you see an error saying: `This environment is externally managed`, please use a virtual environment.
107
+
108
+ ```shell
109
+ ## One time step of creating the venv
110
+ VENV_DIR=~/venvp3
111
+ python3 -m venv $VENV_DIR
112
+ ## Enter your venv.
113
+ source $VENV_DIR/bin/activate
114
+ ## Clone the repository and installing dependencies.
115
+ pip install xpk
116
+ ```
125
117
 
126
118
  If you are running XPK by cloning GitHub repository, first run the
127
119
  following commands to begin using XPK commands:
@@ -174,6 +166,8 @@ cleanup with a `Cluster Delete`.
174
166
  If you have failures with workloads not running, use `xpk inspector` to investigate
175
167
  more.
176
168
 
169
+ If you need your Workloads to have persistent storage, use `xpk storage` to find out more.
170
+
177
171
  ## Cluster Create
178
172
 
179
173
  First set the project and zone through gcloud config or xpk arguments.
@@ -448,6 +442,103 @@ Currently, the below flags/arguments are supported for A3-Mega and A3-Ultra mach
448
442
  * --on-demand (only A3-Mega)
449
443
 
450
444
 
445
+ ## Storage
446
+ Currently XPK supports two types of storages: Cloud Storage FUSE and Google Cloud Filestore.
447
+
448
+ ### FUSE
449
+ A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so applications can read and write objects in your bucket using standard file system semantics.
450
+
451
+ To use the GCS FUSE with XPK you need to create a [Storage Bucket](https://console.cloud.google.com/storage/).
452
+
453
+ Once it's ready you can use `xpk storage attach` with `--type=gcsfuse` command to attach a FUSE storage instance to your cluster:
454
+
455
+ ```shell
456
+ python3 xpk.py storage attach test-fuse-storage --type=gcsfuse \
457
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
458
+ --mount-point='/test-mount-point' --readonly=false \
459
+ --bucket=test-bucket --size=1 --auto-mount=false
460
+ ```
461
+
462
+ Parameters:
463
+
464
+ - `--type` - type of the storage, currently xpk supports `gcsfuse` and `gcpfilestore` only.
465
+ - `--auto-mount` - if set to true all workloads will have this storage mounted by default.
466
+ - `--mount-point` - the path on which this storage should be mounted for a workload.
467
+ - `--readonly` - if set to true, workload can only read from storage.
468
+ - `--size` - size of the storage in Gb.
469
+ - `--bucket` - name of the storage bucket. If not set then the name of the storage is used as a bucket name.
470
+ - `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions. If set, then values from manifest override the following parameters: `--size` and `--bucket`.
471
+
472
+ ### Filestore
473
+
474
+ A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so applications can read and write objects in your volumes using standard file system semantics.
475
+
476
+ To create and attach a GCP Filestore instance to your cluster use `xpk storage create` command with `--type=gcpfilestore`:
477
+
478
+ ```shell
479
+ python3 xpk.py storage create test-fs-storage --type=gcpfilestore \
480
+ --auto-mount=false --mount-point=/data-fs --readonly=false \
481
+ --size=1024 --tier=BASIC_HDD --access_mode=ReadWriteMany --vol=default \
482
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
483
+ ```
484
+
485
+ You can also attach an existing Filestore instance to your cluster using `xpk storage attach` command:
486
+
487
+ ```shell
488
+ python3 xpk.py storage attach test-fs-storage --type=gcpfilestore \
489
+ --auto-mount=false --mount-point=/data-fs --readonly=false \
490
+ --size=1024 --tier=BASIC_HDD --access_mode=ReadWriteMany --vol=default \
491
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
492
+ ```
493
+
494
+ The command above is also useful when attaching multiple volumes from the same Filestore instance.
495
+
496
+ Commands `xpk storage create` and `xpk storage attach` with `--type=gcpfilestore` accept following arguments:
497
+ - `--type` - type of the storage.
498
+ - `--auto-mount` - if set to true all workloads will have this storage mounted by default.
499
+ - `--mount-point` - the path on which this storage should be mounted for a workload.
500
+ - `--readonly` - if set to true, workload can only read from storage.
501
+ - `--size` - size of the Filestore instance that will be created in Gb.
502
+ - `--tier` - tier of the Filestore instance that will be created. Possible options are: `[BASIC_HDD, BASIC_SSD, ZONAL, REGIONAL, ENTERPRISE]`
503
+ - `--access-mode` - access mode of the Filestore instance that will be created. Possible values are: `[ReadWriteOnce, ReadOnlyMany, ReadWriteMany]`
504
+ - `--vol` - file share name of the Filestore instance that will be created.
505
+ - `--instance` - the name of the Filestore instance. If not set then the name parameter is used as an instance name. Useful when connecting multiple volumes from the same Filestore instance.
506
+ - `--manifest` - path to the manifest file containing PersistentVolume, PresistentVolumeClaim and StorageClass definitions. If set, then values from manifest override the following parameters: `--access-mode`, `--size` and `--volume`.
507
+
508
+ ### List attached storages
509
+
510
+ ```shell
511
+ python3 xpk.py storage list \
512
+ --project=$PROJECT --cluster $CLUSTER --zone=$ZONE
513
+ ```
514
+
515
+ ### Running workloads with storage
516
+
517
+ If you specified `--auto-mount=true` when creating or attaching a storage, then all workloads deployed on the cluster will have the volume attached by default. Otherwise, in order to have the storage attached, you have to add `--storage` parameter to `workload create` command:
518
+
519
+ ```shell
520
+ python3 xpk.py workload create \
521
+ --workload xpk-test-workload --command "echo goodbye" \
522
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
523
+ --tpu-type=v5litepod-16 --storage=test-storage
524
+ ```
525
+
526
+ ### Detaching storage
527
+
528
+ ```shell
529
+ python3 xpk.py storage detach $STORAGE_NAME \
530
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
531
+ ```
532
+
533
+ ### Deleting storage
534
+
535
+ XPK allows you to remove Filestore instances easily with `xpk storage delete` command. **Warning:** this deletes all data contained in the Filestore!
536
+
537
+ ```shell
538
+ python3 xpk.py storage delete test-fs-instance \
539
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
540
+ ```
541
+
451
542
  ## Workload Create
452
543
  * Workload Create (submit training job):
453
544
 
@@ -455,7 +546,7 @@ Currently, the below flags/arguments are supported for A3-Mega and A3-Ultra mach
455
546
  python3 xpk.py workload create \
456
547
  --workload xpk-test-workload --command "echo goodbye" \
457
548
  --cluster xpk-test \
458
- --tpu-type=v5litepod-16
549
+ --tpu-type=v5litepod-16 --projet=$PROJECT
459
550
  ```
460
551
 
461
552
  * Workload Create for Pathways:
@@ -528,6 +619,8 @@ To submit jobs on a cluster with A3 machines, run the below command. To create a
528
619
  ```
529
620
  > The docker image flags/arguments introduced in [workloads section](#workload-create) can be used with A3 machines as well.
530
621
 
622
+ In order to run NCCL test on A3 Ultra machines check out [this guide](/examples/nccl/nccl.md).
623
+
531
624
  ### Workload Priority and Preemption
532
625
  * Set the priority level of your workload with `--priority=LEVEL`
533
626
 
@@ -666,8 +759,6 @@ Check out [MaxText example](https://github.com/google/maxtext/pull/570) on how t
666
759
  ```
667
760
 
668
761
  * Workload List supports waiting for the completion of a specific job. XPK will follow an existing job until it has finished or the `timeout`, if provided, has been reached and then list the job. If no `timeout` is specified, the default value is set to the max value, 1 week. You may also set `timeout=0` to poll the job once.
669
- (Note: `restart-on-user-code-failure` must be set
670
- when creating the workload otherwise the workload will always finish with `Completed` status.)
671
762
 
672
763
  Wait for a job to complete.
673
764
 
@@ -759,6 +850,35 @@ Inspector output is saved to a file.
759
850
  [XPK] Exiting XPK cleanly
760
851
  ```
761
852
 
853
+ ## Run
854
+ * `xpk run` lets you execute scripts on a cluster with ease. It automates task execution, handles interruptions, and streams job output to your console.
855
+
856
+ ```shell
857
+ python xpk.py run --kind-cluster -n 2 -t 0-2 examples/job.sh
858
+ ```
859
+
860
+ * Example Output:
861
+
862
+ ```shell
863
+ [XPK] Starting xpk
864
+ [XPK] Task: `get current-context` is implemented by `kubectl config current-context`, hiding output unless there is an error.
865
+ [XPK] No local cluster name specified. Using current-context `kind-kind`
866
+ [XPK] Task: `run task` is implemented by `kubectl kjob create slurm --profile xpk-def-app-profile --localqueue multislice-queue --wait --rm -- examples/job.sh --partition multislice-queue --ntasks 2 --time 0-2`. Streaming output and input live.
867
+ job.batch/xpk-def-app-profile-slurm-g4vr6 created
868
+ configmap/xpk-def-app-profile-slurm-g4vr6 created
869
+ service/xpk-def-app-profile-slurm-g4vr6 created
870
+ Starting log streaming for pod xpk-def-app-profile-slurm-g4vr6-1-4rmgk...
871
+ Now processing task ID: 3
872
+ Starting log streaming for pod xpk-def-app-profile-slurm-g4vr6-0-bg6dm...
873
+ Now processing task ID: 1
874
+ exit
875
+ exit
876
+ Now processing task ID: 2
877
+ exit
878
+ Job logs streaming finished.[XPK] Task: `run task` terminated with code `0`
879
+ [XPK] XPK Done.
880
+ ```
881
+
762
882
  ## GPU usage
763
883
 
764
884
  In order to use XPK for GPU, you can do so by using `device-type` flag.
@@ -1241,6 +1361,8 @@ gcloud beta compute reservations describe $RESERVATION --project=$PROJECT_ID --z
1241
1361
 
1242
1362
  ## 403 error on workload create when using `--base-docker-image` flag
1243
1363
  You need authority to push to the registry from your local machine. Try running `gcloud auth configure-docker`.
1364
+ ## `Kubernetes API exception` - 404 error
1365
+ If error of this kind appeared after updating xpk version it's possible that you need to rerun `cluster create` command in order to update resource definitions.
1244
1366
 
1245
1367
  # TPU Workload Debugging
1246
1368
 
@@ -30,12 +30,17 @@ keywords = []
30
30
 
31
31
  # pip dependencies installed with `pip install -e .`
32
32
  dependencies = [
33
- "cloud-accelerator-diagnostics",
34
- "tabulate",
35
- "ruamel.yaml",
36
- "pyyaml",
37
- "docker",
38
- "packaging"
33
+ "cloud-accelerator-diagnostics==0.1.1",
34
+ "tabulate==0.9.0",
35
+ "ruamel.yaml==0.18.10",
36
+ "pyyaml==6.0.2",
37
+ "docker==7.1.0",
38
+ "kubernetes==31.0.0",
39
+ "google-cloud==0.34.0",
40
+ "google-api-core==2.24.1",
41
+ "packaging==24.2",
42
+ "google-cloud-filestore==1.12.0",
43
+ "google-cloud-storage==2.19.0"
39
44
  ]
40
45
 
41
46
  [project.urls]
@@ -57,15 +62,16 @@ dev = [
57
62
  "pylint>=2.6.0",
58
63
  "pre-commit",
59
64
  "pytest",
60
- "docker"
65
+ "docker==7.1.0"
61
66
  ]
62
67
 
63
68
  [tool.setuptools.dynamic]
64
- version = {attr = "xpk.core.core.__version__"}
69
+ version = {attr = "xpk.core.config.__version__"}
65
70
 
66
71
  [tool.setuptools]
67
- packages = ["xpk", "xpk.parser", "xpk.core", "xpk.commands", "xpk.utils", "xpk.core.blueprint", "xpk.core.workload_decorators"]
72
+ packages = ["xpk", "xpk.parser", "xpk.core", "xpk.commands", "xpk.api", "xpk.templates", "xpk.utils", "xpk.core.blueprint", "xpk.core.remote_state", "xpk.core.workload_decorators"]
68
73
  package-dir = {"" = "src"}
74
+ package-data = {"xpk.api" = ["storage_crd.yaml"], "xpk.templates" = ["storage.yaml"]}
69
75
 
70
76
  [tool.pyink]
71
77
  # Formatting configuration to follow Google style-guide.
@@ -0,0 +1,52 @@
1
+ apiVersion: apiextensions.k8s.io/v1
2
+ kind: CustomResourceDefinition
3
+ metadata:
4
+ name: storages.xpk.x-k8s.io
5
+ spec:
6
+ group: xpk.x-k8s.io
7
+ versions:
8
+ - name: v1
9
+ served: true
10
+ storage: true
11
+ schema:
12
+ openAPIV3Schema:
13
+ type: object
14
+ properties:
15
+ spec:
16
+ type: object
17
+ properties:
18
+ type:
19
+ type: string
20
+ cluster:
21
+ type: string
22
+ auto_mount:
23
+ type: boolean
24
+ mount_point:
25
+ type: string
26
+ readonly:
27
+ type: boolean
28
+ manifest:
29
+ type: string
30
+ pv:
31
+ type: string
32
+ pvc:
33
+ type: string
34
+ required:
35
+ - type
36
+ - cluster
37
+ - auto_mount
38
+ - mount_point
39
+ - readonly
40
+ - manifest
41
+ - pvc
42
+ - pv
43
+ x-kubernetes-validations:
44
+ - message: Value is immutable
45
+ rule: self == oldSelf
46
+ scope: Cluster
47
+ names:
48
+ plural: storages
49
+ singular: storage
50
+ kind: Storage
51
+ shortNames:
52
+ - stg