xpk 0.6.0__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. {xpk-0.6.0 → xpk-0.7.0}/PKG-INFO +165 -14
  2. xpk-0.6.0/src/xpk.egg-info/PKG-INFO → xpk-0.7.0/README.md +152 -32
  3. {xpk-0.6.0 → xpk-0.7.0}/pyproject.toml +16 -10
  4. xpk-0.7.0/src/xpk/api/storage_crd.yaml +52 -0
  5. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/commands/batch.py +27 -5
  6. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/commands/cluster.py +104 -80
  7. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/commands/cluster_gcluster.py +94 -10
  8. xpk-0.7.0/src/xpk/commands/common.py +44 -0
  9. xpk-0.7.0/src/xpk/commands/config.py +29 -0
  10. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/commands/info.py +8 -10
  11. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/commands/inspector.py +5 -11
  12. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/commands/job.py +9 -7
  13. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/commands/kind.py +34 -4
  14. xpk-0.7.0/src/xpk/commands/kjob_common.py +44 -0
  15. xpk-0.7.0/src/xpk/commands/run.py +128 -0
  16. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/commands/shell.py +27 -7
  17. xpk-0.7.0/src/xpk/commands/storage.py +267 -0
  18. xpk-0.7.0/src/xpk/commands/version.py +27 -0
  19. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/commands/workload.py +381 -184
  20. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/core/blueprint/blueprint_definitions.py +1 -0
  21. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/core/blueprint/blueprint_generator.py +132 -76
  22. xpk-0.7.0/src/xpk/core/capacity.py +185 -0
  23. xpk-0.7.0/src/xpk/core/cluster.py +564 -0
  24. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/core/cluster_private.py +6 -3
  25. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/core/commands.py +18 -14
  26. xpk-0.7.0/src/xpk/core/config.py +179 -0
  27. xpk-0.7.0/src/xpk/core/docker_container.py +225 -0
  28. xpk-0.7.0/src/xpk/core/docker_image.py +210 -0
  29. xpk-0.7.0/src/xpk/core/docker_resources.py +350 -0
  30. xpk-0.7.0/src/xpk/core/filestore.py +251 -0
  31. xpk-0.7.0/src/xpk/core/gcloud_context.py +196 -0
  32. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/core/gcluster_manager.py +20 -2
  33. xpk-0.7.0/src/xpk/core/gcsfuse.py +50 -0
  34. xpk-0.7.0/src/xpk/core/kjob.py +444 -0
  35. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/core/kueue.py +12 -6
  36. xpk-0.7.0/src/xpk/core/monitoring.py +134 -0
  37. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/core/nap.py +32 -20
  38. xpk-0.7.0/src/xpk/core/network.py +377 -0
  39. xpk-0.7.0/src/xpk/core/nodepool.py +581 -0
  40. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/core/pathways.py +124 -45
  41. xpk-0.7.0/src/xpk/core/remote_state/__init__.py +15 -0
  42. xpk-0.7.0/src/xpk/core/remote_state/fuse_remote_state.py +99 -0
  43. xpk-0.7.0/src/xpk/core/remote_state/remote_state_client.py +38 -0
  44. xpk-0.7.0/src/xpk/core/resources.py +238 -0
  45. xpk-0.7.0/src/xpk/core/scheduling.py +253 -0
  46. xpk-0.7.0/src/xpk/core/storage.py +581 -0
  47. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/core/system_characteristics.py +38 -1
  48. xpk-0.7.0/src/xpk/core/vertex.py +105 -0
  49. xpk-0.7.0/src/xpk/core/workload.py +341 -0
  50. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/core/workload_decorators/rdma_decorator.py +25 -5
  51. xpk-0.7.0/src/xpk/core/workload_decorators/storage_decorator.py +52 -0
  52. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/core/workload_decorators/tcpxo_decorator.py +70 -37
  53. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/main.py +3 -1
  54. xpk-0.7.0/src/xpk/parser/batch.py +43 -0
  55. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/parser/cluster.py +49 -8
  56. xpk-0.6.0/src/xpk/parser/batch.py → xpk-0.7.0/src/xpk/parser/common.py +107 -32
  57. xpk-0.7.0/src/xpk/parser/config.py +49 -0
  58. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/parser/core.py +27 -1
  59. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/parser/info.py +2 -1
  60. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/parser/inspector.py +3 -3
  61. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/parser/job.py +25 -4
  62. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/parser/kind.py +3 -2
  63. xpk-0.7.0/src/xpk/parser/run.py +47 -0
  64. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/parser/shell.py +10 -1
  65. xpk-0.7.0/src/xpk/parser/storage.py +316 -0
  66. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/parser/validators.py +3 -3
  67. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/parser/workload.py +118 -76
  68. xpk-0.7.0/src/xpk/templates/__init__.py +15 -0
  69. xpk-0.7.0/src/xpk/templates/storage.yaml +13 -0
  70. xpk-0.7.0/src/xpk/utils/__init__.py +15 -0
  71. xpk-0.7.0/src/xpk/utils/gcs_utils.py +125 -0
  72. xpk-0.7.0/src/xpk/utils/kubectl.py +57 -0
  73. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/utils/objects.py +8 -5
  74. xpk-0.7.0/src/xpk/utils/templates.py +28 -0
  75. xpk-0.7.0/src/xpk/utils/validation.py +80 -0
  76. xpk-0.6.0/README.md → xpk-0.7.0/src/xpk.egg-info/PKG-INFO +183 -6
  77. {xpk-0.6.0 → xpk-0.7.0}/src/xpk.egg-info/SOURCES.txt +36 -1
  78. xpk-0.7.0/src/xpk.egg-info/requires.txt +18 -0
  79. xpk-0.6.0/src/xpk/commands/version.py +0 -39
  80. xpk-0.6.0/src/xpk/core/core.py +0 -2824
  81. xpk-0.6.0/src/xpk/core/kjob.py +0 -205
  82. xpk-0.6.0/src/xpk/core/workload.py +0 -133
  83. xpk-0.6.0/src/xpk/parser/common.py +0 -71
  84. xpk-0.6.0/src/xpk.egg-info/requires.txt +0 -13
  85. {xpk-0.6.0 → xpk-0.7.0}/LICENSE +0 -0
  86. {xpk-0.6.0 → xpk-0.7.0}/setup.cfg +0 -0
  87. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/__init__.py +0 -0
  88. {xpk-0.6.0/src/xpk/commands → xpk-0.7.0/src/xpk/api}/__init__.py +0 -0
  89. {xpk-0.6.0/src/xpk/core → xpk-0.7.0/src/xpk/commands}/__init__.py +0 -0
  90. {xpk-0.6.0/src/xpk/core/blueprint → xpk-0.7.0/src/xpk/core}/__init__.py +0 -0
  91. {xpk-0.6.0/src/xpk/core/workload_decorators → xpk-0.7.0/src/xpk/core/blueprint}/__init__.py +0 -0
  92. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/core/docker_manager.py +0 -0
  93. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/core/ray.py +0 -0
  94. {xpk-0.6.0/src/xpk/parser → xpk-0.7.0/src/xpk/core/workload_decorators}/__init__.py +0 -0
  95. {xpk-0.6.0/src/xpk/utils → xpk-0.7.0/src/xpk/parser}/__init__.py +0 -0
  96. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/parser/version.py +0 -0
  97. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/utils/console.py +0 -0
  98. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/utils/file.py +0 -0
  99. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/utils/network.py +0 -0
  100. {xpk-0.6.0 → xpk-0.7.0}/src/xpk/utils/yaml.py +0 -0
  101. {xpk-0.6.0 → xpk-0.7.0}/src/xpk.egg-info/dependency_links.txt +0 -0
  102. {xpk-0.6.0 → xpk-0.7.0}/src/xpk.egg-info/entry_points.txt +0 -0
  103. {xpk-0.6.0 → xpk-0.7.0}/src/xpk.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: xpk
3
- Version: 0.6.0
3
+ Version: 0.7.0
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
5
  Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
@@ -11,18 +11,23 @@ Classifier: Programming Language :: Python :: 3.11
11
11
  Requires-Python: >=3.10
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
- Requires-Dist: cloud-accelerator-diagnostics
15
- Requires-Dist: tabulate
16
- Requires-Dist: ruamel.yaml
17
- Requires-Dist: pyyaml
18
- Requires-Dist: docker
19
- Requires-Dist: packaging
14
+ Requires-Dist: cloud-accelerator-diagnostics==0.1.1
15
+ Requires-Dist: tabulate==0.9.0
16
+ Requires-Dist: ruamel.yaml==0.18.10
17
+ Requires-Dist: pyyaml==6.0.2
18
+ Requires-Dist: docker==7.1.0
19
+ Requires-Dist: kubernetes==31.0.0
20
+ Requires-Dist: google-cloud==0.34.0
21
+ Requires-Dist: google-api-core==2.24.1
22
+ Requires-Dist: packaging==24.2
23
+ Requires-Dist: google-cloud-filestore==1.12.0
24
+ Requires-Dist: google-cloud-storage==2.19.0
20
25
  Provides-Extra: dev
21
26
  Requires-Dist: pyink==24.3.0; extra == "dev"
22
27
  Requires-Dist: pylint>=2.6.0; extra == "dev"
23
28
  Requires-Dist: pre-commit; extra == "dev"
24
29
  Requires-Dist: pytest; extra == "dev"
25
- Requires-Dist: docker; extra == "dev"
30
+ Requires-Dist: docker==7.1.0; extra == "dev"
26
31
 
27
32
  <!--
28
33
  Copyright 2023 Google LLC
@@ -42,6 +47,8 @@ Requires-Dist: docker; extra == "dev"
42
47
 
43
48
  [![Build Tests](https://github.com/google/xpk/actions/workflows/build_tests.yaml/badge.svg)](https://github.com/google/xpk/actions/workflows/build_tests.yaml)
44
49
  [![Nightly Tests](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml/badge.svg)](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml)
50
+ [![Develop Tests](https://github.com/AI-Hypercomputer/xpk/actions/workflows/build_tests.yaml/badge.svg?branch=develop)](https://github.com/AI-Hypercomputer/xpk/actions/workflows/build_tests.yaml)
51
+ [![Develop Nightly Tests](https://github.com/AI-Hypercomputer/xpk/actions/workflows/nightly_tests.yaml/badge.svg?branch=develop)](https://github.com/AI-Hypercomputer/xpk/actions/workflows/nightly_tests.yaml)
45
52
 
46
53
  # Overview
47
54
 
@@ -80,7 +87,11 @@ and the following GPU types:
80
87
  and the following CPU types:
81
88
  * n2-standard-32
82
89
 
83
- # Cloud Console Permissions on the user or service account needed to run XPK:
90
+ xpk also supports Google Cloud Storage solutions:
91
+ * [Cloud Storage FUSE](#fuse)
92
+ * [Filestore](#filestore)
93
+
94
+ # Permissions needed on Cloud Console:
84
95
 
85
96
  * Artifact Registry Writer
86
97
  * Compute Admin
@@ -90,6 +101,7 @@ and the following CPU types:
90
101
  * Service Account User
91
102
  * Storage Admin
92
103
  * Vertex AI Administrator
104
+ * Filestore Editor (This role is neccessary if you want to run `storage create` command with `--type=gcpfilestore`)
93
105
 
94
106
  # Prerequisites
95
107
 
@@ -111,17 +123,28 @@ Following tools must be installed:
111
123
  # sudo may be required
112
124
  apt-get -y install make
113
125
  ```
114
- In addition, below dependencies will be installed with `make install` command:
126
+ In addition, below dependencies can be installed either using provided links or using `make install` command, if xpk is downloaded via `git clone` command:
115
127
  - kueuectl (install from [here](https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/))
116
128
  - kjob (installation instructions [here](https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md))
117
129
 
118
130
  # Installation
119
- To install xpk, run the following command and install additional tools, mentioned in [prerequisites](#prerequisites). [Makefile](https://github.com/AI-Hypercomputer/xpk/blob/main/Makefile) provides a way to install all neccessary tools:
131
+ To install xpk, install required tools mentioned in [prerequisites](#prerequisites). [Makefile](https://github.com/AI-Hypercomputer/xpk/blob/main/Makefile) provides a way to install all neccessary tools. XPK can be installed via pip:
120
132
 
121
133
  ```shell
122
134
  pip install xpk
123
135
  ```
124
136
 
137
+ If you see an error saying: `This environment is externally managed`, please use a virtual environment.
138
+
139
+ ```shell
140
+ ## One time step of creating the venv
141
+ VENV_DIR=~/venvp3
142
+ python3 -m venv $VENV_DIR
143
+ ## Enter your venv.
144
+ source $VENV_DIR/bin/activate
145
+ ## Clone the repository and installing dependencies.
146
+ pip install xpk
147
+ ```
125
148
 
126
149
  If you are running XPK by cloning GitHub repository, first run the
127
150
  following commands to begin using XPK commands:
@@ -174,6 +197,8 @@ cleanup with a `Cluster Delete`.
174
197
  If you have failures with workloads not running, use `xpk inspector` to investigate
175
198
  more.
176
199
 
200
+ If you need your Workloads to have persistent storage, use `xpk storage` to find out more.
201
+
177
202
  ## Cluster Create
178
203
 
179
204
  First set the project and zone through gcloud config or xpk arguments.
@@ -448,6 +473,101 @@ Currently, the below flags/arguments are supported for A3-Mega and A3-Ultra mach
448
473
  * --on-demand (only A3-Mega)
449
474
 
450
475
 
476
+ ## Storage
477
+ Currently XPK supports two types of storages: Cloud Storage FUSE and Google Cloud Filestore.
478
+
479
+ ### FUSE
480
+ A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so applications can read and write objects in your bucket using standard file system semantics.
481
+
482
+ To use the GCS FUSE with XPK you need to create a [Storage Bucket](https://console.cloud.google.com/storage/).
483
+
484
+ Once it's ready you can use `xpk storage attach` with `--type=gcsfuse` command to attach a FUSE storage instance to your cluster:
485
+
486
+ ```shell
487
+ python3 xpk.py storage attach test-fuse-storage --type=gcsfuse \
488
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
489
+ --mount-point='/test-mount-point' --readonly=false \
490
+ --bucket=test-bucket --size=1 --auto-mount=false
491
+ ```
492
+
493
+ Parameters:
494
+
495
+ - `--type` - type of the storage, currently xpk supports `gcsfuse` and `gcpfilestore` only.
496
+ - `--auto-mount` - if set to true all workloads will have this storage mounted by default.
497
+ - `--mount-point` - the path on which this storage should be mounted for a workload.
498
+ - `--readonly` - if set to true, workload can only read from storage.
499
+ - `--size` - size of the storage in Gb.
500
+ - `--bucket` - name of the storage bucket. If not set then the name of the storage is used as a bucket name.
501
+
502
+ ### Filestore
503
+
504
+ A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so applications can read and write objects in your volumes using standard file system semantics.
505
+
506
+ To create and attach a GCP Filestore instance to your cluster use `xpk storage create` command with `--type=gcpfilestore`:
507
+
508
+ ```shell
509
+ python3 xpk.py storage create test-fs-storage --type=gcpfilestore \
510
+ --auto-mount=false --mount-point=/data-fs --readonly=false \
511
+ --size=1024 --tier=BASIC_HDD --access_mode=ReadWriteMany --vol=default \
512
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
513
+ ```
514
+
515
+ You can also attach an existing Filestore instance to your cluster using `xpk storage attach` command:
516
+
517
+ ```shell
518
+ python3 xpk.py storage attach test-fs-storage --type=gcpfilestore \
519
+ --auto-mount=false --mount-point=/data-fs --readonly=false \
520
+ --size=1024 --tier=BASIC_HDD --access_mode=ReadWriteMany --vol=default \
521
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
522
+ ```
523
+
524
+ The command above is also useful when attaching multiple volumes from the same Filestore instance.
525
+
526
+ Commands `xpk storage create` and `xpk storage attach` with `--type=gcpfilestore` accept following arguments:
527
+ - `--type` - type of the storage.
528
+ - `--auto-mount` - if set to true all workloads will have this storage mounted by default.
529
+ - `--mount-point` - the path on which this storage should be mounted for a workload.
530
+ - `--readonly` - if set to true, workload can only read from storage.
531
+ - `--size` - size of the Filestore instance that will be created in Gb.
532
+ - `--tier` - tier of the Filestore instance that will be created. Possible options are: `[BASIC_HDD, BASIC_SSD, ZONAL, REGIONAL, ENTERPRISE]`
533
+ - `--access-mode` - access mode of the Filestore instance that will be created. Possible values are: `[ReadWriteOnce, ReadOnlyMany, ReadWriteMany]`
534
+ - `--vol` - file share name of the Filestore instance that will be created.
535
+ - `--instance` - the name of the Filestore instance. If not set then the name parameter is used as an instance name. Useful when connecting multiple volumes from the same Filestore instance.
536
+
537
+ ### List attached storages
538
+
539
+ ```shell
540
+ python3 xpk.py storage list \
541
+ --project=$PROJECT --cluster $CLUSTER --zone=$ZONE
542
+ ```
543
+
544
+ ### Running workloads with storage
545
+
546
+ If you specified `--auto-mount=true` when creating or attaching a storage, then all workloads deployed on the cluster will have the volume attached by default. Otherwise, in order to have the storage attached, you have to add `--storage` parameter to `workload create` command:
547
+
548
+ ```shell
549
+ python3 xpk.py workload create \
550
+ --workload xpk-test-workload --command "echo goodbye" \
551
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
552
+ --tpu-type=v5litepod-16 --storage=test-storage
553
+ ```
554
+
555
+ ### Detaching storage
556
+
557
+ ```shell
558
+ python3 xpk.py storage detach $STORAGE_NAME \
559
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
560
+ ```
561
+
562
+ ### Deleting storage
563
+
564
+ XPK allows you to remove Filestore instances easily with `xpk storage delete` command. **Warning:** this deletes all data contained in the Filestore!
565
+
566
+ ```shell
567
+ python3 xpk.py storage delete test-fs-instance \
568
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
569
+ ```
570
+
451
571
  ## Workload Create
452
572
  * Workload Create (submit training job):
453
573
 
@@ -455,7 +575,7 @@ Currently, the below flags/arguments are supported for A3-Mega and A3-Ultra mach
455
575
  python3 xpk.py workload create \
456
576
  --workload xpk-test-workload --command "echo goodbye" \
457
577
  --cluster xpk-test \
458
- --tpu-type=v5litepod-16
578
+ --tpu-type=v5litepod-16 --projet=$PROJECT
459
579
  ```
460
580
 
461
581
  * Workload Create for Pathways:
@@ -528,6 +648,8 @@ To submit jobs on a cluster with A3 machines, run the below command. To create a
528
648
  ```
529
649
  > The docker image flags/arguments introduced in [workloads section](#workload-create) can be used with A3 machines as well.
530
650
 
651
+ In order to run NCCL test on A3 Ultra machines check out [this guide](/examples/nccl/nccl.md).
652
+
531
653
  ### Workload Priority and Preemption
532
654
  * Set the priority level of your workload with `--priority=LEVEL`
533
655
 
@@ -666,8 +788,6 @@ Check out [MaxText example](https://github.com/google/maxtext/pull/570) on how t
666
788
  ```
667
789
 
668
790
  * Workload List supports waiting for the completion of a specific job. XPK will follow an existing job until it has finished or the `timeout`, if provided, has been reached and then list the job. If no `timeout` is specified, the default value is set to the max value, 1 week. You may also set `timeout=0` to poll the job once.
669
- (Note: `restart-on-user-code-failure` must be set
670
- when creating the workload otherwise the workload will always finish with `Completed` status.)
671
791
 
672
792
  Wait for a job to complete.
673
793
 
@@ -759,6 +879,35 @@ Inspector output is saved to a file.
759
879
  [XPK] Exiting XPK cleanly
760
880
  ```
761
881
 
882
+ ## Run
883
+ * `xpk run` lets you execute scripts on a cluster with ease. It automates task execution, handles interruptions, and streams job output to your console.
884
+
885
+ ```shell
886
+ python xpk.py run --kind-cluster -n 2 -t 0-2 examples/job.sh
887
+ ```
888
+
889
+ * Example Output:
890
+
891
+ ```shell
892
+ [XPK] Starting xpk
893
+ [XPK] Task: `get current-context` is implemented by `kubectl config current-context`, hiding output unless there is an error.
894
+ [XPK] No local cluster name specified. Using current-context `kind-kind`
895
+ [XPK] Task: `run task` is implemented by `kubectl kjob create slurm --profile xpk-def-app-profile --localqueue multislice-queue --wait --rm -- examples/job.sh --partition multislice-queue --ntasks 2 --time 0-2`. Streaming output and input live.
896
+ job.batch/xpk-def-app-profile-slurm-g4vr6 created
897
+ configmap/xpk-def-app-profile-slurm-g4vr6 created
898
+ service/xpk-def-app-profile-slurm-g4vr6 created
899
+ Starting log streaming for pod xpk-def-app-profile-slurm-g4vr6-1-4rmgk...
900
+ Now processing task ID: 3
901
+ Starting log streaming for pod xpk-def-app-profile-slurm-g4vr6-0-bg6dm...
902
+ Now processing task ID: 1
903
+ exit
904
+ exit
905
+ Now processing task ID: 2
906
+ exit
907
+ Job logs streaming finished.[XPK] Task: `run task` terminated with code `0`
908
+ [XPK] XPK Done.
909
+ ```
910
+
762
911
  ## GPU usage
763
912
 
764
913
  In order to use XPK for GPU, you can do so by using `device-type` flag.
@@ -1241,6 +1390,8 @@ gcloud beta compute reservations describe $RESERVATION --project=$PROJECT_ID --z
1241
1390
 
1242
1391
  ## 403 error on workload create when using `--base-docker-image` flag
1243
1392
  You need authority to push to the registry from your local machine. Try running `gcloud auth configure-docker`.
1393
+ ## `Kubernetes API exception` - 404 error
1394
+ If error of this kind appeared after updating xpk version it's possible that you need to rerun `cluster create` command in order to update resource definitions.
1244
1395
 
1245
1396
  # TPU Workload Debugging
1246
1397
 
@@ -1,29 +1,3 @@
1
- Metadata-Version: 2.2
2
- Name: xpk
3
- Version: 0.6.0
4
- Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
- Author-email: XPK team <xpk-code-reviewers@google.com>
6
- License: Apache-2.0
7
- Project-URL: Homepage, https://github.com/google/xpk
8
- Project-URL: Bug Tracker, https://github.com/google/xpk/issues
9
- Classifier: Programming Language :: Python :: 3.10
10
- Classifier: Programming Language :: Python :: 3.11
11
- Requires-Python: >=3.10
12
- Description-Content-Type: text/markdown
13
- License-File: LICENSE
14
- Requires-Dist: cloud-accelerator-diagnostics
15
- Requires-Dist: tabulate
16
- Requires-Dist: ruamel.yaml
17
- Requires-Dist: pyyaml
18
- Requires-Dist: docker
19
- Requires-Dist: packaging
20
- Provides-Extra: dev
21
- Requires-Dist: pyink==24.3.0; extra == "dev"
22
- Requires-Dist: pylint>=2.6.0; extra == "dev"
23
- Requires-Dist: pre-commit; extra == "dev"
24
- Requires-Dist: pytest; extra == "dev"
25
- Requires-Dist: docker; extra == "dev"
26
-
27
1
  <!--
28
2
  Copyright 2023 Google LLC
29
3
 
@@ -42,6 +16,8 @@ Requires-Dist: docker; extra == "dev"
42
16
 
43
17
  [![Build Tests](https://github.com/google/xpk/actions/workflows/build_tests.yaml/badge.svg)](https://github.com/google/xpk/actions/workflows/build_tests.yaml)
44
18
  [![Nightly Tests](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml/badge.svg)](https://github.com/google/xpk/actions/workflows/nightly_tests.yaml)
19
+ [![Develop Tests](https://github.com/AI-Hypercomputer/xpk/actions/workflows/build_tests.yaml/badge.svg?branch=develop)](https://github.com/AI-Hypercomputer/xpk/actions/workflows/build_tests.yaml)
20
+ [![Develop Nightly Tests](https://github.com/AI-Hypercomputer/xpk/actions/workflows/nightly_tests.yaml/badge.svg?branch=develop)](https://github.com/AI-Hypercomputer/xpk/actions/workflows/nightly_tests.yaml)
45
21
 
46
22
  # Overview
47
23
 
@@ -80,7 +56,11 @@ and the following GPU types:
80
56
  and the following CPU types:
81
57
  * n2-standard-32
82
58
 
83
- # Cloud Console Permissions on the user or service account needed to run XPK:
59
+ xpk also supports Google Cloud Storage solutions:
60
+ * [Cloud Storage FUSE](#fuse)
61
+ * [Filestore](#filestore)
62
+
63
+ # Permissions needed on Cloud Console:
84
64
 
85
65
  * Artifact Registry Writer
86
66
  * Compute Admin
@@ -90,6 +70,7 @@ and the following CPU types:
90
70
  * Service Account User
91
71
  * Storage Admin
92
72
  * Vertex AI Administrator
73
+ * Filestore Editor (This role is neccessary if you want to run `storage create` command with `--type=gcpfilestore`)
93
74
 
94
75
  # Prerequisites
95
76
 
@@ -111,17 +92,28 @@ Following tools must be installed:
111
92
  # sudo may be required
112
93
  apt-get -y install make
113
94
  ```
114
- In addition, below dependencies will be installed with `make install` command:
95
+ In addition, below dependencies can be installed either using provided links or using `make install` command, if xpk is downloaded via `git clone` command:
115
96
  - kueuectl (install from [here](https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/))
116
97
  - kjob (installation instructions [here](https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md))
117
98
 
118
99
  # Installation
119
- To install xpk, run the following command and install additional tools, mentioned in [prerequisites](#prerequisites). [Makefile](https://github.com/AI-Hypercomputer/xpk/blob/main/Makefile) provides a way to install all neccessary tools:
100
+ To install xpk, install required tools mentioned in [prerequisites](#prerequisites). [Makefile](https://github.com/AI-Hypercomputer/xpk/blob/main/Makefile) provides a way to install all neccessary tools. XPK can be installed via pip:
120
101
 
121
102
  ```shell
122
103
  pip install xpk
123
104
  ```
124
105
 
106
+ If you see an error saying: `This environment is externally managed`, please use a virtual environment.
107
+
108
+ ```shell
109
+ ## One time step of creating the venv
110
+ VENV_DIR=~/venvp3
111
+ python3 -m venv $VENV_DIR
112
+ ## Enter your venv.
113
+ source $VENV_DIR/bin/activate
114
+ ## Clone the repository and installing dependencies.
115
+ pip install xpk
116
+ ```
125
117
 
126
118
  If you are running XPK by cloning GitHub repository, first run the
127
119
  following commands to begin using XPK commands:
@@ -174,6 +166,8 @@ cleanup with a `Cluster Delete`.
174
166
  If you have failures with workloads not running, use `xpk inspector` to investigate
175
167
  more.
176
168
 
169
+ If you need your Workloads to have persistent storage, use `xpk storage` to find out more.
170
+
177
171
  ## Cluster Create
178
172
 
179
173
  First set the project and zone through gcloud config or xpk arguments.
@@ -448,6 +442,101 @@ Currently, the below flags/arguments are supported for A3-Mega and A3-Ultra mach
448
442
  * --on-demand (only A3-Mega)
449
443
 
450
444
 
445
+ ## Storage
446
+ Currently XPK supports two types of storages: Cloud Storage FUSE and Google Cloud Filestore.
447
+
448
+ ### FUSE
449
+ A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so applications can read and write objects in your bucket using standard file system semantics.
450
+
451
+ To use the GCS FUSE with XPK you need to create a [Storage Bucket](https://console.cloud.google.com/storage/).
452
+
453
+ Once it's ready you can use `xpk storage attach` with `--type=gcsfuse` command to attach a FUSE storage instance to your cluster:
454
+
455
+ ```shell
456
+ python3 xpk.py storage attach test-fuse-storage --type=gcsfuse \
457
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
458
+ --mount-point='/test-mount-point' --readonly=false \
459
+ --bucket=test-bucket --size=1 --auto-mount=false
460
+ ```
461
+
462
+ Parameters:
463
+
464
+ - `--type` - type of the storage, currently xpk supports `gcsfuse` and `gcpfilestore` only.
465
+ - `--auto-mount` - if set to true all workloads will have this storage mounted by default.
466
+ - `--mount-point` - the path on which this storage should be mounted for a workload.
467
+ - `--readonly` - if set to true, workload can only read from storage.
468
+ - `--size` - size of the storage in Gb.
469
+ - `--bucket` - name of the storage bucket. If not set then the name of the storage is used as a bucket name.
470
+
471
+ ### Filestore
472
+
473
+ A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so applications can read and write objects in your volumes using standard file system semantics.
474
+
475
+ To create and attach a GCP Filestore instance to your cluster use `xpk storage create` command with `--type=gcpfilestore`:
476
+
477
+ ```shell
478
+ python3 xpk.py storage create test-fs-storage --type=gcpfilestore \
479
+ --auto-mount=false --mount-point=/data-fs --readonly=false \
480
+ --size=1024 --tier=BASIC_HDD --access_mode=ReadWriteMany --vol=default \
481
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
482
+ ```
483
+
484
+ You can also attach an existing Filestore instance to your cluster using `xpk storage attach` command:
485
+
486
+ ```shell
487
+ python3 xpk.py storage attach test-fs-storage --type=gcpfilestore \
488
+ --auto-mount=false --mount-point=/data-fs --readonly=false \
489
+ --size=1024 --tier=BASIC_HDD --access_mode=ReadWriteMany --vol=default \
490
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
491
+ ```
492
+
493
+ The command above is also useful when attaching multiple volumes from the same Filestore instance.
494
+
495
+ Commands `xpk storage create` and `xpk storage attach` with `--type=gcpfilestore` accept following arguments:
496
+ - `--type` - type of the storage.
497
+ - `--auto-mount` - if set to true all workloads will have this storage mounted by default.
498
+ - `--mount-point` - the path on which this storage should be mounted for a workload.
499
+ - `--readonly` - if set to true, workload can only read from storage.
500
+ - `--size` - size of the Filestore instance that will be created in Gb.
501
+ - `--tier` - tier of the Filestore instance that will be created. Possible options are: `[BASIC_HDD, BASIC_SSD, ZONAL, REGIONAL, ENTERPRISE]`
502
+ - `--access-mode` - access mode of the Filestore instance that will be created. Possible values are: `[ReadWriteOnce, ReadOnlyMany, ReadWriteMany]`
503
+ - `--vol` - file share name of the Filestore instance that will be created.
504
+ - `--instance` - the name of the Filestore instance. If not set then the name parameter is used as an instance name. Useful when connecting multiple volumes from the same Filestore instance.
505
+
506
+ ### List attached storages
507
+
508
+ ```shell
509
+ python3 xpk.py storage list \
510
+ --project=$PROJECT --cluster $CLUSTER --zone=$ZONE
511
+ ```
512
+
513
+ ### Running workloads with storage
514
+
515
+ If you specified `--auto-mount=true` when creating or attaching a storage, then all workloads deployed on the cluster will have the volume attached by default. Otherwise, in order to have the storage attached, you have to add `--storage` parameter to `workload create` command:
516
+
517
+ ```shell
518
+ python3 xpk.py workload create \
519
+ --workload xpk-test-workload --command "echo goodbye" \
520
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
521
+ --tpu-type=v5litepod-16 --storage=test-storage
522
+ ```
523
+
524
+ ### Detaching storage
525
+
526
+ ```shell
527
+ python3 xpk.py storage detach $STORAGE_NAME \
528
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
529
+ ```
530
+
531
+ ### Deleting storage
532
+
533
+ XPK allows you to remove Filestore instances easily with `xpk storage delete` command. **Warning:** this deletes all data contained in the Filestore!
534
+
535
+ ```shell
536
+ python3 xpk.py storage delete test-fs-instance \
537
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE
538
+ ```
539
+
451
540
  ## Workload Create
452
541
  * Workload Create (submit training job):
453
542
 
@@ -455,7 +544,7 @@ Currently, the below flags/arguments are supported for A3-Mega and A3-Ultra mach
455
544
  python3 xpk.py workload create \
456
545
  --workload xpk-test-workload --command "echo goodbye" \
457
546
  --cluster xpk-test \
458
- --tpu-type=v5litepod-16
547
+ --tpu-type=v5litepod-16 --projet=$PROJECT
459
548
  ```
460
549
 
461
550
  * Workload Create for Pathways:
@@ -528,6 +617,8 @@ To submit jobs on a cluster with A3 machines, run the below command. To create a
528
617
  ```
529
618
  > The docker image flags/arguments introduced in [workloads section](#workload-create) can be used with A3 machines as well.
530
619
 
620
+ In order to run NCCL test on A3 Ultra machines check out [this guide](/examples/nccl/nccl.md).
621
+
531
622
  ### Workload Priority and Preemption
532
623
  * Set the priority level of your workload with `--priority=LEVEL`
533
624
 
@@ -666,8 +757,6 @@ Check out [MaxText example](https://github.com/google/maxtext/pull/570) on how t
666
757
  ```
667
758
 
668
759
  * Workload List supports waiting for the completion of a specific job. XPK will follow an existing job until it has finished or the `timeout`, if provided, has been reached and then list the job. If no `timeout` is specified, the default value is set to the max value, 1 week. You may also set `timeout=0` to poll the job once.
669
- (Note: `restart-on-user-code-failure` must be set
670
- when creating the workload otherwise the workload will always finish with `Completed` status.)
671
760
 
672
761
  Wait for a job to complete.
673
762
 
@@ -759,6 +848,35 @@ Inspector output is saved to a file.
759
848
  [XPK] Exiting XPK cleanly
760
849
  ```
761
850
 
851
+ ## Run
852
+ * `xpk run` lets you execute scripts on a cluster with ease. It automates task execution, handles interruptions, and streams job output to your console.
853
+
854
+ ```shell
855
+ python xpk.py run --kind-cluster -n 2 -t 0-2 examples/job.sh
856
+ ```
857
+
858
+ * Example Output:
859
+
860
+ ```shell
861
+ [XPK] Starting xpk
862
+ [XPK] Task: `get current-context` is implemented by `kubectl config current-context`, hiding output unless there is an error.
863
+ [XPK] No local cluster name specified. Using current-context `kind-kind`
864
+ [XPK] Task: `run task` is implemented by `kubectl kjob create slurm --profile xpk-def-app-profile --localqueue multislice-queue --wait --rm -- examples/job.sh --partition multislice-queue --ntasks 2 --time 0-2`. Streaming output and input live.
865
+ job.batch/xpk-def-app-profile-slurm-g4vr6 created
866
+ configmap/xpk-def-app-profile-slurm-g4vr6 created
867
+ service/xpk-def-app-profile-slurm-g4vr6 created
868
+ Starting log streaming for pod xpk-def-app-profile-slurm-g4vr6-1-4rmgk...
869
+ Now processing task ID: 3
870
+ Starting log streaming for pod xpk-def-app-profile-slurm-g4vr6-0-bg6dm...
871
+ Now processing task ID: 1
872
+ exit
873
+ exit
874
+ Now processing task ID: 2
875
+ exit
876
+ Job logs streaming finished.[XPK] Task: `run task` terminated with code `0`
877
+ [XPK] XPK Done.
878
+ ```
879
+
762
880
  ## GPU usage
763
881
 
764
882
  In order to use XPK for GPU, you can do so by using `device-type` flag.
@@ -1241,6 +1359,8 @@ gcloud beta compute reservations describe $RESERVATION --project=$PROJECT_ID --z
1241
1359
 
1242
1360
  ## 403 error on workload create when using `--base-docker-image` flag
1243
1361
  You need authority to push to the registry from your local machine. Try running `gcloud auth configure-docker`.
1362
+ ## `Kubernetes API exception` - 404 error
1363
+ If error of this kind appeared after updating xpk version it's possible that you need to rerun `cluster create` command in order to update resource definitions.
1244
1364
 
1245
1365
  # TPU Workload Debugging
1246
1366
 
@@ -30,12 +30,17 @@ keywords = []
30
30
 
31
31
  # pip dependencies installed with `pip install -e .`
32
32
  dependencies = [
33
- "cloud-accelerator-diagnostics",
34
- "tabulate",
35
- "ruamel.yaml",
36
- "pyyaml",
37
- "docker",
38
- "packaging"
33
+ "cloud-accelerator-diagnostics==0.1.1",
34
+ "tabulate==0.9.0",
35
+ "ruamel.yaml==0.18.10",
36
+ "pyyaml==6.0.2",
37
+ "docker==7.1.0",
38
+ "kubernetes==31.0.0",
39
+ "google-cloud==0.34.0",
40
+ "google-api-core==2.24.1",
41
+ "packaging==24.2",
42
+ "google-cloud-filestore==1.12.0",
43
+ "google-cloud-storage==2.19.0"
39
44
  ]
40
45
 
41
46
  [project.urls]
@@ -43,7 +48,7 @@ dependencies = [
43
48
  "Bug Tracker" = "https://github.com/google/xpk/issues"
44
49
 
45
50
  [build-system]
46
- requires = ["setuptools>=61.0"]
51
+ requires = ["setuptools>=61.0", "setuptools-git-versioning"]
47
52
  build-backend = "setuptools.build_meta"
48
53
 
49
54
  [project.scripts]
@@ -57,15 +62,16 @@ dev = [
57
62
  "pylint>=2.6.0",
58
63
  "pre-commit",
59
64
  "pytest",
60
- "docker"
65
+ "docker==7.1.0"
61
66
  ]
62
67
 
63
68
  [tool.setuptools.dynamic]
64
- version = {attr = "xpk.core.core.__version__"}
69
+ version = {attr = "xpk.core.config.__version__"}
65
70
 
66
71
  [tool.setuptools]
67
- packages = ["xpk", "xpk.parser", "xpk.core", "xpk.commands", "xpk.utils", "xpk.core.blueprint", "xpk.core.workload_decorators"]
72
+ packages = ["xpk", "xpk.parser", "xpk.core", "xpk.commands", "xpk.api", "xpk.templates", "xpk.utils", "xpk.core.blueprint", "xpk.core.remote_state", "xpk.core.workload_decorators"]
68
73
  package-dir = {"" = "src"}
74
+ package-data = {"xpk.api" = ["storage_crd.yaml"], "xpk.templates" = ["storage.yaml"]}
69
75
 
70
76
  [tool.pyink]
71
77
  # Formatting configuration to follow Google style-guide.
@@ -0,0 +1,52 @@
1
+ apiVersion: apiextensions.k8s.io/v1
2
+ kind: CustomResourceDefinition
3
+ metadata:
4
+ name: storages.xpk.x-k8s.io
5
+ spec:
6
+ group: xpk.x-k8s.io
7
+ versions:
8
+ - name: v1
9
+ served: true
10
+ storage: true
11
+ schema:
12
+ openAPIV3Schema:
13
+ type: object
14
+ properties:
15
+ spec:
16
+ type: object
17
+ properties:
18
+ type:
19
+ type: string
20
+ cluster:
21
+ type: string
22
+ auto_mount:
23
+ type: boolean
24
+ mount_point:
25
+ type: string
26
+ readonly:
27
+ type: boolean
28
+ manifest:
29
+ type: string
30
+ pv:
31
+ type: string
32
+ pvc:
33
+ type: string
34
+ required:
35
+ - type
36
+ - cluster
37
+ - auto_mount
38
+ - mount_point
39
+ - readonly
40
+ - manifest
41
+ - pvc
42
+ - pv
43
+ x-kubernetes-validations:
44
+ - message: Value is immutable
45
+ rule: self == oldSelf
46
+ scope: Cluster
47
+ names:
48
+ plural: storages
49
+ singular: storage
50
+ kind: Storage
51
+ shortNames:
52
+ - stg