xpk 0.9.0__tar.gz → 0.10.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. {xpk-0.9.0 → xpk-0.10.1}/PKG-INFO +46 -7
  2. xpk-0.9.0/src/xpk.egg-info/PKG-INFO → xpk-0.10.1/README.md +44 -37
  3. {xpk-0.9.0 → xpk-0.10.1}/pyproject.toml +1 -1
  4. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/commands/batch.py +3 -3
  5. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/commands/cluster.py +22 -1
  6. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/commands/cluster_gcluster.py +27 -0
  7. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/commands/common.py +12 -5
  8. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/commands/kjob_common.py +4 -1
  9. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/commands/run.py +2 -2
  10. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/commands/shell.py +2 -2
  11. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/commands/storage.py +10 -3
  12. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/commands/workload.py +64 -27
  13. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/blueprint/blueprint_generator.py +108 -40
  14. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/capacity.py +66 -6
  15. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/cluster.py +165 -7
  16. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/config.py +1 -65
  17. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/docker_manager.py +1 -1
  18. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/docker_resources.py +145 -72
  19. xpk-0.10.1/src/xpk/core/jobset.py +143 -0
  20. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/kjob.py +2 -6
  21. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/kueue.py +154 -5
  22. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/nodepool.py +17 -4
  23. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/pathways.py +1 -2
  24. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/storage.py +1 -95
  25. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/system_characteristics.py +1 -1
  26. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/workload.py +0 -44
  27. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/workload_decorators/rdma_decorator.py +2 -0
  28. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/workload_decorators/tcpx_decorator.py +10 -4
  29. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/workload_decorators/tcpxo_decorator.py +7 -0
  30. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/parser/cluster.py +23 -7
  31. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/parser/storage.py +2 -2
  32. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/parser/workload.py +21 -3
  33. xpk-0.9.0/README.md → xpk-0.10.1/src/xpk.egg-info/PKG-INFO +76 -5
  34. {xpk-0.9.0 → xpk-0.10.1}/src/xpk.egg-info/SOURCES.txt +1 -0
  35. {xpk-0.9.0 → xpk-0.10.1}/src/xpk.egg-info/requires.txt +1 -1
  36. {xpk-0.9.0 → xpk-0.10.1}/LICENSE +0 -0
  37. {xpk-0.9.0 → xpk-0.10.1}/setup.cfg +0 -0
  38. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/__init__.py +0 -0
  39. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/api/__init__.py +0 -0
  40. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/api/storage_crd.yaml +0 -0
  41. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/commands/__init__.py +0 -0
  42. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/commands/config.py +0 -0
  43. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/commands/info.py +0 -0
  44. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/commands/inspector.py +0 -0
  45. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/commands/job.py +0 -0
  46. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/commands/kind.py +0 -0
  47. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/commands/version.py +0 -0
  48. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/__init__.py +0 -0
  49. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/blueprint/__init__.py +0 -0
  50. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
  51. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/cluster_private.py +0 -0
  52. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/commands.py +0 -0
  53. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/docker_container.py +0 -0
  54. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/docker_image.py +0 -0
  55. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/filestore.py +0 -0
  56. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/gcloud_context.py +0 -0
  57. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/gcluster_manager.py +0 -0
  58. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/gcsfuse.py +0 -0
  59. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/monitoring.py +0 -0
  60. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/mtc.py +0 -0
  61. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/nap.py +0 -0
  62. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/network.py +0 -0
  63. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/ray.py +0 -0
  64. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/remote_state/__init__.py +0 -0
  65. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
  66. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/remote_state/remote_state_client.py +0 -0
  67. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/resources.py +0 -0
  68. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/scheduling.py +0 -0
  69. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/vertex.py +0 -0
  70. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/workload_decorators/__init__.py +0 -0
  71. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
  72. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/main.py +0 -0
  73. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/parser/__init__.py +0 -0
  74. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/parser/batch.py +0 -0
  75. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/parser/common.py +0 -0
  76. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/parser/config.py +0 -0
  77. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/parser/core.py +0 -0
  78. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/parser/info.py +0 -0
  79. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/parser/inspector.py +0 -0
  80. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/parser/job.py +0 -0
  81. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/parser/kind.py +0 -0
  82. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/parser/run.py +0 -0
  83. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/parser/shell.py +0 -0
  84. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/parser/validators.py +0 -0
  85. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/parser/version.py +0 -0
  86. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/templates/__init__.py +0 -0
  87. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/templates/storage.yaml +0 -0
  88. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/utils/__init__.py +0 -0
  89. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/utils/console.py +0 -0
  90. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/utils/file.py +0 -0
  91. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/utils/gcs_utils.py +0 -0
  92. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/utils/kubectl.py +0 -0
  93. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/utils/network.py +0 -0
  94. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/utils/objects.py +0 -0
  95. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/utils/templates.py +0 -0
  96. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/utils/validation.py +0 -0
  97. {xpk-0.9.0 → xpk-0.10.1}/src/xpk/utils/yaml.py +0 -0
  98. {xpk-0.9.0 → xpk-0.10.1}/src/xpk.egg-info/dependency_links.txt +0 -0
  99. {xpk-0.9.0 → xpk-0.10.1}/src/xpk.egg-info/entry_points.txt +0 -0
  100. {xpk-0.9.0 → xpk-0.10.1}/src/xpk.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xpk
3
- Version: 0.9.0
3
+ Version: 0.10.1
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
5
  Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
@@ -21,7 +21,7 @@ Requires-Dist: google-cloud==0.34.0
21
21
  Requires-Dist: google-api-core==2.24.1
22
22
  Requires-Dist: packaging==24.2
23
23
  Requires-Dist: google-cloud-filestore==1.12.0
24
- Requires-Dist: google-cloud-storage==2.19.0
24
+ Requires-Dist: google-cloud-storage
25
25
  Provides-Extra: dev
26
26
  Requires-Dist: pyink==24.3.0; extra == "dev"
27
27
  Requires-Dist: pylint>=2.6.0; extra == "dev"
@@ -259,6 +259,13 @@ all zones.
259
259
  --num-slices=4 --spot
260
260
  ```
261
261
 
262
+ * Cluster Create (DWS flex queued capacity):
263
+ ```shell
264
+ python3 xpk.py cluster create \
265
+ --cluster xpk-test --tpu-type=v5litepod-16 \
266
+ --num-slices=4 --flex
267
+ ```
268
+
262
269
  * Cluster Create for Pathways:
263
270
  Pathways compatible cluster can be created using `cluster create-pathways`.
264
271
  ```shell
@@ -495,6 +502,7 @@ Currently, the below flags/arguments are supported for A3 Mega, A3 Ultra and A4
495
502
  * `--reservation`
496
503
  * `--spot`
497
504
  * `--on-demand` (A3 Mega only)
505
+ * `--flex`
498
506
 
499
507
  ## Running XPK on existing clusters
500
508
 
@@ -518,9 +526,10 @@ Currently XPK supports the below types of storages:
518
526
  - [Google Cloud Filestore](#filestore)
519
527
  - [Google Cloud Parallelstore](#parallelstore)
520
528
  - [Google Cloud Block storages (Persistent Disk, Hyperdisk)](#block-storage-persistent-disk-hyperdisk)
529
+ - [Google Cloud Managed Lustre](#managed-lustre)
521
530
 
522
531
  ### FUSE
523
- A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so applications can read and write objects in your bucket using standard file system semantics.
532
+ A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so workloads can read and write objects in your bucket using standard file system semantics.
524
533
 
525
534
  To use the GCS FUSE with XPK you need to create a [Storage Bucket](https://console.cloud.google.com/storage/).
526
535
 
@@ -547,7 +556,7 @@ Parameters:
547
556
 
548
557
  ### Filestore
549
558
 
550
- A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
559
+ A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
551
560
 
552
561
  To create and attach a GCP Filestore instance to your cluster use `xpk storage create` command with `--type=gcpfilestore`:
553
562
 
@@ -583,7 +592,7 @@ Commands `xpk storage create` and `xpk storage attach` with `--type=gcpfilestore
583
592
 
584
593
  ### Parallelstore
585
594
 
586
- A Parallelstore adapter lets you mount and access [Parallelstore instances](https://cloud.google.com/parallelstore/) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
595
+ A Parallelstore adapter lets you mount and access [Parallelstore instances](https://cloud.google.com/parallelstore/) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
587
596
 
588
597
  To use the GCS Parallelstore with XPK you need to create a [Parallelstore Instance](https://console.cloud.google.com/parallelstore/).
589
598
 
@@ -607,7 +616,7 @@ Parameters:
607
616
 
608
617
  ### Block storage (Persistent Disk, Hyperdisk)
609
618
 
610
- A PersistentDisk adapter lets you mount and access Google Cloud Block storage solutions ([Persistent Disk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#pd), [Hyperdisk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#hyperdisk)) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
619
+ A PersistentDisk adapter lets you mount and access Google Cloud Block storage solutions ([Persistent Disk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#pd), [Hyperdisk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#hyperdisk)) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
611
620
 
612
621
  To use the GCE PersistentDisk with XPK you need to create a [disk in GCE](https://cloud.google.com/compute/docs/disks). Please consider that the disk type you are creating is [compatible with the VMs](https://cloud.google.com/compute/docs/machine-resource#machine_type_comparison) in the default and accelerator nodepools.
613
622
 
@@ -629,6 +638,30 @@ Parameters:
629
638
  - `--readonly` - if set to true, workload can only read from storage.
630
639
  - `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
631
640
 
641
+ ### Managed Lustre
642
+
643
+ A Managed Lustre adaptor lets you mount and access [Google Cloud Managed Lustre instances](https://cloud.google.com/kubernetes-engine/docs/concepts/managed-lustre) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
644
+
645
+ To use the GCP Managed Lustre with XPK you need to create [an instance](https://cloud.google.com/managed-lustre/docs/create-instance). Please make sure you enable GKE support when creating the instance (gcloud ex. `--gke-support-enabled`).
646
+
647
+ Once it's ready you can use `xpk storage attach` with `--type=lustre` command to attach a Managed Lustre instance to your cluster. Currently, attaching a Managed Lustre instance is supported only by providing a manifest file.
648
+
649
+ ```shell
650
+ python3 xpk.py storage attach test-lustre-storage --type=lustre \
651
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
652
+ --mount-point='/test-mount-point' --readonly=false \
653
+ --auto-mount=true \
654
+ --manifest='./examples/storage/lustre-manifest-attach.yaml'
655
+ ```
656
+
657
+ Parameters:
658
+
659
+ - `--type` - type of the storage `lustre`
660
+ - `--auto-mount` - if set to true all workloads will have this storage mounted by default.
661
+ - `--mount-point` - the path on which this storage should be mounted for a workload.
662
+ - `--readonly` - if set to true, workload can only read from storage.
663
+ - `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
664
+
632
665
  ### List attached storages
633
666
 
634
667
  ```shell
@@ -670,8 +703,14 @@ python3 xpk.py storage delete test-fs-instance \
670
703
  python3 xpk.py workload create \
671
704
  --workload xpk-test-workload --command "echo goodbye" \
672
705
  --cluster xpk-test \
673
- --tpu-type=v5litepod-16 --projet=$PROJECT
706
+ --tpu-type=v5litepod-16 --project=$PROJECT
674
707
  ```
708
+ * Workload create(DWS flex with queued provisioning):
709
+ ```shell
710
+ python3 xpk.py workload create \
711
+ --workload xpk-test-workload --command "echo goodbye" \
712
+ --cluster xpk-test --flex \
713
+ --tpu-type=v5litepod-16 --project=$PROJECT
675
714
 
676
715
  * Workload Create for Pathways:
677
716
  Pathways workload can be submitted using `workload create-pathways` on a Pathways enabled cluster (created with `cluster create-pathways`)
@@ -1,35 +1,3 @@
1
- Metadata-Version: 2.4
2
- Name: xpk
3
- Version: 0.9.0
4
- Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
- Author-email: XPK team <xpk-code-reviewers@google.com>
6
- License: Apache-2.0
7
- Project-URL: Homepage, https://github.com/google/xpk
8
- Project-URL: Bug Tracker, https://github.com/google/xpk/issues
9
- Classifier: Programming Language :: Python :: 3.10
10
- Classifier: Programming Language :: Python :: 3.11
11
- Requires-Python: >=3.10
12
- Description-Content-Type: text/markdown
13
- License-File: LICENSE
14
- Requires-Dist: cloud-accelerator-diagnostics==0.1.1
15
- Requires-Dist: tabulate==0.9.0
16
- Requires-Dist: ruamel.yaml==0.18.10
17
- Requires-Dist: pyyaml==6.0.2
18
- Requires-Dist: docker==7.1.0
19
- Requires-Dist: kubernetes==31.0.0
20
- Requires-Dist: google-cloud==0.34.0
21
- Requires-Dist: google-api-core==2.24.1
22
- Requires-Dist: packaging==24.2
23
- Requires-Dist: google-cloud-filestore==1.12.0
24
- Requires-Dist: google-cloud-storage==2.19.0
25
- Provides-Extra: dev
26
- Requires-Dist: pyink==24.3.0; extra == "dev"
27
- Requires-Dist: pylint>=2.6.0; extra == "dev"
28
- Requires-Dist: pre-commit; extra == "dev"
29
- Requires-Dist: pytest; extra == "dev"
30
- Requires-Dist: docker==7.1.0; extra == "dev"
31
- Dynamic: license-file
32
-
33
1
  <!--
34
2
  Copyright 2023 Google LLC
35
3
 
@@ -259,6 +227,13 @@ all zones.
259
227
  --num-slices=4 --spot
260
228
  ```
261
229
 
230
+ * Cluster Create (DWS flex queued capacity):
231
+ ```shell
232
+ python3 xpk.py cluster create \
233
+ --cluster xpk-test --tpu-type=v5litepod-16 \
234
+ --num-slices=4 --flex
235
+ ```
236
+
262
237
  * Cluster Create for Pathways:
263
238
  Pathways compatible cluster can be created using `cluster create-pathways`.
264
239
  ```shell
@@ -495,6 +470,7 @@ Currently, the below flags/arguments are supported for A3 Mega, A3 Ultra and A4
495
470
  * `--reservation`
496
471
  * `--spot`
497
472
  * `--on-demand` (A3 Mega only)
473
+ * `--flex`
498
474
 
499
475
  ## Running XPK on existing clusters
500
476
 
@@ -518,9 +494,10 @@ Currently XPK supports the below types of storages:
518
494
  - [Google Cloud Filestore](#filestore)
519
495
  - [Google Cloud Parallelstore](#parallelstore)
520
496
  - [Google Cloud Block storages (Persistent Disk, Hyperdisk)](#block-storage-persistent-disk-hyperdisk)
497
+ - [Google Cloud Managed Lustre](#managed-lustre)
521
498
 
522
499
  ### FUSE
523
- A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so applications can read and write objects in your bucket using standard file system semantics.
500
+ A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so workloads can read and write objects in your bucket using standard file system semantics.
524
501
 
525
502
  To use the GCS FUSE with XPK you need to create a [Storage Bucket](https://console.cloud.google.com/storage/).
526
503
 
@@ -547,7 +524,7 @@ Parameters:
547
524
 
548
525
  ### Filestore
549
526
 
550
- A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
527
+ A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
551
528
 
552
529
  To create and attach a GCP Filestore instance to your cluster use `xpk storage create` command with `--type=gcpfilestore`:
553
530
 
@@ -583,7 +560,7 @@ Commands `xpk storage create` and `xpk storage attach` with `--type=gcpfilestore
583
560
 
584
561
  ### Parallelstore
585
562
 
586
- A Parallelstore adapter lets you mount and access [Parallelstore instances](https://cloud.google.com/parallelstore/) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
563
+ A Parallelstore adapter lets you mount and access [Parallelstore instances](https://cloud.google.com/parallelstore/) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
587
564
 
588
565
  To use the GCS Parallelstore with XPK you need to create a [Parallelstore Instance](https://console.cloud.google.com/parallelstore/).
589
566
 
@@ -607,7 +584,7 @@ Parameters:
607
584
 
608
585
  ### Block storage (Persistent Disk, Hyperdisk)
609
586
 
610
- A PersistentDisk adapter lets you mount and access Google Cloud Block storage solutions ([Persistent Disk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#pd), [Hyperdisk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#hyperdisk)) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
587
+ A PersistentDisk adapter lets you mount and access Google Cloud Block storage solutions ([Persistent Disk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#pd), [Hyperdisk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#hyperdisk)) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
611
588
 
612
589
  To use the GCE PersistentDisk with XPK you need to create a [disk in GCE](https://cloud.google.com/compute/docs/disks). Please consider that the disk type you are creating is [compatible with the VMs](https://cloud.google.com/compute/docs/machine-resource#machine_type_comparison) in the default and accelerator nodepools.
613
590
 
@@ -629,6 +606,30 @@ Parameters:
629
606
  - `--readonly` - if set to true, workload can only read from storage.
630
607
  - `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
631
608
 
609
+ ### Managed Lustre
610
+
611
+ A Managed Lustre adaptor lets you mount and access [Google Cloud Managed Lustre instances](https://cloud.google.com/kubernetes-engine/docs/concepts/managed-lustre) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
612
+
613
+ To use the GCP Managed Lustre with XPK you need to create [an instance](https://cloud.google.com/managed-lustre/docs/create-instance). Please make sure you enable GKE support when creating the instance (gcloud ex. `--gke-support-enabled`).
614
+
615
+ Once it's ready you can use `xpk storage attach` with `--type=lustre` command to attach a Managed Lustre instance to your cluster. Currently, attaching a Managed Lustre instance is supported only by providing a manifest file.
616
+
617
+ ```shell
618
+ python3 xpk.py storage attach test-lustre-storage --type=lustre \
619
+ --project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
620
+ --mount-point='/test-mount-point' --readonly=false \
621
+ --auto-mount=true \
622
+ --manifest='./examples/storage/lustre-manifest-attach.yaml'
623
+ ```
624
+
625
+ Parameters:
626
+
627
+ - `--type` - type of the storage `lustre`
628
+ - `--auto-mount` - if set to true all workloads will have this storage mounted by default.
629
+ - `--mount-point` - the path on which this storage should be mounted for a workload.
630
+ - `--readonly` - if set to true, workload can only read from storage.
631
+ - `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
632
+
632
633
  ### List attached storages
633
634
 
634
635
  ```shell
@@ -670,8 +671,14 @@ python3 xpk.py storage delete test-fs-instance \
670
671
  python3 xpk.py workload create \
671
672
  --workload xpk-test-workload --command "echo goodbye" \
672
673
  --cluster xpk-test \
673
- --tpu-type=v5litepod-16 --projet=$PROJECT
674
+ --tpu-type=v5litepod-16 --project=$PROJECT
674
675
  ```
676
+ * Workload create(DWS flex with queued provisioning):
677
+ ```shell
678
+ python3 xpk.py workload create \
679
+ --workload xpk-test-workload --command "echo goodbye" \
680
+ --cluster xpk-test --flex \
681
+ --tpu-type=v5litepod-16 --project=$PROJECT
675
682
 
676
683
  * Workload Create for Pathways:
677
684
  Pathways workload can be submitted using `workload create-pathways` on a Pathways enabled cluster (created with `cluster create-pathways`)
@@ -40,7 +40,7 @@ dependencies = [
40
40
  "google-api-core==2.24.1",
41
41
  "packaging==24.2",
42
42
  "google-cloud-filestore==1.12.0",
43
- "google-cloud-storage==2.19.0"
43
+ "google-cloud-storage"
44
44
  ]
45
45
 
46
46
  [project.urls]
@@ -18,7 +18,7 @@ import re
18
18
  from argparse import Namespace
19
19
 
20
20
  from ..core.cluster import (
21
- create_xpk_k8s_service_account,
21
+ setup_k8s_service_accounts,
22
22
  get_cluster_credentials,
23
23
  )
24
24
  from ..core.commands import run_command_for_value
@@ -54,14 +54,14 @@ def batch(args: Namespace) -> None:
54
54
  err_code = prepare_kjob(args)
55
55
  if err_code > 0:
56
56
  xpk_exit(err_code)
57
- create_xpk_k8s_service_account()
57
+ setup_k8s_service_accounts()
58
58
 
59
59
  submit_job(args)
60
60
 
61
61
 
62
62
  def submit_job(args: Namespace) -> None:
63
63
 
64
- create_xpk_k8s_service_account()
64
+ setup_k8s_service_accounts()
65
65
 
66
66
  cmd = (
67
67
  'kubectl kjob create slurm'
@@ -31,6 +31,7 @@ from ..core.cluster import (
31
31
  update_cluster_with_gcsfuse_driver_if_necessary,
32
32
  update_cluster_with_parallelstore_driver_if_necessary,
33
33
  update_cluster_with_pd_driver_if_necessary,
34
+ update_cluster_with_lustre_driver_if_necessary,
34
35
  update_cluster_with_workload_identity_if_necessary,
35
36
  )
36
37
  from ..core.cluster_private import authorize_private_cluster_access_if_necessary
@@ -42,12 +43,14 @@ from ..core.gcloud_context import (
42
43
  get_gke_server_config,
43
44
  zone_to_region,
44
45
  )
46
+ from ..core.jobset import update_jobset_resources_if_necessary
45
47
  from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
46
48
  from ..core.kueue import (
47
49
  cluster_preheat_yml,
48
50
  install_kueue_crs,
49
51
  install_kueue_on_cluster,
50
52
  wait_for_kueue_available,
53
+ update_kueue_resources_if_necessary,
51
54
  )
52
55
  from ..core.nap import enable_autoprovisioning_on_cluster
53
56
  from ..core.network import (
@@ -170,7 +173,6 @@ def cluster_adapt(args) -> None:
170
173
  install_kueue(args, system, autoprovisioning_config)
171
174
 
172
175
  install_kjob(args)
173
-
174
176
  if system.accelerator_type == AcceleratorType['GPU']:
175
177
  prepare_gpus(args, system)
176
178
 
@@ -308,6 +310,9 @@ def cluster_create(args) -> None:
308
310
  set_jobset_on_cluster_code = set_jobset_on_cluster(args)
309
311
  if set_jobset_on_cluster_code != 0:
310
312
  xpk_exit(set_jobset_on_cluster_code)
313
+ update_jobset_resources_code = update_jobset_resources_if_necessary(args)
314
+ if update_jobset_resources_code != 0:
315
+ xpk_exit(update_jobset_resources_code)
311
316
 
312
317
  set_pathways_job_on_cluster_code = set_pathways_job_on_cluster(args)
313
318
  if set_pathways_job_on_cluster_code != 0:
@@ -879,6 +884,10 @@ def run_gke_cluster_create_command(
879
884
  if args.enable_pd_csi_driver:
880
885
  addons.append('GcePersistentDiskCsiDriver')
881
886
 
887
+ if args.enable_lustre_csi_driver:
888
+ addons.append('LustreCsiDriver')
889
+ command += ' --enable-legacy-lustre-port'
890
+
882
891
  if hasattr(args, 'enable_mtc') and args.enable_mtc:
883
892
  addons.append('HighScaleCheckpointing')
884
893
 
@@ -922,6 +931,13 @@ def install_storage_csis(args):
922
931
  if update_cluster_command_code != 0:
923
932
  xpk_exit(update_cluster_command_code)
924
933
 
934
+ if args.enable_lustre_csi_driver:
935
+ update_cluster_command_code = (
936
+ update_cluster_with_lustre_driver_if_necessary(args)
937
+ )
938
+ if update_cluster_command_code != 0:
939
+ xpk_exit(update_cluster_command_code)
940
+
925
941
 
926
942
  def install_kjob(args):
927
943
  xpk_print('Verifying kjob installation')
@@ -957,6 +973,11 @@ def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config):
957
973
  if enable_kueue_credentials_code != 0:
958
974
  xpk_exit(enable_kueue_credentials_code)
959
975
 
976
+ xpk_print('Update Kueue Controller Manager resources')
977
+ update_kueue_resources_code = update_kueue_resources_if_necessary(args)
978
+ if update_kueue_resources_code != 0:
979
+ xpk_exit(update_kueue_resources_code)
980
+
960
981
 
961
982
  def prepare_gpus(args, system: SystemCharacteristics):
962
983
  xpk_print('Installing NCCL Plugin for cluster')
@@ -37,6 +37,7 @@ from ..utils.console import xpk_exit, xpk_print
37
37
  from ..utils.file import ensure_directory_exists
38
38
  from ..utils.network import all_IPs_cidr
39
39
  from ..utils.objects import hash_string
40
+ from ..core.capacity import get_reservation_maintenance_interval, get_reservation_placement_policy
40
41
 
41
42
  blueprints_path = os.path.abspath('xpkclusters/blueprints')
42
43
  gcluster_working_dir = os.path.abspath('xpkclusters/gcluster-out')
@@ -234,6 +235,30 @@ def generate_blueprint(
234
235
  if args.device_type in supported_device_types:
235
236
  if args.device_type == a3mega_device_type:
236
237
  num_nodes = args.num_nodes if not args.num_nodes is None else 2
238
+
239
+ maintenance_interval = (
240
+ get_reservation_maintenance_interval(
241
+ args.reservation, args.zone, args.project
242
+ )
243
+ if args.reservation is not None
244
+ else 'PERIODIC'
245
+ )
246
+ placement_policy_name = (
247
+ get_reservation_placement_policy(
248
+ args.reservation, args.zone, args.project
249
+ )
250
+ if args.reservation is not None
251
+ else None
252
+ )
253
+ placement_policy = (
254
+ {
255
+ 'type': 'COMPACT',
256
+ 'name': placement_policy_name.split('/')[-1],
257
+ }
258
+ if placement_policy_name is not None
259
+ and len(placement_policy_name) > 0
260
+ else None
261
+ )
237
262
  return bpg.generate_a3_mega_blueprint(
238
263
  blueprint_name=blueprint_name,
239
264
  prefix=prefix,
@@ -243,6 +268,8 @@ def generate_blueprint(
243
268
  zone=args.zone,
244
269
  auth_cidr=all_IPs_cidr,
245
270
  num_nodes=num_nodes,
271
+ reservation_maintenance_interval=maintenance_interval,
272
+ reservation_placement_policy=placement_policy,
246
273
  reservation=args.reservation if args.reservation else None,
247
274
  capacity_type=capacity_type,
248
275
  system_node_pool_machine_type=args.default_pool_cpu_machine_type,
@@ -15,10 +15,12 @@ limitations under the License.
15
15
  """
16
16
 
17
17
  from ..core.commands import run_command_with_updates_retry
18
- from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
19
18
  from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType
20
19
  from ..core.gcloud_context import zone_to_region
21
20
  from ..utils.console import xpk_print, xpk_exit
21
+ from ..core.system_characteristics import (
22
+ SystemCharacteristics,
23
+ )
22
24
 
23
25
 
24
26
  def set_cluster_command(args) -> int:
@@ -47,7 +49,11 @@ def set_cluster_command(args) -> int:
47
49
  return return_code
48
50
 
49
51
 
50
- def is_TAS_possible(args) -> bool:
52
+ def is_TAS_possible(
53
+ system_characteristics: SystemCharacteristics,
54
+ capacity_type: CapacityType,
55
+ flex: bool,
56
+ ) -> bool:
51
57
  """Check cluster's machine_type and capacity type to determine if Kueue TAS is possible
52
58
 
53
59
  Args:
@@ -56,8 +62,6 @@ def is_TAS_possible(args) -> bool:
56
62
  Returns:
57
63
  True if possible and False otherwise.
58
64
  """
59
- system_characteristics = get_cluster_system_characteristics(args)
60
- capacity_type = get_cluster_capacity_type(args)
61
65
 
62
66
  if system_characteristics is None:
63
67
  xpk_print('system_characteristics data was not found in configmaps.')
@@ -67,9 +71,12 @@ def is_TAS_possible(args) -> bool:
67
71
  xpk_print('capacity_type data was not found in configmaps.')
68
72
  xpk_exit(1)
69
73
 
74
+ if flex:
75
+ return False
76
+
70
77
  if (
71
78
  system_characteristics.device_type == H100_MEGA_DEVICE_TYPE
72
- and capacity_type == CapacityType.SPOT
79
+ and capacity_type != CapacityType.RESERVATION
73
80
  ):
74
81
  return False
75
82
 
@@ -27,6 +27,7 @@ from ..core.kjob import (
27
27
  Kueue_TAS_annotation,
28
28
  )
29
29
  from .common import is_TAS_possible
30
+ from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
30
31
 
31
32
 
32
33
  def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
@@ -50,7 +51,9 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
50
51
 
51
52
 
52
53
  def add_TAS_annotations_to_command(args, cmd: str) -> str:
53
- if is_TAS_possible(args):
54
+ system_characteristics = get_cluster_system_characteristics(args)
55
+ capacity_type = get_cluster_capacity_type(args)
56
+ if is_TAS_possible(system_characteristics, capacity_type, flex=False):
54
57
  cmd += f" --pod-template-annotation {Kueue_TAS_annotation}"
55
58
 
56
59
  return cmd
@@ -17,7 +17,7 @@ limitations under the License.
17
17
  from argparse import Namespace
18
18
 
19
19
  from ..core.cluster import (
20
- create_xpk_k8s_service_account,
20
+ setup_k8s_service_accounts,
21
21
  get_cluster_credentials,
22
22
  )
23
23
  from ..core.commands import run_command_with_full_controls
@@ -53,7 +53,7 @@ def run(args: Namespace) -> None:
53
53
  err_code = prepare_kjob(args)
54
54
  if err_code > 0:
55
55
  xpk_exit(err_code)
56
- create_xpk_k8s_service_account()
56
+ setup_k8s_service_accounts()
57
57
 
58
58
  submit_job(args)
59
59
 
@@ -12,7 +12,7 @@ limitations under the License.
12
12
  """
13
13
 
14
14
  from ..core.commands import run_command_with_full_controls, run_command_for_value, run_command_with_updates
15
- from ..core.cluster import get_cluster_credentials, add_zone_and_project, create_xpk_k8s_service_account
15
+ from ..core.cluster import get_cluster_credentials, add_zone_and_project, setup_k8s_service_accounts
16
16
  from ..utils.console import xpk_exit, xpk_print
17
17
  from argparse import Namespace
18
18
 
@@ -82,7 +82,7 @@ def connect_to_new_interactive_shell(args: Namespace) -> int:
82
82
  err_code = prepare_kjob(args)
83
83
  if err_code > 0:
84
84
  xpk_exit(err_code)
85
- create_xpk_k8s_service_account()
85
+ setup_k8s_service_accounts()
86
86
 
87
87
  cmd = (
88
88
  'kubectl-kjob create interactive --profile'
@@ -29,6 +29,7 @@ from ..core.cluster import (
29
29
  setup_k8s_env,
30
30
  update_cluster_with_parallelstore_driver_if_necessary,
31
31
  update_cluster_with_pd_driver_if_necessary,
32
+ update_cluster_with_lustre_driver_if_necessary,
32
33
  update_cluster_with_gcpfilestore_driver_if_necessary,
33
34
  update_cluster_with_gcsfuse_driver_if_necessary,
34
35
  update_cluster_with_workload_identity_if_necessary,
@@ -45,6 +46,7 @@ from ..core.storage import (
45
46
  GCS_FUSE_TYPE,
46
47
  GCE_PD_TYPE,
47
48
  PARALLELSTORE_TYPE,
49
+ LUSTRE_TYPE,
48
50
  STORAGE_CRD_PLURAL,
49
51
  XPK_API_GROUP_NAME,
50
52
  XPK_API_GROUP_VERSION,
@@ -183,11 +185,11 @@ def storage_attach(args: Namespace) -> None:
183
185
  args.prefetch_metadata,
184
186
  )
185
187
 
186
- elif args.type in [PARALLELSTORE_TYPE, GCE_PD_TYPE]:
188
+ elif args.type in [PARALLELSTORE_TYPE, GCE_PD_TYPE, LUSTRE_TYPE]:
187
189
  if args.manifest is None:
188
190
  xpk_print(
189
- "Parallelstore and PersistentDisk are currently supported only with"
190
- " --manifest"
191
+ "Parallelstore, PersistentDisk, and Lustre are currently supported"
192
+ " only with --manifest"
191
193
  )
192
194
  xpk_exit(1)
193
195
 
@@ -234,6 +236,11 @@ def enable_csi_drivers_if_necessary(args: Namespace) -> None:
234
236
  if return_code > 0:
235
237
  xpk_exit(return_code)
236
238
 
239
+ if args.type == LUSTRE_TYPE:
240
+ return_code = update_cluster_with_lustre_driver_if_necessary(args)
241
+ if return_code > 0:
242
+ xpk_exit(return_code)
243
+
237
244
 
238
245
  def storage_list(args: Namespace) -> None:
239
246
  k8s_api_client = setup_k8s_env(args)