xpk 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +8 -8
- xpk/commands/cluster.py +19 -19
- xpk/commands/cluster_gcluster.py +2 -1
- xpk/commands/common.py +7 -3
- xpk/commands/info.py +12 -12
- xpk/commands/inspector.py +1 -1
- xpk/commands/job.py +42 -12
- xpk/commands/kjob_common.py +2 -1
- xpk/commands/storage.py +6 -3
- xpk/commands/workload.py +28 -15
- xpk/core/blueprint/blueprint_generator.py +7 -7
- xpk/core/blueprint/blueprint_test.py +218 -0
- xpk/core/capacity.py +3 -1
- xpk/core/cluster.py +14 -8
- xpk/core/cluster_private.py +8 -2
- xpk/core/commands.py +13 -10
- xpk/core/config.py +3 -4
- xpk/core/config_test.py +71 -0
- xpk/core/docker_image.py +14 -5
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +10 -5
- xpk/core/filestore.py +7 -2
- xpk/core/gcloud_context.py +2 -2
- xpk/core/jobset.py +1 -1
- xpk/core/kjob.py +7 -3
- xpk/core/kueue.py +28 -8
- xpk/core/nap.py +5 -5
- xpk/core/network.py +1 -1
- xpk/core/nodepool.py +8 -3
- xpk/core/nodepool_test.py +82 -0
- xpk/core/pathways.py +6 -2
- xpk/core/ray.py +1 -1
- xpk/core/resources.py +18 -14
- xpk/core/scheduling.py +4 -0
- xpk/core/storage.py +14 -14
- xpk/core/system_characteristics.py +1 -1
- xpk/core/workload.py +11 -0
- xpk/core/workload_decorators/rdma_decorator.py +3 -2
- xpk/core/workload_decorators/storage_decorator.py +2 -1
- xpk/core/workload_decorators/tcpx_decorator.py +4 -2
- xpk/core/workload_decorators/tcpx_decorator_test.py +267 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +2 -1
- xpk/core/workload_test.py +28 -0
- xpk/main.py +12 -10
- xpk/parser/cluster.py +110 -49
- xpk/parser/common.py +45 -36
- xpk/parser/storage.py +12 -13
- xpk/parser/workload.py +57 -39
- xpk/utils/console.py +2 -1
- xpk/utils/execution_context.py +28 -0
- xpk/utils/file.py +25 -10
- xpk/utils/network.py +4 -0
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/METADATA +4 -1
- xpk-0.13.0.dist-info/RECORD +101 -0
- xpk-0.11.0.dist-info/RECORD +0 -95
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/WHEEL +0 -0
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/top_level.txt +0 -0
xpk/parser/cluster.py
CHANGED
|
@@ -29,7 +29,7 @@ from ..commands.cluster import (
|
|
|
29
29
|
from ..commands.config import xpk_cfg
|
|
30
30
|
from ..core.config import CFG_BUCKET_KEY
|
|
31
31
|
from ..core.vertex import DEFAULT_VERTEX_TENSORBOARD_NAME
|
|
32
|
-
from .common import add_shared_arguments
|
|
32
|
+
from .common import add_shared_arguments, ParserOrArgumentGroup
|
|
33
33
|
from .validators import name_type
|
|
34
34
|
|
|
35
35
|
|
|
@@ -174,6 +174,13 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
|
|
|
174
174
|
'Arguments for configuring MTC in cluster create.',
|
|
175
175
|
)
|
|
176
176
|
add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments)
|
|
177
|
+
|
|
178
|
+
cluster_create_resource_limits = cluster_create_parser.add_argument_group(
|
|
179
|
+
'Optional Resource Limits Arguments',
|
|
180
|
+
'Arguments for configuring resource limits in cluster create.',
|
|
181
|
+
)
|
|
182
|
+
add_resource_limits(cluster_create_resource_limits)
|
|
183
|
+
|
|
177
184
|
cluster_create_parser.set_defaults(func=cluster_create)
|
|
178
185
|
|
|
179
186
|
|
|
@@ -208,6 +215,14 @@ def set_cluster_create_pathways_parser(
|
|
|
208
215
|
cluster_create_pathways_optional_arguments
|
|
209
216
|
)
|
|
210
217
|
|
|
218
|
+
autoprovisioning_arguments = (
|
|
219
|
+
cluster_create_pathways_parser.add_argument_group(
|
|
220
|
+
'Autoprovisioning Arguments',
|
|
221
|
+
'Optional arguments for enabling autoprovisioning.',
|
|
222
|
+
)
|
|
223
|
+
)
|
|
224
|
+
add_autoprovisioning_arguments(autoprovisioning_arguments)
|
|
225
|
+
|
|
211
226
|
### Capacity arguments specific to "cluster create-pathways"
|
|
212
227
|
cluster_create_pathways_capacity_arguments = (
|
|
213
228
|
cluster_create_pathways_parser.add_argument_group(
|
|
@@ -237,6 +252,15 @@ def set_cluster_create_pathways_parser(
|
|
|
237
252
|
)
|
|
238
253
|
)
|
|
239
254
|
add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments)
|
|
255
|
+
|
|
256
|
+
cluster_create_resource_limits = (
|
|
257
|
+
cluster_create_pathways_parser.add_argument_group(
|
|
258
|
+
'Optional Resource Limits Arguments',
|
|
259
|
+
'Arguments for configuring resource limits in cluster create.',
|
|
260
|
+
)
|
|
261
|
+
)
|
|
262
|
+
add_resource_limits(cluster_create_resource_limits)
|
|
263
|
+
|
|
240
264
|
cluster_create_pathways_parser.set_defaults(func=cluster_create_pathways)
|
|
241
265
|
|
|
242
266
|
|
|
@@ -312,6 +336,13 @@ def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser):
|
|
|
312
336
|
'Arguments for configuring MTC in cluster create.',
|
|
313
337
|
)
|
|
314
338
|
add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments)
|
|
339
|
+
|
|
340
|
+
cluster_create_resource_limits = cluster_create_ray_parser.add_argument_group(
|
|
341
|
+
'Optional Resource Limits Arguments',
|
|
342
|
+
'Arguments for configuring resource limits in cluster create.',
|
|
343
|
+
)
|
|
344
|
+
add_resource_limits(cluster_create_resource_limits)
|
|
345
|
+
|
|
315
346
|
cluster_create_ray_parser.set_defaults(func=cluster_create_ray_cluster)
|
|
316
347
|
|
|
317
348
|
|
|
@@ -529,15 +560,15 @@ def set_cluster_adapt_parser(cluster_adapt_parser: ArgumentParser):
|
|
|
529
560
|
cluster_adapt_parser.set_defaults(func=cluster_adapt)
|
|
530
561
|
|
|
531
562
|
|
|
532
|
-
def add_autoprovisioning_arguments(
|
|
533
|
-
|
|
563
|
+
def add_autoprovisioning_arguments(parser_or_group: ParserOrArgumentGroup):
|
|
564
|
+
parser_or_group.add_argument(
|
|
534
565
|
'--enable-autoprovisioning',
|
|
535
566
|
action='store_true',
|
|
536
567
|
help=(
|
|
537
568
|
'Enable GKE features for autoprovisioning node pools in GKE clusters.'
|
|
538
569
|
),
|
|
539
570
|
)
|
|
540
|
-
|
|
571
|
+
parser_or_group.add_argument(
|
|
541
572
|
'--autoprovisioning-min-chips',
|
|
542
573
|
type=int,
|
|
543
574
|
help=(
|
|
@@ -546,7 +577,7 @@ def add_autoprovisioning_arguments(parser: ArgumentParser):
|
|
|
546
577
|
' resources in the cluster as the minimum, and maximum.'
|
|
547
578
|
),
|
|
548
579
|
)
|
|
549
|
-
|
|
580
|
+
parser_or_group.add_argument(
|
|
550
581
|
'--autoprovisioning-max-chips',
|
|
551
582
|
type=int,
|
|
552
583
|
help=(
|
|
@@ -557,13 +588,15 @@ def add_autoprovisioning_arguments(parser: ArgumentParser):
|
|
|
557
588
|
)
|
|
558
589
|
|
|
559
590
|
|
|
560
|
-
def add_shared_cluster_create_required_arguments(
|
|
591
|
+
def add_shared_cluster_create_required_arguments(
|
|
592
|
+
parser_or_group: ParserOrArgumentGroup,
|
|
593
|
+
):
|
|
561
594
|
"""Add shared required arguments in cluster create and Pathways cluster create.
|
|
562
595
|
|
|
563
596
|
Args:
|
|
564
|
-
|
|
597
|
+
parser_or_group: cluster create argument parser or argument group
|
|
565
598
|
"""
|
|
566
|
-
|
|
599
|
+
parser_or_group.add_argument(
|
|
567
600
|
'--cluster',
|
|
568
601
|
type=name_type,
|
|
569
602
|
default=None,
|
|
@@ -575,21 +608,23 @@ def add_shared_cluster_create_required_arguments(parser: ArgumentParser):
|
|
|
575
608
|
)
|
|
576
609
|
|
|
577
610
|
|
|
578
|
-
def add_shared_cluster_create_optional_arguments(
|
|
611
|
+
def add_shared_cluster_create_optional_arguments(
|
|
612
|
+
parser_or_group: ParserOrArgumentGroup,
|
|
613
|
+
):
|
|
579
614
|
"""Add shared optional arguments in cluster create and Pathways cluster create.
|
|
580
615
|
|
|
581
616
|
Args:
|
|
582
|
-
|
|
617
|
+
parser_or_group: cluster create argument parser or argument group
|
|
583
618
|
"""
|
|
584
|
-
add_shared_arguments(
|
|
585
|
-
|
|
619
|
+
add_shared_arguments(parser_or_group)
|
|
620
|
+
parser_or_group.add_argument(
|
|
586
621
|
'--host-maintenance-interval',
|
|
587
622
|
type=str,
|
|
588
623
|
choices=['AS_NEEDED', 'PERIODIC'],
|
|
589
624
|
default='AS_NEEDED',
|
|
590
625
|
help='The maintenance policy of the cluster and respective clusters.',
|
|
591
626
|
)
|
|
592
|
-
|
|
627
|
+
parser_or_group.add_argument(
|
|
593
628
|
'--gke-version',
|
|
594
629
|
type=str,
|
|
595
630
|
help=(
|
|
@@ -598,20 +633,20 @@ def add_shared_cluster_create_optional_arguments(parser: ArgumentParser):
|
|
|
598
633
|
' recommended version.'
|
|
599
634
|
),
|
|
600
635
|
)
|
|
601
|
-
|
|
636
|
+
parser_or_group.add_argument(
|
|
602
637
|
'--num-slices',
|
|
603
638
|
type=int,
|
|
604
639
|
default=1,
|
|
605
640
|
help='The number of slices to run the job on, defaults to 1.',
|
|
606
641
|
required=False,
|
|
607
642
|
)
|
|
608
|
-
|
|
643
|
+
parser_or_group.add_argument(
|
|
609
644
|
'--pathways-gce-machine-type',
|
|
610
645
|
type=str,
|
|
611
646
|
default='n2-standard-64',
|
|
612
647
|
help='The CPU type for Pathways CPU nodepools',
|
|
613
648
|
)
|
|
614
|
-
|
|
649
|
+
parser_or_group.add_argument(
|
|
615
650
|
'--default-pool-cpu-machine-type',
|
|
616
651
|
type=str,
|
|
617
652
|
default='e2-standard-16',
|
|
@@ -620,7 +655,7 @@ def add_shared_cluster_create_optional_arguments(parser: ArgumentParser):
|
|
|
620
655
|
' regional clusters, all zones must support the machine type.'
|
|
621
656
|
),
|
|
622
657
|
)
|
|
623
|
-
|
|
658
|
+
parser_or_group.add_argument(
|
|
624
659
|
'--cluster-cpu-machine-type',
|
|
625
660
|
type=str,
|
|
626
661
|
default='',
|
|
@@ -631,7 +666,7 @@ def add_shared_cluster_create_optional_arguments(parser: ArgumentParser):
|
|
|
631
666
|
' cpu nodepools using --device-type.'
|
|
632
667
|
),
|
|
633
668
|
)
|
|
634
|
-
|
|
669
|
+
parser_or_group.add_argument(
|
|
635
670
|
'--default-pool-cpu-num-nodes',
|
|
636
671
|
type=int,
|
|
637
672
|
default=6,
|
|
@@ -641,7 +676,7 @@ def add_shared_cluster_create_optional_arguments(parser: ArgumentParser):
|
|
|
641
676
|
' over time.'
|
|
642
677
|
),
|
|
643
678
|
)
|
|
644
|
-
|
|
679
|
+
parser_or_group.add_argument(
|
|
645
680
|
'--custom-cluster-arguments',
|
|
646
681
|
type=str,
|
|
647
682
|
default='',
|
|
@@ -652,7 +687,7 @@ def add_shared_cluster_create_optional_arguments(parser: ArgumentParser):
|
|
|
652
687
|
" --custom-cluster-arguments='--network=mtu9k --subnetwork=mtu9k'"
|
|
653
688
|
),
|
|
654
689
|
)
|
|
655
|
-
|
|
690
|
+
parser_or_group.add_argument(
|
|
656
691
|
'--custom-nodepool-arguments',
|
|
657
692
|
type=str,
|
|
658
693
|
default='',
|
|
@@ -663,7 +698,7 @@ def add_shared_cluster_create_optional_arguments(parser: ArgumentParser):
|
|
|
663
698
|
' --custom-nodepool-arguments="--disk-size=300"'
|
|
664
699
|
),
|
|
665
700
|
)
|
|
666
|
-
|
|
701
|
+
parser_or_group.add_argument(
|
|
667
702
|
'--force',
|
|
668
703
|
action='store_true',
|
|
669
704
|
help=(
|
|
@@ -671,7 +706,7 @@ def add_shared_cluster_create_optional_arguments(parser: ArgumentParser):
|
|
|
671
706
|
' additional approval.'
|
|
672
707
|
),
|
|
673
708
|
)
|
|
674
|
-
|
|
709
|
+
parser_or_group.add_argument(
|
|
675
710
|
'--custom-tpu-nodepool-arguments',
|
|
676
711
|
type=str,
|
|
677
712
|
default='',
|
|
@@ -682,7 +717,7 @@ def add_shared_cluster_create_optional_arguments(parser: ArgumentParser):
|
|
|
682
717
|
' --custom-tpu-nodepool-arguments="--enable-ip-alias"'
|
|
683
718
|
),
|
|
684
719
|
)
|
|
685
|
-
|
|
720
|
+
parser_or_group.add_argument(
|
|
686
721
|
'--private',
|
|
687
722
|
action='store_true',
|
|
688
723
|
help=(
|
|
@@ -695,7 +730,7 @@ def add_shared_cluster_create_optional_arguments(parser: ArgumentParser):
|
|
|
695
730
|
' clusters.'
|
|
696
731
|
),
|
|
697
732
|
)
|
|
698
|
-
|
|
733
|
+
parser_or_group.add_argument(
|
|
699
734
|
'--authorized-networks',
|
|
700
735
|
action='extend',
|
|
701
736
|
nargs='+',
|
|
@@ -710,16 +745,16 @@ def add_shared_cluster_create_optional_arguments(parser: ArgumentParser):
|
|
|
710
745
|
' Example usage: --authorized-networks 1.2.3.0/24 1.2.4.5/32'
|
|
711
746
|
),
|
|
712
747
|
)
|
|
713
|
-
|
|
748
|
+
parser_or_group.add_argument(
|
|
714
749
|
'--enable-workload-identity',
|
|
715
750
|
action='store_true',
|
|
716
751
|
help='Enable Workload Identity Federation on the cluster and node-pools.',
|
|
717
752
|
)
|
|
718
|
-
add_driver_arguments(
|
|
753
|
+
add_driver_arguments(parser_or_group)
|
|
719
754
|
|
|
720
755
|
|
|
721
|
-
def add_driver_arguments(
|
|
722
|
-
|
|
756
|
+
def add_driver_arguments(parser_or_group: ParserOrArgumentGroup):
|
|
757
|
+
parser_or_group.add_argument(
|
|
723
758
|
'--enable-gcsfuse-csi-driver',
|
|
724
759
|
action='store_true',
|
|
725
760
|
help=(
|
|
@@ -728,42 +763,44 @@ def add_driver_arguments(parser: ArgumentParser):
|
|
|
728
763
|
' Identity is enabled by default.'
|
|
729
764
|
),
|
|
730
765
|
)
|
|
731
|
-
|
|
766
|
+
parser_or_group.add_argument(
|
|
732
767
|
'--enable-gcpfilestore-csi-driver',
|
|
733
768
|
action='store_true',
|
|
734
769
|
help='Enable GCPFilestore driver on the cluster.',
|
|
735
770
|
)
|
|
736
|
-
|
|
771
|
+
parser_or_group.add_argument(
|
|
737
772
|
'--enable-parallelstore-csi-driver',
|
|
738
773
|
action='store_true',
|
|
739
774
|
help='Enable Parallelstore CSI driver on the cluster.',
|
|
740
775
|
)
|
|
741
|
-
|
|
776
|
+
parser_or_group.add_argument(
|
|
742
777
|
'--enable-pd-csi-driver',
|
|
743
778
|
action='store_true',
|
|
744
779
|
help='Enable PersistentDisk CSI driver on the cluster.',
|
|
745
780
|
)
|
|
746
|
-
|
|
781
|
+
parser_or_group.add_argument(
|
|
747
782
|
'--enable-lustre-csi-driver',
|
|
748
783
|
action='store_true',
|
|
749
784
|
help='Enable Lustre CSI driver on the cluster.',
|
|
750
785
|
)
|
|
751
786
|
|
|
752
787
|
|
|
753
|
-
def add_shared_cluster_create_tensorboard_arguments(
|
|
788
|
+
def add_shared_cluster_create_tensorboard_arguments(
|
|
789
|
+
parser_or_group: ParserOrArgumentGroup,
|
|
790
|
+
):
|
|
754
791
|
"""Add shared tensorboard arguments in cluster create and Pathways cluster create.
|
|
755
792
|
Note that this feature enables non-Pathways workloads to use tensorboard arguments
|
|
756
793
|
on a Pathways cluster.
|
|
757
794
|
|
|
758
795
|
Args:
|
|
759
|
-
|
|
796
|
+
parser_or_group: cluster create argument parser or argument group
|
|
760
797
|
"""
|
|
761
|
-
|
|
798
|
+
parser_or_group.add_argument(
|
|
762
799
|
'--create-vertex-tensorboard',
|
|
763
800
|
action='store_true',
|
|
764
801
|
help='Set this flag to create a Tensorboard instance in Vertex AI.',
|
|
765
802
|
)
|
|
766
|
-
|
|
803
|
+
parser_or_group.add_argument(
|
|
767
804
|
'--tensorboard-region',
|
|
768
805
|
type=str,
|
|
769
806
|
default='us-central1',
|
|
@@ -774,7 +811,7 @@ def add_shared_cluster_create_tensorboard_arguments(parser: ArgumentParser):
|
|
|
774
811
|
' instance will be created in us-central1.'
|
|
775
812
|
),
|
|
776
813
|
)
|
|
777
|
-
|
|
814
|
+
parser_or_group.add_argument(
|
|
778
815
|
'--tensorboard-name',
|
|
779
816
|
type=str,
|
|
780
817
|
required=False,
|
|
@@ -787,13 +824,15 @@ def add_shared_cluster_create_tensorboard_arguments(parser: ArgumentParser):
|
|
|
787
824
|
)
|
|
788
825
|
|
|
789
826
|
|
|
790
|
-
def add_shared_cluster_create_capacity_arguments(
|
|
827
|
+
def add_shared_cluster_create_capacity_arguments(
|
|
828
|
+
parser_or_group: ParserOrArgumentGroup,
|
|
829
|
+
):
|
|
791
830
|
"""Add shared capacity arguments in cluster create and Pathways cluster create.
|
|
792
831
|
|
|
793
832
|
Args:
|
|
794
|
-
|
|
833
|
+
parser_or_group: cluster create argument parser or argument group
|
|
795
834
|
"""
|
|
796
|
-
|
|
835
|
+
parser_or_group.add_argument(
|
|
797
836
|
'--on-demand',
|
|
798
837
|
action='store_true',
|
|
799
838
|
help=(
|
|
@@ -802,7 +841,7 @@ def add_shared_cluster_create_capacity_arguments(parser: ArgumentParser):
|
|
|
802
841
|
' types.'
|
|
803
842
|
),
|
|
804
843
|
)
|
|
805
|
-
|
|
844
|
+
parser_or_group.add_argument(
|
|
806
845
|
'--reservation',
|
|
807
846
|
type=str,
|
|
808
847
|
help=(
|
|
@@ -811,7 +850,7 @@ def add_shared_cluster_create_capacity_arguments(parser: ArgumentParser):
|
|
|
811
850
|
' `--flex` or `--on-demand` for other capacity types.'
|
|
812
851
|
),
|
|
813
852
|
)
|
|
814
|
-
|
|
853
|
+
parser_or_group.add_argument(
|
|
815
854
|
'--spot',
|
|
816
855
|
action='store_true',
|
|
817
856
|
help=(
|
|
@@ -820,7 +859,7 @@ def add_shared_cluster_create_capacity_arguments(parser: ArgumentParser):
|
|
|
820
859
|
' capacity types.'
|
|
821
860
|
),
|
|
822
861
|
)
|
|
823
|
-
|
|
862
|
+
parser_or_group.add_argument(
|
|
824
863
|
'--flex',
|
|
825
864
|
action='store_true',
|
|
826
865
|
help=(
|
|
@@ -831,18 +870,20 @@ def add_shared_cluster_create_capacity_arguments(parser: ArgumentParser):
|
|
|
831
870
|
)
|
|
832
871
|
|
|
833
872
|
|
|
834
|
-
def add_shared_cluster_create_mtc_arguments(
|
|
873
|
+
def add_shared_cluster_create_mtc_arguments(
|
|
874
|
+
parser_or_group: ParserOrArgumentGroup,
|
|
875
|
+
):
|
|
835
876
|
"""Add shared Multi-tier Checkpointing arguments in cluster create and Pathways cluster create.
|
|
836
877
|
|
|
837
878
|
Args:
|
|
838
|
-
List of cluster create MTC arguments parsers
|
|
879
|
+
List of cluster create MTC arguments parsers or group
|
|
839
880
|
"""
|
|
840
|
-
|
|
881
|
+
parser_or_group.add_argument(
|
|
841
882
|
'--enable-mtc',
|
|
842
883
|
action='store_true',
|
|
843
884
|
help='Enable MTC on the cluster.',
|
|
844
885
|
)
|
|
845
|
-
|
|
886
|
+
parser_or_group.add_argument(
|
|
846
887
|
'--mtc-ramdisk-size',
|
|
847
888
|
type=str,
|
|
848
889
|
default=None,
|
|
@@ -851,7 +892,7 @@ def add_shared_cluster_create_mtc_arguments(parser: ArgumentParser):
|
|
|
851
892
|
' used for multi-tier checkpointing. e.g. "64Mi" '
|
|
852
893
|
),
|
|
853
894
|
)
|
|
854
|
-
|
|
895
|
+
parser_or_group.add_argument(
|
|
855
896
|
'--mtc-gcs-bucket',
|
|
856
897
|
type=str,
|
|
857
898
|
default=None,
|
|
@@ -860,7 +901,7 @@ def add_shared_cluster_create_mtc_arguments(parser: ArgumentParser):
|
|
|
860
901
|
' multi-tier checkpointing.'
|
|
861
902
|
),
|
|
862
903
|
)
|
|
863
|
-
|
|
904
|
+
parser_or_group.add_argument(
|
|
864
905
|
'--mtc-toleration-key',
|
|
865
906
|
type=str,
|
|
866
907
|
default=None,
|
|
@@ -869,3 +910,23 @@ def add_shared_cluster_create_mtc_arguments(parser: ArgumentParser):
|
|
|
869
910
|
' checkpointing. By default, it is set to "google.com/tpu".'
|
|
870
911
|
),
|
|
871
912
|
)
|
|
913
|
+
|
|
914
|
+
|
|
915
|
+
def add_resource_limits(parser_or_group: ParserOrArgumentGroup):
|
|
916
|
+
"""Add resource limits arguments in cluster create.
|
|
917
|
+
|
|
918
|
+
Args:
|
|
919
|
+
List of cluster create resource limits arguments parsers or group
|
|
920
|
+
"""
|
|
921
|
+
parser_or_group.add_argument(
|
|
922
|
+
'--memory-limit',
|
|
923
|
+
type=str,
|
|
924
|
+
default=None,
|
|
925
|
+
help='The memory limit for the Kueue controller manager.',
|
|
926
|
+
)
|
|
927
|
+
parser_or_group.add_argument(
|
|
928
|
+
'--cpu-limit',
|
|
929
|
+
type=int,
|
|
930
|
+
default=None,
|
|
931
|
+
help='The CPU limit for the Kueue controller manager.',
|
|
932
|
+
)
|
xpk/parser/common.py
CHANGED
|
@@ -15,24 +15,31 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
import argparse
|
|
18
|
+
from typing import Protocol, Any
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ParserOrArgumentGroup(Protocol):
|
|
22
|
+
|
|
23
|
+
def add_argument(self, *args, **kwargs) -> Any:
|
|
24
|
+
...
|
|
18
25
|
|
|
19
26
|
|
|
20
27
|
def add_shared_arguments(
|
|
21
|
-
|
|
28
|
+
custom_parser_or_group: ParserOrArgumentGroup, required=False
|
|
22
29
|
) -> None:
|
|
23
|
-
"""Add shared arguments to the parser.
|
|
30
|
+
"""Add shared arguments to the parser or argument group.
|
|
24
31
|
|
|
25
32
|
Args:
|
|
26
|
-
|
|
33
|
+
custom_parser_or_group: parser or argument group to add shared arguments to.
|
|
27
34
|
"""
|
|
28
|
-
|
|
35
|
+
custom_parser_or_group.add_argument(
|
|
29
36
|
'--project',
|
|
30
37
|
type=str,
|
|
31
38
|
default=None,
|
|
32
39
|
help='GCE project name, defaults to "gcloud config project."',
|
|
33
40
|
required=required,
|
|
34
41
|
)
|
|
35
|
-
|
|
42
|
+
custom_parser_or_group.add_argument(
|
|
36
43
|
'--zone',
|
|
37
44
|
type=str,
|
|
38
45
|
default=None,
|
|
@@ -43,7 +50,7 @@ def add_shared_arguments(
|
|
|
43
50
|
),
|
|
44
51
|
required=required,
|
|
45
52
|
)
|
|
46
|
-
|
|
53
|
+
custom_parser_or_group.add_argument(
|
|
47
54
|
'--dry-run',
|
|
48
55
|
type=bool,
|
|
49
56
|
action=argparse.BooleanOptionalAction,
|
|
@@ -58,14 +65,14 @@ def add_shared_arguments(
|
|
|
58
65
|
|
|
59
66
|
|
|
60
67
|
def add_cluster_arguments(
|
|
61
|
-
|
|
68
|
+
custom_parser_or_group: ParserOrArgumentGroup, required=False
|
|
62
69
|
) -> None:
|
|
63
|
-
"""Add cluster argument to the parser.
|
|
70
|
+
"""Add cluster argument to the parser or argument group.
|
|
64
71
|
|
|
65
72
|
Args:
|
|
66
|
-
|
|
73
|
+
custom_parser_or_group: parser or argument group to add shared arguments to.
|
|
67
74
|
"""
|
|
68
|
-
|
|
75
|
+
custom_parser_or_group.add_argument(
|
|
69
76
|
'--cluster',
|
|
70
77
|
type=str,
|
|
71
78
|
default=None,
|
|
@@ -74,13 +81,15 @@ def add_cluster_arguments(
|
|
|
74
81
|
)
|
|
75
82
|
|
|
76
83
|
|
|
77
|
-
def add_kind_cluster_arguments(
|
|
78
|
-
|
|
84
|
+
def add_kind_cluster_arguments(
|
|
85
|
+
custom_parser_or_group: ParserOrArgumentGroup,
|
|
86
|
+
) -> None:
|
|
87
|
+
"""Add kind cluster arguments to the parser or argument group.
|
|
79
88
|
|
|
80
89
|
Args:
|
|
81
|
-
|
|
90
|
+
custom_parser_or_group: parser or argument group to add shared arguments to.
|
|
82
91
|
"""
|
|
83
|
-
|
|
92
|
+
custom_parser_or_group.add_argument(
|
|
84
93
|
'--kind-cluster',
|
|
85
94
|
type=bool,
|
|
86
95
|
action=argparse.BooleanOptionalAction,
|
|
@@ -89,13 +98,13 @@ def add_kind_cluster_arguments(custom_parser: argparse.ArgumentParser) -> None:
|
|
|
89
98
|
)
|
|
90
99
|
|
|
91
100
|
|
|
92
|
-
def add_global_arguments(
|
|
101
|
+
def add_global_arguments(custom_parser_or_group: ParserOrArgumentGroup):
|
|
93
102
|
"""Add global - no cloud dependent - arguments to the parser.
|
|
94
103
|
|
|
95
104
|
Args:
|
|
96
|
-
|
|
105
|
+
custom_parser_or_group: parser or argument group to add global arguments to.
|
|
97
106
|
"""
|
|
98
|
-
|
|
107
|
+
custom_parser_or_group.add_argument(
|
|
99
108
|
'--dry-run',
|
|
100
109
|
type=bool,
|
|
101
110
|
action=argparse.BooleanOptionalAction,
|
|
@@ -108,20 +117,20 @@ def add_global_arguments(custom_parser: argparse.ArgumentParser):
|
|
|
108
117
|
)
|
|
109
118
|
|
|
110
119
|
|
|
111
|
-
def add_slurm_arguments(
|
|
120
|
+
def add_slurm_arguments(custom_parser_or_group: ParserOrArgumentGroup):
|
|
112
121
|
"""Add Slurm job arguments to the parser.
|
|
113
122
|
|
|
114
123
|
Args:
|
|
115
|
-
|
|
124
|
+
custom_parser_or_group: parser or argument group to add global arguments to.
|
|
116
125
|
"""
|
|
117
|
-
|
|
126
|
+
custom_parser_or_group.add_argument(
|
|
118
127
|
'--ignore-unknown-flags',
|
|
119
128
|
type=bool,
|
|
120
129
|
action=argparse.BooleanOptionalAction,
|
|
121
130
|
default=False,
|
|
122
131
|
help='Ignore all the unsupported flags in the bash script.',
|
|
123
132
|
)
|
|
124
|
-
|
|
133
|
+
custom_parser_or_group.add_argument(
|
|
125
134
|
'-a',
|
|
126
135
|
'--array',
|
|
127
136
|
type=str,
|
|
@@ -137,32 +146,32 @@ def add_slurm_arguments(custom_parser: argparse.ArgumentParser):
|
|
|
137
146
|
' 0. The maximum index value is 2147483647.'
|
|
138
147
|
),
|
|
139
148
|
)
|
|
140
|
-
|
|
149
|
+
custom_parser_or_group.add_argument(
|
|
141
150
|
'-c',
|
|
142
151
|
'--cpus-per-task',
|
|
143
152
|
type=str,
|
|
144
153
|
default=None,
|
|
145
154
|
help='How much cpus a container inside a pod requires.',
|
|
146
155
|
)
|
|
147
|
-
|
|
156
|
+
custom_parser_or_group.add_argument(
|
|
148
157
|
'--gpus-per-task',
|
|
149
158
|
type=str,
|
|
150
159
|
default=None,
|
|
151
160
|
help='How much gpus a container inside a pod requires.',
|
|
152
161
|
)
|
|
153
|
-
|
|
162
|
+
custom_parser_or_group.add_argument(
|
|
154
163
|
'--mem',
|
|
155
164
|
type=str,
|
|
156
165
|
default=None,
|
|
157
166
|
help='How much memory a pod requires.',
|
|
158
167
|
)
|
|
159
|
-
|
|
168
|
+
custom_parser_or_group.add_argument(
|
|
160
169
|
'--mem-per-task',
|
|
161
170
|
type=str,
|
|
162
171
|
default=None,
|
|
163
172
|
help='How much memory a container requires.',
|
|
164
173
|
)
|
|
165
|
-
|
|
174
|
+
custom_parser_or_group.add_argument(
|
|
166
175
|
'--mem-per-cpu',
|
|
167
176
|
type=str,
|
|
168
177
|
default=None,
|
|
@@ -171,7 +180,7 @@ def add_slurm_arguments(custom_parser: argparse.ArgumentParser):
|
|
|
171
180
|
'of requested cpus per task by mem-per-cpu.'
|
|
172
181
|
),
|
|
173
182
|
)
|
|
174
|
-
|
|
183
|
+
custom_parser_or_group.add_argument(
|
|
175
184
|
'--mem-per-gpu',
|
|
176
185
|
type=str,
|
|
177
186
|
default=None,
|
|
@@ -180,21 +189,21 @@ def add_slurm_arguments(custom_parser: argparse.ArgumentParser):
|
|
|
180
189
|
'of requested gpus per task by mem-per-gpu.'
|
|
181
190
|
),
|
|
182
191
|
)
|
|
183
|
-
|
|
192
|
+
custom_parser_or_group.add_argument(
|
|
184
193
|
'-N',
|
|
185
194
|
'--nodes',
|
|
186
195
|
type=int,
|
|
187
196
|
default=None,
|
|
188
197
|
help='Number of pods to be used at a time.',
|
|
189
198
|
)
|
|
190
|
-
|
|
199
|
+
custom_parser_or_group.add_argument(
|
|
191
200
|
'-n',
|
|
192
201
|
'--ntasks',
|
|
193
202
|
type=int,
|
|
194
203
|
default=None,
|
|
195
204
|
help='Number of identical containers inside of a pod, usually 1.',
|
|
196
205
|
)
|
|
197
|
-
|
|
206
|
+
custom_parser_or_group.add_argument(
|
|
198
207
|
'-o',
|
|
199
208
|
'--output',
|
|
200
209
|
type=str,
|
|
@@ -204,7 +213,7 @@ def add_slurm_arguments(custom_parser: argparse.ArgumentParser):
|
|
|
204
213
|
' passed it proceeds to stdout, and is available via kubectl logs.'
|
|
205
214
|
),
|
|
206
215
|
)
|
|
207
|
-
|
|
216
|
+
custom_parser_or_group.add_argument(
|
|
208
217
|
'-e',
|
|
209
218
|
'--error',
|
|
210
219
|
type=str,
|
|
@@ -214,27 +223,27 @@ def add_slurm_arguments(custom_parser: argparse.ArgumentParser):
|
|
|
214
223
|
' proceeds to stdout, and is available via kubectl logs.'
|
|
215
224
|
),
|
|
216
225
|
)
|
|
217
|
-
|
|
226
|
+
custom_parser_or_group.add_argument(
|
|
218
227
|
'--input',
|
|
219
228
|
type=str,
|
|
220
229
|
default=None,
|
|
221
230
|
help='What to pipe into the script.',
|
|
222
231
|
)
|
|
223
|
-
|
|
232
|
+
custom_parser_or_group.add_argument(
|
|
224
233
|
'-J',
|
|
225
234
|
'--job-name',
|
|
226
235
|
type=str,
|
|
227
236
|
default=None,
|
|
228
237
|
help='What is the job name.',
|
|
229
238
|
)
|
|
230
|
-
|
|
239
|
+
custom_parser_or_group.add_argument(
|
|
231
240
|
'-D',
|
|
232
241
|
'--chdir',
|
|
233
242
|
type=str,
|
|
234
243
|
default=None,
|
|
235
244
|
help='Change directory before executing the script.',
|
|
236
245
|
)
|
|
237
|
-
|
|
246
|
+
custom_parser_or_group.add_argument(
|
|
238
247
|
'-t',
|
|
239
248
|
'--time',
|
|
240
249
|
type=str,
|
|
@@ -247,7 +256,7 @@ def add_slurm_arguments(custom_parser: argparse.ArgumentParser):
|
|
|
247
256
|
'and "days-hours:minutes:seconds".'
|
|
248
257
|
),
|
|
249
258
|
)
|
|
250
|
-
|
|
259
|
+
custom_parser_or_group.add_argument(
|
|
251
260
|
'--priority',
|
|
252
261
|
type=str,
|
|
253
262
|
default='medium',
|