xpk 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/cluster.py +270 -8
- xpk/commands/cluster_gcluster.py +2 -1
- xpk/commands/common.py +3 -3
- xpk/commands/info.py +12 -12
- xpk/commands/job.py +12 -10
- xpk/commands/kjob_common.py +2 -1
- xpk/commands/storage.py +1 -1
- xpk/commands/workload.py +12 -6
- xpk/core/blueprint/blueprint_generator.py +7 -7
- xpk/core/blueprint/blueprint_test.py +218 -0
- xpk/core/capacity.py +5 -3
- xpk/core/cluster.py +9 -7
- xpk/core/cluster_private.py +5 -1
- xpk/core/commands.py +3 -3
- xpk/core/config.py +3 -4
- xpk/core/config_test.py +71 -0
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +1 -1
- xpk/core/filestore.py +7 -2
- xpk/core/gcloud_context.py +2 -2
- xpk/core/jobset.py +1 -1
- xpk/core/kjob.py +2 -1
- xpk/core/kueue.py +12 -4
- xpk/core/nap.py +20 -6
- xpk/core/nodepool.py +52 -19
- xpk/core/nodepool_test.py +82 -0
- xpk/core/resources.py +1 -7
- xpk/core/scheduling.py +1 -1
- xpk/core/storage.py +14 -14
- xpk/core/system_characteristics.py +267 -1081
- xpk/core/workload.py +11 -0
- xpk/core/workload_decorators/rdma_decorator.py +3 -2
- xpk/core/workload_decorators/storage_decorator.py +2 -1
- xpk/core/workload_decorators/tcpx_decorator.py +4 -2
- xpk/core/workload_decorators/tcpx_decorator_test.py +267 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +2 -1
- xpk/core/workload_test.py +28 -0
- xpk/main.py +9 -10
- xpk/parser/cluster.py +67 -49
- xpk/parser/common.py +45 -36
- xpk/parser/storage.py +12 -13
- xpk/parser/workload.py +57 -39
- xpk/utils/console.py +2 -1
- {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/METADATA +4 -1
- {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/RECORD +49 -44
- {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/WHEEL +0 -0
- {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/top_level.txt +0 -0
xpk/commands/cluster.py
CHANGED
|
@@ -78,6 +78,8 @@ from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
|
78
78
|
from ..utils.file import write_tmp_file
|
|
79
79
|
from . import cluster_gcluster
|
|
80
80
|
from .common import set_cluster_command
|
|
81
|
+
import shutil
|
|
82
|
+
import os
|
|
81
83
|
|
|
82
84
|
|
|
83
85
|
def cluster_adapt(args) -> None:
|
|
@@ -90,7 +92,7 @@ def cluster_adapt(args) -> None:
|
|
|
90
92
|
|
|
91
93
|
system, return_code = get_system_characteristics(args)
|
|
92
94
|
|
|
93
|
-
if return_code > 0:
|
|
95
|
+
if return_code > 0 or system is None:
|
|
94
96
|
xpk_print('Fetching system characteristics failed!')
|
|
95
97
|
xpk_exit(return_code)
|
|
96
98
|
|
|
@@ -139,8 +141,6 @@ def cluster_adapt(args) -> None:
|
|
|
139
141
|
if not tensorboard_config:
|
|
140
142
|
xpk_exit(1)
|
|
141
143
|
|
|
142
|
-
# Provision node pools dynamically based on incoming workloads:
|
|
143
|
-
# Currently autoprovisioning is not supported with Pathways.
|
|
144
144
|
autoprovisioning_config = None
|
|
145
145
|
if args.enable_autoprovisioning:
|
|
146
146
|
xpk_print('Enabling Autoprovisioning')
|
|
@@ -199,7 +199,7 @@ def cluster_create(args) -> None:
|
|
|
199
199
|
"""
|
|
200
200
|
system, return_code = get_system_characteristics(args)
|
|
201
201
|
|
|
202
|
-
if return_code > 0:
|
|
202
|
+
if return_code > 0 or system is None:
|
|
203
203
|
xpk_print('Fetching system characteristics failed!')
|
|
204
204
|
xpk_exit(return_code)
|
|
205
205
|
|
|
@@ -215,13 +215,13 @@ def cluster_create(args) -> None:
|
|
|
215
215
|
xpk_exit(0)
|
|
216
216
|
|
|
217
217
|
return_code, gke_server_config = get_gke_server_config(args)
|
|
218
|
-
if return_code != 0:
|
|
218
|
+
if return_code != 0 or gke_server_config is None:
|
|
219
219
|
xpk_exit(return_code)
|
|
220
220
|
|
|
221
221
|
return_code, gke_control_plane_version = get_gke_control_plane_version(
|
|
222
222
|
args, gke_server_config
|
|
223
223
|
)
|
|
224
|
-
if return_code != 0:
|
|
224
|
+
if return_code != 0 or gke_control_plane_version is None:
|
|
225
225
|
xpk_exit(return_code)
|
|
226
226
|
|
|
227
227
|
create_cluster_command_code = create_cluster_if_necessary(
|
|
@@ -247,6 +247,10 @@ def cluster_create(args) -> None:
|
|
|
247
247
|
|
|
248
248
|
get_cluster_credentials(args)
|
|
249
249
|
|
|
250
|
+
update_coredns_command_code = update_coredns_if_necessary(args)
|
|
251
|
+
if update_coredns_command_code != 0:
|
|
252
|
+
xpk_exit(update_cluster_command_code)
|
|
253
|
+
|
|
250
254
|
k8s_client = setup_k8s_env(args)
|
|
251
255
|
|
|
252
256
|
install_storage_crd(k8s_client)
|
|
@@ -288,7 +292,7 @@ def cluster_create(args) -> None:
|
|
|
288
292
|
# Provision node pools dynamically based on incoming workloads:
|
|
289
293
|
# Currently autoprovisioning is not supported with Pathways.
|
|
290
294
|
autoprovisioning_config = None
|
|
291
|
-
if
|
|
295
|
+
if args.enable_autoprovisioning:
|
|
292
296
|
xpk_print('Enabling Autoprovisioning')
|
|
293
297
|
autoprovisioning_config, return_code = enable_autoprovisioning_on_cluster(
|
|
294
298
|
args, system
|
|
@@ -392,7 +396,7 @@ def cluster_cacheimage(args) -> None:
|
|
|
392
396
|
get_cluster_credentials(args)
|
|
393
397
|
system, return_code = get_system_characteristics(args)
|
|
394
398
|
|
|
395
|
-
if return_code > 0:
|
|
399
|
+
if return_code > 0 or system is None:
|
|
396
400
|
xpk_print('Fetching system characteristics failed!')
|
|
397
401
|
xpk_exit(return_code)
|
|
398
402
|
|
|
@@ -702,6 +706,263 @@ def cluster_create_ray_cluster(args) -> None:
|
|
|
702
706
|
cluster_create(args)
|
|
703
707
|
|
|
704
708
|
|
|
709
|
+
def install_jq(args):
|
|
710
|
+
"""Installs 'jq' utility."""
|
|
711
|
+
if shutil.which('jq'):
|
|
712
|
+
xpk_print("Task: 'Install jq' skipped, jq already installed.")
|
|
713
|
+
return
|
|
714
|
+
command_jq_install = 'sudo apt install jq -y'
|
|
715
|
+
xpk_print("Task: 'Install jq' in progress.")
|
|
716
|
+
return_code = run_command_with_updates(command_jq_install, 'Install jq', args)
|
|
717
|
+
if return_code != 0:
|
|
718
|
+
xpk_print(f'Install jq error {return_code}')
|
|
719
|
+
xpk_exit(return_code)
|
|
720
|
+
|
|
721
|
+
|
|
722
|
+
def clone_coredns_deployment_repo(args, coredns_repo_full_path: str):
|
|
723
|
+
"""Clones the CoreDNS deployment repository if it doesn't exist."""
|
|
724
|
+
if os.path.exists(coredns_repo_full_path):
|
|
725
|
+
xpk_print(
|
|
726
|
+
f"Directory '{coredns_repo_full_path}' already exists, skip git clone."
|
|
727
|
+
)
|
|
728
|
+
return
|
|
729
|
+
command_git_clone = (
|
|
730
|
+
'git clone https://github.com/coredns/deployment.git'
|
|
731
|
+
f' {coredns_repo_full_path}'
|
|
732
|
+
)
|
|
733
|
+
xpk_print(
|
|
734
|
+
"Task: 'Clone deployment' in progress, Target"
|
|
735
|
+
f' directory:{coredns_repo_full_path}.'
|
|
736
|
+
)
|
|
737
|
+
return_code = run_command_with_updates(
|
|
738
|
+
command_git_clone, 'Clone deployment', args
|
|
739
|
+
)
|
|
740
|
+
if return_code != 0:
|
|
741
|
+
xpk_print(f'Clone deployment error {return_code}')
|
|
742
|
+
xpk_exit(return_code)
|
|
743
|
+
|
|
744
|
+
|
|
745
|
+
def deploy_coredns_manifests(args, coredns_k8s_path: str):
|
|
746
|
+
"""Deploys CoreDNS manifests to the cluster."""
|
|
747
|
+
if not os.path.isdir(coredns_k8s_path):
|
|
748
|
+
xpk_print(
|
|
749
|
+
f"Error:CoreDNS Kubernetes path '{coredns_k8s_path}' does not exist."
|
|
750
|
+
' Has git clone been successful?'
|
|
751
|
+
)
|
|
752
|
+
xpk_exit(1)
|
|
753
|
+
original_cwd = os.getcwd()
|
|
754
|
+
try:
|
|
755
|
+
os.chdir(coredns_k8s_path)
|
|
756
|
+
xpk_print(f'Current working directory changed to: {os.getcwd()}')
|
|
757
|
+
|
|
758
|
+
command_deploy_coredns = './deploy.sh | kubectl apply -f -'
|
|
759
|
+
xpk_print(
|
|
760
|
+
f"Task: 'Deploy CoreDNS' in progress, Located at '{coredns_k8s_path}'"
|
|
761
|
+
)
|
|
762
|
+
return_code = run_command_with_updates(
|
|
763
|
+
command_deploy_coredns, 'Deploy CoreDNS', args
|
|
764
|
+
)
|
|
765
|
+
if return_code != 0:
|
|
766
|
+
xpk_print(f'Deploy CoreDNS error {return_code}')
|
|
767
|
+
|
|
768
|
+
finally:
|
|
769
|
+
xpk_print(f'Restoring working directory to: {original_cwd}')
|
|
770
|
+
os.chdir(original_cwd)
|
|
771
|
+
if return_code != 0:
|
|
772
|
+
xpk_exit(return_code)
|
|
773
|
+
|
|
774
|
+
|
|
775
|
+
def scale_down_deployment(
|
|
776
|
+
args, deployment_name: str, namespace: str = 'kube-system'
|
|
777
|
+
):
|
|
778
|
+
"""Scales down a specified Kubernetes deployment to 0 replicas."""
|
|
779
|
+
command = (
|
|
780
|
+
f'kubectl scale deployment {deployment_name} --replicas=0'
|
|
781
|
+
f' --namespace={namespace}'
|
|
782
|
+
)
|
|
783
|
+
xpk_print(f"Task: 'Scaling down {deployment_name}' in progress")
|
|
784
|
+
return_code = run_command_with_updates(
|
|
785
|
+
command, f'Scale down {deployment_name}', args
|
|
786
|
+
)
|
|
787
|
+
if return_code != 0:
|
|
788
|
+
xpk_print(f'Scale down {deployment_name} error {return_code}')
|
|
789
|
+
xpk_exit(return_code)
|
|
790
|
+
xpk_print(f'\n{deployment_name} has been scaled down.')
|
|
791
|
+
|
|
792
|
+
|
|
793
|
+
def scale_up_coredns(args, replicas: int = 15, namespace: str = 'kube-system'):
|
|
794
|
+
"""Scales up the CoreDNS deployment to a specified number of replicas."""
|
|
795
|
+
command_coredns_scale = (
|
|
796
|
+
f'kubectl scale deployment coredns --replicas={replicas} -n {namespace}'
|
|
797
|
+
)
|
|
798
|
+
xpk_print(f"Task: 'Scale CoreDNS' in progress (to {replicas} replicas)")
|
|
799
|
+
return_code = run_command_with_updates(
|
|
800
|
+
command_coredns_scale, 'Scale CoreDNS', args
|
|
801
|
+
)
|
|
802
|
+
if return_code != 0:
|
|
803
|
+
xpk_print(f'Scale CoreDNS error {return_code}')
|
|
804
|
+
xpk_exit(return_code)
|
|
805
|
+
|
|
806
|
+
|
|
807
|
+
def check_deployment_exists(args, deployment_name: str, namespace: str) -> bool:
|
|
808
|
+
"""Check for the existence of a specific Deployment in a given namespace."""
|
|
809
|
+
# TODO: rewrite this to be more obvious, check if it is correct
|
|
810
|
+
command = (
|
|
811
|
+
f'kubectl get deployment {deployment_name} -n'
|
|
812
|
+
f' {namespace} --ignore-not-found'
|
|
813
|
+
)
|
|
814
|
+
result = run_command_with_updates(
|
|
815
|
+
command, 'Waiting for kubeDNS to be checked.', args
|
|
816
|
+
)
|
|
817
|
+
return result != 0
|
|
818
|
+
|
|
819
|
+
|
|
820
|
+
def verify_coredns_readiness(
|
|
821
|
+
args, timeout: int = 240, namespace: str = 'kube-system'
|
|
822
|
+
):
|
|
823
|
+
"""Verifies CoreDNS readiness using kubectl wait commands."""
|
|
824
|
+
xpk_print('Now verifying CoreDNS readiness...')
|
|
825
|
+
kube_dns_exists = check_deployment_exists(args, 'kube-dns', namespace)
|
|
826
|
+
if kube_dns_exists:
|
|
827
|
+
# Wait for kube-dns to be fully scaled down
|
|
828
|
+
command_kube_dns_wait_scaled_down = (
|
|
829
|
+
'kubectl wait deployment/kube-dns'
|
|
830
|
+
" --for=jsonpath='{.status.replicas}'=0"
|
|
831
|
+
f' --namespace={namespace} --timeout={timeout}s'
|
|
832
|
+
)
|
|
833
|
+
xpk_print('Verifying if kube-dns has scaled down...')
|
|
834
|
+
return_code_kube_dns = run_command_with_updates(
|
|
835
|
+
command_kube_dns_wait_scaled_down, 'Wait for kube-dns scale down', args
|
|
836
|
+
)
|
|
837
|
+
if return_code_kube_dns != 0:
|
|
838
|
+
xpk_print('kube-dns did not scale down successfully within the timeout.')
|
|
839
|
+
xpk_exit(1) # Exit if kube-dns cannot scale down
|
|
840
|
+
else:
|
|
841
|
+
xpk_print('kube-dns has successfully scaled down.')
|
|
842
|
+
else:
|
|
843
|
+
xpk_print('kube-dns deployment not found.')
|
|
844
|
+
# Wait for CoreDNS to be fully scaled up and available
|
|
845
|
+
command_coredns_wait_available = (
|
|
846
|
+
'kubectl wait deployment/coredns --for=condition=Available=true'
|
|
847
|
+
f' --namespace={namespace} --timeout={timeout}s'
|
|
848
|
+
)
|
|
849
|
+
xpk_print('Verifying if CoreDNS is available...')
|
|
850
|
+
return_code_coredns = run_command_with_updates(
|
|
851
|
+
command_coredns_wait_available, 'Wait for coredns available', args
|
|
852
|
+
)
|
|
853
|
+
if return_code_coredns != 0:
|
|
854
|
+
xpk_print(
|
|
855
|
+
'CoreDNS verification failed, it might not have fully started within'
|
|
856
|
+
' the timeout.'
|
|
857
|
+
)
|
|
858
|
+
xpk_exit(1) # Exit if coredns cannot become available
|
|
859
|
+
|
|
860
|
+
xpk_print('CoreDNS has successfully started and passed verification.')
|
|
861
|
+
|
|
862
|
+
|
|
863
|
+
def cleanup_coredns_repo(coredns_repo_full_path: str):
|
|
864
|
+
"""Deletes the cloned CoreDNS deployment directory."""
|
|
865
|
+
xpk_print(
|
|
866
|
+
"Task: 'Deleting CoreDNS deployment directory' in progress:"
|
|
867
|
+
f' {coredns_repo_full_path}'
|
|
868
|
+
)
|
|
869
|
+
try:
|
|
870
|
+
shutil.rmtree(coredns_repo_full_path)
|
|
871
|
+
xpk_print(f'Successfully deleted directory: {coredns_repo_full_path}')
|
|
872
|
+
except OSError as e:
|
|
873
|
+
xpk_print(f'Error deleting directory {coredns_repo_full_path}: {e}')
|
|
874
|
+
|
|
875
|
+
|
|
876
|
+
def update_coredns(args) -> int:
|
|
877
|
+
"""Updates and deploys CoreDNS within a cluster.
|
|
878
|
+
|
|
879
|
+
Args:
|
|
880
|
+
args: user provided arguments for running the command.
|
|
881
|
+
|
|
882
|
+
Returns:
|
|
883
|
+
0 if successful and 1 otherwise.
|
|
884
|
+
"""
|
|
885
|
+
coredns_repo_dir = os.path.expanduser('/tmp/')
|
|
886
|
+
coredns_repo_dir_name = 'deployment'
|
|
887
|
+
coredns_repo_full_path = os.path.join(coredns_repo_dir, coredns_repo_dir_name)
|
|
888
|
+
coredns_k8s_path = os.path.join(coredns_repo_full_path, 'kubernetes')
|
|
889
|
+
# 1. Install jq
|
|
890
|
+
install_jq(args)
|
|
891
|
+
|
|
892
|
+
# 2. Clone CoreDNS deployment repository
|
|
893
|
+
clone_coredns_deployment_repo(args, coredns_repo_full_path)
|
|
894
|
+
|
|
895
|
+
# 3. Deploy CoreDNS to the cluster
|
|
896
|
+
deploy_coredns_manifests(args, coredns_k8s_path)
|
|
897
|
+
|
|
898
|
+
# 4. Scale down kube-dns-autoscaler
|
|
899
|
+
scale_down_deployment(args, 'kube-dns-autoscaler')
|
|
900
|
+
|
|
901
|
+
# 5. Scale down kube-dns
|
|
902
|
+
scale_down_deployment(args, 'kube-dns')
|
|
903
|
+
|
|
904
|
+
# 6. Scale up coredns and verify readiness
|
|
905
|
+
scale_up_coredns(args, replicas=15)
|
|
906
|
+
verify_coredns_readiness(args, timeout=120)
|
|
907
|
+
|
|
908
|
+
xpk_print('The CoreDNS setup process has been completed.')
|
|
909
|
+
|
|
910
|
+
# 7. Cleanup
|
|
911
|
+
cleanup_coredns_repo(coredns_repo_full_path)
|
|
912
|
+
|
|
913
|
+
return 0
|
|
914
|
+
|
|
915
|
+
|
|
916
|
+
def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool:
|
|
917
|
+
"""Checks if the CoreDNS deployment exists in the given namespace.
|
|
918
|
+
|
|
919
|
+
Args:
|
|
920
|
+
namespace: The Kubernetes namespace to check for the CoreDNS deployment.
|
|
921
|
+
|
|
922
|
+
Returns:
|
|
923
|
+
True if the 'coredns' deployment exists, False otherwise.
|
|
924
|
+
"""
|
|
925
|
+
command = f'kubectl get deployment coredns -n {namespace}'
|
|
926
|
+
xpk_print(
|
|
927
|
+
"Task: 'Checking CoreDNS deployment existence' in progress for"
|
|
928
|
+
f' namespace: {namespace}'
|
|
929
|
+
)
|
|
930
|
+
return_code = run_command_with_updates(
|
|
931
|
+
command, f'Check CoreDNS deployment in {namespace}', args
|
|
932
|
+
)
|
|
933
|
+
if return_code == 0:
|
|
934
|
+
verify_coredns_readiness(args)
|
|
935
|
+
xpk_print(f"CoreDNS deployment 'coredns' found in namespace '{namespace}'.")
|
|
936
|
+
return True
|
|
937
|
+
else:
|
|
938
|
+
xpk_print(
|
|
939
|
+
f"CoreDNS deployment 'coredns' NOT found in namespace '{namespace}' or"
|
|
940
|
+
' an error occurred.'
|
|
941
|
+
)
|
|
942
|
+
return False
|
|
943
|
+
|
|
944
|
+
|
|
945
|
+
def update_coredns_if_necessary(args) -> int:
|
|
946
|
+
"""Updates and deploys CoreDNS within the cluster if it's not already present.
|
|
947
|
+
|
|
948
|
+
This function checks for the existence of the CoreDNS deployment.
|
|
949
|
+
If it's not found, it proceeds to deploy and configure CoreDNS.
|
|
950
|
+
|
|
951
|
+
Args:
|
|
952
|
+
args: User-provided arguments for running the command.
|
|
953
|
+
|
|
954
|
+
Returns:
|
|
955
|
+
0 if successful (CoreDNS was already present or successfully deployed),
|
|
956
|
+
and 1 otherwise.
|
|
957
|
+
"""
|
|
958
|
+
if coredns_deployment_exists(args, namespace='kube-system'):
|
|
959
|
+
xpk_print('Skipping CoreDNS deployment since it already exists.')
|
|
960
|
+
return 0
|
|
961
|
+
else:
|
|
962
|
+
xpk_print('CoreDNS deployment not found. Proceeding with CoreDNS setup.')
|
|
963
|
+
return update_coredns(args)
|
|
964
|
+
|
|
965
|
+
|
|
705
966
|
def create_cluster_if_necessary(
|
|
706
967
|
args, gke_control_plane_version: str, system: SystemCharacteristics
|
|
707
968
|
) -> int:
|
|
@@ -842,6 +1103,7 @@ def run_gke_cluster_create_command(
|
|
|
842
1103
|
f' {args.custom_cluster_arguments}'
|
|
843
1104
|
f' {rapid_release_cmd}'
|
|
844
1105
|
' --enable-dns-access'
|
|
1106
|
+
' --autoscaling-profile=optimize-utilization'
|
|
845
1107
|
)
|
|
846
1108
|
|
|
847
1109
|
enable_ip_alias = False
|
xpk/commands/cluster_gcluster.py
CHANGED
xpk/commands/common.py
CHANGED
|
@@ -50,8 +50,8 @@ def set_cluster_command(args) -> int:
|
|
|
50
50
|
|
|
51
51
|
|
|
52
52
|
def is_TAS_possible(
|
|
53
|
-
system_characteristics: SystemCharacteristics,
|
|
54
|
-
capacity_type: CapacityType,
|
|
53
|
+
system_characteristics: SystemCharacteristics | None,
|
|
54
|
+
capacity_type: CapacityType | None,
|
|
55
55
|
flex: bool,
|
|
56
56
|
) -> bool:
|
|
57
57
|
"""Check cluster's machine_type and capacity type to determine if Kueue TAS is possible
|
|
@@ -71,7 +71,7 @@ def is_TAS_possible(
|
|
|
71
71
|
xpk_print('capacity_type data was not found in configmaps.')
|
|
72
72
|
xpk_exit(1)
|
|
73
73
|
|
|
74
|
-
if flex:
|
|
74
|
+
if not flex:
|
|
75
75
|
return False
|
|
76
76
|
|
|
77
77
|
if (
|
xpk/commands/info.py
CHANGED
|
@@ -51,19 +51,19 @@ def info(args: Namespace) -> None:
|
|
|
51
51
|
cqs = run_kueuectl_list_clusterqueue(args)
|
|
52
52
|
quotas = get_nominal_quotas(cqs)
|
|
53
53
|
|
|
54
|
-
if lq:
|
|
54
|
+
if lq and lqs is not None:
|
|
55
55
|
print_formatted_lqs(lqs, quotas)
|
|
56
56
|
|
|
57
57
|
if cq:
|
|
58
58
|
print_formatted_cqs(cqs, quotas)
|
|
59
59
|
|
|
60
60
|
|
|
61
|
-
def get_nominal_quotas(cqs:
|
|
61
|
+
def get_nominal_quotas(cqs: str) -> dict[str, dict[str, str]]:
|
|
62
62
|
"""Get quotas from clusterqueues.
|
|
63
63
|
This function retrieves how much of resource in each flavor is assigned to cluster queue.
|
|
64
64
|
It parses flavors of passed cluster queues.
|
|
65
65
|
Args:
|
|
66
|
-
- cqs - list of cluster queues.
|
|
66
|
+
- cqs - string containing a list of cluster queues in JSON format.
|
|
67
67
|
Returns:
|
|
68
68
|
- dictionary of cluster queues resources quotas in format:
|
|
69
69
|
{cq_name:{"flavorName:resourceName":quota}}
|
|
@@ -75,7 +75,7 @@ def get_nominal_quotas(cqs: list[dict]) -> dict[str, dict[str, str]]:
|
|
|
75
75
|
xpk_print(cqs)
|
|
76
76
|
xpk_exit(1)
|
|
77
77
|
|
|
78
|
-
quotas = {}
|
|
78
|
+
quotas: dict[str, dict] = {}
|
|
79
79
|
for cq in cq_list:
|
|
80
80
|
spec = cq['spec']
|
|
81
81
|
cq_name = cq['metadata']['name']
|
|
@@ -89,7 +89,7 @@ def get_nominal_quotas(cqs: list[dict]) -> dict[str, dict[str, str]]:
|
|
|
89
89
|
return quotas
|
|
90
90
|
|
|
91
91
|
|
|
92
|
-
def print_formatted_cqs(cqs:
|
|
92
|
+
def print_formatted_cqs(cqs: str, nominalQuotas) -> None:
|
|
93
93
|
try:
|
|
94
94
|
cq_list = json.loads(cqs)['items']
|
|
95
95
|
except ValueError:
|
|
@@ -105,7 +105,7 @@ def print_formatted_cqs(cqs: list[dict], nominalQuotas) -> None:
|
|
|
105
105
|
)
|
|
106
106
|
|
|
107
107
|
|
|
108
|
-
def print_formatted_lqs(lqs:
|
|
108
|
+
def print_formatted_lqs(lqs: str, nominalQuotas) -> None:
|
|
109
109
|
try:
|
|
110
110
|
lq_list = json.loads(lqs)['items']
|
|
111
111
|
except ValueError:
|
|
@@ -143,18 +143,18 @@ def parse_queue_lists(
|
|
|
143
143
|
|
|
144
144
|
|
|
145
145
|
def get_flavors_resources_reservations(
|
|
146
|
-
cq_name: str, flavors_res:
|
|
146
|
+
cq_name: str, flavors_res: dict
|
|
147
147
|
) -> dict[str, dict[str, str]]:
|
|
148
148
|
"""Get usage of flavors resources.
|
|
149
149
|
This function parser flavorsReservation section of clusterQueue of LocalQueue.
|
|
150
150
|
Args:
|
|
151
151
|
- cq_name - name of ClusterQueue to which flavors belong.
|
|
152
|
-
- flavors_res -
|
|
152
|
+
- flavors_res - dict of reservations made by flavors
|
|
153
153
|
Returns:
|
|
154
154
|
Dict containing usage of each resource in flavor for each flavor in cluster or local queue.
|
|
155
155
|
Dict format: {cq_name: {{flavor:resource}:reservation}}
|
|
156
156
|
"""
|
|
157
|
-
reservations = {}
|
|
157
|
+
reservations: dict[str, dict] = {}
|
|
158
158
|
reservations[cq_name] = {}
|
|
159
159
|
for flavor_name, flavor_resources_reservation_list in flavors_res.items():
|
|
160
160
|
for resource in flavor_resources_reservation_list:
|
|
@@ -167,15 +167,15 @@ def get_flavors_resources_reservations(
|
|
|
167
167
|
|
|
168
168
|
def get_flavors_usage(
|
|
169
169
|
q_entry: dict, res_field: str, flavor_resource_quotas: dict
|
|
170
|
-
) ->
|
|
170
|
+
) -> dict[str, str]:
|
|
171
171
|
"""Parse q_entry to retrieve list of each resource usage in flavour.
|
|
172
172
|
Args:
|
|
173
173
|
q_entry - single entry into either LocalQueue or ClusterQueue structured as json
|
|
174
174
|
flavor_resource_quotas - nominalQuota of flavors resource usage for each clusterqueue
|
|
175
175
|
Returns:
|
|
176
|
-
|
|
176
|
+
Dict where for each (key, value):
|
|
177
177
|
- key is flavorName:resourceName
|
|
178
|
-
-
|
|
178
|
+
- value is string formatted as 'flavorResourceReservation/flavorResourceQuota'
|
|
179
179
|
"""
|
|
180
180
|
status = q_entry['status']
|
|
181
181
|
flavors_res = status[res_field]
|
xpk/commands/job.py
CHANGED
|
@@ -18,6 +18,7 @@ import re
|
|
|
18
18
|
import sys
|
|
19
19
|
|
|
20
20
|
from ruamel.yaml import YAML
|
|
21
|
+
from typing import cast
|
|
21
22
|
|
|
22
23
|
from ..core.commands import run_command_for_value, run_command_with_updates
|
|
23
24
|
from ..core.cluster import get_cluster_credentials
|
|
@@ -84,7 +85,7 @@ def job_info(args):
|
|
|
84
85
|
|
|
85
86
|
|
|
86
87
|
def get_profile(job_yaml: dict) -> str:
|
|
87
|
-
containers = (
|
|
88
|
+
containers: list[dict] = (
|
|
88
89
|
job_yaml.get('spec', {})
|
|
89
90
|
.get('template', {})
|
|
90
91
|
.get('spec', {})
|
|
@@ -96,13 +97,13 @@ def get_profile(job_yaml: dict) -> str:
|
|
|
96
97
|
|
|
97
98
|
|
|
98
99
|
def get_mounts(job_yaml: dict) -> list[dict]:
|
|
99
|
-
containers = (
|
|
100
|
+
containers: list[dict] = (
|
|
100
101
|
job_yaml.get('spec', {})
|
|
101
102
|
.get('template', {})
|
|
102
103
|
.get('spec', {})
|
|
103
104
|
.get('containers', [])
|
|
104
105
|
)
|
|
105
|
-
mounts = next(iter(containers), {}).get('volumeMounts', [])
|
|
106
|
+
mounts: list[dict] = next(iter(containers), {}).get('volumeMounts', [])
|
|
106
107
|
return mounts
|
|
107
108
|
|
|
108
109
|
|
|
@@ -112,23 +113,24 @@ def get_kjob_env_vars(job_desc_text: str) -> list[tuple[str, str]]:
|
|
|
112
113
|
return search_res
|
|
113
114
|
|
|
114
115
|
|
|
115
|
-
def get_pods(pods_text: str) -> list[str]:
|
|
116
|
+
def get_pods(pods_text: str) -> list[dict[str, str]]:
|
|
116
117
|
pods_lines = pods_text.strip().split('\n')
|
|
117
|
-
|
|
118
|
+
pods_lines_tokenized = [line.split() for line in pods_lines]
|
|
118
119
|
return [
|
|
119
120
|
{
|
|
120
|
-
'Name':
|
|
121
|
-
'Status':
|
|
121
|
+
'Name': tokens[0],
|
|
122
|
+
'Status': tokens[2],
|
|
122
123
|
}
|
|
123
|
-
for
|
|
124
|
+
for tokens in pods_lines_tokenized
|
|
124
125
|
]
|
|
125
126
|
|
|
126
127
|
|
|
127
128
|
def get_script_name(job_yaml: dict) -> str | None:
|
|
128
|
-
return (
|
|
129
|
+
return cast(
|
|
130
|
+
str | None,
|
|
129
131
|
job_yaml.get('metadata', {})
|
|
130
132
|
.get('annotations', {})
|
|
131
|
-
.get('kjobctl.x-k8s.io/script', '')
|
|
133
|
+
.get('kjobctl.x-k8s.io/script', ''),
|
|
132
134
|
)
|
|
133
135
|
|
|
134
136
|
|
xpk/commands/kjob_common.py
CHANGED
|
@@ -33,6 +33,7 @@ from ..core.resources import get_cluster_capacity_type, get_cluster_system_chara
|
|
|
33
33
|
def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
|
|
34
34
|
gpu_type = get_gpu_type_from_cluster(args)
|
|
35
35
|
|
|
36
|
+
annotations: tuple
|
|
36
37
|
if gpu_type == H100_MEGA_DEVICE_TYPE:
|
|
37
38
|
annotations = get_a3mega_pod_template_annotations(args)
|
|
38
39
|
elif gpu_type == H200_DEVICE_TYPE:
|
|
@@ -40,7 +41,7 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
|
|
|
40
41
|
elif gpu_type == B200_DEVICE_TYPE:
|
|
41
42
|
annotations = get_a4_pod_template_annotations(args)
|
|
42
43
|
else:
|
|
43
|
-
annotations =
|
|
44
|
+
annotations = tuple()
|
|
44
45
|
|
|
45
46
|
flags = [
|
|
46
47
|
f" --pod-template-annotation {annotation} " for annotation in annotations
|
xpk/commands/storage.py
CHANGED
|
@@ -141,7 +141,7 @@ def storage_delete(args: Namespace) -> None:
|
|
|
141
141
|
|
|
142
142
|
def storage_attach(args: Namespace) -> None:
|
|
143
143
|
add_zone_and_project(args)
|
|
144
|
-
manifest = [{}]
|
|
144
|
+
manifest: list[dict] = [{}]
|
|
145
145
|
if args.type == GCP_FILESTORE_TYPE:
|
|
146
146
|
if args.instance is None:
|
|
147
147
|
args.instance = args.name
|
xpk/commands/workload.py
CHANGED
|
@@ -84,6 +84,7 @@ from ..core.system_characteristics import (
|
|
|
84
84
|
from ..core.vertex import create_vertex_experiment
|
|
85
85
|
from ..core.workload import (
|
|
86
86
|
check_if_workload_exists,
|
|
87
|
+
get_jobsets_list_gcp_link,
|
|
87
88
|
get_workload_list,
|
|
88
89
|
wait_for_job_completion,
|
|
89
90
|
zone_to_region,
|
|
@@ -226,7 +227,8 @@ spec:
|
|
|
226
227
|
metadata:
|
|
227
228
|
labels:
|
|
228
229
|
xpk.google.com/workload: {args.workload}
|
|
229
|
-
annotations:
|
|
230
|
+
annotations:
|
|
231
|
+
{annotations}
|
|
230
232
|
spec:
|
|
231
233
|
priorityClassName: {args.priority}
|
|
232
234
|
restartPolicy: Never
|
|
@@ -319,7 +321,7 @@ def workload_create(args) -> None:
|
|
|
319
321
|
xpk_print('Starting workload create', flush=True)
|
|
320
322
|
system, return_code = get_system_characteristics(args)
|
|
321
323
|
|
|
322
|
-
if return_code > 0:
|
|
324
|
+
if return_code > 0 or system is None:
|
|
323
325
|
xpk_print('Fetching system characteristics failed!')
|
|
324
326
|
xpk_exit(return_code)
|
|
325
327
|
|
|
@@ -345,7 +347,7 @@ def workload_create(args) -> None:
|
|
|
345
347
|
):
|
|
346
348
|
xpk_print(
|
|
347
349
|
'Warning: Cluster has been created using XPK version:'
|
|
348
|
-
f' {
|
|
350
|
+
f' {cluster_xpk_version} but the XPK version you are'
|
|
349
351
|
f' using to schedule workload is: {XPK_CURRENT_VERSION}. Some features'
|
|
350
352
|
' might not be available for this cluster. We recommend to'
|
|
351
353
|
' upgrade/downgrade your XPK version or cluster by running `xpk'
|
|
@@ -354,7 +356,7 @@ def workload_create(args) -> None:
|
|
|
354
356
|
|
|
355
357
|
debugging_dashboard_id = None
|
|
356
358
|
|
|
357
|
-
tensorboard_config = {}
|
|
359
|
+
tensorboard_config: dict | None = {}
|
|
358
360
|
if VERTEX_TENSORBOARD_FEATURE_FLAG and args.use_vertex_tensorboard:
|
|
359
361
|
tensorboard_config = create_vertex_experiment(args)
|
|
360
362
|
# exit if failed to create Experiment in Vertex AI
|
|
@@ -450,8 +452,8 @@ def workload_create(args) -> None:
|
|
|
450
452
|
- action: FailJobSet
|
|
451
453
|
onJobFailureReasons:
|
|
452
454
|
- PodFailurePolicy"""
|
|
453
|
-
|
|
454
|
-
restart_on_exit_codes = ','.join(map(str,
|
|
455
|
+
restart_on_exit_codes_list = get_restart_exit_codes(args)
|
|
456
|
+
restart_on_exit_codes = ','.join(map(str, restart_on_exit_codes_list))
|
|
455
457
|
pod_failure_policy = f"""
|
|
456
458
|
podFailurePolicy:
|
|
457
459
|
rules:
|
|
@@ -760,4 +762,8 @@ def workload_list(args) -> None:
|
|
|
760
762
|
xpk_print(f'List Job request returned ERROR {return_code}')
|
|
761
763
|
xpk_exit(return_code)
|
|
762
764
|
xpk_print(f'Workload List Output:\n{return_value}')
|
|
765
|
+
|
|
766
|
+
workload_list_gcp_link = get_jobsets_list_gcp_link(project=args.project)
|
|
767
|
+
xpk_print(f'See your workloads in Cloud Console: {workload_list_gcp_link}')
|
|
768
|
+
|
|
763
769
|
xpk_exit(0)
|
|
@@ -34,7 +34,7 @@ from ..system_characteristics import get_system_characteristics_by_device_type
|
|
|
34
34
|
from .blueprint_definitions import Blueprint, DeploymentGroup, DeploymentModule
|
|
35
35
|
from ..kueue import KUEUE_VERSION
|
|
36
36
|
|
|
37
|
-
|
|
37
|
+
yaml_parser = yaml.YAML()
|
|
38
38
|
|
|
39
39
|
a3high_device_type = H100_DEVICE_TYPE
|
|
40
40
|
a3mega_device_type = H100_MEGA_DEVICE_TYPE
|
|
@@ -52,7 +52,7 @@ blueprint_dependencies_dir = {
|
|
|
52
52
|
}
|
|
53
53
|
|
|
54
54
|
cluster_toolkit_url = "github.com/GoogleCloudPlatform/cluster-toolkit"
|
|
55
|
-
cluster_toolkit_version = "v1.
|
|
55
|
+
cluster_toolkit_version = "v1.62.2"
|
|
56
56
|
|
|
57
57
|
|
|
58
58
|
class BlueprintGeneratorOutput:
|
|
@@ -1019,7 +1019,7 @@ class BlueprintGenerator:
|
|
|
1019
1019
|
) -> str:
|
|
1020
1020
|
blueprint_path = self._get_blueprint_path(blueprint_name, prefix)
|
|
1021
1021
|
with open(blueprint_path, "w+", encoding="utf-8") as blueprint_file:
|
|
1022
|
-
|
|
1022
|
+
yaml_parser.dump(xpk_blueprint, blueprint_file)
|
|
1023
1023
|
return blueprint_path
|
|
1024
1024
|
|
|
1025
1025
|
def _get_blueprint_path(self, blueprint_name, prefix: str = ""):
|
|
@@ -1033,7 +1033,7 @@ class BlueprintGenerator:
|
|
|
1033
1033
|
ensure_directory_exists(storage_path_with_prefix)
|
|
1034
1034
|
return storage_path_with_prefix
|
|
1035
1035
|
|
|
1036
|
-
def blueprint_exists(self, blueprint_name, prefix: str = ""):
|
|
1036
|
+
def blueprint_exists(self, blueprint_name, prefix: str = "") -> bool:
|
|
1037
1037
|
blueprint_path = self._get_blueprint_path(blueprint_name, prefix)
|
|
1038
1038
|
return os.path.exists(blueprint_path)
|
|
1039
1039
|
|
|
@@ -1061,6 +1061,6 @@ class BlueprintGenerator:
|
|
|
1061
1061
|
}
|
|
1062
1062
|
|
|
1063
1063
|
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1064
|
+
yaml_parser.register_class(Blueprint)
|
|
1065
|
+
yaml_parser.register_class(DeploymentGroup)
|
|
1066
|
+
yaml_parser.register_class(DeploymentModule)
|