xpk 0.10.0__tar.gz → 0.11.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xpk-0.10.0/src/xpk.egg-info → xpk-0.11.0}/PKG-INFO +2 -2
- {xpk-0.10.0 → xpk-0.11.0}/pyproject.toml +1 -1
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/commands/cluster.py +263 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/capacity.py +2 -2
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/config.py +1 -1
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/jobset.py +1 -1
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/kueue.py +8 -15
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/nap.py +16 -2
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/nodepool.py +52 -19
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/scheduling.py +1 -1
- xpk-0.11.0/src/xpk/core/system_characteristics.py +627 -0
- {xpk-0.10.0 → xpk-0.11.0/src/xpk.egg-info}/PKG-INFO +2 -2
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk.egg-info/requires.txt +1 -1
- xpk-0.10.0/src/xpk/core/system_characteristics.py +0 -1441
- {xpk-0.10.0 → xpk-0.11.0}/LICENSE +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/README.md +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/setup.cfg +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/__init__.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/api/__init__.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/api/storage_crd.yaml +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/commands/__init__.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/commands/batch.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/commands/cluster_gcluster.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/commands/common.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/commands/config.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/commands/info.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/commands/inspector.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/commands/job.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/commands/kind.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/commands/kjob_common.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/commands/run.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/commands/shell.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/commands/storage.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/commands/version.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/commands/workload.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/__init__.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/blueprint/__init__.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/blueprint/blueprint_generator.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/cluster.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/cluster_private.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/commands.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/docker_container.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/docker_image.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/docker_manager.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/docker_resources.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/filestore.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/gcloud_context.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/gcluster_manager.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/gcsfuse.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/kjob.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/monitoring.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/mtc.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/network.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/pathways.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/ray.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/remote_state/__init__.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/remote_state/remote_state_client.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/resources.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/storage.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/vertex.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/workload.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/workload_decorators/__init__.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/workload_decorators/rdma_decorator.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/workload_decorators/tcpx_decorator.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/core/workload_decorators/tcpxo_decorator.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/main.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/parser/__init__.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/parser/batch.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/parser/cluster.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/parser/common.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/parser/config.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/parser/core.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/parser/info.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/parser/inspector.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/parser/job.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/parser/kind.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/parser/run.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/parser/shell.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/parser/storage.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/parser/validators.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/parser/version.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/parser/workload.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/templates/__init__.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/templates/storage.yaml +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/utils/__init__.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/utils/console.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/utils/file.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/utils/gcs_utils.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/utils/kubectl.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/utils/network.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/utils/objects.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/utils/templates.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/utils/validation.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk/utils/yaml.py +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk.egg-info/SOURCES.txt +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk.egg-info/dependency_links.txt +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk.egg-info/entry_points.txt +0 -0
- {xpk-0.10.0 → xpk-0.11.0}/src/xpk.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xpk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.11.0
|
|
4
4
|
Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
|
|
5
5
|
Author-email: XPK team <xpk-code-reviewers@google.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -21,7 +21,7 @@ Requires-Dist: google-cloud==0.34.0
|
|
|
21
21
|
Requires-Dist: google-api-core==2.24.1
|
|
22
22
|
Requires-Dist: packaging==24.2
|
|
23
23
|
Requires-Dist: google-cloud-filestore==1.12.0
|
|
24
|
-
Requires-Dist: google-cloud-storage
|
|
24
|
+
Requires-Dist: google-cloud-storage
|
|
25
25
|
Provides-Extra: dev
|
|
26
26
|
Requires-Dist: pyink==24.3.0; extra == "dev"
|
|
27
27
|
Requires-Dist: pylint>=2.6.0; extra == "dev"
|
|
@@ -78,6 +78,8 @@ from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
|
78
78
|
from ..utils.file import write_tmp_file
|
|
79
79
|
from . import cluster_gcluster
|
|
80
80
|
from .common import set_cluster_command
|
|
81
|
+
import shutil
|
|
82
|
+
import os
|
|
81
83
|
|
|
82
84
|
|
|
83
85
|
def cluster_adapt(args) -> None:
|
|
@@ -247,6 +249,10 @@ def cluster_create(args) -> None:
|
|
|
247
249
|
|
|
248
250
|
get_cluster_credentials(args)
|
|
249
251
|
|
|
252
|
+
update_coredns_command_code = update_coredns_if_necessary(args)
|
|
253
|
+
if update_coredns_command_code != 0:
|
|
254
|
+
xpk_exit(update_cluster_command_code)
|
|
255
|
+
|
|
250
256
|
k8s_client = setup_k8s_env(args)
|
|
251
257
|
|
|
252
258
|
install_storage_crd(k8s_client)
|
|
@@ -702,6 +708,262 @@ def cluster_create_ray_cluster(args) -> None:
|
|
|
702
708
|
cluster_create(args)
|
|
703
709
|
|
|
704
710
|
|
|
711
|
+
def install_jq(args):
|
|
712
|
+
"""Installs 'jq' utility."""
|
|
713
|
+
if shutil.which('jq'):
|
|
714
|
+
xpk_print("Task: 'Install jq' skipped, jq already installed.")
|
|
715
|
+
return
|
|
716
|
+
command_jq_install = 'sudo apt install jq -y'
|
|
717
|
+
xpk_print("Task: 'Install jq' in progress.")
|
|
718
|
+
return_code = run_command_with_updates(command_jq_install, 'Install jq', args)
|
|
719
|
+
if return_code != 0:
|
|
720
|
+
xpk_print(f'Install jq error {return_code}')
|
|
721
|
+
xpk_exit(return_code)
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
def clone_coredns_deployment_repo(args, coredns_repo_full_path: str):
|
|
725
|
+
"""Clones the CoreDNS deployment repository if it doesn't exist."""
|
|
726
|
+
if os.path.exists(coredns_repo_full_path):
|
|
727
|
+
xpk_print(
|
|
728
|
+
f"Directory '{coredns_repo_full_path}' already exists, skip git clone."
|
|
729
|
+
)
|
|
730
|
+
return
|
|
731
|
+
command_git_clone = (
|
|
732
|
+
'git clone https://github.com/coredns/deployment.git'
|
|
733
|
+
f' {coredns_repo_full_path}'
|
|
734
|
+
)
|
|
735
|
+
xpk_print(
|
|
736
|
+
"Task: 'Clone deployment' in progress, Target"
|
|
737
|
+
f' directory:{coredns_repo_full_path}.'
|
|
738
|
+
)
|
|
739
|
+
return_code = run_command_with_updates(
|
|
740
|
+
command_git_clone, 'Clone deployment', args
|
|
741
|
+
)
|
|
742
|
+
if return_code != 0:
|
|
743
|
+
xpk_print(f'Clone deployment error {return_code}')
|
|
744
|
+
xpk_exit(return_code)
|
|
745
|
+
|
|
746
|
+
|
|
747
|
+
def deploy_coredns_manifests(args, coredns_k8s_path: str):
|
|
748
|
+
"""Deploys CoreDNS manifests to the cluster."""
|
|
749
|
+
if not os.path.isdir(coredns_k8s_path):
|
|
750
|
+
xpk_print(
|
|
751
|
+
f"Error:CoreDNS Kubernetes path '{coredns_k8s_path}' does not exist."
|
|
752
|
+
' Has git clone been successful?'
|
|
753
|
+
)
|
|
754
|
+
xpk_exit(1)
|
|
755
|
+
original_cwd = os.getcwd()
|
|
756
|
+
try:
|
|
757
|
+
os.chdir(coredns_k8s_path)
|
|
758
|
+
xpk_print(f'Current working directory changed to: {os.getcwd()}')
|
|
759
|
+
|
|
760
|
+
command_deploy_coredns = './deploy.sh | kubectl apply -f -'
|
|
761
|
+
xpk_print(
|
|
762
|
+
f"Task: 'Deploy CoreDNS' in progress, Located at '{coredns_k8s_path}'"
|
|
763
|
+
)
|
|
764
|
+
return_code = run_command_with_updates(
|
|
765
|
+
command_deploy_coredns, 'Deploy CoreDNS', args
|
|
766
|
+
)
|
|
767
|
+
if return_code != 0:
|
|
768
|
+
xpk_print(f'Deploy CoreDNS error {return_code}')
|
|
769
|
+
|
|
770
|
+
finally:
|
|
771
|
+
xpk_print(f'Restoring working directory to: {original_cwd}')
|
|
772
|
+
os.chdir(original_cwd)
|
|
773
|
+
if return_code != 0:
|
|
774
|
+
xpk_exit(return_code)
|
|
775
|
+
|
|
776
|
+
|
|
777
|
+
def scale_down_deployment(
|
|
778
|
+
args, deployment_name: str, namespace: str = 'kube-system'
|
|
779
|
+
):
|
|
780
|
+
"""Scales down a specified Kubernetes deployment to 0 replicas."""
|
|
781
|
+
command = (
|
|
782
|
+
f'kubectl scale deployment {deployment_name} --replicas=0'
|
|
783
|
+
f' --namespace={namespace}'
|
|
784
|
+
)
|
|
785
|
+
xpk_print(f"Task: 'Scaling down {deployment_name}' in progress")
|
|
786
|
+
return_code = run_command_with_updates(
|
|
787
|
+
command, f'Scale down {deployment_name}', args
|
|
788
|
+
)
|
|
789
|
+
if return_code != 0:
|
|
790
|
+
xpk_print(f'Scale down {deployment_name} error {return_code}')
|
|
791
|
+
xpk_exit(return_code)
|
|
792
|
+
xpk_print(f'\n{deployment_name} has been scaled down.')
|
|
793
|
+
|
|
794
|
+
|
|
795
|
+
def scale_up_coredns(args, replicas: int = 15, namespace: str = 'kube-system'):
|
|
796
|
+
"""Scales up the CoreDNS deployment to a specified number of replicas."""
|
|
797
|
+
command_coredns_scale = (
|
|
798
|
+
f'kubectl scale deployment coredns --replicas={replicas} -n {namespace}'
|
|
799
|
+
)
|
|
800
|
+
xpk_print(f"Task: 'Scale CoreDNS' in progress (to {replicas} replicas)")
|
|
801
|
+
return_code = run_command_with_updates(
|
|
802
|
+
command_coredns_scale, 'Scale CoreDNS', args
|
|
803
|
+
)
|
|
804
|
+
if return_code != 0:
|
|
805
|
+
xpk_print(f'Scale CoreDNS error {return_code}')
|
|
806
|
+
xpk_exit(return_code)
|
|
807
|
+
|
|
808
|
+
|
|
809
|
+
def check_deployment_exists(args, deployment_name: str, namespace: str) -> bool:
|
|
810
|
+
"""Check for the existence of a specific Deployment in a given namespace."""
|
|
811
|
+
command = (
|
|
812
|
+
f'kubectl get deployment {deployment_name} -n'
|
|
813
|
+
f' {namespace} --ignore-not-found'
|
|
814
|
+
)
|
|
815
|
+
result = run_command_with_updates(
|
|
816
|
+
command, 'Waiting for kubeDNS to be checked.', args
|
|
817
|
+
)
|
|
818
|
+
return result
|
|
819
|
+
|
|
820
|
+
|
|
821
|
+
def verify_coredns_readiness(
|
|
822
|
+
args, timeout: int = 120, namespace: str = 'kube-system'
|
|
823
|
+
):
|
|
824
|
+
"""Verifies CoreDNS readiness using kubectl wait commands."""
|
|
825
|
+
xpk_print('Now verifying CoreDNS readiness...')
|
|
826
|
+
kube_dns_exists = check_deployment_exists(args, 'kube-dns', namespace)
|
|
827
|
+
if kube_dns_exists:
|
|
828
|
+
# Wait for kube-dns to be fully scaled down
|
|
829
|
+
command_kube_dns_wait_scaled_down = (
|
|
830
|
+
'kubectl wait deployment/kube-dns'
|
|
831
|
+
" --for=jsonpath='{.status.replicas}'=0"
|
|
832
|
+
f' --namespace={namespace} --timeout={timeout}s'
|
|
833
|
+
)
|
|
834
|
+
xpk_print('Verifying if kube-dns has scaled down...')
|
|
835
|
+
return_code_kube_dns = run_command_with_updates(
|
|
836
|
+
command_kube_dns_wait_scaled_down, 'Wait for kube-dns scale down', args
|
|
837
|
+
)
|
|
838
|
+
if return_code_kube_dns != 0:
|
|
839
|
+
xpk_print('kube-dns did not scale down successfully within the timeout.')
|
|
840
|
+
xpk_exit(1) # Exit if kube-dns cannot scale down
|
|
841
|
+
else:
|
|
842
|
+
xpk_print('kube-dns has successfully scaled down.')
|
|
843
|
+
else:
|
|
844
|
+
xpk_print('kube-dns deployment not found.')
|
|
845
|
+
# Wait for CoreDNS to be fully scaled up and available
|
|
846
|
+
command_coredns_wait_available = (
|
|
847
|
+
'kubectl wait deployment/coredns --for=condition=Available=true'
|
|
848
|
+
f' --namespace={namespace} --timeout={timeout}s'
|
|
849
|
+
)
|
|
850
|
+
xpk_print('Verifying if CoreDNS is available...')
|
|
851
|
+
return_code_coredns = run_command_with_updates(
|
|
852
|
+
command_coredns_wait_available, 'Wait for coredns available', args
|
|
853
|
+
)
|
|
854
|
+
if return_code_coredns != 0:
|
|
855
|
+
xpk_print(
|
|
856
|
+
'CoreDNS verification failed, it might not have fully started within'
|
|
857
|
+
' the timeout.'
|
|
858
|
+
)
|
|
859
|
+
xpk_exit(1) # Exit if coredns cannot become available
|
|
860
|
+
|
|
861
|
+
xpk_print('CoreDNS has successfully started and passed verification.')
|
|
862
|
+
|
|
863
|
+
|
|
864
|
+
def cleanup_coredns_repo(coredns_repo_full_path: str):
|
|
865
|
+
"""Deletes the cloned CoreDNS deployment directory."""
|
|
866
|
+
xpk_print(
|
|
867
|
+
"Task: 'Deleting CoreDNS deployment directory' in progress:"
|
|
868
|
+
f' {coredns_repo_full_path}'
|
|
869
|
+
)
|
|
870
|
+
try:
|
|
871
|
+
shutil.rmtree(coredns_repo_full_path)
|
|
872
|
+
xpk_print(f'Successfully deleted directory: {coredns_repo_full_path}')
|
|
873
|
+
except OSError as e:
|
|
874
|
+
xpk_print(f'Error deleting directory {coredns_repo_full_path}: {e}')
|
|
875
|
+
|
|
876
|
+
|
|
877
|
+
def update_coredns(args):
|
|
878
|
+
"""Updates and deploys CoreDNS within a cluster.
|
|
879
|
+
|
|
880
|
+
Args:
|
|
881
|
+
args: user provided arguments for running the command.
|
|
882
|
+
|
|
883
|
+
Returns:
|
|
884
|
+
0 if successful and 1 otherwise.
|
|
885
|
+
"""
|
|
886
|
+
coredns_repo_dir = os.path.expanduser('/tmp/')
|
|
887
|
+
coredns_repo_dir_name = 'deployment'
|
|
888
|
+
coredns_repo_full_path = os.path.join(coredns_repo_dir, coredns_repo_dir_name)
|
|
889
|
+
coredns_k8s_path = os.path.join(coredns_repo_full_path, 'kubernetes')
|
|
890
|
+
# 1. Install jq
|
|
891
|
+
install_jq(args)
|
|
892
|
+
|
|
893
|
+
# 2. Clone CoreDNS deployment repository
|
|
894
|
+
clone_coredns_deployment_repo(args, coredns_repo_full_path)
|
|
895
|
+
|
|
896
|
+
# 3. Deploy CoreDNS to the cluster
|
|
897
|
+
deploy_coredns_manifests(args, coredns_k8s_path)
|
|
898
|
+
|
|
899
|
+
# 4. Scale down kube-dns-autoscaler
|
|
900
|
+
scale_down_deployment(args, 'kube-dns-autoscaler')
|
|
901
|
+
|
|
902
|
+
# 5. Scale down kube-dns
|
|
903
|
+
scale_down_deployment(args, 'kube-dns')
|
|
904
|
+
|
|
905
|
+
# 6. Scale up coredns and verify readiness
|
|
906
|
+
scale_up_coredns(args, replicas=15)
|
|
907
|
+
verify_coredns_readiness(args, timeout=120)
|
|
908
|
+
|
|
909
|
+
xpk_print('The CoreDNS setup process has been completed.')
|
|
910
|
+
|
|
911
|
+
# 7. Cleanup
|
|
912
|
+
cleanup_coredns_repo(coredns_repo_full_path)
|
|
913
|
+
|
|
914
|
+
return 0
|
|
915
|
+
|
|
916
|
+
|
|
917
|
+
def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool:
|
|
918
|
+
"""Checks if the CoreDNS deployment exists in the given namespace.
|
|
919
|
+
|
|
920
|
+
Args:
|
|
921
|
+
namespace: The Kubernetes namespace to check for the CoreDNS deployment.
|
|
922
|
+
|
|
923
|
+
Returns:
|
|
924
|
+
True if the 'coredns' deployment exists, False otherwise.
|
|
925
|
+
"""
|
|
926
|
+
command = f'kubectl get deployment coredns -n {namespace}'
|
|
927
|
+
xpk_print(
|
|
928
|
+
"Task: 'Checking CoreDNS deployment existence' in progress for"
|
|
929
|
+
f' namespace: {namespace}'
|
|
930
|
+
)
|
|
931
|
+
return_code = run_command_with_updates(
|
|
932
|
+
command, f'Check CoreDNS deployment in {namespace}', args
|
|
933
|
+
)
|
|
934
|
+
if return_code == 0:
|
|
935
|
+
verify_coredns_readiness(args)
|
|
936
|
+
xpk_print(f"CoreDNS deployment 'coredns' found in namespace '{namespace}'.")
|
|
937
|
+
return True
|
|
938
|
+
else:
|
|
939
|
+
xpk_print(
|
|
940
|
+
f"CoreDNS deployment 'coredns' NOT found in namespace '{namespace}' or"
|
|
941
|
+
' an error occurred.'
|
|
942
|
+
)
|
|
943
|
+
return False
|
|
944
|
+
|
|
945
|
+
|
|
946
|
+
def update_coredns_if_necessary(args) -> int:
|
|
947
|
+
"""Updates and deploys CoreDNS within the cluster if it's not already present.
|
|
948
|
+
|
|
949
|
+
This function checks for the existence of the CoreDNS deployment.
|
|
950
|
+
If it's not found, it proceeds to deploy and configure CoreDNS.
|
|
951
|
+
|
|
952
|
+
Args:
|
|
953
|
+
args: User-provided arguments for running the command.
|
|
954
|
+
|
|
955
|
+
Returns:
|
|
956
|
+
0 if successful (CoreDNS was already present or successfully deployed),
|
|
957
|
+
and 1 otherwise.
|
|
958
|
+
"""
|
|
959
|
+
if coredns_deployment_exists(args, namespace='kube-system'):
|
|
960
|
+
xpk_print('Skipping CoreDNS deployment since it already exists.')
|
|
961
|
+
return 0
|
|
962
|
+
else:
|
|
963
|
+
xpk_print('CoreDNS deployment not found. Proceeding with CoreDNS setup.')
|
|
964
|
+
return update_coredns(args)
|
|
965
|
+
|
|
966
|
+
|
|
705
967
|
def create_cluster_if_necessary(
|
|
706
968
|
args, gke_control_plane_version: str, system: SystemCharacteristics
|
|
707
969
|
) -> int:
|
|
@@ -842,6 +1104,7 @@ def run_gke_cluster_create_command(
|
|
|
842
1104
|
f' {args.custom_cluster_arguments}'
|
|
843
1105
|
f' {rapid_release_cmd}'
|
|
844
1106
|
' --enable-dns-access'
|
|
1107
|
+
' --autoscaling-profile=optimize-utilization'
|
|
845
1108
|
)
|
|
846
1109
|
|
|
847
1110
|
enable_ip_alias = False
|
|
@@ -232,9 +232,9 @@ def get_capacity_node_selectors_from_capacity_type(
|
|
|
232
232
|
case CapacityType.ON_DEMAND.name:
|
|
233
233
|
node_selector = ''
|
|
234
234
|
case CapacityType.FLEX_START.name:
|
|
235
|
-
node_selector = 'cloud.google.com/gke-queued
|
|
235
|
+
node_selector = 'cloud.google.com/gke-queued: "true"'
|
|
236
236
|
case CapacityType.SPOT.name:
|
|
237
|
-
node_selector = 'cloud.google.com/gke-spot
|
|
237
|
+
node_selector = 'cloud.google.com/gke-spot: "true"'
|
|
238
238
|
case CapacityType.RESERVATION.name:
|
|
239
239
|
node_selector = f'cloud.google.com/reservation-name: {args.reservation}'
|
|
240
240
|
case _:
|
|
@@ -22,7 +22,7 @@ from ..utils import file
|
|
|
22
22
|
from ..utils.console import xpk_print
|
|
23
23
|
|
|
24
24
|
# This is the version for XPK PyPI package
|
|
25
|
-
__version__ = 'v0.
|
|
25
|
+
__version__ = 'v0.11.0'
|
|
26
26
|
XPK_CURRENT_VERSION = __version__
|
|
27
27
|
XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
|
|
28
28
|
|
|
@@ -220,7 +220,7 @@ spec:
|
|
|
220
220
|
- --zap-log-level=2
|
|
221
221
|
command:
|
|
222
222
|
- /manager
|
|
223
|
-
image: registry.k8s.io/kueue/kueue:
|
|
223
|
+
image: registry.k8s.io/kueue/kueue:{KUEUE_VERSION}
|
|
224
224
|
imagePullPolicy: Always
|
|
225
225
|
livenessProbe:
|
|
226
226
|
httpGet:
|
|
@@ -244,36 +244,29 @@ spec:
|
|
|
244
244
|
periodSeconds: 10
|
|
245
245
|
resources:
|
|
246
246
|
limits:
|
|
247
|
-
cpu:
|
|
247
|
+
cpu: 1000m
|
|
248
248
|
memory: {memory_limit_size}
|
|
249
249
|
requests:
|
|
250
|
-
cpu:
|
|
250
|
+
cpu: 1000m
|
|
251
251
|
memory: 512Mi
|
|
252
252
|
securityContext:
|
|
253
253
|
allowPrivilegeEscalation: false
|
|
254
254
|
volumeMounts:
|
|
255
|
+
- mountPath: /visibility
|
|
256
|
+
name: visibility
|
|
255
257
|
- mountPath: /tmp/k8s-webhook-server/serving-certs
|
|
256
258
|
name: cert
|
|
257
259
|
readOnly: true
|
|
258
260
|
- mountPath: /controller_manager_config.yaml
|
|
259
261
|
name: manager-config
|
|
260
262
|
subPath: controller_manager_config.yaml
|
|
261
|
-
- args:
|
|
262
|
-
- --secure-listen-address=0.0.0.0:8443
|
|
263
|
-
- --upstream=http://127.0.0.1:8080/
|
|
264
|
-
- --logtostderr=true
|
|
265
|
-
- --v=10
|
|
266
|
-
image: registry.k8s.io/kubebuilder/kube-rbac-proxy:v0.16.0
|
|
267
|
-
name: kube-rbac-proxy
|
|
268
|
-
ports:
|
|
269
|
-
- containerPort: 8443
|
|
270
|
-
name: https
|
|
271
|
-
protocol: TCP
|
|
272
263
|
securityContext:
|
|
273
264
|
runAsNonRoot: true
|
|
274
265
|
serviceAccountName: kueue-controller-manager
|
|
275
266
|
terminationGracePeriodSeconds: 10
|
|
276
267
|
volumes:
|
|
268
|
+
- name: visibility
|
|
269
|
+
emptyDir: {{}}
|
|
277
270
|
- name: cert
|
|
278
271
|
secret:
|
|
279
272
|
defaultMode: 420
|
|
@@ -536,7 +529,7 @@ def update_kueue_resources_if_necessary(args):
|
|
|
536
529
|
f'{max(math.ceil(int(out) * MEMORY_SIZE_PER_VM), MIN_MEMORY_LIMIT_SIZE)}Mi'
|
|
537
530
|
)
|
|
538
531
|
yml_string = kueue_controller_manager_yml.format(
|
|
539
|
-
memory_limit_size=new_memory_limit,
|
|
532
|
+
memory_limit_size=new_memory_limit, KUEUE_VERSION=KUEUE_VERSION
|
|
540
533
|
)
|
|
541
534
|
tmp = write_tmp_file(yml_string)
|
|
542
535
|
command = f'kubectl apply -f {str(tmp.file.name)}'
|
|
@@ -42,6 +42,8 @@ AUTOPROVISIONING_CONFIG_FILE = """
|
|
|
42
42
|
management:
|
|
43
43
|
autoRepair: true
|
|
44
44
|
autoUpgrade: true
|
|
45
|
+
scopes:
|
|
46
|
+
- "https://www.googleapis.com/auth/devstorage.read_write"
|
|
45
47
|
autoprovisioningLocations:
|
|
46
48
|
{zones}
|
|
47
49
|
{resource_limits}
|
|
@@ -106,6 +108,18 @@ def enable_autoprovisioning_on_cluster(
|
|
|
106
108
|
xpk_print(f'{task} request returned ERROR {return_code}')
|
|
107
109
|
return autoprovisioning_config, return_code
|
|
108
110
|
|
|
111
|
+
command = (
|
|
112
|
+
'gcloud container clusters update'
|
|
113
|
+
f' {args.cluster} --project={args.project}'
|
|
114
|
+
f' --region={zone_to_region(args.zone)}'
|
|
115
|
+
' --autoscaling-profile=optimize-utilization'
|
|
116
|
+
)
|
|
117
|
+
task = 'Update cluster with autoscaling-profile'
|
|
118
|
+
return_code = run_command_with_updates(command, task, args)
|
|
119
|
+
if return_code != 0:
|
|
120
|
+
xpk_print(f'{task} request returned ERROR {return_code}')
|
|
121
|
+
return autoprovisioning_config, return_code
|
|
122
|
+
|
|
109
123
|
# Update created accelerator node pools to support autoprovisioning.
|
|
110
124
|
existing_node_pool_names, return_code = get_all_nodepools_programmatic(args)
|
|
111
125
|
if return_code != 0:
|
|
@@ -171,11 +185,11 @@ def create_autoprovisioning_config(
|
|
|
171
185
|
# is not controlled by NAP.
|
|
172
186
|
cpu_limits = """
|
|
173
187
|
minimum: 1
|
|
174
|
-
maximum:
|
|
188
|
+
maximum: 1000000
|
|
175
189
|
"""
|
|
176
190
|
memory_limits = """
|
|
177
191
|
minimum: 1
|
|
178
|
-
maximum:
|
|
192
|
+
maximum: 10000000
|
|
179
193
|
"""
|
|
180
194
|
|
|
181
195
|
# By default, the maximum chips is set to be the current number of resources used
|
|
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
from typing import List
|
|
17
18
|
from ..utils.console import get_user_input, xpk_print
|
|
18
19
|
from .capacity import (
|
|
19
20
|
AUTOPROVISIONING_CONFIG_VALUE,
|
|
@@ -32,6 +33,8 @@ from .resources import (
|
|
|
32
33
|
create_or_update_cluster_configmap,
|
|
33
34
|
)
|
|
34
35
|
from .system_characteristics import AcceleratorType
|
|
36
|
+
from functools import reduce
|
|
37
|
+
from operator import mul
|
|
35
38
|
|
|
36
39
|
CLOUD_PLATFORM_AUTH_SCOPE_URL = (
|
|
37
40
|
'"https://www.googleapis.com/auth/cloud-platform"'
|
|
@@ -88,20 +91,26 @@ def run_gke_node_pool_create_command(
|
|
|
88
91
|
xpk_print('Parsing capacity arguments failed!')
|
|
89
92
|
return return_code
|
|
90
93
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
94
|
+
desired_node_pool_count = (
|
|
95
|
+
1
|
|
96
|
+
if system.accelerator_type == AcceleratorType['GPU']
|
|
97
|
+
else args.num_slices
|
|
98
|
+
)
|
|
99
|
+
message = (
|
|
100
|
+
(
|
|
101
|
+
f'Creating 1 node pool with {args.num_nodes} nodes of'
|
|
102
|
+
f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
|
|
103
|
+
)
|
|
104
|
+
if system.accelerator_type == AcceleratorType['GPU']
|
|
105
|
+
else (
|
|
106
|
+
f'Creating {args.num_slices} node pool or pools of'
|
|
107
|
+
f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
|
|
108
|
+
)
|
|
109
|
+
)
|
|
110
|
+
xpk_print(message)
|
|
111
|
+
desired_node_pool_names = get_desired_node_pool_names(
|
|
112
|
+
existing_node_pool_names, args.cluster, desired_node_pool_count
|
|
113
|
+
)
|
|
105
114
|
|
|
106
115
|
node_pools_to_remain = []
|
|
107
116
|
delete_commands = []
|
|
@@ -275,20 +284,24 @@ def run_gke_node_pool_create_command(
|
|
|
275
284
|
f' --host-maintenance-interval={args.host_maintenance_interval}'
|
|
276
285
|
f' {capacity_args}'
|
|
277
286
|
' --enable-gvnic'
|
|
278
|
-
f' {args.custom_nodepool_arguments}'
|
|
279
287
|
)
|
|
280
288
|
if system.accelerator_type == AcceleratorType['TPU']:
|
|
281
289
|
command += f' --node-version={gke_node_pool_version}'
|
|
290
|
+
topology_product = reduce(
|
|
291
|
+
mul, (int(x) for x in system.topology.split('x')), 1
|
|
292
|
+
)
|
|
282
293
|
if capacity_type == CapacityType.FLEX_START:
|
|
283
294
|
command += ' --num-nodes=0'
|
|
284
|
-
|
|
295
|
+
elif topology_product > 1:
|
|
285
296
|
command += f' --num-nodes={system.vms_per_slice}'
|
|
286
|
-
command += ' --placement-type=COMPACT --max-pods-per-node 15'
|
|
287
297
|
command += (
|
|
288
298
|
f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}'
|
|
289
299
|
)
|
|
290
|
-
|
|
291
|
-
|
|
300
|
+
|
|
301
|
+
if topology_product > 1:
|
|
302
|
+
command += ' --placement-type=COMPACT --max-pods-per-node 15'
|
|
303
|
+
command += f' --tpu-topology={system.topology}'
|
|
304
|
+
command += f' {args.custom_tpu_nodepool_arguments}'
|
|
292
305
|
elif system.accelerator_type == AcceleratorType['GPU']:
|
|
293
306
|
subnet_prefix = f'{args.cluster}-{zone_to_region(args.zone)}'
|
|
294
307
|
if capacity_type == CapacityType.FLEX_START:
|
|
@@ -319,6 +332,8 @@ def run_gke_node_pool_create_command(
|
|
|
319
332
|
if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
|
|
320
333
|
command += ' --workload-metadata=GKE_METADATA'
|
|
321
334
|
|
|
335
|
+
command += f' {args.custom_nodepool_arguments}'
|
|
336
|
+
|
|
322
337
|
task = f'NodepoolCreate-{node_pool_name}'
|
|
323
338
|
create_commands.append(command)
|
|
324
339
|
create_task_names.append(task)
|
|
@@ -594,3 +609,21 @@ def get_nodepool_workload_metadata_mode(
|
|
|
594
609
|
return 1, None
|
|
595
610
|
|
|
596
611
|
return 0, nodepool_WI_mode.strip()
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
def get_desired_node_pool_names(
|
|
615
|
+
existing_node_pool_names: List[str],
|
|
616
|
+
cluster_name: str,
|
|
617
|
+
desired_node_pool_count: int,
|
|
618
|
+
) -> List[str]:
|
|
619
|
+
cluster_node_pools = [
|
|
620
|
+
np
|
|
621
|
+
for np in existing_node_pool_names
|
|
622
|
+
if np.startswith(f'{cluster_name}-np-')
|
|
623
|
+
]
|
|
624
|
+
result = set(cluster_node_pools[:desired_node_pool_count])
|
|
625
|
+
i = 0
|
|
626
|
+
while len(result) < desired_node_pool_count:
|
|
627
|
+
result.add(f'{cluster_name}-np-{i}')
|
|
628
|
+
i += 1
|
|
629
|
+
return list(result)
|
|
@@ -49,7 +49,7 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
|
|
|
49
49
|
missing_gke_accelerator_type = False
|
|
50
50
|
if not cluster_config_map.get(system.gke_accelerator):
|
|
51
51
|
xpk_print(
|
|
52
|
-
f'
|
|
52
|
+
f'GKE Accelerator Type Check: {args.workload} is requesting'
|
|
53
53
|
f' {system.gke_accelerator} but cluster only contains'
|
|
54
54
|
f' {cluster_config_map.keys()}. '
|
|
55
55
|
)
|