skypilot-nightly 1.0.0.dev20250427__py3-none-any.whl → 1.0.0.dev20250428__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +19 -2
- sky/backends/cloud_vm_ray_backend.py +33 -8
- sky/backends/local_docker_backend.py +1 -2
- sky/cli.py +1 -1
- sky/client/cli.py +1 -1
- sky/clouds/aws.py +12 -6
- sky/clouds/azure.py +3 -0
- sky/clouds/cloud.py +3 -0
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +3 -0
- sky/clouds/fluidstack.py +3 -0
- sky/clouds/gcp.py +7 -0
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +38 -15
- sky/clouds/lambda_cloud.py +1 -0
- sky/clouds/nebius.py +2 -0
- sky/clouds/oci.py +6 -3
- sky/clouds/paperspace.py +2 -0
- sky/clouds/runpod.py +2 -0
- sky/clouds/scp.py +2 -0
- sky/clouds/vast.py +2 -0
- sky/clouds/vsphere.py +2 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/exceptions.py +6 -0
- sky/execution.py +19 -4
- sky/global_user_state.py +1 -0
- sky/provision/common.py +2 -5
- sky/provision/instance_setup.py +1 -1
- sky/provision/kubernetes/instance.py +276 -93
- sky/provision/kubernetes/network.py +1 -1
- sky/provision/kubernetes/utils.py +10 -0
- sky/provision/provisioner.py +6 -0
- sky/serve/replica_managers.py +51 -5
- sky/serve/serve_state.py +41 -0
- sky/serve/service.py +108 -63
- sky/server/requests/executor.py +4 -4
- sky/skylet/constants.py +7 -0
- sky/task.py +1 -1
- sky/templates/kubernetes-ray.yml.j2 +122 -2
- sky/utils/command_runner.py +17 -3
- sky/utils/command_runner.pyi +2 -0
- sky/utils/controller_utils.py +24 -0
- sky/utils/kubernetes/rsync_helper.sh +20 -4
- sky/utils/schemas.py +13 -0
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/RECORD +59 -59
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/WHEEL +1 -1
- /sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → 2f-jlOWR_G5mOwCF4RcZz}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → 2f-jlOWR_G5mOwCF4RcZz}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/top_level.txt +0 -0
@@ -32,22 +32,53 @@ logger = sky_logging.init_logger(__name__)
|
|
32
32
|
TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
|
33
33
|
TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name'
|
34
34
|
TAG_POD_INITIALIZED = 'skypilot-initialized'
|
35
|
+
TAG_SKYPILOT_DEPLOYMENT_NAME = 'skypilot-deployment-name'
|
36
|
+
|
37
|
+
|
38
|
+
def ray_tag_filter(cluster_name: str) -> Dict[str, str]:
|
39
|
+
return {TAG_RAY_CLUSTER_NAME: cluster_name}
|
40
|
+
|
41
|
+
|
42
|
+
def _is_head(pod) -> bool:
|
43
|
+
return pod.metadata.labels.get(constants.TAG_RAY_NODE_KIND) == 'head'
|
35
44
|
|
36
45
|
|
37
46
|
def _get_head_pod_name(pods: Dict[str, Any]) -> Optional[str]:
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
return
|
47
|
+
return next((pod_name for pod_name, pod in pods.items() if _is_head(pod)),
|
48
|
+
None)
|
49
|
+
|
50
|
+
|
51
|
+
def _get_pvc_name(cluster_name: str, volume_name: str) -> str:
|
52
|
+
return f'{cluster_name}-{volume_name}'
|
53
|
+
|
44
54
|
|
55
|
+
def _get_deployment_name(cluster_name: str) -> str:
|
56
|
+
return f'{cluster_name}-deployment'
|
45
57
|
|
46
|
-
|
47
|
-
|
58
|
+
|
59
|
+
def _head_service_selector(cluster_name: str) -> Dict[str, str]:
|
48
60
|
return {'component': f'{cluster_name}-head'}
|
49
61
|
|
50
62
|
|
63
|
+
def is_high_availability_cluster_by_kubectl(
|
64
|
+
cluster_name: str,
|
65
|
+
context: Optional[str] = None,
|
66
|
+
namespace: Optional[str] = None) -> bool:
|
67
|
+
"""Check if a cluster is a high availability controller by calling
|
68
|
+
`kubectl get deployment`.
|
69
|
+
"""
|
70
|
+
try:
|
71
|
+
deployment_list = kubernetes.apps_api(
|
72
|
+
context).list_namespaced_deployment(
|
73
|
+
namespace,
|
74
|
+
label_selector=f'{TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}')
|
75
|
+
except kubernetes.api_exception():
|
76
|
+
return False
|
77
|
+
# It is a high availability cluster if there is at least one deployment
|
78
|
+
# matching the label selector.
|
79
|
+
return bool(deployment_list.items)
|
80
|
+
|
81
|
+
|
51
82
|
def _formatted_resource_requirements(pod_or_spec: Union[Any, dict]) -> str:
|
52
83
|
# Returns a formatted string of resource requirements for a pod.
|
53
84
|
resource_requirements = {}
|
@@ -384,13 +415,11 @@ def _run_function_with_retries(func: Callable,
|
|
384
415
|
max_retries: int = _MAX_RETRIES,
|
385
416
|
retry_delay: int = 5) -> Any:
|
386
417
|
"""Runs a function with retries on Kubernetes errors.
|
387
|
-
|
388
418
|
Args:
|
389
419
|
func: Function to retry
|
390
420
|
operation_name: Name of the operation for logging
|
391
421
|
max_retries: Maximum number of retry attempts
|
392
422
|
retry_delay: Delay between retries in seconds
|
393
|
-
|
394
423
|
Raises:
|
395
424
|
The last exception encountered if all retries fail.
|
396
425
|
"""
|
@@ -409,30 +438,23 @@ def _run_function_with_retries(func: Callable,
|
|
409
438
|
@timeline.event
|
410
439
|
def pre_init(namespace: str, context: Optional[str], new_nodes: List) -> None:
|
411
440
|
"""Pre-initialization step for SkyPilot pods.
|
412
|
-
|
413
441
|
This step is run in the pod right after it is created and before the
|
414
442
|
SkyPilot runtime is setup.
|
415
|
-
|
416
443
|
This step includes three key steps:
|
417
|
-
|
418
444
|
1. Privilege check: Checks if the default user has sufficient privilege
|
419
445
|
to set up the kubernetes instance pod.
|
420
446
|
2. SSH setup: Sets up SSH for the pod instance.
|
421
447
|
3. Environment variable setup to populate k8s env vars in the pod.
|
422
|
-
|
423
448
|
Make sure commands used in these methods are generic and work
|
424
449
|
on most base images. E.g., do not use Python, since that may not
|
425
450
|
be installed by default.
|
426
|
-
|
427
451
|
If you run any apt commands, be sure to check if the lock is available.
|
428
452
|
It is possible the `apt update` run in the pod container args may still
|
429
453
|
be running.
|
430
|
-
|
431
454
|
Args:
|
432
455
|
namespace (str): Kubernetes namespace.
|
433
456
|
context (Optional[str]): Kubernetes context.
|
434
457
|
new_nodes (List): List of new pod instances.
|
435
|
-
|
436
458
|
Raises:
|
437
459
|
config_lib.KubernetesError: If user privileges are insufficient or
|
438
460
|
setup fails.
|
@@ -647,6 +669,56 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
|
|
647
669
|
raise e
|
648
670
|
|
649
671
|
|
672
|
+
def _create_persistent_volume_claim(namespace: str, context: Optional[str],
|
673
|
+
pvc_spec: Dict[str, Any]) -> None:
|
674
|
+
"""Creates a persistent volume claim for SkyServe controller."""
|
675
|
+
try:
|
676
|
+
kubernetes.core_api(context).read_namespaced_persistent_volume_claim(
|
677
|
+
name=pvc_spec['metadata']['name'], namespace=namespace)
|
678
|
+
return
|
679
|
+
except kubernetes.api_exception() as e:
|
680
|
+
if e.status != 404: # Not found
|
681
|
+
raise
|
682
|
+
|
683
|
+
kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
|
684
|
+
namespace=namespace, body=pvc_spec)
|
685
|
+
|
686
|
+
|
687
|
+
@timeline.event
|
688
|
+
def _wait_for_deployment_pod(context,
|
689
|
+
namespace,
|
690
|
+
deployment,
|
691
|
+
timeout=60) -> List:
|
692
|
+
label_selector = ','.join([
|
693
|
+
f'{key}={value}'
|
694
|
+
for key, value in deployment.spec.selector.match_labels.items()
|
695
|
+
])
|
696
|
+
target_replicas = deployment.spec.replicas
|
697
|
+
deployment_name = deployment.metadata.name
|
698
|
+
start_time = time.time()
|
699
|
+
while time.time() - start_time < timeout:
|
700
|
+
# Refresh the deployment status
|
701
|
+
deployment = kubernetes.apps_api(
|
702
|
+
context).read_namespaced_deployment_status(deployment_name,
|
703
|
+
namespace)
|
704
|
+
if (deployment.status and
|
705
|
+
deployment.status.ready_replicas is not None and
|
706
|
+
deployment.status.ready_replicas >= target_replicas):
|
707
|
+
pods = kubernetes.core_api(context).list_namespaced_pod(
|
708
|
+
namespace, label_selector=label_selector).items
|
709
|
+
return pods
|
710
|
+
|
711
|
+
ready_replicas = (deployment.status.ready_replicas
|
712
|
+
if deployment.status is not None else 0)
|
713
|
+
logger.debug(f'Waiting for deployment {deployment_name!r} to be ready. '
|
714
|
+
f'Ready replicas: {ready_replicas}/{target_replicas}')
|
715
|
+
time.sleep(2)
|
716
|
+
|
717
|
+
raise TimeoutError(
|
718
|
+
f'Timeout: Deployment {deployment_name!r} did not become '
|
719
|
+
'ready.')
|
720
|
+
|
721
|
+
|
650
722
|
@timeline.event
|
651
723
|
def _create_pods(region: str, cluster_name_on_cloud: str,
|
652
724
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
@@ -655,9 +727,16 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
655
727
|
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
656
728
|
context = kubernetes_utils.get_context_from_config(provider_config)
|
657
729
|
pod_spec = copy.deepcopy(config.node_config)
|
658
|
-
|
659
|
-
|
660
|
-
|
730
|
+
|
731
|
+
to_create_deployment = 'deployment_spec' in pod_spec
|
732
|
+
if to_create_deployment:
|
733
|
+
deployment_spec = pod_spec.pop('deployment_spec')
|
734
|
+
pvc_spec = pod_spec.pop('pvc_spec')
|
735
|
+
assert len(pod_spec['spec']['containers']) == 1, (
|
736
|
+
'Only one container is supported for deployment')
|
737
|
+
|
738
|
+
tags = ray_tag_filter(cluster_name_on_cloud)
|
739
|
+
|
661
740
|
pod_spec['metadata']['namespace'] = namespace
|
662
741
|
if 'labels' in pod_spec['metadata']:
|
663
742
|
pod_spec['metadata']['labels'].update(tags)
|
@@ -734,16 +813,15 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
734
813
|
if nvidia_runtime_exists and needs_gpus:
|
735
814
|
pod_spec['spec']['runtimeClassName'] = 'nvidia'
|
736
815
|
|
737
|
-
created_pods = {}
|
738
816
|
logger.debug(f'run_instances: calling create_namespaced_pod '
|
739
817
|
f'(count={to_start_count}).')
|
740
818
|
|
741
|
-
def
|
819
|
+
def _create_resource_thread(i: int):
|
742
820
|
pod_spec_copy = copy.deepcopy(pod_spec)
|
743
821
|
if head_pod_name is None and i == 0:
|
744
822
|
# First pod should be head if no head exists
|
745
823
|
pod_spec_copy['metadata']['labels'].update(constants.HEAD_NODE_TAGS)
|
746
|
-
head_selector =
|
824
|
+
head_selector = _head_service_selector(cluster_name_on_cloud)
|
747
825
|
pod_spec_copy['metadata']['labels'].update(head_selector)
|
748
826
|
pod_spec_copy['metadata']['name'] = f'{cluster_name_on_cloud}-head'
|
749
827
|
else:
|
@@ -800,19 +878,62 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
800
878
|
tpu_toleration
|
801
879
|
]
|
802
880
|
|
881
|
+
if to_create_deployment:
|
882
|
+
_create_persistent_volume_claim(namespace, context, pvc_spec)
|
883
|
+
|
884
|
+
# It's safe to directly modify the template spec in the deployment spec
|
885
|
+
# because controller pod is singleton, i in [0].
|
886
|
+
template_pod_spec = deployment_spec['spec']['template']
|
887
|
+
# Add the deployment name as a label to the pod spec
|
888
|
+
deployment_name = deployment_spec['metadata']['name']
|
889
|
+
pod_spec_copy['metadata']['labels'][
|
890
|
+
TAG_SKYPILOT_DEPLOYMENT_NAME] = deployment_name
|
891
|
+
template_pod_spec['metadata'] = pod_spec_copy['metadata']
|
892
|
+
template_pod_spec['spec'].update(pod_spec_copy['spec'])
|
893
|
+
try:
|
894
|
+
return kubernetes.apps_api(
|
895
|
+
context).create_namespaced_deployment(
|
896
|
+
namespace, deployment_spec)
|
897
|
+
except Exception as e:
|
898
|
+
print('Deployment failed', e)
|
899
|
+
raise e
|
900
|
+
|
803
901
|
return _create_namespaced_pod_with_retries(namespace, pod_spec_copy,
|
804
902
|
context)
|
805
903
|
|
904
|
+
if not to_start_count:
|
905
|
+
is_provisioned_cluster_ha = is_high_availability_cluster_by_kubectl(
|
906
|
+
cluster_name_on_cloud, context, namespace)
|
907
|
+
if is_provisioned_cluster_ha != to_create_deployment:
|
908
|
+
ha_str = lambda x: 'high availability' if x else 'non-high availability'
|
909
|
+
|
910
|
+
message = (
|
911
|
+
f'The cluster "{cluster_name_on_cloud}" is configured to be '
|
912
|
+
f'{ha_str(to_create_deployment)} but the cluster has already been '
|
913
|
+
f'provisioned as {ha_str(is_provisioned_cluster_ha)}. '
|
914
|
+
'If you want to make the provisioned cluster '
|
915
|
+
f'{ha_str(to_create_deployment)}, please first down the cluster '
|
916
|
+
'and then up the cluster again.')
|
917
|
+
raise exceptions.InconsistentHighAvailabilityError(message)
|
918
|
+
|
806
919
|
# Create pods in parallel
|
807
|
-
|
808
|
-
|
809
|
-
|
920
|
+
created_resources = subprocess_utils.run_in_parallel(
|
921
|
+
_create_resource_thread, list(range(to_start_count)), _NUM_THREADS)
|
922
|
+
|
923
|
+
if to_create_deployment:
|
924
|
+
deployments = copy.deepcopy(created_resources)
|
925
|
+
pods = [
|
926
|
+
pod for deployment in deployments
|
927
|
+
for pod in _wait_for_deployment_pod(context, namespace, deployment)
|
928
|
+
]
|
929
|
+
else:
|
930
|
+
# If not creating deployments, 'created_resources' already holds Pod objects
|
931
|
+
pods = created_resources
|
810
932
|
|
811
|
-
|
933
|
+
created_pods = {}
|
812
934
|
for pod in pods:
|
813
935
|
created_pods[pod.metadata.name] = pod
|
814
|
-
if head_pod_name is None and pod
|
815
|
-
constants.TAG_RAY_NODE_KIND) == 'head':
|
936
|
+
if head_pod_name is None and _is_head(pod):
|
816
937
|
head_pod_name = pod.metadata.name
|
817
938
|
|
818
939
|
networking_mode = network_utils.get_networking_mode(
|
@@ -879,57 +1000,75 @@ def stop_instances(
|
|
879
1000
|
raise NotImplementedError()
|
880
1001
|
|
881
1002
|
|
882
|
-
def
|
883
|
-
|
884
|
-
"""
|
885
|
-
logger.debug('terminate_instances: calling delete_namespaced_pod')
|
1003
|
+
def _delete_k8s_resource_with_retry(delete_func: Callable, resource_type: str,
|
1004
|
+
resource_name: str) -> None:
|
1005
|
+
"""Helper to delete Kubernetes resources with 404 handling and retries.
|
886
1006
|
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
|
1007
|
+
Args:
|
1008
|
+
delete_func: Function to call to delete the resource
|
1009
|
+
resource_type: Type of resource being deleted (e.g. 'service'),
|
1010
|
+
used in logging
|
1011
|
+
resource_name: Name of the resource being deleted, used in logging
|
1012
|
+
"""
|
1013
|
+
max_retries = 3
|
1014
|
+
retry_delay = 5 # seconds
|
1015
|
+
|
1016
|
+
for attempt in range(max_retries):
|
1017
|
+
try:
|
1018
|
+
delete_func()
|
1019
|
+
return
|
1020
|
+
except kubernetes.api_exception() as e:
|
1021
|
+
if e.status == 404:
|
1022
|
+
logger.warning(
|
1023
|
+
f'terminate_instances: Tried to delete {resource_type} '
|
1024
|
+
f'{resource_name}, but the {resource_type} was not '
|
1025
|
+
'found (404).')
|
904
1026
|
return
|
905
|
-
|
906
|
-
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
|
911
|
-
|
912
|
-
|
913
|
-
|
914
|
-
|
915
|
-
|
916
|
-
|
917
|
-
|
918
|
-
|
919
|
-
|
1027
|
+
elif attempt < max_retries - 1:
|
1028
|
+
logger.warning(f'terminate_instances: Failed to delete '
|
1029
|
+
f'{resource_type} {resource_name} (attempt '
|
1030
|
+
f'{attempt + 1}/{max_retries}). Error: {e}. '
|
1031
|
+
f'Retrying in {retry_delay} seconds...')
|
1032
|
+
time.sleep(retry_delay)
|
1033
|
+
else:
|
1034
|
+
raise
|
1035
|
+
|
1036
|
+
|
1037
|
+
def _delete_services(name_prefix: str, namespace: str,
|
1038
|
+
context: Optional[str]) -> None:
|
1039
|
+
"""Delete services with the given name prefix.
|
1040
|
+
|
1041
|
+
Args:
|
1042
|
+
name_prefix: Prefix of the service names to delete
|
1043
|
+
namespace: Kubernetes namespace
|
1044
|
+
context: Kubernetes context
|
1045
|
+
"""
|
1046
|
+
# TODO(andy): We should use tag for the service filter.
|
1047
|
+
for service_name in [name_prefix, f'{name_prefix}-ssh']:
|
1048
|
+
# Since we are not saving this lambda, it's a false positive.
|
1049
|
+
# TODO(andyl): Wait for
|
1050
|
+
# https://github.com/pylint-dev/pylint/issues/5263.
|
1051
|
+
# pylint: disable=cell-var-from-loop
|
1052
|
+
_delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.core_api(
|
1053
|
+
context).delete_namespaced_service(name=service_name,
|
1054
|
+
namespace=namespace,
|
1055
|
+
_request_timeout=config_lib.
|
1056
|
+
DELETION_TIMEOUT),
|
1057
|
+
resource_type='service',
|
1058
|
+
resource_name=service_name)
|
1059
|
+
|
1060
|
+
|
1061
|
+
def _terminate_node(namespace: str,
|
1062
|
+
context: Optional[str],
|
1063
|
+
pod_name: str,
|
1064
|
+
is_head: bool = False) -> None:
|
1065
|
+
"""Terminate a pod and its associated services."""
|
1066
|
+
logger.debug('terminate_instances: calling delete_namespaced_pod')
|
920
1067
|
|
921
1068
|
if is_head:
|
922
1069
|
# Delete services for the head pod
|
923
1070
|
# services are specified in sky/templates/kubernetes-ray.yml.j2
|
924
|
-
|
925
|
-
_delete_k8s_resource_with_retry(
|
926
|
-
delete_func=lambda name=service_name: kubernetes.core_api(
|
927
|
-
context).delete_namespaced_service(
|
928
|
-
name=name,
|
929
|
-
namespace=namespace,
|
930
|
-
_request_timeout=config_lib.DELETION_TIMEOUT),
|
931
|
-
resource_type='service',
|
932
|
-
resource_name=service_name)
|
1071
|
+
_delete_services(pod_name, namespace, context)
|
933
1072
|
|
934
1073
|
# Note - delete pod after all other resources are deleted.
|
935
1074
|
# This is to ensure there are no leftover resources if this down is run
|
@@ -946,6 +1085,36 @@ def _terminate_node(namespace: str, context: Optional[str], pod_name: str,
|
|
946
1085
|
resource_name=pod_name)
|
947
1086
|
|
948
1087
|
|
1088
|
+
def _terminate_deployment(cluster_name: str, namespace: str,
|
1089
|
+
context: Optional[str]) -> None:
|
1090
|
+
"""Terminate a deployment."""
|
1091
|
+
# Delete services first
|
1092
|
+
_delete_services(f'{cluster_name}-head', namespace, context)
|
1093
|
+
|
1094
|
+
# Delete deployment
|
1095
|
+
deployment_name = _get_deployment_name(cluster_name)
|
1096
|
+
_delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.apps_api(
|
1097
|
+
context).delete_namespaced_deployment(name=deployment_name,
|
1098
|
+
namespace=namespace,
|
1099
|
+
_request_timeout=config_lib.
|
1100
|
+
DELETION_TIMEOUT),
|
1101
|
+
resource_type='deployment',
|
1102
|
+
resource_name=deployment_name)
|
1103
|
+
|
1104
|
+
# Delete PVCs
|
1105
|
+
pvc_name = _get_pvc_name(
|
1106
|
+
cluster_name,
|
1107
|
+
kubernetes_utils.HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME)
|
1108
|
+
# pylint: disable=cell-var-from-loop
|
1109
|
+
_delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.core_api(
|
1110
|
+
context).delete_namespaced_persistent_volume_claim(
|
1111
|
+
name=pvc_name,
|
1112
|
+
namespace=namespace,
|
1113
|
+
_request_timeout=config_lib.DELETION_TIMEOUT),
|
1114
|
+
resource_type='pvc',
|
1115
|
+
resource_name=pvc_name)
|
1116
|
+
|
1117
|
+
|
949
1118
|
def terminate_instances(
|
950
1119
|
cluster_name_on_cloud: str,
|
951
1120
|
provider_config: Dict[str, Any],
|
@@ -954,10 +1123,9 @@ def terminate_instances(
|
|
954
1123
|
"""See sky/provision/__init__.py"""
|
955
1124
|
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
956
1125
|
context = kubernetes_utils.get_context_from_config(provider_config)
|
957
|
-
|
958
|
-
|
959
|
-
|
960
|
-
pods = kubernetes_utils.filter_pods(namespace, context, tag_filters, None)
|
1126
|
+
pods = kubernetes_utils.filter_pods(namespace, context,
|
1127
|
+
ray_tag_filter(cluster_name_on_cloud),
|
1128
|
+
None)
|
961
1129
|
|
962
1130
|
# Clean up the SSH jump pod if in use
|
963
1131
|
networking_mode = network_utils.get_networking_mode(
|
@@ -971,8 +1139,12 @@ def terminate_instances(
|
|
971
1139
|
logger.warning('terminate_instances: Error occurred when analyzing '
|
972
1140
|
f'SSH Jump pod: {e}')
|
973
1141
|
|
974
|
-
|
975
|
-
|
1142
|
+
if is_high_availability_cluster_by_kubectl(cluster_name_on_cloud, context,
|
1143
|
+
namespace):
|
1144
|
+
# For high availability controllers, terminate the deployment
|
1145
|
+
logger.debug(f'Terminating deployment {cluster_name_on_cloud}')
|
1146
|
+
_terminate_deployment(cluster_name_on_cloud, namespace, context)
|
1147
|
+
return
|
976
1148
|
|
977
1149
|
def _terminate_pod_thread(pod_info):
|
978
1150
|
pod_name, pod = pod_info
|
@@ -994,12 +1166,9 @@ def get_cluster_info(
|
|
994
1166
|
assert provider_config is not None
|
995
1167
|
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
996
1168
|
context = kubernetes_utils.get_context_from_config(provider_config)
|
997
|
-
tag_filters = {
|
998
|
-
TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
|
999
|
-
}
|
1000
1169
|
|
1001
|
-
running_pods = kubernetes_utils.filter_pods(
|
1002
|
-
|
1170
|
+
running_pods = kubernetes_utils.filter_pods(
|
1171
|
+
namespace, context, ray_tag_filter(cluster_name_on_cloud), ['Running'])
|
1003
1172
|
|
1004
1173
|
pods: Dict[str, List[common.InstanceInfo]] = {}
|
1005
1174
|
head_pod_name = None
|
@@ -1029,7 +1198,7 @@ def get_cluster_info(
|
|
1029
1198
|
tags=pod.metadata.labels,
|
1030
1199
|
)
|
1031
1200
|
]
|
1032
|
-
if pod
|
1201
|
+
if _is_head(pod):
|
1033
1202
|
head_pod_name = pod_name
|
1034
1203
|
head_spec = pod.spec
|
1035
1204
|
assert head_spec is not None, pod
|
@@ -1125,11 +1294,25 @@ def get_command_runners(
|
|
1125
1294
|
cluster_info.provider_config)
|
1126
1295
|
context = kubernetes_utils.get_context_from_config(
|
1127
1296
|
cluster_info.provider_config)
|
1128
|
-
|
1297
|
+
|
1298
|
+
runners: List[command_runner.CommandRunner] = []
|
1129
1299
|
if cluster_info.head_instance_id is not None:
|
1130
|
-
|
1131
|
-
|
1132
|
-
|
1133
|
-
|
1134
|
-
|
1135
|
-
|
1300
|
+
pod_name = cluster_info.head_instance_id
|
1301
|
+
|
1302
|
+
# Try to get deployment name from label first
|
1303
|
+
head_instance_info = instances[pod_name][0]
|
1304
|
+
deployment = head_instance_info.tags.get(TAG_SKYPILOT_DEPLOYMENT_NAME)
|
1305
|
+
|
1306
|
+
node_list = [((namespace, context), pod_name)]
|
1307
|
+
head_runner = command_runner.KubernetesCommandRunner(
|
1308
|
+
node_list[0], deployment=deployment, **credentials)
|
1309
|
+
runners.append(head_runner)
|
1310
|
+
|
1311
|
+
node_list = [((namespace, context), pod_name)
|
1312
|
+
for pod_name in instances.keys()
|
1313
|
+
if pod_name != cluster_info.head_instance_id]
|
1314
|
+
runners.extend(
|
1315
|
+
command_runner.KubernetesCommandRunner.make_runner_list(
|
1316
|
+
node_list, **credentials))
|
1317
|
+
|
1318
|
+
return runners
|
@@ -78,7 +78,7 @@ def _open_ports_using_ingress(
|
|
78
78
|
'https://github.com/kubernetes/ingress-nginx/blob/main/docs/deploy/index.md.' # pylint: disable=line-too-long
|
79
79
|
)
|
80
80
|
|
81
|
-
# Prepare service names, ports,
|
81
|
+
# Prepare service names, ports, for template rendering
|
82
82
|
service_details = [
|
83
83
|
(f'{cluster_name_on_cloud}--skypilot-svc--{port}', port,
|
84
84
|
_PATH_PREFIX.format(
|
@@ -45,6 +45,16 @@ else:
|
|
45
45
|
jinja2 = adaptors_common.LazyImport('jinja2')
|
46
46
|
yaml = adaptors_common.LazyImport('yaml')
|
47
47
|
|
48
|
+
# Please be careful when changing this.
|
49
|
+
# When mounting, Kubernetes changes the ownership of the parent directory
|
50
|
+
# to root:root.
|
51
|
+
# See https://stackoverflow.com/questions/50818029/mounted-folder-created-as-root-instead-of-current-user-in-docker/50820023#50820023. # pylint: disable=line-too-long
|
52
|
+
HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME = 'sky-data'
|
53
|
+
# Path where the persistent volume for HA controller is mounted.
|
54
|
+
# TODO(andy): Consider using dedicated path like `/var/skypilot`
|
55
|
+
# and store all data that needs to be persisted in future.
|
56
|
+
HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_PATH = '/home/sky'
|
57
|
+
|
48
58
|
# TODO(romilb): Move constants to constants.py
|
49
59
|
DEFAULT_NAMESPACE = 'default'
|
50
60
|
|
sky/provision/provisioner.py
CHANGED
@@ -149,6 +149,12 @@ def bulk_provision(
|
|
149
149
|
# Skip the teardown if the cloud config is expired and
|
150
150
|
# the provisioner should failover to other clouds.
|
151
151
|
raise
|
152
|
+
except exceptions.InconsistentHighAvailabilityError:
|
153
|
+
# Skip the teardown if the high availability property in the
|
154
|
+
# user config is inconsistent with the actual cluster.
|
155
|
+
# This error is a user error instead of a provisioning failure.
|
156
|
+
# And there is no possibility to fix it by teardown.
|
157
|
+
raise
|
152
158
|
except Exception: # pylint: disable=broad-except
|
153
159
|
zone_str = 'all zones'
|
154
160
|
if zones:
|
sky/serve/replica_managers.py
CHANGED
@@ -387,11 +387,12 @@ class ReplicaStatusProperty:
|
|
387
387
|
class ReplicaInfo:
|
388
388
|
"""Replica info for each replica."""
|
389
389
|
|
390
|
-
_VERSION =
|
390
|
+
_VERSION = 2
|
391
391
|
|
392
392
|
def __init__(self, replica_id: int, cluster_name: str, replica_port: str,
|
393
393
|
is_spot: bool, location: Optional[spot_placer.Location],
|
394
|
-
version: int
|
394
|
+
version: int, resources_override: Optional[Dict[str,
|
395
|
+
Any]]) -> None:
|
395
396
|
self._version = self._VERSION
|
396
397
|
self.replica_id: int = replica_id
|
397
398
|
self.cluster_name: str = cluster_name
|
@@ -403,6 +404,7 @@ class ReplicaInfo:
|
|
403
404
|
self.is_spot: bool = is_spot
|
404
405
|
self.location: Optional[Dict[str, Optional[str]]] = (
|
405
406
|
location.to_pickleable() if location is not None else None)
|
407
|
+
self.resources_override: Optional[Dict[str, Any]] = resources_override
|
406
408
|
|
407
409
|
def get_spot_location(self) -> Optional[spot_placer.Location]:
|
408
410
|
return spot_placer.Location.from_pickleable(self.location)
|
@@ -569,6 +571,9 @@ class ReplicaInfo:
|
|
569
571
|
if version < 1:
|
570
572
|
self.location = None
|
571
573
|
|
574
|
+
if version < 2:
|
575
|
+
self.resources_override = None
|
576
|
+
|
572
577
|
self.__dict__.update(state)
|
573
578
|
|
574
579
|
|
@@ -650,6 +655,44 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
650
655
|
threading.Thread(target=self._job_status_fetcher).start()
|
651
656
|
threading.Thread(target=self._replica_prober).start()
|
652
657
|
|
658
|
+
self._recover_replica_operations()
|
659
|
+
|
660
|
+
def _recover_replica_operations(self):
|
661
|
+
"""Let's see are there something to do for ReplicaManager in a
|
662
|
+
recovery run"""
|
663
|
+
assert (not self._launch_process_pool and not self._down_process_pool
|
664
|
+
), 'We should not have any running processes in a recovery run'
|
665
|
+
|
666
|
+
# There is a FIFO queue with capacity _MAX_NUM_LAUNCH for
|
667
|
+
# _launch_replica.
|
668
|
+
# We prioritize PROVISIONING replicas since they were previously
|
669
|
+
# launched but may have been interrupted and need to be restarted.
|
670
|
+
# This is why we process PENDING replicas only after PROVISIONING
|
671
|
+
# replicas.
|
672
|
+
to_up_replicas = serve_state.get_replicas_at_status(
|
673
|
+
self._service_name, serve_state.ReplicaStatus.PROVISIONING)
|
674
|
+
to_up_replicas.extend(
|
675
|
+
serve_state.get_replicas_at_status(
|
676
|
+
self._service_name, serve_state.ReplicaStatus.PENDING))
|
677
|
+
|
678
|
+
for replica_info in to_up_replicas:
|
679
|
+
# It should be robust enough for `execution.launch` to handle cases
|
680
|
+
# where the provisioning is partially done.
|
681
|
+
# So we mock the original request based on all call sites,
|
682
|
+
# including SkyServeController._run_autoscaler.
|
683
|
+
self._launch_replica(
|
684
|
+
replica_info.replica_id,
|
685
|
+
resources_override=replica_info.resources_override)
|
686
|
+
|
687
|
+
for replica_info in serve_state.get_replicas_at_status(
|
688
|
+
self._service_name, serve_state.ReplicaStatus.SHUTTING_DOWN):
|
689
|
+
self._terminate_replica(
|
690
|
+
replica_info.replica_id,
|
691
|
+
sync_down_logs=False,
|
692
|
+
replica_drain_delay_seconds=0,
|
693
|
+
purge=replica_info.status_property.purged,
|
694
|
+
is_scale_down=replica_info.status_property.is_scale_down)
|
695
|
+
|
653
696
|
################################
|
654
697
|
# Replica management functions #
|
655
698
|
################################
|
@@ -705,7 +748,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
705
748
|
replica_port = _get_resources_ports(self._task_yaml_path)
|
706
749
|
|
707
750
|
info = ReplicaInfo(replica_id, cluster_name, replica_port, use_spot,
|
708
|
-
location, self.latest_version)
|
751
|
+
location, self.latest_version, resources_override)
|
709
752
|
serve_state.add_or_update_replica(self._service_name, replica_id, info)
|
710
753
|
# Don't start right now; we will start it later in _refresh_process_pool
|
711
754
|
# to avoid too many sky.launch running at the same time.
|
@@ -884,7 +927,9 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
884
927
|
the fly. If any of them finished, it will update the status of the
|
885
928
|
corresponding replica.
|
886
929
|
"""
|
887
|
-
|
930
|
+
# To avoid `dictionary changed size during iteration` error.
|
931
|
+
launch_process_pool_snapshot = list(self._launch_process_pool.items())
|
932
|
+
for replica_id, p in launch_process_pool_snapshot:
|
888
933
|
if not p.is_alive():
|
889
934
|
info = serve_state.get_replica_info_from_id(
|
890
935
|
self._service_name, replica_id)
|
@@ -943,7 +988,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
943
988
|
self._terminate_replica(replica_id,
|
944
989
|
sync_down_logs=True,
|
945
990
|
replica_drain_delay_seconds=0)
|
946
|
-
|
991
|
+
down_process_pool_snapshot = list(self._down_process_pool.items())
|
992
|
+
for replica_id, p in down_process_pool_snapshot:
|
947
993
|
if not p.is_alive():
|
948
994
|
logger.info(
|
949
995
|
f'Terminate process for replica {replica_id} finished.')
|