skypilot-nightly 1.0.0.dev20250426__py3-none-any.whl → 1.0.0.dev20250428__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +19 -2
  3. sky/backends/cloud_vm_ray_backend.py +33 -8
  4. sky/backends/local_docker_backend.py +1 -2
  5. sky/cli.py +1 -1
  6. sky/client/cli.py +1 -1
  7. sky/clouds/aws.py +12 -6
  8. sky/clouds/azure.py +3 -0
  9. sky/clouds/cloud.py +3 -0
  10. sky/clouds/cudo.py +2 -0
  11. sky/clouds/do.py +3 -0
  12. sky/clouds/fluidstack.py +3 -0
  13. sky/clouds/gcp.py +7 -0
  14. sky/clouds/ibm.py +2 -0
  15. sky/clouds/kubernetes.py +38 -15
  16. sky/clouds/lambda_cloud.py +1 -0
  17. sky/clouds/nebius.py +2 -0
  18. sky/clouds/oci.py +6 -3
  19. sky/clouds/paperspace.py +2 -0
  20. sky/clouds/runpod.py +2 -0
  21. sky/clouds/scp.py +2 -0
  22. sky/clouds/vast.py +2 -0
  23. sky/clouds/vsphere.py +2 -0
  24. sky/dashboard/out/404.html +1 -1
  25. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  26. sky/dashboard/out/clusters/[cluster].html +1 -1
  27. sky/dashboard/out/clusters.html +1 -1
  28. sky/dashboard/out/index.html +1 -1
  29. sky/dashboard/out/jobs/[job].html +1 -1
  30. sky/dashboard/out/jobs.html +1 -1
  31. sky/exceptions.py +6 -0
  32. sky/execution.py +19 -4
  33. sky/global_user_state.py +1 -0
  34. sky/provision/common.py +2 -5
  35. sky/provision/instance_setup.py +1 -1
  36. sky/provision/kubernetes/instance.py +280 -94
  37. sky/provision/kubernetes/network.py +1 -1
  38. sky/provision/kubernetes/utils.py +10 -0
  39. sky/provision/provisioner.py +6 -0
  40. sky/serve/replica_managers.py +51 -5
  41. sky/serve/serve_state.py +41 -0
  42. sky/serve/service.py +108 -63
  43. sky/server/requests/executor.py +4 -4
  44. sky/skylet/constants.py +7 -0
  45. sky/task.py +1 -1
  46. sky/templates/kubernetes-ray.yml.j2 +122 -2
  47. sky/utils/command_runner.py +17 -3
  48. sky/utils/command_runner.pyi +2 -0
  49. sky/utils/controller_utils.py +24 -0
  50. sky/utils/kubernetes/rsync_helper.sh +20 -4
  51. sky/utils/schemas.py +13 -0
  52. {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/METADATA +1 -1
  53. {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/RECORD +59 -59
  54. {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/WHEEL +1 -1
  55. /sky/dashboard/out/_next/static/{WO8lTFPfj-lO3_gDGEiN8 → 2f-jlOWR_G5mOwCF4RcZz}/_buildManifest.js +0 -0
  56. /sky/dashboard/out/_next/static/{WO8lTFPfj-lO3_gDGEiN8 → 2f-jlOWR_G5mOwCF4RcZz}/_ssgManifest.js +0 -0
  57. {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/entry_points.txt +0 -0
  58. {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/licenses/LICENSE +0 -0
  59. {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/top_level.txt +0 -0
@@ -32,22 +32,53 @@ logger = sky_logging.init_logger(__name__)
32
32
  TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
33
33
  TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name'
34
34
  TAG_POD_INITIALIZED = 'skypilot-initialized'
35
+ TAG_SKYPILOT_DEPLOYMENT_NAME = 'skypilot-deployment-name'
36
+
37
+
38
+ def ray_tag_filter(cluster_name: str) -> Dict[str, str]:
39
+ return {TAG_RAY_CLUSTER_NAME: cluster_name}
40
+
41
+
42
+ def _is_head(pod) -> bool:
43
+ return pod.metadata.labels.get(constants.TAG_RAY_NODE_KIND) == 'head'
35
44
 
36
45
 
37
46
  def _get_head_pod_name(pods: Dict[str, Any]) -> Optional[str]:
38
- head_pod_name = None
39
- for pod_name, pod in pods.items():
40
- if pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head':
41
- head_pod_name = pod_name
42
- break
43
- return head_pod_name
47
+ return next((pod_name for pod_name, pod in pods.items() if _is_head(pod)),
48
+ None)
49
+
50
+
51
+ def _get_pvc_name(cluster_name: str, volume_name: str) -> str:
52
+ return f'{cluster_name}-{volume_name}'
53
+
44
54
 
55
+ def _get_deployment_name(cluster_name: str) -> str:
56
+ return f'{cluster_name}-deployment'
45
57
 
46
- def head_service_selector(cluster_name: str) -> Dict[str, str]:
47
- """Selector for Operator-configured head service."""
58
+
59
+ def _head_service_selector(cluster_name: str) -> Dict[str, str]:
48
60
  return {'component': f'{cluster_name}-head'}
49
61
 
50
62
 
63
+ def is_high_availability_cluster_by_kubectl(
64
+ cluster_name: str,
65
+ context: Optional[str] = None,
66
+ namespace: Optional[str] = None) -> bool:
67
+ """Check if a cluster is a high availability controller by calling
68
+ `kubectl get deployment`.
69
+ """
70
+ try:
71
+ deployment_list = kubernetes.apps_api(
72
+ context).list_namespaced_deployment(
73
+ namespace,
74
+ label_selector=f'{TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}')
75
+ except kubernetes.api_exception():
76
+ return False
77
+ # It is a high availability cluster if there is at least one deployment
78
+ # matching the label selector.
79
+ return bool(deployment_list.items)
80
+
81
+
51
82
  def _formatted_resource_requirements(pod_or_spec: Union[Any, dict]) -> str:
52
83
  # Returns a formatted string of resource requirements for a pod.
53
84
  resource_requirements = {}
@@ -384,13 +415,11 @@ def _run_function_with_retries(func: Callable,
384
415
  max_retries: int = _MAX_RETRIES,
385
416
  retry_delay: int = 5) -> Any:
386
417
  """Runs a function with retries on Kubernetes errors.
387
-
388
418
  Args:
389
419
  func: Function to retry
390
420
  operation_name: Name of the operation for logging
391
421
  max_retries: Maximum number of retry attempts
392
422
  retry_delay: Delay between retries in seconds
393
-
394
423
  Raises:
395
424
  The last exception encountered if all retries fail.
396
425
  """
@@ -409,30 +438,23 @@ def _run_function_with_retries(func: Callable,
409
438
  @timeline.event
410
439
  def pre_init(namespace: str, context: Optional[str], new_nodes: List) -> None:
411
440
  """Pre-initialization step for SkyPilot pods.
412
-
413
441
  This step is run in the pod right after it is created and before the
414
442
  SkyPilot runtime is setup.
415
-
416
443
  This step includes three key steps:
417
-
418
444
  1. Privilege check: Checks if the default user has sufficient privilege
419
445
  to set up the kubernetes instance pod.
420
446
  2. SSH setup: Sets up SSH for the pod instance.
421
447
  3. Environment variable setup to populate k8s env vars in the pod.
422
-
423
448
  Make sure commands used in these methods are generic and work
424
449
  on most base images. E.g., do not use Python, since that may not
425
450
  be installed by default.
426
-
427
451
  If you run any apt commands, be sure to check if the lock is available.
428
452
  It is possible the `apt update` run in the pod container args may still
429
453
  be running.
430
-
431
454
  Args:
432
455
  namespace (str): Kubernetes namespace.
433
456
  context (Optional[str]): Kubernetes context.
434
457
  new_nodes (List): List of new pod instances.
435
-
436
458
  Raises:
437
459
  config_lib.KubernetesError: If user privileges are insufficient or
438
460
  setup fails.
@@ -647,6 +669,56 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
647
669
  raise e
648
670
 
649
671
 
672
+ def _create_persistent_volume_claim(namespace: str, context: Optional[str],
673
+ pvc_spec: Dict[str, Any]) -> None:
674
+ """Creates a persistent volume claim for SkyServe controller."""
675
+ try:
676
+ kubernetes.core_api(context).read_namespaced_persistent_volume_claim(
677
+ name=pvc_spec['metadata']['name'], namespace=namespace)
678
+ return
679
+ except kubernetes.api_exception() as e:
680
+ if e.status != 404: # Not found
681
+ raise
682
+
683
+ kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
684
+ namespace=namespace, body=pvc_spec)
685
+
686
+
687
+ @timeline.event
688
+ def _wait_for_deployment_pod(context,
689
+ namespace,
690
+ deployment,
691
+ timeout=60) -> List:
692
+ label_selector = ','.join([
693
+ f'{key}={value}'
694
+ for key, value in deployment.spec.selector.match_labels.items()
695
+ ])
696
+ target_replicas = deployment.spec.replicas
697
+ deployment_name = deployment.metadata.name
698
+ start_time = time.time()
699
+ while time.time() - start_time < timeout:
700
+ # Refresh the deployment status
701
+ deployment = kubernetes.apps_api(
702
+ context).read_namespaced_deployment_status(deployment_name,
703
+ namespace)
704
+ if (deployment.status and
705
+ deployment.status.ready_replicas is not None and
706
+ deployment.status.ready_replicas >= target_replicas):
707
+ pods = kubernetes.core_api(context).list_namespaced_pod(
708
+ namespace, label_selector=label_selector).items
709
+ return pods
710
+
711
+ ready_replicas = (deployment.status.ready_replicas
712
+ if deployment.status is not None else 0)
713
+ logger.debug(f'Waiting for deployment {deployment_name!r} to be ready. '
714
+ f'Ready replicas: {ready_replicas}/{target_replicas}')
715
+ time.sleep(2)
716
+
717
+ raise TimeoutError(
718
+ f'Timeout: Deployment {deployment_name!r} did not become '
719
+ 'ready.')
720
+
721
+
650
722
  @timeline.event
651
723
  def _create_pods(region: str, cluster_name_on_cloud: str,
652
724
  config: common.ProvisionConfig) -> common.ProvisionRecord:
@@ -655,9 +727,16 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
655
727
  namespace = kubernetes_utils.get_namespace_from_config(provider_config)
656
728
  context = kubernetes_utils.get_context_from_config(provider_config)
657
729
  pod_spec = copy.deepcopy(config.node_config)
658
- tags = {
659
- TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
660
- }
730
+
731
+ to_create_deployment = 'deployment_spec' in pod_spec
732
+ if to_create_deployment:
733
+ deployment_spec = pod_spec.pop('deployment_spec')
734
+ pvc_spec = pod_spec.pop('pvc_spec')
735
+ assert len(pod_spec['spec']['containers']) == 1, (
736
+ 'Only one container is supported for deployment')
737
+
738
+ tags = ray_tag_filter(cluster_name_on_cloud)
739
+
661
740
  pod_spec['metadata']['namespace'] = namespace
662
741
  if 'labels' in pod_spec['metadata']:
663
742
  pod_spec['metadata']['labels'].update(tags)
@@ -734,16 +813,15 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
734
813
  if nvidia_runtime_exists and needs_gpus:
735
814
  pod_spec['spec']['runtimeClassName'] = 'nvidia'
736
815
 
737
- created_pods = {}
738
816
  logger.debug(f'run_instances: calling create_namespaced_pod '
739
817
  f'(count={to_start_count}).')
740
818
 
741
- def _create_pod_thread(i: int):
819
+ def _create_resource_thread(i: int):
742
820
  pod_spec_copy = copy.deepcopy(pod_spec)
743
821
  if head_pod_name is None and i == 0:
744
822
  # First pod should be head if no head exists
745
823
  pod_spec_copy['metadata']['labels'].update(constants.HEAD_NODE_TAGS)
746
- head_selector = head_service_selector(cluster_name_on_cloud)
824
+ head_selector = _head_service_selector(cluster_name_on_cloud)
747
825
  pod_spec_copy['metadata']['labels'].update(head_selector)
748
826
  pod_spec_copy['metadata']['name'] = f'{cluster_name_on_cloud}-head'
749
827
  else:
@@ -800,19 +878,62 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
800
878
  tpu_toleration
801
879
  ]
802
880
 
881
+ if to_create_deployment:
882
+ _create_persistent_volume_claim(namespace, context, pvc_spec)
883
+
884
+ # It's safe to directly modify the template spec in the deployment spec
885
+ # because controller pod is singleton, i in [0].
886
+ template_pod_spec = deployment_spec['spec']['template']
887
+ # Add the deployment name as a label to the pod spec
888
+ deployment_name = deployment_spec['metadata']['name']
889
+ pod_spec_copy['metadata']['labels'][
890
+ TAG_SKYPILOT_DEPLOYMENT_NAME] = deployment_name
891
+ template_pod_spec['metadata'] = pod_spec_copy['metadata']
892
+ template_pod_spec['spec'].update(pod_spec_copy['spec'])
893
+ try:
894
+ return kubernetes.apps_api(
895
+ context).create_namespaced_deployment(
896
+ namespace, deployment_spec)
897
+ except Exception as e:
898
+ print('Deployment failed', e)
899
+ raise e
900
+
803
901
  return _create_namespaced_pod_with_retries(namespace, pod_spec_copy,
804
902
  context)
805
903
 
904
+ if not to_start_count:
905
+ is_provisioned_cluster_ha = is_high_availability_cluster_by_kubectl(
906
+ cluster_name_on_cloud, context, namespace)
907
+ if is_provisioned_cluster_ha != to_create_deployment:
908
+ ha_str = lambda x: 'high availability' if x else 'non-high availability'
909
+
910
+ message = (
911
+ f'The cluster "{cluster_name_on_cloud}" is configured to be '
912
+ f'{ha_str(to_create_deployment)} but the cluster has already been '
913
+ f'provisioned as {ha_str(is_provisioned_cluster_ha)}. '
914
+ 'If you want to make the provisioned cluster '
915
+ f'{ha_str(to_create_deployment)}, please first down the cluster '
916
+ 'and then up the cluster again.')
917
+ raise exceptions.InconsistentHighAvailabilityError(message)
918
+
806
919
  # Create pods in parallel
807
- pods = subprocess_utils.run_in_parallel(_create_pod_thread,
808
- list(range(to_start_count)),
809
- _NUM_THREADS)
920
+ created_resources = subprocess_utils.run_in_parallel(
921
+ _create_resource_thread, list(range(to_start_count)), _NUM_THREADS)
922
+
923
+ if to_create_deployment:
924
+ deployments = copy.deepcopy(created_resources)
925
+ pods = [
926
+ pod for deployment in deployments
927
+ for pod in _wait_for_deployment_pod(context, namespace, deployment)
928
+ ]
929
+ else:
930
+ # If not creating deployments, 'created_resources' already holds Pod objects
931
+ pods = created_resources
810
932
 
811
- # Process created pods
933
+ created_pods = {}
812
934
  for pod in pods:
813
935
  created_pods[pod.metadata.name] = pod
814
- if head_pod_name is None and pod.metadata.labels.get(
815
- constants.TAG_RAY_NODE_KIND) == 'head':
936
+ if head_pod_name is None and _is_head(pod):
816
937
  head_pod_name = pod.metadata.name
817
938
 
818
939
  networking_mode = network_utils.get_networking_mode(
@@ -879,70 +1000,121 @@ def stop_instances(
879
1000
  raise NotImplementedError()
880
1001
 
881
1002
 
882
- def _terminate_node(namespace: str, context: Optional[str], pod_name: str,
883
- is_head: bool) -> None:
884
- """Terminate a pod."""
885
- logger.debug('terminate_instances: calling delete_namespaced_pod')
1003
+ def _delete_k8s_resource_with_retry(delete_func: Callable, resource_type: str,
1004
+ resource_name: str) -> None:
1005
+ """Helper to delete Kubernetes resources with 404 handling and retries.
886
1006
 
887
- def _delete_k8s_resource_with_retry(delete_func: Callable,
888
- resource_type: str,
889
- resource_name: str) -> None:
890
- """Helper to delete Kubernetes resources with 404 handling and retries.
891
-
892
- Args:
893
- delete_func: Function to call to delete the resource
894
- resource_type: Type of resource being deleted (e.g. 'service'),
895
- used in logging
896
- resource_name: Name of the resource being deleted, used in logging
897
- """
898
- max_retries = 3
899
- retry_delay = 5 # seconds
900
-
901
- for attempt in range(max_retries):
902
- try:
903
- delete_func()
1007
+ Args:
1008
+ delete_func: Function to call to delete the resource
1009
+ resource_type: Type of resource being deleted (e.g. 'service'),
1010
+ used in logging
1011
+ resource_name: Name of the resource being deleted, used in logging
1012
+ """
1013
+ max_retries = 3
1014
+ retry_delay = 5 # seconds
1015
+
1016
+ for attempt in range(max_retries):
1017
+ try:
1018
+ delete_func()
1019
+ return
1020
+ except kubernetes.api_exception() as e:
1021
+ if e.status == 404:
1022
+ logger.warning(
1023
+ f'terminate_instances: Tried to delete {resource_type} '
1024
+ f'{resource_name}, but the {resource_type} was not '
1025
+ 'found (404).')
904
1026
  return
905
- except kubernetes.api_exception() as e:
906
- if e.status == 404:
907
- logger.warning(
908
- f'terminate_instances: Tried to delete {resource_type} '
909
- f'{resource_name}, but the {resource_type} was not '
910
- 'found (404).')
911
- return
912
- elif attempt < max_retries - 1:
913
- logger.warning(f'terminate_instances: Failed to delete '
914
- f'{resource_type} {resource_name} (attempt '
915
- f'{attempt + 1}/{max_retries}). Error: {e}. '
916
- f'Retrying in {retry_delay} seconds...')
917
- time.sleep(retry_delay)
918
- else:
919
- raise
1027
+ elif attempt < max_retries - 1:
1028
+ logger.warning(f'terminate_instances: Failed to delete '
1029
+ f'{resource_type} {resource_name} (attempt '
1030
+ f'{attempt + 1}/{max_retries}). Error: {e}. '
1031
+ f'Retrying in {retry_delay} seconds...')
1032
+ time.sleep(retry_delay)
1033
+ else:
1034
+ raise
1035
+
1036
+
1037
+ def _delete_services(name_prefix: str, namespace: str,
1038
+ context: Optional[str]) -> None:
1039
+ """Delete services with the given name prefix.
1040
+
1041
+ Args:
1042
+ name_prefix: Prefix of the service names to delete
1043
+ namespace: Kubernetes namespace
1044
+ context: Kubernetes context
1045
+ """
1046
+ # TODO(andy): We should use tag for the service filter.
1047
+ for service_name in [name_prefix, f'{name_prefix}-ssh']:
1048
+ # Since we are not saving this lambda, it's a false positive.
1049
+ # TODO(andyl): Wait for
1050
+ # https://github.com/pylint-dev/pylint/issues/5263.
1051
+ # pylint: disable=cell-var-from-loop
1052
+ _delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.core_api(
1053
+ context).delete_namespaced_service(name=service_name,
1054
+ namespace=namespace,
1055
+ _request_timeout=config_lib.
1056
+ DELETION_TIMEOUT),
1057
+ resource_type='service',
1058
+ resource_name=service_name)
1059
+
1060
+
1061
+ def _terminate_node(namespace: str,
1062
+ context: Optional[str],
1063
+ pod_name: str,
1064
+ is_head: bool = False) -> None:
1065
+ """Terminate a pod and its associated services."""
1066
+ logger.debug('terminate_instances: calling delete_namespaced_pod')
920
1067
 
921
1068
  if is_head:
922
1069
  # Delete services for the head pod
923
1070
  # services are specified in sky/templates/kubernetes-ray.yml.j2
924
- for service_name in [pod_name, f'{pod_name}-ssh']:
925
- _delete_k8s_resource_with_retry(
926
- delete_func=lambda name=service_name: kubernetes.core_api(
927
- context).delete_namespaced_service(
928
- name=name,
929
- namespace=namespace,
930
- _request_timeout=config_lib.DELETION_TIMEOUT),
931
- resource_type='service',
932
- resource_name=service_name)
1071
+ _delete_services(pod_name, namespace, context)
933
1072
 
934
1073
  # Note - delete pod after all other resources are deleted.
935
1074
  # This is to ensure there are no leftover resources if this down is run
936
1075
  # from within the pod, e.g., for autodown.
1076
+ # Note - some misbehaving pods may not terminate gracefully if they have
1077
+ # open file descriptors. We force delete pods to avoid this.
937
1078
  _delete_k8s_resource_with_retry(
938
1079
  delete_func=lambda: kubernetes.core_api(context).delete_namespaced_pod(
939
1080
  name=pod_name,
940
1081
  namespace=namespace,
941
- _request_timeout=config_lib.DELETION_TIMEOUT),
1082
+ _request_timeout=config_lib.DELETION_TIMEOUT,
1083
+ grace_period_seconds=0),
942
1084
  resource_type='pod',
943
1085
  resource_name=pod_name)
944
1086
 
945
1087
 
1088
+ def _terminate_deployment(cluster_name: str, namespace: str,
1089
+ context: Optional[str]) -> None:
1090
+ """Terminate a deployment."""
1091
+ # Delete services first
1092
+ _delete_services(f'{cluster_name}-head', namespace, context)
1093
+
1094
+ # Delete deployment
1095
+ deployment_name = _get_deployment_name(cluster_name)
1096
+ _delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.apps_api(
1097
+ context).delete_namespaced_deployment(name=deployment_name,
1098
+ namespace=namespace,
1099
+ _request_timeout=config_lib.
1100
+ DELETION_TIMEOUT),
1101
+ resource_type='deployment',
1102
+ resource_name=deployment_name)
1103
+
1104
+ # Delete PVCs
1105
+ pvc_name = _get_pvc_name(
1106
+ cluster_name,
1107
+ kubernetes_utils.HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME)
1108
+ # pylint: disable=cell-var-from-loop
1109
+ _delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.core_api(
1110
+ context).delete_namespaced_persistent_volume_claim(
1111
+ name=pvc_name,
1112
+ namespace=namespace,
1113
+ _request_timeout=config_lib.DELETION_TIMEOUT),
1114
+ resource_type='pvc',
1115
+ resource_name=pvc_name)
1116
+
1117
+
946
1118
  def terminate_instances(
947
1119
  cluster_name_on_cloud: str,
948
1120
  provider_config: Dict[str, Any],
@@ -951,10 +1123,9 @@ def terminate_instances(
951
1123
  """See sky/provision/__init__.py"""
952
1124
  namespace = kubernetes_utils.get_namespace_from_config(provider_config)
953
1125
  context = kubernetes_utils.get_context_from_config(provider_config)
954
- tag_filters = {
955
- TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
956
- }
957
- pods = kubernetes_utils.filter_pods(namespace, context, tag_filters, None)
1126
+ pods = kubernetes_utils.filter_pods(namespace, context,
1127
+ ray_tag_filter(cluster_name_on_cloud),
1128
+ None)
958
1129
 
959
1130
  # Clean up the SSH jump pod if in use
960
1131
  networking_mode = network_utils.get_networking_mode(
@@ -968,8 +1139,12 @@ def terminate_instances(
968
1139
  logger.warning('terminate_instances: Error occurred when analyzing '
969
1140
  f'SSH Jump pod: {e}')
970
1141
 
971
- def _is_head(pod) -> bool:
972
- return pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head'
1142
+ if is_high_availability_cluster_by_kubectl(cluster_name_on_cloud, context,
1143
+ namespace):
1144
+ # For high availability controllers, terminate the deployment
1145
+ logger.debug(f'Terminating deployment {cluster_name_on_cloud}')
1146
+ _terminate_deployment(cluster_name_on_cloud, namespace, context)
1147
+ return
973
1148
 
974
1149
  def _terminate_pod_thread(pod_info):
975
1150
  pod_name, pod = pod_info
@@ -991,12 +1166,9 @@ def get_cluster_info(
991
1166
  assert provider_config is not None
992
1167
  namespace = kubernetes_utils.get_namespace_from_config(provider_config)
993
1168
  context = kubernetes_utils.get_context_from_config(provider_config)
994
- tag_filters = {
995
- TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
996
- }
997
1169
 
998
- running_pods = kubernetes_utils.filter_pods(namespace, context, tag_filters,
999
- ['Running'])
1170
+ running_pods = kubernetes_utils.filter_pods(
1171
+ namespace, context, ray_tag_filter(cluster_name_on_cloud), ['Running'])
1000
1172
 
1001
1173
  pods: Dict[str, List[common.InstanceInfo]] = {}
1002
1174
  head_pod_name = None
@@ -1026,7 +1198,7 @@ def get_cluster_info(
1026
1198
  tags=pod.metadata.labels,
1027
1199
  )
1028
1200
  ]
1029
- if pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head':
1201
+ if _is_head(pod):
1030
1202
  head_pod_name = pod_name
1031
1203
  head_spec = pod.spec
1032
1204
  assert head_spec is not None, pod
@@ -1122,11 +1294,25 @@ def get_command_runners(
1122
1294
  cluster_info.provider_config)
1123
1295
  context = kubernetes_utils.get_context_from_config(
1124
1296
  cluster_info.provider_config)
1125
- node_list = []
1297
+
1298
+ runners: List[command_runner.CommandRunner] = []
1126
1299
  if cluster_info.head_instance_id is not None:
1127
- node_list = [((namespace, context), cluster_info.head_instance_id)]
1128
- node_list.extend(((namespace, context), pod_name)
1129
- for pod_name in instances.keys()
1130
- if pod_name != cluster_info.head_instance_id)
1131
- return command_runner.KubernetesCommandRunner.make_runner_list(
1132
- node_list=node_list, **credentials)
1300
+ pod_name = cluster_info.head_instance_id
1301
+
1302
+ # Try to get deployment name from label first
1303
+ head_instance_info = instances[pod_name][0]
1304
+ deployment = head_instance_info.tags.get(TAG_SKYPILOT_DEPLOYMENT_NAME)
1305
+
1306
+ node_list = [((namespace, context), pod_name)]
1307
+ head_runner = command_runner.KubernetesCommandRunner(
1308
+ node_list[0], deployment=deployment, **credentials)
1309
+ runners.append(head_runner)
1310
+
1311
+ node_list = [((namespace, context), pod_name)
1312
+ for pod_name in instances.keys()
1313
+ if pod_name != cluster_info.head_instance_id]
1314
+ runners.extend(
1315
+ command_runner.KubernetesCommandRunner.make_runner_list(
1316
+ node_list, **credentials))
1317
+
1318
+ return runners
@@ -78,7 +78,7 @@ def _open_ports_using_ingress(
78
78
  'https://github.com/kubernetes/ingress-nginx/blob/main/docs/deploy/index.md.' # pylint: disable=line-too-long
79
79
  )
80
80
 
81
- # Prepare service names, ports, for template rendering
81
+ # Prepare service names, ports, for template rendering
82
82
  service_details = [
83
83
  (f'{cluster_name_on_cloud}--skypilot-svc--{port}', port,
84
84
  _PATH_PREFIX.format(
@@ -45,6 +45,16 @@ else:
45
45
  jinja2 = adaptors_common.LazyImport('jinja2')
46
46
  yaml = adaptors_common.LazyImport('yaml')
47
47
 
48
+ # Please be careful when changing this.
49
+ # When mounting, Kubernetes changes the ownership of the parent directory
50
+ # to root:root.
51
+ # See https://stackoverflow.com/questions/50818029/mounted-folder-created-as-root-instead-of-current-user-in-docker/50820023#50820023. # pylint: disable=line-too-long
52
+ HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME = 'sky-data'
53
+ # Path where the persistent volume for HA controller is mounted.
54
+ # TODO(andy): Consider using dedicated path like `/var/skypilot`
55
+ # and store all data that needs to be persisted in future.
56
+ HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_PATH = '/home/sky'
57
+
48
58
  # TODO(romilb): Move constants to constants.py
49
59
  DEFAULT_NAMESPACE = 'default'
50
60
 
@@ -149,6 +149,12 @@ def bulk_provision(
149
149
  # Skip the teardown if the cloud config is expired and
150
150
  # the provisioner should failover to other clouds.
151
151
  raise
152
+ except exceptions.InconsistentHighAvailabilityError:
153
+ # Skip the teardown if the high availability property in the
154
+ # user config is inconsistent with the actual cluster.
155
+ # This error is a user error instead of a provisioning failure.
156
+ # And there is no possibility to fix it by teardown.
157
+ raise
152
158
  except Exception: # pylint: disable=broad-except
153
159
  zone_str = 'all zones'
154
160
  if zones:
@@ -387,11 +387,12 @@ class ReplicaStatusProperty:
387
387
  class ReplicaInfo:
388
388
  """Replica info for each replica."""
389
389
 
390
- _VERSION = 1
390
+ _VERSION = 2
391
391
 
392
392
  def __init__(self, replica_id: int, cluster_name: str, replica_port: str,
393
393
  is_spot: bool, location: Optional[spot_placer.Location],
394
- version: int) -> None:
394
+ version: int, resources_override: Optional[Dict[str,
395
+ Any]]) -> None:
395
396
  self._version = self._VERSION
396
397
  self.replica_id: int = replica_id
397
398
  self.cluster_name: str = cluster_name
@@ -403,6 +404,7 @@ class ReplicaInfo:
403
404
  self.is_spot: bool = is_spot
404
405
  self.location: Optional[Dict[str, Optional[str]]] = (
405
406
  location.to_pickleable() if location is not None else None)
407
+ self.resources_override: Optional[Dict[str, Any]] = resources_override
406
408
 
407
409
  def get_spot_location(self) -> Optional[spot_placer.Location]:
408
410
  return spot_placer.Location.from_pickleable(self.location)
@@ -569,6 +571,9 @@ class ReplicaInfo:
569
571
  if version < 1:
570
572
  self.location = None
571
573
 
574
+ if version < 2:
575
+ self.resources_override = None
576
+
572
577
  self.__dict__.update(state)
573
578
 
574
579
 
@@ -650,6 +655,44 @@ class SkyPilotReplicaManager(ReplicaManager):
650
655
  threading.Thread(target=self._job_status_fetcher).start()
651
656
  threading.Thread(target=self._replica_prober).start()
652
657
 
658
+ self._recover_replica_operations()
659
+
660
+ def _recover_replica_operations(self):
661
+ """Let's see are there something to do for ReplicaManager in a
662
+ recovery run"""
663
+ assert (not self._launch_process_pool and not self._down_process_pool
664
+ ), 'We should not have any running processes in a recovery run'
665
+
666
+ # There is a FIFO queue with capacity _MAX_NUM_LAUNCH for
667
+ # _launch_replica.
668
+ # We prioritize PROVISIONING replicas since they were previously
669
+ # launched but may have been interrupted and need to be restarted.
670
+ # This is why we process PENDING replicas only after PROVISIONING
671
+ # replicas.
672
+ to_up_replicas = serve_state.get_replicas_at_status(
673
+ self._service_name, serve_state.ReplicaStatus.PROVISIONING)
674
+ to_up_replicas.extend(
675
+ serve_state.get_replicas_at_status(
676
+ self._service_name, serve_state.ReplicaStatus.PENDING))
677
+
678
+ for replica_info in to_up_replicas:
679
+ # It should be robust enough for `execution.launch` to handle cases
680
+ # where the provisioning is partially done.
681
+ # So we mock the original request based on all call sites,
682
+ # including SkyServeController._run_autoscaler.
683
+ self._launch_replica(
684
+ replica_info.replica_id,
685
+ resources_override=replica_info.resources_override)
686
+
687
+ for replica_info in serve_state.get_replicas_at_status(
688
+ self._service_name, serve_state.ReplicaStatus.SHUTTING_DOWN):
689
+ self._terminate_replica(
690
+ replica_info.replica_id,
691
+ sync_down_logs=False,
692
+ replica_drain_delay_seconds=0,
693
+ purge=replica_info.status_property.purged,
694
+ is_scale_down=replica_info.status_property.is_scale_down)
695
+
653
696
  ################################
654
697
  # Replica management functions #
655
698
  ################################
@@ -705,7 +748,7 @@ class SkyPilotReplicaManager(ReplicaManager):
705
748
  replica_port = _get_resources_ports(self._task_yaml_path)
706
749
 
707
750
  info = ReplicaInfo(replica_id, cluster_name, replica_port, use_spot,
708
- location, self.latest_version)
751
+ location, self.latest_version, resources_override)
709
752
  serve_state.add_or_update_replica(self._service_name, replica_id, info)
710
753
  # Don't start right now; we will start it later in _refresh_process_pool
711
754
  # to avoid too many sky.launch running at the same time.
@@ -884,7 +927,9 @@ class SkyPilotReplicaManager(ReplicaManager):
884
927
  the fly. If any of them finished, it will update the status of the
885
928
  corresponding replica.
886
929
  """
887
- for replica_id, p in list(self._launch_process_pool.items()):
930
+ # To avoid `dictionary changed size during iteration` error.
931
+ launch_process_pool_snapshot = list(self._launch_process_pool.items())
932
+ for replica_id, p in launch_process_pool_snapshot:
888
933
  if not p.is_alive():
889
934
  info = serve_state.get_replica_info_from_id(
890
935
  self._service_name, replica_id)
@@ -943,7 +988,8 @@ class SkyPilotReplicaManager(ReplicaManager):
943
988
  self._terminate_replica(replica_id,
944
989
  sync_down_logs=True,
945
990
  replica_drain_delay_seconds=0)
946
- for replica_id, p in list(self._down_process_pool.items()):
991
+ down_process_pool_snapshot = list(self._down_process_pool.items())
992
+ for replica_id, p in down_process_pool_snapshot:
947
993
  if not p.is_alive():
948
994
  logger.info(
949
995
  f'Terminate process for replica {replica_id} finished.')