skypilot-nightly 1.0.0.dev20250427__py3-none-any.whl → 1.0.0.dev20250429__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/nebius.py +28 -40
  3. sky/backends/backend_utils.py +19 -2
  4. sky/backends/cloud_vm_ray_backend.py +33 -8
  5. sky/backends/local_docker_backend.py +1 -2
  6. sky/cli.py +91 -38
  7. sky/client/cli.py +91 -38
  8. sky/client/sdk.py +3 -2
  9. sky/clouds/aws.py +12 -6
  10. sky/clouds/azure.py +3 -0
  11. sky/clouds/cloud.py +8 -2
  12. sky/clouds/cudo.py +2 -0
  13. sky/clouds/do.py +3 -0
  14. sky/clouds/fluidstack.py +3 -0
  15. sky/clouds/gcp.py +7 -0
  16. sky/clouds/ibm.py +2 -0
  17. sky/clouds/kubernetes.py +42 -19
  18. sky/clouds/lambda_cloud.py +1 -0
  19. sky/clouds/nebius.py +18 -10
  20. sky/clouds/oci.py +6 -3
  21. sky/clouds/paperspace.py +2 -0
  22. sky/clouds/runpod.py +2 -0
  23. sky/clouds/scp.py +2 -0
  24. sky/clouds/service_catalog/constants.py +1 -1
  25. sky/clouds/service_catalog/kubernetes_catalog.py +7 -7
  26. sky/clouds/vast.py +2 -0
  27. sky/clouds/vsphere.py +2 -0
  28. sky/core.py +58 -29
  29. sky/dashboard/out/404.html +1 -1
  30. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  31. sky/dashboard/out/clusters/[cluster].html +1 -1
  32. sky/dashboard/out/clusters.html +1 -1
  33. sky/dashboard/out/favicon.ico +0 -0
  34. sky/dashboard/out/index.html +1 -1
  35. sky/dashboard/out/jobs/[job].html +1 -1
  36. sky/dashboard/out/jobs.html +1 -1
  37. sky/exceptions.py +6 -0
  38. sky/execution.py +19 -4
  39. sky/global_user_state.py +1 -0
  40. sky/optimizer.py +35 -11
  41. sky/provision/common.py +2 -5
  42. sky/provision/docker_utils.py +22 -16
  43. sky/provision/instance_setup.py +1 -1
  44. sky/provision/kubernetes/instance.py +276 -93
  45. sky/provision/kubernetes/network.py +1 -1
  46. sky/provision/kubernetes/utils.py +36 -24
  47. sky/provision/provisioner.py +6 -0
  48. sky/serve/replica_managers.py +51 -5
  49. sky/serve/serve_state.py +41 -0
  50. sky/serve/service.py +108 -63
  51. sky/server/common.py +6 -3
  52. sky/server/config.py +184 -0
  53. sky/server/requests/executor.py +17 -156
  54. sky/server/server.py +4 -4
  55. sky/setup_files/dependencies.py +0 -1
  56. sky/skylet/constants.py +7 -0
  57. sky/skypilot_config.py +27 -6
  58. sky/task.py +1 -1
  59. sky/templates/kubernetes-ray.yml.j2 +145 -15
  60. sky/templates/nebius-ray.yml.j2 +63 -0
  61. sky/utils/command_runner.py +17 -3
  62. sky/utils/command_runner.pyi +2 -0
  63. sky/utils/controller_utils.py +24 -0
  64. sky/utils/kubernetes/rsync_helper.sh +20 -4
  65. sky/utils/schemas.py +13 -0
  66. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/METADATA +2 -2
  67. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/RECORD +73 -72
  68. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/WHEEL +1 -1
  69. /sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → BMtJJ079_cyYmtW2-7nVS}/_buildManifest.js +0 -0
  70. /sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → BMtJJ079_cyYmtW2-7nVS}/_ssgManifest.js +0 -0
  71. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/entry_points.txt +0 -0
  72. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/licenses/LICENSE +0 -0
  73. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/top_level.txt +0 -0
@@ -28,6 +28,9 @@ SETUP_ENV_VARS_CMD = (
28
28
  # the command.
29
29
  DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to '
30
30
  'the Docker daemon socket')
31
+
32
+ DOCKER_SOCKET_NOT_READY_STR = ('Is the docker daemon running?')
33
+
31
34
  _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS = 30
32
35
 
33
36
 
@@ -173,22 +176,25 @@ class DockerInitializer:
173
176
  stream_logs=False,
174
177
  separate_stderr=separate_stderr,
175
178
  log_path=self.log_path)
176
- if (DOCKER_PERMISSION_DENIED_STR in stdout + stderr and
177
- wait_for_docker_daemon):
178
- if time.time() - start > _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS:
179
- if rc == 0:
180
- # Set returncode to 1 if failed to connect to docker
181
- # daemon after timeout.
182
- rc = 1
183
- break
184
- # Close the cached connection to make the permission update of
185
- # ssh user take effect, e.g. usermod -aG docker $USER, called
186
- # by cloud-init of Azure.
187
- self.runner.close_cached_connection()
188
- logger.info('Failed to connect to docker daemon. It might be '
189
- 'initializing, retrying in 5 seconds...')
190
- time.sleep(5)
191
- continue
179
+ if (DOCKER_PERMISSION_DENIED_STR in stdout + stderr or
180
+ DOCKER_SOCKET_NOT_READY_STR in stdout + stderr):
181
+ if wait_for_docker_daemon:
182
+ if time.time(
183
+ ) - start > _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS:
184
+ if rc == 0:
185
+ # Set returncode to 1 if failed to connect to docker
186
+ # daemon after timeout.
187
+ rc = 1
188
+ break
189
+ # Close the cached connection to make the permission update
190
+ # of ssh user take effect, e.g. usermod -aG docker $USER,
191
+ # called by cloud-init of Azure.
192
+ self.runner.close_cached_connection()
193
+ logger.info(
194
+ 'Failed to connect to docker daemon. It might be '
195
+ 'initializing, retrying in 5 seconds...')
196
+ time.sleep(5)
197
+ continue
192
198
  break
193
199
  subprocess_utils.handle_returncode(
194
200
  rc,
@@ -501,7 +501,7 @@ def start_skylet_on_head_node(cluster_name: str,
501
501
  def _internal_file_mounts(file_mounts: Dict,
502
502
  runner: command_runner.CommandRunner,
503
503
  log_path: str) -> None:
504
- if file_mounts is None or not file_mounts:
504
+ if not file_mounts:
505
505
  return
506
506
 
507
507
  for dst, src in file_mounts.items():
@@ -32,22 +32,53 @@ logger = sky_logging.init_logger(__name__)
32
32
  TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
33
33
  TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name'
34
34
  TAG_POD_INITIALIZED = 'skypilot-initialized'
35
+ TAG_SKYPILOT_DEPLOYMENT_NAME = 'skypilot-deployment-name'
36
+
37
+
38
+ def ray_tag_filter(cluster_name: str) -> Dict[str, str]:
39
+ return {TAG_RAY_CLUSTER_NAME: cluster_name}
40
+
41
+
42
+ def _is_head(pod) -> bool:
43
+ return pod.metadata.labels.get(constants.TAG_RAY_NODE_KIND) == 'head'
35
44
 
36
45
 
37
46
  def _get_head_pod_name(pods: Dict[str, Any]) -> Optional[str]:
38
- head_pod_name = None
39
- for pod_name, pod in pods.items():
40
- if pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head':
41
- head_pod_name = pod_name
42
- break
43
- return head_pod_name
47
+ return next((pod_name for pod_name, pod in pods.items() if _is_head(pod)),
48
+ None)
49
+
50
+
51
+ def _get_pvc_name(cluster_name: str, volume_name: str) -> str:
52
+ return f'{cluster_name}-{volume_name}'
53
+
44
54
 
55
+ def _get_deployment_name(cluster_name: str) -> str:
56
+ return f'{cluster_name}-deployment'
45
57
 
46
- def head_service_selector(cluster_name: str) -> Dict[str, str]:
47
- """Selector for Operator-configured head service."""
58
+
59
+ def _head_service_selector(cluster_name: str) -> Dict[str, str]:
48
60
  return {'component': f'{cluster_name}-head'}
49
61
 
50
62
 
63
+ def is_high_availability_cluster_by_kubectl(
64
+ cluster_name: str,
65
+ context: Optional[str] = None,
66
+ namespace: Optional[str] = None) -> bool:
67
+ """Check if a cluster is a high availability controller by calling
68
+ `kubectl get deployment`.
69
+ """
70
+ try:
71
+ deployment_list = kubernetes.apps_api(
72
+ context).list_namespaced_deployment(
73
+ namespace,
74
+ label_selector=f'{TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}')
75
+ except kubernetes.api_exception():
76
+ return False
77
+ # It is a high availability cluster if there is at least one deployment
78
+ # matching the label selector.
79
+ return bool(deployment_list.items)
80
+
81
+
51
82
  def _formatted_resource_requirements(pod_or_spec: Union[Any, dict]) -> str:
52
83
  # Returns a formatted string of resource requirements for a pod.
53
84
  resource_requirements = {}
@@ -384,13 +415,11 @@ def _run_function_with_retries(func: Callable,
384
415
  max_retries: int = _MAX_RETRIES,
385
416
  retry_delay: int = 5) -> Any:
386
417
  """Runs a function with retries on Kubernetes errors.
387
-
388
418
  Args:
389
419
  func: Function to retry
390
420
  operation_name: Name of the operation for logging
391
421
  max_retries: Maximum number of retry attempts
392
422
  retry_delay: Delay between retries in seconds
393
-
394
423
  Raises:
395
424
  The last exception encountered if all retries fail.
396
425
  """
@@ -409,30 +438,23 @@ def _run_function_with_retries(func: Callable,
409
438
  @timeline.event
410
439
  def pre_init(namespace: str, context: Optional[str], new_nodes: List) -> None:
411
440
  """Pre-initialization step for SkyPilot pods.
412
-
413
441
  This step is run in the pod right after it is created and before the
414
442
  SkyPilot runtime is setup.
415
-
416
443
  This step includes three key steps:
417
-
418
444
  1. Privilege check: Checks if the default user has sufficient privilege
419
445
  to set up the kubernetes instance pod.
420
446
  2. SSH setup: Sets up SSH for the pod instance.
421
447
  3. Environment variable setup to populate k8s env vars in the pod.
422
-
423
448
  Make sure commands used in these methods are generic and work
424
449
  on most base images. E.g., do not use Python, since that may not
425
450
  be installed by default.
426
-
427
451
  If you run any apt commands, be sure to check if the lock is available.
428
452
  It is possible the `apt update` run in the pod container args may still
429
453
  be running.
430
-
431
454
  Args:
432
455
  namespace (str): Kubernetes namespace.
433
456
  context (Optional[str]): Kubernetes context.
434
457
  new_nodes (List): List of new pod instances.
435
-
436
458
  Raises:
437
459
  config_lib.KubernetesError: If user privileges are insufficient or
438
460
  setup fails.
@@ -647,6 +669,56 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
647
669
  raise e
648
670
 
649
671
 
672
+ def _create_persistent_volume_claim(namespace: str, context: Optional[str],
673
+ pvc_spec: Dict[str, Any]) -> None:
674
+ """Creates a persistent volume claim for SkyServe controller."""
675
+ try:
676
+ kubernetes.core_api(context).read_namespaced_persistent_volume_claim(
677
+ name=pvc_spec['metadata']['name'], namespace=namespace)
678
+ return
679
+ except kubernetes.api_exception() as e:
680
+ if e.status != 404: # Not found
681
+ raise
682
+
683
+ kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
684
+ namespace=namespace, body=pvc_spec)
685
+
686
+
687
+ @timeline.event
688
+ def _wait_for_deployment_pod(context,
689
+ namespace,
690
+ deployment,
691
+ timeout=60) -> List:
692
+ label_selector = ','.join([
693
+ f'{key}={value}'
694
+ for key, value in deployment.spec.selector.match_labels.items()
695
+ ])
696
+ target_replicas = deployment.spec.replicas
697
+ deployment_name = deployment.metadata.name
698
+ start_time = time.time()
699
+ while time.time() - start_time < timeout:
700
+ # Refresh the deployment status
701
+ deployment = kubernetes.apps_api(
702
+ context).read_namespaced_deployment_status(deployment_name,
703
+ namespace)
704
+ if (deployment.status and
705
+ deployment.status.ready_replicas is not None and
706
+ deployment.status.ready_replicas >= target_replicas):
707
+ pods = kubernetes.core_api(context).list_namespaced_pod(
708
+ namespace, label_selector=label_selector).items
709
+ return pods
710
+
711
+ ready_replicas = (deployment.status.ready_replicas
712
+ if deployment.status is not None else 0)
713
+ logger.debug(f'Waiting for deployment {deployment_name!r} to be ready. '
714
+ f'Ready replicas: {ready_replicas}/{target_replicas}')
715
+ time.sleep(2)
716
+
717
+ raise TimeoutError(
718
+ f'Timeout: Deployment {deployment_name!r} did not become '
719
+ 'ready.')
720
+
721
+
650
722
  @timeline.event
651
723
  def _create_pods(region: str, cluster_name_on_cloud: str,
652
724
  config: common.ProvisionConfig) -> common.ProvisionRecord:
@@ -655,9 +727,16 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
655
727
  namespace = kubernetes_utils.get_namespace_from_config(provider_config)
656
728
  context = kubernetes_utils.get_context_from_config(provider_config)
657
729
  pod_spec = copy.deepcopy(config.node_config)
658
- tags = {
659
- TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
660
- }
730
+
731
+ to_create_deployment = 'deployment_spec' in pod_spec
732
+ if to_create_deployment:
733
+ deployment_spec = pod_spec.pop('deployment_spec')
734
+ pvc_spec = pod_spec.pop('pvc_spec')
735
+ assert len(pod_spec['spec']['containers']) == 1, (
736
+ 'Only one container is supported for deployment')
737
+
738
+ tags = ray_tag_filter(cluster_name_on_cloud)
739
+
661
740
  pod_spec['metadata']['namespace'] = namespace
662
741
  if 'labels' in pod_spec['metadata']:
663
742
  pod_spec['metadata']['labels'].update(tags)
@@ -734,16 +813,15 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
734
813
  if nvidia_runtime_exists and needs_gpus:
735
814
  pod_spec['spec']['runtimeClassName'] = 'nvidia'
736
815
 
737
- created_pods = {}
738
816
  logger.debug(f'run_instances: calling create_namespaced_pod '
739
817
  f'(count={to_start_count}).')
740
818
 
741
- def _create_pod_thread(i: int):
819
+ def _create_resource_thread(i: int):
742
820
  pod_spec_copy = copy.deepcopy(pod_spec)
743
821
  if head_pod_name is None and i == 0:
744
822
  # First pod should be head if no head exists
745
823
  pod_spec_copy['metadata']['labels'].update(constants.HEAD_NODE_TAGS)
746
- head_selector = head_service_selector(cluster_name_on_cloud)
824
+ head_selector = _head_service_selector(cluster_name_on_cloud)
747
825
  pod_spec_copy['metadata']['labels'].update(head_selector)
748
826
  pod_spec_copy['metadata']['name'] = f'{cluster_name_on_cloud}-head'
749
827
  else:
@@ -800,19 +878,62 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
800
878
  tpu_toleration
801
879
  ]
802
880
 
881
+ if to_create_deployment:
882
+ _create_persistent_volume_claim(namespace, context, pvc_spec)
883
+
884
+ # It's safe to directly modify the template spec in the deployment spec
885
+ # because controller pod is singleton, i in [0].
886
+ template_pod_spec = deployment_spec['spec']['template']
887
+ # Add the deployment name as a label to the pod spec
888
+ deployment_name = deployment_spec['metadata']['name']
889
+ pod_spec_copy['metadata']['labels'][
890
+ TAG_SKYPILOT_DEPLOYMENT_NAME] = deployment_name
891
+ template_pod_spec['metadata'] = pod_spec_copy['metadata']
892
+ template_pod_spec['spec'].update(pod_spec_copy['spec'])
893
+ try:
894
+ return kubernetes.apps_api(
895
+ context).create_namespaced_deployment(
896
+ namespace, deployment_spec)
897
+ except Exception as e:
898
+ print('Deployment failed', e)
899
+ raise e
900
+
803
901
  return _create_namespaced_pod_with_retries(namespace, pod_spec_copy,
804
902
  context)
805
903
 
904
+ if not to_start_count:
905
+ is_provisioned_cluster_ha = is_high_availability_cluster_by_kubectl(
906
+ cluster_name_on_cloud, context, namespace)
907
+ if is_provisioned_cluster_ha != to_create_deployment:
908
+ ha_str = lambda x: 'high availability' if x else 'non-high availability'
909
+
910
+ message = (
911
+ f'The cluster "{cluster_name_on_cloud}" is configured to be '
912
+ f'{ha_str(to_create_deployment)} but the cluster has already been '
913
+ f'provisioned as {ha_str(is_provisioned_cluster_ha)}. '
914
+ 'If you want to make the provisioned cluster '
915
+ f'{ha_str(to_create_deployment)}, please first down the cluster '
916
+ 'and then up the cluster again.')
917
+ raise exceptions.InconsistentHighAvailabilityError(message)
918
+
806
919
  # Create pods in parallel
807
- pods = subprocess_utils.run_in_parallel(_create_pod_thread,
808
- list(range(to_start_count)),
809
- _NUM_THREADS)
920
+ created_resources = subprocess_utils.run_in_parallel(
921
+ _create_resource_thread, list(range(to_start_count)), _NUM_THREADS)
922
+
923
+ if to_create_deployment:
924
+ deployments = copy.deepcopy(created_resources)
925
+ pods = [
926
+ pod for deployment in deployments
927
+ for pod in _wait_for_deployment_pod(context, namespace, deployment)
928
+ ]
929
+ else:
930
+ # If not creating deployments, 'created_resources' already holds Pod objects
931
+ pods = created_resources
810
932
 
811
- # Process created pods
933
+ created_pods = {}
812
934
  for pod in pods:
813
935
  created_pods[pod.metadata.name] = pod
814
- if head_pod_name is None and pod.metadata.labels.get(
815
- constants.TAG_RAY_NODE_KIND) == 'head':
936
+ if head_pod_name is None and _is_head(pod):
816
937
  head_pod_name = pod.metadata.name
817
938
 
818
939
  networking_mode = network_utils.get_networking_mode(
@@ -879,57 +1000,75 @@ def stop_instances(
879
1000
  raise NotImplementedError()
880
1001
 
881
1002
 
882
- def _terminate_node(namespace: str, context: Optional[str], pod_name: str,
883
- is_head: bool) -> None:
884
- """Terminate a pod."""
885
- logger.debug('terminate_instances: calling delete_namespaced_pod')
1003
+ def _delete_k8s_resource_with_retry(delete_func: Callable, resource_type: str,
1004
+ resource_name: str) -> None:
1005
+ """Helper to delete Kubernetes resources with 404 handling and retries.
886
1006
 
887
- def _delete_k8s_resource_with_retry(delete_func: Callable,
888
- resource_type: str,
889
- resource_name: str) -> None:
890
- """Helper to delete Kubernetes resources with 404 handling and retries.
891
-
892
- Args:
893
- delete_func: Function to call to delete the resource
894
- resource_type: Type of resource being deleted (e.g. 'service'),
895
- used in logging
896
- resource_name: Name of the resource being deleted, used in logging
897
- """
898
- max_retries = 3
899
- retry_delay = 5 # seconds
900
-
901
- for attempt in range(max_retries):
902
- try:
903
- delete_func()
1007
+ Args:
1008
+ delete_func: Function to call to delete the resource
1009
+ resource_type: Type of resource being deleted (e.g. 'service'),
1010
+ used in logging
1011
+ resource_name: Name of the resource being deleted, used in logging
1012
+ """
1013
+ max_retries = 3
1014
+ retry_delay = 5 # seconds
1015
+
1016
+ for attempt in range(max_retries):
1017
+ try:
1018
+ delete_func()
1019
+ return
1020
+ except kubernetes.api_exception() as e:
1021
+ if e.status == 404:
1022
+ logger.warning(
1023
+ f'terminate_instances: Tried to delete {resource_type} '
1024
+ f'{resource_name}, but the {resource_type} was not '
1025
+ 'found (404).')
904
1026
  return
905
- except kubernetes.api_exception() as e:
906
- if e.status == 404:
907
- logger.warning(
908
- f'terminate_instances: Tried to delete {resource_type} '
909
- f'{resource_name}, but the {resource_type} was not '
910
- 'found (404).')
911
- return
912
- elif attempt < max_retries - 1:
913
- logger.warning(f'terminate_instances: Failed to delete '
914
- f'{resource_type} {resource_name} (attempt '
915
- f'{attempt + 1}/{max_retries}). Error: {e}. '
916
- f'Retrying in {retry_delay} seconds...')
917
- time.sleep(retry_delay)
918
- else:
919
- raise
1027
+ elif attempt < max_retries - 1:
1028
+ logger.warning(f'terminate_instances: Failed to delete '
1029
+ f'{resource_type} {resource_name} (attempt '
1030
+ f'{attempt + 1}/{max_retries}). Error: {e}. '
1031
+ f'Retrying in {retry_delay} seconds...')
1032
+ time.sleep(retry_delay)
1033
+ else:
1034
+ raise
1035
+
1036
+
1037
+ def _delete_services(name_prefix: str, namespace: str,
1038
+ context: Optional[str]) -> None:
1039
+ """Delete services with the given name prefix.
1040
+
1041
+ Args:
1042
+ name_prefix: Prefix of the service names to delete
1043
+ namespace: Kubernetes namespace
1044
+ context: Kubernetes context
1045
+ """
1046
+ # TODO(andy): We should use tag for the service filter.
1047
+ for service_name in [name_prefix, f'{name_prefix}-ssh']:
1048
+ # Since we are not saving this lambda, it's a false positive.
1049
+ # TODO(andyl): Wait for
1050
+ # https://github.com/pylint-dev/pylint/issues/5263.
1051
+ # pylint: disable=cell-var-from-loop
1052
+ _delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.core_api(
1053
+ context).delete_namespaced_service(name=service_name,
1054
+ namespace=namespace,
1055
+ _request_timeout=config_lib.
1056
+ DELETION_TIMEOUT),
1057
+ resource_type='service',
1058
+ resource_name=service_name)
1059
+
1060
+
1061
+ def _terminate_node(namespace: str,
1062
+ context: Optional[str],
1063
+ pod_name: str,
1064
+ is_head: bool = False) -> None:
1065
+ """Terminate a pod and its associated services."""
1066
+ logger.debug('terminate_instances: calling delete_namespaced_pod')
920
1067
 
921
1068
  if is_head:
922
1069
  # Delete services for the head pod
923
1070
  # services are specified in sky/templates/kubernetes-ray.yml.j2
924
- for service_name in [pod_name, f'{pod_name}-ssh']:
925
- _delete_k8s_resource_with_retry(
926
- delete_func=lambda name=service_name: kubernetes.core_api(
927
- context).delete_namespaced_service(
928
- name=name,
929
- namespace=namespace,
930
- _request_timeout=config_lib.DELETION_TIMEOUT),
931
- resource_type='service',
932
- resource_name=service_name)
1071
+ _delete_services(pod_name, namespace, context)
933
1072
 
934
1073
  # Note - delete pod after all other resources are deleted.
935
1074
  # This is to ensure there are no leftover resources if this down is run
@@ -946,6 +1085,36 @@ def _terminate_node(namespace: str, context: Optional[str], pod_name: str,
946
1085
  resource_name=pod_name)
947
1086
 
948
1087
 
1088
+ def _terminate_deployment(cluster_name: str, namespace: str,
1089
+ context: Optional[str]) -> None:
1090
+ """Terminate a deployment."""
1091
+ # Delete services first
1092
+ _delete_services(f'{cluster_name}-head', namespace, context)
1093
+
1094
+ # Delete deployment
1095
+ deployment_name = _get_deployment_name(cluster_name)
1096
+ _delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.apps_api(
1097
+ context).delete_namespaced_deployment(name=deployment_name,
1098
+ namespace=namespace,
1099
+ _request_timeout=config_lib.
1100
+ DELETION_TIMEOUT),
1101
+ resource_type='deployment',
1102
+ resource_name=deployment_name)
1103
+
1104
+ # Delete PVCs
1105
+ pvc_name = _get_pvc_name(
1106
+ cluster_name,
1107
+ kubernetes_utils.HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME)
1108
+ # pylint: disable=cell-var-from-loop
1109
+ _delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.core_api(
1110
+ context).delete_namespaced_persistent_volume_claim(
1111
+ name=pvc_name,
1112
+ namespace=namespace,
1113
+ _request_timeout=config_lib.DELETION_TIMEOUT),
1114
+ resource_type='pvc',
1115
+ resource_name=pvc_name)
1116
+
1117
+
949
1118
  def terminate_instances(
950
1119
  cluster_name_on_cloud: str,
951
1120
  provider_config: Dict[str, Any],
@@ -954,10 +1123,9 @@ def terminate_instances(
954
1123
  """See sky/provision/__init__.py"""
955
1124
  namespace = kubernetes_utils.get_namespace_from_config(provider_config)
956
1125
  context = kubernetes_utils.get_context_from_config(provider_config)
957
- tag_filters = {
958
- TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
959
- }
960
- pods = kubernetes_utils.filter_pods(namespace, context, tag_filters, None)
1126
+ pods = kubernetes_utils.filter_pods(namespace, context,
1127
+ ray_tag_filter(cluster_name_on_cloud),
1128
+ None)
961
1129
 
962
1130
  # Clean up the SSH jump pod if in use
963
1131
  networking_mode = network_utils.get_networking_mode(
@@ -971,8 +1139,12 @@ def terminate_instances(
971
1139
  logger.warning('terminate_instances: Error occurred when analyzing '
972
1140
  f'SSH Jump pod: {e}')
973
1141
 
974
- def _is_head(pod) -> bool:
975
- return pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head'
1142
+ if is_high_availability_cluster_by_kubectl(cluster_name_on_cloud, context,
1143
+ namespace):
1144
+ # For high availability controllers, terminate the deployment
1145
+ logger.debug(f'Terminating deployment {cluster_name_on_cloud}')
1146
+ _terminate_deployment(cluster_name_on_cloud, namespace, context)
1147
+ return
976
1148
 
977
1149
  def _terminate_pod_thread(pod_info):
978
1150
  pod_name, pod = pod_info
@@ -994,12 +1166,9 @@ def get_cluster_info(
994
1166
  assert provider_config is not None
995
1167
  namespace = kubernetes_utils.get_namespace_from_config(provider_config)
996
1168
  context = kubernetes_utils.get_context_from_config(provider_config)
997
- tag_filters = {
998
- TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
999
- }
1000
1169
 
1001
- running_pods = kubernetes_utils.filter_pods(namespace, context, tag_filters,
1002
- ['Running'])
1170
+ running_pods = kubernetes_utils.filter_pods(
1171
+ namespace, context, ray_tag_filter(cluster_name_on_cloud), ['Running'])
1003
1172
 
1004
1173
  pods: Dict[str, List[common.InstanceInfo]] = {}
1005
1174
  head_pod_name = None
@@ -1029,7 +1198,7 @@ def get_cluster_info(
1029
1198
  tags=pod.metadata.labels,
1030
1199
  )
1031
1200
  ]
1032
- if pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head':
1201
+ if _is_head(pod):
1033
1202
  head_pod_name = pod_name
1034
1203
  head_spec = pod.spec
1035
1204
  assert head_spec is not None, pod
@@ -1125,11 +1294,25 @@ def get_command_runners(
1125
1294
  cluster_info.provider_config)
1126
1295
  context = kubernetes_utils.get_context_from_config(
1127
1296
  cluster_info.provider_config)
1128
- node_list = []
1297
+
1298
+ runners: List[command_runner.CommandRunner] = []
1129
1299
  if cluster_info.head_instance_id is not None:
1130
- node_list = [((namespace, context), cluster_info.head_instance_id)]
1131
- node_list.extend(((namespace, context), pod_name)
1132
- for pod_name in instances.keys()
1133
- if pod_name != cluster_info.head_instance_id)
1134
- return command_runner.KubernetesCommandRunner.make_runner_list(
1135
- node_list=node_list, **credentials)
1300
+ pod_name = cluster_info.head_instance_id
1301
+
1302
+ # Try to get deployment name from label first
1303
+ head_instance_info = instances[pod_name][0]
1304
+ deployment = head_instance_info.tags.get(TAG_SKYPILOT_DEPLOYMENT_NAME)
1305
+
1306
+ node_list = [((namespace, context), pod_name)]
1307
+ head_runner = command_runner.KubernetesCommandRunner(
1308
+ node_list[0], deployment=deployment, **credentials)
1309
+ runners.append(head_runner)
1310
+
1311
+ node_list = [((namespace, context), pod_name)
1312
+ for pod_name in instances.keys()
1313
+ if pod_name != cluster_info.head_instance_id]
1314
+ runners.extend(
1315
+ command_runner.KubernetesCommandRunner.make_runner_list(
1316
+ node_list, **credentials))
1317
+
1318
+ return runners
@@ -78,7 +78,7 @@ def _open_ports_using_ingress(
78
78
  'https://github.com/kubernetes/ingress-nginx/blob/main/docs/deploy/index.md.' # pylint: disable=line-too-long
79
79
  )
80
80
 
81
- # Prepare service names, ports, for template rendering
81
+ # Prepare service names, ports, for template rendering
82
82
  service_details = [
83
83
  (f'{cluster_name_on_cloud}--skypilot-svc--{port}', port,
84
84
  _PATH_PREFIX.format(