PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20241120__py3-none-any.whl → 1.0.0.dev20241122__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20241120py3-none-any.whl → 1.0.0.dev20241122py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

sky/__init__.py +2 -2
sky/backends/backend_utils.py +20 -15
sky/backends/cloud_vm_ray_backend.py +21 -3
sky/clouds/aws.py +1 -0
sky/clouds/azure.py +1 -0
sky/clouds/cloud.py +1 -0
sky/clouds/cudo.py +1 -0
sky/clouds/fluidstack.py +1 -0
sky/clouds/gcp.py +1 -0
sky/clouds/ibm.py +1 -0
sky/clouds/kubernetes.py +45 -3
sky/clouds/lambda_cloud.py +1 -0
sky/clouds/oci.py +1 -0
sky/clouds/paperspace.py +1 -0
sky/clouds/runpod.py +1 -0
sky/clouds/scp.py +1 -0
sky/clouds/vsphere.py +1 -0
sky/provision/instance_setup.py +80 -83
sky/provision/kubernetes/instance.py +108 -76
sky/provision/kubernetes/utils.py +2 -0
sky/provision/oci/instance.py +4 -2
sky/provision/provisioner.py +95 -19
sky/resources.py +2 -1
sky/skylet/constants.py +31 -21
sky/templates/kubernetes-ray.yml.j2 +169 -39
sky/utils/subprocess_utils.py +49 -4
{skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/METADATA +65 -55
{skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/RECORD +32 -32
{skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/WHEEL +1 -1
{skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/top_level.txt +0 -0

sky/provision/kubernetes/instance.py CHANGED Viewed

@@ -20,12 +20,13 @@ from sky.utils import command_runner
 from sky.utils import common_utils
 from sky.utils import kubernetes_enums
 from sky.utils import subprocess_utils
+from sky.utils import timeline
 from sky.utils import ux_utils
 POLL_INTERVAL = 2
 _TIMEOUT_FOR_POD_TERMINATION = 60  # 1 minutes
 _MAX_RETRIES = 3
-NUM_THREADS = subprocess_utils.get_parallel_threads() * 2
+_NUM_THREADS = subprocess_utils.get_parallel_threads('kubernetes')
 logger = sky_logging.init_logger(__name__)
 TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
@@ -120,6 +121,9 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
     are recorded as events. This function retrieves those events and raises
     descriptive errors for better debugging and user feedback.
     """
+    timeout_err_msg = ('Timed out while waiting for nodes to start. '
+                       'Cluster may be out of resources or '
+                       'may be too slow to autoscale.')
     for new_node in new_nodes:
         pod = kubernetes.core_api(context).read_namespaced_pod(
             new_node.metadata.name, namespace)
@@ -148,9 +152,6 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
             if event.reason == 'FailedScheduling':
                 event_message = event.message
                 break
-        timeout_err_msg = ('Timed out while waiting for nodes to start. '
-                           'Cluster may be out of resources or '
-                           'may be too slow to autoscale.')
         if event_message is not None:
             if pod_status == 'Pending':
                 logger.info(event_message)
@@ -219,6 +220,7 @@ def _raise_command_running_error(message: str, command: str, pod_name: str,
         f'code {rc}: {command!r}\nOutput: {stdout}.')
+@timeline.event
 def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
     """Wait for all pods to be scheduled.
@@ -229,6 +231,10 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
     If timeout is set to a negative value, this method will wait indefinitely.
     """
+    # Create a set of pod names we're waiting for
+    if not new_nodes:
+        return
+    expected_pod_names = {node.metadata.name for node in new_nodes}
     start_time = time.time()
     def _evaluate_timeout() -> bool:
@@ -238,19 +244,34 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
         return time.time() - start_time < timeout
     while _evaluate_timeout():
-        all_pods_scheduled = True
-        for node in new_nodes:
-            # Iterate over each pod to check their status
-            pod = kubernetes.core_api(context).read_namespaced_pod(
-                node.metadata.name, namespace)
-            if pod.status.phase == 'Pending':
+        # Get all pods in a single API call using the cluster name label
+        # which all pods in new_nodes should share
+        cluster_name = new_nodes[0].metadata.labels[TAG_SKYPILOT_CLUSTER_NAME]
+        pods = kubernetes.core_api(context).list_namespaced_pod(
+            namespace,
+            label_selector=f'{TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
+        # Get the set of found pod names and check if we have all expected pods
+        found_pod_names = {pod.metadata.name for pod in pods}
+        missing_pods = expected_pod_names - found_pod_names
+        if missing_pods:
+            logger.info('Retrying waiting for pods: '
+                        f'Missing pods: {missing_pods}')
+            time.sleep(0.5)
+            continue
+        # Check if all pods are scheduled
+        all_scheduled = True
+        for pod in pods:
+            if (pod.metadata.name in expected_pod_names and
+                    pod.status.phase == 'Pending'):
                 # If container_statuses is None, then the pod hasn't
                 # been scheduled yet.
                 if pod.status.container_statuses is None:
-                    all_pods_scheduled = False
+                    all_scheduled = False
                     break
-        if all_pods_scheduled:
+        if all_scheduled:
             return
         time.sleep(1)
@@ -266,12 +287,18 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
             f'Error: {common_utils.format_exception(e)}') from None
+@timeline.event
 def _wait_for_pods_to_run(namespace, context, new_nodes):
     """Wait for pods and their containers to be ready.
     Pods may be pulling images or may be in the process of container
     creation.
     """
+    if not new_nodes:
+        return
+    # Create a set of pod names we're waiting for
+    expected_pod_names = {node.metadata.name for node in new_nodes}
     def _check_init_containers(pod):
         # Check if any of the init containers failed
@@ -299,12 +326,25 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
                     f'{pod.metadata.name}. Error details: {msg}.')
     while True:
-        all_pods_running = True
-        # Iterate over each pod to check their status
-        for node in new_nodes:
-            pod = kubernetes.core_api(context).read_namespaced_pod(
-                node.metadata.name, namespace)
+        # Get all pods in a single API call
+        cluster_name = new_nodes[0].metadata.labels[TAG_SKYPILOT_CLUSTER_NAME]
+        all_pods = kubernetes.core_api(context).list_namespaced_pod(
+            namespace,
+            label_selector=f'{TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
+        # Get the set of found pod names and check if we have all expected pods
+        found_pod_names = {pod.metadata.name for pod in all_pods}
+        missing_pods = expected_pod_names - found_pod_names
+        if missing_pods:
+            logger.info('Retrying running pods check: '
+                        f'Missing pods: {missing_pods}')
+            time.sleep(0.5)
+            continue
+        all_pods_running = True
+        for pod in all_pods:
+            if pod.metadata.name not in expected_pod_names:
+                continue
             # Continue if pod and all the containers within the
             # pod are successfully created and running.
             if pod.status.phase == 'Running' and all(
@@ -367,6 +407,7 @@ def _run_function_with_retries(func: Callable,
                 raise
+@timeline.event
 def pre_init(namespace: str, context: Optional[str], new_nodes: List) -> None:
     """Pre-initialization step for SkyPilot pods.
@@ -514,7 +555,7 @@ def pre_init(namespace: str, context: Optional[str], new_nodes: List) -> None:
         logger.info(f'{"-"*20}End: Pre-init in pod {pod_name!r} {"-"*20}')
     # Run pre_init in parallel across all new_nodes
-    subprocess_utils.run_in_parallel(_pre_init_thread, new_nodes, NUM_THREADS)
+    subprocess_utils.run_in_parallel(_pre_init_thread, new_nodes, _NUM_THREADS)
 def _label_pod(namespace: str, context: Optional[str], pod_name: str,
@@ -528,6 +569,7 @@ def _label_pod(namespace: str, context: Optional[str], pod_name: str,
         _request_timeout=kubernetes.API_TIMEOUT)
+@timeline.event
 def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
                                         context: Optional[str]) -> Any:
     """Attempts to create a Kubernetes Pod and handle any errors.
@@ -606,6 +648,7 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
             raise e
+@timeline.event
 def _create_pods(region: str, cluster_name_on_cloud: str,
                  config: common.ProvisionConfig) -> common.ProvisionRecord:
     """Create pods based on the config."""
@@ -627,7 +670,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
     terminating_pods = kubernetes_utils.filter_pods(namespace, context, tags,
                                                     ['Terminating'])
     start_time = time.time()
-    while (len(terminating_pods) > 0 and
+    while (terminating_pods and
            time.time() - start_time < _TIMEOUT_FOR_POD_TERMINATION):
         logger.debug(f'run_instances: Found {len(terminating_pods)} '
                      'terminating pods. Waiting them to finish: '
@@ -636,7 +679,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
         terminating_pods = kubernetes_utils.filter_pods(namespace, context,
                                                         tags, ['Terminating'])
-    if len(terminating_pods) > 0:
+    if terminating_pods:
         # If there are still terminating pods, we force delete them.
         logger.debug(f'run_instances: Found {len(terminating_pods)} '
                      'terminating pods still in terminating state after '
@@ -695,24 +738,29 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
     created_pods = {}
     logger.debug(f'run_instances: calling create_namespaced_pod '
                  f'(count={to_start_count}).')
-    for _ in range(to_start_count):
-        if head_pod_name is None:
-            pod_spec['metadata']['labels'].update(constants.HEAD_NODE_TAGS)
+    def _create_pod_thread(i: int):
+        pod_spec_copy = copy.deepcopy(pod_spec)
+        if head_pod_name is None and i == 0:
+            # First pod should be head if no head exists
+            pod_spec_copy['metadata']['labels'].update(constants.HEAD_NODE_TAGS)
             head_selector = head_service_selector(cluster_name_on_cloud)
-            pod_spec['metadata']['labels'].update(head_selector)
-            pod_spec['metadata']['name'] = f'{cluster_name_on_cloud}-head'
+            pod_spec_copy['metadata']['labels'].update(head_selector)
+            pod_spec_copy['metadata']['name'] = f'{cluster_name_on_cloud}-head'
         else:
-            pod_spec['metadata']['labels'].update(constants.WORKER_NODE_TAGS)
-            pod_uuid = str(uuid.uuid4())[:4]
+            # Worker pods
+            pod_spec_copy['metadata']['labels'].update(
+                constants.WORKER_NODE_TAGS)
+            pod_uuid = str(uuid.uuid4())[:6]
             pod_name = f'{cluster_name_on_cloud}-{pod_uuid}'
-            pod_spec['metadata']['name'] = f'{pod_name}-worker'
+            pod_spec_copy['metadata']['name'] = f'{pod_name}-worker'
             # For multi-node support, we put a soft-constraint to schedule
             # worker pods on different nodes than the head pod.
             # This is not set as a hard constraint because if different nodes
             # are not available, we still want to be able to schedule worker
             # pods on larger nodes which may be able to fit multiple SkyPilot
             # "nodes".
-            pod_spec['spec']['affinity'] = {
+            pod_spec_copy['spec']['affinity'] = {
                 'podAntiAffinity': {
                     # Set as a soft constraint
                     'preferredDuringSchedulingIgnoredDuringExecution': [{
@@ -747,17 +795,22 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
                 'value': 'present',
                 'effect': 'NoSchedule'
             }
-            pod_spec['spec']['tolerations'] = [tpu_toleration]
+            pod_spec_copy['spec']['tolerations'] = [tpu_toleration]
-        pod = _create_namespaced_pod_with_retries(namespace, pod_spec, context)
+        return _create_namespaced_pod_with_retries(namespace, pod_spec_copy,
+                                                   context)
+    # Create pods in parallel
+    pods = subprocess_utils.run_in_parallel(_create_pod_thread,
+                                            range(to_start_count), _NUM_THREADS)
+    # Process created pods
+    for pod in pods:
         created_pods[pod.metadata.name] = pod
-        if head_pod_name is None:
+        if head_pod_name is None and pod.metadata.labels.get(
+                constants.TAG_RAY_NODE_KIND) == 'head':
             head_pod_name = pod.metadata.name
-    wait_pods_dict = kubernetes_utils.filter_pods(namespace, context, tags,
-                                                  ['Pending'])
-    wait_pods = list(wait_pods_dict.values())
     networking_mode = network_utils.get_networking_mode(
         config.provider_config.get('networking_mode'))
     if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
@@ -766,52 +819,24 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
         ssh_jump_pod_name = pod_spec['metadata']['labels']['skypilot-ssh-jump']
         jump_pod = kubernetes.core_api(context).read_namespaced_pod(
             ssh_jump_pod_name, namespace)
-        wait_pods.append(jump_pod)
+        pods.append(jump_pod)
     provision_timeout = provider_config['timeout']
     wait_str = ('indefinitely'
                 if provision_timeout < 0 else f'for {provision_timeout}s')
     logger.debug(f'run_instances: waiting {wait_str} for pods to schedule and '
-                 f'run: {list(wait_pods_dict.keys())}')
+                 f'run: {[pod.metadata.name for pod in pods]}')
     # Wait until the pods are scheduled and surface cause for error
     # if there is one
-    _wait_for_pods_to_schedule(namespace, context, wait_pods, provision_timeout)
+    _wait_for_pods_to_schedule(namespace, context, pods, provision_timeout)
     # Wait until the pods and their containers are up and running, and
     # fail early if there is an error
     logger.debug(f'run_instances: waiting for pods to be running (pulling '
-                 f'images): {list(wait_pods_dict.keys())}')
-    _wait_for_pods_to_run(namespace, context, wait_pods)
+                 f'images): {[pod.metadata.name for pod in pods]}')
+    _wait_for_pods_to_run(namespace, context, pods)
     logger.debug(f'run_instances: all pods are scheduled and running: '
-                 f'{list(wait_pods_dict.keys())}')
-    running_pods = kubernetes_utils.filter_pods(namespace, context, tags,
-                                                ['Running'])
-    initialized_pods = kubernetes_utils.filter_pods(namespace, context, {
-        TAG_POD_INITIALIZED: 'true',
-        **tags
-    }, ['Running'])
-    uninitialized_pods = {
-        pod_name: pod
-        for pod_name, pod in running_pods.items()
-        if pod_name not in initialized_pods
-    }
-    if len(uninitialized_pods) > 0:
-        logger.debug(f'run_instances: Initializing {len(uninitialized_pods)} '
-                     f'pods: {list(uninitialized_pods.keys())}')
-        uninitialized_pods_list = list(uninitialized_pods.values())
-        # Run pre-init steps in the pod.
-        pre_init(namespace, context, uninitialized_pods_list)
-        for pod in uninitialized_pods.values():
-            _label_pod(namespace,
-                       context,
-                       pod.metadata.name,
-                       label={
-                           TAG_POD_INITIALIZED: 'true',
-                           **pod.metadata.labels
-                       })
+                 f'{[pod.metadata.name for pod in pods]}')
     assert head_pod_name is not None, 'head_instance_id should not be None'
     return common.ProvisionRecord(
@@ -854,11 +879,6 @@ def _terminate_node(namespace: str, context: Optional[str],
                     pod_name: str) -> None:
     """Terminate a pod."""
     logger.debug('terminate_instances: calling delete_namespaced_pod')
-    try:
-        kubernetes_utils.clean_zombie_ssh_jump_pod(namespace, context, pod_name)
-    except Exception as e:  # pylint: disable=broad-except
-        logger.warning('terminate_instances: Error occurred when analyzing '
-                       f'SSH Jump pod: {e}')
     try:
         kubernetes.core_api(context).delete_namespaced_service(
             pod_name, namespace, _request_timeout=config_lib.DELETION_TIMEOUT)
@@ -895,6 +915,18 @@ def terminate_instances(
     }
     pods = kubernetes_utils.filter_pods(namespace, context, tag_filters, None)
+    # Clean up the SSH jump pod if in use
+    networking_mode = network_utils.get_networking_mode(
+        provider_config.get('networking_mode'))
+    if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
+        pod_name = list(pods.keys())[0]
+        try:
+            kubernetes_utils.clean_zombie_ssh_jump_pod(namespace, context,
+                                                       pod_name)
+        except Exception as e:  # pylint: disable=broad-except
+            logger.warning('terminate_instances: Error occurred when analyzing '
+                           f'SSH Jump pod: {e}')
     def _is_head(pod) -> bool:
         return pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head'
@@ -907,7 +939,7 @@ def terminate_instances(
     # Run pod termination in parallel
     subprocess_utils.run_in_parallel(_terminate_pod_thread, pods.items(),
-                                     NUM_THREADS)
+                                     _NUM_THREADS)
 def get_cluster_info(

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -28,6 +28,7 @@ from sky.utils import common_utils
 from sky.utils import env_options
 from sky.utils import kubernetes_enums
 from sky.utils import schemas
+from sky.utils import timeline
 from sky.utils import ux_utils
 if typing.TYPE_CHECKING:
@@ -2053,6 +2054,7 @@ def get_namespace_from_config(provider_config: Dict[str, Any]) -> str:
                                get_kube_config_context_namespace(context))
+@timeline.event
 def filter_pods(namespace: str,
                 context: Optional[str],
                 tag_filters: Dict[str, str],

sky/provision/oci/instance.py CHANGED Viewed

@@ -123,8 +123,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
     # Let's create additional new nodes (if neccessary)
     to_start_count = config.count - len(resume_instances)
     created_instances = []
+    node_config = config.node_config
     if to_start_count > 0:
-        node_config = config.node_config
         compartment = query_helper.find_compartment(region)
         vcn = query_helper.find_create_vcn_subnet(region)
@@ -242,10 +242,12 @@ def run_instances(region: str, cluster_name_on_cloud: str,
     assert head_instance_id is not None, head_instance_id
+    # Format: TenancyPrefix:AvailabilityDomain, e.g. bxtG:US-SANJOSE-1-AD-1
+    _, ad = str(node_config['AvailabilityDomain']).split(':', maxsplit=1)
     return common.ProvisionRecord(
         provider_name='oci',
         region=region,
-        zone=None,
+        zone=ad,
         cluster_name=cluster_name_on_cloud,
         head_instance_id=head_instance_id,
         created_instance_ids=[n['inst_id'] for n in created_instances],

sky/provision/provisioner.py CHANGED Viewed

@@ -29,6 +29,7 @@ from sky.utils import common_utils
 from sky.utils import resources_utils
 from sky.utils import rich_utils
 from sky.utils import subprocess_utils
+from sky.utils import timeline
 from sky.utils import ux_utils
 # Do not use __name__ as we do not want to propagate logs to sky.provision,
@@ -343,6 +344,7 @@ def _wait_ssh_connection_indirect(ip: str,
     return True, ''
+@timeline.event
 def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
                  ssh_credentials: Dict[str, str]):
     """Wait until SSH is ready.
@@ -432,11 +434,15 @@ def _post_provision_setup(
             ux_utils.spinner_message(
                 'Launching - Waiting for SSH access',
                 provision_logging.config.log_path)) as status:
-        logger.debug(
-            f'\nWaiting for SSH to be available for {cluster_name!r} ...')
-        wait_for_ssh(cluster_info, ssh_credentials)
-        logger.debug(f'SSH Connection ready for {cluster_name!r}')
+        # If on Kubernetes, skip SSH check since the pods are guaranteed to be
+        # ready by the provisioner, and we use kubectl instead of SSH to run the
+        # commands and rsync on the pods. SSH will still be ready after a while
+        # for the users to SSH into the pod.
+        if cloud_name.lower() != 'kubernetes':
+            logger.debug(
+                f'\nWaiting for SSH to be available for {cluster_name!r} ...')
+            wait_for_ssh(cluster_info, ssh_credentials)
+            logger.debug(f'SSH Connection ready for {cluster_name!r}')
         vm_str = 'Instance' if cloud_name.lower() != 'kubernetes' else 'Pod'
         plural = '' if len(cluster_info.instances) == 1 else 's'
         verb = 'is' if len(cluster_info.instances) == 1 else 'are'
@@ -496,31 +502,94 @@ def _post_provision_setup(
                                                 **ssh_credentials)
         head_runner = runners[0]
-        status.update(
-            runtime_preparation_str.format(step=3, step_name='runtime'))
-        full_ray_setup = True
-        ray_port = constants.SKY_REMOTE_RAY_PORT
-        if not provision_record.is_instance_just_booted(
-                head_instance.instance_id):
+        def is_ray_cluster_healthy(ray_status_output: str,
+                                   expected_num_nodes: int) -> bool:
+            """Parse the output of `ray status` to get #active nodes.
+            The output of `ray status` looks like:
+            Node status
+            ---------------------------------------------------------------
+            Active:
+              1 node_291a8b849439ad6186387c35dc76dc43f9058108f09e8b68108cf9ec
+              1 node_0945fbaaa7f0b15a19d2fd3dc48f3a1e2d7c97e4a50ca965f67acbfd
+            Pending:
+            (no pending nodes)
+            Recent failures:
+            (no failures)
+            """
+            start = ray_status_output.find('Active:')
+            end = ray_status_output.find('Pending:', start)
+            if start == -1 or end == -1:
+                return False
+            num_active_nodes = 0
+            for line in ray_status_output[start:end].split('\n'):
+                if line.strip() and not line.startswith('Active:'):
+                    num_active_nodes += 1
+            return num_active_nodes == expected_num_nodes
+        def check_ray_port_and_cluster_healthy() -> Tuple[int, bool, bool]:
+            head_ray_needs_restart = True
+            ray_cluster_healthy = False
+            ray_port = constants.SKY_REMOTE_RAY_PORT
             # Check if head node Ray is alive
             returncode, stdout, _ = head_runner.run(
                 instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND,
                 stream_logs=False,
                 require_outputs=True)
-            if returncode:
-                logger.debug('Ray cluster on head is not up. Restarting...')
-            else:
-                logger.debug('Ray cluster on head is up.')
+            if not returncode:
                 ray_port = common_utils.decode_payload(stdout)['ray_port']
-            full_ray_setup = bool(returncode)
+                logger.debug(f'Ray cluster on head is up with port {ray_port}.')
+            head_ray_needs_restart = bool(returncode)
+            # This is a best effort check to see if the ray cluster has expected
+            # number of nodes connected.
+            ray_cluster_healthy = (not head_ray_needs_restart and
+                                   is_ray_cluster_healthy(
+                                       stdout, cluster_info.num_instances))
+            return ray_port, ray_cluster_healthy, head_ray_needs_restart
+        status.update(
+            runtime_preparation_str.format(step=3, step_name='runtime'))
+        ray_port = constants.SKY_REMOTE_RAY_PORT
+        head_ray_needs_restart = True
+        ray_cluster_healthy = False
+        if (not provision_record.is_instance_just_booted(
+                head_instance.instance_id)):
+            # Check if head node Ray is alive
+            (ray_port, ray_cluster_healthy,
+             head_ray_needs_restart) = check_ray_port_and_cluster_healthy()
+        elif cloud_name.lower() == 'kubernetes':
+            timeout = 90  # 1.5-min maximum timeout
+            start = time.time()
+            while True:
+                # Wait until Ray cluster is ready
+                (ray_port, ray_cluster_healthy,
+                 head_ray_needs_restart) = check_ray_port_and_cluster_healthy()
+                if ray_cluster_healthy:
+                    logger.debug('Ray cluster is ready. Skip head and worker '
+                                 'node ray cluster setup.')
+                    break
+                if time.time() - start > timeout:
+                    # In most cases, the ray cluster will be ready after a few
+                    # seconds. Trigger ray start on head or worker nodes to be
+                    # safe, if the ray cluster is not ready after timeout.
+                    break
+                logger.debug('Ray cluster is not ready yet, waiting for the '
+                             'async setup to complete...')
+                time.sleep(1)
-        if full_ray_setup:
+        if head_ray_needs_restart:
             logger.debug('Starting Ray on the entire cluster.')
             instance_setup.start_ray_on_head_node(
                 cluster_name.name_on_cloud,
                 custom_resource=custom_resource,
                 cluster_info=cluster_info,
                 ssh_credentials=ssh_credentials)
+        else:
+            logger.debug('Ray cluster on head is ready. Skip starting ray '
+                         'cluster on head node.')
         # NOTE: We have to check all worker nodes to make sure they are all
         #  healthy, otherwise we can only start Ray on newly started worker
@@ -531,10 +600,13 @@ def _post_provision_setup(
         #     if provision_record.is_instance_just_booted(inst.instance_id):
         #         worker_ips.append(inst.public_ip)
-        if cluster_info.num_instances > 1:
+        # We don't need to restart ray on worker nodes if the ray cluster is
+        # already healthy, i.e. the head node has expected number of nodes
+        # connected to the ray cluster.
+        if cluster_info.num_instances > 1 and not ray_cluster_healthy:
             instance_setup.start_ray_on_worker_nodes(
                 cluster_name.name_on_cloud,
-                no_restart=not full_ray_setup,
+                no_restart=not head_ray_needs_restart,
                 custom_resource=custom_resource,
                 # Pass the ray_port to worker nodes for backward compatibility
                 # as in some existing clusters the ray_port is not dumped with
@@ -543,6 +615,9 @@ def _post_provision_setup(
                 ray_port=ray_port,
                 cluster_info=cluster_info,
                 ssh_credentials=ssh_credentials)
+        elif ray_cluster_healthy:
+            logger.debug('Ray cluster is ready. Skip starting ray cluster on '
+                         'worker nodes.')
         instance_setup.start_skylet_on_head_node(cluster_name.name_on_cloud,
                                                  cluster_info, ssh_credentials)
@@ -553,6 +628,7 @@ def _post_provision_setup(
     return cluster_info
+@timeline.event
 def post_provision_runtime_setup(
         cloud_name: str, cluster_name: resources_utils.ClusterName,
         cluster_yaml: str, provision_record: provision_common.ProvisionRecord,

sky/resources.py CHANGED Viewed

@@ -1041,6 +1041,7 @@ class Resources:
     def make_deploy_variables(self, cluster_name: resources_utils.ClusterName,
                               region: clouds.Region,
                               zones: Optional[List[clouds.Zone]],
+                              num_nodes: int,
                               dryrun: bool) -> Dict[str, Optional[str]]:
         """Converts planned sky.Resources to resource variables.
@@ -1062,7 +1063,7 @@ class Resources:
         # Cloud specific variables
         cloud_specific_variables = self.cloud.make_deploy_resources_variables(
-            self, cluster_name, region, zones, dryrun)
+            self, cluster_name, region, zones, num_nodes, dryrun)
         # Docker run options
         docker_run_options = skypilot_config.get_nested(

skypilot-nightly 1.0.0.dev20241120__py3-none-any.whl → 1.0.0.dev20241122__py3-none-any.whl

skypilot-nightly 1.0.0.dev20241120py3-none-any.whl → 1.0.0.dev20241122py3-none-any.whl