PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20241114__py3-none-any.whl → 1.0.0.dev20241116__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20241114py3-none-any.whl → 1.0.0.dev20241116py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

sky/__init__.py +2 -2
sky/backends/backend_utils.py +142 -74
sky/backends/cloud_vm_ray_backend.py +15 -11
sky/cli.py +15 -4
sky/clouds/aws.py +1 -0
sky/clouds/oci.py +0 -2
sky/clouds/service_catalog/aws_catalog.py +2 -0
sky/clouds/utils/oci_utils.py +5 -0
sky/execution.py +43 -22
sky/global_user_state.py +36 -16
sky/jobs/core.py +0 -1
sky/jobs/utils.py +4 -3
sky/provision/kubernetes/utils.py +2 -0
sky/provision/oci/instance.py +12 -11
sky/provision/oci/query_utils.py +212 -6
sky/serve/core.py +1 -0
sky/serve/serve_utils.py +35 -30
sky/skylet/constants.py +1 -1
sky/skylet/job_lib.py +249 -138
sky/skylet/log_lib.py +1 -34
sky/skylet/subprocess_daemon.py +33 -13
sky/utils/controller_utils.py +10 -9
sky/utils/schemas.py +1 -0
sky/utils/subprocess_utils.py +50 -0
sky/utils/timeline.py +2 -4
{skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/METADATA +1 -1
{skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/RECORD +31 -31
{skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/top_level.txt +0 -0

sky/global_user_state.py CHANGED Viewed

@@ -60,7 +60,8 @@ def create_table(cursor, conn):
         owner TEXT DEFAULT null,
         cluster_hash TEXT DEFAULT null,
         storage_mounts_metadata BLOB DEFAULT null,
-        cluster_ever_up INTEGER DEFAULT 0)""")
+        cluster_ever_up INTEGER DEFAULT 0,
+        status_updated_at INTEGER DEFAULT null)""")
     # Table for Cluster History
     # usage_intervals: List[Tuple[int, int]]
@@ -130,6 +131,10 @@ def create_table(cursor, conn):
         # clusters were never really UP, setting it to 1 means they won't be
         # auto-deleted during any failover.
         value_to_replace_existing_entries=1)
+    db_utils.add_column_to_table(cursor, conn, 'clusters', 'status_updated_at',
+                                 'INTEGER DEFAULT null')
     conn.commit()
@@ -159,6 +164,7 @@ def add_or_update_cluster(cluster_name: str,
     status = status_lib.ClusterStatus.INIT
     if ready:
         status = status_lib.ClusterStatus.UP
+    status_updated_at = int(time.time())
     # TODO (sumanth): Cluster history table will have multiple entries
     # when the cluster failover through multiple regions (one entry per region).
@@ -191,7 +197,7 @@ def add_or_update_cluster(cluster_name: str,
         # specified.
         '(name, launched_at, handle, last_use, status, '
         'autostop, to_down, metadata, owner, cluster_hash, '
-        'storage_mounts_metadata, cluster_ever_up) '
+        'storage_mounts_metadata, cluster_ever_up, status_updated_at) '
         'VALUES ('
         # name
         '?, '
@@ -228,7 +234,9 @@ def add_or_update_cluster(cluster_name: str,
         'COALESCE('
         '(SELECT storage_mounts_metadata FROM clusters WHERE name=?), null), '
         # cluster_ever_up
-        '((SELECT cluster_ever_up FROM clusters WHERE name=?) OR ?)'
+        '((SELECT cluster_ever_up FROM clusters WHERE name=?) OR ?),'
+        # status_updated_at
+        '?'
         ')',
         (
             # name
@@ -260,6 +268,8 @@ def add_or_update_cluster(cluster_name: str,
             # cluster_ever_up
             cluster_name,
             int(ready),
+            # status_updated_at
+            status_updated_at,
         ))
     launched_nodes = getattr(cluster_handle, 'launched_nodes', None)
@@ -330,11 +340,13 @@ def remove_cluster(cluster_name: str, terminate: bool) -> None:
         # stopped VM, which leads to timeout.
         if hasattr(handle, 'stable_internal_external_ips'):
             handle.stable_internal_external_ips = None
+        current_time = int(time.time())
         _DB.cursor.execute(
-            'UPDATE clusters SET handle=(?), status=(?) '
-            'WHERE name=(?)', (
+            'UPDATE clusters SET handle=(?), status=(?), '
+            'status_updated_at=(?) WHERE name=(?)', (
                 pickle.dumps(handle),
                 status_lib.ClusterStatus.STOPPED.value,
+                current_time,
                 cluster_name,
             ))
     _DB.conn.commit()
@@ -359,10 +371,10 @@ def get_glob_cluster_names(cluster_name: str) -> List[str]:
 def set_cluster_status(cluster_name: str,
                        status: status_lib.ClusterStatus) -> None:
-    _DB.cursor.execute('UPDATE clusters SET status=(?) WHERE name=(?)', (
-        status.value,
-        cluster_name,
-    ))
+    current_time = int(time.time())
+    _DB.cursor.execute(
+        'UPDATE clusters SET status=(?), status_updated_at=(?) WHERE name=(?)',
+        (status.value, current_time, cluster_name))
     count = _DB.cursor.rowcount
     _DB.conn.commit()
     assert count <= 1, count
@@ -570,15 +582,18 @@ def _load_storage_mounts_metadata(
 def get_cluster_from_name(
         cluster_name: Optional[str]) -> Optional[Dict[str, Any]]:
-    rows = _DB.cursor.execute('SELECT * FROM clusters WHERE name=(?)',
-                              (cluster_name,)).fetchall()
+    rows = _DB.cursor.execute(
+        'SELECT name, launched_at, handle, last_use, status, autostop, '
+        'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
+        'cluster_ever_up, status_updated_at FROM clusters WHERE name=(?)',
+        (cluster_name,)).fetchall()
     for row in rows:
         # Explicitly specify the number of fields to unpack, so that
         # we can add new fields to the database in the future without
         # breaking the previous code.
         (name, launched_at, handle, last_use, status, autostop, metadata,
-         to_down, owner, cluster_hash, storage_mounts_metadata,
-         cluster_ever_up) = row[:12]
+         to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
+         status_updated_at) = row[:13]
         # TODO: use namedtuple instead of dict
         record = {
             'name': name,
@@ -594,6 +609,7 @@ def get_cluster_from_name(
             'storage_mounts_metadata':
                 _load_storage_mounts_metadata(storage_mounts_metadata),
             'cluster_ever_up': bool(cluster_ever_up),
+            'status_updated_at': status_updated_at,
         }
         return record
     return None
@@ -601,12 +617,15 @@ def get_cluster_from_name(
 def get_clusters() -> List[Dict[str, Any]]:
     rows = _DB.cursor.execute(
-        'select * from clusters order by launched_at desc').fetchall()
+        'select name, launched_at, handle, last_use, status, autostop, '
+        'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
+        'cluster_ever_up, status_updated_at from clusters '
+        'order by launched_at desc').fetchall()
     records = []
     for row in rows:
         (name, launched_at, handle, last_use, status, autostop, metadata,
-         to_down, owner, cluster_hash, storage_mounts_metadata,
-         cluster_ever_up) = row[:12]
+         to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
+         status_updated_at) = row[:13]
         # TODO: use namedtuple instead of dict
         record = {
             'name': name,
@@ -622,6 +641,7 @@ def get_clusters() -> List[Dict[str, Any]]:
             'storage_mounts_metadata':
                 _load_storage_mounts_metadata(storage_mounts_metadata),
             'cluster_ever_up': bool(cluster_ever_up),
+            'status_updated_at': status_updated_at,
         }
         records.append(record)

sky/jobs/core.py CHANGED Viewed

@@ -133,7 +133,6 @@ def launch(
         controller_task.set_resources(controller_resources)
         controller_task.managed_job_dag = dag
-        assert len(controller_task.resources) == 1, controller_task
         sky_logging.print(
             f'{colorama.Fore.YELLOW}'

sky/jobs/utils.py CHANGED Viewed

@@ -85,7 +85,8 @@ def get_job_status(backend: 'backends.CloudVmRayBackend',
                    cluster_name: str) -> Optional['job_lib.JobStatus']:
     """Check the status of the job running on a managed job cluster.
-    It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_SETUP or CANCELLED.
+    It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
+    FAILED_SETUP or CANCELLED.
     """
     handle = global_user_state.get_handle_from_cluster_name(cluster_name)
     assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
@@ -866,7 +867,7 @@ class ManagedJobCodeGen:
         code += inspect.getsource(stream_logs)
         code += textwrap.dedent(f"""\
-        msg = stream_logs({job_id!r}, {job_name!r},
+        msg = stream_logs({job_id!r}, {job_name!r},
                            follow={follow}, controller={controller})
         print(msg, flush=True)
         """)
@@ -883,7 +884,7 @@ class ManagedJobCodeGen:
             resources_str = backend_utils.get_task_resources_str(
                 task, is_managed_job=True)
             code += textwrap.dedent(f"""\
-                managed_job_state.set_pending({job_id}, {task_id},
+                managed_job_state.set_pending({job_id}, {task_id},
                                   {task.name!r}, {resources_str!r})
                 """)
         return cls._build(code)

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -1693,6 +1693,8 @@ def merge_dicts(source: Dict[Any, Any], destination: Dict[Any, Any]):
             else:
                 destination[key].extend(value)
         else:
+            if destination is None:
+                destination = {}
             destination[key] = value

sky/provision/oci/instance.py CHANGED Viewed

@@ -2,6 +2,8 @@
 History:
  - Hysun He (hysun.he@oracle.com) @ Oct.16, 2024: Initial implementation
+ - Hysun He (hysun.he@oracle.com) @ Nov.13, 2024: Implement open_ports
+   and cleanup_ports for supporting SkyServe.
 """
 import copy
@@ -292,11 +294,11 @@ def open_ports(
     provider_config: Optional[Dict[str, Any]] = None,
 ) -> None:
     """Open ports for inbound traffic."""
-    # OCI ports in security groups are opened while creating the new
-    # VCN (skypilot_vcn). If user configure to use existing VCN, it is
-    # intended to let user to manage the ports instead of automatically
-    # opening ports here.
-    del cluster_name_on_cloud, ports, provider_config
+    assert provider_config is not None, cluster_name_on_cloud
+    region = provider_config['region']
+    query_helper.create_nsg_rules(region=region,
+                                  cluster_name=cluster_name_on_cloud,
+                                  ports=ports)
 @query_utils.debug_enabled(logger)
@@ -306,12 +308,11 @@ def cleanup_ports(
     provider_config: Optional[Dict[str, Any]] = None,
 ) -> None:
     """Delete any opened ports."""
-    del cluster_name_on_cloud, ports, provider_config
-    # OCI ports in security groups are opened while creating the new
-    # VCN (skypilot_vcn). The VCN will only be created at the first
-    # time when it is not existed. We'll not automatically delete the
-    # VCN while teardown clusters. it is intended to let user to decide
-    # to delete the VCN or not from OCI console, for example.
+    assert provider_config is not None, cluster_name_on_cloud
+    region = provider_config['region']
+    del ports
+    query_helper.remove_cluster_nsg(region=region,
+                                    cluster_name=cluster_name_on_cloud)
 @query_utils.debug_enabled(logger)

sky/provision/oci/query_utils.py CHANGED Viewed

@@ -5,6 +5,8 @@ History:
    migrated from the old provisioning API.
  - Hysun He (hysun.he@oracle.com) @ Oct.18, 2024: Enhancement.
    find_compartment: allow search subtree when find a compartment.
+ - Hysun He (hysun.he@oracle.com) @ Nov.12, 2024: Add methods to
+   Add/remove security rules: create_nsg_rules & remove_nsg
 """
 from datetime import datetime
 import functools
@@ -13,12 +15,15 @@ import re
 import time
 import traceback
 import typing
-from typing import Optional
+from typing import List, Optional, Tuple
+from sky import exceptions
 from sky import sky_logging
 from sky.adaptors import common as adaptors_common
 from sky.adaptors import oci as oci_adaptor
 from sky.clouds.utils import oci_utils
+from sky.provision import constants
+from sky.utils import resources_utils
 if typing.TYPE_CHECKING:
     import pandas as pd
@@ -81,19 +86,33 @@ class QueryHelper:
         return result_set
     @classmethod
+    @debug_enabled(logger)
     def terminate_instances_by_tags(cls, tag_filters, region) -> int:
         logger.debug(f'Terminate instance by tags: {tag_filters}')
+        cluster_name = tag_filters[constants.TAG_RAY_CLUSTER_NAME]
+        nsg_name = oci_utils.oci_config.NSG_NAME_TEMPLATE.format(
+            cluster_name=cluster_name)
+        nsg_id = cls.find_nsg(region, nsg_name, create_if_not_exist=False)
+        core_client = oci_adaptor.get_core_client(
+            region, oci_utils.oci_config.get_profile())
         insts = cls.query_instances_by_tags(tag_filters, region)
         fail_count = 0
         for inst in insts:
             inst_id = inst.identifier
-            logger.debug(f'Got instance(to be terminated): {inst_id}')
+            logger.debug(f'Terminating instance {inst_id}')
             try:
-                oci_adaptor.get_core_client(
-                    region,
-                    oci_utils.oci_config.get_profile()).terminate_instance(
-                        inst_id)
+                # Release the NSG reference so that the NSG can be
+                # deleted without waiting the instance being terminated.
+                if nsg_id is not None:
+                    cls.detach_nsg(region, inst, nsg_id)
+                # Terminate the instance
+                core_client.terminate_instance(inst_id)
             except oci_adaptor.oci.exceptions.ServiceError as e:
                 fail_count += 1
                 logger.error(f'Terminate instance failed: {str(e)}\n: {inst}')
@@ -468,5 +487,192 @@ class QueryHelper:
             logger.error(
                 f'Delete VCN {oci_utils.oci_config.VCN_NAME} Error: {str(e)}')
+    @classmethod
+    @debug_enabled(logger)
+    def find_nsg(cls, region: str, nsg_name: str,
+                 create_if_not_exist: bool) -> Optional[str]:
+        net_client = oci_adaptor.get_net_client(
+            region, oci_utils.oci_config.get_profile())
+        compartment = cls.find_compartment(region)
+        list_vcns_resp = net_client.list_vcns(
+            compartment_id=compartment,
+            display_name=oci_utils.oci_config.VCN_NAME,
+            lifecycle_state='AVAILABLE',
+        )
+        if not list_vcns_resp:
+            raise exceptions.ResourcesUnavailableError(
+                'The VCN is not available')
+        # Get the primary vnic.
+        assert len(list_vcns_resp.data) > 0
+        vcn = list_vcns_resp.data[0]
+        list_nsg_resp = net_client.list_network_security_groups(
+            compartment_id=compartment,
+            vcn_id=vcn.id,
+            limit=1,
+            display_name=nsg_name,
+        )
+        nsgs = list_nsg_resp.data
+        if nsgs:
+            assert len(nsgs) == 1
+            return nsgs[0].id
+        elif not create_if_not_exist:
+            return None
+        # Continue to create new NSG if not exists
+        create_nsg_resp = net_client.create_network_security_group(
+            create_network_security_group_details=oci_adaptor.oci.core.models.
+            CreateNetworkSecurityGroupDetails(
+                compartment_id=compartment,
+                vcn_id=vcn.id,
+                display_name=nsg_name,
+            ))
+        get_nsg_resp = net_client.get_network_security_group(
+            network_security_group_id=create_nsg_resp.data.id)
+        oci_adaptor.oci.wait_until(
+            net_client,
+            get_nsg_resp,
+            'lifecycle_state',
+            'AVAILABLE',
+        )
+        return get_nsg_resp.data.id
+    @classmethod
+    def get_range_min_max(cls, port_range: str) -> Tuple[int, int]:
+        range_list = port_range.split('-')
+        if len(range_list) == 1:
+            return (int(range_list[0]), int(range_list[0]))
+        from_port, to_port = range_list
+        return (int(from_port), int(to_port))
+    @classmethod
+    @debug_enabled(logger)
+    def create_nsg_rules(cls, region: str, cluster_name: str,
+                         ports: List[str]) -> None:
+        """ Create per-cluster NSG with ingress rules """
+        if not ports:
+            return
+        net_client = oci_adaptor.get_net_client(
+            region, oci_utils.oci_config.get_profile())
+        nsg_name = oci_utils.oci_config.NSG_NAME_TEMPLATE.format(
+            cluster_name=cluster_name)
+        nsg_id = cls.find_nsg(region, nsg_name, create_if_not_exist=True)
+        filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name}
+        insts = query_helper.query_instances_by_tags(filters, region)
+        for inst in insts:
+            vnic = cls.get_instance_primary_vnic(
+                region=region,
+                inst_info={
+                    'inst_id': inst.identifier,
+                    'ad': inst.availability_domain,
+                    'compartment': inst.compartment_id,
+                })
+            nsg_ids = vnic.nsg_ids
+            if not nsg_ids:
+                net_client.update_vnic(
+                    vnic_id=vnic.id,
+                    update_vnic_details=oci_adaptor.oci.core.models.
+                    UpdateVnicDetails(nsg_ids=[nsg_id],
+                                      skip_source_dest_check=False),
+                )
+        # pylint: disable=line-too-long
+        list_nsg_rules_resp = net_client.list_network_security_group_security_rules(
+            network_security_group_id=nsg_id,
+            direction='INGRESS',
+            sort_by='TIMECREATED',
+            sort_order='DESC',
+        )
+        ingress_rules: List = list_nsg_rules_resp.data
+        existing_port_ranges: List[str] = []
+        for r in ingress_rules:
+            if r.tcp_options:
+                options_range = r.tcp_options.destination_port_range
+                rule_port_range = f'{options_range.min}-{options_range.max}'
+                existing_port_ranges.append(rule_port_range)
+        new_ports = resources_utils.port_ranges_to_set(ports)
+        existing_ports = resources_utils.port_ranges_to_set(
+            existing_port_ranges)
+        if new_ports.issubset(existing_ports):
+            # ports already contains in the existing rules, nothing to add.
+            return
+        # Determine the ports to be added, without overlapping.
+        ports_to_open = new_ports - existing_ports
+        port_ranges_to_open = resources_utils.port_set_to_ranges(ports_to_open)
+        new_rules = []
+        for port_range in port_ranges_to_open:
+            port_range_min, port_range_max = cls.get_range_min_max(port_range)
+            new_rules.append(
+                oci_adaptor.oci.core.models.AddSecurityRuleDetails(
+                    direction='INGRESS',
+                    protocol='6',
+                    is_stateless=False,
+                    source=oci_utils.oci_config.VCN_CIDR_INTERNET,
+                    source_type='CIDR_BLOCK',
+                    tcp_options=oci_adaptor.oci.core.models.TcpOptions(
+                        destination_port_range=oci_adaptor.oci.core.models.
+                        PortRange(min=port_range_min, max=port_range_max),),
+                    description=oci_utils.oci_config.SERVICE_PORT_RULE_TAG,
+                ))
+        net_client.add_network_security_group_security_rules(
+            network_security_group_id=nsg_id,
+            add_network_security_group_security_rules_details=oci_adaptor.oci.
+            core.models.AddNetworkSecurityGroupSecurityRulesDetails(
+                security_rules=new_rules),
+        )
+    @classmethod
+    @debug_enabled(logger)
+    def detach_nsg(cls, region: str, inst, nsg_id: Optional[str]) -> None:
+        if nsg_id is None:
+            return
+        vnic = cls.get_instance_primary_vnic(
+            region=region,
+            inst_info={
+                'inst_id': inst.identifier,
+                'ad': inst.availability_domain,
+                'compartment': inst.compartment_id,
+            })
+        # Detatch the NSG before removing it.
+        oci_adaptor.get_net_client(region, oci_utils.oci_config.get_profile(
+        )).update_vnic(
+            vnic_id=vnic.id,
+            update_vnic_details=oci_adaptor.oci.core.models.UpdateVnicDetails(
+                nsg_ids=[], skip_source_dest_check=False),
+        )
+    @classmethod
+    @debug_enabled(logger)
+    def remove_cluster_nsg(cls, region: str, cluster_name: str) -> None:
+        """ Remove NSG of the cluster """
+        net_client = oci_adaptor.get_net_client(
+            region, oci_utils.oci_config.get_profile())
+        nsg_name = oci_utils.oci_config.NSG_NAME_TEMPLATE.format(
+            cluster_name=cluster_name)
+        nsg_id = cls.find_nsg(region, nsg_name, create_if_not_exist=False)
+        if nsg_id is None:
+            return
+        # Delete the NSG
+        net_client.delete_network_security_group(
+            network_security_group_id=nsg_id)
 query_helper = QueryHelper()

sky/serve/core.py CHANGED Viewed

@@ -701,6 +701,7 @@ def tail_logs(
         with ux_utils.print_exception_no_traceback():
             raise ValueError(f'`target` must be a string or '
                              f'sky.serve.ServiceComponent, got {type(target)}.')
     if target == serve_utils.ServiceComponent.REPLICA:
         if replica_id is None:
             with ux_utils.print_exception_no_traceback():

sky/serve/serve_utils.py CHANGED Viewed

@@ -46,8 +46,14 @@ NUM_SERVICE_THRESHOLD = (_SYSTEM_MEMORY_GB //
                          constants.CONTROLLER_MEMORY_USAGE_GB)
 _CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
-_SKYPILOT_PROVISION_LOG_PATTERN = r'.*tail -n100 -f (.*provision\.log).*'
-_SKYPILOT_LOG_PATTERN = r'.*tail -n100 -f (.*\.log).*'
+# NOTE(dev): We assume log paths are either in ~/sky_logs/... or ~/.sky/...
+# and always appear after a space. Be careful when changing UX as this
+# assumption is used to expand some log files while ignoring others.
+_SKYPILOT_LOG_DIRS = r'~/(sky_logs|\.sky)'
+_SKYPILOT_PROVISION_LOG_PATTERN = (
+    fr'.* ({_SKYPILOT_LOG_DIRS}/.*provision\.log)')
+_SKYPILOT_LOG_PATTERN = fr'.* ({_SKYPILOT_LOG_DIRS}/.*\.log)'
 # TODO(tian): Find all existing replica id and print here.
 _FAILED_TO_FIND_REPLICA_MSG = (
     f'{colorama.Fore.RED}Failed to find replica '
@@ -591,7 +597,7 @@ def get_latest_version_with_min_replicas(
     return active_versions[-1] if active_versions else None
-def _follow_replica_logs(
+def _follow_logs_with_provision_expanding(
     file: TextIO,
     cluster_name: str,
     *,
@@ -599,7 +605,7 @@ def _follow_replica_logs(
     stop_on_eof: bool = False,
     idle_timeout_seconds: Optional[int] = None,
 ) -> Iterator[str]:
-    """Follows logs for a replica, handling nested log files.
+    """Follows logs and expands any provision.log references found.
     Args:
         file: Log file to read from.
@@ -610,7 +616,7 @@ def _follow_replica_logs(
             new content.
     Yields:
-        Log lines from the main file and any nested log files.
+        Log lines, including expanded content from referenced provision logs.
     """
     def cluster_is_up() -> bool:
@@ -620,36 +626,35 @@ def _follow_replica_logs(
         return cluster_record['status'] == status_lib.ClusterStatus.UP
     def process_line(line: str) -> Iterator[str]:
-        # Tailing detailed progress for user. All logs in skypilot is
-        # of format `To view detailed progress: tail -n100 -f *.log`.
-        # Check if the line is directing users to view logs
+        # The line might be directing users to view logs, like
+        # `✓ Cluster launched: new-http.  View logs at: *.log`
+        # We should tail the detailed logs for user.
         provision_log_prompt = re.match(_SKYPILOT_PROVISION_LOG_PATTERN, line)
-        other_log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
+        log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
         if provision_log_prompt is not None:
             nested_log_path = os.path.expanduser(provision_log_prompt.group(1))
-            with open(nested_log_path, 'r', newline='', encoding='utf-8') as f:
-                # We still exit if more than 10 seconds without new content
-                # to avoid any internal bug that causes the launch to fail
-                # while cluster status remains INIT.
-                # Originally, we output the next line first before printing
-                # the launching logs. Since the next line is always
-                # `Launching on <cloud> <region> (<zone>)`, we output it first
-                # to indicate the process is starting.
-                # TODO(andyl): After refactor #4323, the above logic is broken,
-                # but coincidentally with the new UX 3.0, the `Cluster launched`
-                # message is printed first, making the output appear correct.
-                # Explaining this since it's technically a breaking change
-                # for this refactor PR #4323. Will remove soon in a fix PR
-                # for adapting the serve.follow_logs to the new UX.
-                yield from _follow_replica_logs(f,
-                                                cluster_name,
-                                                should_stop=cluster_is_up,
-                                                stop_on_eof=stop_on_eof,
-                                                idle_timeout_seconds=10)
+            try:
+                with open(nested_log_path, 'r', newline='',
+                          encoding='utf-8') as f:
+                    # We still exit if more than 10 seconds without new content
+                    # to avoid any internal bug that causes the launch to fail
+                    # while cluster status remains INIT.
+                    yield from log_utils.follow_logs(f,
+                                                     should_stop=cluster_is_up,
+                                                     stop_on_eof=stop_on_eof,
+                                                     idle_timeout_seconds=10)
+            except FileNotFoundError:
+                yield line
+                yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
+                       f'Try to expand log file {nested_log_path} but not '
+                       f'found. Skipping...{colorama.Style.RESET_ALL}')
+                pass
             return
-        if other_log_prompt is not None:
+        if log_prompt is not None:
             # Now we skip other logs (file sync logs) since we lack
             # utility to determine when these log files are finished
             # writing.
@@ -702,7 +707,7 @@ def stream_replica_logs(service_name: str, replica_id: int,
     replica_provisioned = (
         lambda: _get_replica_status() != serve_state.ReplicaStatus.PROVISIONING)
     with open(launch_log_file_name, 'r', newline='', encoding='utf-8') as f:
-        for line in _follow_replica_logs(
+        for line in _follow_logs_with_provision_expanding(
                 f,
                 replica_cluster_name,
                 should_stop=replica_provisioned,

sky/skylet/constants.py CHANGED Viewed

@@ -75,7 +75,7 @@ TASK_ID_LIST_ENV_VAR = 'SKYPILOT_TASK_IDS'
 # cluster yaml is updated.
 #
 # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
-SKYLET_VERSION = '8'
+SKYLET_VERSION = '9'
 # The version of the lib files that skylet/jobs use. Whenever there is an API
 # change for the job_lib or log_lib, we need to bump this version, so that the
 # user can be notified to update their SkyPilot version on the remote cluster.

skypilot-nightly 1.0.0.dev20241114__py3-none-any.whl → 1.0.0.dev20241116__py3-none-any.whl

skypilot-nightly 1.0.0.dev20241114py3-none-any.whl → 1.0.0.dev20241116py3-none-any.whl