PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250116__py3-none-any.whl → 1.0.0.dev20250118__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250116py3-none-any.whl → 1.0.0.dev20250118py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

sky/__init__.py +2 -2
sky/backends/cloud_vm_ray_backend.py +50 -29
sky/cli.py +11 -34
sky/core.py +8 -5
sky/data/storage.py +16 -7
sky/global_user_state.py +1 -1
sky/jobs/constants.py +8 -7
sky/jobs/controller.py +14 -16
sky/jobs/core.py +0 -2
sky/jobs/recovery_strategy.py +114 -143
sky/jobs/scheduler.py +283 -0
sky/jobs/state.py +257 -17
sky/jobs/utils.py +287 -64
sky/provision/kubernetes/instance.py +1 -1
sky/resources.py +1 -1
sky/skylet/constants.py +1 -1
sky/skylet/events.py +7 -3
sky/skylet/job_lib.py +2 -26
sky/skylet/log_lib.py +8 -8
sky/skylet/log_lib.pyi +3 -0
sky/skylet/skylet.py +1 -1
sky/templates/jobs-controller.yaml.j2 +7 -3
sky/utils/resources_utils.py +25 -21
sky/utils/subprocess_utils.py +48 -9
{skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250118.dist-info}/METADATA +1 -1
{skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250118.dist-info}/RECORD +30 -29
{skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250118.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250118.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250118.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250118.dist-info}/top_level.txt +0 -0

sky/jobs/utils.py CHANGED Viewed

@@ -13,23 +13,28 @@ import shlex
 import shutil
 import textwrap
 import time
+import traceback
 import typing
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
 import colorama
 import filelock
+import psutil
 from typing_extensions import Literal
+import sky
 from sky import backends
 from sky import exceptions
 from sky import global_user_state
 from sky import sky_logging
 from sky.backends import backend_utils
 from sky.jobs import constants as managed_job_constants
+from sky.jobs import scheduler
 from sky.jobs import state as managed_job_state
 from sky.skylet import constants
 from sky.skylet import job_lib
 from sky.skylet import log_lib
+from sky.usage import usage_lib
 from sky.utils import common_utils
 from sky.utils import log_utils
 from sky.utils import rich_utils
@@ -37,7 +42,6 @@ from sky.utils import subprocess_utils
 from sky.utils import ux_utils
 if typing.TYPE_CHECKING:
-    import sky
     from sky import dag as dag_lib
 logger = sky_logging.init_logger(__name__)
@@ -69,8 +73,10 @@ _JOB_CANCELLED_MESSAGE = (
 # The maximum time to wait for the managed job status to transition to terminal
 # state, after the job finished. This is a safeguard to avoid the case where
 # the managed job status fails to be updated and keep the `sky jobs logs`
-# blocking for a long time.
-_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 25
+# blocking for a long time. This should be significantly longer than the
+# JOB_STATUS_CHECK_GAP_SECONDS to avoid timing out before the controller can
+# update the state.
+_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 40
 class UserSignal(enum.Enum):
@@ -81,6 +87,43 @@ class UserSignal(enum.Enum):
 # ====== internal functions ======
+def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
+    """Terminate the cluster."""
+    retry_cnt = 0
+    # In some cases, e.g. botocore.exceptions.NoCredentialsError due to AWS
+    # metadata service throttling, the failed sky.down attempt can take 10-11
+    # seconds. In this case, we need the backoff to significantly reduce the
+    # rate of requests - that is, significantly increase the time between
+    # requests. We set the initial backoff to 15 seconds, so that once it grows
+    # exponentially it will quickly dominate the 10-11 seconds that we already
+    # see between requests. We set the max backoff very high, since it's
+    # generally much more important to eventually succeed than to fail fast.
+    backoff = common_utils.Backoff(
+        initial_backoff=15,
+        # 1.6 ** 5 = 10.48576 < 20, so we won't hit this with default max_retry
+        max_backoff_factor=20)
+    while True:
+        try:
+            usage_lib.messages.usage.set_internal()
+            sky.down(cluster_name)
+            return
+        except exceptions.ClusterDoesNotExist:
+            # The cluster is already down.
+            logger.debug(f'The cluster {cluster_name} is already down.')
+            return
+        except Exception as e:  # pylint: disable=broad-except
+            retry_cnt += 1
+            if retry_cnt >= max_retry:
+                raise RuntimeError(
+                    f'Failed to terminate the cluster {cluster_name}.') from e
+            logger.error(
+                f'Failed to terminate the cluster {cluster_name}. Retrying.'
+                f'Details: {common_utils.format_exception(e)}')
+            with ux_utils.enable_traceback():
+                logger.error(f'  Traceback: {traceback.format_exc()}')
+            time.sleep(backoff.current_backoff())
 def get_job_status(backend: 'backends.CloudVmRayBackend',
                    cluster_name: str) -> Optional['job_lib.JobStatus']:
     """Check the status of the job running on a managed job cluster.
@@ -105,57 +148,145 @@ def get_job_status(backend: 'backends.CloudVmRayBackend',
     return status
+def _controller_process_alive(pid: int, job_id: int) -> bool:
+    """Check if the controller process is alive."""
+    try:
+        process = psutil.Process(pid)
+        # The last two args of the command line should be --job-id <id>
+        job_args = process.cmdline()[-2:]
+        return process.is_running() and job_args == ['--job-id', str(job_id)]
+    except psutil.NoSuchProcess:
+        return False
 def update_managed_job_status(job_id: Optional[int] = None):
-    """Update managed job status if the controller job failed abnormally.
+    """Update managed job status if the controller process failed abnormally.
+    Check the status of the controller process. If it is not running, it must
+    have exited abnormally, and we should set the job status to
+    FAILED_CONTROLLER. `end_at` will be set to the current timestamp for the job
+    when above happens, which could be not accurate based on the frequency this
+    function is called.
-    Check the status of the controller job. If it is not running, it must have
-    exited abnormally, and we should set the job status to FAILED_CONTROLLER.
-    `end_at` will be set to the current timestamp for the job when above
-    happens, which could be not accurate based on the frequency this function
-    is called.
+    Note: we expect that job_id, if provided, refers to a nonterminal job.
     """
     if job_id is None:
+        # Warning: it's totally possible for the managed job to transition to
+        # a terminal status during the course of this function. The set_failed()
+        # called below will not update the state for jobs that already have a
+        # terminal status, so it should be fine.
         job_ids = managed_job_state.get_nonterminal_job_ids_by_name(None)
     else:
         job_ids = [job_id]
     for job_id_ in job_ids:
-        controller_status = job_lib.get_status(job_id_)
-        if controller_status is None or controller_status.is_terminal():
-            logger.error(f'Controller for job {job_id_} has exited abnormally. '
-                         'Setting the job status to FAILED_CONTROLLER.')
-            tasks = managed_job_state.get_managed_jobs(job_id_)
-            for task in tasks:
-                task_name = task['job_name']
-                # Tear down the abnormal cluster to avoid resource leakage.
-                cluster_name = generate_managed_job_cluster_name(
-                    task_name, job_id_)
-                handle = global_user_state.get_handle_from_cluster_name(
-                    cluster_name)
-                if handle is not None:
-                    backend = backend_utils.get_backend_from_handle(handle)
-                    max_retry = 3
-                    for retry_cnt in range(max_retry):
-                        try:
-                            backend.teardown(handle, terminate=True)
-                            break
-                        except RuntimeError:
-                            logger.error('Failed to tear down the cluster '
-                                         f'{cluster_name!r}. Retrying '
-                                         f'[{retry_cnt}/{max_retry}].')
-            # The controller job for this managed job is not running: it must
-            # have exited abnormally, and we should set the job status to
-            # FAILED_CONTROLLER.
-            # The `set_failed` will only update the task's status if the
-            # status is non-terminal.
-            managed_job_state.set_failed(
-                job_id_,
-                task_id=None,
-                failure_type=managed_job_state.ManagedJobStatus.
-                FAILED_CONTROLLER,
-                failure_reason=
-                'Controller process has exited abnormally. For more details,'
-                f' run: sky jobs logs --controller {job_id_}')
+        failure_reason = None
+        tasks = managed_job_state.get_managed_jobs(job_id_)
+        schedule_state = tasks[0]['schedule_state']
+        if schedule_state is None:
+            # Backwards compatibility: this job was submitted when ray was still
+            # used for managing the parallelism of job controllers.
+            # TODO(cooperc): Remove before 0.11.0.
+            controller_status = job_lib.get_status(job_id_)
+            if controller_status is None or controller_status.is_terminal():
+                logger.error(f'Controller process for legacy job {job_id_} is '
+                             'in an unexpected state.')
+                failure_reason = 'Legacy job is in an unexpected state'
+                # Continue to mark the job as failed.
+            else:
+                # Still running.
+                continue
+        else:
+            pid = tasks[0]['controller_pid']
+            if pid is None:
+                if schedule_state in (
+                        managed_job_state.ManagedJobScheduleState.INACTIVE,
+                        managed_job_state.ManagedJobScheduleState.WAITING):
+                    # Job has not been scheduled yet.
+                    continue
+                elif (schedule_state ==
+                      managed_job_state.ManagedJobScheduleState.LAUNCHING):
+                    # This should only be the case for a very short period of
+                    # time between marking the job as submitted and writing the
+                    # launched controller process pid back to the database (see
+                    # scheduler.maybe_schedule_next_jobs).
+                    # TODO(cooperc): Find a way to detect if we get stuck in
+                    # this state.
+                    logger.info(f'Job {job_id_} is in LAUNCHING state, '
+                                'but controller process hasn\'t started yet.')
+                    continue
+                # All other statuses are unexpected. Proceed to mark as failed.
+                logger.error(f'Expected to find a controller pid for state '
+                             f'{schedule_state.value} but found none.')
+                failure_reason = ('No controller pid set for '
+                                  f'{schedule_state.value}')
+            else:
+                logger.debug(f'Checking controller pid {pid}')
+                if _controller_process_alive(pid, job_id_):
+                    # The controller is still running.
+                    continue
+                # Otherwise, proceed to mark the job as failed.
+                logger.error(f'Controller process for {job_id_} seems to be '
+                             'dead.')
+                failure_reason = 'Controller process is dead'
+        logger.error(f'Controller process for job {job_id_} has exited '
+                     'abnormally. Setting the job status to FAILED_CONTROLLER.')
+        for task in tasks:
+            task_name = task['job_name']
+            # Tear down the abnormal cluster to avoid resource leakage.
+            cluster_name = generate_managed_job_cluster_name(task_name, job_id_)
+            handle = global_user_state.get_handle_from_cluster_name(
+                cluster_name)
+            # If the cluster exists, terminate it.
+            if handle is not None:
+                terminate_cluster(cluster_name)
+        # The controller process for this managed job is not running: it must
+        # have exited abnormally, and we should set the job status to
+        # FAILED_CONTROLLER.
+        # The `set_failed` will only update the task's status if the
+        # status is non-terminal.
+        managed_job_state.set_failed(
+            job_id_,
+            task_id=None,
+            failure_type=managed_job_state.ManagedJobStatus.FAILED_CONTROLLER,
+            failure_reason=
+            f'Controller process has exited abnormally ({failure_reason}). For '
+            f'more details, run: sky jobs logs --controller {job_id_}')
+        scheduler.job_done(job_id_, idempotent=True)
+    # Some jobs may be in a terminal status, but are not yet DONE. For instance,
+    # they may be still cleaning up resources, etc. Such jobs won't be captured
+    # by the above check, which only looks at nonterminal jobs. So, check the
+    # controller liveness of all jobs that should have live controller
+    # processes.
+    for job_info in managed_job_state.get_schedule_live_jobs(job_id):
+        if not job_info['controller_pid']:
+            # Technically, a job with no controller process but in LAUNCHING
+            # schedule state can happen very briefly after the job is set to
+            # LAUNCHING but before the controller process is actually spawned.
+            # However, if we observe any state other than LAUNCHING, something
+            # is clearly wrong.
+            if (job_info['schedule_state'] !=
+                    managed_job_state.ManagedJobScheduleState.LAUNCHING):
+                logger.error(
+                    f'Missing controller PID for {job_info["job_id"]}. '
+                    'Setting to DONE.')
+                scheduler.job_done(job_info['job_id'])
+            else:
+                logger.info(f'LAUNCHING job {job_info["job_id"]} has no '
+                            'controller process yet. Skipping.')
+        elif not _controller_process_alive(job_info['controller_pid'],
+                                           job_info['job_id']):
+            logger.error(
+                f'Controller process for job {job_info["job_id"]} is not '
+                'alive. Marking the job as DONE.')
+            scheduler.job_done(job_info['job_id'])
 def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
@@ -546,15 +677,75 @@ def stream_logs(job_id: Optional[int],
                         'instead.')
             job_id = managed_job_ids.pop()
         assert job_id is not None, (job_id, job_name)
-        # TODO: keep the following code sync with
-        # job_lib.JobLibCodeGen.tail_logs, we do not directly call that function
-        # as the following code need to be run in the current machine, instead
-        # of running remotely.
-        run_timestamp = job_lib.get_run_timestamp(job_id)
-        if run_timestamp is None:
-            return f'No managed job contrller log found with job_id {job_id}.'
-        log_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp)
-        log_lib.tail_logs(job_id=job_id, log_dir=log_dir, follow=follow)
+        controller_log_path = os.path.join(
+            os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR),
+            f'{job_id}.log')
+        job_status = None
+        # Wait for the log file to be written
+        while not os.path.exists(controller_log_path):
+            if not follow:
+                # Assume that the log file hasn't been written yet. Since we
+                # aren't following, just return.
+                return ''
+            job_status = managed_job_state.get_status(job_id)
+            if job_status is None:
+                with ux_utils.print_exception_no_traceback():
+                    raise ValueError(f'Job {job_id} not found.')
+            # We shouldn't count CANCELLING as terminal here, the controller is
+            # still cleaning up.
+            if (job_status.is_terminal() and job_status !=
+                    managed_job_state.ManagedJobStatus.CANCELLING):
+                # Don't keep waiting. If the log file is not created by this
+                # point, it never will be. This job may have been submitted
+                # using an old version that did not create the log file, so this
+                # is not considered an exceptional case.
+                return ''
+            time.sleep(log_lib.SKY_LOG_WAITING_GAP_SECONDS)
+        # This code is based on log_lib.tail_logs. We can't use that code
+        # exactly because state works differently between managed jobs and
+        # normal jobs.
+        with open(controller_log_path, 'r', newline='', encoding='utf-8') as f:
+            # Note: we do not need to care about start_stream_at here, since
+            # that should be in the job log printed above.
+            for line in f:
+                print(line, end='')
+            # Flush.
+            print(end='', flush=True)
+            if follow:
+                while True:
+                    # Print all new lines, if there are any.
+                    line = f.readline()
+                    while line is not None and line != '':
+                        print(line, end='')
+                        line = f.readline()
+                    # Flush.
+                    print(end='', flush=True)
+                    # Check if the job if finished.
+                    job_status = managed_job_state.get_status(job_id)
+                    assert job_status is not None, (job_id, job_name)
+                    if job_status.is_terminal():
+                        break
+                    time.sleep(log_lib.SKY_LOG_TAILING_GAP_SECONDS)
+                # Wait for final logs to be written.
+                time.sleep(1 + log_lib.SKY_LOG_TAILING_GAP_SECONDS)
+            # Print any remaining logs including incomplete line.
+            print(f.read(), end='', flush=True)
+        if follow:
+            return ux_utils.finishing_message(
+                f'Job finished (status: {job_status}).')
         return ''
     if job_id is None:
@@ -590,6 +781,7 @@ def dump_managed_job_queue() -> str:
             job_duration = 0
         job['job_duration'] = job_duration
         job['status'] = job['status'].value
+        job['schedule_state'] = job['schedule_state'].value
         cluster_name = generate_managed_job_cluster_name(
             job['task_name'], job['job_id'])
@@ -691,11 +883,18 @@ def format_job_table(
             status_counts[managed_job_status.value] += 1
     columns = [
-        'ID', 'TASK', 'NAME', 'RESOURCES', 'SUBMITTED', 'TOT. DURATION',
-        'JOB DURATION', '#RECOVERIES', 'STATUS'
+        'ID',
+        'TASK',
+        'NAME',
+        'RESOURCES',
+        'SUBMITTED',
+        'TOT. DURATION',
+        'JOB DURATION',
+        '#RECOVERIES',
+        'STATUS',
     ]
     if show_all:
-        columns += ['STARTED', 'CLUSTER', 'REGION', 'FAILURE']
+        columns += ['STARTED', 'CLUSTER', 'REGION', 'DESCRIPTION']
     if tasks_have_user:
         columns.insert(0, 'USER')
     job_table = log_utils.create_table(columns)
@@ -714,7 +913,25 @@ def format_job_table(
         # by the task_id.
         jobs[get_hash(task)].append(task)
+    def generate_description(failure_reason: Optional[str],
+                             schedule_state: Optional[str]) -> str:
+        description = ''
+        if schedule_state is not None:
+            description += f'Scheduler: {schedule_state}'
+            if failure_reason is not None:
+                description += ', '
+        if failure_reason is not None:
+            description += f'Failure: {failure_reason}'
+        if description == '':
+            return '-'
+        return description
     for job_hash, job_tasks in jobs.items():
+        if show_all:
+            schedule_state = job_tasks[0]['schedule_state']
         if len(job_tasks) > 1:
             # Aggregate the tasks into a new row in the table.
             job_name = job_tasks[0]['job_name']
@@ -737,7 +954,6 @@ def format_job_table(
                     end_at = None
                 recovery_cnt += task['recovery_count']
-            failure_reason = job_tasks[current_task_id]['failure_reason']
             job_duration = log_utils.readable_time_duration(0,
                                                             job_duration,
                                                             absolute=True)
@@ -763,11 +979,13 @@ def format_job_table(
                 status_str,
             ]
             if show_all:
+                schedule_state = job_tasks[0]['schedule_state']
+                failure_reason = job_tasks[current_task_id]['failure_reason']
                 job_values.extend([
                     '-',
                     '-',
                     '-',
-                    failure_reason if failure_reason is not None else '-',
+                    generate_description(failure_reason, schedule_state),
                 ])
             if tasks_have_user:
                 job_values.insert(0, job_tasks[0].get('user', '-'))
@@ -795,13 +1013,17 @@ def format_job_table(
                 task['status'].colored_str(),
             ]
             if show_all:
+                # schedule_state is only set at the job level, so if we have
+                # more than one task, only display on the aggregated row.
+                schedule_state = (task['schedule_state']
+                                  if len(job_tasks) == 1 else None)
                 values.extend([
                     # STARTED
                     log_utils.readable_time_duration(task['start_at']),
                     task['cluster_resources'],
                     task['region'],
-                    task['failure_reason']
-                    if task['failure_reason'] is not None else '-',
+                    generate_description(task['failure_reason'],
+                                         schedule_state),
                 ])
             if tasks_have_user:
                 values.insert(0, task.get('user', '-'))
@@ -875,7 +1097,7 @@ class ManagedJobCodeGen:
         return cls._build(code)
     @classmethod
-    def get_all_job_ids_by_name(cls, job_name: str) -> str:
+    def get_all_job_ids_by_name(cls, job_name: Optional[str]) -> str:
         code = textwrap.dedent(f"""\
         from sky.utils import common_utils
         job_id = managed_job_state.get_all_job_ids_by_name({job_name!r})
@@ -896,6 +1118,7 @@ class ManagedJobCodeGen:
         # should be removed in v0.8.0.
         code = textwrap.dedent("""\
         import os
+        import time
         from sky.skylet import job_lib, log_lib
         from sky.skylet import constants
@@ -920,7 +1143,7 @@ class ManagedJobCodeGen:
         dag_name = managed_job_dag.name
         # Add the managed job to queue table.
         code = textwrap.dedent(f"""\
-            managed_job_state.set_job_name({job_id}, {dag_name!r})
+            managed_job_state.set_job_info({job_id}, {dag_name!r})
             """)
         for task_id, task in enumerate(managed_job_dag.tasks):
             resources_str = backend_utils.get_task_resources_str(

sky/provision/kubernetes/instance.py CHANGED Viewed

@@ -976,7 +976,7 @@ def terminate_instances(
         _terminate_node(namespace, context, pod_name)
     # Run pod termination in parallel
-    subprocess_utils.run_in_parallel(_terminate_pod_thread, pods.items(),
+    subprocess_utils.run_in_parallel(_terminate_pod_thread, list(pods.items()),
                                      _NUM_THREADS)

sky/resources.py CHANGED Viewed

@@ -540,7 +540,7 @@ class Resources:
         if memory_gb <= 0:
             with ux_utils.print_exception_no_traceback():
                 raise ValueError(
-                    f'The "cpus" field should be positive. Found: {memory!r}')
+                    f'The "memory" field should be positive. Found: {memory!r}')
     def _set_accelerators(
         self,

sky/skylet/constants.py CHANGED Viewed

@@ -86,7 +86,7 @@ TASK_ID_LIST_ENV_VAR = 'SKYPILOT_TASK_IDS'
 # cluster yaml is updated.
 #
 # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
-SKYLET_VERSION = '9'
+SKYLET_VERSION = '10'
 # The version of the lib files that skylet/jobs use. Whenever there is an API
 # change for the job_lib or log_lib, we need to bump this version, so that the
 # user can be notified to update their SkyPilot version on the remote cluster.

sky/skylet/events.py CHANGED Viewed

@@ -13,6 +13,8 @@ from sky import clouds
 from sky import sky_logging
 from sky.backends import cloud_vm_ray_backend
 from sky.clouds import cloud_registry
+from sky.jobs import scheduler as managed_job_scheduler
+from sky.jobs import state as managed_job_state
 from sky.jobs import utils as managed_job_utils
 from sky.serve import serve_utils
 from sky.skylet import autostop_lib
@@ -67,12 +69,13 @@ class JobSchedulerEvent(SkyletEvent):
         job_lib.scheduler.schedule_step(force_update_jobs=True)
-class ManagedJobUpdateEvent(SkyletEvent):
-    """Skylet event for updating managed job status."""
+class ManagedJobEvent(SkyletEvent):
+    """Skylet event for updating and scheduling managed jobs."""
     EVENT_INTERVAL_SECONDS = 300
     def _run(self):
         managed_job_utils.update_managed_job_status()
+        managed_job_scheduler.maybe_schedule_next_jobs()
 class ServiceUpdateEvent(SkyletEvent):
@@ -116,7 +119,8 @@ class AutostopEvent(SkyletEvent):
             logger.debug('autostop_config not set. Skipped.')
             return
-        if job_lib.is_cluster_idle():
+        if (job_lib.is_cluster_idle() and
+                not managed_job_state.get_num_alive_jobs()):
             idle_minutes = (time.time() -
                             autostop_lib.get_last_active_time()) // 60
             logger.debug(

sky/skylet/job_lib.py CHANGED Viewed

@@ -10,7 +10,6 @@ import pathlib
 import shlex
 import signal
 import sqlite3
-import subprocess
 import time
 from typing import Any, Dict, List, Optional, Sequence
@@ -23,6 +22,7 @@ from sky.skylet import constants
 from sky.utils import common_utils
 from sky.utils import db_utils
 from sky.utils import log_utils
+from sky.utils import subprocess_utils
 logger = sky_logging.init_logger(__name__)
@@ -209,31 +209,7 @@ class JobScheduler:
         _CURSOR.execute((f'UPDATE pending_jobs SET submit={int(time.time())} '
                          f'WHERE job_id={job_id!r}'))
         _CONN.commit()
-        # Use nohup to ensure the job driver process is a separate process tree,
-        # instead of being a child of the current process. This is important to
-        # avoid a chain of driver processes (job driver can call schedule_step()
-        # to submit new jobs, and the new job can also call schedule_step()
-        # recursively).
-        #
-        # echo $! will output the PID of the last background process started
-        # in the current shell, so we can retrieve it and record in the DB.
-        #
-        # TODO(zhwu): A more elegant solution is to use another daemon process
-        # to be in charge of starting these driver processes, instead of
-        # starting them in the current process.
-        wrapped_cmd = (f'nohup bash -c {shlex.quote(run_cmd)} '
-                       '</dev/null >/dev/null 2>&1 & echo $!')
-        proc = subprocess.run(wrapped_cmd,
-                              stdout=subprocess.PIPE,
-                              stderr=subprocess.PIPE,
-                              stdin=subprocess.DEVNULL,
-                              start_new_session=True,
-                              check=True,
-                              shell=True,
-                              text=True)
-        # Get the PID of the detached process
-        pid = int(proc.stdout.strip())
+        pid = subprocess_utils.launch_new_process_tree(run_cmd)
         # TODO(zhwu): Backward compatibility, remove this check after 0.10.0.
         # This is for the case where the job is submitted with SkyPilot older
         # than #4318, using ray job submit.

sky/skylet/log_lib.py CHANGED Viewed

@@ -25,9 +25,9 @@ from sky.utils import log_utils
 from sky.utils import subprocess_utils
 from sky.utils import ux_utils
-_SKY_LOG_WAITING_GAP_SECONDS = 1
-_SKY_LOG_WAITING_MAX_RETRY = 5
-_SKY_LOG_TAILING_GAP_SECONDS = 0.2
+SKY_LOG_WAITING_GAP_SECONDS = 1
+SKY_LOG_WAITING_MAX_RETRY = 5
+SKY_LOG_TAILING_GAP_SECONDS = 0.2
 # Peek the head of the lines to check if we need to start
 # streaming when tail > 0.
 PEEK_HEAD_LINES_FOR_START_STREAM = 20
@@ -336,7 +336,7 @@ def _follow_job_logs(file,
             ]:
                 if wait_last_logs:
                     # Wait all the logs are printed before exit.
-                    time.sleep(1 + _SKY_LOG_TAILING_GAP_SECONDS)
+                    time.sleep(1 + SKY_LOG_TAILING_GAP_SECONDS)
                     wait_last_logs = False
                     continue
                 status_str = status.value if status is not None else 'None'
@@ -345,7 +345,7 @@ def _follow_job_logs(file,
                         f'Job finished (status: {status_str}).'))
                 return
-            time.sleep(_SKY_LOG_TAILING_GAP_SECONDS)
+            time.sleep(SKY_LOG_TAILING_GAP_SECONDS)
             status = job_lib.get_status_no_lock(job_id)
@@ -426,15 +426,15 @@ def tail_logs(job_id: Optional[int],
         retry_cnt += 1
         if os.path.exists(log_path) and status != job_lib.JobStatus.INIT:
             break
-        if retry_cnt >= _SKY_LOG_WAITING_MAX_RETRY:
+        if retry_cnt >= SKY_LOG_WAITING_MAX_RETRY:
             print(
                 f'{colorama.Fore.RED}ERROR: Logs for '
                 f'{job_str} (status: {status.value}) does not exist '
                 f'after retrying {retry_cnt} times.{colorama.Style.RESET_ALL}')
             return
-        print(f'INFO: Waiting {_SKY_LOG_WAITING_GAP_SECONDS}s for the logs '
+        print(f'INFO: Waiting {SKY_LOG_WAITING_GAP_SECONDS}s for the logs '
               'to be written...')
-        time.sleep(_SKY_LOG_WAITING_GAP_SECONDS)
+        time.sleep(SKY_LOG_WAITING_GAP_SECONDS)
         status = job_lib.update_job_status([job_id], silent=True)[0]
     start_stream_at = LOG_FILE_START_STREAMING_AT

sky/skylet/log_lib.pyi CHANGED Viewed

@@ -13,6 +13,9 @@ from sky.skylet import constants as constants
 from sky.skylet import job_lib as job_lib
 from sky.utils import log_utils as log_utils
+SKY_LOG_WAITING_GAP_SECONDS: int = ...
+SKY_LOG_WAITING_MAX_RETRY: int = ...
+SKY_LOG_TAILING_GAP_SECONDS: float = ...
 LOG_FILE_START_STREAMING_AT: str = ...

skypilot-nightly 1.0.0.dev20250116__py3-none-any.whl → 1.0.0.dev20250118__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250116py3-none-any.whl → 1.0.0.dev20250118py3-none-any.whl