PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250909__py3-none-any.whl → 1.0.0.dev20250910__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250909py3-none-any.whl → 1.0.0.dev20250910py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (67) hide show

sky/__init__.py +2 -2
sky/authentication.py +19 -4
sky/backends/backend_utils.py +35 -1
sky/backends/cloud_vm_ray_backend.py +2 -2
sky/client/sdk.py +20 -0
sky/client/sdk_async.py +18 -16
sky/clouds/aws.py +3 -1
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/{webpack-d4fabc08788e14af.js → webpack-1d7e11230da3ca89.js} +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/storage.py +5 -1
sky/execution.py +21 -14
sky/jobs/constants.py +3 -0
sky/jobs/controller.py +732 -310
sky/jobs/recovery_strategy.py +251 -129
sky/jobs/scheduler.py +247 -174
sky/jobs/server/core.py +20 -4
sky/jobs/server/utils.py +2 -2
sky/jobs/state.py +702 -511
sky/jobs/utils.py +94 -39
sky/provision/aws/config.py +4 -1
sky/provision/gcp/config.py +6 -1
sky/provision/kubernetes/utils.py +17 -8
sky/provision/provisioner.py +1 -0
sky/serve/replica_managers.py +0 -7
sky/serve/serve_utils.py +5 -0
sky/serve/server/impl.py +1 -2
sky/serve/service.py +0 -2
sky/server/common.py +8 -3
sky/server/config.py +43 -24
sky/server/constants.py +1 -0
sky/server/daemons.py +7 -11
sky/server/requests/serializers/encoders.py +1 -1
sky/server/server.py +8 -1
sky/setup_files/dependencies.py +4 -2
sky/skylet/attempt_skylet.py +1 -0
sky/skylet/constants.py +3 -1
sky/skylet/events.py +2 -10
sky/utils/command_runner.pyi +3 -3
sky/utils/common_utils.py +11 -1
sky/utils/controller_utils.py +5 -0
sky/utils/db/db_utils.py +31 -2
sky/utils/rich_utils.py +3 -1
sky/utils/subprocess_utils.py +9 -0
sky/volumes/volume.py +2 -0
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/METADATA +39 -37
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/RECORD +67 -67
/sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → 3SYxqNGnvvPS8h3gdD2T7}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → 3SYxqNGnvvPS8h3gdD2T7}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/top_level.txt +0 -0

sky/jobs/scheduler.py CHANGED Viewed

@@ -42,145 +42,213 @@ Nomenclature:
 """
 from argparse import ArgumentParser
+import asyncio
 import contextlib
 import os
+import pathlib
+import shutil
 import sys
-import time
-from typing import Optional
+import typing
+from typing import Set
+import uuid
 import filelock
-from sky import exceptions
 from sky import sky_logging
+from sky import skypilot_config
+from sky.adaptors import common as adaptors_common
+from sky.client import sdk
 from sky.jobs import constants as managed_job_constants
 from sky.jobs import state
-from sky.serve import serve_utils
+from sky.jobs import utils as managed_job_utils
+from sky.server import config as server_config
 from sky.skylet import constants
 from sky.utils import common_utils
-from sky.utils import controller_utils
 from sky.utils import subprocess_utils
+if typing.TYPE_CHECKING:
+    import logging
+    import psutil
+else:
+    psutil = adaptors_common.LazyImport('psutil')
 logger = sky_logging.init_logger('sky.jobs.controller')
-_ALIVE_JOB_LAUNCH_WAIT_INTERVAL = 0.5
+# Job controller lock. This is used to synchronize writing/reading the
+# controller pid file.
+JOB_CONTROLLER_PID_LOCK = os.path.expanduser(
+    '~/.sky/locks/job_controller_pid.lock')
+JOB_CONTROLLER_PID_PATH = os.path.expanduser('~/.sky/job_controller_pid')
+JOB_CONTROLLER_ENV_PATH = os.path.expanduser('~/.sky/job_controller_env')
-def _start_controller(job_id: int, dag_yaml_path: str, env_file_path: str,
-                      pool: Optional[str]) -> None:
-    activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
-    source_environment_cmd = (f'source {env_file_path};'
-                              if env_file_path else '')
-    maybe_pool_arg = (f'--pool {pool}' if pool is not None else '')
-    run_controller_cmd = (
-        f'{sys.executable} -u -m sky.jobs.controller '
-        f'{dag_yaml_path} --job-id {job_id} {maybe_pool_arg};')
-    # If the command line here is changed, please also update
-    # utils._controller_process_alive. The substring `--job-id X`
-    # should be in the command.
-    run_cmd = (f'{activate_python_env_cmd}'
-               f'{source_environment_cmd}'
-               f'{run_controller_cmd}')
+# Based on testing, each worker takes around 200-300MB memory. Keeping it
+# higher to be safe.
+JOB_MEMORY_MB = 400
+# Number of ongoing launches launches allowed per worker. Can probably be
+# increased a bit to around 16 but keeping it lower to just to be safe
+LAUNCHES_PER_WORKER = 8
+# this can probably be increased to around 300-400 but keeping it lower to just
+# to be safe
+JOBS_PER_WORKER = 200
+# keep 1GB reserved after the controllers
+MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB = 2048
+CURRENT_HASH = os.path.expanduser('~/.sky/wheels/current_sky_wheel_hash')
+# Maximum values for above constants. There will start to be lagging issues
+# at these numbers already.
+# JOB_MEMORY_MB = 200
+# LAUNCHES_PER_WORKER = 16
+# JOBS_PER_WORKER = 400
+def get_number_of_controllers() -> int:
+    """Returns the number of controllers that should be running.
+    This is the number of controllers that should be running to maximize
+    resource utilization.
+    In consolidation mode, we use the existing API server so our resource
+    requirements are just for the job controllers. We try taking up as much
+    much memory as possible left over from the API server.
+    In non-consolidation mode, we have to take into account the memory of the
+    API server workers. We limit to only 8 launches per worker, so our logic is
+    each controller will take CONTROLLER_MEMORY_MB + 8 * WORKER_MEMORY_MB. We
+    leave some leftover room for ssh codegen and ray status overhead.
+    """
+    consolidation_mode = skypilot_config.get_nested(
+        ('jobs', 'controller', 'consolidation_mode'), default_value=False)
+    total_memory_mb = common_utils.get_mem_size_gb() * 1024
+    if consolidation_mode:
+        config = server_config.compute_server_config(deploy=True, quiet=True)
+        used = 0.0
+        used += MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB
+        used += (config.long_worker_config.garanteed_parallelism +
+                    config.long_worker_config.burstable_parallelism) * \
+            server_config.LONG_WORKER_MEM_GB * 1024
+        used += (config.short_worker_config.garanteed_parallelism +
+                    config.short_worker_config.burstable_parallelism) * \
+            server_config.SHORT_WORKER_MEM_GB * 1024
+        return max(1, int((total_memory_mb - used) // JOB_MEMORY_MB))
+    else:
+        return max(
+            1,
+            int((total_memory_mb - MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB) /
+                ((LAUNCHES_PER_WORKER * server_config.LONG_WORKER_MEM_GB) * 1024
+                 + JOB_MEMORY_MB)))
+def start_controller() -> None:
+    """Start the job controller process.
+    This requires that the env file is already set up.
+    """
+    os.environ[constants.OVERRIDE_CONSOLIDATION_MODE] = 'true'
     logs_dir = os.path.expanduser(
         managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
     os.makedirs(logs_dir, exist_ok=True)
-    log_path = os.path.join(logs_dir, f'{job_id}.log')
+    log_path = os.path.join(logs_dir, f'controller_{uuid.uuid4()}.log')
+    activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
+    run_controller_cmd = (f'{sys.executable} -u -m'
+                          'sky.jobs.controller')
+    run_cmd = (f'{activate_python_env_cmd}'
+               f'{run_controller_cmd}')
+    logger.info(f'Running controller with command: {run_cmd}')
     pid = subprocess_utils.launch_new_process_tree(run_cmd, log_output=log_path)
-    state.set_job_controller_pid(job_id, pid)
-    logger.debug(f'Job {job_id} started with pid {pid}')
-def maybe_schedule_next_jobs() -> None:
-    """Determine if any managed jobs can be scheduled, and if so, schedule them.
-    Here, "schedule" means to select job that is waiting, and allow it to
-    proceed. It does NOT mean to submit a job to the scheduler.
-    For newly submitted jobs, scheduling means updating the state of the jobs,
-    and starting the job controller process. For jobs that are already alive but
-    are waiting to launch a new task or recover, just update the state of the
-    job to indicate that the launch can proceed.
-    This function transitions jobs into LAUNCHING on a best-effort basis. That
-    is, if we can start any jobs, we will, but if not, we will exit (almost)
-    immediately. It's expected that if some WAITING or ALIVE_WAITING jobs cannot
-    be started now (either because the lock is held, or because there are not
-    enough resources), another call to this function will be made whenever that
-    situation is resolved. (If the lock is held, the lock holder should start
-    the jobs. If there aren't enough resources, the next controller to exit and
-    free up resources should start the jobs.)
-    If this function obtains the lock, it will launch as many jobs as possible
-    before releasing the lock. This is what allows other calls to exit
-    immediately if the lock is held, while ensuring that all jobs are started as
-    soon as possible.
-    This uses subprocess_utils.launch_new_process_tree() to start the controller
-    processes, which should be safe to call from pretty much any code running on
-    the jobs controller instance. New job controller processes will be detached
-    from the current process and there will not be a parent/child relationship.
-    See launch_new_process_tree for more.
-    After adding the pool support, this function will be called in a per-pool
-    basis. We employ resources limitation for each pool given the number of
-    ready workers in the pool. Each pool will have its own scheduler queue,
-    indicating by the argument `pool`. Finished job in pool 1 will only trigger
-    another jobs in pool 1, but the job in pool 2 will still be waiting. When
-    the `pool` argument is None, it schedules a job regardless of the pool.
+    with open(JOB_CONTROLLER_PID_PATH, 'a', encoding='utf-8') as f:
+        f.write(str(pid) + '\n')
+def get_alive_controllers() -> typing.Optional[int]:
+    if not os.path.exists(JOB_CONTROLLER_PID_PATH):
+        # if the file doesn't exist, it means the controller server is not
+        # running, so we return 0
+        return 0
+    try:
+        with open(JOB_CONTROLLER_PID_PATH, 'r', encoding='utf-8') as f:
+            pids = f.read().split('\n')[:-1]
+    except OSError:
+        # if the file is corrupted, or any issues with reading it, we just
+        # return None to be safe and not over start
+        return None
+    alive = 0
+    for pid in pids:
+        try:
+            # TODO(luca) there is a chance that the process that is alive is
+            # not the same controller process. a better solution is to also
+            # include a random UUID with each controller and store that in the
+            # db as well/in the command that spawns it.
+            if subprocess_utils.is_process_alive(int(pid.strip())):
+                alive += 1
+        except ValueError:
+            # if the pid is not an integer, let's assume it's alive to not
+            # over start new processes
+            alive += 1
+    return alive
+def maybe_start_controllers(from_scheduler: bool = False) -> None:
+    """Start the job controller process.
+    If the process is already running, it will not start a new one.
+    Will also add the job_id, dag_yaml_path, and env_file_path to the
+    controllers list of processes.
     """
     try:
-        # We must use a global lock rather than a per-job lock to ensure correct
-        # parallelism control. If we cannot obtain the lock, exit immediately.
-        # The current lock holder is expected to launch any jobs it can before
-        # releasing the lock.
-        with filelock.FileLock(controller_utils.get_resources_lock_path(),
-                               blocking=False):
-            while True:
-                maybe_next_job = state.get_waiting_job()
-                if maybe_next_job is None:
-                    # Nothing left to start, break from scheduling loop
-                    break
-                actual_pool = maybe_next_job['pool']
-                current_state = maybe_next_job['schedule_state']
-                assert current_state in (
-                    state.ManagedJobScheduleState.ALIVE_WAITING,
-                    state.ManagedJobScheduleState.WAITING), maybe_next_job
-                # Note: we expect to get ALIVE_WAITING jobs before WAITING jobs,
-                # since they will have been submitted and therefore started
-                # first. The requirements to launch in an alive job are more
-                # lenient, so there is no way that we wouldn't be able to launch
-                # an ALIVE_WAITING job, but we would be able to launch a WAITING
-                # job.
-                if current_state == state.ManagedJobScheduleState.ALIVE_WAITING:
-                    if not controller_utils.can_provision():
-                        # Can't schedule anything, break from scheduling loop.
-                        break
-                elif current_state == state.ManagedJobScheduleState.WAITING:
-                    if not _can_start_new_job(actual_pool):
-                        # Can't schedule anything, break from scheduling loop.
-                        break
-                logger.debug(f'Scheduling job {maybe_next_job["job_id"]}')
-                state.scheduler_set_launching(maybe_next_job['job_id'],
-                                              current_state)
-                if current_state == state.ManagedJobScheduleState.WAITING:
-                    # The job controller has not been started yet. We must start
-                    # it.
-                    job_id = maybe_next_job['job_id']
-                    dag_yaml_path = maybe_next_job['dag_yaml_path']
-                    env_file_path = maybe_next_job['env_file_path']
-                    _start_controller(job_id, dag_yaml_path, env_file_path,
-                                      actual_pool)
+        with filelock.FileLock(JOB_CONTROLLER_PID_LOCK, blocking=False):
+            if from_scheduler and not managed_job_utils.is_consolidation_mode():
+                cur = pathlib.Path(CURRENT_HASH)
+                old = pathlib.Path(f'{CURRENT_HASH}.old')
+                if old.exists() and cur.exists():
+                    if (old.read_text(encoding='utf-8') !=
+                            cur.read_text(encoding='utf-8')):
+                        # TODO(luca): there is a 1/2^160 chance that there will
+                        # be a collision. using a geometric distribution and
+                        # assuming one update a day, we expect a bug slightly
+                        # before the heat death of the universe. should get
+                        # this fixed before then.
+                        try:
+                            # this will stop all the controllers and the api
+                            # server.
+                            sdk.api_stop()
+                            # All controllers should be dead. Remove the PIDs so
+                            # that update_managed_jobs_statuses won't think they
+                            # have failed.
+                            state.reset_jobs_for_recovery()
+                        except Exception as e:  # pylint: disable=broad-except
+                            logger.error(f'Failed to stop the api server: {e}')
+                            pass
+                        else:
+                            shutil.copyfile(cur, old)
+                if not old.exists():
+                    shutil.copyfile(cur, old)
+            alive = get_alive_controllers()
+            if alive is None:
+                return
+            wanted = get_number_of_controllers()
+            started = 0
+            while alive + started < wanted:
+                start_controller()
+                started += 1
+            if started > 0:
+                logger.info(f'Started {started} controllers')
     except filelock.Timeout:
         # If we can't get the lock, just exit. The process holding the lock
         # should launch any pending jobs.
@@ -188,30 +256,46 @@ def maybe_schedule_next_jobs() -> None:
 def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
-               env_file_path: str, priority: int, pool: Optional[str]) -> None:
+               env_file_path: str, priority: int) -> None:
     """Submit an existing job to the scheduler.
     This should be called after a job is created in the `spot` table as
     PENDING. It will tell the scheduler to try and start the job controller, if
-    there are resources available. It may block to acquire the lock, so it
-    should not be on the critical path for `sky jobs launch -d`.
+    there are resources available.
     The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
     """
-    with filelock.FileLock(controller_utils.get_resources_lock_path()):
-        is_resume = state.scheduler_set_waiting(job_id, dag_yaml_path,
-                                                original_user_yaml_path,
-                                                env_file_path,
-                                                common_utils.get_user_hash(),
-                                                priority)
-    if is_resume:
-        _start_controller(job_id, dag_yaml_path, env_file_path, pool)
-    else:
-        maybe_schedule_next_jobs()
-@contextlib.contextmanager
-def scheduled_launch(job_id: int):
+    controller_pid = state.get_job_controller_pid(job_id)
+    if controller_pid is not None:
+        # why? TODO(cooperc): figure out why this is needed, fix it, and remove
+        if managed_job_utils.controller_process_alive(controller_pid, job_id):
+            # This can happen when HA recovery runs for some reason but the job
+            # controller is still alive.
+            logger.warning(f'Job {job_id} is still alive, skipping submission')
+            maybe_start_controllers(from_scheduler=True)
+            return
+    state.scheduler_set_waiting(job_id, dag_yaml_path,
+                                original_user_yaml_path, env_file_path,
+                                common_utils.get_user_hash(), priority)
+    if state.get_ha_recovery_script(job_id) is None:
+        # the run command is just the command that called scheduler
+        run = (f'{sys.executable} -m sky.jobs.scheduler {dag_yaml_path} '
+               f'--job-id {job_id} --env-file {env_file_path} '
+               f'--user-yaml-path {original_user_yaml_path} '
+               f'--priority {priority}')
+        state.set_ha_recovery_script(job_id, run)
+    maybe_start_controllers(from_scheduler=True)
+@contextlib.asynccontextmanager
+async def scheduled_launch(
+    job_id: int,
+    starting: Set[int],
+    starting_lock: asyncio.Lock,
+    starting_signal: asyncio.Condition,
+    job_logger: 'logging.Logger',
+):
     """Launch as part of an ongoing job.
     A newly started job will already be LAUNCHING, and this will immediately
@@ -240,30 +324,34 @@ def scheduled_launch(job_id: int):
         yield
         return
-    # If we're already in LAUNCHING schedule_state, we don't need to wait.
-    # This may be the case for the first launch of a job.
-    if (state.get_job_schedule_state(job_id) !=
-            state.ManagedJobScheduleState.LAUNCHING):
-        # Since we aren't LAUNCHING, we need to wait to be scheduled.
-        _set_alive_waiting(job_id)
+    assert starting_lock == starting_signal._lock, (  # type: ignore #pylint: disable=protected-access
+        'starting_lock and starting_signal must use the same lock')
-        while (state.get_job_schedule_state(job_id) !=
-               state.ManagedJobScheduleState.LAUNCHING):
-            time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
+    while True:
+        async with starting_lock:
+            starting_count = len(starting)
+            if starting_count < LAUNCHES_PER_WORKER:
+                break
+            job_logger.info('Too many jobs starting, waiting for a slot')
+            await starting_signal.wait()
+    job_logger.info(f'Starting job {job_id}')
+    async with starting_lock:
+        starting.add(job_id)
+    await state.scheduler_set_launching_async(job_id)
     try:
         yield
-    except exceptions.NoClusterLaunchedError:
-        # NoClusterLaunchedError is indicates that the job is in retry backoff.
-        # We should transition to ALIVE_BACKOFF instead of ALIVE.
-        with filelock.FileLock(controller_utils.get_resources_lock_path()):
-            state.scheduler_set_alive_backoff(job_id)
-        raise
+    except Exception as e:
+        raise e
     else:
-        with filelock.FileLock(controller_utils.get_resources_lock_path()):
-            state.scheduler_set_alive(job_id)
+        await state.scheduler_set_alive_async(job_id)
     finally:
-        maybe_schedule_next_jobs()
+        async with starting_lock:
+            starting.remove(job_id)
+            starting_signal.notify()
 def job_done(job_id: int, idempotent: bool = False) -> None:
@@ -274,38 +362,23 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
     The job could be in any terminal ManagedJobStatus. However, once DONE, it
     should never transition back to another state.
+    This is only called by utils.update_managed_jobs_statuses which is sync.
     """
     if idempotent and (state.get_job_schedule_state(job_id)
                        == state.ManagedJobScheduleState.DONE):
         return
-    with filelock.FileLock(controller_utils.get_resources_lock_path()):
-        state.scheduler_set_done(job_id, idempotent)
-    maybe_schedule_next_jobs()
+    state.scheduler_set_done(job_id, idempotent)
-def _set_alive_waiting(job_id: int) -> None:
-    """Should use wait_until_launch_okay() to transition to this state."""
-    with filelock.FileLock(controller_utils.get_resources_lock_path()):
-        state.scheduler_set_alive_waiting(job_id)
-    maybe_schedule_next_jobs()
+async def job_done_async(job_id: int, idempotent: bool = False):
+    """Async version of job_done."""
+    if idempotent and (await state.get_job_schedule_state_async(job_id)
+                       == state.ManagedJobScheduleState.DONE):
+        return
-def _can_start_new_job(pool: Optional[str]) -> bool:
-    # Check basic resource limits
-    # Pool jobs don't need to provision resources, so we skip the check.
-    if not ((controller_utils.can_provision() or pool is not None) and
-            controller_utils.can_start_new_process()):
-        return False
-    # Check if there are available workers in the pool
-    if pool is not None:
-        alive_jobs_in_pool = state.get_num_alive_jobs(pool)
-        if alive_jobs_in_pool >= len(serve_utils.get_ready_replicas(pool)):
-            logger.debug(f'No READY workers available in pool {pool}')
-            return False
-    return True
+    await state.scheduler_set_done_async(job_id, idempotent)
 if __name__ == '__main__':
@@ -337,4 +410,4 @@ if __name__ == '__main__':
         f' Default: {constants.DEFAULT_PRIORITY}.')
     args = parser.parse_args()
     submit_job(args.job_id, args.dag_yaml, args.user_yaml_path, args.env_file,
-               args.priority, args.pool)
+               args.priority)

sky/jobs/server/core.py CHANGED Viewed

@@ -1,9 +1,11 @@
 """SDK functions for managed jobs."""
+import ipaddress
 import os
 import pathlib
 import tempfile
 import typing
 from typing import Any, Dict, List, Optional, Tuple, Union
+from urllib import parse as urlparse
 import uuid
 import colorama
@@ -188,6 +190,7 @@ def launch(
     dag_uuid = str(uuid.uuid4().hex[:4])
     dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
     # Always apply the policy again here, even though it might have been applied
     # in the CLI. This is to ensure that we apply the policy to the final DAG
     # and get the mutated config.
@@ -202,6 +205,21 @@ def launch(
     # pre-mount operations when submitting jobs.
     dag.pre_mount_volumes()
+    # If there is a local postgres db, when the api server tries launching on
+    # the remote jobs controller it will fail. therefore, we should remove this
+    # before sending the config to the jobs controller.
+    # TODO(luca) there are a lot of potential problems with postgres being sent
+    # to the jobs controller. for example if the postgres is whitelisted to
+    # only the API server, this will then break. the simple solution to that is
+    # telling the user to add the jobs controller to the postgres whitelist.
+    if not managed_job_utils.is_consolidation_mode():
+        db_path = mutated_user_config.get('db', None)
+        if db_path is not None:
+            parsed = urlparse.urlparse(db_path)
+            if ((parsed.hostname == 'localhost' or
+                 ipaddress.ip_address(parsed.hostname).is_loopback)):
+                mutated_user_config.pop('db', None)
     user_dag_str_user_specified = dag_utils.dump_chain_dag_to_yaml_str(
         dag, use_user_specified_yaml=True)
@@ -424,10 +442,8 @@ def launch(
                     ]
                     run_script = '\n'.join(env_cmds + [run_script])
                     # Dump script for high availability recovery.
-                    if controller_utils.high_availability_specified(
-                            controller_name):
-                        managed_job_state.set_ha_recovery_script(
-                            consolidation_mode_job_id, run_script)
+                    managed_job_state.set_ha_recovery_script(
+                        consolidation_mode_job_id, run_script)
                     backend.run_on_head(local_handle, run_script)
                     return consolidation_mode_job_id, local_handle

sky/jobs/server/utils.py CHANGED Viewed

@@ -11,7 +11,6 @@ logger = sky_logging.init_logger(__name__)
 def check_version_mismatch_and_non_terminal_jobs() -> None:
     """Check if controller has version mismatch and non-terminal jobs exist.
     Raises:
         ValueError: If there's a version mismatch and non-terminal jobs exist.
         sky.exceptions.ClusterNotUpError: If the controller is not accessible.
@@ -59,7 +58,8 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
     job_table_payload = output_parts[1]
     # Process locally: check version match and filter non-terminal jobs
-    version_matches = controller_version == local_version
+    version_matches = (controller_version == local_version or
+                       int(controller_version) > 17)
     # Load and filter jobs locally using existing method
     jobs, _, _, _, _ = managed_job_utils.load_managed_job_queue(

skypilot-nightly 1.0.0.dev20250909__py3-none-any.whl → 1.0.0.dev20250910__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250909py3-none-any.whl → 1.0.0.dev20250910py3-none-any.whl