PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250909__py3-none-any.whl → 1.0.0.dev20250910__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250909py3-none-any.whl → 1.0.0.dev20250910py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (67) hide show

sky/__init__.py +2 -2
sky/authentication.py +19 -4
sky/backends/backend_utils.py +35 -1
sky/backends/cloud_vm_ray_backend.py +2 -2
sky/client/sdk.py +20 -0
sky/client/sdk_async.py +18 -16
sky/clouds/aws.py +3 -1
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/{webpack-d4fabc08788e14af.js → webpack-1d7e11230da3ca89.js} +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/storage.py +5 -1
sky/execution.py +21 -14
sky/jobs/constants.py +3 -0
sky/jobs/controller.py +732 -310
sky/jobs/recovery_strategy.py +251 -129
sky/jobs/scheduler.py +247 -174
sky/jobs/server/core.py +20 -4
sky/jobs/server/utils.py +2 -2
sky/jobs/state.py +702 -511
sky/jobs/utils.py +94 -39
sky/provision/aws/config.py +4 -1
sky/provision/gcp/config.py +6 -1
sky/provision/kubernetes/utils.py +17 -8
sky/provision/provisioner.py +1 -0
sky/serve/replica_managers.py +0 -7
sky/serve/serve_utils.py +5 -0
sky/serve/server/impl.py +1 -2
sky/serve/service.py +0 -2
sky/server/common.py +8 -3
sky/server/config.py +43 -24
sky/server/constants.py +1 -0
sky/server/daemons.py +7 -11
sky/server/requests/serializers/encoders.py +1 -1
sky/server/server.py +8 -1
sky/setup_files/dependencies.py +4 -2
sky/skylet/attempt_skylet.py +1 -0
sky/skylet/constants.py +3 -1
sky/skylet/events.py +2 -10
sky/utils/command_runner.pyi +3 -3
sky/utils/common_utils.py +11 -1
sky/utils/controller_utils.py +5 -0
sky/utils/db/db_utils.py +31 -2
sky/utils/rich_utils.py +3 -1
sky/utils/subprocess_utils.py +9 -0
sky/volumes/volume.py +2 -0
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/METADATA +39 -37
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/RECORD +67 -67
/sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → 3SYxqNGnvvPS8h3gdD2T7}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → 3SYxqNGnvvPS8h3gdD2T7}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/top_level.txt +0 -0

sky/jobs/utils.py CHANGED Viewed

@@ -4,9 +4,11 @@ NOTE: whenever an API change is made in this file, we need to bump the
 jobs.constants.MANAGED_JOBS_VERSION and handle the API change in the
 ManagedJobCodeGen.
 """
+import asyncio
 import collections
 import datetime
 import enum
+import logging
 import os
 import pathlib
 import shlex
@@ -14,11 +16,11 @@ import textwrap
 import time
 import traceback
 import typing
-from typing import Any, Deque, Dict, List, Optional, Set, TextIO, Tuple, Union
+from typing import (Any, Deque, Dict, List, Literal, Optional, Set, TextIO,
+                    Tuple, Union)
 import colorama
 import filelock
-from typing_extensions import Literal
 from sky import backends
 from sky import exceptions
@@ -37,6 +39,7 @@ from sky.usage import usage_lib
 from sky.utils import annotations
 from sky.utils import command_runner
 from sky.utils import common_utils
+from sky.utils import context_utils
 from sky.utils import controller_utils
 from sky.utils import infra_utils
 from sky.utils import log_utils
@@ -56,9 +59,9 @@ else:
 logger = sky_logging.init_logger(__name__)
-SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
 # Controller checks its job's status every this many seconds.
-JOB_STATUS_CHECK_GAP_SECONDS = 20
+# This is a tradeoff between the latency and the resource usage.
+JOB_STATUS_CHECK_GAP_SECONDS = 15
 # Controller checks if its job has started every this many seconds.
 JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
@@ -82,7 +85,7 @@ _JOB_CANCELLED_MESSAGE = (
 # blocking for a long time. This should be significantly longer than the
 # JOB_STATUS_CHECK_GAP_SECONDS to avoid timing out before the controller can
 # update the state.
-_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 40
+_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 120
 class ManagedJobQueueResultType(enum.Enum):
@@ -99,7 +102,11 @@ class UserSignal(enum.Enum):
 # ====== internal functions ======
-def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
+def terminate_cluster(
+        cluster_name: str,
+        max_retry: int = 6,
+        _logger: logging.Logger = logger,  # pylint: disable=invalid-name
+) -> None:
     """Terminate the cluster."""
     from sky import core  # pylint: disable=import-outside-toplevel
     retry_cnt = 0
@@ -122,18 +129,18 @@ def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
             return
         except exceptions.ClusterDoesNotExist:
             # The cluster is already down.
-            logger.debug(f'The cluster {cluster_name} is already down.')
+            _logger.debug(f'The cluster {cluster_name} is already down.')
             return
         except Exception as e:  # pylint: disable=broad-except
             retry_cnt += 1
             if retry_cnt >= max_retry:
                 raise RuntimeError(
                     f'Failed to terminate the cluster {cluster_name}.') from e
-            logger.error(
+            _logger.error(
                 f'Failed to terminate the cluster {cluster_name}. Retrying.'
                 f'Details: {common_utils.format_exception(e)}')
             with ux_utils.enable_traceback():
-                logger.error(f'  Traceback: {traceback.format_exc()}')
+                _logger.error(f'  Traceback: {traceback.format_exc()}')
             time.sleep(backoff.current_backoff())
@@ -183,6 +190,9 @@ def _validate_consolidation_mode_config(
 # Use LRU Cache so that the check is only done once.
 @annotations.lru_cache(scope='request', maxsize=1)
 def is_consolidation_mode() -> bool:
+    if os.environ.get(constants.OVERRIDE_CONSOLIDATION_MODE) is not None:
+        return True
     consolidation_mode = skypilot_config.get_nested(
         ('jobs', 'controller', 'consolidation_mode'), default_value=False)
     # We should only do this check on API server, as the controller will not
@@ -199,6 +209,7 @@ def ha_recovery_for_consolidation_mode():
     # already has all runtime installed. Directly start jobs recovery here.
     # Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
     runner = command_runner.LocalProcessCommandRunner()
+    scheduler.maybe_start_controllers()
     with open(constants.HA_PERSISTENT_RECOVERY_LOG_PATH.format('jobs_'),
               'w',
               encoding='utf-8') as f:
@@ -214,7 +225,7 @@ def ha_recovery_for_consolidation_mode():
             # just keep running.
             if controller_pid is not None:
                 try:
-                    if _controller_process_alive(controller_pid, job_id):
+                    if controller_process_alive(controller_pid, job_id):
                         f.write(f'Controller pid {controller_pid} for '
                                 f'job {job_id} is still running. '
                                 'Skipping recovery.\n')
@@ -227,7 +238,7 @@ def ha_recovery_for_consolidation_mode():
             if job['schedule_state'] not in [
                     managed_job_state.ManagedJobScheduleState.DONE,
-                    managed_job_state.ManagedJobScheduleState.WAITING
+                    managed_job_state.ManagedJobScheduleState.WAITING,
             ]:
                 script = managed_job_state.get_ha_recovery_script(job_id)
                 if script is None:
@@ -242,56 +253,66 @@ def ha_recovery_for_consolidation_mode():
         f.write(f'Total recovery time: {time.time() - start} seconds\n')
-def get_job_status(backend: 'backends.CloudVmRayBackend', cluster_name: str,
-                   job_id: Optional[int]) -> Optional['job_lib.JobStatus']:
+async def get_job_status(
+        backend: 'backends.CloudVmRayBackend', cluster_name: str,
+        job_id: Optional[int],
+        job_logger: logging.Logger) -> Optional['job_lib.JobStatus']:
     """Check the status of the job running on a managed job cluster.
     It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
     FAILED_SETUP or CANCELLED.
     """
-    handle = global_user_state.get_handle_from_cluster_name(cluster_name)
+    # TODO(luca) make this async
+    handle = await context_utils.to_thread(
+        global_user_state.get_handle_from_cluster_name, cluster_name)
     if handle is None:
         # This can happen if the cluster was preempted and background status
         # refresh already noticed and cleaned it up.
-        logger.info(f'Cluster {cluster_name} not found.')
+        job_logger.info(f'Cluster {cluster_name} not found.')
         return None
     assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
     job_ids = None if job_id is None else [job_id]
     for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
         try:
-            logger.info('=== Checking the job status... ===')
-            statuses = backend.get_job_status(handle,
-                                              job_ids=job_ids,
-                                              stream_logs=False)
+            job_logger.info('=== Checking the job status... ===')
+            statuses = await context_utils.to_thread(backend.get_job_status,
+                                                     handle,
+                                                     job_ids=job_ids,
+                                                     stream_logs=False)
             status = list(statuses.values())[0]
             if status is None:
-                logger.info('No job found.')
+                job_logger.info('No job found.')
             else:
-                logger.info(f'Job status: {status}')
-            logger.info('=' * 34)
+                job_logger.info(f'Job status: {status}')
+            job_logger.info('=' * 34)
             return status
         except exceptions.CommandError as e:
             # Retry on k8s transient network errors. This is useful when using
             # coreweave which may have transient network issue sometimes.
             if (e.detailed_reason is not None and
                     _JOB_K8S_TRANSIENT_NW_MSG in e.detailed_reason):
-                logger.info('Failed to connect to the cluster. Retrying '
-                            f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
-                logger.info('=' * 34)
-                time.sleep(1)
+                job_logger.info('Failed to connect to the cluster. Retrying '
+                                f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
+                job_logger.info('=' * 34)
+                await asyncio.sleep(1)
             else:
-                logger.info(f'Failed to get job status: {e.detailed_reason}')
-                logger.info('=' * 34)
+                job_logger.info(
+                    f'Failed to get job status: {e.detailed_reason}')
+                job_logger.info('=' * 34)
                 return None
     return None
-def _controller_process_alive(pid: int, job_id: int) -> bool:
+def controller_process_alive(pid: int, job_id: int) -> bool:
     """Check if the controller process is alive."""
     try:
+        if pid < 0:
+            # new job controller process will always be negative
+            pid = -pid
         process = psutil.Process(pid)
         cmd_str = ' '.join(process.cmdline())
-        return process.is_running() and f'--job-id {job_id}' in cmd_str
+        return process.is_running() and ((f'--job-id {job_id}' in cmd_str) or
+                                         ('controller' in cmd_str))
     except psutil.NoSuchProcess:
         return False
@@ -466,7 +487,7 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
             failure_reason = f'No controller pid set for {schedule_state.value}'
         else:
             logger.debug(f'Checking controller pid {pid}')
-            if _controller_process_alive(pid, job_id):
+            if controller_process_alive(pid, job_id):
                 # The controller is still running, so this job is fine.
                 continue
@@ -604,7 +625,17 @@ def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
             f'Bash:{event_callback},log_path:{log_path},result:{result}')
         logger.info(f'=== END: event callback for {status!r} ===')
-    return callback_func
+    try:
+        asyncio.get_running_loop()
+        # In async context
+        async def async_callback_func(status: str):
+            return await context_utils.to_thread(callback_func, status)
+        return async_callback_func
+    except RuntimeError:
+        # Not in async context
+        return callback_func
 # ======== user functions ========
@@ -651,16 +682,41 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]],
             logger.info(f'Job {job_id} is already in terminal state '
                         f'{job_status.value}. Skipped.')
             continue
+        elif job_status == managed_job_state.ManagedJobStatus.PENDING:
+            # the if is a short circuit, this will be atomic.
+            cancelled = managed_job_state.set_pending_cancelled(job_id)
+            if cancelled:
+                cancelled_job_ids.append(job_id)
+                continue
         update_managed_jobs_statuses(job_id)
+        job_controller_pid = managed_job_state.get_job_controller_pid(job_id)
+        if job_controller_pid is not None and job_controller_pid < 0:
+            # This is a consolidated job controller, so we need to cancel the
+            # with the controller server API
+            try:
+                # we create a file as a signal to the controller server
+                signal_file = pathlib.Path(
+                    managed_job_constants.CONSOLIDATED_SIGNAL_PATH, f'{job_id}')
+                signal_file.touch()
+                cancelled_job_ids.append(job_id)
+            except OSError as e:
+                logger.error(f'Failed to cancel job {job_id} '
+                             f'with controller server: {e}')
+                # don't add it to the to be cancelled job ids, since we don't
+                # know for sure yet.
+                continue
+            continue
         job_workspace = managed_job_state.get_workspace(job_id)
         if current_workspace is not None and job_workspace != current_workspace:
             wrong_workspace_job_ids.append(job_id)
             continue
         # Send the signal to the jobs controller.
-        signal_file = pathlib.Path(SIGNAL_FILE_PREFIX.format(job_id))
+        signal_file = (pathlib.Path(
+            managed_job_constants.SIGNAL_FILE_PREFIX.format(job_id)))
         # Filelock is needed to prevent race condition between signal
         # check/removal and signal writing.
         with filelock.FileLock(str(signal_file) + '.lock'):
@@ -1159,8 +1215,7 @@ def dump_managed_job_queue(
                 # It's possible for a WAITING/ALIVE_WAITING job to be ready to
                 # launch, but the scheduler just hasn't run yet.
                 managed_job_state.ManagedJobScheduleState.WAITING,
-                managed_job_state.ManagedJobScheduleState.ALIVE_WAITING,
-        ):
+                managed_job_state.ManagedJobScheduleState.ALIVE_WAITING):
             # This job will not block others.
             continue
@@ -1370,12 +1425,12 @@ def load_managed_job_queue(
     """Load job queue from json string."""
     result = message_utils.decode_payload(payload)
     result_type = ManagedJobQueueResultType.DICT
-    status_counts = {}
+    status_counts: Dict[str, int] = {}
     if isinstance(result, dict):
-        jobs = result['jobs']
-        total = result['total']
+        jobs: List[Dict[str, Any]] = result['jobs']
+        total: int = result['total']
         status_counts = result.get('status_counts', {})
-        total_no_filter = result.get('total_no_filter', total)
+        total_no_filter: int = result.get('total_no_filter', total)
     else:
         jobs = result
         total = len(jobs)

sky/provision/aws/config.py CHANGED Viewed

@@ -305,7 +305,10 @@ def _get_route_tables(ec2: 'mypy_boto3_ec2.ServiceResource',
     Returns:
         A list of route tables associated with the options VPC and region
     """
-    filters = [{'Name': 'association.main', 'Values': [str(main).lower()]}]
+    filters: List['ec2_type_defs.FilterTypeDef'] = [{
+        'Name': 'association.main',
+        'Values': [str(main).lower()],
+    }]
     if vpc_id is not None:
         filters.append({'Name': 'vpc-id', 'Values': [vpc_id]})
     logger.debug(

sky/provision/gcp/config.py CHANGED Viewed

@@ -5,6 +5,8 @@ import time
 import typing
 from typing import Any, Dict, List, Set, Tuple
+from typing_extensions import TypedDict
 from sky.adaptors import gcp
 from sky.clouds.utils import gcp_utils
 from sky.provision import common
@@ -415,6 +417,9 @@ def _configure_iam_role(config: common.ProvisionConfig, crm, iam) -> dict:
     return iam_role
+AllowedList = TypedDict('AllowedList', {'IPProtocol': str, 'ports': List[str]})
 def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
                           compute):
     """Check if the firewall rules in the VPC are sufficient."""
@@ -466,7 +471,7 @@ def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
             }
         """
         source2rules: Dict[Tuple[str, str], Dict[str, Set[int]]] = {}
-        source2allowed_list: Dict[Tuple[str, str], List[Dict[str, str]]] = {}
+        source2allowed_list: Dict[Tuple[str, str], List[AllowedList]] = {}
         for rule in rules:
             # Rules applied to specific VM (targetTags) may not work for the
             # current VM, so should be skipped.

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -451,6 +451,9 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
     LABEL_KEY = 'gpu.nvidia.com/class'
+    # TODO (kyuds): fill in more label values for different accelerators.
+    ACC_VALUE_MAPPINGS = {'H100_NVLINK_80GB': 'H100'}
     @classmethod
     def get_label_key(cls, accelerator: Optional[str] = None) -> str:
         return cls.LABEL_KEY
@@ -469,7 +472,8 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
     @classmethod
     def get_accelerator_from_label_value(cls, value: str) -> str:
-        return value
+        # return original label value if not found in mappings.
+        return cls.ACC_VALUE_MAPPINGS.get(value, value)
 class GKELabelFormatter(GPULabelFormatter):
@@ -1012,15 +1016,16 @@ class GKEAutoscaler(Autoscaler):
         to fit the instance type.
         """
         for accelerator in node_pool_accelerators:
+            raw_value = accelerator['acceleratorType']
             node_accelerator_type = (
-                GKELabelFormatter.get_accelerator_from_label_value(
-                    accelerator['acceleratorType']))
+                GKELabelFormatter.get_accelerator_from_label_value(raw_value))
             # handle heterogenous nodes.
             if not node_accelerator_type:
                 continue
             node_accelerator_count = accelerator['acceleratorCount']
-            if node_accelerator_type == requested_gpu_type and int(
-                    node_accelerator_count) >= requested_gpu_count:
+            viable_names = [node_accelerator_type.lower(), raw_value.lower()]
+            if (requested_gpu_type.lower() in viable_names and
+                    int(node_accelerator_count) >= requested_gpu_count):
                 return True
         return False
@@ -1448,9 +1453,13 @@ def get_accelerator_label_key_values(
                 if is_multi_host_tpu(node_metadata_labels):
                     continue
                 for label, value in label_list:
-                    if (label_formatter.match_label_key(label) and
-                            label_formatter.get_accelerator_from_label_value(
-                                value).lower() == acc_type.lower()):
+                    if label_formatter.match_label_key(label):
+                        # match either canonicalized name or raw name
+                        accelerator = (label_formatter.
+                                       get_accelerator_from_label_value(value))
+                        viable = [value.lower(), accelerator.lower()]
+                        if acc_type.lower() not in viable:
+                            continue
                         if is_tpu_on_gke(acc_type):
                             assert isinstance(label_formatter,
                                               GKELabelFormatter)

sky/provision/provisioner.py CHANGED Viewed

@@ -526,6 +526,7 @@ def _post_provision_setup(
             status.update(
                 ux_utils.spinner_message(
                     'Checking controller version compatibility'))
             try:
                 server_jobs_utils.check_version_mismatch_and_non_terminal_jobs()
             except exceptions.ClusterNotUpError:

sky/serve/replica_managers.py CHANGED Viewed

@@ -22,7 +22,6 @@ from sky import global_user_state
 from sky import sky_logging
 from sky import task as task_lib
 from sky.backends import backend_utils
-from sky.jobs import scheduler as jobs_scheduler
 from sky.serve import constants as serve_constants
 from sky.serve import serve_state
 from sky.serve import serve_utils
@@ -1052,7 +1051,6 @@ class SkyPilotReplicaManager(ReplicaManager):
                     self._service_name, replica_id)
                 assert info is not None, replica_id
                 error_in_sky_launch = False
-                schedule_next_jobs = False
                 if info.status == serve_state.ReplicaStatus.PENDING:
                     # sky.launch not started yet
                     if controller_utils.can_provision():
@@ -1080,7 +1078,6 @@ class SkyPilotReplicaManager(ReplicaManager):
                     else:
                         info.status_property.sky_launch_status = (
                             common_utils.ProcessStatus.SUCCEEDED)
-                        schedule_next_jobs = True
                     if self._spot_placer is not None and info.is_spot:
                         # TODO(tian): Currently, we set the location to
                         # preemptive if the launch process failed. This is
@@ -1100,16 +1097,12 @@ class SkyPilotReplicaManager(ReplicaManager):
                             self._spot_placer.set_active(location)
                 serve_state.add_or_update_replica(self._service_name,
                                                   replica_id, info)
-                if schedule_next_jobs and self._is_pool:
-                    jobs_scheduler.maybe_schedule_next_jobs()
                 if error_in_sky_launch:
                     # Teardown after update replica info since
                     # _terminate_replica will update the replica info too.
                     self._terminate_replica(replica_id,
                                             sync_down_logs=True,
                                             replica_drain_delay_seconds=0)
-            # Try schedule next job after acquiring the lock.
-            jobs_scheduler.maybe_schedule_next_jobs()
         down_process_pool_snapshot = list(self._down_process_pool.items())
         for replica_id, p in down_process_pool_snapshot:
             if p.is_alive():

sky/serve/serve_utils.py CHANGED Viewed

@@ -294,6 +294,11 @@ def is_consolidation_mode(pool: bool = False) -> bool:
     # We should only do this check on API server, as the controller will not
     # have related config and will always seemingly disabled for consolidation
     # mode. Check #6611 for more details.
+    if (os.environ.get(skylet_constants.OVERRIDE_CONSOLIDATION_MODE) is not None
+            and controller.controller_type == 'jobs'):
+        # if we are in the job controller, we must always be in consolidation
+        # mode.
+        return True
     if os.environ.get(skylet_constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
         _validate_consolidation_mode_config(consolidation_mode, pool)
     return consolidation_mode

sky/serve/server/impl.py CHANGED Viewed

@@ -280,8 +280,7 @@ def up(
             ]
             run_script = '\n'.join(env_cmds + [run_script])
             # Dump script for high availability recovery.
-            if controller_utils.high_availability_specified(controller_name):
-                serve_state.set_ha_recovery_script(service_name, run_script)
+            serve_state.set_ha_recovery_script(service_name, run_script)
             backend.run_on_head(controller_handle, run_script)
         style = colorama.Style

sky/serve/service.py CHANGED Viewed

@@ -21,7 +21,6 @@ from sky import task as task_lib
 from sky.backends import backend_utils
 from sky.backends import cloud_vm_ray_backend
 from sky.data import data_utils
-from sky.jobs import scheduler as jobs_scheduler
 from sky.serve import constants
 from sky.serve import controller
 from sky.serve import load_balancer
@@ -278,7 +277,6 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
                 pool=service_spec.pool,
                 controller_pid=os.getpid(),
                 entrypoint=entrypoint)
-        jobs_scheduler.maybe_schedule_next_jobs()
         # Directly throw an error here. See sky/serve/api.py::up
         # for more details.
         if not success:

sky/server/common.py CHANGED Viewed

@@ -538,12 +538,17 @@ def _start_api_server(deploy: bool = False,
         # Check available memory before starting the server.
         avail_mem_size_gb: float = common_utils.get_mem_size_gb()
-        if avail_mem_size_gb <= server_constants.MIN_AVAIL_MEM_GB:
+        # pylint: disable=import-outside-toplevel
+        import sky.jobs.utils as job_utils
+        max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
+                      if job_utils.is_consolidation_mode() else
+                      server_constants.MIN_AVAIL_MEM_GB)
+        if avail_mem_size_gb <= max_memory:
             logger.warning(
                 f'{colorama.Fore.YELLOW}Your SkyPilot API server machine only '
                 f'has {avail_mem_size_gb:.1f}GB memory available. '
-                f'At least {server_constants.MIN_AVAIL_MEM_GB}GB is '
-                'recommended to support higher load with better performance.'
+                f'At least {max_memory}GB is recommended to support higher '
+                'load with better performance.'
                 f'{colorama.Style.RESET_ALL}')
         args = [sys.executable, *API_SERVER_CMD.split()]

sky/server/config.py CHANGED Viewed

@@ -19,8 +19,9 @@ from sky.utils import common_utils
 # TODO(aylei): maintaining these constants is error-prone, we may need to
 # automatically tune parallelism at runtime according to system usage stats
 # in the future.
-_LONG_WORKER_MEM_GB = 0.4
-_SHORT_WORKER_MEM_GB = 0.25
+# TODO(luca): The future is now! ^^^
+LONG_WORKER_MEM_GB = 0.4
+SHORT_WORKER_MEM_GB = 0.25
 # To control the number of long workers.
 _CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
 # Limit the number of long workers of local API server, since local server is
@@ -75,8 +76,8 @@ class ServerConfig:
 def compute_server_config(deploy: bool,
-                          max_db_connections: Optional[int] = None
-                         ) -> ServerConfig:
+                          max_db_connections: Optional[int] = None,
+                          quiet: bool = False) -> ServerConfig:
     """Compute the server config based on environment.
     We have different assumptions for the resources in different deployment
@@ -140,7 +141,12 @@ def compute_server_config(deploy: bool,
         burstable_parallel_for_short = _BURSTABLE_WORKERS_FOR_LOCAL
         # Runs in low resource mode if the available memory is less than
         # server_constants.MIN_AVAIL_MEM_GB.
-        if not deploy and mem_size_gb < server_constants.MIN_AVAIL_MEM_GB:
+        # pylint: disable=import-outside-toplevel
+        import sky.jobs.utils as job_utils
+        max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
+                      if job_utils.is_consolidation_mode() else
+                      server_constants.MIN_AVAIL_MEM_GB)
+        if not deploy and mem_size_gb < max_memory:
             # Permanent worker process may have significant memory consumption
             # (~350MB per worker) after running commands like `sky check`, so we
             # don't start any permanent workers in low resource local mode. This
@@ -151,25 +157,29 @@ def compute_server_config(deploy: bool,
             # permanently because it never exits.
             max_parallel_for_long = 0
             max_parallel_for_short = 0
-            logger.warning(
-                'SkyPilot API server will run in low resource mode because '
-                'the available memory is less than '
-                f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
+            if not quiet:
+                logger.warning(
+                    'SkyPilot API server will run in low resource mode because '
+                    'the available memory is less than '
+                    f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
     elif max_db_connections is not None:
         if max_parallel_all_workers > max_db_connections:
-            logger.warning(
-                f'Max parallel all workers ({max_parallel_all_workers}) '
-                f'is greater than max db connections ({max_db_connections}). '
-                'Increase the number of max db connections to '
-                f'at least {max_parallel_all_workers} for optimal performance.')
+            if not quiet:
+                logger.warning(
+                    f'Max parallel all workers ({max_parallel_all_workers}) '
+                    'is greater than max db connections '
+                    f'({max_db_connections}). Increase the number of max db '
+                    f'connections to at least {max_parallel_all_workers} for '
+                    'optimal performance.')
         else:
             num_db_connections_per_worker = 1
-    logger.info(
-        f'SkyPilot API server will start {num_server_workers} server processes '
-        f'with {max_parallel_for_long} background workers for long requests '
-        f'and will allow at max {max_parallel_for_short} short requests in '
-        f'parallel.')
+    if not quiet:
+        logger.info(
+            f'SkyPilot API server will start {num_server_workers} server '
+            f'processes with {max_parallel_for_long} background workers for '
+            f'long requests and will allow at max {max_parallel_for_short} '
+            'short requests in parallel.')
     return ServerConfig(
         num_server_workers=num_server_workers,
         queue_backend=queue_backend,
@@ -190,10 +200,15 @@ def _max_long_worker_parallism(cpu_count: int,
                                local=False) -> int:
     """Max parallelism for long workers."""
     # Reserve min available memory to avoid OOM.
-    available_mem = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
+    # pylint: disable=import-outside-toplevel
+    import sky.jobs.utils as job_utils
+    max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
+                  if job_utils.is_consolidation_mode() else
+                  server_constants.MIN_AVAIL_MEM_GB)
+    available_mem = max(0, mem_size_gb - max_memory)
     cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_LONG_WORKERS
     mem_based_max_parallel = int(available_mem * _MAX_MEM_PERCENT_FOR_BLOCKING /
-                                 _LONG_WORKER_MEM_GB)
+                                 LONG_WORKER_MEM_GB)
     n = max(_MIN_LONG_WORKERS,
             min(cpu_based_max_parallel, mem_based_max_parallel))
     if local:
@@ -205,8 +220,12 @@ def _max_short_worker_parallism(mem_size_gb: float,
                                 long_worker_parallism: int) -> int:
     """Max parallelism for short workers."""
     # Reserve memory for long workers and min available memory.
-    reserved_mem = server_constants.MIN_AVAIL_MEM_GB + (long_worker_parallism *
-                                                        _LONG_WORKER_MEM_GB)
+    # pylint: disable=import-outside-toplevel
+    import sky.jobs.utils as job_utils
+    max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
+                  if job_utils.is_consolidation_mode() else
+                  server_constants.MIN_AVAIL_MEM_GB)
+    reserved_mem = max_memory + (long_worker_parallism * LONG_WORKER_MEM_GB)
     available_mem = max(0, mem_size_gb - reserved_mem)
-    n = max(_MIN_SHORT_WORKERS, int(available_mem / _SHORT_WORKER_MEM_GB))
+    n = max(_MIN_SHORT_WORKERS, int(available_mem / SHORT_WORKER_MEM_GB))
     return n

sky/server/constants.py CHANGED Viewed

@@ -34,6 +34,7 @@ VERSION_HEADER = 'X-SkyPilot-Version'
 REQUEST_NAME_PREFIX = 'sky.'
 # The memory (GB) that SkyPilot tries to not use to prevent OOM.
 MIN_AVAIL_MEM_GB = 2
+MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE = 4
 # Default encoder/decoder handler name.
 DEFAULT_HANDLER_NAME = 'default'
 # The path to the API request database.

skypilot-nightly 1.0.0.dev20250909__py3-none-any.whl → 1.0.0.dev20250910__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250909py3-none-any.whl → 1.0.0.dev20250910py3-none-any.whl