PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250912__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250910py3-none-any.whl → 1.0.0.dev20250912py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (68) hide show

sky/__init__.py +2 -2
sky/backends/backend_utils.py +125 -22
sky/backends/cloud_vm_ray_backend.py +224 -72
sky/catalog/__init__.py +7 -0
sky/catalog/aws_catalog.py +4 -0
sky/catalog/common.py +18 -0
sky/catalog/data_fetchers/fetch_aws.py +13 -1
sky/client/cli/command.py +2 -71
sky/client/sdk_async.py +5 -2
sky/clouds/aws.py +23 -5
sky/clouds/cloud.py +8 -0
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/3294.ba6586f9755b0edb.js +6 -0
sky/dashboard/out/_next/static/chunks/{webpack-1d7e11230da3ca89.js → webpack-e8a0c4c3c6f408fb.js} +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/global_user_state.py +34 -0
sky/jobs/client/sdk_async.py +4 -2
sky/jobs/controller.py +4 -2
sky/jobs/recovery_strategy.py +1 -1
sky/jobs/state.py +26 -16
sky/jobs/utils.py +6 -11
sky/logs/agent.py +10 -2
sky/provision/kubernetes/config.py +7 -2
sky/provision/kubernetes/instance.py +84 -41
sky/provision/vast/instance.py +1 -1
sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
sky/server/config.py +14 -5
sky/server/metrics.py +41 -8
sky/server/requests/executor.py +41 -4
sky/server/server.py +1 -0
sky/server/uvicorn.py +11 -5
sky/skylet/constants.py +12 -7
sky/skylet/log_lib.py +11 -0
sky/skylet/log_lib.pyi +9 -0
sky/task.py +62 -0
sky/templates/kubernetes-ray.yml.j2 +120 -3
sky/utils/accelerator_registry.py +3 -1
sky/utils/command_runner.py +35 -11
sky/utils/command_runner.pyi +22 -0
sky/utils/context_utils.py +15 -2
sky/utils/db/migration_utils.py +1 -1
sky/utils/git.py +559 -1
sky/utils/resource_checker.py +8 -7
sky/workspaces/core.py +57 -21
{skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/METADATA +33 -33
{skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/RECORD +66 -66
sky/client/cli/git.py +0 -549
sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
/sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → DAiq7V2xJnO1LSfmunZl6}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → DAiq7V2xJnO1LSfmunZl6}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/top_level.txt +0 -0

sky/dashboard/out/workspaces/[name].html CHANGED Viewed

	@@ -1 +1 @@
1	- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-~~1d7e11230da3ca89~~.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/1836-37fede578e2da5f8.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-0487dfbf149d9e53.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-08b2a1cae076a943.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-408ed10b2f9fce17.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-86cabed5d4669ad0.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-943efc7aff0f0c06.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-af76bb06dbb3954f.js" defer=""></script><script src="/dashboard/_next/static/~~3SYxqNGnvvPS8h3gdD2T7~~/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/~~3SYxqNGnvvPS8h3gdD2T7~~/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"~~3SYxqNGnvvPS8h3gdD2T7~~","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1	+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-e8a0c4c3c6f408fb.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/1836-37fede578e2da5f8.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-0487dfbf149d9e53.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-08b2a1cae076a943.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-408ed10b2f9fce17.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-86cabed5d4669ad0.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-943efc7aff0f0c06.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-af76bb06dbb3954f.js" defer=""></script><script src="/dashboard/_next/static/DAiq7V2xJnO1LSfmunZl6/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/DAiq7V2xJnO1LSfmunZl6/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"DAiq7V2xJnO1LSfmunZl6","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>

sky/dashboard/out/workspaces.html CHANGED Viewed

	@@ -1 +1 @@
1	- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-~~1d7e11230da3ca89~~.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-7598c33a746cdc91.js" defer=""></script><script src="/dashboard/_next/static/~~3SYxqNGnvvPS8h3gdD2T7~~/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/~~3SYxqNGnvvPS8h3gdD2T7~~/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"~~3SYxqNGnvvPS8h3gdD2T7~~","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1	+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-e8a0c4c3c6f408fb.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-7598c33a746cdc91.js" defer=""></script><script src="/dashboard/_next/static/DAiq7V2xJnO1LSfmunZl6/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/DAiq7V2xJnO1LSfmunZl6/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"DAiq7V2xJnO1LSfmunZl6","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>

sky/global_user_state.py CHANGED Viewed

@@ -118,6 +118,9 @@ cluster_table = sqlalchemy.Table(
     sqlalchemy.Column('provision_log_path',
                       sqlalchemy.Text,
                       server_default=None),
+    sqlalchemy.Column('skylet_ssh_tunnel_metadata',
+                      sqlalchemy.LargeBinary,
+                      server_default=None),
 )
 storage_table = sqlalchemy.Table(
@@ -1170,6 +1173,37 @@ def set_cluster_storage_mounts_metadata(
         raise ValueError(f'Cluster {cluster_name} not found.')
+@_init_db
+@metrics_lib.time_me
+def get_cluster_skylet_ssh_tunnel_metadata(
+        cluster_name: str) -> Optional[Tuple[int, int]]:
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        row = session.query(cluster_table).filter_by(name=cluster_name).first()
+    if row is None or row.skylet_ssh_tunnel_metadata is None:
+        return None
+    return pickle.loads(row.skylet_ssh_tunnel_metadata)
+@_init_db
+@metrics_lib.time_me
+def set_cluster_skylet_ssh_tunnel_metadata(
+        cluster_name: str,
+        skylet_ssh_tunnel_metadata: Optional[Tuple[int, int]]) -> None:
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        value = pickle.dumps(
+            skylet_ssh_tunnel_metadata
+        ) if skylet_ssh_tunnel_metadata is not None else None
+        count = session.query(cluster_table).filter_by(
+            name=cluster_name).update(
+                {cluster_table.c.skylet_ssh_tunnel_metadata: value})
+        session.commit()
+    assert count <= 1, count
+    if count == 0:
+        raise ValueError(f'Cluster {cluster_name} not found.')
 @_init_db
 @metrics_lib.time_me
 def _get_cluster_usage_intervals(

sky/jobs/client/sdk_async.py CHANGED Viewed

@@ -28,6 +28,8 @@ logger = sky_logging.init_logger(__name__)
 async def launch(
     task: Union['sky.Task', 'sky.Dag'],
     name: Optional[str] = None,
+    pool: Optional[str] = None,
+    num_jobs: Optional[int] = None,
     # Internal only:
     # pylint: disable=invalid-name
     _need_confirmation: bool = False,
@@ -35,8 +37,8 @@ async def launch(
         sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG,
 ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
     """Async version of launch() that launches a managed job."""
-    request_id = await context_utils.to_thread(sdk.launch, task, name,
-                                               _need_confirmation)
+    request_id = await context_utils.to_thread(sdk.launch, task, name, pool,
+                                               num_jobs, _need_confirmation)
     if stream_logs is not None:
         return await sdk_async._stream_and_get(request_id, stream_logs)  # pylint: disable=protected-access
     else:

sky/jobs/controller.py CHANGED Viewed

@@ -781,7 +781,7 @@ class JobsController:
 class Controller:
     """Controller for managing jobs."""
-    def __init__(self):
+    def __init__(self) -> None:
         # Global state for active jobs
         self.job_tasks: Dict[int, asyncio.Task] = {}
         self.starting: Set[int] = set()
@@ -984,12 +984,14 @@ class Controller:
                 job_logger.info(
                     f'Cluster of managed job {job_id} has been cleaned up.')
             except Exception as e:  # pylint: disable=broad-except
+                failure_reason = ('Failed to clean up: '
+                                  f'{common_utils.format_exception(e)}')
                 await managed_job_state.set_failed_async(
                     job_id,
                     task_id=None,
                     failure_type=managed_job_state.ManagedJobStatus.
                     FAILED_CONTROLLER,
-                    failure_reason=e,
+                    failure_reason=failure_reason,
                     override_terminal=True)
             if cancelling:

sky/jobs/recovery_strategy.py CHANGED Viewed

@@ -543,7 +543,7 @@ class StrategyExecutor:
             except exceptions.NoClusterLaunchedError:
                 # Update the status to PENDING during backoff.
-                state.set_backoff_pending_async(self.job_id, self.task_id)
+                await state.set_backoff_pending_async(self.job_id, self.task_id)
                 # Calculate the backoff time and sleep.
                 gap_seconds = (backoff.current_backoff()
                                if self.pool is None else 1)

sky/jobs/state.py CHANGED Viewed

@@ -238,6 +238,7 @@ def _init_db_async(func):
                 last_exc = e
             logger.debug(f'DB error: {last_exc}')
             await asyncio.sleep(backoff.current_backoff())
+        assert last_exc is not None
         raise last_exc
     return wrapper
@@ -266,6 +267,7 @@ def _init_db(func):
                 last_exc = e
             logger.debug(f'DB error: {last_exc}')
             time.sleep(backoff.current_backoff())
+        assert last_exc is not None
         raise last_exc
     return wrapper
@@ -735,16 +737,21 @@ def set_pending_cancelled(job_id: int):
         # Subquery to get the spot_job_ids that match the joined condition
         subquery = session.query(spot_table.c.job_id).join(
             job_info_table,
-            spot_table.c.spot_job_id == job_info_table.c.spot_job_id).filter(
-                spot_table.c.spot_job_id == job_id,
-                spot_table.c.status == ManagedJobStatus.PENDING.value,
-                sqlalchemy.or_(
-                    job_info_table.c.schedule_state ==
-                    ManagedJobScheduleState.WAITING.value,
-                    job_info_table.c.schedule_state ==
-                    ManagedJobScheduleState.INACTIVE.value,
-                ),
-            ).subquery()
+            spot_table.c.spot_job_id == job_info_table.c.spot_job_id
+        ).filter(
+            spot_table.c.spot_job_id == job_id,
+            spot_table.c.status == ManagedJobStatus.PENDING.value,
+            # Note: it's possible that a WAITING job actually needs to be
+            # cleaned up, if we are in the middle of an upgrade/recovery and
+            # the job is waiting to be reclaimed by a new controller. But,
+            # in this case the status will not be PENDING.
+            sqlalchemy.or_(
+                job_info_table.c.schedule_state ==
+                ManagedJobScheduleState.WAITING.value,
+                job_info_table.c.schedule_state ==
+                ManagedJobScheduleState.INACTIVE.value,
+            ),
+        ).subquery()
         count = session.query(spot_table).filter(
             spot_table.c.job_id.in_(subquery)).update(
@@ -1105,8 +1112,11 @@ async def set_job_id_on_pool_cluster_async(job_id: int,
     """Set the job id on the pool cluster for a job."""
     assert _SQLALCHEMY_ENGINE_ASYNC is not None
     async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
-        await session.execute(job_info_table.c.spot_job_id == job_id).update(
-            {job_info_table.c.job_id_on_pool_cluster: job_id_on_pool_cluster})
+        await session.execute(
+            sqlalchemy.update(job_info_table).
+            where(job_info_table.c.spot_job_id == job_id).values({
+                job_info_table.c.job_id_on_pool_cluster: job_id_on_pool_cluster
+            }))
         await session.commit()
@@ -1130,12 +1140,12 @@ async def get_pool_submit_info_async(
         job_id: int) -> Tuple[Optional[str], Optional[int]]:
     """Get the cluster name and job id on the pool from the managed job id."""
     assert _SQLALCHEMY_ENGINE_ASYNC is not None
-    async with orm.Session(_SQLALCHEMY_ENGINE_ASYNC) as session:
-        info = await session.execute(
+    async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
+        result = await session.execute(
             sqlalchemy.select(job_info_table.c.current_cluster_name,
                               job_info_table.c.job_id_on_pool_cluster).where(
-                                  job_info_table.c.spot_job_id == job_id)
-        ).fetchone()
+                                  job_info_table.c.spot_job_id == job_id))
+        info = result.fetchone()
         if info is None:
             return None, None
         return info[0], info[1]

sky/jobs/utils.py CHANGED Viewed

@@ -586,7 +586,9 @@ def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
             raise
-def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
+def event_callback_func(
+        job_id: int, task_id: Optional[int],
+        task: Optional['sky.Task']) -> managed_job_state.AsyncCallbackType:
     """Run event callback for the task."""
     def callback_func(status: str):
@@ -625,17 +627,10 @@ def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
             f'Bash:{event_callback},log_path:{log_path},result:{result}')
         logger.info(f'=== END: event callback for {status!r} ===')
-    try:
-        asyncio.get_running_loop()
-        # In async context
-        async def async_callback_func(status: str):
-            return await context_utils.to_thread(callback_func, status)
+    async def async_callback_func(status: str):
+        return await context_utils.to_thread(callback_func, status)
-        return async_callback_func
-    except RuntimeError:
-        # Not in async context
-        return callback_func
+    return async_callback_func
 # ======== user functions ========

sky/logs/agent.py CHANGED Viewed

@@ -35,9 +35,17 @@ class FluentbitAgent(LoggingAgent):
                           cluster_name: resources_utils.ClusterName) -> str:
         install_cmd = (
             'if ! command -v fluent-bit >/dev/null 2>&1; then '
-            'sudo apt-get install -y gnupg; '
+            'sudo apt-get update; sudo apt-get install -y gnupg; '
             # pylint: disable=line-too-long
-            'curl https://raw.githubusercontent.com/fluent/fluent-bit/master/install.sh | sh; '
+            'sudo sh -c \'curl https://packages.fluentbit.io/fluentbit.key | gpg --dearmor > /usr/share/keyrings/fluentbit-keyring.gpg\'; '
+            # pylint: disable=line-too-long
+            'os_id=$(grep -oP \'(?<=^ID=).*\' /etc/os-release 2>/dev/null || lsb_release -is 2>/dev/null | tr \'[:upper:]\' \'[:lower:]\'); '
+            # pylint: disable=line-too-long
+            'codename=$(grep -oP \'(?<=VERSION_CODENAME=).*\' /etc/os-release 2>/dev/null || lsb_release -cs 2>/dev/null); '
+            # pylint: disable=line-too-long
+            'echo "deb [signed-by=/usr/share/keyrings/fluentbit-keyring.gpg] https://packages.fluentbit.io/$os_id/$codename $codename main" | sudo tee /etc/apt/sources.list.d/fluent-bit.list; '
+            'sudo apt-get update; '
+            'sudo apt-get install -y fluent-bit; '
             'fi')
         cfg = self.fluentbit_config(cluster_name)
         cfg_path = os.path.join(constants.LOGGING_CONFIG_DIR, 'fluentbit.yaml')

sky/provision/kubernetes/config.py CHANGED Viewed

@@ -3,7 +3,7 @@ import copy
 import logging
 import math
 import os
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 from sky.adaptors import kubernetes
 from sky.provision import common
@@ -666,4 +666,9 @@ def _configure_services(namespace: str, context: Optional[str],
 class KubernetesError(Exception):
-    pass
+    def __init__(self,
+                 *args,
+                 insufficent_resources: Optional[List[str]] = None):
+        self.insufficent_resources = insufficent_resources
+        super().__init__(*args)

sky/provision/kubernetes/instance.py CHANGED Viewed

@@ -3,6 +3,7 @@ import copy
 import datetime
 import json
 import re
+import sys
 import time
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
@@ -191,14 +192,20 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
                 break
         if event_message is not None:
             if pod_status == 'Pending':
-                logger.info(event_message)
+                out_of = {}
+                # key: resource name, value: (extra message, nice name)
                 if 'Insufficient cpu' in event_message:
-                    raise config_lib.KubernetesError(
-                        _lack_resource_msg('CPU', pod, details=event_message))
+                    out_of['CPU'] = (': Run \'kubectl get nodes -o '
+                                     'custom-columns=NAME:.metadata.name,'
+                                     'CPU:.status.allocatable.cpu\' to check '
+                                     'the available CPUs on the node.', 'CPUs')
                 if 'Insufficient memory' in event_message:
-                    raise config_lib.KubernetesError(
-                        _lack_resource_msg('memory', pod,
-                                           details=event_message))
+                    out_of['memory'] = (': Run \'kubectl get nodes -o '
+                                        'custom-columns=NAME:.metadata.name,'
+                                        'MEMORY:.status.allocatable.memory\' '
+                                        'to check the available memory on the '
+                                        'node.', 'Memory')
                 # TODO(aylei): after switching from smarter-device-manager to
                 # fusermount-server, we need a new way to check whether the
                 # fusermount-server daemonset is ready.
@@ -206,41 +213,77 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
                     key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
                     for key in lf.get_label_keys()
                 ]
-                if pod.spec.node_selector:
-                    for label_key in pod.spec.node_selector.keys():
-                        if label_key in gpu_lf_keys:
-                            # TODO(romilb): We may have additional node
-                            #  affinity selectors in the future - in that
-                            #  case we will need to update this logic.
-                            # TODO(Doyoung): Update the error message raised
-                            # with the multi-host TPU support.
-                            gpu_resource_key = kubernetes_utils.get_gpu_resource_key(context)  # pylint: disable=line-too-long
-                            if 'Insufficient google.com/tpu' in event_message:
-                                extra_msg = (
-                                    f'Verify if '
-                                    f'{pod.spec.node_selector[label_key]}'
-                                    ' is available in the cluster. Note '
-                                    'that multi-host TPU podslices are '
-                                    'currently not unsupported.')
-                                raise config_lib.KubernetesError(
-                                    _lack_resource_msg('TPU',
-                                                       pod,
-                                                       extra_msg,
-                                                       details=event_message))
-                            elif ((f'Insufficient {gpu_resource_key}'
-                                   in event_message) or
-                                  ('didn\'t match Pod\'s node affinity/selector'
-                                   in event_message)):
-                                extra_msg = (
-                                    f'Verify if any node matching label  '
-                                    f'{pod.spec.node_selector[label_key]} and '
-                                    f'sufficient resource {gpu_resource_key} '
-                                    f'is available in the cluster.')
-                                raise config_lib.KubernetesError(
-                                    _lack_resource_msg('GPU',
-                                                       pod,
-                                                       extra_msg,
-                                                       details=event_message))
+                for label_key in gpu_lf_keys:
+                    # TODO(romilb): We may have additional node
+                    #  affinity selectors in the future - in that
+                    #  case we will need to update this logic.
+                    # TODO(Doyoung): Update the error message raised
+                    # with the multi-host TPU support.
+                    gpu_resource_key = kubernetes_utils.get_gpu_resource_key(
+                        context)  # pylint: disable=line-too-long
+                    if ((f'Insufficient {gpu_resource_key}' in event_message) or
+                        ('didn\'t match Pod\'s node affinity/selector'
+                         in event_message) and pod.spec.node_selector):
+                        if 'gpu' in gpu_resource_key.lower():
+                            info_msg = (
+                                ': Run \'sky show-gpus --infra kubernetes\' to '
+                                'see the available GPUs.')
+                        else:
+                            info_msg = ': '
+                        if (pod.spec.node_selector and
+                                label_key in pod.spec.node_selector):
+                            extra_msg = (
+                                f'Verify if any node matching label '
+                                f'{pod.spec.node_selector[label_key]} and '
+                                f'sufficient resource {gpu_resource_key} '
+                                f'is available in the cluster.')
+                            extra_msg = info_msg + ' ' + extra_msg
+                        else:
+                            extra_msg = info_msg
+                        if gpu_resource_key not in out_of or len(
+                                out_of[gpu_resource_key][0]) < len(extra_msg):
+                            out_of[f'{gpu_resource_key}'] = (extra_msg, 'GPUs')
+            if len(out_of) > 0:
+                # We are out of some resources. We should raise an error.
+                rsrc_err_msg = 'Insufficient resource capacity on the '
+                rsrc_err_msg += 'cluster:\n'
+                out_of_keys = list(out_of.keys())
+                for i in range(len(out_of_keys)):
+                    rsrc = out_of_keys[i]
+                    (extra_msg, nice_name) = out_of[rsrc]
+                    extra_msg = extra_msg if extra_msg else ''
+                    if i == len(out_of_keys) - 1:
+                        indent = '└──'
+                    else:
+                        indent = '├──'
+                    rsrc_err_msg += (f'{indent} Cluster does not have '
+                                     f'sufficient {nice_name} for your request'
+                                     f'{extra_msg}')
+                    if i != len(out_of_keys) - 1:
+                        rsrc_err_msg += '\n'
+                # Emit the error message without logging prefixes for better UX.
+                tmp_handler = sky_logging.EnvAwareHandler(sys.stdout)
+                tmp_handler.flush = sys.stdout.flush
+                tmp_handler.setFormatter(sky_logging.NO_PREFIX_FORMATTER)
+                tmp_handler.setLevel(sky_logging.ERROR)
+                prev_propagate = logger.propagate
+                try:
+                    logger.addHandler(tmp_handler)
+                    logger.propagate = False
+                    logger.error(ux_utils.error_message(f'{rsrc_err_msg}'))
+                finally:
+                    logger.removeHandler(tmp_handler)
+                    logger.propagate = prev_propagate
+                nice_names = [out_of[rsrc][1] for rsrc in out_of_keys]
+                raise config_lib.KubernetesError(
+                    f'{timeout_err_msg} '
+                    f'Pod status: {pod_status} '
+                    f'Details: \'{event_message}\' ',
+                    insufficent_resources=nice_names,
+                )
             raise config_lib.KubernetesError(f'{timeout_err_msg} '
                                              f'Pod status: {pod_status} '
                                              f'Details: \'{event_message}\' ')

sky/provision/vast/instance.py CHANGED Viewed

@@ -39,7 +39,7 @@ def _filter_instances(cluster_name_on_cloud: str,
 def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
     for inst_id, inst in instances.items():
-        if inst['name'].endswith('-head'):
+        if inst.get('name') and inst['name'].endswith('-head'):
             return inst_id
     return None

sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""Add skylet_ssh_tunnel_metadata to clusters.
+Revision ID: 008
+Revises: 007
+Create Date: 2025-09-09
+"""
+# pylint: disable=invalid-name
+from typing import Sequence, Union
+from alembic import op
+import sqlalchemy as sa
+from sky.utils.db import db_utils
+# revision identifiers, used by Alembic.
+revision: str = '008'
+down_revision: Union[str, Sequence[str], None] = '007'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+def upgrade():
+    """Add skylet_ssh_tunnel_metadata column to clusters."""
+    with op.get_context().autocommit_block():
+        db_utils.add_column_to_table_alembic('clusters',
+                                             'skylet_ssh_tunnel_metadata',
+                                             sa.LargeBinary(),
+                                             server_default=None)
+def downgrade():
+    """No-op for backward compatibility."""
+    pass

sky/server/config.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import Optional
 from sky import sky_logging
 from sky.server import constants as server_constants
+from sky.server import daemons
 from sky.utils import common_utils
 # Constants based on profiling the peak memory usage while serving various
@@ -21,7 +22,7 @@ from sky.utils import common_utils
 # in the future.
 # TODO(luca): The future is now! ^^^
 LONG_WORKER_MEM_GB = 0.4
-SHORT_WORKER_MEM_GB = 0.25
+SHORT_WORKER_MEM_GB = 0.3
 # To control the number of long workers.
 _CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
 # Limit the number of long workers of local API server, since local server is
@@ -36,9 +37,8 @@ _MAX_LONG_WORKERS_LOCAL = 4
 _MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
 # Minimal number of long workers to ensure responsiveness.
 _MIN_LONG_WORKERS = 1
-# Minimal number of short workers, there is a daemon task running on short
-# workers so at least 2 workers are needed to ensure responsiveness.
-_MIN_SHORT_WORKERS = 2
+# Minimal number of idle short workers to ensure responsiveness.
+_MIN_IDLE_SHORT_WORKERS = 1
 # Default number of burstable workers for local API server. A heuristic number
 # that is large enough for most local cases.
@@ -216,6 +216,15 @@ def _max_long_worker_parallism(cpu_count: int,
     return n
+def _get_min_short_workers() -> int:
+    """Min number of short workers."""
+    daemon_count = 0
+    for daemon in daemons.INTERNAL_REQUEST_DAEMONS:
+        if not daemon.should_skip():
+            daemon_count += 1
+    return _MIN_IDLE_SHORT_WORKERS + daemon_count
 def _max_short_worker_parallism(mem_size_gb: float,
                                 long_worker_parallism: int) -> int:
     """Max parallelism for short workers."""
@@ -227,5 +236,5 @@ def _max_short_worker_parallism(mem_size_gb: float,
                   server_constants.MIN_AVAIL_MEM_GB)
     reserved_mem = max_memory + (long_worker_parallism * LONG_WORKER_MEM_GB)
     available_mem = max(0, mem_size_gb - reserved_mem)
-    n = max(_MIN_SHORT_WORKERS, int(available_mem / SHORT_WORKER_MEM_GB))
+    n = max(_get_min_short_workers(), int(available_mem / SHORT_WORKER_MEM_GB))
     return n

sky/server/metrics.py CHANGED Viewed

@@ -4,6 +4,7 @@ import contextlib
 import functools
 import multiprocessing
 import os
+import threading
 import time
 import fastapi
@@ -21,6 +22,24 @@ from sky.skylet import constants
 METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
                                  'false').lower() == 'true'
+_KB = 2**10
+_MB = 2**20
+_MEM_BUCKETS = [
+    _KB,
+    256 * _KB,
+    512 * _KB,
+    _MB,
+    2 * _MB,
+    4 * _MB,
+    8 * _MB,
+    16 * _MB,
+    32 * _MB,
+    64 * _MB,
+    128 * _MB,
+    256 * _MB,
+    float('inf'),
+]
 logger = sky_logging.init_logger(__name__)
 # Total number of API server requests, grouped by path, method, and status.
@@ -92,6 +111,16 @@ SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
     ['pid', 'type', 'mode'],
 )
+SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
+    'sky_apiserver_request_memory_usage_bytes',
+    'Peak memory usage of requests', ['name'],
+    buckets=_MEM_BUCKETS)
+SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
+    'sky_apiserver_request_rss_incr_bytes',
+    'RSS increment after requests', ['name'],
+    buckets=_MEM_BUCKETS)
 metrics_app = fastapi.FastAPI()
@@ -208,19 +237,23 @@ def time_me_async(func):
     return async_wrapper
-def process_monitor(process_type: str):
+peak_rss_bytes = 0
+def process_monitor(process_type: str, stop: threading.Event):
     pid = multiprocessing.current_process().pid
     proc = psutil.Process(pid)
-    peak_rss = 0
     last_bucket_end = time.time()
-    while True:
+    bucket_peak = 0
+    global peak_rss_bytes
+    while not stop.is_set():
         if time.time() - last_bucket_end >= 30:
-            # Reset peak RSS every 30 seconds.
+            # Reset peak RSS for the next time bucket.
             last_bucket_end = time.time()
-            peak_rss = 0
-        peak_rss = max(peak_rss, proc.memory_info().rss)
-        SKY_APISERVER_PROCESS_PEAK_RSS.labels(pid=pid,
-                                              type=process_type).set(peak_rss)
+            bucket_peak = 0
+        peak_rss_bytes = max(bucket_peak, proc.memory_info().rss)
+        SKY_APISERVER_PROCESS_PEAK_RSS.labels(
+            pid=pid, type=process_type).set(peak_rss_bytes)
         ctimes = proc.cpu_times()
         SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
                                                type=process_type,

skypilot-nightly 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250912__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250910py3-none-any.whl → 1.0.0.dev20250912py3-none-any.whl