PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250827__py3-none-any.whl → 1.0.0.dev20250829__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250827py3-none-any.whl → 1.0.0.dev20250829py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (86) hide show

sky/__init__.py +2 -2
sky/admin_policy.py +11 -10
sky/authentication.py +1 -1
sky/backends/backend.py +3 -5
sky/backends/backend_utils.py +140 -52
sky/backends/cloud_vm_ray_backend.py +30 -25
sky/backends/local_docker_backend.py +3 -8
sky/backends/wheel_utils.py +35 -8
sky/client/cli/command.py +41 -9
sky/client/sdk.py +23 -8
sky/client/sdk_async.py +6 -2
sky/clouds/aws.py +118 -1
sky/core.py +1 -4
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/global_user_state.py +82 -22
sky/jobs/client/sdk.py +5 -2
sky/jobs/recovery_strategy.py +9 -4
sky/jobs/server/server.py +2 -1
sky/logs/agent.py +2 -2
sky/logs/aws.py +6 -3
sky/provision/aws/config.py +78 -3
sky/provision/aws/instance.py +45 -6
sky/provision/do/utils.py +2 -1
sky/provision/kubernetes/instance.py +55 -11
sky/provision/kubernetes/utils.py +11 -2
sky/provision/nebius/utils.py +36 -2
sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
sky/serve/client/impl.py +5 -4
sky/serve/replica_managers.py +4 -3
sky/serve/serve_utils.py +2 -2
sky/serve/server/impl.py +3 -2
sky/serve/server/server.py +2 -1
sky/server/auth/oauth2_proxy.py +10 -4
sky/server/common.py +4 -4
sky/server/daemons.py +16 -5
sky/server/requests/executor.py +5 -3
sky/server/requests/payloads.py +3 -1
sky/server/requests/preconditions.py +3 -2
sky/server/requests/requests.py +121 -19
sky/server/server.py +85 -60
sky/server/stream_utils.py +7 -5
sky/setup_files/dependencies.py +6 -1
sky/sky_logging.py +28 -0
sky/skylet/constants.py +6 -0
sky/skylet/events.py +2 -3
sky/skypilot_config.py +10 -10
sky/task.py +1 -1
sky/templates/aws-ray.yml.j2 +1 -0
sky/templates/nebius-ray.yml.j2 +4 -8
sky/usage/usage_lib.py +3 -2
sky/utils/annotations.py +8 -2
sky/utils/cluster_utils.py +3 -3
sky/utils/common_utils.py +0 -72
sky/utils/controller_utils.py +4 -3
sky/utils/dag_utils.py +4 -4
sky/utils/db/db_utils.py +11 -0
sky/utils/db/migration_utils.py +1 -1
sky/utils/kubernetes/config_map_utils.py +3 -3
sky/utils/kubernetes_enums.py +1 -0
sky/utils/lock_events.py +94 -0
sky/utils/schemas.py +3 -0
sky/utils/timeline.py +24 -93
sky/utils/yaml_utils.py +77 -10
{skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/METADATA +8 -2
{skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/RECORD +86 -84
/sky/dashboard/out/_next/static/{-eL7Ky3bxVivzeLHNB9U6 → hYJYFIxp_ZFONR4wTIJqZ}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{-eL7Ky3bxVivzeLHNB9U6 → hYJYFIxp_ZFONR4wTIJqZ}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/top_level.txt +0 -0

sky/provision/kubernetes/instance.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Kubernetes instance provisioning."""
 import copy
+import datetime
 import json
 import re
 import time
@@ -1254,9 +1255,11 @@ def get_cluster_info(
         provider_config=provider_config)
-def _get_pod_termination_reason(pod: Any) -> str:
+def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
+    """Get pod termination reason and write to cluster events."""
     reasons = []
-    if pod.status.container_statuses:
+    latest_timestamp = pod.status.start_time or datetime.datetime.min
+    if pod.status and pod.status.container_statuses:
         for container_status in pod.status.container_statuses:
             terminated = container_status.state.terminated
             if terminated:
@@ -1264,20 +1267,38 @@ def _get_pod_termination_reason(pod: Any) -> str:
                 reason = terminated.reason
                 if exit_code == 0:
                     # skip exit 0 (non-failed) just for sanity
+                    logger.debug(f'{pod.metadata.name}/{container_status.name} '
+                                 'had exit code 0. Skipping.')
                     continue
                 if reason is None:
                     # just in-case reason is None, have default for debugging
                     reason = f'exit({exit_code})'
                 reasons.append(reason)
+                if terminated.finished_at > latest_timestamp:
+                    latest_timestamp = terminated.finished_at
             # TODO (kyuds): later, if needed, query `last_state` too.
+    if not reasons:
+        return ''
     # Normally we will have a single container per pod for skypilot
     # but doing this just in-case there are multiple containers.
-    return ' | '.join(reasons)
+    pod_reason = ' | '.join(reasons)
+    global_user_state.add_cluster_event(
+        cluster_name,
+        None,
+        f'[kubernetes pod {pod.metadata.name} terminated] {pod_reason}',
+        global_user_state.ClusterEventType.DEBUG,
+        transitioned_at=int(latest_timestamp.timestamp()),
+    )
+    return pod_reason
 def _get_pod_missing_reason(context: Optional[str], namespace: str,
                             cluster_name: str, pod_name: str) -> Optional[str]:
+    """Get events for missing pod and write to cluster events."""
     logger.debug(f'Analyzing events for pod {pod_name}')
     pod_field_selector = (
         f'involvedObject.kind=Pod,involvedObject.name={pod_name}')
@@ -1293,6 +1314,8 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
     last_scheduled_node = None
     insert_new_pod_event = True
     new_event_inserted = False
+    inserted_pod_events = 0
     for event in pod_events:
         if event.reason == 'Scheduled':
             pattern = r'Successfully assigned (\S+) to (\S+)'
@@ -1313,10 +1336,18 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
                     transitioned_at=int(
                         event.metadata.creation_timestamp.timestamp()),
                     expose_duplicate_error=True)
+                logger.debug(f'[pod {pod_name}] encountered new pod event: '
+                             f'{event.metadata.creation_timestamp} '
+                             f'{event.reason} {event.message}')
             except db_utils.UniqueConstraintViolationError:
                 insert_new_pod_event = False
             else:
                 new_event_inserted = True
+                inserted_pod_events += 1
+    logger.debug(f'[pod {pod_name}] processed {len(pod_events)} pod events and '
+                 f'inserted {inserted_pod_events} new pod events '
+                 'previously unseen')
     if last_scheduled_node is not None:
         node_field_selector = ('involvedObject.kind=Node,'
@@ -1331,6 +1362,7 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
             # latest event appears first
             reverse=True)
         insert_new_node_event = True
+        inserted_node_events = 0
         for event in node_events:
             if insert_new_node_event:
                 # Try inserting the latest events first. If the event is a
@@ -1345,10 +1377,23 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
                         transitioned_at=int(
                             event.metadata.creation_timestamp.timestamp()),
                         expose_duplicate_error=True)
+                    logger.debug(
+                        f'[pod {pod_name}] encountered new node event: '
+                        f'{event.metadata.creation_timestamp} '
+                        f'{event.reason} {event.message}')
                 except db_utils.UniqueConstraintViolationError:
                     insert_new_node_event = False
                 else:
                     new_event_inserted = True
+                    inserted_node_events += 1
+        logger.debug(f'[pod {pod_name}: node {last_scheduled_node}] '
+                     f'processed {len(node_events)} node events and '
+                     f'inserted {inserted_node_events} new node events '
+                     'previously unseen')
+    else:
+        logger.debug(f'[pod {pod_name}] could not determine the node '
+                     'the pod was scheduled to')
     if not new_event_inserted:
         # If new event is not inserted, there is no useful information to
@@ -1390,13 +1435,15 @@ def query_instances(
     provider_config: Optional[Dict[str, Any]] = None,
     non_terminated_only: bool = True
 ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
+    # Mapping from pod phase to skypilot status. These are the only valid pod
+    # phases.
+    # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
     status_map = {
         'Pending': status_lib.ClusterStatus.INIT,
         'Running': status_lib.ClusterStatus.UP,
         'Failed': status_lib.ClusterStatus.INIT,
         'Unknown': None,
         'Succeeded': None,
-        'Terminating': None,
     }
     assert provider_config is not None
@@ -1440,18 +1487,15 @@ def query_instances(
     for pod in pods:
         phase = pod.status.phase
         pod_status = status_map[phase]
+        reason = None
+        if phase in ('Failed', 'Unknown'):
+            reason = _get_pod_termination_reason(pod, cluster_name)
+            logger.debug(f'Pod Status ({phase}) Reason(s): {reason}')
         if non_terminated_only and pod_status is None:
             logger.debug(f'Pod {pod.metadata.name} is terminated, but '
                          'query_instances is called with '
                          f'non_terminated_only=True. Phase: {phase}')
-            if phase == 'Failed':
-                reason_for_debug = _get_pod_termination_reason(pod)
-                logger.debug(f'Termination reason: {reason_for_debug}')
             continue
-        reason = None
-        if phase == 'Failed':
-            reason = _get_pod_termination_reason(pod)
-            logger.debug(f'Pod Status Reason(s): {reason}')
         pod_name = pod.metadata.name
         reason = f'{pod_name}: {reason}' if reason is not None else None
         cluster_status[pod_name] = (pod_status, reason)

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -1082,6 +1082,14 @@ class KarpenterAutoscaler(Autoscaler):
     can_query_backend: bool = False
+class CoreweaveAutoscaler(Autoscaler):
+    """CoreWeave autoscaler
+    """
+    label_formatter: Any = CoreWeaveLabelFormatter
+    can_query_backend: bool = False
 class GenericAutoscaler(Autoscaler):
     """Generic autoscaler
     """
@@ -1094,6 +1102,7 @@ class GenericAutoscaler(Autoscaler):
 AUTOSCALER_TYPE_TO_AUTOSCALER = {
     kubernetes_enums.KubernetesAutoscalerType.GKE: GKEAutoscaler,
     kubernetes_enums.KubernetesAutoscalerType.KARPENTER: KarpenterAutoscaler,
+    kubernetes_enums.KubernetesAutoscalerType.COREWEAVE: CoreweaveAutoscaler,
     kubernetes_enums.KubernetesAutoscalerType.GENERIC: GenericAutoscaler,
 }
@@ -2782,7 +2791,7 @@ def combine_pod_config_fields(
         kubernetes_config)
     # Write the updated YAML back to the file
-    common_utils.dump_yaml(cluster_yaml_path, yaml_obj)
+    yaml_utils.dump_yaml(cluster_yaml_path, yaml_obj)
 def combine_metadata_fields(cluster_yaml_path: str,
@@ -2834,7 +2843,7 @@ def combine_metadata_fields(cluster_yaml_path: str,
         config_utils.merge_k8s_configs(destination, custom_metadata)
     # Write the updated YAML back to the file
-    common_utils.dump_yaml(cluster_yaml_path, yaml_obj)
+    yaml_utils.dump_yaml(cluster_yaml_path, yaml_obj)
 def merge_custom_metadata(

sky/provision/nebius/utils.py CHANGED Viewed

@@ -14,6 +14,8 @@ logger = sky_logging.init_logger(__name__)
 POLL_INTERVAL = 5
+_MAX_OPERATIONS_TO_FETCH = 1000
 def retry(func):
     """Decorator to retry a function."""
@@ -321,11 +323,43 @@ def launch(cluster_name_on_cloud: str,
                 parent_id=project_id,
                 name=instance_name,
             )))
+        instance_id = instance.metadata.id
         if instance.status.state.name == 'STARTING':
-            instance_id = instance.metadata.id
             break
+        # All Instances initially have state=STOPPED and reconciling=True,
+        # so we need to wait until reconciling is False.
+        if instance.status.state.name == 'STOPPED' and \
+                not instance.status.reconciling:
+            next_token = ''
+            total_operations = 0
+            while True:
+                operations_response = nebius.sync_call(
+                    service.list_operations_by_parent(
+                        nebius.compute().ListOperationsByParentRequest(
+                            parent_id=project_id,
+                            page_size=100,
+                            page_token=next_token,
+                        )))
+                total_operations += len(operations_response.operations)
+                for operation in operations_response.operations:
+                    # Find the most recent operation for the instance.
+                    if operation.resource_id == instance_id:
+                        error_msg = operation.description
+                        if operation.status:
+                            error_msg += f' {operation.status.message}'
+                        raise RuntimeError(error_msg)
+                # If we've fetched too many operations, or there are no more
+                # operations to fetch, just raise a generic error.
+                if total_operations > _MAX_OPERATIONS_TO_FETCH or \
+                        not operations_response.next_page_token:
+                    raise RuntimeError(
+                        f'Instance {instance_name} failed to start.')
+                next_token = operations_response.next_page_token
         time.sleep(POLL_INTERVAL)
-        logger.debug(f'Waiting for instance {instance_name} start running.')
+        logger.debug(f'Waiting for instance {instance_name} to start running. '
+                     f'State: {instance.status.state.name}, '
+                     f'Reconciling: {instance.status.reconciling}')
         retry_count += 1
     if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_READY:

sky/schemas/db/global_user_state/007_cluster_event_request_id.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""Add request_id to cluster_events.
+Revision ID: 007
+Revises: 006
+Create Date: 2025-08-28
+"""
+# pylint: disable=invalid-name
+from typing import Sequence, Union
+from alembic import op
+import sqlalchemy as sa
+from sky.utils.db import db_utils
+# revision identifiers, used by Alembic.
+revision: str = '007'
+down_revision: Union[str, Sequence[str], None] = '006'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+def upgrade():
+    """Add request_id column to cluster_events."""
+    with op.get_context().autocommit_block():
+        db_utils.add_column_to_table_alembic('cluster_events',
+                                             'request_id',
+                                             sa.Text(),
+                                             server_default=None)
+def downgrade():
+    """No-op for backward compatibility."""
+    pass

sky/serve/client/impl.py CHANGED Viewed

@@ -224,10 +224,11 @@ def tail_logs(service_name: str,
         stream=True)
     request_id: server_common.RequestId[None] = server_common.get_request_id(
         response)
-    return sdk.stream_response(request_id=request_id,
-                               response=response,
-                               output_stream=output_stream,
-                               resumable=True)
+    sdk.stream_response(request_id=request_id,
+                        response=response,
+                        output_stream=output_stream,
+                        resumable=True,
+                        get_result=follow)
 def sync_down_logs(service_name: str,

sky/serve/replica_managers.py CHANGED Viewed

@@ -37,6 +37,7 @@ from sky.utils import env_options
 from sky.utils import resources_utils
 from sky.utils import status_lib
 from sky.utils import ux_utils
+from sky.utils import yaml_utils
 if typing.TYPE_CHECKING:
     from sky.serve import service_spec
@@ -79,7 +80,7 @@ def launch_cluster(replica_id: int,
                     f'{cluster_name} with resources override: '
                     f'{resources_override}')
     try:
-        config = common_utils.read_yaml(
+        config = yaml_utils.read_yaml(
             os.path.expanduser(service_task_yaml_path))
         task = task_lib.Task.from_yaml_config(config)
         if resources_override is not None:
@@ -1397,7 +1398,7 @@ class SkyPilotReplicaManager(ReplicaManager):
         # the latest version. This can significantly improve the speed
         # for updating an existing service with only config changes to the
         # service specs, e.g. scale down the service.
-        new_config = common_utils.read_yaml(
+        new_config = yaml_utils.read_yaml(
             os.path.expanduser(service_task_yaml_path))
         # Always create new replicas and scale down old ones when file_mounts
         # are not empty.
@@ -1414,7 +1415,7 @@ class SkyPilotReplicaManager(ReplicaManager):
                 old_service_task_yaml_path = (
                     serve_utils.generate_task_yaml_file_name(
                         self._service_name, info.version))
-                old_config = common_utils.read_yaml(
+                old_config = yaml_utils.read_yaml(
                     os.path.expanduser(old_service_task_yaml_path))
                 for key in ['service', 'pool', '_user_specified_yaml']:
                     old_config.pop(key, None)

sky/serve/serve_utils.py CHANGED Viewed

@@ -699,7 +699,7 @@ def _get_service_status(
     if record['pool']:
         latest_yaml_path = generate_task_yaml_file_name(service_name,
                                                         record['version'])
-        raw_yaml_config = common_utils.read_yaml(latest_yaml_path)
+        raw_yaml_config = yaml_utils.read_yaml(latest_yaml_path)
         original_config = raw_yaml_config.get('_user_specified_yaml')
         if original_config is None:
             # Fall back to old display format.
@@ -711,7 +711,7 @@ def _get_service_status(
                 original_config['pool'] = svc  # Add pool to root config
         else:
             original_config = yaml_utils.safe_load(original_config)
-        record['pool_yaml'] = common_utils.dump_yaml_str(original_config)
+        record['pool_yaml'] = yaml_utils.dump_yaml_str(original_config)
     record['target_num_replicas'] = 0
     try:

sky/serve/server/impl.py CHANGED Viewed

@@ -34,6 +34,7 @@ from sky.utils import dag_utils
 from sky.utils import rich_utils
 from sky.utils import subprocess_utils
 from sky.utils import ux_utils
+from sky.utils import yaml_utils
 logger = sky_logging.init_logger(__name__)
@@ -179,7 +180,7 @@ def up(
         controller = controller_utils.get_controller_for_pool(pool)
         controller_name = controller.value.cluster_name
         task_config = task.to_yaml_config()
-        common_utils.dump_yaml(service_file.name, task_config)
+        yaml_utils.dump_yaml(service_file.name, task_config)
         remote_tmp_task_yaml_path = (
             serve_utils.generate_remote_tmp_task_yaml_file_name(service_name))
         remote_config_yaml_path = (
@@ -531,7 +532,7 @@ def update(
             prefix=f'{service_name}-v{current_version}',
             mode='w') as service_file:
         task_config = task.to_yaml_config()
-        common_utils.dump_yaml(service_file.name, task_config)
+        yaml_utils.dump_yaml(service_file.name, task_config)
         remote_task_yaml_path = serve_utils.generate_task_yaml_file_name(
             service_name, current_version, expand_user=False)

sky/serve/server/server.py CHANGED Viewed

@@ -107,7 +107,8 @@ async def tail_logs(
         request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
     )
-    request_task = api_requests.get_request(request.state.request_id)
+    request_task = await api_requests.get_request_async(request.state.request_id
+                                                       )
     return stream_utils.stream_response(
         request_id=request_task.request_id,

sky/server/auth/oauth2_proxy.py CHANGED Viewed

@@ -4,6 +4,7 @@ import asyncio
 import hashlib
 import http
 import os
+import traceback
 from typing import Optional
 import urllib
@@ -109,8 +110,8 @@ class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
             try:
                 return await self._authenticate(request, call_next, session)
             except (aiohttp.ClientError, asyncio.TimeoutError) as e:
-                logger.error(f'Error communicating with OAuth2 proxy: {e}')
-                # Fail open or closed based on your security requirements
+                logger.error(f'Error communicating with OAuth2 proxy: {e}'
+                             f'{traceback.format_exc()}')
                 return fastapi.responses.JSONResponse(
                     status_code=http.HTTPStatus.BAD_GATEWAY,
                     content={'detail': 'oauth2-proxy service unavailable'})
@@ -120,10 +121,15 @@ class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
         forwarded_headers = dict(request.headers)
         auth_url = f'{self.proxy_base}/oauth2/auth'
         forwarded_headers['X-Forwarded-Uri'] = str(request.url).rstrip('/')
-        logger.debug(f'authenticate request: {request.url.path}')
+        # Remove content-length and content-type headers and drop request body
+        # to reduce the auth overhead.
+        forwarded_headers.pop('content-length', None)
+        forwarded_headers.pop('content-type', None)
+        logger.debug(f'authenticate request: {auth_url}, '
+                     f'headers: {forwarded_headers}')
         async with session.request(
-                method=request.method,
+                method='GET',
                 url=auth_url,
                 headers=forwarded_headers,
                 cookies=request.cookies,

sky/server/common.py CHANGED Viewed

@@ -41,6 +41,7 @@ from sky.utils import annotations
 from sky.utils import common_utils
 from sky.utils import rich_utils
 from sky.utils import ux_utils
+from sky.utils import yaml_utils
 if typing.TYPE_CHECKING:
     import aiohttp
@@ -816,7 +817,7 @@ def process_mounts_in_task_on_api_server(task: str, env_vars: Dict[str, str],
         return str(client_file_mounts_dir /
                    file_mounts_mapping[original_path].lstrip('/'))
-    task_configs = common_utils.read_yaml_all(str(client_task_path))
+    task_configs = yaml_utils.read_yaml_all(str(client_task_path))
     for task_config in task_configs:
         if task_config is None:
             continue
@@ -869,7 +870,7 @@ def process_mounts_in_task_on_api_server(task: str, env_vars: Dict[str, str],
     # We can switch to using string, but this is to make it easier to debug, by
     # persisting the translated task yaml file.
     translated_client_task_path = client_dir / f'{task_id}_translated.yaml'
-    common_utils.dump_yaml(str(translated_client_task_path), task_configs)
+    yaml_utils.dump_yaml(str(translated_client_task_path), task_configs)
     dag = dag_utils.load_chain_dag_from_yaml(str(translated_client_task_path))
     return dag
@@ -910,8 +911,7 @@ def reload_for_new_request(client_entrypoint: Optional[str],
     # Clear cache should be called before reload_logger and usage reset,
     # otherwise, the latest env var will not be used.
-    for func in annotations.FUNCTIONS_NEED_RELOAD_CACHE:
-        func.cache_clear()
+    annotations.clear_request_level_cache()
     # We need to reset usage message, so that the message is up-to-date with the
     # latest information in the context, e.g. client entrypoint and run id.

sky/server/daemons.py CHANGED Viewed

@@ -7,8 +7,10 @@ from typing import Callable
 from sky import sky_logging
 from sky import skypilot_config
 from sky.server import constants as server_constants
+from sky.utils import annotations
 from sky.utils import common
 from sky.utils import env_options
+from sky.utils import timeline
 from sky.utils import ux_utils
 logger = sky_logging.init_logger(__name__)
@@ -67,6 +69,10 @@ class InternalRequestDaemon:
                     sky_logging.reload_logger()
                     level = self.refresh_log_level()
                     self.event_fn()
+                # Clear request level cache after each run to avoid
+                # using too much memory.
+                annotations.clear_request_level_cache()
+                timeline.save_timeline()
             except Exception:  # pylint: disable=broad-except
                 # It is OK to fail to run the event, as the event is not
                 # critical, but we should log the error.
@@ -191,23 +197,28 @@ INTERNAL_REQUEST_DAEMONS = [
     # set to updated status automatically, without showing users the hint of
     # cluster being stopped or down when `sky status -r` is called.
     InternalRequestDaemon(id='skypilot-status-refresh-daemon',
-                          name='status',
+                          name='status-refresh',
                           event_fn=refresh_cluster_status_event,
                           default_log_level='DEBUG'),
     # Volume status refresh daemon to update the volume status periodically.
     InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
-                          name='volume',
+                          name='volume-refresh',
                           event_fn=refresh_volume_status_event),
     InternalRequestDaemon(id='managed-job-status-refresh-daemon',
-                          name='managed-job-status',
+                          name='managed-job-status-refresh',
                           event_fn=managed_job_status_refresh_event,
                           should_skip=should_skip_managed_job_status_refresh),
     InternalRequestDaemon(id='sky-serve-status-refresh-daemon',
-                          name='sky-serve-status',
+                          name='sky-serve-status-refresh',
                           event_fn=sky_serve_status_refresh_event,
                           should_skip=should_skip_sky_serve_status_refresh),
     InternalRequestDaemon(id='pool-status-refresh-daemon',
-                          name='pool-status',
+                          name='pool-status-refresh',
                           event_fn=pool_status_refresh_event,
                           should_skip=should_skip_pool_status_refresh),
 ]
+def is_daemon_request_id(request_id: str) -> bool:
+    """Returns whether a specific request_id is an internal daemon."""
+    return any([d.id == request_id for d in INTERNAL_REQUEST_DAEMONS])

sky/server/requests/executor.py CHANGED Viewed

@@ -55,6 +55,7 @@ from sky.utils import context_utils
 from sky.utils import subprocess_utils
 from sky.utils import tempstore
 from sky.utils import timeline
+from sky.utils import yaml_utils
 from sky.workspaces import core as workspaces_core
 if typing.TYPE_CHECKING:
@@ -382,12 +383,13 @@ def _request_execution_wrapper(request_id: str,
         # config, as there can be some logs during override that needs to be
         # captured in the log file.
         try:
-            with override_request_env_and_config(request_body, request_id), \
+            with sky_logging.add_debug_log_handler(request_id), \
+                override_request_env_and_config(request_body, request_id), \
                 tempstore.tempdir():
                 if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
                     config = skypilot_config.to_dict()
                     logger.debug(f'request config: \n'
-                                 f'{common_utils.dump_yaml_str(dict(config))}')
+                                 f'{yaml_utils.dump_yaml_str(dict(config))}')
                 return_value = func(**request_body.to_kwargs())
                 f.flush()
         except KeyboardInterrupt:
@@ -451,7 +453,7 @@ async def execute_request_coroutine(request: api_requests.Request):
                                                   **request_body.to_kwargs())
     async def poll_task(request_id: str) -> bool:
-        request = api_requests.get_request(request_id)
+        request = await api_requests.get_request_async(request_id)
         if request is None:
             raise RuntimeError('Request not found')

sky/server/requests/payloads.py CHANGED Viewed

@@ -71,7 +71,9 @@ EXTERNAL_LOCAL_ENV_VARS = [
 def request_body_env_vars() -> dict:
     env_vars = {}
     for env_var in os.environ:
-        if env_var.startswith(constants.SKYPILOT_ENV_VAR_PREFIX):
+        if (env_var.startswith(constants.SKYPILOT_ENV_VAR_PREFIX) and
+                not env_var.startswith(
+                    constants.SKYPILOT_SERVER_ENV_VAR_PREFIX)):
             env_vars[env_var] = os.environ[env_var]
         if common.is_api_server_local() and env_var in EXTERNAL_LOCAL_ENV_VARS:
             env_vars[env_var] = os.environ[env_var]

sky/server/requests/preconditions.py CHANGED Viewed

@@ -98,7 +98,7 @@ class Precondition(abc.ABC):
                 return False
             # Check if the request has been cancelled
-            request = api_requests.get_request(self.request_id)
+            request = await api_requests.get_request_async(self.request_id)
             if request is None:
                 logger.error(f'Request {self.request_id} not found')
                 return False
@@ -112,7 +112,8 @@ class Precondition(abc.ABC):
                     return True
                 if status_msg is not None and status_msg != last_status_msg:
                     # Update the status message if it has changed.
-                    with api_requests.update_request(self.request_id) as req:
+                    async with api_requests.update_request_async(
+                            self.request_id) as req:
                         assert req is not None, self.request_id
                         req.status_msg = status_msg
                     last_status_msg = status_msg

skypilot-nightly 1.0.0.dev20250827__py3-none-any.whl → 1.0.0.dev20250829__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250827py3-none-any.whl → 1.0.0.dev20250829py3-none-any.whl