PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251013__py3-none-any.whl → 1.0.0.dev20251014__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251013py3-none-any.whl → 1.0.0.dev20251014py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (52) hide show

sky/__init__.py +2 -2
sky/authentication.py +9 -2
sky/backends/backend_utils.py +33 -25
sky/backends/cloud_vm_ray_backend.py +3 -5
sky/catalog/kubernetes_catalog.py +19 -25
sky/client/cli/command.py +53 -19
sky/client/sdk.py +13 -1
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/{webpack-ac3a34c8f9fef041.js → webpack-66f23594d38c7f16.js} +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/jobs/controller.py +122 -145
sky/jobs/recovery_strategy.py +59 -82
sky/jobs/scheduler.py +5 -5
sky/jobs/state.py +65 -21
sky/jobs/utils.py +58 -22
sky/metrics/utils.py +27 -6
sky/provision/kubernetes/utils.py +44 -39
sky/server/common.py +4 -2
sky/server/requests/executor.py +3 -1
sky/server/server.py +5 -0
sky/sky_logging.py +0 -2
sky/skylet/constants.py +22 -5
sky/skylet/log_lib.py +0 -1
sky/skylet/log_lib.pyi +1 -1
sky/utils/common.py +2 -0
sky/utils/context.py +57 -51
sky/utils/context_utils.py +2 -2
sky/utils/controller_utils.py +35 -8
sky/utils/locks.py +20 -5
sky/utils/subprocess_utils.py +4 -3
{skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/METADATA +36 -36
{skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/RECORD +52 -52
/sky/dashboard/out/_next/static/{MtlDUf-nH1hhcy7xwbCj3 → 9Fek73R28lDp1A5J4N7g7}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{MtlDUf-nH1hhcy7xwbCj3 → 9Fek73R28lDp1A5J4N7g7}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/top_level.txt +0 -0

sky/jobs/utils.py CHANGED Viewed

@@ -8,7 +8,6 @@ import asyncio
 import collections
 import datetime
 import enum
-import logging
 import os
 import pathlib
 import re
@@ -84,6 +83,7 @@ _LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
 _JOB_STATUS_FETCH_MAX_RETRIES = 3
 _JOB_K8S_TRANSIENT_NW_MSG = 'Unable to connect to the server: dial tcp'
+_JOB_STATUS_FETCH_TIMEOUT_SECONDS = 30
 _JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
     'Waiting for task to start[/]'
@@ -101,6 +101,13 @@ _JOB_CANCELLED_MESSAGE = (
 # update the state.
 _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 120
+# After enabling consolidation mode, we need to restart the API server to get
+# the jobs refresh deamon and correct number of executors. We use this file to
+# indicate that the API server has been restarted after enabling consolidation
+# mode.
+_JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE = (
+    '~/.sky/.jobs_controller_consolidation_reloaded_signal')
 class ManagedJobQueueResultType(enum.Enum):
     """The type of the managed job queue result."""
@@ -117,9 +124,8 @@ class UserSignal(enum.Enum):
 # ====== internal functions ======
 def terminate_cluster(
-        cluster_name: str,
-        max_retry: int = 6,
-        _logger: logging.Logger = logger,  # pylint: disable=invalid-name
+    cluster_name: str,
+    max_retry: int = 6,
 ) -> None:
     """Terminate the cluster."""
     from sky import core  # pylint: disable=import-outside-toplevel
@@ -143,18 +149,18 @@ def terminate_cluster(
             return
         except exceptions.ClusterDoesNotExist:
             # The cluster is already down.
-            _logger.debug(f'The cluster {cluster_name} is already down.')
+            logger.debug(f'The cluster {cluster_name} is already down.')
             return
         except Exception as e:  # pylint: disable=broad-except
             retry_cnt += 1
             if retry_cnt >= max_retry:
                 raise RuntimeError(
                     f'Failed to terminate the cluster {cluster_name}.') from e
-            _logger.error(
+            logger.error(
                 f'Failed to terminate the cluster {cluster_name}. Retrying.'
                 f'Details: {common_utils.format_exception(e)}')
             with ux_utils.enable_traceback():
-                _logger.error(f'  Traceback: {traceback.format_exc()}')
+                logger.error(f'  Traceback: {traceback.format_exc()}')
             time.sleep(backoff.current_backoff())
@@ -202,13 +208,39 @@ def _validate_consolidation_mode_config(
 # API Server. Under the hood, we submit the job monitoring logic as processes
 # directly in the API Server.
 # Use LRU Cache so that the check is only done once.
-@annotations.lru_cache(scope='request', maxsize=1)
-def is_consolidation_mode() -> bool:
+@annotations.lru_cache(scope='request', maxsize=2)
+def is_consolidation_mode(on_api_restart: bool = False) -> bool:
     if os.environ.get(constants.OVERRIDE_CONSOLIDATION_MODE) is not None:
         return True
-    consolidation_mode = skypilot_config.get_nested(
+    config_consolidation_mode = skypilot_config.get_nested(
         ('jobs', 'controller', 'consolidation_mode'), default_value=False)
+    signal_file = pathlib.Path(
+        _JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE).expanduser()
+    restart_signal_file_exists = signal_file.exists()
+    consolidation_mode = (config_consolidation_mode and
+                          restart_signal_file_exists)
+    if on_api_restart:
+        if config_consolidation_mode:
+            signal_file.touch()
+    else:
+        if not restart_signal_file_exists:
+            if config_consolidation_mode:
+                logger.warning(f'{colorama.Fore.YELLOW}Consolidation mode for '
+                               'managed jobs is enabled in the server config, '
+                               'but the API server has not been restarted yet. '
+                               'Please restart the API server to enable it.'
+                               f'{colorama.Style.RESET_ALL}')
+                return False
+        elif not config_consolidation_mode:
+            # Cleanup the signal file if the consolidation mode is disabled in
+            # the config. This allow the user to disable the consolidation mode
+            # without restarting the API server.
+            signal_file.unlink()
     # We should only do this check on API server, as the controller will not
     # have related config and will always seemingly disabled for consolidation
     # mode. Check #6611 for more details.
@@ -269,8 +301,7 @@ def ha_recovery_for_consolidation_mode():
 async def get_job_status(
         backend: 'backends.CloudVmRayBackend', cluster_name: str,
-        job_id: Optional[int],
-        job_logger: logging.Logger) -> Optional['job_lib.JobStatus']:
+        job_id: Optional[int]) -> Optional['job_lib.JobStatus']:
     """Check the status of the job running on a managed job cluster.
     It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
@@ -282,26 +313,28 @@ async def get_job_status(
     if handle is None:
         # This can happen if the cluster was preempted and background status
         # refresh already noticed and cleaned it up.
-        job_logger.info(f'Cluster {cluster_name} not found.')
+        logger.info(f'Cluster {cluster_name} not found.')
         return None
     assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
     job_ids = None if job_id is None else [job_id]
     for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
         try:
-            job_logger.info('=== Checking the job status... ===')
-            statuses = await context_utils.to_thread(backend.get_job_status,
-                                                     handle,
-                                                     job_ids=job_ids,
-                                                     stream_logs=False)
+            logger.info('=== Checking the job status... ===')
+            statuses = await asyncio.wait_for(
+                context_utils.to_thread(backend.get_job_status,
+                                        handle,
+                                        job_ids=job_ids,
+                                        stream_logs=False),
+                timeout=_JOB_STATUS_FETCH_TIMEOUT_SECONDS)
             status = list(statuses.values())[0]
             if status is None:
-                job_logger.info('No job found.')
+                logger.info('No job found.')
             else:
-                job_logger.info(f'Job status: {status}')
-            job_logger.info('=' * 34)
+                logger.info(f'Job status: {status}')
+            logger.info('=' * 34)
             return status
         except (exceptions.CommandError, grpc.RpcError, grpc.FutureTimeoutError,
-                ValueError, TypeError) as e:
+                ValueError, TypeError, asyncio.TimeoutError) as e:
             # Note: Each of these exceptions has some additional conditions to
             # limit how we handle it and whether or not we catch it.
             # Retry on k8s transient network errors. This is useful when using
@@ -322,6 +355,9 @@ async def get_job_status(
                     is_transient_error = True
             elif isinstance(e, grpc.FutureTimeoutError):
                 detailed_reason = 'Timeout'
+            elif isinstance(e, asyncio.TimeoutError):
+                detailed_reason = ('Job status check timed out after '
+                                   f'{_JOB_STATUS_FETCH_TIMEOUT_SECONDS}s')
             # TODO(cooperc): Gracefully handle these exceptions in the backend.
             elif isinstance(e, ValueError):
                 # If the cluster yaml is deleted in the middle of getting the

sky/metrics/utils.py CHANGED Viewed

@@ -48,8 +48,15 @@ SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
     'sky_apiserver_code_duration_seconds',
     'Time spent processing code',
     ['name', 'group'],
-    buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
-             60.0, 120.0, float('inf')),
+    buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
+             0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
+             5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
+             50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
+             240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
+             420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
+             600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
+             780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
+             960.0, 980.0, 1000.0, float('inf')),
 )
 # Total number of API server requests, grouped by path, method, and status.
@@ -65,16 +72,30 @@ SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
     'sky_apiserver_request_duration_seconds',
     'Time spent processing API server requests',
     ['path', 'method', 'status'],
-    buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
-             60.0, 120.0, float('inf')),
+    buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
+             0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
+             5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
+             50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
+             240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
+             420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
+             600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
+             780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
+             960.0, 980.0, 1000.0, float('inf')),
 )
 SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
     'sky_apiserver_event_loop_lag_seconds',
     'Scheduling delay of the server event loop',
     ['pid'],
-    buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 20.0,
-             60.0, float('inf')),
+    buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
+             0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
+             5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
+             50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
+             240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
+             420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
+             600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
+             780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
+             960.0, 980.0, 1000.0, float('inf')),
 )
 SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -1299,30 +1299,52 @@ class V1Pod:
 @_retry_on_error(resource_type='pod')
-def get_all_pods_in_kubernetes_cluster(*,
-                                       context: Optional[str] = None
-                                      ) -> List[V1Pod]:
-    """Gets pods in all namespaces in kubernetes cluster indicated by context.
-    Used for computing cluster resource usage.
+def get_allocated_gpu_qty_by_node(
+    *,
+    context: Optional[str] = None,
+) -> Dict[str, int]:
+    """Gets allocated GPU quantity by each node by fetching pods in
+    all namespaces in kubernetes cluster indicated by context.
     """
     if context is None:
         context = get_current_kube_config_context_name()
+    non_included_pod_statuses = POD_STATUSES.copy()
+    status_filters = ['Running', 'Pending']
+    if status_filters is not None:
+        non_included_pod_statuses -= set(status_filters)
+        field_selector = ','.join(
+            [f'status.phase!={status}' for status in non_included_pod_statuses])
     # Return raw urllib3.HTTPResponse object so that we can parse the json
     # more efficiently.
     response = kubernetes.core_api(context).list_pod_for_all_namespaces(
-        _request_timeout=kubernetes.API_TIMEOUT, _preload_content=False)
+        _request_timeout=kubernetes.API_TIMEOUT,
+        _preload_content=False,
+        field_selector=field_selector)
     try:
-        pods = [
-            V1Pod.from_dict(item_dict) for item_dict in ijson.items(
-                response, 'items.item', buf_size=IJSON_BUFFER_SIZE)
-        ]
+        allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
+        for item_dict in ijson.items(response,
+                                     'items.item',
+                                     buf_size=IJSON_BUFFER_SIZE):
+            pod = V1Pod.from_dict(item_dict)
+            if should_exclude_pod_from_gpu_allocation(pod):
+                logger.debug(
+                    f'Excluding pod {pod.metadata.name} from GPU count '
+                    f'calculations on node {pod.spec.node_name}')
+                continue
+            # Iterate over all the containers in the pod and sum the
+            # GPU requests
+            pod_allocated_qty = 0
+            for container in pod.spec.containers:
+                if container.resources.requests:
+                    pod_allocated_qty += get_node_accelerator_count(
+                        context, container.resources.requests)
+            if pod_allocated_qty > 0 and pod.spec.node_name:
+                allocated_qty_by_node[pod.spec.node_name] += pod_allocated_qty
+        return allocated_qty_by_node
     finally:
         response.release_conn()
-    return pods
 def check_instance_fits(context: Optional[str],
                         instance: str) -> Tuple[bool, Optional[str]]:
@@ -3006,41 +3028,24 @@ def get_kubernetes_node_info(
         label_keys = lf.get_label_keys()
     # Check if all nodes have no accelerators to avoid fetching pods
-    any_node_has_accelerators = False
+    has_accelerator_nodes = False
     for node in nodes:
         accelerator_count = get_node_accelerator_count(context,
                                                        node.status.allocatable)
         if accelerator_count > 0:
-            any_node_has_accelerators = True
+            has_accelerator_nodes = True
             break
-    # Get the pods to get the real-time resource usage
-    pods = None
+    # Get the allocated GPU quantity by each node
     allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
-    if any_node_has_accelerators:
+    error_on_get_allocated_gpu_qty_by_node = False
+    if has_accelerator_nodes:
         try:
-            pods = get_all_pods_in_kubernetes_cluster(context=context)
-            # Pre-compute allocated accelerator count per node
-            for pod in pods:
-                if pod.status.phase in ['Running', 'Pending']:
-                    # Skip pods that should not count against GPU count
-                    if should_exclude_pod_from_gpu_allocation(pod):
-                        logger.debug(f'Excluding low priority pod '
-                                     f'{pod.metadata.name} from GPU allocation '
-                                     f'calculations')
-                        continue
-                    # Iterate over all the containers in the pod and sum the
-                    # GPU requests
-                    pod_allocated_qty = 0
-                    for container in pod.spec.containers:
-                        if container.resources.requests:
-                            pod_allocated_qty += get_node_accelerator_count(
-                                context, container.resources.requests)
-                    if pod_allocated_qty > 0:
-                        allocated_qty_by_node[
-                            pod.spec.node_name] += pod_allocated_qty
+            allocated_qty_by_node = get_allocated_gpu_qty_by_node(
+                context=context)
         except kubernetes.api_exception() as e:
             if e.status == 403:
+                error_on_get_allocated_gpu_qty_by_node = True
                 pass
             else:
                 raise
@@ -3085,7 +3090,7 @@ def get_kubernetes_node_info(
                 ip_address=node_ip)
             continue
-        if pods is None:
+        if not has_accelerator_nodes or error_on_get_allocated_gpu_qty_by_node:
             accelerators_available = -1
         else:
             allocated_qty = allocated_qty_by_node[node.metadata.name]

sky/server/common.py CHANGED Viewed

@@ -554,8 +554,8 @@ def _start_api_server(deploy: bool = False,
         # pylint: disable=import-outside-toplevel
         import sky.jobs.utils as job_utils
         max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
-                      if job_utils.is_consolidation_mode() else
-                      server_constants.MIN_AVAIL_MEM_GB)
+                      if job_utils.is_consolidation_mode(on_api_restart=True)
+                      else server_constants.MIN_AVAIL_MEM_GB)
         if avail_mem_size_gb <= max_memory:
             logger.warning(
                 f'{colorama.Fore.YELLOW}Your SkyPilot API server machine only '
@@ -571,6 +571,8 @@ def _start_api_server(deploy: bool = False,
             args += [f'--host={host}']
         if metrics_port is not None:
             args += [f'--metrics-port={metrics_port}']
+        # Use this argument to disable the internal signal file check.
+        args += ['--start-with-python']
         if foreground:
             # Replaces the current process with the API server

sky/server/requests/executor.py CHANGED Viewed

@@ -424,6 +424,7 @@ def _request_execution_wrapper(request_id: str,
             os.close(original_stderr)
             original_stderr = None
+    request_name = None
     try:
         # As soon as the request is updated with the executor PID, we can
         # receive SIGTERM from cancellation. So, we update the request inside
@@ -515,7 +516,8 @@ def _request_execution_wrapper(request_id: str,
             annotations.clear_request_level_cache()
             with metrics_utils.time_it(name='release_memory', group='internal'):
                 common_utils.release_memory()
-            _record_memory_metrics(request_name, proc, rss_begin, peak_rss)
+            if request_name is not None:
+                _record_memory_metrics(request_name, proc, rss_begin, peak_rss)
         except Exception as e:  # pylint: disable=broad-except
             logger.error(f'Failed to record memory metrics: '
                          f'{common_utils.format_exception(e)}')

sky/server/server.py CHANGED Viewed

@@ -1968,6 +1968,7 @@ if __name__ == '__main__':
     # Serve metrics on a separate port to isolate it from the application APIs:
     # metrics port will not be exposed to the public network typically.
     parser.add_argument('--metrics-port', default=9090, type=int)
+    parser.add_argument('--start-with-python', action='store_true')
     cmd_args = parser.parse_args()
     if cmd_args.port == cmd_args.metrics_port:
         logger.error('port and metrics-port cannot be the same, exiting.')
@@ -1982,6 +1983,10 @@ if __name__ == '__main__':
         logger.error(f'Port {cmd_args.port} is not available, exiting.')
         raise RuntimeError(f'Port {cmd_args.port} is not available')
+    if not cmd_args.start_with_python:
+        # Maybe touch the signal file on API server startup.
+        managed_job_utils.is_consolidation_mode(on_api_restart=True)
     # Show the privacy policy if it is not already shown. We place it here so
     # that it is shown only when the API server is started.
     usage_lib.maybe_show_privacy_policy()

sky/sky_logging.py CHANGED Viewed

@@ -109,7 +109,6 @@ def _setup_logger():
     global _default_handler
     if _default_handler is None:
         _default_handler = EnvAwareHandler(sys.stdout)
-        _default_handler.flush = sys.stdout.flush  # type: ignore
         if env_options.Options.SHOW_DEBUG_INFO.get():
             _default_handler.setLevel(logging.DEBUG)
         else:
@@ -129,7 +128,6 @@ def _setup_logger():
         for logger_name in _SENSITIVE_LOGGER:
             logger = logging.getLogger(logger_name)
             handler_to_logger = EnvAwareHandler(sys.stdout, sensitive=True)
-            handler_to_logger.flush = sys.stdout.flush  # type: ignore
             logger.addHandler(handler_to_logger)
             logger.setLevel(logging.INFO)
             if _show_logging_prefix():

sky/skylet/constants.py CHANGED Viewed

@@ -226,7 +226,9 @@ RAY_INSTALLATION_COMMANDS = (
     f'{SKY_UV_PIP_CMD} list | grep "ray " | '
     f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null '
     f'|| {RAY_STATUS} || '
-    f'{SKY_UV_PIP_CMD} install -U ray[default]=={SKY_REMOTE_RAY_VERSION}; '  # pylint: disable=line-too-long
+    # The pydantic-core==2.41.3 for arm seems corrupted
+    # so we need to avoid that specific version.
+    f'{SKY_UV_PIP_CMD} install -U "ray[default]=={SKY_REMOTE_RAY_VERSION}" "pydantic-core==2.41.1"; '  # pylint: disable=line-too-long
     # In some envs, e.g. pip does not have permission to write under /opt/conda
     # ray package will be installed under ~/.local/bin. If the user's PATH does
     # not include ~/.local/bin (the pip install will have the output: `WARNING:
@@ -402,10 +404,25 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
 ]
 # When overriding the SkyPilot configs on the API server with the client one,
 # we skip the following keys because they are meant to be client-side configs.
-SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [('api_server',),
-                                                       ('allowed_clouds',),
-                                                       ('workspaces',), ('db',),
-                                                       ('daemons',)]
+# Also, we skip the consolidation mode config as those should be only set on
+# the API server side.
+SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [
+    ('api_server',),
+    ('allowed_clouds',),
+    ('workspaces',),
+    ('db',),
+    ('daemons',),
+    # TODO(kevin,tian): Override the whole controller config once our test
+    # infrastructure supports setting dynamic server side configs.
+    # Tests that are affected:
+    # - test_managed_jobs_ha_kill_starting
+    # - test_managed_jobs_ha_kill_running
+    # - all tests that use LOW_CONTROLLER_RESOURCE_ENV or
+    #   LOW_CONTROLLER_RESOURCE_OVERRIDE_CONFIG (won't cause test failure,
+    #   but the configs won't be applied)
+    ('jobs', 'controller', 'consolidation_mode'),
+    ('serve', 'controller', 'consolidation_mode'),
+]
 # Constants for Azure blob storage
 WAIT_FOR_STORAGE_ACCOUNT_CREATION = 60

sky/skylet/log_lib.py CHANGED Viewed

@@ -271,7 +271,6 @@ def run_with_log(
                 stdout, stderr = context_utils.pipe_and_wait_process(
                     ctx,
                     proc,
-                    cancel_callback=subprocess_utils.kill_children_processes,
                     stdout_stream_handler=stdout_stream_handler,
                     stderr_stream_handler=stderr_stream_handler)
             elif process_stream:

sky/skylet/log_lib.pyi CHANGED Viewed

@@ -42,7 +42,7 @@ class _ProcessingArgs:
         ...
-def _get_context() -> Optional[context.Context]:
+def _get_context() -> Optional[context.SkyPilotContext]:
     ...

sky/utils/common.py CHANGED Viewed

@@ -42,6 +42,8 @@ def refresh_server_id() -> None:
     JOB_CONTROLLER_NAME = f'{JOB_CONTROLLER_PREFIX}{SERVER_ID}'
+# TODO(kevin): Remove this side effect and have callers call
+# refresh_server_id() explicitly as needed.
 refresh_server_id()

sky/utils/context.py CHANGED Viewed

@@ -5,13 +5,12 @@ from collections.abc import Mapping
 import contextvars
 import copy
 import functools
-import inspect
 import os
 import pathlib
 import subprocess
 import sys
-from typing import (Callable, Dict, Iterator, MutableMapping, Optional, TextIO,
-                    TYPE_CHECKING, TypeVar)
+from typing import (Any, Callable, Coroutine, Dict, Iterator, MutableMapping,
+                    Optional, TextIO, TYPE_CHECKING, TypeVar)
 from typing_extensions import ParamSpec
@@ -19,7 +18,7 @@ if TYPE_CHECKING:
     from sky.skypilot_config import ConfigContext
-class Context(object):
+class SkyPilotContext(object):
     """SkyPilot typed context vars for threads and coroutines.
     This is a wrapper around `contextvars.ContextVar` that provides a typed
@@ -114,7 +113,14 @@ class Context(object):
             self._log_file_handle.close()
             self._log_file_handle = None
-    def copy(self) -> 'Context':
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        del exc_type, exc_val, exc_tb
+        self.cleanup()
+    def copy(self) -> 'SkyPilotContext':
         """Create a copy of the context.
         Changes to the current context after this call will not affect the copy.
@@ -123,18 +129,18 @@ class Context(object):
         The new context will get an independent copy of the config context.
         Cancellation of the current context will not be propagated to the copy.
         """
-        new_context = Context()
+        new_context = SkyPilotContext()
         new_context.redirect_log(self._log_file)
         new_context.env_overrides = self.env_overrides.copy()
         new_context.config_context = copy.deepcopy(self.config_context)
         return new_context
-_CONTEXT = contextvars.ContextVar[Optional[Context]]('sky_context',
-                                                     default=None)
+_CONTEXT = contextvars.ContextVar[Optional[SkyPilotContext]]('sky_context',
+                                                             default=None)
-def get() -> Optional[Context]:
+def get() -> Optional[SkyPilotContext]:
     """Get the current SkyPilot context.
     If the context is not initialized, get() will return None. This helps
@@ -200,7 +206,7 @@ class ContextualEnviron(MutableMapping[str, str]):
     def __iter__(self) -> Iterator[str]:
-        def iter_from_context(ctx: Context) -> Iterator[str]:
+        def iter_from_context(ctx: SkyPilotContext) -> Iterator[str]:
             deleted_keys = set()
             for key, value in ctx.env_overrides.items():
                 if value is None:
@@ -311,56 +317,56 @@ def contextual(func: Callable[P, T]) -> Callable[P, T]:
     context that inherits the values from the existing context.
     """
+    def run_in_context(*args: P.args, **kwargs: P.kwargs) -> T:
+        # Within the new contextvars Context, set up the SkyPilotContext.
+        original_ctx = get()
+        with initialize(original_ctx):
+            return func(*args, **kwargs)
     @functools.wraps(func)
     def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
+        # Create a copy of the current contextvars Context so that setting the
+        # SkyPilotContext does not affect the caller's context in async
+        # environments.
+        context = contextvars.copy_context()
+        return context.run(run_in_context, *args, **kwargs)
+    return wrapper
+def contextual_async(
+    func: Callable[P, Coroutine[Any, Any, T]]
+) -> Callable[P, Coroutine[Any, Any, T]]:
+    """Decorator to initialize a context before executing the function.
+    If a context is already initialized, this decorator will create a new
+    context that inherits the values from the existing context.
+    """
+    async def run_in_context(*args: P.args, **kwargs: P.kwargs) -> T:
+        # Within the new contextvars Context, set up the SkyPilotContext.
         original_ctx = get()
-        initialize(original_ctx)
-        ctx = get()
-        cleanup_after_await = False
-        def cleanup():
-            try:
-                if ctx is not None:
-                    ctx.cleanup()
-            finally:
-                # Note: _CONTEXT.reset() is not reliable - may fail with
-                # ValueError: <Token ... at ...> was created in a different
-                # Context
-                # We must make sure this happens because otherwise we may try to
-                # write to the wrong log.
-                _CONTEXT.set(original_ctx)
-        # There are two cases:
-        # 1. The function is synchronous (that is, return type is not awaitable)
-        #    In this case, we use a finally block to cleanup the context.
-        # 2. The function is asynchronous (that is, return type is awaitable)
-        #    In this case, we need to construct an async def wrapper and await
-        #    the value, then call the cleanup function in the finally block.
-        async def await_with_cleanup(awaitable):
-            try:
-                return await awaitable
-            finally:
-                cleanup()
-        try:
-            ret = func(*args, **kwargs)
-            if inspect.isawaitable(ret):
-                cleanup_after_await = True
-                return await_with_cleanup(ret)
-            else:
-                return ret
-        finally:
-            if not cleanup_after_await:
-                cleanup()
+        with initialize(original_ctx):
+            return await func(*args, **kwargs)
+    @functools.wraps(func)
+    async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
+        # Create a copy of the current contextvars Context so that setting the
+        # SkyPilotContext does not affect the caller's context in async
+        # environments.
+        context = contextvars.copy_context()
+        return await context.run(run_in_context, *args, **kwargs)
     return wrapper
-def initialize(base_context: Optional[Context] = None) -> None:
+def initialize(
+        base_context: Optional[SkyPilotContext] = None) -> SkyPilotContext:
     """Initialize the current SkyPilot context."""
-    new_context = base_context.copy() if base_context is not None else Context()
+    new_context = base_context.copy(
+    ) if base_context is not None else SkyPilotContext()
     _CONTEXT.set(new_context)
+    return new_context
 class _ContextualStream:

skypilot-nightly 1.0.0.dev20251013__py3-none-any.whl → 1.0.0.dev20251014__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20251013py3-none-any.whl → 1.0.0.dev20251014py3-none-any.whl