PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251013__py3-none-any.whl → 1.0.0.dev20251015__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251013py3-none-any.whl → 1.0.0.dev20251015py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (57) hide show

sky/__init__.py +2 -2
sky/authentication.py +9 -2
sky/backends/backend_utils.py +62 -40
sky/backends/cloud_vm_ray_backend.py +8 -6
sky/catalog/kubernetes_catalog.py +19 -25
sky/client/cli/command.py +53 -19
sky/client/sdk.py +13 -1
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/jobs/controller.py +122 -145
sky/jobs/recovery_strategy.py +59 -82
sky/jobs/scheduler.py +5 -5
sky/jobs/state.py +65 -21
sky/jobs/utils.py +58 -22
sky/metrics/utils.py +27 -6
sky/provision/common.py +2 -0
sky/provision/instance_setup.py +10 -2
sky/provision/kubernetes/instance.py +34 -10
sky/provision/kubernetes/utils.py +53 -39
sky/server/common.py +4 -2
sky/server/requests/executor.py +3 -1
sky/server/requests/preconditions.py +2 -4
sky/server/requests/requests.py +13 -23
sky/server/server.py +5 -0
sky/sky_logging.py +0 -2
sky/skylet/constants.py +22 -5
sky/skylet/log_lib.py +0 -1
sky/skylet/log_lib.pyi +1 -1
sky/utils/asyncio_utils.py +18 -0
sky/utils/common.py +2 -0
sky/utils/context.py +57 -51
sky/utils/context_utils.py +2 -2
sky/utils/controller_utils.py +35 -8
sky/utils/locks.py +20 -5
sky/utils/subprocess_utils.py +4 -3
{skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251015.dist-info}/METADATA +38 -37
{skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251015.dist-info}/RECORD +57 -56
/sky/dashboard/out/_next/static/{MtlDUf-nH1hhcy7xwbCj3 → -bih7JVStsXyeasac-dvQ}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{MtlDUf-nH1hhcy7xwbCj3 → -bih7JVStsXyeasac-dvQ}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251015.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251015.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251015.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251015.dist-info}/top_level.txt +0 -0

sky/jobs/utils.py CHANGED Viewed

@@ -8,7 +8,6 @@ import asyncio
 import collections
 import datetime
 import enum
-import logging
 import os
 import pathlib
 import re
@@ -84,6 +83,7 @@ _LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
 _JOB_STATUS_FETCH_MAX_RETRIES = 3
 _JOB_K8S_TRANSIENT_NW_MSG = 'Unable to connect to the server: dial tcp'
+_JOB_STATUS_FETCH_TIMEOUT_SECONDS = 30
 _JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
     'Waiting for task to start[/]'
@@ -101,6 +101,13 @@ _JOB_CANCELLED_MESSAGE = (
 # update the state.
 _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 120
+# After enabling consolidation mode, we need to restart the API server to get
+# the jobs refresh deamon and correct number of executors. We use this file to
+# indicate that the API server has been restarted after enabling consolidation
+# mode.
+_JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE = (
+    '~/.sky/.jobs_controller_consolidation_reloaded_signal')
 class ManagedJobQueueResultType(enum.Enum):
     """The type of the managed job queue result."""
@@ -117,9 +124,8 @@ class UserSignal(enum.Enum):
 # ====== internal functions ======
 def terminate_cluster(
-        cluster_name: str,
-        max_retry: int = 6,
-        _logger: logging.Logger = logger,  # pylint: disable=invalid-name
+    cluster_name: str,
+    max_retry: int = 6,
 ) -> None:
     """Terminate the cluster."""
     from sky import core  # pylint: disable=import-outside-toplevel
@@ -143,18 +149,18 @@ def terminate_cluster(
             return
         except exceptions.ClusterDoesNotExist:
             # The cluster is already down.
-            _logger.debug(f'The cluster {cluster_name} is already down.')
+            logger.debug(f'The cluster {cluster_name} is already down.')
             return
         except Exception as e:  # pylint: disable=broad-except
             retry_cnt += 1
             if retry_cnt >= max_retry:
                 raise RuntimeError(
                     f'Failed to terminate the cluster {cluster_name}.') from e
-            _logger.error(
+            logger.error(
                 f'Failed to terminate the cluster {cluster_name}. Retrying.'
                 f'Details: {common_utils.format_exception(e)}')
             with ux_utils.enable_traceback():
-                _logger.error(f'  Traceback: {traceback.format_exc()}')
+                logger.error(f'  Traceback: {traceback.format_exc()}')
             time.sleep(backoff.current_backoff())
@@ -202,13 +208,39 @@ def _validate_consolidation_mode_config(
 # API Server. Under the hood, we submit the job monitoring logic as processes
 # directly in the API Server.
 # Use LRU Cache so that the check is only done once.
-@annotations.lru_cache(scope='request', maxsize=1)
-def is_consolidation_mode() -> bool:
+@annotations.lru_cache(scope='request', maxsize=2)
+def is_consolidation_mode(on_api_restart: bool = False) -> bool:
     if os.environ.get(constants.OVERRIDE_CONSOLIDATION_MODE) is not None:
         return True
-    consolidation_mode = skypilot_config.get_nested(
+    config_consolidation_mode = skypilot_config.get_nested(
         ('jobs', 'controller', 'consolidation_mode'), default_value=False)
+    signal_file = pathlib.Path(
+        _JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE).expanduser()
+    restart_signal_file_exists = signal_file.exists()
+    consolidation_mode = (config_consolidation_mode and
+                          restart_signal_file_exists)
+    if on_api_restart:
+        if config_consolidation_mode:
+            signal_file.touch()
+    else:
+        if not restart_signal_file_exists:
+            if config_consolidation_mode:
+                logger.warning(f'{colorama.Fore.YELLOW}Consolidation mode for '
+                               'managed jobs is enabled in the server config, '
+                               'but the API server has not been restarted yet. '
+                               'Please restart the API server to enable it.'
+                               f'{colorama.Style.RESET_ALL}')
+                return False
+        elif not config_consolidation_mode:
+            # Cleanup the signal file if the consolidation mode is disabled in
+            # the config. This allow the user to disable the consolidation mode
+            # without restarting the API server.
+            signal_file.unlink()
     # We should only do this check on API server, as the controller will not
     # have related config and will always seemingly disabled for consolidation
     # mode. Check #6611 for more details.
@@ -269,8 +301,7 @@ def ha_recovery_for_consolidation_mode():
 async def get_job_status(
         backend: 'backends.CloudVmRayBackend', cluster_name: str,
-        job_id: Optional[int],
-        job_logger: logging.Logger) -> Optional['job_lib.JobStatus']:
+        job_id: Optional[int]) -> Optional['job_lib.JobStatus']:
     """Check the status of the job running on a managed job cluster.
     It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
@@ -282,26 +313,28 @@ async def get_job_status(
     if handle is None:
         # This can happen if the cluster was preempted and background status
         # refresh already noticed and cleaned it up.
-        job_logger.info(f'Cluster {cluster_name} not found.')
+        logger.info(f'Cluster {cluster_name} not found.')
         return None
     assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
     job_ids = None if job_id is None else [job_id]
     for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
         try:
-            job_logger.info('=== Checking the job status... ===')
-            statuses = await context_utils.to_thread(backend.get_job_status,
-                                                     handle,
-                                                     job_ids=job_ids,
-                                                     stream_logs=False)
+            logger.info('=== Checking the job status... ===')
+            statuses = await asyncio.wait_for(
+                context_utils.to_thread(backend.get_job_status,
+                                        handle,
+                                        job_ids=job_ids,
+                                        stream_logs=False),
+                timeout=_JOB_STATUS_FETCH_TIMEOUT_SECONDS)
             status = list(statuses.values())[0]
             if status is None:
-                job_logger.info('No job found.')
+                logger.info('No job found.')
             else:
-                job_logger.info(f'Job status: {status}')
-            job_logger.info('=' * 34)
+                logger.info(f'Job status: {status}')
+            logger.info('=' * 34)
             return status
         except (exceptions.CommandError, grpc.RpcError, grpc.FutureTimeoutError,
-                ValueError, TypeError) as e:
+                ValueError, TypeError, asyncio.TimeoutError) as e:
             # Note: Each of these exceptions has some additional conditions to
             # limit how we handle it and whether or not we catch it.
             # Retry on k8s transient network errors. This is useful when using
@@ -322,6 +355,9 @@ async def get_job_status(
                     is_transient_error = True
             elif isinstance(e, grpc.FutureTimeoutError):
                 detailed_reason = 'Timeout'
+            elif isinstance(e, asyncio.TimeoutError):
+                detailed_reason = ('Job status check timed out after '
+                                   f'{_JOB_STATUS_FETCH_TIMEOUT_SECONDS}s')
             # TODO(cooperc): Gracefully handle these exceptions in the backend.
             elif isinstance(e, ValueError):
                 # If the cluster yaml is deleted in the middle of getting the

sky/metrics/utils.py CHANGED Viewed

@@ -48,8 +48,15 @@ SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
     'sky_apiserver_code_duration_seconds',
     'Time spent processing code',
     ['name', 'group'],
-    buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
-             60.0, 120.0, float('inf')),
+    buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
+             0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
+             5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
+             50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
+             240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
+             420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
+             600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
+             780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
+             960.0, 980.0, 1000.0, float('inf')),
 )
 # Total number of API server requests, grouped by path, method, and status.
@@ -65,16 +72,30 @@ SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
     'sky_apiserver_request_duration_seconds',
     'Time spent processing API server requests',
     ['path', 'method', 'status'],
-    buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
-             60.0, 120.0, float('inf')),
+    buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
+             0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
+             5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
+             50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
+             240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
+             420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
+             600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
+             780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
+             960.0, 980.0, 1000.0, float('inf')),
 )
 SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
     'sky_apiserver_event_loop_lag_seconds',
     'Scheduling delay of the server event loop',
     ['pid'],
-    buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 20.0,
-             60.0, float('inf')),
+    buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
+             0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
+             5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
+             50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
+             240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
+             420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
+             600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
+             780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
+             960.0, 980.0, 1000.0, float('inf')),
 )
 SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(

sky/provision/common.py CHANGED Viewed

@@ -97,6 +97,8 @@ class InstanceInfo:
     external_ip: Optional[str]
     tags: Dict[str, str]
     ssh_port: int = 22
+    # The internal service address of the instance on Kubernetes.
+    internal_svc: Optional[str] = None
     def get_feasible_ip(self) -> str:
         """Get the most feasible IPs of the instance. This function returns

sky/provision/instance_setup.py CHANGED Viewed

@@ -434,8 +434,16 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
         # use the external IP of the head node.
         use_external_ip = cluster_info.custom_ray_options.pop(
             'use_external_ip', False)
-    head_ip = (head_instance.internal_ip
-               if not use_external_ip else head_instance.external_ip)
+    if use_external_ip:
+        head_ip = head_instance.external_ip
+    else:
+        # For Kubernetes, use the internal service address of the head node.
+        # Keep this consistent with the logic in kubernetes-ray.yml.j2
+        if head_instance.internal_svc:
+            head_ip = head_instance.internal_svc
+        else:
+            head_ip = head_instance.internal_ip
     ray_cmd = ray_worker_start_command(custom_resource,
                                        cluster_info.custom_ray_options,

sky/provision/kubernetes/instance.py CHANGED Viewed

@@ -959,12 +959,19 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
     def _create_resource_thread(i: int):
         pod_spec_copy = copy.deepcopy(pod_spec)
-        if head_pod_name is None and i == 0:
-            # First pod should be head if no head exists
-            pod_spec_copy['metadata']['labels'].update(constants.HEAD_NODE_TAGS)
-            head_selector = _head_service_selector(cluster_name_on_cloud)
-            pod_spec_copy['metadata']['labels'].update(head_selector)
-            pod_spec_copy['metadata']['name'] = f'{cluster_name_on_cloud}-head'
+        # 0 is for head pod, while 1+ is for worker pods.
+        if i == 0:
+            if head_pod_name is None:
+                # First pod should be head if no head exists
+                pod_spec_copy['metadata']['labels'].update(
+                    constants.HEAD_NODE_TAGS)
+                head_selector = _head_service_selector(cluster_name_on_cloud)
+                pod_spec_copy['metadata']['labels'].update(head_selector)
+                pod_spec_copy['metadata'][
+                    'name'] = f'{cluster_name_on_cloud}-head'
+            else:
+                # If head pod already exists, we skip creating it.
+                return
         else:
             # Worker pods
             pod_spec_copy['metadata']['labels'].update(
@@ -1105,9 +1112,16 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
                 'and then up the cluster again.')
             raise exceptions.InconsistentHighAvailabilityError(message)
-    # Create pods in parallel
-    created_resources = subprocess_utils.run_in_parallel(
-        _create_resource_thread, list(range(to_start_count)), _NUM_THREADS)
+    created_resources = []
+    if to_start_count > 0:
+        # Create pods in parallel.
+        # Use `config.count` instead of `to_start_count` to keep the index of
+        # the Pods consistent especially for the case where some Pods are down
+        # due to node failure or manual termination, etc. and then launch
+        # again to create the Pods back.
+        # The existing Pods will be skipped in _create_resource_thread.
+        created_resources = subprocess_utils.run_in_parallel(
+            _create_resource_thread, list(range(config.count)), _NUM_THREADS)
     if to_create_deployment:
         deployments = copy.deepcopy(created_resources)
@@ -1350,6 +1364,9 @@ def get_cluster_info(
                 external_ip=None,
                 ssh_port=port,
                 tags=pod.metadata.labels,
+                # TODO(hailong): `cluster.local` may need to be configurable
+                # Service name is same as the pod name for now.
+                internal_svc=f'{pod_name}.{namespace}.svc.cluster.local',
             )
         ]
         if _is_head(pod):
@@ -1388,6 +1405,13 @@ def get_cluster_info(
     logger.debug(
         f'Using ssh user {ssh_user} for cluster {cluster_name_on_cloud}')
+    # cpu_request may be a string like `100m`, need to parse and convert
+    num_cpus = kubernetes_utils.parse_cpu_or_gpu_resource_to_float(cpu_request)
+    # 'num-cpus' for ray must be an integer, but we should not set it to 0 if
+    # cpus is <1.
+    # Keep consistent with the logic in clouds/kubernetes.py
+    str_cpus = str(max(int(num_cpus), 1))
     return common.ClusterInfo(
         instances=pods,
         head_instance_id=head_pod_name,
@@ -1397,7 +1421,7 @@ def get_cluster_info(
         # problems for other pods.
         custom_ray_options={
             'object-store-memory': 500000000,
-            'num-cpus': cpu_request,
+            'num-cpus': str_cpus,
         },
         provider_name='kubernetes',
         provider_config=provider_config)

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -1299,30 +1299,52 @@ class V1Pod:
 @_retry_on_error(resource_type='pod')
-def get_all_pods_in_kubernetes_cluster(*,
-                                       context: Optional[str] = None
-                                      ) -> List[V1Pod]:
-    """Gets pods in all namespaces in kubernetes cluster indicated by context.
-    Used for computing cluster resource usage.
+def get_allocated_gpu_qty_by_node(
+    *,
+    context: Optional[str] = None,
+) -> Dict[str, int]:
+    """Gets allocated GPU quantity by each node by fetching pods in
+    all namespaces in kubernetes cluster indicated by context.
     """
     if context is None:
         context = get_current_kube_config_context_name()
+    non_included_pod_statuses = POD_STATUSES.copy()
+    status_filters = ['Running', 'Pending']
+    if status_filters is not None:
+        non_included_pod_statuses -= set(status_filters)
+        field_selector = ','.join(
+            [f'status.phase!={status}' for status in non_included_pod_statuses])
     # Return raw urllib3.HTTPResponse object so that we can parse the json
     # more efficiently.
     response = kubernetes.core_api(context).list_pod_for_all_namespaces(
-        _request_timeout=kubernetes.API_TIMEOUT, _preload_content=False)
+        _request_timeout=kubernetes.API_TIMEOUT,
+        _preload_content=False,
+        field_selector=field_selector)
     try:
-        pods = [
-            V1Pod.from_dict(item_dict) for item_dict in ijson.items(
-                response, 'items.item', buf_size=IJSON_BUFFER_SIZE)
-        ]
+        allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
+        for item_dict in ijson.items(response,
+                                     'items.item',
+                                     buf_size=IJSON_BUFFER_SIZE):
+            pod = V1Pod.from_dict(item_dict)
+            if should_exclude_pod_from_gpu_allocation(pod):
+                logger.debug(
+                    f'Excluding pod {pod.metadata.name} from GPU count '
+                    f'calculations on node {pod.spec.node_name}')
+                continue
+            # Iterate over all the containers in the pod and sum the
+            # GPU requests
+            pod_allocated_qty = 0
+            for container in pod.spec.containers:
+                if container.resources.requests:
+                    pod_allocated_qty += get_node_accelerator_count(
+                        context, container.resources.requests)
+            if pod_allocated_qty > 0 and pod.spec.node_name:
+                allocated_qty_by_node[pod.spec.node_name] += pod_allocated_qty
+        return allocated_qty_by_node
     finally:
         response.release_conn()
-    return pods
 def check_instance_fits(context: Optional[str],
                         instance: str) -> Tuple[bool, Optional[str]]:
@@ -2219,6 +2241,15 @@ def get_kube_config_context_namespace(
         return DEFAULT_NAMESPACE
+def parse_cpu_or_gpu_resource_to_float(resource_str: str) -> float:
+    if not resource_str:
+        return 0.0
+    if resource_str[-1] == 'm':
+        return float(resource_str[:-1]) / 1000
+    else:
+        return float(resource_str)
 def parse_cpu_or_gpu_resource(resource_qty_str: str) -> Union[int, float]:
     resource_str = str(resource_qty_str)
     if resource_str[-1] == 'm':
@@ -3006,41 +3037,24 @@ def get_kubernetes_node_info(
         label_keys = lf.get_label_keys()
     # Check if all nodes have no accelerators to avoid fetching pods
-    any_node_has_accelerators = False
+    has_accelerator_nodes = False
     for node in nodes:
         accelerator_count = get_node_accelerator_count(context,
                                                        node.status.allocatable)
         if accelerator_count > 0:
-            any_node_has_accelerators = True
+            has_accelerator_nodes = True
             break
-    # Get the pods to get the real-time resource usage
-    pods = None
+    # Get the allocated GPU quantity by each node
     allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
-    if any_node_has_accelerators:
+    error_on_get_allocated_gpu_qty_by_node = False
+    if has_accelerator_nodes:
         try:
-            pods = get_all_pods_in_kubernetes_cluster(context=context)
-            # Pre-compute allocated accelerator count per node
-            for pod in pods:
-                if pod.status.phase in ['Running', 'Pending']:
-                    # Skip pods that should not count against GPU count
-                    if should_exclude_pod_from_gpu_allocation(pod):
-                        logger.debug(f'Excluding low priority pod '
-                                     f'{pod.metadata.name} from GPU allocation '
-                                     f'calculations')
-                        continue
-                    # Iterate over all the containers in the pod and sum the
-                    # GPU requests
-                    pod_allocated_qty = 0
-                    for container in pod.spec.containers:
-                        if container.resources.requests:
-                            pod_allocated_qty += get_node_accelerator_count(
-                                context, container.resources.requests)
-                    if pod_allocated_qty > 0:
-                        allocated_qty_by_node[
-                            pod.spec.node_name] += pod_allocated_qty
+            allocated_qty_by_node = get_allocated_gpu_qty_by_node(
+                context=context)
         except kubernetes.api_exception() as e:
             if e.status == 403:
+                error_on_get_allocated_gpu_qty_by_node = True
                 pass
             else:
                 raise
@@ -3085,7 +3099,7 @@ def get_kubernetes_node_info(
                 ip_address=node_ip)
             continue
-        if pods is None:
+        if not has_accelerator_nodes or error_on_get_allocated_gpu_qty_by_node:
             accelerators_available = -1
         else:
             allocated_qty = allocated_qty_by_node[node.metadata.name]

sky/server/common.py CHANGED Viewed

@@ -554,8 +554,8 @@ def _start_api_server(deploy: bool = False,
         # pylint: disable=import-outside-toplevel
         import sky.jobs.utils as job_utils
         max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
-                      if job_utils.is_consolidation_mode() else
-                      server_constants.MIN_AVAIL_MEM_GB)
+                      if job_utils.is_consolidation_mode(on_api_restart=True)
+                      else server_constants.MIN_AVAIL_MEM_GB)
         if avail_mem_size_gb <= max_memory:
             logger.warning(
                 f'{colorama.Fore.YELLOW}Your SkyPilot API server machine only '
@@ -571,6 +571,8 @@ def _start_api_server(deploy: bool = False,
             args += [f'--host={host}']
         if metrics_port is not None:
             args += [f'--metrics-port={metrics_port}']
+        # Use this argument to disable the internal signal file check.
+        args += ['--start-with-python']
         if foreground:
             # Replaces the current process with the API server

sky/server/requests/executor.py CHANGED Viewed

@@ -424,6 +424,7 @@ def _request_execution_wrapper(request_id: str,
             os.close(original_stderr)
             original_stderr = None
+    request_name = None
     try:
         # As soon as the request is updated with the executor PID, we can
         # receive SIGTERM from cancellation. So, we update the request inside
@@ -515,7 +516,8 @@ def _request_execution_wrapper(request_id: str,
             annotations.clear_request_level_cache()
             with metrics_utils.time_it(name='release_memory', group='internal'):
                 common_utils.release_memory()
-            _record_memory_metrics(request_name, proc, rss_begin, peak_rss)
+            if request_name is not None:
+                _record_memory_metrics(request_name, proc, rss_begin, peak_rss)
         except Exception as e:  # pylint: disable=broad-except
             logger.error(f'Failed to record memory metrics: '
                          f'{common_utils.format_exception(e)}')

sky/server/requests/preconditions.py CHANGED Viewed

@@ -112,10 +112,8 @@ class Precondition(abc.ABC):
                     return True
                 if status_msg is not None and status_msg != last_status_msg:
                     # Update the status message if it has changed.
-                    async with api_requests.update_request_async(
-                            self.request_id) as req:
-                        assert req is not None, self.request_id
-                        req.status_msg = status_msg
+                    await api_requests.update_status_msg_async(
+                        self.request_id, status_msg)
                     last_status_msg = status_msg
             except (Exception, SystemExit, KeyboardInterrupt) as e:  # pylint: disable=broad-except
                 api_requests.set_request_failed(self.request_id, e)

sky/server/requests/requests.py CHANGED Viewed

@@ -14,8 +14,8 @@ import sqlite3
 import threading
 import time
 import traceback
-from typing import (Any, AsyncContextManager, Callable, Dict, Generator, List,
-                    NamedTuple, Optional, Tuple)
+from typing import (Any, Callable, Dict, Generator, List, NamedTuple, Optional,
+                    Tuple)
 import anyio
 import colorama
@@ -32,6 +32,7 @@ from sky.server import daemons
 from sky.server.requests import payloads
 from sky.server.requests.serializers import decoders
 from sky.server.requests.serializers import encoders
+from sky.utils import asyncio_utils
 from sky.utils import common_utils
 from sky.utils import ux_utils
 from sky.utils.db import db_utils
@@ -578,27 +579,14 @@ def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
 @init_db
 @metrics_lib.time_me
-def update_request_async(
-        request_id: str) -> AsyncContextManager[Optional[Request]]:
-    """Async version of update_request.
-    Returns an async context manager that yields the request record and
-    persists any in-place updates upon exit.
-    """
-    @contextlib.asynccontextmanager
-    async def _cm():
-        # Acquire the lock to avoid race conditions between multiple request
-        # operations, e.g. execute and cancel.
-        async with filelock.AsyncFileLock(request_lock_path(request_id)):
-            request = await _get_request_no_lock_async(request_id)
-            try:
-                yield request
-            finally:
-                if request is not None:
-                    await _add_or_update_request_no_lock_async(request)
-    return _cm()
+@asyncio_utils.shield
+async def update_status_msg_async(request_id: str, status_msg: str) -> None:
+    """Update the status message of a request"""
+    async with filelock.AsyncFileLock(request_lock_path(request_id)):
+        request = await _get_request_no_lock_async(request_id)
+        if request is not None:
+            request.status_msg = status_msg
+            await _add_or_update_request_no_lock_async(request)
 _get_request_sql = (f'SELECT {", ".join(REQUEST_COLUMNS)} FROM {REQUEST_TABLE} '
@@ -651,6 +639,7 @@ def get_request(request_id: str) -> Optional[Request]:
 @init_db_async
 @metrics_lib.time_me_async
+@asyncio_utils.shield
 async def get_request_async(request_id: str) -> Optional[Request]:
     """Async version of get_request."""
     async with filelock.AsyncFileLock(request_lock_path(request_id)):
@@ -704,6 +693,7 @@ def create_if_not_exists(request: Request) -> bool:
 @init_db_async
 @metrics_lib.time_me_async
+@asyncio_utils.shield
 async def create_if_not_exists_async(request: Request) -> bool:
     """Async version of create_if_not_exists."""
     async with filelock.AsyncFileLock(request_lock_path(request.request_id)):

sky/server/server.py CHANGED Viewed

@@ -1968,6 +1968,7 @@ if __name__ == '__main__':
     # Serve metrics on a separate port to isolate it from the application APIs:
     # metrics port will not be exposed to the public network typically.
     parser.add_argument('--metrics-port', default=9090, type=int)
+    parser.add_argument('--start-with-python', action='store_true')
     cmd_args = parser.parse_args()
     if cmd_args.port == cmd_args.metrics_port:
         logger.error('port and metrics-port cannot be the same, exiting.')
@@ -1982,6 +1983,10 @@ if __name__ == '__main__':
         logger.error(f'Port {cmd_args.port} is not available, exiting.')
         raise RuntimeError(f'Port {cmd_args.port} is not available')
+    if not cmd_args.start_with_python:
+        # Maybe touch the signal file on API server startup.
+        managed_job_utils.is_consolidation_mode(on_api_restart=True)
     # Show the privacy policy if it is not already shown. We place it here so
     # that it is shown only when the API server is started.
     usage_lib.maybe_show_privacy_policy()

sky/sky_logging.py CHANGED Viewed

@@ -109,7 +109,6 @@ def _setup_logger():
     global _default_handler
     if _default_handler is None:
         _default_handler = EnvAwareHandler(sys.stdout)
-        _default_handler.flush = sys.stdout.flush  # type: ignore
         if env_options.Options.SHOW_DEBUG_INFO.get():
             _default_handler.setLevel(logging.DEBUG)
         else:
@@ -129,7 +128,6 @@ def _setup_logger():
         for logger_name in _SENSITIVE_LOGGER:
             logger = logging.getLogger(logger_name)
             handler_to_logger = EnvAwareHandler(sys.stdout, sensitive=True)
-            handler_to_logger.flush = sys.stdout.flush  # type: ignore
             logger.addHandler(handler_to_logger)
             logger.setLevel(logging.INFO)
             if _show_logging_prefix():

skypilot-nightly 1.0.0.dev20251013__py3-none-any.whl → 1.0.0.dev20251015__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20251013py3-none-any.whl → 1.0.0.dev20251015py3-none-any.whl