PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250918__py3-none-any.whl → 1.0.0.dev20250922__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250918py3-none-any.whl → 1.0.0.dev20250922py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (52) hide show

sky/server/metrics.py CHANGED Viewed

@@ -1,11 +1,11 @@
 """Instrumentation for the API server."""
-import contextlib
-import functools
+import asyncio
 import multiprocessing
 import os
 import threading
 import time
+from typing import List
 import fastapi
 from prometheus_client import generate_latest
@@ -15,112 +15,12 @@ import psutil
 import starlette.middleware.base
 import uvicorn
+from sky import core
 from sky import sky_logging
-from sky.skylet import constants
-# Whether the metrics are enabled, cannot be changed at runtime.
-METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
-                                 'false').lower() == 'true'
-_KB = 2**10
-_MB = 2**20
-_MEM_BUCKETS = [
-    _KB,
-    256 * _KB,
-    512 * _KB,
-    _MB,
-    2 * _MB,
-    4 * _MB,
-    8 * _MB,
-    16 * _MB,
-    32 * _MB,
-    64 * _MB,
-    128 * _MB,
-    256 * _MB,
-    float('inf'),
-]
+from sky.metrics import utils as metrics_utils
 logger = sky_logging.init_logger(__name__)
-# Total number of API server requests, grouped by path, method, and status.
-SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
-    'sky_apiserver_requests_total',
-    'Total number of API server requests',
-    ['path', 'method', 'status'],
-)
-# Time spent processing API server requests, grouped by path, method, and
-# status.
-SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
-    'sky_apiserver_request_duration_seconds',
-    'Time spent processing API server requests',
-    ['path', 'method', 'status'],
-    buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
-             60.0, 120.0, float('inf')),
-)
-# Time spent processing a piece of code, refer to time_it().
-SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
-    'sky_apiserver_code_duration_seconds',
-    'Time spent processing code',
-    ['name', 'group'],
-    buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
-             60.0, 120.0, float('inf')),
-)
-SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
-    'sky_apiserver_event_loop_lag_seconds',
-    'Scheduling delay of the server event loop',
-    ['pid'],
-    buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 20.0,
-             60.0, float('inf')),
-)
-SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
-    'sky_apiserver_websocket_connections',
-    'Number of websocket connections',
-    ['pid'],
-    multiprocess_mode='livesum',
-)
-SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL = prom.Counter(
-    'sky_apiserver_websocket_closed_total',
-    'Number of websocket closed',
-    ['pid', 'reason'],
-)
-# The number of execution starts in each worker process, we do not record
-# histogram here as the duration has been measured in
-# SKY_APISERVER_CODE_DURATION_SECONDS without the worker label (process id).
-# Recording histogram WITH worker label will cause high cardinality.
-SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL = prom.Counter(
-    'sky_apiserver_process_execution_start_total',
-    'Total number of execution starts in each worker process',
-    ['request', 'pid'],
-)
-SKY_APISERVER_PROCESS_PEAK_RSS = prom.Gauge(
-    'sky_apiserver_process_peak_rss',
-    'Peak RSS we saw in each process in last 30 seconds',
-    ['pid', 'type'],
-)
-SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
-    'sky_apiserver_process_cpu_total',
-    'Total CPU times a worker process has been running',
-    ['pid', 'type', 'mode'],
-)
-SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
-    'sky_apiserver_request_memory_usage_bytes',
-    'Peak memory usage of requests', ['name'],
-    buckets=_MEM_BUCKETS)
-SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
-    'sky_apiserver_request_rss_incr_bytes',
-    'RSS increment after requests', ['name'],
-    buckets=_MEM_BUCKETS)
 metrics_app = fastapi.FastAPI()
@@ -139,6 +39,42 @@ async def metrics() -> fastapi.Response:
                             headers={'Cache-Control': 'no-cache'})
+@metrics_app.get('/gpu-metrics')
+async def gpu_metrics() -> fastapi.Response:
+    """Gets the GPU metrics from multiple external k8s clusters"""
+    contexts = core.get_all_contexts()
+    all_metrics: List[str] = []
+    successful_contexts = 0
+    tasks = [
+        asyncio.create_task(metrics_utils.get_metrics_for_context(context))
+        for context in contexts
+        if context != 'in-cluster'
+    ]
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+    for i, result in enumerate(results):
+        if isinstance(result, Exception):
+            logger.error(
+                f'Failed to get metrics for context {contexts[i]}: {result}')
+        elif isinstance(result, BaseException):
+            # Avoid changing behavior for non-Exception BaseExceptions
+            # like KeyboardInterrupt/SystemExit: re-raise them.
+            raise result
+        else:
+            metrics_text = result
+            all_metrics.append(metrics_text)
+            successful_contexts += 1
+    combined_metrics = '\n\n'.join(all_metrics)
+    # Return as plain text for Prometheus compatibility
+    return fastapi.Response(
+        content=combined_metrics,
+        media_type='text/plain; version=0.0.4; charset=utf-8')
 def build_metrics_server(host: str, port: int) -> uvicorn.Server:
     metrics_config = uvicorn.Config(
         'sky.server.metrics:metrics_app',
@@ -182,61 +118,17 @@ class PrometheusMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
             status_code_group = '5xx'
             raise
         finally:
-            SKY_APISERVER_REQUESTS_TOTAL.labels(path=path,
-                                                method=method,
-                                                status=status_code_group).inc()
+            metrics_utils.SKY_APISERVER_REQUESTS_TOTAL.labels(
+                path=path, method=method, status=status_code_group).inc()
             if not streaming:
                 duration = time.time() - start_time
-                SKY_APISERVER_REQUEST_DURATION_SECONDS.labels(
+                metrics_utils.SKY_APISERVER_REQUEST_DURATION_SECONDS.labels(
                     path=path, method=method,
                     status=status_code_group).observe(duration)
         return response
-@contextlib.contextmanager
-def time_it(name: str, group: str = 'default'):
-    """Context manager to measure and record code execution duration."""
-    if not METRICS_ENABLED:
-        yield
-    else:
-        start_time = time.time()
-        try:
-            yield
-        finally:
-            duration = time.time() - start_time
-            SKY_APISERVER_CODE_DURATION_SECONDS.labels(
-                name=name, group=group).observe(duration)
-def time_me(func):
-    """Measure the duration of decorated function."""
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        if not METRICS_ENABLED:
-            return func(*args, **kwargs)
-        name = f'{func.__module__}/{func.__name__}'
-        with time_it(name, group='function'):
-            return func(*args, **kwargs)
-    return wrapper
-def time_me_async(func):
-    """Measure the duration of decorated async function."""
-    @functools.wraps(func)
-    async def async_wrapper(*args, **kwargs):
-        if not METRICS_ENABLED:
-            return await func(*args, **kwargs)
-        name = f'{func.__module__}/{func.__name__}'
-        with time_it(name, group='function'):
-            return await func(*args, **kwargs)
-    return async_wrapper
 peak_rss_bytes = 0
@@ -252,13 +144,15 @@ def process_monitor(process_type: str, stop: threading.Event):
             last_bucket_end = time.time()
             bucket_peak = 0
         peak_rss_bytes = max(bucket_peak, proc.memory_info().rss)
-        SKY_APISERVER_PROCESS_PEAK_RSS.labels(
+        metrics_utils.SKY_APISERVER_PROCESS_PEAK_RSS.labels(
             pid=pid, type=process_type).set(peak_rss_bytes)
         ctimes = proc.cpu_times()
-        SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
-                                               type=process_type,
-                                               mode='user').set(ctimes.user)
-        SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
-                                               type=process_type,
-                                               mode='system').set(ctimes.system)
+        metrics_utils.SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
+                                                             type=process_type,
+                                                             mode='user').set(
+                                                                 ctimes.user)
+        metrics_utils.SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
+                                                             type=process_type,
+                                                             mode='system').set(
+                                                                 ctimes.system)
         time.sleep(1)

sky/server/requests/executor.py CHANGED Viewed

@@ -39,6 +39,7 @@ from sky import global_user_state
 from sky import models
 from sky import sky_logging
 from sky import skypilot_config
+from sky.metrics import utils as metrics_utils
 from sky.server import common as server_common
 from sky.server import config as server_config
 from sky.server import constants as server_constants
@@ -422,10 +423,10 @@ def _request_execution_wrapper(request_id: str,
                     config = skypilot_config.to_dict()
                     logger.debug(f'request config: \n'
                                  f'{yaml_utils.dump_yaml_str(dict(config))}')
-                metrics_lib.SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL.labels(
-                    request=request_name, pid=pid).inc()
-                with metrics_lib.time_it(name=request_name,
-                                         group='request_execution'):
+                (metrics_utils.SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL.
+                 labels(request=request_name, pid=pid).inc())
+                with metrics_utils.time_it(name=request_name,
+                                           group='request_execution'):
                     return_value = func(**request_body.to_kwargs())
                 f.flush()
         except KeyboardInterrupt:
@@ -468,8 +469,8 @@ def _request_execution_wrapper(request_id: str,
                 # Clear request level cache to release all memory used by
                 # the request.
                 annotations.clear_request_level_cache()
-                with metrics_lib.time_it(name='release_memory',
-                                         group='internal'):
+                with metrics_utils.time_it(name='release_memory',
+                                           group='internal'):
                     common_utils.release_memory()
                 _record_memory_metrics(request_name, proc, rss_begin, peak_rss)
             except Exception as e:  # pylint: disable=broad-except
@@ -493,11 +494,11 @@ def _record_memory_metrics(request_name: str, proc: psutil.Process,
     rss_end = proc.memory_info().rss
     # Answer "how much RSS this request contributed?"
-    metrics_lib.SKY_APISERVER_REQUEST_RSS_INCR_BYTES.labels(
+    metrics_utils.SKY_APISERVER_REQUEST_RSS_INCR_BYTES.labels(
         name=request_name).observe(max(rss_end - rss_begin, 0))
     # Estimate the memory usage by the request by capturing the
     # peak memory delta during the request execution.
-    metrics_lib.SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES.labels(
+    metrics_utils.SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES.labels(
         name=request_name).observe(max(peak_rss - rss_begin, 0))

sky/server/requests/payloads.py CHANGED Viewed

@@ -792,6 +792,12 @@ class GetConfigBody(RequestBody):
 class CostReportBody(RequestBody):
     """The request body for the cost report endpoint."""
     days: Optional[int] = 30
+    # we use hashes instead of names to avoid the case where
+    # the name is not unique
+    cluster_hashes: Optional[List[str]] = None
+    # Only return fields that are needed for the dashboard
+    # summary page
+    dashboard_summary_response: bool = False
 class RequestPayload(BasePayload):

sky/server/requests/requests.py CHANGED Viewed

@@ -25,10 +25,10 @@ from sky import exceptions
 from sky import global_user_state
 from sky import sky_logging
 from sky import skypilot_config
+from sky.metrics import utils as metrics_lib
 from sky.server import common as server_common
 from sky.server import constants as server_constants
 from sky.server import daemons
-from sky.server import metrics as metrics_lib
 from sky.server.requests import payloads
 from sky.server.requests.serializers import decoders
 from sky.server.requests.serializers import encoders

sky/server/requests/serializers/encoders.py CHANGED Viewed

@@ -185,8 +185,9 @@ def encode_cost_report(
     for cluster_report in cost_report:
         if cluster_report['status'] is not None:
             cluster_report['status'] = cluster_report['status'].value
-        cluster_report['resources'] = pickle_and_encode(
-            cluster_report['resources'])
+        if 'resources' in cluster_report:
+            cluster_report['resources'] = pickle_and_encode(
+                cluster_report['resources'])
     return cost_report

sky/server/server.py CHANGED Viewed

@@ -437,7 +437,7 @@ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
         if lag_threshold is not None and lag > lag_threshold:
             logger.warning(f'Event loop lag {lag} seconds exceeds threshold '
                            f'{lag_threshold} seconds.')
-        metrics.SKY_APISERVER_EVENT_LOOP_LAG_SECONDS.labels(
+        metrics_utils.SKY_APISERVER_EVENT_LOOP_LAG_SECONDS.labels(
             pid=pid).observe(lag)
         target = now + interval
         loop.call_at(target, tick)
@@ -470,7 +470,7 @@ async def lifespan(app: fastapi.FastAPI):  # pylint: disable=redefined-outer-nam
             # can safely ignore the error if the task is already scheduled.
             logger.debug(f'Request {event.id} already exists.')
     asyncio.create_task(cleanup_upload_ids())
-    if metrics.METRICS_ENABLED:
+    if metrics_utils.METRICS_ENABLED:
         # Start monitoring the event loop lag in each server worker
         # event loop (process).
         asyncio.create_task(loop_lag_monitor(asyncio.get_event_loop()))
@@ -1743,7 +1743,7 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
             return
     logger.info(f'Starting port-forward to local port: {local_port}')
-    conn_gauge = metrics.SKY_APISERVER_WEBSOCKET_CONNECTIONS.labels(
+    conn_gauge = metrics_utils.SKY_APISERVER_WEBSOCKET_CONNECTIONS.labels(
         pid=os.getpid())
     ssh_failed = False
     websocket_closed = False
@@ -1807,14 +1807,14 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
                          'ssh websocket connection was closed. Remaining '
                          f'output: {str(stdout)}')
             reason = 'KubectlPortForwardExit'
-            metrics.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
+            metrics_utils.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
                 pid=os.getpid(), reason='KubectlPortForwardExit').inc()
         else:
             if ssh_failed:
                 reason = 'SSHToPodDisconnected'
             else:
                 reason = 'ClientClosed'
-        metrics.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
+        metrics_utils.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
             pid=os.getpid(), reason=reason).inc()
@@ -1831,42 +1831,6 @@ async def all_contexts(request: fastapi.Request) -> None:
     )
-@app.get('/gpu-metrics')
-async def gpu_metrics() -> fastapi.Response:
-    """Gets the GPU metrics from multiple external k8s clusters"""
-    contexts = core.get_all_contexts()
-    all_metrics: List[str] = []
-    successful_contexts = 0
-    tasks = [
-        asyncio.create_task(metrics_utils.get_metrics_for_context(context))
-        for context in contexts
-        if context != 'in-cluster'
-    ]
-    results = await asyncio.gather(*tasks, return_exceptions=True)
-    for i, result in enumerate(results):
-        if isinstance(result, Exception):
-            logger.error(
-                f'Failed to get metrics for context {contexts[i]}: {result}')
-        elif isinstance(result, BaseException):
-            # Avoid changing behavior for non-Exception BaseExceptions
-            # like KeyboardInterrupt/SystemExit: re-raise them.
-            raise result
-        else:
-            metrics_text = result
-            all_metrics.append(metrics_text)
-            successful_contexts += 1
-    combined_metrics = '\n\n'.join(all_metrics)
-    # Return as plain text for Prometheus compatibility
-    return fastapi.Response(
-        content=combined_metrics,
-        media_type='text/plain; version=0.0.4; charset=utf-8')
 # === Internal APIs ===
 @app.get('/api/completion/cluster_name')
 async def complete_cluster_name(incomplete: str,) -> List[str]:

sky/setup_files/dependencies.py CHANGED Viewed

@@ -49,8 +49,15 @@ install_requires = [
     # <= 3.13 may encounter https://github.com/ultralytics/yolov5/issues/414
     'pyyaml > 3.13, != 5.4.*',
     'requests',
+    # SkyPilot inherits from uvicorn.Server to customize the behavior of
+    # uvicorn, so we need to pin uvicorn version to avoid potential break
+    # changes.
+    # Notes for current version check:
+    # - uvicorn 0.33.0 is the latest version that supports Python 3.8
+    # - uvicorn 0.36.0 removes setup_event_loop thus breaks SkyPilot's custom
+    #   behavior.
+    'uvicorn[standard] >=0.33.0, <0.36.0',
     'fastapi',
-    'uvicorn[standard]',
     # Some pydantic versions are not compatible with ray. Adopted from ray's
     # setup.py:
     # https://github.com/ray-project/ray/blob/ray-2.9.3/python/setup.py#L254

sky/skylet/constants.py CHANGED Viewed

@@ -29,6 +29,7 @@ SKY_REMOTE_RAY_PORT_FILE = '~/.sky/ray_port.json'
 SKY_REMOTE_RAY_TEMPDIR = '/tmp/ray_skypilot'
 SKY_REMOTE_RAY_VERSION = '2.9.3'
+SKY_UNSET_PYTHONPATH = 'env -u PYTHONPATH'
 # We store the absolute path of the python executable (/opt/conda/bin/python3)
 # in this file, so that any future internal commands that need to use python
 # can use this path. This is useful for the case where the user has a custom
@@ -40,7 +41,7 @@ SKY_GET_PYTHON_PATH_CMD = (f'[ -s {SKY_PYTHON_PATH_FILE} ] && '
                            f'cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || '
                            'which python3')
 # Python executable, e.g., /opt/conda/bin/python3
-SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})'
+SKY_PYTHON_CMD = f'{SKY_UNSET_PYTHONPATH} $({SKY_GET_PYTHON_PATH_CMD})'
 # Prefer SKY_UV_PIP_CMD, which is faster.
 # TODO(cooperc): remove remaining usage (GCP TPU setup).
 SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
@@ -56,14 +57,15 @@ SKY_REMOTE_PYTHON_ENV: str = f'~/{SKY_REMOTE_PYTHON_ENV_NAME}'
 ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
 # uv is used for venv and pip, much faster than python implementations.
 SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
-SKY_UV_CMD = f'UV_SYSTEM_PYTHON=false {SKY_UV_INSTALL_DIR}/uv'
+SKY_UV_CMD = ('UV_SYSTEM_PYTHON=false '
+              f'{SKY_UNSET_PYTHONPATH} {SKY_UV_INSTALL_DIR}/uv')
 # This won't reinstall uv if it's already installed, so it's safe to re-run.
 SKY_UV_INSTALL_CMD = (f'{SKY_UV_CMD} -V >/dev/null 2>&1 || '
                       'curl -LsSf https://astral.sh/uv/install.sh '
                       f'| UV_INSTALL_DIR={SKY_UV_INSTALL_DIR} sh')
 SKY_UV_PIP_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} pip')
-SKY_UV_RUN_CMD: str = (
-    f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} run --active')
+SKY_UV_RUN_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} run '
+                       '--no-project --no-config')
 # Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH and unsetting relevant
 # VIRTUAL_ENV envvars to deactivate the environment. `deactivate` command does
 # not work when conda is used.

sky/skylet/job_lib.py CHANGED Viewed

@@ -559,21 +559,20 @@ def get_jobs_info(user_hash: Optional[str] = None,
     jobs_info = []
     for job in jobs:
         jobs_info.append(
-            jobsv1_pb2.JobInfo(
-                job_id=job['job_id'],
-                job_name=job['job_name'],
-                username=job['username'],
-                submitted_at=job['submitted_at'],
-                status=job['status'].to_protobuf(),
-                run_timestamp=job['run_timestamp'],
-                start_at=job['start_at']
-                if job['start_at'] is not None else -1.0,
-                end_at=job['end_at'] if job['end_at'] is not None else 0.0,
-                resources=job['resources'] or '',
-                pid=job['pid'],
-                log_path=os.path.join(constants.SKY_LOGS_DIRECTORY,
-                                      job['run_timestamp']),
-                metadata=json.dumps(job['metadata'])))
+            jobsv1_pb2.JobInfo(job_id=job['job_id'],
+                               job_name=job['job_name'],
+                               username=job['username'],
+                               submitted_at=job['submitted_at'],
+                               status=job['status'].to_protobuf(),
+                               run_timestamp=job['run_timestamp'],
+                               start_at=job['start_at'],
+                               end_at=job['end_at'],
+                               resources=job['resources'],
+                               pid=job['pid'],
+                               log_path=os.path.join(
+                                   constants.SKY_LOGS_DIRECTORY,
+                                   job['run_timestamp']),
+                               metadata=json.dumps(job['metadata'])))
     return jobs_info

sky/utils/locks.py CHANGED Viewed

@@ -11,6 +11,7 @@ import time
 from typing import Any, Optional
 import filelock
+import psycopg2
 import sqlalchemy
 from sky import global_user_state
@@ -197,6 +198,7 @@ class PostgresLock(DistributedLock):
         if engine.dialect.name != db_utils.SQLAlchemyDialect.POSTGRESQL.value:
             raise ValueError('PostgresLock requires PostgreSQL database. '
                              f'Current dialect: {engine.dialect.name}')
+        # Borrow a dedicated connection from the pool.
         return engine.raw_connection()
     def acquire(self, blocking: bool = True) -> AcquireReturnProxy:
@@ -233,9 +235,7 @@ class PostgresLock(DistributedLock):
                 time.sleep(self.poll_interval)
         except Exception:
-            if self._connection:
-                self._connection.close()
-                self._connection = None
+            self._close_connection()
             raise
     def release(self) -> None:
@@ -248,27 +248,58 @@ class PostgresLock(DistributedLock):
             cursor.execute('SELECT pg_advisory_unlock(%s)', (self._lock_key,))
             self._connection.commit()
             self._acquired = False
+        except psycopg2.OperationalError as e:
+            # Lost connection to the database, likely the lock is force unlocked
+            # by other routines.
+            logger.debug(f'Failed to release postgres lock {self.lock_id}: {e}')
         finally:
-            if self._connection:
-                self._connection.close()
-                self._connection = None
+            self._close_connection()
     def force_unlock(self) -> None:
         """Force unlock the postgres advisory lock."""
         try:
-            if not self._connection:
+            # The lock is held by current routine, gracefully unlock it
+            if self._acquired:
+                self.release()
+                return
+            # The lock is held by another routine, force unlock it.
+            if self._connection is None:
                 self._connection = self._get_connection()
             cursor = self._connection.cursor()
             cursor.execute('SELECT pg_advisory_unlock(%s)', (self._lock_key,))
-            self._connection.commit()
+            result = cursor.fetchone()[0]
+            if result:
+                # The lock is held by current routine and unlock suceed
+                self._connection.commit()
+                self._acquired = False
+                return
+            cursor.execute(
+                ('SELECT pid FROM pg_locks WHERE locktype = \'advisory\' '
+                 'AND ((classid::bigint << 32) | objid::bigint) = %s'),
+                (self._lock_key,))
+            row = cursor.fetchone()
+            if row:
+                # The lock is still held by another routine, false unlock it
+                # by killing the PG connection of that routine.
+                cursor.execute('SELECT pg_terminate_backend(%s)', (row[0],))
+                self._connection.commit()
+                return
         except Exception as e:
             raise RuntimeError(
                 f'Failed to force unlock postgres lock {self.lock_id}: {e}'
             ) from e
         finally:
-            if self._connection:
+            self._close_connection()
+    def _close_connection(self) -> None:
+        """Close the postgres connection."""
+        if self._connection:
+            try:
                 self._connection.close()
-                self._connection = None
+            except Exception as e:  # pylint: disable=broad-except
+                logger.debug(f'Failed to close postgres connection: {e}')
+            self._connection = None
     def is_locked(self) -> bool:
         """Check if the postgres advisory lock is acquired."""

skypilot-nightly 1.0.0.dev20250918__py3-none-any.whl → 1.0.0.dev20250922__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250918py3-none-any.whl → 1.0.0.dev20250922py3-none-any.whl