skypilot-nightly 1.0.0.dev20250918__py3-none-any.whl → 1.0.0.dev20250922__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (52) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +12 -15
  3. sky/core.py +67 -45
  4. sky/dashboard/out/404.html +1 -1
  5. sky/dashboard/out/_next/static/{k1mo5xWZrV9djgjd0moOT → KP6HCNMqb_bnJB17oplgW}/_buildManifest.js +1 -1
  6. sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +1 -0
  7. sky/dashboard/out/_next/static/chunks/3015-88c7c8d69b0b6dba.js +1 -0
  8. sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +1 -0
  9. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +16 -0
  10. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-0b4b35dc1dfe046c.js → [cluster]-9525660179df3605.js} +1 -1
  11. sky/dashboard/out/_next/static/chunks/{webpack-487697b47d8c5e50.js → webpack-26167a9e6d91fa51.js} +1 -1
  12. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  13. sky/dashboard/out/clusters/[cluster].html +1 -1
  14. sky/dashboard/out/clusters.html +1 -1
  15. sky/dashboard/out/config.html +1 -1
  16. sky/dashboard/out/index.html +1 -1
  17. sky/dashboard/out/infra/[context].html +1 -1
  18. sky/dashboard/out/infra.html +1 -1
  19. sky/dashboard/out/jobs/[job].html +1 -1
  20. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  21. sky/dashboard/out/jobs.html +1 -1
  22. sky/dashboard/out/users.html +1 -1
  23. sky/dashboard/out/volumes.html +1 -1
  24. sky/dashboard/out/workspace/new.html +1 -1
  25. sky/dashboard/out/workspaces/[name].html +1 -1
  26. sky/dashboard/out/workspaces.html +1 -1
  27. sky/global_user_state.py +90 -56
  28. sky/metrics/utils.py +174 -8
  29. sky/schemas/generated/jobsv1_pb2.py +40 -40
  30. sky/serve/serve_utils.py +0 -4
  31. sky/server/auth/oauth2_proxy.py +2 -2
  32. sky/server/metrics.py +52 -158
  33. sky/server/requests/executor.py +9 -8
  34. sky/server/requests/payloads.py +6 -0
  35. sky/server/requests/requests.py +1 -1
  36. sky/server/requests/serializers/encoders.py +3 -2
  37. sky/server/server.py +5 -41
  38. sky/setup_files/dependencies.py +8 -1
  39. sky/skylet/constants.py +6 -4
  40. sky/skylet/job_lib.py +14 -15
  41. sky/utils/locks.py +41 -10
  42. {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/METADATA +35 -35
  43. {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/RECORD +48 -48
  44. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  45. sky/dashboard/out/_next/static/chunks/3015-ba5be550eb80fd8c.js +0 -1
  46. sky/dashboard/out/_next/static/chunks/8969-a3e3f0683e19d340.js +0 -1
  47. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  48. /sky/dashboard/out/_next/static/{k1mo5xWZrV9djgjd0moOT → KP6HCNMqb_bnJB17oplgW}/_ssgManifest.js +0 -0
  49. {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/WHEEL +0 -0
  50. {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/entry_points.txt +0 -0
  51. {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/licenses/LICENSE +0 -0
  52. {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/top_level.txt +0 -0
sky/server/metrics.py CHANGED
@@ -1,11 +1,11 @@
1
1
  """Instrumentation for the API server."""
2
2
 
3
- import contextlib
4
- import functools
3
+ import asyncio
5
4
  import multiprocessing
6
5
  import os
7
6
  import threading
8
7
  import time
8
+ from typing import List
9
9
 
10
10
  import fastapi
11
11
  from prometheus_client import generate_latest
@@ -15,112 +15,12 @@ import psutil
15
15
  import starlette.middleware.base
16
16
  import uvicorn
17
17
 
18
+ from sky import core
18
19
  from sky import sky_logging
19
- from sky.skylet import constants
20
-
21
- # Whether the metrics are enabled, cannot be changed at runtime.
22
- METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
23
- 'false').lower() == 'true'
24
-
25
- _KB = 2**10
26
- _MB = 2**20
27
- _MEM_BUCKETS = [
28
- _KB,
29
- 256 * _KB,
30
- 512 * _KB,
31
- _MB,
32
- 2 * _MB,
33
- 4 * _MB,
34
- 8 * _MB,
35
- 16 * _MB,
36
- 32 * _MB,
37
- 64 * _MB,
38
- 128 * _MB,
39
- 256 * _MB,
40
- float('inf'),
41
- ]
20
+ from sky.metrics import utils as metrics_utils
42
21
 
43
22
  logger = sky_logging.init_logger(__name__)
44
23
 
45
- # Total number of API server requests, grouped by path, method, and status.
46
- SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
47
- 'sky_apiserver_requests_total',
48
- 'Total number of API server requests',
49
- ['path', 'method', 'status'],
50
- )
51
-
52
- # Time spent processing API server requests, grouped by path, method, and
53
- # status.
54
- SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
55
- 'sky_apiserver_request_duration_seconds',
56
- 'Time spent processing API server requests',
57
- ['path', 'method', 'status'],
58
- buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
59
- 60.0, 120.0, float('inf')),
60
- )
61
-
62
- # Time spent processing a piece of code, refer to time_it().
63
- SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
64
- 'sky_apiserver_code_duration_seconds',
65
- 'Time spent processing code',
66
- ['name', 'group'],
67
- buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
68
- 60.0, 120.0, float('inf')),
69
- )
70
-
71
- SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
72
- 'sky_apiserver_event_loop_lag_seconds',
73
- 'Scheduling delay of the server event loop',
74
- ['pid'],
75
- buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 20.0,
76
- 60.0, float('inf')),
77
- )
78
-
79
- SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
80
- 'sky_apiserver_websocket_connections',
81
- 'Number of websocket connections',
82
- ['pid'],
83
- multiprocess_mode='livesum',
84
- )
85
-
86
- SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL = prom.Counter(
87
- 'sky_apiserver_websocket_closed_total',
88
- 'Number of websocket closed',
89
- ['pid', 'reason'],
90
- )
91
-
92
- # The number of execution starts in each worker process, we do not record
93
- # histogram here as the duration has been measured in
94
- # SKY_APISERVER_CODE_DURATION_SECONDS without the worker label (process id).
95
- # Recording histogram WITH worker label will cause high cardinality.
96
- SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL = prom.Counter(
97
- 'sky_apiserver_process_execution_start_total',
98
- 'Total number of execution starts in each worker process',
99
- ['request', 'pid'],
100
- )
101
-
102
- SKY_APISERVER_PROCESS_PEAK_RSS = prom.Gauge(
103
- 'sky_apiserver_process_peak_rss',
104
- 'Peak RSS we saw in each process in last 30 seconds',
105
- ['pid', 'type'],
106
- )
107
-
108
- SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
109
- 'sky_apiserver_process_cpu_total',
110
- 'Total CPU times a worker process has been running',
111
- ['pid', 'type', 'mode'],
112
- )
113
-
114
- SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
115
- 'sky_apiserver_request_memory_usage_bytes',
116
- 'Peak memory usage of requests', ['name'],
117
- buckets=_MEM_BUCKETS)
118
-
119
- SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
120
- 'sky_apiserver_request_rss_incr_bytes',
121
- 'RSS increment after requests', ['name'],
122
- buckets=_MEM_BUCKETS)
123
-
124
24
  metrics_app = fastapi.FastAPI()
125
25
 
126
26
 
@@ -139,6 +39,42 @@ async def metrics() -> fastapi.Response:
139
39
  headers={'Cache-Control': 'no-cache'})
140
40
 
141
41
 
42
+ @metrics_app.get('/gpu-metrics')
43
+ async def gpu_metrics() -> fastapi.Response:
44
+ """Gets the GPU metrics from multiple external k8s clusters"""
45
+ contexts = core.get_all_contexts()
46
+ all_metrics: List[str] = []
47
+ successful_contexts = 0
48
+
49
+ tasks = [
50
+ asyncio.create_task(metrics_utils.get_metrics_for_context(context))
51
+ for context in contexts
52
+ if context != 'in-cluster'
53
+ ]
54
+
55
+ results = await asyncio.gather(*tasks, return_exceptions=True)
56
+
57
+ for i, result in enumerate(results):
58
+ if isinstance(result, Exception):
59
+ logger.error(
60
+ f'Failed to get metrics for context {contexts[i]}: {result}')
61
+ elif isinstance(result, BaseException):
62
+ # Avoid changing behavior for non-Exception BaseExceptions
63
+ # like KeyboardInterrupt/SystemExit: re-raise them.
64
+ raise result
65
+ else:
66
+ metrics_text = result
67
+ all_metrics.append(metrics_text)
68
+ successful_contexts += 1
69
+
70
+ combined_metrics = '\n\n'.join(all_metrics)
71
+
72
+ # Return as plain text for Prometheus compatibility
73
+ return fastapi.Response(
74
+ content=combined_metrics,
75
+ media_type='text/plain; version=0.0.4; charset=utf-8')
76
+
77
+
142
78
  def build_metrics_server(host: str, port: int) -> uvicorn.Server:
143
79
  metrics_config = uvicorn.Config(
144
80
  'sky.server.metrics:metrics_app',
@@ -182,61 +118,17 @@ class PrometheusMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
182
118
  status_code_group = '5xx'
183
119
  raise
184
120
  finally:
185
- SKY_APISERVER_REQUESTS_TOTAL.labels(path=path,
186
- method=method,
187
- status=status_code_group).inc()
121
+ metrics_utils.SKY_APISERVER_REQUESTS_TOTAL.labels(
122
+ path=path, method=method, status=status_code_group).inc()
188
123
  if not streaming:
189
124
  duration = time.time() - start_time
190
- SKY_APISERVER_REQUEST_DURATION_SECONDS.labels(
125
+ metrics_utils.SKY_APISERVER_REQUEST_DURATION_SECONDS.labels(
191
126
  path=path, method=method,
192
127
  status=status_code_group).observe(duration)
193
128
 
194
129
  return response
195
130
 
196
131
 
197
- @contextlib.contextmanager
198
- def time_it(name: str, group: str = 'default'):
199
- """Context manager to measure and record code execution duration."""
200
- if not METRICS_ENABLED:
201
- yield
202
- else:
203
- start_time = time.time()
204
- try:
205
- yield
206
- finally:
207
- duration = time.time() - start_time
208
- SKY_APISERVER_CODE_DURATION_SECONDS.labels(
209
- name=name, group=group).observe(duration)
210
-
211
-
212
- def time_me(func):
213
- """Measure the duration of decorated function."""
214
-
215
- @functools.wraps(func)
216
- def wrapper(*args, **kwargs):
217
- if not METRICS_ENABLED:
218
- return func(*args, **kwargs)
219
- name = f'{func.__module__}/{func.__name__}'
220
- with time_it(name, group='function'):
221
- return func(*args, **kwargs)
222
-
223
- return wrapper
224
-
225
-
226
- def time_me_async(func):
227
- """Measure the duration of decorated async function."""
228
-
229
- @functools.wraps(func)
230
- async def async_wrapper(*args, **kwargs):
231
- if not METRICS_ENABLED:
232
- return await func(*args, **kwargs)
233
- name = f'{func.__module__}/{func.__name__}'
234
- with time_it(name, group='function'):
235
- return await func(*args, **kwargs)
236
-
237
- return async_wrapper
238
-
239
-
240
132
  peak_rss_bytes = 0
241
133
 
242
134
 
@@ -252,13 +144,15 @@ def process_monitor(process_type: str, stop: threading.Event):
252
144
  last_bucket_end = time.time()
253
145
  bucket_peak = 0
254
146
  peak_rss_bytes = max(bucket_peak, proc.memory_info().rss)
255
- SKY_APISERVER_PROCESS_PEAK_RSS.labels(
147
+ metrics_utils.SKY_APISERVER_PROCESS_PEAK_RSS.labels(
256
148
  pid=pid, type=process_type).set(peak_rss_bytes)
257
149
  ctimes = proc.cpu_times()
258
- SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
259
- type=process_type,
260
- mode='user').set(ctimes.user)
261
- SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
262
- type=process_type,
263
- mode='system').set(ctimes.system)
150
+ metrics_utils.SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
151
+ type=process_type,
152
+ mode='user').set(
153
+ ctimes.user)
154
+ metrics_utils.SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
155
+ type=process_type,
156
+ mode='system').set(
157
+ ctimes.system)
264
158
  time.sleep(1)
@@ -39,6 +39,7 @@ from sky import global_user_state
39
39
  from sky import models
40
40
  from sky import sky_logging
41
41
  from sky import skypilot_config
42
+ from sky.metrics import utils as metrics_utils
42
43
  from sky.server import common as server_common
43
44
  from sky.server import config as server_config
44
45
  from sky.server import constants as server_constants
@@ -422,10 +423,10 @@ def _request_execution_wrapper(request_id: str,
422
423
  config = skypilot_config.to_dict()
423
424
  logger.debug(f'request config: \n'
424
425
  f'{yaml_utils.dump_yaml_str(dict(config))}')
425
- metrics_lib.SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL.labels(
426
- request=request_name, pid=pid).inc()
427
- with metrics_lib.time_it(name=request_name,
428
- group='request_execution'):
426
+ (metrics_utils.SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL.
427
+ labels(request=request_name, pid=pid).inc())
428
+ with metrics_utils.time_it(name=request_name,
429
+ group='request_execution'):
429
430
  return_value = func(**request_body.to_kwargs())
430
431
  f.flush()
431
432
  except KeyboardInterrupt:
@@ -468,8 +469,8 @@ def _request_execution_wrapper(request_id: str,
468
469
  # Clear request level cache to release all memory used by
469
470
  # the request.
470
471
  annotations.clear_request_level_cache()
471
- with metrics_lib.time_it(name='release_memory',
472
- group='internal'):
472
+ with metrics_utils.time_it(name='release_memory',
473
+ group='internal'):
473
474
  common_utils.release_memory()
474
475
  _record_memory_metrics(request_name, proc, rss_begin, peak_rss)
475
476
  except Exception as e: # pylint: disable=broad-except
@@ -493,11 +494,11 @@ def _record_memory_metrics(request_name: str, proc: psutil.Process,
493
494
  rss_end = proc.memory_info().rss
494
495
 
495
496
  # Answer "how much RSS this request contributed?"
496
- metrics_lib.SKY_APISERVER_REQUEST_RSS_INCR_BYTES.labels(
497
+ metrics_utils.SKY_APISERVER_REQUEST_RSS_INCR_BYTES.labels(
497
498
  name=request_name).observe(max(rss_end - rss_begin, 0))
498
499
  # Estimate the memory usage by the request by capturing the
499
500
  # peak memory delta during the request execution.
500
- metrics_lib.SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES.labels(
501
+ metrics_utils.SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES.labels(
501
502
  name=request_name).observe(max(peak_rss - rss_begin, 0))
502
503
 
503
504
 
@@ -792,6 +792,12 @@ class GetConfigBody(RequestBody):
792
792
  class CostReportBody(RequestBody):
793
793
  """The request body for the cost report endpoint."""
794
794
  days: Optional[int] = 30
795
+ # we use hashes instead of names to avoid the case where
796
+ # the name is not unique
797
+ cluster_hashes: Optional[List[str]] = None
798
+ # Only return fields that are needed for the dashboard
799
+ # summary page
800
+ dashboard_summary_response: bool = False
795
801
 
796
802
 
797
803
  class RequestPayload(BasePayload):
@@ -25,10 +25,10 @@ from sky import exceptions
25
25
  from sky import global_user_state
26
26
  from sky import sky_logging
27
27
  from sky import skypilot_config
28
+ from sky.metrics import utils as metrics_lib
28
29
  from sky.server import common as server_common
29
30
  from sky.server import constants as server_constants
30
31
  from sky.server import daemons
31
- from sky.server import metrics as metrics_lib
32
32
  from sky.server.requests import payloads
33
33
  from sky.server.requests.serializers import decoders
34
34
  from sky.server.requests.serializers import encoders
@@ -185,8 +185,9 @@ def encode_cost_report(
185
185
  for cluster_report in cost_report:
186
186
  if cluster_report['status'] is not None:
187
187
  cluster_report['status'] = cluster_report['status'].value
188
- cluster_report['resources'] = pickle_and_encode(
189
- cluster_report['resources'])
188
+ if 'resources' in cluster_report:
189
+ cluster_report['resources'] = pickle_and_encode(
190
+ cluster_report['resources'])
190
191
  return cost_report
191
192
 
192
193
 
sky/server/server.py CHANGED
@@ -437,7 +437,7 @@ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
437
437
  if lag_threshold is not None and lag > lag_threshold:
438
438
  logger.warning(f'Event loop lag {lag} seconds exceeds threshold '
439
439
  f'{lag_threshold} seconds.')
440
- metrics.SKY_APISERVER_EVENT_LOOP_LAG_SECONDS.labels(
440
+ metrics_utils.SKY_APISERVER_EVENT_LOOP_LAG_SECONDS.labels(
441
441
  pid=pid).observe(lag)
442
442
  target = now + interval
443
443
  loop.call_at(target, tick)
@@ -470,7 +470,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
470
470
  # can safely ignore the error if the task is already scheduled.
471
471
  logger.debug(f'Request {event.id} already exists.')
472
472
  asyncio.create_task(cleanup_upload_ids())
473
- if metrics.METRICS_ENABLED:
473
+ if metrics_utils.METRICS_ENABLED:
474
474
  # Start monitoring the event loop lag in each server worker
475
475
  # event loop (process).
476
476
  asyncio.create_task(loop_lag_monitor(asyncio.get_event_loop()))
@@ -1743,7 +1743,7 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1743
1743
  return
1744
1744
 
1745
1745
  logger.info(f'Starting port-forward to local port: {local_port}')
1746
- conn_gauge = metrics.SKY_APISERVER_WEBSOCKET_CONNECTIONS.labels(
1746
+ conn_gauge = metrics_utils.SKY_APISERVER_WEBSOCKET_CONNECTIONS.labels(
1747
1747
  pid=os.getpid())
1748
1748
  ssh_failed = False
1749
1749
  websocket_closed = False
@@ -1807,14 +1807,14 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1807
1807
  'ssh websocket connection was closed. Remaining '
1808
1808
  f'output: {str(stdout)}')
1809
1809
  reason = 'KubectlPortForwardExit'
1810
- metrics.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
1810
+ metrics_utils.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
1811
1811
  pid=os.getpid(), reason='KubectlPortForwardExit').inc()
1812
1812
  else:
1813
1813
  if ssh_failed:
1814
1814
  reason = 'SSHToPodDisconnected'
1815
1815
  else:
1816
1816
  reason = 'ClientClosed'
1817
- metrics.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
1817
+ metrics_utils.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
1818
1818
  pid=os.getpid(), reason=reason).inc()
1819
1819
 
1820
1820
 
@@ -1831,42 +1831,6 @@ async def all_contexts(request: fastapi.Request) -> None:
1831
1831
  )
1832
1832
 
1833
1833
 
1834
- @app.get('/gpu-metrics')
1835
- async def gpu_metrics() -> fastapi.Response:
1836
- """Gets the GPU metrics from multiple external k8s clusters"""
1837
- contexts = core.get_all_contexts()
1838
- all_metrics: List[str] = []
1839
- successful_contexts = 0
1840
-
1841
- tasks = [
1842
- asyncio.create_task(metrics_utils.get_metrics_for_context(context))
1843
- for context in contexts
1844
- if context != 'in-cluster'
1845
- ]
1846
-
1847
- results = await asyncio.gather(*tasks, return_exceptions=True)
1848
-
1849
- for i, result in enumerate(results):
1850
- if isinstance(result, Exception):
1851
- logger.error(
1852
- f'Failed to get metrics for context {contexts[i]}: {result}')
1853
- elif isinstance(result, BaseException):
1854
- # Avoid changing behavior for non-Exception BaseExceptions
1855
- # like KeyboardInterrupt/SystemExit: re-raise them.
1856
- raise result
1857
- else:
1858
- metrics_text = result
1859
- all_metrics.append(metrics_text)
1860
- successful_contexts += 1
1861
-
1862
- combined_metrics = '\n\n'.join(all_metrics)
1863
-
1864
- # Return as plain text for Prometheus compatibility
1865
- return fastapi.Response(
1866
- content=combined_metrics,
1867
- media_type='text/plain; version=0.0.4; charset=utf-8')
1868
-
1869
-
1870
1834
  # === Internal APIs ===
1871
1835
  @app.get('/api/completion/cluster_name')
1872
1836
  async def complete_cluster_name(incomplete: str,) -> List[str]:
@@ -49,8 +49,15 @@ install_requires = [
49
49
  # <= 3.13 may encounter https://github.com/ultralytics/yolov5/issues/414
50
50
  'pyyaml > 3.13, != 5.4.*',
51
51
  'requests',
52
+ # SkyPilot inherits from uvicorn.Server to customize the behavior of
53
+ # uvicorn, so we need to pin uvicorn version to avoid potential break
54
+ # changes.
55
+ # Notes for current version check:
56
+ # - uvicorn 0.33.0 is the latest version that supports Python 3.8
57
+ # - uvicorn 0.36.0 removes setup_event_loop thus breaks SkyPilot's custom
58
+ # behavior.
59
+ 'uvicorn[standard] >=0.33.0, <0.36.0',
52
60
  'fastapi',
53
- 'uvicorn[standard]',
54
61
  # Some pydantic versions are not compatible with ray. Adopted from ray's
55
62
  # setup.py:
56
63
  # https://github.com/ray-project/ray/blob/ray-2.9.3/python/setup.py#L254
sky/skylet/constants.py CHANGED
@@ -29,6 +29,7 @@ SKY_REMOTE_RAY_PORT_FILE = '~/.sky/ray_port.json'
29
29
  SKY_REMOTE_RAY_TEMPDIR = '/tmp/ray_skypilot'
30
30
  SKY_REMOTE_RAY_VERSION = '2.9.3'
31
31
 
32
+ SKY_UNSET_PYTHONPATH = 'env -u PYTHONPATH'
32
33
  # We store the absolute path of the python executable (/opt/conda/bin/python3)
33
34
  # in this file, so that any future internal commands that need to use python
34
35
  # can use this path. This is useful for the case where the user has a custom
@@ -40,7 +41,7 @@ SKY_GET_PYTHON_PATH_CMD = (f'[ -s {SKY_PYTHON_PATH_FILE} ] && '
40
41
  f'cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || '
41
42
  'which python3')
42
43
  # Python executable, e.g., /opt/conda/bin/python3
43
- SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})'
44
+ SKY_PYTHON_CMD = f'{SKY_UNSET_PYTHONPATH} $({SKY_GET_PYTHON_PATH_CMD})'
44
45
  # Prefer SKY_UV_PIP_CMD, which is faster.
45
46
  # TODO(cooperc): remove remaining usage (GCP TPU setup).
46
47
  SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
@@ -56,14 +57,15 @@ SKY_REMOTE_PYTHON_ENV: str = f'~/{SKY_REMOTE_PYTHON_ENV_NAME}'
56
57
  ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
57
58
  # uv is used for venv and pip, much faster than python implementations.
58
59
  SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
59
- SKY_UV_CMD = f'UV_SYSTEM_PYTHON=false {SKY_UV_INSTALL_DIR}/uv'
60
+ SKY_UV_CMD = ('UV_SYSTEM_PYTHON=false '
61
+ f'{SKY_UNSET_PYTHONPATH} {SKY_UV_INSTALL_DIR}/uv')
60
62
  # This won't reinstall uv if it's already installed, so it's safe to re-run.
61
63
  SKY_UV_INSTALL_CMD = (f'{SKY_UV_CMD} -V >/dev/null 2>&1 || '
62
64
  'curl -LsSf https://astral.sh/uv/install.sh '
63
65
  f'| UV_INSTALL_DIR={SKY_UV_INSTALL_DIR} sh')
64
66
  SKY_UV_PIP_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} pip')
65
- SKY_UV_RUN_CMD: str = (
66
- f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} run --active')
67
+ SKY_UV_RUN_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} run '
68
+ '--no-project --no-config')
67
69
  # Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH and unsetting relevant
68
70
  # VIRTUAL_ENV envvars to deactivate the environment. `deactivate` command does
69
71
  # not work when conda is used.
sky/skylet/job_lib.py CHANGED
@@ -559,21 +559,20 @@ def get_jobs_info(user_hash: Optional[str] = None,
559
559
  jobs_info = []
560
560
  for job in jobs:
561
561
  jobs_info.append(
562
- jobsv1_pb2.JobInfo(
563
- job_id=job['job_id'],
564
- job_name=job['job_name'],
565
- username=job['username'],
566
- submitted_at=job['submitted_at'],
567
- status=job['status'].to_protobuf(),
568
- run_timestamp=job['run_timestamp'],
569
- start_at=job['start_at']
570
- if job['start_at'] is not None else -1.0,
571
- end_at=job['end_at'] if job['end_at'] is not None else 0.0,
572
- resources=job['resources'] or '',
573
- pid=job['pid'],
574
- log_path=os.path.join(constants.SKY_LOGS_DIRECTORY,
575
- job['run_timestamp']),
576
- metadata=json.dumps(job['metadata'])))
562
+ jobsv1_pb2.JobInfo(job_id=job['job_id'],
563
+ job_name=job['job_name'],
564
+ username=job['username'],
565
+ submitted_at=job['submitted_at'],
566
+ status=job['status'].to_protobuf(),
567
+ run_timestamp=job['run_timestamp'],
568
+ start_at=job['start_at'],
569
+ end_at=job['end_at'],
570
+ resources=job['resources'],
571
+ pid=job['pid'],
572
+ log_path=os.path.join(
573
+ constants.SKY_LOGS_DIRECTORY,
574
+ job['run_timestamp']),
575
+ metadata=json.dumps(job['metadata'])))
577
576
  return jobs_info
578
577
 
579
578
 
sky/utils/locks.py CHANGED
@@ -11,6 +11,7 @@ import time
11
11
  from typing import Any, Optional
12
12
 
13
13
  import filelock
14
+ import psycopg2
14
15
  import sqlalchemy
15
16
 
16
17
  from sky import global_user_state
@@ -197,6 +198,7 @@ class PostgresLock(DistributedLock):
197
198
  if engine.dialect.name != db_utils.SQLAlchemyDialect.POSTGRESQL.value:
198
199
  raise ValueError('PostgresLock requires PostgreSQL database. '
199
200
  f'Current dialect: {engine.dialect.name}')
201
+ # Borrow a dedicated connection from the pool.
200
202
  return engine.raw_connection()
201
203
 
202
204
  def acquire(self, blocking: bool = True) -> AcquireReturnProxy:
@@ -233,9 +235,7 @@ class PostgresLock(DistributedLock):
233
235
  time.sleep(self.poll_interval)
234
236
 
235
237
  except Exception:
236
- if self._connection:
237
- self._connection.close()
238
- self._connection = None
238
+ self._close_connection()
239
239
  raise
240
240
 
241
241
  def release(self) -> None:
@@ -248,27 +248,58 @@ class PostgresLock(DistributedLock):
248
248
  cursor.execute('SELECT pg_advisory_unlock(%s)', (self._lock_key,))
249
249
  self._connection.commit()
250
250
  self._acquired = False
251
+ except psycopg2.OperationalError as e:
252
+ # Lost connection to the database, likely the lock is force unlocked
253
+ # by other routines.
254
+ logger.debug(f'Failed to release postgres lock {self.lock_id}: {e}')
251
255
  finally:
252
- if self._connection:
253
- self._connection.close()
254
- self._connection = None
256
+ self._close_connection()
255
257
 
256
258
  def force_unlock(self) -> None:
257
259
  """Force unlock the postgres advisory lock."""
258
260
  try:
259
- if not self._connection:
261
+ # The lock is held by current routine, gracefully unlock it
262
+ if self._acquired:
263
+ self.release()
264
+ return
265
+
266
+ # The lock is held by another routine, force unlock it.
267
+ if self._connection is None:
260
268
  self._connection = self._get_connection()
261
269
  cursor = self._connection.cursor()
262
270
  cursor.execute('SELECT pg_advisory_unlock(%s)', (self._lock_key,))
263
- self._connection.commit()
271
+ result = cursor.fetchone()[0]
272
+ if result:
273
+ # The lock is held by current routine and unlock suceed
274
+ self._connection.commit()
275
+ self._acquired = False
276
+ return
277
+ cursor.execute(
278
+ ('SELECT pid FROM pg_locks WHERE locktype = \'advisory\' '
279
+ 'AND ((classid::bigint << 32) | objid::bigint) = %s'),
280
+ (self._lock_key,))
281
+ row = cursor.fetchone()
282
+ if row:
283
+ # The lock is still held by another routine, false unlock it
284
+ # by killing the PG connection of that routine.
285
+ cursor.execute('SELECT pg_terminate_backend(%s)', (row[0],))
286
+ self._connection.commit()
287
+ return
264
288
  except Exception as e:
265
289
  raise RuntimeError(
266
290
  f'Failed to force unlock postgres lock {self.lock_id}: {e}'
267
291
  ) from e
268
292
  finally:
269
- if self._connection:
293
+ self._close_connection()
294
+
295
+ def _close_connection(self) -> None:
296
+ """Close the postgres connection."""
297
+ if self._connection:
298
+ try:
270
299
  self._connection.close()
271
- self._connection = None
300
+ except Exception as e: # pylint: disable=broad-except
301
+ logger.debug(f'Failed to close postgres connection: {e}')
302
+ self._connection = None
272
303
 
273
304
  def is_locked(self) -> bool:
274
305
  """Check if the postgres advisory lock is acquired."""