skypilot-nightly 1.0.0.dev20250918__py3-none-any.whl → 1.0.0.dev20250922__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +12 -15
- sky/core.py +67 -45
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{k1mo5xWZrV9djgjd0moOT → KP6HCNMqb_bnJB17oplgW}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-88c7c8d69b0b6dba.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-0b4b35dc1dfe046c.js → [cluster]-9525660179df3605.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-487697b47d8c5e50.js → webpack-26167a9e6d91fa51.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +90 -56
- sky/metrics/utils.py +174 -8
- sky/schemas/generated/jobsv1_pb2.py +40 -40
- sky/serve/serve_utils.py +0 -4
- sky/server/auth/oauth2_proxy.py +2 -2
- sky/server/metrics.py +52 -158
- sky/server/requests/executor.py +9 -8
- sky/server/requests/payloads.py +6 -0
- sky/server/requests/requests.py +1 -1
- sky/server/requests/serializers/encoders.py +3 -2
- sky/server/server.py +5 -41
- sky/setup_files/dependencies.py +8 -1
- sky/skylet/constants.py +6 -4
- sky/skylet/job_lib.py +14 -15
- sky/utils/locks.py +41 -10
- {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/METADATA +35 -35
- {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/RECORD +48 -48
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/3015-ba5be550eb80fd8c.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-a3e3f0683e19d340.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- /sky/dashboard/out/_next/static/{k1mo5xWZrV9djgjd0moOT → KP6HCNMqb_bnJB17oplgW}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/top_level.txt +0 -0
sky/server/metrics.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
"""Instrumentation for the API server."""
|
|
2
2
|
|
|
3
|
-
import
|
|
4
|
-
import functools
|
|
3
|
+
import asyncio
|
|
5
4
|
import multiprocessing
|
|
6
5
|
import os
|
|
7
6
|
import threading
|
|
8
7
|
import time
|
|
8
|
+
from typing import List
|
|
9
9
|
|
|
10
10
|
import fastapi
|
|
11
11
|
from prometheus_client import generate_latest
|
|
@@ -15,112 +15,12 @@ import psutil
|
|
|
15
15
|
import starlette.middleware.base
|
|
16
16
|
import uvicorn
|
|
17
17
|
|
|
18
|
+
from sky import core
|
|
18
19
|
from sky import sky_logging
|
|
19
|
-
from sky.
|
|
20
|
-
|
|
21
|
-
# Whether the metrics are enabled, cannot be changed at runtime.
|
|
22
|
-
METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
|
|
23
|
-
'false').lower() == 'true'
|
|
24
|
-
|
|
25
|
-
_KB = 2**10
|
|
26
|
-
_MB = 2**20
|
|
27
|
-
_MEM_BUCKETS = [
|
|
28
|
-
_KB,
|
|
29
|
-
256 * _KB,
|
|
30
|
-
512 * _KB,
|
|
31
|
-
_MB,
|
|
32
|
-
2 * _MB,
|
|
33
|
-
4 * _MB,
|
|
34
|
-
8 * _MB,
|
|
35
|
-
16 * _MB,
|
|
36
|
-
32 * _MB,
|
|
37
|
-
64 * _MB,
|
|
38
|
-
128 * _MB,
|
|
39
|
-
256 * _MB,
|
|
40
|
-
float('inf'),
|
|
41
|
-
]
|
|
20
|
+
from sky.metrics import utils as metrics_utils
|
|
42
21
|
|
|
43
22
|
logger = sky_logging.init_logger(__name__)
|
|
44
23
|
|
|
45
|
-
# Total number of API server requests, grouped by path, method, and status.
|
|
46
|
-
SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
|
|
47
|
-
'sky_apiserver_requests_total',
|
|
48
|
-
'Total number of API server requests',
|
|
49
|
-
['path', 'method', 'status'],
|
|
50
|
-
)
|
|
51
|
-
|
|
52
|
-
# Time spent processing API server requests, grouped by path, method, and
|
|
53
|
-
# status.
|
|
54
|
-
SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
|
|
55
|
-
'sky_apiserver_request_duration_seconds',
|
|
56
|
-
'Time spent processing API server requests',
|
|
57
|
-
['path', 'method', 'status'],
|
|
58
|
-
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
|
|
59
|
-
60.0, 120.0, float('inf')),
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
# Time spent processing a piece of code, refer to time_it().
|
|
63
|
-
SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
|
|
64
|
-
'sky_apiserver_code_duration_seconds',
|
|
65
|
-
'Time spent processing code',
|
|
66
|
-
['name', 'group'],
|
|
67
|
-
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
|
|
68
|
-
60.0, 120.0, float('inf')),
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
|
|
72
|
-
'sky_apiserver_event_loop_lag_seconds',
|
|
73
|
-
'Scheduling delay of the server event loop',
|
|
74
|
-
['pid'],
|
|
75
|
-
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 20.0,
|
|
76
|
-
60.0, float('inf')),
|
|
77
|
-
)
|
|
78
|
-
|
|
79
|
-
SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
|
|
80
|
-
'sky_apiserver_websocket_connections',
|
|
81
|
-
'Number of websocket connections',
|
|
82
|
-
['pid'],
|
|
83
|
-
multiprocess_mode='livesum',
|
|
84
|
-
)
|
|
85
|
-
|
|
86
|
-
SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL = prom.Counter(
|
|
87
|
-
'sky_apiserver_websocket_closed_total',
|
|
88
|
-
'Number of websocket closed',
|
|
89
|
-
['pid', 'reason'],
|
|
90
|
-
)
|
|
91
|
-
|
|
92
|
-
# The number of execution starts in each worker process, we do not record
|
|
93
|
-
# histogram here as the duration has been measured in
|
|
94
|
-
# SKY_APISERVER_CODE_DURATION_SECONDS without the worker label (process id).
|
|
95
|
-
# Recording histogram WITH worker label will cause high cardinality.
|
|
96
|
-
SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL = prom.Counter(
|
|
97
|
-
'sky_apiserver_process_execution_start_total',
|
|
98
|
-
'Total number of execution starts in each worker process',
|
|
99
|
-
['request', 'pid'],
|
|
100
|
-
)
|
|
101
|
-
|
|
102
|
-
SKY_APISERVER_PROCESS_PEAK_RSS = prom.Gauge(
|
|
103
|
-
'sky_apiserver_process_peak_rss',
|
|
104
|
-
'Peak RSS we saw in each process in last 30 seconds',
|
|
105
|
-
['pid', 'type'],
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
|
|
109
|
-
'sky_apiserver_process_cpu_total',
|
|
110
|
-
'Total CPU times a worker process has been running',
|
|
111
|
-
['pid', 'type', 'mode'],
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
|
|
115
|
-
'sky_apiserver_request_memory_usage_bytes',
|
|
116
|
-
'Peak memory usage of requests', ['name'],
|
|
117
|
-
buckets=_MEM_BUCKETS)
|
|
118
|
-
|
|
119
|
-
SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
|
|
120
|
-
'sky_apiserver_request_rss_incr_bytes',
|
|
121
|
-
'RSS increment after requests', ['name'],
|
|
122
|
-
buckets=_MEM_BUCKETS)
|
|
123
|
-
|
|
124
24
|
metrics_app = fastapi.FastAPI()
|
|
125
25
|
|
|
126
26
|
|
|
@@ -139,6 +39,42 @@ async def metrics() -> fastapi.Response:
|
|
|
139
39
|
headers={'Cache-Control': 'no-cache'})
|
|
140
40
|
|
|
141
41
|
|
|
42
|
+
@metrics_app.get('/gpu-metrics')
|
|
43
|
+
async def gpu_metrics() -> fastapi.Response:
|
|
44
|
+
"""Gets the GPU metrics from multiple external k8s clusters"""
|
|
45
|
+
contexts = core.get_all_contexts()
|
|
46
|
+
all_metrics: List[str] = []
|
|
47
|
+
successful_contexts = 0
|
|
48
|
+
|
|
49
|
+
tasks = [
|
|
50
|
+
asyncio.create_task(metrics_utils.get_metrics_for_context(context))
|
|
51
|
+
for context in contexts
|
|
52
|
+
if context != 'in-cluster'
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
56
|
+
|
|
57
|
+
for i, result in enumerate(results):
|
|
58
|
+
if isinstance(result, Exception):
|
|
59
|
+
logger.error(
|
|
60
|
+
f'Failed to get metrics for context {contexts[i]}: {result}')
|
|
61
|
+
elif isinstance(result, BaseException):
|
|
62
|
+
# Avoid changing behavior for non-Exception BaseExceptions
|
|
63
|
+
# like KeyboardInterrupt/SystemExit: re-raise them.
|
|
64
|
+
raise result
|
|
65
|
+
else:
|
|
66
|
+
metrics_text = result
|
|
67
|
+
all_metrics.append(metrics_text)
|
|
68
|
+
successful_contexts += 1
|
|
69
|
+
|
|
70
|
+
combined_metrics = '\n\n'.join(all_metrics)
|
|
71
|
+
|
|
72
|
+
# Return as plain text for Prometheus compatibility
|
|
73
|
+
return fastapi.Response(
|
|
74
|
+
content=combined_metrics,
|
|
75
|
+
media_type='text/plain; version=0.0.4; charset=utf-8')
|
|
76
|
+
|
|
77
|
+
|
|
142
78
|
def build_metrics_server(host: str, port: int) -> uvicorn.Server:
|
|
143
79
|
metrics_config = uvicorn.Config(
|
|
144
80
|
'sky.server.metrics:metrics_app',
|
|
@@ -182,61 +118,17 @@ class PrometheusMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
182
118
|
status_code_group = '5xx'
|
|
183
119
|
raise
|
|
184
120
|
finally:
|
|
185
|
-
SKY_APISERVER_REQUESTS_TOTAL.labels(
|
|
186
|
-
|
|
187
|
-
status=status_code_group).inc()
|
|
121
|
+
metrics_utils.SKY_APISERVER_REQUESTS_TOTAL.labels(
|
|
122
|
+
path=path, method=method, status=status_code_group).inc()
|
|
188
123
|
if not streaming:
|
|
189
124
|
duration = time.time() - start_time
|
|
190
|
-
SKY_APISERVER_REQUEST_DURATION_SECONDS.labels(
|
|
125
|
+
metrics_utils.SKY_APISERVER_REQUEST_DURATION_SECONDS.labels(
|
|
191
126
|
path=path, method=method,
|
|
192
127
|
status=status_code_group).observe(duration)
|
|
193
128
|
|
|
194
129
|
return response
|
|
195
130
|
|
|
196
131
|
|
|
197
|
-
@contextlib.contextmanager
|
|
198
|
-
def time_it(name: str, group: str = 'default'):
|
|
199
|
-
"""Context manager to measure and record code execution duration."""
|
|
200
|
-
if not METRICS_ENABLED:
|
|
201
|
-
yield
|
|
202
|
-
else:
|
|
203
|
-
start_time = time.time()
|
|
204
|
-
try:
|
|
205
|
-
yield
|
|
206
|
-
finally:
|
|
207
|
-
duration = time.time() - start_time
|
|
208
|
-
SKY_APISERVER_CODE_DURATION_SECONDS.labels(
|
|
209
|
-
name=name, group=group).observe(duration)
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
def time_me(func):
|
|
213
|
-
"""Measure the duration of decorated function."""
|
|
214
|
-
|
|
215
|
-
@functools.wraps(func)
|
|
216
|
-
def wrapper(*args, **kwargs):
|
|
217
|
-
if not METRICS_ENABLED:
|
|
218
|
-
return func(*args, **kwargs)
|
|
219
|
-
name = f'{func.__module__}/{func.__name__}'
|
|
220
|
-
with time_it(name, group='function'):
|
|
221
|
-
return func(*args, **kwargs)
|
|
222
|
-
|
|
223
|
-
return wrapper
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
def time_me_async(func):
|
|
227
|
-
"""Measure the duration of decorated async function."""
|
|
228
|
-
|
|
229
|
-
@functools.wraps(func)
|
|
230
|
-
async def async_wrapper(*args, **kwargs):
|
|
231
|
-
if not METRICS_ENABLED:
|
|
232
|
-
return await func(*args, **kwargs)
|
|
233
|
-
name = f'{func.__module__}/{func.__name__}'
|
|
234
|
-
with time_it(name, group='function'):
|
|
235
|
-
return await func(*args, **kwargs)
|
|
236
|
-
|
|
237
|
-
return async_wrapper
|
|
238
|
-
|
|
239
|
-
|
|
240
132
|
peak_rss_bytes = 0
|
|
241
133
|
|
|
242
134
|
|
|
@@ -252,13 +144,15 @@ def process_monitor(process_type: str, stop: threading.Event):
|
|
|
252
144
|
last_bucket_end = time.time()
|
|
253
145
|
bucket_peak = 0
|
|
254
146
|
peak_rss_bytes = max(bucket_peak, proc.memory_info().rss)
|
|
255
|
-
SKY_APISERVER_PROCESS_PEAK_RSS.labels(
|
|
147
|
+
metrics_utils.SKY_APISERVER_PROCESS_PEAK_RSS.labels(
|
|
256
148
|
pid=pid, type=process_type).set(peak_rss_bytes)
|
|
257
149
|
ctimes = proc.cpu_times()
|
|
258
|
-
SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
150
|
+
metrics_utils.SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
|
|
151
|
+
type=process_type,
|
|
152
|
+
mode='user').set(
|
|
153
|
+
ctimes.user)
|
|
154
|
+
metrics_utils.SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
|
|
155
|
+
type=process_type,
|
|
156
|
+
mode='system').set(
|
|
157
|
+
ctimes.system)
|
|
264
158
|
time.sleep(1)
|
sky/server/requests/executor.py
CHANGED
|
@@ -39,6 +39,7 @@ from sky import global_user_state
|
|
|
39
39
|
from sky import models
|
|
40
40
|
from sky import sky_logging
|
|
41
41
|
from sky import skypilot_config
|
|
42
|
+
from sky.metrics import utils as metrics_utils
|
|
42
43
|
from sky.server import common as server_common
|
|
43
44
|
from sky.server import config as server_config
|
|
44
45
|
from sky.server import constants as server_constants
|
|
@@ -422,10 +423,10 @@ def _request_execution_wrapper(request_id: str,
|
|
|
422
423
|
config = skypilot_config.to_dict()
|
|
423
424
|
logger.debug(f'request config: \n'
|
|
424
425
|
f'{yaml_utils.dump_yaml_str(dict(config))}')
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
with
|
|
428
|
-
|
|
426
|
+
(metrics_utils.SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL.
|
|
427
|
+
labels(request=request_name, pid=pid).inc())
|
|
428
|
+
with metrics_utils.time_it(name=request_name,
|
|
429
|
+
group='request_execution'):
|
|
429
430
|
return_value = func(**request_body.to_kwargs())
|
|
430
431
|
f.flush()
|
|
431
432
|
except KeyboardInterrupt:
|
|
@@ -468,8 +469,8 @@ def _request_execution_wrapper(request_id: str,
|
|
|
468
469
|
# Clear request level cache to release all memory used by
|
|
469
470
|
# the request.
|
|
470
471
|
annotations.clear_request_level_cache()
|
|
471
|
-
with
|
|
472
|
-
|
|
472
|
+
with metrics_utils.time_it(name='release_memory',
|
|
473
|
+
group='internal'):
|
|
473
474
|
common_utils.release_memory()
|
|
474
475
|
_record_memory_metrics(request_name, proc, rss_begin, peak_rss)
|
|
475
476
|
except Exception as e: # pylint: disable=broad-except
|
|
@@ -493,11 +494,11 @@ def _record_memory_metrics(request_name: str, proc: psutil.Process,
|
|
|
493
494
|
rss_end = proc.memory_info().rss
|
|
494
495
|
|
|
495
496
|
# Answer "how much RSS this request contributed?"
|
|
496
|
-
|
|
497
|
+
metrics_utils.SKY_APISERVER_REQUEST_RSS_INCR_BYTES.labels(
|
|
497
498
|
name=request_name).observe(max(rss_end - rss_begin, 0))
|
|
498
499
|
# Estimate the memory usage by the request by capturing the
|
|
499
500
|
# peak memory delta during the request execution.
|
|
500
|
-
|
|
501
|
+
metrics_utils.SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES.labels(
|
|
501
502
|
name=request_name).observe(max(peak_rss - rss_begin, 0))
|
|
502
503
|
|
|
503
504
|
|
sky/server/requests/payloads.py
CHANGED
|
@@ -792,6 +792,12 @@ class GetConfigBody(RequestBody):
|
|
|
792
792
|
class CostReportBody(RequestBody):
|
|
793
793
|
"""The request body for the cost report endpoint."""
|
|
794
794
|
days: Optional[int] = 30
|
|
795
|
+
# we use hashes instead of names to avoid the case where
|
|
796
|
+
# the name is not unique
|
|
797
|
+
cluster_hashes: Optional[List[str]] = None
|
|
798
|
+
# Only return fields that are needed for the dashboard
|
|
799
|
+
# summary page
|
|
800
|
+
dashboard_summary_response: bool = False
|
|
795
801
|
|
|
796
802
|
|
|
797
803
|
class RequestPayload(BasePayload):
|
sky/server/requests/requests.py
CHANGED
|
@@ -25,10 +25,10 @@ from sky import exceptions
|
|
|
25
25
|
from sky import global_user_state
|
|
26
26
|
from sky import sky_logging
|
|
27
27
|
from sky import skypilot_config
|
|
28
|
+
from sky.metrics import utils as metrics_lib
|
|
28
29
|
from sky.server import common as server_common
|
|
29
30
|
from sky.server import constants as server_constants
|
|
30
31
|
from sky.server import daemons
|
|
31
|
-
from sky.server import metrics as metrics_lib
|
|
32
32
|
from sky.server.requests import payloads
|
|
33
33
|
from sky.server.requests.serializers import decoders
|
|
34
34
|
from sky.server.requests.serializers import encoders
|
|
@@ -185,8 +185,9 @@ def encode_cost_report(
|
|
|
185
185
|
for cluster_report in cost_report:
|
|
186
186
|
if cluster_report['status'] is not None:
|
|
187
187
|
cluster_report['status'] = cluster_report['status'].value
|
|
188
|
-
|
|
189
|
-
cluster_report['resources']
|
|
188
|
+
if 'resources' in cluster_report:
|
|
189
|
+
cluster_report['resources'] = pickle_and_encode(
|
|
190
|
+
cluster_report['resources'])
|
|
190
191
|
return cost_report
|
|
191
192
|
|
|
192
193
|
|
sky/server/server.py
CHANGED
|
@@ -437,7 +437,7 @@ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
|
|
|
437
437
|
if lag_threshold is not None and lag > lag_threshold:
|
|
438
438
|
logger.warning(f'Event loop lag {lag} seconds exceeds threshold '
|
|
439
439
|
f'{lag_threshold} seconds.')
|
|
440
|
-
|
|
440
|
+
metrics_utils.SKY_APISERVER_EVENT_LOOP_LAG_SECONDS.labels(
|
|
441
441
|
pid=pid).observe(lag)
|
|
442
442
|
target = now + interval
|
|
443
443
|
loop.call_at(target, tick)
|
|
@@ -470,7 +470,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
|
|
|
470
470
|
# can safely ignore the error if the task is already scheduled.
|
|
471
471
|
logger.debug(f'Request {event.id} already exists.')
|
|
472
472
|
asyncio.create_task(cleanup_upload_ids())
|
|
473
|
-
if
|
|
473
|
+
if metrics_utils.METRICS_ENABLED:
|
|
474
474
|
# Start monitoring the event loop lag in each server worker
|
|
475
475
|
# event loop (process).
|
|
476
476
|
asyncio.create_task(loop_lag_monitor(asyncio.get_event_loop()))
|
|
@@ -1743,7 +1743,7 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
|
|
|
1743
1743
|
return
|
|
1744
1744
|
|
|
1745
1745
|
logger.info(f'Starting port-forward to local port: {local_port}')
|
|
1746
|
-
conn_gauge =
|
|
1746
|
+
conn_gauge = metrics_utils.SKY_APISERVER_WEBSOCKET_CONNECTIONS.labels(
|
|
1747
1747
|
pid=os.getpid())
|
|
1748
1748
|
ssh_failed = False
|
|
1749
1749
|
websocket_closed = False
|
|
@@ -1807,14 +1807,14 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
|
|
|
1807
1807
|
'ssh websocket connection was closed. Remaining '
|
|
1808
1808
|
f'output: {str(stdout)}')
|
|
1809
1809
|
reason = 'KubectlPortForwardExit'
|
|
1810
|
-
|
|
1810
|
+
metrics_utils.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
|
|
1811
1811
|
pid=os.getpid(), reason='KubectlPortForwardExit').inc()
|
|
1812
1812
|
else:
|
|
1813
1813
|
if ssh_failed:
|
|
1814
1814
|
reason = 'SSHToPodDisconnected'
|
|
1815
1815
|
else:
|
|
1816
1816
|
reason = 'ClientClosed'
|
|
1817
|
-
|
|
1817
|
+
metrics_utils.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
|
|
1818
1818
|
pid=os.getpid(), reason=reason).inc()
|
|
1819
1819
|
|
|
1820
1820
|
|
|
@@ -1831,42 +1831,6 @@ async def all_contexts(request: fastapi.Request) -> None:
|
|
|
1831
1831
|
)
|
|
1832
1832
|
|
|
1833
1833
|
|
|
1834
|
-
@app.get('/gpu-metrics')
|
|
1835
|
-
async def gpu_metrics() -> fastapi.Response:
|
|
1836
|
-
"""Gets the GPU metrics from multiple external k8s clusters"""
|
|
1837
|
-
contexts = core.get_all_contexts()
|
|
1838
|
-
all_metrics: List[str] = []
|
|
1839
|
-
successful_contexts = 0
|
|
1840
|
-
|
|
1841
|
-
tasks = [
|
|
1842
|
-
asyncio.create_task(metrics_utils.get_metrics_for_context(context))
|
|
1843
|
-
for context in contexts
|
|
1844
|
-
if context != 'in-cluster'
|
|
1845
|
-
]
|
|
1846
|
-
|
|
1847
|
-
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
1848
|
-
|
|
1849
|
-
for i, result in enumerate(results):
|
|
1850
|
-
if isinstance(result, Exception):
|
|
1851
|
-
logger.error(
|
|
1852
|
-
f'Failed to get metrics for context {contexts[i]}: {result}')
|
|
1853
|
-
elif isinstance(result, BaseException):
|
|
1854
|
-
# Avoid changing behavior for non-Exception BaseExceptions
|
|
1855
|
-
# like KeyboardInterrupt/SystemExit: re-raise them.
|
|
1856
|
-
raise result
|
|
1857
|
-
else:
|
|
1858
|
-
metrics_text = result
|
|
1859
|
-
all_metrics.append(metrics_text)
|
|
1860
|
-
successful_contexts += 1
|
|
1861
|
-
|
|
1862
|
-
combined_metrics = '\n\n'.join(all_metrics)
|
|
1863
|
-
|
|
1864
|
-
# Return as plain text for Prometheus compatibility
|
|
1865
|
-
return fastapi.Response(
|
|
1866
|
-
content=combined_metrics,
|
|
1867
|
-
media_type='text/plain; version=0.0.4; charset=utf-8')
|
|
1868
|
-
|
|
1869
|
-
|
|
1870
1834
|
# === Internal APIs ===
|
|
1871
1835
|
@app.get('/api/completion/cluster_name')
|
|
1872
1836
|
async def complete_cluster_name(incomplete: str,) -> List[str]:
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -49,8 +49,15 @@ install_requires = [
|
|
|
49
49
|
# <= 3.13 may encounter https://github.com/ultralytics/yolov5/issues/414
|
|
50
50
|
'pyyaml > 3.13, != 5.4.*',
|
|
51
51
|
'requests',
|
|
52
|
+
# SkyPilot inherits from uvicorn.Server to customize the behavior of
|
|
53
|
+
# uvicorn, so we need to pin uvicorn version to avoid potential break
|
|
54
|
+
# changes.
|
|
55
|
+
# Notes for current version check:
|
|
56
|
+
# - uvicorn 0.33.0 is the latest version that supports Python 3.8
|
|
57
|
+
# - uvicorn 0.36.0 removes setup_event_loop thus breaks SkyPilot's custom
|
|
58
|
+
# behavior.
|
|
59
|
+
'uvicorn[standard] >=0.33.0, <0.36.0',
|
|
52
60
|
'fastapi',
|
|
53
|
-
'uvicorn[standard]',
|
|
54
61
|
# Some pydantic versions are not compatible with ray. Adopted from ray's
|
|
55
62
|
# setup.py:
|
|
56
63
|
# https://github.com/ray-project/ray/blob/ray-2.9.3/python/setup.py#L254
|
sky/skylet/constants.py
CHANGED
|
@@ -29,6 +29,7 @@ SKY_REMOTE_RAY_PORT_FILE = '~/.sky/ray_port.json'
|
|
|
29
29
|
SKY_REMOTE_RAY_TEMPDIR = '/tmp/ray_skypilot'
|
|
30
30
|
SKY_REMOTE_RAY_VERSION = '2.9.3'
|
|
31
31
|
|
|
32
|
+
SKY_UNSET_PYTHONPATH = 'env -u PYTHONPATH'
|
|
32
33
|
# We store the absolute path of the python executable (/opt/conda/bin/python3)
|
|
33
34
|
# in this file, so that any future internal commands that need to use python
|
|
34
35
|
# can use this path. This is useful for the case where the user has a custom
|
|
@@ -40,7 +41,7 @@ SKY_GET_PYTHON_PATH_CMD = (f'[ -s {SKY_PYTHON_PATH_FILE} ] && '
|
|
|
40
41
|
f'cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || '
|
|
41
42
|
'which python3')
|
|
42
43
|
# Python executable, e.g., /opt/conda/bin/python3
|
|
43
|
-
SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})'
|
|
44
|
+
SKY_PYTHON_CMD = f'{SKY_UNSET_PYTHONPATH} $({SKY_GET_PYTHON_PATH_CMD})'
|
|
44
45
|
# Prefer SKY_UV_PIP_CMD, which is faster.
|
|
45
46
|
# TODO(cooperc): remove remaining usage (GCP TPU setup).
|
|
46
47
|
SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
|
|
@@ -56,14 +57,15 @@ SKY_REMOTE_PYTHON_ENV: str = f'~/{SKY_REMOTE_PYTHON_ENV_NAME}'
|
|
|
56
57
|
ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
|
|
57
58
|
# uv is used for venv and pip, much faster than python implementations.
|
|
58
59
|
SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
|
|
59
|
-
SKY_UV_CMD =
|
|
60
|
+
SKY_UV_CMD = ('UV_SYSTEM_PYTHON=false '
|
|
61
|
+
f'{SKY_UNSET_PYTHONPATH} {SKY_UV_INSTALL_DIR}/uv')
|
|
60
62
|
# This won't reinstall uv if it's already installed, so it's safe to re-run.
|
|
61
63
|
SKY_UV_INSTALL_CMD = (f'{SKY_UV_CMD} -V >/dev/null 2>&1 || '
|
|
62
64
|
'curl -LsSf https://astral.sh/uv/install.sh '
|
|
63
65
|
f'| UV_INSTALL_DIR={SKY_UV_INSTALL_DIR} sh')
|
|
64
66
|
SKY_UV_PIP_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} pip')
|
|
65
|
-
SKY_UV_RUN_CMD: str = (
|
|
66
|
-
|
|
67
|
+
SKY_UV_RUN_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} run '
|
|
68
|
+
'--no-project --no-config')
|
|
67
69
|
# Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH and unsetting relevant
|
|
68
70
|
# VIRTUAL_ENV envvars to deactivate the environment. `deactivate` command does
|
|
69
71
|
# not work when conda is used.
|
sky/skylet/job_lib.py
CHANGED
|
@@ -559,21 +559,20 @@ def get_jobs_info(user_hash: Optional[str] = None,
|
|
|
559
559
|
jobs_info = []
|
|
560
560
|
for job in jobs:
|
|
561
561
|
jobs_info.append(
|
|
562
|
-
jobsv1_pb2.JobInfo(
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
metadata=json.dumps(job['metadata'])))
|
|
562
|
+
jobsv1_pb2.JobInfo(job_id=job['job_id'],
|
|
563
|
+
job_name=job['job_name'],
|
|
564
|
+
username=job['username'],
|
|
565
|
+
submitted_at=job['submitted_at'],
|
|
566
|
+
status=job['status'].to_protobuf(),
|
|
567
|
+
run_timestamp=job['run_timestamp'],
|
|
568
|
+
start_at=job['start_at'],
|
|
569
|
+
end_at=job['end_at'],
|
|
570
|
+
resources=job['resources'],
|
|
571
|
+
pid=job['pid'],
|
|
572
|
+
log_path=os.path.join(
|
|
573
|
+
constants.SKY_LOGS_DIRECTORY,
|
|
574
|
+
job['run_timestamp']),
|
|
575
|
+
metadata=json.dumps(job['metadata'])))
|
|
577
576
|
return jobs_info
|
|
578
577
|
|
|
579
578
|
|
sky/utils/locks.py
CHANGED
|
@@ -11,6 +11,7 @@ import time
|
|
|
11
11
|
from typing import Any, Optional
|
|
12
12
|
|
|
13
13
|
import filelock
|
|
14
|
+
import psycopg2
|
|
14
15
|
import sqlalchemy
|
|
15
16
|
|
|
16
17
|
from sky import global_user_state
|
|
@@ -197,6 +198,7 @@ class PostgresLock(DistributedLock):
|
|
|
197
198
|
if engine.dialect.name != db_utils.SQLAlchemyDialect.POSTGRESQL.value:
|
|
198
199
|
raise ValueError('PostgresLock requires PostgreSQL database. '
|
|
199
200
|
f'Current dialect: {engine.dialect.name}')
|
|
201
|
+
# Borrow a dedicated connection from the pool.
|
|
200
202
|
return engine.raw_connection()
|
|
201
203
|
|
|
202
204
|
def acquire(self, blocking: bool = True) -> AcquireReturnProxy:
|
|
@@ -233,9 +235,7 @@ class PostgresLock(DistributedLock):
|
|
|
233
235
|
time.sleep(self.poll_interval)
|
|
234
236
|
|
|
235
237
|
except Exception:
|
|
236
|
-
|
|
237
|
-
self._connection.close()
|
|
238
|
-
self._connection = None
|
|
238
|
+
self._close_connection()
|
|
239
239
|
raise
|
|
240
240
|
|
|
241
241
|
def release(self) -> None:
|
|
@@ -248,27 +248,58 @@ class PostgresLock(DistributedLock):
|
|
|
248
248
|
cursor.execute('SELECT pg_advisory_unlock(%s)', (self._lock_key,))
|
|
249
249
|
self._connection.commit()
|
|
250
250
|
self._acquired = False
|
|
251
|
+
except psycopg2.OperationalError as e:
|
|
252
|
+
# Lost connection to the database, likely the lock is force unlocked
|
|
253
|
+
# by other routines.
|
|
254
|
+
logger.debug(f'Failed to release postgres lock {self.lock_id}: {e}')
|
|
251
255
|
finally:
|
|
252
|
-
|
|
253
|
-
self._connection.close()
|
|
254
|
-
self._connection = None
|
|
256
|
+
self._close_connection()
|
|
255
257
|
|
|
256
258
|
def force_unlock(self) -> None:
|
|
257
259
|
"""Force unlock the postgres advisory lock."""
|
|
258
260
|
try:
|
|
259
|
-
|
|
261
|
+
# The lock is held by current routine, gracefully unlock it
|
|
262
|
+
if self._acquired:
|
|
263
|
+
self.release()
|
|
264
|
+
return
|
|
265
|
+
|
|
266
|
+
# The lock is held by another routine, force unlock it.
|
|
267
|
+
if self._connection is None:
|
|
260
268
|
self._connection = self._get_connection()
|
|
261
269
|
cursor = self._connection.cursor()
|
|
262
270
|
cursor.execute('SELECT pg_advisory_unlock(%s)', (self._lock_key,))
|
|
263
|
-
|
|
271
|
+
result = cursor.fetchone()[0]
|
|
272
|
+
if result:
|
|
273
|
+
# The lock is held by current routine and unlock suceed
|
|
274
|
+
self._connection.commit()
|
|
275
|
+
self._acquired = False
|
|
276
|
+
return
|
|
277
|
+
cursor.execute(
|
|
278
|
+
('SELECT pid FROM pg_locks WHERE locktype = \'advisory\' '
|
|
279
|
+
'AND ((classid::bigint << 32) | objid::bigint) = %s'),
|
|
280
|
+
(self._lock_key,))
|
|
281
|
+
row = cursor.fetchone()
|
|
282
|
+
if row:
|
|
283
|
+
# The lock is still held by another routine, false unlock it
|
|
284
|
+
# by killing the PG connection of that routine.
|
|
285
|
+
cursor.execute('SELECT pg_terminate_backend(%s)', (row[0],))
|
|
286
|
+
self._connection.commit()
|
|
287
|
+
return
|
|
264
288
|
except Exception as e:
|
|
265
289
|
raise RuntimeError(
|
|
266
290
|
f'Failed to force unlock postgres lock {self.lock_id}: {e}'
|
|
267
291
|
) from e
|
|
268
292
|
finally:
|
|
269
|
-
|
|
293
|
+
self._close_connection()
|
|
294
|
+
|
|
295
|
+
def _close_connection(self) -> None:
|
|
296
|
+
"""Close the postgres connection."""
|
|
297
|
+
if self._connection:
|
|
298
|
+
try:
|
|
270
299
|
self._connection.close()
|
|
271
|
-
|
|
300
|
+
except Exception as e: # pylint: disable=broad-except
|
|
301
|
+
logger.debug(f'Failed to close postgres connection: {e}')
|
|
302
|
+
self._connection = None
|
|
272
303
|
|
|
273
304
|
def is_locked(self) -> bool:
|
|
274
305
|
"""Check if the postgres advisory lock is acquired."""
|