skypilot-nightly 1.0.0.dev20250918__py3-none-any.whl → 1.0.0.dev20250919__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/core.py +67 -45
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{k1mo5xWZrV9djgjd0moOT → VvaUqYDvHOcHZRnvMBmax}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-88c7c8d69b0b6dba.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-0b4b35dc1dfe046c.js → [cluster]-9525660179df3605.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-487697b47d8c5e50.js → webpack-b2a3938c22b6647b.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +83 -54
- sky/metrics/utils.py +174 -8
- sky/schemas/generated/jobsv1_pb2.py +40 -40
- sky/server/metrics.py +52 -158
- sky/server/requests/executor.py +9 -8
- sky/server/requests/payloads.py +6 -0
- sky/server/requests/requests.py +1 -1
- sky/server/requests/serializers/encoders.py +3 -2
- sky/server/server.py +5 -41
- sky/skylet/constants.py +6 -3
- sky/skylet/job_lib.py +14 -15
- sky/utils/locks.py +41 -10
- {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/METADATA +32 -32
- {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/RECORD +44 -44
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/3015-ba5be550eb80fd8c.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-a3e3f0683e19d340.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- /sky/dashboard/out/_next/static/{k1mo5xWZrV9djgjd0moOT → VvaUqYDvHOcHZRnvMBmax}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/top_level.txt +0 -0
sky/metrics/utils.py
CHANGED
|
@@ -1,11 +1,165 @@
|
|
|
1
1
|
"""Utilities for processing GPU metrics from Kubernetes clusters."""
|
|
2
|
+
import contextlib
|
|
3
|
+
import functools
|
|
2
4
|
import os
|
|
3
5
|
import re
|
|
6
|
+
import select
|
|
4
7
|
import subprocess
|
|
5
8
|
import time
|
|
6
9
|
from typing import List, Optional, Tuple
|
|
7
10
|
|
|
8
11
|
import httpx
|
|
12
|
+
import prometheus_client as prom
|
|
13
|
+
|
|
14
|
+
from sky.skylet import constants
|
|
15
|
+
from sky.utils import context_utils
|
|
16
|
+
|
|
17
|
+
_SELECT_TIMEOUT = 1
|
|
18
|
+
_SELECT_BUFFER_SIZE = 4096
|
|
19
|
+
|
|
20
|
+
_KB = 2**10
|
|
21
|
+
_MB = 2**20
|
|
22
|
+
_MEM_BUCKETS = [
|
|
23
|
+
_KB,
|
|
24
|
+
256 * _KB,
|
|
25
|
+
512 * _KB,
|
|
26
|
+
_MB,
|
|
27
|
+
2 * _MB,
|
|
28
|
+
4 * _MB,
|
|
29
|
+
8 * _MB,
|
|
30
|
+
16 * _MB,
|
|
31
|
+
32 * _MB,
|
|
32
|
+
64 * _MB,
|
|
33
|
+
128 * _MB,
|
|
34
|
+
256 * _MB,
|
|
35
|
+
float('inf'),
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
# Whether the metrics are enabled, cannot be changed at runtime.
|
|
39
|
+
METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
|
|
40
|
+
'false').lower() == 'true'
|
|
41
|
+
|
|
42
|
+
# Time spent processing a piece of code, refer to time_it().
|
|
43
|
+
SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
|
|
44
|
+
'sky_apiserver_code_duration_seconds',
|
|
45
|
+
'Time spent processing code',
|
|
46
|
+
['name', 'group'],
|
|
47
|
+
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
|
|
48
|
+
60.0, 120.0, float('inf')),
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Total number of API server requests, grouped by path, method, and status.
|
|
52
|
+
SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
|
|
53
|
+
'sky_apiserver_requests_total',
|
|
54
|
+
'Total number of API server requests',
|
|
55
|
+
['path', 'method', 'status'],
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Time spent processing API server requests, grouped by path, method, and
|
|
59
|
+
# status.
|
|
60
|
+
SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
|
|
61
|
+
'sky_apiserver_request_duration_seconds',
|
|
62
|
+
'Time spent processing API server requests',
|
|
63
|
+
['path', 'method', 'status'],
|
|
64
|
+
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
|
|
65
|
+
60.0, 120.0, float('inf')),
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
|
|
69
|
+
'sky_apiserver_event_loop_lag_seconds',
|
|
70
|
+
'Scheduling delay of the server event loop',
|
|
71
|
+
['pid'],
|
|
72
|
+
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 20.0,
|
|
73
|
+
60.0, float('inf')),
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
|
|
77
|
+
'sky_apiserver_websocket_connections',
|
|
78
|
+
'Number of websocket connections',
|
|
79
|
+
['pid'],
|
|
80
|
+
multiprocess_mode='livesum',
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL = prom.Counter(
|
|
84
|
+
'sky_apiserver_websocket_closed_total',
|
|
85
|
+
'Number of websocket closed',
|
|
86
|
+
['pid', 'reason'],
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# The number of execution starts in each worker process, we do not record
|
|
90
|
+
# histogram here as the duration has been measured in
|
|
91
|
+
# SKY_APISERVER_CODE_DURATION_SECONDS without the worker label (process id).
|
|
92
|
+
# Recording histogram WITH worker label will cause high cardinality.
|
|
93
|
+
SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL = prom.Counter(
|
|
94
|
+
'sky_apiserver_process_execution_start_total',
|
|
95
|
+
'Total number of execution starts in each worker process',
|
|
96
|
+
['request', 'pid'],
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
SKY_APISERVER_PROCESS_PEAK_RSS = prom.Gauge(
|
|
100
|
+
'sky_apiserver_process_peak_rss',
|
|
101
|
+
'Peak RSS we saw in each process in last 30 seconds',
|
|
102
|
+
['pid', 'type'],
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
|
|
106
|
+
'sky_apiserver_process_cpu_total',
|
|
107
|
+
'Total CPU times a worker process has been running',
|
|
108
|
+
['pid', 'type', 'mode'],
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
|
|
112
|
+
'sky_apiserver_request_memory_usage_bytes',
|
|
113
|
+
'Peak memory usage of requests', ['name'],
|
|
114
|
+
buckets=_MEM_BUCKETS)
|
|
115
|
+
|
|
116
|
+
SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
|
|
117
|
+
'sky_apiserver_request_rss_incr_bytes',
|
|
118
|
+
'RSS increment after requests', ['name'],
|
|
119
|
+
buckets=_MEM_BUCKETS)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@contextlib.contextmanager
|
|
123
|
+
def time_it(name: str, group: str = 'default'):
|
|
124
|
+
"""Context manager to measure and record code execution duration."""
|
|
125
|
+
if not METRICS_ENABLED:
|
|
126
|
+
yield
|
|
127
|
+
else:
|
|
128
|
+
start_time = time.time()
|
|
129
|
+
try:
|
|
130
|
+
yield
|
|
131
|
+
finally:
|
|
132
|
+
duration = time.time() - start_time
|
|
133
|
+
SKY_APISERVER_CODE_DURATION_SECONDS.labels(
|
|
134
|
+
name=name, group=group).observe(duration)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def time_me(func):
|
|
138
|
+
"""Measure the duration of decorated function."""
|
|
139
|
+
|
|
140
|
+
@functools.wraps(func)
|
|
141
|
+
def wrapper(*args, **kwargs):
|
|
142
|
+
if not METRICS_ENABLED:
|
|
143
|
+
return func(*args, **kwargs)
|
|
144
|
+
name = f'{func.__module__}/{func.__name__}'
|
|
145
|
+
with time_it(name, group='function'):
|
|
146
|
+
return func(*args, **kwargs)
|
|
147
|
+
|
|
148
|
+
return wrapper
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def time_me_async(func):
|
|
152
|
+
"""Measure the duration of decorated async function."""
|
|
153
|
+
|
|
154
|
+
@functools.wraps(func)
|
|
155
|
+
async def async_wrapper(*args, **kwargs):
|
|
156
|
+
if not METRICS_ENABLED:
|
|
157
|
+
return await func(*args, **kwargs)
|
|
158
|
+
name = f'{func.__module__}/{func.__name__}'
|
|
159
|
+
with time_it(name, group='function'):
|
|
160
|
+
return await func(*args, **kwargs)
|
|
161
|
+
|
|
162
|
+
return async_wrapper
|
|
9
163
|
|
|
10
164
|
|
|
11
165
|
def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
@@ -44,6 +198,7 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
|
44
198
|
local_port = None
|
|
45
199
|
start_time = time.time()
|
|
46
200
|
|
|
201
|
+
buffer = ''
|
|
47
202
|
# wait for the port forward to start and extract the local port
|
|
48
203
|
while time.time() - start_time < start_port_forward_timeout:
|
|
49
204
|
if port_forward_process.poll() is not None:
|
|
@@ -56,10 +211,16 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
|
56
211
|
|
|
57
212
|
# read output line by line to find the local port
|
|
58
213
|
if port_forward_process.stdout:
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
214
|
+
# Wait up to 1s for data to be available without blocking
|
|
215
|
+
r, _, _ = select.select([port_forward_process.stdout], [], [],
|
|
216
|
+
_SELECT_TIMEOUT)
|
|
217
|
+
if r:
|
|
218
|
+
# Read available bytes from the FD without blocking
|
|
219
|
+
fd = port_forward_process.stdout.fileno()
|
|
220
|
+
raw = os.read(fd, _SELECT_BUFFER_SIZE)
|
|
221
|
+
chunk = raw.decode(errors='ignore')
|
|
222
|
+
buffer += chunk
|
|
223
|
+
match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', buffer)
|
|
63
224
|
if match:
|
|
64
225
|
local_port = int(match.group(1))
|
|
65
226
|
break
|
|
@@ -122,8 +283,8 @@ async def send_metrics_request_with_port_forward(
|
|
|
122
283
|
port_forward_process = None
|
|
123
284
|
try:
|
|
124
285
|
# Start port forward
|
|
125
|
-
port_forward_process, local_port =
|
|
126
|
-
context, namespace, service, service_port)
|
|
286
|
+
port_forward_process, local_port = await context_utils.to_thread(
|
|
287
|
+
start_svc_port_forward, context, namespace, service, service_port)
|
|
127
288
|
|
|
128
289
|
# Build endpoint URL
|
|
129
290
|
endpoint = f'http://localhost:{local_port}{endpoint_path}'
|
|
@@ -143,7 +304,8 @@ async def send_metrics_request_with_port_forward(
|
|
|
143
304
|
finally:
|
|
144
305
|
# Always clean up port forward
|
|
145
306
|
if port_forward_process:
|
|
146
|
-
stop_svc_port_forward
|
|
307
|
+
await context_utils.to_thread(stop_svc_port_forward,
|
|
308
|
+
port_forward_process)
|
|
147
309
|
|
|
148
310
|
|
|
149
311
|
async def add_cluster_name_label(metrics_text: str, context: str) -> str:
|
|
@@ -193,7 +355,11 @@ async def get_metrics_for_context(context: str) -> str:
|
|
|
193
355
|
"""
|
|
194
356
|
# Query both DCGM metrics and kube_pod_labels metrics
|
|
195
357
|
# This ensures the dashboard can perform joins to filter by skypilot cluster
|
|
196
|
-
match_patterns = [
|
|
358
|
+
match_patterns = [
|
|
359
|
+
'{__name__=~"node_memory_MemAvailable_bytes|node_memory_MemTotal_bytes|DCGM_.*"}', # pylint: disable=line-too-long
|
|
360
|
+
'kube_pod_labels',
|
|
361
|
+
'node_cpu_seconds_total{mode="idle"}'
|
|
362
|
+
]
|
|
197
363
|
|
|
198
364
|
# TODO(rohan): don't hardcode the namespace and service name
|
|
199
365
|
metrics_text = await send_metrics_request_with_port_forward(
|
|
@@ -14,7 +14,7 @@ _sym_db = _symbol_database.Default()
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\"sky/schemas/generated/jobsv1.proto\x12\x07jobs.v1\"\x85\x01\n\rAddJobRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x15\n\rrun_timestamp\x18\x03 \x01(\t\x12\x15\n\rresources_str\x18\x04 \x01(\t\x12\x10\n\x08metadata\x18\x05 \x01(\tB\x0b\n\t_job_name\"1\n\x0e\x41\x64\x64JobResponse\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07log_dir\x18\x02 \x01(\t\"\xb3\x01\n\x0fQueueJobRequest\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x14\n\x07\x63odegen\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bscript_path\x18\x03 \x01(\t\x12\x16\n\x0eremote_log_dir\x18\x04 \x01(\t\x12\x31\n\x0bmanaged_job\x18\x05 \x01(\x0b\x32\x17.jobs.v1.ManagedJobInfoH\x01\x88\x01\x01\x42\n\n\x08_codegenB\x0e\n\x0c_managed_job\"\x89\x01\n\x0eManagedJobInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x11\n\x04pool\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tworkspace\x18\x03 \x01(\t\x12\x12\n\nentrypoint\x18\x04 \x01(\t\x12&\n\x05tasks\x18\x05 \x03(\x0b\x32\x17.jobs.v1.ManagedJobTaskB\x07\n\x05_pool\"]\n\x0eManagedJobTask\x12\x0f\n\x07task_id\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x15\n\rresources_str\x18\x03 \x01(\t\x12\x15\n\rmetadata_json\x18\x04 \x01(\t\"\x12\n\x10QueueJobResponse\"\x15\n\x13UpdateStatusRequest\"\x16\n\x14UpdateStatusResponse\"L\n\x12GetJobQueueRequest\x12\x16\n\tuser_hash\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08\x61ll_jobs\x18\x02 \x01(\x08\x42\x0c\n\n_user_hash\"\
|
|
17
|
+
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\"sky/schemas/generated/jobsv1.proto\x12\x07jobs.v1\"\x85\x01\n\rAddJobRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x15\n\rrun_timestamp\x18\x03 \x01(\t\x12\x15\n\rresources_str\x18\x04 \x01(\t\x12\x10\n\x08metadata\x18\x05 \x01(\tB\x0b\n\t_job_name\"1\n\x0e\x41\x64\x64JobResponse\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07log_dir\x18\x02 \x01(\t\"\xb3\x01\n\x0fQueueJobRequest\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x14\n\x07\x63odegen\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bscript_path\x18\x03 \x01(\t\x12\x16\n\x0eremote_log_dir\x18\x04 \x01(\t\x12\x31\n\x0bmanaged_job\x18\x05 \x01(\x0b\x32\x17.jobs.v1.ManagedJobInfoH\x01\x88\x01\x01\x42\n\n\x08_codegenB\x0e\n\x0c_managed_job\"\x89\x01\n\x0eManagedJobInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x11\n\x04pool\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tworkspace\x18\x03 \x01(\t\x12\x12\n\nentrypoint\x18\x04 \x01(\t\x12&\n\x05tasks\x18\x05 \x03(\x0b\x32\x17.jobs.v1.ManagedJobTaskB\x07\n\x05_pool\"]\n\x0eManagedJobTask\x12\x0f\n\x07task_id\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x15\n\rresources_str\x18\x03 \x01(\t\x12\x15\n\rmetadata_json\x18\x04 \x01(\t\"\x12\n\x10QueueJobResponse\"\x15\n\x13UpdateStatusRequest\"\x16\n\x14UpdateStatusResponse\"L\n\x12GetJobQueueRequest\x12\x16\n\tuser_hash\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08\x61ll_jobs\x18\x02 \x01(\x08\x42\x0c\n\n_user_hash\"\xa3\x02\n\x07JobInfo\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x10\n\x08job_name\x18\x02 \x01(\t\x12\x10\n\x08username\x18\x03 \x01(\t\x12\x14\n\x0csubmitted_at\x18\x04 \x01(\x01\x12\"\n\x06status\x18\x05 \x01(\x0e\x32\x12.jobs.v1.JobStatus\x12\x15\n\rrun_timestamp\x18\x06 \x01(\t\x12\x15\n\x08start_at\x18\x07 \x01(\x01H\x00\x88\x01\x01\x12\x13\n\x06\x65nd_at\x18\x08 \x01(\x01H\x01\x88\x01\x01\x12\x11\n\tresources\x18\t \x01(\t\x12\x10\n\x03pid\x18\n \x01(\x03H\x02\x88\x01\x01\x12\x10\n\x08log_path\x18\x0b \x01(\t\x12\x10\n\x08metadata\x18\x0c \x01(\tB\x0b\n\t_start_atB\t\n\x07_end_atB\x06\n\x04_pid\"5\n\x13GetJobQueueResponse\x12\x1e\n\x04jobs\x18\x01 \x03(\x0b\x32\x10.jobs.v1.JobInfo\"^\n\x11\x43\x61ncelJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\x12\x12\n\ncancel_all\x18\x02 \x01(\x08\x12\x16\n\tuser_hash\x18\x03 \x01(\tH\x00\x88\x01\x01\x42\x0c\n\n_user_hash\"/\n\x12\x43\x61ncelJobsResponse\x12\x19\n\x11\x63\x61ncelled_job_ids\x18\x01 \x03(\x03\"\x1e\n\x1c\x46\x61ilAllInProgressJobsRequest\"\x1f\n\x1d\x46\x61ilAllInProgressJobsResponse\"\x7f\n\x0fTailLogsRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x12\x1b\n\x0emanaged_job_id\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x0e\n\x06\x66ollow\x18\x03 \x01(\x08\x12\x0c\n\x04tail\x18\x04 \x01(\x05\x42\t\n\x07_job_idB\x11\n\x0f_managed_job_id\"7\n\x10TailLogsResponse\x12\x10\n\x08log_line\x18\x01 \x01(\t\x12\x11\n\texit_code\x18\x02 \x01(\x05\"&\n\x13GetJobStatusRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\xa4\x01\n\x14GetJobStatusResponse\x12\x44\n\x0cjob_statuses\x18\x01 \x03(\x0b\x32..jobs.v1.GetJobStatusResponse.JobStatusesEntry\x1a\x46\n\x10JobStatusesEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12!\n\x05value\x18\x02 \x01(\x0e\x32\x12.jobs.v1.JobStatus:\x02\x38\x01\"A\n\x1fGetJobSubmittedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"5\n GetJobSubmittedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"=\n\x1bGetJobEndedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"1\n\x1cGetJobEndedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"+\n\x18GetLogDirsForJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\x98\x01\n\x19GetLogDirsForJobsResponse\x12H\n\x0cjob_log_dirs\x18\x01 \x03(\x0b\x32\x32.jobs.v1.GetLogDirsForJobsResponse.JobLogDirsEntry\x1a\x31\n\x0fJobLogDirsEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01*\x8d\x02\n\tJobStatus\x12\x1a\n\x16JOB_STATUS_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOB_STATUS_INIT\x10\x01\x12\x16\n\x12JOB_STATUS_PENDING\x10\x02\x12\x19\n\x15JOB_STATUS_SETTING_UP\x10\x03\x12\x16\n\x12JOB_STATUS_RUNNING\x10\x04\x12\x1c\n\x18JOB_STATUS_FAILED_DRIVER\x10\x05\x12\x18\n\x14JOB_STATUS_SUCCEEDED\x10\x06\x12\x15\n\x11JOB_STATUS_FAILED\x10\x07\x12\x1b\n\x17JOB_STATUS_FAILED_SETUP\x10\x08\x12\x18\n\x14JOB_STATUS_CANCELLED\x10\t2\x91\x07\n\x0bJobsService\x12\x39\n\x06\x41\x64\x64Job\x12\x16.jobs.v1.AddJobRequest\x1a\x17.jobs.v1.AddJobResponse\x12?\n\x08QueueJob\x12\x18.jobs.v1.QueueJobRequest\x1a\x19.jobs.v1.QueueJobResponse\x12K\n\x0cUpdateStatus\x12\x1c.jobs.v1.UpdateStatusRequest\x1a\x1d.jobs.v1.UpdateStatusResponse\x12H\n\x0bGetJobQueue\x12\x1b.jobs.v1.GetJobQueueRequest\x1a\x1c.jobs.v1.GetJobQueueResponse\x12\x45\n\nCancelJobs\x12\x1a.jobs.v1.CancelJobsRequest\x1a\x1b.jobs.v1.CancelJobsResponse\x12\x66\n\x15\x46\x61ilAllInProgressJobs\x12%.jobs.v1.FailAllInProgressJobsRequest\x1a&.jobs.v1.FailAllInProgressJobsResponse\x12\x41\n\x08TailLogs\x12\x18.jobs.v1.TailLogsRequest\x1a\x19.jobs.v1.TailLogsResponse0\x01\x12K\n\x0cGetJobStatus\x12\x1c.jobs.v1.GetJobStatusRequest\x1a\x1d.jobs.v1.GetJobStatusResponse\x12o\n\x18GetJobSubmittedTimestamp\x12(.jobs.v1.GetJobSubmittedTimestampRequest\x1a).jobs.v1.GetJobSubmittedTimestampResponse\x12\x63\n\x14GetJobEndedTimestamp\x12$.jobs.v1.GetJobEndedTimestampRequest\x1a%.jobs.v1.GetJobEndedTimestampResponse\x12Z\n\x11GetLogDirsForJobs\x12!.jobs.v1.GetLogDirsForJobsRequest\x1a\".jobs.v1.GetLogDirsForJobsResponseb\x06proto3')
|
|
18
18
|
|
|
19
19
|
_globals = globals()
|
|
20
20
|
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
|
@@ -25,8 +25,8 @@ if not _descriptor._USE_C_DESCRIPTORS:
|
|
|
25
25
|
_globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_options = b'8\001'
|
|
26
26
|
_globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._loaded_options = None
|
|
27
27
|
_globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_options = b'8\001'
|
|
28
|
-
_globals['_JOBSTATUS']._serialized_start=
|
|
29
|
-
_globals['_JOBSTATUS']._serialized_end=
|
|
28
|
+
_globals['_JOBSTATUS']._serialized_start=2185
|
|
29
|
+
_globals['_JOBSTATUS']._serialized_end=2454
|
|
30
30
|
_globals['_ADDJOBREQUEST']._serialized_start=48
|
|
31
31
|
_globals['_ADDJOBREQUEST']._serialized_end=181
|
|
32
32
|
_globals['_ADDJOBRESPONSE']._serialized_start=183
|
|
@@ -46,41 +46,41 @@ if not _descriptor._USE_C_DESCRIPTORS:
|
|
|
46
46
|
_globals['_GETJOBQUEUEREQUEST']._serialized_start=718
|
|
47
47
|
_globals['_GETJOBQUEUEREQUEST']._serialized_end=794
|
|
48
48
|
_globals['_JOBINFO']._serialized_start=797
|
|
49
|
-
_globals['_JOBINFO']._serialized_end=
|
|
50
|
-
_globals['_GETJOBQUEUERESPONSE']._serialized_start=
|
|
51
|
-
_globals['_GETJOBQUEUERESPONSE']._serialized_end=
|
|
52
|
-
_globals['_CANCELJOBSREQUEST']._serialized_start=
|
|
53
|
-
_globals['_CANCELJOBSREQUEST']._serialized_end=
|
|
54
|
-
_globals['_CANCELJOBSRESPONSE']._serialized_start=
|
|
55
|
-
_globals['_CANCELJOBSRESPONSE']._serialized_end=
|
|
56
|
-
_globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_start=
|
|
57
|
-
_globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_end=
|
|
58
|
-
_globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_start=
|
|
59
|
-
_globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_end=
|
|
60
|
-
_globals['_TAILLOGSREQUEST']._serialized_start=
|
|
61
|
-
_globals['_TAILLOGSREQUEST']._serialized_end=
|
|
62
|
-
_globals['_TAILLOGSRESPONSE']._serialized_start=
|
|
63
|
-
_globals['_TAILLOGSRESPONSE']._serialized_end=
|
|
64
|
-
_globals['_GETJOBSTATUSREQUEST']._serialized_start=
|
|
65
|
-
_globals['_GETJOBSTATUSREQUEST']._serialized_end=
|
|
66
|
-
_globals['_GETJOBSTATUSRESPONSE']._serialized_start=
|
|
67
|
-
_globals['_GETJOBSTATUSRESPONSE']._serialized_end=
|
|
68
|
-
_globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_start=
|
|
69
|
-
_globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_end=
|
|
70
|
-
_globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_start=
|
|
71
|
-
_globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_end=
|
|
72
|
-
_globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_start=
|
|
73
|
-
_globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_end=
|
|
74
|
-
_globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_start=
|
|
75
|
-
_globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_end=
|
|
76
|
-
_globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_start=
|
|
77
|
-
_globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_end=
|
|
78
|
-
_globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_start=
|
|
79
|
-
_globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_end=
|
|
80
|
-
_globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_start=
|
|
81
|
-
_globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_end=
|
|
82
|
-
_globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_start=
|
|
83
|
-
_globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_end=
|
|
84
|
-
_globals['_JOBSSERVICE']._serialized_start=
|
|
85
|
-
_globals['_JOBSSERVICE']._serialized_end=
|
|
49
|
+
_globals['_JOBINFO']._serialized_end=1088
|
|
50
|
+
_globals['_GETJOBQUEUERESPONSE']._serialized_start=1090
|
|
51
|
+
_globals['_GETJOBQUEUERESPONSE']._serialized_end=1143
|
|
52
|
+
_globals['_CANCELJOBSREQUEST']._serialized_start=1145
|
|
53
|
+
_globals['_CANCELJOBSREQUEST']._serialized_end=1239
|
|
54
|
+
_globals['_CANCELJOBSRESPONSE']._serialized_start=1241
|
|
55
|
+
_globals['_CANCELJOBSRESPONSE']._serialized_end=1288
|
|
56
|
+
_globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_start=1290
|
|
57
|
+
_globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_end=1320
|
|
58
|
+
_globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_start=1322
|
|
59
|
+
_globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_end=1353
|
|
60
|
+
_globals['_TAILLOGSREQUEST']._serialized_start=1355
|
|
61
|
+
_globals['_TAILLOGSREQUEST']._serialized_end=1482
|
|
62
|
+
_globals['_TAILLOGSRESPONSE']._serialized_start=1484
|
|
63
|
+
_globals['_TAILLOGSRESPONSE']._serialized_end=1539
|
|
64
|
+
_globals['_GETJOBSTATUSREQUEST']._serialized_start=1541
|
|
65
|
+
_globals['_GETJOBSTATUSREQUEST']._serialized_end=1579
|
|
66
|
+
_globals['_GETJOBSTATUSRESPONSE']._serialized_start=1582
|
|
67
|
+
_globals['_GETJOBSTATUSRESPONSE']._serialized_end=1746
|
|
68
|
+
_globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_start=1676
|
|
69
|
+
_globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_end=1746
|
|
70
|
+
_globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_start=1748
|
|
71
|
+
_globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_end=1813
|
|
72
|
+
_globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_start=1815
|
|
73
|
+
_globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_end=1868
|
|
74
|
+
_globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_start=1870
|
|
75
|
+
_globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_end=1931
|
|
76
|
+
_globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_start=1933
|
|
77
|
+
_globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_end=1982
|
|
78
|
+
_globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_start=1984
|
|
79
|
+
_globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_end=2027
|
|
80
|
+
_globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_start=2030
|
|
81
|
+
_globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_end=2182
|
|
82
|
+
_globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_start=2133
|
|
83
|
+
_globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_end=2182
|
|
84
|
+
_globals['_JOBSSERVICE']._serialized_start=2457
|
|
85
|
+
_globals['_JOBSSERVICE']._serialized_end=3370
|
|
86
86
|
# @@protoc_insertion_point(module_scope)
|
sky/server/metrics.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
"""Instrumentation for the API server."""
|
|
2
2
|
|
|
3
|
-
import
|
|
4
|
-
import functools
|
|
3
|
+
import asyncio
|
|
5
4
|
import multiprocessing
|
|
6
5
|
import os
|
|
7
6
|
import threading
|
|
8
7
|
import time
|
|
8
|
+
from typing import List
|
|
9
9
|
|
|
10
10
|
import fastapi
|
|
11
11
|
from prometheus_client import generate_latest
|
|
@@ -15,112 +15,12 @@ import psutil
|
|
|
15
15
|
import starlette.middleware.base
|
|
16
16
|
import uvicorn
|
|
17
17
|
|
|
18
|
+
from sky import core
|
|
18
19
|
from sky import sky_logging
|
|
19
|
-
from sky.
|
|
20
|
-
|
|
21
|
-
# Whether the metrics are enabled, cannot be changed at runtime.
|
|
22
|
-
METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
|
|
23
|
-
'false').lower() == 'true'
|
|
24
|
-
|
|
25
|
-
_KB = 2**10
|
|
26
|
-
_MB = 2**20
|
|
27
|
-
_MEM_BUCKETS = [
|
|
28
|
-
_KB,
|
|
29
|
-
256 * _KB,
|
|
30
|
-
512 * _KB,
|
|
31
|
-
_MB,
|
|
32
|
-
2 * _MB,
|
|
33
|
-
4 * _MB,
|
|
34
|
-
8 * _MB,
|
|
35
|
-
16 * _MB,
|
|
36
|
-
32 * _MB,
|
|
37
|
-
64 * _MB,
|
|
38
|
-
128 * _MB,
|
|
39
|
-
256 * _MB,
|
|
40
|
-
float('inf'),
|
|
41
|
-
]
|
|
20
|
+
from sky.metrics import utils as metrics_utils
|
|
42
21
|
|
|
43
22
|
logger = sky_logging.init_logger(__name__)
|
|
44
23
|
|
|
45
|
-
# Total number of API server requests, grouped by path, method, and status.
|
|
46
|
-
SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
|
|
47
|
-
'sky_apiserver_requests_total',
|
|
48
|
-
'Total number of API server requests',
|
|
49
|
-
['path', 'method', 'status'],
|
|
50
|
-
)
|
|
51
|
-
|
|
52
|
-
# Time spent processing API server requests, grouped by path, method, and
|
|
53
|
-
# status.
|
|
54
|
-
SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
|
|
55
|
-
'sky_apiserver_request_duration_seconds',
|
|
56
|
-
'Time spent processing API server requests',
|
|
57
|
-
['path', 'method', 'status'],
|
|
58
|
-
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
|
|
59
|
-
60.0, 120.0, float('inf')),
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
# Time spent processing a piece of code, refer to time_it().
|
|
63
|
-
SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
|
|
64
|
-
'sky_apiserver_code_duration_seconds',
|
|
65
|
-
'Time spent processing code',
|
|
66
|
-
['name', 'group'],
|
|
67
|
-
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
|
|
68
|
-
60.0, 120.0, float('inf')),
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
|
|
72
|
-
'sky_apiserver_event_loop_lag_seconds',
|
|
73
|
-
'Scheduling delay of the server event loop',
|
|
74
|
-
['pid'],
|
|
75
|
-
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 20.0,
|
|
76
|
-
60.0, float('inf')),
|
|
77
|
-
)
|
|
78
|
-
|
|
79
|
-
SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
|
|
80
|
-
'sky_apiserver_websocket_connections',
|
|
81
|
-
'Number of websocket connections',
|
|
82
|
-
['pid'],
|
|
83
|
-
multiprocess_mode='livesum',
|
|
84
|
-
)
|
|
85
|
-
|
|
86
|
-
SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL = prom.Counter(
|
|
87
|
-
'sky_apiserver_websocket_closed_total',
|
|
88
|
-
'Number of websocket closed',
|
|
89
|
-
['pid', 'reason'],
|
|
90
|
-
)
|
|
91
|
-
|
|
92
|
-
# The number of execution starts in each worker process, we do not record
|
|
93
|
-
# histogram here as the duration has been measured in
|
|
94
|
-
# SKY_APISERVER_CODE_DURATION_SECONDS without the worker label (process id).
|
|
95
|
-
# Recording histogram WITH worker label will cause high cardinality.
|
|
96
|
-
SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL = prom.Counter(
|
|
97
|
-
'sky_apiserver_process_execution_start_total',
|
|
98
|
-
'Total number of execution starts in each worker process',
|
|
99
|
-
['request', 'pid'],
|
|
100
|
-
)
|
|
101
|
-
|
|
102
|
-
SKY_APISERVER_PROCESS_PEAK_RSS = prom.Gauge(
|
|
103
|
-
'sky_apiserver_process_peak_rss',
|
|
104
|
-
'Peak RSS we saw in each process in last 30 seconds',
|
|
105
|
-
['pid', 'type'],
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
|
|
109
|
-
'sky_apiserver_process_cpu_total',
|
|
110
|
-
'Total CPU times a worker process has been running',
|
|
111
|
-
['pid', 'type', 'mode'],
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
|
|
115
|
-
'sky_apiserver_request_memory_usage_bytes',
|
|
116
|
-
'Peak memory usage of requests', ['name'],
|
|
117
|
-
buckets=_MEM_BUCKETS)
|
|
118
|
-
|
|
119
|
-
SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
|
|
120
|
-
'sky_apiserver_request_rss_incr_bytes',
|
|
121
|
-
'RSS increment after requests', ['name'],
|
|
122
|
-
buckets=_MEM_BUCKETS)
|
|
123
|
-
|
|
124
24
|
metrics_app = fastapi.FastAPI()
|
|
125
25
|
|
|
126
26
|
|
|
@@ -139,6 +39,42 @@ async def metrics() -> fastapi.Response:
|
|
|
139
39
|
headers={'Cache-Control': 'no-cache'})
|
|
140
40
|
|
|
141
41
|
|
|
42
|
+
@metrics_app.get('/gpu-metrics')
|
|
43
|
+
async def gpu_metrics() -> fastapi.Response:
|
|
44
|
+
"""Gets the GPU metrics from multiple external k8s clusters"""
|
|
45
|
+
contexts = core.get_all_contexts()
|
|
46
|
+
all_metrics: List[str] = []
|
|
47
|
+
successful_contexts = 0
|
|
48
|
+
|
|
49
|
+
tasks = [
|
|
50
|
+
asyncio.create_task(metrics_utils.get_metrics_for_context(context))
|
|
51
|
+
for context in contexts
|
|
52
|
+
if context != 'in-cluster'
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
56
|
+
|
|
57
|
+
for i, result in enumerate(results):
|
|
58
|
+
if isinstance(result, Exception):
|
|
59
|
+
logger.error(
|
|
60
|
+
f'Failed to get metrics for context {contexts[i]}: {result}')
|
|
61
|
+
elif isinstance(result, BaseException):
|
|
62
|
+
# Avoid changing behavior for non-Exception BaseExceptions
|
|
63
|
+
# like KeyboardInterrupt/SystemExit: re-raise them.
|
|
64
|
+
raise result
|
|
65
|
+
else:
|
|
66
|
+
metrics_text = result
|
|
67
|
+
all_metrics.append(metrics_text)
|
|
68
|
+
successful_contexts += 1
|
|
69
|
+
|
|
70
|
+
combined_metrics = '\n\n'.join(all_metrics)
|
|
71
|
+
|
|
72
|
+
# Return as plain text for Prometheus compatibility
|
|
73
|
+
return fastapi.Response(
|
|
74
|
+
content=combined_metrics,
|
|
75
|
+
media_type='text/plain; version=0.0.4; charset=utf-8')
|
|
76
|
+
|
|
77
|
+
|
|
142
78
|
def build_metrics_server(host: str, port: int) -> uvicorn.Server:
|
|
143
79
|
metrics_config = uvicorn.Config(
|
|
144
80
|
'sky.server.metrics:metrics_app',
|
|
@@ -182,61 +118,17 @@ class PrometheusMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
182
118
|
status_code_group = '5xx'
|
|
183
119
|
raise
|
|
184
120
|
finally:
|
|
185
|
-
SKY_APISERVER_REQUESTS_TOTAL.labels(
|
|
186
|
-
|
|
187
|
-
status=status_code_group).inc()
|
|
121
|
+
metrics_utils.SKY_APISERVER_REQUESTS_TOTAL.labels(
|
|
122
|
+
path=path, method=method, status=status_code_group).inc()
|
|
188
123
|
if not streaming:
|
|
189
124
|
duration = time.time() - start_time
|
|
190
|
-
SKY_APISERVER_REQUEST_DURATION_SECONDS.labels(
|
|
125
|
+
metrics_utils.SKY_APISERVER_REQUEST_DURATION_SECONDS.labels(
|
|
191
126
|
path=path, method=method,
|
|
192
127
|
status=status_code_group).observe(duration)
|
|
193
128
|
|
|
194
129
|
return response
|
|
195
130
|
|
|
196
131
|
|
|
197
|
-
@contextlib.contextmanager
|
|
198
|
-
def time_it(name: str, group: str = 'default'):
|
|
199
|
-
"""Context manager to measure and record code execution duration."""
|
|
200
|
-
if not METRICS_ENABLED:
|
|
201
|
-
yield
|
|
202
|
-
else:
|
|
203
|
-
start_time = time.time()
|
|
204
|
-
try:
|
|
205
|
-
yield
|
|
206
|
-
finally:
|
|
207
|
-
duration = time.time() - start_time
|
|
208
|
-
SKY_APISERVER_CODE_DURATION_SECONDS.labels(
|
|
209
|
-
name=name, group=group).observe(duration)
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
def time_me(func):
|
|
213
|
-
"""Measure the duration of decorated function."""
|
|
214
|
-
|
|
215
|
-
@functools.wraps(func)
|
|
216
|
-
def wrapper(*args, **kwargs):
|
|
217
|
-
if not METRICS_ENABLED:
|
|
218
|
-
return func(*args, **kwargs)
|
|
219
|
-
name = f'{func.__module__}/{func.__name__}'
|
|
220
|
-
with time_it(name, group='function'):
|
|
221
|
-
return func(*args, **kwargs)
|
|
222
|
-
|
|
223
|
-
return wrapper
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
def time_me_async(func):
|
|
227
|
-
"""Measure the duration of decorated async function."""
|
|
228
|
-
|
|
229
|
-
@functools.wraps(func)
|
|
230
|
-
async def async_wrapper(*args, **kwargs):
|
|
231
|
-
if not METRICS_ENABLED:
|
|
232
|
-
return await func(*args, **kwargs)
|
|
233
|
-
name = f'{func.__module__}/{func.__name__}'
|
|
234
|
-
with time_it(name, group='function'):
|
|
235
|
-
return await func(*args, **kwargs)
|
|
236
|
-
|
|
237
|
-
return async_wrapper
|
|
238
|
-
|
|
239
|
-
|
|
240
132
|
peak_rss_bytes = 0
|
|
241
133
|
|
|
242
134
|
|
|
@@ -252,13 +144,15 @@ def process_monitor(process_type: str, stop: threading.Event):
|
|
|
252
144
|
last_bucket_end = time.time()
|
|
253
145
|
bucket_peak = 0
|
|
254
146
|
peak_rss_bytes = max(bucket_peak, proc.memory_info().rss)
|
|
255
|
-
SKY_APISERVER_PROCESS_PEAK_RSS.labels(
|
|
147
|
+
metrics_utils.SKY_APISERVER_PROCESS_PEAK_RSS.labels(
|
|
256
148
|
pid=pid, type=process_type).set(peak_rss_bytes)
|
|
257
149
|
ctimes = proc.cpu_times()
|
|
258
|
-
SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
150
|
+
metrics_utils.SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
|
|
151
|
+
type=process_type,
|
|
152
|
+
mode='user').set(
|
|
153
|
+
ctimes.user)
|
|
154
|
+
metrics_utils.SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
|
|
155
|
+
type=process_type,
|
|
156
|
+
mode='system').set(
|
|
157
|
+
ctimes.system)
|
|
264
158
|
time.sleep(1)
|
sky/server/requests/executor.py
CHANGED
|
@@ -39,6 +39,7 @@ from sky import global_user_state
|
|
|
39
39
|
from sky import models
|
|
40
40
|
from sky import sky_logging
|
|
41
41
|
from sky import skypilot_config
|
|
42
|
+
from sky.metrics import utils as metrics_utils
|
|
42
43
|
from sky.server import common as server_common
|
|
43
44
|
from sky.server import config as server_config
|
|
44
45
|
from sky.server import constants as server_constants
|
|
@@ -422,10 +423,10 @@ def _request_execution_wrapper(request_id: str,
|
|
|
422
423
|
config = skypilot_config.to_dict()
|
|
423
424
|
logger.debug(f'request config: \n'
|
|
424
425
|
f'{yaml_utils.dump_yaml_str(dict(config))}')
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
with
|
|
428
|
-
|
|
426
|
+
(metrics_utils.SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL.
|
|
427
|
+
labels(request=request_name, pid=pid).inc())
|
|
428
|
+
with metrics_utils.time_it(name=request_name,
|
|
429
|
+
group='request_execution'):
|
|
429
430
|
return_value = func(**request_body.to_kwargs())
|
|
430
431
|
f.flush()
|
|
431
432
|
except KeyboardInterrupt:
|
|
@@ -468,8 +469,8 @@ def _request_execution_wrapper(request_id: str,
|
|
|
468
469
|
# Clear request level cache to release all memory used by
|
|
469
470
|
# the request.
|
|
470
471
|
annotations.clear_request_level_cache()
|
|
471
|
-
with
|
|
472
|
-
|
|
472
|
+
with metrics_utils.time_it(name='release_memory',
|
|
473
|
+
group='internal'):
|
|
473
474
|
common_utils.release_memory()
|
|
474
475
|
_record_memory_metrics(request_name, proc, rss_begin, peak_rss)
|
|
475
476
|
except Exception as e: # pylint: disable=broad-except
|
|
@@ -493,11 +494,11 @@ def _record_memory_metrics(request_name: str, proc: psutil.Process,
|
|
|
493
494
|
rss_end = proc.memory_info().rss
|
|
494
495
|
|
|
495
496
|
# Answer "how much RSS this request contributed?"
|
|
496
|
-
|
|
497
|
+
metrics_utils.SKY_APISERVER_REQUEST_RSS_INCR_BYTES.labels(
|
|
497
498
|
name=request_name).observe(max(rss_end - rss_begin, 0))
|
|
498
499
|
# Estimate the memory usage by the request by capturing the
|
|
499
500
|
# peak memory delta during the request execution.
|
|
500
|
-
|
|
501
|
+
metrics_utils.SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES.labels(
|
|
501
502
|
name=request_name).observe(max(peak_rss - rss_begin, 0))
|
|
502
503
|
|
|
503
504
|
|
sky/server/requests/payloads.py
CHANGED
|
@@ -792,6 +792,12 @@ class GetConfigBody(RequestBody):
|
|
|
792
792
|
class CostReportBody(RequestBody):
|
|
793
793
|
"""The request body for the cost report endpoint."""
|
|
794
794
|
days: Optional[int] = 30
|
|
795
|
+
# we use hashes instead of names to avoid the case where
|
|
796
|
+
# the name is not unique
|
|
797
|
+
cluster_hashes: Optional[List[str]] = None
|
|
798
|
+
# Only return fields that are needed for the dashboard
|
|
799
|
+
# summary page
|
|
800
|
+
dashboard_summary_response: bool = False
|
|
795
801
|
|
|
796
802
|
|
|
797
803
|
class RequestPayload(BasePayload):
|