skypilot-nightly 1.0.0.dev20250918__py3-none-any.whl → 1.0.0.dev20250919__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (48) hide show
  1. sky/__init__.py +2 -2
  2. sky/core.py +67 -45
  3. sky/dashboard/out/404.html +1 -1
  4. sky/dashboard/out/_next/static/{k1mo5xWZrV9djgjd0moOT → VvaUqYDvHOcHZRnvMBmax}/_buildManifest.js +1 -1
  5. sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +1 -0
  6. sky/dashboard/out/_next/static/chunks/3015-88c7c8d69b0b6dba.js +1 -0
  7. sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +1 -0
  8. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +16 -0
  9. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-0b4b35dc1dfe046c.js → [cluster]-9525660179df3605.js} +1 -1
  10. sky/dashboard/out/_next/static/chunks/{webpack-487697b47d8c5e50.js → webpack-b2a3938c22b6647b.js} +1 -1
  11. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  12. sky/dashboard/out/clusters/[cluster].html +1 -1
  13. sky/dashboard/out/clusters.html +1 -1
  14. sky/dashboard/out/config.html +1 -1
  15. sky/dashboard/out/index.html +1 -1
  16. sky/dashboard/out/infra/[context].html +1 -1
  17. sky/dashboard/out/infra.html +1 -1
  18. sky/dashboard/out/jobs/[job].html +1 -1
  19. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  20. sky/dashboard/out/jobs.html +1 -1
  21. sky/dashboard/out/users.html +1 -1
  22. sky/dashboard/out/volumes.html +1 -1
  23. sky/dashboard/out/workspace/new.html +1 -1
  24. sky/dashboard/out/workspaces/[name].html +1 -1
  25. sky/dashboard/out/workspaces.html +1 -1
  26. sky/global_user_state.py +83 -54
  27. sky/metrics/utils.py +174 -8
  28. sky/schemas/generated/jobsv1_pb2.py +40 -40
  29. sky/server/metrics.py +52 -158
  30. sky/server/requests/executor.py +9 -8
  31. sky/server/requests/payloads.py +6 -0
  32. sky/server/requests/requests.py +1 -1
  33. sky/server/requests/serializers/encoders.py +3 -2
  34. sky/server/server.py +5 -41
  35. sky/skylet/constants.py +6 -3
  36. sky/skylet/job_lib.py +14 -15
  37. sky/utils/locks.py +41 -10
  38. {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/METADATA +32 -32
  39. {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/RECORD +44 -44
  40. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  41. sky/dashboard/out/_next/static/chunks/3015-ba5be550eb80fd8c.js +0 -1
  42. sky/dashboard/out/_next/static/chunks/8969-a3e3f0683e19d340.js +0 -1
  43. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  44. /sky/dashboard/out/_next/static/{k1mo5xWZrV9djgjd0moOT → VvaUqYDvHOcHZRnvMBmax}/_ssgManifest.js +0 -0
  45. {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/WHEEL +0 -0
  46. {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/entry_points.txt +0 -0
  47. {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/licenses/LICENSE +0 -0
  48. {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/top_level.txt +0 -0
sky/metrics/utils.py CHANGED
@@ -1,11 +1,165 @@
1
1
  """Utilities for processing GPU metrics from Kubernetes clusters."""
2
+ import contextlib
3
+ import functools
2
4
  import os
3
5
  import re
6
+ import select
4
7
  import subprocess
5
8
  import time
6
9
  from typing import List, Optional, Tuple
7
10
 
8
11
  import httpx
12
+ import prometheus_client as prom
13
+
14
+ from sky.skylet import constants
15
+ from sky.utils import context_utils
16
+
17
+ _SELECT_TIMEOUT = 1
18
+ _SELECT_BUFFER_SIZE = 4096
19
+
20
+ _KB = 2**10
21
+ _MB = 2**20
22
+ _MEM_BUCKETS = [
23
+ _KB,
24
+ 256 * _KB,
25
+ 512 * _KB,
26
+ _MB,
27
+ 2 * _MB,
28
+ 4 * _MB,
29
+ 8 * _MB,
30
+ 16 * _MB,
31
+ 32 * _MB,
32
+ 64 * _MB,
33
+ 128 * _MB,
34
+ 256 * _MB,
35
+ float('inf'),
36
+ ]
37
+
38
+ # Whether the metrics are enabled, cannot be changed at runtime.
39
+ METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
40
+ 'false').lower() == 'true'
41
+
42
+ # Time spent processing a piece of code, refer to time_it().
43
+ SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
44
+ 'sky_apiserver_code_duration_seconds',
45
+ 'Time spent processing code',
46
+ ['name', 'group'],
47
+ buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
48
+ 60.0, 120.0, float('inf')),
49
+ )
50
+
51
+ # Total number of API server requests, grouped by path, method, and status.
52
+ SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
53
+ 'sky_apiserver_requests_total',
54
+ 'Total number of API server requests',
55
+ ['path', 'method', 'status'],
56
+ )
57
+
58
+ # Time spent processing API server requests, grouped by path, method, and
59
+ # status.
60
+ SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
61
+ 'sky_apiserver_request_duration_seconds',
62
+ 'Time spent processing API server requests',
63
+ ['path', 'method', 'status'],
64
+ buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
65
+ 60.0, 120.0, float('inf')),
66
+ )
67
+
68
+ SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
69
+ 'sky_apiserver_event_loop_lag_seconds',
70
+ 'Scheduling delay of the server event loop',
71
+ ['pid'],
72
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 20.0,
73
+ 60.0, float('inf')),
74
+ )
75
+
76
+ SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
77
+ 'sky_apiserver_websocket_connections',
78
+ 'Number of websocket connections',
79
+ ['pid'],
80
+ multiprocess_mode='livesum',
81
+ )
82
+
83
+ SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL = prom.Counter(
84
+ 'sky_apiserver_websocket_closed_total',
85
+ 'Number of websocket closed',
86
+ ['pid', 'reason'],
87
+ )
88
+
89
+ # The number of execution starts in each worker process, we do not record
90
+ # histogram here as the duration has been measured in
91
+ # SKY_APISERVER_CODE_DURATION_SECONDS without the worker label (process id).
92
+ # Recording histogram WITH worker label will cause high cardinality.
93
+ SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL = prom.Counter(
94
+ 'sky_apiserver_process_execution_start_total',
95
+ 'Total number of execution starts in each worker process',
96
+ ['request', 'pid'],
97
+ )
98
+
99
+ SKY_APISERVER_PROCESS_PEAK_RSS = prom.Gauge(
100
+ 'sky_apiserver_process_peak_rss',
101
+ 'Peak RSS we saw in each process in last 30 seconds',
102
+ ['pid', 'type'],
103
+ )
104
+
105
+ SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
106
+ 'sky_apiserver_process_cpu_total',
107
+ 'Total CPU times a worker process has been running',
108
+ ['pid', 'type', 'mode'],
109
+ )
110
+
111
+ SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
112
+ 'sky_apiserver_request_memory_usage_bytes',
113
+ 'Peak memory usage of requests', ['name'],
114
+ buckets=_MEM_BUCKETS)
115
+
116
+ SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
117
+ 'sky_apiserver_request_rss_incr_bytes',
118
+ 'RSS increment after requests', ['name'],
119
+ buckets=_MEM_BUCKETS)
120
+
121
+
122
+ @contextlib.contextmanager
123
+ def time_it(name: str, group: str = 'default'):
124
+ """Context manager to measure and record code execution duration."""
125
+ if not METRICS_ENABLED:
126
+ yield
127
+ else:
128
+ start_time = time.time()
129
+ try:
130
+ yield
131
+ finally:
132
+ duration = time.time() - start_time
133
+ SKY_APISERVER_CODE_DURATION_SECONDS.labels(
134
+ name=name, group=group).observe(duration)
135
+
136
+
137
+ def time_me(func):
138
+ """Measure the duration of decorated function."""
139
+
140
+ @functools.wraps(func)
141
+ def wrapper(*args, **kwargs):
142
+ if not METRICS_ENABLED:
143
+ return func(*args, **kwargs)
144
+ name = f'{func.__module__}/{func.__name__}'
145
+ with time_it(name, group='function'):
146
+ return func(*args, **kwargs)
147
+
148
+ return wrapper
149
+
150
+
151
+ def time_me_async(func):
152
+ """Measure the duration of decorated async function."""
153
+
154
+ @functools.wraps(func)
155
+ async def async_wrapper(*args, **kwargs):
156
+ if not METRICS_ENABLED:
157
+ return await func(*args, **kwargs)
158
+ name = f'{func.__module__}/{func.__name__}'
159
+ with time_it(name, group='function'):
160
+ return await func(*args, **kwargs)
161
+
162
+ return async_wrapper
9
163
 
10
164
 
11
165
  def start_svc_port_forward(context: str, namespace: str, service: str,
@@ -44,6 +198,7 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
44
198
  local_port = None
45
199
  start_time = time.time()
46
200
 
201
+ buffer = ''
47
202
  # wait for the port forward to start and extract the local port
48
203
  while time.time() - start_time < start_port_forward_timeout:
49
204
  if port_forward_process.poll() is not None:
@@ -56,10 +211,16 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
56
211
 
57
212
  # read output line by line to find the local port
58
213
  if port_forward_process.stdout:
59
- line = port_forward_process.stdout.readline()
60
- if line:
61
- # look for 'Forwarding from 127.0.0.1:XXXXX -> service_port'
62
- match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', line)
214
+ # Wait up to 1s for data to be available without blocking
215
+ r, _, _ = select.select([port_forward_process.stdout], [], [],
216
+ _SELECT_TIMEOUT)
217
+ if r:
218
+ # Read available bytes from the FD without blocking
219
+ fd = port_forward_process.stdout.fileno()
220
+ raw = os.read(fd, _SELECT_BUFFER_SIZE)
221
+ chunk = raw.decode(errors='ignore')
222
+ buffer += chunk
223
+ match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', buffer)
63
224
  if match:
64
225
  local_port = int(match.group(1))
65
226
  break
@@ -122,8 +283,8 @@ async def send_metrics_request_with_port_forward(
122
283
  port_forward_process = None
123
284
  try:
124
285
  # Start port forward
125
- port_forward_process, local_port = start_svc_port_forward(
126
- context, namespace, service, service_port)
286
+ port_forward_process, local_port = await context_utils.to_thread(
287
+ start_svc_port_forward, context, namespace, service, service_port)
127
288
 
128
289
  # Build endpoint URL
129
290
  endpoint = f'http://localhost:{local_port}{endpoint_path}'
@@ -143,7 +304,8 @@ async def send_metrics_request_with_port_forward(
143
304
  finally:
144
305
  # Always clean up port forward
145
306
  if port_forward_process:
146
- stop_svc_port_forward(port_forward_process)
307
+ await context_utils.to_thread(stop_svc_port_forward,
308
+ port_forward_process)
147
309
 
148
310
 
149
311
  async def add_cluster_name_label(metrics_text: str, context: str) -> str:
@@ -193,7 +355,11 @@ async def get_metrics_for_context(context: str) -> str:
193
355
  """
194
356
  # Query both DCGM metrics and kube_pod_labels metrics
195
357
  # This ensures the dashboard can perform joins to filter by skypilot cluster
196
- match_patterns = ['{__name__=~"DCGM_.*"}', 'kube_pod_labels']
358
+ match_patterns = [
359
+ '{__name__=~"node_memory_MemAvailable_bytes|node_memory_MemTotal_bytes|DCGM_.*"}', # pylint: disable=line-too-long
360
+ 'kube_pod_labels',
361
+ 'node_cpu_seconds_total{mode="idle"}'
362
+ ]
197
363
 
198
364
  # TODO(rohan): don't hardcode the namespace and service name
199
365
  metrics_text = await send_metrics_request_with_port_forward(
@@ -14,7 +14,7 @@ _sym_db = _symbol_database.Default()
14
14
 
15
15
 
16
16
 
17
- DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\"sky/schemas/generated/jobsv1.proto\x12\x07jobs.v1\"\x85\x01\n\rAddJobRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x15\n\rrun_timestamp\x18\x03 \x01(\t\x12\x15\n\rresources_str\x18\x04 \x01(\t\x12\x10\n\x08metadata\x18\x05 \x01(\tB\x0b\n\t_job_name\"1\n\x0e\x41\x64\x64JobResponse\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07log_dir\x18\x02 \x01(\t\"\xb3\x01\n\x0fQueueJobRequest\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x14\n\x07\x63odegen\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bscript_path\x18\x03 \x01(\t\x12\x16\n\x0eremote_log_dir\x18\x04 \x01(\t\x12\x31\n\x0bmanaged_job\x18\x05 \x01(\x0b\x32\x17.jobs.v1.ManagedJobInfoH\x01\x88\x01\x01\x42\n\n\x08_codegenB\x0e\n\x0c_managed_job\"\x89\x01\n\x0eManagedJobInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x11\n\x04pool\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tworkspace\x18\x03 \x01(\t\x12\x12\n\nentrypoint\x18\x04 \x01(\t\x12&\n\x05tasks\x18\x05 \x03(\x0b\x32\x17.jobs.v1.ManagedJobTaskB\x07\n\x05_pool\"]\n\x0eManagedJobTask\x12\x0f\n\x07task_id\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x15\n\rresources_str\x18\x03 \x01(\t\x12\x15\n\rmetadata_json\x18\x04 \x01(\t\"\x12\n\x10QueueJobResponse\"\x15\n\x13UpdateStatusRequest\"\x16\n\x14UpdateStatusResponse\"L\n\x12GetJobQueueRequest\x12\x16\n\tuser_hash\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08\x61ll_jobs\x18\x02 \x01(\x08\x42\x0c\n\n_user_hash\"\xf4\x01\n\x07JobInfo\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x10\n\x08job_name\x18\x02 \x01(\t\x12\x10\n\x08username\x18\x03 \x01(\t\x12\x14\n\x0csubmitted_at\x18\x04 \x01(\x01\x12\"\n\x06status\x18\x05 \x01(\x0e\x32\x12.jobs.v1.JobStatus\x12\x15\n\rrun_timestamp\x18\x06 \x01(\t\x12\x10\n\x08start_at\x18\x07 \x01(\x01\x12\x0e\n\x06\x65nd_at\x18\x08 \x01(\x01\x12\x11\n\tresources\x18\t \x01(\t\x12\x0b\n\x03pid\x18\n \x01(\x03\x12\x10\n\x08log_path\x18\x0b \x01(\t\x12\x10\n\x08metadata\x18\x0c \x01(\t\"5\n\x13GetJobQueueResponse\x12\x1e\n\x04jobs\x18\x01 \x03(\x0b\x32\x10.jobs.v1.JobInfo\"^\n\x11\x43\x61ncelJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\x12\x12\n\ncancel_all\x18\x02 \x01(\x08\x12\x16\n\tuser_hash\x18\x03 \x01(\tH\x00\x88\x01\x01\x42\x0c\n\n_user_hash\"/\n\x12\x43\x61ncelJobsResponse\x12\x19\n\x11\x63\x61ncelled_job_ids\x18\x01 \x03(\x03\"\x1e\n\x1c\x46\x61ilAllInProgressJobsRequest\"\x1f\n\x1d\x46\x61ilAllInProgressJobsResponse\"\x7f\n\x0fTailLogsRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x12\x1b\n\x0emanaged_job_id\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x0e\n\x06\x66ollow\x18\x03 \x01(\x08\x12\x0c\n\x04tail\x18\x04 \x01(\x05\x42\t\n\x07_job_idB\x11\n\x0f_managed_job_id\"7\n\x10TailLogsResponse\x12\x10\n\x08log_line\x18\x01 \x01(\t\x12\x11\n\texit_code\x18\x02 \x01(\x05\"&\n\x13GetJobStatusRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\xa4\x01\n\x14GetJobStatusResponse\x12\x44\n\x0cjob_statuses\x18\x01 \x03(\x0b\x32..jobs.v1.GetJobStatusResponse.JobStatusesEntry\x1a\x46\n\x10JobStatusesEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12!\n\x05value\x18\x02 \x01(\x0e\x32\x12.jobs.v1.JobStatus:\x02\x38\x01\"A\n\x1fGetJobSubmittedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"5\n GetJobSubmittedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"=\n\x1bGetJobEndedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"1\n\x1cGetJobEndedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"+\n\x18GetLogDirsForJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\x98\x01\n\x19GetLogDirsForJobsResponse\x12H\n\x0cjob_log_dirs\x18\x01 \x03(\x0b\x32\x32.jobs.v1.GetLogDirsForJobsResponse.JobLogDirsEntry\x1a\x31\n\x0fJobLogDirsEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01*\x8d\x02\n\tJobStatus\x12\x1a\n\x16JOB_STATUS_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOB_STATUS_INIT\x10\x01\x12\x16\n\x12JOB_STATUS_PENDING\x10\x02\x12\x19\n\x15JOB_STATUS_SETTING_UP\x10\x03\x12\x16\n\x12JOB_STATUS_RUNNING\x10\x04\x12\x1c\n\x18JOB_STATUS_FAILED_DRIVER\x10\x05\x12\x18\n\x14JOB_STATUS_SUCCEEDED\x10\x06\x12\x15\n\x11JOB_STATUS_FAILED\x10\x07\x12\x1b\n\x17JOB_STATUS_FAILED_SETUP\x10\x08\x12\x18\n\x14JOB_STATUS_CANCELLED\x10\t2\x91\x07\n\x0bJobsService\x12\x39\n\x06\x41\x64\x64Job\x12\x16.jobs.v1.AddJobRequest\x1a\x17.jobs.v1.AddJobResponse\x12?\n\x08QueueJob\x12\x18.jobs.v1.QueueJobRequest\x1a\x19.jobs.v1.QueueJobResponse\x12K\n\x0cUpdateStatus\x12\x1c.jobs.v1.UpdateStatusRequest\x1a\x1d.jobs.v1.UpdateStatusResponse\x12H\n\x0bGetJobQueue\x12\x1b.jobs.v1.GetJobQueueRequest\x1a\x1c.jobs.v1.GetJobQueueResponse\x12\x45\n\nCancelJobs\x12\x1a.jobs.v1.CancelJobsRequest\x1a\x1b.jobs.v1.CancelJobsResponse\x12\x66\n\x15\x46\x61ilAllInProgressJobs\x12%.jobs.v1.FailAllInProgressJobsRequest\x1a&.jobs.v1.FailAllInProgressJobsResponse\x12\x41\n\x08TailLogs\x12\x18.jobs.v1.TailLogsRequest\x1a\x19.jobs.v1.TailLogsResponse0\x01\x12K\n\x0cGetJobStatus\x12\x1c.jobs.v1.GetJobStatusRequest\x1a\x1d.jobs.v1.GetJobStatusResponse\x12o\n\x18GetJobSubmittedTimestamp\x12(.jobs.v1.GetJobSubmittedTimestampRequest\x1a).jobs.v1.GetJobSubmittedTimestampResponse\x12\x63\n\x14GetJobEndedTimestamp\x12$.jobs.v1.GetJobEndedTimestampRequest\x1a%.jobs.v1.GetJobEndedTimestampResponse\x12Z\n\x11GetLogDirsForJobs\x12!.jobs.v1.GetLogDirsForJobsRequest\x1a\".jobs.v1.GetLogDirsForJobsResponseb\x06proto3')
17
+ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\"sky/schemas/generated/jobsv1.proto\x12\x07jobs.v1\"\x85\x01\n\rAddJobRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x15\n\rrun_timestamp\x18\x03 \x01(\t\x12\x15\n\rresources_str\x18\x04 \x01(\t\x12\x10\n\x08metadata\x18\x05 \x01(\tB\x0b\n\t_job_name\"1\n\x0e\x41\x64\x64JobResponse\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07log_dir\x18\x02 \x01(\t\"\xb3\x01\n\x0fQueueJobRequest\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x14\n\x07\x63odegen\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bscript_path\x18\x03 \x01(\t\x12\x16\n\x0eremote_log_dir\x18\x04 \x01(\t\x12\x31\n\x0bmanaged_job\x18\x05 \x01(\x0b\x32\x17.jobs.v1.ManagedJobInfoH\x01\x88\x01\x01\x42\n\n\x08_codegenB\x0e\n\x0c_managed_job\"\x89\x01\n\x0eManagedJobInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x11\n\x04pool\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tworkspace\x18\x03 \x01(\t\x12\x12\n\nentrypoint\x18\x04 \x01(\t\x12&\n\x05tasks\x18\x05 \x03(\x0b\x32\x17.jobs.v1.ManagedJobTaskB\x07\n\x05_pool\"]\n\x0eManagedJobTask\x12\x0f\n\x07task_id\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x15\n\rresources_str\x18\x03 \x01(\t\x12\x15\n\rmetadata_json\x18\x04 \x01(\t\"\x12\n\x10QueueJobResponse\"\x15\n\x13UpdateStatusRequest\"\x16\n\x14UpdateStatusResponse\"L\n\x12GetJobQueueRequest\x12\x16\n\tuser_hash\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08\x61ll_jobs\x18\x02 \x01(\x08\x42\x0c\n\n_user_hash\"\xa3\x02\n\x07JobInfo\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x10\n\x08job_name\x18\x02 \x01(\t\x12\x10\n\x08username\x18\x03 \x01(\t\x12\x14\n\x0csubmitted_at\x18\x04 \x01(\x01\x12\"\n\x06status\x18\x05 \x01(\x0e\x32\x12.jobs.v1.JobStatus\x12\x15\n\rrun_timestamp\x18\x06 \x01(\t\x12\x15\n\x08start_at\x18\x07 \x01(\x01H\x00\x88\x01\x01\x12\x13\n\x06\x65nd_at\x18\x08 \x01(\x01H\x01\x88\x01\x01\x12\x11\n\tresources\x18\t \x01(\t\x12\x10\n\x03pid\x18\n \x01(\x03H\x02\x88\x01\x01\x12\x10\n\x08log_path\x18\x0b \x01(\t\x12\x10\n\x08metadata\x18\x0c \x01(\tB\x0b\n\t_start_atB\t\n\x07_end_atB\x06\n\x04_pid\"5\n\x13GetJobQueueResponse\x12\x1e\n\x04jobs\x18\x01 \x03(\x0b\x32\x10.jobs.v1.JobInfo\"^\n\x11\x43\x61ncelJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\x12\x12\n\ncancel_all\x18\x02 \x01(\x08\x12\x16\n\tuser_hash\x18\x03 \x01(\tH\x00\x88\x01\x01\x42\x0c\n\n_user_hash\"/\n\x12\x43\x61ncelJobsResponse\x12\x19\n\x11\x63\x61ncelled_job_ids\x18\x01 \x03(\x03\"\x1e\n\x1c\x46\x61ilAllInProgressJobsRequest\"\x1f\n\x1d\x46\x61ilAllInProgressJobsResponse\"\x7f\n\x0fTailLogsRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x12\x1b\n\x0emanaged_job_id\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x0e\n\x06\x66ollow\x18\x03 \x01(\x08\x12\x0c\n\x04tail\x18\x04 \x01(\x05\x42\t\n\x07_job_idB\x11\n\x0f_managed_job_id\"7\n\x10TailLogsResponse\x12\x10\n\x08log_line\x18\x01 \x01(\t\x12\x11\n\texit_code\x18\x02 \x01(\x05\"&\n\x13GetJobStatusRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\xa4\x01\n\x14GetJobStatusResponse\x12\x44\n\x0cjob_statuses\x18\x01 \x03(\x0b\x32..jobs.v1.GetJobStatusResponse.JobStatusesEntry\x1a\x46\n\x10JobStatusesEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12!\n\x05value\x18\x02 \x01(\x0e\x32\x12.jobs.v1.JobStatus:\x02\x38\x01\"A\n\x1fGetJobSubmittedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"5\n GetJobSubmittedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"=\n\x1bGetJobEndedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"1\n\x1cGetJobEndedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"+\n\x18GetLogDirsForJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\x98\x01\n\x19GetLogDirsForJobsResponse\x12H\n\x0cjob_log_dirs\x18\x01 \x03(\x0b\x32\x32.jobs.v1.GetLogDirsForJobsResponse.JobLogDirsEntry\x1a\x31\n\x0fJobLogDirsEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01*\x8d\x02\n\tJobStatus\x12\x1a\n\x16JOB_STATUS_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOB_STATUS_INIT\x10\x01\x12\x16\n\x12JOB_STATUS_PENDING\x10\x02\x12\x19\n\x15JOB_STATUS_SETTING_UP\x10\x03\x12\x16\n\x12JOB_STATUS_RUNNING\x10\x04\x12\x1c\n\x18JOB_STATUS_FAILED_DRIVER\x10\x05\x12\x18\n\x14JOB_STATUS_SUCCEEDED\x10\x06\x12\x15\n\x11JOB_STATUS_FAILED\x10\x07\x12\x1b\n\x17JOB_STATUS_FAILED_SETUP\x10\x08\x12\x18\n\x14JOB_STATUS_CANCELLED\x10\t2\x91\x07\n\x0bJobsService\x12\x39\n\x06\x41\x64\x64Job\x12\x16.jobs.v1.AddJobRequest\x1a\x17.jobs.v1.AddJobResponse\x12?\n\x08QueueJob\x12\x18.jobs.v1.QueueJobRequest\x1a\x19.jobs.v1.QueueJobResponse\x12K\n\x0cUpdateStatus\x12\x1c.jobs.v1.UpdateStatusRequest\x1a\x1d.jobs.v1.UpdateStatusResponse\x12H\n\x0bGetJobQueue\x12\x1b.jobs.v1.GetJobQueueRequest\x1a\x1c.jobs.v1.GetJobQueueResponse\x12\x45\n\nCancelJobs\x12\x1a.jobs.v1.CancelJobsRequest\x1a\x1b.jobs.v1.CancelJobsResponse\x12\x66\n\x15\x46\x61ilAllInProgressJobs\x12%.jobs.v1.FailAllInProgressJobsRequest\x1a&.jobs.v1.FailAllInProgressJobsResponse\x12\x41\n\x08TailLogs\x12\x18.jobs.v1.TailLogsRequest\x1a\x19.jobs.v1.TailLogsResponse0\x01\x12K\n\x0cGetJobStatus\x12\x1c.jobs.v1.GetJobStatusRequest\x1a\x1d.jobs.v1.GetJobStatusResponse\x12o\n\x18GetJobSubmittedTimestamp\x12(.jobs.v1.GetJobSubmittedTimestampRequest\x1a).jobs.v1.GetJobSubmittedTimestampResponse\x12\x63\n\x14GetJobEndedTimestamp\x12$.jobs.v1.GetJobEndedTimestampRequest\x1a%.jobs.v1.GetJobEndedTimestampResponse\x12Z\n\x11GetLogDirsForJobs\x12!.jobs.v1.GetLogDirsForJobsRequest\x1a\".jobs.v1.GetLogDirsForJobsResponseb\x06proto3')
18
18
 
19
19
  _globals = globals()
20
20
  _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
@@ -25,8 +25,8 @@ if not _descriptor._USE_C_DESCRIPTORS:
25
25
  _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_options = b'8\001'
26
26
  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._loaded_options = None
27
27
  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_options = b'8\001'
28
- _globals['_JOBSTATUS']._serialized_start=2138
29
- _globals['_JOBSTATUS']._serialized_end=2407
28
+ _globals['_JOBSTATUS']._serialized_start=2185
29
+ _globals['_JOBSTATUS']._serialized_end=2454
30
30
  _globals['_ADDJOBREQUEST']._serialized_start=48
31
31
  _globals['_ADDJOBREQUEST']._serialized_end=181
32
32
  _globals['_ADDJOBRESPONSE']._serialized_start=183
@@ -46,41 +46,41 @@ if not _descriptor._USE_C_DESCRIPTORS:
46
46
  _globals['_GETJOBQUEUEREQUEST']._serialized_start=718
47
47
  _globals['_GETJOBQUEUEREQUEST']._serialized_end=794
48
48
  _globals['_JOBINFO']._serialized_start=797
49
- _globals['_JOBINFO']._serialized_end=1041
50
- _globals['_GETJOBQUEUERESPONSE']._serialized_start=1043
51
- _globals['_GETJOBQUEUERESPONSE']._serialized_end=1096
52
- _globals['_CANCELJOBSREQUEST']._serialized_start=1098
53
- _globals['_CANCELJOBSREQUEST']._serialized_end=1192
54
- _globals['_CANCELJOBSRESPONSE']._serialized_start=1194
55
- _globals['_CANCELJOBSRESPONSE']._serialized_end=1241
56
- _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_start=1243
57
- _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_end=1273
58
- _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_start=1275
59
- _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_end=1306
60
- _globals['_TAILLOGSREQUEST']._serialized_start=1308
61
- _globals['_TAILLOGSREQUEST']._serialized_end=1435
62
- _globals['_TAILLOGSRESPONSE']._serialized_start=1437
63
- _globals['_TAILLOGSRESPONSE']._serialized_end=1492
64
- _globals['_GETJOBSTATUSREQUEST']._serialized_start=1494
65
- _globals['_GETJOBSTATUSREQUEST']._serialized_end=1532
66
- _globals['_GETJOBSTATUSRESPONSE']._serialized_start=1535
67
- _globals['_GETJOBSTATUSRESPONSE']._serialized_end=1699
68
- _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_start=1629
69
- _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_end=1699
70
- _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_start=1701
71
- _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_end=1766
72
- _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_start=1768
73
- _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_end=1821
74
- _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_start=1823
75
- _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_end=1884
76
- _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_start=1886
77
- _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_end=1935
78
- _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_start=1937
79
- _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_end=1980
80
- _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_start=1983
81
- _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_end=2135
82
- _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_start=2086
83
- _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_end=2135
84
- _globals['_JOBSSERVICE']._serialized_start=2410
85
- _globals['_JOBSSERVICE']._serialized_end=3323
49
+ _globals['_JOBINFO']._serialized_end=1088
50
+ _globals['_GETJOBQUEUERESPONSE']._serialized_start=1090
51
+ _globals['_GETJOBQUEUERESPONSE']._serialized_end=1143
52
+ _globals['_CANCELJOBSREQUEST']._serialized_start=1145
53
+ _globals['_CANCELJOBSREQUEST']._serialized_end=1239
54
+ _globals['_CANCELJOBSRESPONSE']._serialized_start=1241
55
+ _globals['_CANCELJOBSRESPONSE']._serialized_end=1288
56
+ _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_start=1290
57
+ _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_end=1320
58
+ _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_start=1322
59
+ _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_end=1353
60
+ _globals['_TAILLOGSREQUEST']._serialized_start=1355
61
+ _globals['_TAILLOGSREQUEST']._serialized_end=1482
62
+ _globals['_TAILLOGSRESPONSE']._serialized_start=1484
63
+ _globals['_TAILLOGSRESPONSE']._serialized_end=1539
64
+ _globals['_GETJOBSTATUSREQUEST']._serialized_start=1541
65
+ _globals['_GETJOBSTATUSREQUEST']._serialized_end=1579
66
+ _globals['_GETJOBSTATUSRESPONSE']._serialized_start=1582
67
+ _globals['_GETJOBSTATUSRESPONSE']._serialized_end=1746
68
+ _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_start=1676
69
+ _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_end=1746
70
+ _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_start=1748
71
+ _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_end=1813
72
+ _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_start=1815
73
+ _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_end=1868
74
+ _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_start=1870
75
+ _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_end=1931
76
+ _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_start=1933
77
+ _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_end=1982
78
+ _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_start=1984
79
+ _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_end=2027
80
+ _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_start=2030
81
+ _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_end=2182
82
+ _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_start=2133
83
+ _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_end=2182
84
+ _globals['_JOBSSERVICE']._serialized_start=2457
85
+ _globals['_JOBSSERVICE']._serialized_end=3370
86
86
  # @@protoc_insertion_point(module_scope)
sky/server/metrics.py CHANGED
@@ -1,11 +1,11 @@
1
1
  """Instrumentation for the API server."""
2
2
 
3
- import contextlib
4
- import functools
3
+ import asyncio
5
4
  import multiprocessing
6
5
  import os
7
6
  import threading
8
7
  import time
8
+ from typing import List
9
9
 
10
10
  import fastapi
11
11
  from prometheus_client import generate_latest
@@ -15,112 +15,12 @@ import psutil
15
15
  import starlette.middleware.base
16
16
  import uvicorn
17
17
 
18
+ from sky import core
18
19
  from sky import sky_logging
19
- from sky.skylet import constants
20
-
21
- # Whether the metrics are enabled, cannot be changed at runtime.
22
- METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
23
- 'false').lower() == 'true'
24
-
25
- _KB = 2**10
26
- _MB = 2**20
27
- _MEM_BUCKETS = [
28
- _KB,
29
- 256 * _KB,
30
- 512 * _KB,
31
- _MB,
32
- 2 * _MB,
33
- 4 * _MB,
34
- 8 * _MB,
35
- 16 * _MB,
36
- 32 * _MB,
37
- 64 * _MB,
38
- 128 * _MB,
39
- 256 * _MB,
40
- float('inf'),
41
- ]
20
+ from sky.metrics import utils as metrics_utils
42
21
 
43
22
  logger = sky_logging.init_logger(__name__)
44
23
 
45
- # Total number of API server requests, grouped by path, method, and status.
46
- SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
47
- 'sky_apiserver_requests_total',
48
- 'Total number of API server requests',
49
- ['path', 'method', 'status'],
50
- )
51
-
52
- # Time spent processing API server requests, grouped by path, method, and
53
- # status.
54
- SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
55
- 'sky_apiserver_request_duration_seconds',
56
- 'Time spent processing API server requests',
57
- ['path', 'method', 'status'],
58
- buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
59
- 60.0, 120.0, float('inf')),
60
- )
61
-
62
- # Time spent processing a piece of code, refer to time_it().
63
- SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
64
- 'sky_apiserver_code_duration_seconds',
65
- 'Time spent processing code',
66
- ['name', 'group'],
67
- buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
68
- 60.0, 120.0, float('inf')),
69
- )
70
-
71
- SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
72
- 'sky_apiserver_event_loop_lag_seconds',
73
- 'Scheduling delay of the server event loop',
74
- ['pid'],
75
- buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 20.0,
76
- 60.0, float('inf')),
77
- )
78
-
79
- SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
80
- 'sky_apiserver_websocket_connections',
81
- 'Number of websocket connections',
82
- ['pid'],
83
- multiprocess_mode='livesum',
84
- )
85
-
86
- SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL = prom.Counter(
87
- 'sky_apiserver_websocket_closed_total',
88
- 'Number of websocket closed',
89
- ['pid', 'reason'],
90
- )
91
-
92
- # The number of execution starts in each worker process, we do not record
93
- # histogram here as the duration has been measured in
94
- # SKY_APISERVER_CODE_DURATION_SECONDS without the worker label (process id).
95
- # Recording histogram WITH worker label will cause high cardinality.
96
- SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL = prom.Counter(
97
- 'sky_apiserver_process_execution_start_total',
98
- 'Total number of execution starts in each worker process',
99
- ['request', 'pid'],
100
- )
101
-
102
- SKY_APISERVER_PROCESS_PEAK_RSS = prom.Gauge(
103
- 'sky_apiserver_process_peak_rss',
104
- 'Peak RSS we saw in each process in last 30 seconds',
105
- ['pid', 'type'],
106
- )
107
-
108
- SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
109
- 'sky_apiserver_process_cpu_total',
110
- 'Total CPU times a worker process has been running',
111
- ['pid', 'type', 'mode'],
112
- )
113
-
114
- SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
115
- 'sky_apiserver_request_memory_usage_bytes',
116
- 'Peak memory usage of requests', ['name'],
117
- buckets=_MEM_BUCKETS)
118
-
119
- SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
120
- 'sky_apiserver_request_rss_incr_bytes',
121
- 'RSS increment after requests', ['name'],
122
- buckets=_MEM_BUCKETS)
123
-
124
24
  metrics_app = fastapi.FastAPI()
125
25
 
126
26
 
@@ -139,6 +39,42 @@ async def metrics() -> fastapi.Response:
139
39
  headers={'Cache-Control': 'no-cache'})
140
40
 
141
41
 
42
+ @metrics_app.get('/gpu-metrics')
43
+ async def gpu_metrics() -> fastapi.Response:
44
+ """Gets the GPU metrics from multiple external k8s clusters"""
45
+ contexts = core.get_all_contexts()
46
+ all_metrics: List[str] = []
47
+ successful_contexts = 0
48
+
49
+ tasks = [
50
+ asyncio.create_task(metrics_utils.get_metrics_for_context(context))
51
+ for context in contexts
52
+ if context != 'in-cluster'
53
+ ]
54
+
55
+ results = await asyncio.gather(*tasks, return_exceptions=True)
56
+
57
+ for i, result in enumerate(results):
58
+ if isinstance(result, Exception):
59
+ logger.error(
60
+ f'Failed to get metrics for context {contexts[i]}: {result}')
61
+ elif isinstance(result, BaseException):
62
+ # Avoid changing behavior for non-Exception BaseExceptions
63
+ # like KeyboardInterrupt/SystemExit: re-raise them.
64
+ raise result
65
+ else:
66
+ metrics_text = result
67
+ all_metrics.append(metrics_text)
68
+ successful_contexts += 1
69
+
70
+ combined_metrics = '\n\n'.join(all_metrics)
71
+
72
+ # Return as plain text for Prometheus compatibility
73
+ return fastapi.Response(
74
+ content=combined_metrics,
75
+ media_type='text/plain; version=0.0.4; charset=utf-8')
76
+
77
+
142
78
  def build_metrics_server(host: str, port: int) -> uvicorn.Server:
143
79
  metrics_config = uvicorn.Config(
144
80
  'sky.server.metrics:metrics_app',
@@ -182,61 +118,17 @@ class PrometheusMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
182
118
  status_code_group = '5xx'
183
119
  raise
184
120
  finally:
185
- SKY_APISERVER_REQUESTS_TOTAL.labels(path=path,
186
- method=method,
187
- status=status_code_group).inc()
121
+ metrics_utils.SKY_APISERVER_REQUESTS_TOTAL.labels(
122
+ path=path, method=method, status=status_code_group).inc()
188
123
  if not streaming:
189
124
  duration = time.time() - start_time
190
- SKY_APISERVER_REQUEST_DURATION_SECONDS.labels(
125
+ metrics_utils.SKY_APISERVER_REQUEST_DURATION_SECONDS.labels(
191
126
  path=path, method=method,
192
127
  status=status_code_group).observe(duration)
193
128
 
194
129
  return response
195
130
 
196
131
 
197
- @contextlib.contextmanager
198
- def time_it(name: str, group: str = 'default'):
199
- """Context manager to measure and record code execution duration."""
200
- if not METRICS_ENABLED:
201
- yield
202
- else:
203
- start_time = time.time()
204
- try:
205
- yield
206
- finally:
207
- duration = time.time() - start_time
208
- SKY_APISERVER_CODE_DURATION_SECONDS.labels(
209
- name=name, group=group).observe(duration)
210
-
211
-
212
- def time_me(func):
213
- """Measure the duration of decorated function."""
214
-
215
- @functools.wraps(func)
216
- def wrapper(*args, **kwargs):
217
- if not METRICS_ENABLED:
218
- return func(*args, **kwargs)
219
- name = f'{func.__module__}/{func.__name__}'
220
- with time_it(name, group='function'):
221
- return func(*args, **kwargs)
222
-
223
- return wrapper
224
-
225
-
226
- def time_me_async(func):
227
- """Measure the duration of decorated async function."""
228
-
229
- @functools.wraps(func)
230
- async def async_wrapper(*args, **kwargs):
231
- if not METRICS_ENABLED:
232
- return await func(*args, **kwargs)
233
- name = f'{func.__module__}/{func.__name__}'
234
- with time_it(name, group='function'):
235
- return await func(*args, **kwargs)
236
-
237
- return async_wrapper
238
-
239
-
240
132
  peak_rss_bytes = 0
241
133
 
242
134
 
@@ -252,13 +144,15 @@ def process_monitor(process_type: str, stop: threading.Event):
252
144
  last_bucket_end = time.time()
253
145
  bucket_peak = 0
254
146
  peak_rss_bytes = max(bucket_peak, proc.memory_info().rss)
255
- SKY_APISERVER_PROCESS_PEAK_RSS.labels(
147
+ metrics_utils.SKY_APISERVER_PROCESS_PEAK_RSS.labels(
256
148
  pid=pid, type=process_type).set(peak_rss_bytes)
257
149
  ctimes = proc.cpu_times()
258
- SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
259
- type=process_type,
260
- mode='user').set(ctimes.user)
261
- SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
262
- type=process_type,
263
- mode='system').set(ctimes.system)
150
+ metrics_utils.SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
151
+ type=process_type,
152
+ mode='user').set(
153
+ ctimes.user)
154
+ metrics_utils.SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
155
+ type=process_type,
156
+ mode='system').set(
157
+ ctimes.system)
264
158
  time.sleep(1)
@@ -39,6 +39,7 @@ from sky import global_user_state
39
39
  from sky import models
40
40
  from sky import sky_logging
41
41
  from sky import skypilot_config
42
+ from sky.metrics import utils as metrics_utils
42
43
  from sky.server import common as server_common
43
44
  from sky.server import config as server_config
44
45
  from sky.server import constants as server_constants
@@ -422,10 +423,10 @@ def _request_execution_wrapper(request_id: str,
422
423
  config = skypilot_config.to_dict()
423
424
  logger.debug(f'request config: \n'
424
425
  f'{yaml_utils.dump_yaml_str(dict(config))}')
425
- metrics_lib.SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL.labels(
426
- request=request_name, pid=pid).inc()
427
- with metrics_lib.time_it(name=request_name,
428
- group='request_execution'):
426
+ (metrics_utils.SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL.
427
+ labels(request=request_name, pid=pid).inc())
428
+ with metrics_utils.time_it(name=request_name,
429
+ group='request_execution'):
429
430
  return_value = func(**request_body.to_kwargs())
430
431
  f.flush()
431
432
  except KeyboardInterrupt:
@@ -468,8 +469,8 @@ def _request_execution_wrapper(request_id: str,
468
469
  # Clear request level cache to release all memory used by
469
470
  # the request.
470
471
  annotations.clear_request_level_cache()
471
- with metrics_lib.time_it(name='release_memory',
472
- group='internal'):
472
+ with metrics_utils.time_it(name='release_memory',
473
+ group='internal'):
473
474
  common_utils.release_memory()
474
475
  _record_memory_metrics(request_name, proc, rss_begin, peak_rss)
475
476
  except Exception as e: # pylint: disable=broad-except
@@ -493,11 +494,11 @@ def _record_memory_metrics(request_name: str, proc: psutil.Process,
493
494
  rss_end = proc.memory_info().rss
494
495
 
495
496
  # Answer "how much RSS this request contributed?"
496
- metrics_lib.SKY_APISERVER_REQUEST_RSS_INCR_BYTES.labels(
497
+ metrics_utils.SKY_APISERVER_REQUEST_RSS_INCR_BYTES.labels(
497
498
  name=request_name).observe(max(rss_end - rss_begin, 0))
498
499
  # Estimate the memory usage by the request by capturing the
499
500
  # peak memory delta during the request execution.
500
- metrics_lib.SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES.labels(
501
+ metrics_utils.SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES.labels(
501
502
  name=request_name).observe(max(peak_rss - rss_begin, 0))
502
503
 
503
504
 
@@ -792,6 +792,12 @@ class GetConfigBody(RequestBody):
792
792
  class CostReportBody(RequestBody):
793
793
  """The request body for the cost report endpoint."""
794
794
  days: Optional[int] = 30
795
+ # we use hashes instead of names to avoid the case where
796
+ # the name is not unique
797
+ cluster_hashes: Optional[List[str]] = None
798
+ # Only return fields that are needed for the dashboard
799
+ # summary page
800
+ dashboard_summary_response: bool = False
795
801
 
796
802
 
797
803
  class RequestPayload(BasePayload):