PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250918__py3-none-any.whl → 1.0.0.dev20250919__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250918py3-none-any.whl → 1.0.0.dev20250919py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (48) hide show

sky/metrics/utils.py CHANGED Viewed

@@ -1,11 +1,165 @@
 """Utilities for processing GPU metrics from Kubernetes clusters."""
+import contextlib
+import functools
 import os
 import re
+import select
 import subprocess
 import time
 from typing import List, Optional, Tuple
 import httpx
+import prometheus_client as prom
+from sky.skylet import constants
+from sky.utils import context_utils
+_SELECT_TIMEOUT = 1
+_SELECT_BUFFER_SIZE = 4096
+_KB = 2**10
+_MB = 2**20
+_MEM_BUCKETS = [
+    _KB,
+    256 * _KB,
+    512 * _KB,
+    _MB,
+    2 * _MB,
+    4 * _MB,
+    8 * _MB,
+    16 * _MB,
+    32 * _MB,
+    64 * _MB,
+    128 * _MB,
+    256 * _MB,
+    float('inf'),
+]
+# Whether the metrics are enabled, cannot be changed at runtime.
+METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
+                                 'false').lower() == 'true'
+# Time spent processing a piece of code, refer to time_it().
+SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
+    'sky_apiserver_code_duration_seconds',
+    'Time spent processing code',
+    ['name', 'group'],
+    buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
+             60.0, 120.0, float('inf')),
+)
+# Total number of API server requests, grouped by path, method, and status.
+SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
+    'sky_apiserver_requests_total',
+    'Total number of API server requests',
+    ['path', 'method', 'status'],
+)
+# Time spent processing API server requests, grouped by path, method, and
+# status.
+SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
+    'sky_apiserver_request_duration_seconds',
+    'Time spent processing API server requests',
+    ['path', 'method', 'status'],
+    buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
+             60.0, 120.0, float('inf')),
+)
+SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
+    'sky_apiserver_event_loop_lag_seconds',
+    'Scheduling delay of the server event loop',
+    ['pid'],
+    buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 20.0,
+             60.0, float('inf')),
+)
+SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
+    'sky_apiserver_websocket_connections',
+    'Number of websocket connections',
+    ['pid'],
+    multiprocess_mode='livesum',
+)
+SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL = prom.Counter(
+    'sky_apiserver_websocket_closed_total',
+    'Number of websocket closed',
+    ['pid', 'reason'],
+)
+# The number of execution starts in each worker process, we do not record
+# histogram here as the duration has been measured in
+# SKY_APISERVER_CODE_DURATION_SECONDS without the worker label (process id).
+# Recording histogram WITH worker label will cause high cardinality.
+SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL = prom.Counter(
+    'sky_apiserver_process_execution_start_total',
+    'Total number of execution starts in each worker process',
+    ['request', 'pid'],
+)
+SKY_APISERVER_PROCESS_PEAK_RSS = prom.Gauge(
+    'sky_apiserver_process_peak_rss',
+    'Peak RSS we saw in each process in last 30 seconds',
+    ['pid', 'type'],
+)
+SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
+    'sky_apiserver_process_cpu_total',
+    'Total CPU times a worker process has been running',
+    ['pid', 'type', 'mode'],
+)
+SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
+    'sky_apiserver_request_memory_usage_bytes',
+    'Peak memory usage of requests', ['name'],
+    buckets=_MEM_BUCKETS)
+SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
+    'sky_apiserver_request_rss_incr_bytes',
+    'RSS increment after requests', ['name'],
+    buckets=_MEM_BUCKETS)
+@contextlib.contextmanager
+def time_it(name: str, group: str = 'default'):
+    """Context manager to measure and record code execution duration."""
+    if not METRICS_ENABLED:
+        yield
+    else:
+        start_time = time.time()
+        try:
+            yield
+        finally:
+            duration = time.time() - start_time
+            SKY_APISERVER_CODE_DURATION_SECONDS.labels(
+                name=name, group=group).observe(duration)
+def time_me(func):
+    """Measure the duration of decorated function."""
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if not METRICS_ENABLED:
+            return func(*args, **kwargs)
+        name = f'{func.__module__}/{func.__name__}'
+        with time_it(name, group='function'):
+            return func(*args, **kwargs)
+    return wrapper
+def time_me_async(func):
+    """Measure the duration of decorated async function."""
+    @functools.wraps(func)
+    async def async_wrapper(*args, **kwargs):
+        if not METRICS_ENABLED:
+            return await func(*args, **kwargs)
+        name = f'{func.__module__}/{func.__name__}'
+        with time_it(name, group='function'):
+            return await func(*args, **kwargs)
+    return async_wrapper
 def start_svc_port_forward(context: str, namespace: str, service: str,
@@ -44,6 +198,7 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
     local_port = None
     start_time = time.time()
+    buffer = ''
     # wait for the port forward to start and extract the local port
     while time.time() - start_time < start_port_forward_timeout:
         if port_forward_process.poll() is not None:
@@ -56,10 +211,16 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
         # read output line by line to find the local port
         if port_forward_process.stdout:
-            line = port_forward_process.stdout.readline()
-            if line:
-                # look for 'Forwarding from 127.0.0.1:XXXXX -> service_port'
-                match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', line)
+            # Wait up to 1s for data to be available without blocking
+            r, _, _ = select.select([port_forward_process.stdout], [], [],
+                                    _SELECT_TIMEOUT)
+            if r:
+                # Read available bytes from the FD without blocking
+                fd = port_forward_process.stdout.fileno()
+                raw = os.read(fd, _SELECT_BUFFER_SIZE)
+                chunk = raw.decode(errors='ignore')
+                buffer += chunk
+                match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', buffer)
                 if match:
                     local_port = int(match.group(1))
                     break
@@ -122,8 +283,8 @@ async def send_metrics_request_with_port_forward(
     port_forward_process = None
     try:
         # Start port forward
-        port_forward_process, local_port = start_svc_port_forward(
-            context, namespace, service, service_port)
+        port_forward_process, local_port = await context_utils.to_thread(
+            start_svc_port_forward, context, namespace, service, service_port)
         # Build endpoint URL
         endpoint = f'http://localhost:{local_port}{endpoint_path}'
@@ -143,7 +304,8 @@ async def send_metrics_request_with_port_forward(
     finally:
         # Always clean up port forward
         if port_forward_process:
-            stop_svc_port_forward(port_forward_process)
+            await context_utils.to_thread(stop_svc_port_forward,
+                                          port_forward_process)
 async def add_cluster_name_label(metrics_text: str, context: str) -> str:
@@ -193,7 +355,11 @@ async def get_metrics_for_context(context: str) -> str:
     """
     # Query both DCGM metrics and kube_pod_labels metrics
     # This ensures the dashboard can perform joins to filter by skypilot cluster
-    match_patterns = ['{__name__=~"DCGM_.*"}', 'kube_pod_labels']
+    match_patterns = [
+        '{__name__=~"node_memory_MemAvailable_bytes|node_memory_MemTotal_bytes|DCGM_.*"}',  # pylint: disable=line-too-long
+        'kube_pod_labels',
+        'node_cpu_seconds_total{mode="idle"}'
+    ]
     # TODO(rohan): don't hardcode the namespace and service name
     metrics_text = await send_metrics_request_with_port_forward(

sky/schemas/generated/jobsv1_pb2.py CHANGED Viewed

@@ -14,7 +14,7 @@ _sym_db = _symbol_database.Default()
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\"sky/schemas/generated/jobsv1.proto\x12\x07jobs.v1\"\x85\x01\n\rAddJobRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x15\n\rrun_timestamp\x18\x03 \x01(\t\x12\x15\n\rresources_str\x18\x04 \x01(\t\x12\x10\n\x08metadata\x18\x05 \x01(\tB\x0b\n\t_job_name\"1\n\x0e\x41\x64\x64JobResponse\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07log_dir\x18\x02 \x01(\t\"\xb3\x01\n\x0fQueueJobRequest\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x14\n\x07\x63odegen\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bscript_path\x18\x03 \x01(\t\x12\x16\n\x0eremote_log_dir\x18\x04 \x01(\t\x12\x31\n\x0bmanaged_job\x18\x05 \x01(\x0b\x32\x17.jobs.v1.ManagedJobInfoH\x01\x88\x01\x01\x42\n\n\x08_codegenB\x0e\n\x0c_managed_job\"\x89\x01\n\x0eManagedJobInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x11\n\x04pool\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tworkspace\x18\x03 \x01(\t\x12\x12\n\nentrypoint\x18\x04 \x01(\t\x12&\n\x05tasks\x18\x05 \x03(\x0b\x32\x17.jobs.v1.ManagedJobTaskB\x07\n\x05_pool\"]\n\x0eManagedJobTask\x12\x0f\n\x07task_id\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x15\n\rresources_str\x18\x03 \x01(\t\x12\x15\n\rmetadata_json\x18\x04 \x01(\t\"\x12\n\x10QueueJobResponse\"\x15\n\x13UpdateStatusRequest\"\x16\n\x14UpdateStatusResponse\"L\n\x12GetJobQueueRequest\x12\x16\n\tuser_hash\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08\x61ll_jobs\x18\x02 \x01(\x08\x42\x0c\n\n_user_hash\"\xf4\x01\n\x07JobInfo\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x10\n\x08job_name\x18\x02 \x01(\t\x12\x10\n\x08username\x18\x03 \x01(\t\x12\x14\n\x0csubmitted_at\x18\x04 \x01(\x01\x12\"\n\x06status\x18\x05 \x01(\x0e\x32\x12.jobs.v1.JobStatus\x12\x15\n\rrun_timestamp\x18\x06 \x01(\t\x12\x10\n\x08start_at\x18\x07 \x01(\x01\x12\x0e\n\x06\x65nd_at\x18\x08 \x01(\x01\x12\x11\n\tresources\x18\t \x01(\t\x12\x0b\n\x03pid\x18\n \x01(\x03\x12\x10\n\x08log_path\x18\x0b \x01(\t\x12\x10\n\x08metadata\x18\x0c \x01(\t\"5\n\x13GetJobQueueResponse\x12\x1e\n\x04jobs\x18\x01 \x03(\x0b\x32\x10.jobs.v1.JobInfo\"^\n\x11\x43\x61ncelJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\x12\x12\n\ncancel_all\x18\x02 \x01(\x08\x12\x16\n\tuser_hash\x18\x03 \x01(\tH\x00\x88\x01\x01\x42\x0c\n\n_user_hash\"/\n\x12\x43\x61ncelJobsResponse\x12\x19\n\x11\x63\x61ncelled_job_ids\x18\x01 \x03(\x03\"\x1e\n\x1c\x46\x61ilAllInProgressJobsRequest\"\x1f\n\x1d\x46\x61ilAllInProgressJobsResponse\"\x7f\n\x0fTailLogsRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x12\x1b\n\x0emanaged_job_id\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x0e\n\x06\x66ollow\x18\x03 \x01(\x08\x12\x0c\n\x04tail\x18\x04 \x01(\x05\x42\t\n\x07_job_idB\x11\n\x0f_managed_job_id\"7\n\x10TailLogsResponse\x12\x10\n\x08log_line\x18\x01 \x01(\t\x12\x11\n\texit_code\x18\x02 \x01(\x05\"&\n\x13GetJobStatusRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\xa4\x01\n\x14GetJobStatusResponse\x12\x44\n\x0cjob_statuses\x18\x01 \x03(\x0b\x32..jobs.v1.GetJobStatusResponse.JobStatusesEntry\x1a\x46\n\x10JobStatusesEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12!\n\x05value\x18\x02 \x01(\x0e\x32\x12.jobs.v1.JobStatus:\x02\x38\x01\"A\n\x1fGetJobSubmittedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"5\n GetJobSubmittedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"=\n\x1bGetJobEndedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"1\n\x1cGetJobEndedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"+\n\x18GetLogDirsForJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\x98\x01\n\x19GetLogDirsForJobsResponse\x12H\n\x0cjob_log_dirs\x18\x01 \x03(\x0b\x32\x32.jobs.v1.GetLogDirsForJobsResponse.JobLogDirsEntry\x1a\x31\n\x0fJobLogDirsEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01*\x8d\x02\n\tJobStatus\x12\x1a\n\x16JOB_STATUS_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOB_STATUS_INIT\x10\x01\x12\x16\n\x12JOB_STATUS_PENDING\x10\x02\x12\x19\n\x15JOB_STATUS_SETTING_UP\x10\x03\x12\x16\n\x12JOB_STATUS_RUNNING\x10\x04\x12\x1c\n\x18JOB_STATUS_FAILED_DRIVER\x10\x05\x12\x18\n\x14JOB_STATUS_SUCCEEDED\x10\x06\x12\x15\n\x11JOB_STATUS_FAILED\x10\x07\x12\x1b\n\x17JOB_STATUS_FAILED_SETUP\x10\x08\x12\x18\n\x14JOB_STATUS_CANCELLED\x10\t2\x91\x07\n\x0bJobsService\x12\x39\n\x06\x41\x64\x64Job\x12\x16.jobs.v1.AddJobRequest\x1a\x17.jobs.v1.AddJobResponse\x12?\n\x08QueueJob\x12\x18.jobs.v1.QueueJobRequest\x1a\x19.jobs.v1.QueueJobResponse\x12K\n\x0cUpdateStatus\x12\x1c.jobs.v1.UpdateStatusRequest\x1a\x1d.jobs.v1.UpdateStatusResponse\x12H\n\x0bGetJobQueue\x12\x1b.jobs.v1.GetJobQueueRequest\x1a\x1c.jobs.v1.GetJobQueueResponse\x12\x45\n\nCancelJobs\x12\x1a.jobs.v1.CancelJobsRequest\x1a\x1b.jobs.v1.CancelJobsResponse\x12\x66\n\x15\x46\x61ilAllInProgressJobs\x12%.jobs.v1.FailAllInProgressJobsRequest\x1a&.jobs.v1.FailAllInProgressJobsResponse\x12\x41\n\x08TailLogs\x12\x18.jobs.v1.TailLogsRequest\x1a\x19.jobs.v1.TailLogsResponse0\x01\x12K\n\x0cGetJobStatus\x12\x1c.jobs.v1.GetJobStatusRequest\x1a\x1d.jobs.v1.GetJobStatusResponse\x12o\n\x18GetJobSubmittedTimestamp\x12(.jobs.v1.GetJobSubmittedTimestampRequest\x1a).jobs.v1.GetJobSubmittedTimestampResponse\x12\x63\n\x14GetJobEndedTimestamp\x12$.jobs.v1.GetJobEndedTimestampRequest\x1a%.jobs.v1.GetJobEndedTimestampResponse\x12Z\n\x11GetLogDirsForJobs\x12!.jobs.v1.GetLogDirsForJobsRequest\x1a\".jobs.v1.GetLogDirsForJobsResponseb\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\"sky/schemas/generated/jobsv1.proto\x12\x07jobs.v1\"\x85\x01\n\rAddJobRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x15\n\rrun_timestamp\x18\x03 \x01(\t\x12\x15\n\rresources_str\x18\x04 \x01(\t\x12\x10\n\x08metadata\x18\x05 \x01(\tB\x0b\n\t_job_name\"1\n\x0e\x41\x64\x64JobResponse\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07log_dir\x18\x02 \x01(\t\"\xb3\x01\n\x0fQueueJobRequest\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x14\n\x07\x63odegen\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bscript_path\x18\x03 \x01(\t\x12\x16\n\x0eremote_log_dir\x18\x04 \x01(\t\x12\x31\n\x0bmanaged_job\x18\x05 \x01(\x0b\x32\x17.jobs.v1.ManagedJobInfoH\x01\x88\x01\x01\x42\n\n\x08_codegenB\x0e\n\x0c_managed_job\"\x89\x01\n\x0eManagedJobInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x11\n\x04pool\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tworkspace\x18\x03 \x01(\t\x12\x12\n\nentrypoint\x18\x04 \x01(\t\x12&\n\x05tasks\x18\x05 \x03(\x0b\x32\x17.jobs.v1.ManagedJobTaskB\x07\n\x05_pool\"]\n\x0eManagedJobTask\x12\x0f\n\x07task_id\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x15\n\rresources_str\x18\x03 \x01(\t\x12\x15\n\rmetadata_json\x18\x04 \x01(\t\"\x12\n\x10QueueJobResponse\"\x15\n\x13UpdateStatusRequest\"\x16\n\x14UpdateStatusResponse\"L\n\x12GetJobQueueRequest\x12\x16\n\tuser_hash\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08\x61ll_jobs\x18\x02 \x01(\x08\x42\x0c\n\n_user_hash\"\xa3\x02\n\x07JobInfo\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x10\n\x08job_name\x18\x02 \x01(\t\x12\x10\n\x08username\x18\x03 \x01(\t\x12\x14\n\x0csubmitted_at\x18\x04 \x01(\x01\x12\"\n\x06status\x18\x05 \x01(\x0e\x32\x12.jobs.v1.JobStatus\x12\x15\n\rrun_timestamp\x18\x06 \x01(\t\x12\x15\n\x08start_at\x18\x07 \x01(\x01H\x00\x88\x01\x01\x12\x13\n\x06\x65nd_at\x18\x08 \x01(\x01H\x01\x88\x01\x01\x12\x11\n\tresources\x18\t \x01(\t\x12\x10\n\x03pid\x18\n \x01(\x03H\x02\x88\x01\x01\x12\x10\n\x08log_path\x18\x0b \x01(\t\x12\x10\n\x08metadata\x18\x0c \x01(\tB\x0b\n\t_start_atB\t\n\x07_end_atB\x06\n\x04_pid\"5\n\x13GetJobQueueResponse\x12\x1e\n\x04jobs\x18\x01 \x03(\x0b\x32\x10.jobs.v1.JobInfo\"^\n\x11\x43\x61ncelJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\x12\x12\n\ncancel_all\x18\x02 \x01(\x08\x12\x16\n\tuser_hash\x18\x03 \x01(\tH\x00\x88\x01\x01\x42\x0c\n\n_user_hash\"/\n\x12\x43\x61ncelJobsResponse\x12\x19\n\x11\x63\x61ncelled_job_ids\x18\x01 \x03(\x03\"\x1e\n\x1c\x46\x61ilAllInProgressJobsRequest\"\x1f\n\x1d\x46\x61ilAllInProgressJobsResponse\"\x7f\n\x0fTailLogsRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x12\x1b\n\x0emanaged_job_id\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x0e\n\x06\x66ollow\x18\x03 \x01(\x08\x12\x0c\n\x04tail\x18\x04 \x01(\x05\x42\t\n\x07_job_idB\x11\n\x0f_managed_job_id\"7\n\x10TailLogsResponse\x12\x10\n\x08log_line\x18\x01 \x01(\t\x12\x11\n\texit_code\x18\x02 \x01(\x05\"&\n\x13GetJobStatusRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\xa4\x01\n\x14GetJobStatusResponse\x12\x44\n\x0cjob_statuses\x18\x01 \x03(\x0b\x32..jobs.v1.GetJobStatusResponse.JobStatusesEntry\x1a\x46\n\x10JobStatusesEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12!\n\x05value\x18\x02 \x01(\x0e\x32\x12.jobs.v1.JobStatus:\x02\x38\x01\"A\n\x1fGetJobSubmittedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"5\n GetJobSubmittedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"=\n\x1bGetJobEndedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"1\n\x1cGetJobEndedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"+\n\x18GetLogDirsForJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\x98\x01\n\x19GetLogDirsForJobsResponse\x12H\n\x0cjob_log_dirs\x18\x01 \x03(\x0b\x32\x32.jobs.v1.GetLogDirsForJobsResponse.JobLogDirsEntry\x1a\x31\n\x0fJobLogDirsEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01*\x8d\x02\n\tJobStatus\x12\x1a\n\x16JOB_STATUS_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOB_STATUS_INIT\x10\x01\x12\x16\n\x12JOB_STATUS_PENDING\x10\x02\x12\x19\n\x15JOB_STATUS_SETTING_UP\x10\x03\x12\x16\n\x12JOB_STATUS_RUNNING\x10\x04\x12\x1c\n\x18JOB_STATUS_FAILED_DRIVER\x10\x05\x12\x18\n\x14JOB_STATUS_SUCCEEDED\x10\x06\x12\x15\n\x11JOB_STATUS_FAILED\x10\x07\x12\x1b\n\x17JOB_STATUS_FAILED_SETUP\x10\x08\x12\x18\n\x14JOB_STATUS_CANCELLED\x10\t2\x91\x07\n\x0bJobsService\x12\x39\n\x06\x41\x64\x64Job\x12\x16.jobs.v1.AddJobRequest\x1a\x17.jobs.v1.AddJobResponse\x12?\n\x08QueueJob\x12\x18.jobs.v1.QueueJobRequest\x1a\x19.jobs.v1.QueueJobResponse\x12K\n\x0cUpdateStatus\x12\x1c.jobs.v1.UpdateStatusRequest\x1a\x1d.jobs.v1.UpdateStatusResponse\x12H\n\x0bGetJobQueue\x12\x1b.jobs.v1.GetJobQueueRequest\x1a\x1c.jobs.v1.GetJobQueueResponse\x12\x45\n\nCancelJobs\x12\x1a.jobs.v1.CancelJobsRequest\x1a\x1b.jobs.v1.CancelJobsResponse\x12\x66\n\x15\x46\x61ilAllInProgressJobs\x12%.jobs.v1.FailAllInProgressJobsRequest\x1a&.jobs.v1.FailAllInProgressJobsResponse\x12\x41\n\x08TailLogs\x12\x18.jobs.v1.TailLogsRequest\x1a\x19.jobs.v1.TailLogsResponse0\x01\x12K\n\x0cGetJobStatus\x12\x1c.jobs.v1.GetJobStatusRequest\x1a\x1d.jobs.v1.GetJobStatusResponse\x12o\n\x18GetJobSubmittedTimestamp\x12(.jobs.v1.GetJobSubmittedTimestampRequest\x1a).jobs.v1.GetJobSubmittedTimestampResponse\x12\x63\n\x14GetJobEndedTimestamp\x12$.jobs.v1.GetJobEndedTimestampRequest\x1a%.jobs.v1.GetJobEndedTimestampResponse\x12Z\n\x11GetLogDirsForJobs\x12!.jobs.v1.GetLogDirsForJobsRequest\x1a\".jobs.v1.GetLogDirsForJobsResponseb\x06proto3')
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
@@ -25,8 +25,8 @@ if not _descriptor._USE_C_DESCRIPTORS:
   _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_options = b'8\001'
   _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._loaded_options = None
   _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_options = b'8\001'
-  _globals['_JOBSTATUS']._serialized_start=2138
-  _globals['_JOBSTATUS']._serialized_end=2407
+  _globals['_JOBSTATUS']._serialized_start=2185
+  _globals['_JOBSTATUS']._serialized_end=2454
   _globals['_ADDJOBREQUEST']._serialized_start=48
   _globals['_ADDJOBREQUEST']._serialized_end=181
   _globals['_ADDJOBRESPONSE']._serialized_start=183
@@ -46,41 +46,41 @@ if not _descriptor._USE_C_DESCRIPTORS:
   _globals['_GETJOBQUEUEREQUEST']._serialized_start=718
   _globals['_GETJOBQUEUEREQUEST']._serialized_end=794
   _globals['_JOBINFO']._serialized_start=797
-  _globals['_JOBINFO']._serialized_end=1041
-  _globals['_GETJOBQUEUERESPONSE']._serialized_start=1043
-  _globals['_GETJOBQUEUERESPONSE']._serialized_end=1096
-  _globals['_CANCELJOBSREQUEST']._serialized_start=1098
-  _globals['_CANCELJOBSREQUEST']._serialized_end=1192
-  _globals['_CANCELJOBSRESPONSE']._serialized_start=1194
-  _globals['_CANCELJOBSRESPONSE']._serialized_end=1241
-  _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_start=1243
-  _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_end=1273
-  _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_start=1275
-  _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_end=1306
-  _globals['_TAILLOGSREQUEST']._serialized_start=1308
-  _globals['_TAILLOGSREQUEST']._serialized_end=1435
-  _globals['_TAILLOGSRESPONSE']._serialized_start=1437
-  _globals['_TAILLOGSRESPONSE']._serialized_end=1492
-  _globals['_GETJOBSTATUSREQUEST']._serialized_start=1494
-  _globals['_GETJOBSTATUSREQUEST']._serialized_end=1532
-  _globals['_GETJOBSTATUSRESPONSE']._serialized_start=1535
-  _globals['_GETJOBSTATUSRESPONSE']._serialized_end=1699
-  _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_start=1629
-  _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_end=1699
-  _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_start=1701
-  _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_end=1766
-  _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_start=1768
-  _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_end=1821
-  _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_start=1823
-  _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_end=1884
-  _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_start=1886
-  _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_end=1935
-  _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_start=1937
-  _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_end=1980
-  _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_start=1983
-  _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_end=2135
-  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_start=2086
-  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_end=2135
-  _globals['_JOBSSERVICE']._serialized_start=2410
-  _globals['_JOBSSERVICE']._serialized_end=3323
+  _globals['_JOBINFO']._serialized_end=1088
+  _globals['_GETJOBQUEUERESPONSE']._serialized_start=1090
+  _globals['_GETJOBQUEUERESPONSE']._serialized_end=1143
+  _globals['_CANCELJOBSREQUEST']._serialized_start=1145
+  _globals['_CANCELJOBSREQUEST']._serialized_end=1239
+  _globals['_CANCELJOBSRESPONSE']._serialized_start=1241
+  _globals['_CANCELJOBSRESPONSE']._serialized_end=1288
+  _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_start=1290
+  _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_end=1320
+  _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_start=1322
+  _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_end=1353
+  _globals['_TAILLOGSREQUEST']._serialized_start=1355
+  _globals['_TAILLOGSREQUEST']._serialized_end=1482
+  _globals['_TAILLOGSRESPONSE']._serialized_start=1484
+  _globals['_TAILLOGSRESPONSE']._serialized_end=1539
+  _globals['_GETJOBSTATUSREQUEST']._serialized_start=1541
+  _globals['_GETJOBSTATUSREQUEST']._serialized_end=1579
+  _globals['_GETJOBSTATUSRESPONSE']._serialized_start=1582
+  _globals['_GETJOBSTATUSRESPONSE']._serialized_end=1746
+  _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_start=1676
+  _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_end=1746
+  _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_start=1748
+  _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_end=1813
+  _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_start=1815
+  _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_end=1868
+  _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_start=1870
+  _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_end=1931
+  _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_start=1933
+  _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_end=1982
+  _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_start=1984
+  _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_end=2027
+  _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_start=2030
+  _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_end=2182
+  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_start=2133
+  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_end=2182
+  _globals['_JOBSSERVICE']._serialized_start=2457
+  _globals['_JOBSSERVICE']._serialized_end=3370
 # @@protoc_insertion_point(module_scope)

sky/server/metrics.py CHANGED Viewed

@@ -1,11 +1,11 @@
 """Instrumentation for the API server."""
-import contextlib
-import functools
+import asyncio
 import multiprocessing
 import os
 import threading
 import time
+from typing import List
 import fastapi
 from prometheus_client import generate_latest
@@ -15,112 +15,12 @@ import psutil
 import starlette.middleware.base
 import uvicorn
+from sky import core
 from sky import sky_logging
-from sky.skylet import constants
-# Whether the metrics are enabled, cannot be changed at runtime.
-METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
-                                 'false').lower() == 'true'
-_KB = 2**10
-_MB = 2**20
-_MEM_BUCKETS = [
-    _KB,
-    256 * _KB,
-    512 * _KB,
-    _MB,
-    2 * _MB,
-    4 * _MB,
-    8 * _MB,
-    16 * _MB,
-    32 * _MB,
-    64 * _MB,
-    128 * _MB,
-    256 * _MB,
-    float('inf'),
-]
+from sky.metrics import utils as metrics_utils
 logger = sky_logging.init_logger(__name__)
-# Total number of API server requests, grouped by path, method, and status.
-SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
-    'sky_apiserver_requests_total',
-    'Total number of API server requests',
-    ['path', 'method', 'status'],
-)
-# Time spent processing API server requests, grouped by path, method, and
-# status.
-SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
-    'sky_apiserver_request_duration_seconds',
-    'Time spent processing API server requests',
-    ['path', 'method', 'status'],
-    buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
-             60.0, 120.0, float('inf')),
-)
-# Time spent processing a piece of code, refer to time_it().
-SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
-    'sky_apiserver_code_duration_seconds',
-    'Time spent processing code',
-    ['name', 'group'],
-    buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
-             60.0, 120.0, float('inf')),
-)
-SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
-    'sky_apiserver_event_loop_lag_seconds',
-    'Scheduling delay of the server event loop',
-    ['pid'],
-    buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 20.0,
-             60.0, float('inf')),
-)
-SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
-    'sky_apiserver_websocket_connections',
-    'Number of websocket connections',
-    ['pid'],
-    multiprocess_mode='livesum',
-)
-SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL = prom.Counter(
-    'sky_apiserver_websocket_closed_total',
-    'Number of websocket closed',
-    ['pid', 'reason'],
-)
-# The number of execution starts in each worker process, we do not record
-# histogram here as the duration has been measured in
-# SKY_APISERVER_CODE_DURATION_SECONDS without the worker label (process id).
-# Recording histogram WITH worker label will cause high cardinality.
-SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL = prom.Counter(
-    'sky_apiserver_process_execution_start_total',
-    'Total number of execution starts in each worker process',
-    ['request', 'pid'],
-)
-SKY_APISERVER_PROCESS_PEAK_RSS = prom.Gauge(
-    'sky_apiserver_process_peak_rss',
-    'Peak RSS we saw in each process in last 30 seconds',
-    ['pid', 'type'],
-)
-SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
-    'sky_apiserver_process_cpu_total',
-    'Total CPU times a worker process has been running',
-    ['pid', 'type', 'mode'],
-)
-SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
-    'sky_apiserver_request_memory_usage_bytes',
-    'Peak memory usage of requests', ['name'],
-    buckets=_MEM_BUCKETS)
-SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
-    'sky_apiserver_request_rss_incr_bytes',
-    'RSS increment after requests', ['name'],
-    buckets=_MEM_BUCKETS)
 metrics_app = fastapi.FastAPI()
@@ -139,6 +39,42 @@ async def metrics() -> fastapi.Response:
                             headers={'Cache-Control': 'no-cache'})
+@metrics_app.get('/gpu-metrics')
+async def gpu_metrics() -> fastapi.Response:
+    """Gets the GPU metrics from multiple external k8s clusters"""
+    contexts = core.get_all_contexts()
+    all_metrics: List[str] = []
+    successful_contexts = 0
+    tasks = [
+        asyncio.create_task(metrics_utils.get_metrics_for_context(context))
+        for context in contexts
+        if context != 'in-cluster'
+    ]
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+    for i, result in enumerate(results):
+        if isinstance(result, Exception):
+            logger.error(
+                f'Failed to get metrics for context {contexts[i]}: {result}')
+        elif isinstance(result, BaseException):
+            # Avoid changing behavior for non-Exception BaseExceptions
+            # like KeyboardInterrupt/SystemExit: re-raise them.
+            raise result
+        else:
+            metrics_text = result
+            all_metrics.append(metrics_text)
+            successful_contexts += 1
+    combined_metrics = '\n\n'.join(all_metrics)
+    # Return as plain text for Prometheus compatibility
+    return fastapi.Response(
+        content=combined_metrics,
+        media_type='text/plain; version=0.0.4; charset=utf-8')
 def build_metrics_server(host: str, port: int) -> uvicorn.Server:
     metrics_config = uvicorn.Config(
         'sky.server.metrics:metrics_app',
@@ -182,61 +118,17 @@ class PrometheusMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
             status_code_group = '5xx'
             raise
         finally:
-            SKY_APISERVER_REQUESTS_TOTAL.labels(path=path,
-                                                method=method,
-                                                status=status_code_group).inc()
+            metrics_utils.SKY_APISERVER_REQUESTS_TOTAL.labels(
+                path=path, method=method, status=status_code_group).inc()
             if not streaming:
                 duration = time.time() - start_time
-                SKY_APISERVER_REQUEST_DURATION_SECONDS.labels(
+                metrics_utils.SKY_APISERVER_REQUEST_DURATION_SECONDS.labels(
                     path=path, method=method,
                     status=status_code_group).observe(duration)
         return response
-@contextlib.contextmanager
-def time_it(name: str, group: str = 'default'):
-    """Context manager to measure and record code execution duration."""
-    if not METRICS_ENABLED:
-        yield
-    else:
-        start_time = time.time()
-        try:
-            yield
-        finally:
-            duration = time.time() - start_time
-            SKY_APISERVER_CODE_DURATION_SECONDS.labels(
-                name=name, group=group).observe(duration)
-def time_me(func):
-    """Measure the duration of decorated function."""
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        if not METRICS_ENABLED:
-            return func(*args, **kwargs)
-        name = f'{func.__module__}/{func.__name__}'
-        with time_it(name, group='function'):
-            return func(*args, **kwargs)
-    return wrapper
-def time_me_async(func):
-    """Measure the duration of decorated async function."""
-    @functools.wraps(func)
-    async def async_wrapper(*args, **kwargs):
-        if not METRICS_ENABLED:
-            return await func(*args, **kwargs)
-        name = f'{func.__module__}/{func.__name__}'
-        with time_it(name, group='function'):
-            return await func(*args, **kwargs)
-    return async_wrapper
 peak_rss_bytes = 0
@@ -252,13 +144,15 @@ def process_monitor(process_type: str, stop: threading.Event):
             last_bucket_end = time.time()
             bucket_peak = 0
         peak_rss_bytes = max(bucket_peak, proc.memory_info().rss)
-        SKY_APISERVER_PROCESS_PEAK_RSS.labels(
+        metrics_utils.SKY_APISERVER_PROCESS_PEAK_RSS.labels(
             pid=pid, type=process_type).set(peak_rss_bytes)
         ctimes = proc.cpu_times()
-        SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
-                                               type=process_type,
-                                               mode='user').set(ctimes.user)
-        SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
-                                               type=process_type,
-                                               mode='system').set(ctimes.system)
+        metrics_utils.SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
+                                                             type=process_type,
+                                                             mode='user').set(
+                                                                 ctimes.user)
+        metrics_utils.SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
+                                                             type=process_type,
+                                                             mode='system').set(
+                                                                 ctimes.system)
         time.sleep(1)

sky/server/requests/executor.py CHANGED Viewed

@@ -39,6 +39,7 @@ from sky import global_user_state
 from sky import models
 from sky import sky_logging
 from sky import skypilot_config
+from sky.metrics import utils as metrics_utils
 from sky.server import common as server_common
 from sky.server import config as server_config
 from sky.server import constants as server_constants
@@ -422,10 +423,10 @@ def _request_execution_wrapper(request_id: str,
                     config = skypilot_config.to_dict()
                     logger.debug(f'request config: \n'
                                  f'{yaml_utils.dump_yaml_str(dict(config))}')
-                metrics_lib.SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL.labels(
-                    request=request_name, pid=pid).inc()
-                with metrics_lib.time_it(name=request_name,
-                                         group='request_execution'):
+                (metrics_utils.SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL.
+                 labels(request=request_name, pid=pid).inc())
+                with metrics_utils.time_it(name=request_name,
+                                           group='request_execution'):
                     return_value = func(**request_body.to_kwargs())
                 f.flush()
         except KeyboardInterrupt:
@@ -468,8 +469,8 @@ def _request_execution_wrapper(request_id: str,
                 # Clear request level cache to release all memory used by
                 # the request.
                 annotations.clear_request_level_cache()
-                with metrics_lib.time_it(name='release_memory',
-                                         group='internal'):
+                with metrics_utils.time_it(name='release_memory',
+                                           group='internal'):
                     common_utils.release_memory()
                 _record_memory_metrics(request_name, proc, rss_begin, peak_rss)
             except Exception as e:  # pylint: disable=broad-except
@@ -493,11 +494,11 @@ def _record_memory_metrics(request_name: str, proc: psutil.Process,
     rss_end = proc.memory_info().rss
     # Answer "how much RSS this request contributed?"
-    metrics_lib.SKY_APISERVER_REQUEST_RSS_INCR_BYTES.labels(
+    metrics_utils.SKY_APISERVER_REQUEST_RSS_INCR_BYTES.labels(
         name=request_name).observe(max(rss_end - rss_begin, 0))
     # Estimate the memory usage by the request by capturing the
     # peak memory delta during the request execution.
-    metrics_lib.SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES.labels(
+    metrics_utils.SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES.labels(
         name=request_name).observe(max(peak_rss - rss_begin, 0))

sky/server/requests/payloads.py CHANGED Viewed

@@ -792,6 +792,12 @@ class GetConfigBody(RequestBody):
 class CostReportBody(RequestBody):
     """The request body for the cost report endpoint."""
     days: Optional[int] = 30
+    # we use hashes instead of names to avoid the case where
+    # the name is not unique
+    cluster_hashes: Optional[List[str]] = None
+    # Only return fields that are needed for the dashboard
+    # summary page
+    dashboard_summary_response: bool = False
 class RequestPayload(BasePayload):

skypilot-nightly 1.0.0.dev20250918__py3-none-any.whl → 1.0.0.dev20250919__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250918py3-none-any.whl → 1.0.0.dev20250919py3-none-any.whl