skypilot-nightly 1.0.0.dev20250918__py3-none-any.whl → 1.0.0.dev20250922__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (52) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +12 -15
  3. sky/core.py +67 -45
  4. sky/dashboard/out/404.html +1 -1
  5. sky/dashboard/out/_next/static/{k1mo5xWZrV9djgjd0moOT → KP6HCNMqb_bnJB17oplgW}/_buildManifest.js +1 -1
  6. sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +1 -0
  7. sky/dashboard/out/_next/static/chunks/3015-88c7c8d69b0b6dba.js +1 -0
  8. sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +1 -0
  9. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +16 -0
  10. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-0b4b35dc1dfe046c.js → [cluster]-9525660179df3605.js} +1 -1
  11. sky/dashboard/out/_next/static/chunks/{webpack-487697b47d8c5e50.js → webpack-26167a9e6d91fa51.js} +1 -1
  12. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  13. sky/dashboard/out/clusters/[cluster].html +1 -1
  14. sky/dashboard/out/clusters.html +1 -1
  15. sky/dashboard/out/config.html +1 -1
  16. sky/dashboard/out/index.html +1 -1
  17. sky/dashboard/out/infra/[context].html +1 -1
  18. sky/dashboard/out/infra.html +1 -1
  19. sky/dashboard/out/jobs/[job].html +1 -1
  20. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  21. sky/dashboard/out/jobs.html +1 -1
  22. sky/dashboard/out/users.html +1 -1
  23. sky/dashboard/out/volumes.html +1 -1
  24. sky/dashboard/out/workspace/new.html +1 -1
  25. sky/dashboard/out/workspaces/[name].html +1 -1
  26. sky/dashboard/out/workspaces.html +1 -1
  27. sky/global_user_state.py +90 -56
  28. sky/metrics/utils.py +174 -8
  29. sky/schemas/generated/jobsv1_pb2.py +40 -40
  30. sky/serve/serve_utils.py +0 -4
  31. sky/server/auth/oauth2_proxy.py +2 -2
  32. sky/server/metrics.py +52 -158
  33. sky/server/requests/executor.py +9 -8
  34. sky/server/requests/payloads.py +6 -0
  35. sky/server/requests/requests.py +1 -1
  36. sky/server/requests/serializers/encoders.py +3 -2
  37. sky/server/server.py +5 -41
  38. sky/setup_files/dependencies.py +8 -1
  39. sky/skylet/constants.py +6 -4
  40. sky/skylet/job_lib.py +14 -15
  41. sky/utils/locks.py +41 -10
  42. {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/METADATA +35 -35
  43. {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/RECORD +48 -48
  44. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  45. sky/dashboard/out/_next/static/chunks/3015-ba5be550eb80fd8c.js +0 -1
  46. sky/dashboard/out/_next/static/chunks/8969-a3e3f0683e19d340.js +0 -1
  47. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  48. /sky/dashboard/out/_next/static/{k1mo5xWZrV9djgjd0moOT → KP6HCNMqb_bnJB17oplgW}/_ssgManifest.js +0 -0
  49. {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/WHEEL +0 -0
  50. {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/entry_points.txt +0 -0
  51. {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/licenses/LICENSE +0 -0
  52. {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/top_level.txt +0 -0
sky/global_user_state.py CHANGED
@@ -29,7 +29,7 @@ from sqlalchemy.ext import declarative
29
29
  from sky import models
30
30
  from sky import sky_logging
31
31
  from sky import skypilot_config
32
- from sky.server import metrics as metrics_lib
32
+ from sky.metrics import utils as metrics_lib
33
33
  from sky.skylet import constants
34
34
  from sky.utils import common_utils
35
35
  from sky.utils import context_utils
@@ -1238,14 +1238,15 @@ def _get_cluster_usage_intervals(
1238
1238
 
1239
1239
 
1240
1240
  def _get_cluster_launch_time(
1241
- usage_intervals: List[Tuple[int, Optional[int]]]) -> Optional[int]:
1241
+ usage_intervals: Optional[List[Tuple[int,
1242
+ Optional[int]]]]) -> Optional[int]:
1242
1243
  if usage_intervals is None:
1243
1244
  return None
1244
1245
  return usage_intervals[0][0]
1245
1246
 
1246
1247
 
1247
1248
  def _get_cluster_duration(
1248
- usage_intervals: List[Tuple[int, Optional[int]]]) -> int:
1249
+ usage_intervals: Optional[List[Tuple[int, Optional[int]]]]) -> int:
1249
1250
  total_duration = 0
1250
1251
 
1251
1252
  if usage_intervals is None:
@@ -1408,6 +1409,7 @@ def get_clusters(
1408
1409
  exclude_managed_clusters: bool = False,
1409
1410
  workspaces_filter: Optional[Set[str]] = None,
1410
1411
  user_hashes_filter: Optional[Set[str]] = None,
1412
+ cluster_names: Optional[List[str]] = None,
1411
1413
  ) -> List[Dict[str, Any]]:
1412
1414
  """Get clusters from the database.
1413
1415
 
@@ -1418,6 +1420,8 @@ def get_clusters(
1418
1420
  that has workspace field set to one of the values.
1419
1421
  user_hashes_filter: If specified, only include clusters
1420
1422
  that has user_hash field set to one of the values.
1423
+ cluster_names: If specified, only include clusters
1424
+ that has name field set to one of the values.
1421
1425
  """
1422
1426
  # is a cluster has a null user_hash,
1423
1427
  # we treat it as belonging to the current user.
@@ -1436,11 +1440,13 @@ def get_clusters(
1436
1440
  # If current_user_hash is in user_hashes_filter, we include
1437
1441
  # clusters that have a null user_hash.
1438
1442
  query = query.filter(
1439
- cluster_table.c.user_hash.in_(user_hashes_filter) |
1440
- (cluster_table.c.user_hash is None))
1443
+ (cluster_table.c.user_hash.in_(user_hashes_filter) |
1444
+ (cluster_table.c.user_hash is None)))
1441
1445
  else:
1442
1446
  query = query.filter(
1443
1447
  cluster_table.c.user_hash.in_(user_hashes_filter))
1448
+ if cluster_names is not None:
1449
+ query = query.filter(cluster_table.c.name.in_(cluster_names))
1444
1450
  query = query.order_by(sqlalchemy.desc(cluster_table.c.launched_at))
1445
1451
  rows = query.all()
1446
1452
  records = []
@@ -1500,7 +1506,9 @@ def get_clusters(
1500
1506
  @_init_db
1501
1507
  @metrics_lib.time_me
1502
1508
  def get_clusters_from_history(
1503
- days: Optional[int] = None) -> List[Dict[str, Any]]:
1509
+ days: Optional[int] = None,
1510
+ abbreviate_response: bool = False,
1511
+ cluster_hashes: Optional[List[str]] = None) -> List[Dict[str, Any]]:
1504
1512
  """Get cluster reports from history.
1505
1513
 
1506
1514
  Args:
@@ -1513,68 +1521,61 @@ def get_clusters_from_history(
1513
1521
  List of cluster records with history information.
1514
1522
  """
1515
1523
  assert _SQLALCHEMY_ENGINE is not None
1516
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
1517
- # Explicitly select columns from both tables to avoid ambiguity
1518
- query = session.query(
1519
- cluster_history_table.c.cluster_hash, cluster_history_table.c.name,
1520
- cluster_history_table.c.num_nodes,
1521
- cluster_history_table.c.requested_resources,
1522
- cluster_history_table.c.launched_resources,
1523
- cluster_history_table.c.usage_intervals,
1524
- cluster_history_table.c.user_hash,
1525
- cluster_history_table.c.last_creation_yaml,
1526
- cluster_history_table.c.last_creation_command,
1527
- cluster_history_table.c.workspace.label('history_workspace'),
1528
- cluster_table.c.status, cluster_table.c.workspace,
1529
- cluster_table.c.status_updated_at).select_from(
1530
- cluster_history_table.join(cluster_table,
1531
- cluster_history_table.c.cluster_hash
1532
- == cluster_table.c.cluster_hash,
1533
- isouter=True))
1534
1524
 
1535
- rows = query.all()
1525
+ current_user_hash = common_utils.get_user_hash()
1536
1526
 
1537
1527
  # Prepare filtering parameters
1538
1528
  cutoff_time = None
1539
1529
  if days is not None:
1540
1530
  cutoff_time = int(time.time()) - (days * 24 * 60 * 60)
1541
1531
 
1542
- current_user_hash = common_utils.get_user_hash()
1532
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1533
+ # Explicitly select columns from both tables to avoid ambiguity
1534
+ if abbreviate_response:
1535
+ query = session.query(
1536
+ cluster_history_table.c.cluster_hash,
1537
+ cluster_history_table.c.name, cluster_history_table.c.num_nodes,
1538
+ cluster_history_table.c.launched_resources,
1539
+ cluster_history_table.c.usage_intervals,
1540
+ cluster_history_table.c.user_hash,
1541
+ cluster_history_table.c.workspace.label('history_workspace'),
1542
+ cluster_table.c.status, cluster_table.c.workspace)
1543
+ else:
1544
+ query = session.query(
1545
+ cluster_history_table.c.cluster_hash,
1546
+ cluster_history_table.c.name, cluster_history_table.c.num_nodes,
1547
+ cluster_history_table.c.launched_resources,
1548
+ cluster_history_table.c.usage_intervals,
1549
+ cluster_history_table.c.user_hash,
1550
+ cluster_history_table.c.last_creation_yaml,
1551
+ cluster_history_table.c.last_creation_command,
1552
+ cluster_history_table.c.workspace.label('history_workspace'),
1553
+ cluster_table.c.status, cluster_table.c.workspace)
1554
+
1555
+ query = query.select_from(
1556
+ cluster_history_table.join(cluster_table,
1557
+ cluster_history_table.c.cluster_hash ==
1558
+ cluster_table.c.cluster_hash,
1559
+ isouter=True))
1560
+ if cluster_hashes is not None:
1561
+ query = query.filter(
1562
+ cluster_history_table.c.cluster_hash.in_(cluster_hashes))
1563
+ rows = query.all()
1543
1564
 
1544
- row_to_user_hash = {}
1565
+ filtered_rows = []
1545
1566
  usage_intervals_dict = {}
1567
+ row_to_user_hash = {}
1546
1568
  for row in rows:
1547
- user_hash = (row.user_hash
1548
- if row.user_hash is not None else current_user_hash)
1549
- row_to_user_hash[row.cluster_hash] = user_hash
1569
+ row_usage_intervals: List[Tuple[int, Optional[int]]] = []
1550
1570
  if row.usage_intervals:
1551
1571
  try:
1552
- usage_intervals_dict[row.cluster_hash] = pickle.loads(
1553
- row.usage_intervals)
1572
+ row_usage_intervals = pickle.loads(row.usage_intervals)
1554
1573
  except (pickle.PickleError, AttributeError):
1555
- usage_intervals_dict[row.cluster_hash] = []
1556
- user_hashes = set(row_to_user_hash.values())
1557
- user_hash_to_user = _get_users(user_hashes)
1558
-
1559
- cluster_hashes = set(row_to_user_hash.keys())
1560
- last_cluster_event_dict = _get_last_cluster_event_multiple(
1561
- cluster_hashes, ClusterEventType.STATUS_CHANGE)
1562
-
1563
- records = []
1564
- for row in rows:
1565
- user_hash = row_to_user_hash[row.cluster_hash]
1566
- user = user_hash_to_user.get(user_hash, None)
1567
- user_name = user.name if user is not None else None
1568
- last_event = last_cluster_event_dict.get(row.cluster_hash, None)
1569
- usage_intervals = usage_intervals_dict.get(row.cluster_hash, None)
1570
- launched_at = _get_cluster_launch_time(usage_intervals)
1571
- duration = _get_cluster_duration(usage_intervals)
1572
-
1574
+ pass
1573
1575
  # Parse status
1574
1576
  status = None
1575
1577
  if row.status:
1576
1578
  status = status_lib.ClusterStatus[row.status]
1577
-
1578
1579
  # Apply filtering: always include active clusters, filter historical
1579
1580
  # ones by time
1580
1581
  if cutoff_time is not None and status is None: # Historical cluster
@@ -1583,10 +1584,10 @@ def get_clusters_from_history(
1583
1584
  # last use
1584
1585
  # Find the most recent activity time from usage_intervals
1585
1586
  last_activity_time = None
1586
- if usage_intervals:
1587
+ if row_usage_intervals:
1587
1588
  # Get the end time of the last interval (or start time if
1588
1589
  # still running)
1589
- last_interval = usage_intervals[-1]
1590
+ last_interval = row_usage_intervals[-1]
1590
1591
  last_activity_time = (last_interval[1] if last_interval[1]
1591
1592
  is not None else last_interval[0])
1592
1593
 
@@ -1594,6 +1595,38 @@ def get_clusters_from_history(
1594
1595
  if last_activity_time is None or last_activity_time < cutoff_time:
1595
1596
  continue
1596
1597
 
1598
+ filtered_rows.append(row)
1599
+ usage_intervals_dict[row.cluster_hash] = row_usage_intervals
1600
+ user_hash = (row.user_hash
1601
+ if row.user_hash is not None else current_user_hash)
1602
+ row_to_user_hash[row.cluster_hash] = user_hash
1603
+
1604
+ rows = filtered_rows
1605
+ user_hashes = set(row_to_user_hash.values())
1606
+ user_hash_to_user = _get_users(user_hashes)
1607
+ cluster_hashes = set(row_to_user_hash.keys())
1608
+ if not abbreviate_response:
1609
+ last_cluster_event_dict = _get_last_cluster_event_multiple(
1610
+ cluster_hashes, ClusterEventType.STATUS_CHANGE)
1611
+
1612
+ records = []
1613
+ for row in rows:
1614
+ user_hash = row_to_user_hash[row.cluster_hash]
1615
+ user = user_hash_to_user.get(user_hash, None)
1616
+ user_name = user.name if user is not None else None
1617
+ if not abbreviate_response:
1618
+ last_event = last_cluster_event_dict.get(row.cluster_hash, None)
1619
+ usage_intervals: Optional[List[Tuple[
1620
+ int,
1621
+ Optional[int]]]] = usage_intervals_dict.get(row.cluster_hash, None)
1622
+ launched_at = _get_cluster_launch_time(usage_intervals)
1623
+ duration = _get_cluster_duration(usage_intervals)
1624
+
1625
+ # Parse status
1626
+ status = None
1627
+ if row.status:
1628
+ status = status_lib.ClusterStatus[row.status]
1629
+
1597
1630
  # Parse launched resources safely
1598
1631
  launched_resources = None
1599
1632
  if row.launched_resources:
@@ -1617,10 +1650,11 @@ def get_clusters_from_history(
1617
1650
  'user_hash': user_hash,
1618
1651
  'user_name': user_name,
1619
1652
  'workspace': workspace,
1620
- 'last_creation_yaml': row.last_creation_yaml,
1621
- 'last_creation_command': row.last_creation_command,
1622
- 'last_event': last_event,
1623
1653
  }
1654
+ if not abbreviate_response:
1655
+ record['last_creation_yaml'] = row.last_creation_yaml
1656
+ record['last_creation_command'] = row.last_creation_command
1657
+ record['last_event'] = last_event
1624
1658
 
1625
1659
  records.append(record)
1626
1660
 
sky/metrics/utils.py CHANGED
@@ -1,11 +1,165 @@
1
1
  """Utilities for processing GPU metrics from Kubernetes clusters."""
2
+ import contextlib
3
+ import functools
2
4
  import os
3
5
  import re
6
+ import select
4
7
  import subprocess
5
8
  import time
6
9
  from typing import List, Optional, Tuple
7
10
 
8
11
  import httpx
12
+ import prometheus_client as prom
13
+
14
+ from sky.skylet import constants
15
+ from sky.utils import context_utils
16
+
17
+ _SELECT_TIMEOUT = 1
18
+ _SELECT_BUFFER_SIZE = 4096
19
+
20
+ _KB = 2**10
21
+ _MB = 2**20
22
+ _MEM_BUCKETS = [
23
+ _KB,
24
+ 256 * _KB,
25
+ 512 * _KB,
26
+ _MB,
27
+ 2 * _MB,
28
+ 4 * _MB,
29
+ 8 * _MB,
30
+ 16 * _MB,
31
+ 32 * _MB,
32
+ 64 * _MB,
33
+ 128 * _MB,
34
+ 256 * _MB,
35
+ float('inf'),
36
+ ]
37
+
38
+ # Whether the metrics are enabled, cannot be changed at runtime.
39
+ METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
40
+ 'false').lower() == 'true'
41
+
42
+ # Time spent processing a piece of code, refer to time_it().
43
+ SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
44
+ 'sky_apiserver_code_duration_seconds',
45
+ 'Time spent processing code',
46
+ ['name', 'group'],
47
+ buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
48
+ 60.0, 120.0, float('inf')),
49
+ )
50
+
51
+ # Total number of API server requests, grouped by path, method, and status.
52
+ SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
53
+ 'sky_apiserver_requests_total',
54
+ 'Total number of API server requests',
55
+ ['path', 'method', 'status'],
56
+ )
57
+
58
+ # Time spent processing API server requests, grouped by path, method, and
59
+ # status.
60
+ SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
61
+ 'sky_apiserver_request_duration_seconds',
62
+ 'Time spent processing API server requests',
63
+ ['path', 'method', 'status'],
64
+ buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
65
+ 60.0, 120.0, float('inf')),
66
+ )
67
+
68
+ SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
69
+ 'sky_apiserver_event_loop_lag_seconds',
70
+ 'Scheduling delay of the server event loop',
71
+ ['pid'],
72
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 20.0,
73
+ 60.0, float('inf')),
74
+ )
75
+
76
+ SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
77
+ 'sky_apiserver_websocket_connections',
78
+ 'Number of websocket connections',
79
+ ['pid'],
80
+ multiprocess_mode='livesum',
81
+ )
82
+
83
+ SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL = prom.Counter(
84
+ 'sky_apiserver_websocket_closed_total',
85
+ 'Number of websocket closed',
86
+ ['pid', 'reason'],
87
+ )
88
+
89
+ # The number of execution starts in each worker process, we do not record
90
+ # histogram here as the duration has been measured in
91
+ # SKY_APISERVER_CODE_DURATION_SECONDS without the worker label (process id).
92
+ # Recording histogram WITH worker label will cause high cardinality.
93
+ SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL = prom.Counter(
94
+ 'sky_apiserver_process_execution_start_total',
95
+ 'Total number of execution starts in each worker process',
96
+ ['request', 'pid'],
97
+ )
98
+
99
+ SKY_APISERVER_PROCESS_PEAK_RSS = prom.Gauge(
100
+ 'sky_apiserver_process_peak_rss',
101
+ 'Peak RSS we saw in each process in last 30 seconds',
102
+ ['pid', 'type'],
103
+ )
104
+
105
+ SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
106
+ 'sky_apiserver_process_cpu_total',
107
+ 'Total CPU times a worker process has been running',
108
+ ['pid', 'type', 'mode'],
109
+ )
110
+
111
+ SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
112
+ 'sky_apiserver_request_memory_usage_bytes',
113
+ 'Peak memory usage of requests', ['name'],
114
+ buckets=_MEM_BUCKETS)
115
+
116
+ SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
117
+ 'sky_apiserver_request_rss_incr_bytes',
118
+ 'RSS increment after requests', ['name'],
119
+ buckets=_MEM_BUCKETS)
120
+
121
+
122
+ @contextlib.contextmanager
123
+ def time_it(name: str, group: str = 'default'):
124
+ """Context manager to measure and record code execution duration."""
125
+ if not METRICS_ENABLED:
126
+ yield
127
+ else:
128
+ start_time = time.time()
129
+ try:
130
+ yield
131
+ finally:
132
+ duration = time.time() - start_time
133
+ SKY_APISERVER_CODE_DURATION_SECONDS.labels(
134
+ name=name, group=group).observe(duration)
135
+
136
+
137
+ def time_me(func):
138
+ """Measure the duration of decorated function."""
139
+
140
+ @functools.wraps(func)
141
+ def wrapper(*args, **kwargs):
142
+ if not METRICS_ENABLED:
143
+ return func(*args, **kwargs)
144
+ name = f'{func.__module__}/{func.__name__}'
145
+ with time_it(name, group='function'):
146
+ return func(*args, **kwargs)
147
+
148
+ return wrapper
149
+
150
+
151
+ def time_me_async(func):
152
+ """Measure the duration of decorated async function."""
153
+
154
+ @functools.wraps(func)
155
+ async def async_wrapper(*args, **kwargs):
156
+ if not METRICS_ENABLED:
157
+ return await func(*args, **kwargs)
158
+ name = f'{func.__module__}/{func.__name__}'
159
+ with time_it(name, group='function'):
160
+ return await func(*args, **kwargs)
161
+
162
+ return async_wrapper
9
163
 
10
164
 
11
165
  def start_svc_port_forward(context: str, namespace: str, service: str,
@@ -44,6 +198,7 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
44
198
  local_port = None
45
199
  start_time = time.time()
46
200
 
201
+ buffer = ''
47
202
  # wait for the port forward to start and extract the local port
48
203
  while time.time() - start_time < start_port_forward_timeout:
49
204
  if port_forward_process.poll() is not None:
@@ -56,10 +211,16 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
56
211
 
57
212
  # read output line by line to find the local port
58
213
  if port_forward_process.stdout:
59
- line = port_forward_process.stdout.readline()
60
- if line:
61
- # look for 'Forwarding from 127.0.0.1:XXXXX -> service_port'
62
- match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', line)
214
+ # Wait up to 1s for data to be available without blocking
215
+ r, _, _ = select.select([port_forward_process.stdout], [], [],
216
+ _SELECT_TIMEOUT)
217
+ if r:
218
+ # Read available bytes from the FD without blocking
219
+ fd = port_forward_process.stdout.fileno()
220
+ raw = os.read(fd, _SELECT_BUFFER_SIZE)
221
+ chunk = raw.decode(errors='ignore')
222
+ buffer += chunk
223
+ match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', buffer)
63
224
  if match:
64
225
  local_port = int(match.group(1))
65
226
  break
@@ -122,8 +283,8 @@ async def send_metrics_request_with_port_forward(
122
283
  port_forward_process = None
123
284
  try:
124
285
  # Start port forward
125
- port_forward_process, local_port = start_svc_port_forward(
126
- context, namespace, service, service_port)
286
+ port_forward_process, local_port = await context_utils.to_thread(
287
+ start_svc_port_forward, context, namespace, service, service_port)
127
288
 
128
289
  # Build endpoint URL
129
290
  endpoint = f'http://localhost:{local_port}{endpoint_path}'
@@ -143,7 +304,8 @@ async def send_metrics_request_with_port_forward(
143
304
  finally:
144
305
  # Always clean up port forward
145
306
  if port_forward_process:
146
- stop_svc_port_forward(port_forward_process)
307
+ await context_utils.to_thread(stop_svc_port_forward,
308
+ port_forward_process)
147
309
 
148
310
 
149
311
  async def add_cluster_name_label(metrics_text: str, context: str) -> str:
@@ -193,7 +355,11 @@ async def get_metrics_for_context(context: str) -> str:
193
355
  """
194
356
  # Query both DCGM metrics and kube_pod_labels metrics
195
357
  # This ensures the dashboard can perform joins to filter by skypilot cluster
196
- match_patterns = ['{__name__=~"DCGM_.*"}', 'kube_pod_labels']
358
+ match_patterns = [
359
+ '{__name__=~"node_memory_MemAvailable_bytes|node_memory_MemTotal_bytes|DCGM_.*"}', # pylint: disable=line-too-long
360
+ 'kube_pod_labels',
361
+ 'node_cpu_seconds_total{mode="idle"}'
362
+ ]
197
363
 
198
364
  # TODO(rohan): don't hardcode the namespace and service name
199
365
  metrics_text = await send_metrics_request_with_port_forward(
@@ -14,7 +14,7 @@ _sym_db = _symbol_database.Default()
14
14
 
15
15
 
16
16
 
17
- DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\"sky/schemas/generated/jobsv1.proto\x12\x07jobs.v1\"\x85\x01\n\rAddJobRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x15\n\rrun_timestamp\x18\x03 \x01(\t\x12\x15\n\rresources_str\x18\x04 \x01(\t\x12\x10\n\x08metadata\x18\x05 \x01(\tB\x0b\n\t_job_name\"1\n\x0e\x41\x64\x64JobResponse\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07log_dir\x18\x02 \x01(\t\"\xb3\x01\n\x0fQueueJobRequest\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x14\n\x07\x63odegen\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bscript_path\x18\x03 \x01(\t\x12\x16\n\x0eremote_log_dir\x18\x04 \x01(\t\x12\x31\n\x0bmanaged_job\x18\x05 \x01(\x0b\x32\x17.jobs.v1.ManagedJobInfoH\x01\x88\x01\x01\x42\n\n\x08_codegenB\x0e\n\x0c_managed_job\"\x89\x01\n\x0eManagedJobInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x11\n\x04pool\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tworkspace\x18\x03 \x01(\t\x12\x12\n\nentrypoint\x18\x04 \x01(\t\x12&\n\x05tasks\x18\x05 \x03(\x0b\x32\x17.jobs.v1.ManagedJobTaskB\x07\n\x05_pool\"]\n\x0eManagedJobTask\x12\x0f\n\x07task_id\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x15\n\rresources_str\x18\x03 \x01(\t\x12\x15\n\rmetadata_json\x18\x04 \x01(\t\"\x12\n\x10QueueJobResponse\"\x15\n\x13UpdateStatusRequest\"\x16\n\x14UpdateStatusResponse\"L\n\x12GetJobQueueRequest\x12\x16\n\tuser_hash\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08\x61ll_jobs\x18\x02 \x01(\x08\x42\x0c\n\n_user_hash\"\xf4\x01\n\x07JobInfo\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x10\n\x08job_name\x18\x02 \x01(\t\x12\x10\n\x08username\x18\x03 \x01(\t\x12\x14\n\x0csubmitted_at\x18\x04 \x01(\x01\x12\"\n\x06status\x18\x05 \x01(\x0e\x32\x12.jobs.v1.JobStatus\x12\x15\n\rrun_timestamp\x18\x06 \x01(\t\x12\x10\n\x08start_at\x18\x07 \x01(\x01\x12\x0e\n\x06\x65nd_at\x18\x08 \x01(\x01\x12\x11\n\tresources\x18\t \x01(\t\x12\x0b\n\x03pid\x18\n \x01(\x03\x12\x10\n\x08log_path\x18\x0b \x01(\t\x12\x10\n\x08metadata\x18\x0c \x01(\t\"5\n\x13GetJobQueueResponse\x12\x1e\n\x04jobs\x18\x01 \x03(\x0b\x32\x10.jobs.v1.JobInfo\"^\n\x11\x43\x61ncelJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\x12\x12\n\ncancel_all\x18\x02 \x01(\x08\x12\x16\n\tuser_hash\x18\x03 \x01(\tH\x00\x88\x01\x01\x42\x0c\n\n_user_hash\"/\n\x12\x43\x61ncelJobsResponse\x12\x19\n\x11\x63\x61ncelled_job_ids\x18\x01 \x03(\x03\"\x1e\n\x1c\x46\x61ilAllInProgressJobsRequest\"\x1f\n\x1d\x46\x61ilAllInProgressJobsResponse\"\x7f\n\x0fTailLogsRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x12\x1b\n\x0emanaged_job_id\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x0e\n\x06\x66ollow\x18\x03 \x01(\x08\x12\x0c\n\x04tail\x18\x04 \x01(\x05\x42\t\n\x07_job_idB\x11\n\x0f_managed_job_id\"7\n\x10TailLogsResponse\x12\x10\n\x08log_line\x18\x01 \x01(\t\x12\x11\n\texit_code\x18\x02 \x01(\x05\"&\n\x13GetJobStatusRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\xa4\x01\n\x14GetJobStatusResponse\x12\x44\n\x0cjob_statuses\x18\x01 \x03(\x0b\x32..jobs.v1.GetJobStatusResponse.JobStatusesEntry\x1a\x46\n\x10JobStatusesEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12!\n\x05value\x18\x02 \x01(\x0e\x32\x12.jobs.v1.JobStatus:\x02\x38\x01\"A\n\x1fGetJobSubmittedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"5\n GetJobSubmittedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"=\n\x1bGetJobEndedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"1\n\x1cGetJobEndedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"+\n\x18GetLogDirsForJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\x98\x01\n\x19GetLogDirsForJobsResponse\x12H\n\x0cjob_log_dirs\x18\x01 \x03(\x0b\x32\x32.jobs.v1.GetLogDirsForJobsResponse.JobLogDirsEntry\x1a\x31\n\x0fJobLogDirsEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01*\x8d\x02\n\tJobStatus\x12\x1a\n\x16JOB_STATUS_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOB_STATUS_INIT\x10\x01\x12\x16\n\x12JOB_STATUS_PENDING\x10\x02\x12\x19\n\x15JOB_STATUS_SETTING_UP\x10\x03\x12\x16\n\x12JOB_STATUS_RUNNING\x10\x04\x12\x1c\n\x18JOB_STATUS_FAILED_DRIVER\x10\x05\x12\x18\n\x14JOB_STATUS_SUCCEEDED\x10\x06\x12\x15\n\x11JOB_STATUS_FAILED\x10\x07\x12\x1b\n\x17JOB_STATUS_FAILED_SETUP\x10\x08\x12\x18\n\x14JOB_STATUS_CANCELLED\x10\t2\x91\x07\n\x0bJobsService\x12\x39\n\x06\x41\x64\x64Job\x12\x16.jobs.v1.AddJobRequest\x1a\x17.jobs.v1.AddJobResponse\x12?\n\x08QueueJob\x12\x18.jobs.v1.QueueJobRequest\x1a\x19.jobs.v1.QueueJobResponse\x12K\n\x0cUpdateStatus\x12\x1c.jobs.v1.UpdateStatusRequest\x1a\x1d.jobs.v1.UpdateStatusResponse\x12H\n\x0bGetJobQueue\x12\x1b.jobs.v1.GetJobQueueRequest\x1a\x1c.jobs.v1.GetJobQueueResponse\x12\x45\n\nCancelJobs\x12\x1a.jobs.v1.CancelJobsRequest\x1a\x1b.jobs.v1.CancelJobsResponse\x12\x66\n\x15\x46\x61ilAllInProgressJobs\x12%.jobs.v1.FailAllInProgressJobsRequest\x1a&.jobs.v1.FailAllInProgressJobsResponse\x12\x41\n\x08TailLogs\x12\x18.jobs.v1.TailLogsRequest\x1a\x19.jobs.v1.TailLogsResponse0\x01\x12K\n\x0cGetJobStatus\x12\x1c.jobs.v1.GetJobStatusRequest\x1a\x1d.jobs.v1.GetJobStatusResponse\x12o\n\x18GetJobSubmittedTimestamp\x12(.jobs.v1.GetJobSubmittedTimestampRequest\x1a).jobs.v1.GetJobSubmittedTimestampResponse\x12\x63\n\x14GetJobEndedTimestamp\x12$.jobs.v1.GetJobEndedTimestampRequest\x1a%.jobs.v1.GetJobEndedTimestampResponse\x12Z\n\x11GetLogDirsForJobs\x12!.jobs.v1.GetLogDirsForJobsRequest\x1a\".jobs.v1.GetLogDirsForJobsResponseb\x06proto3')
17
+ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\"sky/schemas/generated/jobsv1.proto\x12\x07jobs.v1\"\x85\x01\n\rAddJobRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x15\n\rrun_timestamp\x18\x03 \x01(\t\x12\x15\n\rresources_str\x18\x04 \x01(\t\x12\x10\n\x08metadata\x18\x05 \x01(\tB\x0b\n\t_job_name\"1\n\x0e\x41\x64\x64JobResponse\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07log_dir\x18\x02 \x01(\t\"\xb3\x01\n\x0fQueueJobRequest\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x14\n\x07\x63odegen\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bscript_path\x18\x03 \x01(\t\x12\x16\n\x0eremote_log_dir\x18\x04 \x01(\t\x12\x31\n\x0bmanaged_job\x18\x05 \x01(\x0b\x32\x17.jobs.v1.ManagedJobInfoH\x01\x88\x01\x01\x42\n\n\x08_codegenB\x0e\n\x0c_managed_job\"\x89\x01\n\x0eManagedJobInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x11\n\x04pool\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tworkspace\x18\x03 \x01(\t\x12\x12\n\nentrypoint\x18\x04 \x01(\t\x12&\n\x05tasks\x18\x05 \x03(\x0b\x32\x17.jobs.v1.ManagedJobTaskB\x07\n\x05_pool\"]\n\x0eManagedJobTask\x12\x0f\n\x07task_id\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x15\n\rresources_str\x18\x03 \x01(\t\x12\x15\n\rmetadata_json\x18\x04 \x01(\t\"\x12\n\x10QueueJobResponse\"\x15\n\x13UpdateStatusRequest\"\x16\n\x14UpdateStatusResponse\"L\n\x12GetJobQueueRequest\x12\x16\n\tuser_hash\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08\x61ll_jobs\x18\x02 \x01(\x08\x42\x0c\n\n_user_hash\"\xa3\x02\n\x07JobInfo\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x10\n\x08job_name\x18\x02 \x01(\t\x12\x10\n\x08username\x18\x03 \x01(\t\x12\x14\n\x0csubmitted_at\x18\x04 \x01(\x01\x12\"\n\x06status\x18\x05 \x01(\x0e\x32\x12.jobs.v1.JobStatus\x12\x15\n\rrun_timestamp\x18\x06 \x01(\t\x12\x15\n\x08start_at\x18\x07 \x01(\x01H\x00\x88\x01\x01\x12\x13\n\x06\x65nd_at\x18\x08 \x01(\x01H\x01\x88\x01\x01\x12\x11\n\tresources\x18\t \x01(\t\x12\x10\n\x03pid\x18\n \x01(\x03H\x02\x88\x01\x01\x12\x10\n\x08log_path\x18\x0b \x01(\t\x12\x10\n\x08metadata\x18\x0c \x01(\tB\x0b\n\t_start_atB\t\n\x07_end_atB\x06\n\x04_pid\"5\n\x13GetJobQueueResponse\x12\x1e\n\x04jobs\x18\x01 \x03(\x0b\x32\x10.jobs.v1.JobInfo\"^\n\x11\x43\x61ncelJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\x12\x12\n\ncancel_all\x18\x02 \x01(\x08\x12\x16\n\tuser_hash\x18\x03 \x01(\tH\x00\x88\x01\x01\x42\x0c\n\n_user_hash\"/\n\x12\x43\x61ncelJobsResponse\x12\x19\n\x11\x63\x61ncelled_job_ids\x18\x01 \x03(\x03\"\x1e\n\x1c\x46\x61ilAllInProgressJobsRequest\"\x1f\n\x1d\x46\x61ilAllInProgressJobsResponse\"\x7f\n\x0fTailLogsRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x12\x1b\n\x0emanaged_job_id\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x0e\n\x06\x66ollow\x18\x03 \x01(\x08\x12\x0c\n\x04tail\x18\x04 \x01(\x05\x42\t\n\x07_job_idB\x11\n\x0f_managed_job_id\"7\n\x10TailLogsResponse\x12\x10\n\x08log_line\x18\x01 \x01(\t\x12\x11\n\texit_code\x18\x02 \x01(\x05\"&\n\x13GetJobStatusRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\xa4\x01\n\x14GetJobStatusResponse\x12\x44\n\x0cjob_statuses\x18\x01 \x03(\x0b\x32..jobs.v1.GetJobStatusResponse.JobStatusesEntry\x1a\x46\n\x10JobStatusesEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12!\n\x05value\x18\x02 \x01(\x0e\x32\x12.jobs.v1.JobStatus:\x02\x38\x01\"A\n\x1fGetJobSubmittedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"5\n GetJobSubmittedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"=\n\x1bGetJobEndedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"1\n\x1cGetJobEndedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"+\n\x18GetLogDirsForJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\x98\x01\n\x19GetLogDirsForJobsResponse\x12H\n\x0cjob_log_dirs\x18\x01 \x03(\x0b\x32\x32.jobs.v1.GetLogDirsForJobsResponse.JobLogDirsEntry\x1a\x31\n\x0fJobLogDirsEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01*\x8d\x02\n\tJobStatus\x12\x1a\n\x16JOB_STATUS_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOB_STATUS_INIT\x10\x01\x12\x16\n\x12JOB_STATUS_PENDING\x10\x02\x12\x19\n\x15JOB_STATUS_SETTING_UP\x10\x03\x12\x16\n\x12JOB_STATUS_RUNNING\x10\x04\x12\x1c\n\x18JOB_STATUS_FAILED_DRIVER\x10\x05\x12\x18\n\x14JOB_STATUS_SUCCEEDED\x10\x06\x12\x15\n\x11JOB_STATUS_FAILED\x10\x07\x12\x1b\n\x17JOB_STATUS_FAILED_SETUP\x10\x08\x12\x18\n\x14JOB_STATUS_CANCELLED\x10\t2\x91\x07\n\x0bJobsService\x12\x39\n\x06\x41\x64\x64Job\x12\x16.jobs.v1.AddJobRequest\x1a\x17.jobs.v1.AddJobResponse\x12?\n\x08QueueJob\x12\x18.jobs.v1.QueueJobRequest\x1a\x19.jobs.v1.QueueJobResponse\x12K\n\x0cUpdateStatus\x12\x1c.jobs.v1.UpdateStatusRequest\x1a\x1d.jobs.v1.UpdateStatusResponse\x12H\n\x0bGetJobQueue\x12\x1b.jobs.v1.GetJobQueueRequest\x1a\x1c.jobs.v1.GetJobQueueResponse\x12\x45\n\nCancelJobs\x12\x1a.jobs.v1.CancelJobsRequest\x1a\x1b.jobs.v1.CancelJobsResponse\x12\x66\n\x15\x46\x61ilAllInProgressJobs\x12%.jobs.v1.FailAllInProgressJobsRequest\x1a&.jobs.v1.FailAllInProgressJobsResponse\x12\x41\n\x08TailLogs\x12\x18.jobs.v1.TailLogsRequest\x1a\x19.jobs.v1.TailLogsResponse0\x01\x12K\n\x0cGetJobStatus\x12\x1c.jobs.v1.GetJobStatusRequest\x1a\x1d.jobs.v1.GetJobStatusResponse\x12o\n\x18GetJobSubmittedTimestamp\x12(.jobs.v1.GetJobSubmittedTimestampRequest\x1a).jobs.v1.GetJobSubmittedTimestampResponse\x12\x63\n\x14GetJobEndedTimestamp\x12$.jobs.v1.GetJobEndedTimestampRequest\x1a%.jobs.v1.GetJobEndedTimestampResponse\x12Z\n\x11GetLogDirsForJobs\x12!.jobs.v1.GetLogDirsForJobsRequest\x1a\".jobs.v1.GetLogDirsForJobsResponseb\x06proto3')
18
18
 
19
19
  _globals = globals()
20
20
  _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
@@ -25,8 +25,8 @@ if not _descriptor._USE_C_DESCRIPTORS:
25
25
  _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_options = b'8\001'
26
26
  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._loaded_options = None
27
27
  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_options = b'8\001'
28
- _globals['_JOBSTATUS']._serialized_start=2138
29
- _globals['_JOBSTATUS']._serialized_end=2407
28
+ _globals['_JOBSTATUS']._serialized_start=2185
29
+ _globals['_JOBSTATUS']._serialized_end=2454
30
30
  _globals['_ADDJOBREQUEST']._serialized_start=48
31
31
  _globals['_ADDJOBREQUEST']._serialized_end=181
32
32
  _globals['_ADDJOBRESPONSE']._serialized_start=183
@@ -46,41 +46,41 @@ if not _descriptor._USE_C_DESCRIPTORS:
46
46
  _globals['_GETJOBQUEUEREQUEST']._serialized_start=718
47
47
  _globals['_GETJOBQUEUEREQUEST']._serialized_end=794
48
48
  _globals['_JOBINFO']._serialized_start=797
49
- _globals['_JOBINFO']._serialized_end=1041
50
- _globals['_GETJOBQUEUERESPONSE']._serialized_start=1043
51
- _globals['_GETJOBQUEUERESPONSE']._serialized_end=1096
52
- _globals['_CANCELJOBSREQUEST']._serialized_start=1098
53
- _globals['_CANCELJOBSREQUEST']._serialized_end=1192
54
- _globals['_CANCELJOBSRESPONSE']._serialized_start=1194
55
- _globals['_CANCELJOBSRESPONSE']._serialized_end=1241
56
- _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_start=1243
57
- _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_end=1273
58
- _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_start=1275
59
- _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_end=1306
60
- _globals['_TAILLOGSREQUEST']._serialized_start=1308
61
- _globals['_TAILLOGSREQUEST']._serialized_end=1435
62
- _globals['_TAILLOGSRESPONSE']._serialized_start=1437
63
- _globals['_TAILLOGSRESPONSE']._serialized_end=1492
64
- _globals['_GETJOBSTATUSREQUEST']._serialized_start=1494
65
- _globals['_GETJOBSTATUSREQUEST']._serialized_end=1532
66
- _globals['_GETJOBSTATUSRESPONSE']._serialized_start=1535
67
- _globals['_GETJOBSTATUSRESPONSE']._serialized_end=1699
68
- _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_start=1629
69
- _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_end=1699
70
- _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_start=1701
71
- _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_end=1766
72
- _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_start=1768
73
- _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_end=1821
74
- _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_start=1823
75
- _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_end=1884
76
- _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_start=1886
77
- _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_end=1935
78
- _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_start=1937
79
- _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_end=1980
80
- _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_start=1983
81
- _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_end=2135
82
- _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_start=2086
83
- _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_end=2135
84
- _globals['_JOBSSERVICE']._serialized_start=2410
85
- _globals['_JOBSSERVICE']._serialized_end=3323
49
+ _globals['_JOBINFO']._serialized_end=1088
50
+ _globals['_GETJOBQUEUERESPONSE']._serialized_start=1090
51
+ _globals['_GETJOBQUEUERESPONSE']._serialized_end=1143
52
+ _globals['_CANCELJOBSREQUEST']._serialized_start=1145
53
+ _globals['_CANCELJOBSREQUEST']._serialized_end=1239
54
+ _globals['_CANCELJOBSRESPONSE']._serialized_start=1241
55
+ _globals['_CANCELJOBSRESPONSE']._serialized_end=1288
56
+ _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_start=1290
57
+ _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_end=1320
58
+ _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_start=1322
59
+ _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_end=1353
60
+ _globals['_TAILLOGSREQUEST']._serialized_start=1355
61
+ _globals['_TAILLOGSREQUEST']._serialized_end=1482
62
+ _globals['_TAILLOGSRESPONSE']._serialized_start=1484
63
+ _globals['_TAILLOGSRESPONSE']._serialized_end=1539
64
+ _globals['_GETJOBSTATUSREQUEST']._serialized_start=1541
65
+ _globals['_GETJOBSTATUSREQUEST']._serialized_end=1579
66
+ _globals['_GETJOBSTATUSRESPONSE']._serialized_start=1582
67
+ _globals['_GETJOBSTATUSRESPONSE']._serialized_end=1746
68
+ _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_start=1676
69
+ _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_end=1746
70
+ _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_start=1748
71
+ _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_end=1813
72
+ _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_start=1815
73
+ _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_end=1868
74
+ _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_start=1870
75
+ _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_end=1931
76
+ _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_start=1933
77
+ _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_end=1982
78
+ _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_start=1984
79
+ _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_end=2027
80
+ _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_start=2030
81
+ _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_end=2182
82
+ _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_start=2133
83
+ _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_end=2182
84
+ _globals['_JOBSSERVICE']._serialized_start=2457
85
+ _globals['_JOBSSERVICE']._serialized_end=3370
86
86
  # @@protoc_insertion_point(module_scope)
sky/serve/serve_utils.py CHANGED
@@ -1329,10 +1329,6 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
1329
1329
  print(line, end='', flush=True)
1330
1330
  return ''
1331
1331
 
1332
- # For pools, we don't stream the job logs as the run section is ignored.
1333
- if pool:
1334
- return ''
1335
-
1336
1332
  backend = backends.CloudVmRayBackend()
1337
1333
  handle = global_user_state.get_handle_from_cluster_name(
1338
1334
  replica_cluster_name)
@@ -37,8 +37,8 @@ OAUTH2_PROXY_ENABLED_ENV_VAR = 'SKYPILOT_AUTH_OAUTH2_PROXY_ENABLED'
37
37
  class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
38
38
  """Middleware to handle authentication by delegating to OAuth2 Proxy."""
39
39
 
40
- def __init__(self, application: fastapi.FastAPI):
41
- super().__init__(application)
40
+ def __init__(self, *args, **kwargs):
41
+ super().__init__(*args, **kwargs)
42
42
  self.enabled: bool = (os.getenv(OAUTH2_PROXY_ENABLED_ENV_VAR,
43
43
  'false') == 'true')
44
44
  self.proxy_base: str = ''