skypilot-nightly 1.0.0.dev20250918__py3-none-any.whl → 1.0.0.dev20250922__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +12 -15
- sky/core.py +67 -45
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{k1mo5xWZrV9djgjd0moOT → KP6HCNMqb_bnJB17oplgW}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-88c7c8d69b0b6dba.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-0b4b35dc1dfe046c.js → [cluster]-9525660179df3605.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-487697b47d8c5e50.js → webpack-26167a9e6d91fa51.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +90 -56
- sky/metrics/utils.py +174 -8
- sky/schemas/generated/jobsv1_pb2.py +40 -40
- sky/serve/serve_utils.py +0 -4
- sky/server/auth/oauth2_proxy.py +2 -2
- sky/server/metrics.py +52 -158
- sky/server/requests/executor.py +9 -8
- sky/server/requests/payloads.py +6 -0
- sky/server/requests/requests.py +1 -1
- sky/server/requests/serializers/encoders.py +3 -2
- sky/server/server.py +5 -41
- sky/setup_files/dependencies.py +8 -1
- sky/skylet/constants.py +6 -4
- sky/skylet/job_lib.py +14 -15
- sky/utils/locks.py +41 -10
- {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/METADATA +35 -35
- {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/RECORD +48 -48
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/3015-ba5be550eb80fd8c.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-a3e3f0683e19d340.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- /sky/dashboard/out/_next/static/{k1mo5xWZrV9djgjd0moOT → KP6HCNMqb_bnJB17oplgW}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250918.dist-info → skypilot_nightly-1.0.0.dev20250922.dist-info}/top_level.txt +0 -0
sky/global_user_state.py
CHANGED
|
@@ -29,7 +29,7 @@ from sqlalchemy.ext import declarative
|
|
|
29
29
|
from sky import models
|
|
30
30
|
from sky import sky_logging
|
|
31
31
|
from sky import skypilot_config
|
|
32
|
-
from sky.
|
|
32
|
+
from sky.metrics import utils as metrics_lib
|
|
33
33
|
from sky.skylet import constants
|
|
34
34
|
from sky.utils import common_utils
|
|
35
35
|
from sky.utils import context_utils
|
|
@@ -1238,14 +1238,15 @@ def _get_cluster_usage_intervals(
|
|
|
1238
1238
|
|
|
1239
1239
|
|
|
1240
1240
|
def _get_cluster_launch_time(
|
|
1241
|
-
|
|
1241
|
+
usage_intervals: Optional[List[Tuple[int,
|
|
1242
|
+
Optional[int]]]]) -> Optional[int]:
|
|
1242
1243
|
if usage_intervals is None:
|
|
1243
1244
|
return None
|
|
1244
1245
|
return usage_intervals[0][0]
|
|
1245
1246
|
|
|
1246
1247
|
|
|
1247
1248
|
def _get_cluster_duration(
|
|
1248
|
-
usage_intervals: List[Tuple[int, Optional[int]]]) -> int:
|
|
1249
|
+
usage_intervals: Optional[List[Tuple[int, Optional[int]]]]) -> int:
|
|
1249
1250
|
total_duration = 0
|
|
1250
1251
|
|
|
1251
1252
|
if usage_intervals is None:
|
|
@@ -1408,6 +1409,7 @@ def get_clusters(
|
|
|
1408
1409
|
exclude_managed_clusters: bool = False,
|
|
1409
1410
|
workspaces_filter: Optional[Set[str]] = None,
|
|
1410
1411
|
user_hashes_filter: Optional[Set[str]] = None,
|
|
1412
|
+
cluster_names: Optional[List[str]] = None,
|
|
1411
1413
|
) -> List[Dict[str, Any]]:
|
|
1412
1414
|
"""Get clusters from the database.
|
|
1413
1415
|
|
|
@@ -1418,6 +1420,8 @@ def get_clusters(
|
|
|
1418
1420
|
that has workspace field set to one of the values.
|
|
1419
1421
|
user_hashes_filter: If specified, only include clusters
|
|
1420
1422
|
that has user_hash field set to one of the values.
|
|
1423
|
+
cluster_names: If specified, only include clusters
|
|
1424
|
+
that has name field set to one of the values.
|
|
1421
1425
|
"""
|
|
1422
1426
|
# is a cluster has a null user_hash,
|
|
1423
1427
|
# we treat it as belonging to the current user.
|
|
@@ -1436,11 +1440,13 @@ def get_clusters(
|
|
|
1436
1440
|
# If current_user_hash is in user_hashes_filter, we include
|
|
1437
1441
|
# clusters that have a null user_hash.
|
|
1438
1442
|
query = query.filter(
|
|
1439
|
-
cluster_table.c.user_hash.in_(user_hashes_filter) |
|
|
1440
|
-
|
|
1443
|
+
(cluster_table.c.user_hash.in_(user_hashes_filter) |
|
|
1444
|
+
(cluster_table.c.user_hash is None)))
|
|
1441
1445
|
else:
|
|
1442
1446
|
query = query.filter(
|
|
1443
1447
|
cluster_table.c.user_hash.in_(user_hashes_filter))
|
|
1448
|
+
if cluster_names is not None:
|
|
1449
|
+
query = query.filter(cluster_table.c.name.in_(cluster_names))
|
|
1444
1450
|
query = query.order_by(sqlalchemy.desc(cluster_table.c.launched_at))
|
|
1445
1451
|
rows = query.all()
|
|
1446
1452
|
records = []
|
|
@@ -1500,7 +1506,9 @@ def get_clusters(
|
|
|
1500
1506
|
@_init_db
|
|
1501
1507
|
@metrics_lib.time_me
|
|
1502
1508
|
def get_clusters_from_history(
|
|
1503
|
-
days: Optional[int] = None
|
|
1509
|
+
days: Optional[int] = None,
|
|
1510
|
+
abbreviate_response: bool = False,
|
|
1511
|
+
cluster_hashes: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
1504
1512
|
"""Get cluster reports from history.
|
|
1505
1513
|
|
|
1506
1514
|
Args:
|
|
@@ -1513,68 +1521,61 @@ def get_clusters_from_history(
|
|
|
1513
1521
|
List of cluster records with history information.
|
|
1514
1522
|
"""
|
|
1515
1523
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1516
|
-
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1517
|
-
# Explicitly select columns from both tables to avoid ambiguity
|
|
1518
|
-
query = session.query(
|
|
1519
|
-
cluster_history_table.c.cluster_hash, cluster_history_table.c.name,
|
|
1520
|
-
cluster_history_table.c.num_nodes,
|
|
1521
|
-
cluster_history_table.c.requested_resources,
|
|
1522
|
-
cluster_history_table.c.launched_resources,
|
|
1523
|
-
cluster_history_table.c.usage_intervals,
|
|
1524
|
-
cluster_history_table.c.user_hash,
|
|
1525
|
-
cluster_history_table.c.last_creation_yaml,
|
|
1526
|
-
cluster_history_table.c.last_creation_command,
|
|
1527
|
-
cluster_history_table.c.workspace.label('history_workspace'),
|
|
1528
|
-
cluster_table.c.status, cluster_table.c.workspace,
|
|
1529
|
-
cluster_table.c.status_updated_at).select_from(
|
|
1530
|
-
cluster_history_table.join(cluster_table,
|
|
1531
|
-
cluster_history_table.c.cluster_hash
|
|
1532
|
-
== cluster_table.c.cluster_hash,
|
|
1533
|
-
isouter=True))
|
|
1534
1524
|
|
|
1535
|
-
|
|
1525
|
+
current_user_hash = common_utils.get_user_hash()
|
|
1536
1526
|
|
|
1537
1527
|
# Prepare filtering parameters
|
|
1538
1528
|
cutoff_time = None
|
|
1539
1529
|
if days is not None:
|
|
1540
1530
|
cutoff_time = int(time.time()) - (days * 24 * 60 * 60)
|
|
1541
1531
|
|
|
1542
|
-
|
|
1532
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1533
|
+
# Explicitly select columns from both tables to avoid ambiguity
|
|
1534
|
+
if abbreviate_response:
|
|
1535
|
+
query = session.query(
|
|
1536
|
+
cluster_history_table.c.cluster_hash,
|
|
1537
|
+
cluster_history_table.c.name, cluster_history_table.c.num_nodes,
|
|
1538
|
+
cluster_history_table.c.launched_resources,
|
|
1539
|
+
cluster_history_table.c.usage_intervals,
|
|
1540
|
+
cluster_history_table.c.user_hash,
|
|
1541
|
+
cluster_history_table.c.workspace.label('history_workspace'),
|
|
1542
|
+
cluster_table.c.status, cluster_table.c.workspace)
|
|
1543
|
+
else:
|
|
1544
|
+
query = session.query(
|
|
1545
|
+
cluster_history_table.c.cluster_hash,
|
|
1546
|
+
cluster_history_table.c.name, cluster_history_table.c.num_nodes,
|
|
1547
|
+
cluster_history_table.c.launched_resources,
|
|
1548
|
+
cluster_history_table.c.usage_intervals,
|
|
1549
|
+
cluster_history_table.c.user_hash,
|
|
1550
|
+
cluster_history_table.c.last_creation_yaml,
|
|
1551
|
+
cluster_history_table.c.last_creation_command,
|
|
1552
|
+
cluster_history_table.c.workspace.label('history_workspace'),
|
|
1553
|
+
cluster_table.c.status, cluster_table.c.workspace)
|
|
1554
|
+
|
|
1555
|
+
query = query.select_from(
|
|
1556
|
+
cluster_history_table.join(cluster_table,
|
|
1557
|
+
cluster_history_table.c.cluster_hash ==
|
|
1558
|
+
cluster_table.c.cluster_hash,
|
|
1559
|
+
isouter=True))
|
|
1560
|
+
if cluster_hashes is not None:
|
|
1561
|
+
query = query.filter(
|
|
1562
|
+
cluster_history_table.c.cluster_hash.in_(cluster_hashes))
|
|
1563
|
+
rows = query.all()
|
|
1543
1564
|
|
|
1544
|
-
|
|
1565
|
+
filtered_rows = []
|
|
1545
1566
|
usage_intervals_dict = {}
|
|
1567
|
+
row_to_user_hash = {}
|
|
1546
1568
|
for row in rows:
|
|
1547
|
-
|
|
1548
|
-
if row.user_hash is not None else current_user_hash)
|
|
1549
|
-
row_to_user_hash[row.cluster_hash] = user_hash
|
|
1569
|
+
row_usage_intervals: List[Tuple[int, Optional[int]]] = []
|
|
1550
1570
|
if row.usage_intervals:
|
|
1551
1571
|
try:
|
|
1552
|
-
|
|
1553
|
-
row.usage_intervals)
|
|
1572
|
+
row_usage_intervals = pickle.loads(row.usage_intervals)
|
|
1554
1573
|
except (pickle.PickleError, AttributeError):
|
|
1555
|
-
|
|
1556
|
-
user_hashes = set(row_to_user_hash.values())
|
|
1557
|
-
user_hash_to_user = _get_users(user_hashes)
|
|
1558
|
-
|
|
1559
|
-
cluster_hashes = set(row_to_user_hash.keys())
|
|
1560
|
-
last_cluster_event_dict = _get_last_cluster_event_multiple(
|
|
1561
|
-
cluster_hashes, ClusterEventType.STATUS_CHANGE)
|
|
1562
|
-
|
|
1563
|
-
records = []
|
|
1564
|
-
for row in rows:
|
|
1565
|
-
user_hash = row_to_user_hash[row.cluster_hash]
|
|
1566
|
-
user = user_hash_to_user.get(user_hash, None)
|
|
1567
|
-
user_name = user.name if user is not None else None
|
|
1568
|
-
last_event = last_cluster_event_dict.get(row.cluster_hash, None)
|
|
1569
|
-
usage_intervals = usage_intervals_dict.get(row.cluster_hash, None)
|
|
1570
|
-
launched_at = _get_cluster_launch_time(usage_intervals)
|
|
1571
|
-
duration = _get_cluster_duration(usage_intervals)
|
|
1572
|
-
|
|
1574
|
+
pass
|
|
1573
1575
|
# Parse status
|
|
1574
1576
|
status = None
|
|
1575
1577
|
if row.status:
|
|
1576
1578
|
status = status_lib.ClusterStatus[row.status]
|
|
1577
|
-
|
|
1578
1579
|
# Apply filtering: always include active clusters, filter historical
|
|
1579
1580
|
# ones by time
|
|
1580
1581
|
if cutoff_time is not None and status is None: # Historical cluster
|
|
@@ -1583,10 +1584,10 @@ def get_clusters_from_history(
|
|
|
1583
1584
|
# last use
|
|
1584
1585
|
# Find the most recent activity time from usage_intervals
|
|
1585
1586
|
last_activity_time = None
|
|
1586
|
-
if
|
|
1587
|
+
if row_usage_intervals:
|
|
1587
1588
|
# Get the end time of the last interval (or start time if
|
|
1588
1589
|
# still running)
|
|
1589
|
-
last_interval =
|
|
1590
|
+
last_interval = row_usage_intervals[-1]
|
|
1590
1591
|
last_activity_time = (last_interval[1] if last_interval[1]
|
|
1591
1592
|
is not None else last_interval[0])
|
|
1592
1593
|
|
|
@@ -1594,6 +1595,38 @@ def get_clusters_from_history(
|
|
|
1594
1595
|
if last_activity_time is None or last_activity_time < cutoff_time:
|
|
1595
1596
|
continue
|
|
1596
1597
|
|
|
1598
|
+
filtered_rows.append(row)
|
|
1599
|
+
usage_intervals_dict[row.cluster_hash] = row_usage_intervals
|
|
1600
|
+
user_hash = (row.user_hash
|
|
1601
|
+
if row.user_hash is not None else current_user_hash)
|
|
1602
|
+
row_to_user_hash[row.cluster_hash] = user_hash
|
|
1603
|
+
|
|
1604
|
+
rows = filtered_rows
|
|
1605
|
+
user_hashes = set(row_to_user_hash.values())
|
|
1606
|
+
user_hash_to_user = _get_users(user_hashes)
|
|
1607
|
+
cluster_hashes = set(row_to_user_hash.keys())
|
|
1608
|
+
if not abbreviate_response:
|
|
1609
|
+
last_cluster_event_dict = _get_last_cluster_event_multiple(
|
|
1610
|
+
cluster_hashes, ClusterEventType.STATUS_CHANGE)
|
|
1611
|
+
|
|
1612
|
+
records = []
|
|
1613
|
+
for row in rows:
|
|
1614
|
+
user_hash = row_to_user_hash[row.cluster_hash]
|
|
1615
|
+
user = user_hash_to_user.get(user_hash, None)
|
|
1616
|
+
user_name = user.name if user is not None else None
|
|
1617
|
+
if not abbreviate_response:
|
|
1618
|
+
last_event = last_cluster_event_dict.get(row.cluster_hash, None)
|
|
1619
|
+
usage_intervals: Optional[List[Tuple[
|
|
1620
|
+
int,
|
|
1621
|
+
Optional[int]]]] = usage_intervals_dict.get(row.cluster_hash, None)
|
|
1622
|
+
launched_at = _get_cluster_launch_time(usage_intervals)
|
|
1623
|
+
duration = _get_cluster_duration(usage_intervals)
|
|
1624
|
+
|
|
1625
|
+
# Parse status
|
|
1626
|
+
status = None
|
|
1627
|
+
if row.status:
|
|
1628
|
+
status = status_lib.ClusterStatus[row.status]
|
|
1629
|
+
|
|
1597
1630
|
# Parse launched resources safely
|
|
1598
1631
|
launched_resources = None
|
|
1599
1632
|
if row.launched_resources:
|
|
@@ -1617,10 +1650,11 @@ def get_clusters_from_history(
|
|
|
1617
1650
|
'user_hash': user_hash,
|
|
1618
1651
|
'user_name': user_name,
|
|
1619
1652
|
'workspace': workspace,
|
|
1620
|
-
'last_creation_yaml': row.last_creation_yaml,
|
|
1621
|
-
'last_creation_command': row.last_creation_command,
|
|
1622
|
-
'last_event': last_event,
|
|
1623
1653
|
}
|
|
1654
|
+
if not abbreviate_response:
|
|
1655
|
+
record['last_creation_yaml'] = row.last_creation_yaml
|
|
1656
|
+
record['last_creation_command'] = row.last_creation_command
|
|
1657
|
+
record['last_event'] = last_event
|
|
1624
1658
|
|
|
1625
1659
|
records.append(record)
|
|
1626
1660
|
|
sky/metrics/utils.py
CHANGED
|
@@ -1,11 +1,165 @@
|
|
|
1
1
|
"""Utilities for processing GPU metrics from Kubernetes clusters."""
|
|
2
|
+
import contextlib
|
|
3
|
+
import functools
|
|
2
4
|
import os
|
|
3
5
|
import re
|
|
6
|
+
import select
|
|
4
7
|
import subprocess
|
|
5
8
|
import time
|
|
6
9
|
from typing import List, Optional, Tuple
|
|
7
10
|
|
|
8
11
|
import httpx
|
|
12
|
+
import prometheus_client as prom
|
|
13
|
+
|
|
14
|
+
from sky.skylet import constants
|
|
15
|
+
from sky.utils import context_utils
|
|
16
|
+
|
|
17
|
+
_SELECT_TIMEOUT = 1
|
|
18
|
+
_SELECT_BUFFER_SIZE = 4096
|
|
19
|
+
|
|
20
|
+
_KB = 2**10
|
|
21
|
+
_MB = 2**20
|
|
22
|
+
_MEM_BUCKETS = [
|
|
23
|
+
_KB,
|
|
24
|
+
256 * _KB,
|
|
25
|
+
512 * _KB,
|
|
26
|
+
_MB,
|
|
27
|
+
2 * _MB,
|
|
28
|
+
4 * _MB,
|
|
29
|
+
8 * _MB,
|
|
30
|
+
16 * _MB,
|
|
31
|
+
32 * _MB,
|
|
32
|
+
64 * _MB,
|
|
33
|
+
128 * _MB,
|
|
34
|
+
256 * _MB,
|
|
35
|
+
float('inf'),
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
# Whether the metrics are enabled, cannot be changed at runtime.
|
|
39
|
+
METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
|
|
40
|
+
'false').lower() == 'true'
|
|
41
|
+
|
|
42
|
+
# Time spent processing a piece of code, refer to time_it().
|
|
43
|
+
SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
|
|
44
|
+
'sky_apiserver_code_duration_seconds',
|
|
45
|
+
'Time spent processing code',
|
|
46
|
+
['name', 'group'],
|
|
47
|
+
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
|
|
48
|
+
60.0, 120.0, float('inf')),
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Total number of API server requests, grouped by path, method, and status.
|
|
52
|
+
SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
|
|
53
|
+
'sky_apiserver_requests_total',
|
|
54
|
+
'Total number of API server requests',
|
|
55
|
+
['path', 'method', 'status'],
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Time spent processing API server requests, grouped by path, method, and
|
|
59
|
+
# status.
|
|
60
|
+
SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
|
|
61
|
+
'sky_apiserver_request_duration_seconds',
|
|
62
|
+
'Time spent processing API server requests',
|
|
63
|
+
['path', 'method', 'status'],
|
|
64
|
+
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
|
|
65
|
+
60.0, 120.0, float('inf')),
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
|
|
69
|
+
'sky_apiserver_event_loop_lag_seconds',
|
|
70
|
+
'Scheduling delay of the server event loop',
|
|
71
|
+
['pid'],
|
|
72
|
+
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 20.0,
|
|
73
|
+
60.0, float('inf')),
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
|
|
77
|
+
'sky_apiserver_websocket_connections',
|
|
78
|
+
'Number of websocket connections',
|
|
79
|
+
['pid'],
|
|
80
|
+
multiprocess_mode='livesum',
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL = prom.Counter(
|
|
84
|
+
'sky_apiserver_websocket_closed_total',
|
|
85
|
+
'Number of websocket closed',
|
|
86
|
+
['pid', 'reason'],
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# The number of execution starts in each worker process, we do not record
|
|
90
|
+
# histogram here as the duration has been measured in
|
|
91
|
+
# SKY_APISERVER_CODE_DURATION_SECONDS without the worker label (process id).
|
|
92
|
+
# Recording histogram WITH worker label will cause high cardinality.
|
|
93
|
+
SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL = prom.Counter(
|
|
94
|
+
'sky_apiserver_process_execution_start_total',
|
|
95
|
+
'Total number of execution starts in each worker process',
|
|
96
|
+
['request', 'pid'],
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
SKY_APISERVER_PROCESS_PEAK_RSS = prom.Gauge(
|
|
100
|
+
'sky_apiserver_process_peak_rss',
|
|
101
|
+
'Peak RSS we saw in each process in last 30 seconds',
|
|
102
|
+
['pid', 'type'],
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
|
|
106
|
+
'sky_apiserver_process_cpu_total',
|
|
107
|
+
'Total CPU times a worker process has been running',
|
|
108
|
+
['pid', 'type', 'mode'],
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
|
|
112
|
+
'sky_apiserver_request_memory_usage_bytes',
|
|
113
|
+
'Peak memory usage of requests', ['name'],
|
|
114
|
+
buckets=_MEM_BUCKETS)
|
|
115
|
+
|
|
116
|
+
SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
|
|
117
|
+
'sky_apiserver_request_rss_incr_bytes',
|
|
118
|
+
'RSS increment after requests', ['name'],
|
|
119
|
+
buckets=_MEM_BUCKETS)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@contextlib.contextmanager
|
|
123
|
+
def time_it(name: str, group: str = 'default'):
|
|
124
|
+
"""Context manager to measure and record code execution duration."""
|
|
125
|
+
if not METRICS_ENABLED:
|
|
126
|
+
yield
|
|
127
|
+
else:
|
|
128
|
+
start_time = time.time()
|
|
129
|
+
try:
|
|
130
|
+
yield
|
|
131
|
+
finally:
|
|
132
|
+
duration = time.time() - start_time
|
|
133
|
+
SKY_APISERVER_CODE_DURATION_SECONDS.labels(
|
|
134
|
+
name=name, group=group).observe(duration)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def time_me(func):
|
|
138
|
+
"""Measure the duration of decorated function."""
|
|
139
|
+
|
|
140
|
+
@functools.wraps(func)
|
|
141
|
+
def wrapper(*args, **kwargs):
|
|
142
|
+
if not METRICS_ENABLED:
|
|
143
|
+
return func(*args, **kwargs)
|
|
144
|
+
name = f'{func.__module__}/{func.__name__}'
|
|
145
|
+
with time_it(name, group='function'):
|
|
146
|
+
return func(*args, **kwargs)
|
|
147
|
+
|
|
148
|
+
return wrapper
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def time_me_async(func):
|
|
152
|
+
"""Measure the duration of decorated async function."""
|
|
153
|
+
|
|
154
|
+
@functools.wraps(func)
|
|
155
|
+
async def async_wrapper(*args, **kwargs):
|
|
156
|
+
if not METRICS_ENABLED:
|
|
157
|
+
return await func(*args, **kwargs)
|
|
158
|
+
name = f'{func.__module__}/{func.__name__}'
|
|
159
|
+
with time_it(name, group='function'):
|
|
160
|
+
return await func(*args, **kwargs)
|
|
161
|
+
|
|
162
|
+
return async_wrapper
|
|
9
163
|
|
|
10
164
|
|
|
11
165
|
def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
@@ -44,6 +198,7 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
|
44
198
|
local_port = None
|
|
45
199
|
start_time = time.time()
|
|
46
200
|
|
|
201
|
+
buffer = ''
|
|
47
202
|
# wait for the port forward to start and extract the local port
|
|
48
203
|
while time.time() - start_time < start_port_forward_timeout:
|
|
49
204
|
if port_forward_process.poll() is not None:
|
|
@@ -56,10 +211,16 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
|
56
211
|
|
|
57
212
|
# read output line by line to find the local port
|
|
58
213
|
if port_forward_process.stdout:
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
214
|
+
# Wait up to 1s for data to be available without blocking
|
|
215
|
+
r, _, _ = select.select([port_forward_process.stdout], [], [],
|
|
216
|
+
_SELECT_TIMEOUT)
|
|
217
|
+
if r:
|
|
218
|
+
# Read available bytes from the FD without blocking
|
|
219
|
+
fd = port_forward_process.stdout.fileno()
|
|
220
|
+
raw = os.read(fd, _SELECT_BUFFER_SIZE)
|
|
221
|
+
chunk = raw.decode(errors='ignore')
|
|
222
|
+
buffer += chunk
|
|
223
|
+
match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', buffer)
|
|
63
224
|
if match:
|
|
64
225
|
local_port = int(match.group(1))
|
|
65
226
|
break
|
|
@@ -122,8 +283,8 @@ async def send_metrics_request_with_port_forward(
|
|
|
122
283
|
port_forward_process = None
|
|
123
284
|
try:
|
|
124
285
|
# Start port forward
|
|
125
|
-
port_forward_process, local_port =
|
|
126
|
-
context, namespace, service, service_port)
|
|
286
|
+
port_forward_process, local_port = await context_utils.to_thread(
|
|
287
|
+
start_svc_port_forward, context, namespace, service, service_port)
|
|
127
288
|
|
|
128
289
|
# Build endpoint URL
|
|
129
290
|
endpoint = f'http://localhost:{local_port}{endpoint_path}'
|
|
@@ -143,7 +304,8 @@ async def send_metrics_request_with_port_forward(
|
|
|
143
304
|
finally:
|
|
144
305
|
# Always clean up port forward
|
|
145
306
|
if port_forward_process:
|
|
146
|
-
stop_svc_port_forward
|
|
307
|
+
await context_utils.to_thread(stop_svc_port_forward,
|
|
308
|
+
port_forward_process)
|
|
147
309
|
|
|
148
310
|
|
|
149
311
|
async def add_cluster_name_label(metrics_text: str, context: str) -> str:
|
|
@@ -193,7 +355,11 @@ async def get_metrics_for_context(context: str) -> str:
|
|
|
193
355
|
"""
|
|
194
356
|
# Query both DCGM metrics and kube_pod_labels metrics
|
|
195
357
|
# This ensures the dashboard can perform joins to filter by skypilot cluster
|
|
196
|
-
match_patterns = [
|
|
358
|
+
match_patterns = [
|
|
359
|
+
'{__name__=~"node_memory_MemAvailable_bytes|node_memory_MemTotal_bytes|DCGM_.*"}', # pylint: disable=line-too-long
|
|
360
|
+
'kube_pod_labels',
|
|
361
|
+
'node_cpu_seconds_total{mode="idle"}'
|
|
362
|
+
]
|
|
197
363
|
|
|
198
364
|
# TODO(rohan): don't hardcode the namespace and service name
|
|
199
365
|
metrics_text = await send_metrics_request_with_port_forward(
|
|
@@ -14,7 +14,7 @@ _sym_db = _symbol_database.Default()
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\"sky/schemas/generated/jobsv1.proto\x12\x07jobs.v1\"\x85\x01\n\rAddJobRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x15\n\rrun_timestamp\x18\x03 \x01(\t\x12\x15\n\rresources_str\x18\x04 \x01(\t\x12\x10\n\x08metadata\x18\x05 \x01(\tB\x0b\n\t_job_name\"1\n\x0e\x41\x64\x64JobResponse\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07log_dir\x18\x02 \x01(\t\"\xb3\x01\n\x0fQueueJobRequest\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x14\n\x07\x63odegen\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bscript_path\x18\x03 \x01(\t\x12\x16\n\x0eremote_log_dir\x18\x04 \x01(\t\x12\x31\n\x0bmanaged_job\x18\x05 \x01(\x0b\x32\x17.jobs.v1.ManagedJobInfoH\x01\x88\x01\x01\x42\n\n\x08_codegenB\x0e\n\x0c_managed_job\"\x89\x01\n\x0eManagedJobInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x11\n\x04pool\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tworkspace\x18\x03 \x01(\t\x12\x12\n\nentrypoint\x18\x04 \x01(\t\x12&\n\x05tasks\x18\x05 \x03(\x0b\x32\x17.jobs.v1.ManagedJobTaskB\x07\n\x05_pool\"]\n\x0eManagedJobTask\x12\x0f\n\x07task_id\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x15\n\rresources_str\x18\x03 \x01(\t\x12\x15\n\rmetadata_json\x18\x04 \x01(\t\"\x12\n\x10QueueJobResponse\"\x15\n\x13UpdateStatusRequest\"\x16\n\x14UpdateStatusResponse\"L\n\x12GetJobQueueRequest\x12\x16\n\tuser_hash\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08\x61ll_jobs\x18\x02 \x01(\x08\x42\x0c\n\n_user_hash\"\
|
|
17
|
+
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\"sky/schemas/generated/jobsv1.proto\x12\x07jobs.v1\"\x85\x01\n\rAddJobRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x15\n\rrun_timestamp\x18\x03 \x01(\t\x12\x15\n\rresources_str\x18\x04 \x01(\t\x12\x10\n\x08metadata\x18\x05 \x01(\tB\x0b\n\t_job_name\"1\n\x0e\x41\x64\x64JobResponse\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07log_dir\x18\x02 \x01(\t\"\xb3\x01\n\x0fQueueJobRequest\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x14\n\x07\x63odegen\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bscript_path\x18\x03 \x01(\t\x12\x16\n\x0eremote_log_dir\x18\x04 \x01(\t\x12\x31\n\x0bmanaged_job\x18\x05 \x01(\x0b\x32\x17.jobs.v1.ManagedJobInfoH\x01\x88\x01\x01\x42\n\n\x08_codegenB\x0e\n\x0c_managed_job\"\x89\x01\n\x0eManagedJobInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x11\n\x04pool\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tworkspace\x18\x03 \x01(\t\x12\x12\n\nentrypoint\x18\x04 \x01(\t\x12&\n\x05tasks\x18\x05 \x03(\x0b\x32\x17.jobs.v1.ManagedJobTaskB\x07\n\x05_pool\"]\n\x0eManagedJobTask\x12\x0f\n\x07task_id\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x15\n\rresources_str\x18\x03 \x01(\t\x12\x15\n\rmetadata_json\x18\x04 \x01(\t\"\x12\n\x10QueueJobResponse\"\x15\n\x13UpdateStatusRequest\"\x16\n\x14UpdateStatusResponse\"L\n\x12GetJobQueueRequest\x12\x16\n\tuser_hash\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08\x61ll_jobs\x18\x02 \x01(\x08\x42\x0c\n\n_user_hash\"\xa3\x02\n\x07JobInfo\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x10\n\x08job_name\x18\x02 \x01(\t\x12\x10\n\x08username\x18\x03 \x01(\t\x12\x14\n\x0csubmitted_at\x18\x04 \x01(\x01\x12\"\n\x06status\x18\x05 \x01(\x0e\x32\x12.jobs.v1.JobStatus\x12\x15\n\rrun_timestamp\x18\x06 \x01(\t\x12\x15\n\x08start_at\x18\x07 \x01(\x01H\x00\x88\x01\x01\x12\x13\n\x06\x65nd_at\x18\x08 \x01(\x01H\x01\x88\x01\x01\x12\x11\n\tresources\x18\t \x01(\t\x12\x10\n\x03pid\x18\n \x01(\x03H\x02\x88\x01\x01\x12\x10\n\x08log_path\x18\x0b \x01(\t\x12\x10\n\x08metadata\x18\x0c \x01(\tB\x0b\n\t_start_atB\t\n\x07_end_atB\x06\n\x04_pid\"5\n\x13GetJobQueueResponse\x12\x1e\n\x04jobs\x18\x01 \x03(\x0b\x32\x10.jobs.v1.JobInfo\"^\n\x11\x43\x61ncelJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\x12\x12\n\ncancel_all\x18\x02 \x01(\x08\x12\x16\n\tuser_hash\x18\x03 \x01(\tH\x00\x88\x01\x01\x42\x0c\n\n_user_hash\"/\n\x12\x43\x61ncelJobsResponse\x12\x19\n\x11\x63\x61ncelled_job_ids\x18\x01 \x03(\x03\"\x1e\n\x1c\x46\x61ilAllInProgressJobsRequest\"\x1f\n\x1d\x46\x61ilAllInProgressJobsResponse\"\x7f\n\x0fTailLogsRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x12\x1b\n\x0emanaged_job_id\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x0e\n\x06\x66ollow\x18\x03 \x01(\x08\x12\x0c\n\x04tail\x18\x04 \x01(\x05\x42\t\n\x07_job_idB\x11\n\x0f_managed_job_id\"7\n\x10TailLogsResponse\x12\x10\n\x08log_line\x18\x01 \x01(\t\x12\x11\n\texit_code\x18\x02 \x01(\x05\"&\n\x13GetJobStatusRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\xa4\x01\n\x14GetJobStatusResponse\x12\x44\n\x0cjob_statuses\x18\x01 \x03(\x0b\x32..jobs.v1.GetJobStatusResponse.JobStatusesEntry\x1a\x46\n\x10JobStatusesEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12!\n\x05value\x18\x02 \x01(\x0e\x32\x12.jobs.v1.JobStatus:\x02\x38\x01\"A\n\x1fGetJobSubmittedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"5\n GetJobSubmittedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"=\n\x1bGetJobEndedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"1\n\x1cGetJobEndedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"+\n\x18GetLogDirsForJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\x98\x01\n\x19GetLogDirsForJobsResponse\x12H\n\x0cjob_log_dirs\x18\x01 \x03(\x0b\x32\x32.jobs.v1.GetLogDirsForJobsResponse.JobLogDirsEntry\x1a\x31\n\x0fJobLogDirsEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01*\x8d\x02\n\tJobStatus\x12\x1a\n\x16JOB_STATUS_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOB_STATUS_INIT\x10\x01\x12\x16\n\x12JOB_STATUS_PENDING\x10\x02\x12\x19\n\x15JOB_STATUS_SETTING_UP\x10\x03\x12\x16\n\x12JOB_STATUS_RUNNING\x10\x04\x12\x1c\n\x18JOB_STATUS_FAILED_DRIVER\x10\x05\x12\x18\n\x14JOB_STATUS_SUCCEEDED\x10\x06\x12\x15\n\x11JOB_STATUS_FAILED\x10\x07\x12\x1b\n\x17JOB_STATUS_FAILED_SETUP\x10\x08\x12\x18\n\x14JOB_STATUS_CANCELLED\x10\t2\x91\x07\n\x0bJobsService\x12\x39\n\x06\x41\x64\x64Job\x12\x16.jobs.v1.AddJobRequest\x1a\x17.jobs.v1.AddJobResponse\x12?\n\x08QueueJob\x12\x18.jobs.v1.QueueJobRequest\x1a\x19.jobs.v1.QueueJobResponse\x12K\n\x0cUpdateStatus\x12\x1c.jobs.v1.UpdateStatusRequest\x1a\x1d.jobs.v1.UpdateStatusResponse\x12H\n\x0bGetJobQueue\x12\x1b.jobs.v1.GetJobQueueRequest\x1a\x1c.jobs.v1.GetJobQueueResponse\x12\x45\n\nCancelJobs\x12\x1a.jobs.v1.CancelJobsRequest\x1a\x1b.jobs.v1.CancelJobsResponse\x12\x66\n\x15\x46\x61ilAllInProgressJobs\x12%.jobs.v1.FailAllInProgressJobsRequest\x1a&.jobs.v1.FailAllInProgressJobsResponse\x12\x41\n\x08TailLogs\x12\x18.jobs.v1.TailLogsRequest\x1a\x19.jobs.v1.TailLogsResponse0\x01\x12K\n\x0cGetJobStatus\x12\x1c.jobs.v1.GetJobStatusRequest\x1a\x1d.jobs.v1.GetJobStatusResponse\x12o\n\x18GetJobSubmittedTimestamp\x12(.jobs.v1.GetJobSubmittedTimestampRequest\x1a).jobs.v1.GetJobSubmittedTimestampResponse\x12\x63\n\x14GetJobEndedTimestamp\x12$.jobs.v1.GetJobEndedTimestampRequest\x1a%.jobs.v1.GetJobEndedTimestampResponse\x12Z\n\x11GetLogDirsForJobs\x12!.jobs.v1.GetLogDirsForJobsRequest\x1a\".jobs.v1.GetLogDirsForJobsResponseb\x06proto3')
|
|
18
18
|
|
|
19
19
|
_globals = globals()
|
|
20
20
|
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
|
@@ -25,8 +25,8 @@ if not _descriptor._USE_C_DESCRIPTORS:
|
|
|
25
25
|
_globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_options = b'8\001'
|
|
26
26
|
_globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._loaded_options = None
|
|
27
27
|
_globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_options = b'8\001'
|
|
28
|
-
_globals['_JOBSTATUS']._serialized_start=
|
|
29
|
-
_globals['_JOBSTATUS']._serialized_end=
|
|
28
|
+
_globals['_JOBSTATUS']._serialized_start=2185
|
|
29
|
+
_globals['_JOBSTATUS']._serialized_end=2454
|
|
30
30
|
_globals['_ADDJOBREQUEST']._serialized_start=48
|
|
31
31
|
_globals['_ADDJOBREQUEST']._serialized_end=181
|
|
32
32
|
_globals['_ADDJOBRESPONSE']._serialized_start=183
|
|
@@ -46,41 +46,41 @@ if not _descriptor._USE_C_DESCRIPTORS:
|
|
|
46
46
|
_globals['_GETJOBQUEUEREQUEST']._serialized_start=718
|
|
47
47
|
_globals['_GETJOBQUEUEREQUEST']._serialized_end=794
|
|
48
48
|
_globals['_JOBINFO']._serialized_start=797
|
|
49
|
-
_globals['_JOBINFO']._serialized_end=
|
|
50
|
-
_globals['_GETJOBQUEUERESPONSE']._serialized_start=
|
|
51
|
-
_globals['_GETJOBQUEUERESPONSE']._serialized_end=
|
|
52
|
-
_globals['_CANCELJOBSREQUEST']._serialized_start=
|
|
53
|
-
_globals['_CANCELJOBSREQUEST']._serialized_end=
|
|
54
|
-
_globals['_CANCELJOBSRESPONSE']._serialized_start=
|
|
55
|
-
_globals['_CANCELJOBSRESPONSE']._serialized_end=
|
|
56
|
-
_globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_start=
|
|
57
|
-
_globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_end=
|
|
58
|
-
_globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_start=
|
|
59
|
-
_globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_end=
|
|
60
|
-
_globals['_TAILLOGSREQUEST']._serialized_start=
|
|
61
|
-
_globals['_TAILLOGSREQUEST']._serialized_end=
|
|
62
|
-
_globals['_TAILLOGSRESPONSE']._serialized_start=
|
|
63
|
-
_globals['_TAILLOGSRESPONSE']._serialized_end=
|
|
64
|
-
_globals['_GETJOBSTATUSREQUEST']._serialized_start=
|
|
65
|
-
_globals['_GETJOBSTATUSREQUEST']._serialized_end=
|
|
66
|
-
_globals['_GETJOBSTATUSRESPONSE']._serialized_start=
|
|
67
|
-
_globals['_GETJOBSTATUSRESPONSE']._serialized_end=
|
|
68
|
-
_globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_start=
|
|
69
|
-
_globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_end=
|
|
70
|
-
_globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_start=
|
|
71
|
-
_globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_end=
|
|
72
|
-
_globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_start=
|
|
73
|
-
_globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_end=
|
|
74
|
-
_globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_start=
|
|
75
|
-
_globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_end=
|
|
76
|
-
_globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_start=
|
|
77
|
-
_globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_end=
|
|
78
|
-
_globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_start=
|
|
79
|
-
_globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_end=
|
|
80
|
-
_globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_start=
|
|
81
|
-
_globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_end=
|
|
82
|
-
_globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_start=
|
|
83
|
-
_globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_end=
|
|
84
|
-
_globals['_JOBSSERVICE']._serialized_start=
|
|
85
|
-
_globals['_JOBSSERVICE']._serialized_end=
|
|
49
|
+
_globals['_JOBINFO']._serialized_end=1088
|
|
50
|
+
_globals['_GETJOBQUEUERESPONSE']._serialized_start=1090
|
|
51
|
+
_globals['_GETJOBQUEUERESPONSE']._serialized_end=1143
|
|
52
|
+
_globals['_CANCELJOBSREQUEST']._serialized_start=1145
|
|
53
|
+
_globals['_CANCELJOBSREQUEST']._serialized_end=1239
|
|
54
|
+
_globals['_CANCELJOBSRESPONSE']._serialized_start=1241
|
|
55
|
+
_globals['_CANCELJOBSRESPONSE']._serialized_end=1288
|
|
56
|
+
_globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_start=1290
|
|
57
|
+
_globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_end=1320
|
|
58
|
+
_globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_start=1322
|
|
59
|
+
_globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_end=1353
|
|
60
|
+
_globals['_TAILLOGSREQUEST']._serialized_start=1355
|
|
61
|
+
_globals['_TAILLOGSREQUEST']._serialized_end=1482
|
|
62
|
+
_globals['_TAILLOGSRESPONSE']._serialized_start=1484
|
|
63
|
+
_globals['_TAILLOGSRESPONSE']._serialized_end=1539
|
|
64
|
+
_globals['_GETJOBSTATUSREQUEST']._serialized_start=1541
|
|
65
|
+
_globals['_GETJOBSTATUSREQUEST']._serialized_end=1579
|
|
66
|
+
_globals['_GETJOBSTATUSRESPONSE']._serialized_start=1582
|
|
67
|
+
_globals['_GETJOBSTATUSRESPONSE']._serialized_end=1746
|
|
68
|
+
_globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_start=1676
|
|
69
|
+
_globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_end=1746
|
|
70
|
+
_globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_start=1748
|
|
71
|
+
_globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_end=1813
|
|
72
|
+
_globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_start=1815
|
|
73
|
+
_globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_end=1868
|
|
74
|
+
_globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_start=1870
|
|
75
|
+
_globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_end=1931
|
|
76
|
+
_globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_start=1933
|
|
77
|
+
_globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_end=1982
|
|
78
|
+
_globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_start=1984
|
|
79
|
+
_globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_end=2027
|
|
80
|
+
_globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_start=2030
|
|
81
|
+
_globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_end=2182
|
|
82
|
+
_globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_start=2133
|
|
83
|
+
_globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_end=2182
|
|
84
|
+
_globals['_JOBSSERVICE']._serialized_start=2457
|
|
85
|
+
_globals['_JOBSSERVICE']._serialized_end=3370
|
|
86
86
|
# @@protoc_insertion_point(module_scope)
|
sky/serve/serve_utils.py
CHANGED
|
@@ -1329,10 +1329,6 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
|
|
|
1329
1329
|
print(line, end='', flush=True)
|
|
1330
1330
|
return ''
|
|
1331
1331
|
|
|
1332
|
-
# For pools, we don't stream the job logs as the run section is ignored.
|
|
1333
|
-
if pool:
|
|
1334
|
-
return ''
|
|
1335
|
-
|
|
1336
1332
|
backend = backends.CloudVmRayBackend()
|
|
1337
1333
|
handle = global_user_state.get_handle_from_cluster_name(
|
|
1338
1334
|
replica_cluster_name)
|
sky/server/auth/oauth2_proxy.py
CHANGED
|
@@ -37,8 +37,8 @@ OAUTH2_PROXY_ENABLED_ENV_VAR = 'SKYPILOT_AUTH_OAUTH2_PROXY_ENABLED'
|
|
|
37
37
|
class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
38
38
|
"""Middleware to handle authentication by delegating to OAuth2 Proxy."""
|
|
39
39
|
|
|
40
|
-
def __init__(self,
|
|
41
|
-
super().__init__(
|
|
40
|
+
def __init__(self, *args, **kwargs):
|
|
41
|
+
super().__init__(*args, **kwargs)
|
|
42
42
|
self.enabled: bool = (os.getenv(OAUTH2_PROXY_ENABLED_ENV_VAR,
|
|
43
43
|
'false') == 'true')
|
|
44
44
|
self.proxy_base: str = ''
|