skypilot-nightly 1.0.0.dev20250701__py3-none-any.whl → 1.0.0.dev20250702__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/catalog/__init__.py +1 -1
- sky/client/cli/command.py +60 -21
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{Md3rlE87jmL5uv7gSo8mR → N5IdFnjR1RaPGBAVYeTIr}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/9984.b56614f3c4c5961d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-1159f362b960e2b8.js +6 -0
- sky/dashboard/out/_next/static/chunks/{webpack-d427db53e54de9ce.js → webpack-9a81ea998672c303.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/metrics/utils.py +210 -0
- sky/optimizer.py +1 -1
- sky/resources.py +145 -7
- sky/server/server.py +80 -7
- sky/setup_files/MANIFEST.in +1 -0
- sky/skylet/constants.py +3 -0
- sky/skypilot_config.py +62 -53
- sky/utils/accelerator_registry.py +28 -1
- sky/utils/schemas.py +3 -0
- sky/utils/ux_utils.py +9 -4
- {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/RECORD +39 -38
- sky/dashboard/out/_next/static/chunks/9984.739ae958a066298d.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b8e1114e6d38218c.js +0 -6
- /sky/dashboard/out/_next/static/{Md3rlE87jmL5uv7gSo8mR → N5IdFnjR1RaPGBAVYeTIr}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-9a81ea998672c303.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-a37b06ddb64521fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-162f3033ffcd3d31.js" defer=""></script><script src="/dashboard/_next/static/chunks/5230-df791914b54d91d9.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-5ea3ffa10fc884f2.js" defer=""></script><script src="/dashboard/_next/static/chunks/1664-d65361e92b85e786.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-9f5e98ce84d46bdd.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/3947-b059261d6fa88a1f.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-6ff4e45dfb49d11d.js" defer=""></script><script src="/dashboard/_next/static/chunks/3698-52ad1ca228faa776.js" defer=""></script><script src="/dashboard/_next/static/chunks/9470-21d059a1dfa03f61.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-d0dc765474fa0eca.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-909d53833da080cb.js" defer=""></script><script src="/dashboard/_next/static/chunks/1043-1b39779691bb4030.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-fcfad0ddf92ec7ab.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-044ad21de8b4626b.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-726e5a3f00b67185.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-7c0187f43757a548.js" defer=""></script><script src="/dashboard/_next/static/N5IdFnjR1RaPGBAVYeTIr/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/N5IdFnjR1RaPGBAVYeTIr/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"N5IdFnjR1RaPGBAVYeTIr","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-9a81ea998672c303.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-a37b06ddb64521fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-06bde99155fa6292.js" defer=""></script><script src="/dashboard/_next/static/N5IdFnjR1RaPGBAVYeTIr/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/N5IdFnjR1RaPGBAVYeTIr/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"N5IdFnjR1RaPGBAVYeTIr","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/metrics/utils.py
ADDED
@@ -0,0 +1,210 @@
|
|
1
|
+
"""Utilities for processing GPU metrics from Kubernetes clusters."""
|
2
|
+
import os
|
3
|
+
import re
|
4
|
+
import subprocess
|
5
|
+
import time
|
6
|
+
from typing import List, Optional, Tuple
|
7
|
+
|
8
|
+
import httpx
|
9
|
+
|
10
|
+
|
11
|
+
def start_svc_port_forward(context: str, namespace: str, service: str,
|
12
|
+
service_port: int) -> Tuple[subprocess.Popen, int]:
|
13
|
+
"""Starts a port forward to a service in a Kubernetes cluster.
|
14
|
+
Args:
|
15
|
+
context: Kubernetes context name
|
16
|
+
namespace: Namespace where the service is located
|
17
|
+
service: Service name to port forward to
|
18
|
+
service_port: Port on the service to forward to
|
19
|
+
Returns:
|
20
|
+
Tuple of (subprocess.Popen process, local_port assigned)
|
21
|
+
Raises:
|
22
|
+
RuntimeError: If port forward fails to start
|
23
|
+
"""
|
24
|
+
start_port_forward_timeout = 10 # 10 second timeout
|
25
|
+
terminate_port_forward_timeout = 5 # 5 second timeout
|
26
|
+
|
27
|
+
# Use ':service_port' to let kubectl choose the local port
|
28
|
+
cmd = [
|
29
|
+
'kubectl', '--context', context, '-n', namespace, 'port-forward',
|
30
|
+
f'service/{service}', f':{service_port}'
|
31
|
+
]
|
32
|
+
|
33
|
+
env = os.environ.copy()
|
34
|
+
if 'KUBECONFIG' not in env:
|
35
|
+
env['KUBECONFIG'] = os.path.expanduser('~/.kube/config')
|
36
|
+
|
37
|
+
# start the port forward process
|
38
|
+
port_forward_process = subprocess.Popen(cmd,
|
39
|
+
stdout=subprocess.PIPE,
|
40
|
+
stderr=subprocess.STDOUT,
|
41
|
+
text=True,
|
42
|
+
env=env)
|
43
|
+
|
44
|
+
local_port = None
|
45
|
+
start_time = time.time()
|
46
|
+
|
47
|
+
# wait for the port forward to start and extract the local port
|
48
|
+
while time.time() - start_time < start_port_forward_timeout:
|
49
|
+
if port_forward_process.poll() is not None:
|
50
|
+
# port forward process has terminated
|
51
|
+
if port_forward_process.returncode != 0:
|
52
|
+
raise RuntimeError(
|
53
|
+
f'Port forward failed for service {service} in namespace '
|
54
|
+
f'{namespace} on context {context}')
|
55
|
+
break
|
56
|
+
|
57
|
+
# read output line by line to find the local port
|
58
|
+
if port_forward_process.stdout:
|
59
|
+
line = port_forward_process.stdout.readline()
|
60
|
+
if line:
|
61
|
+
# look for 'Forwarding from 127.0.0.1:XXXXX -> service_port'
|
62
|
+
match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', line)
|
63
|
+
if match:
|
64
|
+
local_port = int(match.group(1))
|
65
|
+
break
|
66
|
+
|
67
|
+
# sleep for 100ms to avoid busy-waiting
|
68
|
+
time.sleep(0.1)
|
69
|
+
|
70
|
+
if local_port is None:
|
71
|
+
try:
|
72
|
+
port_forward_process.terminate()
|
73
|
+
port_forward_process.wait(timeout=terminate_port_forward_timeout)
|
74
|
+
except subprocess.TimeoutExpired:
|
75
|
+
port_forward_process.kill()
|
76
|
+
port_forward_process.wait()
|
77
|
+
finally:
|
78
|
+
raise RuntimeError(
|
79
|
+
f'Failed to extract local port for service {service} in '
|
80
|
+
f'namespace {namespace} on context {context}')
|
81
|
+
|
82
|
+
return port_forward_process, local_port
|
83
|
+
|
84
|
+
|
85
|
+
def stop_svc_port_forward(port_forward_process: subprocess.Popen) -> None:
|
86
|
+
"""Stops a port forward to a service in a Kubernetes cluster.
|
87
|
+
Args:
|
88
|
+
port_forward_process: The subprocess.Popen process to terminate
|
89
|
+
"""
|
90
|
+
try:
|
91
|
+
port_forward_process.terminate()
|
92
|
+
port_forward_process.wait(timeout=5)
|
93
|
+
except subprocess.TimeoutExpired:
|
94
|
+
port_forward_process.kill()
|
95
|
+
port_forward_process.wait()
|
96
|
+
|
97
|
+
|
98
|
+
async def send_metrics_request_with_port_forward(
|
99
|
+
context: str,
|
100
|
+
namespace: str,
|
101
|
+
service: str,
|
102
|
+
service_port: int,
|
103
|
+
endpoint_path: str = '/federate',
|
104
|
+
match_patterns: Optional[List[str]] = None,
|
105
|
+
timeout: float = 30.0) -> str:
|
106
|
+
"""Sends a metrics request to a Prometheus endpoint via port forwarding.
|
107
|
+
Args:
|
108
|
+
context: Kubernetes context name
|
109
|
+
namespace: Namespace where the service is located
|
110
|
+
service: Service name to port forward to
|
111
|
+
service_port: Port on the service to forward to
|
112
|
+
endpoint_path: Path to append to the localhost endpoint (e.g.,
|
113
|
+
'/federate')
|
114
|
+
match_patterns: List of metric patterns to match (for federate
|
115
|
+
endpoint)
|
116
|
+
timeout: Request timeout in seconds
|
117
|
+
Returns:
|
118
|
+
Response text containing the metrics
|
119
|
+
Raises:
|
120
|
+
RuntimeError: If port forward or HTTP request fails
|
121
|
+
"""
|
122
|
+
port_forward_process = None
|
123
|
+
try:
|
124
|
+
# Start port forward
|
125
|
+
port_forward_process, local_port = start_svc_port_forward(
|
126
|
+
context, namespace, service, service_port)
|
127
|
+
|
128
|
+
# Build endpoint URL
|
129
|
+
endpoint = f'http://localhost:{local_port}{endpoint_path}'
|
130
|
+
|
131
|
+
# Make HTTP request
|
132
|
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
133
|
+
if match_patterns:
|
134
|
+
# For federate endpoint, add match[] parameters
|
135
|
+
params = [('match[]', pattern) for pattern in match_patterns]
|
136
|
+
response = await client.get(endpoint, params=params)
|
137
|
+
else:
|
138
|
+
response = await client.get(endpoint)
|
139
|
+
|
140
|
+
response.raise_for_status()
|
141
|
+
return response.text
|
142
|
+
|
143
|
+
finally:
|
144
|
+
# Always clean up port forward
|
145
|
+
if port_forward_process:
|
146
|
+
stop_svc_port_forward(port_forward_process)
|
147
|
+
|
148
|
+
|
149
|
+
async def add_cluster_name_label(metrics_text: str, context: str) -> str:
|
150
|
+
"""Adds a cluster_name label to each metric line.
|
151
|
+
Args:
|
152
|
+
metrics_text: The text containing the metrics
|
153
|
+
context: The cluster name
|
154
|
+
"""
|
155
|
+
lines = metrics_text.strip().split('\n')
|
156
|
+
modified_lines = []
|
157
|
+
|
158
|
+
for line in lines:
|
159
|
+
# keep comment lines and empty lines as-is
|
160
|
+
if line.startswith('#') or not line.strip():
|
161
|
+
modified_lines.append(line)
|
162
|
+
continue
|
163
|
+
# if line is a metric line with labels, add cluster label
|
164
|
+
brace_start = line.find('{')
|
165
|
+
brace_end = line.find('}')
|
166
|
+
if brace_start != -1 and brace_end != -1:
|
167
|
+
metric_name = line[:brace_start]
|
168
|
+
existing_labels = line[brace_start + 1:brace_end]
|
169
|
+
rest_of_line = line[brace_end + 1:]
|
170
|
+
|
171
|
+
if existing_labels:
|
172
|
+
new_labels = f'cluster="{context}",{existing_labels}'
|
173
|
+
else:
|
174
|
+
new_labels = f'cluster="{context}"'
|
175
|
+
|
176
|
+
modified_line = f'{metric_name}{{{new_labels}}}{rest_of_line}'
|
177
|
+
modified_lines.append(modified_line)
|
178
|
+
else:
|
179
|
+
# keep other lines as-is
|
180
|
+
modified_lines.append(line)
|
181
|
+
|
182
|
+
return '\n'.join(modified_lines)
|
183
|
+
|
184
|
+
|
185
|
+
async def get_metrics_for_context(context: str) -> str:
|
186
|
+
"""Get GPU metrics for a single Kubernetes context.
|
187
|
+
Args:
|
188
|
+
context: Kubernetes context name
|
189
|
+
Returns:
|
190
|
+
metrics_text: String containing the metrics
|
191
|
+
Raises:
|
192
|
+
Exception: If metrics collection fails for any reason
|
193
|
+
"""
|
194
|
+
# Query both DCGM metrics and kube_pod_labels metrics
|
195
|
+
# This ensures the dashboard can perform joins to filter by skypilot cluster
|
196
|
+
match_patterns = ['{__name__=~"DCGM_.*"}', 'kube_pod_labels']
|
197
|
+
|
198
|
+
# TODO(rohan): don't hardcode the namespace and service name
|
199
|
+
metrics_text = await send_metrics_request_with_port_forward(
|
200
|
+
context=context,
|
201
|
+
namespace='skypilot',
|
202
|
+
service='skypilot-prometheus-server',
|
203
|
+
service_port=80,
|
204
|
+
endpoint_path='/federate',
|
205
|
+
match_patterns=match_patterns)
|
206
|
+
|
207
|
+
# add cluster name as a label to each metric line
|
208
|
+
metrics_text = await add_cluster_name_label(metrics_text, context)
|
209
|
+
|
210
|
+
return metrics_text
|
sky/optimizer.py
CHANGED
@@ -1375,7 +1375,7 @@ def _fill_in_launchable_resources(
|
|
1375
1375
|
num_node_str = ''
|
1376
1376
|
if task.num_nodes > 1:
|
1377
1377
|
num_node_str = f'{task.num_nodes}x '
|
1378
|
-
if not quiet:
|
1378
|
+
if not (quiet or resources.no_missing_accel_warnings):
|
1379
1379
|
logger.info(
|
1380
1380
|
f'No resource satisfying {num_node_str}'
|
1381
1381
|
f'{resources.repr_with_region_zone} on {clouds_str}.')
|
sky/resources.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
"""Resources: compute requirements of Tasks."""
|
2
|
+
import collections
|
2
3
|
import dataclasses
|
3
4
|
import math
|
5
|
+
import re
|
4
6
|
import textwrap
|
5
7
|
import typing
|
6
8
|
from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
|
@@ -41,6 +43,20 @@ RESOURCE_CONFIG_ALIASES = {
|
|
41
43
|
'gpus': 'accelerators',
|
42
44
|
}
|
43
45
|
|
46
|
+
MEMORY_SIZE_UNITS = {
|
47
|
+
'b': 1,
|
48
|
+
'k': 2**10,
|
49
|
+
'kb': 2**10,
|
50
|
+
'm': 2**20,
|
51
|
+
'mb': 2**20,
|
52
|
+
'g': 2**30,
|
53
|
+
'gb': 2**30,
|
54
|
+
't': 2**40,
|
55
|
+
'tb': 2**40,
|
56
|
+
'p': 2**50,
|
57
|
+
'pb': 2**50,
|
58
|
+
}
|
59
|
+
|
44
60
|
|
45
61
|
@dataclasses.dataclass
|
46
62
|
class AutostopConfig:
|
@@ -110,7 +126,7 @@ class Resources:
|
|
110
126
|
"""
|
111
127
|
# If any fields changed, increment the version. For backward compatibility,
|
112
128
|
# modify the __setstate__ method to handle the old version.
|
113
|
-
_VERSION =
|
129
|
+
_VERSION = 28
|
114
130
|
|
115
131
|
def __init__(
|
116
132
|
self,
|
@@ -142,6 +158,7 @@ class Resources:
|
|
142
158
|
_is_image_managed: Optional[bool] = None,
|
143
159
|
_requires_fuse: Optional[bool] = None,
|
144
160
|
_cluster_config_overrides: Optional[Dict[str, Any]] = None,
|
161
|
+
_no_missing_accel_warnings: Optional[bool] = None,
|
145
162
|
):
|
146
163
|
"""Initialize a Resources object.
|
147
164
|
|
@@ -366,6 +383,7 @@ class Resources:
|
|
366
383
|
|
367
384
|
self._cluster_config_overrides = _cluster_config_overrides
|
368
385
|
self._cached_repr: Optional[str] = None
|
386
|
+
self._no_missing_accel_warnings = _no_missing_accel_warnings
|
369
387
|
|
370
388
|
# Initialize _priority before calling the setter
|
371
389
|
self._priority: Optional[int] = None
|
@@ -649,6 +667,13 @@ class Resources:
|
|
649
667
|
return False
|
650
668
|
return self._requires_fuse
|
651
669
|
|
670
|
+
@property
|
671
|
+
def no_missing_accel_warnings(self) -> bool:
|
672
|
+
"""Returns whether to force quiet mode for this resource."""
|
673
|
+
if self._no_missing_accel_warnings is None:
|
674
|
+
return False
|
675
|
+
return self._no_missing_accel_warnings
|
676
|
+
|
652
677
|
def set_requires_fuse(self, value: bool) -> None:
|
653
678
|
"""Sets whether this resource requires FUSE mounting support.
|
654
679
|
|
@@ -754,6 +779,8 @@ class Resources:
|
|
754
779
|
if ':' not in accelerators:
|
755
780
|
accelerators = {accelerators: 1}
|
756
781
|
else:
|
782
|
+
assert isinstance(accelerators,
|
783
|
+
str), (type(accelerators), accelerators)
|
757
784
|
splits = accelerators.split(':')
|
758
785
|
parse_error = ('The "accelerators" field as a str '
|
759
786
|
'should be <name> or <name>:<cnt>. '
|
@@ -1778,6 +1805,8 @@ class Resources:
|
|
1778
1805
|
self._is_image_managed),
|
1779
1806
|
_requires_fuse=override.pop('_requires_fuse', self._requires_fuse),
|
1780
1807
|
_cluster_config_overrides=override_configs,
|
1808
|
+
_no_missing_accel_warnings=override.pop(
|
1809
|
+
'no_missing_accel_warnings', self._no_missing_accel_warnings),
|
1781
1810
|
)
|
1782
1811
|
assert not override
|
1783
1812
|
return resources
|
@@ -1843,10 +1872,75 @@ class Resources:
|
|
1843
1872
|
config[canonical] = config[alias]
|
1844
1873
|
del config[alias]
|
1845
1874
|
|
1875
|
+
@classmethod
|
1876
|
+
def _parse_accelerators_from_str(
|
1877
|
+
cls, accelerators: str) -> List[Tuple[str, bool]]:
|
1878
|
+
"""Parse accelerators string into a list of possible accelerators.
|
1879
|
+
|
1880
|
+
Returns:
|
1881
|
+
A list of possible accelerators. Each element is a tuple of
|
1882
|
+
(accelerator_name, was_user_specified). was_user_specified is True
|
1883
|
+
if the accelerator was directly named by the user (for example
|
1884
|
+
"H100:2" would be True, but "80GB+" would be False since it doesn't
|
1885
|
+
mention the name of the accelerator).
|
1886
|
+
"""
|
1887
|
+
# sanity check
|
1888
|
+
assert isinstance(accelerators, str), accelerators
|
1889
|
+
|
1890
|
+
manufacturer = None
|
1891
|
+
memory = None
|
1892
|
+
count = 1
|
1893
|
+
|
1894
|
+
split = accelerators.split(':')
|
1895
|
+
if len(split) == 3:
|
1896
|
+
manufacturer, memory, count_str = split
|
1897
|
+
count = int(count_str)
|
1898
|
+
assert re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', memory), \
|
1899
|
+
'If specifying a GPU manufacturer, you must also' \
|
1900
|
+
'specify the memory size'
|
1901
|
+
elif len(split) == 2 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[0]):
|
1902
|
+
memory = split[0]
|
1903
|
+
count = int(split[1])
|
1904
|
+
elif len(split) == 2 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[1]):
|
1905
|
+
manufacturer, memory = split
|
1906
|
+
elif len(split) == 1 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[0]):
|
1907
|
+
memory = split[0]
|
1908
|
+
else:
|
1909
|
+
# it is just an accelerator name, not a memory size
|
1910
|
+
return [(accelerators, True)]
|
1911
|
+
|
1912
|
+
# we know we have some case of manufacturer, memory, count, now we
|
1913
|
+
# need to convert that to a list of possible accelerators
|
1914
|
+
memory_parsed = resources_utils.parse_memory_resource(memory,
|
1915
|
+
'accelerators',
|
1916
|
+
allow_plus=True)
|
1917
|
+
plus = memory_parsed[-1] == '+'
|
1918
|
+
if plus:
|
1919
|
+
memory_parsed = memory_parsed[:-1]
|
1920
|
+
memory_gb = int(memory_parsed)
|
1921
|
+
|
1922
|
+
accelerators = [
|
1923
|
+
(f'{device}:{count}', False)
|
1924
|
+
for device in accelerator_registry.get_devices_by_memory(
|
1925
|
+
memory_gb, plus, manufacturer=manufacturer)
|
1926
|
+
]
|
1927
|
+
|
1928
|
+
return accelerators
|
1929
|
+
|
1846
1930
|
@classmethod
|
1847
1931
|
def from_yaml_config(
|
1848
1932
|
cls, config: Optional[Dict[str, Any]]
|
1849
1933
|
) -> Union[Set['Resources'], List['Resources']]:
|
1934
|
+
"""Creates Resources objects from a YAML config.
|
1935
|
+
|
1936
|
+
Args:
|
1937
|
+
config: A dict of resource config.
|
1938
|
+
|
1939
|
+
Returns:
|
1940
|
+
A set of Resources objects if any_of is specified, otherwise a list
|
1941
|
+
of Resources objects if ordered is specified, otherwise a set with
|
1942
|
+
a single Resources object.
|
1943
|
+
"""
|
1850
1944
|
if config is None:
|
1851
1945
|
return {Resources()}
|
1852
1946
|
|
@@ -1903,13 +1997,48 @@ class Resources:
|
|
1903
1997
|
accelerators = config.get('accelerators')
|
1904
1998
|
if config and accelerators is not None:
|
1905
1999
|
if isinstance(accelerators, str):
|
1906
|
-
|
2000
|
+
accelerators_list = cls._parse_accelerators_from_str(
|
2001
|
+
accelerators)
|
1907
2002
|
elif isinstance(accelerators, dict):
|
1908
|
-
|
2003
|
+
accelerator_names = [
|
1909
2004
|
f'{k}:{v}' if v is not None else f'{k}'
|
1910
2005
|
for k, v in accelerators.items()
|
1911
2006
|
]
|
1912
|
-
|
2007
|
+
accelerators_list = []
|
2008
|
+
for accel_name in accelerator_names:
|
2009
|
+
parsed_accels = cls._parse_accelerators_from_str(accel_name)
|
2010
|
+
accelerators_list.extend(parsed_accels)
|
2011
|
+
elif isinstance(accelerators, list) or isinstance(
|
2012
|
+
accelerators, set):
|
2013
|
+
accelerators_list = []
|
2014
|
+
for accel_name in accelerators:
|
2015
|
+
parsed_accels = cls._parse_accelerators_from_str(accel_name)
|
2016
|
+
accelerators_list.extend(parsed_accels)
|
2017
|
+
else:
|
2018
|
+
assert False, ('Invalid accelerators type:'
|
2019
|
+
f'{type(accelerators)}')
|
2020
|
+
# now that accelerators is a list, we need to decide which to
|
2021
|
+
# include in the final set, however, there may be multiple copies
|
2022
|
+
# of the same accelerator, some given by name by the user and the
|
2023
|
+
# other copy being given by memory size. In this case, we only care
|
2024
|
+
# about the user specified ones (so we can give a warning if it
|
2025
|
+
# doesn't exist).
|
2026
|
+
accel_to_user_specified: Dict[str, bool] = collections.OrderedDict()
|
2027
|
+
for accel, user_specified in accelerators_list:
|
2028
|
+
# If this accelerator is not in dict yet, or if current one is
|
2029
|
+
# user specified and existing one is not, update the entry
|
2030
|
+
accel_to_user_specified[accel] = (user_specified or
|
2031
|
+
accel_to_user_specified.get(
|
2032
|
+
accel, False))
|
2033
|
+
|
2034
|
+
# only time we care about ordered is when we are given a list,
|
2035
|
+
# otherwise we default to a set
|
2036
|
+
accelerators_type = list if isinstance(accelerators, list) else set
|
2037
|
+
accelerators = accelerators_type([
|
2038
|
+
(accel, user_specified)
|
2039
|
+
for accel, user_specified in accel_to_user_specified.items()
|
2040
|
+
])
|
2041
|
+
|
1913
2042
|
if len(accelerators) > 1 and ordered_configs:
|
1914
2043
|
with ux_utils.print_exception_no_traceback():
|
1915
2044
|
raise ValueError(
|
@@ -1935,20 +2064,20 @@ class Resources:
|
|
1935
2064
|
# In Task, we store a list of resources, each with 1 accelerator.
|
1936
2065
|
# This for loop is for format conversion.
|
1937
2066
|
tmp_resources_list = []
|
1938
|
-
for acc in accelerators:
|
2067
|
+
for acc, user_specified in accelerators:
|
1939
2068
|
tmp_resource = config.copy()
|
1940
2069
|
tmp_resource['accelerators'] = acc
|
2070
|
+
if not user_specified:
|
2071
|
+
tmp_resource['_no_missing_accel_warnings'] = True
|
1941
2072
|
tmp_resources_list.append(
|
1942
2073
|
Resources._from_yaml_config_single(tmp_resource))
|
1943
2074
|
|
1944
2075
|
assert isinstance(accelerators, (list, set)), accelerators
|
1945
2076
|
return type(accelerators)(tmp_resources_list)
|
1946
|
-
|
1947
2077
|
return {Resources._from_yaml_config_single(config)}
|
1948
2078
|
|
1949
2079
|
@classmethod
|
1950
2080
|
def _from_yaml_config_single(cls, config: Dict[str, str]) -> 'Resources':
|
1951
|
-
|
1952
2081
|
resources_fields: Dict[str, Any] = {}
|
1953
2082
|
|
1954
2083
|
# Extract infra field if present
|
@@ -2010,6 +2139,8 @@ class Resources:
|
|
2010
2139
|
# although it will end up being an int, we don't know at this point
|
2011
2140
|
# if it has units or not, so we store it as a string
|
2012
2141
|
resources_fields['disk_size'] = str(resources_fields['disk_size'])
|
2142
|
+
resources_fields['_no_missing_accel_warnings'] = config.pop(
|
2143
|
+
'_no_missing_accel_warnings', None)
|
2013
2144
|
|
2014
2145
|
assert not config, f'Invalid resource args: {config.keys()}'
|
2015
2146
|
return Resources(**resources_fields)
|
@@ -2060,6 +2191,9 @@ class Resources:
|
|
2060
2191
|
config['volumes'] = volumes
|
2061
2192
|
if self._autostop_config is not None:
|
2062
2193
|
config['autostop'] = self._autostop_config.to_yaml_config()
|
2194
|
+
|
2195
|
+
add_if_not_none('_no_missing_accel_warnings',
|
2196
|
+
self._no_missing_accel_warnings)
|
2063
2197
|
add_if_not_none('priority', self.priority)
|
2064
2198
|
if self._docker_login_config is not None:
|
2065
2199
|
config['_docker_login_config'] = dataclasses.asdict(
|
@@ -2232,6 +2366,10 @@ class Resources:
|
|
2232
2366
|
if version < 27:
|
2233
2367
|
self._priority = None
|
2234
2368
|
|
2369
|
+
if version < 28:
|
2370
|
+
self._no_missing_accel_warnings = state.get(
|
2371
|
+
'_no_missing_accel_warnings', None)
|
2372
|
+
|
2235
2373
|
self.__dict__.update(state)
|
2236
2374
|
|
2237
2375
|
|
sky/server/server.py
CHANGED
@@ -39,6 +39,7 @@ from sky import models
|
|
39
39
|
from sky import sky_logging
|
40
40
|
from sky.data import storage_utils
|
41
41
|
from sky.jobs.server import server as jobs_rest
|
42
|
+
from sky.metrics import utils as metrics_utils
|
42
43
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
43
44
|
from sky.serve.server import server as serve_rest
|
44
45
|
from sky.server import common
|
@@ -218,14 +219,26 @@ class RequestIDMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
218
219
|
|
219
220
|
|
220
221
|
def _get_auth_user_header(request: fastapi.Request) -> Optional[models.User]:
|
221
|
-
|
222
|
+
header_name = os.environ.get(constants.ENV_VAR_SERVER_AUTH_USER_HEADER,
|
223
|
+
'X-Auth-Request-Email')
|
224
|
+
if header_name not in request.headers:
|
222
225
|
return None
|
223
|
-
user_name = request.headers[
|
226
|
+
user_name = request.headers[header_name]
|
224
227
|
user_hash = hashlib.md5(
|
225
228
|
user_name.encode()).hexdigest()[:common_utils.USER_HASH_LENGTH]
|
226
229
|
return models.User(id=user_hash, name=user_name)
|
227
230
|
|
228
231
|
|
232
|
+
class InitializeRequestAuthUserMiddleware(
|
233
|
+
starlette.middleware.base.BaseHTTPMiddleware):
|
234
|
+
|
235
|
+
async def dispatch(self, request: fastapi.Request, call_next):
|
236
|
+
# Make sure that request.state.auth_user is set. Otherwise, we may get a
|
237
|
+
# KeyError while trying to read it.
|
238
|
+
request.state.auth_user = None
|
239
|
+
return await call_next(request)
|
240
|
+
|
241
|
+
|
229
242
|
class BasicAuthMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
230
243
|
"""Middleware to handle HTTP Basic Auth."""
|
231
244
|
|
@@ -406,6 +419,18 @@ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
406
419
|
async def dispatch(self, request: fastapi.Request, call_next):
|
407
420
|
auth_user = _get_auth_user_header(request)
|
408
421
|
|
422
|
+
if request.state.auth_user is not None:
|
423
|
+
# Previous middleware is trusted more than this middleware. For
|
424
|
+
# instance, a client could set the Authorization and the
|
425
|
+
# X-Auth-Request-Email header. In that case, the auth proxy will be
|
426
|
+
# skipped and we should rely on the Bearer token to authenticate the
|
427
|
+
# user - but that means the user could set X-Auth-Request-Email to
|
428
|
+
# whatever the user wants. We should thus ignore it.
|
429
|
+
if auth_user is not None:
|
430
|
+
logger.debug('Warning: ignoring auth proxy header since the '
|
431
|
+
'auth user was already set.')
|
432
|
+
return await call_next(request)
|
433
|
+
|
409
434
|
# Add user to database if auth_user is present
|
410
435
|
if auth_user is not None:
|
411
436
|
newly_added = global_user_state.add_or_update_user(auth_user)
|
@@ -416,8 +441,6 @@ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
416
441
|
# Store user info in request.state for access by GET endpoints
|
417
442
|
if auth_user is not None:
|
418
443
|
request.state.auth_user = auth_user
|
419
|
-
else:
|
420
|
-
request.state.auth_user = None
|
421
444
|
|
422
445
|
await _override_user_info_in_request_body(request, auth_user)
|
423
446
|
return await call_next(request)
|
@@ -536,10 +559,17 @@ class GracefulShutdownMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
536
559
|
|
537
560
|
|
538
561
|
app = fastapi.FastAPI(prefix='/api/v1', debug=True, lifespan=lifespan)
|
562
|
+
# Middleware wraps in the order defined here. E.g., given
|
563
|
+
# app.add_middleware(Middleware1)
|
564
|
+
# app.add_middleware(Middleware2)
|
565
|
+
# app.add_middleware(Middleware3)
|
566
|
+
# The effect will be like:
|
567
|
+
# Middleware3(Middleware2(Middleware1(request)))
|
568
|
+
# If MiddlewareN does something like print(n); call_next(); print(n), you'll get
|
569
|
+
# 3; 2; 1; <request>; 1; 2; 3
|
539
570
|
# Use environment variable to make the metrics middleware optional.
|
540
571
|
if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
|
541
572
|
app.add_middleware(metrics.PrometheusMiddleware)
|
542
|
-
app.add_middleware(RBACMiddleware)
|
543
573
|
app.add_middleware(InternalDashboardPrefixMiddleware)
|
544
574
|
app.add_middleware(GracefulShutdownMiddleware)
|
545
575
|
app.add_middleware(PathCleanMiddleware)
|
@@ -552,15 +582,26 @@ app.add_middleware(
|
|
552
582
|
allow_credentials=True,
|
553
583
|
allow_methods=['*'],
|
554
584
|
allow_headers=['*'],
|
555
|
-
# TODO(syang): remove X-Request-ID when v0.10.0 is released.
|
585
|
+
# TODO(syang): remove X-Request-ID \when v0.10.0 is released.
|
556
586
|
expose_headers=['X-Request-ID', 'X-Skypilot-Request-ID'])
|
587
|
+
# The order of all the authentication-related middleware is important.
|
588
|
+
# RBACMiddleware must precede all the auth middleware, so it can access
|
589
|
+
# request.state.auth_user.
|
590
|
+
app.add_middleware(RBACMiddleware)
|
591
|
+
# AuthProxyMiddleware should precede BasicAuthMiddleware and
|
592
|
+
# BearerTokenMiddleware, since it should be skipped if either of those set the
|
593
|
+
# auth user.
|
594
|
+
app.add_middleware(AuthProxyMiddleware)
|
557
595
|
enable_basic_auth = os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH, 'false')
|
558
596
|
if str(enable_basic_auth).lower() == 'true':
|
559
597
|
app.add_middleware(BasicAuthMiddleware)
|
560
598
|
# Bearer token middleware should always be present to handle service account
|
561
599
|
# authentication
|
562
600
|
app.add_middleware(BearerTokenMiddleware)
|
563
|
-
|
601
|
+
# InitializeRequestAuthUserMiddleware must be the last added middleware so that
|
602
|
+
# request.state.auth_user is always set, but can be overridden by the auth
|
603
|
+
# middleware above.
|
604
|
+
app.add_middleware(InitializeRequestAuthUserMiddleware)
|
564
605
|
app.add_middleware(RequestIDMiddleware)
|
565
606
|
app.include_router(jobs_rest.router, prefix='/jobs', tags=['jobs'])
|
566
607
|
app.include_router(serve_rest.router, prefix='/serve', tags=['serve'])
|
@@ -1576,6 +1617,38 @@ async def all_contexts(request: fastapi.Request) -> None:
|
|
1576
1617
|
)
|
1577
1618
|
|
1578
1619
|
|
1620
|
+
@app.get('/gpu-metrics')
|
1621
|
+
async def gpu_metrics() -> fastapi.Response:
|
1622
|
+
"""Gets the GPU metrics from multiple external k8s clusters"""
|
1623
|
+
contexts = core.get_all_contexts()
|
1624
|
+
all_metrics = []
|
1625
|
+
successful_contexts = 0
|
1626
|
+
|
1627
|
+
tasks = [
|
1628
|
+
asyncio.create_task(metrics_utils.get_metrics_for_context(context))
|
1629
|
+
for context in contexts
|
1630
|
+
if context != 'in-cluster'
|
1631
|
+
]
|
1632
|
+
|
1633
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
1634
|
+
|
1635
|
+
for i, result in enumerate(results):
|
1636
|
+
if isinstance(result, Exception):
|
1637
|
+
logger.error(
|
1638
|
+
f'Failed to get metrics for context {contexts[i]}: {result}')
|
1639
|
+
else:
|
1640
|
+
metrics_text = result
|
1641
|
+
all_metrics.append(metrics_text)
|
1642
|
+
successful_contexts += 1
|
1643
|
+
|
1644
|
+
combined_metrics = '\n\n'.join(all_metrics)
|
1645
|
+
|
1646
|
+
# Return as plain text for Prometheus compatibility
|
1647
|
+
return fastapi.Response(
|
1648
|
+
content=combined_metrics,
|
1649
|
+
media_type='text/plain; version=0.0.4; charset=utf-8')
|
1650
|
+
|
1651
|
+
|
1579
1652
|
# === Internal APIs ===
|
1580
1653
|
@app.get('/api/completion/cluster_name')
|
1581
1654
|
async def complete_cluster_name(incomplete: str,) -> List[str]:
|
sky/setup_files/MANIFEST.in
CHANGED