skypilot-nightly 1.0.0.dev20250701__py3-none-any.whl → 1.0.0.dev20250702__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. sky/__init__.py +2 -2
  2. sky/catalog/__init__.py +1 -1
  3. sky/client/cli/command.py +60 -21
  4. sky/dashboard/out/404.html +1 -1
  5. sky/dashboard/out/_next/static/{Md3rlE87jmL5uv7gSo8mR → N5IdFnjR1RaPGBAVYeTIr}/_buildManifest.js +1 -1
  6. sky/dashboard/out/_next/static/chunks/9984.b56614f3c4c5961d.js +1 -0
  7. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-1159f362b960e2b8.js +6 -0
  8. sky/dashboard/out/_next/static/chunks/{webpack-d427db53e54de9ce.js → webpack-9a81ea998672c303.js} +1 -1
  9. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  10. sky/dashboard/out/clusters/[cluster].html +1 -1
  11. sky/dashboard/out/clusters.html +1 -1
  12. sky/dashboard/out/config.html +1 -1
  13. sky/dashboard/out/index.html +1 -1
  14. sky/dashboard/out/infra/[context].html +1 -1
  15. sky/dashboard/out/infra.html +1 -1
  16. sky/dashboard/out/jobs/[job].html +1 -1
  17. sky/dashboard/out/jobs.html +1 -1
  18. sky/dashboard/out/users.html +1 -1
  19. sky/dashboard/out/volumes.html +1 -1
  20. sky/dashboard/out/workspace/new.html +1 -1
  21. sky/dashboard/out/workspaces/[name].html +1 -1
  22. sky/dashboard/out/workspaces.html +1 -1
  23. sky/metrics/utils.py +210 -0
  24. sky/optimizer.py +1 -1
  25. sky/resources.py +145 -7
  26. sky/server/server.py +80 -7
  27. sky/setup_files/MANIFEST.in +1 -0
  28. sky/skylet/constants.py +3 -0
  29. sky/skypilot_config.py +62 -53
  30. sky/utils/accelerator_registry.py +28 -1
  31. sky/utils/schemas.py +3 -0
  32. sky/utils/ux_utils.py +9 -4
  33. {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/METADATA +1 -1
  34. {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/RECORD +39 -38
  35. sky/dashboard/out/_next/static/chunks/9984.739ae958a066298d.js +0 -1
  36. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b8e1114e6d38218c.js +0 -6
  37. /sky/dashboard/out/_next/static/{Md3rlE87jmL5uv7gSo8mR → N5IdFnjR1RaPGBAVYeTIr}/_ssgManifest.js +0 -0
  38. {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/WHEEL +0 -0
  39. {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/entry_points.txt +0 -0
  40. {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/licenses/LICENSE +0 -0
  41. {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-d427db53e54de9ce.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-a37b06ddb64521fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-162f3033ffcd3d31.js" defer=""></script><script src="/dashboard/_next/static/chunks/5230-df791914b54d91d9.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-5ea3ffa10fc884f2.js" defer=""></script><script src="/dashboard/_next/static/chunks/1664-d65361e92b85e786.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-9f5e98ce84d46bdd.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/3947-b059261d6fa88a1f.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-6ff4e45dfb49d11d.js" defer=""></script><script src="/dashboard/_next/static/chunks/3698-52ad1ca228faa776.js" defer=""></script><script src="/dashboard/_next/static/chunks/9470-21d059a1dfa03f61.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-d0dc765474fa0eca.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-909d53833da080cb.js" defer=""></script><script src="/dashboard/_next/static/chunks/1043-1b39779691bb4030.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-fcfad0ddf92ec7ab.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-044ad21de8b4626b.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-726e5a3f00b67185.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-7c0187f43757a548.js" defer=""></script><script src="/dashboard/_next/static/Md3rlE87jmL5uv7gSo8mR/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Md3rlE87jmL5uv7gSo8mR/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"Md3rlE87jmL5uv7gSo8mR","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-9a81ea998672c303.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-a37b06ddb64521fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-162f3033ffcd3d31.js" defer=""></script><script src="/dashboard/_next/static/chunks/5230-df791914b54d91d9.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-5ea3ffa10fc884f2.js" defer=""></script><script src="/dashboard/_next/static/chunks/1664-d65361e92b85e786.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-9f5e98ce84d46bdd.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/3947-b059261d6fa88a1f.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-6ff4e45dfb49d11d.js" defer=""></script><script src="/dashboard/_next/static/chunks/3698-52ad1ca228faa776.js" defer=""></script><script src="/dashboard/_next/static/chunks/9470-21d059a1dfa03f61.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-d0dc765474fa0eca.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-909d53833da080cb.js" defer=""></script><script src="/dashboard/_next/static/chunks/1043-1b39779691bb4030.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-fcfad0ddf92ec7ab.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-044ad21de8b4626b.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-726e5a3f00b67185.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-7c0187f43757a548.js" defer=""></script><script src="/dashboard/_next/static/N5IdFnjR1RaPGBAVYeTIr/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/N5IdFnjR1RaPGBAVYeTIr/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"N5IdFnjR1RaPGBAVYeTIr","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-d427db53e54de9ce.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-a37b06ddb64521fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-06bde99155fa6292.js" defer=""></script><script src="/dashboard/_next/static/Md3rlE87jmL5uv7gSo8mR/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Md3rlE87jmL5uv7gSo8mR/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"Md3rlE87jmL5uv7gSo8mR","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-9a81ea998672c303.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-a37b06ddb64521fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-06bde99155fa6292.js" defer=""></script><script src="/dashboard/_next/static/N5IdFnjR1RaPGBAVYeTIr/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/N5IdFnjR1RaPGBAVYeTIr/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"N5IdFnjR1RaPGBAVYeTIr","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
sky/metrics/utils.py ADDED
@@ -0,0 +1,210 @@
1
+ """Utilities for processing GPU metrics from Kubernetes clusters."""
2
+ import os
3
+ import re
4
+ import subprocess
5
+ import time
6
+ from typing import List, Optional, Tuple
7
+
8
+ import httpx
9
+
10
+
11
+ def start_svc_port_forward(context: str, namespace: str, service: str,
12
+ service_port: int) -> Tuple[subprocess.Popen, int]:
13
+ """Starts a port forward to a service in a Kubernetes cluster.
14
+ Args:
15
+ context: Kubernetes context name
16
+ namespace: Namespace where the service is located
17
+ service: Service name to port forward to
18
+ service_port: Port on the service to forward to
19
+ Returns:
20
+ Tuple of (subprocess.Popen process, local_port assigned)
21
+ Raises:
22
+ RuntimeError: If port forward fails to start
23
+ """
24
+ start_port_forward_timeout = 10 # 10 second timeout
25
+ terminate_port_forward_timeout = 5 # 5 second timeout
26
+
27
+ # Use ':service_port' to let kubectl choose the local port
28
+ cmd = [
29
+ 'kubectl', '--context', context, '-n', namespace, 'port-forward',
30
+ f'service/{service}', f':{service_port}'
31
+ ]
32
+
33
+ env = os.environ.copy()
34
+ if 'KUBECONFIG' not in env:
35
+ env['KUBECONFIG'] = os.path.expanduser('~/.kube/config')
36
+
37
+ # start the port forward process
38
+ port_forward_process = subprocess.Popen(cmd,
39
+ stdout=subprocess.PIPE,
40
+ stderr=subprocess.STDOUT,
41
+ text=True,
42
+ env=env)
43
+
44
+ local_port = None
45
+ start_time = time.time()
46
+
47
+ # wait for the port forward to start and extract the local port
48
+ while time.time() - start_time < start_port_forward_timeout:
49
+ if port_forward_process.poll() is not None:
50
+ # port forward process has terminated
51
+ if port_forward_process.returncode != 0:
52
+ raise RuntimeError(
53
+ f'Port forward failed for service {service} in namespace '
54
+ f'{namespace} on context {context}')
55
+ break
56
+
57
+ # read output line by line to find the local port
58
+ if port_forward_process.stdout:
59
+ line = port_forward_process.stdout.readline()
60
+ if line:
61
+ # look for 'Forwarding from 127.0.0.1:XXXXX -> service_port'
62
+ match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', line)
63
+ if match:
64
+ local_port = int(match.group(1))
65
+ break
66
+
67
+ # sleep for 100ms to avoid busy-waiting
68
+ time.sleep(0.1)
69
+
70
+ if local_port is None:
71
+ try:
72
+ port_forward_process.terminate()
73
+ port_forward_process.wait(timeout=terminate_port_forward_timeout)
74
+ except subprocess.TimeoutExpired:
75
+ port_forward_process.kill()
76
+ port_forward_process.wait()
77
+ finally:
78
+ raise RuntimeError(
79
+ f'Failed to extract local port for service {service} in '
80
+ f'namespace {namespace} on context {context}')
81
+
82
+ return port_forward_process, local_port
83
+
84
+
85
+ def stop_svc_port_forward(port_forward_process: subprocess.Popen) -> None:
86
+ """Stops a port forward to a service in a Kubernetes cluster.
87
+ Args:
88
+ port_forward_process: The subprocess.Popen process to terminate
89
+ """
90
+ try:
91
+ port_forward_process.terminate()
92
+ port_forward_process.wait(timeout=5)
93
+ except subprocess.TimeoutExpired:
94
+ port_forward_process.kill()
95
+ port_forward_process.wait()
96
+
97
+
98
+ async def send_metrics_request_with_port_forward(
99
+ context: str,
100
+ namespace: str,
101
+ service: str,
102
+ service_port: int,
103
+ endpoint_path: str = '/federate',
104
+ match_patterns: Optional[List[str]] = None,
105
+ timeout: float = 30.0) -> str:
106
+ """Sends a metrics request to a Prometheus endpoint via port forwarding.
107
+ Args:
108
+ context: Kubernetes context name
109
+ namespace: Namespace where the service is located
110
+ service: Service name to port forward to
111
+ service_port: Port on the service to forward to
112
+ endpoint_path: Path to append to the localhost endpoint (e.g.,
113
+ '/federate')
114
+ match_patterns: List of metric patterns to match (for federate
115
+ endpoint)
116
+ timeout: Request timeout in seconds
117
+ Returns:
118
+ Response text containing the metrics
119
+ Raises:
120
+ RuntimeError: If port forward or HTTP request fails
121
+ """
122
+ port_forward_process = None
123
+ try:
124
+ # Start port forward
125
+ port_forward_process, local_port = start_svc_port_forward(
126
+ context, namespace, service, service_port)
127
+
128
+ # Build endpoint URL
129
+ endpoint = f'http://localhost:{local_port}{endpoint_path}'
130
+
131
+ # Make HTTP request
132
+ async with httpx.AsyncClient(timeout=timeout) as client:
133
+ if match_patterns:
134
+ # For federate endpoint, add match[] parameters
135
+ params = [('match[]', pattern) for pattern in match_patterns]
136
+ response = await client.get(endpoint, params=params)
137
+ else:
138
+ response = await client.get(endpoint)
139
+
140
+ response.raise_for_status()
141
+ return response.text
142
+
143
+ finally:
144
+ # Always clean up port forward
145
+ if port_forward_process:
146
+ stop_svc_port_forward(port_forward_process)
147
+
148
+
149
+ async def add_cluster_name_label(metrics_text: str, context: str) -> str:
150
+ """Adds a cluster_name label to each metric line.
151
+ Args:
152
+ metrics_text: The text containing the metrics
153
+ context: The cluster name
154
+ """
155
+ lines = metrics_text.strip().split('\n')
156
+ modified_lines = []
157
+
158
+ for line in lines:
159
+ # keep comment lines and empty lines as-is
160
+ if line.startswith('#') or not line.strip():
161
+ modified_lines.append(line)
162
+ continue
163
+ # if line is a metric line with labels, add cluster label
164
+ brace_start = line.find('{')
165
+ brace_end = line.find('}')
166
+ if brace_start != -1 and brace_end != -1:
167
+ metric_name = line[:brace_start]
168
+ existing_labels = line[brace_start + 1:brace_end]
169
+ rest_of_line = line[brace_end + 1:]
170
+
171
+ if existing_labels:
172
+ new_labels = f'cluster="{context}",{existing_labels}'
173
+ else:
174
+ new_labels = f'cluster="{context}"'
175
+
176
+ modified_line = f'{metric_name}{{{new_labels}}}{rest_of_line}'
177
+ modified_lines.append(modified_line)
178
+ else:
179
+ # keep other lines as-is
180
+ modified_lines.append(line)
181
+
182
+ return '\n'.join(modified_lines)
183
+
184
+
185
+ async def get_metrics_for_context(context: str) -> str:
186
+ """Get GPU metrics for a single Kubernetes context.
187
+ Args:
188
+ context: Kubernetes context name
189
+ Returns:
190
+ metrics_text: String containing the metrics
191
+ Raises:
192
+ Exception: If metrics collection fails for any reason
193
+ """
194
+ # Query both DCGM metrics and kube_pod_labels metrics
195
+ # This ensures the dashboard can perform joins to filter by skypilot cluster
196
+ match_patterns = ['{__name__=~"DCGM_.*"}', 'kube_pod_labels']
197
+
198
+ # TODO(rohan): don't hardcode the namespace and service name
199
+ metrics_text = await send_metrics_request_with_port_forward(
200
+ context=context,
201
+ namespace='skypilot',
202
+ service='skypilot-prometheus-server',
203
+ service_port=80,
204
+ endpoint_path='/federate',
205
+ match_patterns=match_patterns)
206
+
207
+ # add cluster name as a label to each metric line
208
+ metrics_text = await add_cluster_name_label(metrics_text, context)
209
+
210
+ return metrics_text
sky/optimizer.py CHANGED
@@ -1375,7 +1375,7 @@ def _fill_in_launchable_resources(
1375
1375
  num_node_str = ''
1376
1376
  if task.num_nodes > 1:
1377
1377
  num_node_str = f'{task.num_nodes}x '
1378
- if not quiet:
1378
+ if not (quiet or resources.no_missing_accel_warnings):
1379
1379
  logger.info(
1380
1380
  f'No resource satisfying {num_node_str}'
1381
1381
  f'{resources.repr_with_region_zone} on {clouds_str}.')
sky/resources.py CHANGED
@@ -1,6 +1,8 @@
1
1
  """Resources: compute requirements of Tasks."""
2
+ import collections
2
3
  import dataclasses
3
4
  import math
5
+ import re
4
6
  import textwrap
5
7
  import typing
6
8
  from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
@@ -41,6 +43,20 @@ RESOURCE_CONFIG_ALIASES = {
41
43
  'gpus': 'accelerators',
42
44
  }
43
45
 
46
+ MEMORY_SIZE_UNITS = {
47
+ 'b': 1,
48
+ 'k': 2**10,
49
+ 'kb': 2**10,
50
+ 'm': 2**20,
51
+ 'mb': 2**20,
52
+ 'g': 2**30,
53
+ 'gb': 2**30,
54
+ 't': 2**40,
55
+ 'tb': 2**40,
56
+ 'p': 2**50,
57
+ 'pb': 2**50,
58
+ }
59
+
44
60
 
45
61
  @dataclasses.dataclass
46
62
  class AutostopConfig:
@@ -110,7 +126,7 @@ class Resources:
110
126
  """
111
127
  # If any fields changed, increment the version. For backward compatibility,
112
128
  # modify the __setstate__ method to handle the old version.
113
- _VERSION = 27
129
+ _VERSION = 28
114
130
 
115
131
  def __init__(
116
132
  self,
@@ -142,6 +158,7 @@ class Resources:
142
158
  _is_image_managed: Optional[bool] = None,
143
159
  _requires_fuse: Optional[bool] = None,
144
160
  _cluster_config_overrides: Optional[Dict[str, Any]] = None,
161
+ _no_missing_accel_warnings: Optional[bool] = None,
145
162
  ):
146
163
  """Initialize a Resources object.
147
164
 
@@ -366,6 +383,7 @@ class Resources:
366
383
 
367
384
  self._cluster_config_overrides = _cluster_config_overrides
368
385
  self._cached_repr: Optional[str] = None
386
+ self._no_missing_accel_warnings = _no_missing_accel_warnings
369
387
 
370
388
  # Initialize _priority before calling the setter
371
389
  self._priority: Optional[int] = None
@@ -649,6 +667,13 @@ class Resources:
649
667
  return False
650
668
  return self._requires_fuse
651
669
 
670
+ @property
671
+ def no_missing_accel_warnings(self) -> bool:
672
+ """Returns whether to force quiet mode for this resource."""
673
+ if self._no_missing_accel_warnings is None:
674
+ return False
675
+ return self._no_missing_accel_warnings
676
+
652
677
  def set_requires_fuse(self, value: bool) -> None:
653
678
  """Sets whether this resource requires FUSE mounting support.
654
679
 
@@ -754,6 +779,8 @@ class Resources:
754
779
  if ':' not in accelerators:
755
780
  accelerators = {accelerators: 1}
756
781
  else:
782
+ assert isinstance(accelerators,
783
+ str), (type(accelerators), accelerators)
757
784
  splits = accelerators.split(':')
758
785
  parse_error = ('The "accelerators" field as a str '
759
786
  'should be <name> or <name>:<cnt>. '
@@ -1778,6 +1805,8 @@ class Resources:
1778
1805
  self._is_image_managed),
1779
1806
  _requires_fuse=override.pop('_requires_fuse', self._requires_fuse),
1780
1807
  _cluster_config_overrides=override_configs,
1808
+ _no_missing_accel_warnings=override.pop(
1809
+ 'no_missing_accel_warnings', self._no_missing_accel_warnings),
1781
1810
  )
1782
1811
  assert not override
1783
1812
  return resources
@@ -1843,10 +1872,75 @@ class Resources:
1843
1872
  config[canonical] = config[alias]
1844
1873
  del config[alias]
1845
1874
 
1875
+ @classmethod
1876
+ def _parse_accelerators_from_str(
1877
+ cls, accelerators: str) -> List[Tuple[str, bool]]:
1878
+ """Parse accelerators string into a list of possible accelerators.
1879
+
1880
+ Returns:
1881
+ A list of possible accelerators. Each element is a tuple of
1882
+ (accelerator_name, was_user_specified). was_user_specified is True
1883
+ if the accelerator was directly named by the user (for example
1884
+ "H100:2" would be True, but "80GB+" would be False since it doesn't
1885
+ mention the name of the accelerator).
1886
+ """
1887
+ # sanity check
1888
+ assert isinstance(accelerators, str), accelerators
1889
+
1890
+ manufacturer = None
1891
+ memory = None
1892
+ count = 1
1893
+
1894
+ split = accelerators.split(':')
1895
+ if len(split) == 3:
1896
+ manufacturer, memory, count_str = split
1897
+ count = int(count_str)
1898
+ assert re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', memory), \
1899
+ 'If specifying a GPU manufacturer, you must also' \
1900
+ 'specify the memory size'
1901
+ elif len(split) == 2 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[0]):
1902
+ memory = split[0]
1903
+ count = int(split[1])
1904
+ elif len(split) == 2 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[1]):
1905
+ manufacturer, memory = split
1906
+ elif len(split) == 1 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[0]):
1907
+ memory = split[0]
1908
+ else:
1909
+ # it is just an accelerator name, not a memory size
1910
+ return [(accelerators, True)]
1911
+
1912
+ # we know we have some case of manufacturer, memory, count, now we
1913
+ # need to convert that to a list of possible accelerators
1914
+ memory_parsed = resources_utils.parse_memory_resource(memory,
1915
+ 'accelerators',
1916
+ allow_plus=True)
1917
+ plus = memory_parsed[-1] == '+'
1918
+ if plus:
1919
+ memory_parsed = memory_parsed[:-1]
1920
+ memory_gb = int(memory_parsed)
1921
+
1922
+ accelerators = [
1923
+ (f'{device}:{count}', False)
1924
+ for device in accelerator_registry.get_devices_by_memory(
1925
+ memory_gb, plus, manufacturer=manufacturer)
1926
+ ]
1927
+
1928
+ return accelerators
1929
+
1846
1930
  @classmethod
1847
1931
  def from_yaml_config(
1848
1932
  cls, config: Optional[Dict[str, Any]]
1849
1933
  ) -> Union[Set['Resources'], List['Resources']]:
1934
+ """Creates Resources objects from a YAML config.
1935
+
1936
+ Args:
1937
+ config: A dict of resource config.
1938
+
1939
+ Returns:
1940
+ A set of Resources objects if any_of is specified, otherwise a list
1941
+ of Resources objects if ordered is specified, otherwise a set with
1942
+ a single Resources object.
1943
+ """
1850
1944
  if config is None:
1851
1945
  return {Resources()}
1852
1946
 
@@ -1903,13 +1997,48 @@ class Resources:
1903
1997
  accelerators = config.get('accelerators')
1904
1998
  if config and accelerators is not None:
1905
1999
  if isinstance(accelerators, str):
1906
- accelerators = {accelerators}
2000
+ accelerators_list = cls._parse_accelerators_from_str(
2001
+ accelerators)
1907
2002
  elif isinstance(accelerators, dict):
1908
- accelerators = [
2003
+ accelerator_names = [
1909
2004
  f'{k}:{v}' if v is not None else f'{k}'
1910
2005
  for k, v in accelerators.items()
1911
2006
  ]
1912
- accelerators = set(accelerators)
2007
+ accelerators_list = []
2008
+ for accel_name in accelerator_names:
2009
+ parsed_accels = cls._parse_accelerators_from_str(accel_name)
2010
+ accelerators_list.extend(parsed_accels)
2011
+ elif isinstance(accelerators, list) or isinstance(
2012
+ accelerators, set):
2013
+ accelerators_list = []
2014
+ for accel_name in accelerators:
2015
+ parsed_accels = cls._parse_accelerators_from_str(accel_name)
2016
+ accelerators_list.extend(parsed_accels)
2017
+ else:
2018
+ assert False, ('Invalid accelerators type:'
2019
+ f'{type(accelerators)}')
2020
+ # now that accelerators is a list, we need to decide which to
2021
+ # include in the final set, however, there may be multiple copies
2022
+ # of the same accelerator, some given by name by the user and the
2023
+ # other copy being given by memory size. In this case, we only care
2024
+ # about the user specified ones (so we can give a warning if it
2025
+ # doesn't exist).
2026
+ accel_to_user_specified: Dict[str, bool] = collections.OrderedDict()
2027
+ for accel, user_specified in accelerators_list:
2028
+ # If this accelerator is not in dict yet, or if current one is
2029
+ # user specified and existing one is not, update the entry
2030
+ accel_to_user_specified[accel] = (user_specified or
2031
+ accel_to_user_specified.get(
2032
+ accel, False))
2033
+
2034
+ # only time we care about ordered is when we are given a list,
2035
+ # otherwise we default to a set
2036
+ accelerators_type = list if isinstance(accelerators, list) else set
2037
+ accelerators = accelerators_type([
2038
+ (accel, user_specified)
2039
+ for accel, user_specified in accel_to_user_specified.items()
2040
+ ])
2041
+
1913
2042
  if len(accelerators) > 1 and ordered_configs:
1914
2043
  with ux_utils.print_exception_no_traceback():
1915
2044
  raise ValueError(
@@ -1935,20 +2064,20 @@ class Resources:
1935
2064
  # In Task, we store a list of resources, each with 1 accelerator.
1936
2065
  # This for loop is for format conversion.
1937
2066
  tmp_resources_list = []
1938
- for acc in accelerators:
2067
+ for acc, user_specified in accelerators:
1939
2068
  tmp_resource = config.copy()
1940
2069
  tmp_resource['accelerators'] = acc
2070
+ if not user_specified:
2071
+ tmp_resource['_no_missing_accel_warnings'] = True
1941
2072
  tmp_resources_list.append(
1942
2073
  Resources._from_yaml_config_single(tmp_resource))
1943
2074
 
1944
2075
  assert isinstance(accelerators, (list, set)), accelerators
1945
2076
  return type(accelerators)(tmp_resources_list)
1946
-
1947
2077
  return {Resources._from_yaml_config_single(config)}
1948
2078
 
1949
2079
  @classmethod
1950
2080
  def _from_yaml_config_single(cls, config: Dict[str, str]) -> 'Resources':
1951
-
1952
2081
  resources_fields: Dict[str, Any] = {}
1953
2082
 
1954
2083
  # Extract infra field if present
@@ -2010,6 +2139,8 @@ class Resources:
2010
2139
  # although it will end up being an int, we don't know at this point
2011
2140
  # if it has units or not, so we store it as a string
2012
2141
  resources_fields['disk_size'] = str(resources_fields['disk_size'])
2142
+ resources_fields['_no_missing_accel_warnings'] = config.pop(
2143
+ '_no_missing_accel_warnings', None)
2013
2144
 
2014
2145
  assert not config, f'Invalid resource args: {config.keys()}'
2015
2146
  return Resources(**resources_fields)
@@ -2060,6 +2191,9 @@ class Resources:
2060
2191
  config['volumes'] = volumes
2061
2192
  if self._autostop_config is not None:
2062
2193
  config['autostop'] = self._autostop_config.to_yaml_config()
2194
+
2195
+ add_if_not_none('_no_missing_accel_warnings',
2196
+ self._no_missing_accel_warnings)
2063
2197
  add_if_not_none('priority', self.priority)
2064
2198
  if self._docker_login_config is not None:
2065
2199
  config['_docker_login_config'] = dataclasses.asdict(
@@ -2232,6 +2366,10 @@ class Resources:
2232
2366
  if version < 27:
2233
2367
  self._priority = None
2234
2368
 
2369
+ if version < 28:
2370
+ self._no_missing_accel_warnings = state.get(
2371
+ '_no_missing_accel_warnings', None)
2372
+
2235
2373
  self.__dict__.update(state)
2236
2374
 
2237
2375
 
sky/server/server.py CHANGED
@@ -39,6 +39,7 @@ from sky import models
39
39
  from sky import sky_logging
40
40
  from sky.data import storage_utils
41
41
  from sky.jobs.server import server as jobs_rest
42
+ from sky.metrics import utils as metrics_utils
42
43
  from sky.provision.kubernetes import utils as kubernetes_utils
43
44
  from sky.serve.server import server as serve_rest
44
45
  from sky.server import common
@@ -218,14 +219,26 @@ class RequestIDMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
218
219
 
219
220
 
220
221
  def _get_auth_user_header(request: fastapi.Request) -> Optional[models.User]:
221
- if 'X-Auth-Request-Email' not in request.headers:
222
+ header_name = os.environ.get(constants.ENV_VAR_SERVER_AUTH_USER_HEADER,
223
+ 'X-Auth-Request-Email')
224
+ if header_name not in request.headers:
222
225
  return None
223
- user_name = request.headers['X-Auth-Request-Email']
226
+ user_name = request.headers[header_name]
224
227
  user_hash = hashlib.md5(
225
228
  user_name.encode()).hexdigest()[:common_utils.USER_HASH_LENGTH]
226
229
  return models.User(id=user_hash, name=user_name)
227
230
 
228
231
 
232
+ class InitializeRequestAuthUserMiddleware(
233
+ starlette.middleware.base.BaseHTTPMiddleware):
234
+
235
+ async def dispatch(self, request: fastapi.Request, call_next):
236
+ # Make sure that request.state.auth_user is set. Otherwise, we may get a
237
+ # KeyError while trying to read it.
238
+ request.state.auth_user = None
239
+ return await call_next(request)
240
+
241
+
229
242
  class BasicAuthMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
230
243
  """Middleware to handle HTTP Basic Auth."""
231
244
 
@@ -406,6 +419,18 @@ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
406
419
  async def dispatch(self, request: fastapi.Request, call_next):
407
420
  auth_user = _get_auth_user_header(request)
408
421
 
422
+ if request.state.auth_user is not None:
423
+ # Previous middleware is trusted more than this middleware. For
424
+ # instance, a client could set the Authorization and the
425
+ # X-Auth-Request-Email header. In that case, the auth proxy will be
426
+ # skipped and we should rely on the Bearer token to authenticate the
427
+ # user - but that means the user could set X-Auth-Request-Email to
428
+ # whatever the user wants. We should thus ignore it.
429
+ if auth_user is not None:
430
+ logger.debug('Warning: ignoring auth proxy header since the '
431
+ 'auth user was already set.')
432
+ return await call_next(request)
433
+
409
434
  # Add user to database if auth_user is present
410
435
  if auth_user is not None:
411
436
  newly_added = global_user_state.add_or_update_user(auth_user)
@@ -416,8 +441,6 @@ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
416
441
  # Store user info in request.state for access by GET endpoints
417
442
  if auth_user is not None:
418
443
  request.state.auth_user = auth_user
419
- else:
420
- request.state.auth_user = None
421
444
 
422
445
  await _override_user_info_in_request_body(request, auth_user)
423
446
  return await call_next(request)
@@ -536,10 +559,17 @@ class GracefulShutdownMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
536
559
 
537
560
 
538
561
  app = fastapi.FastAPI(prefix='/api/v1', debug=True, lifespan=lifespan)
562
+ # Middleware wraps in the order defined here. E.g., given
563
+ # app.add_middleware(Middleware1)
564
+ # app.add_middleware(Middleware2)
565
+ # app.add_middleware(Middleware3)
566
+ # The effect will be like:
567
+ # Middleware3(Middleware2(Middleware1(request)))
568
+ # If MiddlewareN does something like print(n); call_next(); print(n), you'll get
569
+ # 3; 2; 1; <request>; 1; 2; 3
539
570
  # Use environment variable to make the metrics middleware optional.
540
571
  if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
541
572
  app.add_middleware(metrics.PrometheusMiddleware)
542
- app.add_middleware(RBACMiddleware)
543
573
  app.add_middleware(InternalDashboardPrefixMiddleware)
544
574
  app.add_middleware(GracefulShutdownMiddleware)
545
575
  app.add_middleware(PathCleanMiddleware)
@@ -552,15 +582,26 @@ app.add_middleware(
552
582
  allow_credentials=True,
553
583
  allow_methods=['*'],
554
584
  allow_headers=['*'],
555
- # TODO(syang): remove X-Request-ID when v0.10.0 is released.
585
+ # TODO(syang): remove X-Request-ID \when v0.10.0 is released.
556
586
  expose_headers=['X-Request-ID', 'X-Skypilot-Request-ID'])
587
+ # The order of all the authentication-related middleware is important.
588
+ # RBACMiddleware must precede all the auth middleware, so it can access
589
+ # request.state.auth_user.
590
+ app.add_middleware(RBACMiddleware)
591
+ # AuthProxyMiddleware should precede BasicAuthMiddleware and
592
+ # BearerTokenMiddleware, since it should be skipped if either of those set the
593
+ # auth user.
594
+ app.add_middleware(AuthProxyMiddleware)
557
595
  enable_basic_auth = os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH, 'false')
558
596
  if str(enable_basic_auth).lower() == 'true':
559
597
  app.add_middleware(BasicAuthMiddleware)
560
598
  # Bearer token middleware should always be present to handle service account
561
599
  # authentication
562
600
  app.add_middleware(BearerTokenMiddleware)
563
- app.add_middleware(AuthProxyMiddleware)
601
+ # InitializeRequestAuthUserMiddleware must be the last added middleware so that
602
+ # request.state.auth_user is always set, but can be overridden by the auth
603
+ # middleware above.
604
+ app.add_middleware(InitializeRequestAuthUserMiddleware)
564
605
  app.add_middleware(RequestIDMiddleware)
565
606
  app.include_router(jobs_rest.router, prefix='/jobs', tags=['jobs'])
566
607
  app.include_router(serve_rest.router, prefix='/serve', tags=['serve'])
@@ -1576,6 +1617,38 @@ async def all_contexts(request: fastapi.Request) -> None:
1576
1617
  )
1577
1618
 
1578
1619
 
1620
+ @app.get('/gpu-metrics')
1621
+ async def gpu_metrics() -> fastapi.Response:
1622
+ """Gets the GPU metrics from multiple external k8s clusters"""
1623
+ contexts = core.get_all_contexts()
1624
+ all_metrics = []
1625
+ successful_contexts = 0
1626
+
1627
+ tasks = [
1628
+ asyncio.create_task(metrics_utils.get_metrics_for_context(context))
1629
+ for context in contexts
1630
+ if context != 'in-cluster'
1631
+ ]
1632
+
1633
+ results = await asyncio.gather(*tasks, return_exceptions=True)
1634
+
1635
+ for i, result in enumerate(results):
1636
+ if isinstance(result, Exception):
1637
+ logger.error(
1638
+ f'Failed to get metrics for context {contexts[i]}: {result}')
1639
+ else:
1640
+ metrics_text = result
1641
+ all_metrics.append(metrics_text)
1642
+ successful_contexts += 1
1643
+
1644
+ combined_metrics = '\n\n'.join(all_metrics)
1645
+
1646
+ # Return as plain text for Prometheus compatibility
1647
+ return fastapi.Response(
1648
+ content=combined_metrics,
1649
+ media_type='text/plain; version=0.0.4; charset=utf-8')
1650
+
1651
+
1579
1652
  # === Internal APIs ===
1580
1653
  @app.get('/api/completion/cluster_name')
1581
1654
  async def complete_cluster_name(incomplete: str,) -> List[str]:
@@ -17,3 +17,4 @@ include sky/utils/kubernetes/*
17
17
  include sky/server/html/*
18
18
  recursive-include sky/dashboard/out *
19
19
  include sky/users/*.conf
20
+ include sky/metrics/*