skypilot-nightly 1.0.0.dev20250701__py3-none-any.whl → 1.0.0.dev20250703__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/catalog/__init__.py +1 -1
- sky/client/cli/command.py +60 -21
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{Md3rlE87jmL5uv7gSo8mR → A-fbCEgJE_q2cV8biIOIr}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/9984.b56614f3c4c5961d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-1159f362b960e2b8.js +6 -0
- sky/dashboard/out/_next/static/chunks/{webpack-d427db53e54de9ce.js → webpack-9a81ea998672c303.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/jobs/server/core.py +13 -10
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +210 -0
- sky/optimizer.py +1 -1
- sky/resources.py +145 -7
- sky/server/common.py +13 -2
- sky/server/server.py +91 -7
- sky/skylet/constants.py +3 -0
- sky/skypilot_config.py +62 -53
- sky/utils/accelerator_registry.py +28 -1
- sky/utils/schemas.py +3 -0
- sky/utils/ux_utils.py +9 -4
- {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250703.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250703.dist-info}/RECORD +41 -39
- sky/dashboard/out/_next/static/chunks/9984.739ae958a066298d.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b8e1114e6d38218c.js +0 -6
- /sky/dashboard/out/_next/static/{Md3rlE87jmL5uv7gSo8mR → A-fbCEgJE_q2cV8biIOIr}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250703.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250703.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250703.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250703.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-9a81ea998672c303.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-a37b06ddb64521fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-162f3033ffcd3d31.js" defer=""></script><script src="/dashboard/_next/static/chunks/5230-df791914b54d91d9.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-5ea3ffa10fc884f2.js" defer=""></script><script src="/dashboard/_next/static/chunks/1664-d65361e92b85e786.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-9f5e98ce84d46bdd.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/3947-b059261d6fa88a1f.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-6ff4e45dfb49d11d.js" defer=""></script><script src="/dashboard/_next/static/chunks/3698-52ad1ca228faa776.js" defer=""></script><script src="/dashboard/_next/static/chunks/9470-21d059a1dfa03f61.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-d0dc765474fa0eca.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-909d53833da080cb.js" defer=""></script><script src="/dashboard/_next/static/chunks/1043-1b39779691bb4030.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-fcfad0ddf92ec7ab.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-044ad21de8b4626b.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-726e5a3f00b67185.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-7c0187f43757a548.js" defer=""></script><script src="/dashboard/_next/static/A-fbCEgJE_q2cV8biIOIr/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/A-fbCEgJE_q2cV8biIOIr/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"A-fbCEgJE_q2cV8biIOIr","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-9a81ea998672c303.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-a37b06ddb64521fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-06bde99155fa6292.js" defer=""></script><script src="/dashboard/_next/static/A-fbCEgJE_q2cV8biIOIr/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/A-fbCEgJE_q2cV8biIOIr/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"A-fbCEgJE_q2cV8biIOIr","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/jobs/server/core.py
CHANGED
@@ -45,18 +45,21 @@ if typing.TYPE_CHECKING:
|
|
45
45
|
logger = sky_logging.init_logger(__name__)
|
46
46
|
|
47
47
|
|
48
|
-
def
|
49
|
-
"""
|
48
|
+
def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
|
49
|
+
"""Upload files to the controller.
|
50
50
|
|
51
|
-
In consolidation mode, we
|
52
|
-
|
51
|
+
In consolidation mode, we still need to upload files to the controller as
|
52
|
+
we should keep a separate workdir for each jobs. Assuming two jobs using
|
53
|
+
the same workdir, if there are some modifications to the workdir after job 1
|
54
|
+
is submitted, on recovery of job 1, the modifications should not be applied.
|
53
55
|
"""
|
54
56
|
local_to_controller_file_mounts: Dict[str, str] = {}
|
55
57
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
58
|
+
# For consolidation mode, we don't need to use cloud storage,
|
59
|
+
# as uploading to the controller is only a local copy.
|
60
|
+
storage_clouds = (
|
61
|
+
storage_lib.get_cached_enabled_storage_cloud_names_or_refresh())
|
62
|
+
if not managed_job_utils.is_consolidation_mode() and storage_clouds:
|
60
63
|
for task_ in dag.tasks:
|
61
64
|
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
62
65
|
task_, task_type='jobs')
|
@@ -67,7 +70,7 @@ def _maybe_upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
|
|
67
70
|
# directly to the controller, because the controller may not
|
68
71
|
# even be up yet.
|
69
72
|
for task_ in dag.tasks:
|
70
|
-
if task_.storage_mounts:
|
73
|
+
if task_.storage_mounts and not storage_clouds:
|
71
74
|
# Technically, we could convert COPY storage_mounts that
|
72
75
|
# have a local source and do not specify `store`, but we
|
73
76
|
# will not do that for now. Only plain file_mounts are
|
@@ -242,7 +245,7 @@ def launch(
|
|
242
245
|
f'with:\n\n`sky down {cluster_name} --purge`\n\n'
|
243
246
|
f'Reason: {common_utils.format_exception(e)}')
|
244
247
|
|
245
|
-
local_to_controller_file_mounts =
|
248
|
+
local_to_controller_file_mounts = _upload_files_to_controller(dag)
|
246
249
|
|
247
250
|
# Has to use `\` to avoid yapf issue.
|
248
251
|
with tempfile.NamedTemporaryFile(prefix=f'managed-dag-{dag.name}-',
|
sky/metrics/__init__.py
ADDED
File without changes
|
sky/metrics/utils.py
ADDED
@@ -0,0 +1,210 @@
|
|
1
|
+
"""Utilities for processing GPU metrics from Kubernetes clusters."""
|
2
|
+
import os
|
3
|
+
import re
|
4
|
+
import subprocess
|
5
|
+
import time
|
6
|
+
from typing import List, Optional, Tuple
|
7
|
+
|
8
|
+
import httpx
|
9
|
+
|
10
|
+
|
11
|
+
def start_svc_port_forward(context: str, namespace: str, service: str,
|
12
|
+
service_port: int) -> Tuple[subprocess.Popen, int]:
|
13
|
+
"""Starts a port forward to a service in a Kubernetes cluster.
|
14
|
+
Args:
|
15
|
+
context: Kubernetes context name
|
16
|
+
namespace: Namespace where the service is located
|
17
|
+
service: Service name to port forward to
|
18
|
+
service_port: Port on the service to forward to
|
19
|
+
Returns:
|
20
|
+
Tuple of (subprocess.Popen process, local_port assigned)
|
21
|
+
Raises:
|
22
|
+
RuntimeError: If port forward fails to start
|
23
|
+
"""
|
24
|
+
start_port_forward_timeout = 10 # 10 second timeout
|
25
|
+
terminate_port_forward_timeout = 5 # 5 second timeout
|
26
|
+
|
27
|
+
# Use ':service_port' to let kubectl choose the local port
|
28
|
+
cmd = [
|
29
|
+
'kubectl', '--context', context, '-n', namespace, 'port-forward',
|
30
|
+
f'service/{service}', f':{service_port}'
|
31
|
+
]
|
32
|
+
|
33
|
+
env = os.environ.copy()
|
34
|
+
if 'KUBECONFIG' not in env:
|
35
|
+
env['KUBECONFIG'] = os.path.expanduser('~/.kube/config')
|
36
|
+
|
37
|
+
# start the port forward process
|
38
|
+
port_forward_process = subprocess.Popen(cmd,
|
39
|
+
stdout=subprocess.PIPE,
|
40
|
+
stderr=subprocess.STDOUT,
|
41
|
+
text=True,
|
42
|
+
env=env)
|
43
|
+
|
44
|
+
local_port = None
|
45
|
+
start_time = time.time()
|
46
|
+
|
47
|
+
# wait for the port forward to start and extract the local port
|
48
|
+
while time.time() - start_time < start_port_forward_timeout:
|
49
|
+
if port_forward_process.poll() is not None:
|
50
|
+
# port forward process has terminated
|
51
|
+
if port_forward_process.returncode != 0:
|
52
|
+
raise RuntimeError(
|
53
|
+
f'Port forward failed for service {service} in namespace '
|
54
|
+
f'{namespace} on context {context}')
|
55
|
+
break
|
56
|
+
|
57
|
+
# read output line by line to find the local port
|
58
|
+
if port_forward_process.stdout:
|
59
|
+
line = port_forward_process.stdout.readline()
|
60
|
+
if line:
|
61
|
+
# look for 'Forwarding from 127.0.0.1:XXXXX -> service_port'
|
62
|
+
match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', line)
|
63
|
+
if match:
|
64
|
+
local_port = int(match.group(1))
|
65
|
+
break
|
66
|
+
|
67
|
+
# sleep for 100ms to avoid busy-waiting
|
68
|
+
time.sleep(0.1)
|
69
|
+
|
70
|
+
if local_port is None:
|
71
|
+
try:
|
72
|
+
port_forward_process.terminate()
|
73
|
+
port_forward_process.wait(timeout=terminate_port_forward_timeout)
|
74
|
+
except subprocess.TimeoutExpired:
|
75
|
+
port_forward_process.kill()
|
76
|
+
port_forward_process.wait()
|
77
|
+
finally:
|
78
|
+
raise RuntimeError(
|
79
|
+
f'Failed to extract local port for service {service} in '
|
80
|
+
f'namespace {namespace} on context {context}')
|
81
|
+
|
82
|
+
return port_forward_process, local_port
|
83
|
+
|
84
|
+
|
85
|
+
def stop_svc_port_forward(port_forward_process: subprocess.Popen) -> None:
|
86
|
+
"""Stops a port forward to a service in a Kubernetes cluster.
|
87
|
+
Args:
|
88
|
+
port_forward_process: The subprocess.Popen process to terminate
|
89
|
+
"""
|
90
|
+
try:
|
91
|
+
port_forward_process.terminate()
|
92
|
+
port_forward_process.wait(timeout=5)
|
93
|
+
except subprocess.TimeoutExpired:
|
94
|
+
port_forward_process.kill()
|
95
|
+
port_forward_process.wait()
|
96
|
+
|
97
|
+
|
98
|
+
async def send_metrics_request_with_port_forward(
|
99
|
+
context: str,
|
100
|
+
namespace: str,
|
101
|
+
service: str,
|
102
|
+
service_port: int,
|
103
|
+
endpoint_path: str = '/federate',
|
104
|
+
match_patterns: Optional[List[str]] = None,
|
105
|
+
timeout: float = 30.0) -> str:
|
106
|
+
"""Sends a metrics request to a Prometheus endpoint via port forwarding.
|
107
|
+
Args:
|
108
|
+
context: Kubernetes context name
|
109
|
+
namespace: Namespace where the service is located
|
110
|
+
service: Service name to port forward to
|
111
|
+
service_port: Port on the service to forward to
|
112
|
+
endpoint_path: Path to append to the localhost endpoint (e.g.,
|
113
|
+
'/federate')
|
114
|
+
match_patterns: List of metric patterns to match (for federate
|
115
|
+
endpoint)
|
116
|
+
timeout: Request timeout in seconds
|
117
|
+
Returns:
|
118
|
+
Response text containing the metrics
|
119
|
+
Raises:
|
120
|
+
RuntimeError: If port forward or HTTP request fails
|
121
|
+
"""
|
122
|
+
port_forward_process = None
|
123
|
+
try:
|
124
|
+
# Start port forward
|
125
|
+
port_forward_process, local_port = start_svc_port_forward(
|
126
|
+
context, namespace, service, service_port)
|
127
|
+
|
128
|
+
# Build endpoint URL
|
129
|
+
endpoint = f'http://localhost:{local_port}{endpoint_path}'
|
130
|
+
|
131
|
+
# Make HTTP request
|
132
|
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
133
|
+
if match_patterns:
|
134
|
+
# For federate endpoint, add match[] parameters
|
135
|
+
params = [('match[]', pattern) for pattern in match_patterns]
|
136
|
+
response = await client.get(endpoint, params=params)
|
137
|
+
else:
|
138
|
+
response = await client.get(endpoint)
|
139
|
+
|
140
|
+
response.raise_for_status()
|
141
|
+
return response.text
|
142
|
+
|
143
|
+
finally:
|
144
|
+
# Always clean up port forward
|
145
|
+
if port_forward_process:
|
146
|
+
stop_svc_port_forward(port_forward_process)
|
147
|
+
|
148
|
+
|
149
|
+
async def add_cluster_name_label(metrics_text: str, context: str) -> str:
|
150
|
+
"""Adds a cluster_name label to each metric line.
|
151
|
+
Args:
|
152
|
+
metrics_text: The text containing the metrics
|
153
|
+
context: The cluster name
|
154
|
+
"""
|
155
|
+
lines = metrics_text.strip().split('\n')
|
156
|
+
modified_lines = []
|
157
|
+
|
158
|
+
for line in lines:
|
159
|
+
# keep comment lines and empty lines as-is
|
160
|
+
if line.startswith('#') or not line.strip():
|
161
|
+
modified_lines.append(line)
|
162
|
+
continue
|
163
|
+
# if line is a metric line with labels, add cluster label
|
164
|
+
brace_start = line.find('{')
|
165
|
+
brace_end = line.find('}')
|
166
|
+
if brace_start != -1 and brace_end != -1:
|
167
|
+
metric_name = line[:brace_start]
|
168
|
+
existing_labels = line[brace_start + 1:brace_end]
|
169
|
+
rest_of_line = line[brace_end + 1:]
|
170
|
+
|
171
|
+
if existing_labels:
|
172
|
+
new_labels = f'cluster="{context}",{existing_labels}'
|
173
|
+
else:
|
174
|
+
new_labels = f'cluster="{context}"'
|
175
|
+
|
176
|
+
modified_line = f'{metric_name}{{{new_labels}}}{rest_of_line}'
|
177
|
+
modified_lines.append(modified_line)
|
178
|
+
else:
|
179
|
+
# keep other lines as-is
|
180
|
+
modified_lines.append(line)
|
181
|
+
|
182
|
+
return '\n'.join(modified_lines)
|
183
|
+
|
184
|
+
|
185
|
+
async def get_metrics_for_context(context: str) -> str:
|
186
|
+
"""Get GPU metrics for a single Kubernetes context.
|
187
|
+
Args:
|
188
|
+
context: Kubernetes context name
|
189
|
+
Returns:
|
190
|
+
metrics_text: String containing the metrics
|
191
|
+
Raises:
|
192
|
+
Exception: If metrics collection fails for any reason
|
193
|
+
"""
|
194
|
+
# Query both DCGM metrics and kube_pod_labels metrics
|
195
|
+
# This ensures the dashboard can perform joins to filter by skypilot cluster
|
196
|
+
match_patterns = ['{__name__=~"DCGM_.*"}', 'kube_pod_labels']
|
197
|
+
|
198
|
+
# TODO(rohan): don't hardcode the namespace and service name
|
199
|
+
metrics_text = await send_metrics_request_with_port_forward(
|
200
|
+
context=context,
|
201
|
+
namespace='skypilot',
|
202
|
+
service='skypilot-prometheus-server',
|
203
|
+
service_port=80,
|
204
|
+
endpoint_path='/federate',
|
205
|
+
match_patterns=match_patterns)
|
206
|
+
|
207
|
+
# add cluster name as a label to each metric line
|
208
|
+
metrics_text = await add_cluster_name_label(metrics_text, context)
|
209
|
+
|
210
|
+
return metrics_text
|
sky/optimizer.py
CHANGED
@@ -1375,7 +1375,7 @@ def _fill_in_launchable_resources(
|
|
1375
1375
|
num_node_str = ''
|
1376
1376
|
if task.num_nodes > 1:
|
1377
1377
|
num_node_str = f'{task.num_nodes}x '
|
1378
|
-
if not quiet:
|
1378
|
+
if not (quiet or resources.no_missing_accel_warnings):
|
1379
1379
|
logger.info(
|
1380
1380
|
f'No resource satisfying {num_node_str}'
|
1381
1381
|
f'{resources.repr_with_region_zone} on {clouds_str}.')
|
sky/resources.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
"""Resources: compute requirements of Tasks."""
|
2
|
+
import collections
|
2
3
|
import dataclasses
|
3
4
|
import math
|
5
|
+
import re
|
4
6
|
import textwrap
|
5
7
|
import typing
|
6
8
|
from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
|
@@ -41,6 +43,20 @@ RESOURCE_CONFIG_ALIASES = {
|
|
41
43
|
'gpus': 'accelerators',
|
42
44
|
}
|
43
45
|
|
46
|
+
MEMORY_SIZE_UNITS = {
|
47
|
+
'b': 1,
|
48
|
+
'k': 2**10,
|
49
|
+
'kb': 2**10,
|
50
|
+
'm': 2**20,
|
51
|
+
'mb': 2**20,
|
52
|
+
'g': 2**30,
|
53
|
+
'gb': 2**30,
|
54
|
+
't': 2**40,
|
55
|
+
'tb': 2**40,
|
56
|
+
'p': 2**50,
|
57
|
+
'pb': 2**50,
|
58
|
+
}
|
59
|
+
|
44
60
|
|
45
61
|
@dataclasses.dataclass
|
46
62
|
class AutostopConfig:
|
@@ -110,7 +126,7 @@ class Resources:
|
|
110
126
|
"""
|
111
127
|
# If any fields changed, increment the version. For backward compatibility,
|
112
128
|
# modify the __setstate__ method to handle the old version.
|
113
|
-
_VERSION =
|
129
|
+
_VERSION = 28
|
114
130
|
|
115
131
|
def __init__(
|
116
132
|
self,
|
@@ -142,6 +158,7 @@ class Resources:
|
|
142
158
|
_is_image_managed: Optional[bool] = None,
|
143
159
|
_requires_fuse: Optional[bool] = None,
|
144
160
|
_cluster_config_overrides: Optional[Dict[str, Any]] = None,
|
161
|
+
_no_missing_accel_warnings: Optional[bool] = None,
|
145
162
|
):
|
146
163
|
"""Initialize a Resources object.
|
147
164
|
|
@@ -366,6 +383,7 @@ class Resources:
|
|
366
383
|
|
367
384
|
self._cluster_config_overrides = _cluster_config_overrides
|
368
385
|
self._cached_repr: Optional[str] = None
|
386
|
+
self._no_missing_accel_warnings = _no_missing_accel_warnings
|
369
387
|
|
370
388
|
# Initialize _priority before calling the setter
|
371
389
|
self._priority: Optional[int] = None
|
@@ -649,6 +667,13 @@ class Resources:
|
|
649
667
|
return False
|
650
668
|
return self._requires_fuse
|
651
669
|
|
670
|
+
@property
|
671
|
+
def no_missing_accel_warnings(self) -> bool:
|
672
|
+
"""Returns whether to force quiet mode for this resource."""
|
673
|
+
if self._no_missing_accel_warnings is None:
|
674
|
+
return False
|
675
|
+
return self._no_missing_accel_warnings
|
676
|
+
|
652
677
|
def set_requires_fuse(self, value: bool) -> None:
|
653
678
|
"""Sets whether this resource requires FUSE mounting support.
|
654
679
|
|
@@ -754,6 +779,8 @@ class Resources:
|
|
754
779
|
if ':' not in accelerators:
|
755
780
|
accelerators = {accelerators: 1}
|
756
781
|
else:
|
782
|
+
assert isinstance(accelerators,
|
783
|
+
str), (type(accelerators), accelerators)
|
757
784
|
splits = accelerators.split(':')
|
758
785
|
parse_error = ('The "accelerators" field as a str '
|
759
786
|
'should be <name> or <name>:<cnt>. '
|
@@ -1778,6 +1805,8 @@ class Resources:
|
|
1778
1805
|
self._is_image_managed),
|
1779
1806
|
_requires_fuse=override.pop('_requires_fuse', self._requires_fuse),
|
1780
1807
|
_cluster_config_overrides=override_configs,
|
1808
|
+
_no_missing_accel_warnings=override.pop(
|
1809
|
+
'no_missing_accel_warnings', self._no_missing_accel_warnings),
|
1781
1810
|
)
|
1782
1811
|
assert not override
|
1783
1812
|
return resources
|
@@ -1843,10 +1872,75 @@ class Resources:
|
|
1843
1872
|
config[canonical] = config[alias]
|
1844
1873
|
del config[alias]
|
1845
1874
|
|
1875
|
+
@classmethod
|
1876
|
+
def _parse_accelerators_from_str(
|
1877
|
+
cls, accelerators: str) -> List[Tuple[str, bool]]:
|
1878
|
+
"""Parse accelerators string into a list of possible accelerators.
|
1879
|
+
|
1880
|
+
Returns:
|
1881
|
+
A list of possible accelerators. Each element is a tuple of
|
1882
|
+
(accelerator_name, was_user_specified). was_user_specified is True
|
1883
|
+
if the accelerator was directly named by the user (for example
|
1884
|
+
"H100:2" would be True, but "80GB+" would be False since it doesn't
|
1885
|
+
mention the name of the accelerator).
|
1886
|
+
"""
|
1887
|
+
# sanity check
|
1888
|
+
assert isinstance(accelerators, str), accelerators
|
1889
|
+
|
1890
|
+
manufacturer = None
|
1891
|
+
memory = None
|
1892
|
+
count = 1
|
1893
|
+
|
1894
|
+
split = accelerators.split(':')
|
1895
|
+
if len(split) == 3:
|
1896
|
+
manufacturer, memory, count_str = split
|
1897
|
+
count = int(count_str)
|
1898
|
+
assert re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', memory), \
|
1899
|
+
'If specifying a GPU manufacturer, you must also' \
|
1900
|
+
'specify the memory size'
|
1901
|
+
elif len(split) == 2 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[0]):
|
1902
|
+
memory = split[0]
|
1903
|
+
count = int(split[1])
|
1904
|
+
elif len(split) == 2 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[1]):
|
1905
|
+
manufacturer, memory = split
|
1906
|
+
elif len(split) == 1 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[0]):
|
1907
|
+
memory = split[0]
|
1908
|
+
else:
|
1909
|
+
# it is just an accelerator name, not a memory size
|
1910
|
+
return [(accelerators, True)]
|
1911
|
+
|
1912
|
+
# we know we have some case of manufacturer, memory, count, now we
|
1913
|
+
# need to convert that to a list of possible accelerators
|
1914
|
+
memory_parsed = resources_utils.parse_memory_resource(memory,
|
1915
|
+
'accelerators',
|
1916
|
+
allow_plus=True)
|
1917
|
+
plus = memory_parsed[-1] == '+'
|
1918
|
+
if plus:
|
1919
|
+
memory_parsed = memory_parsed[:-1]
|
1920
|
+
memory_gb = int(memory_parsed)
|
1921
|
+
|
1922
|
+
accelerators = [
|
1923
|
+
(f'{device}:{count}', False)
|
1924
|
+
for device in accelerator_registry.get_devices_by_memory(
|
1925
|
+
memory_gb, plus, manufacturer=manufacturer)
|
1926
|
+
]
|
1927
|
+
|
1928
|
+
return accelerators
|
1929
|
+
|
1846
1930
|
@classmethod
|
1847
1931
|
def from_yaml_config(
|
1848
1932
|
cls, config: Optional[Dict[str, Any]]
|
1849
1933
|
) -> Union[Set['Resources'], List['Resources']]:
|
1934
|
+
"""Creates Resources objects from a YAML config.
|
1935
|
+
|
1936
|
+
Args:
|
1937
|
+
config: A dict of resource config.
|
1938
|
+
|
1939
|
+
Returns:
|
1940
|
+
A set of Resources objects if any_of is specified, otherwise a list
|
1941
|
+
of Resources objects if ordered is specified, otherwise a set with
|
1942
|
+
a single Resources object.
|
1943
|
+
"""
|
1850
1944
|
if config is None:
|
1851
1945
|
return {Resources()}
|
1852
1946
|
|
@@ -1903,13 +1997,48 @@ class Resources:
|
|
1903
1997
|
accelerators = config.get('accelerators')
|
1904
1998
|
if config and accelerators is not None:
|
1905
1999
|
if isinstance(accelerators, str):
|
1906
|
-
|
2000
|
+
accelerators_list = cls._parse_accelerators_from_str(
|
2001
|
+
accelerators)
|
1907
2002
|
elif isinstance(accelerators, dict):
|
1908
|
-
|
2003
|
+
accelerator_names = [
|
1909
2004
|
f'{k}:{v}' if v is not None else f'{k}'
|
1910
2005
|
for k, v in accelerators.items()
|
1911
2006
|
]
|
1912
|
-
|
2007
|
+
accelerators_list = []
|
2008
|
+
for accel_name in accelerator_names:
|
2009
|
+
parsed_accels = cls._parse_accelerators_from_str(accel_name)
|
2010
|
+
accelerators_list.extend(parsed_accels)
|
2011
|
+
elif isinstance(accelerators, list) or isinstance(
|
2012
|
+
accelerators, set):
|
2013
|
+
accelerators_list = []
|
2014
|
+
for accel_name in accelerators:
|
2015
|
+
parsed_accels = cls._parse_accelerators_from_str(accel_name)
|
2016
|
+
accelerators_list.extend(parsed_accels)
|
2017
|
+
else:
|
2018
|
+
assert False, ('Invalid accelerators type:'
|
2019
|
+
f'{type(accelerators)}')
|
2020
|
+
# now that accelerators is a list, we need to decide which to
|
2021
|
+
# include in the final set, however, there may be multiple copies
|
2022
|
+
# of the same accelerator, some given by name by the user and the
|
2023
|
+
# other copy being given by memory size. In this case, we only care
|
2024
|
+
# about the user specified ones (so we can give a warning if it
|
2025
|
+
# doesn't exist).
|
2026
|
+
accel_to_user_specified: Dict[str, bool] = collections.OrderedDict()
|
2027
|
+
for accel, user_specified in accelerators_list:
|
2028
|
+
# If this accelerator is not in dict yet, or if current one is
|
2029
|
+
# user specified and existing one is not, update the entry
|
2030
|
+
accel_to_user_specified[accel] = (user_specified or
|
2031
|
+
accel_to_user_specified.get(
|
2032
|
+
accel, False))
|
2033
|
+
|
2034
|
+
# only time we care about ordered is when we are given a list,
|
2035
|
+
# otherwise we default to a set
|
2036
|
+
accelerators_type = list if isinstance(accelerators, list) else set
|
2037
|
+
accelerators = accelerators_type([
|
2038
|
+
(accel, user_specified)
|
2039
|
+
for accel, user_specified in accel_to_user_specified.items()
|
2040
|
+
])
|
2041
|
+
|
1913
2042
|
if len(accelerators) > 1 and ordered_configs:
|
1914
2043
|
with ux_utils.print_exception_no_traceback():
|
1915
2044
|
raise ValueError(
|
@@ -1935,20 +2064,20 @@ class Resources:
|
|
1935
2064
|
# In Task, we store a list of resources, each with 1 accelerator.
|
1936
2065
|
# This for loop is for format conversion.
|
1937
2066
|
tmp_resources_list = []
|
1938
|
-
for acc in accelerators:
|
2067
|
+
for acc, user_specified in accelerators:
|
1939
2068
|
tmp_resource = config.copy()
|
1940
2069
|
tmp_resource['accelerators'] = acc
|
2070
|
+
if not user_specified:
|
2071
|
+
tmp_resource['_no_missing_accel_warnings'] = True
|
1941
2072
|
tmp_resources_list.append(
|
1942
2073
|
Resources._from_yaml_config_single(tmp_resource))
|
1943
2074
|
|
1944
2075
|
assert isinstance(accelerators, (list, set)), accelerators
|
1945
2076
|
return type(accelerators)(tmp_resources_list)
|
1946
|
-
|
1947
2077
|
return {Resources._from_yaml_config_single(config)}
|
1948
2078
|
|
1949
2079
|
@classmethod
|
1950
2080
|
def _from_yaml_config_single(cls, config: Dict[str, str]) -> 'Resources':
|
1951
|
-
|
1952
2081
|
resources_fields: Dict[str, Any] = {}
|
1953
2082
|
|
1954
2083
|
# Extract infra field if present
|
@@ -2010,6 +2139,8 @@ class Resources:
|
|
2010
2139
|
# although it will end up being an int, we don't know at this point
|
2011
2140
|
# if it has units or not, so we store it as a string
|
2012
2141
|
resources_fields['disk_size'] = str(resources_fields['disk_size'])
|
2142
|
+
resources_fields['_no_missing_accel_warnings'] = config.pop(
|
2143
|
+
'_no_missing_accel_warnings', None)
|
2013
2144
|
|
2014
2145
|
assert not config, f'Invalid resource args: {config.keys()}'
|
2015
2146
|
return Resources(**resources_fields)
|
@@ -2060,6 +2191,9 @@ class Resources:
|
|
2060
2191
|
config['volumes'] = volumes
|
2061
2192
|
if self._autostop_config is not None:
|
2062
2193
|
config['autostop'] = self._autostop_config.to_yaml_config()
|
2194
|
+
|
2195
|
+
add_if_not_none('_no_missing_accel_warnings',
|
2196
|
+
self._no_missing_accel_warnings)
|
2063
2197
|
add_if_not_none('priority', self.priority)
|
2064
2198
|
if self._docker_login_config is not None:
|
2065
2199
|
config['_docker_login_config'] = dataclasses.asdict(
|
@@ -2232,6 +2366,10 @@ class Resources:
|
|
2232
2366
|
if version < 27:
|
2233
2367
|
self._priority = None
|
2234
2368
|
|
2369
|
+
if version < 28:
|
2370
|
+
self._no_missing_accel_warnings = state.get(
|
2371
|
+
'_no_missing_accel_warnings', None)
|
2372
|
+
|
2235
2373
|
self.__dict__.update(state)
|
2236
2374
|
|
2237
2375
|
|
sky/server/common.py
CHANGED
@@ -165,14 +165,25 @@ def set_api_cookie_jar(cookie_jar: CookieJar,
|
|
165
165
|
if not cookie_path.parent.exists():
|
166
166
|
cookie_path.parent.mkdir(parents=True, exist_ok=True)
|
167
167
|
|
168
|
-
|
168
|
+
# Writing directly to the cookie jar path can race with other processes that
|
169
|
+
# are reading the cookie jar, making it look malformed. Instead, write to a
|
170
|
+
# temporary file and then move it to the final location.
|
171
|
+
# Avoid hardcoding the tmp file path, since it could cause a race with other
|
172
|
+
# processes that are also writing to the tmp file.
|
173
|
+
with tempfile.NamedTemporaryFile(dir=cookie_path.parent,
|
174
|
+
delete=False) as tmp_file:
|
175
|
+
tmp_cookie_path = tmp_file.name
|
176
|
+
file_cookie_jar = MozillaCookieJar(tmp_cookie_path)
|
169
177
|
if cookie_path.exists():
|
170
|
-
file_cookie_jar.load()
|
178
|
+
file_cookie_jar.load(str(cookie_path))
|
171
179
|
|
172
180
|
for cookie in cookie_jar:
|
173
181
|
file_cookie_jar.set_cookie(cookie)
|
174
182
|
file_cookie_jar.save()
|
175
183
|
|
184
|
+
# Move the temporary file to the final location.
|
185
|
+
os.replace(tmp_cookie_path, cookie_path)
|
186
|
+
|
176
187
|
|
177
188
|
def get_cookies_from_response(
|
178
189
|
response: 'requests.Response') -> requests.cookies.RequestsCookieJar:
|