skypilot-nightly 1.0.0.dev20250701__py3-none-any.whl → 1.0.0.dev20250703__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. sky/__init__.py +2 -2
  2. sky/catalog/__init__.py +1 -1
  3. sky/client/cli/command.py +60 -21
  4. sky/dashboard/out/404.html +1 -1
  5. sky/dashboard/out/_next/static/{Md3rlE87jmL5uv7gSo8mR → A-fbCEgJE_q2cV8biIOIr}/_buildManifest.js +1 -1
  6. sky/dashboard/out/_next/static/chunks/9984.b56614f3c4c5961d.js +1 -0
  7. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-1159f362b960e2b8.js +6 -0
  8. sky/dashboard/out/_next/static/chunks/{webpack-d427db53e54de9ce.js → webpack-9a81ea998672c303.js} +1 -1
  9. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  10. sky/dashboard/out/clusters/[cluster].html +1 -1
  11. sky/dashboard/out/clusters.html +1 -1
  12. sky/dashboard/out/config.html +1 -1
  13. sky/dashboard/out/index.html +1 -1
  14. sky/dashboard/out/infra/[context].html +1 -1
  15. sky/dashboard/out/infra.html +1 -1
  16. sky/dashboard/out/jobs/[job].html +1 -1
  17. sky/dashboard/out/jobs.html +1 -1
  18. sky/dashboard/out/users.html +1 -1
  19. sky/dashboard/out/volumes.html +1 -1
  20. sky/dashboard/out/workspace/new.html +1 -1
  21. sky/dashboard/out/workspaces/[name].html +1 -1
  22. sky/dashboard/out/workspaces.html +1 -1
  23. sky/jobs/server/core.py +13 -10
  24. sky/metrics/__init__.py +0 -0
  25. sky/metrics/utils.py +210 -0
  26. sky/optimizer.py +1 -1
  27. sky/resources.py +145 -7
  28. sky/server/common.py +13 -2
  29. sky/server/server.py +91 -7
  30. sky/skylet/constants.py +3 -0
  31. sky/skypilot_config.py +62 -53
  32. sky/utils/accelerator_registry.py +28 -1
  33. sky/utils/schemas.py +3 -0
  34. sky/utils/ux_utils.py +9 -4
  35. {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250703.dist-info}/METADATA +1 -1
  36. {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250703.dist-info}/RECORD +41 -39
  37. sky/dashboard/out/_next/static/chunks/9984.739ae958a066298d.js +0 -1
  38. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b8e1114e6d38218c.js +0 -6
  39. /sky/dashboard/out/_next/static/{Md3rlE87jmL5uv7gSo8mR → A-fbCEgJE_q2cV8biIOIr}/_ssgManifest.js +0 -0
  40. {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250703.dist-info}/WHEEL +0 -0
  41. {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250703.dist-info}/entry_points.txt +0 -0
  42. {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250703.dist-info}/licenses/LICENSE +0 -0
  43. {skypilot_nightly-1.0.0.dev20250701.dist-info → skypilot_nightly-1.0.0.dev20250703.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-d427db53e54de9ce.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-a37b06ddb64521fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-162f3033ffcd3d31.js" defer=""></script><script src="/dashboard/_next/static/chunks/5230-df791914b54d91d9.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-5ea3ffa10fc884f2.js" defer=""></script><script src="/dashboard/_next/static/chunks/1664-d65361e92b85e786.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-9f5e98ce84d46bdd.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/3947-b059261d6fa88a1f.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-6ff4e45dfb49d11d.js" defer=""></script><script src="/dashboard/_next/static/chunks/3698-52ad1ca228faa776.js" defer=""></script><script src="/dashboard/_next/static/chunks/9470-21d059a1dfa03f61.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-d0dc765474fa0eca.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-909d53833da080cb.js" defer=""></script><script src="/dashboard/_next/static/chunks/1043-1b39779691bb4030.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-fcfad0ddf92ec7ab.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-044ad21de8b4626b.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-726e5a3f00b67185.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-7c0187f43757a548.js" defer=""></script><script src="/dashboard/_next/static/Md3rlE87jmL5uv7gSo8mR/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Md3rlE87jmL5uv7gSo8mR/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"Md3rlE87jmL5uv7gSo8mR","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-9a81ea998672c303.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-a37b06ddb64521fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-162f3033ffcd3d31.js" defer=""></script><script src="/dashboard/_next/static/chunks/5230-df791914b54d91d9.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-5ea3ffa10fc884f2.js" defer=""></script><script src="/dashboard/_next/static/chunks/1664-d65361e92b85e786.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-9f5e98ce84d46bdd.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/3947-b059261d6fa88a1f.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-6ff4e45dfb49d11d.js" defer=""></script><script src="/dashboard/_next/static/chunks/3698-52ad1ca228faa776.js" defer=""></script><script src="/dashboard/_next/static/chunks/9470-21d059a1dfa03f61.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-d0dc765474fa0eca.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-909d53833da080cb.js" defer=""></script><script src="/dashboard/_next/static/chunks/1043-1b39779691bb4030.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-fcfad0ddf92ec7ab.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-044ad21de8b4626b.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-726e5a3f00b67185.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-7c0187f43757a548.js" defer=""></script><script src="/dashboard/_next/static/A-fbCEgJE_q2cV8biIOIr/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/A-fbCEgJE_q2cV8biIOIr/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"A-fbCEgJE_q2cV8biIOIr","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-d427db53e54de9ce.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-a37b06ddb64521fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-06bde99155fa6292.js" defer=""></script><script src="/dashboard/_next/static/Md3rlE87jmL5uv7gSo8mR/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Md3rlE87jmL5uv7gSo8mR/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"Md3rlE87jmL5uv7gSo8mR","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-9a81ea998672c303.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-a37b06ddb64521fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-06bde99155fa6292.js" defer=""></script><script src="/dashboard/_next/static/A-fbCEgJE_q2cV8biIOIr/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/A-fbCEgJE_q2cV8biIOIr/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"A-fbCEgJE_q2cV8biIOIr","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
sky/jobs/server/core.py CHANGED
@@ -45,18 +45,21 @@ if typing.TYPE_CHECKING:
45
45
  logger = sky_logging.init_logger(__name__)
46
46
 
47
47
 
48
- def _maybe_upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
49
- """Maybe upload files to the controller.
48
+ def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
49
+ """Upload files to the controller.
50
50
 
51
- In consolidation mode, we don't need to upload files to the controller as
52
- the API server and the controller are colocated.
51
+ In consolidation mode, we still need to upload files to the controller as
52
+ we should keep a separate workdir for each jobs. Assuming two jobs using
53
+ the same workdir, if there are some modifications to the workdir after job 1
54
+ is submitted, on recovery of job 1, the modifications should not be applied.
53
55
  """
54
56
  local_to_controller_file_mounts: Dict[str, str] = {}
55
57
 
56
- if managed_job_utils.is_consolidation_mode():
57
- return local_to_controller_file_mounts
58
-
59
- if storage_lib.get_cached_enabled_storage_cloud_names_or_refresh():
58
+ # For consolidation mode, we don't need to use cloud storage,
59
+ # as uploading to the controller is only a local copy.
60
+ storage_clouds = (
61
+ storage_lib.get_cached_enabled_storage_cloud_names_or_refresh())
62
+ if not managed_job_utils.is_consolidation_mode() and storage_clouds:
60
63
  for task_ in dag.tasks:
61
64
  controller_utils.maybe_translate_local_file_mounts_and_sync_up(
62
65
  task_, task_type='jobs')
@@ -67,7 +70,7 @@ def _maybe_upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
67
70
  # directly to the controller, because the controller may not
68
71
  # even be up yet.
69
72
  for task_ in dag.tasks:
70
- if task_.storage_mounts:
73
+ if task_.storage_mounts and not storage_clouds:
71
74
  # Technically, we could convert COPY storage_mounts that
72
75
  # have a local source and do not specify `store`, but we
73
76
  # will not do that for now. Only plain file_mounts are
@@ -242,7 +245,7 @@ def launch(
242
245
  f'with:\n\n`sky down {cluster_name} --purge`\n\n'
243
246
  f'Reason: {common_utils.format_exception(e)}')
244
247
 
245
- local_to_controller_file_mounts = _maybe_upload_files_to_controller(dag)
248
+ local_to_controller_file_mounts = _upload_files_to_controller(dag)
246
249
 
247
250
  # Has to use `\` to avoid yapf issue.
248
251
  with tempfile.NamedTemporaryFile(prefix=f'managed-dag-{dag.name}-',
File without changes
sky/metrics/utils.py ADDED
@@ -0,0 +1,210 @@
1
+ """Utilities for processing GPU metrics from Kubernetes clusters."""
2
+ import os
3
+ import re
4
+ import subprocess
5
+ import time
6
+ from typing import List, Optional, Tuple
7
+
8
+ import httpx
9
+
10
+
11
+ def start_svc_port_forward(context: str, namespace: str, service: str,
12
+ service_port: int) -> Tuple[subprocess.Popen, int]:
13
+ """Starts a port forward to a service in a Kubernetes cluster.
14
+ Args:
15
+ context: Kubernetes context name
16
+ namespace: Namespace where the service is located
17
+ service: Service name to port forward to
18
+ service_port: Port on the service to forward to
19
+ Returns:
20
+ Tuple of (subprocess.Popen process, local_port assigned)
21
+ Raises:
22
+ RuntimeError: If port forward fails to start
23
+ """
24
+ start_port_forward_timeout = 10 # 10 second timeout
25
+ terminate_port_forward_timeout = 5 # 5 second timeout
26
+
27
+ # Use ':service_port' to let kubectl choose the local port
28
+ cmd = [
29
+ 'kubectl', '--context', context, '-n', namespace, 'port-forward',
30
+ f'service/{service}', f':{service_port}'
31
+ ]
32
+
33
+ env = os.environ.copy()
34
+ if 'KUBECONFIG' not in env:
35
+ env['KUBECONFIG'] = os.path.expanduser('~/.kube/config')
36
+
37
+ # start the port forward process
38
+ port_forward_process = subprocess.Popen(cmd,
39
+ stdout=subprocess.PIPE,
40
+ stderr=subprocess.STDOUT,
41
+ text=True,
42
+ env=env)
43
+
44
+ local_port = None
45
+ start_time = time.time()
46
+
47
+ # wait for the port forward to start and extract the local port
48
+ while time.time() - start_time < start_port_forward_timeout:
49
+ if port_forward_process.poll() is not None:
50
+ # port forward process has terminated
51
+ if port_forward_process.returncode != 0:
52
+ raise RuntimeError(
53
+ f'Port forward failed for service {service} in namespace '
54
+ f'{namespace} on context {context}')
55
+ break
56
+
57
+ # read output line by line to find the local port
58
+ if port_forward_process.stdout:
59
+ line = port_forward_process.stdout.readline()
60
+ if line:
61
+ # look for 'Forwarding from 127.0.0.1:XXXXX -> service_port'
62
+ match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', line)
63
+ if match:
64
+ local_port = int(match.group(1))
65
+ break
66
+
67
+ # sleep for 100ms to avoid busy-waiting
68
+ time.sleep(0.1)
69
+
70
+ if local_port is None:
71
+ try:
72
+ port_forward_process.terminate()
73
+ port_forward_process.wait(timeout=terminate_port_forward_timeout)
74
+ except subprocess.TimeoutExpired:
75
+ port_forward_process.kill()
76
+ port_forward_process.wait()
77
+ finally:
78
+ raise RuntimeError(
79
+ f'Failed to extract local port for service {service} in '
80
+ f'namespace {namespace} on context {context}')
81
+
82
+ return port_forward_process, local_port
83
+
84
+
85
+ def stop_svc_port_forward(port_forward_process: subprocess.Popen) -> None:
86
+ """Stops a port forward to a service in a Kubernetes cluster.
87
+ Args:
88
+ port_forward_process: The subprocess.Popen process to terminate
89
+ """
90
+ try:
91
+ port_forward_process.terminate()
92
+ port_forward_process.wait(timeout=5)
93
+ except subprocess.TimeoutExpired:
94
+ port_forward_process.kill()
95
+ port_forward_process.wait()
96
+
97
+
98
+ async def send_metrics_request_with_port_forward(
99
+ context: str,
100
+ namespace: str,
101
+ service: str,
102
+ service_port: int,
103
+ endpoint_path: str = '/federate',
104
+ match_patterns: Optional[List[str]] = None,
105
+ timeout: float = 30.0) -> str:
106
+ """Sends a metrics request to a Prometheus endpoint via port forwarding.
107
+ Args:
108
+ context: Kubernetes context name
109
+ namespace: Namespace where the service is located
110
+ service: Service name to port forward to
111
+ service_port: Port on the service to forward to
112
+ endpoint_path: Path to append to the localhost endpoint (e.g.,
113
+ '/federate')
114
+ match_patterns: List of metric patterns to match (for federate
115
+ endpoint)
116
+ timeout: Request timeout in seconds
117
+ Returns:
118
+ Response text containing the metrics
119
+ Raises:
120
+ RuntimeError: If port forward or HTTP request fails
121
+ """
122
+ port_forward_process = None
123
+ try:
124
+ # Start port forward
125
+ port_forward_process, local_port = start_svc_port_forward(
126
+ context, namespace, service, service_port)
127
+
128
+ # Build endpoint URL
129
+ endpoint = f'http://localhost:{local_port}{endpoint_path}'
130
+
131
+ # Make HTTP request
132
+ async with httpx.AsyncClient(timeout=timeout) as client:
133
+ if match_patterns:
134
+ # For federate endpoint, add match[] parameters
135
+ params = [('match[]', pattern) for pattern in match_patterns]
136
+ response = await client.get(endpoint, params=params)
137
+ else:
138
+ response = await client.get(endpoint)
139
+
140
+ response.raise_for_status()
141
+ return response.text
142
+
143
+ finally:
144
+ # Always clean up port forward
145
+ if port_forward_process:
146
+ stop_svc_port_forward(port_forward_process)
147
+
148
+
149
+ async def add_cluster_name_label(metrics_text: str, context: str) -> str:
150
+ """Adds a cluster_name label to each metric line.
151
+ Args:
152
+ metrics_text: The text containing the metrics
153
+ context: The cluster name
154
+ """
155
+ lines = metrics_text.strip().split('\n')
156
+ modified_lines = []
157
+
158
+ for line in lines:
159
+ # keep comment lines and empty lines as-is
160
+ if line.startswith('#') or not line.strip():
161
+ modified_lines.append(line)
162
+ continue
163
+ # if line is a metric line with labels, add cluster label
164
+ brace_start = line.find('{')
165
+ brace_end = line.find('}')
166
+ if brace_start != -1 and brace_end != -1:
167
+ metric_name = line[:brace_start]
168
+ existing_labels = line[brace_start + 1:brace_end]
169
+ rest_of_line = line[brace_end + 1:]
170
+
171
+ if existing_labels:
172
+ new_labels = f'cluster="{context}",{existing_labels}'
173
+ else:
174
+ new_labels = f'cluster="{context}"'
175
+
176
+ modified_line = f'{metric_name}{{{new_labels}}}{rest_of_line}'
177
+ modified_lines.append(modified_line)
178
+ else:
179
+ # keep other lines as-is
180
+ modified_lines.append(line)
181
+
182
+ return '\n'.join(modified_lines)
183
+
184
+
185
+ async def get_metrics_for_context(context: str) -> str:
186
+ """Get GPU metrics for a single Kubernetes context.
187
+ Args:
188
+ context: Kubernetes context name
189
+ Returns:
190
+ metrics_text: String containing the metrics
191
+ Raises:
192
+ Exception: If metrics collection fails for any reason
193
+ """
194
+ # Query both DCGM metrics and kube_pod_labels metrics
195
+ # This ensures the dashboard can perform joins to filter by skypilot cluster
196
+ match_patterns = ['{__name__=~"DCGM_.*"}', 'kube_pod_labels']
197
+
198
+ # TODO(rohan): don't hardcode the namespace and service name
199
+ metrics_text = await send_metrics_request_with_port_forward(
200
+ context=context,
201
+ namespace='skypilot',
202
+ service='skypilot-prometheus-server',
203
+ service_port=80,
204
+ endpoint_path='/federate',
205
+ match_patterns=match_patterns)
206
+
207
+ # add cluster name as a label to each metric line
208
+ metrics_text = await add_cluster_name_label(metrics_text, context)
209
+
210
+ return metrics_text
sky/optimizer.py CHANGED
@@ -1375,7 +1375,7 @@ def _fill_in_launchable_resources(
1375
1375
  num_node_str = ''
1376
1376
  if task.num_nodes > 1:
1377
1377
  num_node_str = f'{task.num_nodes}x '
1378
- if not quiet:
1378
+ if not (quiet or resources.no_missing_accel_warnings):
1379
1379
  logger.info(
1380
1380
  f'No resource satisfying {num_node_str}'
1381
1381
  f'{resources.repr_with_region_zone} on {clouds_str}.')
sky/resources.py CHANGED
@@ -1,6 +1,8 @@
1
1
  """Resources: compute requirements of Tasks."""
2
+ import collections
2
3
  import dataclasses
3
4
  import math
5
+ import re
4
6
  import textwrap
5
7
  import typing
6
8
  from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
@@ -41,6 +43,20 @@ RESOURCE_CONFIG_ALIASES = {
41
43
  'gpus': 'accelerators',
42
44
  }
43
45
 
46
+ MEMORY_SIZE_UNITS = {
47
+ 'b': 1,
48
+ 'k': 2**10,
49
+ 'kb': 2**10,
50
+ 'm': 2**20,
51
+ 'mb': 2**20,
52
+ 'g': 2**30,
53
+ 'gb': 2**30,
54
+ 't': 2**40,
55
+ 'tb': 2**40,
56
+ 'p': 2**50,
57
+ 'pb': 2**50,
58
+ }
59
+
44
60
 
45
61
  @dataclasses.dataclass
46
62
  class AutostopConfig:
@@ -110,7 +126,7 @@ class Resources:
110
126
  """
111
127
  # If any fields changed, increment the version. For backward compatibility,
112
128
  # modify the __setstate__ method to handle the old version.
113
- _VERSION = 27
129
+ _VERSION = 28
114
130
 
115
131
  def __init__(
116
132
  self,
@@ -142,6 +158,7 @@ class Resources:
142
158
  _is_image_managed: Optional[bool] = None,
143
159
  _requires_fuse: Optional[bool] = None,
144
160
  _cluster_config_overrides: Optional[Dict[str, Any]] = None,
161
+ _no_missing_accel_warnings: Optional[bool] = None,
145
162
  ):
146
163
  """Initialize a Resources object.
147
164
 
@@ -366,6 +383,7 @@ class Resources:
366
383
 
367
384
  self._cluster_config_overrides = _cluster_config_overrides
368
385
  self._cached_repr: Optional[str] = None
386
+ self._no_missing_accel_warnings = _no_missing_accel_warnings
369
387
 
370
388
  # Initialize _priority before calling the setter
371
389
  self._priority: Optional[int] = None
@@ -649,6 +667,13 @@ class Resources:
649
667
  return False
650
668
  return self._requires_fuse
651
669
 
670
+ @property
671
+ def no_missing_accel_warnings(self) -> bool:
672
+ """Returns whether to force quiet mode for this resource."""
673
+ if self._no_missing_accel_warnings is None:
674
+ return False
675
+ return self._no_missing_accel_warnings
676
+
652
677
  def set_requires_fuse(self, value: bool) -> None:
653
678
  """Sets whether this resource requires FUSE mounting support.
654
679
 
@@ -754,6 +779,8 @@ class Resources:
754
779
  if ':' not in accelerators:
755
780
  accelerators = {accelerators: 1}
756
781
  else:
782
+ assert isinstance(accelerators,
783
+ str), (type(accelerators), accelerators)
757
784
  splits = accelerators.split(':')
758
785
  parse_error = ('The "accelerators" field as a str '
759
786
  'should be <name> or <name>:<cnt>. '
@@ -1778,6 +1805,8 @@ class Resources:
1778
1805
  self._is_image_managed),
1779
1806
  _requires_fuse=override.pop('_requires_fuse', self._requires_fuse),
1780
1807
  _cluster_config_overrides=override_configs,
1808
+ _no_missing_accel_warnings=override.pop(
1809
+ 'no_missing_accel_warnings', self._no_missing_accel_warnings),
1781
1810
  )
1782
1811
  assert not override
1783
1812
  return resources
@@ -1843,10 +1872,75 @@ class Resources:
1843
1872
  config[canonical] = config[alias]
1844
1873
  del config[alias]
1845
1874
 
1875
+ @classmethod
1876
+ def _parse_accelerators_from_str(
1877
+ cls, accelerators: str) -> List[Tuple[str, bool]]:
1878
+ """Parse accelerators string into a list of possible accelerators.
1879
+
1880
+ Returns:
1881
+ A list of possible accelerators. Each element is a tuple of
1882
+ (accelerator_name, was_user_specified). was_user_specified is True
1883
+ if the accelerator was directly named by the user (for example
1884
+ "H100:2" would be True, but "80GB+" would be False since it doesn't
1885
+ mention the name of the accelerator).
1886
+ """
1887
+ # sanity check
1888
+ assert isinstance(accelerators, str), accelerators
1889
+
1890
+ manufacturer = None
1891
+ memory = None
1892
+ count = 1
1893
+
1894
+ split = accelerators.split(':')
1895
+ if len(split) == 3:
1896
+ manufacturer, memory, count_str = split
1897
+ count = int(count_str)
1898
+ assert re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', memory), \
1899
+ 'If specifying a GPU manufacturer, you must also' \
1900
+ 'specify the memory size'
1901
+ elif len(split) == 2 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[0]):
1902
+ memory = split[0]
1903
+ count = int(split[1])
1904
+ elif len(split) == 2 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[1]):
1905
+ manufacturer, memory = split
1906
+ elif len(split) == 1 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[0]):
1907
+ memory = split[0]
1908
+ else:
1909
+ # it is just an accelerator name, not a memory size
1910
+ return [(accelerators, True)]
1911
+
1912
+ # we know we have some case of manufacturer, memory, count, now we
1913
+ # need to convert that to a list of possible accelerators
1914
+ memory_parsed = resources_utils.parse_memory_resource(memory,
1915
+ 'accelerators',
1916
+ allow_plus=True)
1917
+ plus = memory_parsed[-1] == '+'
1918
+ if plus:
1919
+ memory_parsed = memory_parsed[:-1]
1920
+ memory_gb = int(memory_parsed)
1921
+
1922
+ accelerators = [
1923
+ (f'{device}:{count}', False)
1924
+ for device in accelerator_registry.get_devices_by_memory(
1925
+ memory_gb, plus, manufacturer=manufacturer)
1926
+ ]
1927
+
1928
+ return accelerators
1929
+
1846
1930
  @classmethod
1847
1931
  def from_yaml_config(
1848
1932
  cls, config: Optional[Dict[str, Any]]
1849
1933
  ) -> Union[Set['Resources'], List['Resources']]:
1934
+ """Creates Resources objects from a YAML config.
1935
+
1936
+ Args:
1937
+ config: A dict of resource config.
1938
+
1939
+ Returns:
1940
+ A set of Resources objects if any_of is specified, otherwise a list
1941
+ of Resources objects if ordered is specified, otherwise a set with
1942
+ a single Resources object.
1943
+ """
1850
1944
  if config is None:
1851
1945
  return {Resources()}
1852
1946
 
@@ -1903,13 +1997,48 @@ class Resources:
1903
1997
  accelerators = config.get('accelerators')
1904
1998
  if config and accelerators is not None:
1905
1999
  if isinstance(accelerators, str):
1906
- accelerators = {accelerators}
2000
+ accelerators_list = cls._parse_accelerators_from_str(
2001
+ accelerators)
1907
2002
  elif isinstance(accelerators, dict):
1908
- accelerators = [
2003
+ accelerator_names = [
1909
2004
  f'{k}:{v}' if v is not None else f'{k}'
1910
2005
  for k, v in accelerators.items()
1911
2006
  ]
1912
- accelerators = set(accelerators)
2007
+ accelerators_list = []
2008
+ for accel_name in accelerator_names:
2009
+ parsed_accels = cls._parse_accelerators_from_str(accel_name)
2010
+ accelerators_list.extend(parsed_accels)
2011
+ elif isinstance(accelerators, list) or isinstance(
2012
+ accelerators, set):
2013
+ accelerators_list = []
2014
+ for accel_name in accelerators:
2015
+ parsed_accels = cls._parse_accelerators_from_str(accel_name)
2016
+ accelerators_list.extend(parsed_accels)
2017
+ else:
2018
+ assert False, ('Invalid accelerators type:'
2019
+ f'{type(accelerators)}')
2020
+ # now that accelerators is a list, we need to decide which to
2021
+ # include in the final set, however, there may be multiple copies
2022
+ # of the same accelerator, some given by name by the user and the
2023
+ # other copy being given by memory size. In this case, we only care
2024
+ # about the user specified ones (so we can give a warning if it
2025
+ # doesn't exist).
2026
+ accel_to_user_specified: Dict[str, bool] = collections.OrderedDict()
2027
+ for accel, user_specified in accelerators_list:
2028
+ # If this accelerator is not in dict yet, or if current one is
2029
+ # user specified and existing one is not, update the entry
2030
+ accel_to_user_specified[accel] = (user_specified or
2031
+ accel_to_user_specified.get(
2032
+ accel, False))
2033
+
2034
+ # only time we care about ordered is when we are given a list,
2035
+ # otherwise we default to a set
2036
+ accelerators_type = list if isinstance(accelerators, list) else set
2037
+ accelerators = accelerators_type([
2038
+ (accel, user_specified)
2039
+ for accel, user_specified in accel_to_user_specified.items()
2040
+ ])
2041
+
1913
2042
  if len(accelerators) > 1 and ordered_configs:
1914
2043
  with ux_utils.print_exception_no_traceback():
1915
2044
  raise ValueError(
@@ -1935,20 +2064,20 @@ class Resources:
1935
2064
  # In Task, we store a list of resources, each with 1 accelerator.
1936
2065
  # This for loop is for format conversion.
1937
2066
  tmp_resources_list = []
1938
- for acc in accelerators:
2067
+ for acc, user_specified in accelerators:
1939
2068
  tmp_resource = config.copy()
1940
2069
  tmp_resource['accelerators'] = acc
2070
+ if not user_specified:
2071
+ tmp_resource['_no_missing_accel_warnings'] = True
1941
2072
  tmp_resources_list.append(
1942
2073
  Resources._from_yaml_config_single(tmp_resource))
1943
2074
 
1944
2075
  assert isinstance(accelerators, (list, set)), accelerators
1945
2076
  return type(accelerators)(tmp_resources_list)
1946
-
1947
2077
  return {Resources._from_yaml_config_single(config)}
1948
2078
 
1949
2079
  @classmethod
1950
2080
  def _from_yaml_config_single(cls, config: Dict[str, str]) -> 'Resources':
1951
-
1952
2081
  resources_fields: Dict[str, Any] = {}
1953
2082
 
1954
2083
  # Extract infra field if present
@@ -2010,6 +2139,8 @@ class Resources:
2010
2139
  # although it will end up being an int, we don't know at this point
2011
2140
  # if it has units or not, so we store it as a string
2012
2141
  resources_fields['disk_size'] = str(resources_fields['disk_size'])
2142
+ resources_fields['_no_missing_accel_warnings'] = config.pop(
2143
+ '_no_missing_accel_warnings', None)
2013
2144
 
2014
2145
  assert not config, f'Invalid resource args: {config.keys()}'
2015
2146
  return Resources(**resources_fields)
@@ -2060,6 +2191,9 @@ class Resources:
2060
2191
  config['volumes'] = volumes
2061
2192
  if self._autostop_config is not None:
2062
2193
  config['autostop'] = self._autostop_config.to_yaml_config()
2194
+
2195
+ add_if_not_none('_no_missing_accel_warnings',
2196
+ self._no_missing_accel_warnings)
2063
2197
  add_if_not_none('priority', self.priority)
2064
2198
  if self._docker_login_config is not None:
2065
2199
  config['_docker_login_config'] = dataclasses.asdict(
@@ -2232,6 +2366,10 @@ class Resources:
2232
2366
  if version < 27:
2233
2367
  self._priority = None
2234
2368
 
2369
+ if version < 28:
2370
+ self._no_missing_accel_warnings = state.get(
2371
+ '_no_missing_accel_warnings', None)
2372
+
2235
2373
  self.__dict__.update(state)
2236
2374
 
2237
2375
 
sky/server/common.py CHANGED
@@ -165,14 +165,25 @@ def set_api_cookie_jar(cookie_jar: CookieJar,
165
165
  if not cookie_path.parent.exists():
166
166
  cookie_path.parent.mkdir(parents=True, exist_ok=True)
167
167
 
168
- file_cookie_jar = MozillaCookieJar(cookie_path)
168
+ # Writing directly to the cookie jar path can race with other processes that
169
+ # are reading the cookie jar, making it look malformed. Instead, write to a
170
+ # temporary file and then move it to the final location.
171
+ # Avoid hardcoding the tmp file path, since it could cause a race with other
172
+ # processes that are also writing to the tmp file.
173
+ with tempfile.NamedTemporaryFile(dir=cookie_path.parent,
174
+ delete=False) as tmp_file:
175
+ tmp_cookie_path = tmp_file.name
176
+ file_cookie_jar = MozillaCookieJar(tmp_cookie_path)
169
177
  if cookie_path.exists():
170
- file_cookie_jar.load()
178
+ file_cookie_jar.load(str(cookie_path))
171
179
 
172
180
  for cookie in cookie_jar:
173
181
  file_cookie_jar.set_cookie(cookie)
174
182
  file_cookie_jar.save()
175
183
 
184
+ # Move the temporary file to the final location.
185
+ os.replace(tmp_cookie_path, cookie_path)
186
+
176
187
 
177
188
  def get_cookies_from_response(
178
189
  response: 'requests.Response') -> requests.cookies.RequestsCookieJar: