skypilot-nightly 1.0.0.dev20250624__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +26 -11
- sky/backends/cloud_vm_ray_backend.py +16 -5
- sky/client/cli/command.py +222 -4
- sky/client/sdk.py +110 -82
- sky/clouds/aws.py +10 -7
- sky/clouds/azure.py +10 -7
- sky/clouds/cloud.py +2 -0
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +10 -7
- sky/clouds/fluidstack.py +2 -0
- sky/clouds/gcp.py +10 -7
- sky/clouds/hyperbolic.py +10 -7
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +26 -9
- sky/clouds/lambda_cloud.py +10 -7
- sky/clouds/nebius.py +10 -7
- sky/clouds/oci.py +10 -7
- sky/clouds/paperspace.py +10 -7
- sky/clouds/runpod.py +10 -7
- sky/clouds/scp.py +10 -7
- sky/clouds/vast.py +10 -7
- sky/clouds/vsphere.py +2 -0
- sky/core.py +1 -0
- sky/dag.py +14 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
- sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
- sky/dashboard/out/_next/static/chunks/{37-4650f214e2119168.js → 37-1f1e94f5a561202a.js} +2 -2
- sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
- sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
- sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
- sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
- sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
- sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
- sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
- sky/dashboard/out/_next/static/chunks/{856-bfddc18e16f3873c.js → 856-cdf66268ec878d0c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce31493da9747ef4.js → _app-0ef7418d1a3822f3.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-ecc5a7003776cfa7.js → [name]-0b4c662a25e4747a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
- sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +2 -4
- sky/exceptions.py +15 -0
- sky/execution.py +5 -0
- sky/global_user_state.py +129 -0
- sky/jobs/client/sdk.py +13 -11
- sky/jobs/server/core.py +4 -0
- sky/models.py +16 -0
- sky/provision/__init__.py +26 -0
- sky/provision/kubernetes/__init__.py +3 -0
- sky/provision/kubernetes/instance.py +38 -77
- sky/provision/kubernetes/utils.py +52 -2
- sky/provision/kubernetes/volume.py +147 -0
- sky/resources.py +20 -76
- sky/serve/client/sdk.py +13 -13
- sky/serve/server/core.py +5 -1
- sky/server/common.py +40 -5
- sky/server/constants.py +5 -1
- sky/server/metrics.py +105 -0
- sky/server/requests/executor.py +30 -14
- sky/server/requests/payloads.py +16 -0
- sky/server/requests/requests.py +35 -1
- sky/server/rest.py +152 -0
- sky/server/server.py +66 -16
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +8 -3
- sky/server/uvicorn.py +153 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +14 -3
- sky/task.py +141 -18
- sky/templates/kubernetes-ray.yml.j2 +30 -1
- sky/users/permission.py +2 -0
- sky/utils/context.py +3 -1
- sky/utils/resources_utils.py +66 -0
- sky/utils/rich_utils.py +6 -0
- sky/utils/schemas.py +146 -3
- sky/utils/status_lib.py +10 -0
- sky/utils/validator.py +11 -1
- sky/volumes/__init__.py +0 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +64 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +199 -0
- sky/volumes/server/server.py +85 -0
- sky/volumes/utils.py +158 -0
- sky/volumes/volume.py +198 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +123 -108
- sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
- sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +0 -6
- sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
- sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
- sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
- sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-7e9736af1c6345a6.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-171c27f4ca94861c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +0 -1
- sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +0 -3
- sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{zsALxITkbP8J8NVwSDwMo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-bde186946d353355.js → 843-07d25a7e64462fd8.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{938-ce7991c156584b06.js → 938-068520cc11738deb.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-56412c7976b4655b.js → 973-5b5019ba333e8d62.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
sky/server/stream_utils.py
CHANGED
@@ -155,9 +155,14 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
|
|
155
155
|
if request_task.status > requests_lib.RequestStatus.RUNNING:
|
156
156
|
if (request_task.status ==
|
157
157
|
requests_lib.RequestStatus.CANCELLED):
|
158
|
-
|
159
|
-
|
160
|
-
|
158
|
+
if request_task.should_retry:
|
159
|
+
buffer.append(
|
160
|
+
message_utils.encode_payload(
|
161
|
+
rich_utils.Control.RETRY.encode('')))
|
162
|
+
else:
|
163
|
+
buffer.append(
|
164
|
+
f'{request_task.name!r} request {request_id}'
|
165
|
+
' cancelled\n')
|
161
166
|
break
|
162
167
|
if not follow:
|
163
168
|
break
|
sky/server/uvicorn.py
CHANGED
@@ -3,17 +3,165 @@
|
|
3
3
|
This module is a wrapper around uvicorn to customize the behavior of the
|
4
4
|
server.
|
5
5
|
"""
|
6
|
-
import
|
6
|
+
import asyncio
|
7
7
|
import os
|
8
|
+
import signal
|
8
9
|
import threading
|
9
|
-
|
10
|
+
import time
|
11
|
+
from types import FrameType
|
12
|
+
from typing import Optional, Union
|
10
13
|
|
14
|
+
import filelock
|
11
15
|
import uvicorn
|
12
16
|
from uvicorn.supervisors import multiprocess
|
13
17
|
|
18
|
+
from sky import sky_logging
|
19
|
+
from sky.server import state
|
20
|
+
from sky.server.requests import requests as requests_lib
|
21
|
+
from sky.skylet import constants
|
14
22
|
from sky.utils import context_utils
|
15
23
|
from sky.utils import subprocess_utils
|
16
24
|
|
25
|
+
logger = sky_logging.init_logger(__name__)
|
26
|
+
|
27
|
+
# File lock path for coordinating graceful shutdown across processes
|
28
|
+
_GRACEFUL_SHUTDOWN_LOCK_PATH = '/tmp/skypilot_graceful_shutdown.lock'
|
29
|
+
|
30
|
+
# Interval to check for on-going requests.
|
31
|
+
_WAIT_REQUESTS_INTERVAL_SECONDS = 5
|
32
|
+
|
33
|
+
# Timeout for waiting for on-going requests to finish.
|
34
|
+
try:
|
35
|
+
_WAIT_REQUESTS_TIMEOUT_SECONDS = int(
|
36
|
+
os.environ.get(constants.GRACE_PERIOD_SECONDS_ENV_VAR, '60'))
|
37
|
+
except ValueError:
|
38
|
+
_WAIT_REQUESTS_TIMEOUT_SECONDS = 60
|
39
|
+
|
40
|
+
# TODO(aylei): use decorator to register requests that need to be proactively
|
41
|
+
# cancelled instead of hardcoding here.
|
42
|
+
_RETRIABLE_REQUEST_NAMES = [
|
43
|
+
'sky.logs',
|
44
|
+
'sky.jobs.logs',
|
45
|
+
'sky.serve.logs',
|
46
|
+
]
|
47
|
+
|
48
|
+
|
49
|
+
class Server(uvicorn.Server):
|
50
|
+
"""Server wrapper for uvicorn.
|
51
|
+
|
52
|
+
Extended functionalities:
|
53
|
+
- Handle exit signal and perform custom graceful shutdown.
|
54
|
+
- Run the server process with contextually aware.
|
55
|
+
"""
|
56
|
+
|
57
|
+
def __init__(self, config: uvicorn.Config):
|
58
|
+
super().__init__(config=config)
|
59
|
+
self.exiting: bool = False
|
60
|
+
|
61
|
+
def handle_exit(self, sig: int, frame: Union[FrameType, None]) -> None:
|
62
|
+
"""Handle exit signal.
|
63
|
+
|
64
|
+
When a server process receives a SIGTERM or SIGINT signal, a graceful
|
65
|
+
shutdown will be initiated. If a SIGINT signal is received again, the
|
66
|
+
server will be forcefully shutdown.
|
67
|
+
"""
|
68
|
+
if self.exiting and sig == signal.SIGINT:
|
69
|
+
# The server has been siganled to exit and recieved a SIGINT again,
|
70
|
+
# do force shutdown.
|
71
|
+
logger.info('Force shutdown.')
|
72
|
+
self.should_exit = True
|
73
|
+
super().handle_exit(sig, frame)
|
74
|
+
return
|
75
|
+
if not self.exiting:
|
76
|
+
self.exiting = True
|
77
|
+
# Perform graceful shutdown in a separate thread to avoid blocking
|
78
|
+
# the main thread.
|
79
|
+
threading.Thread(target=self._graceful_shutdown,
|
80
|
+
args=(sig, frame),
|
81
|
+
daemon=True).start()
|
82
|
+
|
83
|
+
def _graceful_shutdown(self, sig: int, frame: Union[FrameType,
|
84
|
+
None]) -> None:
|
85
|
+
"""Perform graceful shutdown."""
|
86
|
+
# Block new requests so that we can wait until all on-going requests
|
87
|
+
# are finished. Note that /api/$verb operations are still allowed in
|
88
|
+
# this stage to ensure the client can still operate the on-going
|
89
|
+
# requests, e.g. /api/logs, /api/cancel, etc.
|
90
|
+
logger.info('Block new requests being submitted in worker '
|
91
|
+
f'{os.getpid()}.')
|
92
|
+
state.set_block_requests(True)
|
93
|
+
# Ensure the shutting_down are set on all workers before next step.
|
94
|
+
# TODO(aylei): hacky, need a reliable solution.
|
95
|
+
time.sleep(1)
|
96
|
+
|
97
|
+
lock = filelock.FileLock(_GRACEFUL_SHUTDOWN_LOCK_PATH)
|
98
|
+
# Elect a coordinator process to handle on-going requests check
|
99
|
+
with lock.acquire():
|
100
|
+
logger.info(f'Worker {os.getpid()} elected as shutdown coordinator')
|
101
|
+
self._wait_requests()
|
102
|
+
|
103
|
+
logger.info('Shutting down server...')
|
104
|
+
self.should_exit = True
|
105
|
+
super().handle_exit(sig, frame)
|
106
|
+
|
107
|
+
def _wait_requests(self) -> None:
|
108
|
+
"""Wait until all on-going requests are finished or cancelled."""
|
109
|
+
start_time = time.time()
|
110
|
+
while True:
|
111
|
+
statuses = [
|
112
|
+
requests_lib.RequestStatus.PENDING,
|
113
|
+
requests_lib.RequestStatus.RUNNING,
|
114
|
+
]
|
115
|
+
reqs = requests_lib.get_request_tasks(status=statuses)
|
116
|
+
if not reqs:
|
117
|
+
break
|
118
|
+
logger.info(f'{len(reqs)} on-going requests '
|
119
|
+
'found, waiting for them to finish...')
|
120
|
+
# Proactively cancel internal requests and logs requests since
|
121
|
+
# they can run for infinite time.
|
122
|
+
internal_request_ids = [
|
123
|
+
d.id for d in requests_lib.INTERNAL_REQUEST_DAEMONS
|
124
|
+
]
|
125
|
+
if time.time() - start_time > _WAIT_REQUESTS_TIMEOUT_SECONDS:
|
126
|
+
logger.warning('Timeout waiting for on-going requests to '
|
127
|
+
'finish, cancelling all on-going requests.')
|
128
|
+
for req in reqs:
|
129
|
+
self.interrupt_request_for_retry(req.request_id)
|
130
|
+
break
|
131
|
+
interrupted = 0
|
132
|
+
for req in reqs:
|
133
|
+
if req.request_id in internal_request_ids:
|
134
|
+
self.interrupt_request_for_retry(req.request_id)
|
135
|
+
interrupted += 1
|
136
|
+
elif req.name in _RETRIABLE_REQUEST_NAMES:
|
137
|
+
self.interrupt_request_for_retry(req.request_id)
|
138
|
+
interrupted += 1
|
139
|
+
# TODO(aylei): interrupt pending requests to accelerate the
|
140
|
+
# shutdown.
|
141
|
+
# If some requests are not interrupted, wait for them to finish,
|
142
|
+
# otherwise we just check again immediately to accelerate the
|
143
|
+
# shutdown process.
|
144
|
+
if interrupted < len(reqs):
|
145
|
+
time.sleep(_WAIT_REQUESTS_INTERVAL_SECONDS)
|
146
|
+
|
147
|
+
def interrupt_request_for_retry(self, request_id: str) -> None:
|
148
|
+
"""Interrupt a request for retry."""
|
149
|
+
with requests_lib.update_request(request_id) as req:
|
150
|
+
if req is None:
|
151
|
+
return
|
152
|
+
if req.pid is not None:
|
153
|
+
os.kill(req.pid, signal.SIGTERM)
|
154
|
+
req.status = requests_lib.RequestStatus.CANCELLED
|
155
|
+
req.should_retry = True
|
156
|
+
logger.info(
|
157
|
+
f'Request {request_id} interrupted and will be retried by client.')
|
158
|
+
|
159
|
+
def run(self, *args, **kwargs):
|
160
|
+
"""Run the server process."""
|
161
|
+
context_utils.hijack_sys_attrs()
|
162
|
+
with self.capture_signals():
|
163
|
+
asyncio.run(self.serve(*args, **kwargs))
|
164
|
+
|
17
165
|
|
18
166
|
def run(config: uvicorn.Config):
|
19
167
|
"""Run unvicorn server."""
|
@@ -22,28 +170,20 @@ def run(config: uvicorn.Config):
|
|
22
170
|
# in uvicorn. Since we do not use reload now, simply
|
23
171
|
# guard by an exception.
|
24
172
|
raise ValueError('Reload is not supported yet.')
|
25
|
-
server =
|
26
|
-
run_server_process = functools.partial(_run_server_process, server)
|
173
|
+
server = Server(config=config)
|
27
174
|
try:
|
28
175
|
if config.workers is not None and config.workers > 1:
|
29
176
|
sock = config.bind_socket()
|
30
|
-
SlowStartMultiprocess(config,
|
31
|
-
target=run_server_process,
|
177
|
+
SlowStartMultiprocess(config, target=server.run,
|
32
178
|
sockets=[sock]).run()
|
33
179
|
else:
|
34
|
-
|
180
|
+
server.run()
|
35
181
|
finally:
|
36
182
|
# Copied from unvicorn.run()
|
37
183
|
if config.uds and os.path.exists(config.uds):
|
38
184
|
os.remove(config.uds)
|
39
185
|
|
40
186
|
|
41
|
-
def _run_server_process(server: uvicorn.Server, *args, **kwargs):
|
42
|
-
"""Run the server process with contextually aware."""
|
43
|
-
context_utils.hijack_sys_attrs()
|
44
|
-
server.run(*args, **kwargs)
|
45
|
-
|
46
|
-
|
47
187
|
class SlowStartMultiprocess(multiprocess.Multiprocess):
|
48
188
|
"""Uvicorn Multiprocess wrapper with slow start.
|
49
189
|
|
sky/setup_files/dependencies.py
CHANGED
sky/skylet/constants.py
CHANGED
@@ -413,6 +413,8 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
|
|
413
413
|
# Environment variable that is set to 'true' if this is a skypilot server.
|
414
414
|
ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
|
415
415
|
|
416
|
+
# Environment variable that is set to 'true' if metrics are enabled.
|
417
|
+
ENV_VAR_SERVER_METRICS_ENABLED = 'SKY_API_SERVER_METRICS_ENABLED'
|
416
418
|
# Environment variable that is set to 'true' if basic
|
417
419
|
# authentication is enabled in the API server.
|
418
420
|
ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
|
@@ -449,20 +451,29 @@ TIME_PATTERN: str = (
|
|
449
451
|
|
450
452
|
MEMORY_SIZE_UNITS = {
|
451
453
|
'kb': 2**10,
|
454
|
+
'ki': 2**10,
|
452
455
|
'mb': 2**20,
|
456
|
+
'mi': 2**20,
|
453
457
|
'gb': 2**30,
|
458
|
+
'gi': 2**30,
|
454
459
|
'tb': 2**40,
|
460
|
+
'ti': 2**40,
|
455
461
|
'pb': 2**50,
|
462
|
+
'pi': 2**50,
|
456
463
|
}
|
457
464
|
|
458
465
|
MEMORY_SIZE_PATTERN = (
|
459
466
|
'^[0-9]+('
|
460
|
-
f'{"|".join([unit.lower() for unit in MEMORY_SIZE_UNITS])}'
|
461
|
-
')
|
462
|
-
|
467
|
+
f'{"|".join([unit.lower() for unit in MEMORY_SIZE_UNITS])}|'
|
468
|
+
f'{"|".join([unit.upper() for unit in MEMORY_SIZE_UNITS])}|'
|
469
|
+
f'{"|".join([unit[0].upper() + unit[1:] for unit in MEMORY_SIZE_UNITS if len(unit) > 1])}' # pylint: disable=line-too-long
|
470
|
+
')?$')
|
471
|
+
|
472
|
+
LAST_USE_TRUNC_LENGTH = 25
|
463
473
|
|
464
474
|
MIN_PRIORITY = -1000
|
465
475
|
MAX_PRIORITY = 1000
|
466
476
|
DEFAULT_PRIORITY = 0
|
467
477
|
|
478
|
+
GRACE_PERIOD_SECONDS_ENV_VAR = SKYPILOT_ENV_VAR_PREFIX + 'GRACE_PERIOD_SECONDS'
|
468
479
|
COST_REPORT_DEFAULT_DAYS = 30
|
sky/task.py
CHANGED
@@ -24,6 +24,7 @@ from sky.skylet import constants
|
|
24
24
|
from sky.utils import common_utils
|
25
25
|
from sky.utils import schemas
|
26
26
|
from sky.utils import ux_utils
|
27
|
+
from sky.volumes import volume as volume_lib
|
27
28
|
|
28
29
|
if typing.TYPE_CHECKING:
|
29
30
|
import yaml
|
@@ -246,12 +247,14 @@ class Task:
|
|
246
247
|
secrets: Optional[Dict[str, str]] = None,
|
247
248
|
workdir: Optional[str] = None,
|
248
249
|
num_nodes: Optional[int] = None,
|
250
|
+
volumes: Optional[Dict[str, str]] = None,
|
249
251
|
# Advanced:
|
250
252
|
docker_image: Optional[str] = None,
|
251
253
|
event_callback: Optional[str] = None,
|
252
254
|
blocked_resources: Optional[Iterable['resources_lib.Resources']] = None,
|
253
255
|
# Internal use only.
|
254
256
|
file_mounts_mapping: Optional[Dict[str, str]] = None,
|
257
|
+
volume_mounts: Optional[List[volume_lib.VolumeMount]] = None,
|
255
258
|
):
|
256
259
|
"""Initializes a Task.
|
257
260
|
|
@@ -319,6 +322,7 @@ class Task:
|
|
319
322
|
self.setup = setup
|
320
323
|
self._envs = envs or {}
|
321
324
|
self._secrets = secrets or {}
|
325
|
+
self._volumes = volumes or {}
|
322
326
|
|
323
327
|
# Validate Docker login configuration early if both envs and secrets
|
324
328
|
# contain Docker variables
|
@@ -361,7 +365,9 @@ class Task:
|
|
361
365
|
self.best_resources: Optional[sky.Resources] = None
|
362
366
|
|
363
367
|
# For internal use only.
|
364
|
-
self.file_mounts_mapping = file_mounts_mapping
|
368
|
+
self.file_mounts_mapping: Optional[Dict[str, str]] = file_mounts_mapping
|
369
|
+
self.volume_mounts: Optional[List[volume_lib.VolumeMount]] = (
|
370
|
+
volume_mounts)
|
365
371
|
|
366
372
|
dag = sky.dag.get_current_dag()
|
367
373
|
if dag is not None:
|
@@ -442,12 +448,9 @@ class Task:
|
|
442
448
|
if self.file_mounts is None:
|
443
449
|
return
|
444
450
|
for target, source in self.file_mounts.items():
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
'File mount paths cannot end with a slash '
|
449
|
-
'(try "/mydir: /mydir" or "/myfile: /myfile"). '
|
450
|
-
f'Found: target={target} source={source}')
|
451
|
+
location = f'file_mounts.{target}: {source}'
|
452
|
+
self._validate_mount_path(target, location)
|
453
|
+
self._validate_path(source, location)
|
451
454
|
if data_utils.is_cloud_store_url(target):
|
452
455
|
with ux_utils.print_exception_no_traceback():
|
453
456
|
raise ValueError(
|
@@ -462,17 +465,25 @@ class Task:
|
|
462
465
|
f'File mount source {source!r} does not exist '
|
463
466
|
'locally. To fix: check if it exists, and correct '
|
464
467
|
'the path.')
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
468
|
+
|
469
|
+
def _validate_mount_path(self, path: str, location: str):
|
470
|
+
self._validate_path(path, location)
|
471
|
+
# TODO(zhwu): /home/username/sky_workdir as the target path need
|
472
|
+
# to be filtered out as well.
|
473
|
+
if (path == constants.SKY_REMOTE_WORKDIR and self.workdir is not None):
|
474
|
+
with ux_utils.print_exception_no_traceback():
|
475
|
+
raise ValueError(
|
476
|
+
f'Cannot use {constants.SKY_REMOTE_WORKDIR!r} as a '
|
477
|
+
'destination path of a file mount, as it will be used '
|
478
|
+
'by the workdir. If uploading a file/folder to the '
|
479
|
+
'workdir is needed, please specify the full path to '
|
480
|
+
'the file/folder.')
|
481
|
+
|
482
|
+
def _validate_path(self, path: str, location: str):
|
483
|
+
if path.endswith('/'):
|
484
|
+
with ux_utils.print_exception_no_traceback():
|
485
|
+
raise ValueError('Mount paths cannot end with a slash '
|
486
|
+
f'Found: {path} in {location}')
|
476
487
|
|
477
488
|
def expand_and_validate_workdir(self):
|
478
489
|
"""Expand workdir to absolute path and validate it.
|
@@ -587,6 +598,7 @@ class Task:
|
|
587
598
|
secrets=config.pop('secrets', None),
|
588
599
|
event_callback=config.pop('event_callback', None),
|
589
600
|
file_mounts_mapping=config.pop('file_mounts_mapping', None),
|
601
|
+
volumes=config.pop('volumes', None),
|
590
602
|
)
|
591
603
|
|
592
604
|
# Create lists to store storage objects inlined in file_mounts.
|
@@ -711,6 +723,16 @@ class Task:
|
|
711
723
|
service = service_spec.SkyServiceSpec.from_yaml_config(service)
|
712
724
|
task.set_service(service)
|
713
725
|
|
726
|
+
volume_mounts = config.pop('volume_mounts', None)
|
727
|
+
if volume_mounts is not None:
|
728
|
+
task.volume_mounts = []
|
729
|
+
for vol in volume_mounts:
|
730
|
+
common_utils.validate_schema(vol,
|
731
|
+
schemas.get_volume_mount_schema(),
|
732
|
+
'Invalid volume mount config: ')
|
733
|
+
volume_mount = volume_lib.VolumeMount.from_yaml_config(vol)
|
734
|
+
task.volume_mounts.append(volume_mount)
|
735
|
+
|
714
736
|
assert not config, f'Invalid task args: {config.keys()}'
|
715
737
|
return task
|
716
738
|
|
@@ -745,6 +767,97 @@ class Task:
|
|
745
767
|
config = {}
|
746
768
|
return Task.from_yaml_config(config)
|
747
769
|
|
770
|
+
def resolve_and_validate_volumes(self) -> None:
|
771
|
+
"""Resolve volumes config to volume mounts and validate them.
|
772
|
+
|
773
|
+
Raises:
|
774
|
+
exceptions.VolumeNotFoundError: if any volume is not found.
|
775
|
+
exceptions.VolumeTopologyConflictError: if there is conflict in the
|
776
|
+
volumes and compute topology.
|
777
|
+
"""
|
778
|
+
# Volumes has been resolved, a typical case is that the API server
|
779
|
+
# has resolved the volumes and the dag was then submitted to
|
780
|
+
# controllers.
|
781
|
+
if self.volume_mounts is not None:
|
782
|
+
return None
|
783
|
+
if not self._volumes:
|
784
|
+
return None
|
785
|
+
volume_mounts: List[volume_lib.VolumeMount] = []
|
786
|
+
for dst_path, vol in self._volumes.items():
|
787
|
+
self._validate_mount_path(dst_path, location='volumes')
|
788
|
+
# Shortcut for `dst_path: volume_name`
|
789
|
+
if isinstance(vol, str):
|
790
|
+
volume_mount = volume_lib.VolumeMount.resolve(dst_path, vol)
|
791
|
+
elif isinstance(vol, dict):
|
792
|
+
assert 'name' in vol, 'Volume name must be set.'
|
793
|
+
volume_mount = volume_lib.VolumeMount.resolve(
|
794
|
+
dst_path, vol['name'])
|
795
|
+
else:
|
796
|
+
raise ValueError(f'Invalid volume config: {dst_path}: {vol}')
|
797
|
+
volume_mounts.append(volume_mount)
|
798
|
+
# Disable certain access modes
|
799
|
+
disabled_modes = {}
|
800
|
+
if self.num_nodes > 1:
|
801
|
+
disabled_modes[
|
802
|
+
volume_lib.VolumeAccessMode.READ_WRITE_ONCE.value] = (
|
803
|
+
'access mode ReadWriteOnce is not supported for '
|
804
|
+
'multi-node tasks.')
|
805
|
+
disabled_modes[
|
806
|
+
volume_lib.VolumeAccessMode.READ_WRITE_ONCE_POD.value] = (
|
807
|
+
'access mode ReadWriteOncePod is not supported for '
|
808
|
+
'multi-node tasks.')
|
809
|
+
# TODO(aylei): generalize access mode to all volume types
|
810
|
+
# Record the required topology and the volume that requires it, e.g.
|
811
|
+
# {'cloud': ('volume_name', 'aws')}
|
812
|
+
topology: Dict[str, Tuple[str, Optional[str]]] = {
|
813
|
+
'cloud': ('', None),
|
814
|
+
'region': ('', None),
|
815
|
+
'zone': ('', None),
|
816
|
+
}
|
817
|
+
for vol in volume_mounts:
|
818
|
+
# Check access mode
|
819
|
+
access_mode = vol.volume_config.config.get('access_mode', '')
|
820
|
+
if access_mode in disabled_modes:
|
821
|
+
raise ValueError(f'Volume {vol.volume_name} with '
|
822
|
+
f'{disabled_modes[access_mode]}')
|
823
|
+
# Check topology
|
824
|
+
for key, (vol_name, previous_req) in topology.items():
|
825
|
+
req = getattr(vol.volume_config, key)
|
826
|
+
if req is not None:
|
827
|
+
if previous_req is not None and req != previous_req:
|
828
|
+
raise exceptions.VolumeTopologyConflictError(
|
829
|
+
f'Volume {vol.volume_name} can only be attached on '
|
830
|
+
f'{key}:{req}, which conflicts with another volume '
|
831
|
+
f'{vol_name} that requires {key}:{previous_req}.'
|
832
|
+
f'Please use different volumes and retry.')
|
833
|
+
topology[key] = (vol_name, req)
|
834
|
+
# Now we have the topology requirements from the intersection of all
|
835
|
+
# volumes. Check if there is topology conflict with the resources.
|
836
|
+
# Volume must have no conflict with ALL resources even if user
|
837
|
+
# specifies 'any_of' resources to ensure no resources will conflict
|
838
|
+
# with the volumes during failover.
|
839
|
+
|
840
|
+
for res in self.resources:
|
841
|
+
for key, (vol_name, vol_req) in topology.items():
|
842
|
+
req = getattr(res, key)
|
843
|
+
if (req is not None and vol_req is not None and
|
844
|
+
str(req) != vol_req):
|
845
|
+
raise exceptions.VolumeTopologyConflictError(
|
846
|
+
f'The task requires {key}:{req}, which conflicts with '
|
847
|
+
f'the volume constraint {key}:{vol_req}. Please '
|
848
|
+
f'use different volumes and retry.')
|
849
|
+
# No topology conflict, we safely override the topology of resources to
|
850
|
+
# satisfy the volume constraints.
|
851
|
+
override_params = {}
|
852
|
+
for key, (vol_name, vol_req) in topology.items():
|
853
|
+
if vol_req is not None:
|
854
|
+
if key == 'cloud':
|
855
|
+
override_params[key] = sky.CLOUD_REGISTRY.from_str(vol_req)
|
856
|
+
else:
|
857
|
+
override_params[key] = vol_req
|
858
|
+
self.set_resources_override(override_params)
|
859
|
+
self.volume_mounts = volume_mounts
|
860
|
+
|
748
861
|
@property
|
749
862
|
def num_nodes(self) -> int:
|
750
863
|
return self._num_nodes
|
@@ -767,6 +880,10 @@ class Task:
|
|
767
880
|
def secrets(self) -> Dict[str, str]:
|
768
881
|
return self._secrets
|
769
882
|
|
883
|
+
@property
|
884
|
+
def volumes(self) -> Dict[str, str]:
|
885
|
+
return self._volumes
|
886
|
+
|
770
887
|
def update_envs(
|
771
888
|
self, envs: Union[None, List[Tuple[str, str]],
|
772
889
|
Dict[str, str]]) -> 'Task':
|
@@ -1453,6 +1570,12 @@ class Task:
|
|
1453
1570
|
})
|
1454
1571
|
|
1455
1572
|
add_if_not_none('file_mounts_mapping', self.file_mounts_mapping)
|
1573
|
+
add_if_not_none('volumes', self.volumes)
|
1574
|
+
if self.volume_mounts is not None:
|
1575
|
+
config['volume_mounts'] = [
|
1576
|
+
volume_mount.to_yaml_config()
|
1577
|
+
for volume_mount in self.volume_mounts
|
1578
|
+
]
|
1456
1579
|
return config
|
1457
1580
|
|
1458
1581
|
def get_required_cloud_features(
|
@@ -243,6 +243,22 @@ provider:
|
|
243
243
|
# This selector must match the head node pod's selector below.
|
244
244
|
selector:
|
245
245
|
component: {{cluster_name_on_cloud}}-head
|
246
|
+
# Headless service mapping hostnames to rest of the worker nodes
|
247
|
+
{% for worker_id in range(1, num_nodes) %}
|
248
|
+
- apiVersion: v1
|
249
|
+
kind: Service
|
250
|
+
metadata:
|
251
|
+
labels:
|
252
|
+
parent: skypilot
|
253
|
+
skypilot-cluster: {{cluster_name_on_cloud}}
|
254
|
+
skypilot-user: {{ user }}
|
255
|
+
name: {{cluster_name_on_cloud}}-worker{{ worker_id }}
|
256
|
+
spec:
|
257
|
+
selector:
|
258
|
+
component: {{cluster_name_on_cloud}}-worker{{ worker_id }}
|
259
|
+
clusterIP: None
|
260
|
+
{% endfor %}
|
261
|
+
|
246
262
|
|
247
263
|
# Specify the pod type for the ray head node (as configured below).
|
248
264
|
head_node_type: ray_head_default
|
@@ -255,7 +271,7 @@ available_node_types:
|
|
255
271
|
metadata:
|
256
272
|
# name will be filled in the provisioner
|
257
273
|
# head node name will be {{cluster_name_on_cloud}}-head, which will match the head node service selector above if a head node
|
258
|
-
# service is required.
|
274
|
+
# service is required. Worker nodes are named {{cluster_name_on_cloud}}-worker{{ node_id }}
|
259
275
|
labels:
|
260
276
|
parent: skypilot
|
261
277
|
# component will be set for the head node pod to be the same as the head node service selector above if a
|
@@ -287,6 +303,10 @@ available_node_types:
|
|
287
303
|
serviceAccountName: {{k8s_service_account_name}}
|
288
304
|
automountServiceAccountToken: {{k8s_automount_sa_token}}
|
289
305
|
restartPolicy: {{ "Always" if high_availability else "Never" }}
|
306
|
+
{% if volume_mounts %}
|
307
|
+
securityContext:
|
308
|
+
fsGroup: 1000
|
309
|
+
{% endif %}
|
290
310
|
|
291
311
|
# Add node selector if GPU/TPUs are requested:
|
292
312
|
{% if (k8s_topology_label_key is not none and k8s_topology_label_value is not none) or (k8s_spot_label_key is not none) %}
|
@@ -365,6 +385,11 @@ available_node_types:
|
|
365
385
|
persistentVolumeClaim:
|
366
386
|
claimName: {{cluster_name_on_cloud}}-{{k8s_high_availability_deployment_volume_mount_name}}
|
367
387
|
{% endif %}
|
388
|
+
{% for volume_mount in volume_mounts %}
|
389
|
+
- name: {{volume_mount.name}}
|
390
|
+
persistentVolumeClaim:
|
391
|
+
claimName: {{volume_mount.volume_name_on_cloud}}
|
392
|
+
{% endfor %}
|
368
393
|
containers:
|
369
394
|
- name: ray-node
|
370
395
|
imagePullPolicy: IfNotPresent
|
@@ -734,6 +759,10 @@ available_node_types:
|
|
734
759
|
- name: fusermount-shared-dir
|
735
760
|
mountPath: {{k8s_fusermount_shared_dir}}
|
736
761
|
{% endif %}
|
762
|
+
{% for volume_mount in volume_mounts %}
|
763
|
+
- name: {{volume_mount.name}}
|
764
|
+
mountPath: {{volume_mount.path}}
|
765
|
+
{% endfor %}
|
737
766
|
resources:
|
738
767
|
requests:
|
739
768
|
cpu: {{cpus}}
|
sky/users/permission.py
CHANGED
@@ -18,6 +18,8 @@ from sky.utils import common_utils
|
|
18
18
|
|
19
19
|
logging.getLogger('casbin.policy').setLevel(sky_logging.ERROR)
|
20
20
|
logging.getLogger('casbin.role').setLevel(sky_logging.ERROR)
|
21
|
+
logging.getLogger('casbin.model').setLevel(sky_logging.ERROR)
|
22
|
+
logging.getLogger('casbin.rbac').setLevel(sky_logging.ERROR)
|
21
23
|
logger = sky_logging.init_logger(__name__)
|
22
24
|
|
23
25
|
# Filelocks for the policy update.
|
sky/utils/context.py
CHANGED
@@ -254,7 +254,9 @@ class Popen(subprocess.Popen):
|
|
254
254
|
def __init__(self, *args, **kwargs):
|
255
255
|
env = kwargs.pop('env', None)
|
256
256
|
if env is None:
|
257
|
-
|
257
|
+
# Pass a copy of current context.environ to avoid race condition
|
258
|
+
# when the context is updated after the Popen is created.
|
259
|
+
env = os.environ.copy()
|
258
260
|
super().__init__(*args, env=env, **kwargs)
|
259
261
|
|
260
262
|
|
sky/utils/resources_utils.py
CHANGED
@@ -8,6 +8,7 @@ import typing
|
|
8
8
|
from typing import Dict, List, Optional, Set, Union
|
9
9
|
|
10
10
|
from sky import skypilot_config
|
11
|
+
from sky.skylet import constants
|
11
12
|
from sky.utils import common_utils
|
12
13
|
from sky.utils import registry
|
13
14
|
from sky.utils import ux_utils
|
@@ -331,3 +332,68 @@ def make_launchables_for_valid_region_zones(
|
|
331
332
|
# Batch the requests at the granularity of a single region.
|
332
333
|
launchables.append(launchable_resources.copy(region=region.name))
|
333
334
|
return launchables
|
335
|
+
|
336
|
+
|
337
|
+
def parse_memory_resource(resource_qty_str: Union[str, int, float],
|
338
|
+
field_name: str,
|
339
|
+
ret_type: type = int,
|
340
|
+
unit: str = 'gb',
|
341
|
+
allow_plus: bool = False,
|
342
|
+
allow_x: bool = False,
|
343
|
+
allow_rounding: bool = False) -> str:
|
344
|
+
"""Returns memory size in chosen units given a resource quantity string.
|
345
|
+
|
346
|
+
Args:
|
347
|
+
resource_qty_str: Resource quantity string
|
348
|
+
unit: Unit to convert to
|
349
|
+
allow_plus: Whether to allow '+' prefix
|
350
|
+
allow_x: Whether to allow 'x' suffix
|
351
|
+
"""
|
352
|
+
assert unit in constants.MEMORY_SIZE_UNITS, f'Invalid unit: {unit}'
|
353
|
+
|
354
|
+
error_msg = (f'"{field_name}" field should be a '
|
355
|
+
f'{constants.MEMORY_SIZE_PATTERN}+?,'
|
356
|
+
f' got {resource_qty_str}')
|
357
|
+
|
358
|
+
resource_str = str(resource_qty_str)
|
359
|
+
|
360
|
+
# Handle plus and x suffixes, x is only used internally for jobs controller
|
361
|
+
plus = ''
|
362
|
+
if resource_str.endswith('+'):
|
363
|
+
if allow_plus:
|
364
|
+
resource_str = resource_str[:-1]
|
365
|
+
plus = '+'
|
366
|
+
else:
|
367
|
+
raise ValueError(error_msg)
|
368
|
+
|
369
|
+
x = ''
|
370
|
+
if resource_str.endswith('x'):
|
371
|
+
if allow_x:
|
372
|
+
resource_str = resource_str[:-1]
|
373
|
+
x = 'x'
|
374
|
+
else:
|
375
|
+
raise ValueError(error_msg)
|
376
|
+
|
377
|
+
try:
|
378
|
+
# We assume it is already in the wanted units to maintain backwards
|
379
|
+
# compatibility
|
380
|
+
ret_type(resource_str)
|
381
|
+
return f'{resource_str}{plus}{x}'
|
382
|
+
except ValueError:
|
383
|
+
pass
|
384
|
+
|
385
|
+
resource_str = resource_str.lower()
|
386
|
+
for mem_unit, multiplier in constants.MEMORY_SIZE_UNITS.items():
|
387
|
+
if resource_str.endswith(mem_unit):
|
388
|
+
try:
|
389
|
+
value = ret_type(resource_str[:-len(mem_unit)])
|
390
|
+
converted = (value * multiplier /
|
391
|
+
constants.MEMORY_SIZE_UNITS[unit])
|
392
|
+
if not allow_rounding and ret_type(converted) != converted:
|
393
|
+
raise ValueError(error_msg)
|
394
|
+
converted = ret_type(converted)
|
395
|
+
return f'{converted}{plus}{x}'
|
396
|
+
except ValueError:
|
397
|
+
continue
|
398
|
+
|
399
|
+
raise ValueError(error_msg)
|