skypilot-nightly 1.0.0.dev20250624__py3-none-any.whl → 1.0.0.dev20250626__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +1 -6
- sky/backends/backend_utils.py +26 -11
- sky/backends/cloud_vm_ray_backend.py +16 -5
- sky/client/cli/command.py +232 -9
- sky/client/sdk.py +195 -91
- sky/clouds/aws.py +10 -7
- sky/clouds/azure.py +10 -7
- sky/clouds/cloud.py +2 -0
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +10 -7
- sky/clouds/fluidstack.py +2 -0
- sky/clouds/gcp.py +10 -7
- sky/clouds/hyperbolic.py +10 -7
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +26 -9
- sky/clouds/lambda_cloud.py +10 -7
- sky/clouds/nebius.py +10 -7
- sky/clouds/oci.py +10 -7
- sky/clouds/paperspace.py +10 -7
- sky/clouds/runpod.py +10 -7
- sky/clouds/scp.py +10 -7
- sky/clouds/ssh.py +36 -0
- sky/clouds/vast.py +10 -7
- sky/clouds/vsphere.py +2 -0
- sky/core.py +21 -0
- sky/dag.py +14 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/bs6UB9V4Jq10TIZ5x-kBK/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/141-fa5a20cbf401b351.js +11 -0
- sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
- sky/dashboard/out/_next/static/chunks/25.76c246239df93d50.js +6 -0
- sky/dashboard/out/_next/static/chunks/43-36177d00f6956ab2.js +1 -0
- sky/dashboard/out/_next/static/chunks/430.ed51037d1a4a438b.js +1 -0
- sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
- sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
- sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
- sky/dashboard/out/_next/static/chunks/690.55f9eed3be903f56.js +16 -0
- sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
- sky/dashboard/out/_next/static/chunks/785.dc2686c3c1235554.js +1 -0
- sky/dashboard/out/_next/static/chunks/871-3db673be3ee3750b.js +6 -0
- sky/dashboard/out/_next/static/chunks/875.52c962183328b3f2.js +25 -0
- sky/dashboard/out/_next/static/chunks/973-81b2d057178adb76.js +1 -0
- sky/dashboard/out/_next/static/chunks/982.1b61658204416b0f.js +1 -0
- sky/dashboard/out/_next/static/chunks/984.e8bac186a24e5178.js +1 -0
- sky/dashboard/out/_next/static/chunks/990-0ad5ea1699e03ee8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce31493da9747ef4.js → _app-9a3ce3170d2edcec.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-8040f2483897ed0c.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-7e9736af1c6345a6.js → clusters-f119a5630a1efd61.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/config-6b255eae088da6a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-b302aea4d65766bf.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-ee8cc4d449945d19.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-0a5695ff3075d94a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-4978cbb093e141e7.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/{new-31aa8bdcb7592635.js → new-5b59bce9eb208d84.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-cb7e720b739de53a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-50e230828730cfb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-08fdb9e6070127fc.js +1 -0
- sky/dashboard/out/_next/static/css/52082cf558ec9705.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +2 -4
- sky/exceptions.py +15 -0
- sky/execution.py +5 -0
- sky/global_user_state.py +129 -0
- sky/jobs/client/sdk.py +13 -11
- sky/jobs/server/core.py +4 -0
- sky/models.py +16 -0
- sky/provision/__init__.py +26 -0
- sky/provision/kubernetes/__init__.py +3 -0
- sky/provision/kubernetes/instance.py +38 -77
- sky/provision/kubernetes/utils.py +70 -4
- sky/provision/kubernetes/volume.py +147 -0
- sky/resources.py +20 -76
- sky/serve/client/sdk.py +13 -13
- sky/serve/server/core.py +5 -1
- sky/server/common.py +40 -5
- sky/server/constants.py +5 -1
- sky/server/metrics.py +105 -0
- sky/server/requests/executor.py +30 -14
- sky/server/requests/payloads.py +16 -0
- sky/server/requests/requests.py +35 -1
- sky/server/rest.py +153 -0
- sky/server/server.py +70 -43
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +8 -3
- sky/server/uvicorn.py +153 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +19 -3
- sky/skypilot_config.py +3 -0
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +133 -0
- sky/ssh_node_pools/server.py +232 -0
- sky/task.py +141 -18
- sky/templates/kubernetes-ray.yml.j2 +30 -1
- sky/users/permission.py +2 -0
- sky/utils/context.py +3 -1
- sky/utils/kubernetes/deploy_remote_cluster.py +12 -185
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/resources_utils.py +66 -0
- sky/utils/rich_utils.py +6 -0
- sky/utils/schemas.py +146 -3
- sky/utils/status_lib.py +10 -0
- sky/utils/validator.py +11 -1
- sky/volumes/__init__.py +0 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +64 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +199 -0
- sky/volumes/server/server.py +85 -0
- sky/volumes/utils.py +158 -0
- sky/volumes/volume.py +198 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/RECORD +135 -115
- sky/dashboard/out/_next/static/chunks/211.692afc57e812ae1a.js +0 -1
- sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
- sky/dashboard/out/_next/static/chunks/37-4650f214e2119168.js +0 -6
- sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +0 -6
- sky/dashboard/out/_next/static/chunks/443.b2242d0efcdf5f47.js +0 -1
- sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/513.309df9e18a9ff005.js +0 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
- sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
- sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
- sky/dashboard/out/_next/static/chunks/843-bde186946d353355.js +0 -11
- sky/dashboard/out/_next/static/chunks/856-bfddc18e16f3873c.js +0 -1
- sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
- sky/dashboard/out/_next/static/chunks/973-56412c7976b4655b.js +0 -1
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-171c27f4ca94861c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-ecc5a7003776cfa7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +0 -1
- sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +0 -3
- sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{zsALxITkbP8J8NVwSDwMo → bs6UB9V4Jq10TIZ5x-kBK}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{938-ce7991c156584b06.js → 938-068520cc11738deb.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/top_level.txt +0 -0
sky/server/uvicorn.py
CHANGED
@@ -3,17 +3,165 @@
|
|
3
3
|
This module is a wrapper around uvicorn to customize the behavior of the
|
4
4
|
server.
|
5
5
|
"""
|
6
|
-
import
|
6
|
+
import asyncio
|
7
7
|
import os
|
8
|
+
import signal
|
8
9
|
import threading
|
9
|
-
|
10
|
+
import time
|
11
|
+
from types import FrameType
|
12
|
+
from typing import Optional, Union
|
10
13
|
|
14
|
+
import filelock
|
11
15
|
import uvicorn
|
12
16
|
from uvicorn.supervisors import multiprocess
|
13
17
|
|
18
|
+
from sky import sky_logging
|
19
|
+
from sky.server import state
|
20
|
+
from sky.server.requests import requests as requests_lib
|
21
|
+
from sky.skylet import constants
|
14
22
|
from sky.utils import context_utils
|
15
23
|
from sky.utils import subprocess_utils
|
16
24
|
|
25
|
+
logger = sky_logging.init_logger(__name__)
|
26
|
+
|
27
|
+
# File lock path for coordinating graceful shutdown across processes
|
28
|
+
_GRACEFUL_SHUTDOWN_LOCK_PATH = '/tmp/skypilot_graceful_shutdown.lock'
|
29
|
+
|
30
|
+
# Interval to check for on-going requests.
|
31
|
+
_WAIT_REQUESTS_INTERVAL_SECONDS = 5
|
32
|
+
|
33
|
+
# Timeout for waiting for on-going requests to finish.
|
34
|
+
try:
|
35
|
+
_WAIT_REQUESTS_TIMEOUT_SECONDS = int(
|
36
|
+
os.environ.get(constants.GRACE_PERIOD_SECONDS_ENV_VAR, '60'))
|
37
|
+
except ValueError:
|
38
|
+
_WAIT_REQUESTS_TIMEOUT_SECONDS = 60
|
39
|
+
|
40
|
+
# TODO(aylei): use decorator to register requests that need to be proactively
|
41
|
+
# cancelled instead of hardcoding here.
|
42
|
+
_RETRIABLE_REQUEST_NAMES = [
|
43
|
+
'sky.logs',
|
44
|
+
'sky.jobs.logs',
|
45
|
+
'sky.serve.logs',
|
46
|
+
]
|
47
|
+
|
48
|
+
|
49
|
+
class Server(uvicorn.Server):
|
50
|
+
"""Server wrapper for uvicorn.
|
51
|
+
|
52
|
+
Extended functionalities:
|
53
|
+
- Handle exit signal and perform custom graceful shutdown.
|
54
|
+
- Run the server process with contextually aware.
|
55
|
+
"""
|
56
|
+
|
57
|
+
def __init__(self, config: uvicorn.Config):
|
58
|
+
super().__init__(config=config)
|
59
|
+
self.exiting: bool = False
|
60
|
+
|
61
|
+
def handle_exit(self, sig: int, frame: Union[FrameType, None]) -> None:
|
62
|
+
"""Handle exit signal.
|
63
|
+
|
64
|
+
When a server process receives a SIGTERM or SIGINT signal, a graceful
|
65
|
+
shutdown will be initiated. If a SIGINT signal is received again, the
|
66
|
+
server will be forcefully shutdown.
|
67
|
+
"""
|
68
|
+
if self.exiting and sig == signal.SIGINT:
|
69
|
+
# The server has been siganled to exit and recieved a SIGINT again,
|
70
|
+
# do force shutdown.
|
71
|
+
logger.info('Force shutdown.')
|
72
|
+
self.should_exit = True
|
73
|
+
super().handle_exit(sig, frame)
|
74
|
+
return
|
75
|
+
if not self.exiting:
|
76
|
+
self.exiting = True
|
77
|
+
# Perform graceful shutdown in a separate thread to avoid blocking
|
78
|
+
# the main thread.
|
79
|
+
threading.Thread(target=self._graceful_shutdown,
|
80
|
+
args=(sig, frame),
|
81
|
+
daemon=True).start()
|
82
|
+
|
83
|
+
def _graceful_shutdown(self, sig: int, frame: Union[FrameType,
|
84
|
+
None]) -> None:
|
85
|
+
"""Perform graceful shutdown."""
|
86
|
+
# Block new requests so that we can wait until all on-going requests
|
87
|
+
# are finished. Note that /api/$verb operations are still allowed in
|
88
|
+
# this stage to ensure the client can still operate the on-going
|
89
|
+
# requests, e.g. /api/logs, /api/cancel, etc.
|
90
|
+
logger.info('Block new requests being submitted in worker '
|
91
|
+
f'{os.getpid()}.')
|
92
|
+
state.set_block_requests(True)
|
93
|
+
# Ensure the shutting_down are set on all workers before next step.
|
94
|
+
# TODO(aylei): hacky, need a reliable solution.
|
95
|
+
time.sleep(1)
|
96
|
+
|
97
|
+
lock = filelock.FileLock(_GRACEFUL_SHUTDOWN_LOCK_PATH)
|
98
|
+
# Elect a coordinator process to handle on-going requests check
|
99
|
+
with lock.acquire():
|
100
|
+
logger.info(f'Worker {os.getpid()} elected as shutdown coordinator')
|
101
|
+
self._wait_requests()
|
102
|
+
|
103
|
+
logger.info('Shutting down server...')
|
104
|
+
self.should_exit = True
|
105
|
+
super().handle_exit(sig, frame)
|
106
|
+
|
107
|
+
def _wait_requests(self) -> None:
|
108
|
+
"""Wait until all on-going requests are finished or cancelled."""
|
109
|
+
start_time = time.time()
|
110
|
+
while True:
|
111
|
+
statuses = [
|
112
|
+
requests_lib.RequestStatus.PENDING,
|
113
|
+
requests_lib.RequestStatus.RUNNING,
|
114
|
+
]
|
115
|
+
reqs = requests_lib.get_request_tasks(status=statuses)
|
116
|
+
if not reqs:
|
117
|
+
break
|
118
|
+
logger.info(f'{len(reqs)} on-going requests '
|
119
|
+
'found, waiting for them to finish...')
|
120
|
+
# Proactively cancel internal requests and logs requests since
|
121
|
+
# they can run for infinite time.
|
122
|
+
internal_request_ids = [
|
123
|
+
d.id for d in requests_lib.INTERNAL_REQUEST_DAEMONS
|
124
|
+
]
|
125
|
+
if time.time() - start_time > _WAIT_REQUESTS_TIMEOUT_SECONDS:
|
126
|
+
logger.warning('Timeout waiting for on-going requests to '
|
127
|
+
'finish, cancelling all on-going requests.')
|
128
|
+
for req in reqs:
|
129
|
+
self.interrupt_request_for_retry(req.request_id)
|
130
|
+
break
|
131
|
+
interrupted = 0
|
132
|
+
for req in reqs:
|
133
|
+
if req.request_id in internal_request_ids:
|
134
|
+
self.interrupt_request_for_retry(req.request_id)
|
135
|
+
interrupted += 1
|
136
|
+
elif req.name in _RETRIABLE_REQUEST_NAMES:
|
137
|
+
self.interrupt_request_for_retry(req.request_id)
|
138
|
+
interrupted += 1
|
139
|
+
# TODO(aylei): interrupt pending requests to accelerate the
|
140
|
+
# shutdown.
|
141
|
+
# If some requests are not interrupted, wait for them to finish,
|
142
|
+
# otherwise we just check again immediately to accelerate the
|
143
|
+
# shutdown process.
|
144
|
+
if interrupted < len(reqs):
|
145
|
+
time.sleep(_WAIT_REQUESTS_INTERVAL_SECONDS)
|
146
|
+
|
147
|
+
def interrupt_request_for_retry(self, request_id: str) -> None:
|
148
|
+
"""Interrupt a request for retry."""
|
149
|
+
with requests_lib.update_request(request_id) as req:
|
150
|
+
if req is None:
|
151
|
+
return
|
152
|
+
if req.pid is not None:
|
153
|
+
os.kill(req.pid, signal.SIGTERM)
|
154
|
+
req.status = requests_lib.RequestStatus.CANCELLED
|
155
|
+
req.should_retry = True
|
156
|
+
logger.info(
|
157
|
+
f'Request {request_id} interrupted and will be retried by client.')
|
158
|
+
|
159
|
+
def run(self, *args, **kwargs):
|
160
|
+
"""Run the server process."""
|
161
|
+
context_utils.hijack_sys_attrs()
|
162
|
+
with self.capture_signals():
|
163
|
+
asyncio.run(self.serve(*args, **kwargs))
|
164
|
+
|
17
165
|
|
18
166
|
def run(config: uvicorn.Config):
|
19
167
|
"""Run unvicorn server."""
|
@@ -22,28 +170,20 @@ def run(config: uvicorn.Config):
|
|
22
170
|
# in uvicorn. Since we do not use reload now, simply
|
23
171
|
# guard by an exception.
|
24
172
|
raise ValueError('Reload is not supported yet.')
|
25
|
-
server =
|
26
|
-
run_server_process = functools.partial(_run_server_process, server)
|
173
|
+
server = Server(config=config)
|
27
174
|
try:
|
28
175
|
if config.workers is not None and config.workers > 1:
|
29
176
|
sock = config.bind_socket()
|
30
|
-
SlowStartMultiprocess(config,
|
31
|
-
target=run_server_process,
|
177
|
+
SlowStartMultiprocess(config, target=server.run,
|
32
178
|
sockets=[sock]).run()
|
33
179
|
else:
|
34
|
-
|
180
|
+
server.run()
|
35
181
|
finally:
|
36
182
|
# Copied from unvicorn.run()
|
37
183
|
if config.uds and os.path.exists(config.uds):
|
38
184
|
os.remove(config.uds)
|
39
185
|
|
40
186
|
|
41
|
-
def _run_server_process(server: uvicorn.Server, *args, **kwargs):
|
42
|
-
"""Run the server process with contextually aware."""
|
43
|
-
context_utils.hijack_sys_attrs()
|
44
|
-
server.run(*args, **kwargs)
|
45
|
-
|
46
|
-
|
47
187
|
class SlowStartMultiprocess(multiprocess.Multiprocess):
|
48
188
|
"""Uvicorn Multiprocess wrapper with slow start.
|
49
189
|
|
sky/setup_files/dependencies.py
CHANGED
sky/skylet/constants.py
CHANGED
@@ -413,6 +413,13 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
|
|
413
413
|
# Environment variable that is set to 'true' if this is a skypilot server.
|
414
414
|
ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
|
415
415
|
|
416
|
+
# Environment variable that is set to 'true' if metrics are enabled.
|
417
|
+
ENV_VAR_SERVER_METRICS_ENABLED = 'SKY_API_SERVER_METRICS_ENABLED'
|
418
|
+
|
419
|
+
# Environment variable that is used as the DB connection string for the
|
420
|
+
# skypilot server.
|
421
|
+
ENV_VAR_DB_CONNECTION_URI = (f'{SKYPILOT_ENV_VAR_PREFIX}DB_CONNECTION_URI')
|
422
|
+
|
416
423
|
# Environment variable that is set to 'true' if basic
|
417
424
|
# authentication is enabled in the API server.
|
418
425
|
ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
|
@@ -449,20 +456,29 @@ TIME_PATTERN: str = (
|
|
449
456
|
|
450
457
|
MEMORY_SIZE_UNITS = {
|
451
458
|
'kb': 2**10,
|
459
|
+
'ki': 2**10,
|
452
460
|
'mb': 2**20,
|
461
|
+
'mi': 2**20,
|
453
462
|
'gb': 2**30,
|
463
|
+
'gi': 2**30,
|
454
464
|
'tb': 2**40,
|
465
|
+
'ti': 2**40,
|
455
466
|
'pb': 2**50,
|
467
|
+
'pi': 2**50,
|
456
468
|
}
|
457
469
|
|
458
470
|
MEMORY_SIZE_PATTERN = (
|
459
471
|
'^[0-9]+('
|
460
|
-
f'{"|".join([unit.lower() for unit in MEMORY_SIZE_UNITS])}'
|
461
|
-
')
|
462
|
-
|
472
|
+
f'{"|".join([unit.lower() for unit in MEMORY_SIZE_UNITS])}|'
|
473
|
+
f'{"|".join([unit.upper() for unit in MEMORY_SIZE_UNITS])}|'
|
474
|
+
f'{"|".join([unit[0].upper() + unit[1:] for unit in MEMORY_SIZE_UNITS if len(unit) > 1])}' # pylint: disable=line-too-long
|
475
|
+
')?$')
|
476
|
+
|
477
|
+
LAST_USE_TRUNC_LENGTH = 25
|
463
478
|
|
464
479
|
MIN_PRIORITY = -1000
|
465
480
|
MAX_PRIORITY = 1000
|
466
481
|
DEFAULT_PRIORITY = 0
|
467
482
|
|
483
|
+
GRACE_PERIOD_SECONDS_ENV_VAR = SKYPILOT_ENV_VAR_PREFIX + 'GRACE_PERIOD_SECONDS'
|
468
484
|
COST_REPORT_DEFAULT_DAYS = 30
|
sky/skypilot_config.py
CHANGED
@@ -564,7 +564,10 @@ def _reload_config_as_server() -> None:
|
|
564
564
|
_set_loaded_config_path(None)
|
565
565
|
|
566
566
|
server_config_path = _resolve_server_config_path()
|
567
|
+
db_url_from_env = os.environ.get(constants.ENV_VAR_DB_CONNECTION_URI)
|
567
568
|
server_config = _get_config_from_path(server_config_path)
|
569
|
+
if db_url_from_env:
|
570
|
+
server_config.set_nested(('db',), db_url_from_env)
|
568
571
|
|
569
572
|
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
570
573
|
logger.debug(f'server config: \n'
|
@@ -0,0 +1 @@
|
|
1
|
+
"""SSH Node Pool management package."""
|
@@ -0,0 +1,133 @@
|
|
1
|
+
"""SSH Node Pool management core functionality."""
|
2
|
+
import os
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Any, Dict, List
|
5
|
+
|
6
|
+
import yaml
|
7
|
+
|
8
|
+
|
9
|
+
class SSHNodePoolManager:
|
10
|
+
"""Manager for SSH Node Pool configurations."""
|
11
|
+
|
12
|
+
def __init__(self):
|
13
|
+
self.config_path = Path.home() / '.sky' / 'ssh_node_pools.yaml'
|
14
|
+
self.keys_dir = Path.home() / '.sky' / 'ssh_keys'
|
15
|
+
self.keys_dir.mkdir(parents=True, exist_ok=True)
|
16
|
+
|
17
|
+
def get_all_pools(self) -> Dict[str, Any]:
|
18
|
+
"""Read all SSH Node Pool configurations from YAML file."""
|
19
|
+
if not self.config_path.exists():
|
20
|
+
return {}
|
21
|
+
|
22
|
+
try:
|
23
|
+
with open(self.config_path, 'r', encoding='utf-8') as f:
|
24
|
+
return yaml.safe_load(f) or {}
|
25
|
+
except Exception as e:
|
26
|
+
raise RuntimeError(
|
27
|
+
f'Failed to read SSH Node Pool config: {e}') from e
|
28
|
+
|
29
|
+
def save_all_pools(self, pools_config: Dict[str, Any]) -> None:
|
30
|
+
"""Write SSH Node Pool configurations to YAML file."""
|
31
|
+
try:
|
32
|
+
self.config_path.parent.mkdir(parents=True, exist_ok=True)
|
33
|
+
with open(self.config_path, 'w', encoding='utf-8') as f:
|
34
|
+
yaml.dump(pools_config, f, default_flow_style=False)
|
35
|
+
except Exception as e:
|
36
|
+
raise RuntimeError(
|
37
|
+
f'Failed to save SSH Node Pool config: {e}') from e
|
38
|
+
|
39
|
+
def update_pools(self, pools_config: Dict[str, Any]) -> None:
|
40
|
+
"""Update SSH Node Pool configurations."""
|
41
|
+
all_pools = self.get_all_pools()
|
42
|
+
all_pools.update(pools_config)
|
43
|
+
self.save_all_pools(all_pools)
|
44
|
+
|
45
|
+
def add_or_update_pool(self, pool_name: str,
|
46
|
+
pool_config: Dict[str, Any]) -> None:
|
47
|
+
"""Add or update a single SSH Node Pool configuration."""
|
48
|
+
# Validate pool configuration
|
49
|
+
self._validate_pool_config(pool_config)
|
50
|
+
|
51
|
+
all_pools = self.get_all_pools()
|
52
|
+
all_pools[pool_name] = pool_config
|
53
|
+
self.save_all_pools(all_pools)
|
54
|
+
|
55
|
+
def delete_pool(self, pool_name: str) -> bool:
|
56
|
+
"""Delete a SSH Node Pool configuration."""
|
57
|
+
all_pools = self.get_all_pools()
|
58
|
+
if pool_name in all_pools:
|
59
|
+
del all_pools[pool_name]
|
60
|
+
self.save_all_pools(all_pools)
|
61
|
+
return True
|
62
|
+
return False
|
63
|
+
|
64
|
+
def save_ssh_key(self, key_name: str, key_content: str) -> str:
|
65
|
+
"""Save SSH private key to ~/.sky/ssh_keys/ directory."""
|
66
|
+
# Validate key name
|
67
|
+
if not key_name or '/' in key_name or key_name.startswith('.'):
|
68
|
+
raise ValueError('Invalid key name')
|
69
|
+
|
70
|
+
key_path = self.keys_dir / key_name
|
71
|
+
try:
|
72
|
+
with open(key_path, 'w', encoding='utf-8') as f:
|
73
|
+
f.write(key_content)
|
74
|
+
os.chmod(key_path, 0o600) # Set secure permissions
|
75
|
+
return str(key_path)
|
76
|
+
except Exception as e:
|
77
|
+
raise RuntimeError(f'Failed to save SSH key: {e}') from e
|
78
|
+
|
79
|
+
def list_ssh_keys(self) -> List[str]:
|
80
|
+
"""List available SSH key files."""
|
81
|
+
if not self.keys_dir.exists():
|
82
|
+
return []
|
83
|
+
try:
|
84
|
+
return [f.name for f in self.keys_dir.iterdir() if f.is_file()]
|
85
|
+
except Exception: # pylint: disable=broad-except
|
86
|
+
return []
|
87
|
+
|
88
|
+
def _validate_pool_config(self, config: Dict[str, Any]) -> None:
|
89
|
+
"""Validate SSH Node Pool configuration."""
|
90
|
+
if 'hosts' not in config:
|
91
|
+
raise ValueError('Pool configuration must include `hosts`')
|
92
|
+
|
93
|
+
if not isinstance(config['hosts'], list) or not config['hosts']:
|
94
|
+
raise ValueError('`hosts` must be a non-empty list')
|
95
|
+
|
96
|
+
# Validate user field
|
97
|
+
if not config.get('user', '').strip():
|
98
|
+
raise ValueError('Pool configuration must include `user`')
|
99
|
+
|
100
|
+
# Validate authentication - must have either identity_file or password
|
101
|
+
if not config.get('identity_file') and not config.get('password'):
|
102
|
+
raise ValueError('Pool configuration must include '
|
103
|
+
'either `identity_file` or `password`')
|
104
|
+
|
105
|
+
|
106
|
+
def get_all_pools() -> Dict[str, Any]:
|
107
|
+
"""Get all SSH Node Pool configurations."""
|
108
|
+
manager = SSHNodePoolManager()
|
109
|
+
return manager.get_all_pools()
|
110
|
+
|
111
|
+
|
112
|
+
def update_pools(pools_config: Dict[str, Any]) -> None:
|
113
|
+
"""Update SSH Node Pool configurations."""
|
114
|
+
manager = SSHNodePoolManager()
|
115
|
+
manager.update_pools(pools_config)
|
116
|
+
|
117
|
+
|
118
|
+
def delete_pool(pool_name: str) -> bool:
|
119
|
+
"""Delete a SSH Node Pool configuration."""
|
120
|
+
manager = SSHNodePoolManager()
|
121
|
+
return manager.delete_pool(pool_name)
|
122
|
+
|
123
|
+
|
124
|
+
def upload_ssh_key(key_name: str, key_content: str) -> str:
|
125
|
+
"""Upload SSH private key."""
|
126
|
+
manager = SSHNodePoolManager()
|
127
|
+
return manager.save_ssh_key(key_name, key_content)
|
128
|
+
|
129
|
+
|
130
|
+
def list_ssh_keys() -> List[str]:
|
131
|
+
"""List available SSH keys."""
|
132
|
+
manager = SSHNodePoolManager()
|
133
|
+
return manager.list_ssh_keys()
|
@@ -0,0 +1,232 @@
|
|
1
|
+
"""SSH Node Pool management API endpoints."""
|
2
|
+
import re
|
3
|
+
from typing import Any, Dict, List
|
4
|
+
|
5
|
+
import fastapi
|
6
|
+
|
7
|
+
from sky import core as sky_core
|
8
|
+
from sky.server.requests import executor
|
9
|
+
from sky.server.requests import payloads
|
10
|
+
from sky.server.requests import requests as requests_lib
|
11
|
+
from sky.ssh_node_pools import core as ssh_node_pools_core
|
12
|
+
from sky.utils import common_utils
|
13
|
+
|
14
|
+
router = fastapi.APIRouter()
|
15
|
+
|
16
|
+
|
17
|
+
@router.get('')
|
18
|
+
async def get_ssh_node_pools() -> Dict[str, Any]:
|
19
|
+
"""Get all SSH Node Pool configurations."""
|
20
|
+
try:
|
21
|
+
return ssh_node_pools_core.get_all_pools()
|
22
|
+
except Exception as e:
|
23
|
+
raise fastapi.HTTPException(
|
24
|
+
status_code=500,
|
25
|
+
detail=
|
26
|
+
f'Failed to get SSH Node Pools: {common_utils.format_exception(e)}')
|
27
|
+
|
28
|
+
|
29
|
+
@router.post('')
|
30
|
+
async def update_ssh_node_pools(pools_config: Dict[str, Any]) -> Dict[str, str]:
|
31
|
+
"""Update SSH Node Pool configurations."""
|
32
|
+
try:
|
33
|
+
ssh_node_pools_core.update_pools(pools_config)
|
34
|
+
return {'status': 'success'}
|
35
|
+
except Exception as e:
|
36
|
+
raise fastapi.HTTPException(status_code=400,
|
37
|
+
detail=f'Failed to update SSH Node Pools:'
|
38
|
+
f' {common_utils.format_exception(e)}')
|
39
|
+
|
40
|
+
|
41
|
+
@router.delete('/{pool_name}')
|
42
|
+
async def delete_ssh_node_pool(pool_name: str) -> Dict[str, str]:
|
43
|
+
"""Delete a SSH Node Pool configuration."""
|
44
|
+
try:
|
45
|
+
if ssh_node_pools_core.delete_pool(pool_name):
|
46
|
+
return {'status': 'success'}
|
47
|
+
else:
|
48
|
+
raise fastapi.HTTPException(
|
49
|
+
status_code=404,
|
50
|
+
detail=f'SSH Node Pool `{pool_name}` not found')
|
51
|
+
except fastapi.HTTPException:
|
52
|
+
raise
|
53
|
+
except Exception as e:
|
54
|
+
raise fastapi.HTTPException(status_code=500,
|
55
|
+
detail='Failed to delete SSH Node Pool: '
|
56
|
+
f'{common_utils.format_exception(e)}')
|
57
|
+
|
58
|
+
|
59
|
+
@router.post('/keys')
|
60
|
+
async def upload_ssh_key(request: fastapi.Request) -> Dict[str, str]:
|
61
|
+
"""Upload SSH private key."""
|
62
|
+
try:
|
63
|
+
form = await request.form()
|
64
|
+
key_name = form.get('key_name')
|
65
|
+
key_file = form.get('key_file')
|
66
|
+
|
67
|
+
if not key_name or not key_file:
|
68
|
+
raise fastapi.HTTPException(status_code=400,
|
69
|
+
detail='Missing key_name or key_file')
|
70
|
+
|
71
|
+
key_content = await key_file.read()
|
72
|
+
key_path = ssh_node_pools_core.upload_ssh_key(key_name,
|
73
|
+
key_content.decode())
|
74
|
+
|
75
|
+
return {'status': 'success', 'key_path': key_path}
|
76
|
+
except fastapi.HTTPException:
|
77
|
+
raise
|
78
|
+
except Exception as e:
|
79
|
+
raise fastapi.HTTPException(
|
80
|
+
status_code=500,
|
81
|
+
detail=
|
82
|
+
f'Failed to upload SSH key: {common_utils.format_exception(e)}')
|
83
|
+
|
84
|
+
|
85
|
+
@router.get('/keys')
|
86
|
+
async def list_ssh_keys() -> List[str]:
|
87
|
+
"""List available SSH keys."""
|
88
|
+
try:
|
89
|
+
return ssh_node_pools_core.list_ssh_keys()
|
90
|
+
except Exception as e:
|
91
|
+
exception_msg = common_utils.format_exception(e)
|
92
|
+
raise fastapi.HTTPException(
|
93
|
+
status_code=500, detail=f'Failed to list SSH keys: {exception_msg}')
|
94
|
+
|
95
|
+
|
96
|
+
@router.post('/{pool_name}/deploy')
|
97
|
+
async def deploy_ssh_node_pool(request: fastapi.Request,
|
98
|
+
pool_name: str) -> Dict[str, str]:
|
99
|
+
"""Deploy SSH Node Pool using existing ssh_up functionality."""
|
100
|
+
try:
|
101
|
+
ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=False)
|
102
|
+
executor.schedule_request(
|
103
|
+
request_id=request.state.request_id,
|
104
|
+
request_name='ssh_up',
|
105
|
+
request_body=ssh_up_body,
|
106
|
+
func=sky_core.ssh_up,
|
107
|
+
schedule_type=requests_lib.ScheduleType.LONG,
|
108
|
+
)
|
109
|
+
|
110
|
+
return {
|
111
|
+
'status': 'success',
|
112
|
+
'request_id': request.state.request_id,
|
113
|
+
'message': f'SSH Node Pool `{pool_name}` deployment started'
|
114
|
+
}
|
115
|
+
except Exception as e:
|
116
|
+
raise fastapi.HTTPException(status_code=500,
|
117
|
+
detail=f'Failed to deploy SSH Node Pool: '
|
118
|
+
f'{common_utils.format_exception(e)}')
|
119
|
+
|
120
|
+
|
121
|
+
@router.post('/deploy')
|
122
|
+
async def deploy_ssh_node_pool_general(
|
123
|
+
request: fastapi.Request,
|
124
|
+
ssh_up_body: payloads.SSHUpBody) -> Dict[str, str]:
|
125
|
+
"""Deploys all SSH Node Pools."""
|
126
|
+
try:
|
127
|
+
executor.schedule_request(
|
128
|
+
request_id=request.state.request_id,
|
129
|
+
request_name='ssh_up',
|
130
|
+
request_body=ssh_up_body,
|
131
|
+
func=sky_core.ssh_up,
|
132
|
+
schedule_type=requests_lib.ScheduleType.LONG,
|
133
|
+
)
|
134
|
+
|
135
|
+
pool_name = ssh_up_body.infra or 'default'
|
136
|
+
return {
|
137
|
+
'status': 'success',
|
138
|
+
'request_id': request.state.request_id,
|
139
|
+
'message': f'SSH Node Pool `{pool_name}` deployment started'
|
140
|
+
}
|
141
|
+
except Exception as e:
|
142
|
+
raise fastapi.HTTPException(status_code=500,
|
143
|
+
detail=f'Failed to deploy SSH Node Pool: '
|
144
|
+
f'{common_utils.format_exception(e)}')
|
145
|
+
|
146
|
+
|
147
|
+
@router.post('/{pool_name}/down')
|
148
|
+
async def down_ssh_node_pool(request: fastapi.Request,
|
149
|
+
pool_name: str) -> Dict[str, str]:
|
150
|
+
"""Cleans up a SSH Node Pools."""
|
151
|
+
try:
|
152
|
+
ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=True)
|
153
|
+
executor.schedule_request(
|
154
|
+
request_id=request.state.request_id,
|
155
|
+
request_name='ssh_down',
|
156
|
+
request_body=ssh_up_body,
|
157
|
+
func=sky_core.ssh_up, # Reuse ssh_up function with cleanup=True
|
158
|
+
schedule_type=requests_lib.ScheduleType.LONG,
|
159
|
+
)
|
160
|
+
|
161
|
+
return {
|
162
|
+
'status': 'success',
|
163
|
+
'request_id': request.state.request_id,
|
164
|
+
'message': f'SSH Node Pool `{pool_name}` teardown started'
|
165
|
+
}
|
166
|
+
except Exception as e:
|
167
|
+
raise fastapi.HTTPException(
|
168
|
+
status_code=500,
|
169
|
+
detail=f'Failed to tear down SSH Node Pool: '
|
170
|
+
f'{common_utils.format_exception(e)}')
|
171
|
+
|
172
|
+
|
173
|
+
@router.post('/down')
|
174
|
+
async def down_ssh_node_pool_general(
|
175
|
+
request: fastapi.Request,
|
176
|
+
ssh_up_body: payloads.SSHUpBody) -> Dict[str, str]:
|
177
|
+
"""Cleans up all SSH Node Pools."""
|
178
|
+
try:
|
179
|
+
# Set cleanup=True for down operation
|
180
|
+
ssh_up_body.cleanup = True
|
181
|
+
executor.schedule_request(
|
182
|
+
request_id=request.state.request_id,
|
183
|
+
request_name='ssh_down',
|
184
|
+
request_body=ssh_up_body,
|
185
|
+
func=sky_core.ssh_up, # Reuse ssh_up function with cleanup=True
|
186
|
+
schedule_type=requests_lib.ScheduleType.LONG,
|
187
|
+
)
|
188
|
+
|
189
|
+
pool_name = ssh_up_body.infra or 'default'
|
190
|
+
return {
|
191
|
+
'status': 'success',
|
192
|
+
'request_id': request.state.request_id,
|
193
|
+
'message': f'SSH Node Pool `{pool_name}` teardown started'
|
194
|
+
}
|
195
|
+
except Exception as e:
|
196
|
+
raise fastapi.HTTPException(
|
197
|
+
status_code=500,
|
198
|
+
detail=f'Failed to tear down SSH Node Pool: '
|
199
|
+
f'{common_utils.format_exception(e)}')
|
200
|
+
|
201
|
+
|
202
|
+
@router.get('/{pool_name}/status')
|
203
|
+
async def get_ssh_node_pool_status(pool_name: str) -> Dict[str, str]:
|
204
|
+
"""Get the status of a specific SSH Node Pool."""
|
205
|
+
try:
|
206
|
+
# Call ssh_status to check the context
|
207
|
+
context_name = f'ssh-{pool_name}'
|
208
|
+
is_ready, reason = sky_core.ssh_status(context_name)
|
209
|
+
|
210
|
+
# Strip ANSI escape codes from the reason text
|
211
|
+
def strip_ansi_codes(text):
|
212
|
+
if not text:
|
213
|
+
return text
|
214
|
+
# Remove ANSI escape sequences (color codes, formatting, etc.)
|
215
|
+
text = re.sub(r'\x1b\[[0-9;]*m', '', text)
|
216
|
+
# Remove 'disabled. Reason: ' prefix if present
|
217
|
+
text = text.replace('disabled. Reason: ', '')
|
218
|
+
return text
|
219
|
+
|
220
|
+
cleaned_reason = strip_ansi_codes(reason) if reason else reason
|
221
|
+
|
222
|
+
return {
|
223
|
+
'pool_name': pool_name,
|
224
|
+
'context_name': context_name,
|
225
|
+
'status': 'Ready' if is_ready else 'Not Ready',
|
226
|
+
'reason': cleaned_reason
|
227
|
+
}
|
228
|
+
except Exception as e:
|
229
|
+
raise fastapi.HTTPException(
|
230
|
+
status_code=500,
|
231
|
+
detail=f'Failed to get SSH Node Pool status: '
|
232
|
+
f'{common_utils.format_exception(e)}')
|