skypilot-nightly 1.0.0.dev20250624__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +26 -11
- sky/backends/cloud_vm_ray_backend.py +16 -5
- sky/client/cli/command.py +222 -4
- sky/client/sdk.py +110 -82
- sky/clouds/aws.py +10 -7
- sky/clouds/azure.py +10 -7
- sky/clouds/cloud.py +2 -0
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +10 -7
- sky/clouds/fluidstack.py +2 -0
- sky/clouds/gcp.py +10 -7
- sky/clouds/hyperbolic.py +10 -7
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +26 -9
- sky/clouds/lambda_cloud.py +10 -7
- sky/clouds/nebius.py +10 -7
- sky/clouds/oci.py +10 -7
- sky/clouds/paperspace.py +10 -7
- sky/clouds/runpod.py +10 -7
- sky/clouds/scp.py +10 -7
- sky/clouds/vast.py +10 -7
- sky/clouds/vsphere.py +2 -0
- sky/core.py +1 -0
- sky/dag.py +14 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
- sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
- sky/dashboard/out/_next/static/chunks/{37-4650f214e2119168.js → 37-1f1e94f5a561202a.js} +2 -2
- sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
- sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
- sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
- sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
- sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
- sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
- sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
- sky/dashboard/out/_next/static/chunks/{856-bfddc18e16f3873c.js → 856-cdf66268ec878d0c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce31493da9747ef4.js → _app-0ef7418d1a3822f3.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-ecc5a7003776cfa7.js → [name]-0b4c662a25e4747a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
- sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +2 -4
- sky/exceptions.py +15 -0
- sky/execution.py +5 -0
- sky/global_user_state.py +129 -0
- sky/jobs/client/sdk.py +13 -11
- sky/jobs/server/core.py +4 -0
- sky/models.py +16 -0
- sky/provision/__init__.py +26 -0
- sky/provision/kubernetes/__init__.py +3 -0
- sky/provision/kubernetes/instance.py +38 -77
- sky/provision/kubernetes/utils.py +52 -2
- sky/provision/kubernetes/volume.py +147 -0
- sky/resources.py +20 -76
- sky/serve/client/sdk.py +13 -13
- sky/serve/server/core.py +5 -1
- sky/server/common.py +40 -5
- sky/server/constants.py +5 -1
- sky/server/metrics.py +105 -0
- sky/server/requests/executor.py +30 -14
- sky/server/requests/payloads.py +16 -0
- sky/server/requests/requests.py +35 -1
- sky/server/rest.py +152 -0
- sky/server/server.py +66 -16
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +8 -3
- sky/server/uvicorn.py +153 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +14 -3
- sky/task.py +141 -18
- sky/templates/kubernetes-ray.yml.j2 +30 -1
- sky/users/permission.py +2 -0
- sky/utils/context.py +3 -1
- sky/utils/resources_utils.py +66 -0
- sky/utils/rich_utils.py +6 -0
- sky/utils/schemas.py +146 -3
- sky/utils/status_lib.py +10 -0
- sky/utils/validator.py +11 -1
- sky/volumes/__init__.py +0 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +64 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +199 -0
- sky/volumes/server/server.py +85 -0
- sky/volumes/utils.py +158 -0
- sky/volumes/volume.py +198 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +123 -108
- sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
- sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +0 -6
- sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
- sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
- sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
- sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-7e9736af1c6345a6.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-171c27f4ca94861c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +0 -1
- sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +0 -3
- sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{zsALxITkbP8J8NVwSDwMo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-bde186946d353355.js → 843-07d25a7e64462fd8.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{938-ce7991c156584b06.js → 938-068520cc11738deb.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-56412c7976b4655b.js → 973-5b5019ba333e8d62.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
sky/server/metrics.py
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
"""Instrumentation for the API server."""
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
import os
|
5
|
+
import time
|
6
|
+
|
7
|
+
import fastapi
|
8
|
+
from prometheus_client import generate_latest
|
9
|
+
from prometheus_client import multiprocess
|
10
|
+
import prometheus_client as prom
|
11
|
+
import starlette.middleware.base
|
12
|
+
import uvicorn
|
13
|
+
|
14
|
+
from sky import sky_logging
|
15
|
+
|
16
|
+
logger = sky_logging.init_logger(__name__)
|
17
|
+
|
18
|
+
# Total number of API server requests, grouped by path, method, and status.
|
19
|
+
sky_apiserver_requests_total = prom.Counter(
|
20
|
+
'sky_apiserver_requests_total',
|
21
|
+
'Total number of API server requests',
|
22
|
+
['path', 'method', 'status'],
|
23
|
+
)
|
24
|
+
|
25
|
+
# Time spent processing API server requests, grouped by path, method, and
|
26
|
+
# status.
|
27
|
+
sky_apiserver_request_duration_seconds = prom.Histogram(
|
28
|
+
'sky_apiserver_request_duration_seconds',
|
29
|
+
'Time spent processing API server requests',
|
30
|
+
['path', 'method', 'status'],
|
31
|
+
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
|
32
|
+
float('inf')),
|
33
|
+
)
|
34
|
+
|
35
|
+
metrics_app = fastapi.FastAPI()
|
36
|
+
|
37
|
+
|
38
|
+
@metrics_app.get('/metrics')
|
39
|
+
async def metrics() -> fastapi.Response:
|
40
|
+
"""Expose aggregated Prometheus metrics from all worker processes."""
|
41
|
+
if os.environ.get('PROMETHEUS_MULTIPROC_DIR'):
|
42
|
+
# In multiprocess mode, we need to collect metrics from all processes.
|
43
|
+
registry = prom.CollectorRegistry()
|
44
|
+
multiprocess.MultiProcessCollector(registry)
|
45
|
+
data = generate_latest(registry)
|
46
|
+
else:
|
47
|
+
data = generate_latest()
|
48
|
+
return fastapi.Response(content=data,
|
49
|
+
media_type=prom.CONTENT_TYPE_LATEST,
|
50
|
+
headers={'Cache-Control': 'no-cache'})
|
51
|
+
|
52
|
+
|
53
|
+
def run_metrics_server(host: str, port: int):
|
54
|
+
metrics_config = uvicorn.Config(
|
55
|
+
'sky.server.metrics:metrics_app',
|
56
|
+
host=host,
|
57
|
+
port=port,
|
58
|
+
workers=1,
|
59
|
+
)
|
60
|
+
metrics_server_instance = uvicorn.Server(metrics_config)
|
61
|
+
asyncio.run(metrics_server_instance.serve())
|
62
|
+
|
63
|
+
|
64
|
+
def _get_status_code_group(status_code: int) -> str:
|
65
|
+
"""Group status codes into classes (2xx, 5xx) to reduce cardinality."""
|
66
|
+
return f'{status_code // 100}xx'
|
67
|
+
|
68
|
+
|
69
|
+
def _is_streaming_api(path: str) -> bool:
|
70
|
+
"""Check if the path is a streaming API."""
|
71
|
+
path = path.rstrip('/')
|
72
|
+
return path.endswith('/logs') or path.endswith('/api/stream')
|
73
|
+
|
74
|
+
|
75
|
+
class PrometheusMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
76
|
+
"""Middleware to collect Prometheus metrics for HTTP requests."""
|
77
|
+
|
78
|
+
async def dispatch(self, request: fastapi.Request, call_next):
|
79
|
+
path = request.url.path
|
80
|
+
logger.info(f'PROM Middleware Request: {request}, {request.url.path}')
|
81
|
+
streaming = _is_streaming_api(path)
|
82
|
+
if not streaming:
|
83
|
+
# Exclude streaming APIs, the duration is not meaningful.
|
84
|
+
# TODO(aylei): measure the duration of async execution instead.
|
85
|
+
start_time = time.time()
|
86
|
+
method = request.method
|
87
|
+
status_code_group = ''
|
88
|
+
|
89
|
+
try:
|
90
|
+
response = await call_next(request)
|
91
|
+
status_code_group = _get_status_code_group(response.status_code)
|
92
|
+
except Exception: # pylint: disable=broad-except
|
93
|
+
status_code_group = '5xx'
|
94
|
+
raise
|
95
|
+
finally:
|
96
|
+
sky_apiserver_requests_total.labels(path=path,
|
97
|
+
method=method,
|
98
|
+
status=status_code_group).inc()
|
99
|
+
if not streaming:
|
100
|
+
duration = time.time() - start_time
|
101
|
+
sky_apiserver_request_duration_seconds.labels(
|
102
|
+
path=path, method=method,
|
103
|
+
status=status_code_group).observe(duration)
|
104
|
+
|
105
|
+
return response
|
sky/server/requests/executor.py
CHANGED
@@ -149,10 +149,25 @@ class RequestWorker:
|
|
149
149
|
self.schedule_type = schedule_type
|
150
150
|
self.garanteed_parallelism = config.garanteed_parallelism
|
151
151
|
self.burstable_parallelism = config.burstable_parallelism
|
152
|
+
self._thread: Optional[threading.Thread] = None
|
153
|
+
self._cancel_event = threading.Event()
|
152
154
|
|
153
155
|
def __str__(self) -> str:
|
154
156
|
return f'Worker(schedule_type={self.schedule_type.value})'
|
155
157
|
|
158
|
+
def run_in_background(self) -> None:
|
159
|
+
# Thread dispatcher is sufficient for current scale, refer to
|
160
|
+
# tests/load_tests/test_queue_dispatcher.py for more details.
|
161
|
+
# Use daemon thread for automatic cleanup.
|
162
|
+
thread = threading.Thread(target=self.run, daemon=True)
|
163
|
+
thread.start()
|
164
|
+
self._thread = thread
|
165
|
+
|
166
|
+
def cancel(self) -> None:
|
167
|
+
if self._thread is not None:
|
168
|
+
self._cancel_event.set()
|
169
|
+
self._thread.join()
|
170
|
+
|
156
171
|
def process_request(self, executor: process.BurstableExecutor,
|
157
172
|
queue: RequestQueue) -> None:
|
158
173
|
try:
|
@@ -219,7 +234,7 @@ class RequestWorker:
|
|
219
234
|
burst_workers=self.burstable_parallelism,
|
220
235
|
initializer=executor_initializer,
|
221
236
|
initargs=(proc_group,))
|
222
|
-
while
|
237
|
+
while not self._cancel_event.is_set():
|
223
238
|
self.process_request(executor, queue)
|
224
239
|
# TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
|
225
240
|
except KeyboardInterrupt:
|
@@ -539,15 +554,21 @@ def schedule_request(request_id: str,
|
|
539
554
|
enqueue()
|
540
555
|
|
541
556
|
|
542
|
-
def start(
|
557
|
+
def start(
|
558
|
+
config: server_config.ServerConfig
|
559
|
+
) -> Tuple[Optional[multiprocessing.Process], List[RequestWorker]]:
|
543
560
|
"""Start the request workers.
|
544
561
|
|
545
562
|
Request workers run in background, schedule the requests and delegate the
|
546
563
|
request execution to executor processes.
|
564
|
+
|
565
|
+
Returns:
|
566
|
+
A tuple of the queue server process and the list of request worker
|
567
|
+
threads.
|
547
568
|
"""
|
548
569
|
global queue_backend
|
549
570
|
queue_backend = config.queue_backend
|
550
|
-
|
571
|
+
queue_server = None
|
551
572
|
# Setup the queues.
|
552
573
|
if queue_backend == server_config.QueueBackend.MULTIPROCESSING:
|
553
574
|
logger.info('Creating shared request queues')
|
@@ -564,7 +585,6 @@ def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
|
|
564
585
|
queue_server = multiprocessing.Process(
|
565
586
|
target=mp_queue.start_queue_manager, args=(queue_names, port))
|
566
587
|
queue_server.start()
|
567
|
-
sub_procs.append(queue_server)
|
568
588
|
mp_queue.wait_for_queues_to_be_ready(queue_names,
|
569
589
|
queue_server,
|
570
590
|
port=port)
|
@@ -577,20 +597,16 @@ def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
|
|
577
597
|
|
578
598
|
logger.info('Request queues created')
|
579
599
|
|
580
|
-
|
581
|
-
# Thread dispatcher is sufficient for current scale, refer to
|
582
|
-
# tests/load_tests/test_queue_dispatcher.py for more details.
|
583
|
-
# Use daemon thread for automatic cleanup.
|
584
|
-
thread = threading.Thread(target=worker.run, daemon=True)
|
585
|
-
thread.start()
|
586
|
-
|
600
|
+
workers = []
|
587
601
|
# Start a worker for long requests.
|
588
602
|
long_worker = RequestWorker(schedule_type=api_requests.ScheduleType.LONG,
|
589
603
|
config=config.long_worker_config)
|
590
|
-
|
604
|
+
long_worker.run_in_background()
|
605
|
+
workers.append(long_worker)
|
591
606
|
|
592
607
|
# Start a worker for short requests.
|
593
608
|
short_worker = RequestWorker(schedule_type=api_requests.ScheduleType.SHORT,
|
594
609
|
config=config.short_worker_config)
|
595
|
-
|
596
|
-
|
610
|
+
short_worker.run_in_background()
|
611
|
+
workers.append(short_worker)
|
612
|
+
return queue_server, workers
|
sky/server/requests/payloads.py
CHANGED
@@ -368,6 +368,22 @@ class StorageBody(RequestBody):
|
|
368
368
|
name: str
|
369
369
|
|
370
370
|
|
371
|
+
class VolumeApplyBody(RequestBody):
|
372
|
+
"""The request body for the volume apply endpoint."""
|
373
|
+
name: str
|
374
|
+
volume_type: str
|
375
|
+
cloud: str
|
376
|
+
region: Optional[str] = None
|
377
|
+
zone: Optional[str] = None
|
378
|
+
size: Optional[str] = None
|
379
|
+
config: Optional[Dict[str, Any]] = None
|
380
|
+
|
381
|
+
|
382
|
+
class VolumeDeleteBody(RequestBody):
|
383
|
+
"""The request body for the volume delete endpoint."""
|
384
|
+
names: List[str]
|
385
|
+
|
386
|
+
|
371
387
|
class EndpointsBody(RequestBody):
|
372
388
|
"""The request body for the endpoint."""
|
373
389
|
cluster: str
|
sky/server/requests/requests.py
CHANGED
@@ -38,6 +38,7 @@ REQUEST_TABLE = 'requests'
|
|
38
38
|
COL_CLUSTER_NAME = 'cluster_name'
|
39
39
|
COL_USER_ID = 'user_id'
|
40
40
|
COL_STATUS_MSG = 'status_msg'
|
41
|
+
COL_SHOULD_RETRY = 'should_retry'
|
41
42
|
REQUEST_LOG_PATH_PREFIX = '~/sky_logs/api_server/requests'
|
42
43
|
|
43
44
|
# TODO(zhwu): For scalability, there are several TODOs:
|
@@ -86,6 +87,7 @@ REQUEST_COLUMNS = [
|
|
86
87
|
'schedule_type',
|
87
88
|
COL_USER_ID,
|
88
89
|
COL_STATUS_MSG,
|
90
|
+
COL_SHOULD_RETRY,
|
89
91
|
]
|
90
92
|
|
91
93
|
|
@@ -115,6 +117,7 @@ class RequestPayload:
|
|
115
117
|
# Resources the request operates on.
|
116
118
|
cluster_name: Optional[str] = None
|
117
119
|
status_msg: Optional[str] = None
|
120
|
+
should_retry: bool = False
|
118
121
|
|
119
122
|
|
120
123
|
@dataclasses.dataclass
|
@@ -137,6 +140,8 @@ class Request:
|
|
137
140
|
cluster_name: Optional[str] = None
|
138
141
|
# Status message of the request, indicates the reason of current status.
|
139
142
|
status_msg: Optional[str] = None
|
143
|
+
# Whether the request should be retried.
|
144
|
+
should_retry: bool = False
|
140
145
|
|
141
146
|
@property
|
142
147
|
def log_path(self) -> pathlib.Path:
|
@@ -222,6 +227,7 @@ class Request:
|
|
222
227
|
user_name=user_name,
|
223
228
|
cluster_name=self.cluster_name,
|
224
229
|
status_msg=self.status_msg,
|
230
|
+
should_retry=self.should_retry,
|
225
231
|
)
|
226
232
|
|
227
233
|
def encode(self) -> RequestPayload:
|
@@ -243,6 +249,7 @@ class Request:
|
|
243
249
|
user_id=self.user_id,
|
244
250
|
cluster_name=self.cluster_name,
|
245
251
|
status_msg=self.status_msg,
|
252
|
+
should_retry=self.should_retry,
|
246
253
|
)
|
247
254
|
except (TypeError, ValueError) as e:
|
248
255
|
# The error is unexpected, so we don't suppress the stack trace.
|
@@ -274,6 +281,7 @@ class Request:
|
|
274
281
|
user_id=payload.user_id,
|
275
282
|
cluster_name=payload.cluster_name,
|
276
283
|
status_msg=payload.status_msg,
|
284
|
+
should_retry=payload.should_retry,
|
277
285
|
)
|
278
286
|
except (TypeError, ValueError) as e:
|
279
287
|
logger.error(
|
@@ -327,6 +335,24 @@ def refresh_cluster_status_event():
|
|
327
335
|
time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
|
328
336
|
|
329
337
|
|
338
|
+
def refresh_volume_status_event():
|
339
|
+
"""Periodically refresh the volume status."""
|
340
|
+
# pylint: disable=import-outside-toplevel
|
341
|
+
from sky.volumes.server import core
|
342
|
+
|
343
|
+
# Disable logging for periodic refresh to avoid the usage message being
|
344
|
+
# sent multiple times.
|
345
|
+
os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
|
346
|
+
|
347
|
+
while True:
|
348
|
+
logger.info('=== Refreshing volume status ===')
|
349
|
+
core.volume_refresh()
|
350
|
+
logger.info('Volume status refreshed. Sleeping '
|
351
|
+
f'{server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS}'
|
352
|
+
' seconds for the next refresh...\n')
|
353
|
+
time.sleep(server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS)
|
354
|
+
|
355
|
+
|
330
356
|
def managed_job_status_refresh_event():
|
331
357
|
"""Refresh the managed job status for controller consolidation mode."""
|
332
358
|
# pylint: disable=import-outside-toplevel
|
@@ -362,6 +388,10 @@ INTERNAL_REQUEST_DAEMONS = [
|
|
362
388
|
InternalRequestDaemon(id='skypilot-status-refresh-daemon',
|
363
389
|
name='status',
|
364
390
|
event_fn=refresh_cluster_status_event),
|
391
|
+
# Volume status refresh daemon to update the volume status periodically.
|
392
|
+
InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
|
393
|
+
name='volume',
|
394
|
+
event_fn=refresh_volume_status_event),
|
365
395
|
InternalRequestDaemon(id='managed-job-status-refresh-daemon',
|
366
396
|
name='managed-job-status',
|
367
397
|
event_fn=managed_job_status_refresh_event),
|
@@ -446,10 +476,14 @@ def create_table(cursor, conn):
|
|
446
476
|
{COL_CLUSTER_NAME} TEXT,
|
447
477
|
schedule_type TEXT,
|
448
478
|
{COL_USER_ID} TEXT,
|
449
|
-
{COL_STATUS_MSG} TEXT
|
479
|
+
{COL_STATUS_MSG} TEXT,
|
480
|
+
{COL_SHOULD_RETRY} INTEGER
|
481
|
+
)""")
|
450
482
|
|
451
483
|
db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_STATUS_MSG,
|
452
484
|
'TEXT')
|
485
|
+
db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_SHOULD_RETRY,
|
486
|
+
'INTEGER')
|
453
487
|
|
454
488
|
|
455
489
|
_DB = None
|
sky/server/rest.py
ADDED
@@ -0,0 +1,152 @@
|
|
1
|
+
"""REST API client of SkyPilot API server"""
|
2
|
+
|
3
|
+
import contextlib
|
4
|
+
import contextvars
|
5
|
+
import functools
|
6
|
+
import time
|
7
|
+
import typing
|
8
|
+
from typing import Any, Callable, cast, Optional, TypeVar
|
9
|
+
|
10
|
+
import colorama
|
11
|
+
|
12
|
+
from sky import exceptions
|
13
|
+
from sky import sky_logging
|
14
|
+
from sky.adaptors import common as adaptors_common
|
15
|
+
from sky.utils import common_utils
|
16
|
+
from sky.utils import rich_utils
|
17
|
+
from sky.utils import ux_utils
|
18
|
+
|
19
|
+
logger = sky_logging.init_logger(__name__)
|
20
|
+
|
21
|
+
if typing.TYPE_CHECKING:
|
22
|
+
import requests
|
23
|
+
|
24
|
+
else:
|
25
|
+
requests = adaptors_common.LazyImport('requests')
|
26
|
+
|
27
|
+
F = TypeVar('F', bound=Callable[..., Any])
|
28
|
+
|
29
|
+
_RETRY_CONTEXT = contextvars.ContextVar('retry_context', default=None)
|
30
|
+
|
31
|
+
|
32
|
+
class RetryContext:
|
33
|
+
|
34
|
+
def __init__(self):
|
35
|
+
self.line_processed = 0
|
36
|
+
|
37
|
+
|
38
|
+
def retry_on_server_unavailable(max_wait_seconds: int = 600,
|
39
|
+
initial_backoff: float = 5.0,
|
40
|
+
max_backoff_factor: int = 5):
|
41
|
+
"""Decorator that retries a function when ServerTemporarilyUnavailableError
|
42
|
+
is caught.
|
43
|
+
|
44
|
+
Args:
|
45
|
+
max_wait_seconds: Maximum number of seconds to wait for the server to
|
46
|
+
be healthy
|
47
|
+
initial_backoff: Initial backoff time in seconds
|
48
|
+
max_backoff_factor: Maximum backoff factor for exponential backoff
|
49
|
+
|
50
|
+
Notes(dev):
|
51
|
+
This decorator is mainly used in two scenarios:
|
52
|
+
1. Decorate a Restful API call to make the API call wait for server
|
53
|
+
recovery when server is temporarily unavailable. APIs like /api/get
|
54
|
+
and /api/stream should not be retried since sending them to a new
|
55
|
+
replica of API server will not work.
|
56
|
+
2. Decorate a SDK function to make the entire SDK function call get
|
57
|
+
retried when /api/get or /logs raises a retryable error. This
|
58
|
+
is typically triggered by a graceful upgrade of the API server,
|
59
|
+
where the pending requests and logs requests will be interrupted.
|
60
|
+
"""
|
61
|
+
|
62
|
+
def decorator(func: F) -> F:
|
63
|
+
|
64
|
+
@functools.wraps(func)
|
65
|
+
def wrapper(*args, **kwargs) -> Any:
|
66
|
+
msg = (
|
67
|
+
f'{colorama.Fore.YELLOW}API server is temporarily: upgrade in '
|
68
|
+
f'progress. Waiting to resume...{colorama.Style.RESET_ALL}')
|
69
|
+
backoff = common_utils.Backoff(
|
70
|
+
initial_backoff=initial_backoff,
|
71
|
+
max_backoff_factor=max_backoff_factor)
|
72
|
+
start_time = time.time()
|
73
|
+
attempt = 0
|
74
|
+
|
75
|
+
with _retry_in_context():
|
76
|
+
while True:
|
77
|
+
attempt += 1
|
78
|
+
try:
|
79
|
+
return func(*args, **kwargs)
|
80
|
+
except exceptions.ServerTemporarilyUnavailableError as e:
|
81
|
+
# This will cause the status spinner being stopped and
|
82
|
+
# restarted in every retry loop. But it is necessary to
|
83
|
+
# stop the status spinner before retrying func() to
|
84
|
+
# avoid the status spinner get stuck if the func() runs
|
85
|
+
# for a long time without update status, e.g. sky logs.
|
86
|
+
with rich_utils.client_status(msg):
|
87
|
+
if time.time() - start_time > max_wait_seconds:
|
88
|
+
# pylint: disable=line-too-long
|
89
|
+
raise exceptions.ServerTemporarilyUnavailableError(
|
90
|
+
'Timeout waiting for the API server to be '
|
91
|
+
f'available after {max_wait_seconds}s.') \
|
92
|
+
from e
|
93
|
+
|
94
|
+
sleep_time = backoff.current_backoff()
|
95
|
+
time.sleep(sleep_time)
|
96
|
+
logger.debug('The API server is unavailable. '
|
97
|
+
f'Retrying {func.__name__} '
|
98
|
+
f'(attempt {attempt}, '
|
99
|
+
f'backoff {sleep_time}s).')
|
100
|
+
|
101
|
+
return cast(F, wrapper)
|
102
|
+
|
103
|
+
return decorator
|
104
|
+
|
105
|
+
|
106
|
+
@contextlib.contextmanager
|
107
|
+
def _retry_in_context():
|
108
|
+
token = _RETRY_CONTEXT.set(RetryContext())
|
109
|
+
try:
|
110
|
+
yield
|
111
|
+
finally:
|
112
|
+
_RETRY_CONTEXT.reset(token)
|
113
|
+
|
114
|
+
|
115
|
+
def get_retry_context() -> Optional[RetryContext]:
|
116
|
+
return _RETRY_CONTEXT.get()
|
117
|
+
|
118
|
+
|
119
|
+
def handle_server_unavailable(response: 'requests.Response') -> None:
|
120
|
+
if response.status_code == 503:
|
121
|
+
# TODO(aylei): Hacky, depends on how nginx controller handles backends
|
122
|
+
# with no ready endpoints. Should use self-defined status code or header
|
123
|
+
# to distinguish retryable server error from general 503 errors.
|
124
|
+
with ux_utils.print_exception_no_traceback():
|
125
|
+
raise exceptions.ServerTemporarilyUnavailableError(
|
126
|
+
'SkyPilot API server is temporarily unavailable. '
|
127
|
+
'Please try again later.')
|
128
|
+
|
129
|
+
|
130
|
+
@retry_on_server_unavailable()
|
131
|
+
def post(url, data=None, json=None, **kwargs) -> 'requests.Response':
|
132
|
+
"""Send a POST request to the API server, retry on server temporarily
|
133
|
+
unavailable."""
|
134
|
+
response = requests.post(url, data=data, json=json, **kwargs)
|
135
|
+
handle_server_unavailable(response)
|
136
|
+
return response
|
137
|
+
|
138
|
+
|
139
|
+
@retry_on_server_unavailable()
|
140
|
+
def get(url, params=None, **kwargs) -> 'requests.Response':
|
141
|
+
"""Send a GET request to the API server, retry on server temporarily
|
142
|
+
unavailable."""
|
143
|
+
response = requests.get(url, params=params, **kwargs)
|
144
|
+
handle_server_unavailable(response)
|
145
|
+
return response
|
146
|
+
|
147
|
+
|
148
|
+
def get_without_retry(url, params=None, **kwargs) -> 'requests.Response':
|
149
|
+
"""Send a GET request to the API server without retry."""
|
150
|
+
response = requests.get(url, params=params, **kwargs)
|
151
|
+
handle_server_unavailable(response)
|
152
|
+
return response
|
sky/server/server.py
CHANGED
@@ -16,6 +16,7 @@ import posixpath
|
|
16
16
|
import re
|
17
17
|
import shutil
|
18
18
|
import sys
|
19
|
+
import threading
|
19
20
|
from typing import Any, Dict, List, Literal, Optional, Set, Tuple
|
20
21
|
import uuid
|
21
22
|
import zipfile
|
@@ -43,6 +44,8 @@ from sky.serve.server import server as serve_rest
|
|
43
44
|
from sky.server import common
|
44
45
|
from sky.server import config as server_config
|
45
46
|
from sky.server import constants as server_constants
|
47
|
+
from sky.server import metrics
|
48
|
+
from sky.server import state
|
46
49
|
from sky.server import stream_utils
|
47
50
|
from sky.server.requests import executor
|
48
51
|
from sky.server.requests import payloads
|
@@ -61,6 +64,7 @@ from sky.utils import dag_utils
|
|
61
64
|
from sky.utils import env_options
|
62
65
|
from sky.utils import status_lib
|
63
66
|
from sky.utils import subprocess_utils
|
67
|
+
from sky.volumes.server import server as volumes_rest
|
64
68
|
from sky.workspaces import server as workspaces_rest
|
65
69
|
|
66
70
|
# pylint: disable=ungrouped-imports
|
@@ -378,9 +382,32 @@ class PathCleanMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
378
382
|
return await call_next(request)
|
379
383
|
|
380
384
|
|
385
|
+
class GracefulShutdownMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
386
|
+
"""Middleware to control requests when server is shutting down."""
|
387
|
+
|
388
|
+
async def dispatch(self, request: fastapi.Request, call_next):
|
389
|
+
if state.get_block_requests():
|
390
|
+
# Allow /api/ paths to continue, which are critical to operate
|
391
|
+
# on-going requests but will not submit new requests.
|
392
|
+
if not request.url.path.startswith('/api/'):
|
393
|
+
# Client will retry on 503 error.
|
394
|
+
return fastapi.responses.JSONResponse(
|
395
|
+
status_code=503,
|
396
|
+
content={
|
397
|
+
'detail': 'Server is shutting down, '
|
398
|
+
'please try again later.'
|
399
|
+
})
|
400
|
+
|
401
|
+
return await call_next(request)
|
402
|
+
|
403
|
+
|
381
404
|
app = fastapi.FastAPI(prefix='/api/v1', debug=True, lifespan=lifespan)
|
405
|
+
# Use environment variable to make the metrics middleware optional.
|
406
|
+
if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
|
407
|
+
app.add_middleware(metrics.PrometheusMiddleware)
|
382
408
|
app.add_middleware(RBACMiddleware)
|
383
409
|
app.add_middleware(InternalDashboardPrefixMiddleware)
|
410
|
+
app.add_middleware(GracefulShutdownMiddleware)
|
384
411
|
app.add_middleware(PathCleanMiddleware)
|
385
412
|
app.add_middleware(CacheControlStaticMiddleware)
|
386
413
|
app.add_middleware(
|
@@ -404,6 +431,7 @@ app.include_router(users_rest.router, prefix='/users', tags=['users'])
|
|
404
431
|
app.include_router(workspaces_rest.router,
|
405
432
|
prefix='/workspaces',
|
406
433
|
tags=['workspaces'])
|
434
|
+
app.include_router(volumes_rest.router, prefix='/volumes', tags=['volumes'])
|
407
435
|
|
408
436
|
|
409
437
|
@app.get('/token')
|
@@ -564,6 +592,8 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
|
|
564
592
|
ctx.override_envs(validate_body.env_vars)
|
565
593
|
|
566
594
|
def validate_dag(dag: dag_utils.dag_lib.Dag):
|
595
|
+
# Resolve the volumes before admin policy and validation.
|
596
|
+
dag.resolve_and_validate_volumes()
|
567
597
|
# TODO: Admin policy may contain arbitrary code, which may be expensive
|
568
598
|
# to run and may block the server thread. However, moving it into the
|
569
599
|
# executor adds a ~150ms penalty on the local API server because of
|
@@ -826,6 +856,10 @@ async def status(
|
|
826
856
|
status_body: payloads.StatusBody = payloads.StatusBody()
|
827
857
|
) -> None:
|
828
858
|
"""Gets cluster statuses."""
|
859
|
+
if state.get_block_requests():
|
860
|
+
raise fastapi.HTTPException(
|
861
|
+
status_code=503,
|
862
|
+
detail='Server is shutting down, please try again later.')
|
829
863
|
executor.schedule_request(
|
830
864
|
request_id=request.state.request_id,
|
831
865
|
request_name='status',
|
@@ -1145,6 +1179,10 @@ async def api_get(request_id: str) -> requests_lib.RequestPayload:
|
|
1145
1179
|
raise fastapi.HTTPException(
|
1146
1180
|
status_code=404, detail=f'Request {request_id!r} not found')
|
1147
1181
|
if request_task.status > requests_lib.RequestStatus.RUNNING:
|
1182
|
+
if request_task.should_retry:
|
1183
|
+
raise fastapi.HTTPException(
|
1184
|
+
status_code=503,
|
1185
|
+
detail=f'Request {request_id!r} should be retried')
|
1148
1186
|
request_error = request_task.get_error()
|
1149
1187
|
if request_error is not None:
|
1150
1188
|
raise fastapi.HTTPException(status_code=500,
|
@@ -1435,6 +1473,11 @@ async def complete_storage_name(incomplete: str,) -> List[str]:
|
|
1435
1473
|
return global_user_state.get_storage_names_start_with(incomplete)
|
1436
1474
|
|
1437
1475
|
|
1476
|
+
@app.get('/api/completion/volume_name')
|
1477
|
+
async def complete_volume_name(incomplete: str,) -> List[str]:
|
1478
|
+
return global_user_state.get_volume_names_start_with(incomplete)
|
1479
|
+
|
1480
|
+
|
1438
1481
|
@app.get('/dashboard/{full_path:path}')
|
1439
1482
|
async def serve_dashboard(full_path: str):
|
1440
1483
|
"""Serves the Next.js dashboard application.
|
@@ -1461,6 +1504,7 @@ async def serve_dashboard(full_path: str):
|
|
1461
1504
|
try:
|
1462
1505
|
with open(index_path, 'r', encoding='utf-8') as f:
|
1463
1506
|
content = f.read()
|
1507
|
+
|
1464
1508
|
return fastapi.responses.HTMLResponse(content=content)
|
1465
1509
|
except Exception as e:
|
1466
1510
|
logger.error(f'Error serving dashboard: {e}')
|
@@ -1484,7 +1528,13 @@ if __name__ == '__main__':
|
|
1484
1528
|
parser.add_argument('--host', default='127.0.0.1')
|
1485
1529
|
parser.add_argument('--port', default=46580, type=int)
|
1486
1530
|
parser.add_argument('--deploy', action='store_true')
|
1531
|
+
# Serve metrics on a separate port to isolate it from the application APIs:
|
1532
|
+
# metrics port will not be exposed to the public network typically.
|
1533
|
+
parser.add_argument('--metrics-port', default=9090, type=int)
|
1487
1534
|
cmd_args = parser.parse_args()
|
1535
|
+
if cmd_args.port == cmd_args.metrics_port:
|
1536
|
+
raise ValueError('port and metrics-port cannot be the same')
|
1537
|
+
|
1488
1538
|
# Show the privacy policy if it is not already shown. We place it here so
|
1489
1539
|
# that it is shown only when the API server is started.
|
1490
1540
|
usage_lib.maybe_show_privacy_policy()
|
@@ -1492,9 +1542,17 @@ if __name__ == '__main__':
|
|
1492
1542
|
config = server_config.compute_server_config(cmd_args.deploy)
|
1493
1543
|
num_workers = config.num_server_workers
|
1494
1544
|
|
1495
|
-
|
1545
|
+
queue_server: Optional[multiprocessing.Process] = None
|
1546
|
+
workers: List[executor.RequestWorker] = []
|
1496
1547
|
try:
|
1497
|
-
|
1548
|
+
if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
|
1549
|
+
metrics_thread = threading.Thread(target=metrics.run_metrics_server,
|
1550
|
+
args=(cmd_args.host,
|
1551
|
+
cmd_args.metrics_port),
|
1552
|
+
daemon=True)
|
1553
|
+
metrics_thread.start()
|
1554
|
+
queue_server, workers = executor.start(config)
|
1555
|
+
|
1498
1556
|
logger.info(f'Starting SkyPilot API server, workers={num_workers}')
|
1499
1557
|
# We don't support reload for now, since it may cause leakage of request
|
1500
1558
|
# workers or interrupt running requests.
|
@@ -1510,17 +1568,9 @@ if __name__ == '__main__':
|
|
1510
1568
|
finally:
|
1511
1569
|
logger.info('Shutting down SkyPilot API server...')
|
1512
1570
|
|
1513
|
-
|
1514
|
-
|
1515
|
-
|
1516
|
-
|
1517
|
-
|
1518
|
-
|
1519
|
-
proc.close()
|
1520
|
-
|
1521
|
-
# Terminate processes in reverse order in case dependency, especially
|
1522
|
-
# queue server. Terminate queue server first does not affect the
|
1523
|
-
# correctness of cleanup but introduce redundant error messages.
|
1524
|
-
subprocess_utils.run_in_parallel(cleanup,
|
1525
|
-
list(reversed(sub_procs)),
|
1526
|
-
num_threads=len(sub_procs))
|
1571
|
+
subprocess_utils.run_in_parallel(lambda worker: worker.cancel(),
|
1572
|
+
workers,
|
1573
|
+
num_threads=len(workers))
|
1574
|
+
if queue_server is not None:
|
1575
|
+
queue_server.kill()
|
1576
|
+
queue_server.join()
|
sky/server/state.py
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
"""State for API server process."""
|
2
|
+
|
3
|
+
# This state is used to block requests except /api operations, which is useful
|
4
|
+
# when a server is shutting down: new requests will be blocked, but existing
|
5
|
+
# requests will be allowed to finish and be operated via /api operations, e.g.
|
6
|
+
# /api/logs, /api/cancel, etc.
|
7
|
+
_block_requests = False
|
8
|
+
|
9
|
+
|
10
|
+
# TODO(aylei): refactor, state should be a instance property of API server app
|
11
|
+
# instead of a global variable.
|
12
|
+
def get_block_requests() -> bool:
|
13
|
+
"""Whether block requests except /api operations."""
|
14
|
+
return _block_requests
|
15
|
+
|
16
|
+
|
17
|
+
def set_block_requests(shutting_down: bool) -> None:
|
18
|
+
"""Set the API server to block requests except /api operations."""
|
19
|
+
global _block_requests
|
20
|
+
_block_requests = shutting_down
|