skypilot-nightly 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/admin_policy.py +16 -5
- sky/backends/__init__.py +2 -1
- sky/backends/backend_utils.py +38 -11
- sky/backends/cloud_vm_ray_backend.py +52 -18
- sky/client/cli/command.py +264 -25
- sky/client/sdk.py +119 -85
- sky/clouds/aws.py +10 -7
- sky/clouds/azure.py +10 -7
- sky/clouds/cloud.py +2 -0
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +10 -7
- sky/clouds/fluidstack.py +2 -0
- sky/clouds/gcp.py +10 -7
- sky/clouds/hyperbolic.py +10 -7
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +27 -9
- sky/clouds/lambda_cloud.py +10 -7
- sky/clouds/nebius.py +10 -7
- sky/clouds/oci.py +10 -7
- sky/clouds/paperspace.py +10 -7
- sky/clouds/runpod.py +10 -7
- sky/clouds/scp.py +10 -7
- sky/clouds/vast.py +10 -7
- sky/clouds/vsphere.py +2 -0
- sky/core.py +89 -15
- sky/dag.py +14 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
- sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
- sky/dashboard/out/_next/static/chunks/37-1f1e94f5a561202a.js +6 -0
- sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
- sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
- sky/dashboard/out/_next/static/chunks/{513.211357a2914a34b2.js → 513.309df9e18a9ff005.js} +1 -1
- sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
- sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
- sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
- sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
- sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
- sky/dashboard/out/_next/static/chunks/856-cdf66268ec878d0c.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-068520cc11738deb.js +1 -0
- sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +1 -0
- sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-c416e87d5c2715cf.js → _app-0ef7418d1a3822f3.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c4ff1ec05e2f3daf.js → [name]-0b4c662a25e4747a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
- sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +2 -4
- sky/exceptions.py +26 -0
- sky/execution.py +5 -0
- sky/global_user_state.py +263 -20
- sky/jobs/client/sdk.py +13 -12
- sky/jobs/controller.py +5 -1
- sky/jobs/scheduler.py +4 -3
- sky/jobs/server/core.py +121 -51
- sky/jobs/state.py +15 -0
- sky/jobs/utils.py +114 -8
- sky/models.py +16 -0
- sky/provision/__init__.py +26 -0
- sky/provision/kubernetes/__init__.py +3 -0
- sky/provision/kubernetes/instance.py +38 -77
- sky/provision/kubernetes/utils.py +52 -2
- sky/provision/kubernetes/volume.py +147 -0
- sky/resources.py +20 -76
- sky/serve/client/sdk.py +13 -13
- sky/serve/server/core.py +5 -1
- sky/server/common.py +40 -5
- sky/server/constants.py +5 -1
- sky/server/metrics.py +105 -0
- sky/server/requests/executor.py +30 -14
- sky/server/requests/payloads.py +22 -3
- sky/server/requests/requests.py +59 -2
- sky/server/rest.py +152 -0
- sky/server/server.py +70 -19
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +8 -3
- sky/server/uvicorn.py +153 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +19 -14
- sky/task.py +141 -43
- sky/templates/jobs-controller.yaml.j2 +12 -1
- sky/templates/kubernetes-ray.yml.j2 +31 -2
- sky/users/permission.py +2 -0
- sky/utils/admin_policy_utils.py +5 -1
- sky/utils/cli_utils/status_utils.py +25 -17
- sky/utils/command_runner.py +118 -12
- sky/utils/command_runner.pyi +57 -0
- sky/utils/common_utils.py +9 -1
- sky/utils/context.py +3 -1
- sky/utils/controller_utils.py +1 -2
- sky/utils/resources_utils.py +66 -0
- sky/utils/rich_utils.py +6 -0
- sky/utils/schemas.py +180 -38
- sky/utils/status_lib.py +10 -0
- sky/utils/validator.py +11 -1
- sky/volumes/__init__.py +0 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +64 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +199 -0
- sky/volumes/server/server.py +85 -0
- sky/volumes/utils.py +158 -0
- sky/volumes/volume.py +198 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +139 -123
- sky/dashboard/out/_next/static/F4kiZ6Zh72jA6HzZ3ncFo/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
- sky/dashboard/out/_next/static/chunks/37-3a4d77ad62932eaf.js +0 -6
- sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +0 -6
- sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +0 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
- sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
- sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
- sky/dashboard/out/_next/static/chunks/856-c2c39c0912285e54.js +0 -1
- sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
- sky/dashboard/out/_next/static/chunks/938-1493ac755eadeb35.js +0 -1
- sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +0 -1
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +0 -1
- sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +0 -3
- /sky/dashboard/out/_next/static/{F4kiZ6Zh72jA6HzZ3ncFo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-b3040e493f6e7947.js → 843-07d25a7e64462fd8.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-db3c97c2bfbceb65.js → 973-5b5019ba333e8d62.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
sky/server/server.py
CHANGED
@@ -16,6 +16,7 @@ import posixpath
|
|
16
16
|
import re
|
17
17
|
import shutil
|
18
18
|
import sys
|
19
|
+
import threading
|
19
20
|
from typing import Any, Dict, List, Literal, Optional, Set, Tuple
|
20
21
|
import uuid
|
21
22
|
import zipfile
|
@@ -43,6 +44,8 @@ from sky.serve.server import server as serve_rest
|
|
43
44
|
from sky.server import common
|
44
45
|
from sky.server import config as server_config
|
45
46
|
from sky.server import constants as server_constants
|
47
|
+
from sky.server import metrics
|
48
|
+
from sky.server import state
|
46
49
|
from sky.server import stream_utils
|
47
50
|
from sky.server.requests import executor
|
48
51
|
from sky.server.requests import payloads
|
@@ -61,6 +64,7 @@ from sky.utils import dag_utils
|
|
61
64
|
from sky.utils import env_options
|
62
65
|
from sky.utils import status_lib
|
63
66
|
from sky.utils import subprocess_utils
|
67
|
+
from sky.volumes.server import server as volumes_rest
|
64
68
|
from sky.workspaces import server as workspaces_rest
|
65
69
|
|
66
70
|
# pylint: disable=ungrouped-imports
|
@@ -378,9 +382,32 @@ class PathCleanMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
378
382
|
return await call_next(request)
|
379
383
|
|
380
384
|
|
385
|
+
class GracefulShutdownMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
386
|
+
"""Middleware to control requests when server is shutting down."""
|
387
|
+
|
388
|
+
async def dispatch(self, request: fastapi.Request, call_next):
|
389
|
+
if state.get_block_requests():
|
390
|
+
# Allow /api/ paths to continue, which are critical to operate
|
391
|
+
# on-going requests but will not submit new requests.
|
392
|
+
if not request.url.path.startswith('/api/'):
|
393
|
+
# Client will retry on 503 error.
|
394
|
+
return fastapi.responses.JSONResponse(
|
395
|
+
status_code=503,
|
396
|
+
content={
|
397
|
+
'detail': 'Server is shutting down, '
|
398
|
+
'please try again later.'
|
399
|
+
})
|
400
|
+
|
401
|
+
return await call_next(request)
|
402
|
+
|
403
|
+
|
381
404
|
app = fastapi.FastAPI(prefix='/api/v1', debug=True, lifespan=lifespan)
|
405
|
+
# Use environment variable to make the metrics middleware optional.
|
406
|
+
if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
|
407
|
+
app.add_middleware(metrics.PrometheusMiddleware)
|
382
408
|
app.add_middleware(RBACMiddleware)
|
383
409
|
app.add_middleware(InternalDashboardPrefixMiddleware)
|
410
|
+
app.add_middleware(GracefulShutdownMiddleware)
|
384
411
|
app.add_middleware(PathCleanMiddleware)
|
385
412
|
app.add_middleware(CacheControlStaticMiddleware)
|
386
413
|
app.add_middleware(
|
@@ -404,6 +431,7 @@ app.include_router(users_rest.router, prefix='/users', tags=['users'])
|
|
404
431
|
app.include_router(workspaces_rest.router,
|
405
432
|
prefix='/workspaces',
|
406
433
|
tags=['workspaces'])
|
434
|
+
app.include_router(volumes_rest.router, prefix='/volumes', tags=['volumes'])
|
407
435
|
|
408
436
|
|
409
437
|
@app.get('/token')
|
@@ -564,6 +592,8 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
|
|
564
592
|
ctx.override_envs(validate_body.env_vars)
|
565
593
|
|
566
594
|
def validate_dag(dag: dag_utils.dag_lib.Dag):
|
595
|
+
# Resolve the volumes before admin policy and validation.
|
596
|
+
dag.resolve_and_validate_volumes()
|
567
597
|
# TODO: Admin policy may contain arbitrary code, which may be expensive
|
568
598
|
# to run and may block the server thread. However, moving it into the
|
569
599
|
# executor adds a ~150ms penalty on the local API server because of
|
@@ -826,6 +856,10 @@ async def status(
|
|
826
856
|
status_body: payloads.StatusBody = payloads.StatusBody()
|
827
857
|
) -> None:
|
828
858
|
"""Gets cluster statuses."""
|
859
|
+
if state.get_block_requests():
|
860
|
+
raise fastapi.HTTPException(
|
861
|
+
status_code=503,
|
862
|
+
detail='Server is shutting down, please try again later.')
|
829
863
|
executor.schedule_request(
|
830
864
|
request_id=request.state.request_id,
|
831
865
|
request_name='status',
|
@@ -1044,13 +1078,14 @@ async def download(download_body: payloads.DownloadBody) -> None:
|
|
1044
1078
|
detail=f'Error creating zip file: {str(e)}')
|
1045
1079
|
|
1046
1080
|
|
1047
|
-
@app.
|
1048
|
-
async def cost_report(request: fastapi.Request
|
1081
|
+
@app.post('/cost_report')
|
1082
|
+
async def cost_report(request: fastapi.Request,
|
1083
|
+
cost_report_body: payloads.CostReportBody) -> None:
|
1049
1084
|
"""Gets the cost report of a cluster."""
|
1050
1085
|
executor.schedule_request(
|
1051
1086
|
request_id=request.state.request_id,
|
1052
1087
|
request_name='cost_report',
|
1053
|
-
request_body=
|
1088
|
+
request_body=cost_report_body,
|
1054
1089
|
func=core.cost_report,
|
1055
1090
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
1056
1091
|
)
|
@@ -1144,6 +1179,10 @@ async def api_get(request_id: str) -> requests_lib.RequestPayload:
|
|
1144
1179
|
raise fastapi.HTTPException(
|
1145
1180
|
status_code=404, detail=f'Request {request_id!r} not found')
|
1146
1181
|
if request_task.status > requests_lib.RequestStatus.RUNNING:
|
1182
|
+
if request_task.should_retry:
|
1183
|
+
raise fastapi.HTTPException(
|
1184
|
+
status_code=503,
|
1185
|
+
detail=f'Request {request_id!r} should be retried')
|
1147
1186
|
request_error = request_task.get_error()
|
1148
1187
|
if request_error is not None:
|
1149
1188
|
raise fastapi.HTTPException(status_code=500,
|
@@ -1434,6 +1473,11 @@ async def complete_storage_name(incomplete: str,) -> List[str]:
|
|
1434
1473
|
return global_user_state.get_storage_names_start_with(incomplete)
|
1435
1474
|
|
1436
1475
|
|
1476
|
+
@app.get('/api/completion/volume_name')
|
1477
|
+
async def complete_volume_name(incomplete: str,) -> List[str]:
|
1478
|
+
return global_user_state.get_volume_names_start_with(incomplete)
|
1479
|
+
|
1480
|
+
|
1437
1481
|
@app.get('/dashboard/{full_path:path}')
|
1438
1482
|
async def serve_dashboard(full_path: str):
|
1439
1483
|
"""Serves the Next.js dashboard application.
|
@@ -1460,6 +1504,7 @@ async def serve_dashboard(full_path: str):
|
|
1460
1504
|
try:
|
1461
1505
|
with open(index_path, 'r', encoding='utf-8') as f:
|
1462
1506
|
content = f.read()
|
1507
|
+
|
1463
1508
|
return fastapi.responses.HTMLResponse(content=content)
|
1464
1509
|
except Exception as e:
|
1465
1510
|
logger.error(f'Error serving dashboard: {e}')
|
@@ -1483,7 +1528,13 @@ if __name__ == '__main__':
|
|
1483
1528
|
parser.add_argument('--host', default='127.0.0.1')
|
1484
1529
|
parser.add_argument('--port', default=46580, type=int)
|
1485
1530
|
parser.add_argument('--deploy', action='store_true')
|
1531
|
+
# Serve metrics on a separate port to isolate it from the application APIs:
|
1532
|
+
# metrics port will not be exposed to the public network typically.
|
1533
|
+
parser.add_argument('--metrics-port', default=9090, type=int)
|
1486
1534
|
cmd_args = parser.parse_args()
|
1535
|
+
if cmd_args.port == cmd_args.metrics_port:
|
1536
|
+
raise ValueError('port and metrics-port cannot be the same')
|
1537
|
+
|
1487
1538
|
# Show the privacy policy if it is not already shown. We place it here so
|
1488
1539
|
# that it is shown only when the API server is started.
|
1489
1540
|
usage_lib.maybe_show_privacy_policy()
|
@@ -1491,9 +1542,17 @@ if __name__ == '__main__':
|
|
1491
1542
|
config = server_config.compute_server_config(cmd_args.deploy)
|
1492
1543
|
num_workers = config.num_server_workers
|
1493
1544
|
|
1494
|
-
|
1545
|
+
queue_server: Optional[multiprocessing.Process] = None
|
1546
|
+
workers: List[executor.RequestWorker] = []
|
1495
1547
|
try:
|
1496
|
-
|
1548
|
+
if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
|
1549
|
+
metrics_thread = threading.Thread(target=metrics.run_metrics_server,
|
1550
|
+
args=(cmd_args.host,
|
1551
|
+
cmd_args.metrics_port),
|
1552
|
+
daemon=True)
|
1553
|
+
metrics_thread.start()
|
1554
|
+
queue_server, workers = executor.start(config)
|
1555
|
+
|
1497
1556
|
logger.info(f'Starting SkyPilot API server, workers={num_workers}')
|
1498
1557
|
# We don't support reload for now, since it may cause leakage of request
|
1499
1558
|
# workers or interrupt running requests.
|
@@ -1509,17 +1568,9 @@ if __name__ == '__main__':
|
|
1509
1568
|
finally:
|
1510
1569
|
logger.info('Shutting down SkyPilot API server...')
|
1511
1570
|
|
1512
|
-
|
1513
|
-
|
1514
|
-
|
1515
|
-
|
1516
|
-
|
1517
|
-
|
1518
|
-
proc.close()
|
1519
|
-
|
1520
|
-
# Terminate processes in reverse order in case dependency, especially
|
1521
|
-
# queue server. Terminate queue server first does not affect the
|
1522
|
-
# correctness of cleanup but introduce redundant error messages.
|
1523
|
-
subprocess_utils.run_in_parallel(cleanup,
|
1524
|
-
list(reversed(sub_procs)),
|
1525
|
-
num_threads=len(sub_procs))
|
1571
|
+
subprocess_utils.run_in_parallel(lambda worker: worker.cancel(),
|
1572
|
+
workers,
|
1573
|
+
num_threads=len(workers))
|
1574
|
+
if queue_server is not None:
|
1575
|
+
queue_server.kill()
|
1576
|
+
queue_server.join()
|
sky/server/state.py
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
"""State for API server process."""
|
2
|
+
|
3
|
+
# This state is used to block requests except /api operations, which is useful
|
4
|
+
# when a server is shutting down: new requests will be blocked, but existing
|
5
|
+
# requests will be allowed to finish and be operated via /api operations, e.g.
|
6
|
+
# /api/logs, /api/cancel, etc.
|
7
|
+
_block_requests = False
|
8
|
+
|
9
|
+
|
10
|
+
# TODO(aylei): refactor, state should be a instance property of API server app
|
11
|
+
# instead of a global variable.
|
12
|
+
def get_block_requests() -> bool:
|
13
|
+
"""Whether block requests except /api operations."""
|
14
|
+
return _block_requests
|
15
|
+
|
16
|
+
|
17
|
+
def set_block_requests(shutting_down: bool) -> None:
|
18
|
+
"""Set the API server to block requests except /api operations."""
|
19
|
+
global _block_requests
|
20
|
+
_block_requests = shutting_down
|
sky/server/stream_utils.py
CHANGED
@@ -155,9 +155,14 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
|
|
155
155
|
if request_task.status > requests_lib.RequestStatus.RUNNING:
|
156
156
|
if (request_task.status ==
|
157
157
|
requests_lib.RequestStatus.CANCELLED):
|
158
|
-
|
159
|
-
|
160
|
-
|
158
|
+
if request_task.should_retry:
|
159
|
+
buffer.append(
|
160
|
+
message_utils.encode_payload(
|
161
|
+
rich_utils.Control.RETRY.encode('')))
|
162
|
+
else:
|
163
|
+
buffer.append(
|
164
|
+
f'{request_task.name!r} request {request_id}'
|
165
|
+
' cancelled\n')
|
161
166
|
break
|
162
167
|
if not follow:
|
163
168
|
break
|
sky/server/uvicorn.py
CHANGED
@@ -3,17 +3,165 @@
|
|
3
3
|
This module is a wrapper around uvicorn to customize the behavior of the
|
4
4
|
server.
|
5
5
|
"""
|
6
|
-
import
|
6
|
+
import asyncio
|
7
7
|
import os
|
8
|
+
import signal
|
8
9
|
import threading
|
9
|
-
|
10
|
+
import time
|
11
|
+
from types import FrameType
|
12
|
+
from typing import Optional, Union
|
10
13
|
|
14
|
+
import filelock
|
11
15
|
import uvicorn
|
12
16
|
from uvicorn.supervisors import multiprocess
|
13
17
|
|
18
|
+
from sky import sky_logging
|
19
|
+
from sky.server import state
|
20
|
+
from sky.server.requests import requests as requests_lib
|
21
|
+
from sky.skylet import constants
|
14
22
|
from sky.utils import context_utils
|
15
23
|
from sky.utils import subprocess_utils
|
16
24
|
|
25
|
+
logger = sky_logging.init_logger(__name__)
|
26
|
+
|
27
|
+
# File lock path for coordinating graceful shutdown across processes
|
28
|
+
_GRACEFUL_SHUTDOWN_LOCK_PATH = '/tmp/skypilot_graceful_shutdown.lock'
|
29
|
+
|
30
|
+
# Interval to check for on-going requests.
|
31
|
+
_WAIT_REQUESTS_INTERVAL_SECONDS = 5
|
32
|
+
|
33
|
+
# Timeout for waiting for on-going requests to finish.
|
34
|
+
try:
|
35
|
+
_WAIT_REQUESTS_TIMEOUT_SECONDS = int(
|
36
|
+
os.environ.get(constants.GRACE_PERIOD_SECONDS_ENV_VAR, '60'))
|
37
|
+
except ValueError:
|
38
|
+
_WAIT_REQUESTS_TIMEOUT_SECONDS = 60
|
39
|
+
|
40
|
+
# TODO(aylei): use decorator to register requests that need to be proactively
|
41
|
+
# cancelled instead of hardcoding here.
|
42
|
+
_RETRIABLE_REQUEST_NAMES = [
|
43
|
+
'sky.logs',
|
44
|
+
'sky.jobs.logs',
|
45
|
+
'sky.serve.logs',
|
46
|
+
]
|
47
|
+
|
48
|
+
|
49
|
+
class Server(uvicorn.Server):
|
50
|
+
"""Server wrapper for uvicorn.
|
51
|
+
|
52
|
+
Extended functionalities:
|
53
|
+
- Handle exit signal and perform custom graceful shutdown.
|
54
|
+
- Run the server process with contextually aware.
|
55
|
+
"""
|
56
|
+
|
57
|
+
def __init__(self, config: uvicorn.Config):
|
58
|
+
super().__init__(config=config)
|
59
|
+
self.exiting: bool = False
|
60
|
+
|
61
|
+
def handle_exit(self, sig: int, frame: Union[FrameType, None]) -> None:
|
62
|
+
"""Handle exit signal.
|
63
|
+
|
64
|
+
When a server process receives a SIGTERM or SIGINT signal, a graceful
|
65
|
+
shutdown will be initiated. If a SIGINT signal is received again, the
|
66
|
+
server will be forcefully shutdown.
|
67
|
+
"""
|
68
|
+
if self.exiting and sig == signal.SIGINT:
|
69
|
+
# The server has been siganled to exit and recieved a SIGINT again,
|
70
|
+
# do force shutdown.
|
71
|
+
logger.info('Force shutdown.')
|
72
|
+
self.should_exit = True
|
73
|
+
super().handle_exit(sig, frame)
|
74
|
+
return
|
75
|
+
if not self.exiting:
|
76
|
+
self.exiting = True
|
77
|
+
# Perform graceful shutdown in a separate thread to avoid blocking
|
78
|
+
# the main thread.
|
79
|
+
threading.Thread(target=self._graceful_shutdown,
|
80
|
+
args=(sig, frame),
|
81
|
+
daemon=True).start()
|
82
|
+
|
83
|
+
def _graceful_shutdown(self, sig: int, frame: Union[FrameType,
|
84
|
+
None]) -> None:
|
85
|
+
"""Perform graceful shutdown."""
|
86
|
+
# Block new requests so that we can wait until all on-going requests
|
87
|
+
# are finished. Note that /api/$verb operations are still allowed in
|
88
|
+
# this stage to ensure the client can still operate the on-going
|
89
|
+
# requests, e.g. /api/logs, /api/cancel, etc.
|
90
|
+
logger.info('Block new requests being submitted in worker '
|
91
|
+
f'{os.getpid()}.')
|
92
|
+
state.set_block_requests(True)
|
93
|
+
# Ensure the shutting_down are set on all workers before next step.
|
94
|
+
# TODO(aylei): hacky, need a reliable solution.
|
95
|
+
time.sleep(1)
|
96
|
+
|
97
|
+
lock = filelock.FileLock(_GRACEFUL_SHUTDOWN_LOCK_PATH)
|
98
|
+
# Elect a coordinator process to handle on-going requests check
|
99
|
+
with lock.acquire():
|
100
|
+
logger.info(f'Worker {os.getpid()} elected as shutdown coordinator')
|
101
|
+
self._wait_requests()
|
102
|
+
|
103
|
+
logger.info('Shutting down server...')
|
104
|
+
self.should_exit = True
|
105
|
+
super().handle_exit(sig, frame)
|
106
|
+
|
107
|
+
def _wait_requests(self) -> None:
|
108
|
+
"""Wait until all on-going requests are finished or cancelled."""
|
109
|
+
start_time = time.time()
|
110
|
+
while True:
|
111
|
+
statuses = [
|
112
|
+
requests_lib.RequestStatus.PENDING,
|
113
|
+
requests_lib.RequestStatus.RUNNING,
|
114
|
+
]
|
115
|
+
reqs = requests_lib.get_request_tasks(status=statuses)
|
116
|
+
if not reqs:
|
117
|
+
break
|
118
|
+
logger.info(f'{len(reqs)} on-going requests '
|
119
|
+
'found, waiting for them to finish...')
|
120
|
+
# Proactively cancel internal requests and logs requests since
|
121
|
+
# they can run for infinite time.
|
122
|
+
internal_request_ids = [
|
123
|
+
d.id for d in requests_lib.INTERNAL_REQUEST_DAEMONS
|
124
|
+
]
|
125
|
+
if time.time() - start_time > _WAIT_REQUESTS_TIMEOUT_SECONDS:
|
126
|
+
logger.warning('Timeout waiting for on-going requests to '
|
127
|
+
'finish, cancelling all on-going requests.')
|
128
|
+
for req in reqs:
|
129
|
+
self.interrupt_request_for_retry(req.request_id)
|
130
|
+
break
|
131
|
+
interrupted = 0
|
132
|
+
for req in reqs:
|
133
|
+
if req.request_id in internal_request_ids:
|
134
|
+
self.interrupt_request_for_retry(req.request_id)
|
135
|
+
interrupted += 1
|
136
|
+
elif req.name in _RETRIABLE_REQUEST_NAMES:
|
137
|
+
self.interrupt_request_for_retry(req.request_id)
|
138
|
+
interrupted += 1
|
139
|
+
# TODO(aylei): interrupt pending requests to accelerate the
|
140
|
+
# shutdown.
|
141
|
+
# If some requests are not interrupted, wait for them to finish,
|
142
|
+
# otherwise we just check again immediately to accelerate the
|
143
|
+
# shutdown process.
|
144
|
+
if interrupted < len(reqs):
|
145
|
+
time.sleep(_WAIT_REQUESTS_INTERVAL_SECONDS)
|
146
|
+
|
147
|
+
def interrupt_request_for_retry(self, request_id: str) -> None:
|
148
|
+
"""Interrupt a request for retry."""
|
149
|
+
with requests_lib.update_request(request_id) as req:
|
150
|
+
if req is None:
|
151
|
+
return
|
152
|
+
if req.pid is not None:
|
153
|
+
os.kill(req.pid, signal.SIGTERM)
|
154
|
+
req.status = requests_lib.RequestStatus.CANCELLED
|
155
|
+
req.should_retry = True
|
156
|
+
logger.info(
|
157
|
+
f'Request {request_id} interrupted and will be retried by client.')
|
158
|
+
|
159
|
+
def run(self, *args, **kwargs):
|
160
|
+
"""Run the server process."""
|
161
|
+
context_utils.hijack_sys_attrs()
|
162
|
+
with self.capture_signals():
|
163
|
+
asyncio.run(self.serve(*args, **kwargs))
|
164
|
+
|
17
165
|
|
18
166
|
def run(config: uvicorn.Config):
|
19
167
|
"""Run unvicorn server."""
|
@@ -22,28 +170,20 @@ def run(config: uvicorn.Config):
|
|
22
170
|
# in uvicorn. Since we do not use reload now, simply
|
23
171
|
# guard by an exception.
|
24
172
|
raise ValueError('Reload is not supported yet.')
|
25
|
-
server =
|
26
|
-
run_server_process = functools.partial(_run_server_process, server)
|
173
|
+
server = Server(config=config)
|
27
174
|
try:
|
28
175
|
if config.workers is not None and config.workers > 1:
|
29
176
|
sock = config.bind_socket()
|
30
|
-
SlowStartMultiprocess(config,
|
31
|
-
target=run_server_process,
|
177
|
+
SlowStartMultiprocess(config, target=server.run,
|
32
178
|
sockets=[sock]).run()
|
33
179
|
else:
|
34
|
-
|
180
|
+
server.run()
|
35
181
|
finally:
|
36
182
|
# Copied from unvicorn.run()
|
37
183
|
if config.uds and os.path.exists(config.uds):
|
38
184
|
os.remove(config.uds)
|
39
185
|
|
40
186
|
|
41
|
-
def _run_server_process(server: uvicorn.Server, *args, **kwargs):
|
42
|
-
"""Run the server process with contextually aware."""
|
43
|
-
context_utils.hijack_sys_attrs()
|
44
|
-
server.run(*args, **kwargs)
|
45
|
-
|
46
|
-
|
47
187
|
class SlowStartMultiprocess(multiprocess.Multiprocess):
|
48
188
|
"""Uvicorn Multiprocess wrapper with slow start.
|
49
189
|
|
sky/setup_files/dependencies.py
CHANGED
sky/skylet/constants.py
CHANGED
@@ -401,6 +401,8 @@ PERSISTENT_RUN_SCRIPT_DIR = '~/.sky/.controller_recovery_task_run'
|
|
401
401
|
PERSISTENT_RUN_RESTARTING_SIGNAL_FILE = (
|
402
402
|
'~/.sky/.controller_recovery_restarting_signal')
|
403
403
|
|
404
|
+
HA_PERSISTENT_RECOVERY_LOG_PATH = '/tmp/ha_recovery.log'
|
405
|
+
|
404
406
|
# The placeholder for the local skypilot config path in file mounts for
|
405
407
|
# controllers.
|
406
408
|
LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER = 'skypilot:local_skypilot_config_path'
|
@@ -411,6 +413,8 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
|
|
411
413
|
# Environment variable that is set to 'true' if this is a skypilot server.
|
412
414
|
ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
|
413
415
|
|
416
|
+
# Environment variable that is set to 'true' if metrics are enabled.
|
417
|
+
ENV_VAR_SERVER_METRICS_ENABLED = 'SKY_API_SERVER_METRICS_ENABLED'
|
414
418
|
# Environment variable that is set to 'true' if basic
|
415
419
|
# authentication is enabled in the API server.
|
416
420
|
ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
|
@@ -436,39 +440,40 @@ LOGGING_CONFIG_DIR = '~/.sky/logging'
|
|
436
440
|
|
437
441
|
# Resources constants
|
438
442
|
TIME_UNITS = {
|
439
|
-
's': 1 / 60,
|
440
|
-
'sec': 1 / 60,
|
441
443
|
'm': 1,
|
442
|
-
'min': 1,
|
443
444
|
'h': 60,
|
444
|
-
'hr': 60,
|
445
445
|
'd': 24 * 60,
|
446
|
-
'
|
446
|
+
'w': 7 * 24 * 60,
|
447
447
|
}
|
448
448
|
|
449
449
|
TIME_PATTERN: str = (
|
450
450
|
f'^[0-9]+({"|".join([unit.lower() for unit in TIME_UNITS])})?$/i')
|
451
451
|
|
452
452
|
MEMORY_SIZE_UNITS = {
|
453
|
-
'b': 1,
|
454
|
-
'k': 2**10,
|
455
453
|
'kb': 2**10,
|
456
|
-
'
|
454
|
+
'ki': 2**10,
|
457
455
|
'mb': 2**20,
|
458
|
-
'
|
456
|
+
'mi': 2**20,
|
459
457
|
'gb': 2**30,
|
460
|
-
'
|
458
|
+
'gi': 2**30,
|
461
459
|
'tb': 2**40,
|
462
|
-
'
|
460
|
+
'ti': 2**40,
|
463
461
|
'pb': 2**50,
|
462
|
+
'pi': 2**50,
|
464
463
|
}
|
465
464
|
|
466
465
|
MEMORY_SIZE_PATTERN = (
|
467
466
|
'^[0-9]+('
|
468
|
-
f'{"|".join([unit.lower() for unit in MEMORY_SIZE_UNITS])}'
|
469
|
-
')
|
470
|
-
|
467
|
+
f'{"|".join([unit.lower() for unit in MEMORY_SIZE_UNITS])}|'
|
468
|
+
f'{"|".join([unit.upper() for unit in MEMORY_SIZE_UNITS])}|'
|
469
|
+
f'{"|".join([unit[0].upper() + unit[1:] for unit in MEMORY_SIZE_UNITS if len(unit) > 1])}' # pylint: disable=line-too-long
|
470
|
+
')?$')
|
471
|
+
|
472
|
+
LAST_USE_TRUNC_LENGTH = 25
|
471
473
|
|
472
474
|
MIN_PRIORITY = -1000
|
473
475
|
MAX_PRIORITY = 1000
|
474
476
|
DEFAULT_PRIORITY = 0
|
477
|
+
|
478
|
+
GRACE_PERIOD_SECONDS_ENV_VAR = SKYPILOT_ENV_VAR_PREFIX + 'GRACE_PERIOD_SECONDS'
|
479
|
+
COST_REPORT_DEFAULT_DAYS = 30
|