skypilot-nightly 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. sky/__init__.py +2 -2
  2. sky/admin_policy.py +16 -5
  3. sky/backends/__init__.py +2 -1
  4. sky/backends/backend_utils.py +38 -11
  5. sky/backends/cloud_vm_ray_backend.py +52 -18
  6. sky/client/cli/command.py +264 -25
  7. sky/client/sdk.py +119 -85
  8. sky/clouds/aws.py +10 -7
  9. sky/clouds/azure.py +10 -7
  10. sky/clouds/cloud.py +2 -0
  11. sky/clouds/cudo.py +2 -0
  12. sky/clouds/do.py +10 -7
  13. sky/clouds/fluidstack.py +2 -0
  14. sky/clouds/gcp.py +10 -7
  15. sky/clouds/hyperbolic.py +10 -7
  16. sky/clouds/ibm.py +2 -0
  17. sky/clouds/kubernetes.py +27 -9
  18. sky/clouds/lambda_cloud.py +10 -7
  19. sky/clouds/nebius.py +10 -7
  20. sky/clouds/oci.py +10 -7
  21. sky/clouds/paperspace.py +10 -7
  22. sky/clouds/runpod.py +10 -7
  23. sky/clouds/scp.py +10 -7
  24. sky/clouds/vast.py +10 -7
  25. sky/clouds/vsphere.py +2 -0
  26. sky/core.py +89 -15
  27. sky/dag.py +14 -0
  28. sky/dashboard/out/404.html +1 -1
  29. sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
  32. sky/dashboard/out/_next/static/chunks/37-1f1e94f5a561202a.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/{513.211357a2914a34b2.js → 513.309df9e18a9ff005.js} +1 -1
  36. sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
  40. sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
  41. sky/dashboard/out/_next/static/chunks/856-cdf66268ec878d0c.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/938-068520cc11738deb.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/{_app-c416e87d5c2715cf.js → _app-0ef7418d1a3822f3.js} +1 -1
  46. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
  48. sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
  53. sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c4ff1ec05e2f3daf.js → [name]-0b4c662a25e4747a.js} +1 -1
  58. sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
  60. sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
  61. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  62. sky/dashboard/out/clusters/[cluster].html +1 -1
  63. sky/dashboard/out/clusters.html +1 -1
  64. sky/dashboard/out/config.html +1 -1
  65. sky/dashboard/out/index.html +1 -1
  66. sky/dashboard/out/infra/[context].html +1 -1
  67. sky/dashboard/out/infra.html +1 -1
  68. sky/dashboard/out/jobs/[job].html +1 -1
  69. sky/dashboard/out/jobs.html +1 -1
  70. sky/dashboard/out/users.html +1 -1
  71. sky/dashboard/out/volumes.html +1 -0
  72. sky/dashboard/out/workspace/new.html +1 -1
  73. sky/dashboard/out/workspaces/[name].html +1 -1
  74. sky/dashboard/out/workspaces.html +1 -1
  75. sky/data/storage_utils.py +2 -4
  76. sky/exceptions.py +26 -0
  77. sky/execution.py +5 -0
  78. sky/global_user_state.py +263 -20
  79. sky/jobs/client/sdk.py +13 -12
  80. sky/jobs/controller.py +5 -1
  81. sky/jobs/scheduler.py +4 -3
  82. sky/jobs/server/core.py +121 -51
  83. sky/jobs/state.py +15 -0
  84. sky/jobs/utils.py +114 -8
  85. sky/models.py +16 -0
  86. sky/provision/__init__.py +26 -0
  87. sky/provision/kubernetes/__init__.py +3 -0
  88. sky/provision/kubernetes/instance.py +38 -77
  89. sky/provision/kubernetes/utils.py +52 -2
  90. sky/provision/kubernetes/volume.py +147 -0
  91. sky/resources.py +20 -76
  92. sky/serve/client/sdk.py +13 -13
  93. sky/serve/server/core.py +5 -1
  94. sky/server/common.py +40 -5
  95. sky/server/constants.py +5 -1
  96. sky/server/metrics.py +105 -0
  97. sky/server/requests/executor.py +30 -14
  98. sky/server/requests/payloads.py +22 -3
  99. sky/server/requests/requests.py +59 -2
  100. sky/server/rest.py +152 -0
  101. sky/server/server.py +70 -19
  102. sky/server/state.py +20 -0
  103. sky/server/stream_utils.py +8 -3
  104. sky/server/uvicorn.py +153 -13
  105. sky/setup_files/dependencies.py +2 -0
  106. sky/skylet/constants.py +19 -14
  107. sky/task.py +141 -43
  108. sky/templates/jobs-controller.yaml.j2 +12 -1
  109. sky/templates/kubernetes-ray.yml.j2 +31 -2
  110. sky/users/permission.py +2 -0
  111. sky/utils/admin_policy_utils.py +5 -1
  112. sky/utils/cli_utils/status_utils.py +25 -17
  113. sky/utils/command_runner.py +118 -12
  114. sky/utils/command_runner.pyi +57 -0
  115. sky/utils/common_utils.py +9 -1
  116. sky/utils/context.py +3 -1
  117. sky/utils/controller_utils.py +1 -2
  118. sky/utils/resources_utils.py +66 -0
  119. sky/utils/rich_utils.py +6 -0
  120. sky/utils/schemas.py +180 -38
  121. sky/utils/status_lib.py +10 -0
  122. sky/utils/validator.py +11 -1
  123. sky/volumes/__init__.py +0 -0
  124. sky/volumes/client/__init__.py +0 -0
  125. sky/volumes/client/sdk.py +64 -0
  126. sky/volumes/server/__init__.py +0 -0
  127. sky/volumes/server/core.py +199 -0
  128. sky/volumes/server/server.py +85 -0
  129. sky/volumes/utils.py +158 -0
  130. sky/volumes/volume.py +198 -0
  131. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
  132. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +139 -123
  133. sky/dashboard/out/_next/static/F4kiZ6Zh72jA6HzZ3ncFo/_buildManifest.js +0 -1
  134. sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/37-3a4d77ad62932eaf.js +0 -6
  136. sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +0 -6
  137. sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
  139. sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
  140. sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/856-c2c39c0912285e54.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/938-1493ac755eadeb35.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +0 -1
  145. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
  146. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +0 -6
  147. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +0 -6
  148. sky/dashboard/out/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js +0 -1
  149. sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +0 -1
  151. sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +0 -16
  153. sky/dashboard/out/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +0 -1
  155. sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
  156. sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +0 -1
  157. sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +0 -1
  158. sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +0 -3
  159. /sky/dashboard/out/_next/static/{F4kiZ6Zh72jA6HzZ3ncFo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
  160. /sky/dashboard/out/_next/static/chunks/{843-b3040e493f6e7947.js → 843-07d25a7e64462fd8.js} +0 -0
  161. /sky/dashboard/out/_next/static/chunks/{973-db3c97c2bfbceb65.js → 973-5b5019ba333e8d62.js} +0 -0
  162. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
  163. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
  164. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
  165. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
sky/server/server.py CHANGED
@@ -16,6 +16,7 @@ import posixpath
16
16
  import re
17
17
  import shutil
18
18
  import sys
19
+ import threading
19
20
  from typing import Any, Dict, List, Literal, Optional, Set, Tuple
20
21
  import uuid
21
22
  import zipfile
@@ -43,6 +44,8 @@ from sky.serve.server import server as serve_rest
43
44
  from sky.server import common
44
45
  from sky.server import config as server_config
45
46
  from sky.server import constants as server_constants
47
+ from sky.server import metrics
48
+ from sky.server import state
46
49
  from sky.server import stream_utils
47
50
  from sky.server.requests import executor
48
51
  from sky.server.requests import payloads
@@ -61,6 +64,7 @@ from sky.utils import dag_utils
61
64
  from sky.utils import env_options
62
65
  from sky.utils import status_lib
63
66
  from sky.utils import subprocess_utils
67
+ from sky.volumes.server import server as volumes_rest
64
68
  from sky.workspaces import server as workspaces_rest
65
69
 
66
70
  # pylint: disable=ungrouped-imports
@@ -378,9 +382,32 @@ class PathCleanMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
378
382
  return await call_next(request)
379
383
 
380
384
 
385
+ class GracefulShutdownMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
386
+ """Middleware to control requests when server is shutting down."""
387
+
388
+ async def dispatch(self, request: fastapi.Request, call_next):
389
+ if state.get_block_requests():
390
+ # Allow /api/ paths to continue, which are critical to operate
391
+ # on-going requests but will not submit new requests.
392
+ if not request.url.path.startswith('/api/'):
393
+ # Client will retry on 503 error.
394
+ return fastapi.responses.JSONResponse(
395
+ status_code=503,
396
+ content={
397
+ 'detail': 'Server is shutting down, '
398
+ 'please try again later.'
399
+ })
400
+
401
+ return await call_next(request)
402
+
403
+
381
404
  app = fastapi.FastAPI(prefix='/api/v1', debug=True, lifespan=lifespan)
405
+ # Use environment variable to make the metrics middleware optional.
406
+ if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
407
+ app.add_middleware(metrics.PrometheusMiddleware)
382
408
  app.add_middleware(RBACMiddleware)
383
409
  app.add_middleware(InternalDashboardPrefixMiddleware)
410
+ app.add_middleware(GracefulShutdownMiddleware)
384
411
  app.add_middleware(PathCleanMiddleware)
385
412
  app.add_middleware(CacheControlStaticMiddleware)
386
413
  app.add_middleware(
@@ -404,6 +431,7 @@ app.include_router(users_rest.router, prefix='/users', tags=['users'])
404
431
  app.include_router(workspaces_rest.router,
405
432
  prefix='/workspaces',
406
433
  tags=['workspaces'])
434
+ app.include_router(volumes_rest.router, prefix='/volumes', tags=['volumes'])
407
435
 
408
436
 
409
437
  @app.get('/token')
@@ -564,6 +592,8 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
564
592
  ctx.override_envs(validate_body.env_vars)
565
593
 
566
594
  def validate_dag(dag: dag_utils.dag_lib.Dag):
595
+ # Resolve the volumes before admin policy and validation.
596
+ dag.resolve_and_validate_volumes()
567
597
  # TODO: Admin policy may contain arbitrary code, which may be expensive
568
598
  # to run and may block the server thread. However, moving it into the
569
599
  # executor adds a ~150ms penalty on the local API server because of
@@ -826,6 +856,10 @@ async def status(
826
856
  status_body: payloads.StatusBody = payloads.StatusBody()
827
857
  ) -> None:
828
858
  """Gets cluster statuses."""
859
+ if state.get_block_requests():
860
+ raise fastapi.HTTPException(
861
+ status_code=503,
862
+ detail='Server is shutting down, please try again later.')
829
863
  executor.schedule_request(
830
864
  request_id=request.state.request_id,
831
865
  request_name='status',
@@ -1044,13 +1078,14 @@ async def download(download_body: payloads.DownloadBody) -> None:
1044
1078
  detail=f'Error creating zip file: {str(e)}')
1045
1079
 
1046
1080
 
1047
- @app.get('/cost_report')
1048
- async def cost_report(request: fastapi.Request) -> None:
1081
+ @app.post('/cost_report')
1082
+ async def cost_report(request: fastapi.Request,
1083
+ cost_report_body: payloads.CostReportBody) -> None:
1049
1084
  """Gets the cost report of a cluster."""
1050
1085
  executor.schedule_request(
1051
1086
  request_id=request.state.request_id,
1052
1087
  request_name='cost_report',
1053
- request_body=payloads.RequestBody(),
1088
+ request_body=cost_report_body,
1054
1089
  func=core.cost_report,
1055
1090
  schedule_type=requests_lib.ScheduleType.SHORT,
1056
1091
  )
@@ -1144,6 +1179,10 @@ async def api_get(request_id: str) -> requests_lib.RequestPayload:
1144
1179
  raise fastapi.HTTPException(
1145
1180
  status_code=404, detail=f'Request {request_id!r} not found')
1146
1181
  if request_task.status > requests_lib.RequestStatus.RUNNING:
1182
+ if request_task.should_retry:
1183
+ raise fastapi.HTTPException(
1184
+ status_code=503,
1185
+ detail=f'Request {request_id!r} should be retried')
1147
1186
  request_error = request_task.get_error()
1148
1187
  if request_error is not None:
1149
1188
  raise fastapi.HTTPException(status_code=500,
@@ -1434,6 +1473,11 @@ async def complete_storage_name(incomplete: str,) -> List[str]:
1434
1473
  return global_user_state.get_storage_names_start_with(incomplete)
1435
1474
 
1436
1475
 
1476
+ @app.get('/api/completion/volume_name')
1477
+ async def complete_volume_name(incomplete: str,) -> List[str]:
1478
+ return global_user_state.get_volume_names_start_with(incomplete)
1479
+
1480
+
1437
1481
  @app.get('/dashboard/{full_path:path}')
1438
1482
  async def serve_dashboard(full_path: str):
1439
1483
  """Serves the Next.js dashboard application.
@@ -1460,6 +1504,7 @@ async def serve_dashboard(full_path: str):
1460
1504
  try:
1461
1505
  with open(index_path, 'r', encoding='utf-8') as f:
1462
1506
  content = f.read()
1507
+
1463
1508
  return fastapi.responses.HTMLResponse(content=content)
1464
1509
  except Exception as e:
1465
1510
  logger.error(f'Error serving dashboard: {e}')
@@ -1483,7 +1528,13 @@ if __name__ == '__main__':
1483
1528
  parser.add_argument('--host', default='127.0.0.1')
1484
1529
  parser.add_argument('--port', default=46580, type=int)
1485
1530
  parser.add_argument('--deploy', action='store_true')
1531
+ # Serve metrics on a separate port to isolate it from the application APIs:
1532
+ # metrics port will not be exposed to the public network typically.
1533
+ parser.add_argument('--metrics-port', default=9090, type=int)
1486
1534
  cmd_args = parser.parse_args()
1535
+ if cmd_args.port == cmd_args.metrics_port:
1536
+ raise ValueError('port and metrics-port cannot be the same')
1537
+
1487
1538
  # Show the privacy policy if it is not already shown. We place it here so
1488
1539
  # that it is shown only when the API server is started.
1489
1540
  usage_lib.maybe_show_privacy_policy()
@@ -1491,9 +1542,17 @@ if __name__ == '__main__':
1491
1542
  config = server_config.compute_server_config(cmd_args.deploy)
1492
1543
  num_workers = config.num_server_workers
1493
1544
 
1494
- sub_procs = []
1545
+ queue_server: Optional[multiprocessing.Process] = None
1546
+ workers: List[executor.RequestWorker] = []
1495
1547
  try:
1496
- sub_procs = executor.start(config)
1548
+ if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
1549
+ metrics_thread = threading.Thread(target=metrics.run_metrics_server,
1550
+ args=(cmd_args.host,
1551
+ cmd_args.metrics_port),
1552
+ daemon=True)
1553
+ metrics_thread.start()
1554
+ queue_server, workers = executor.start(config)
1555
+
1497
1556
  logger.info(f'Starting SkyPilot API server, workers={num_workers}')
1498
1557
  # We don't support reload for now, since it may cause leakage of request
1499
1558
  # workers or interrupt running requests.
@@ -1509,17 +1568,9 @@ if __name__ == '__main__':
1509
1568
  finally:
1510
1569
  logger.info('Shutting down SkyPilot API server...')
1511
1570
 
1512
- def cleanup(proc: multiprocessing.Process) -> None:
1513
- try:
1514
- proc.terminate()
1515
- proc.join()
1516
- finally:
1517
- # The process may not be started yet, close it anyway.
1518
- proc.close()
1519
-
1520
- # Terminate processes in reverse order in case dependency, especially
1521
- # queue server. Terminate queue server first does not affect the
1522
- # correctness of cleanup but introduce redundant error messages.
1523
- subprocess_utils.run_in_parallel(cleanup,
1524
- list(reversed(sub_procs)),
1525
- num_threads=len(sub_procs))
1571
+ subprocess_utils.run_in_parallel(lambda worker: worker.cancel(),
1572
+ workers,
1573
+ num_threads=len(workers))
1574
+ if queue_server is not None:
1575
+ queue_server.kill()
1576
+ queue_server.join()
sky/server/state.py ADDED
@@ -0,0 +1,20 @@
1
+ """State for API server process."""
2
+
3
+ # This state is used to block requests except /api operations, which is useful
4
+ # when a server is shutting down: new requests will be blocked, but existing
5
+ # requests will be allowed to finish and be operated via /api operations, e.g.
6
+ # /api/logs, /api/cancel, etc.
7
+ _block_requests = False
8
+
9
+
10
+ # TODO(aylei): refactor, state should be a instance property of API server app
11
+ # instead of a global variable.
12
+ def get_block_requests() -> bool:
13
+ """Whether block requests except /api operations."""
14
+ return _block_requests
15
+
16
+
17
+ def set_block_requests(shutting_down: bool) -> None:
18
+ """Set the API server to block requests except /api operations."""
19
+ global _block_requests
20
+ _block_requests = shutting_down
@@ -155,9 +155,14 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
155
155
  if request_task.status > requests_lib.RequestStatus.RUNNING:
156
156
  if (request_task.status ==
157
157
  requests_lib.RequestStatus.CANCELLED):
158
- buffer.append(
159
- f'{request_task.name!r} request {request_id}'
160
- ' cancelled\n')
158
+ if request_task.should_retry:
159
+ buffer.append(
160
+ message_utils.encode_payload(
161
+ rich_utils.Control.RETRY.encode('')))
162
+ else:
163
+ buffer.append(
164
+ f'{request_task.name!r} request {request_id}'
165
+ ' cancelled\n')
161
166
  break
162
167
  if not follow:
163
168
  break
sky/server/uvicorn.py CHANGED
@@ -3,17 +3,165 @@
3
3
  This module is a wrapper around uvicorn to customize the behavior of the
4
4
  server.
5
5
  """
6
- import functools
6
+ import asyncio
7
7
  import os
8
+ import signal
8
9
  import threading
9
- from typing import Optional
10
+ import time
11
+ from types import FrameType
12
+ from typing import Optional, Union
10
13
 
14
+ import filelock
11
15
  import uvicorn
12
16
  from uvicorn.supervisors import multiprocess
13
17
 
18
+ from sky import sky_logging
19
+ from sky.server import state
20
+ from sky.server.requests import requests as requests_lib
21
+ from sky.skylet import constants
14
22
  from sky.utils import context_utils
15
23
  from sky.utils import subprocess_utils
16
24
 
25
+ logger = sky_logging.init_logger(__name__)
26
+
27
+ # File lock path for coordinating graceful shutdown across processes
28
+ _GRACEFUL_SHUTDOWN_LOCK_PATH = '/tmp/skypilot_graceful_shutdown.lock'
29
+
30
+ # Interval to check for on-going requests.
31
+ _WAIT_REQUESTS_INTERVAL_SECONDS = 5
32
+
33
+ # Timeout for waiting for on-going requests to finish.
34
+ try:
35
+ _WAIT_REQUESTS_TIMEOUT_SECONDS = int(
36
+ os.environ.get(constants.GRACE_PERIOD_SECONDS_ENV_VAR, '60'))
37
+ except ValueError:
38
+ _WAIT_REQUESTS_TIMEOUT_SECONDS = 60
39
+
40
+ # TODO(aylei): use decorator to register requests that need to be proactively
41
+ # cancelled instead of hardcoding here.
42
+ _RETRIABLE_REQUEST_NAMES = [
43
+ 'sky.logs',
44
+ 'sky.jobs.logs',
45
+ 'sky.serve.logs',
46
+ ]
47
+
48
+
49
+ class Server(uvicorn.Server):
50
+ """Server wrapper for uvicorn.
51
+
52
+ Extended functionalities:
53
+ - Handle exit signal and perform custom graceful shutdown.
54
+ - Run the server process with contextually aware.
55
+ """
56
+
57
+ def __init__(self, config: uvicorn.Config):
58
+ super().__init__(config=config)
59
+ self.exiting: bool = False
60
+
61
+ def handle_exit(self, sig: int, frame: Union[FrameType, None]) -> None:
62
+ """Handle exit signal.
63
+
64
+ When a server process receives a SIGTERM or SIGINT signal, a graceful
65
+ shutdown will be initiated. If a SIGINT signal is received again, the
66
+ server will be forcefully shutdown.
67
+ """
68
+ if self.exiting and sig == signal.SIGINT:
69
+ # The server has been siganled to exit and recieved a SIGINT again,
70
+ # do force shutdown.
71
+ logger.info('Force shutdown.')
72
+ self.should_exit = True
73
+ super().handle_exit(sig, frame)
74
+ return
75
+ if not self.exiting:
76
+ self.exiting = True
77
+ # Perform graceful shutdown in a separate thread to avoid blocking
78
+ # the main thread.
79
+ threading.Thread(target=self._graceful_shutdown,
80
+ args=(sig, frame),
81
+ daemon=True).start()
82
+
83
+ def _graceful_shutdown(self, sig: int, frame: Union[FrameType,
84
+ None]) -> None:
85
+ """Perform graceful shutdown."""
86
+ # Block new requests so that we can wait until all on-going requests
87
+ # are finished. Note that /api/$verb operations are still allowed in
88
+ # this stage to ensure the client can still operate the on-going
89
+ # requests, e.g. /api/logs, /api/cancel, etc.
90
+ logger.info('Block new requests being submitted in worker '
91
+ f'{os.getpid()}.')
92
+ state.set_block_requests(True)
93
+ # Ensure the shutting_down are set on all workers before next step.
94
+ # TODO(aylei): hacky, need a reliable solution.
95
+ time.sleep(1)
96
+
97
+ lock = filelock.FileLock(_GRACEFUL_SHUTDOWN_LOCK_PATH)
98
+ # Elect a coordinator process to handle on-going requests check
99
+ with lock.acquire():
100
+ logger.info(f'Worker {os.getpid()} elected as shutdown coordinator')
101
+ self._wait_requests()
102
+
103
+ logger.info('Shutting down server...')
104
+ self.should_exit = True
105
+ super().handle_exit(sig, frame)
106
+
107
+ def _wait_requests(self) -> None:
108
+ """Wait until all on-going requests are finished or cancelled."""
109
+ start_time = time.time()
110
+ while True:
111
+ statuses = [
112
+ requests_lib.RequestStatus.PENDING,
113
+ requests_lib.RequestStatus.RUNNING,
114
+ ]
115
+ reqs = requests_lib.get_request_tasks(status=statuses)
116
+ if not reqs:
117
+ break
118
+ logger.info(f'{len(reqs)} on-going requests '
119
+ 'found, waiting for them to finish...')
120
+ # Proactively cancel internal requests and logs requests since
121
+ # they can run for infinite time.
122
+ internal_request_ids = [
123
+ d.id for d in requests_lib.INTERNAL_REQUEST_DAEMONS
124
+ ]
125
+ if time.time() - start_time > _WAIT_REQUESTS_TIMEOUT_SECONDS:
126
+ logger.warning('Timeout waiting for on-going requests to '
127
+ 'finish, cancelling all on-going requests.')
128
+ for req in reqs:
129
+ self.interrupt_request_for_retry(req.request_id)
130
+ break
131
+ interrupted = 0
132
+ for req in reqs:
133
+ if req.request_id in internal_request_ids:
134
+ self.interrupt_request_for_retry(req.request_id)
135
+ interrupted += 1
136
+ elif req.name in _RETRIABLE_REQUEST_NAMES:
137
+ self.interrupt_request_for_retry(req.request_id)
138
+ interrupted += 1
139
+ # TODO(aylei): interrupt pending requests to accelerate the
140
+ # shutdown.
141
+ # If some requests are not interrupted, wait for them to finish,
142
+ # otherwise we just check again immediately to accelerate the
143
+ # shutdown process.
144
+ if interrupted < len(reqs):
145
+ time.sleep(_WAIT_REQUESTS_INTERVAL_SECONDS)
146
+
147
+ def interrupt_request_for_retry(self, request_id: str) -> None:
148
+ """Interrupt a request for retry."""
149
+ with requests_lib.update_request(request_id) as req:
150
+ if req is None:
151
+ return
152
+ if req.pid is not None:
153
+ os.kill(req.pid, signal.SIGTERM)
154
+ req.status = requests_lib.RequestStatus.CANCELLED
155
+ req.should_retry = True
156
+ logger.info(
157
+ f'Request {request_id} interrupted and will be retried by client.')
158
+
159
+ def run(self, *args, **kwargs):
160
+ """Run the server process."""
161
+ context_utils.hijack_sys_attrs()
162
+ with self.capture_signals():
163
+ asyncio.run(self.serve(*args, **kwargs))
164
+
17
165
 
18
166
  def run(config: uvicorn.Config):
19
167
  """Run unvicorn server."""
@@ -22,28 +170,20 @@ def run(config: uvicorn.Config):
22
170
  # in uvicorn. Since we do not use reload now, simply
23
171
  # guard by an exception.
24
172
  raise ValueError('Reload is not supported yet.')
25
- server = uvicorn.Server(config=config)
26
- run_server_process = functools.partial(_run_server_process, server)
173
+ server = Server(config=config)
27
174
  try:
28
175
  if config.workers is not None and config.workers > 1:
29
176
  sock = config.bind_socket()
30
- SlowStartMultiprocess(config,
31
- target=run_server_process,
177
+ SlowStartMultiprocess(config, target=server.run,
32
178
  sockets=[sock]).run()
33
179
  else:
34
- run_server_process()
180
+ server.run()
35
181
  finally:
36
182
  # Copied from unvicorn.run()
37
183
  if config.uds and os.path.exists(config.uds):
38
184
  os.remove(config.uds)
39
185
 
40
186
 
41
- def _run_server_process(server: uvicorn.Server, *args, **kwargs):
42
- """Run the server process with contextually aware."""
43
- context_utils.hijack_sys_attrs()
44
- server.run(*args, **kwargs)
45
-
46
-
47
187
  class SlowStartMultiprocess(multiprocess.Multiprocess):
48
188
  """Uvicorn Multiprocess wrapper with slow start.
49
189
 
@@ -62,6 +62,8 @@ install_requires = [
62
62
  # the client-side actually not importing them.
63
63
  'casbin',
64
64
  'sqlalchemy_adapter',
65
+ # Required for API server metrics
66
+ 'prometheus_client>=0.8.0',
65
67
  'passlib',
66
68
  ]
67
69
 
sky/skylet/constants.py CHANGED
@@ -401,6 +401,8 @@ PERSISTENT_RUN_SCRIPT_DIR = '~/.sky/.controller_recovery_task_run'
401
401
  PERSISTENT_RUN_RESTARTING_SIGNAL_FILE = (
402
402
  '~/.sky/.controller_recovery_restarting_signal')
403
403
 
404
+ HA_PERSISTENT_RECOVERY_LOG_PATH = '/tmp/ha_recovery.log'
405
+
404
406
  # The placeholder for the local skypilot config path in file mounts for
405
407
  # controllers.
406
408
  LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER = 'skypilot:local_skypilot_config_path'
@@ -411,6 +413,8 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
411
413
  # Environment variable that is set to 'true' if this is a skypilot server.
412
414
  ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
413
415
 
416
+ # Environment variable that is set to 'true' if metrics are enabled.
417
+ ENV_VAR_SERVER_METRICS_ENABLED = 'SKY_API_SERVER_METRICS_ENABLED'
414
418
  # Environment variable that is set to 'true' if basic
415
419
  # authentication is enabled in the API server.
416
420
  ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
@@ -436,39 +440,40 @@ LOGGING_CONFIG_DIR = '~/.sky/logging'
436
440
 
437
441
  # Resources constants
438
442
  TIME_UNITS = {
439
- 's': 1 / 60,
440
- 'sec': 1 / 60,
441
443
  'm': 1,
442
- 'min': 1,
443
444
  'h': 60,
444
- 'hr': 60,
445
445
  'd': 24 * 60,
446
- 'day': 24 * 60,
446
+ 'w': 7 * 24 * 60,
447
447
  }
448
448
 
449
449
  TIME_PATTERN: str = (
450
450
  f'^[0-9]+({"|".join([unit.lower() for unit in TIME_UNITS])})?$/i')
451
451
 
452
452
  MEMORY_SIZE_UNITS = {
453
- 'b': 1,
454
- 'k': 2**10,
455
453
  'kb': 2**10,
456
- 'm': 2**20,
454
+ 'ki': 2**10,
457
455
  'mb': 2**20,
458
- 'g': 2**30,
456
+ 'mi': 2**20,
459
457
  'gb': 2**30,
460
- 't': 2**40,
458
+ 'gi': 2**30,
461
459
  'tb': 2**40,
462
- 'p': 2**50,
460
+ 'ti': 2**40,
463
461
  'pb': 2**50,
462
+ 'pi': 2**50,
464
463
  }
465
464
 
466
465
  MEMORY_SIZE_PATTERN = (
467
466
  '^[0-9]+('
468
- f'{"|".join([unit.lower() for unit in MEMORY_SIZE_UNITS])}'
469
- ')?$/i')
470
- MEMORY_SIZE_PLUS_PATTERN = f'{MEMORY_SIZE_PATTERN[:-3]}+?$/i'
467
+ f'{"|".join([unit.lower() for unit in MEMORY_SIZE_UNITS])}|'
468
+ f'{"|".join([unit.upper() for unit in MEMORY_SIZE_UNITS])}|'
469
+ f'{"|".join([unit[0].upper() + unit[1:] for unit in MEMORY_SIZE_UNITS if len(unit) > 1])}' # pylint: disable=line-too-long
470
+ ')?$')
471
+
472
+ LAST_USE_TRUNC_LENGTH = 25
471
473
 
472
474
  MIN_PRIORITY = -1000
473
475
  MAX_PRIORITY = 1000
474
476
  DEFAULT_PRIORITY = 0
477
+
478
+ GRACE_PERIOD_SECONDS_ENV_VAR = SKYPILOT_ENV_VAR_PREFIX + 'GRACE_PERIOD_SECONDS'
479
+ COST_REPORT_DEFAULT_DAYS = 30