skypilot-nightly 1.0.0.dev20250624__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +26 -11
  3. sky/backends/cloud_vm_ray_backend.py +16 -5
  4. sky/client/cli/command.py +222 -4
  5. sky/client/sdk.py +110 -82
  6. sky/clouds/aws.py +10 -7
  7. sky/clouds/azure.py +10 -7
  8. sky/clouds/cloud.py +2 -0
  9. sky/clouds/cudo.py +2 -0
  10. sky/clouds/do.py +10 -7
  11. sky/clouds/fluidstack.py +2 -0
  12. sky/clouds/gcp.py +10 -7
  13. sky/clouds/hyperbolic.py +10 -7
  14. sky/clouds/ibm.py +2 -0
  15. sky/clouds/kubernetes.py +26 -9
  16. sky/clouds/lambda_cloud.py +10 -7
  17. sky/clouds/nebius.py +10 -7
  18. sky/clouds/oci.py +10 -7
  19. sky/clouds/paperspace.py +10 -7
  20. sky/clouds/runpod.py +10 -7
  21. sky/clouds/scp.py +10 -7
  22. sky/clouds/vast.py +10 -7
  23. sky/clouds/vsphere.py +2 -0
  24. sky/core.py +1 -0
  25. sky/dag.py +14 -0
  26. sky/dashboard/out/404.html +1 -1
  27. sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
  30. sky/dashboard/out/_next/static/chunks/{37-4650f214e2119168.js → 37-1f1e94f5a561202a.js} +2 -2
  31. sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
  32. sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
  36. sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
  37. sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
  38. sky/dashboard/out/_next/static/chunks/{856-bfddc18e16f3873c.js → 856-cdf66268ec878d0c.js} +1 -1
  39. sky/dashboard/out/_next/static/chunks/pages/{_app-ce31493da9747ef4.js → _app-0ef7418d1a3822f3.js} +1 -1
  40. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
  41. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
  42. sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
  47. sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-ecc5a7003776cfa7.js → [name]-0b4c662a25e4747a.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
  54. sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
  55. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  56. sky/dashboard/out/clusters/[cluster].html +1 -1
  57. sky/dashboard/out/clusters.html +1 -1
  58. sky/dashboard/out/config.html +1 -1
  59. sky/dashboard/out/index.html +1 -1
  60. sky/dashboard/out/infra/[context].html +1 -1
  61. sky/dashboard/out/infra.html +1 -1
  62. sky/dashboard/out/jobs/[job].html +1 -1
  63. sky/dashboard/out/jobs.html +1 -1
  64. sky/dashboard/out/users.html +1 -1
  65. sky/dashboard/out/volumes.html +1 -0
  66. sky/dashboard/out/workspace/new.html +1 -1
  67. sky/dashboard/out/workspaces/[name].html +1 -1
  68. sky/dashboard/out/workspaces.html +1 -1
  69. sky/data/storage_utils.py +2 -4
  70. sky/exceptions.py +15 -0
  71. sky/execution.py +5 -0
  72. sky/global_user_state.py +129 -0
  73. sky/jobs/client/sdk.py +13 -11
  74. sky/jobs/server/core.py +4 -0
  75. sky/models.py +16 -0
  76. sky/provision/__init__.py +26 -0
  77. sky/provision/kubernetes/__init__.py +3 -0
  78. sky/provision/kubernetes/instance.py +38 -77
  79. sky/provision/kubernetes/utils.py +52 -2
  80. sky/provision/kubernetes/volume.py +147 -0
  81. sky/resources.py +20 -76
  82. sky/serve/client/sdk.py +13 -13
  83. sky/serve/server/core.py +5 -1
  84. sky/server/common.py +40 -5
  85. sky/server/constants.py +5 -1
  86. sky/server/metrics.py +105 -0
  87. sky/server/requests/executor.py +30 -14
  88. sky/server/requests/payloads.py +16 -0
  89. sky/server/requests/requests.py +35 -1
  90. sky/server/rest.py +152 -0
  91. sky/server/server.py +66 -16
  92. sky/server/state.py +20 -0
  93. sky/server/stream_utils.py +8 -3
  94. sky/server/uvicorn.py +153 -13
  95. sky/setup_files/dependencies.py +2 -0
  96. sky/skylet/constants.py +14 -3
  97. sky/task.py +141 -18
  98. sky/templates/kubernetes-ray.yml.j2 +30 -1
  99. sky/users/permission.py +2 -0
  100. sky/utils/context.py +3 -1
  101. sky/utils/resources_utils.py +66 -0
  102. sky/utils/rich_utils.py +6 -0
  103. sky/utils/schemas.py +146 -3
  104. sky/utils/status_lib.py +10 -0
  105. sky/utils/validator.py +11 -1
  106. sky/volumes/__init__.py +0 -0
  107. sky/volumes/client/__init__.py +0 -0
  108. sky/volumes/client/sdk.py +64 -0
  109. sky/volumes/server/__init__.py +0 -0
  110. sky/volumes/server/core.py +199 -0
  111. sky/volumes/server/server.py +85 -0
  112. sky/volumes/utils.py +158 -0
  113. sky/volumes/volume.py +198 -0
  114. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
  115. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +123 -108
  116. sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
  117. sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +0 -6
  118. sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +0 -1
  119. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
  121. sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
  123. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
  124. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +0 -6
  125. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +0 -6
  126. sky/dashboard/out/_next/static/chunks/pages/clusters-7e9736af1c6345a6.js +0 -1
  127. sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +0 -1
  128. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +0 -1
  129. sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +0 -1
  130. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-171c27f4ca94861c.js +0 -16
  131. sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +0 -1
  133. sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
  134. sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +0 -1
  136. sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +0 -3
  137. sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +0 -1
  138. /sky/dashboard/out/_next/static/{zsALxITkbP8J8NVwSDwMo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
  139. /sky/dashboard/out/_next/static/chunks/{843-bde186946d353355.js → 843-07d25a7e64462fd8.js} +0 -0
  140. /sky/dashboard/out/_next/static/chunks/{938-ce7991c156584b06.js → 938-068520cc11738deb.js} +0 -0
  141. /sky/dashboard/out/_next/static/chunks/{973-56412c7976b4655b.js → 973-5b5019ba333e8d62.js} +0 -0
  142. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
  143. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
  144. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
  145. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
sky/server/metrics.py ADDED
@@ -0,0 +1,105 @@
1
+ """Instrumentation for the API server."""
2
+
3
+ import asyncio
4
+ import os
5
+ import time
6
+
7
+ import fastapi
8
+ from prometheus_client import generate_latest
9
+ from prometheus_client import multiprocess
10
+ import prometheus_client as prom
11
+ import starlette.middleware.base
12
+ import uvicorn
13
+
14
+ from sky import sky_logging
15
+
16
+ logger = sky_logging.init_logger(__name__)
17
+
18
+ # Total number of API server requests, grouped by path, method, and status.
19
+ sky_apiserver_requests_total = prom.Counter(
20
+ 'sky_apiserver_requests_total',
21
+ 'Total number of API server requests',
22
+ ['path', 'method', 'status'],
23
+ )
24
+
25
+ # Time spent processing API server requests, grouped by path, method, and
26
+ # status.
27
+ sky_apiserver_request_duration_seconds = prom.Histogram(
28
+ 'sky_apiserver_request_duration_seconds',
29
+ 'Time spent processing API server requests',
30
+ ['path', 'method', 'status'],
31
+ buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
32
+ float('inf')),
33
+ )
34
+
35
+ metrics_app = fastapi.FastAPI()
36
+
37
+
38
+ @metrics_app.get('/metrics')
39
+ async def metrics() -> fastapi.Response:
40
+ """Expose aggregated Prometheus metrics from all worker processes."""
41
+ if os.environ.get('PROMETHEUS_MULTIPROC_DIR'):
42
+ # In multiprocess mode, we need to collect metrics from all processes.
43
+ registry = prom.CollectorRegistry()
44
+ multiprocess.MultiProcessCollector(registry)
45
+ data = generate_latest(registry)
46
+ else:
47
+ data = generate_latest()
48
+ return fastapi.Response(content=data,
49
+ media_type=prom.CONTENT_TYPE_LATEST,
50
+ headers={'Cache-Control': 'no-cache'})
51
+
52
+
53
+ def run_metrics_server(host: str, port: int):
54
+ metrics_config = uvicorn.Config(
55
+ 'sky.server.metrics:metrics_app',
56
+ host=host,
57
+ port=port,
58
+ workers=1,
59
+ )
60
+ metrics_server_instance = uvicorn.Server(metrics_config)
61
+ asyncio.run(metrics_server_instance.serve())
62
+
63
+
64
+ def _get_status_code_group(status_code: int) -> str:
65
+ """Group status codes into classes (2xx, 5xx) to reduce cardinality."""
66
+ return f'{status_code // 100}xx'
67
+
68
+
69
+ def _is_streaming_api(path: str) -> bool:
70
+ """Check if the path is a streaming API."""
71
+ path = path.rstrip('/')
72
+ return path.endswith('/logs') or path.endswith('/api/stream')
73
+
74
+
75
+ class PrometheusMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
76
+ """Middleware to collect Prometheus metrics for HTTP requests."""
77
+
78
+ async def dispatch(self, request: fastapi.Request, call_next):
79
+ path = request.url.path
80
+ logger.info(f'PROM Middleware Request: {request}, {request.url.path}')
81
+ streaming = _is_streaming_api(path)
82
+ if not streaming:
83
+ # Exclude streaming APIs, the duration is not meaningful.
84
+ # TODO(aylei): measure the duration of async execution instead.
85
+ start_time = time.time()
86
+ method = request.method
87
+ status_code_group = ''
88
+
89
+ try:
90
+ response = await call_next(request)
91
+ status_code_group = _get_status_code_group(response.status_code)
92
+ except Exception: # pylint: disable=broad-except
93
+ status_code_group = '5xx'
94
+ raise
95
+ finally:
96
+ sky_apiserver_requests_total.labels(path=path,
97
+ method=method,
98
+ status=status_code_group).inc()
99
+ if not streaming:
100
+ duration = time.time() - start_time
101
+ sky_apiserver_request_duration_seconds.labels(
102
+ path=path, method=method,
103
+ status=status_code_group).observe(duration)
104
+
105
+ return response
@@ -149,10 +149,25 @@ class RequestWorker:
149
149
  self.schedule_type = schedule_type
150
150
  self.garanteed_parallelism = config.garanteed_parallelism
151
151
  self.burstable_parallelism = config.burstable_parallelism
152
+ self._thread: Optional[threading.Thread] = None
153
+ self._cancel_event = threading.Event()
152
154
 
153
155
  def __str__(self) -> str:
154
156
  return f'Worker(schedule_type={self.schedule_type.value})'
155
157
 
158
+ def run_in_background(self) -> None:
159
+ # Thread dispatcher is sufficient for current scale, refer to
160
+ # tests/load_tests/test_queue_dispatcher.py for more details.
161
+ # Use daemon thread for automatic cleanup.
162
+ thread = threading.Thread(target=self.run, daemon=True)
163
+ thread.start()
164
+ self._thread = thread
165
+
166
+ def cancel(self) -> None:
167
+ if self._thread is not None:
168
+ self._cancel_event.set()
169
+ self._thread.join()
170
+
156
171
  def process_request(self, executor: process.BurstableExecutor,
157
172
  queue: RequestQueue) -> None:
158
173
  try:
@@ -219,7 +234,7 @@ class RequestWorker:
219
234
  burst_workers=self.burstable_parallelism,
220
235
  initializer=executor_initializer,
221
236
  initargs=(proc_group,))
222
- while True:
237
+ while not self._cancel_event.is_set():
223
238
  self.process_request(executor, queue)
224
239
  # TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
225
240
  except KeyboardInterrupt:
@@ -539,15 +554,21 @@ def schedule_request(request_id: str,
539
554
  enqueue()
540
555
 
541
556
 
542
- def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
557
+ def start(
558
+ config: server_config.ServerConfig
559
+ ) -> Tuple[Optional[multiprocessing.Process], List[RequestWorker]]:
543
560
  """Start the request workers.
544
561
 
545
562
  Request workers run in background, schedule the requests and delegate the
546
563
  request execution to executor processes.
564
+
565
+ Returns:
566
+ A tuple of the queue server process and the list of request worker
567
+ threads.
547
568
  """
548
569
  global queue_backend
549
570
  queue_backend = config.queue_backend
550
- sub_procs = []
571
+ queue_server = None
551
572
  # Setup the queues.
552
573
  if queue_backend == server_config.QueueBackend.MULTIPROCESSING:
553
574
  logger.info('Creating shared request queues')
@@ -564,7 +585,6 @@ def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
564
585
  queue_server = multiprocessing.Process(
565
586
  target=mp_queue.start_queue_manager, args=(queue_names, port))
566
587
  queue_server.start()
567
- sub_procs.append(queue_server)
568
588
  mp_queue.wait_for_queues_to_be_ready(queue_names,
569
589
  queue_server,
570
590
  port=port)
@@ -577,20 +597,16 @@ def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
577
597
 
578
598
  logger.info('Request queues created')
579
599
 
580
- def run_worker_in_background(worker: RequestWorker):
581
- # Thread dispatcher is sufficient for current scale, refer to
582
- # tests/load_tests/test_queue_dispatcher.py for more details.
583
- # Use daemon thread for automatic cleanup.
584
- thread = threading.Thread(target=worker.run, daemon=True)
585
- thread.start()
586
-
600
+ workers = []
587
601
  # Start a worker for long requests.
588
602
  long_worker = RequestWorker(schedule_type=api_requests.ScheduleType.LONG,
589
603
  config=config.long_worker_config)
590
- run_worker_in_background(long_worker)
604
+ long_worker.run_in_background()
605
+ workers.append(long_worker)
591
606
 
592
607
  # Start a worker for short requests.
593
608
  short_worker = RequestWorker(schedule_type=api_requests.ScheduleType.SHORT,
594
609
  config=config.short_worker_config)
595
- run_worker_in_background(short_worker)
596
- return sub_procs
610
+ short_worker.run_in_background()
611
+ workers.append(short_worker)
612
+ return queue_server, workers
@@ -368,6 +368,22 @@ class StorageBody(RequestBody):
368
368
  name: str
369
369
 
370
370
 
371
+ class VolumeApplyBody(RequestBody):
372
+ """The request body for the volume apply endpoint."""
373
+ name: str
374
+ volume_type: str
375
+ cloud: str
376
+ region: Optional[str] = None
377
+ zone: Optional[str] = None
378
+ size: Optional[str] = None
379
+ config: Optional[Dict[str, Any]] = None
380
+
381
+
382
+ class VolumeDeleteBody(RequestBody):
383
+ """The request body for the volume delete endpoint."""
384
+ names: List[str]
385
+
386
+
371
387
  class EndpointsBody(RequestBody):
372
388
  """The request body for the endpoint."""
373
389
  cluster: str
@@ -38,6 +38,7 @@ REQUEST_TABLE = 'requests'
38
38
  COL_CLUSTER_NAME = 'cluster_name'
39
39
  COL_USER_ID = 'user_id'
40
40
  COL_STATUS_MSG = 'status_msg'
41
+ COL_SHOULD_RETRY = 'should_retry'
41
42
  REQUEST_LOG_PATH_PREFIX = '~/sky_logs/api_server/requests'
42
43
 
43
44
  # TODO(zhwu): For scalability, there are several TODOs:
@@ -86,6 +87,7 @@ REQUEST_COLUMNS = [
86
87
  'schedule_type',
87
88
  COL_USER_ID,
88
89
  COL_STATUS_MSG,
90
+ COL_SHOULD_RETRY,
89
91
  ]
90
92
 
91
93
 
@@ -115,6 +117,7 @@ class RequestPayload:
115
117
  # Resources the request operates on.
116
118
  cluster_name: Optional[str] = None
117
119
  status_msg: Optional[str] = None
120
+ should_retry: bool = False
118
121
 
119
122
 
120
123
  @dataclasses.dataclass
@@ -137,6 +140,8 @@ class Request:
137
140
  cluster_name: Optional[str] = None
138
141
  # Status message of the request, indicates the reason of current status.
139
142
  status_msg: Optional[str] = None
143
+ # Whether the request should be retried.
144
+ should_retry: bool = False
140
145
 
141
146
  @property
142
147
  def log_path(self) -> pathlib.Path:
@@ -222,6 +227,7 @@ class Request:
222
227
  user_name=user_name,
223
228
  cluster_name=self.cluster_name,
224
229
  status_msg=self.status_msg,
230
+ should_retry=self.should_retry,
225
231
  )
226
232
 
227
233
  def encode(self) -> RequestPayload:
@@ -243,6 +249,7 @@ class Request:
243
249
  user_id=self.user_id,
244
250
  cluster_name=self.cluster_name,
245
251
  status_msg=self.status_msg,
252
+ should_retry=self.should_retry,
246
253
  )
247
254
  except (TypeError, ValueError) as e:
248
255
  # The error is unexpected, so we don't suppress the stack trace.
@@ -274,6 +281,7 @@ class Request:
274
281
  user_id=payload.user_id,
275
282
  cluster_name=payload.cluster_name,
276
283
  status_msg=payload.status_msg,
284
+ should_retry=payload.should_retry,
277
285
  )
278
286
  except (TypeError, ValueError) as e:
279
287
  logger.error(
@@ -327,6 +335,24 @@ def refresh_cluster_status_event():
327
335
  time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
328
336
 
329
337
 
338
+ def refresh_volume_status_event():
339
+ """Periodically refresh the volume status."""
340
+ # pylint: disable=import-outside-toplevel
341
+ from sky.volumes.server import core
342
+
343
+ # Disable logging for periodic refresh to avoid the usage message being
344
+ # sent multiple times.
345
+ os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
346
+
347
+ while True:
348
+ logger.info('=== Refreshing volume status ===')
349
+ core.volume_refresh()
350
+ logger.info('Volume status refreshed. Sleeping '
351
+ f'{server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS}'
352
+ ' seconds for the next refresh...\n')
353
+ time.sleep(server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS)
354
+
355
+
330
356
  def managed_job_status_refresh_event():
331
357
  """Refresh the managed job status for controller consolidation mode."""
332
358
  # pylint: disable=import-outside-toplevel
@@ -362,6 +388,10 @@ INTERNAL_REQUEST_DAEMONS = [
362
388
  InternalRequestDaemon(id='skypilot-status-refresh-daemon',
363
389
  name='status',
364
390
  event_fn=refresh_cluster_status_event),
391
+ # Volume status refresh daemon to update the volume status periodically.
392
+ InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
393
+ name='volume',
394
+ event_fn=refresh_volume_status_event),
365
395
  InternalRequestDaemon(id='managed-job-status-refresh-daemon',
366
396
  name='managed-job-status',
367
397
  event_fn=managed_job_status_refresh_event),
@@ -446,10 +476,14 @@ def create_table(cursor, conn):
446
476
  {COL_CLUSTER_NAME} TEXT,
447
477
  schedule_type TEXT,
448
478
  {COL_USER_ID} TEXT,
449
- {COL_STATUS_MSG} TEXT)""")
479
+ {COL_STATUS_MSG} TEXT,
480
+ {COL_SHOULD_RETRY} INTEGER
481
+ )""")
450
482
 
451
483
  db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_STATUS_MSG,
452
484
  'TEXT')
485
+ db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_SHOULD_RETRY,
486
+ 'INTEGER')
453
487
 
454
488
 
455
489
  _DB = None
sky/server/rest.py ADDED
@@ -0,0 +1,152 @@
1
+ """REST API client of SkyPilot API server"""
2
+
3
+ import contextlib
4
+ import contextvars
5
+ import functools
6
+ import time
7
+ import typing
8
+ from typing import Any, Callable, cast, Optional, TypeVar
9
+
10
+ import colorama
11
+
12
+ from sky import exceptions
13
+ from sky import sky_logging
14
+ from sky.adaptors import common as adaptors_common
15
+ from sky.utils import common_utils
16
+ from sky.utils import rich_utils
17
+ from sky.utils import ux_utils
18
+
19
+ logger = sky_logging.init_logger(__name__)
20
+
21
+ if typing.TYPE_CHECKING:
22
+ import requests
23
+
24
+ else:
25
+ requests = adaptors_common.LazyImport('requests')
26
+
27
+ F = TypeVar('F', bound=Callable[..., Any])
28
+
29
+ _RETRY_CONTEXT = contextvars.ContextVar('retry_context', default=None)
30
+
31
+
32
+ class RetryContext:
33
+
34
+ def __init__(self):
35
+ self.line_processed = 0
36
+
37
+
38
+ def retry_on_server_unavailable(max_wait_seconds: int = 600,
39
+ initial_backoff: float = 5.0,
40
+ max_backoff_factor: int = 5):
41
+ """Decorator that retries a function when ServerTemporarilyUnavailableError
42
+ is caught.
43
+
44
+ Args:
45
+ max_wait_seconds: Maximum number of seconds to wait for the server to
46
+ be healthy
47
+ initial_backoff: Initial backoff time in seconds
48
+ max_backoff_factor: Maximum backoff factor for exponential backoff
49
+
50
+ Notes(dev):
51
+ This decorator is mainly used in two scenarios:
52
+ 1. Decorate a Restful API call to make the API call wait for server
53
+ recovery when server is temporarily unavailable. APIs like /api/get
54
+ and /api/stream should not be retried since sending them to a new
55
+ replica of API server will not work.
56
+ 2. Decorate a SDK function to make the entire SDK function call get
57
+ retried when /api/get or /logs raises a retryable error. This
58
+ is typically triggered by a graceful upgrade of the API server,
59
+ where the pending requests and logs requests will be interrupted.
60
+ """
61
+
62
+ def decorator(func: F) -> F:
63
+
64
+ @functools.wraps(func)
65
+ def wrapper(*args, **kwargs) -> Any:
66
+ msg = (
67
+ f'{colorama.Fore.YELLOW}API server is temporarily: upgrade in '
68
+ f'progress. Waiting to resume...{colorama.Style.RESET_ALL}')
69
+ backoff = common_utils.Backoff(
70
+ initial_backoff=initial_backoff,
71
+ max_backoff_factor=max_backoff_factor)
72
+ start_time = time.time()
73
+ attempt = 0
74
+
75
+ with _retry_in_context():
76
+ while True:
77
+ attempt += 1
78
+ try:
79
+ return func(*args, **kwargs)
80
+ except exceptions.ServerTemporarilyUnavailableError as e:
81
+ # This will cause the status spinner being stopped and
82
+ # restarted in every retry loop. But it is necessary to
83
+ # stop the status spinner before retrying func() to
84
+ # avoid the status spinner get stuck if the func() runs
85
+ # for a long time without update status, e.g. sky logs.
86
+ with rich_utils.client_status(msg):
87
+ if time.time() - start_time > max_wait_seconds:
88
+ # pylint: disable=line-too-long
89
+ raise exceptions.ServerTemporarilyUnavailableError(
90
+ 'Timeout waiting for the API server to be '
91
+ f'available after {max_wait_seconds}s.') \
92
+ from e
93
+
94
+ sleep_time = backoff.current_backoff()
95
+ time.sleep(sleep_time)
96
+ logger.debug('The API server is unavailable. '
97
+ f'Retrying {func.__name__} '
98
+ f'(attempt {attempt}, '
99
+ f'backoff {sleep_time}s).')
100
+
101
+ return cast(F, wrapper)
102
+
103
+ return decorator
104
+
105
+
106
+ @contextlib.contextmanager
107
+ def _retry_in_context():
108
+ token = _RETRY_CONTEXT.set(RetryContext())
109
+ try:
110
+ yield
111
+ finally:
112
+ _RETRY_CONTEXT.reset(token)
113
+
114
+
115
+ def get_retry_context() -> Optional[RetryContext]:
116
+ return _RETRY_CONTEXT.get()
117
+
118
+
119
+ def handle_server_unavailable(response: 'requests.Response') -> None:
120
+ if response.status_code == 503:
121
+ # TODO(aylei): Hacky, depends on how nginx controller handles backends
122
+ # with no ready endpoints. Should use self-defined status code or header
123
+ # to distinguish retryable server error from general 503 errors.
124
+ with ux_utils.print_exception_no_traceback():
125
+ raise exceptions.ServerTemporarilyUnavailableError(
126
+ 'SkyPilot API server is temporarily unavailable. '
127
+ 'Please try again later.')
128
+
129
+
130
+ @retry_on_server_unavailable()
131
+ def post(url, data=None, json=None, **kwargs) -> 'requests.Response':
132
+ """Send a POST request to the API server, retry on server temporarily
133
+ unavailable."""
134
+ response = requests.post(url, data=data, json=json, **kwargs)
135
+ handle_server_unavailable(response)
136
+ return response
137
+
138
+
139
+ @retry_on_server_unavailable()
140
+ def get(url, params=None, **kwargs) -> 'requests.Response':
141
+ """Send a GET request to the API server, retry on server temporarily
142
+ unavailable."""
143
+ response = requests.get(url, params=params, **kwargs)
144
+ handle_server_unavailable(response)
145
+ return response
146
+
147
+
148
+ def get_without_retry(url, params=None, **kwargs) -> 'requests.Response':
149
+ """Send a GET request to the API server without retry."""
150
+ response = requests.get(url, params=params, **kwargs)
151
+ handle_server_unavailable(response)
152
+ return response
sky/server/server.py CHANGED
@@ -16,6 +16,7 @@ import posixpath
16
16
  import re
17
17
  import shutil
18
18
  import sys
19
+ import threading
19
20
  from typing import Any, Dict, List, Literal, Optional, Set, Tuple
20
21
  import uuid
21
22
  import zipfile
@@ -43,6 +44,8 @@ from sky.serve.server import server as serve_rest
43
44
  from sky.server import common
44
45
  from sky.server import config as server_config
45
46
  from sky.server import constants as server_constants
47
+ from sky.server import metrics
48
+ from sky.server import state
46
49
  from sky.server import stream_utils
47
50
  from sky.server.requests import executor
48
51
  from sky.server.requests import payloads
@@ -61,6 +64,7 @@ from sky.utils import dag_utils
61
64
  from sky.utils import env_options
62
65
  from sky.utils import status_lib
63
66
  from sky.utils import subprocess_utils
67
+ from sky.volumes.server import server as volumes_rest
64
68
  from sky.workspaces import server as workspaces_rest
65
69
 
66
70
  # pylint: disable=ungrouped-imports
@@ -378,9 +382,32 @@ class PathCleanMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
378
382
  return await call_next(request)
379
383
 
380
384
 
385
+ class GracefulShutdownMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
386
+ """Middleware to control requests when server is shutting down."""
387
+
388
+ async def dispatch(self, request: fastapi.Request, call_next):
389
+ if state.get_block_requests():
390
+ # Allow /api/ paths to continue, which are critical to operate
391
+ # on-going requests but will not submit new requests.
392
+ if not request.url.path.startswith('/api/'):
393
+ # Client will retry on 503 error.
394
+ return fastapi.responses.JSONResponse(
395
+ status_code=503,
396
+ content={
397
+ 'detail': 'Server is shutting down, '
398
+ 'please try again later.'
399
+ })
400
+
401
+ return await call_next(request)
402
+
403
+
381
404
  app = fastapi.FastAPI(prefix='/api/v1', debug=True, lifespan=lifespan)
405
+ # Use environment variable to make the metrics middleware optional.
406
+ if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
407
+ app.add_middleware(metrics.PrometheusMiddleware)
382
408
  app.add_middleware(RBACMiddleware)
383
409
  app.add_middleware(InternalDashboardPrefixMiddleware)
410
+ app.add_middleware(GracefulShutdownMiddleware)
384
411
  app.add_middleware(PathCleanMiddleware)
385
412
  app.add_middleware(CacheControlStaticMiddleware)
386
413
  app.add_middleware(
@@ -404,6 +431,7 @@ app.include_router(users_rest.router, prefix='/users', tags=['users'])
404
431
  app.include_router(workspaces_rest.router,
405
432
  prefix='/workspaces',
406
433
  tags=['workspaces'])
434
+ app.include_router(volumes_rest.router, prefix='/volumes', tags=['volumes'])
407
435
 
408
436
 
409
437
  @app.get('/token')
@@ -564,6 +592,8 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
564
592
  ctx.override_envs(validate_body.env_vars)
565
593
 
566
594
  def validate_dag(dag: dag_utils.dag_lib.Dag):
595
+ # Resolve the volumes before admin policy and validation.
596
+ dag.resolve_and_validate_volumes()
567
597
  # TODO: Admin policy may contain arbitrary code, which may be expensive
568
598
  # to run and may block the server thread. However, moving it into the
569
599
  # executor adds a ~150ms penalty on the local API server because of
@@ -826,6 +856,10 @@ async def status(
826
856
  status_body: payloads.StatusBody = payloads.StatusBody()
827
857
  ) -> None:
828
858
  """Gets cluster statuses."""
859
+ if state.get_block_requests():
860
+ raise fastapi.HTTPException(
861
+ status_code=503,
862
+ detail='Server is shutting down, please try again later.')
829
863
  executor.schedule_request(
830
864
  request_id=request.state.request_id,
831
865
  request_name='status',
@@ -1145,6 +1179,10 @@ async def api_get(request_id: str) -> requests_lib.RequestPayload:
1145
1179
  raise fastapi.HTTPException(
1146
1180
  status_code=404, detail=f'Request {request_id!r} not found')
1147
1181
  if request_task.status > requests_lib.RequestStatus.RUNNING:
1182
+ if request_task.should_retry:
1183
+ raise fastapi.HTTPException(
1184
+ status_code=503,
1185
+ detail=f'Request {request_id!r} should be retried')
1148
1186
  request_error = request_task.get_error()
1149
1187
  if request_error is not None:
1150
1188
  raise fastapi.HTTPException(status_code=500,
@@ -1435,6 +1473,11 @@ async def complete_storage_name(incomplete: str,) -> List[str]:
1435
1473
  return global_user_state.get_storage_names_start_with(incomplete)
1436
1474
 
1437
1475
 
1476
+ @app.get('/api/completion/volume_name')
1477
+ async def complete_volume_name(incomplete: str,) -> List[str]:
1478
+ return global_user_state.get_volume_names_start_with(incomplete)
1479
+
1480
+
1438
1481
  @app.get('/dashboard/{full_path:path}')
1439
1482
  async def serve_dashboard(full_path: str):
1440
1483
  """Serves the Next.js dashboard application.
@@ -1461,6 +1504,7 @@ async def serve_dashboard(full_path: str):
1461
1504
  try:
1462
1505
  with open(index_path, 'r', encoding='utf-8') as f:
1463
1506
  content = f.read()
1507
+
1464
1508
  return fastapi.responses.HTMLResponse(content=content)
1465
1509
  except Exception as e:
1466
1510
  logger.error(f'Error serving dashboard: {e}')
@@ -1484,7 +1528,13 @@ if __name__ == '__main__':
1484
1528
  parser.add_argument('--host', default='127.0.0.1')
1485
1529
  parser.add_argument('--port', default=46580, type=int)
1486
1530
  parser.add_argument('--deploy', action='store_true')
1531
+ # Serve metrics on a separate port to isolate it from the application APIs:
1532
+ # metrics port will not be exposed to the public network typically.
1533
+ parser.add_argument('--metrics-port', default=9090, type=int)
1487
1534
  cmd_args = parser.parse_args()
1535
+ if cmd_args.port == cmd_args.metrics_port:
1536
+ raise ValueError('port and metrics-port cannot be the same')
1537
+
1488
1538
  # Show the privacy policy if it is not already shown. We place it here so
1489
1539
  # that it is shown only when the API server is started.
1490
1540
  usage_lib.maybe_show_privacy_policy()
@@ -1492,9 +1542,17 @@ if __name__ == '__main__':
1492
1542
  config = server_config.compute_server_config(cmd_args.deploy)
1493
1543
  num_workers = config.num_server_workers
1494
1544
 
1495
- sub_procs = []
1545
+ queue_server: Optional[multiprocessing.Process] = None
1546
+ workers: List[executor.RequestWorker] = []
1496
1547
  try:
1497
- sub_procs = executor.start(config)
1548
+ if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
1549
+ metrics_thread = threading.Thread(target=metrics.run_metrics_server,
1550
+ args=(cmd_args.host,
1551
+ cmd_args.metrics_port),
1552
+ daemon=True)
1553
+ metrics_thread.start()
1554
+ queue_server, workers = executor.start(config)
1555
+
1498
1556
  logger.info(f'Starting SkyPilot API server, workers={num_workers}')
1499
1557
  # We don't support reload for now, since it may cause leakage of request
1500
1558
  # workers or interrupt running requests.
@@ -1510,17 +1568,9 @@ if __name__ == '__main__':
1510
1568
  finally:
1511
1569
  logger.info('Shutting down SkyPilot API server...')
1512
1570
 
1513
- def cleanup(proc: multiprocessing.Process) -> None:
1514
- try:
1515
- proc.terminate()
1516
- proc.join()
1517
- finally:
1518
- # The process may not be started yet, close it anyway.
1519
- proc.close()
1520
-
1521
- # Terminate processes in reverse order in case dependency, especially
1522
- # queue server. Terminate queue server first does not affect the
1523
- # correctness of cleanup but introduce redundant error messages.
1524
- subprocess_utils.run_in_parallel(cleanup,
1525
- list(reversed(sub_procs)),
1526
- num_threads=len(sub_procs))
1571
+ subprocess_utils.run_in_parallel(lambda worker: worker.cancel(),
1572
+ workers,
1573
+ num_threads=len(workers))
1574
+ if queue_server is not None:
1575
+ queue_server.kill()
1576
+ queue_server.join()
sky/server/state.py ADDED
@@ -0,0 +1,20 @@
1
+ """State for API server process."""
2
+
3
+ # This state is used to block requests except /api operations, which is useful
4
+ # when a server is shutting down: new requests will be blocked, but existing
5
+ # requests will be allowed to finish and be operated via /api operations, e.g.
6
+ # /api/logs, /api/cancel, etc.
7
+ _block_requests = False
8
+
9
+
10
+ # TODO(aylei): refactor, state should be a instance property of API server app
11
+ # instead of a global variable.
12
+ def get_block_requests() -> bool:
13
+ """Whether block requests except /api operations."""
14
+ return _block_requests
15
+
16
+
17
+ def set_block_requests(shutting_down: bool) -> None:
18
+ """Set the API server to block requests except /api operations."""
19
+ global _block_requests
20
+ _block_requests = shutting_down