skypilot-nightly 1.0.0.dev20250624__py3-none-any.whl → 1.0.0.dev20250626__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +1 -6
  3. sky/backends/backend_utils.py +26 -11
  4. sky/backends/cloud_vm_ray_backend.py +16 -5
  5. sky/client/cli/command.py +232 -9
  6. sky/client/sdk.py +195 -91
  7. sky/clouds/aws.py +10 -7
  8. sky/clouds/azure.py +10 -7
  9. sky/clouds/cloud.py +2 -0
  10. sky/clouds/cudo.py +2 -0
  11. sky/clouds/do.py +10 -7
  12. sky/clouds/fluidstack.py +2 -0
  13. sky/clouds/gcp.py +10 -7
  14. sky/clouds/hyperbolic.py +10 -7
  15. sky/clouds/ibm.py +2 -0
  16. sky/clouds/kubernetes.py +26 -9
  17. sky/clouds/lambda_cloud.py +10 -7
  18. sky/clouds/nebius.py +10 -7
  19. sky/clouds/oci.py +10 -7
  20. sky/clouds/paperspace.py +10 -7
  21. sky/clouds/runpod.py +10 -7
  22. sky/clouds/scp.py +10 -7
  23. sky/clouds/ssh.py +36 -0
  24. sky/clouds/vast.py +10 -7
  25. sky/clouds/vsphere.py +2 -0
  26. sky/core.py +21 -0
  27. sky/dag.py +14 -0
  28. sky/dashboard/out/404.html +1 -1
  29. sky/dashboard/out/_next/static/bs6UB9V4Jq10TIZ5x-kBK/_buildManifest.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/141-fa5a20cbf401b351.js +11 -0
  31. sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/25.76c246239df93d50.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/43-36177d00f6956ab2.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/430.ed51037d1a4a438b.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
  36. sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/690.55f9eed3be903f56.js +16 -0
  39. sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
  40. sky/dashboard/out/_next/static/chunks/785.dc2686c3c1235554.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/871-3db673be3ee3750b.js +6 -0
  42. sky/dashboard/out/_next/static/chunks/875.52c962183328b3f2.js +25 -0
  43. sky/dashboard/out/_next/static/chunks/973-81b2d057178adb76.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/982.1b61658204416b0f.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/984.e8bac186a24e5178.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/990-0ad5ea1699e03ee8.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/pages/{_app-ce31493da9747ef4.js → _app-9a3ce3170d2edcec.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
  49. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-8040f2483897ed0c.js +6 -0
  50. sky/dashboard/out/_next/static/chunks/pages/{clusters-7e9736af1c6345a6.js → clusters-f119a5630a1efd61.js} +1 -1
  51. sky/dashboard/out/_next/static/chunks/pages/config-6b255eae088da6a3.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-b302aea4d65766bf.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/pages/infra-ee8cc4d449945d19.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
  55. sky/dashboard/out/_next/static/chunks/pages/jobs-0a5695ff3075d94a.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/users-4978cbb093e141e7.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/pages/workspace/{new-31aa8bdcb7592635.js → new-5b59bce9eb208d84.js} +1 -1
  59. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-cb7e720b739de53a.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/workspaces-50e230828730cfb3.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/webpack-08fdb9e6070127fc.js +1 -0
  62. sky/dashboard/out/_next/static/css/52082cf558ec9705.css +3 -0
  63. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  64. sky/dashboard/out/clusters/[cluster].html +1 -1
  65. sky/dashboard/out/clusters.html +1 -1
  66. sky/dashboard/out/config.html +1 -1
  67. sky/dashboard/out/index.html +1 -1
  68. sky/dashboard/out/infra/[context].html +1 -1
  69. sky/dashboard/out/infra.html +1 -1
  70. sky/dashboard/out/jobs/[job].html +1 -1
  71. sky/dashboard/out/jobs.html +1 -1
  72. sky/dashboard/out/users.html +1 -1
  73. sky/dashboard/out/volumes.html +1 -0
  74. sky/dashboard/out/workspace/new.html +1 -1
  75. sky/dashboard/out/workspaces/[name].html +1 -1
  76. sky/dashboard/out/workspaces.html +1 -1
  77. sky/data/storage_utils.py +2 -4
  78. sky/exceptions.py +15 -0
  79. sky/execution.py +5 -0
  80. sky/global_user_state.py +129 -0
  81. sky/jobs/client/sdk.py +13 -11
  82. sky/jobs/server/core.py +4 -0
  83. sky/models.py +16 -0
  84. sky/provision/__init__.py +26 -0
  85. sky/provision/kubernetes/__init__.py +3 -0
  86. sky/provision/kubernetes/instance.py +38 -77
  87. sky/provision/kubernetes/utils.py +70 -4
  88. sky/provision/kubernetes/volume.py +147 -0
  89. sky/resources.py +20 -76
  90. sky/serve/client/sdk.py +13 -13
  91. sky/serve/server/core.py +5 -1
  92. sky/server/common.py +40 -5
  93. sky/server/constants.py +5 -1
  94. sky/server/metrics.py +105 -0
  95. sky/server/requests/executor.py +30 -14
  96. sky/server/requests/payloads.py +16 -0
  97. sky/server/requests/requests.py +35 -1
  98. sky/server/rest.py +153 -0
  99. sky/server/server.py +70 -43
  100. sky/server/state.py +20 -0
  101. sky/server/stream_utils.py +8 -3
  102. sky/server/uvicorn.py +153 -13
  103. sky/setup_files/dependencies.py +2 -0
  104. sky/skylet/constants.py +19 -3
  105. sky/skypilot_config.py +3 -0
  106. sky/ssh_node_pools/__init__.py +1 -0
  107. sky/ssh_node_pools/core.py +133 -0
  108. sky/ssh_node_pools/server.py +232 -0
  109. sky/task.py +141 -18
  110. sky/templates/kubernetes-ray.yml.j2 +30 -1
  111. sky/users/permission.py +2 -0
  112. sky/utils/context.py +3 -1
  113. sky/utils/kubernetes/deploy_remote_cluster.py +12 -185
  114. sky/utils/kubernetes/ssh_utils.py +221 -0
  115. sky/utils/resources_utils.py +66 -0
  116. sky/utils/rich_utils.py +6 -0
  117. sky/utils/schemas.py +146 -3
  118. sky/utils/status_lib.py +10 -0
  119. sky/utils/validator.py +11 -1
  120. sky/volumes/__init__.py +0 -0
  121. sky/volumes/client/__init__.py +0 -0
  122. sky/volumes/client/sdk.py +64 -0
  123. sky/volumes/server/__init__.py +0 -0
  124. sky/volumes/server/core.py +199 -0
  125. sky/volumes/server/server.py +85 -0
  126. sky/volumes/utils.py +158 -0
  127. sky/volumes/volume.py +198 -0
  128. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/METADATA +2 -1
  129. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/RECORD +135 -115
  130. sky/dashboard/out/_next/static/chunks/211.692afc57e812ae1a.js +0 -1
  131. sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/37-4650f214e2119168.js +0 -6
  133. sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +0 -6
  134. sky/dashboard/out/_next/static/chunks/443.b2242d0efcdf5f47.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/513.309df9e18a9ff005.js +0 -1
  137. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
  139. sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
  140. sky/dashboard/out/_next/static/chunks/843-bde186946d353355.js +0 -11
  141. sky/dashboard/out/_next/static/chunks/856-bfddc18e16f3873c.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/973-56412c7976b4655b.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
  145. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +0 -6
  146. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +0 -6
  147. sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +0 -1
  149. sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-171c27f4ca94861c.js +0 -16
  151. sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +0 -1
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-ecc5a7003776cfa7.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +0 -1
  155. sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +0 -1
  156. sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +0 -3
  157. sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +0 -1
  158. /sky/dashboard/out/_next/static/{zsALxITkbP8J8NVwSDwMo → bs6UB9V4Jq10TIZ5x-kBK}/_ssgManifest.js +0 -0
  159. /sky/dashboard/out/_next/static/chunks/{938-ce7991c156584b06.js → 938-068520cc11738deb.js} +0 -0
  160. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/WHEEL +0 -0
  161. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/entry_points.txt +0 -0
  162. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/licenses/LICENSE +0 -0
  163. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/top_level.txt +0 -0
@@ -38,6 +38,7 @@ REQUEST_TABLE = 'requests'
38
38
  COL_CLUSTER_NAME = 'cluster_name'
39
39
  COL_USER_ID = 'user_id'
40
40
  COL_STATUS_MSG = 'status_msg'
41
+ COL_SHOULD_RETRY = 'should_retry'
41
42
  REQUEST_LOG_PATH_PREFIX = '~/sky_logs/api_server/requests'
42
43
 
43
44
  # TODO(zhwu): For scalability, there are several TODOs:
@@ -86,6 +87,7 @@ REQUEST_COLUMNS = [
86
87
  'schedule_type',
87
88
  COL_USER_ID,
88
89
  COL_STATUS_MSG,
90
+ COL_SHOULD_RETRY,
89
91
  ]
90
92
 
91
93
 
@@ -115,6 +117,7 @@ class RequestPayload:
115
117
  # Resources the request operates on.
116
118
  cluster_name: Optional[str] = None
117
119
  status_msg: Optional[str] = None
120
+ should_retry: bool = False
118
121
 
119
122
 
120
123
  @dataclasses.dataclass
@@ -137,6 +140,8 @@ class Request:
137
140
  cluster_name: Optional[str] = None
138
141
  # Status message of the request, indicates the reason of current status.
139
142
  status_msg: Optional[str] = None
143
+ # Whether the request should be retried.
144
+ should_retry: bool = False
140
145
 
141
146
  @property
142
147
  def log_path(self) -> pathlib.Path:
@@ -222,6 +227,7 @@ class Request:
222
227
  user_name=user_name,
223
228
  cluster_name=self.cluster_name,
224
229
  status_msg=self.status_msg,
230
+ should_retry=self.should_retry,
225
231
  )
226
232
 
227
233
  def encode(self) -> RequestPayload:
@@ -243,6 +249,7 @@ class Request:
243
249
  user_id=self.user_id,
244
250
  cluster_name=self.cluster_name,
245
251
  status_msg=self.status_msg,
252
+ should_retry=self.should_retry,
246
253
  )
247
254
  except (TypeError, ValueError) as e:
248
255
  # The error is unexpected, so we don't suppress the stack trace.
@@ -274,6 +281,7 @@ class Request:
274
281
  user_id=payload.user_id,
275
282
  cluster_name=payload.cluster_name,
276
283
  status_msg=payload.status_msg,
284
+ should_retry=payload.should_retry,
277
285
  )
278
286
  except (TypeError, ValueError) as e:
279
287
  logger.error(
@@ -327,6 +335,24 @@ def refresh_cluster_status_event():
327
335
  time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
328
336
 
329
337
 
338
+ def refresh_volume_status_event():
339
+ """Periodically refresh the volume status."""
340
+ # pylint: disable=import-outside-toplevel
341
+ from sky.volumes.server import core
342
+
343
+ # Disable logging for periodic refresh to avoid the usage message being
344
+ # sent multiple times.
345
+ os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
346
+
347
+ while True:
348
+ logger.info('=== Refreshing volume status ===')
349
+ core.volume_refresh()
350
+ logger.info('Volume status refreshed. Sleeping '
351
+ f'{server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS}'
352
+ ' seconds for the next refresh...\n')
353
+ time.sleep(server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS)
354
+
355
+
330
356
  def managed_job_status_refresh_event():
331
357
  """Refresh the managed job status for controller consolidation mode."""
332
358
  # pylint: disable=import-outside-toplevel
@@ -362,6 +388,10 @@ INTERNAL_REQUEST_DAEMONS = [
362
388
  InternalRequestDaemon(id='skypilot-status-refresh-daemon',
363
389
  name='status',
364
390
  event_fn=refresh_cluster_status_event),
391
+ # Volume status refresh daemon to update the volume status periodically.
392
+ InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
393
+ name='volume',
394
+ event_fn=refresh_volume_status_event),
365
395
  InternalRequestDaemon(id='managed-job-status-refresh-daemon',
366
396
  name='managed-job-status',
367
397
  event_fn=managed_job_status_refresh_event),
@@ -446,10 +476,14 @@ def create_table(cursor, conn):
446
476
  {COL_CLUSTER_NAME} TEXT,
447
477
  schedule_type TEXT,
448
478
  {COL_USER_ID} TEXT,
449
- {COL_STATUS_MSG} TEXT)""")
479
+ {COL_STATUS_MSG} TEXT,
480
+ {COL_SHOULD_RETRY} INTEGER
481
+ )""")
450
482
 
451
483
  db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_STATUS_MSG,
452
484
  'TEXT')
485
+ db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_SHOULD_RETRY,
486
+ 'INTEGER')
453
487
 
454
488
 
455
489
  _DB = None
sky/server/rest.py ADDED
@@ -0,0 +1,153 @@
1
+ """REST API client of SkyPilot API server"""
2
+
3
+ import contextlib
4
+ import contextvars
5
+ import functools
6
+ import time
7
+ import typing
8
+ from typing import Any, Callable, cast, Optional, TypeVar
9
+
10
+ import colorama
11
+
12
+ from sky import exceptions
13
+ from sky import sky_logging
14
+ from sky.adaptors import common as adaptors_common
15
+ from sky.utils import common_utils
16
+ from sky.utils import rich_utils
17
+ from sky.utils import ux_utils
18
+
19
+ logger = sky_logging.init_logger(__name__)
20
+
21
+ if typing.TYPE_CHECKING:
22
+ import requests
23
+
24
+ else:
25
+ requests = adaptors_common.LazyImport('requests')
26
+
27
+ F = TypeVar('F', bound=Callable[..., Any])
28
+
29
+ _RETRY_CONTEXT = contextvars.ContextVar('retry_context', default=None)
30
+
31
+
32
+ class RetryContext:
33
+
34
+ def __init__(self):
35
+ self.line_processed = 0
36
+
37
+
38
+ def retry_on_server_unavailable(max_wait_seconds: int = 600,
39
+ initial_backoff: float = 5.0,
40
+ max_backoff_factor: int = 5):
41
+ """Decorator that retries a function when ServerTemporarilyUnavailableError
42
+ is caught.
43
+
44
+ Args:
45
+ max_wait_seconds: Maximum number of seconds to wait for the server to
46
+ be healthy
47
+ initial_backoff: Initial backoff time in seconds
48
+ max_backoff_factor: Maximum backoff factor for exponential backoff
49
+
50
+ Notes(dev):
51
+ This decorator is mainly used in two scenarios:
52
+ 1. Decorate a Restful API call to make the API call wait for server
53
+ recovery when server is temporarily unavailable. APIs like /api/get
54
+ and /api/stream should not be retried since sending them to a new
55
+ replica of API server will not work.
56
+ 2. Decorate a SDK function to make the entire SDK function call get
57
+ retried when /api/get or /logs raises a retryable error. This
58
+ is typically triggered by a graceful upgrade of the API server,
59
+ where the pending requests and logs requests will be interrupted.
60
+ """
61
+
62
+ def decorator(func: F) -> F:
63
+
64
+ @functools.wraps(func)
65
+ def wrapper(*args, **kwargs) -> Any:
66
+ msg = (
67
+ f'{colorama.Fore.YELLOW}API server is temporarily unavailable: '
68
+ 'upgrade in progress. Waiting to resume...'
69
+ f'{colorama.Style.RESET_ALL}')
70
+ backoff = common_utils.Backoff(
71
+ initial_backoff=initial_backoff,
72
+ max_backoff_factor=max_backoff_factor)
73
+ start_time = time.time()
74
+ attempt = 0
75
+
76
+ with _retry_in_context():
77
+ while True:
78
+ attempt += 1
79
+ try:
80
+ return func(*args, **kwargs)
81
+ except exceptions.ServerTemporarilyUnavailableError as e:
82
+ # This will cause the status spinner being stopped and
83
+ # restarted in every retry loop. But it is necessary to
84
+ # stop the status spinner before retrying func() to
85
+ # avoid the status spinner get stuck if the func() runs
86
+ # for a long time without update status, e.g. sky logs.
87
+ with rich_utils.client_status(msg):
88
+ if time.time() - start_time > max_wait_seconds:
89
+ # pylint: disable=line-too-long
90
+ raise exceptions.ServerTemporarilyUnavailableError(
91
+ 'Timeout waiting for the API server to be '
92
+ f'available after {max_wait_seconds}s.') \
93
+ from e
94
+
95
+ sleep_time = backoff.current_backoff()
96
+ time.sleep(sleep_time)
97
+ logger.debug('The API server is unavailable. '
98
+ f'Retrying {func.__name__} '
99
+ f'(attempt {attempt}, '
100
+ f'backoff {sleep_time}s).')
101
+
102
+ return cast(F, wrapper)
103
+
104
+ return decorator
105
+
106
+
107
+ @contextlib.contextmanager
108
+ def _retry_in_context():
109
+ token = _RETRY_CONTEXT.set(RetryContext())
110
+ try:
111
+ yield
112
+ finally:
113
+ _RETRY_CONTEXT.reset(token)
114
+
115
+
116
+ def get_retry_context() -> Optional[RetryContext]:
117
+ return _RETRY_CONTEXT.get()
118
+
119
+
120
+ def handle_server_unavailable(response: 'requests.Response') -> None:
121
+ if response.status_code == 503:
122
+ # TODO(aylei): Hacky, depends on how nginx controller handles backends
123
+ # with no ready endpoints. Should use self-defined status code or header
124
+ # to distinguish retryable server error from general 503 errors.
125
+ with ux_utils.print_exception_no_traceback():
126
+ raise exceptions.ServerTemporarilyUnavailableError(
127
+ 'SkyPilot API server is temporarily unavailable. '
128
+ 'Please try again later.')
129
+
130
+
131
+ @retry_on_server_unavailable()
132
+ def post(url, data=None, json=None, **kwargs) -> 'requests.Response':
133
+ """Send a POST request to the API server, retry on server temporarily
134
+ unavailable."""
135
+ response = requests.post(url, data=data, json=json, **kwargs)
136
+ handle_server_unavailable(response)
137
+ return response
138
+
139
+
140
+ @retry_on_server_unavailable()
141
+ def get(url, params=None, **kwargs) -> 'requests.Response':
142
+ """Send a GET request to the API server, retry on server temporarily
143
+ unavailable."""
144
+ response = requests.get(url, params=params, **kwargs)
145
+ handle_server_unavailable(response)
146
+ return response
147
+
148
+
149
+ def get_without_retry(url, params=None, **kwargs) -> 'requests.Response':
150
+ """Send a GET request to the API server without retry."""
151
+ response = requests.get(url, params=params, **kwargs)
152
+ handle_server_unavailable(response)
153
+ return response
sky/server/server.py CHANGED
@@ -16,6 +16,7 @@ import posixpath
16
16
  import re
17
17
  import shutil
18
18
  import sys
19
+ import threading
19
20
  from typing import Any, Dict, List, Literal, Optional, Set, Tuple
20
21
  import uuid
21
22
  import zipfile
@@ -43,12 +44,15 @@ from sky.serve.server import server as serve_rest
43
44
  from sky.server import common
44
45
  from sky.server import config as server_config
45
46
  from sky.server import constants as server_constants
47
+ from sky.server import metrics
48
+ from sky.server import state
46
49
  from sky.server import stream_utils
47
50
  from sky.server.requests import executor
48
51
  from sky.server.requests import payloads
49
52
  from sky.server.requests import preconditions
50
53
  from sky.server.requests import requests as requests_lib
51
54
  from sky.skylet import constants
55
+ from sky.ssh_node_pools import server as ssh_node_pools_rest
52
56
  from sky.usage import usage_lib
53
57
  from sky.users import permission
54
58
  from sky.users import server as users_rest
@@ -61,6 +65,7 @@ from sky.utils import dag_utils
61
65
  from sky.utils import env_options
62
66
  from sky.utils import status_lib
63
67
  from sky.utils import subprocess_utils
68
+ from sky.volumes.server import server as volumes_rest
64
69
  from sky.workspaces import server as workspaces_rest
65
70
 
66
71
  # pylint: disable=ungrouped-imports
@@ -378,9 +383,32 @@ class PathCleanMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
378
383
  return await call_next(request)
379
384
 
380
385
 
386
+ class GracefulShutdownMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
387
+ """Middleware to control requests when server is shutting down."""
388
+
389
+ async def dispatch(self, request: fastapi.Request, call_next):
390
+ if state.get_block_requests():
391
+ # Allow /api/ paths to continue, which are critical to operate
392
+ # on-going requests but will not submit new requests.
393
+ if not request.url.path.startswith('/api/'):
394
+ # Client will retry on 503 error.
395
+ return fastapi.responses.JSONResponse(
396
+ status_code=503,
397
+ content={
398
+ 'detail': 'Server is shutting down, '
399
+ 'please try again later.'
400
+ })
401
+
402
+ return await call_next(request)
403
+
404
+
381
405
  app = fastapi.FastAPI(prefix='/api/v1', debug=True, lifespan=lifespan)
406
+ # Use environment variable to make the metrics middleware optional.
407
+ if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
408
+ app.add_middleware(metrics.PrometheusMiddleware)
382
409
  app.add_middleware(RBACMiddleware)
383
410
  app.add_middleware(InternalDashboardPrefixMiddleware)
411
+ app.add_middleware(GracefulShutdownMiddleware)
384
412
  app.add_middleware(PathCleanMiddleware)
385
413
  app.add_middleware(CacheControlStaticMiddleware)
386
414
  app.add_middleware(
@@ -404,6 +432,10 @@ app.include_router(users_rest.router, prefix='/users', tags=['users'])
404
432
  app.include_router(workspaces_rest.router,
405
433
  prefix='/workspaces',
406
434
  tags=['workspaces'])
435
+ app.include_router(volumes_rest.router, prefix='/volumes', tags=['volumes'])
436
+ app.include_router(ssh_node_pools_rest.router,
437
+ prefix='/ssh_node_pools',
438
+ tags=['ssh_node_pools'])
407
439
 
408
440
 
409
441
  @app.get('/token')
@@ -564,6 +596,8 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
564
596
  ctx.override_envs(validate_body.env_vars)
565
597
 
566
598
  def validate_dag(dag: dag_utils.dag_lib.Dag):
599
+ # Resolve the volumes before admin policy and validation.
600
+ dag.resolve_and_validate_volumes()
567
601
  # TODO: Admin policy may contain arbitrary code, which may be expensive
568
602
  # to run and may block the server thread. However, moving it into the
569
603
  # executor adds a ~150ms penalty on the local API server because of
@@ -826,6 +860,10 @@ async def status(
826
860
  status_body: payloads.StatusBody = payloads.StatusBody()
827
861
  ) -> None:
828
862
  """Gets cluster statuses."""
863
+ if state.get_block_requests():
864
+ raise fastapi.HTTPException(
865
+ status_code=503,
866
+ detail='Server is shutting down, please try again later.')
829
867
  executor.schedule_request(
830
868
  request_id=request.state.request_id,
831
869
  request_name='status',
@@ -1107,33 +1145,6 @@ async def local_down(request: fastapi.Request) -> None:
1107
1145
  )
1108
1146
 
1109
1147
 
1110
- @app.post('/ssh_up')
1111
- async def ssh_up(request: fastapi.Request,
1112
- ssh_up_body: payloads.SSHUpBody) -> None:
1113
- """Deploys a Kubernetes cluster on SSH targets."""
1114
- executor.schedule_request(
1115
- request_id=request.state.request_id,
1116
- request_name='ssh_up',
1117
- request_body=ssh_up_body,
1118
- func=core.ssh_up,
1119
- schedule_type=requests_lib.ScheduleType.LONG,
1120
- )
1121
-
1122
-
1123
- @app.post('/ssh_down')
1124
- async def ssh_down(request: fastapi.Request,
1125
- ssh_up_body: payloads.SSHUpBody) -> None:
1126
- """Tears down a Kubernetes cluster on SSH targets."""
1127
- # We still call ssh_up but with cleanup=True
1128
- executor.schedule_request(
1129
- request_id=request.state.request_id,
1130
- request_name='ssh_down',
1131
- request_body=ssh_up_body,
1132
- func=core.ssh_up, # Reuse ssh_up function with cleanup=True
1133
- schedule_type=requests_lib.ScheduleType.LONG,
1134
- )
1135
-
1136
-
1137
1148
  # === API server related APIs ===
1138
1149
  @app.get('/api/get')
1139
1150
  async def api_get(request_id: str) -> requests_lib.RequestPayload:
@@ -1145,6 +1156,10 @@ async def api_get(request_id: str) -> requests_lib.RequestPayload:
1145
1156
  raise fastapi.HTTPException(
1146
1157
  status_code=404, detail=f'Request {request_id!r} not found')
1147
1158
  if request_task.status > requests_lib.RequestStatus.RUNNING:
1159
+ if request_task.should_retry:
1160
+ raise fastapi.HTTPException(
1161
+ status_code=503,
1162
+ detail=f'Request {request_id!r} should be retried')
1148
1163
  request_error = request_task.get_error()
1149
1164
  if request_error is not None:
1150
1165
  raise fastapi.HTTPException(status_code=500,
@@ -1435,6 +1450,11 @@ async def complete_storage_name(incomplete: str,) -> List[str]:
1435
1450
  return global_user_state.get_storage_names_start_with(incomplete)
1436
1451
 
1437
1452
 
1453
+ @app.get('/api/completion/volume_name')
1454
+ async def complete_volume_name(incomplete: str,) -> List[str]:
1455
+ return global_user_state.get_volume_names_start_with(incomplete)
1456
+
1457
+
1438
1458
  @app.get('/dashboard/{full_path:path}')
1439
1459
  async def serve_dashboard(full_path: str):
1440
1460
  """Serves the Next.js dashboard application.
@@ -1461,6 +1481,7 @@ async def serve_dashboard(full_path: str):
1461
1481
  try:
1462
1482
  with open(index_path, 'r', encoding='utf-8') as f:
1463
1483
  content = f.read()
1484
+
1464
1485
  return fastapi.responses.HTMLResponse(content=content)
1465
1486
  except Exception as e:
1466
1487
  logger.error(f'Error serving dashboard: {e}')
@@ -1484,7 +1505,13 @@ if __name__ == '__main__':
1484
1505
  parser.add_argument('--host', default='127.0.0.1')
1485
1506
  parser.add_argument('--port', default=46580, type=int)
1486
1507
  parser.add_argument('--deploy', action='store_true')
1508
+ # Serve metrics on a separate port to isolate it from the application APIs:
1509
+ # metrics port will not be exposed to the public network typically.
1510
+ parser.add_argument('--metrics-port', default=9090, type=int)
1487
1511
  cmd_args = parser.parse_args()
1512
+ if cmd_args.port == cmd_args.metrics_port:
1513
+ raise ValueError('port and metrics-port cannot be the same')
1514
+
1488
1515
  # Show the privacy policy if it is not already shown. We place it here so
1489
1516
  # that it is shown only when the API server is started.
1490
1517
  usage_lib.maybe_show_privacy_policy()
@@ -1492,9 +1519,17 @@ if __name__ == '__main__':
1492
1519
  config = server_config.compute_server_config(cmd_args.deploy)
1493
1520
  num_workers = config.num_server_workers
1494
1521
 
1495
- sub_procs = []
1522
+ queue_server: Optional[multiprocessing.Process] = None
1523
+ workers: List[executor.RequestWorker] = []
1496
1524
  try:
1497
- sub_procs = executor.start(config)
1525
+ if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
1526
+ metrics_thread = threading.Thread(target=metrics.run_metrics_server,
1527
+ args=(cmd_args.host,
1528
+ cmd_args.metrics_port),
1529
+ daemon=True)
1530
+ metrics_thread.start()
1531
+ queue_server, workers = executor.start(config)
1532
+
1498
1533
  logger.info(f'Starting SkyPilot API server, workers={num_workers}')
1499
1534
  # We don't support reload for now, since it may cause leakage of request
1500
1535
  # workers or interrupt running requests.
@@ -1510,17 +1545,9 @@ if __name__ == '__main__':
1510
1545
  finally:
1511
1546
  logger.info('Shutting down SkyPilot API server...')
1512
1547
 
1513
- def cleanup(proc: multiprocessing.Process) -> None:
1514
- try:
1515
- proc.terminate()
1516
- proc.join()
1517
- finally:
1518
- # The process may not be started yet, close it anyway.
1519
- proc.close()
1520
-
1521
- # Terminate processes in reverse order in case dependency, especially
1522
- # queue server. Terminate queue server first does not affect the
1523
- # correctness of cleanup but introduce redundant error messages.
1524
- subprocess_utils.run_in_parallel(cleanup,
1525
- list(reversed(sub_procs)),
1526
- num_threads=len(sub_procs))
1548
+ subprocess_utils.run_in_parallel(lambda worker: worker.cancel(),
1549
+ workers,
1550
+ num_threads=len(workers))
1551
+ if queue_server is not None:
1552
+ queue_server.kill()
1553
+ queue_server.join()
sky/server/state.py ADDED
@@ -0,0 +1,20 @@
1
+ """State for API server process."""
2
+
3
+ # This state is used to block requests except /api operations, which is useful
4
+ # when a server is shutting down: new requests will be blocked, but existing
5
+ # requests will be allowed to finish and be operated via /api operations, e.g.
6
+ # /api/logs, /api/cancel, etc.
7
+ _block_requests = False
8
+
9
+
10
+ # TODO(aylei): refactor, state should be a instance property of API server app
11
+ # instead of a global variable.
12
+ def get_block_requests() -> bool:
13
+ """Whether block requests except /api operations."""
14
+ return _block_requests
15
+
16
+
17
+ def set_block_requests(shutting_down: bool) -> None:
18
+ """Set the API server to block requests except /api operations."""
19
+ global _block_requests
20
+ _block_requests = shutting_down
@@ -155,9 +155,14 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
155
155
  if request_task.status > requests_lib.RequestStatus.RUNNING:
156
156
  if (request_task.status ==
157
157
  requests_lib.RequestStatus.CANCELLED):
158
- buffer.append(
159
- f'{request_task.name!r} request {request_id}'
160
- ' cancelled\n')
158
+ if request_task.should_retry:
159
+ buffer.append(
160
+ message_utils.encode_payload(
161
+ rich_utils.Control.RETRY.encode('')))
162
+ else:
163
+ buffer.append(
164
+ f'{request_task.name!r} request {request_id}'
165
+ ' cancelled\n')
161
166
  break
162
167
  if not follow:
163
168
  break