skypilot-nightly 1.0.0.dev20250804__py3-none-any.whl → 1.0.0.dev20250806__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (103) hide show
  1. sky/__init__.py +2 -2
  2. sky/catalog/kubernetes_catalog.py +8 -0
  3. sky/catalog/nebius_catalog.py +0 -1
  4. sky/client/cli/command.py +26 -7
  5. sky/client/sdk.py +16 -8
  6. sky/client/sdk.pyi +6 -5
  7. sky/client/sdk_async.py +811 -0
  8. sky/clouds/kubernetes.py +6 -1
  9. sky/clouds/nebius.py +1 -4
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/Gelsd19kVxXcX7aQQGsGu/_buildManifest.js +1 -0
  12. sky/dashboard/out/_next/static/chunks/1043-75af48ca5d5aaf57.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/1141-8678a9102cc5f67e.js +11 -0
  14. sky/dashboard/out/_next/static/chunks/2622-951867535095b0eb.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/3785.0a173cd4393f0fef.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/9025.99f29acb7617963e.js +6 -0
  17. sky/dashboard/out/_next/static/chunks/{9984.78ee6d2c6fa4b0e8.js → 9984.c5564679e467d245.js} +1 -1
  18. sky/dashboard/out/_next/static/chunks/pages/{_app-a67ae198457b9886.js → _app-2a43ea3241bbdacd.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-fa63e8b1d203f298.js → [job]-7cb24da04ca00956.js} +1 -1
  20. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9e7df5fc761c95a7.js → [cluster]-1e95993124dbfc57.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/pages/clusters-47f1ddae13a2f8e4.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/pages/config-d56e64f30db7b42e.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-2a44e70b500b6b70.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/pages/infra-22faac9325016d83.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90693cb88b5599a7.js +11 -0
  26. sky/dashboard/out/_next/static/chunks/pages/jobs-ab318e52eb4424a7.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/pages/users-b90c865a690bfe84.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/pages/volumes-7af733f5d7b6ed1c.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-4d41c9023287f59a.js → [name]-35e0de5bca55e594.js} +1 -1
  30. sky/dashboard/out/_next/static/chunks/pages/workspaces-062525fb5462acb6.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/webpack-387626669badf82e.js +1 -0
  32. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  33. sky/dashboard/out/clusters/[cluster].html +1 -1
  34. sky/dashboard/out/clusters.html +1 -1
  35. sky/dashboard/out/config.html +1 -1
  36. sky/dashboard/out/index.html +1 -1
  37. sky/dashboard/out/infra/[context].html +1 -1
  38. sky/dashboard/out/infra.html +1 -1
  39. sky/dashboard/out/jobs/[job].html +1 -1
  40. sky/dashboard/out/jobs.html +1 -1
  41. sky/dashboard/out/users.html +1 -1
  42. sky/dashboard/out/volumes.html +1 -1
  43. sky/dashboard/out/workspace/new.html +1 -1
  44. sky/dashboard/out/workspaces/[name].html +1 -1
  45. sky/dashboard/out/workspaces.html +1 -1
  46. sky/jobs/client/sdk_async.py +135 -0
  47. sky/jobs/utils.py +3 -1
  48. sky/provision/kubernetes/utils.py +30 -4
  49. sky/provision/nebius/instance.py +1 -0
  50. sky/provision/nebius/utils.py +9 -1
  51. sky/serve/client/sdk_async.py +130 -0
  52. sky/serve/constants.py +2 -1
  53. sky/serve/controller.py +2 -1
  54. sky/serve/load_balancer.py +3 -1
  55. sky/serve/serve_state.py +70 -5
  56. sky/serve/serve_utils.py +124 -22
  57. sky/serve/server/impl.py +22 -21
  58. sky/serve/service.py +8 -1
  59. sky/server/auth/__init__.py +0 -0
  60. sky/server/auth/authn.py +46 -0
  61. sky/server/auth/oauth2_proxy.py +185 -0
  62. sky/server/common.py +108 -17
  63. sky/server/constants.py +1 -1
  64. sky/server/daemons.py +60 -11
  65. sky/server/rest.py +114 -0
  66. sky/server/server.py +44 -40
  67. sky/setup_files/dependencies.py +2 -0
  68. sky/skylet/constants.py +1 -1
  69. sky/skylet/events.py +5 -1
  70. sky/skylet/skylet.py +3 -1
  71. sky/task.py +43 -10
  72. sky/templates/kubernetes-ray.yml.j2 +4 -0
  73. sky/templates/nebius-ray.yml.j2 +1 -0
  74. sky/utils/controller_utils.py +7 -0
  75. sky/utils/rich_utils.py +120 -0
  76. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/METADATA +5 -1
  77. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/RECORD +86 -81
  78. sky/dashboard/out/_next/static/KiGGm4fK0CpmN6BT17jkh/_buildManifest.js +0 -1
  79. sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +0 -1
  80. sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +0 -11
  81. sky/dashboard/out/_next/static/chunks/3698-7874720877646365.js +0 -1
  82. sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +0 -1
  83. sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +0 -1
  84. sky/dashboard/out/_next/static/chunks/9025.7937c16bc8623516.js +0 -6
  85. sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +0 -1
  86. sky/dashboard/out/_next/static/chunks/pages/config-8620d099cbef8608.js +0 -1
  87. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +0 -1
  88. sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +0 -1
  89. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6c5af4c86e6ab3d3.js +0 -11
  90. sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +0 -1
  91. sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +0 -1
  92. sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +0 -1
  93. sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +0 -1
  94. sky/dashboard/out/_next/static/chunks/webpack-13145516b19858fb.js +0 -1
  95. /sky/dashboard/out/_next/static/{KiGGm4fK0CpmN6BT17jkh → Gelsd19kVxXcX7aQQGsGu}/_ssgManifest.js +0 -0
  96. /sky/dashboard/out/_next/static/chunks/{1871-7e17c195296e2ea9.js → 1871-ced1c14230cad6e1.js} +0 -0
  97. /sky/dashboard/out/_next/static/chunks/{6135-d0e285ac5f3f2485.js → 6135-2d7ed3350659d073.js} +0 -0
  98. /sky/dashboard/out/_next/static/chunks/{6601-234b1cf963c7280b.js → 6601-2109d22e7861861c.js} +0 -0
  99. /sky/dashboard/out/_next/static/chunks/{938-40d15b6261ec8dc1.js → 938-bda2685db5eae6cf.js} +0 -0
  100. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/WHEEL +0 -0
  101. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/entry_points.txt +0 -0
  102. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/licenses/LICENSE +0 -0
  103. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/top_level.txt +0 -0
sky/serve/service.py CHANGED
@@ -112,6 +112,10 @@ def cleanup_storage(task_yaml: str) -> bool:
112
112
 
113
113
  def _cleanup(service_name: str) -> bool:
114
114
  """Clean up all service related resources, i.e. replicas and storage."""
115
+ # Cleanup the HA recovery script first as it is possible that some error
116
+ # was raised when we construct the task object (e.g.,
117
+ # sky.exceptions.ResourcesUnavailableError).
118
+ serve_state.remove_ha_recovery_script(service_name)
115
119
  failed = False
116
120
  replica_infos = serve_state.get_replica_infos(service_name)
117
121
  info2proc: Dict[replica_managers.ReplicaInfo,
@@ -223,7 +227,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
223
227
  load_balancing_policy=service_spec.load_balancing_policy,
224
228
  status=serve_state.ServiceStatus.CONTROLLER_INIT,
225
229
  tls_encrypted=service_spec.tls_credential is not None,
226
- pool=service_spec.pool)
230
+ pool=service_spec.pool,
231
+ controller_pid=os.getpid())
227
232
  # Directly throw an error here. See sky/serve/api.py::up
228
233
  # for more details.
229
234
  if not success:
@@ -241,6 +246,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
241
246
  # sync to a tmp file first and then copy it to the final name
242
247
  # if there is no name conflict.
243
248
  shutil.copy(tmp_task_yaml, service_task_yaml)
249
+ else:
250
+ serve_state.update_service_controller_pid(service_name, os.getpid())
244
251
 
245
252
  controller_process = None
246
253
  load_balancer_process = None
File without changes
@@ -0,0 +1,46 @@
1
+ """Authentication module."""
2
+ import json
3
+ from typing import Optional
4
+
5
+ import fastapi
6
+
7
+ from sky import models
8
+ from sky import sky_logging
9
+ from sky.skylet import constants
10
+
11
+ logger = sky_logging.init_logger(__name__)
12
+
13
+
14
+ # TODO(hailong): Remove this function and use request.state.auth_user instead.
15
+ async def override_user_info_in_request_body(request: fastapi.Request,
16
+ auth_user: Optional[models.User]):
17
+ if auth_user is None:
18
+ return
19
+
20
+ body = await request.body()
21
+ if body:
22
+ try:
23
+ original_json = await request.json()
24
+ except (json.JSONDecodeError, UnicodeDecodeError) as e:
25
+ logger.error(f'Error parsing request JSON: {e}')
26
+ else:
27
+ logger.debug(f'Overriding user for {request.state.request_id}: '
28
+ f'{auth_user.name}, {auth_user.id}')
29
+ if 'env_vars' in original_json:
30
+ if isinstance(original_json.get('env_vars'), dict):
31
+ original_json['env_vars'][
32
+ constants.USER_ID_ENV_VAR] = auth_user.id
33
+ original_json['env_vars'][
34
+ constants.USER_ENV_VAR] = auth_user.name
35
+ else:
36
+ logger.warning(
37
+ f'"env_vars" in request body is not a dictionary '
38
+ f'for request {request.state.request_id}. '
39
+ 'Skipping user info injection into body.')
40
+ else:
41
+ original_json['env_vars'] = {}
42
+ original_json['env_vars'][
43
+ constants.USER_ID_ENV_VAR] = auth_user.id
44
+ original_json['env_vars'][
45
+ constants.USER_ENV_VAR] = auth_user.name
46
+ request._body = json.dumps(original_json).encode('utf-8') # pylint: disable=protected-access
@@ -0,0 +1,185 @@
1
+ """Authentication based on oauth2-proxy."""
2
+
3
+ import asyncio
4
+ import hashlib
5
+ import http
6
+ import os
7
+ from typing import Optional
8
+ import urllib
9
+
10
+ import aiohttp
11
+ import fastapi
12
+ import starlette.middleware.base
13
+
14
+ from sky import models
15
+ from sky import sky_logging
16
+ from sky.server.auth import authn
17
+ from sky.utils import common_utils
18
+
19
+ logger = sky_logging.init_logger(__name__)
20
+
21
+ # We do not support setting these in config.yaml because:
22
+ # 1. config.yaml can be updated dynamically, but auth middleware does not
23
+ # support hot reload yet.
24
+ # 2. If we introduce hot reload for auth middleware, bad config might
25
+ # invalidate all authenticated sessions and thus cannot be rolled back
26
+ # by API users.
27
+ # TODO(aylei): we should introduce server.yaml for static server admin config,
28
+ # which is more structured than multiple environment variables and can be less
29
+ # confusing to users.
30
+ OAUTH2_PROXY_BASE_URL_ENV_VAR = 'SKYPILOT_AUTH_OAUTH2_PROXY_BASE_URL'
31
+ OAUTH2_PROXY_ENABLED_ENV_VAR = 'SKYPILOT_AUTH_OAUTH2_PROXY_ENABLED'
32
+
33
+
34
+ class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
35
+ """Middleware to handle authentication by delegating to OAuth2 Proxy."""
36
+
37
+ def __init__(self, application: fastapi.FastAPI):
38
+ super().__init__(application)
39
+ self.enabled: bool = (os.getenv(OAUTH2_PROXY_ENABLED_ENV_VAR,
40
+ 'false') == 'true')
41
+ self.proxy_base: str = ''
42
+ if self.enabled:
43
+ proxy_base = os.getenv(OAUTH2_PROXY_BASE_URL_ENV_VAR)
44
+ if not proxy_base:
45
+ raise ValueError('OAuth2 Proxy is enabled but base_url is not '
46
+ 'set')
47
+ self.proxy_base = proxy_base.rstrip('/')
48
+
49
+ async def dispatch(self, request: fastapi.Request, call_next):
50
+ if not self.enabled:
51
+ return await call_next(request)
52
+
53
+ # Forward /oauth2/* to oauth2-proxy, including /oauth2/start and
54
+ # /oauth2/callback.
55
+ if request.url.path.startswith('/oauth2'):
56
+ return await self.forward_to_oauth2_proxy(request)
57
+
58
+ return await self.authenticate(request, call_next)
59
+
60
+ async def forward_to_oauth2_proxy(self, request: fastapi.Request):
61
+ """Forward requests to oauth2-proxy service."""
62
+ logger.debug(f'forwarding to oauth2-proxy: {request.url.path}')
63
+ path = request.url.path.lstrip('/')
64
+ target_url = f'{self.proxy_base}/{path}'
65
+ body = await request.body()
66
+ async with aiohttp.ClientSession() as session:
67
+ try:
68
+ forwarded_headers = dict(request.headers)
69
+ async with session.request(
70
+ method=request.method,
71
+ url=target_url,
72
+ headers=forwarded_headers,
73
+ data=body,
74
+ cookies=request.cookies,
75
+ params=request.query_params,
76
+ allow_redirects=False,
77
+ ) as response:
78
+ response_body = await response.read()
79
+ fastapi_response = fastapi.responses.Response(
80
+ content=response_body,
81
+ status_code=response.status,
82
+ headers=dict(response.headers),
83
+ )
84
+ # Forward cookies from OAuth2 proxy response to client
85
+ for cookie_name, cookie in response.cookies.items():
86
+ fastapi_response.set_cookie(
87
+ key=cookie_name,
88
+ value=cookie.value,
89
+ max_age=cookie.get('max-age'),
90
+ expires=cookie.get('expires'),
91
+ path=cookie.get('path', '/'),
92
+ domain=cookie.get('domain'),
93
+ secure=cookie.get('secure', False),
94
+ httponly=cookie.get('httponly', False),
95
+ )
96
+ return fastapi_response
97
+ except (aiohttp.ClientError, asyncio.TimeoutError) as e:
98
+ logger.error(f'Error forwarding to OAuth2 proxy: {e}')
99
+ return fastapi.responses.JSONResponse(
100
+ status_code=http.HTTPStatus.BAD_GATEWAY,
101
+ content={'detail': 'oauth2-proxy service unavailable'})
102
+
103
+ async def authenticate(self, request: fastapi.Request, call_next):
104
+ if request.state.auth_user is not None:
105
+ # Already authenticated
106
+ return await call_next(request)
107
+
108
+ async with aiohttp.ClientSession() as session:
109
+ try:
110
+ return await self._authenticate(request, call_next, session)
111
+ except (aiohttp.ClientError, asyncio.TimeoutError) as e:
112
+ logger.error(f'Error communicating with OAuth2 proxy: {e}')
113
+ # Fail open or closed based on your security requirements
114
+ return fastapi.responses.JSONResponse(
115
+ status_code=http.HTTPStatus.BAD_GATEWAY,
116
+ content={'detail': 'oauth2-proxy service unavailable'})
117
+
118
+ async def _authenticate(self, request: fastapi.Request, call_next,
119
+ session: aiohttp.ClientSession):
120
+ forwarded_headers = dict(request.headers)
121
+ auth_url = f'{self.proxy_base}/oauth2/auth'
122
+ forwarded_headers['X-Forwarded-Uri'] = str(request.url).rstrip('/')
123
+ logger.debug(f'authenticate request: {request.url.path}')
124
+
125
+ async with session.request(
126
+ method=request.method,
127
+ url=auth_url,
128
+ headers=forwarded_headers,
129
+ cookies=request.cookies,
130
+ timeout=aiohttp.ClientTimeout(total=10),
131
+ allow_redirects=False,
132
+ ) as auth_response:
133
+
134
+ if auth_response.status == http.HTTPStatus.ACCEPTED:
135
+ # User is authenticated, extract user info from headers
136
+ auth_user = self.get_auth_user(auth_response)
137
+ if not auth_user:
138
+ return fastapi.responses.JSONResponse(
139
+ status_code=http.HTTPStatus.INTERNAL_SERVER_ERROR,
140
+ content={
141
+ 'detail':
142
+ 'oauth2-proxy is enabled but did not'
143
+ 'return user info, check your oauth2-proxy'
144
+ 'setup.'
145
+ })
146
+ request.state.auth_user = auth_user
147
+ await authn.override_user_info_in_request_body(
148
+ request, auth_user)
149
+ return await call_next(request)
150
+ elif auth_response.status == http.HTTPStatus.UNAUTHORIZED:
151
+ # For /api/health, we should allow unauthenticated requests to
152
+ # not break healthz check.
153
+ # TODO(aylei): remove this to an aggregated login middleware
154
+ # in favor of the unified authentication.
155
+ if request.url.path.startswith('/api/health'):
156
+ request.state.anonymous_user = True
157
+ return await call_next(request)
158
+
159
+ # TODO(aylei): in unified authentication, the redirection
160
+ # or rejection should be done after all the authentication
161
+ # methods are performed.
162
+ # Not authenticated, redirect to sign-in
163
+ redirect_path = request.url.path
164
+ if request.url.query:
165
+ redirect_path += f'?{request.url.query}'
166
+ rd = urllib.parse.quote(redirect_path)
167
+ signin_url = (f'{request.base_url}oauth2/start?'
168
+ f'rd={rd}')
169
+ return fastapi.responses.RedirectResponse(url=signin_url)
170
+ else:
171
+ logger.error('oauth2-proxy returned unexpected status '
172
+ f'{auth_response.status}: {auth_response.text}')
173
+ return fastapi.responses.JSONResponse(
174
+ status_code=auth_response.status,
175
+ content={'detail': 'oauth2-proxy error'})
176
+
177
+ def get_auth_user(
178
+ self, response: aiohttp.ClientResponse) -> Optional[models.User]:
179
+ """Extract user info from OAuth2 proxy response headers."""
180
+ email_header = response.headers.get('X-Auth-Request-Email')
181
+ if email_header:
182
+ user_hash = hashlib.md5(email_header.encode()).hexdigest(
183
+ )[:common_utils.USER_HASH_LENGTH]
184
+ return models.User(id=user_hash, name=email_header)
185
+ return None
sky/server/common.py CHANGED
@@ -41,12 +41,14 @@ from sky.utils import rich_utils
41
41
  from sky.utils import ux_utils
42
42
 
43
43
  if typing.TYPE_CHECKING:
44
+ import aiohttp
44
45
  import pydantic
45
46
  import requests
46
47
 
47
48
  from sky import dag as dag_lib
48
49
  from sky import models
49
50
  else:
51
+ aiohttp = adaptors_common.LazyImport('aiohttp')
50
52
  pydantic = adaptors_common.LazyImport('pydantic')
51
53
  requests = adaptors_common.LazyImport('requests')
52
54
 
@@ -175,24 +177,14 @@ def get_cookies_from_response(
175
177
  return cookies
176
178
 
177
179
 
178
- def make_authenticated_request(method: str,
179
- path: str,
180
- server_url: Optional[str] = None,
181
- retry: bool = True,
182
- **kwargs) -> 'requests.Response':
183
- """Make an authenticated HTTP request to the API server.
184
-
185
- Automatically handles service account token authentication or cookie-based
186
- authentication based on what's available.
187
-
188
- Args:
189
- method: HTTP method (GET, POST, etc.)
190
- path: API path (e.g., '/api/v1/status')
191
- server_url: Server URL, defaults to configured server
192
- **kwargs: Additional arguments to pass to requests
180
+ def _prepare_authenticated_request_params(
181
+ path: str,
182
+ server_url: Optional[str] = None,
183
+ **kwargs) -> Tuple[str, Dict[str, Any]]:
184
+ """Prepare common parameters for authenticated requests (sync or async).
193
185
 
194
186
  Returns:
195
- requests.Response object
187
+ Tuple of (url, updated_kwargs)
196
188
  """
197
189
  if server_url is None:
198
190
  server_url = get_server_url()
@@ -214,6 +206,41 @@ def make_authenticated_request(method: str,
214
206
  if not headers.get('Authorization') and 'cookies' not in kwargs:
215
207
  kwargs['cookies'] = get_api_cookie_jar()
216
208
 
209
+ return url, kwargs
210
+
211
+
212
+ def _convert_requests_cookies_to_aiohttp(
213
+ cookie_jar: requests.cookies.RequestsCookieJar) -> Dict[str, str]:
214
+ """Convert requests cookie jar to aiohttp-compatible dict format."""
215
+ cookies = {}
216
+ for cookie in cookie_jar:
217
+ cookies[cookie.name] = cookie.value
218
+ return cookies # type: ignore
219
+
220
+
221
+ def make_authenticated_request(method: str,
222
+ path: str,
223
+ server_url: Optional[str] = None,
224
+ retry: bool = True,
225
+ **kwargs) -> 'requests.Response':
226
+ """Make an authenticated HTTP request to the API server.
227
+
228
+ Automatically handles service account token authentication or cookie-based
229
+ authentication based on what's available.
230
+
231
+ Args:
232
+ method: HTTP method (GET, POST, etc.)
233
+ path: API path (e.g., '/api/v1/status')
234
+ server_url: Server URL, defaults to configured server
235
+ retry: Whether to retry on transient errors
236
+ **kwargs: Additional arguments to pass to requests
237
+
238
+ Returns:
239
+ requests.Response object
240
+ """
241
+ url, kwargs = _prepare_authenticated_request_params(path, server_url,
242
+ **kwargs)
243
+
217
244
  # Make the request
218
245
  if retry:
219
246
  return rest.request(method, url, **kwargs)
@@ -222,6 +249,69 @@ def make_authenticated_request(method: str,
222
249
  return rest.request_without_retry(method, url, **kwargs)
223
250
 
224
251
 
252
+ async def make_authenticated_request_async(
253
+ session: 'aiohttp.ClientSession',
254
+ method: str,
255
+ path: str,
256
+ server_url: Optional[str] = None,
257
+ retry: bool = True,
258
+ **kwargs) -> 'aiohttp.ClientResponse':
259
+ """Make an authenticated async HTTP request to the API server using aiohttp.
260
+
261
+ Automatically handles service account token authentication or cookie-based
262
+ authentication based on what's available.
263
+
264
+ Example usage:
265
+ async with aiohttp.ClientSession() as session:
266
+ response = await make_authenticated_request_async(
267
+ session, 'GET', '/api/v1/status')
268
+ data = await response.json()
269
+
270
+ Args:
271
+ session: aiohttp ClientSession to use for the request
272
+ method: HTTP method (GET, POST, etc.)
273
+ path: API path (e.g., '/api/v1/status')
274
+ server_url: Server URL, defaults to configured server
275
+ retry: Whether to retry on transient errors
276
+ **kwargs: Additional arguments to pass to aiohttp
277
+
278
+ Returns:
279
+ aiohttp.ClientResponse object
280
+
281
+ Raises:
282
+ aiohttp.ClientError: For HTTP-related errors
283
+ exceptions.ServerTemporarilyUnavailableError: When server returns 503
284
+ exceptions.RequestInterruptedError: When request is interrupted
285
+ """
286
+ url, kwargs = _prepare_authenticated_request_params(path, server_url,
287
+ **kwargs)
288
+
289
+ # Convert cookies to aiohttp format if needed
290
+ if 'cookies' in kwargs and isinstance(kwargs['cookies'],
291
+ requests.cookies.RequestsCookieJar):
292
+ kwargs['cookies'] = _convert_requests_cookies_to_aiohttp(
293
+ kwargs['cookies'])
294
+
295
+ # Convert params to strings for aiohttp compatibility
296
+ if 'params' in kwargs and kwargs['params'] is not None:
297
+ normalized_params = {}
298
+ for key, value in kwargs['params'].items():
299
+ if isinstance(value, bool):
300
+ normalized_params[key] = str(value).lower()
301
+ elif value is not None:
302
+ normalized_params[key] = str(value)
303
+ # Skip None values
304
+ kwargs['params'] = normalized_params
305
+
306
+ # Make the request
307
+ if retry:
308
+ return await rest.request_async(session, method, url, **kwargs)
309
+ else:
310
+ assert method == 'GET', 'Only GET requests can be done without retry'
311
+ return await rest.request_without_retry_async(session, method, url,
312
+ **kwargs)
313
+
314
+
225
315
  @annotations.lru_cache(scope='global')
226
316
  def get_server_url(host: Optional[str] = None) -> str:
227
317
  endpoint = DEFAULT_SERVER_URL
@@ -322,13 +412,14 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
322
412
  # The response is 200, so we can parse the response.
323
413
  try:
324
414
  result = response.json()
415
+ server_status = result.get('status')
325
416
  api_version = result.get('api_version')
326
417
  version = result.get('version')
327
418
  version_on_disk = result.get('version_on_disk')
328
419
  commit = result.get('commit')
329
420
  user = result.get('user')
330
421
  basic_auth_enabled = result.get('basic_auth_enabled')
331
- server_info = ApiServerInfo(status=ApiServerStatus.HEALTHY,
422
+ server_info = ApiServerInfo(status=ApiServerStatus(server_status),
332
423
  api_version=api_version,
333
424
  version=version,
334
425
  version_on_disk=version_on_disk,
sky/server/constants.py CHANGED
@@ -10,7 +10,7 @@ from sky.skylet import constants
10
10
  # based on version info is needed.
11
11
  # For more details and code guidelines, refer to:
12
12
  # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
13
- API_VERSION = 13
13
+ API_VERSION = 14
14
14
 
15
15
  # The minimum peer API version that the code should still work with.
16
16
  # Notes (dev):
sky/server/daemons.py CHANGED
@@ -14,6 +14,10 @@ from sky.utils import ux_utils
14
14
  logger = sky_logging.init_logger(__name__)
15
15
 
16
16
 
17
+ def _default_should_skip():
18
+ return False
19
+
20
+
17
21
  @dataclasses.dataclass
18
22
  class InternalRequestDaemon:
19
23
  """Internal daemon that runs an event in the background."""
@@ -22,6 +26,7 @@ class InternalRequestDaemon:
22
26
  name: str
23
27
  event_fn: Callable[[], None]
24
28
  default_log_level: str = 'INFO'
29
+ should_skip: Callable[[], bool] = _default_should_skip
25
30
 
26
31
  def refresh_log_level(self) -> int:
27
32
  # pylint: disable=import-outside-toplevel
@@ -110,14 +115,14 @@ def managed_job_status_refresh_event():
110
115
  """Refresh the managed job status for controller consolidation mode."""
111
116
  # pylint: disable=import-outside-toplevel
112
117
  from sky.jobs import utils as managed_job_utils
113
- if not managed_job_utils.is_consolidation_mode():
114
- return
118
+ from sky.utils import controller_utils
119
+
115
120
  # We run the recovery logic before starting the event loop as those two are
116
121
  # conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
117
- from sky.utils import controller_utils
118
122
  if controller_utils.high_availability_specified(
119
123
  controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name):
120
124
  managed_job_utils.ha_recovery_for_consolidation_mode()
125
+
121
126
  # After recovery, we start the event loop.
122
127
  from sky.skylet import events
123
128
  refresh_event = events.ManagedJobEvent()
@@ -128,20 +133,58 @@ def managed_job_status_refresh_event():
128
133
  time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
129
134
 
130
135
 
131
- def sky_serve_status_refresh_event():
136
+ def should_skip_managed_job_status_refresh():
137
+ """Check if the managed job status refresh event should be skipped."""
138
+ # pylint: disable=import-outside-toplevel
139
+ from sky.jobs import utils as managed_job_utils
140
+ return not managed_job_utils.is_consolidation_mode()
141
+
142
+
143
+ def _serve_status_refresh_event(pool: bool):
132
144
  """Refresh the sky serve status for controller consolidation mode."""
133
145
  # pylint: disable=import-outside-toplevel
134
146
  from sky.serve import serve_utils
135
- if not serve_utils.is_consolidation_mode():
136
- return
137
- # TODO(tian): Add HA recovery logic.
147
+ from sky.utils import controller_utils
148
+
149
+ # We run the recovery logic before starting the event loop as those two are
150
+ # conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
151
+ controller = controller_utils.get_controller_for_pool(pool)
152
+ if controller_utils.high_availability_specified(
153
+ controller.value.cluster_name):
154
+ serve_utils.ha_recovery_for_consolidation_mode(pool=pool)
155
+
156
+ # After recovery, we start the event loop.
138
157
  from sky.skylet import events
139
- event = events.ServiceUpdateEvent()
140
- logger.info('=== Running serve status refresh event ===')
158
+ event = events.ServiceUpdateEvent(pool=pool)
159
+ noun = 'pool' if pool else 'serve'
160
+ logger.info(f'=== Running {noun} status refresh event ===')
141
161
  event.run()
142
162
  time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
143
163
 
144
164
 
165
+ def _should_skip_serve_status_refresh_event(pool: bool):
166
+ """Check if the serve status refresh event should be skipped."""
167
+ # pylint: disable=import-outside-toplevel
168
+ from sky.serve import serve_utils
169
+ return not serve_utils.is_consolidation_mode(pool=pool)
170
+
171
+
172
+ def sky_serve_status_refresh_event():
173
+ _serve_status_refresh_event(pool=False)
174
+
175
+
176
+ def should_skip_sky_serve_status_refresh():
177
+ return _should_skip_serve_status_refresh_event(pool=False)
178
+
179
+
180
+ def pool_status_refresh_event():
181
+ _serve_status_refresh_event(pool=True)
182
+
183
+
184
+ def should_skip_pool_status_refresh():
185
+ return _should_skip_serve_status_refresh_event(pool=True)
186
+
187
+
145
188
  # Register the events to run in the background.
146
189
  INTERNAL_REQUEST_DAEMONS = [
147
190
  # This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
@@ -157,8 +200,14 @@ INTERNAL_REQUEST_DAEMONS = [
157
200
  event_fn=refresh_volume_status_event),
158
201
  InternalRequestDaemon(id='managed-job-status-refresh-daemon',
159
202
  name='managed-job-status',
160
- event_fn=managed_job_status_refresh_event),
203
+ event_fn=managed_job_status_refresh_event,
204
+ should_skip=should_skip_managed_job_status_refresh),
161
205
  InternalRequestDaemon(id='sky-serve-status-refresh-daemon',
162
206
  name='sky-serve-status',
163
- event_fn=sky_serve_status_refresh_event),
207
+ event_fn=sky_serve_status_refresh_event,
208
+ should_skip=should_skip_sky_serve_status_refresh),
209
+ InternalRequestDaemon(id='pool-status-refresh-daemon',
210
+ name='pool-status',
211
+ event_fn=pool_status_refresh_event,
212
+ should_skip=should_skip_pool_status_refresh),
164
213
  ]