skypilot-nightly 1.0.0.dev20250804__py3-none-any.whl → 1.0.0.dev20250806__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/catalog/kubernetes_catalog.py +8 -0
- sky/catalog/nebius_catalog.py +0 -1
- sky/client/cli/command.py +26 -7
- sky/client/sdk.py +16 -8
- sky/client/sdk.pyi +6 -5
- sky/client/sdk_async.py +811 -0
- sky/clouds/kubernetes.py +6 -1
- sky/clouds/nebius.py +1 -4
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/Gelsd19kVxXcX7aQQGsGu/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1043-75af48ca5d5aaf57.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-8678a9102cc5f67e.js +11 -0
- sky/dashboard/out/_next/static/chunks/2622-951867535095b0eb.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.0a173cd4393f0fef.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.99f29acb7617963e.js +6 -0
- sky/dashboard/out/_next/static/chunks/{9984.78ee6d2c6fa4b0e8.js → 9984.c5564679e467d245.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{_app-a67ae198457b9886.js → _app-2a43ea3241bbdacd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-fa63e8b1d203f298.js → [job]-7cb24da04ca00956.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9e7df5fc761c95a7.js → [cluster]-1e95993124dbfc57.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-47f1ddae13a2f8e4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-d56e64f30db7b42e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-2a44e70b500b6b70.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-22faac9325016d83.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90693cb88b5599a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ab318e52eb4424a7.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-b90c865a690bfe84.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-7af733f5d7b6ed1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-4d41c9023287f59a.js → [name]-35e0de5bca55e594.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-062525fb5462acb6.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-387626669badf82e.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/jobs/client/sdk_async.py +135 -0
- sky/jobs/utils.py +3 -1
- sky/provision/kubernetes/utils.py +30 -4
- sky/provision/nebius/instance.py +1 -0
- sky/provision/nebius/utils.py +9 -1
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +2 -1
- sky/serve/controller.py +2 -1
- sky/serve/load_balancer.py +3 -1
- sky/serve/serve_state.py +70 -5
- sky/serve/serve_utils.py +124 -22
- sky/serve/server/impl.py +22 -21
- sky/serve/service.py +8 -1
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +46 -0
- sky/server/auth/oauth2_proxy.py +185 -0
- sky/server/common.py +108 -17
- sky/server/constants.py +1 -1
- sky/server/daemons.py +60 -11
- sky/server/rest.py +114 -0
- sky/server/server.py +44 -40
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +1 -1
- sky/skylet/events.py +5 -1
- sky/skylet/skylet.py +3 -1
- sky/task.py +43 -10
- sky/templates/kubernetes-ray.yml.j2 +4 -0
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/utils/controller_utils.py +7 -0
- sky/utils/rich_utils.py +120 -0
- {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/METADATA +5 -1
- {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/RECORD +86 -81
- sky/dashboard/out/_next/static/KiGGm4fK0CpmN6BT17jkh/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +0 -11
- sky/dashboard/out/_next/static/chunks/3698-7874720877646365.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.7937c16bc8623516.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-8620d099cbef8608.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6c5af4c86e6ab3d3.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-13145516b19858fb.js +0 -1
- /sky/dashboard/out/_next/static/{KiGGm4fK0CpmN6BT17jkh → Gelsd19kVxXcX7aQQGsGu}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{1871-7e17c195296e2ea9.js → 1871-ced1c14230cad6e1.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{6135-d0e285ac5f3f2485.js → 6135-2d7ed3350659d073.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{6601-234b1cf963c7280b.js → 6601-2109d22e7861861c.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{938-40d15b6261ec8dc1.js → 938-bda2685db5eae6cf.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/top_level.txt +0 -0
sky/serve/service.py
CHANGED
|
@@ -112,6 +112,10 @@ def cleanup_storage(task_yaml: str) -> bool:
|
|
|
112
112
|
|
|
113
113
|
def _cleanup(service_name: str) -> bool:
|
|
114
114
|
"""Clean up all service related resources, i.e. replicas and storage."""
|
|
115
|
+
# Cleanup the HA recovery script first as it is possible that some error
|
|
116
|
+
# was raised when we construct the task object (e.g.,
|
|
117
|
+
# sky.exceptions.ResourcesUnavailableError).
|
|
118
|
+
serve_state.remove_ha_recovery_script(service_name)
|
|
115
119
|
failed = False
|
|
116
120
|
replica_infos = serve_state.get_replica_infos(service_name)
|
|
117
121
|
info2proc: Dict[replica_managers.ReplicaInfo,
|
|
@@ -223,7 +227,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
|
223
227
|
load_balancing_policy=service_spec.load_balancing_policy,
|
|
224
228
|
status=serve_state.ServiceStatus.CONTROLLER_INIT,
|
|
225
229
|
tls_encrypted=service_spec.tls_credential is not None,
|
|
226
|
-
pool=service_spec.pool
|
|
230
|
+
pool=service_spec.pool,
|
|
231
|
+
controller_pid=os.getpid())
|
|
227
232
|
# Directly throw an error here. See sky/serve/api.py::up
|
|
228
233
|
# for more details.
|
|
229
234
|
if not success:
|
|
@@ -241,6 +246,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
|
241
246
|
# sync to a tmp file first and then copy it to the final name
|
|
242
247
|
# if there is no name conflict.
|
|
243
248
|
shutil.copy(tmp_task_yaml, service_task_yaml)
|
|
249
|
+
else:
|
|
250
|
+
serve_state.update_service_controller_pid(service_name, os.getpid())
|
|
244
251
|
|
|
245
252
|
controller_process = None
|
|
246
253
|
load_balancer_process = None
|
|
File without changes
|
sky/server/auth/authn.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Authentication module."""
|
|
2
|
+
import json
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import fastapi
|
|
6
|
+
|
|
7
|
+
from sky import models
|
|
8
|
+
from sky import sky_logging
|
|
9
|
+
from sky.skylet import constants
|
|
10
|
+
|
|
11
|
+
logger = sky_logging.init_logger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# TODO(hailong): Remove this function and use request.state.auth_user instead.
|
|
15
|
+
async def override_user_info_in_request_body(request: fastapi.Request,
|
|
16
|
+
auth_user: Optional[models.User]):
|
|
17
|
+
if auth_user is None:
|
|
18
|
+
return
|
|
19
|
+
|
|
20
|
+
body = await request.body()
|
|
21
|
+
if body:
|
|
22
|
+
try:
|
|
23
|
+
original_json = await request.json()
|
|
24
|
+
except (json.JSONDecodeError, UnicodeDecodeError) as e:
|
|
25
|
+
logger.error(f'Error parsing request JSON: {e}')
|
|
26
|
+
else:
|
|
27
|
+
logger.debug(f'Overriding user for {request.state.request_id}: '
|
|
28
|
+
f'{auth_user.name}, {auth_user.id}')
|
|
29
|
+
if 'env_vars' in original_json:
|
|
30
|
+
if isinstance(original_json.get('env_vars'), dict):
|
|
31
|
+
original_json['env_vars'][
|
|
32
|
+
constants.USER_ID_ENV_VAR] = auth_user.id
|
|
33
|
+
original_json['env_vars'][
|
|
34
|
+
constants.USER_ENV_VAR] = auth_user.name
|
|
35
|
+
else:
|
|
36
|
+
logger.warning(
|
|
37
|
+
f'"env_vars" in request body is not a dictionary '
|
|
38
|
+
f'for request {request.state.request_id}. '
|
|
39
|
+
'Skipping user info injection into body.')
|
|
40
|
+
else:
|
|
41
|
+
original_json['env_vars'] = {}
|
|
42
|
+
original_json['env_vars'][
|
|
43
|
+
constants.USER_ID_ENV_VAR] = auth_user.id
|
|
44
|
+
original_json['env_vars'][
|
|
45
|
+
constants.USER_ENV_VAR] = auth_user.name
|
|
46
|
+
request._body = json.dumps(original_json).encode('utf-8') # pylint: disable=protected-access
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""Authentication based on oauth2-proxy."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import hashlib
|
|
5
|
+
import http
|
|
6
|
+
import os
|
|
7
|
+
from typing import Optional
|
|
8
|
+
import urllib
|
|
9
|
+
|
|
10
|
+
import aiohttp
|
|
11
|
+
import fastapi
|
|
12
|
+
import starlette.middleware.base
|
|
13
|
+
|
|
14
|
+
from sky import models
|
|
15
|
+
from sky import sky_logging
|
|
16
|
+
from sky.server.auth import authn
|
|
17
|
+
from sky.utils import common_utils
|
|
18
|
+
|
|
19
|
+
logger = sky_logging.init_logger(__name__)
|
|
20
|
+
|
|
21
|
+
# We do not support setting these in config.yaml because:
|
|
22
|
+
# 1. config.yaml can be updated dynamically, but auth middleware does not
|
|
23
|
+
# support hot reload yet.
|
|
24
|
+
# 2. If we introduce hot reload for auth middleware, bad config might
|
|
25
|
+
# invalidate all authenticated sessions and thus cannot be rolled back
|
|
26
|
+
# by API users.
|
|
27
|
+
# TODO(aylei): we should introduce server.yaml for static server admin config,
|
|
28
|
+
# which is more structured than multiple environment variables and can be less
|
|
29
|
+
# confusing to users.
|
|
30
|
+
OAUTH2_PROXY_BASE_URL_ENV_VAR = 'SKYPILOT_AUTH_OAUTH2_PROXY_BASE_URL'
|
|
31
|
+
OAUTH2_PROXY_ENABLED_ENV_VAR = 'SKYPILOT_AUTH_OAUTH2_PROXY_ENABLED'
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
35
|
+
"""Middleware to handle authentication by delegating to OAuth2 Proxy."""
|
|
36
|
+
|
|
37
|
+
def __init__(self, application: fastapi.FastAPI):
|
|
38
|
+
super().__init__(application)
|
|
39
|
+
self.enabled: bool = (os.getenv(OAUTH2_PROXY_ENABLED_ENV_VAR,
|
|
40
|
+
'false') == 'true')
|
|
41
|
+
self.proxy_base: str = ''
|
|
42
|
+
if self.enabled:
|
|
43
|
+
proxy_base = os.getenv(OAUTH2_PROXY_BASE_URL_ENV_VAR)
|
|
44
|
+
if not proxy_base:
|
|
45
|
+
raise ValueError('OAuth2 Proxy is enabled but base_url is not '
|
|
46
|
+
'set')
|
|
47
|
+
self.proxy_base = proxy_base.rstrip('/')
|
|
48
|
+
|
|
49
|
+
async def dispatch(self, request: fastapi.Request, call_next):
|
|
50
|
+
if not self.enabled:
|
|
51
|
+
return await call_next(request)
|
|
52
|
+
|
|
53
|
+
# Forward /oauth2/* to oauth2-proxy, including /oauth2/start and
|
|
54
|
+
# /oauth2/callback.
|
|
55
|
+
if request.url.path.startswith('/oauth2'):
|
|
56
|
+
return await self.forward_to_oauth2_proxy(request)
|
|
57
|
+
|
|
58
|
+
return await self.authenticate(request, call_next)
|
|
59
|
+
|
|
60
|
+
async def forward_to_oauth2_proxy(self, request: fastapi.Request):
|
|
61
|
+
"""Forward requests to oauth2-proxy service."""
|
|
62
|
+
logger.debug(f'forwarding to oauth2-proxy: {request.url.path}')
|
|
63
|
+
path = request.url.path.lstrip('/')
|
|
64
|
+
target_url = f'{self.proxy_base}/{path}'
|
|
65
|
+
body = await request.body()
|
|
66
|
+
async with aiohttp.ClientSession() as session:
|
|
67
|
+
try:
|
|
68
|
+
forwarded_headers = dict(request.headers)
|
|
69
|
+
async with session.request(
|
|
70
|
+
method=request.method,
|
|
71
|
+
url=target_url,
|
|
72
|
+
headers=forwarded_headers,
|
|
73
|
+
data=body,
|
|
74
|
+
cookies=request.cookies,
|
|
75
|
+
params=request.query_params,
|
|
76
|
+
allow_redirects=False,
|
|
77
|
+
) as response:
|
|
78
|
+
response_body = await response.read()
|
|
79
|
+
fastapi_response = fastapi.responses.Response(
|
|
80
|
+
content=response_body,
|
|
81
|
+
status_code=response.status,
|
|
82
|
+
headers=dict(response.headers),
|
|
83
|
+
)
|
|
84
|
+
# Forward cookies from OAuth2 proxy response to client
|
|
85
|
+
for cookie_name, cookie in response.cookies.items():
|
|
86
|
+
fastapi_response.set_cookie(
|
|
87
|
+
key=cookie_name,
|
|
88
|
+
value=cookie.value,
|
|
89
|
+
max_age=cookie.get('max-age'),
|
|
90
|
+
expires=cookie.get('expires'),
|
|
91
|
+
path=cookie.get('path', '/'),
|
|
92
|
+
domain=cookie.get('domain'),
|
|
93
|
+
secure=cookie.get('secure', False),
|
|
94
|
+
httponly=cookie.get('httponly', False),
|
|
95
|
+
)
|
|
96
|
+
return fastapi_response
|
|
97
|
+
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
|
|
98
|
+
logger.error(f'Error forwarding to OAuth2 proxy: {e}')
|
|
99
|
+
return fastapi.responses.JSONResponse(
|
|
100
|
+
status_code=http.HTTPStatus.BAD_GATEWAY,
|
|
101
|
+
content={'detail': 'oauth2-proxy service unavailable'})
|
|
102
|
+
|
|
103
|
+
async def authenticate(self, request: fastapi.Request, call_next):
|
|
104
|
+
if request.state.auth_user is not None:
|
|
105
|
+
# Already authenticated
|
|
106
|
+
return await call_next(request)
|
|
107
|
+
|
|
108
|
+
async with aiohttp.ClientSession() as session:
|
|
109
|
+
try:
|
|
110
|
+
return await self._authenticate(request, call_next, session)
|
|
111
|
+
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
|
|
112
|
+
logger.error(f'Error communicating with OAuth2 proxy: {e}')
|
|
113
|
+
# Fail open or closed based on your security requirements
|
|
114
|
+
return fastapi.responses.JSONResponse(
|
|
115
|
+
status_code=http.HTTPStatus.BAD_GATEWAY,
|
|
116
|
+
content={'detail': 'oauth2-proxy service unavailable'})
|
|
117
|
+
|
|
118
|
+
async def _authenticate(self, request: fastapi.Request, call_next,
|
|
119
|
+
session: aiohttp.ClientSession):
|
|
120
|
+
forwarded_headers = dict(request.headers)
|
|
121
|
+
auth_url = f'{self.proxy_base}/oauth2/auth'
|
|
122
|
+
forwarded_headers['X-Forwarded-Uri'] = str(request.url).rstrip('/')
|
|
123
|
+
logger.debug(f'authenticate request: {request.url.path}')
|
|
124
|
+
|
|
125
|
+
async with session.request(
|
|
126
|
+
method=request.method,
|
|
127
|
+
url=auth_url,
|
|
128
|
+
headers=forwarded_headers,
|
|
129
|
+
cookies=request.cookies,
|
|
130
|
+
timeout=aiohttp.ClientTimeout(total=10),
|
|
131
|
+
allow_redirects=False,
|
|
132
|
+
) as auth_response:
|
|
133
|
+
|
|
134
|
+
if auth_response.status == http.HTTPStatus.ACCEPTED:
|
|
135
|
+
# User is authenticated, extract user info from headers
|
|
136
|
+
auth_user = self.get_auth_user(auth_response)
|
|
137
|
+
if not auth_user:
|
|
138
|
+
return fastapi.responses.JSONResponse(
|
|
139
|
+
status_code=http.HTTPStatus.INTERNAL_SERVER_ERROR,
|
|
140
|
+
content={
|
|
141
|
+
'detail':
|
|
142
|
+
'oauth2-proxy is enabled but did not'
|
|
143
|
+
'return user info, check your oauth2-proxy'
|
|
144
|
+
'setup.'
|
|
145
|
+
})
|
|
146
|
+
request.state.auth_user = auth_user
|
|
147
|
+
await authn.override_user_info_in_request_body(
|
|
148
|
+
request, auth_user)
|
|
149
|
+
return await call_next(request)
|
|
150
|
+
elif auth_response.status == http.HTTPStatus.UNAUTHORIZED:
|
|
151
|
+
# For /api/health, we should allow unauthenticated requests to
|
|
152
|
+
# not break healthz check.
|
|
153
|
+
# TODO(aylei): remove this to an aggregated login middleware
|
|
154
|
+
# in favor of the unified authentication.
|
|
155
|
+
if request.url.path.startswith('/api/health'):
|
|
156
|
+
request.state.anonymous_user = True
|
|
157
|
+
return await call_next(request)
|
|
158
|
+
|
|
159
|
+
# TODO(aylei): in unified authentication, the redirection
|
|
160
|
+
# or rejection should be done after all the authentication
|
|
161
|
+
# methods are performed.
|
|
162
|
+
# Not authenticated, redirect to sign-in
|
|
163
|
+
redirect_path = request.url.path
|
|
164
|
+
if request.url.query:
|
|
165
|
+
redirect_path += f'?{request.url.query}'
|
|
166
|
+
rd = urllib.parse.quote(redirect_path)
|
|
167
|
+
signin_url = (f'{request.base_url}oauth2/start?'
|
|
168
|
+
f'rd={rd}')
|
|
169
|
+
return fastapi.responses.RedirectResponse(url=signin_url)
|
|
170
|
+
else:
|
|
171
|
+
logger.error('oauth2-proxy returned unexpected status '
|
|
172
|
+
f'{auth_response.status}: {auth_response.text}')
|
|
173
|
+
return fastapi.responses.JSONResponse(
|
|
174
|
+
status_code=auth_response.status,
|
|
175
|
+
content={'detail': 'oauth2-proxy error'})
|
|
176
|
+
|
|
177
|
+
def get_auth_user(
|
|
178
|
+
self, response: aiohttp.ClientResponse) -> Optional[models.User]:
|
|
179
|
+
"""Extract user info from OAuth2 proxy response headers."""
|
|
180
|
+
email_header = response.headers.get('X-Auth-Request-Email')
|
|
181
|
+
if email_header:
|
|
182
|
+
user_hash = hashlib.md5(email_header.encode()).hexdigest(
|
|
183
|
+
)[:common_utils.USER_HASH_LENGTH]
|
|
184
|
+
return models.User(id=user_hash, name=email_header)
|
|
185
|
+
return None
|
sky/server/common.py
CHANGED
|
@@ -41,12 +41,14 @@ from sky.utils import rich_utils
|
|
|
41
41
|
from sky.utils import ux_utils
|
|
42
42
|
|
|
43
43
|
if typing.TYPE_CHECKING:
|
|
44
|
+
import aiohttp
|
|
44
45
|
import pydantic
|
|
45
46
|
import requests
|
|
46
47
|
|
|
47
48
|
from sky import dag as dag_lib
|
|
48
49
|
from sky import models
|
|
49
50
|
else:
|
|
51
|
+
aiohttp = adaptors_common.LazyImport('aiohttp')
|
|
50
52
|
pydantic = adaptors_common.LazyImport('pydantic')
|
|
51
53
|
requests = adaptors_common.LazyImport('requests')
|
|
52
54
|
|
|
@@ -175,24 +177,14 @@ def get_cookies_from_response(
|
|
|
175
177
|
return cookies
|
|
176
178
|
|
|
177
179
|
|
|
178
|
-
def
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
"""Make an authenticated HTTP request to the API server.
|
|
184
|
-
|
|
185
|
-
Automatically handles service account token authentication or cookie-based
|
|
186
|
-
authentication based on what's available.
|
|
187
|
-
|
|
188
|
-
Args:
|
|
189
|
-
method: HTTP method (GET, POST, etc.)
|
|
190
|
-
path: API path (e.g., '/api/v1/status')
|
|
191
|
-
server_url: Server URL, defaults to configured server
|
|
192
|
-
**kwargs: Additional arguments to pass to requests
|
|
180
|
+
def _prepare_authenticated_request_params(
|
|
181
|
+
path: str,
|
|
182
|
+
server_url: Optional[str] = None,
|
|
183
|
+
**kwargs) -> Tuple[str, Dict[str, Any]]:
|
|
184
|
+
"""Prepare common parameters for authenticated requests (sync or async).
|
|
193
185
|
|
|
194
186
|
Returns:
|
|
195
|
-
|
|
187
|
+
Tuple of (url, updated_kwargs)
|
|
196
188
|
"""
|
|
197
189
|
if server_url is None:
|
|
198
190
|
server_url = get_server_url()
|
|
@@ -214,6 +206,41 @@ def make_authenticated_request(method: str,
|
|
|
214
206
|
if not headers.get('Authorization') and 'cookies' not in kwargs:
|
|
215
207
|
kwargs['cookies'] = get_api_cookie_jar()
|
|
216
208
|
|
|
209
|
+
return url, kwargs
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _convert_requests_cookies_to_aiohttp(
|
|
213
|
+
cookie_jar: requests.cookies.RequestsCookieJar) -> Dict[str, str]:
|
|
214
|
+
"""Convert requests cookie jar to aiohttp-compatible dict format."""
|
|
215
|
+
cookies = {}
|
|
216
|
+
for cookie in cookie_jar:
|
|
217
|
+
cookies[cookie.name] = cookie.value
|
|
218
|
+
return cookies # type: ignore
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def make_authenticated_request(method: str,
|
|
222
|
+
path: str,
|
|
223
|
+
server_url: Optional[str] = None,
|
|
224
|
+
retry: bool = True,
|
|
225
|
+
**kwargs) -> 'requests.Response':
|
|
226
|
+
"""Make an authenticated HTTP request to the API server.
|
|
227
|
+
|
|
228
|
+
Automatically handles service account token authentication or cookie-based
|
|
229
|
+
authentication based on what's available.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
method: HTTP method (GET, POST, etc.)
|
|
233
|
+
path: API path (e.g., '/api/v1/status')
|
|
234
|
+
server_url: Server URL, defaults to configured server
|
|
235
|
+
retry: Whether to retry on transient errors
|
|
236
|
+
**kwargs: Additional arguments to pass to requests
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
requests.Response object
|
|
240
|
+
"""
|
|
241
|
+
url, kwargs = _prepare_authenticated_request_params(path, server_url,
|
|
242
|
+
**kwargs)
|
|
243
|
+
|
|
217
244
|
# Make the request
|
|
218
245
|
if retry:
|
|
219
246
|
return rest.request(method, url, **kwargs)
|
|
@@ -222,6 +249,69 @@ def make_authenticated_request(method: str,
|
|
|
222
249
|
return rest.request_without_retry(method, url, **kwargs)
|
|
223
250
|
|
|
224
251
|
|
|
252
|
+
async def make_authenticated_request_async(
|
|
253
|
+
session: 'aiohttp.ClientSession',
|
|
254
|
+
method: str,
|
|
255
|
+
path: str,
|
|
256
|
+
server_url: Optional[str] = None,
|
|
257
|
+
retry: bool = True,
|
|
258
|
+
**kwargs) -> 'aiohttp.ClientResponse':
|
|
259
|
+
"""Make an authenticated async HTTP request to the API server using aiohttp.
|
|
260
|
+
|
|
261
|
+
Automatically handles service account token authentication or cookie-based
|
|
262
|
+
authentication based on what's available.
|
|
263
|
+
|
|
264
|
+
Example usage:
|
|
265
|
+
async with aiohttp.ClientSession() as session:
|
|
266
|
+
response = await make_authenticated_request_async(
|
|
267
|
+
session, 'GET', '/api/v1/status')
|
|
268
|
+
data = await response.json()
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
session: aiohttp ClientSession to use for the request
|
|
272
|
+
method: HTTP method (GET, POST, etc.)
|
|
273
|
+
path: API path (e.g., '/api/v1/status')
|
|
274
|
+
server_url: Server URL, defaults to configured server
|
|
275
|
+
retry: Whether to retry on transient errors
|
|
276
|
+
**kwargs: Additional arguments to pass to aiohttp
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
aiohttp.ClientResponse object
|
|
280
|
+
|
|
281
|
+
Raises:
|
|
282
|
+
aiohttp.ClientError: For HTTP-related errors
|
|
283
|
+
exceptions.ServerTemporarilyUnavailableError: When server returns 503
|
|
284
|
+
exceptions.RequestInterruptedError: When request is interrupted
|
|
285
|
+
"""
|
|
286
|
+
url, kwargs = _prepare_authenticated_request_params(path, server_url,
|
|
287
|
+
**kwargs)
|
|
288
|
+
|
|
289
|
+
# Convert cookies to aiohttp format if needed
|
|
290
|
+
if 'cookies' in kwargs and isinstance(kwargs['cookies'],
|
|
291
|
+
requests.cookies.RequestsCookieJar):
|
|
292
|
+
kwargs['cookies'] = _convert_requests_cookies_to_aiohttp(
|
|
293
|
+
kwargs['cookies'])
|
|
294
|
+
|
|
295
|
+
# Convert params to strings for aiohttp compatibility
|
|
296
|
+
if 'params' in kwargs and kwargs['params'] is not None:
|
|
297
|
+
normalized_params = {}
|
|
298
|
+
for key, value in kwargs['params'].items():
|
|
299
|
+
if isinstance(value, bool):
|
|
300
|
+
normalized_params[key] = str(value).lower()
|
|
301
|
+
elif value is not None:
|
|
302
|
+
normalized_params[key] = str(value)
|
|
303
|
+
# Skip None values
|
|
304
|
+
kwargs['params'] = normalized_params
|
|
305
|
+
|
|
306
|
+
# Make the request
|
|
307
|
+
if retry:
|
|
308
|
+
return await rest.request_async(session, method, url, **kwargs)
|
|
309
|
+
else:
|
|
310
|
+
assert method == 'GET', 'Only GET requests can be done without retry'
|
|
311
|
+
return await rest.request_without_retry_async(session, method, url,
|
|
312
|
+
**kwargs)
|
|
313
|
+
|
|
314
|
+
|
|
225
315
|
@annotations.lru_cache(scope='global')
|
|
226
316
|
def get_server_url(host: Optional[str] = None) -> str:
|
|
227
317
|
endpoint = DEFAULT_SERVER_URL
|
|
@@ -322,13 +412,14 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
|
322
412
|
# The response is 200, so we can parse the response.
|
|
323
413
|
try:
|
|
324
414
|
result = response.json()
|
|
415
|
+
server_status = result.get('status')
|
|
325
416
|
api_version = result.get('api_version')
|
|
326
417
|
version = result.get('version')
|
|
327
418
|
version_on_disk = result.get('version_on_disk')
|
|
328
419
|
commit = result.get('commit')
|
|
329
420
|
user = result.get('user')
|
|
330
421
|
basic_auth_enabled = result.get('basic_auth_enabled')
|
|
331
|
-
server_info = ApiServerInfo(status=ApiServerStatus
|
|
422
|
+
server_info = ApiServerInfo(status=ApiServerStatus(server_status),
|
|
332
423
|
api_version=api_version,
|
|
333
424
|
version=version,
|
|
334
425
|
version_on_disk=version_on_disk,
|
sky/server/constants.py
CHANGED
|
@@ -10,7 +10,7 @@ from sky.skylet import constants
|
|
|
10
10
|
# based on version info is needed.
|
|
11
11
|
# For more details and code guidelines, refer to:
|
|
12
12
|
# https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
|
|
13
|
-
API_VERSION =
|
|
13
|
+
API_VERSION = 14
|
|
14
14
|
|
|
15
15
|
# The minimum peer API version that the code should still work with.
|
|
16
16
|
# Notes (dev):
|
sky/server/daemons.py
CHANGED
|
@@ -14,6 +14,10 @@ from sky.utils import ux_utils
|
|
|
14
14
|
logger = sky_logging.init_logger(__name__)
|
|
15
15
|
|
|
16
16
|
|
|
17
|
+
def _default_should_skip():
|
|
18
|
+
return False
|
|
19
|
+
|
|
20
|
+
|
|
17
21
|
@dataclasses.dataclass
|
|
18
22
|
class InternalRequestDaemon:
|
|
19
23
|
"""Internal daemon that runs an event in the background."""
|
|
@@ -22,6 +26,7 @@ class InternalRequestDaemon:
|
|
|
22
26
|
name: str
|
|
23
27
|
event_fn: Callable[[], None]
|
|
24
28
|
default_log_level: str = 'INFO'
|
|
29
|
+
should_skip: Callable[[], bool] = _default_should_skip
|
|
25
30
|
|
|
26
31
|
def refresh_log_level(self) -> int:
|
|
27
32
|
# pylint: disable=import-outside-toplevel
|
|
@@ -110,14 +115,14 @@ def managed_job_status_refresh_event():
|
|
|
110
115
|
"""Refresh the managed job status for controller consolidation mode."""
|
|
111
116
|
# pylint: disable=import-outside-toplevel
|
|
112
117
|
from sky.jobs import utils as managed_job_utils
|
|
113
|
-
|
|
114
|
-
|
|
118
|
+
from sky.utils import controller_utils
|
|
119
|
+
|
|
115
120
|
# We run the recovery logic before starting the event loop as those two are
|
|
116
121
|
# conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
|
|
117
|
-
from sky.utils import controller_utils
|
|
118
122
|
if controller_utils.high_availability_specified(
|
|
119
123
|
controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name):
|
|
120
124
|
managed_job_utils.ha_recovery_for_consolidation_mode()
|
|
125
|
+
|
|
121
126
|
# After recovery, we start the event loop.
|
|
122
127
|
from sky.skylet import events
|
|
123
128
|
refresh_event = events.ManagedJobEvent()
|
|
@@ -128,20 +133,58 @@ def managed_job_status_refresh_event():
|
|
|
128
133
|
time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
|
|
129
134
|
|
|
130
135
|
|
|
131
|
-
def
|
|
136
|
+
def should_skip_managed_job_status_refresh():
|
|
137
|
+
"""Check if the managed job status refresh event should be skipped."""
|
|
138
|
+
# pylint: disable=import-outside-toplevel
|
|
139
|
+
from sky.jobs import utils as managed_job_utils
|
|
140
|
+
return not managed_job_utils.is_consolidation_mode()
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _serve_status_refresh_event(pool: bool):
|
|
132
144
|
"""Refresh the sky serve status for controller consolidation mode."""
|
|
133
145
|
# pylint: disable=import-outside-toplevel
|
|
134
146
|
from sky.serve import serve_utils
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
#
|
|
147
|
+
from sky.utils import controller_utils
|
|
148
|
+
|
|
149
|
+
# We run the recovery logic before starting the event loop as those two are
|
|
150
|
+
# conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
|
|
151
|
+
controller = controller_utils.get_controller_for_pool(pool)
|
|
152
|
+
if controller_utils.high_availability_specified(
|
|
153
|
+
controller.value.cluster_name):
|
|
154
|
+
serve_utils.ha_recovery_for_consolidation_mode(pool=pool)
|
|
155
|
+
|
|
156
|
+
# After recovery, we start the event loop.
|
|
138
157
|
from sky.skylet import events
|
|
139
|
-
event = events.ServiceUpdateEvent()
|
|
140
|
-
|
|
158
|
+
event = events.ServiceUpdateEvent(pool=pool)
|
|
159
|
+
noun = 'pool' if pool else 'serve'
|
|
160
|
+
logger.info(f'=== Running {noun} status refresh event ===')
|
|
141
161
|
event.run()
|
|
142
162
|
time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
|
|
143
163
|
|
|
144
164
|
|
|
165
|
+
def _should_skip_serve_status_refresh_event(pool: bool):
|
|
166
|
+
"""Check if the serve status refresh event should be skipped."""
|
|
167
|
+
# pylint: disable=import-outside-toplevel
|
|
168
|
+
from sky.serve import serve_utils
|
|
169
|
+
return not serve_utils.is_consolidation_mode(pool=pool)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def sky_serve_status_refresh_event():
|
|
173
|
+
_serve_status_refresh_event(pool=False)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def should_skip_sky_serve_status_refresh():
|
|
177
|
+
return _should_skip_serve_status_refresh_event(pool=False)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def pool_status_refresh_event():
|
|
181
|
+
_serve_status_refresh_event(pool=True)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def should_skip_pool_status_refresh():
|
|
185
|
+
return _should_skip_serve_status_refresh_event(pool=True)
|
|
186
|
+
|
|
187
|
+
|
|
145
188
|
# Register the events to run in the background.
|
|
146
189
|
INTERNAL_REQUEST_DAEMONS = [
|
|
147
190
|
# This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
|
|
@@ -157,8 +200,14 @@ INTERNAL_REQUEST_DAEMONS = [
|
|
|
157
200
|
event_fn=refresh_volume_status_event),
|
|
158
201
|
InternalRequestDaemon(id='managed-job-status-refresh-daemon',
|
|
159
202
|
name='managed-job-status',
|
|
160
|
-
event_fn=managed_job_status_refresh_event
|
|
203
|
+
event_fn=managed_job_status_refresh_event,
|
|
204
|
+
should_skip=should_skip_managed_job_status_refresh),
|
|
161
205
|
InternalRequestDaemon(id='sky-serve-status-refresh-daemon',
|
|
162
206
|
name='sky-serve-status',
|
|
163
|
-
event_fn=sky_serve_status_refresh_event
|
|
207
|
+
event_fn=sky_serve_status_refresh_event,
|
|
208
|
+
should_skip=should_skip_sky_serve_status_refresh),
|
|
209
|
+
InternalRequestDaemon(id='pool-status-refresh-daemon',
|
|
210
|
+
name='pool-status',
|
|
211
|
+
event_fn=pool_status_refresh_event,
|
|
212
|
+
should_skip=should_skip_pool_status_refresh),
|
|
164
213
|
]
|