skypilot-nightly 1.0.0.dev20250802__py3-none-any.whl → 1.0.0.dev20250806__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +4 -1
- sky/catalog/kubernetes_catalog.py +8 -0
- sky/catalog/nebius_catalog.py +0 -1
- sky/client/cli/command.py +32 -13
- sky/client/sdk.py +16 -8
- sky/client/sdk.pyi +6 -5
- sky/client/sdk_async.py +811 -0
- sky/clouds/kubernetes.py +6 -1
- sky/clouds/nebius.py +1 -4
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/Gelsd19kVxXcX7aQQGsGu/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1043-75af48ca5d5aaf57.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-8678a9102cc5f67e.js +11 -0
- sky/dashboard/out/_next/static/chunks/2622-951867535095b0eb.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.0a173cd4393f0fef.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.99f29acb7617963e.js +6 -0
- sky/dashboard/out/_next/static/chunks/{9984.78ee6d2c6fa4b0e8.js → 9984.c5564679e467d245.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{_app-a67ae198457b9886.js → _app-2a43ea3241bbdacd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-fa63e8b1d203f298.js → [job]-7cb24da04ca00956.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9e7df5fc761c95a7.js → [cluster]-1e95993124dbfc57.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-47f1ddae13a2f8e4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-d56e64f30db7b42e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-2a44e70b500b6b70.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-22faac9325016d83.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90693cb88b5599a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ab318e52eb4424a7.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-b90c865a690bfe84.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-7af733f5d7b6ed1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-4d41c9023287f59a.js → [name]-35e0de5bca55e594.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-062525fb5462acb6.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-387626669badf82e.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/jobs/client/sdk_async.py +135 -0
- sky/jobs/utils.py +3 -1
- sky/provision/kubernetes/utils.py +62 -6
- sky/provision/nebius/instance.py +1 -0
- sky/provision/nebius/utils.py +9 -1
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +2 -1
- sky/serve/controller.py +2 -1
- sky/serve/load_balancer.py +3 -1
- sky/serve/serve_state.py +70 -5
- sky/serve/serve_utils.py +124 -22
- sky/serve/server/impl.py +22 -21
- sky/serve/service.py +8 -1
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +46 -0
- sky/server/auth/oauth2_proxy.py +185 -0
- sky/server/common.py +108 -17
- sky/server/constants.py +1 -1
- sky/server/daemons.py +60 -11
- sky/server/rest.py +114 -0
- sky/server/server.py +44 -40
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +2 -1
- sky/skylet/events.py +5 -1
- sky/skylet/skylet.py +3 -1
- sky/task.py +43 -10
- sky/templates/kubernetes-ray.yml.j2 +4 -0
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/utils/controller_utils.py +7 -0
- sky/utils/rich_utils.py +120 -0
- {skypilot_nightly-1.0.0.dev20250802.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/METADATA +5 -1
- {skypilot_nightly-1.0.0.dev20250802.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/RECORD +87 -82
- sky/dashboard/out/_next/static/2JNCZ4daQBotwWRNGi6aE/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +0 -11
- sky/dashboard/out/_next/static/chunks/3698-7874720877646365.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.7937c16bc8623516.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-8620d099cbef8608.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6c5af4c86e6ab3d3.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-13145516b19858fb.js +0 -1
- /sky/dashboard/out/_next/static/{2JNCZ4daQBotwWRNGi6aE → Gelsd19kVxXcX7aQQGsGu}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{1871-7e17c195296e2ea9.js → 1871-ced1c14230cad6e1.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{6135-d0e285ac5f3f2485.js → 6135-2d7ed3350659d073.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{6601-234b1cf963c7280b.js → 6601-2109d22e7861861c.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{938-40d15b6261ec8dc1.js → 938-bda2685db5eae6cf.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250802.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250802.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250802.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250802.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/top_level.txt +0 -0
sky/server/rest.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""REST API client of SkyPilot API server"""
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import contextlib
|
|
4
5
|
import contextvars
|
|
5
6
|
import functools
|
|
@@ -21,9 +22,11 @@ from sky.utils import ux_utils
|
|
|
21
22
|
logger = sky_logging.init_logger(__name__)
|
|
22
23
|
|
|
23
24
|
if typing.TYPE_CHECKING:
|
|
25
|
+
import aiohttp
|
|
24
26
|
import requests
|
|
25
27
|
|
|
26
28
|
else:
|
|
29
|
+
aiohttp = adaptors_common.LazyImport('aiohttp')
|
|
27
30
|
requests = adaptors_common.LazyImport('requests')
|
|
28
31
|
|
|
29
32
|
F = TypeVar('F', bound=Callable[..., Any])
|
|
@@ -204,3 +207,114 @@ def request_without_retry(method, url, **kwargs) -> 'requests.Response':
|
|
|
204
207
|
if remote_version is not None:
|
|
205
208
|
versions.set_remote_version(remote_version)
|
|
206
209
|
return response
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
# Async versions of the above functions
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
async def request_async(session: 'aiohttp.ClientSession', method: str, url: str,
|
|
216
|
+
**kwargs) -> 'aiohttp.ClientResponse':
|
|
217
|
+
"""Send an async request to the API server, retry on server temporarily
|
|
218
|
+
unavailable."""
|
|
219
|
+
max_retries = 3
|
|
220
|
+
initial_backoff = 1.0
|
|
221
|
+
max_backoff_factor = 5
|
|
222
|
+
|
|
223
|
+
backoff = common_utils.Backoff(initial_backoff, max_backoff_factor)
|
|
224
|
+
last_exception = Exception('Uknown Exception') # this will be replaced by e
|
|
225
|
+
|
|
226
|
+
for retry_count in range(max_retries):
|
|
227
|
+
try:
|
|
228
|
+
return await request_without_retry_async(session, method, url,
|
|
229
|
+
**kwargs)
|
|
230
|
+
except exceptions.RequestInterruptedError:
|
|
231
|
+
logger.debug('Request interrupted. Retry immediately.')
|
|
232
|
+
continue
|
|
233
|
+
except Exception as e: # pylint: disable=broad-except
|
|
234
|
+
last_exception = e
|
|
235
|
+
if retry_count >= max_retries - 1:
|
|
236
|
+
# Retries exhausted
|
|
237
|
+
raise
|
|
238
|
+
|
|
239
|
+
# Check if this is a transient error (similar to sync version logic)
|
|
240
|
+
is_transient = _is_transient_error_async(e)
|
|
241
|
+
if not is_transient:
|
|
242
|
+
# Permanent error, no need to retry
|
|
243
|
+
raise
|
|
244
|
+
|
|
245
|
+
logger.debug(f'Retry async request due to {e}, '
|
|
246
|
+
f'attempt {retry_count + 1}/{max_retries}')
|
|
247
|
+
await asyncio.sleep(backoff.current_backoff())
|
|
248
|
+
|
|
249
|
+
# This should never be reached, but just in case
|
|
250
|
+
raise last_exception
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
async def request_without_retry_async(session: 'aiohttp.ClientSession',
|
|
254
|
+
method: str, url: str,
|
|
255
|
+
**kwargs) -> 'aiohttp.ClientResponse':
|
|
256
|
+
"""Send an async request to the API server without retry."""
|
|
257
|
+
# Add API version headers for compatibility (like sync version does)
|
|
258
|
+
if 'headers' not in kwargs:
|
|
259
|
+
kwargs['headers'] = {}
|
|
260
|
+
kwargs['headers'][constants.API_VERSION_HEADER] = str(constants.API_VERSION)
|
|
261
|
+
kwargs['headers'][constants.VERSION_HEADER] = (
|
|
262
|
+
versions.get_local_readable_version())
|
|
263
|
+
|
|
264
|
+
try:
|
|
265
|
+
response = await session.request(method, url, **kwargs)
|
|
266
|
+
|
|
267
|
+
# Handle server unavailability (503 status) - same as sync version
|
|
268
|
+
if response.status == 503:
|
|
269
|
+
with ux_utils.print_exception_no_traceback():
|
|
270
|
+
raise exceptions.ServerTemporarilyUnavailableError(
|
|
271
|
+
'SkyPilot API server is temporarily unavailable. '
|
|
272
|
+
'Please try again later.')
|
|
273
|
+
|
|
274
|
+
# Set remote API version and version from headers - same as sync version
|
|
275
|
+
remote_api_version = response.headers.get(constants.API_VERSION_HEADER)
|
|
276
|
+
remote_version = response.headers.get(constants.VERSION_HEADER)
|
|
277
|
+
if remote_api_version is not None:
|
|
278
|
+
versions.set_remote_api_version(int(remote_api_version))
|
|
279
|
+
if remote_version is not None:
|
|
280
|
+
versions.set_remote_version(remote_version)
|
|
281
|
+
|
|
282
|
+
return response
|
|
283
|
+
|
|
284
|
+
except aiohttp.ClientError as e:
|
|
285
|
+
# Convert aiohttp errors to appropriate SkyPilot exceptions
|
|
286
|
+
if isinstance(e, aiohttp.ClientConnectorError):
|
|
287
|
+
raise exceptions.RequestInterruptedError(
|
|
288
|
+
f'Connection failed: {e}') from e
|
|
289
|
+
elif isinstance(e, aiohttp.ClientTimeout):
|
|
290
|
+
raise exceptions.RequestInterruptedError(
|
|
291
|
+
f'Request timeout: {e}') from e
|
|
292
|
+
else:
|
|
293
|
+
raise
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _is_transient_error_async(e: Exception) -> bool:
|
|
297
|
+
"""Check if an exception from async request is transient and should be
|
|
298
|
+
retried.
|
|
299
|
+
|
|
300
|
+
Mirrors the logic from the sync version's is_transient_error().
|
|
301
|
+
"""
|
|
302
|
+
if isinstance(e, aiohttp.ClientError):
|
|
303
|
+
# For response errors, check status code if available
|
|
304
|
+
if isinstance(e, aiohttp.ClientResponseError):
|
|
305
|
+
# Only server error is considered as transient (same as sync
|
|
306
|
+
# version)
|
|
307
|
+
return e.status >= 500
|
|
308
|
+
# Consider connection errors and timeouts as transient
|
|
309
|
+
if isinstance(e, (aiohttp.ClientConnectorError, aiohttp.ClientTimeout)):
|
|
310
|
+
return True
|
|
311
|
+
|
|
312
|
+
# Consider server temporarily unavailable as transient
|
|
313
|
+
if isinstance(e, exceptions.ServerTemporarilyUnavailableError):
|
|
314
|
+
return True
|
|
315
|
+
|
|
316
|
+
# It is hard to enumerate all other errors that are transient, e.g.
|
|
317
|
+
# broken pipe, connection refused, etc. Instead, it is safer to assume
|
|
318
|
+
# all other errors might be transient since we only retry for 3 times
|
|
319
|
+
# by default. (Same comment as in sync version)
|
|
320
|
+
return True
|
sky/server/server.py
CHANGED
|
@@ -51,6 +51,8 @@ from sky.server import metrics
|
|
|
51
51
|
from sky.server import state
|
|
52
52
|
from sky.server import stream_utils
|
|
53
53
|
from sky.server import versions
|
|
54
|
+
from sky.server.auth import authn
|
|
55
|
+
from sky.server.auth import oauth2_proxy
|
|
54
56
|
from sky.server.requests import executor
|
|
55
57
|
from sky.server.requests import payloads
|
|
56
58
|
from sky.server.requests import preconditions
|
|
@@ -120,41 +122,6 @@ def _basic_auth_401_response(content: str):
|
|
|
120
122
|
content=content)
|
|
121
123
|
|
|
122
124
|
|
|
123
|
-
# TODO(hailong): Remove this function and use request.state.auth_user instead.
|
|
124
|
-
async def _override_user_info_in_request_body(request: fastapi.Request,
|
|
125
|
-
auth_user: Optional[models.User]):
|
|
126
|
-
if auth_user is None:
|
|
127
|
-
return
|
|
128
|
-
|
|
129
|
-
body = await request.body()
|
|
130
|
-
if body:
|
|
131
|
-
try:
|
|
132
|
-
original_json = await request.json()
|
|
133
|
-
except (json.JSONDecodeError, UnicodeDecodeError) as e:
|
|
134
|
-
logger.error(f'Error parsing request JSON: {e}')
|
|
135
|
-
else:
|
|
136
|
-
logger.debug(f'Overriding user for {request.state.request_id}: '
|
|
137
|
-
f'{auth_user.name}, {auth_user.id}')
|
|
138
|
-
if 'env_vars' in original_json:
|
|
139
|
-
if isinstance(original_json.get('env_vars'), dict):
|
|
140
|
-
original_json['env_vars'][
|
|
141
|
-
constants.USER_ID_ENV_VAR] = auth_user.id
|
|
142
|
-
original_json['env_vars'][
|
|
143
|
-
constants.USER_ENV_VAR] = auth_user.name
|
|
144
|
-
else:
|
|
145
|
-
logger.warning(
|
|
146
|
-
f'"env_vars" in request body is not a dictionary '
|
|
147
|
-
f'for request {request.state.request_id}. '
|
|
148
|
-
'Skipping user info injection into body.')
|
|
149
|
-
else:
|
|
150
|
-
original_json['env_vars'] = {}
|
|
151
|
-
original_json['env_vars'][
|
|
152
|
-
constants.USER_ID_ENV_VAR] = auth_user.id
|
|
153
|
-
original_json['env_vars'][
|
|
154
|
-
constants.USER_ENV_VAR] = auth_user.name
|
|
155
|
-
request._body = json.dumps(original_json).encode('utf-8') # pylint: disable=protected-access
|
|
156
|
-
|
|
157
|
-
|
|
158
125
|
def _try_set_basic_auth_user(request: fastapi.Request):
|
|
159
126
|
auth_header = request.headers.get('authorization')
|
|
160
127
|
if not auth_header or not auth_header.lower().startswith('basic '):
|
|
@@ -281,7 +248,7 @@ class BasicAuthMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
281
248
|
apr_md5_crypt.verify(password, user.password)):
|
|
282
249
|
valid_user = True
|
|
283
250
|
request.state.auth_user = user
|
|
284
|
-
await
|
|
251
|
+
await authn.override_user_info_in_request_body(request, user)
|
|
285
252
|
break
|
|
286
253
|
if not valid_user:
|
|
287
254
|
return _basic_auth_401_response('Invalid credentials')
|
|
@@ -400,7 +367,7 @@ class BearerTokenMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
400
367
|
request.state.auth_user = auth_user
|
|
401
368
|
|
|
402
369
|
# Override user info in request body for service account requests
|
|
403
|
-
await
|
|
370
|
+
await authn.override_user_info_in_request_body(request, auth_user)
|
|
404
371
|
|
|
405
372
|
logger.debug(f'Authenticated service account: {user_id}')
|
|
406
373
|
|
|
@@ -445,7 +412,7 @@ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
445
412
|
if auth_user is not None:
|
|
446
413
|
request.state.auth_user = auth_user
|
|
447
414
|
|
|
448
|
-
await
|
|
415
|
+
await authn.override_user_info_in_request_body(request, auth_user)
|
|
449
416
|
return await call_next(request)
|
|
450
417
|
|
|
451
418
|
|
|
@@ -484,6 +451,8 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
|
|
|
484
451
|
del app # unused
|
|
485
452
|
# Startup: Run background tasks
|
|
486
453
|
for event in daemons.INTERNAL_REQUEST_DAEMONS:
|
|
454
|
+
if event.should_skip():
|
|
455
|
+
continue
|
|
487
456
|
try:
|
|
488
457
|
executor.schedule_request(
|
|
489
458
|
request_id=event.id,
|
|
@@ -625,6 +594,8 @@ app.add_middleware(
|
|
|
625
594
|
# RBACMiddleware must precede all the auth middleware, so it can access
|
|
626
595
|
# request.state.auth_user.
|
|
627
596
|
app.add_middleware(RBACMiddleware)
|
|
597
|
+
# Authentication based on oauth2-proxy.
|
|
598
|
+
app.add_middleware(oauth2_proxy.OAuth2ProxyMiddleware)
|
|
628
599
|
# AuthProxyMiddleware should precede BasicAuthMiddleware and
|
|
629
600
|
# BearerTokenMiddleware, since it should be skipped if either of those set the
|
|
630
601
|
# auth user.
|
|
@@ -1574,9 +1545,42 @@ async def health(request: fastapi.Request) -> Dict[str, Any]:
|
|
|
1574
1545
|
- commit: str; The commit hash of SkyPilot used for API server.
|
|
1575
1546
|
"""
|
|
1576
1547
|
user = request.state.auth_user
|
|
1577
|
-
|
|
1548
|
+
server_status = common.ApiServerStatus.HEALTHY
|
|
1549
|
+
if getattr(request.state, 'anonymous_user', False):
|
|
1550
|
+
# API server authentication is enabled, but the request is not
|
|
1551
|
+
# authenticated. We still have to serve the request because the
|
|
1552
|
+
# /api/health endpoint has two different usage:
|
|
1553
|
+
# 1. For health check from `api start` and external ochestration
|
|
1554
|
+
# tools (k8s), which does not require authentication and user info.
|
|
1555
|
+
# 2. Return server info to client and hint client to login if required.
|
|
1556
|
+
# Separating these two usage to different APIs will break backward
|
|
1557
|
+
# compatibility for existing ochestration solutions (e.g. helm chart).
|
|
1558
|
+
# So we serve these two usages in a backward compatible manner below.
|
|
1559
|
+
client_version = versions.get_remote_api_version()
|
|
1560
|
+
# - For Client with API version >= 14, we return 200 response with
|
|
1561
|
+
# status=NEEDS_AUTH, new client will handle the login process.
|
|
1562
|
+
# - For health check from `sky api start`, the client code always uses
|
|
1563
|
+
# the same API version with the server, thus there is no compatibility
|
|
1564
|
+
# issue.
|
|
1565
|
+
server_status = common.ApiServerStatus.NEEDS_AUTH
|
|
1566
|
+
if client_version is None:
|
|
1567
|
+
# - For health check from ochestration tools (e.g. k8s), we also
|
|
1568
|
+
# return 200 with status=NEEDS_AUTH, which passes HTTP probe
|
|
1569
|
+
# check.
|
|
1570
|
+
# - There is no harm when an malicious client calls /api/health
|
|
1571
|
+
# without authentication since no sensitive information is
|
|
1572
|
+
# returned.
|
|
1573
|
+
return {'status': common.ApiServerStatus.HEALTHY}
|
|
1574
|
+
# TODO(aylei): remove this after min_compatible_api_version >= 14.
|
|
1575
|
+
if client_version < 14:
|
|
1576
|
+
# For Client with API version < 14, the NEEDS_AUTH status is not
|
|
1577
|
+
# honored. Return 401 to trigger the login process.
|
|
1578
|
+
raise fastapi.HTTPException(status_code=401,
|
|
1579
|
+
detail='Authentication required')
|
|
1580
|
+
|
|
1581
|
+
logger.debug(f'Health endpoint: request.state.auth_user = {user}')
|
|
1578
1582
|
return {
|
|
1579
|
-
'status':
|
|
1583
|
+
'status': server_status,
|
|
1580
1584
|
# Kept for backward compatibility, clients before 0.11.0 will read this
|
|
1581
1585
|
# field to check compatibility and hint the user to upgrade the CLI.
|
|
1582
1586
|
# TODO(aylei): remove this field after 0.13.0
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -69,6 +69,7 @@ install_requires = [
|
|
|
69
69
|
'gitpython',
|
|
70
70
|
'types-paramiko',
|
|
71
71
|
'alembic',
|
|
72
|
+
'aiohttp',
|
|
72
73
|
]
|
|
73
74
|
|
|
74
75
|
server_dependencies = [
|
|
@@ -76,6 +77,7 @@ server_dependencies = [
|
|
|
76
77
|
'sqlalchemy_adapter',
|
|
77
78
|
'passlib',
|
|
78
79
|
'pyjwt',
|
|
80
|
+
'aiohttp',
|
|
79
81
|
]
|
|
80
82
|
|
|
81
83
|
local_ray = [
|
sky/skylet/constants.py
CHANGED
|
@@ -369,6 +369,7 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
|
|
|
369
369
|
('docker', 'run_options'),
|
|
370
370
|
('nvidia_gpus', 'disable_ecc'),
|
|
371
371
|
('ssh', 'pod_config'),
|
|
372
|
+
('kubernetes', 'custom_metadata'),
|
|
372
373
|
('kubernetes', 'pod_config'),
|
|
373
374
|
('kubernetes', 'provision_timeout'),
|
|
374
375
|
('kubernetes', 'dws'),
|
|
@@ -405,7 +406,7 @@ PERSISTENT_RUN_SCRIPT_DIR = '~/.sky/.controller_recovery_task_run'
|
|
|
405
406
|
PERSISTENT_RUN_RESTARTING_SIGNAL_FILE = (
|
|
406
407
|
'~/.sky/.controller_recovery_restarting_signal')
|
|
407
408
|
|
|
408
|
-
HA_PERSISTENT_RECOVERY_LOG_PATH = '/tmp/ha_recovery.log'
|
|
409
|
+
HA_PERSISTENT_RECOVERY_LOG_PATH = '/tmp/{}ha_recovery.log'
|
|
409
410
|
|
|
410
411
|
# The placeholder for the local skypilot config path in file mounts for
|
|
411
412
|
# controllers.
|
sky/skylet/events.py
CHANGED
|
@@ -96,8 +96,12 @@ class ServiceUpdateEvent(SkyletEvent):
|
|
|
96
96
|
"""
|
|
97
97
|
EVENT_INTERVAL_SECONDS = 300
|
|
98
98
|
|
|
99
|
+
def __init__(self, pool: bool) -> None:
|
|
100
|
+
super().__init__()
|
|
101
|
+
self._pool = pool
|
|
102
|
+
|
|
99
103
|
def _run(self):
|
|
100
|
-
serve_utils.update_service_status()
|
|
104
|
+
serve_utils.update_service_status(self._pool)
|
|
101
105
|
|
|
102
106
|
|
|
103
107
|
class UsageHeartbeatReportEvent(SkyletEvent):
|
sky/skylet/skylet.py
CHANGED
|
@@ -24,7 +24,9 @@ EVENTS = [
|
|
|
24
24
|
# This is for monitoring controller job status. If it becomes
|
|
25
25
|
# unhealthy, this event will correctly update the controller
|
|
26
26
|
# status to CONTROLLER_FAILED.
|
|
27
|
-
events.ServiceUpdateEvent(),
|
|
27
|
+
events.ServiceUpdateEvent(pool=False),
|
|
28
|
+
# Status refresh for pool.
|
|
29
|
+
events.ServiceUpdateEvent(pool=True),
|
|
28
30
|
# Report usage heartbeat every 10 minutes.
|
|
29
31
|
events.UsageHeartbeatReportEvent(),
|
|
30
32
|
]
|
sky/task.py
CHANGED
|
@@ -247,15 +247,20 @@ class Task:
|
|
|
247
247
|
secrets: Optional[Dict[str, str]] = None,
|
|
248
248
|
workdir: Optional[Union[str, Dict[str, Any]]] = None,
|
|
249
249
|
num_nodes: Optional[int] = None,
|
|
250
|
+
file_mounts: Optional[Dict[str, str]] = None,
|
|
251
|
+
storage_mounts: Optional[Dict[str, storage_lib.Storage]] = None,
|
|
250
252
|
volumes: Optional[Dict[str, str]] = None,
|
|
253
|
+
resources: Optional[Union['resources_lib.Resources',
|
|
254
|
+
List['resources_lib.Resources'],
|
|
255
|
+
Set['resources_lib.Resources']]] = None,
|
|
251
256
|
# Advanced:
|
|
252
257
|
docker_image: Optional[str] = None,
|
|
253
258
|
event_callback: Optional[str] = None,
|
|
254
259
|
blocked_resources: Optional[Iterable['resources_lib.Resources']] = None,
|
|
255
260
|
# Internal use only.
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
261
|
+
_file_mounts_mapping: Optional[Dict[str, str]] = None,
|
|
262
|
+
_volume_mounts: Optional[List[volume_lib.VolumeMount]] = None,
|
|
263
|
+
_metadata: Optional[Dict[str, Any]] = None,
|
|
259
264
|
_user_specified_yaml: Optional[str] = None,
|
|
260
265
|
):
|
|
261
266
|
"""Initializes a Task.
|
|
@@ -315,11 +320,31 @@ class Task:
|
|
|
315
320
|
setup/run command, where ``run`` can either be a str, meaning all
|
|
316
321
|
nodes get the same command, or a lambda, with the semantics
|
|
317
322
|
documented above.
|
|
323
|
+
file_mounts: An optional dict of ``{remote_path: (local_path|cloud
|
|
324
|
+
URI)}``, where remote means the VM(s) on which this Task will
|
|
325
|
+
eventually run on, and local means the node from which the task is
|
|
326
|
+
launched.
|
|
327
|
+
storage_mounts: an optional dict of ``{mount_path: sky.Storage
|
|
328
|
+
object}``, where mount_path is the path inside the remote VM(s)
|
|
329
|
+
where the Storage object will be mounted on.
|
|
330
|
+
volumes: A dict of volumes to be mounted for the task. The dict has
|
|
331
|
+
the form of ``{mount_path: volume_name}``.
|
|
332
|
+
resources: either a sky.Resources, a set of them, or a list of them.
|
|
333
|
+
A set or a list of resources asks the optimizer to "pick the
|
|
334
|
+
best of these resources" to run this task.
|
|
318
335
|
docker_image: (EXPERIMENTAL: Only in effect when LocalDockerBackend
|
|
319
336
|
is used.) The base docker image that this Task will be built on.
|
|
320
337
|
Defaults to 'gpuci/miniforge-cuda:11.4-devel-ubuntu18.04'.
|
|
338
|
+
event_callback: A bash script that will be executed when the task
|
|
339
|
+
changes state.
|
|
321
340
|
blocked_resources: A set of resources that this task cannot run on.
|
|
322
|
-
|
|
341
|
+
_file_mounts_mapping: (Internal use only) A dictionary of file mounts
|
|
342
|
+
mapping.
|
|
343
|
+
_volume_mounts: (Internal use only) A list of volume mounts.
|
|
344
|
+
_metadata: (Internal use only) A dictionary of metadata to be added to
|
|
345
|
+
the task.
|
|
346
|
+
_user_specified_yaml: (Internal use only) A string of user-specified
|
|
347
|
+
YAML config.
|
|
323
348
|
"""
|
|
324
349
|
self.name = name
|
|
325
350
|
self.run = run
|
|
@@ -372,11 +397,19 @@ class Task:
|
|
|
372
397
|
self.best_resources: Optional[sky.Resources] = None
|
|
373
398
|
|
|
374
399
|
# For internal use only.
|
|
375
|
-
self.file_mounts_mapping: Optional[Dict[str,
|
|
400
|
+
self.file_mounts_mapping: Optional[Dict[str,
|
|
401
|
+
str]] = _file_mounts_mapping
|
|
376
402
|
self.volume_mounts: Optional[List[volume_lib.VolumeMount]] = (
|
|
377
|
-
|
|
403
|
+
_volume_mounts)
|
|
404
|
+
|
|
405
|
+
self._metadata = _metadata if _metadata is not None else {}
|
|
378
406
|
|
|
379
|
-
|
|
407
|
+
if resources is not None:
|
|
408
|
+
self.set_resources(resources)
|
|
409
|
+
if storage_mounts is not None:
|
|
410
|
+
self.set_storage_mounts(storage_mounts)
|
|
411
|
+
if file_mounts is not None:
|
|
412
|
+
self.set_file_mounts(file_mounts)
|
|
380
413
|
|
|
381
414
|
dag = sky.dag.get_current_dag()
|
|
382
415
|
if dag is not None:
|
|
@@ -621,10 +654,10 @@ class Task:
|
|
|
621
654
|
num_nodes=config.pop('num_nodes', None),
|
|
622
655
|
envs=config.pop('envs', None),
|
|
623
656
|
secrets=config.pop('secrets', None),
|
|
624
|
-
event_callback=config.pop('event_callback', None),
|
|
625
|
-
file_mounts_mapping=config.pop('file_mounts_mapping', None),
|
|
626
657
|
volumes=config.pop('volumes', None),
|
|
627
|
-
|
|
658
|
+
event_callback=config.pop('event_callback', None),
|
|
659
|
+
_file_mounts_mapping=config.pop('file_mounts_mapping', None),
|
|
660
|
+
_metadata=config.pop('_metadata', None),
|
|
628
661
|
_user_specified_yaml=user_specified_yaml,
|
|
629
662
|
)
|
|
630
663
|
|
|
@@ -777,6 +777,10 @@ available_node_types:
|
|
|
777
777
|
{{ ray_installation_commands }}
|
|
778
778
|
|
|
779
779
|
VIRTUAL_ENV=~/skypilot-runtime ~/.local/bin/uv pip install skypilot[kubernetes,remote]
|
|
780
|
+
# Apply Ray patches for progress bar fix
|
|
781
|
+
~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
|
|
782
|
+
VIRTUAL_ENV=~/skypilot-runtime python -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
|
|
783
|
+
}
|
|
780
784
|
touch /tmp/ray_skypilot_installation_complete
|
|
781
785
|
echo "=== Ray and skypilot installation completed ==="
|
|
782
786
|
|
sky/templates/nebius-ray.yml.j2
CHANGED
sky/utils/controller_utils.py
CHANGED
|
@@ -209,6 +209,13 @@ class Controllers(enum.Enum):
|
|
|
209
209
|
return None
|
|
210
210
|
|
|
211
211
|
|
|
212
|
+
def get_controller_for_pool(pool: bool) -> Controllers:
|
|
213
|
+
"""Get the controller type."""
|
|
214
|
+
if pool:
|
|
215
|
+
return Controllers.JOBS_CONTROLLER
|
|
216
|
+
return Controllers.SKY_SERVE_CONTROLLER
|
|
217
|
+
|
|
218
|
+
|
|
212
219
|
def high_availability_specified(cluster_name: Optional[str]) -> bool:
|
|
213
220
|
"""Check if the controller high availability is specified in user config.
|
|
214
221
|
"""
|
sky/utils/rich_utils.py
CHANGED
|
@@ -15,11 +15,13 @@ from sky.utils import message_utils
|
|
|
15
15
|
from sky.utils import rich_console_utils
|
|
16
16
|
|
|
17
17
|
if typing.TYPE_CHECKING:
|
|
18
|
+
import aiohttp
|
|
18
19
|
import requests
|
|
19
20
|
import rich.console as rich_console
|
|
20
21
|
else:
|
|
21
22
|
requests = adaptors_common.LazyImport('requests')
|
|
22
23
|
rich_console = adaptors_common.LazyImport('rich.console')
|
|
24
|
+
aiohttp = adaptors_common.LazyImport('aiohttp')
|
|
23
25
|
|
|
24
26
|
GeneralStatus = Union['rich_console.Status', 'EncodedStatus']
|
|
25
27
|
|
|
@@ -398,3 +400,121 @@ def decode_rich_status(
|
|
|
398
400
|
finally:
|
|
399
401
|
if decoding_status is not None:
|
|
400
402
|
decoding_status.__exit__(None, None, None)
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
async def decode_rich_status_async(
|
|
406
|
+
response: 'aiohttp.ClientResponse'
|
|
407
|
+
) -> typing.AsyncIterator[Optional[str]]:
|
|
408
|
+
"""Async version of rich_utils.decode_rich_status that decodes rich status
|
|
409
|
+
messages from an aiohttp response.
|
|
410
|
+
|
|
411
|
+
Args:
|
|
412
|
+
response: The aiohttp response.
|
|
413
|
+
|
|
414
|
+
Yields:
|
|
415
|
+
Optional[str]: Decoded lines or None for control messages.
|
|
416
|
+
"""
|
|
417
|
+
decoding_status = None
|
|
418
|
+
try:
|
|
419
|
+
last_line = ''
|
|
420
|
+
# Buffer to store incomplete UTF-8 bytes between chunks
|
|
421
|
+
undecoded_buffer = b''
|
|
422
|
+
|
|
423
|
+
# Iterate over the response content in chunks
|
|
424
|
+
async for chunk in response.content.iter_chunked(8192):
|
|
425
|
+
if chunk is None:
|
|
426
|
+
return
|
|
427
|
+
|
|
428
|
+
# Append the new chunk to any leftover bytes from previous iteration
|
|
429
|
+
current_bytes = undecoded_buffer + chunk
|
|
430
|
+
undecoded_buffer = b''
|
|
431
|
+
|
|
432
|
+
# Try to decode the combined bytes
|
|
433
|
+
try:
|
|
434
|
+
encoded_msg = current_bytes.decode('utf-8')
|
|
435
|
+
except UnicodeDecodeError as e:
|
|
436
|
+
# Check if this is potentially an incomplete sequence at the end
|
|
437
|
+
if e.start > 0:
|
|
438
|
+
# Decode the valid part
|
|
439
|
+
encoded_msg = current_bytes[:e.start].decode('utf-8')
|
|
440
|
+
|
|
441
|
+
# Check if the remaining bytes are likely a partial char
|
|
442
|
+
# or actually invalid UTF-8
|
|
443
|
+
remaining_bytes = current_bytes[e.start:]
|
|
444
|
+
if len(remaining_bytes) < 4: # Max UTF-8 char is 4 bytes
|
|
445
|
+
# Likely incomplete - save for next chunk
|
|
446
|
+
undecoded_buffer = remaining_bytes
|
|
447
|
+
else:
|
|
448
|
+
# Likely invalid - replace with replacement character
|
|
449
|
+
encoded_msg += remaining_bytes.decode('utf-8',
|
|
450
|
+
errors='replace')
|
|
451
|
+
undecoded_buffer = b''
|
|
452
|
+
else:
|
|
453
|
+
# Error at the very beginning of the buffer - invalid UTF-8
|
|
454
|
+
encoded_msg = current_bytes.decode('utf-8',
|
|
455
|
+
errors='replace')
|
|
456
|
+
undecoded_buffer = b''
|
|
457
|
+
|
|
458
|
+
lines = encoded_msg.splitlines(keepends=True)
|
|
459
|
+
|
|
460
|
+
# Skip processing if lines is empty to avoid IndexError
|
|
461
|
+
if not lines:
|
|
462
|
+
continue
|
|
463
|
+
|
|
464
|
+
lines[0] = last_line + lines[0]
|
|
465
|
+
last_line = lines[-1]
|
|
466
|
+
# If the last line is not ended with `\r` or `\n` (with ending
|
|
467
|
+
# spaces stripped), it means the last line is not a complete line.
|
|
468
|
+
# We keep the last line in the buffer and continue.
|
|
469
|
+
if (not last_line.strip(' ').endswith('\r') and
|
|
470
|
+
not last_line.strip(' ').endswith('\n')):
|
|
471
|
+
lines = lines[:-1]
|
|
472
|
+
else:
|
|
473
|
+
# Reset the buffer for the next line, as the last line is a
|
|
474
|
+
# complete line.
|
|
475
|
+
last_line = ''
|
|
476
|
+
|
|
477
|
+
for line in lines:
|
|
478
|
+
if line.endswith('\r\n'):
|
|
479
|
+
# Replace `\r\n` with `\n`, as printing a line ends with
|
|
480
|
+
# `\r\n` in linux will cause the line to be empty.
|
|
481
|
+
line = line[:-2] + '\n'
|
|
482
|
+
is_payload, line = message_utils.decode_payload(
|
|
483
|
+
line, raise_for_mismatch=False)
|
|
484
|
+
control = None
|
|
485
|
+
if is_payload:
|
|
486
|
+
control, encoded_status = Control.decode(line)
|
|
487
|
+
if control is None:
|
|
488
|
+
yield line
|
|
489
|
+
continue
|
|
490
|
+
|
|
491
|
+
if control == Control.RETRY:
|
|
492
|
+
raise exceptions.RequestInterruptedError(
|
|
493
|
+
'Streaming interrupted. Please retry.')
|
|
494
|
+
# control is not None, i.e. it is a rich status control message.
|
|
495
|
+
# In async context, we'll handle rich status controls normally
|
|
496
|
+
# since async typically runs in main thread
|
|
497
|
+
if control == Control.INIT:
|
|
498
|
+
decoding_status = client_status(encoded_status)
|
|
499
|
+
else:
|
|
500
|
+
if decoding_status is None:
|
|
501
|
+
# status may not be initialized if a user use --tail for
|
|
502
|
+
# sky api logs.
|
|
503
|
+
continue
|
|
504
|
+
assert decoding_status is not None, (
|
|
505
|
+
f'Rich status not initialized: {line}')
|
|
506
|
+
if control == Control.UPDATE:
|
|
507
|
+
decoding_status.update(encoded_status)
|
|
508
|
+
elif control == Control.STOP:
|
|
509
|
+
decoding_status.stop()
|
|
510
|
+
elif control == Control.EXIT:
|
|
511
|
+
decoding_status.__exit__(None, None, None)
|
|
512
|
+
elif control == Control.START:
|
|
513
|
+
decoding_status.start()
|
|
514
|
+
elif control == Control.HEARTBEAT:
|
|
515
|
+
# Heartbeat is not displayed to the user, so we do not
|
|
516
|
+
# need to update the status.
|
|
517
|
+
pass
|
|
518
|
+
finally:
|
|
519
|
+
if decoding_status is not None:
|
|
520
|
+
decoding_status.__exit__(None, None, None)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: skypilot-nightly
|
|
3
|
-
Version: 1.0.0.
|
|
3
|
+
Version: 1.0.0.dev20250806
|
|
4
4
|
Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
|
|
5
5
|
Author: SkyPilot Team
|
|
6
6
|
License: Apache 2.0
|
|
@@ -57,6 +57,7 @@ Requires-Dist: pyjwt
|
|
|
57
57
|
Requires-Dist: gitpython
|
|
58
58
|
Requires-Dist: types-paramiko
|
|
59
59
|
Requires-Dist: alembic
|
|
60
|
+
Requires-Dist: aiohttp
|
|
60
61
|
Provides-Extra: aws
|
|
61
62
|
Requires-Dist: awscli>=1.27.10; extra == "aws"
|
|
62
63
|
Requires-Dist: botocore>=1.29.10; extra == "aws"
|
|
@@ -132,6 +133,7 @@ Requires-Dist: casbin; extra == "server"
|
|
|
132
133
|
Requires-Dist: sqlalchemy_adapter; extra == "server"
|
|
133
134
|
Requires-Dist: passlib; extra == "server"
|
|
134
135
|
Requires-Dist: pyjwt; extra == "server"
|
|
136
|
+
Requires-Dist: aiohttp; extra == "server"
|
|
135
137
|
Provides-Extra: all
|
|
136
138
|
Requires-Dist: awscli>=1.27.10; extra == "all"
|
|
137
139
|
Requires-Dist: botocore>=1.29.10; extra == "all"
|
|
@@ -186,6 +188,7 @@ Requires-Dist: casbin; extra == "all"
|
|
|
186
188
|
Requires-Dist: sqlalchemy_adapter; extra == "all"
|
|
187
189
|
Requires-Dist: passlib; extra == "all"
|
|
188
190
|
Requires-Dist: pyjwt; extra == "all"
|
|
191
|
+
Requires-Dist: aiohttp; extra == "all"
|
|
189
192
|
Dynamic: author
|
|
190
193
|
Dynamic: classifier
|
|
191
194
|
Dynamic: description
|
|
@@ -235,6 +238,7 @@ Dynamic: summary
|
|
|
235
238
|
----
|
|
236
239
|
|
|
237
240
|
:fire: *News* :fire:
|
|
241
|
+
- [Aug 2025] Run and serve **OpenAI GPT-OSS models** (gpt-oss-120b, gpt-oss-20b) with one command on any infra: [**example**](./llm/gpt-oss/)
|
|
238
242
|
- [Jul 2025] Run distributed **RL training for LLMs** with Verl (PPO, GRPO) on any cloud: [**example**](./llm/verl/)
|
|
239
243
|
- [Jul 2025] 🎉 SkyPilot v0.10.0 released! [**blog post**](https://blog.skypilot.co/announcing-skypilot-0.10.0/), [**release notes**](https://github.com/skypilot-org/skypilot/releases/tag/v0.10.0)
|
|
240
244
|
- [Jul 2025] Finetune **Llama4** on any distributed cluster/cloud: [**example**](./llm/llama-4-finetuning/)
|