skypilot-nightly 1.0.0.dev20250630__py3-none-any.whl → 1.0.0.dev20250702__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +3 -3
- sky/catalog/__init__.py +1 -1
- sky/client/cli/command.py +60 -21
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{NdypbqMxaYucRGfopkKXa → N5IdFnjR1RaPGBAVYeTIr}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/9984.b56614f3c4c5961d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-1159f362b960e2b8.js +6 -0
- sky/dashboard/out/_next/static/chunks/{webpack-d427db53e54de9ce.js → webpack-9a81ea998672c303.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/jobs/controller.py +4 -0
- sky/jobs/server/core.py +5 -9
- sky/jobs/state.py +820 -670
- sky/jobs/utils.py +7 -15
- sky/metrics/utils.py +210 -0
- sky/optimizer.py +1 -1
- sky/resources.py +145 -7
- sky/server/common.py +1 -0
- sky/server/server.py +117 -22
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +3 -0
- sky/skypilot_config.py +62 -53
- sky/task.py +1 -1
- sky/utils/accelerator_registry.py +28 -1
- sky/utils/dag_utils.py +4 -2
- sky/utils/schemas.py +3 -0
- sky/utils/ux_utils.py +9 -4
- {skypilot_nightly-1.0.0.dev20250630.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/METADATA +4 -1
- {skypilot_nightly-1.0.0.dev20250630.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/RECORD +48 -47
- sky/dashboard/out/_next/static/chunks/9984.739ae958a066298d.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b8e1114e6d38218c.js +0 -6
- /sky/dashboard/out/_next/static/{NdypbqMxaYucRGfopkKXa → N5IdFnjR1RaPGBAVYeTIr}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250630.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250630.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250630.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250630.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/top_level.txt +0 -0
sky/server/server.py
CHANGED
|
@@ -39,6 +39,7 @@ from sky import models
|
|
|
39
39
|
from sky import sky_logging
|
|
40
40
|
from sky.data import storage_utils
|
|
41
41
|
from sky.jobs.server import server as jobs_rest
|
|
42
|
+
from sky.metrics import utils as metrics_utils
|
|
42
43
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
43
44
|
from sky.serve.server import server as serve_rest
|
|
44
45
|
from sky.server import common
|
|
@@ -218,14 +219,26 @@ class RequestIDMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
218
219
|
|
|
219
220
|
|
|
220
221
|
def _get_auth_user_header(request: fastapi.Request) -> Optional[models.User]:
|
|
221
|
-
|
|
222
|
+
header_name = os.environ.get(constants.ENV_VAR_SERVER_AUTH_USER_HEADER,
|
|
223
|
+
'X-Auth-Request-Email')
|
|
224
|
+
if header_name not in request.headers:
|
|
222
225
|
return None
|
|
223
|
-
user_name = request.headers[
|
|
226
|
+
user_name = request.headers[header_name]
|
|
224
227
|
user_hash = hashlib.md5(
|
|
225
228
|
user_name.encode()).hexdigest()[:common_utils.USER_HASH_LENGTH]
|
|
226
229
|
return models.User(id=user_hash, name=user_name)
|
|
227
230
|
|
|
228
231
|
|
|
232
|
+
class InitializeRequestAuthUserMiddleware(
|
|
233
|
+
starlette.middleware.base.BaseHTTPMiddleware):
|
|
234
|
+
|
|
235
|
+
async def dispatch(self, request: fastapi.Request, call_next):
|
|
236
|
+
# Make sure that request.state.auth_user is set. Otherwise, we may get a
|
|
237
|
+
# KeyError while trying to read it.
|
|
238
|
+
request.state.auth_user = None
|
|
239
|
+
return await call_next(request)
|
|
240
|
+
|
|
241
|
+
|
|
229
242
|
class BasicAuthMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
230
243
|
"""Middleware to handle HTTP Basic Auth."""
|
|
231
244
|
|
|
@@ -277,29 +290,51 @@ class BearerTokenMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
277
290
|
"""Middleware to handle Bearer Token Auth (Service Accounts)."""
|
|
278
291
|
|
|
279
292
|
async def dispatch(self, request: fastapi.Request, call_next):
|
|
280
|
-
|
|
293
|
+
"""Make sure correct bearer token auth is present.
|
|
294
|
+
|
|
295
|
+
1. If the request has the X-Skypilot-Auth-Mode: token header, it must
|
|
296
|
+
have a valid bearer token.
|
|
297
|
+
2. For backwards compatibility, if the request has a Bearer token
|
|
298
|
+
beginning with "sky_" (even if X-Skypilot-Auth-Mode is not present),
|
|
299
|
+
it must be a valid token.
|
|
300
|
+
3. If X-Skypilot-Auth-Mode is not set to "token", and there is no Bearer
|
|
301
|
+
token beginning with "sky_", allow the request to continue.
|
|
302
|
+
|
|
303
|
+
In conjunction with an auth proxy, the idea is to make the auth proxy
|
|
304
|
+
bypass requests with bearer tokens, instead setting the
|
|
305
|
+
X-Skypilot-Auth-Mode header. The auth proxy should either validate the
|
|
306
|
+
auth or set the header X-Skypilot-Auth-Mode: token.
|
|
307
|
+
"""
|
|
308
|
+
has_skypilot_auth_header = (
|
|
309
|
+
request.headers.get('X-Skypilot-Auth-Mode') == 'token')
|
|
281
310
|
auth_header = request.headers.get('authorization')
|
|
282
|
-
|
|
311
|
+
has_bearer_token_starting_with_sky = (
|
|
312
|
+
auth_header and auth_header.lower().startswith('bearer ') and
|
|
313
|
+
auth_header.split(' ', 1)[1].startswith('sky_'))
|
|
314
|
+
|
|
315
|
+
if (not has_skypilot_auth_header and
|
|
316
|
+
not has_bearer_token_starting_with_sky):
|
|
317
|
+
# This is case #3 above. We do not need to validate the request.
|
|
283
318
|
# No Bearer token, continue with normal processing (OAuth2 cookies,
|
|
284
319
|
# etc.)
|
|
285
320
|
return await call_next(request)
|
|
321
|
+
# After this point, all requests must be validated.
|
|
322
|
+
|
|
323
|
+
if auth_header is None:
|
|
324
|
+
return fastapi.responses.JSONResponse(
|
|
325
|
+
status_code=401, content={'detail': 'Authentication required'})
|
|
286
326
|
|
|
287
327
|
# Extract token
|
|
288
|
-
|
|
328
|
+
split_header = auth_header.split(' ', 1)
|
|
329
|
+
if split_header[0].lower() != 'bearer':
|
|
330
|
+
return fastapi.responses.JSONResponse(
|
|
331
|
+
status_code=401,
|
|
332
|
+
content={'detail': 'Invalid authentication method'})
|
|
333
|
+
sa_token = split_header[1]
|
|
289
334
|
|
|
290
335
|
# Handle SkyPilot service account tokens
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
request, sa_token, call_next)
|
|
294
|
-
|
|
295
|
-
# Handle other Bearer tokens (OAuth2 access tokens, etc.)
|
|
296
|
-
# These requests bypassed OAuth2 proxy, so let the application decide
|
|
297
|
-
# how to handle them
|
|
298
|
-
# For now, we'll let them continue through normal processing
|
|
299
|
-
logger.debug(
|
|
300
|
-
'Non-SkyPilot Bearer token detected, continuing with normal '
|
|
301
|
-
'processing')
|
|
302
|
-
return await call_next(request)
|
|
336
|
+
return await self._handle_service_account_token(request, sa_token,
|
|
337
|
+
call_next)
|
|
303
338
|
|
|
304
339
|
async def _handle_service_account_token(self, request: fastapi.Request,
|
|
305
340
|
sa_token: str, call_next):
|
|
@@ -384,6 +419,18 @@ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
384
419
|
async def dispatch(self, request: fastapi.Request, call_next):
|
|
385
420
|
auth_user = _get_auth_user_header(request)
|
|
386
421
|
|
|
422
|
+
if request.state.auth_user is not None:
|
|
423
|
+
# Previous middleware is trusted more than this middleware. For
|
|
424
|
+
# instance, a client could set the Authorization and the
|
|
425
|
+
# X-Auth-Request-Email header. In that case, the auth proxy will be
|
|
426
|
+
# skipped and we should rely on the Bearer token to authenticate the
|
|
427
|
+
# user - but that means the user could set X-Auth-Request-Email to
|
|
428
|
+
# whatever the user wants. We should thus ignore it.
|
|
429
|
+
if auth_user is not None:
|
|
430
|
+
logger.debug('Warning: ignoring auth proxy header since the '
|
|
431
|
+
'auth user was already set.')
|
|
432
|
+
return await call_next(request)
|
|
433
|
+
|
|
387
434
|
# Add user to database if auth_user is present
|
|
388
435
|
if auth_user is not None:
|
|
389
436
|
newly_added = global_user_state.add_or_update_user(auth_user)
|
|
@@ -394,8 +441,6 @@ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
394
441
|
# Store user info in request.state for access by GET endpoints
|
|
395
442
|
if auth_user is not None:
|
|
396
443
|
request.state.auth_user = auth_user
|
|
397
|
-
else:
|
|
398
|
-
request.state.auth_user = None
|
|
399
444
|
|
|
400
445
|
await _override_user_info_in_request_body(request, auth_user)
|
|
401
446
|
return await call_next(request)
|
|
@@ -514,10 +559,17 @@ class GracefulShutdownMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
514
559
|
|
|
515
560
|
|
|
516
561
|
app = fastapi.FastAPI(prefix='/api/v1', debug=True, lifespan=lifespan)
|
|
562
|
+
# Middleware wraps in the order defined here. E.g., given
|
|
563
|
+
# app.add_middleware(Middleware1)
|
|
564
|
+
# app.add_middleware(Middleware2)
|
|
565
|
+
# app.add_middleware(Middleware3)
|
|
566
|
+
# The effect will be like:
|
|
567
|
+
# Middleware3(Middleware2(Middleware1(request)))
|
|
568
|
+
# If MiddlewareN does something like print(n); call_next(); print(n), you'll get
|
|
569
|
+
# 3; 2; 1; <request>; 1; 2; 3
|
|
517
570
|
# Use environment variable to make the metrics middleware optional.
|
|
518
571
|
if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
|
|
519
572
|
app.add_middleware(metrics.PrometheusMiddleware)
|
|
520
|
-
app.add_middleware(RBACMiddleware)
|
|
521
573
|
app.add_middleware(InternalDashboardPrefixMiddleware)
|
|
522
574
|
app.add_middleware(GracefulShutdownMiddleware)
|
|
523
575
|
app.add_middleware(PathCleanMiddleware)
|
|
@@ -530,15 +582,26 @@ app.add_middleware(
|
|
|
530
582
|
allow_credentials=True,
|
|
531
583
|
allow_methods=['*'],
|
|
532
584
|
allow_headers=['*'],
|
|
533
|
-
# TODO(syang): remove X-Request-ID when v0.10.0 is released.
|
|
585
|
+
# TODO(syang): remove X-Request-ID \when v0.10.0 is released.
|
|
534
586
|
expose_headers=['X-Request-ID', 'X-Skypilot-Request-ID'])
|
|
587
|
+
# The order of all the authentication-related middleware is important.
|
|
588
|
+
# RBACMiddleware must precede all the auth middleware, so it can access
|
|
589
|
+
# request.state.auth_user.
|
|
590
|
+
app.add_middleware(RBACMiddleware)
|
|
591
|
+
# AuthProxyMiddleware should precede BasicAuthMiddleware and
|
|
592
|
+
# BearerTokenMiddleware, since it should be skipped if either of those set the
|
|
593
|
+
# auth user.
|
|
594
|
+
app.add_middleware(AuthProxyMiddleware)
|
|
535
595
|
enable_basic_auth = os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH, 'false')
|
|
536
596
|
if str(enable_basic_auth).lower() == 'true':
|
|
537
597
|
app.add_middleware(BasicAuthMiddleware)
|
|
538
598
|
# Bearer token middleware should always be present to handle service account
|
|
539
599
|
# authentication
|
|
540
600
|
app.add_middleware(BearerTokenMiddleware)
|
|
541
|
-
|
|
601
|
+
# InitializeRequestAuthUserMiddleware must be the last added middleware so that
|
|
602
|
+
# request.state.auth_user is always set, but can be overridden by the auth
|
|
603
|
+
# middleware above.
|
|
604
|
+
app.add_middleware(InitializeRequestAuthUserMiddleware)
|
|
542
605
|
app.add_middleware(RequestIDMiddleware)
|
|
543
606
|
app.include_router(jobs_rest.router, prefix='/jobs', tags=['jobs'])
|
|
544
607
|
app.include_router(serve_rest.router, prefix='/serve', tags=['serve'])
|
|
@@ -1554,6 +1617,38 @@ async def all_contexts(request: fastapi.Request) -> None:
|
|
|
1554
1617
|
)
|
|
1555
1618
|
|
|
1556
1619
|
|
|
1620
|
+
@app.get('/gpu-metrics')
|
|
1621
|
+
async def gpu_metrics() -> fastapi.Response:
|
|
1622
|
+
"""Gets the GPU metrics from multiple external k8s clusters"""
|
|
1623
|
+
contexts = core.get_all_contexts()
|
|
1624
|
+
all_metrics = []
|
|
1625
|
+
successful_contexts = 0
|
|
1626
|
+
|
|
1627
|
+
tasks = [
|
|
1628
|
+
asyncio.create_task(metrics_utils.get_metrics_for_context(context))
|
|
1629
|
+
for context in contexts
|
|
1630
|
+
if context != 'in-cluster'
|
|
1631
|
+
]
|
|
1632
|
+
|
|
1633
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
1634
|
+
|
|
1635
|
+
for i, result in enumerate(results):
|
|
1636
|
+
if isinstance(result, Exception):
|
|
1637
|
+
logger.error(
|
|
1638
|
+
f'Failed to get metrics for context {contexts[i]}: {result}')
|
|
1639
|
+
else:
|
|
1640
|
+
metrics_text = result
|
|
1641
|
+
all_metrics.append(metrics_text)
|
|
1642
|
+
successful_contexts += 1
|
|
1643
|
+
|
|
1644
|
+
combined_metrics = '\n\n'.join(all_metrics)
|
|
1645
|
+
|
|
1646
|
+
# Return as plain text for Prometheus compatibility
|
|
1647
|
+
return fastapi.Response(
|
|
1648
|
+
content=combined_metrics,
|
|
1649
|
+
media_type='text/plain; version=0.0.4; charset=utf-8')
|
|
1650
|
+
|
|
1651
|
+
|
|
1557
1652
|
# === Internal APIs ===
|
|
1558
1653
|
@app.get('/api/completion/cluster_name')
|
|
1559
1654
|
async def complete_cluster_name(incomplete: str,) -> List[str]:
|
sky/setup_files/MANIFEST.in
CHANGED
sky/setup_files/dependencies.py
CHANGED
sky/skylet/constants.py
CHANGED
|
@@ -421,6 +421,9 @@ ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
|
|
|
421
421
|
# Environment variable that is set to 'true' if metrics are enabled.
|
|
422
422
|
ENV_VAR_SERVER_METRICS_ENABLED = 'SKY_API_SERVER_METRICS_ENABLED'
|
|
423
423
|
|
|
424
|
+
# If set, overrides the header that we can use to get the user name.
|
|
425
|
+
ENV_VAR_SERVER_AUTH_USER_HEADER = f'{SKYPILOT_ENV_VAR_PREFIX}AUTH_USER_HEADER'
|
|
426
|
+
|
|
424
427
|
# Environment variable that is used as the DB connection string for the
|
|
425
428
|
# skypilot server.
|
|
426
429
|
ENV_VAR_DB_CONNECTION_URI = (f'{SKYPILOT_ENV_VAR_PREFIX}DB_CONNECTION_URI')
|
sky/skypilot_config.py
CHANGED
|
@@ -63,6 +63,7 @@ from sqlalchemy import orm
|
|
|
63
63
|
from sqlalchemy.dialects import postgresql
|
|
64
64
|
from sqlalchemy.dialects import sqlite
|
|
65
65
|
from sqlalchemy.ext import declarative
|
|
66
|
+
from sqlalchemy.pool import NullPool
|
|
66
67
|
|
|
67
68
|
from sky import exceptions
|
|
68
69
|
from sky import sky_logging
|
|
@@ -116,9 +117,10 @@ ENV_VAR_PROJECT_CONFIG = f'{constants.SKYPILOT_ENV_VAR_PREFIX}PROJECT_CONFIG'
|
|
|
116
117
|
_GLOBAL_CONFIG_PATH = '~/.sky/config.yaml'
|
|
117
118
|
_PROJECT_CONFIG_PATH = '.sky.yaml'
|
|
118
119
|
|
|
119
|
-
_SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
|
120
120
|
API_SERVER_CONFIG_KEY = 'api_server_config'
|
|
121
121
|
|
|
122
|
+
_DB_USE_LOCK = threading.Lock()
|
|
123
|
+
|
|
122
124
|
Base = declarative.declarative_base()
|
|
123
125
|
|
|
124
126
|
config_yaml_table = sqlalchemy.Table(
|
|
@@ -129,44 +131,6 @@ config_yaml_table = sqlalchemy.Table(
|
|
|
129
131
|
)
|
|
130
132
|
|
|
131
133
|
|
|
132
|
-
def create_table():
|
|
133
|
-
# Create tables if they don't exist
|
|
134
|
-
Base.metadata.create_all(bind=_SQLALCHEMY_ENGINE)
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
def _get_config_yaml_from_db(key: str) -> Optional[config_utils.Config]:
|
|
138
|
-
assert _SQLALCHEMY_ENGINE is not None
|
|
139
|
-
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
140
|
-
row = session.query(config_yaml_table).filter_by(key=key).first()
|
|
141
|
-
if row:
|
|
142
|
-
db_config = config_utils.Config(yaml.safe_load(row.value))
|
|
143
|
-
db_config.pop_nested(('db',), None)
|
|
144
|
-
return db_config
|
|
145
|
-
return None
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
def _set_config_yaml_to_db(key: str, config: config_utils.Config):
|
|
149
|
-
assert _SQLALCHEMY_ENGINE is not None
|
|
150
|
-
config.pop_nested(('db',), None)
|
|
151
|
-
config_str = common_utils.dump_yaml_str(dict(config))
|
|
152
|
-
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
153
|
-
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
154
|
-
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
155
|
-
insert_func = sqlite.insert
|
|
156
|
-
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
157
|
-
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
|
158
|
-
insert_func = postgresql.insert
|
|
159
|
-
else:
|
|
160
|
-
raise ValueError('Unsupported database dialect')
|
|
161
|
-
insert_stmnt = insert_func(config_yaml_table).values(key=key,
|
|
162
|
-
value=config_str)
|
|
163
|
-
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
|
164
|
-
index_elements=[config_yaml_table.c.key],
|
|
165
|
-
set_={config_yaml_table.c.value: config_str})
|
|
166
|
-
session.execute(do_update_stmt)
|
|
167
|
-
session.commit()
|
|
168
|
-
|
|
169
|
-
|
|
170
134
|
class ConfigContext:
|
|
171
135
|
|
|
172
136
|
def __init__(self,
|
|
@@ -586,7 +550,6 @@ def _reload_config_from_internal_file(internal_config_path: str) -> None:
|
|
|
586
550
|
|
|
587
551
|
|
|
588
552
|
def _reload_config_as_server() -> None:
|
|
589
|
-
global _SQLALCHEMY_ENGINE
|
|
590
553
|
# Reset the global variables, to avoid using stale values.
|
|
591
554
|
_set_loaded_config(config_utils.Config())
|
|
592
555
|
_set_loaded_config_path(None)
|
|
@@ -607,16 +570,33 @@ def _reload_config_as_server() -> None:
|
|
|
607
570
|
'if db config is specified, no other config is allowed')
|
|
608
571
|
|
|
609
572
|
if db_url:
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
573
|
+
with _DB_USE_LOCK:
|
|
574
|
+
sqlalchemy_engine = sqlalchemy.create_engine(db_url,
|
|
575
|
+
poolclass=NullPool)
|
|
576
|
+
Base.metadata.create_all(bind=sqlalchemy_engine)
|
|
577
|
+
|
|
578
|
+
def _get_config_yaml_from_db(
|
|
579
|
+
key: str) -> Optional[config_utils.Config]:
|
|
580
|
+
assert sqlalchemy_engine is not None
|
|
581
|
+
with orm.Session(sqlalchemy_engine) as session:
|
|
582
|
+
row = session.query(config_yaml_table).filter_by(
|
|
583
|
+
key=key).first()
|
|
584
|
+
if row:
|
|
585
|
+
db_config = config_utils.Config(yaml.safe_load(row.value))
|
|
586
|
+
db_config.pop_nested(('db',), None)
|
|
587
|
+
return db_config
|
|
588
|
+
return None
|
|
589
|
+
|
|
590
|
+
db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
|
|
591
|
+
if db_config:
|
|
592
|
+
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
593
|
+
logger.debug(
|
|
594
|
+
f'Config loaded from db:\n'
|
|
595
|
+
f'{common_utils.dump_yaml_str(dict(db_config))}')
|
|
596
|
+
server_config = overlay_skypilot_config(server_config,
|
|
597
|
+
db_config)
|
|
598
|
+
# Close the engine to avoid connection leaks
|
|
599
|
+
sqlalchemy_engine.dispose()
|
|
620
600
|
_set_loaded_config(server_config)
|
|
621
601
|
_set_loaded_config_path(server_config_path)
|
|
622
602
|
|
|
@@ -876,9 +856,38 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
|
|
|
876
856
|
new_db_url = config.get_nested(('db',), None)
|
|
877
857
|
if new_db_url and new_db_url != existing_db_url:
|
|
878
858
|
raise ValueError('Cannot change db url while server is running')
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
859
|
+
with _DB_USE_LOCK:
|
|
860
|
+
sqlalchemy_engine = sqlalchemy.create_engine(existing_db_url,
|
|
861
|
+
poolclass=NullPool)
|
|
862
|
+
Base.metadata.create_all(bind=sqlalchemy_engine)
|
|
863
|
+
|
|
864
|
+
def _set_config_yaml_to_db(key: str,
|
|
865
|
+
config: config_utils.Config):
|
|
866
|
+
assert sqlalchemy_engine is not None
|
|
867
|
+
config.pop_nested(('db',), None)
|
|
868
|
+
config_str = common_utils.dump_yaml_str(dict(config))
|
|
869
|
+
with orm.Session(sqlalchemy_engine) as session:
|
|
870
|
+
if (sqlalchemy_engine.dialect.name ==
|
|
871
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
872
|
+
insert_func = sqlite.insert
|
|
873
|
+
elif (sqlalchemy_engine.dialect.name ==
|
|
874
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
|
875
|
+
insert_func = postgresql.insert
|
|
876
|
+
else:
|
|
877
|
+
raise ValueError('Unsupported database dialect')
|
|
878
|
+
insert_stmnt = insert_func(config_yaml_table).values(
|
|
879
|
+
key=key, value=config_str)
|
|
880
|
+
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
|
881
|
+
index_elements=[config_yaml_table.c.key],
|
|
882
|
+
set_={config_yaml_table.c.value: config_str})
|
|
883
|
+
session.execute(do_update_stmt)
|
|
884
|
+
session.commit()
|
|
885
|
+
|
|
886
|
+
logger.debug('saving api_server config to db')
|
|
887
|
+
_set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
|
|
888
|
+
db_updated = True
|
|
889
|
+
# Close the engine to avoid connection leaks
|
|
890
|
+
sqlalchemy_engine.dispose()
|
|
882
891
|
|
|
883
892
|
if not db_updated:
|
|
884
893
|
# save to the local file (PVC in Kubernetes, local file otherwise)
|
sky/task.py
CHANGED
|
@@ -1512,7 +1512,7 @@ class Task:
|
|
|
1512
1512
|
d[k] = v
|
|
1513
1513
|
return d
|
|
1514
1514
|
|
|
1515
|
-
def to_yaml_config(self, redact_secrets: bool =
|
|
1515
|
+
def to_yaml_config(self, redact_secrets: bool = False) -> Dict[str, Any]:
|
|
1516
1516
|
"""Returns a yaml-style dict representation of the task.
|
|
1517
1517
|
|
|
1518
1518
|
INTERNAL: this method is internal-facing.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""Accelerator registry."""
|
|
2
2
|
import typing
|
|
3
|
-
from typing import Optional
|
|
3
|
+
from typing import List, Optional
|
|
4
4
|
|
|
5
5
|
from sky import catalog
|
|
6
6
|
from sky.utils import rich_utils
|
|
@@ -35,6 +35,7 @@ if typing.TYPE_CHECKING:
|
|
|
35
35
|
# Use a cached version of accelerators to cloud mapping, so that we don't have
|
|
36
36
|
# to download and read the catalog file for every cloud locally.
|
|
37
37
|
_accelerator_df = catalog.common.read_catalog('common/accelerators.csv')
|
|
38
|
+
_memory_df = catalog.common.read_catalog('common/metadata.csv')
|
|
38
39
|
|
|
39
40
|
# List of non-GPU accelerators that are supported by our backend for job queue
|
|
40
41
|
# scheduling.
|
|
@@ -45,6 +46,32 @@ _SCHEDULABLE_NON_GPU_ACCELERATORS = [
|
|
|
45
46
|
]
|
|
46
47
|
|
|
47
48
|
|
|
49
|
+
def get_devices_by_memory(memory: float,
|
|
50
|
+
plus: bool = False,
|
|
51
|
+
manufacturer: Optional[str] = None) -> List[str]:
|
|
52
|
+
"""Returns a list of device names that meet the memory and manufacturer
|
|
53
|
+
requirements.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
memory: The minimum memory size in GB.
|
|
57
|
+
plus: If True, returns devices with memory >= memory, otherwise returns
|
|
58
|
+
devices with memory == memory.
|
|
59
|
+
manufacturer: The manufacturer of the GPU.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
# Filter by memory requirements
|
|
63
|
+
if plus:
|
|
64
|
+
df = _memory_df[_memory_df['MemoryGB'] >= memory]
|
|
65
|
+
else:
|
|
66
|
+
df = _memory_df[_memory_df['MemoryGB'] == memory]
|
|
67
|
+
|
|
68
|
+
# Filter by manufacturer if specified
|
|
69
|
+
if manufacturer is not None:
|
|
70
|
+
df = df[df['Manufacturer'].str.lower() == manufacturer.lower()]
|
|
71
|
+
|
|
72
|
+
return df['GPU'].tolist()
|
|
73
|
+
|
|
74
|
+
|
|
48
75
|
def is_schedulable_non_gpu_accelerator(accelerator_name: str) -> bool:
|
|
49
76
|
"""Returns if this accelerator is a 'schedulable' non-GPU accelerator."""
|
|
50
77
|
for name in _SCHEDULABLE_NON_GPU_ACCELERATORS:
|
sky/utils/dag_utils.py
CHANGED
|
@@ -147,11 +147,13 @@ def load_chain_dag_from_yaml_str(
|
|
|
147
147
|
return _load_chain_dag(configs, env_overrides, secrets_overrides)
|
|
148
148
|
|
|
149
149
|
|
|
150
|
-
def dump_chain_dag_to_yaml_str(dag: dag_lib.Dag
|
|
150
|
+
def dump_chain_dag_to_yaml_str(dag: dag_lib.Dag,
|
|
151
|
+
redact_secrets: bool = False) -> str:
|
|
151
152
|
"""Dumps a chain DAG to a YAML string.
|
|
152
153
|
|
|
153
154
|
Args:
|
|
154
155
|
dag: the DAG to dump.
|
|
156
|
+
redact_secrets: whether to redact secrets in the YAML string.
|
|
155
157
|
|
|
156
158
|
Returns:
|
|
157
159
|
The YAML string.
|
|
@@ -159,7 +161,7 @@ def dump_chain_dag_to_yaml_str(dag: dag_lib.Dag) -> str:
|
|
|
159
161
|
assert dag.is_chain(), dag
|
|
160
162
|
configs = [{'name': dag.name}]
|
|
161
163
|
for task in dag.tasks:
|
|
162
|
-
configs.append(task.to_yaml_config())
|
|
164
|
+
configs.append(task.to_yaml_config(redact_secrets=redact_secrets))
|
|
163
165
|
return common_utils.dump_yaml_str(configs)
|
|
164
166
|
|
|
165
167
|
|
sky/utils/schemas.py
CHANGED
sky/utils/ux_utils.py
CHANGED
|
@@ -12,6 +12,7 @@ import colorama
|
|
|
12
12
|
from sky import sky_logging
|
|
13
13
|
from sky.skylet import constants
|
|
14
14
|
from sky.utils import common_utils
|
|
15
|
+
from sky.utils import env_options
|
|
15
16
|
from sky.utils import rich_console_utils
|
|
16
17
|
|
|
17
18
|
if typing.TYPE_CHECKING:
|
|
@@ -57,10 +58,14 @@ def print_exception_no_traceback():
|
|
|
57
58
|
if error():
|
|
58
59
|
raise ValueError('...')
|
|
59
60
|
"""
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
61
|
+
if env_options.Options.SHOW_DEBUG_INFO.get():
|
|
62
|
+
# When SKYPILOT_DEBUG is set, show the full traceback
|
|
63
|
+
yield
|
|
64
|
+
else:
|
|
65
|
+
original_tracelimit = getattr(sys, 'tracebacklimit', 1000)
|
|
66
|
+
sys.tracebacklimit = 0
|
|
67
|
+
yield
|
|
68
|
+
sys.tracebacklimit = original_tracelimit
|
|
64
69
|
|
|
65
70
|
|
|
66
71
|
@contextlib.contextmanager
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: skypilot-nightly
|
|
3
|
-
Version: 1.0.0.
|
|
3
|
+
Version: 1.0.0.dev20250702
|
|
4
4
|
Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
|
|
5
5
|
Author: SkyPilot Team
|
|
6
6
|
License: Apache 2.0
|
|
@@ -53,6 +53,7 @@ Requires-Dist: casbin
|
|
|
53
53
|
Requires-Dist: sqlalchemy_adapter
|
|
54
54
|
Requires-Dist: prometheus_client>=0.8.0
|
|
55
55
|
Requires-Dist: passlib
|
|
56
|
+
Requires-Dist: pyjwt
|
|
56
57
|
Provides-Extra: aws
|
|
57
58
|
Requires-Dist: awscli>=1.27.10; extra == "aws"
|
|
58
59
|
Requires-Dist: botocore>=1.29.10; extra == "aws"
|
|
@@ -126,6 +127,7 @@ Provides-Extra: server
|
|
|
126
127
|
Requires-Dist: casbin; extra == "server"
|
|
127
128
|
Requires-Dist: sqlalchemy_adapter; extra == "server"
|
|
128
129
|
Requires-Dist: passlib; extra == "server"
|
|
130
|
+
Requires-Dist: pyjwt; extra == "server"
|
|
129
131
|
Provides-Extra: all
|
|
130
132
|
Requires-Dist: awscli>=1.27.10; extra == "all"
|
|
131
133
|
Requires-Dist: botocore>=1.29.10; extra == "all"
|
|
@@ -178,6 +180,7 @@ Requires-Dist: colorama<0.4.5; extra == "all"
|
|
|
178
180
|
Requires-Dist: casbin; extra == "all"
|
|
179
181
|
Requires-Dist: sqlalchemy_adapter; extra == "all"
|
|
180
182
|
Requires-Dist: passlib; extra == "all"
|
|
183
|
+
Requires-Dist: pyjwt; extra == "all"
|
|
181
184
|
Dynamic: author
|
|
182
185
|
Dynamic: classifier
|
|
183
186
|
Dynamic: description
|