skypilot-nightly 1.0.0.dev20250630__py3-none-any.whl → 1.0.0.dev20250702__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (50) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/cloud_vm_ray_backend.py +3 -3
  3. sky/catalog/__init__.py +1 -1
  4. sky/client/cli/command.py +60 -21
  5. sky/dashboard/out/404.html +1 -1
  6. sky/dashboard/out/_next/static/{NdypbqMxaYucRGfopkKXa → N5IdFnjR1RaPGBAVYeTIr}/_buildManifest.js +1 -1
  7. sky/dashboard/out/_next/static/chunks/9984.b56614f3c4c5961d.js +1 -0
  8. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-1159f362b960e2b8.js +6 -0
  9. sky/dashboard/out/_next/static/chunks/{webpack-d427db53e54de9ce.js → webpack-9a81ea998672c303.js} +1 -1
  10. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  11. sky/dashboard/out/clusters/[cluster].html +1 -1
  12. sky/dashboard/out/clusters.html +1 -1
  13. sky/dashboard/out/config.html +1 -1
  14. sky/dashboard/out/index.html +1 -1
  15. sky/dashboard/out/infra/[context].html +1 -1
  16. sky/dashboard/out/infra.html +1 -1
  17. sky/dashboard/out/jobs/[job].html +1 -1
  18. sky/dashboard/out/jobs.html +1 -1
  19. sky/dashboard/out/users.html +1 -1
  20. sky/dashboard/out/volumes.html +1 -1
  21. sky/dashboard/out/workspace/new.html +1 -1
  22. sky/dashboard/out/workspaces/[name].html +1 -1
  23. sky/dashboard/out/workspaces.html +1 -1
  24. sky/jobs/controller.py +4 -0
  25. sky/jobs/server/core.py +5 -9
  26. sky/jobs/state.py +820 -670
  27. sky/jobs/utils.py +7 -15
  28. sky/metrics/utils.py +210 -0
  29. sky/optimizer.py +1 -1
  30. sky/resources.py +145 -7
  31. sky/server/common.py +1 -0
  32. sky/server/server.py +117 -22
  33. sky/setup_files/MANIFEST.in +1 -0
  34. sky/setup_files/dependencies.py +2 -0
  35. sky/skylet/constants.py +3 -0
  36. sky/skypilot_config.py +62 -53
  37. sky/task.py +1 -1
  38. sky/utils/accelerator_registry.py +28 -1
  39. sky/utils/dag_utils.py +4 -2
  40. sky/utils/schemas.py +3 -0
  41. sky/utils/ux_utils.py +9 -4
  42. {skypilot_nightly-1.0.0.dev20250630.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/METADATA +4 -1
  43. {skypilot_nightly-1.0.0.dev20250630.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/RECORD +48 -47
  44. sky/dashboard/out/_next/static/chunks/9984.739ae958a066298d.js +0 -1
  45. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b8e1114e6d38218c.js +0 -6
  46. /sky/dashboard/out/_next/static/{NdypbqMxaYucRGfopkKXa → N5IdFnjR1RaPGBAVYeTIr}/_ssgManifest.js +0 -0
  47. {skypilot_nightly-1.0.0.dev20250630.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/WHEEL +0 -0
  48. {skypilot_nightly-1.0.0.dev20250630.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/entry_points.txt +0 -0
  49. {skypilot_nightly-1.0.0.dev20250630.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/licenses/LICENSE +0 -0
  50. {skypilot_nightly-1.0.0.dev20250630.dist-info → skypilot_nightly-1.0.0.dev20250702.dist-info}/top_level.txt +0 -0
sky/server/server.py CHANGED
@@ -39,6 +39,7 @@ from sky import models
39
39
  from sky import sky_logging
40
40
  from sky.data import storage_utils
41
41
  from sky.jobs.server import server as jobs_rest
42
+ from sky.metrics import utils as metrics_utils
42
43
  from sky.provision.kubernetes import utils as kubernetes_utils
43
44
  from sky.serve.server import server as serve_rest
44
45
  from sky.server import common
@@ -218,14 +219,26 @@ class RequestIDMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
218
219
 
219
220
 
220
221
  def _get_auth_user_header(request: fastapi.Request) -> Optional[models.User]:
221
- if 'X-Auth-Request-Email' not in request.headers:
222
+ header_name = os.environ.get(constants.ENV_VAR_SERVER_AUTH_USER_HEADER,
223
+ 'X-Auth-Request-Email')
224
+ if header_name not in request.headers:
222
225
  return None
223
- user_name = request.headers['X-Auth-Request-Email']
226
+ user_name = request.headers[header_name]
224
227
  user_hash = hashlib.md5(
225
228
  user_name.encode()).hexdigest()[:common_utils.USER_HASH_LENGTH]
226
229
  return models.User(id=user_hash, name=user_name)
227
230
 
228
231
 
232
+ class InitializeRequestAuthUserMiddleware(
233
+ starlette.middleware.base.BaseHTTPMiddleware):
234
+
235
+ async def dispatch(self, request: fastapi.Request, call_next):
236
+ # Make sure that request.state.auth_user is set. Otherwise, we may get a
237
+ # KeyError while trying to read it.
238
+ request.state.auth_user = None
239
+ return await call_next(request)
240
+
241
+
229
242
  class BasicAuthMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
230
243
  """Middleware to handle HTTP Basic Auth."""
231
244
 
@@ -277,29 +290,51 @@ class BearerTokenMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
277
290
  """Middleware to handle Bearer Token Auth (Service Accounts)."""
278
291
 
279
292
  async def dispatch(self, request: fastapi.Request, call_next):
280
- # Only process requests with Bearer token authorization header
293
+ """Make sure correct bearer token auth is present.
294
+
295
+ 1. If the request has the X-Skypilot-Auth-Mode: token header, it must
296
+ have a valid bearer token.
297
+ 2. For backwards compatibility, if the request has a Bearer token
298
+ beginning with "sky_" (even if X-Skypilot-Auth-Mode is not present),
299
+ it must be a valid token.
300
+ 3. If X-Skypilot-Auth-Mode is not set to "token", and there is no Bearer
301
+ token beginning with "sky_", allow the request to continue.
302
+
303
+ In conjunction with an auth proxy, the idea is to make the auth proxy
304
+ bypass requests with bearer tokens, instead setting the
305
+ X-Skypilot-Auth-Mode header. The auth proxy should either validate the
306
+ auth or set the header X-Skypilot-Auth-Mode: token.
307
+ """
308
+ has_skypilot_auth_header = (
309
+ request.headers.get('X-Skypilot-Auth-Mode') == 'token')
281
310
  auth_header = request.headers.get('authorization')
282
- if not auth_header or not auth_header.lower().startswith('bearer '):
311
+ has_bearer_token_starting_with_sky = (
312
+ auth_header and auth_header.lower().startswith('bearer ') and
313
+ auth_header.split(' ', 1)[1].startswith('sky_'))
314
+
315
+ if (not has_skypilot_auth_header and
316
+ not has_bearer_token_starting_with_sky):
317
+ # This is case #3 above. We do not need to validate the request.
283
318
  # No Bearer token, continue with normal processing (OAuth2 cookies,
284
319
  # etc.)
285
320
  return await call_next(request)
321
+ # After this point, all requests must be validated.
322
+
323
+ if auth_header is None:
324
+ return fastapi.responses.JSONResponse(
325
+ status_code=401, content={'detail': 'Authentication required'})
286
326
 
287
327
  # Extract token
288
- sa_token = auth_header.split(' ', 1)[1]
328
+ split_header = auth_header.split(' ', 1)
329
+ if split_header[0].lower() != 'bearer':
330
+ return fastapi.responses.JSONResponse(
331
+ status_code=401,
332
+ content={'detail': 'Invalid authentication method'})
333
+ sa_token = split_header[1]
289
334
 
290
335
  # Handle SkyPilot service account tokens
291
- if sa_token.startswith('sky_'):
292
- return await self._handle_service_account_token(
293
- request, sa_token, call_next)
294
-
295
- # Handle other Bearer tokens (OAuth2 access tokens, etc.)
296
- # These requests bypassed OAuth2 proxy, so let the application decide
297
- # how to handle them
298
- # For now, we'll let them continue through normal processing
299
- logger.debug(
300
- 'Non-SkyPilot Bearer token detected, continuing with normal '
301
- 'processing')
302
- return await call_next(request)
336
+ return await self._handle_service_account_token(request, sa_token,
337
+ call_next)
303
338
 
304
339
  async def _handle_service_account_token(self, request: fastapi.Request,
305
340
  sa_token: str, call_next):
@@ -384,6 +419,18 @@ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
384
419
  async def dispatch(self, request: fastapi.Request, call_next):
385
420
  auth_user = _get_auth_user_header(request)
386
421
 
422
+ if request.state.auth_user is not None:
423
+ # Previous middleware is trusted more than this middleware. For
424
+ # instance, a client could set the Authorization and the
425
+ # X-Auth-Request-Email header. In that case, the auth proxy will be
426
+ # skipped and we should rely on the Bearer token to authenticate the
427
+ # user - but that means the user could set X-Auth-Request-Email to
428
+ # whatever the user wants. We should thus ignore it.
429
+ if auth_user is not None:
430
+ logger.debug('Warning: ignoring auth proxy header since the '
431
+ 'auth user was already set.')
432
+ return await call_next(request)
433
+
387
434
  # Add user to database if auth_user is present
388
435
  if auth_user is not None:
389
436
  newly_added = global_user_state.add_or_update_user(auth_user)
@@ -394,8 +441,6 @@ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
394
441
  # Store user info in request.state for access by GET endpoints
395
442
  if auth_user is not None:
396
443
  request.state.auth_user = auth_user
397
- else:
398
- request.state.auth_user = None
399
444
 
400
445
  await _override_user_info_in_request_body(request, auth_user)
401
446
  return await call_next(request)
@@ -514,10 +559,17 @@ class GracefulShutdownMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
514
559
 
515
560
 
516
561
  app = fastapi.FastAPI(prefix='/api/v1', debug=True, lifespan=lifespan)
562
+ # Middleware wraps in the order defined here. E.g., given
563
+ # app.add_middleware(Middleware1)
564
+ # app.add_middleware(Middleware2)
565
+ # app.add_middleware(Middleware3)
566
+ # The effect will be like:
567
+ # Middleware3(Middleware2(Middleware1(request)))
568
+ # If MiddlewareN does something like print(n); call_next(); print(n), you'll get
569
+ # 3; 2; 1; <request>; 1; 2; 3
517
570
  # Use environment variable to make the metrics middleware optional.
518
571
  if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
519
572
  app.add_middleware(metrics.PrometheusMiddleware)
520
- app.add_middleware(RBACMiddleware)
521
573
  app.add_middleware(InternalDashboardPrefixMiddleware)
522
574
  app.add_middleware(GracefulShutdownMiddleware)
523
575
  app.add_middleware(PathCleanMiddleware)
@@ -530,15 +582,26 @@ app.add_middleware(
530
582
  allow_credentials=True,
531
583
  allow_methods=['*'],
532
584
  allow_headers=['*'],
533
- # TODO(syang): remove X-Request-ID when v0.10.0 is released.
585
+ # TODO(syang): remove X-Request-ID \when v0.10.0 is released.
534
586
  expose_headers=['X-Request-ID', 'X-Skypilot-Request-ID'])
587
+ # The order of all the authentication-related middleware is important.
588
+ # RBACMiddleware must precede all the auth middleware, so it can access
589
+ # request.state.auth_user.
590
+ app.add_middleware(RBACMiddleware)
591
+ # AuthProxyMiddleware should precede BasicAuthMiddleware and
592
+ # BearerTokenMiddleware, since it should be skipped if either of those set the
593
+ # auth user.
594
+ app.add_middleware(AuthProxyMiddleware)
535
595
  enable_basic_auth = os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH, 'false')
536
596
  if str(enable_basic_auth).lower() == 'true':
537
597
  app.add_middleware(BasicAuthMiddleware)
538
598
  # Bearer token middleware should always be present to handle service account
539
599
  # authentication
540
600
  app.add_middleware(BearerTokenMiddleware)
541
- app.add_middleware(AuthProxyMiddleware)
601
+ # InitializeRequestAuthUserMiddleware must be the last added middleware so that
602
+ # request.state.auth_user is always set, but can be overridden by the auth
603
+ # middleware above.
604
+ app.add_middleware(InitializeRequestAuthUserMiddleware)
542
605
  app.add_middleware(RequestIDMiddleware)
543
606
  app.include_router(jobs_rest.router, prefix='/jobs', tags=['jobs'])
544
607
  app.include_router(serve_rest.router, prefix='/serve', tags=['serve'])
@@ -1554,6 +1617,38 @@ async def all_contexts(request: fastapi.Request) -> None:
1554
1617
  )
1555
1618
 
1556
1619
 
1620
+ @app.get('/gpu-metrics')
1621
+ async def gpu_metrics() -> fastapi.Response:
1622
+ """Gets the GPU metrics from multiple external k8s clusters"""
1623
+ contexts = core.get_all_contexts()
1624
+ all_metrics = []
1625
+ successful_contexts = 0
1626
+
1627
+ tasks = [
1628
+ asyncio.create_task(metrics_utils.get_metrics_for_context(context))
1629
+ for context in contexts
1630
+ if context != 'in-cluster'
1631
+ ]
1632
+
1633
+ results = await asyncio.gather(*tasks, return_exceptions=True)
1634
+
1635
+ for i, result in enumerate(results):
1636
+ if isinstance(result, Exception):
1637
+ logger.error(
1638
+ f'Failed to get metrics for context {contexts[i]}: {result}')
1639
+ else:
1640
+ metrics_text = result
1641
+ all_metrics.append(metrics_text)
1642
+ successful_contexts += 1
1643
+
1644
+ combined_metrics = '\n\n'.join(all_metrics)
1645
+
1646
+ # Return as plain text for Prometheus compatibility
1647
+ return fastapi.Response(
1648
+ content=combined_metrics,
1649
+ media_type='text/plain; version=0.0.4; charset=utf-8')
1650
+
1651
+
1557
1652
  # === Internal APIs ===
1558
1653
  @app.get('/api/completion/cluster_name')
1559
1654
  async def complete_cluster_name(incomplete: str,) -> List[str]:
@@ -17,3 +17,4 @@ include sky/utils/kubernetes/*
17
17
  include sky/server/html/*
18
18
  recursive-include sky/dashboard/out *
19
19
  include sky/users/*.conf
20
+ include sky/metrics/*
@@ -65,12 +65,14 @@ install_requires = [
65
65
  # Required for API server metrics
66
66
  'prometheus_client>=0.8.0',
67
67
  'passlib',
68
+ 'pyjwt',
68
69
  ]
69
70
 
70
71
  server_dependencies = [
71
72
  'casbin',
72
73
  'sqlalchemy_adapter',
73
74
  'passlib',
75
+ 'pyjwt',
74
76
  ]
75
77
 
76
78
  local_ray = [
sky/skylet/constants.py CHANGED
@@ -421,6 +421,9 @@ ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
421
421
  # Environment variable that is set to 'true' if metrics are enabled.
422
422
  ENV_VAR_SERVER_METRICS_ENABLED = 'SKY_API_SERVER_METRICS_ENABLED'
423
423
 
424
+ # If set, overrides the header that we can use to get the user name.
425
+ ENV_VAR_SERVER_AUTH_USER_HEADER = f'{SKYPILOT_ENV_VAR_PREFIX}AUTH_USER_HEADER'
426
+
424
427
  # Environment variable that is used as the DB connection string for the
425
428
  # skypilot server.
426
429
  ENV_VAR_DB_CONNECTION_URI = (f'{SKYPILOT_ENV_VAR_PREFIX}DB_CONNECTION_URI')
sky/skypilot_config.py CHANGED
@@ -63,6 +63,7 @@ from sqlalchemy import orm
63
63
  from sqlalchemy.dialects import postgresql
64
64
  from sqlalchemy.dialects import sqlite
65
65
  from sqlalchemy.ext import declarative
66
+ from sqlalchemy.pool import NullPool
66
67
 
67
68
  from sky import exceptions
68
69
  from sky import sky_logging
@@ -116,9 +117,10 @@ ENV_VAR_PROJECT_CONFIG = f'{constants.SKYPILOT_ENV_VAR_PREFIX}PROJECT_CONFIG'
116
117
  _GLOBAL_CONFIG_PATH = '~/.sky/config.yaml'
117
118
  _PROJECT_CONFIG_PATH = '.sky.yaml'
118
119
 
119
- _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
120
120
  API_SERVER_CONFIG_KEY = 'api_server_config'
121
121
 
122
+ _DB_USE_LOCK = threading.Lock()
123
+
122
124
  Base = declarative.declarative_base()
123
125
 
124
126
  config_yaml_table = sqlalchemy.Table(
@@ -129,44 +131,6 @@ config_yaml_table = sqlalchemy.Table(
129
131
  )
130
132
 
131
133
 
132
- def create_table():
133
- # Create tables if they don't exist
134
- Base.metadata.create_all(bind=_SQLALCHEMY_ENGINE)
135
-
136
-
137
- def _get_config_yaml_from_db(key: str) -> Optional[config_utils.Config]:
138
- assert _SQLALCHEMY_ENGINE is not None
139
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
140
- row = session.query(config_yaml_table).filter_by(key=key).first()
141
- if row:
142
- db_config = config_utils.Config(yaml.safe_load(row.value))
143
- db_config.pop_nested(('db',), None)
144
- return db_config
145
- return None
146
-
147
-
148
- def _set_config_yaml_to_db(key: str, config: config_utils.Config):
149
- assert _SQLALCHEMY_ENGINE is not None
150
- config.pop_nested(('db',), None)
151
- config_str = common_utils.dump_yaml_str(dict(config))
152
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
153
- if (_SQLALCHEMY_ENGINE.dialect.name ==
154
- db_utils.SQLAlchemyDialect.SQLITE.value):
155
- insert_func = sqlite.insert
156
- elif (_SQLALCHEMY_ENGINE.dialect.name ==
157
- db_utils.SQLAlchemyDialect.POSTGRESQL.value):
158
- insert_func = postgresql.insert
159
- else:
160
- raise ValueError('Unsupported database dialect')
161
- insert_stmnt = insert_func(config_yaml_table).values(key=key,
162
- value=config_str)
163
- do_update_stmt = insert_stmnt.on_conflict_do_update(
164
- index_elements=[config_yaml_table.c.key],
165
- set_={config_yaml_table.c.value: config_str})
166
- session.execute(do_update_stmt)
167
- session.commit()
168
-
169
-
170
134
  class ConfigContext:
171
135
 
172
136
  def __init__(self,
@@ -586,7 +550,6 @@ def _reload_config_from_internal_file(internal_config_path: str) -> None:
586
550
 
587
551
 
588
552
  def _reload_config_as_server() -> None:
589
- global _SQLALCHEMY_ENGINE
590
553
  # Reset the global variables, to avoid using stale values.
591
554
  _set_loaded_config(config_utils.Config())
592
555
  _set_loaded_config_path(None)
@@ -607,16 +570,33 @@ def _reload_config_as_server() -> None:
607
570
  'if db config is specified, no other config is allowed')
608
571
 
609
572
  if db_url:
610
- if _SQLALCHEMY_ENGINE is None:
611
- _SQLALCHEMY_ENGINE = sqlalchemy.create_engine(db_url)
612
- create_table()
613
- db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
614
- if db_config:
615
- if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
616
- logger.debug(f'Config loaded from db:\n'
617
- f'{common_utils.dump_yaml_str(dict(db_config))}')
618
- server_config = overlay_skypilot_config(server_config, db_config)
619
-
573
+ with _DB_USE_LOCK:
574
+ sqlalchemy_engine = sqlalchemy.create_engine(db_url,
575
+ poolclass=NullPool)
576
+ Base.metadata.create_all(bind=sqlalchemy_engine)
577
+
578
+ def _get_config_yaml_from_db(
579
+ key: str) -> Optional[config_utils.Config]:
580
+ assert sqlalchemy_engine is not None
581
+ with orm.Session(sqlalchemy_engine) as session:
582
+ row = session.query(config_yaml_table).filter_by(
583
+ key=key).first()
584
+ if row:
585
+ db_config = config_utils.Config(yaml.safe_load(row.value))
586
+ db_config.pop_nested(('db',), None)
587
+ return db_config
588
+ return None
589
+
590
+ db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
591
+ if db_config:
592
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
593
+ logger.debug(
594
+ f'Config loaded from db:\n'
595
+ f'{common_utils.dump_yaml_str(dict(db_config))}')
596
+ server_config = overlay_skypilot_config(server_config,
597
+ db_config)
598
+ # Close the engine to avoid connection leaks
599
+ sqlalchemy_engine.dispose()
620
600
  _set_loaded_config(server_config)
621
601
  _set_loaded_config_path(server_config_path)
622
602
 
@@ -876,9 +856,38 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
876
856
  new_db_url = config.get_nested(('db',), None)
877
857
  if new_db_url and new_db_url != existing_db_url:
878
858
  raise ValueError('Cannot change db url while server is running')
879
- logger.debug('saving api_server config to db')
880
- _set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
881
- db_updated = True
859
+ with _DB_USE_LOCK:
860
+ sqlalchemy_engine = sqlalchemy.create_engine(existing_db_url,
861
+ poolclass=NullPool)
862
+ Base.metadata.create_all(bind=sqlalchemy_engine)
863
+
864
+ def _set_config_yaml_to_db(key: str,
865
+ config: config_utils.Config):
866
+ assert sqlalchemy_engine is not None
867
+ config.pop_nested(('db',), None)
868
+ config_str = common_utils.dump_yaml_str(dict(config))
869
+ with orm.Session(sqlalchemy_engine) as session:
870
+ if (sqlalchemy_engine.dialect.name ==
871
+ db_utils.SQLAlchemyDialect.SQLITE.value):
872
+ insert_func = sqlite.insert
873
+ elif (sqlalchemy_engine.dialect.name ==
874
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
875
+ insert_func = postgresql.insert
876
+ else:
877
+ raise ValueError('Unsupported database dialect')
878
+ insert_stmnt = insert_func(config_yaml_table).values(
879
+ key=key, value=config_str)
880
+ do_update_stmt = insert_stmnt.on_conflict_do_update(
881
+ index_elements=[config_yaml_table.c.key],
882
+ set_={config_yaml_table.c.value: config_str})
883
+ session.execute(do_update_stmt)
884
+ session.commit()
885
+
886
+ logger.debug('saving api_server config to db')
887
+ _set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
888
+ db_updated = True
889
+ # Close the engine to avoid connection leaks
890
+ sqlalchemy_engine.dispose()
882
891
 
883
892
  if not db_updated:
884
893
  # save to the local file (PVC in Kubernetes, local file otherwise)
sky/task.py CHANGED
@@ -1512,7 +1512,7 @@ class Task:
1512
1512
  d[k] = v
1513
1513
  return d
1514
1514
 
1515
- def to_yaml_config(self, redact_secrets: bool = True) -> Dict[str, Any]:
1515
+ def to_yaml_config(self, redact_secrets: bool = False) -> Dict[str, Any]:
1516
1516
  """Returns a yaml-style dict representation of the task.
1517
1517
 
1518
1518
  INTERNAL: this method is internal-facing.
@@ -1,6 +1,6 @@
1
1
  """Accelerator registry."""
2
2
  import typing
3
- from typing import Optional
3
+ from typing import List, Optional
4
4
 
5
5
  from sky import catalog
6
6
  from sky.utils import rich_utils
@@ -35,6 +35,7 @@ if typing.TYPE_CHECKING:
35
35
  # Use a cached version of accelerators to cloud mapping, so that we don't have
36
36
  # to download and read the catalog file for every cloud locally.
37
37
  _accelerator_df = catalog.common.read_catalog('common/accelerators.csv')
38
+ _memory_df = catalog.common.read_catalog('common/metadata.csv')
38
39
 
39
40
  # List of non-GPU accelerators that are supported by our backend for job queue
40
41
  # scheduling.
@@ -45,6 +46,32 @@ _SCHEDULABLE_NON_GPU_ACCELERATORS = [
45
46
  ]
46
47
 
47
48
 
49
+ def get_devices_by_memory(memory: float,
50
+ plus: bool = False,
51
+ manufacturer: Optional[str] = None) -> List[str]:
52
+ """Returns a list of device names that meet the memory and manufacturer
53
+ requirements.
54
+
55
+ Args:
56
+ memory: The minimum memory size in GB.
57
+ plus: If True, returns devices with memory >= memory, otherwise returns
58
+ devices with memory == memory.
59
+ manufacturer: The manufacturer of the GPU.
60
+ """
61
+
62
+ # Filter by memory requirements
63
+ if plus:
64
+ df = _memory_df[_memory_df['MemoryGB'] >= memory]
65
+ else:
66
+ df = _memory_df[_memory_df['MemoryGB'] == memory]
67
+
68
+ # Filter by manufacturer if specified
69
+ if manufacturer is not None:
70
+ df = df[df['Manufacturer'].str.lower() == manufacturer.lower()]
71
+
72
+ return df['GPU'].tolist()
73
+
74
+
48
75
  def is_schedulable_non_gpu_accelerator(accelerator_name: str) -> bool:
49
76
  """Returns if this accelerator is a 'schedulable' non-GPU accelerator."""
50
77
  for name in _SCHEDULABLE_NON_GPU_ACCELERATORS:
sky/utils/dag_utils.py CHANGED
@@ -147,11 +147,13 @@ def load_chain_dag_from_yaml_str(
147
147
  return _load_chain_dag(configs, env_overrides, secrets_overrides)
148
148
 
149
149
 
150
- def dump_chain_dag_to_yaml_str(dag: dag_lib.Dag) -> str:
150
+ def dump_chain_dag_to_yaml_str(dag: dag_lib.Dag,
151
+ redact_secrets: bool = False) -> str:
151
152
  """Dumps a chain DAG to a YAML string.
152
153
 
153
154
  Args:
154
155
  dag: the DAG to dump.
156
+ redact_secrets: whether to redact secrets in the YAML string.
155
157
 
156
158
  Returns:
157
159
  The YAML string.
@@ -159,7 +161,7 @@ def dump_chain_dag_to_yaml_str(dag: dag_lib.Dag) -> str:
159
161
  assert dag.is_chain(), dag
160
162
  configs = [{'name': dag.name}]
161
163
  for task in dag.tasks:
162
- configs.append(task.to_yaml_config())
164
+ configs.append(task.to_yaml_config(redact_secrets=redact_secrets))
163
165
  return common_utils.dump_yaml_str(configs)
164
166
 
165
167
 
sky/utils/schemas.py CHANGED
@@ -311,6 +311,9 @@ def _get_single_resources_schema():
311
311
  }
312
312
  }
313
313
  },
314
+ '_no_missing_accel_warnings': {
315
+ 'type': 'boolean'
316
+ },
314
317
  'image_id': {
315
318
  'anyOf': [{
316
319
  'type': 'string',
sky/utils/ux_utils.py CHANGED
@@ -12,6 +12,7 @@ import colorama
12
12
  from sky import sky_logging
13
13
  from sky.skylet import constants
14
14
  from sky.utils import common_utils
15
+ from sky.utils import env_options
15
16
  from sky.utils import rich_console_utils
16
17
 
17
18
  if typing.TYPE_CHECKING:
@@ -57,10 +58,14 @@ def print_exception_no_traceback():
57
58
  if error():
58
59
  raise ValueError('...')
59
60
  """
60
- original_tracelimit = getattr(sys, 'tracebacklimit', 1000)
61
- sys.tracebacklimit = 0
62
- yield
63
- sys.tracebacklimit = original_tracelimit
61
+ if env_options.Options.SHOW_DEBUG_INFO.get():
62
+ # When SKYPILOT_DEBUG is set, show the full traceback
63
+ yield
64
+ else:
65
+ original_tracelimit = getattr(sys, 'tracebacklimit', 1000)
66
+ sys.tracebacklimit = 0
67
+ yield
68
+ sys.tracebacklimit = original_tracelimit
64
69
 
65
70
 
66
71
  @contextlib.contextmanager
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250630
3
+ Version: 1.0.0.dev20250702
4
4
  Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -53,6 +53,7 @@ Requires-Dist: casbin
53
53
  Requires-Dist: sqlalchemy_adapter
54
54
  Requires-Dist: prometheus_client>=0.8.0
55
55
  Requires-Dist: passlib
56
+ Requires-Dist: pyjwt
56
57
  Provides-Extra: aws
57
58
  Requires-Dist: awscli>=1.27.10; extra == "aws"
58
59
  Requires-Dist: botocore>=1.29.10; extra == "aws"
@@ -126,6 +127,7 @@ Provides-Extra: server
126
127
  Requires-Dist: casbin; extra == "server"
127
128
  Requires-Dist: sqlalchemy_adapter; extra == "server"
128
129
  Requires-Dist: passlib; extra == "server"
130
+ Requires-Dist: pyjwt; extra == "server"
129
131
  Provides-Extra: all
130
132
  Requires-Dist: awscli>=1.27.10; extra == "all"
131
133
  Requires-Dist: botocore>=1.29.10; extra == "all"
@@ -178,6 +180,7 @@ Requires-Dist: colorama<0.4.5; extra == "all"
178
180
  Requires-Dist: casbin; extra == "all"
179
181
  Requires-Dist: sqlalchemy_adapter; extra == "all"
180
182
  Requires-Dist: passlib; extra == "all"
183
+ Requires-Dist: pyjwt; extra == "all"
181
184
  Dynamic: author
182
185
  Dynamic: classifier
183
186
  Dynamic: description