skypilot-nightly 1.0.0.dev20250710__py3-none-any.whl → 1.0.0.dev20250711__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/clouds/kubernetes.py +137 -23
- sky/core.py +3 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1871-3a0f047988be65cd.js +6 -0
- sky/dashboard/out/_next/static/chunks/{webpack-fd62f17bd9ce1fcc.js → webpack-60070a62f55486a6.js} +1 -1
- sky/dashboard/out/_next/static/css/6cbd41a88d2e9e1c.css +3 -0
- sky/dashboard/out/_next/static/{P2Di1JdUlHuKN2lBws4Mr → ldZFQWCiYX_vZnIfB_o8S}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +10 -11
- sky/jobs/state.py +10 -11
- sky/jobs/utils.py +11 -3
- sky/provision/kubernetes/utils.py +132 -0
- sky/skypilot_config.py +4 -1
- sky/templates/kubernetes-ray.yml.j2 +298 -10
- sky/users/permission.py +15 -1
- sky/users/token_service.py +25 -3
- sky/utils/schemas.py +3 -0
- {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/RECORD +41 -41
- sky/dashboard/out/_next/static/chunks/1871-80dea41717729fa5.js +0 -6
- sky/dashboard/out/_next/static/css/0da6afe66176678a.css +0 -3
- /sky/dashboard/out/_next/static/chunks/pages/{_app-a37b06ddb64521fd.js → _app-e6e82dc8abb50c4f.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-1159f362b960e2b8.js → [cluster]-0fbfb1dd0b08c90c.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{clusters-9744c271a1642f76.js → clusters-102d169e87913ba1.js} +0 -0
- /sky/dashboard/out/_next/static/{P2Di1JdUlHuKN2lBws4Mr → ldZFQWCiYX_vZnIfB_o8S}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250710.dist-info → skypilot_nightly-1.0.0.dev20250711.dist-info}/top_level.txt +0 -0
sky/dashboard/out/infra.html
CHANGED
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/6cbd41a88d2e9e1c.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/6cbd41a88d2e9e1c.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-60070a62f55486a6.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6e82dc8abb50c4f.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/infra-ae9d2f705ce582c9.js" defer=""></script><script src="/dashboard/_next/static/ldZFQWCiYX_vZnIfB_o8S/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/ldZFQWCiYX_vZnIfB_o8S/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/infra","query":{},"buildId":"ldZFQWCiYX_vZnIfB_o8S","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/6cbd41a88d2e9e1c.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/6cbd41a88d2e9e1c.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-60070a62f55486a6.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6e82dc8abb50c4f.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-162f3033ffcd3d31.js" defer=""></script><script src="/dashboard/_next/static/chunks/5230-df791914b54d91d9.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-5ea3ffa10fc884f2.js" defer=""></script><script src="/dashboard/_next/static/chunks/1664-d65361e92b85e786.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-9f5e98ce84d46bdd.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-6ff4e45dfb49d11d.js" defer=""></script><script src="/dashboard/_next/static/chunks/3698-52ad1ca228faa776.js" defer=""></script><script src="/dashboard/_next/static/chunks/9470-21d059a1dfa03f61.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-13bb52ce3cffa4e3.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs/%5Bjob%5D-c4d5cfac7fbc0668.js" defer=""></script><script src="/dashboard/_next/static/ldZFQWCiYX_vZnIfB_o8S/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/ldZFQWCiYX_vZnIfB_o8S/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/jobs/[job]","query":{},"buildId":"ldZFQWCiYX_vZnIfB_o8S","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/jobs.html
CHANGED
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/6cbd41a88d2e9e1c.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/6cbd41a88d2e9e1c.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-60070a62f55486a6.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6e82dc8abb50c4f.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs-5bbdc71878f0a068.js" defer=""></script><script src="/dashboard/_next/static/ldZFQWCiYX_vZnIfB_o8S/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/ldZFQWCiYX_vZnIfB_o8S/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/jobs","query":{},"buildId":"ldZFQWCiYX_vZnIfB_o8S","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/users.html
CHANGED
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/6cbd41a88d2e9e1c.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/6cbd41a88d2e9e1c.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-60070a62f55486a6.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6e82dc8abb50c4f.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/users-cd43fb3c122eedde.js" defer=""></script><script src="/dashboard/_next/static/ldZFQWCiYX_vZnIfB_o8S/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/ldZFQWCiYX_vZnIfB_o8S/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/users","query":{},"buildId":"ldZFQWCiYX_vZnIfB_o8S","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/volumes.html
CHANGED
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/6cbd41a88d2e9e1c.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/6cbd41a88d2e9e1c.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-60070a62f55486a6.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6e82dc8abb50c4f.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/volumes-4ebf6484f7216387.js" defer=""></script><script src="/dashboard/_next/static/ldZFQWCiYX_vZnIfB_o8S/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/ldZFQWCiYX_vZnIfB_o8S/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/volumes","query":{},"buildId":"ldZFQWCiYX_vZnIfB_o8S","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/6cbd41a88d2e9e1c.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/6cbd41a88d2e9e1c.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-60070a62f55486a6.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6e82dc8abb50c4f.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-5629d4e551dba1ee.js" defer=""></script><script src="/dashboard/_next/static/ldZFQWCiYX_vZnIfB_o8S/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/ldZFQWCiYX_vZnIfB_o8S/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"ldZFQWCiYX_vZnIfB_o8S","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/6cbd41a88d2e9e1c.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/6cbd41a88d2e9e1c.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-60070a62f55486a6.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6e82dc8abb50c4f.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-162f3033ffcd3d31.js" defer=""></script><script src="/dashboard/_next/static/chunks/5230-df791914b54d91d9.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-5ea3ffa10fc884f2.js" defer=""></script><script src="/dashboard/_next/static/chunks/1664-d65361e92b85e786.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-9f5e98ce84d46bdd.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/3947-b059261d6fa88a1f.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-6ff4e45dfb49d11d.js" defer=""></script><script src="/dashboard/_next/static/chunks/3698-52ad1ca228faa776.js" defer=""></script><script src="/dashboard/_next/static/chunks/9470-21d059a1dfa03f61.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-d0dc765474fa0eca.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-13bb52ce3cffa4e3.js" defer=""></script><script src="/dashboard/_next/static/chunks/1043-1b39779691bb4030.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-fcfad0ddf92ec7ab.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-044ad21de8b4626b.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-726e5a3f00b67185.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-7c0187f43757a548.js" defer=""></script><script src="/dashboard/_next/static/ldZFQWCiYX_vZnIfB_o8S/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/ldZFQWCiYX_vZnIfB_o8S/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"ldZFQWCiYX_vZnIfB_o8S","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/6cbd41a88d2e9e1c.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/6cbd41a88d2e9e1c.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-60070a62f55486a6.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6e82dc8abb50c4f.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-06bde99155fa6292.js" defer=""></script><script src="/dashboard/_next/static/ldZFQWCiYX_vZnIfB_o8S/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/ldZFQWCiYX_vZnIfB_o8S/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"ldZFQWCiYX_vZnIfB_o8S","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/global_user_state.py
CHANGED
@@ -220,17 +220,16 @@ def _glob_to_similar(glob_pattern):
|
|
220
220
|
return like_pattern
|
221
221
|
|
222
222
|
|
223
|
-
def create_table():
|
223
|
+
def create_table(engine: sqlalchemy.engine.Engine):
|
224
224
|
# Enable WAL mode to avoid locking issues.
|
225
225
|
# See: issue #1441 and PR #1509
|
226
226
|
# https://github.com/microsoft/WSL/issues/2395
|
227
227
|
# TODO(romilb): We do not enable WAL for WSL because of known issue in WSL.
|
228
228
|
# This may cause the database locked problem from WSL issue #1441.
|
229
|
-
if (
|
230
|
-
== db_utils.SQLAlchemyDialect.SQLITE.value and
|
229
|
+
if (engine.dialect.name == db_utils.SQLAlchemyDialect.SQLITE.value and
|
231
230
|
not common_utils.is_wsl()):
|
232
231
|
try:
|
233
|
-
with orm.Session(
|
232
|
+
with orm.Session(engine) as session:
|
234
233
|
session.execute(sqlalchemy.text('PRAGMA journal_mode=WAL'))
|
235
234
|
session.commit()
|
236
235
|
except sqlalchemy_exc.OperationalError as e:
|
@@ -240,12 +239,12 @@ def create_table():
|
|
240
239
|
# is not critical and is likely to be enabled by other processes.
|
241
240
|
|
242
241
|
# Create tables if they don't exist
|
243
|
-
db_utils.add_tables_to_db_sqlalchemy(Base.metadata,
|
242
|
+
db_utils.add_tables_to_db_sqlalchemy(Base.metadata, engine)
|
244
243
|
|
245
244
|
# For backward compatibility.
|
246
245
|
# TODO(zhwu): Remove this function after all users have migrated to
|
247
246
|
# the latest version of SkyPilot.
|
248
|
-
with orm.Session(
|
247
|
+
with orm.Session(engine) as session:
|
249
248
|
# Add autostop column to clusters table
|
250
249
|
db_utils.add_column_to_table_sqlalchemy(session,
|
251
250
|
'clusters',
|
@@ -391,15 +390,15 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
391
390
|
conn_string = skypilot_config.get_nested(('db',), None)
|
392
391
|
if conn_string:
|
393
392
|
logger.debug(f'using db URI from {conn_string}')
|
394
|
-
|
395
|
-
|
393
|
+
engine = sqlalchemy.create_engine(conn_string,
|
394
|
+
poolclass=sqlalchemy.NullPool)
|
396
395
|
else:
|
397
396
|
db_path = os.path.expanduser('~/.sky/state.db')
|
398
397
|
pathlib.Path(db_path).parents[0].mkdir(parents=True,
|
399
398
|
exist_ok=True)
|
400
|
-
|
401
|
-
|
402
|
-
|
399
|
+
engine = sqlalchemy.create_engine('sqlite:///' + db_path)
|
400
|
+
create_table(engine)
|
401
|
+
_SQLALCHEMY_ENGINE = engine
|
403
402
|
return _SQLALCHEMY_ENGINE
|
404
403
|
|
405
404
|
|
sky/jobs/state.py
CHANGED
@@ -112,17 +112,16 @@ ha_recovery_script_table = sqlalchemy.Table(
|
|
112
112
|
)
|
113
113
|
|
114
114
|
|
115
|
-
def create_table():
|
115
|
+
def create_table(engine: sqlalchemy.engine.Engine):
|
116
116
|
# Enable WAL mode to avoid locking issues.
|
117
117
|
# See: issue #3863, #1441 and PR #1509
|
118
118
|
# https://github.com/microsoft/WSL/issues/2395
|
119
119
|
# TODO(romilb): We do not enable WAL for WSL because of known issue in WSL.
|
120
120
|
# This may cause the database locked problem from WSL issue #1441.
|
121
|
-
if (
|
122
|
-
== db_utils.SQLAlchemyDialect.SQLITE.value and
|
121
|
+
if (engine.dialect.name == db_utils.SQLAlchemyDialect.SQLITE.value and
|
123
122
|
not common_utils.is_wsl()):
|
124
123
|
try:
|
125
|
-
with orm.Session(
|
124
|
+
with orm.Session(engine) as session:
|
126
125
|
session.execute(sqlalchemy.text('PRAGMA journal_mode=WAL'))
|
127
126
|
session.commit()
|
128
127
|
except sqlalchemy_exc.OperationalError as e:
|
@@ -132,10 +131,10 @@ def create_table():
|
|
132
131
|
# is not critical and is likely to be enabled by other processes.
|
133
132
|
|
134
133
|
# Create tables if they don't exist
|
135
|
-
db_utils.add_tables_to_db_sqlalchemy(Base.metadata,
|
134
|
+
db_utils.add_tables_to_db_sqlalchemy(Base.metadata, engine)
|
136
135
|
|
137
136
|
# Backward compatibility: add columns that not exist in older databases
|
138
|
-
with orm.Session(
|
137
|
+
with orm.Session(engine) as session:
|
139
138
|
db_utils.add_column_to_table_sqlalchemy(session, 'spot',
|
140
139
|
'failure_reason',
|
141
140
|
sqlalchemy.Text())
|
@@ -228,15 +227,15 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
228
227
|
conn_string = skypilot_config.get_nested(('db',), None)
|
229
228
|
if conn_string:
|
230
229
|
logger.debug(f'using db URI from {conn_string}')
|
231
|
-
|
232
|
-
|
230
|
+
engine = sqlalchemy.create_engine(conn_string,
|
231
|
+
poolclass=sqlalchemy.NullPool)
|
233
232
|
else:
|
234
233
|
db_path = os.path.expanduser('~/.sky/spot_jobs.db')
|
235
234
|
pathlib.Path(db_path).parents[0].mkdir(parents=True,
|
236
235
|
exist_ok=True)
|
237
|
-
|
238
|
-
|
239
|
-
|
236
|
+
engine = sqlalchemy.create_engine('sqlite:///' + db_path)
|
237
|
+
create_table(engine)
|
238
|
+
_SQLALCHEMY_ENGINE = engine
|
240
239
|
return _SQLALCHEMY_ENGINE
|
241
240
|
|
242
241
|
|
sky/jobs/utils.py
CHANGED
@@ -30,6 +30,7 @@ from sky.backends import backend_utils
|
|
30
30
|
from sky.jobs import constants as managed_job_constants
|
31
31
|
from sky.jobs import scheduler
|
32
32
|
from sky.jobs import state as managed_job_state
|
33
|
+
from sky.server import common as server_common
|
33
34
|
from sky.skylet import constants
|
34
35
|
from sky.skylet import job_lib
|
35
36
|
from sky.skylet import log_lib
|
@@ -38,6 +39,7 @@ from sky.utils import annotations
|
|
38
39
|
from sky.utils import command_runner
|
39
40
|
from sky.utils import common_utils
|
40
41
|
from sky.utils import controller_utils
|
42
|
+
from sky.utils import env_options
|
41
43
|
from sky.utils import infra_utils
|
42
44
|
from sky.utils import log_utils
|
43
45
|
from sky.utils import message_utils
|
@@ -128,9 +130,15 @@ def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
|
|
128
130
|
time.sleep(backoff.current_backoff())
|
129
131
|
|
130
132
|
|
131
|
-
def
|
133
|
+
def _validate_consolidation_mode_config(
|
132
134
|
current_is_consolidation_mode: bool) -> None:
|
133
|
-
"""
|
135
|
+
"""Validate the consolidation mode config."""
|
136
|
+
if (current_is_consolidation_mode and
|
137
|
+
not env_options.Options.IS_DEVELOPER.get() and
|
138
|
+
server_common.is_api_server_local()):
|
139
|
+
with ux_utils.print_exception_no_traceback():
|
140
|
+
raise exceptions.NotSupportedError(
|
141
|
+
'Consolidation mode is not supported when running locally.')
|
134
142
|
# Check whether the consolidation mode config is changed.
|
135
143
|
if current_is_consolidation_mode:
|
136
144
|
controller_cn = (
|
@@ -176,7 +184,7 @@ def _check_consolidation_mode_consistency(
|
|
176
184
|
def is_consolidation_mode() -> bool:
|
177
185
|
consolidation_mode = skypilot_config.get_nested(
|
178
186
|
('jobs', 'controller', 'consolidation_mode'), default_value=False)
|
179
|
-
|
187
|
+
_validate_consolidation_mode_config(consolidation_mode)
|
180
188
|
return consolidation_mode
|
181
189
|
|
182
190
|
|
@@ -1,5 +1,6 @@
|
|
1
1
|
"""Kubernetes utilities for SkyPilot."""
|
2
2
|
import dataclasses
|
3
|
+
import enum
|
3
4
|
import functools
|
4
5
|
import hashlib
|
5
6
|
import json
|
@@ -57,6 +58,69 @@ HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME = 'sky-data'
|
|
57
58
|
# and store all data that needs to be persisted in future.
|
58
59
|
HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_PATH = '/home/sky'
|
59
60
|
|
61
|
+
|
62
|
+
class KubernetesHighPerformanceNetworkType(enum.Enum):
|
63
|
+
"""Enum for different Kubernetes cluster types with high performance
|
64
|
+
network configurations.
|
65
|
+
|
66
|
+
This enum defines cluster types that support optimized networking for
|
67
|
+
distributed ML workloads:
|
68
|
+
- GCP_TCPX: GKE clusters with GPUDirect-TCPX support
|
69
|
+
(A3 High instances: a3-highgpu-8g)
|
70
|
+
- GCP_TCPXO: GKE clusters with GPUDirect-TCPXO support
|
71
|
+
(A3 Mega instances: a3-megagpu-8g)
|
72
|
+
- GCP_GPUDIRECT_RDMA: GKE clusters with GPUDirect-RDMA support
|
73
|
+
(A4/A3 Ultra instances)
|
74
|
+
- NEBIUS: Nebius clusters with InfiniBand support for high-throughput,
|
75
|
+
low-latency networking
|
76
|
+
- NONE: Standard clusters without specialized networking optimizations
|
77
|
+
|
78
|
+
The network configurations align with corresponding VM-based
|
79
|
+
implementations:
|
80
|
+
- GCP settings match
|
81
|
+
sky.provision.gcp.constants.GPU_DIRECT_TCPX_SPECIFIC_OPTIONS
|
82
|
+
- Nebius settings match the InfiniBand configuration used in Nebius VMs
|
83
|
+
"""
|
84
|
+
|
85
|
+
GCP_TCPX = 'gcp_tcpx'
|
86
|
+
GCP_TCPXO = 'gcp_tcpxo'
|
87
|
+
GCP_GPUDIRECT_RDMA = 'gcp_gpudirect_rdma'
|
88
|
+
NEBIUS = 'nebius'
|
89
|
+
NONE = 'none'
|
90
|
+
|
91
|
+
def get_network_env_vars(self) -> Dict[str, str]:
|
92
|
+
"""Get network environment variables for this cluster type."""
|
93
|
+
if self == KubernetesHighPerformanceNetworkType.NEBIUS:
|
94
|
+
# Nebius cluster with InfiniBand - use InfiniBand optimizations
|
95
|
+
return {
|
96
|
+
'NCCL_IB_HCA': 'mlx5',
|
97
|
+
'UCX_NET_DEVICES': ('mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,'
|
98
|
+
'mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1')
|
99
|
+
}
|
100
|
+
else:
|
101
|
+
# GCP clusters and generic clusters - environment variables are
|
102
|
+
# handled directly in the template
|
103
|
+
return {}
|
104
|
+
|
105
|
+
def supports_high_performance_networking(self) -> bool:
|
106
|
+
"""Check if this cluster type supports high performance networking."""
|
107
|
+
return self is not KubernetesHighPerformanceNetworkType.NONE
|
108
|
+
|
109
|
+
def supports_gpu_direct(self) -> bool:
|
110
|
+
"""Check if this cluster type supports GPUDirect networking."""
|
111
|
+
return self in (KubernetesHighPerformanceNetworkType.GCP_TCPX,
|
112
|
+
KubernetesHighPerformanceNetworkType.GCP_TCPXO,
|
113
|
+
KubernetesHighPerformanceNetworkType.GCP_GPUDIRECT_RDMA)
|
114
|
+
|
115
|
+
def requires_ipc_lock_capability(self) -> bool:
|
116
|
+
"""Check if this cluster type requires IPC_LOCK capability."""
|
117
|
+
return self.supports_high_performance_networking()
|
118
|
+
|
119
|
+
def requires_tcpxo_daemon(self) -> bool:
|
120
|
+
"""Check if this cluster type requires TCPXO daemon."""
|
121
|
+
return self == KubernetesHighPerformanceNetworkType.GCP_TCPXO
|
122
|
+
|
123
|
+
|
60
124
|
# TODO(romilb): Move constants to constants.py
|
61
125
|
DEFAULT_NAMESPACE = 'default'
|
62
126
|
|
@@ -758,6 +822,74 @@ class GKEAutoscaler(Autoscaler):
|
|
758
822
|
return True
|
759
823
|
return False
|
760
824
|
|
825
|
+
@classmethod
|
826
|
+
@annotations.lru_cache(scope='request', maxsize=10)
|
827
|
+
def get_available_machine_types(cls, context: str) -> List[str]:
|
828
|
+
"""Returns the list of machine types that are available in the cluster.
|
829
|
+
"""
|
830
|
+
# Assume context naming convention of
|
831
|
+
# gke_PROJECT-ID_LOCATION_CLUSTER-NAME
|
832
|
+
valid, project_id, location, cluster_name = cls._validate_context_name(
|
833
|
+
context)
|
834
|
+
if not valid:
|
835
|
+
# Context name is not in the format of
|
836
|
+
# gke_PROJECT-ID_LOCATION_CLUSTER-NAME.
|
837
|
+
# Cannot determine if the context can autoscale.
|
838
|
+
# Return empty list.
|
839
|
+
logger.debug(f'Context {context} is not in the format of '
|
840
|
+
f'gke_PROJECT-ID_LOCATION_CLUSTER-NAME. '
|
841
|
+
'Returning empty machine type list.')
|
842
|
+
return []
|
843
|
+
try:
|
844
|
+
logger.debug(
|
845
|
+
f'Attempting to get information about cluster {cluster_name}')
|
846
|
+
container_service = gcp.build('container',
|
847
|
+
'v1',
|
848
|
+
credentials=None,
|
849
|
+
cache_discovery=False)
|
850
|
+
cluster = container_service.projects().locations().clusters().get(
|
851
|
+
name=f'projects/{project_id}'
|
852
|
+
f'/locations/{location}'
|
853
|
+
f'/clusters/{cluster_name}').execute()
|
854
|
+
except ImportError:
|
855
|
+
# If the gcp module is not installed, return empty list.
|
856
|
+
# Remind the user once per day to install the gcp module for better
|
857
|
+
# pod scheduling with GKE autoscaler.
|
858
|
+
if time.time() - cls._pip_install_gcp_hint_last_sent > 60 * 60 * 24:
|
859
|
+
logger.info(
|
860
|
+
'Could not fetch autoscaler information from GKE. '
|
861
|
+
'Run pip install "skypilot[gcp]" for more intelligent pod '
|
862
|
+
'scheduling with GKE autoscaler.')
|
863
|
+
cls._pip_install_gcp_hint_last_sent = time.time()
|
864
|
+
return []
|
865
|
+
except gcp.http_error_exception() as e:
|
866
|
+
# Cluster information is not available.
|
867
|
+
# Return empty list.
|
868
|
+
logger.debug(f'{e.message}', exc_info=True)
|
869
|
+
return []
|
870
|
+
|
871
|
+
machine_types = []
|
872
|
+
# Get the list of machine types that are available in the cluster.
|
873
|
+
node_pools = cluster.get('nodePools', [])
|
874
|
+
for node_pool in node_pools:
|
875
|
+
name = node_pool.get('name', '')
|
876
|
+
logger.debug(f'Checking if node pool {name} '
|
877
|
+
'has autoscaling enabled.')
|
878
|
+
autoscaling_enabled = (node_pool.get('autoscaling',
|
879
|
+
{}).get('enabled', False))
|
880
|
+
if autoscaling_enabled:
|
881
|
+
logger.debug(f'Node pool {name} has autoscaling enabled.')
|
882
|
+
try:
|
883
|
+
machine_type = node_pool.get('config',
|
884
|
+
{}).get('machineType', '')
|
885
|
+
if machine_type:
|
886
|
+
machine_types.append(machine_type)
|
887
|
+
except KeyError:
|
888
|
+
logger.debug(f'Encountered KeyError while checking machine '
|
889
|
+
f'type of node pool {name}.')
|
890
|
+
continue
|
891
|
+
return machine_types
|
892
|
+
|
761
893
|
@classmethod
|
762
894
|
def _validate_context_name(cls, context: str) -> Tuple[bool, str, str, str]:
|
763
895
|
"""Validates the context name is in the format of
|
sky/skypilot_config.py
CHANGED
@@ -52,6 +52,7 @@ import contextlib
|
|
52
52
|
import copy
|
53
53
|
import json
|
54
54
|
import os
|
55
|
+
import pathlib
|
55
56
|
import tempfile
|
56
57
|
import threading
|
57
58
|
import typing
|
@@ -848,7 +849,9 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
|
|
848
849
|
|
849
850
|
global_config_path = _resolve_server_config_path()
|
850
851
|
if global_config_path is None:
|
851
|
-
|
852
|
+
# Fallback to ~/.sky/config.yaml, and make sure it exists.
|
853
|
+
global_config_path = os.path.expanduser(get_user_config_path())
|
854
|
+
pathlib.Path(global_config_path).touch(exist_ok=True)
|
852
855
|
|
853
856
|
db_updated = False
|
854
857
|
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|