skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +478 -0
- sky/backends/backend_utils.py +45 -4
- sky/backends/cloud_vm_ray_backend.py +32 -33
- sky/backends/task_codegen.py +340 -2
- sky/catalog/__init__.py +0 -3
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +14 -3
- sky/client/cli/command.py +329 -22
- sky/client/sdk.py +56 -2
- sky/clouds/__init__.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +2 -1
- sky/clouds/vast.py +10 -0
- sky/core.py +128 -36
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-abfcac9c137aa543.js → [cluster]-a7565f586ef86467.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-9e5d47818b9bdadd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-c0b5935149902e6f.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aed0ea19df7cf961.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-9faf940b253e3e06.js → [pool]-8d0f4655400b4eb9.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-2072b48b617989c9.js → jobs-e5a98f17f8513a96.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{users-f42674164aa73423.js → users-2f7646eb77785a2c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-ef19d49c6d0e8500.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-531b2f8c4bf89f82.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-64e05f17bf2cf8ce.js → webpack-fba3de387ff6bb08.js} +1 -1
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +16 -2
- sky/global_user_state.py +3 -3
- sky/models.py +2 -0
- sky/optimizer.py +6 -5
- sky/provision/__init__.py +1 -0
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +42 -6
- sky/provision/provisioner.py +15 -6
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +10 -6
- sky/serve/server/impl.py +1 -1
- sky/server/constants.py +1 -1
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +12 -1
- sky/server/requests/request_names.py +2 -0
- sky/server/requests/requests.py +5 -1
- sky/server/requests/serializers/encoders.py +17 -0
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/server.py +78 -8
- sky/server/server_utils.py +30 -0
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +34 -9
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +2 -1
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +8 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/users/model.conf +1 -1
- sky/users/permission.py +24 -1
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/command_runner.py +197 -5
- sky/utils/command_runner.pyi +27 -4
- sky/utils/common_utils.py +18 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/schemas.py +31 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +48 -36
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/RECORD +125 -107
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{1141-e6aa9ab418717c59.js → 1141-9c810f01ff4f398a.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{3800-7b45f9fbb6308557.js → 3800-b589397dc09c5b4e.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/top_level.txt +0 -0
sky/dashboard/out/index.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-fba3de387ff6bb08.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-68b647e26f9d2793.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/index-444f1804401f04ea.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/","query":{},"buildId":"KYAhEFa3FTfq4JyKVgo-s","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-fba3de387ff6bb08.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-68b647e26f9d2793.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/infra/%5Bcontext%5D-12c559ec4d81fdbd.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/infra/[context]","query":{},"buildId":"KYAhEFa3FTfq4JyKVgo-s","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/infra.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-fba3de387ff6bb08.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-68b647e26f9d2793.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/infra-d187cd0413d72475.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/infra","query":{},"buildId":"KYAhEFa3FTfq4JyKVgo-s","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-fba3de387ff6bb08.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-68b647e26f9d2793.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/6212-7bd06f60ba693125.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-fd5696f3bbbaddae.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-452f9d5cbdd2dc73.js" defer=""></script><script src="/dashboard/_next/static/chunks/9353-8369df1cf105221c.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs/%5Bjob%5D-895847b6cf200b04.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/jobs/[job]","query":{},"buildId":"KYAhEFa3FTfq4JyKVgo-s","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-fba3de387ff6bb08.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-68b647e26f9d2793.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/6212-7bd06f60ba693125.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-fd5696f3bbbaddae.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-452f9d5cbdd2dc73.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs/pools/%5Bpool%5D-8d0f4655400b4eb9.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/jobs/pools/[pool]","query":{},"buildId":"KYAhEFa3FTfq4JyKVgo-s","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/jobs.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-fba3de387ff6bb08.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-68b647e26f9d2793.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/jobs","query":{},"buildId":"KYAhEFa3FTfq4JyKVgo-s","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-fba3de387ff6bb08.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-68b647e26f9d2793.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/plugins/%5B...slug%5D-4f46050ca065d8f8.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/plugins/[...slug]","query":{},"buildId":"KYAhEFa3FTfq4JyKVgo-s","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/users.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-fba3de387ff6bb08.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-68b647e26f9d2793.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/users-2f7646eb77785a2c.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/users","query":{},"buildId":"KYAhEFa3FTfq4JyKVgo-s","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/volumes.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-fba3de387ff6bb08.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-68b647e26f9d2793.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/volumes","query":{},"buildId":"KYAhEFa3FTfq4JyKVgo-s","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-fba3de387ff6bb08.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-68b647e26f9d2793.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"KYAhEFa3FTfq4JyKVgo-s","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-fba3de387ff6bb08.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-68b647e26f9d2793.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/7359-c8d04e06886000b3.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-fd5696f3bbbaddae.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-452f9d5cbdd2dc73.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-09cbf02d3cd518c3.js" defer=""></script><script src="/dashboard/_next/static/chunks/9353-8369df1cf105221c.js" defer=""></script><script src="/dashboard/_next/static/chunks/2260-7703229c33c5ebd5.js" defer=""></script><script src="/dashboard/_next/static/chunks/3800-b589397dc09c5b4e.js" defer=""></script><script src="/dashboard/_next/static/chunks/7615-019513abc55b3b47.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-9c810f01ff4f398a.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-96e0f298308da7e2.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"KYAhEFa3FTfq4JyKVgo-s","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c5a4cfd2600fc715.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-fba3de387ff6bb08.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-68b647e26f9d2793.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/KYAhEFa3FTfq4JyKVgo-s/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"KYAhEFa3FTfq4JyKVgo-s","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/data/mounting_utils.py
CHANGED
|
@@ -223,7 +223,10 @@ def get_gcs_mount_cmd(bucket_name: str,
|
|
|
223
223
|
"""Returns a command to mount a GCS bucket using gcsfuse."""
|
|
224
224
|
bucket_sub_path_arg = f'--only-dir {_bucket_sub_path} '\
|
|
225
225
|
if _bucket_sub_path else ''
|
|
226
|
-
|
|
226
|
+
log_file = '$(mktemp -t gcsfuse.XXXX.log)'
|
|
227
|
+
mount_cmd = (f'gcsfuse --log-file {log_file} '
|
|
228
|
+
'--debug_fuse_errors '
|
|
229
|
+
'-o allow_other '
|
|
227
230
|
'--implicit-dirs '
|
|
228
231
|
f'--stat-cache-capacity {_STAT_CACHE_CAPACITY} '
|
|
229
232
|
f'--stat-cache-ttl {_STAT_CACHE_TTL} '
|
|
@@ -646,8 +649,19 @@ def get_mounting_script(
|
|
|
646
649
|
else
|
|
647
650
|
echo "No goofys log file found in /tmp"
|
|
648
651
|
fi
|
|
652
|
+
elif [ "$MOUNT_BINARY" = "gcsfuse" ]; then
|
|
653
|
+
echo "Looking for gcsfuse log files..."
|
|
654
|
+
# Find gcsfuse log files in /tmp (created by mktemp -t gcsfuse.XXXX.log)
|
|
655
|
+
GCSFUSE_LOGS=$(ls -t /tmp/gcsfuse.*.log 2>/dev/null | head -1)
|
|
656
|
+
if [ -n "$GCSFUSE_LOGS" ]; then
|
|
657
|
+
echo "=== GCSFuse log file contents ==="
|
|
658
|
+
cat "$GCSFUSE_LOGS"
|
|
659
|
+
echo "=== End of gcsfuse log file ==="
|
|
660
|
+
else
|
|
661
|
+
echo "No gcsfuse log file found in /tmp"
|
|
662
|
+
fi
|
|
649
663
|
fi
|
|
650
|
-
# TODO(kevin): Print logs from rclone, etc too for observability.
|
|
664
|
+
# TODO(kevin): Print logs from rclone, blobfuse2, etc too for observability.
|
|
651
665
|
exit $MOUNT_EXIT_CODE
|
|
652
666
|
fi
|
|
653
667
|
echo "Mounting done."
|
sky/global_user_state.py
CHANGED
|
@@ -2241,7 +2241,7 @@ def get_volumes(is_ephemeral: Optional[bool] = None) -> List[Dict[str, Any]]:
|
|
|
2241
2241
|
rows = session.query(volume_table).all()
|
|
2242
2242
|
else:
|
|
2243
2243
|
rows = session.query(volume_table).filter_by(
|
|
2244
|
-
is_ephemeral=is_ephemeral).all()
|
|
2244
|
+
is_ephemeral=int(is_ephemeral)).all()
|
|
2245
2245
|
records = []
|
|
2246
2246
|
for row in rows:
|
|
2247
2247
|
records.append({
|
|
@@ -2253,7 +2253,7 @@ def get_volumes(is_ephemeral: Optional[bool] = None) -> List[Dict[str, Any]]:
|
|
|
2253
2253
|
'last_attached_at': row.last_attached_at,
|
|
2254
2254
|
'last_use': row.last_use,
|
|
2255
2255
|
'status': status_lib.VolumeStatus[row.status],
|
|
2256
|
-
'is_ephemeral': row.is_ephemeral,
|
|
2256
|
+
'is_ephemeral': bool(row.is_ephemeral),
|
|
2257
2257
|
})
|
|
2258
2258
|
return records
|
|
2259
2259
|
|
|
@@ -2316,7 +2316,7 @@ def add_volume(
|
|
|
2316
2316
|
last_attached_at=last_attached_at,
|
|
2317
2317
|
last_use=last_use,
|
|
2318
2318
|
status=status.value,
|
|
2319
|
-
is_ephemeral=is_ephemeral,
|
|
2319
|
+
is_ephemeral=int(is_ephemeral),
|
|
2320
2320
|
)
|
|
2321
2321
|
do_update_stmt = insert_stmnt.on_conflict_do_nothing()
|
|
2322
2322
|
session.execute(do_update_stmt)
|
sky/models.py
CHANGED
|
@@ -68,6 +68,8 @@ class KubernetesNodeInfo:
|
|
|
68
68
|
free: Dict[str, int]
|
|
69
69
|
# IP address of the node (external IP preferred, fallback to internal IP)
|
|
70
70
|
ip_address: Optional[str] = None
|
|
71
|
+
# Whether the node is ready (all conditions are satisfied)
|
|
72
|
+
is_ready: bool = True
|
|
71
73
|
|
|
72
74
|
|
|
73
75
|
@dataclasses.dataclass
|
sky/optimizer.py
CHANGED
|
@@ -781,7 +781,7 @@ class Optimizer:
|
|
|
781
781
|
def _instance_type_str(resources: 'resources_lib.Resources') -> str:
|
|
782
782
|
instance_type = resources.instance_type
|
|
783
783
|
assert instance_type is not None, 'Instance type must be specified'
|
|
784
|
-
if isinstance(resources.cloud, clouds.Kubernetes):
|
|
784
|
+
if isinstance(resources.cloud, (clouds.Kubernetes, clouds.Slurm)):
|
|
785
785
|
instance_type = '-'
|
|
786
786
|
if resources.use_spot:
|
|
787
787
|
instance_type = ''
|
|
@@ -865,11 +865,12 @@ class Optimizer:
|
|
|
865
865
|
'use_spot': resources.use_spot
|
|
866
866
|
}
|
|
867
867
|
|
|
868
|
-
# Handle special case for Kubernetes and
|
|
869
|
-
if isinstance(resources.cloud, clouds.Kubernetes):
|
|
868
|
+
# Handle special case for Kubernetes, SSH, and SLURM clouds
|
|
869
|
+
if isinstance(resources.cloud, (clouds.Kubernetes, clouds.Slurm)):
|
|
870
870
|
# Region for Kubernetes-like clouds (SSH, Kubernetes) is the
|
|
871
|
-
# context name, i.e. different Kubernetes clusters.
|
|
872
|
-
#
|
|
871
|
+
# context name, i.e. different Kubernetes clusters.
|
|
872
|
+
# Region for SLURM is the cluster name.
|
|
873
|
+
# We add region to the key to show all the clusters in the
|
|
873
874
|
# optimizer table for better UX.
|
|
874
875
|
|
|
875
876
|
if resources.cloud.__class__.__name__ == 'SSH':
|
sky/provision/__init__.py
CHANGED
|
@@ -29,6 +29,7 @@ from sky.provision import runpod
|
|
|
29
29
|
from sky.provision import scp
|
|
30
30
|
from sky.provision import seeweb
|
|
31
31
|
from sky.provision import shadeform
|
|
32
|
+
from sky.provision import slurm
|
|
32
33
|
from sky.provision import ssh
|
|
33
34
|
from sky.provision import vast
|
|
34
35
|
from sky.provision import vsphere
|
sky/provision/common.py
CHANGED
|
@@ -6,6 +6,7 @@ import os
|
|
|
6
6
|
from typing import Any, Dict, List, Optional, Tuple
|
|
7
7
|
|
|
8
8
|
from sky import sky_logging
|
|
9
|
+
from sky.utils import config_utils
|
|
9
10
|
from sky.utils import env_options
|
|
10
11
|
from sky.utils import resources_utils
|
|
11
12
|
|
|
@@ -36,6 +37,13 @@ class StopFailoverError(Exception):
|
|
|
36
37
|
"""
|
|
37
38
|
|
|
38
39
|
|
|
40
|
+
# These fields are sensitive and should be redacted from the config for logging
|
|
41
|
+
# purposes.
|
|
42
|
+
SENSITIVE_FIELDS = [
|
|
43
|
+
('docker_config', 'docker_login_config', 'password'),
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
|
|
39
47
|
@dataclasses.dataclass
|
|
40
48
|
class ProvisionConfig:
|
|
41
49
|
"""Configuration for provisioning."""
|
|
@@ -56,6 +64,18 @@ class ProvisionConfig:
|
|
|
56
64
|
# Optional ports to open on launch of the cluster.
|
|
57
65
|
ports_to_open_on_launch: Optional[List[int]]
|
|
58
66
|
|
|
67
|
+
def get_redacted_config(self) -> Dict[str, Any]:
|
|
68
|
+
"""Get the redacted config."""
|
|
69
|
+
config = dataclasses.asdict(self)
|
|
70
|
+
|
|
71
|
+
config_copy = config_utils.Config(config)
|
|
72
|
+
|
|
73
|
+
for field_list in SENSITIVE_FIELDS:
|
|
74
|
+
val = config_copy.get_nested(field_list, default_value=None)
|
|
75
|
+
if val is not None:
|
|
76
|
+
config_copy.set_nested(field_list, '<redacted>')
|
|
77
|
+
return dict(**config_copy)
|
|
78
|
+
|
|
59
79
|
|
|
60
80
|
# -------------------- output data model -------------------- #
|
|
61
81
|
|
sky/provision/docker_utils.py
CHANGED
|
@@ -176,6 +176,17 @@ def _with_interactive(cmd):
|
|
|
176
176
|
return ['bash', '--login', '-c', '-i', shlex.quote(force_interactive)]
|
|
177
177
|
|
|
178
178
|
|
|
179
|
+
def _redact_docker_password(cmd: str) -> str:
|
|
180
|
+
parts = shlex.split(cmd)
|
|
181
|
+
for i, part in enumerate(parts):
|
|
182
|
+
if part.startswith('--password'):
|
|
183
|
+
if part.startswith('--password='):
|
|
184
|
+
parts[i] = '--password=<redacted>'
|
|
185
|
+
elif i + 1 < len(parts):
|
|
186
|
+
parts[i + 1] = '<redacted>'
|
|
187
|
+
return ' '.join(parts)
|
|
188
|
+
|
|
189
|
+
|
|
179
190
|
# SkyPilot: New class to initialize docker containers on a remote node.
|
|
180
191
|
# Adopted from ray.autoscaler._private.command_runner.DockerCommandRunner.
|
|
181
192
|
class DockerInitializer:
|
|
@@ -219,7 +230,9 @@ class DockerInitializer:
|
|
|
219
230
|
cmd = (f'flock {flock_args} /tmp/{flock_name} '
|
|
220
231
|
f'-c {shlex.quote(cmd)}')
|
|
221
232
|
|
|
222
|
-
|
|
233
|
+
# Redact the password in the login command.
|
|
234
|
+
redacted_cmd = _redact_docker_password(cmd)
|
|
235
|
+
logger.debug(f'+ {redacted_cmd}')
|
|
223
236
|
start = time.time()
|
|
224
237
|
while True:
|
|
225
238
|
rc, stdout, stderr = self.runner.run(
|
|
@@ -251,7 +264,7 @@ class DockerInitializer:
|
|
|
251
264
|
break
|
|
252
265
|
subprocess_utils.handle_returncode(
|
|
253
266
|
rc,
|
|
254
|
-
|
|
267
|
+
redacted_cmd,
|
|
255
268
|
error_msg='Failed to run docker setup commands.',
|
|
256
269
|
stderr=stdout + stderr,
|
|
257
270
|
# Print out the error message if the command failed.
|
|
@@ -1205,15 +1205,24 @@ class V1NodeAddress:
|
|
|
1205
1205
|
address: str
|
|
1206
1206
|
|
|
1207
1207
|
|
|
1208
|
+
@dataclasses.dataclass
|
|
1209
|
+
class V1NodeCondition:
|
|
1210
|
+
"""Represents a Kubernetes node condition."""
|
|
1211
|
+
type: str
|
|
1212
|
+
status: str
|
|
1213
|
+
|
|
1214
|
+
|
|
1208
1215
|
@dataclasses.dataclass
|
|
1209
1216
|
class V1NodeStatus:
|
|
1210
1217
|
allocatable: Dict[str, str]
|
|
1211
1218
|
capacity: Dict[str, str]
|
|
1212
1219
|
addresses: List[V1NodeAddress]
|
|
1220
|
+
conditions: List[V1NodeCondition]
|
|
1213
1221
|
|
|
1214
1222
|
|
|
1215
1223
|
@dataclasses.dataclass
|
|
1216
1224
|
class V1Node:
|
|
1225
|
+
"""Represents a Kubernetes node."""
|
|
1217
1226
|
metadata: V1ObjectMeta
|
|
1218
1227
|
status: V1NodeStatus
|
|
1219
1228
|
|
|
@@ -1231,8 +1240,24 @@ class V1Node:
|
|
|
1231
1240
|
V1NodeAddress(type=addr['type'],
|
|
1232
1241
|
address=addr['address'])
|
|
1233
1242
|
for addr in data['status'].get('addresses', [])
|
|
1243
|
+
],
|
|
1244
|
+
conditions=[
|
|
1245
|
+
V1NodeCondition(type=cond['type'],
|
|
1246
|
+
status=cond['status'])
|
|
1247
|
+
for cond in data['status'].get('conditions', [])
|
|
1234
1248
|
]))
|
|
1235
1249
|
|
|
1250
|
+
def is_ready(self) -> bool:
|
|
1251
|
+
"""Check if the node is ready based on its conditions.
|
|
1252
|
+
|
|
1253
|
+
A node is considered ready if it has a 'Ready' condition with
|
|
1254
|
+
status 'True'.
|
|
1255
|
+
"""
|
|
1256
|
+
for condition in self.status.conditions:
|
|
1257
|
+
if condition.type == 'Ready':
|
|
1258
|
+
return condition.status == 'True'
|
|
1259
|
+
return False
|
|
1260
|
+
|
|
1236
1261
|
|
|
1237
1262
|
@annotations.lru_cache(scope='request', maxsize=10)
|
|
1238
1263
|
@_retry_on_error(resource_type='node')
|
|
@@ -1451,11 +1476,12 @@ def check_instance_fits(context: Optional[str],
|
|
|
1451
1476
|
return False, str(e)
|
|
1452
1477
|
# Get the set of nodes that have the GPU type
|
|
1453
1478
|
gpu_nodes = [
|
|
1454
|
-
node for node in nodes
|
|
1479
|
+
node for node in nodes
|
|
1480
|
+
if node.is_ready() and gpu_label_key in node.metadata.labels and
|
|
1455
1481
|
node.metadata.labels[gpu_label_key] in gpu_label_values
|
|
1456
1482
|
]
|
|
1457
1483
|
if not gpu_nodes:
|
|
1458
|
-
return False, f'No GPU nodes found with {acc_type} on the cluster'
|
|
1484
|
+
return False, f'No ready GPU nodes found with {acc_type} on the cluster'
|
|
1459
1485
|
if is_tpu_on_gke(acc_type):
|
|
1460
1486
|
# If requested accelerator is a TPU type, check if the cluster
|
|
1461
1487
|
# has sufficient TPU resource to meet the requirement.
|
|
@@ -1479,7 +1505,9 @@ def check_instance_fits(context: Optional[str],
|
|
|
1479
1505
|
f'enough CPU (> {k8s_instance_type.cpus} CPUs) and/or '
|
|
1480
1506
|
f'memory (> {k8s_instance_type.memory} G). ')
|
|
1481
1507
|
else:
|
|
1482
|
-
candidate_nodes = nodes
|
|
1508
|
+
candidate_nodes = [node for node in nodes if node.is_ready()]
|
|
1509
|
+
if not candidate_nodes:
|
|
1510
|
+
return False, 'No ready nodes found in the cluster.'
|
|
1483
1511
|
not_fit_reason_prefix = (f'No nodes found with enough '
|
|
1484
1512
|
f'CPU (> {k8s_instance_type.cpus} CPUs) '
|
|
1485
1513
|
'and/or memory '
|
|
@@ -3078,16 +3106,23 @@ def get_kubernetes_node_info(
|
|
|
3078
3106
|
|
|
3079
3107
|
accelerator_count = get_node_accelerator_count(context,
|
|
3080
3108
|
node.status.allocatable)
|
|
3109
|
+
# Check if node is ready
|
|
3110
|
+
node_is_ready = node.is_ready()
|
|
3111
|
+
|
|
3081
3112
|
if accelerator_count == 0:
|
|
3082
3113
|
node_info_dict[node.metadata.name] = models.KubernetesNodeInfo(
|
|
3083
3114
|
name=node.metadata.name,
|
|
3084
3115
|
accelerator_type=accelerator_name,
|
|
3085
3116
|
total={'accelerator_count': 0},
|
|
3086
3117
|
free={'accelerators_available': 0},
|
|
3087
|
-
ip_address=node_ip
|
|
3118
|
+
ip_address=node_ip,
|
|
3119
|
+
is_ready=node_is_ready)
|
|
3088
3120
|
continue
|
|
3089
3121
|
|
|
3090
|
-
if not
|
|
3122
|
+
if not node_is_ready:
|
|
3123
|
+
# If node is not ready, report 0 available GPUs
|
|
3124
|
+
accelerators_available = 0
|
|
3125
|
+
elif not has_accelerator_nodes or error_on_get_allocated_gpu_qty_by_node:
|
|
3091
3126
|
accelerators_available = -1
|
|
3092
3127
|
else:
|
|
3093
3128
|
allocated_qty = allocated_qty_by_node[node.metadata.name]
|
|
@@ -3105,7 +3140,8 @@ def get_kubernetes_node_info(
|
|
|
3105
3140
|
accelerator_type=accelerator_name,
|
|
3106
3141
|
total={'accelerator_count': int(accelerator_count)},
|
|
3107
3142
|
free={'accelerators_available': int(accelerators_available)},
|
|
3108
|
-
ip_address=node_ip
|
|
3143
|
+
ip_address=node_ip,
|
|
3144
|
+
is_ready=node_is_ready)
|
|
3109
3145
|
hint = ''
|
|
3110
3146
|
if has_multi_host_tpu:
|
|
3111
3147
|
hint = ('(Note: Multi-host TPUs are detected and excluded from the '
|
sky/provision/provisioner.py
CHANGED
|
@@ -157,9 +157,9 @@ def bulk_provision(
|
|
|
157
157
|
logger.debug(f'SkyPilot version: {sky.__version__}; '
|
|
158
158
|
f'commit: {sky.__commit__}')
|
|
159
159
|
logger.debug(_TITLE.format('Provisioning'))
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
160
|
+
redacted_config = bootstrap_config.get_redacted_config()
|
|
161
|
+
logger.debug('Provision config:\n'
|
|
162
|
+
f'{json.dumps(redacted_config, indent=2)}')
|
|
163
163
|
return _bulk_provision(cloud, region, cluster_name,
|
|
164
164
|
bootstrap_config)
|
|
165
165
|
except exceptions.NoClusterLaunchedError:
|
|
@@ -635,10 +635,15 @@ def _post_provision_setup(
|
|
|
635
635
|
status.update(
|
|
636
636
|
runtime_preparation_str.format(step=3, step_name='runtime'))
|
|
637
637
|
|
|
638
|
+
skip_ray_setup = False
|
|
638
639
|
ray_port = constants.SKY_REMOTE_RAY_PORT
|
|
639
640
|
head_ray_needs_restart = True
|
|
640
641
|
ray_cluster_healthy = False
|
|
641
|
-
if (not
|
|
642
|
+
if (launched_resources.cloud is not None and
|
|
643
|
+
not launched_resources.cloud.uses_ray()):
|
|
644
|
+
skip_ray_setup = True
|
|
645
|
+
logger.debug('Skip Ray cluster setup as cloud does not use Ray.')
|
|
646
|
+
elif (not provision_record.is_instance_just_booted(
|
|
642
647
|
head_instance.instance_id)):
|
|
643
648
|
# Check if head node Ray is alive
|
|
644
649
|
(ray_port, ray_cluster_healthy,
|
|
@@ -663,7 +668,9 @@ def _post_provision_setup(
|
|
|
663
668
|
'async setup to complete...')
|
|
664
669
|
time.sleep(1)
|
|
665
670
|
|
|
666
|
-
if
|
|
671
|
+
if skip_ray_setup:
|
|
672
|
+
logger.debug('Skip Ray cluster setup on the head node.')
|
|
673
|
+
elif head_ray_needs_restart:
|
|
667
674
|
logger.debug('Starting Ray on the entire cluster.')
|
|
668
675
|
instance_setup.start_ray_on_head_node(
|
|
669
676
|
cluster_name.name_on_cloud,
|
|
@@ -686,7 +693,9 @@ def _post_provision_setup(
|
|
|
686
693
|
# We don't need to restart ray on worker nodes if the ray cluster is
|
|
687
694
|
# already healthy, i.e. the head node has expected number of nodes
|
|
688
695
|
# connected to the ray cluster.
|
|
689
|
-
if
|
|
696
|
+
if skip_ray_setup:
|
|
697
|
+
logger.debug('Skip Ray cluster setup on the worker nodes.')
|
|
698
|
+
elif cluster_info.num_instances > 1 and not ray_cluster_healthy:
|
|
690
699
|
instance_setup.start_ray_on_worker_nodes(
|
|
691
700
|
cluster_name.name_on_cloud,
|
|
692
701
|
no_restart=not head_ray_needs_restart,
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Slurm provisioner for SkyPilot."""
|
|
2
|
+
|
|
3
|
+
from sky.provision.slurm.config import bootstrap_instances
|
|
4
|
+
from sky.provision.slurm.instance import cleanup_ports
|
|
5
|
+
from sky.provision.slurm.instance import get_cluster_info
|
|
6
|
+
from sky.provision.slurm.instance import get_command_runners
|
|
7
|
+
from sky.provision.slurm.instance import open_ports
|
|
8
|
+
from sky.provision.slurm.instance import query_instances
|
|
9
|
+
from sky.provision.slurm.instance import run_instances
|
|
10
|
+
from sky.provision.slurm.instance import stop_instances
|
|
11
|
+
from sky.provision.slurm.instance import terminate_instances
|
|
12
|
+
from sky.provision.slurm.instance import wait_instances
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Slrum-specific configuration for the provisioner."""
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
from sky.provision import common
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def bootstrap_instances(
|
|
10
|
+
region: str, cluster_name: str,
|
|
11
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
|
12
|
+
del region, cluster_name # unused
|
|
13
|
+
return config
|