skypilot-nightly 1.0.0.dev20250827__py3-none-any.whl → 1.0.0.dev20250829__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/admin_policy.py +11 -10
- sky/authentication.py +1 -1
- sky/backends/backend.py +3 -5
- sky/backends/backend_utils.py +140 -52
- sky/backends/cloud_vm_ray_backend.py +30 -25
- sky/backends/local_docker_backend.py +3 -8
- sky/backends/wheel_utils.py +35 -8
- sky/client/cli/command.py +41 -9
- sky/client/sdk.py +23 -8
- sky/client/sdk_async.py +6 -2
- sky/clouds/aws.py +118 -1
- sky/core.py +1 -4
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +82 -22
- sky/jobs/client/sdk.py +5 -2
- sky/jobs/recovery_strategy.py +9 -4
- sky/jobs/server/server.py +2 -1
- sky/logs/agent.py +2 -2
- sky/logs/aws.py +6 -3
- sky/provision/aws/config.py +78 -3
- sky/provision/aws/instance.py +45 -6
- sky/provision/do/utils.py +2 -1
- sky/provision/kubernetes/instance.py +55 -11
- sky/provision/kubernetes/utils.py +11 -2
- sky/provision/nebius/utils.py +36 -2
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/serve/client/impl.py +5 -4
- sky/serve/replica_managers.py +4 -3
- sky/serve/serve_utils.py +2 -2
- sky/serve/server/impl.py +3 -2
- sky/serve/server/server.py +2 -1
- sky/server/auth/oauth2_proxy.py +10 -4
- sky/server/common.py +4 -4
- sky/server/daemons.py +16 -5
- sky/server/requests/executor.py +5 -3
- sky/server/requests/payloads.py +3 -1
- sky/server/requests/preconditions.py +3 -2
- sky/server/requests/requests.py +121 -19
- sky/server/server.py +85 -60
- sky/server/stream_utils.py +7 -5
- sky/setup_files/dependencies.py +6 -1
- sky/sky_logging.py +28 -0
- sky/skylet/constants.py +6 -0
- sky/skylet/events.py +2 -3
- sky/skypilot_config.py +10 -10
- sky/task.py +1 -1
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +4 -8
- sky/usage/usage_lib.py +3 -2
- sky/utils/annotations.py +8 -2
- sky/utils/cluster_utils.py +3 -3
- sky/utils/common_utils.py +0 -72
- sky/utils/controller_utils.py +4 -3
- sky/utils/dag_utils.py +4 -4
- sky/utils/db/db_utils.py +11 -0
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/kubernetes/config_map_utils.py +3 -3
- sky/utils/kubernetes_enums.py +1 -0
- sky/utils/lock_events.py +94 -0
- sky/utils/schemas.py +3 -0
- sky/utils/timeline.py +24 -93
- sky/utils/yaml_utils.py +77 -10
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/METADATA +8 -2
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/RECORD +86 -84
- /sky/dashboard/out/_next/static/{-eL7Ky3bxVivzeLHNB9U6 → hYJYFIxp_ZFONR4wTIJqZ}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{-eL7Ky3bxVivzeLHNB9U6 → hYJYFIxp_ZFONR4wTIJqZ}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/top_level.txt +0 -0
sky/dashboard/out/jobs.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs-7421e63ac35f8fce.js" defer=""></script><script src="/dashboard/_next/static
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs-7421e63ac35f8fce.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/jobs","query":{},"buildId":"hYJYFIxp_ZFONR4wTIJqZ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/users.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/users-018bf31cda52e11b.js" defer=""></script><script src="/dashboard/_next/static
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/users-018bf31cda52e11b.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/users","query":{},"buildId":"hYJYFIxp_ZFONR4wTIJqZ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/volumes.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/volumes-739726d6b823f532.js" defer=""></script><script src="/dashboard/_next/static
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/volumes-739726d6b823f532.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/volumes","query":{},"buildId":"hYJYFIxp_ZFONR4wTIJqZ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js" defer=""></script><script src="/dashboard/_next/static
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"hYJYFIxp_ZFONR4wTIJqZ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/7205-88191679e7988c57.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-4a6f1a928fb6d370.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-08b2a1cae076a943.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-8afcf719ea87debc.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-6c9c09593b1e67b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-943efc7aff0f0c06.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-de06e613e20bc977.js" defer=""></script><script src="/dashboard/_next/static
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/7205-88191679e7988c57.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-4a6f1a928fb6d370.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-08b2a1cae076a943.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-8afcf719ea87debc.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-6c9c09593b1e67b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-943efc7aff0f0c06.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-de06e613e20bc977.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"hYJYFIxp_ZFONR4wTIJqZ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-be35b22e2046564c.js" defer=""></script><script src="/dashboard/_next/static
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-be35b22e2046564c.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"hYJYFIxp_ZFONR4wTIJqZ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/global_user_state.py
CHANGED
|
@@ -53,6 +53,7 @@ _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
|
|
53
53
|
_SQLALCHEMY_ENGINE_LOCK = threading.Lock()
|
|
54
54
|
|
|
55
55
|
DEFAULT_CLUSTER_EVENT_RETENTION_HOURS = 24.0
|
|
56
|
+
DEBUG_CLUSTER_EVENT_RETENTION_HOURS = 30 * 24.0
|
|
56
57
|
MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS = 3600
|
|
57
58
|
|
|
58
59
|
_UNIQUE_CONSTRAINT_FAILED_ERROR_MSGS = [
|
|
@@ -205,6 +206,7 @@ cluster_event_table = sqlalchemy.Table(
|
|
|
205
206
|
sqlalchemy.Column('reason', sqlalchemy.Text, primary_key=True),
|
|
206
207
|
sqlalchemy.Column('transitioned_at', sqlalchemy.Integer, primary_key=True),
|
|
207
208
|
sqlalchemy.Column('type', sqlalchemy.Text),
|
|
209
|
+
sqlalchemy.Column('request_id', sqlalchemy.Text, server_default=None),
|
|
208
210
|
)
|
|
209
211
|
|
|
210
212
|
ssh_key_table = sqlalchemy.Table(
|
|
@@ -595,7 +597,7 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
595
597
|
if (is_launch and not cluster_row or
|
|
596
598
|
cluster_row.status != status_lib.ClusterStatus.UP.value):
|
|
597
599
|
conditional_values.update({
|
|
598
|
-
'last_creation_yaml':
|
|
600
|
+
'last_creation_yaml': yaml_utils.dump_yaml_str(task_config)
|
|
599
601
|
if task_config else None,
|
|
600
602
|
'last_creation_command': last_use,
|
|
601
603
|
})
|
|
@@ -744,6 +746,7 @@ def add_cluster_event(cluster_name: str,
|
|
|
744
746
|
elif last_event == reason:
|
|
745
747
|
return
|
|
746
748
|
try:
|
|
749
|
+
request_id = common_utils.get_current_request_id()
|
|
747
750
|
session.execute(
|
|
748
751
|
insert_func(cluster_event_table).values(
|
|
749
752
|
cluster_hash=cluster_hash,
|
|
@@ -753,6 +756,7 @@ def add_cluster_event(cluster_name: str,
|
|
|
753
756
|
reason=reason,
|
|
754
757
|
transitioned_at=transitioned_at,
|
|
755
758
|
type=event_type.value,
|
|
759
|
+
request_id=request_id,
|
|
756
760
|
))
|
|
757
761
|
session.commit()
|
|
758
762
|
except sqlalchemy.exc.IntegrityError as e:
|
|
@@ -807,12 +811,15 @@ def _get_last_cluster_event_multiple(
|
|
|
807
811
|
return {row.cluster_hash: row.reason for row in rows}
|
|
808
812
|
|
|
809
813
|
|
|
810
|
-
def cleanup_cluster_events_with_retention(retention_hours: float
|
|
814
|
+
def cleanup_cluster_events_with_retention(retention_hours: float,
|
|
815
|
+
event_type: ClusterEventType) -> None:
|
|
811
816
|
assert _SQLALCHEMY_ENGINE is not None
|
|
817
|
+
# Once for events with type STATUS_CHANGE.
|
|
812
818
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
813
819
|
query = session.query(cluster_event_table).filter(
|
|
814
|
-
cluster_event_table.c.transitioned_at <
|
|
815
|
-
retention_hours * 3600
|
|
820
|
+
cluster_event_table.c.transitioned_at <
|
|
821
|
+
time.time() - retention_hours * 3600,
|
|
822
|
+
cluster_event_table.c.type == event_type.value)
|
|
816
823
|
logger.debug(f'Deleting {query.count()} cluster events.')
|
|
817
824
|
query.delete()
|
|
818
825
|
session.commit()
|
|
@@ -827,9 +834,20 @@ async def cluster_event_retention_daemon():
|
|
|
827
834
|
retention_hours = skypilot_config.get_nested(
|
|
828
835
|
('api_server', 'cluster_event_retention_hours'),
|
|
829
836
|
DEFAULT_CLUSTER_EVENT_RETENTION_HOURS)
|
|
837
|
+
debug_retention_hours = skypilot_config.get_nested(
|
|
838
|
+
('api_server', 'cluster_debug_event_retention_hours'),
|
|
839
|
+
DEBUG_CLUSTER_EVENT_RETENTION_HOURS)
|
|
830
840
|
try:
|
|
831
841
|
if retention_hours >= 0:
|
|
832
|
-
|
|
842
|
+
logger.debug('Cleaning up cluster events with retention '
|
|
843
|
+
f'{retention_hours} hours.')
|
|
844
|
+
cleanup_cluster_events_with_retention(
|
|
845
|
+
retention_hours, ClusterEventType.STATUS_CHANGE)
|
|
846
|
+
if debug_retention_hours >= 0:
|
|
847
|
+
logger.debug('Cleaning up debug cluster events with retention '
|
|
848
|
+
f'{debug_retention_hours} hours.')
|
|
849
|
+
cleanup_cluster_events_with_retention(debug_retention_hours,
|
|
850
|
+
ClusterEventType.DEBUG)
|
|
833
851
|
except asyncio.CancelledError:
|
|
834
852
|
logger.info('Cluster event retention daemon cancelled')
|
|
835
853
|
break
|
|
@@ -837,8 +855,9 @@ async def cluster_event_retention_daemon():
|
|
|
837
855
|
logger.error(f'Error running cluster event retention daemon: {e}')
|
|
838
856
|
|
|
839
857
|
# Run daemon at most once every hour to avoid too frequent cleanup.
|
|
840
|
-
sleep_amount = max(
|
|
841
|
-
|
|
858
|
+
sleep_amount = max(
|
|
859
|
+
min(retention_hours * 3600, debug_retention_hours * 3600),
|
|
860
|
+
MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS)
|
|
842
861
|
await asyncio.sleep(sleep_amount)
|
|
843
862
|
|
|
844
863
|
|
|
@@ -904,8 +923,7 @@ def update_last_use(cluster_name: str):
|
|
|
904
923
|
|
|
905
924
|
|
|
906
925
|
@_init_db
|
|
907
|
-
def remove_cluster(cluster_name: str, terminate: bool
|
|
908
|
-
remove_events: bool) -> None:
|
|
926
|
+
def remove_cluster(cluster_name: str, terminate: bool) -> None:
|
|
909
927
|
"""Removes cluster_name mapping."""
|
|
910
928
|
assert _SQLALCHEMY_ENGINE is not None
|
|
911
929
|
cluster_hash = _get_hash_for_existing_cluster(cluster_name)
|
|
@@ -933,9 +951,6 @@ def remove_cluster(cluster_name: str, terminate: bool,
|
|
|
933
951
|
|
|
934
952
|
if terminate:
|
|
935
953
|
session.query(cluster_table).filter_by(name=cluster_name).delete()
|
|
936
|
-
if remove_events:
|
|
937
|
-
session.query(cluster_event_table).filter_by(
|
|
938
|
-
cluster_hash=cluster_hash).delete()
|
|
939
954
|
else:
|
|
940
955
|
handle = get_handle_from_cluster_name(cluster_name)
|
|
941
956
|
if handle is None:
|
|
@@ -2070,19 +2085,51 @@ def get_cluster_yaml_str(cluster_yaml_path: Optional[str]) -> Optional[str]:
|
|
|
2070
2085
|
row = session.query(cluster_yaml_table).filter_by(
|
|
2071
2086
|
cluster_name=cluster_name).first()
|
|
2072
2087
|
if row is None:
|
|
2073
|
-
|
|
2074
|
-
# on the local file system and migrate it to the database.
|
|
2075
|
-
# TODO(syang): remove this check once we have a way to migrate the
|
|
2076
|
-
# cluster from file to database. Remove on v0.12.0.
|
|
2077
|
-
if cluster_yaml_path is not None and os.path.exists(cluster_yaml_path):
|
|
2078
|
-
with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
|
|
2079
|
-
yaml_str = f.read()
|
|
2080
|
-
set_cluster_yaml(cluster_name, yaml_str)
|
|
2081
|
-
return yaml_str
|
|
2082
|
-
return None
|
|
2088
|
+
return _set_cluster_yaml_from_file(cluster_yaml_path, cluster_name)
|
|
2083
2089
|
return row.yaml
|
|
2084
2090
|
|
|
2085
2091
|
|
|
2092
|
+
def get_cluster_yaml_str_multiple(cluster_yaml_paths: List[str]) -> List[str]:
|
|
2093
|
+
"""Get the cluster yaml from the database or the local file system.
|
|
2094
|
+
"""
|
|
2095
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2096
|
+
cluster_names_to_yaml_paths = {}
|
|
2097
|
+
for cluster_yaml_path in cluster_yaml_paths:
|
|
2098
|
+
cluster_name, _ = os.path.splitext(os.path.basename(cluster_yaml_path))
|
|
2099
|
+
cluster_names_to_yaml_paths[cluster_name] = cluster_yaml_path
|
|
2100
|
+
|
|
2101
|
+
cluster_names = list(cluster_names_to_yaml_paths.keys())
|
|
2102
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2103
|
+
rows = session.query(cluster_yaml_table).filter(
|
|
2104
|
+
cluster_yaml_table.c.cluster_name.in_(cluster_names)).all()
|
|
2105
|
+
row_cluster_names_to_yaml = {row.cluster_name: row.yaml for row in rows}
|
|
2106
|
+
|
|
2107
|
+
yaml_strs = []
|
|
2108
|
+
for cluster_name in cluster_names:
|
|
2109
|
+
if cluster_name in row_cluster_names_to_yaml:
|
|
2110
|
+
yaml_strs.append(row_cluster_names_to_yaml[cluster_name])
|
|
2111
|
+
else:
|
|
2112
|
+
yaml_str = _set_cluster_yaml_from_file(
|
|
2113
|
+
cluster_names_to_yaml_paths[cluster_name], cluster_name)
|
|
2114
|
+
yaml_strs.append(yaml_str)
|
|
2115
|
+
return yaml_strs
|
|
2116
|
+
|
|
2117
|
+
|
|
2118
|
+
def _set_cluster_yaml_from_file(cluster_yaml_path: str,
|
|
2119
|
+
cluster_name: str) -> Optional[str]:
|
|
2120
|
+
"""Set the cluster yaml in the database from a file."""
|
|
2121
|
+
# If the cluster yaml is not in the database, check if it exists
|
|
2122
|
+
# on the local file system and migrate it to the database.
|
|
2123
|
+
# TODO(syang): remove this check once we have a way to migrate the
|
|
2124
|
+
# cluster from file to database. Remove on v0.12.0.
|
|
2125
|
+
if cluster_yaml_path is not None and os.path.exists(cluster_yaml_path):
|
|
2126
|
+
with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
|
|
2127
|
+
yaml_str = f.read()
|
|
2128
|
+
set_cluster_yaml(cluster_name, yaml_str)
|
|
2129
|
+
return yaml_str
|
|
2130
|
+
return None
|
|
2131
|
+
|
|
2132
|
+
|
|
2086
2133
|
def get_cluster_yaml_dict(cluster_yaml_path: Optional[str]) -> Dict[str, Any]:
|
|
2087
2134
|
"""Get the cluster yaml as a dictionary from the database.
|
|
2088
2135
|
|
|
@@ -2094,6 +2141,19 @@ def get_cluster_yaml_dict(cluster_yaml_path: Optional[str]) -> Dict[str, Any]:
|
|
|
2094
2141
|
return yaml_utils.safe_load(yaml_str)
|
|
2095
2142
|
|
|
2096
2143
|
|
|
2144
|
+
def get_cluster_yaml_dict_multiple(
|
|
2145
|
+
cluster_yaml_paths: List[str]) -> List[Dict[str, Any]]:
|
|
2146
|
+
"""Get the cluster yaml as a dictionary from the database."""
|
|
2147
|
+
yaml_strs = get_cluster_yaml_str_multiple(cluster_yaml_paths)
|
|
2148
|
+
yaml_dicts = []
|
|
2149
|
+
for idx, yaml_str in enumerate(yaml_strs):
|
|
2150
|
+
if yaml_str is None:
|
|
2151
|
+
raise ValueError(
|
|
2152
|
+
f'Cluster yaml {cluster_yaml_paths[idx]} not found.')
|
|
2153
|
+
yaml_dicts.append(yaml_utils.safe_load(yaml_str))
|
|
2154
|
+
return yaml_dicts
|
|
2155
|
+
|
|
2156
|
+
|
|
2097
2157
|
@_init_db
|
|
2098
2158
|
def set_cluster_yaml(cluster_name: str, yaml_str: str) -> None:
|
|
2099
2159
|
"""Set the cluster yaml in the database."""
|
sky/jobs/client/sdk.py
CHANGED
|
@@ -243,7 +243,7 @@ def tail_logs(name: Optional[str] = None,
|
|
|
243
243
|
controller: bool = False,
|
|
244
244
|
refresh: bool = False,
|
|
245
245
|
tail: Optional[int] = None,
|
|
246
|
-
output_stream: Optional['io.TextIOBase'] = None) -> int:
|
|
246
|
+
output_stream: Optional['io.TextIOBase'] = None) -> Optional[int]:
|
|
247
247
|
"""Tails logs of managed jobs.
|
|
248
248
|
|
|
249
249
|
You can provide either a job name or a job ID to tail logs. If both are not
|
|
@@ -263,6 +263,8 @@ def tail_logs(name: Optional[str] = None,
|
|
|
263
263
|
Exit code based on success or failure of the job. 0 if success,
|
|
264
264
|
100 if the job failed. See exceptions.JobExitCode for possible exit
|
|
265
265
|
codes.
|
|
266
|
+
Will return None if follow is False
|
|
267
|
+
(see note in sky/client/sdk.py::stream_response)
|
|
266
268
|
|
|
267
269
|
Request Raises:
|
|
268
270
|
ValueError: invalid arguments.
|
|
@@ -289,7 +291,8 @@ def tail_logs(name: Optional[str] = None,
|
|
|
289
291
|
return sdk.stream_response(request_id=request_id,
|
|
290
292
|
response=response,
|
|
291
293
|
output_stream=output_stream,
|
|
292
|
-
resumable=(tail == 0)
|
|
294
|
+
resumable=(tail == 0),
|
|
295
|
+
get_result=follow)
|
|
293
296
|
|
|
294
297
|
|
|
295
298
|
@usage_lib.entrypoint
|
sky/jobs/recovery_strategy.py
CHANGED
|
@@ -327,10 +327,15 @@ class StrategyExecutor:
|
|
|
327
327
|
cluster_name=self.cluster_name,
|
|
328
328
|
# We expect to tear down the cluster as soon as
|
|
329
329
|
# the job is finished. However, in case the
|
|
330
|
-
# controller dies,
|
|
331
|
-
#
|
|
332
|
-
|
|
333
|
-
|
|
330
|
+
# controller dies, we may end up with a
|
|
331
|
+
# resource leak.
|
|
332
|
+
# Ideally, we should autodown to be safe,
|
|
333
|
+
# but it's fine to disable it for now, as
|
|
334
|
+
# Nebius doesn't support autodown yet.
|
|
335
|
+
# TODO(kevin): set down=True once Nebius
|
|
336
|
+
# supports autodown.
|
|
337
|
+
# idle_minutes_to_autostop=_AUTODOWN_MINUTES,
|
|
338
|
+
# down=True,
|
|
334
339
|
_is_launched_by_jobs_controller=True)
|
|
335
340
|
else:
|
|
336
341
|
self.cluster_name = (
|
sky/jobs/server/server.py
CHANGED
|
@@ -79,7 +79,8 @@ async def logs(
|
|
|
79
79
|
if jobs_logs_body.refresh else api_requests.ScheduleType.SHORT,
|
|
80
80
|
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
81
81
|
)
|
|
82
|
-
request_task = api_requests.
|
|
82
|
+
request_task = await api_requests.get_request_async(request.state.request_id
|
|
83
|
+
)
|
|
83
84
|
|
|
84
85
|
return stream_utils.stream_response(
|
|
85
86
|
request_id=request_task.request_id,
|
sky/logs/agent.py
CHANGED
|
@@ -5,8 +5,8 @@ import shlex
|
|
|
5
5
|
from typing import Any, Dict
|
|
6
6
|
|
|
7
7
|
from sky.skylet import constants
|
|
8
|
-
from sky.utils import common_utils
|
|
9
8
|
from sky.utils import resources_utils
|
|
9
|
+
from sky.utils import yaml_utils
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class LoggingAgent(abc.ABC):
|
|
@@ -65,7 +65,7 @@ class FluentbitAgent(LoggingAgent):
|
|
|
65
65
|
'outputs': [self.fluentbit_output_config(cluster_name)],
|
|
66
66
|
}
|
|
67
67
|
}
|
|
68
|
-
return
|
|
68
|
+
return yaml_utils.dump_yaml_str(cfg_dict)
|
|
69
69
|
|
|
70
70
|
@abc.abstractmethod
|
|
71
71
|
def fluentbit_output_config(
|
sky/logs/aws.py
CHANGED
|
@@ -6,8 +6,8 @@ import pydantic
|
|
|
6
6
|
|
|
7
7
|
from sky.logs.agent import FluentbitAgent
|
|
8
8
|
from sky.skylet import constants
|
|
9
|
-
from sky.utils import common_utils
|
|
10
9
|
from sky.utils import resources_utils
|
|
10
|
+
from sky.utils import yaml_utils
|
|
11
11
|
|
|
12
12
|
EC2_MD_URL = '"${AWS_EC2_METADATA_SERVICE_ENDPOINT:-http://169.254.169.254/}"'
|
|
13
13
|
|
|
@@ -130,7 +130,10 @@ class CloudwatchLoggingAgent(FluentbitAgent):
|
|
|
130
130
|
|
|
131
131
|
# If region is specified, set it in the environment
|
|
132
132
|
if self.config.region:
|
|
133
|
-
pre_cmd += f' export AWS_REGION={self.config.region}
|
|
133
|
+
pre_cmd += (f' export AWS_REGION={self.config.region}'
|
|
134
|
+
f' AWS_DEFAULT_REGION={self.config.region};'
|
|
135
|
+
' command -v aws &>/dev/null && '
|
|
136
|
+
f'aws configure set region {self.config.region};')
|
|
134
137
|
else:
|
|
135
138
|
# If region is not specified, check if it's available in
|
|
136
139
|
# the environment or credentials file
|
|
@@ -213,7 +216,7 @@ class CloudwatchLoggingAgent(FluentbitAgent):
|
|
|
213
216
|
}
|
|
214
217
|
}
|
|
215
218
|
|
|
216
|
-
return
|
|
219
|
+
return yaml_utils.dump_yaml_str(cfg_dict)
|
|
217
220
|
|
|
218
221
|
def fluentbit_output_config(
|
|
219
222
|
self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
|
sky/provision/aws/config.py
CHANGED
|
@@ -87,6 +87,9 @@ def bootstrap_instances(
|
|
|
87
87
|
use_internal_ips=config.provider_config.get('use_internal_ips', False),
|
|
88
88
|
vpc_name=config.provider_config.get('vpc_name'))
|
|
89
89
|
|
|
90
|
+
max_efa_interfaces = config.provider_config.get('max_efa_interfaces', 0)
|
|
91
|
+
enable_efa = max_efa_interfaces > 0
|
|
92
|
+
|
|
90
93
|
# Cluster workers should be in a security group that permits traffic within
|
|
91
94
|
# the group, and also SSH access from outside.
|
|
92
95
|
if security_group_ids is None:
|
|
@@ -103,7 +106,8 @@ def bootstrap_instances(
|
|
|
103
106
|
extended_ip_rules = []
|
|
104
107
|
security_group_ids = _configure_security_group(ec2, vpc_id,
|
|
105
108
|
expected_sg_name,
|
|
106
|
-
extended_ip_rules
|
|
109
|
+
extended_ip_rules,
|
|
110
|
+
enable_efa)
|
|
107
111
|
if expected_sg_name != aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
|
|
108
112
|
logger.debug('Attempting to create the default security group.')
|
|
109
113
|
# Attempt to create the default security group. This is needed
|
|
@@ -114,7 +118,7 @@ def bootstrap_instances(
|
|
|
114
118
|
try:
|
|
115
119
|
_configure_security_group(ec2, vpc_id,
|
|
116
120
|
aws_cloud.DEFAULT_SECURITY_GROUP_NAME,
|
|
117
|
-
[])
|
|
121
|
+
[], enable_efa)
|
|
118
122
|
logger.debug('Default security group created.')
|
|
119
123
|
except exceptions.NoClusterLaunchedError as e:
|
|
120
124
|
if 'not authorized to perform: ec2:CreateSecurityGroup' in str(
|
|
@@ -148,6 +152,37 @@ def bootstrap_instances(
|
|
|
148
152
|
return config
|
|
149
153
|
|
|
150
154
|
|
|
155
|
+
def _configure_placement_group(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
156
|
+
placement_group_name: str):
|
|
157
|
+
"""Configure placement group for the cluster."""
|
|
158
|
+
# Create the placement group
|
|
159
|
+
logger.info(f'Creating placement group {placement_group_name}.')
|
|
160
|
+
try:
|
|
161
|
+
ec2.meta.client.create_placement_group(GroupName=placement_group_name,
|
|
162
|
+
Strategy='cluster')
|
|
163
|
+
except aws.botocore_exceptions().ClientError as exc:
|
|
164
|
+
if exc.response.get(
|
|
165
|
+
'Error', {}).get('Code') == 'InvalidPlacementGroup.Duplicate':
|
|
166
|
+
logger.debug(
|
|
167
|
+
f'Placement group {placement_group_name} already exists.')
|
|
168
|
+
else:
|
|
169
|
+
raise exc
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def delete_placement_group(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
173
|
+
placement_group_name: str):
|
|
174
|
+
"""Delete the placement group."""
|
|
175
|
+
try:
|
|
176
|
+
ec2.meta.client.delete_placement_group(GroupName=placement_group_name)
|
|
177
|
+
except aws.botocore_exceptions().ClientError as exc:
|
|
178
|
+
if exc.response.get('Error',
|
|
179
|
+
{}).get('Code') == 'InvalidPlacementGroup.Unknown':
|
|
180
|
+
logger.debug(
|
|
181
|
+
f'Placement group {placement_group_name} does not exist.')
|
|
182
|
+
else:
|
|
183
|
+
raise exc
|
|
184
|
+
|
|
185
|
+
|
|
151
186
|
def _configure_iam_role(iam) -> Dict[str, Any]:
|
|
152
187
|
|
|
153
188
|
def _get_instance_profile(profile_name: str):
|
|
@@ -557,7 +592,8 @@ def _get_subnet_and_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
557
592
|
|
|
558
593
|
def _configure_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
559
594
|
vpc_id: str, expected_sg_name: str,
|
|
560
|
-
extended_ip_rules: List
|
|
595
|
+
extended_ip_rules: List,
|
|
596
|
+
enable_efa: bool) -> List[str]:
|
|
561
597
|
security_group = _get_or_create_vpc_security_group(ec2, vpc_id,
|
|
562
598
|
expected_sg_name)
|
|
563
599
|
sg_ids = [security_group.id]
|
|
@@ -583,16 +619,55 @@ def _configure_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
583
619
|
},
|
|
584
620
|
*extended_ip_rules,
|
|
585
621
|
]
|
|
622
|
+
outbound_rules = []
|
|
623
|
+
if enable_efa:
|
|
624
|
+
# EFA requires that outbound rules permit the same security group to
|
|
625
|
+
# communicate with each other
|
|
626
|
+
# Refer to https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start-nccl.html#nccl-start-base-setup # pylint: disable=line-too-long
|
|
627
|
+
outbound_rules.append({
|
|
628
|
+
'FromPort': -1,
|
|
629
|
+
'ToPort': -1,
|
|
630
|
+
'IpProtocol': '-1',
|
|
631
|
+
'UserIdGroupPairs': [{
|
|
632
|
+
'GroupId': i
|
|
633
|
+
} for i in sg_ids],
|
|
634
|
+
})
|
|
586
635
|
# upsert the default security group
|
|
587
636
|
if not security_group.ip_permissions:
|
|
588
637
|
# If users specify security groups, we should not change the rules
|
|
589
638
|
# of these security groups. Here we change it because it is the default
|
|
590
639
|
# security group for SkyPilot.
|
|
591
640
|
security_group.authorize_ingress(IpPermissions=inbound_rules)
|
|
641
|
+
if _need_to_update_outbound_rules(security_group, outbound_rules):
|
|
642
|
+
security_group.authorize_egress(IpPermissions=outbound_rules)
|
|
592
643
|
|
|
593
644
|
return sg_ids
|
|
594
645
|
|
|
595
646
|
|
|
647
|
+
def _need_to_update_outbound_rules(
|
|
648
|
+
security_group: Any,
|
|
649
|
+
outbound_rules: List[Dict[str, Any]],
|
|
650
|
+
) -> bool:
|
|
651
|
+
"""Check if we need to update the outbound rules of the security group."""
|
|
652
|
+
if not security_group.ip_permissions_egress:
|
|
653
|
+
return True # No outbound rules, we need to add them
|
|
654
|
+
existing_group_ids = []
|
|
655
|
+
for rule in security_group.ip_permissions_egress:
|
|
656
|
+
if 'UserIdGroupPairs' in rule:
|
|
657
|
+
group_pairs = rule['UserIdGroupPairs']
|
|
658
|
+
for pair in group_pairs:
|
|
659
|
+
existing_group_ids.append(pair['GroupId'])
|
|
660
|
+
logger.debug(f'Existing group ids: {existing_group_ids}')
|
|
661
|
+
for rule in outbound_rules:
|
|
662
|
+
if 'UserIdGroupPairs' in rule:
|
|
663
|
+
group_pairs = rule['UserIdGroupPairs']
|
|
664
|
+
for pair in group_pairs:
|
|
665
|
+
if pair['GroupId'] not in existing_group_ids:
|
|
666
|
+
logger.debug(f'New group id: {pair["GroupId"]}')
|
|
667
|
+
return True # New group id, we need to add it
|
|
668
|
+
return False # No need to update
|
|
669
|
+
|
|
670
|
+
|
|
596
671
|
def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
597
672
|
vpc_id: str,
|
|
598
673
|
expected_sg_name: str) -> Any:
|
sky/provision/aws/instance.py
CHANGED
|
@@ -184,9 +184,15 @@ def _merge_tag_specs(tag_specs: List[Dict[str, Any]],
|
|
|
184
184
|
tag_specs += [user_tag_spec]
|
|
185
185
|
|
|
186
186
|
|
|
187
|
-
def _create_instances(
|
|
188
|
-
|
|
189
|
-
|
|
187
|
+
def _create_instances(
|
|
188
|
+
ec2_fail_fast,
|
|
189
|
+
cluster_name: str,
|
|
190
|
+
node_config: Dict[str, Any],
|
|
191
|
+
tags: Dict[str, str],
|
|
192
|
+
count: int,
|
|
193
|
+
associate_public_ip_address: bool,
|
|
194
|
+
max_efa_interfaces: int,
|
|
195
|
+
) -> List:
|
|
190
196
|
tags = {
|
|
191
197
|
'Name': cluster_name,
|
|
192
198
|
constants.TAG_RAY_CLUSTER_NAME: cluster_name,
|
|
@@ -239,7 +245,36 @@ def _create_instances(ec2_fail_fast, cluster_name: str,
|
|
|
239
245
|
# Whether the VM(s) should have a public IP.
|
|
240
246
|
'AssociatePublicIpAddress': associate_public_ip_address,
|
|
241
247
|
'Groups': security_group_ids,
|
|
248
|
+
'InterfaceType': 'efa'
|
|
249
|
+
if max_efa_interfaces > 0 else 'interface',
|
|
242
250
|
}]
|
|
251
|
+
# Due to AWS limitation, if an instance type supports multiple
|
|
252
|
+
# network cards, we cannot assign public IP addresses to the
|
|
253
|
+
# instance during creation, which will raise the following error:
|
|
254
|
+
# (InvalidParameterCombination) when calling the RunInstances
|
|
255
|
+
# operation: The associatePublicIPAddress parameter cannot be
|
|
256
|
+
# specified when launching with multiple network interfaces.
|
|
257
|
+
# So we only attach multiple network interfaces if public IP is
|
|
258
|
+
# not required.
|
|
259
|
+
# TODO(hailong): support attaching/detaching elastic IP to expose
|
|
260
|
+
# public IP in this case.
|
|
261
|
+
if max_efa_interfaces > 1 and not associate_public_ip_address:
|
|
262
|
+
instance_type = conf['InstanceType']
|
|
263
|
+
for i in range(1, max_efa_interfaces):
|
|
264
|
+
interface_type = 'efa-only'
|
|
265
|
+
# Special handling for P5 instances
|
|
266
|
+
# Refer to https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-acc-inst-types.html#efa-for-p5 for more details. # pylint: disable=line-too-long
|
|
267
|
+
if (instance_type == 'p5.48xlarge' or
|
|
268
|
+
instance_type == 'p5e.48xlarge'):
|
|
269
|
+
interface_type = 'efa' if i % 4 == 0 else 'efa-only'
|
|
270
|
+
network_interfaces.append({
|
|
271
|
+
'SubnetId': subnet_id,
|
|
272
|
+
'DeviceIndex': 1,
|
|
273
|
+
'NetworkCardIndex': i,
|
|
274
|
+
'AssociatePublicIpAddress': False,
|
|
275
|
+
'Groups': security_group_ids,
|
|
276
|
+
'InterfaceType': interface_type,
|
|
277
|
+
})
|
|
243
278
|
conf['NetworkInterfaces'] = network_interfaces
|
|
244
279
|
|
|
245
280
|
instances = _ec2_call_with_retry_on_server_error(
|
|
@@ -289,6 +324,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
289
324
|
zone = None
|
|
290
325
|
resumed_instance_ids: List[str] = []
|
|
291
326
|
created_instance_ids: List[str] = []
|
|
327
|
+
max_efa_interfaces = config.provider_config.get('max_efa_interfaces', 0)
|
|
292
328
|
|
|
293
329
|
# sort tags by key to support deterministic unit test stubbing
|
|
294
330
|
tags = dict(sorted(copy.deepcopy(config.tags).items()))
|
|
@@ -504,7 +540,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
504
540
|
tags,
|
|
505
541
|
reservation_count,
|
|
506
542
|
associate_public_ip_address=(
|
|
507
|
-
not config.provider_config['use_internal_ips'])
|
|
543
|
+
not config.provider_config['use_internal_ips']),
|
|
544
|
+
max_efa_interfaces=max_efa_interfaces)
|
|
508
545
|
created_instances.extend(created_reserved_instances)
|
|
509
546
|
to_start_count -= reservation_count
|
|
510
547
|
if to_start_count <= 0:
|
|
@@ -527,7 +564,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
527
564
|
tags,
|
|
528
565
|
to_start_count,
|
|
529
566
|
associate_public_ip_address=(
|
|
530
|
-
not config.provider_config['use_internal_ips'])
|
|
567
|
+
not config.provider_config['use_internal_ips']),
|
|
568
|
+
max_efa_interfaces=max_efa_interfaces)
|
|
531
569
|
|
|
532
570
|
created_instances.extend(created_remaining_instances)
|
|
533
571
|
created_instances.sort(key=lambda x: x.id)
|
|
@@ -686,6 +724,7 @@ def terminate_instances(
|
|
|
686
724
|
filters,
|
|
687
725
|
included_instances=None,
|
|
688
726
|
excluded_instances=None)
|
|
727
|
+
instance_list = list(instances)
|
|
689
728
|
default_sg = aws_config.get_security_group_from_vpc_id(
|
|
690
729
|
ec2, _get_vpc_id(provider_config),
|
|
691
730
|
aws_cloud.DEFAULT_SECURITY_GROUP_NAME)
|
|
@@ -719,7 +758,7 @@ def terminate_instances(
|
|
|
719
758
|
# exist. We must block on instance termination so that we can
|
|
720
759
|
# delete the security group.
|
|
721
760
|
instances.terminate()
|
|
722
|
-
for instance in
|
|
761
|
+
for instance in instance_list:
|
|
723
762
|
instance.wait_until_terminated()
|
|
724
763
|
|
|
725
764
|
# TODO(suquark): Currently, the implementation of GCP and Azure will
|
sky/provision/do/utils.py
CHANGED
|
@@ -17,6 +17,7 @@ from sky.provision import constants as provision_constants
|
|
|
17
17
|
from sky.provision.do import constants
|
|
18
18
|
from sky.utils import annotations
|
|
19
19
|
from sky.utils import common_utils
|
|
20
|
+
from sky.utils import yaml_utils
|
|
20
21
|
|
|
21
22
|
logger = sky_logging.init_logger(__name__)
|
|
22
23
|
|
|
@@ -61,7 +62,7 @@ def _init_client():
|
|
|
61
62
|
if get_credentials_path() is None:
|
|
62
63
|
raise DigitalOceanError(
|
|
63
64
|
'No credentials found, please run `doctl auth init`')
|
|
64
|
-
credentials =
|
|
65
|
+
credentials = yaml_utils.read_yaml(get_credentials_path())
|
|
65
66
|
default_token = credentials.get('access-token', None)
|
|
66
67
|
if default_token is not None:
|
|
67
68
|
try:
|