skypilot-nightly 1.0.0.dev20251029__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/aws.py +25 -7
- sky/client/cli/command.py +47 -23
- sky/clouds/aws.py +59 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/2755.d6dc6d530fed0b61.js +26 -0
- sky/dashboard/out/_next/static/chunks/{webpack-485984ca04e021d0.js → webpack-e38d5319cd10a3a0.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +32 -2
- sky/jobs/constants.py +2 -0
- sky/jobs/controller.py +62 -67
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/scheduler.py +15 -2
- sky/jobs/server/core.py +85 -13
- sky/jobs/server/server.py +12 -11
- sky/jobs/server/utils.py +28 -10
- sky/jobs/state.py +216 -40
- sky/jobs/utils.py +60 -22
- sky/metrics/utils.py +18 -0
- sky/schemas/api/responses.py +1 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/server/server.py +8 -7
- sky/server/common.py +21 -15
- sky/server/constants.py +1 -1
- sky/server/daemons.py +23 -17
- sky/server/requests/executor.py +7 -3
- sky/server/requests/request_names.py +80 -0
- sky/server/server.py +103 -35
- sky/skylet/constants.py +6 -1
- sky/skylet/events.py +7 -0
- sky/skylet/services.py +18 -7
- sky/ssh_node_pools/server.py +5 -4
- sky/task.py +4 -42
- sky/templates/kubernetes-ray.yml.j2 +1 -1
- sky/templates/websocket_proxy.py +140 -12
- sky/users/permission.py +4 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/resource_checker.py +4 -1
- sky/utils/schemas.py +23 -4
- sky/volumes/server/server.py +4 -3
- sky/workspaces/server.py +7 -6
- {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/METADATA +36 -36
- {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/RECORD +67 -62
- sky/dashboard/out/_next/static/chunks/2755.a239c652bf8684dd.js +0 -26
- /sky/dashboard/out/_next/static/{DabuSAKsc_y0wyJxpTIdQ → 8ixeA0NVQJN8HUdijid8b}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{DabuSAKsc_y0wyJxpTIdQ → 8ixeA0NVQJN8HUdijid8b}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/top_level.txt +0 -0
sky/dashboard/out/config.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-e38d5319cd10a3a0.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-bde01e4a2beec258.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/config-dfb9bf07b13045f4.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/config","query":{},"buildId":"8ixeA0NVQJN8HUdijid8b","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/index.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-e38d5319cd10a3a0.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-bde01e4a2beec258.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/index-444f1804401f04ea.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/","query":{},"buildId":"8ixeA0NVQJN8HUdijid8b","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-e38d5319cd10a3a0.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-bde01e4a2beec258.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/infra/%5Bcontext%5D-9b6e47c2e8b485a2.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/infra/[context]","query":{},"buildId":"8ixeA0NVQJN8HUdijid8b","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/infra.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-e38d5319cd10a3a0.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-bde01e4a2beec258.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/infra-c84a3b8a9d599b02.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/infra","query":{},"buildId":"8ixeA0NVQJN8HUdijid8b","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-e38d5319cd10a3a0.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-bde01e4a2beec258.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/6212-7bd06f60ba693125.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-4ed9236db997b42b.js" defer=""></script><script src="/dashboard/_next/static/chunks/9353-cff34f7e773b2e2b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs/%5Bjob%5D-eb5822dac0c9509b.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/jobs/[job]","query":{},"buildId":"8ixeA0NVQJN8HUdijid8b","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-e38d5319cd10a3a0.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-bde01e4a2beec258.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/6212-7bd06f60ba693125.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-4ed9236db997b42b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs/pools/%5Bpool%5D-e020fd69dbe76cea.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/jobs/pools/[pool]","query":{},"buildId":"8ixeA0NVQJN8HUdijid8b","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/jobs.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-e38d5319cd10a3a0.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-bde01e4a2beec258.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs-7eee823559e5cf9f.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/jobs","query":{},"buildId":"8ixeA0NVQJN8HUdijid8b","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/users.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-e38d5319cd10a3a0.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-bde01e4a2beec258.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/users-2b172f13f8538a7a.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/users","query":{},"buildId":"8ixeA0NVQJN8HUdijid8b","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/volumes.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-e38d5319cd10a3a0.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-bde01e4a2beec258.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/volumes-d2af9d22e87cc4ba.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/volumes","query":{},"buildId":"8ixeA0NVQJN8HUdijid8b","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-e38d5319cd10a3a0.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-bde01e4a2beec258.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"8ixeA0NVQJN8HUdijid8b","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-e38d5319cd10a3a0.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-bde01e4a2beec258.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/7359-c8d04e06886000b3.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-4ed9236db997b42b.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-f6818c84ed8f1c86.js" defer=""></script><script src="/dashboard/_next/static/chunks/9353-cff34f7e773b2e2b.js" defer=""></script><script src="/dashboard/_next/static/chunks/4282-49b2065b7336e496.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/7615-80aa7b09f45a86d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-c3c10e2c6ed71a8f.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-bbfe5860c93470fd.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"8ixeA0NVQJN8HUdijid8b","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0748ce22df867032.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0748ce22df867032.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-e38d5319cd10a3a0.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-bde01e4a2beec258.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-1891376c08050940.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/8ixeA0NVQJN8HUdijid8b/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"8ixeA0NVQJN8HUdijid8b","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/data/mounting_utils.py
CHANGED
|
@@ -233,9 +233,18 @@ def get_az_mount_install_cmd() -> str:
|
|
|
233
233
|
# Try to install fuse3 from default repos
|
|
234
234
|
'sudo apt-get update && '
|
|
235
235
|
'FUSE3_INSTALLED=0 && '
|
|
236
|
+
# On Kubernetes, if FUSERMOUNT_SHARED_DIR is set, it means
|
|
237
|
+
# fusermount and fusermount3 is symlinked to fusermount-shim.
|
|
238
|
+
# If we reinstall fuse3, it may overwrite the symlink, so
|
|
239
|
+
# just install libfuse3, which is needed by blobfuse2.
|
|
240
|
+
'if [ -n "${FUSERMOUNT_SHARED_DIR:-}" ]; then '
|
|
241
|
+
' PACKAGES="libfuse3-3 libfuse3-dev"; '
|
|
242
|
+
'else '
|
|
243
|
+
' PACKAGES="fuse3 libfuse3-3 libfuse3-dev"; '
|
|
244
|
+
'fi && '
|
|
236
245
|
'if sudo apt-get install -y '
|
|
237
246
|
'-o Dpkg::Options::="--force-confdef" '
|
|
238
|
-
'
|
|
247
|
+
'$PACKAGES; then '
|
|
239
248
|
' FUSE3_INSTALLED=1; '
|
|
240
249
|
' echo "fuse3 installed from default repos"; '
|
|
241
250
|
'else '
|
|
@@ -256,7 +265,7 @@ def get_az_mount_install_cmd() -> str:
|
|
|
256
265
|
' if sudo apt-get install -y '
|
|
257
266
|
'-o Dpkg::Options::="--force-confdef" '
|
|
258
267
|
'-o Dpkg::Options::="--force-confold" '
|
|
259
|
-
'
|
|
268
|
+
'$PACKAGES; then '
|
|
260
269
|
' FUSE3_INSTALLED=1; '
|
|
261
270
|
' echo "fuse3 installed from focal"; '
|
|
262
271
|
' sudo rm /etc/apt/sources.list.d/focal-fuse3.list; '
|
|
@@ -603,7 +612,28 @@ def get_mounting_script(
|
|
|
603
612
|
fi
|
|
604
613
|
fi
|
|
605
614
|
echo "Mounting $SOURCE_BUCKET to $MOUNT_PATH with $MOUNT_BINARY..."
|
|
615
|
+
set +e
|
|
606
616
|
{mount_cmd}
|
|
617
|
+
MOUNT_EXIT_CODE=$?
|
|
618
|
+
set -e
|
|
619
|
+
if [ $MOUNT_EXIT_CODE -ne 0 ]; then
|
|
620
|
+
echo "Mount failed with exit code $MOUNT_EXIT_CODE."
|
|
621
|
+
if [ "$MOUNT_BINARY" = "goofys" ]; then
|
|
622
|
+
echo "Looking for goofys log files..."
|
|
623
|
+
# Find goofys log files in /tmp (created by mktemp -t goofys.XXXX.log)
|
|
624
|
+
# Note: if /dev/log exists, goofys logs to syslog instead of a file
|
|
625
|
+
GOOFYS_LOGS=$(ls -t /tmp/goofys.*.log 2>/dev/null | head -1)
|
|
626
|
+
if [ -n "$GOOFYS_LOGS" ]; then
|
|
627
|
+
echo "=== Goofys log file contents ==="
|
|
628
|
+
cat "$GOOFYS_LOGS"
|
|
629
|
+
echo "=== End of goofys log file ==="
|
|
630
|
+
else
|
|
631
|
+
echo "No goofys log file found in /tmp"
|
|
632
|
+
fi
|
|
633
|
+
fi
|
|
634
|
+
# TODO(kevin): Print logs from rclone, etc too for observability.
|
|
635
|
+
exit $MOUNT_EXIT_CODE
|
|
636
|
+
fi
|
|
607
637
|
echo "Mounting done."
|
|
608
638
|
""")
|
|
609
639
|
|
sky/jobs/constants.py
CHANGED
|
@@ -46,6 +46,8 @@ JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
|
|
|
46
46
|
# The version of the lib files that jobs/utils use. Whenever there is an API
|
|
47
47
|
# change for the jobs/utils, we need to bump this version and update
|
|
48
48
|
# job.utils.ManagedJobCodeGen to handle the version update.
|
|
49
|
+
# WARNING: If you update this due to a codegen change, make sure to make the
|
|
50
|
+
# corresponding change in the ManagedJobsService AND bump the SKYLET_VERSION.
|
|
49
51
|
MANAGED_JOBS_VERSION = 12
|
|
50
52
|
|
|
51
53
|
# The command for setting up the jobs dashboard on the controller. It firstly
|
sky/jobs/controller.py
CHANGED
|
@@ -1,15 +1,17 @@
|
|
|
1
1
|
"""Controller: handles scheduling and the life cycle of a managed job.
|
|
2
2
|
"""
|
|
3
3
|
import asyncio
|
|
4
|
+
import io
|
|
4
5
|
import os
|
|
5
6
|
import pathlib
|
|
6
7
|
import resource
|
|
7
8
|
import shutil
|
|
8
9
|
import sys
|
|
10
|
+
import threading
|
|
9
11
|
import time
|
|
10
12
|
import traceback
|
|
11
13
|
import typing
|
|
12
|
-
from typing import Dict, Optional, Set
|
|
14
|
+
from typing import Dict, Optional, Set
|
|
13
15
|
|
|
14
16
|
import dotenv
|
|
15
17
|
|
|
@@ -22,6 +24,8 @@ from sky.backends import backend_utils
|
|
|
22
24
|
from sky.backends import cloud_vm_ray_backend
|
|
23
25
|
from sky.data import data_utils
|
|
24
26
|
from sky.jobs import constants as jobs_constants
|
|
27
|
+
from sky.jobs import file_content_utils
|
|
28
|
+
from sky.jobs import log_gc
|
|
25
29
|
from sky.jobs import recovery_strategy
|
|
26
30
|
from sky.jobs import scheduler
|
|
27
31
|
from sky.jobs import state as managed_job_state
|
|
@@ -29,6 +33,7 @@ from sky.jobs import utils as managed_job_utils
|
|
|
29
33
|
from sky.skylet import constants
|
|
30
34
|
from sky.skylet import job_lib
|
|
31
35
|
from sky.usage import usage_lib
|
|
36
|
+
from sky.utils import annotations
|
|
32
37
|
from sky.utils import common
|
|
33
38
|
from sky.utils import common_utils
|
|
34
39
|
from sky.utils import context
|
|
@@ -61,17 +66,26 @@ async def create_background_task(coro: typing.Coroutine) -> None:
|
|
|
61
66
|
task.add_done_callback(_background_tasks.discard)
|
|
62
67
|
|
|
63
68
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
+
# Make sure to limit the size as we don't want to cache too many DAGs in memory.
|
|
70
|
+
@annotations.lru_cache(scope='global', maxsize=50)
|
|
71
|
+
def _get_dag(job_id: int) -> 'sky.Dag':
|
|
72
|
+
dag_content = file_content_utils.get_job_dag_content(job_id)
|
|
73
|
+
if dag_content is None:
|
|
74
|
+
raise RuntimeError('Managed job DAG YAML content is unavailable for '
|
|
75
|
+
f'job {job_id}. This can happen if the job was '
|
|
76
|
+
'submitted before file migration completed or if '
|
|
77
|
+
'the submission failed to persist the DAG. Please '
|
|
78
|
+
're-submit the job.')
|
|
69
79
|
|
|
80
|
+
dag = dag_utils.load_chain_dag_from_yaml_str(dag_content)
|
|
81
|
+
assert dag.name is not None, dag
|
|
82
|
+
return dag
|
|
70
83
|
|
|
71
|
-
|
|
84
|
+
|
|
85
|
+
class JobController:
|
|
72
86
|
"""Controls the lifecycle of a single managed job.
|
|
73
87
|
|
|
74
|
-
This controller executes
|
|
88
|
+
This controller executes the chain DAG recorded for the job by:
|
|
75
89
|
- Loading the DAG and preparing per-task environment variables so each task
|
|
76
90
|
has a stable global job identifier across recoveries.
|
|
77
91
|
- Launching the task on the configured backend (``CloudVmRayBackend``),
|
|
@@ -91,7 +105,8 @@ class JobsController:
|
|
|
91
105
|
|
|
92
106
|
Key attributes:
|
|
93
107
|
- ``_job_id``: Integer identifier of this managed job.
|
|
94
|
-
- ``
|
|
108
|
+
- ``_dag`` / ``_dag_name``: The job definition and metadata loaded from the
|
|
109
|
+
database-backed job YAML.
|
|
95
110
|
- ``_backend``: Backend used to launch and manage clusters.
|
|
96
111
|
- ``_pool``: Optional pool name if using a cluster pool.
|
|
97
112
|
- ``starting`` / ``starting_lock`` / ``starting_signal``: Shared scheduler
|
|
@@ -104,7 +119,6 @@ class JobsController:
|
|
|
104
119
|
def __init__(
|
|
105
120
|
self,
|
|
106
121
|
job_id: int,
|
|
107
|
-
dag_yaml: str,
|
|
108
122
|
starting: Set[int],
|
|
109
123
|
starting_lock: asyncio.Lock,
|
|
110
124
|
starting_signal: asyncio.Condition,
|
|
@@ -114,7 +128,6 @@ class JobsController:
|
|
|
114
128
|
|
|
115
129
|
Args:
|
|
116
130
|
job_id: Integer ID of the managed job.
|
|
117
|
-
dag_yaml: Path to the YAML file containing the chain DAG to run.
|
|
118
131
|
starting: Shared set of job IDs currently in the STARTING phase,
|
|
119
132
|
used to limit concurrent launches.
|
|
120
133
|
starting_lock: ``asyncio.Lock`` guarding access to the shared
|
|
@@ -130,12 +143,11 @@ class JobsController:
|
|
|
130
143
|
self.starting_lock = starting_lock
|
|
131
144
|
self.starting_signal = starting_signal
|
|
132
145
|
|
|
133
|
-
logger.info(
|
|
134
|
-
f'dag_yaml={dag_yaml}')
|
|
146
|
+
logger.info('Initializing JobsController for job_id=%s', job_id)
|
|
135
147
|
|
|
136
148
|
self._job_id = job_id
|
|
137
|
-
self.
|
|
138
|
-
self.
|
|
149
|
+
self._dag = _get_dag(job_id)
|
|
150
|
+
self._dag_name = self._dag.name
|
|
139
151
|
logger.info(f'Loaded DAG: {self._dag}')
|
|
140
152
|
|
|
141
153
|
self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
|
|
@@ -779,8 +791,11 @@ class JobsController:
|
|
|
779
791
|
task=self._dag.tasks[task_id]))
|
|
780
792
|
|
|
781
793
|
|
|
782
|
-
class
|
|
783
|
-
"""
|
|
794
|
+
class ControllerManager:
|
|
795
|
+
"""Main loop for a job controller process.
|
|
796
|
+
|
|
797
|
+
Many jobs will be handled by this, each by a single JobController.
|
|
798
|
+
"""
|
|
784
799
|
|
|
785
800
|
def __init__(self, controller_uuid: str) -> None:
|
|
786
801
|
self._controller_uuid = controller_uuid
|
|
@@ -799,10 +814,7 @@ class Controller:
|
|
|
799
814
|
|
|
800
815
|
self._pid = os.getpid()
|
|
801
816
|
|
|
802
|
-
async def _cleanup(self,
|
|
803
|
-
job_id: int,
|
|
804
|
-
dag_yaml: str,
|
|
805
|
-
pool: Optional[str] = None):
|
|
817
|
+
async def _cleanup(self, job_id: int, pool: Optional[str] = None):
|
|
806
818
|
"""Clean up the cluster(s) and storages.
|
|
807
819
|
|
|
808
820
|
(1) Clean up the succeeded task(s)' ephemeral storage. The storage has
|
|
@@ -892,7 +904,7 @@ class Controller:
|
|
|
892
904
|
if error is not None:
|
|
893
905
|
raise error
|
|
894
906
|
|
|
895
|
-
dag
|
|
907
|
+
dag = _get_dag(job_id)
|
|
896
908
|
error = None
|
|
897
909
|
for task in dag.tasks:
|
|
898
910
|
# most things in this function are blocking
|
|
@@ -911,57 +923,45 @@ class Controller:
|
|
|
911
923
|
@context.contextual_async
|
|
912
924
|
async def run_job_loop(self,
|
|
913
925
|
job_id: int,
|
|
914
|
-
dag_yaml: str,
|
|
915
926
|
log_file: str,
|
|
916
|
-
env_file_path: Optional[str] = None,
|
|
917
927
|
pool: Optional[str] = None):
|
|
918
928
|
"""Background task that runs the job loop."""
|
|
919
929
|
ctx = context.get()
|
|
920
930
|
assert ctx is not None, 'Context is not initialized'
|
|
921
931
|
ctx.redirect_log(pathlib.Path(log_file))
|
|
922
932
|
|
|
923
|
-
logger.info(
|
|
924
|
-
logger.info(
|
|
925
|
-
logger.info(
|
|
926
|
-
logger.info(f' env_file_path={env_file_path}')
|
|
927
|
-
logger.info(f' pool={pool}')
|
|
933
|
+
logger.info('Starting job loop for %s', job_id)
|
|
934
|
+
logger.info(' log_file=%s', log_file)
|
|
935
|
+
logger.info(' pool=%s', pool)
|
|
928
936
|
logger.info(f'From controller {self._controller_uuid}')
|
|
929
937
|
logger.info(f' pid={self._pid}')
|
|
930
938
|
|
|
931
|
-
|
|
932
|
-
if
|
|
939
|
+
env_content = file_content_utils.get_job_env_content(job_id)
|
|
940
|
+
if env_content:
|
|
933
941
|
try:
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
f'{list(env_vars.keys())}')
|
|
938
|
-
|
|
939
|
-
# Apply environment variables to the job's context
|
|
942
|
+
env_vars = dotenv.dotenv_values(stream=io.StringIO(env_content))
|
|
943
|
+
logger.info('Loading %d environment variables for job %s',
|
|
944
|
+
len(env_vars), job_id)
|
|
940
945
|
if ctx is not None:
|
|
941
946
|
for key, value in env_vars.items():
|
|
942
947
|
if value is not None:
|
|
943
948
|
ctx.override_envs({key: value})
|
|
944
|
-
logger.debug(
|
|
945
|
-
|
|
946
|
-
# Reload the skypilot config for this context to make sure
|
|
947
|
-
# the latest config is used.
|
|
949
|
+
logger.debug('Set environment variable: %s=%s', key,
|
|
950
|
+
value)
|
|
948
951
|
skypilot_config.reload_config()
|
|
949
|
-
else:
|
|
950
|
-
logger.error(
|
|
951
|
-
|
|
952
|
+
else: # pragma: no cover - defensive
|
|
953
|
+
logger.error('Context is None, cannot set environment '
|
|
954
|
+
'variables')
|
|
952
955
|
except Exception as e: # pylint: disable=broad-except
|
|
953
956
|
logger.error(
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
logger.error(f'Environment file not found: {env_file_path}')
|
|
957
|
+
'Failed to load environment variables for job %s: '
|
|
958
|
+
'%s', job_id, e)
|
|
957
959
|
|
|
958
960
|
cancelling = False
|
|
959
961
|
try:
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
self._job_tasks_lock,
|
|
964
|
-
self._starting_signal, pool)
|
|
962
|
+
controller = JobController(job_id, self.starting,
|
|
963
|
+
self._job_tasks_lock,
|
|
964
|
+
self._starting_signal, pool)
|
|
965
965
|
|
|
966
966
|
async with self._job_tasks_lock:
|
|
967
967
|
if job_id in self.job_tasks:
|
|
@@ -976,7 +976,7 @@ class Controller:
|
|
|
976
976
|
await task
|
|
977
977
|
except asyncio.CancelledError:
|
|
978
978
|
logger.info(f'Job {job_id} was cancelled')
|
|
979
|
-
dag
|
|
979
|
+
dag = _get_dag(job_id)
|
|
980
980
|
task_id, _ = await (
|
|
981
981
|
managed_job_state.get_latest_task_id_status_async(job_id))
|
|
982
982
|
assert task_id is not None, job_id
|
|
@@ -994,7 +994,7 @@ class Controller:
|
|
|
994
994
|
raise
|
|
995
995
|
finally:
|
|
996
996
|
try:
|
|
997
|
-
await self._cleanup(job_id,
|
|
997
|
+
await self._cleanup(job_id, pool=pool)
|
|
998
998
|
logger.info(
|
|
999
999
|
f'Cluster of managed job {job_id} has been cleaned up.')
|
|
1000
1000
|
except Exception as e: # pylint: disable=broad-except
|
|
@@ -1056,29 +1056,23 @@ class Controller:
|
|
|
1056
1056
|
async def start_job(
|
|
1057
1057
|
self,
|
|
1058
1058
|
job_id: int,
|
|
1059
|
-
dag_yaml: str,
|
|
1060
|
-
env_file_path: Optional[str] = None,
|
|
1061
1059
|
pool: Optional[str] = None,
|
|
1062
1060
|
):
|
|
1063
1061
|
"""Start a new job.
|
|
1064
1062
|
|
|
1065
1063
|
Args:
|
|
1066
1064
|
job_id: The ID of the job to start.
|
|
1067
|
-
dag_yaml: Path to the YAML file containing the DAG definition.
|
|
1068
|
-
env_file_path: Optional path to environment file for the job.
|
|
1069
1065
|
"""
|
|
1070
1066
|
# Create log file path for job output redirection
|
|
1071
1067
|
log_dir = os.path.expanduser(jobs_constants.JOBS_CONTROLLER_LOGS_DIR)
|
|
1072
1068
|
os.makedirs(log_dir, exist_ok=True)
|
|
1073
1069
|
log_file = os.path.join(log_dir, f'{job_id}.log')
|
|
1074
1070
|
|
|
1075
|
-
logger.info(f'Starting job {job_id} with
|
|
1076
|
-
f'env_file_path={env_file_path}, and log_file={log_file}')
|
|
1071
|
+
logger.info(f'Starting job {job_id} with log_file={log_file}')
|
|
1077
1072
|
|
|
1078
1073
|
async with self._job_tasks_lock:
|
|
1079
1074
|
self.starting.add(job_id)
|
|
1080
|
-
await create_background_task(
|
|
1081
|
-
self.run_job_loop(job_id, dag_yaml, log_file, env_file_path, pool))
|
|
1075
|
+
await create_background_task(self.run_job_loop(job_id, log_file, pool))
|
|
1082
1076
|
|
|
1083
1077
|
logger.info(f'Job {job_id} started successfully')
|
|
1084
1078
|
|
|
@@ -1151,8 +1145,6 @@ class Controller:
|
|
|
1151
1145
|
|
|
1152
1146
|
logger.info(f'Claiming job {waiting_job["job_id"]}')
|
|
1153
1147
|
job_id = waiting_job['job_id']
|
|
1154
|
-
dag_yaml_path = waiting_job['dag_yaml_path']
|
|
1155
|
-
env_file_path = waiting_job.get('env_file_path')
|
|
1156
1148
|
pool = waiting_job.get('pool', None)
|
|
1157
1149
|
|
|
1158
1150
|
cancels = os.listdir(jobs_constants.CONSOLIDATED_SIGNAL_PATH)
|
|
@@ -1172,7 +1164,7 @@ class Controller:
|
|
|
1172
1164
|
job_id=job_id, task_id=None, task=None))
|
|
1173
1165
|
continue
|
|
1174
1166
|
|
|
1175
|
-
await self.start_job(job_id,
|
|
1167
|
+
await self.start_job(job_id, pool)
|
|
1176
1168
|
|
|
1177
1169
|
|
|
1178
1170
|
async def main(controller_uuid: str):
|
|
@@ -1180,7 +1172,7 @@ async def main(controller_uuid: str):
|
|
|
1180
1172
|
|
|
1181
1173
|
context_utils.hijack_sys_attrs()
|
|
1182
1174
|
|
|
1183
|
-
controller =
|
|
1175
|
+
controller = ControllerManager(controller_uuid)
|
|
1184
1176
|
|
|
1185
1177
|
# Will happen multiple times, who cares though
|
|
1186
1178
|
os.makedirs(jobs_constants.CONSOLIDATED_SIGNAL_PATH, exist_ok=True)
|
|
@@ -1199,7 +1191,10 @@ async def main(controller_uuid: str):
|
|
|
1199
1191
|
# Will loop forever, do it in the background
|
|
1200
1192
|
cancel_job_task = asyncio.create_task(controller.cancel_job())
|
|
1201
1193
|
monitor_loop_task = asyncio.create_task(controller.monitor_loop())
|
|
1202
|
-
|
|
1194
|
+
# Run the garbage collector in a dedicated daemon thread to avoid affecting
|
|
1195
|
+
# the main event loop.
|
|
1196
|
+
gc_thread = threading.Thread(target=log_gc.elect_for_log_gc, daemon=True)
|
|
1197
|
+
gc_thread.start()
|
|
1203
1198
|
try:
|
|
1204
1199
|
await asyncio.gather(cancel_job_task, monitor_loop_task)
|
|
1205
1200
|
except Exception as e: # pylint: disable=broad-except
|