skypilot-nightly 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250617__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -4
- sky/backends/cloud_vm_ray_backend.py +43 -60
- sky/cli.py +55 -637
- sky/client/cli.py +55 -637
- sky/clouds/kubernetes.py +3 -0
- sky/clouds/scp.py +7 -26
- sky/clouds/utils/scp_utils.py +177 -124
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
- sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → vA3PPpkBwpRTRNBHFYAw_}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/jobs/controller.py +98 -31
- sky/jobs/scheduler.py +37 -29
- sky/jobs/server/core.py +36 -3
- sky/jobs/state.py +69 -9
- sky/jobs/utils.py +11 -0
- sky/provision/__init__.py +1 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +528 -0
- sky/resources.py +164 -29
- sky/skylet/constants.py +39 -0
- sky/skylet/job_lib.py +8 -0
- sky/task.py +171 -21
- sky/templates/kubernetes-ray.yml.j2 +51 -4
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/users/permission.py +19 -36
- sky/utils/command_runner.py +1 -1
- sky/utils/common_utils.py +16 -14
- sky/utils/context.py +1 -1
- sky/utils/controller_utils.py +12 -3
- sky/utils/dag_utils.py +17 -4
- sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
- sky/utils/schemas.py +43 -5
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/RECORD +54 -57
- sky/benchmark/__init__.py +0 -0
- sky/benchmark/benchmark_state.py +0 -295
- sky/benchmark/benchmark_utils.py +0 -641
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- /sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → vA3PPpkBwpRTRNBHFYAw_}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-d6128fa9e7cae6e6.js" defer=""></script><script src="/dashboard/_next/static/chunks/760-a89d354797ce7af5.js" defer=""></script><script src="/dashboard/_next/static/chunks/799-3625946b2ec2eb30.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-4c9fc53aa74bc191.js" defer=""></script><script src="/dashboard/_next/static/chunks/664-047bc03493fda379.js" defer=""></script><script src="/dashboard/_next/static/chunks/798-c0525dc3f21e488d.js" defer=""></script><script src="/dashboard/_next/static/chunks/947-6620842ef80ae879.js" defer=""></script><script src="/dashboard/_next/static/chunks/470-4d1a5dbe58a8a2b9.js" defer=""></script><script src="/dashboard/_next/static/chunks/901-b424d293275e1fd7.js" defer=""></script><script src="/dashboard/_next/static/chunks/969-20d54a9d998dc102.js" defer=""></script><script src="/dashboard/_next/static/chunks/856-c2c39c0912285e54.js" defer=""></script><script src="/dashboard/_next/static/chunks/973-c807fc34f09c7df3.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-385d190b95815e11.js" defer=""></script><script src="/dashboard/_next/static/chunks/37-824c707421f6f003.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/clusters/%5Bcluster%5D-
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-d6128fa9e7cae6e6.js" defer=""></script><script src="/dashboard/_next/static/chunks/760-a89d354797ce7af5.js" defer=""></script><script src="/dashboard/_next/static/chunks/799-3625946b2ec2eb30.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-4c9fc53aa74bc191.js" defer=""></script><script src="/dashboard/_next/static/chunks/664-047bc03493fda379.js" defer=""></script><script src="/dashboard/_next/static/chunks/798-c0525dc3f21e488d.js" defer=""></script><script src="/dashboard/_next/static/chunks/947-6620842ef80ae879.js" defer=""></script><script src="/dashboard/_next/static/chunks/470-4d1a5dbe58a8a2b9.js" defer=""></script><script src="/dashboard/_next/static/chunks/901-b424d293275e1fd7.js" defer=""></script><script src="/dashboard/_next/static/chunks/969-20d54a9d998dc102.js" defer=""></script><script src="/dashboard/_next/static/chunks/856-c2c39c0912285e54.js" defer=""></script><script src="/dashboard/_next/static/chunks/973-c807fc34f09c7df3.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-385d190b95815e11.js" defer=""></script><script src="/dashboard/_next/static/chunks/37-824c707421f6f003.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/clusters/%5Bcluster%5D-36bc0962129f72df.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/clusters/[cluster]","query":{},"buildId":"vA3PPpkBwpRTRNBHFYAw_","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/clusters.html
CHANGED
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js" defer=""></script><script src="/dashboard/_next/static/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/clusters","query":{},"buildId":"vA3PPpkBwpRTRNBHFYAw_","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/config.html
CHANGED
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/config-497a35a7ed49734a.js" defer=""></script><script src="/dashboard/_next/static/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/config-497a35a7ed49734a.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/config","query":{},"buildId":"vA3PPpkBwpRTRNBHFYAw_","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/index.html
CHANGED
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/index-6b0d9e5031b70c58.js" defer=""></script><script src="/dashboard/_next/static/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/index-6b0d9e5031b70c58.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/","query":{},"buildId":"vA3PPpkBwpRTRNBHFYAw_","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/infra/%5Bcontext%5D-d2910be98e9227cb.js" defer=""></script><script src="/dashboard/_next/static/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/infra/%5Bcontext%5D-d2910be98e9227cb.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/infra/[context]","query":{},"buildId":"vA3PPpkBwpRTRNBHFYAw_","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/infra.html
CHANGED
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/infra-780860bcc1103945.js" defer=""></script><script src="/dashboard/_next/static/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/infra-780860bcc1103945.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/infra","query":{},"buildId":"vA3PPpkBwpRTRNBHFYAw_","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-d6128fa9e7cae6e6.js" defer=""></script><script src="/dashboard/_next/static/chunks/760-a89d354797ce7af5.js" defer=""></script><script src="/dashboard/_next/static/chunks/799-3625946b2ec2eb30.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-4c9fc53aa74bc191.js" defer=""></script><script src="/dashboard/_next/static/chunks/664-047bc03493fda379.js" defer=""></script><script src="/dashboard/_next/static/chunks/798-c0525dc3f21e488d.js" defer=""></script><script src="/dashboard/_next/static/chunks/470-4d1a5dbe58a8a2b9.js" defer=""></script><script src="/dashboard/_next/static/chunks/969-20d54a9d998dc102.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs/%5Bjob%5D-
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-d6128fa9e7cae6e6.js" defer=""></script><script src="/dashboard/_next/static/chunks/760-a89d354797ce7af5.js" defer=""></script><script src="/dashboard/_next/static/chunks/799-3625946b2ec2eb30.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-4c9fc53aa74bc191.js" defer=""></script><script src="/dashboard/_next/static/chunks/664-047bc03493fda379.js" defer=""></script><script src="/dashboard/_next/static/chunks/798-c0525dc3f21e488d.js" defer=""></script><script src="/dashboard/_next/static/chunks/470-4d1a5dbe58a8a2b9.js" defer=""></script><script src="/dashboard/_next/static/chunks/969-20d54a9d998dc102.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs/%5Bjob%5D-cf490d1fa38f3740.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/jobs/[job]","query":{},"buildId":"vA3PPpkBwpRTRNBHFYAw_","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/jobs.html
CHANGED
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js" defer=""></script><script src="/dashboard/_next/static/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/jobs","query":{},"buildId":"vA3PPpkBwpRTRNBHFYAw_","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/users.html
CHANGED
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/users-c69ffcab9d6e5269.js" defer=""></script><script src="/dashboard/_next/static/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/users-c69ffcab9d6e5269.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/users","query":{},"buildId":"vA3PPpkBwpRTRNBHFYAw_","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js" defer=""></script><script src="/dashboard/_next/static/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"vA3PPpkBwpRTRNBHFYAw_","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-d6128fa9e7cae6e6.js" defer=""></script><script src="/dashboard/_next/static/chunks/760-a89d354797ce7af5.js" defer=""></script><script src="/dashboard/_next/static/chunks/799-3625946b2ec2eb30.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-4c9fc53aa74bc191.js" defer=""></script><script src="/dashboard/_next/static/chunks/664-047bc03493fda379.js" defer=""></script><script src="/dashboard/_next/static/chunks/798-c0525dc3f21e488d.js" defer=""></script><script src="/dashboard/_next/static/chunks/947-6620842ef80ae879.js" defer=""></script><script src="/dashboard/_next/static/chunks/470-4d1a5dbe58a8a2b9.js" defer=""></script><script src="/dashboard/_next/static/chunks/901-b424d293275e1fd7.js" defer=""></script><script src="/dashboard/_next/static/chunks/969-20d54a9d998dc102.js" defer=""></script><script src="/dashboard/_next/static/chunks/856-c2c39c0912285e54.js" defer=""></script><script src="/dashboard/_next/static/chunks/973-c807fc34f09c7df3.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-385d190b95815e11.js" defer=""></script><script src="/dashboard/_next/static/chunks/843-ab9c4f609239155f.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-c8c2191328532b7d.js" defer=""></script><script src="/dashboard/_next/static/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-d6128fa9e7cae6e6.js" defer=""></script><script src="/dashboard/_next/static/chunks/760-a89d354797ce7af5.js" defer=""></script><script src="/dashboard/_next/static/chunks/799-3625946b2ec2eb30.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-4c9fc53aa74bc191.js" defer=""></script><script src="/dashboard/_next/static/chunks/664-047bc03493fda379.js" defer=""></script><script src="/dashboard/_next/static/chunks/798-c0525dc3f21e488d.js" defer=""></script><script src="/dashboard/_next/static/chunks/947-6620842ef80ae879.js" defer=""></script><script src="/dashboard/_next/static/chunks/470-4d1a5dbe58a8a2b9.js" defer=""></script><script src="/dashboard/_next/static/chunks/901-b424d293275e1fd7.js" defer=""></script><script src="/dashboard/_next/static/chunks/969-20d54a9d998dc102.js" defer=""></script><script src="/dashboard/_next/static/chunks/856-c2c39c0912285e54.js" defer=""></script><script src="/dashboard/_next/static/chunks/973-c807fc34f09c7df3.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-385d190b95815e11.js" defer=""></script><script src="/dashboard/_next/static/chunks/843-ab9c4f609239155f.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-c8c2191328532b7d.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"vA3PPpkBwpRTRNBHFYAw_","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js" defer=""></script><script src="/dashboard/_next/static/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/vA3PPpkBwpRTRNBHFYAw_/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"vA3PPpkBwpRTRNBHFYAw_","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/jobs/controller.py
CHANGED
@@ -152,6 +152,20 @@ class JobsController:
|
|
152
152
|
Other exceptions may be raised depending on the backend.
|
153
153
|
"""
|
154
154
|
|
155
|
+
latest_task_id, last_task_prev_status = (
|
156
|
+
managed_job_state.get_latest_task_id_status(self._job_id))
|
157
|
+
is_resume = False
|
158
|
+
if (latest_task_id is not None and last_task_prev_status !=
|
159
|
+
managed_job_state.ManagedJobStatus.PENDING):
|
160
|
+
assert latest_task_id >= task_id, (latest_task_id, task_id)
|
161
|
+
if latest_task_id > task_id:
|
162
|
+
logger.info(f'Task {task_id} ({task.name}) has already '
|
163
|
+
'been executed. Skipping...')
|
164
|
+
return True
|
165
|
+
if latest_task_id == task_id:
|
166
|
+
# Start recovery.
|
167
|
+
is_resume = True
|
168
|
+
|
155
169
|
callback_func = managed_job_utils.event_callback_func(
|
156
170
|
job_id=self._job_id, task_id=task_id, task=task)
|
157
171
|
if task.run is None:
|
@@ -171,42 +185,72 @@ class JobsController:
|
|
171
185
|
return True
|
172
186
|
usage_lib.messages.usage.update_task_id(task_id)
|
173
187
|
task_id_env_var = task.envs[constants.TASK_ID_ENV_VAR]
|
174
|
-
submitted_at = time.time()
|
175
|
-
if task_id == 0:
|
176
|
-
submitted_at = backend_utils.get_timestamp_from_run_timestamp(
|
177
|
-
self._backend.run_timestamp)
|
178
188
|
assert task.name is not None, task
|
179
189
|
cluster_name = managed_job_utils.generate_managed_job_cluster_name(
|
180
190
|
task.name, self._job_id)
|
181
191
|
self._strategy_executor = recovery_strategy.StrategyExecutor.make(
|
182
192
|
cluster_name, self._backend, task, self._job_id, task_id)
|
183
|
-
|
184
|
-
|
185
|
-
task_id
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
193
|
+
if not is_resume:
|
194
|
+
submitted_at = time.time()
|
195
|
+
if task_id == 0:
|
196
|
+
submitted_at = backend_utils.get_timestamp_from_run_timestamp(
|
197
|
+
self._backend.run_timestamp)
|
198
|
+
managed_job_state.set_starting(
|
199
|
+
self._job_id,
|
200
|
+
task_id,
|
201
|
+
self._backend.run_timestamp,
|
202
|
+
submitted_at,
|
203
|
+
resources_str=backend_utils.get_task_resources_str(
|
204
|
+
task, is_managed_job=True),
|
205
|
+
specs={
|
206
|
+
'max_restarts_on_errors':
|
207
|
+
self._strategy_executor.max_restarts_on_errors
|
208
|
+
},
|
209
|
+
callback_func=callback_func)
|
210
|
+
logger.info(f'Submitted managed job {self._job_id} '
|
211
|
+
f'(task: {task_id}, name: {task.name!r}); '
|
212
|
+
f'{constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
|
198
213
|
|
199
214
|
logger.info('Started monitoring.')
|
200
215
|
|
201
|
-
|
202
|
-
|
216
|
+
# Only do the initial cluster launch if not resuming from a controller
|
217
|
+
# failure. Otherwise, we will transit to recovering immediately.
|
218
|
+
remote_job_submitted_at = time.time()
|
219
|
+
if not is_resume:
|
220
|
+
remote_job_submitted_at = self._strategy_executor.launch()
|
221
|
+
assert remote_job_submitted_at is not None, remote_job_submitted_at
|
203
222
|
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
223
|
+
if not is_resume:
|
224
|
+
managed_job_state.set_started(job_id=self._job_id,
|
225
|
+
task_id=task_id,
|
226
|
+
start_time=remote_job_submitted_at,
|
227
|
+
callback_func=callback_func)
|
208
228
|
|
209
229
|
while True:
|
230
|
+
# NOTE: if we are resuming from a controller failure, we only keep
|
231
|
+
# monitoring if the job is in RUNNING state. For all other cases,
|
232
|
+
# we will directly transit to recovering since we have no idea what
|
233
|
+
# the cluster status is.
|
234
|
+
force_transit_to_recovering = False
|
235
|
+
if is_resume:
|
236
|
+
prev_status = managed_job_state.get_job_status_with_task_id(
|
237
|
+
job_id=self._job_id, task_id=task_id)
|
238
|
+
if prev_status is not None:
|
239
|
+
if prev_status.is_terminal():
|
240
|
+
return (prev_status ==
|
241
|
+
managed_job_state.ManagedJobStatus.SUCCEEDED)
|
242
|
+
if (prev_status ==
|
243
|
+
managed_job_state.ManagedJobStatus.CANCELLING):
|
244
|
+
# If the controller is down when cancelling the job,
|
245
|
+
# we re-raise the error to run the `_cleanup` function
|
246
|
+
# again to clean up any remaining resources.
|
247
|
+
raise exceptions.ManagedJobUserCancelledError(
|
248
|
+
'Recovering cancel signal.')
|
249
|
+
if prev_status != managed_job_state.ManagedJobStatus.RUNNING:
|
250
|
+
force_transit_to_recovering = True
|
251
|
+
# This resume logic should only be triggered once.
|
252
|
+
is_resume = False
|
253
|
+
|
210
254
|
time.sleep(managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS)
|
211
255
|
|
212
256
|
# Check the network connection to avoid false alarm for job failure.
|
@@ -221,8 +265,19 @@ class JobsController:
|
|
221
265
|
|
222
266
|
# NOTE: we do not check cluster status first because race condition
|
223
267
|
# can occur, i.e. cluster can be down during the job status check.
|
224
|
-
|
225
|
-
|
268
|
+
# NOTE: If fetching the job status fails or we force to transit to
|
269
|
+
# recovering, we will set the job status to None, which will force
|
270
|
+
# enter the recovering logic.
|
271
|
+
job_status = None
|
272
|
+
if not force_transit_to_recovering:
|
273
|
+
try:
|
274
|
+
job_status = managed_job_utils.get_job_status(
|
275
|
+
self._backend, cluster_name)
|
276
|
+
except exceptions.FetchClusterInfoError as fetch_e:
|
277
|
+
logger.info(
|
278
|
+
'Failed to fetch the job status. Start recovery.\n'
|
279
|
+
f'Exception: {common_utils.format_exception(fetch_e)}\n'
|
280
|
+
f'Traceback: {traceback.format_exc()}')
|
226
281
|
|
227
282
|
if job_status == job_lib.JobStatus.SUCCEEDED:
|
228
283
|
success_end_time = managed_job_utils.try_to_get_job_end_time(
|
@@ -379,7 +434,17 @@ class JobsController:
|
|
379
434
|
if handle is not None:
|
380
435
|
resources = handle.launched_resources
|
381
436
|
assert resources is not None, handle
|
382
|
-
|
437
|
+
# If we are forcing to transit to recovering, we need to clean
|
438
|
+
# up the cluster as it is possible that we already submitted the
|
439
|
+
# job to the worker cluster, but state is not updated yet. In
|
440
|
+
# this case, it is possible that we will double-submit the job
|
441
|
+
# to the worker cluster. So we always clean up the cluster here.
|
442
|
+
# TODO(tian,cooperc): We can check if there is a running job on
|
443
|
+
# the worker cluster, and if so, we can skip the cleanup.
|
444
|
+
# Challenge: race condition when the worker cluster thought it
|
445
|
+
# does not have a running job yet but later the job is launched.
|
446
|
+
if (resources.need_cleanup_after_preemption_or_failure() or
|
447
|
+
force_transit_to_recovering):
|
383
448
|
# Some spot resource (e.g., Spot TPU VM) may need to be
|
384
449
|
# cleaned up after preemption, as running launch again on
|
385
450
|
# those clusters again may fail.
|
@@ -389,9 +454,11 @@ class JobsController:
|
|
389
454
|
|
390
455
|
# Try to recover the managed jobs, when the cluster is preempted or
|
391
456
|
# failed or the job status is failed to be fetched.
|
392
|
-
managed_job_state.set_recovering(
|
393
|
-
|
394
|
-
|
457
|
+
managed_job_state.set_recovering(
|
458
|
+
job_id=self._job_id,
|
459
|
+
task_id=task_id,
|
460
|
+
force_transit_to_recovering=force_transit_to_recovering,
|
461
|
+
callback_func=callback_func)
|
395
462
|
recovered_time = self._strategy_executor.recover()
|
396
463
|
managed_job_state.set_recovered(self._job_id,
|
397
464
|
task_id,
|
sky/jobs/scheduler.py
CHANGED
@@ -84,6 +84,32 @@ def _get_lock_path() -> str:
|
|
84
84
|
return path
|
85
85
|
|
86
86
|
|
87
|
+
def _start_controller(job_id: int, dag_yaml_path: str,
|
88
|
+
env_file_path: str) -> None:
|
89
|
+
activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
|
90
|
+
source_environment_cmd = (f'source {env_file_path};'
|
91
|
+
if env_file_path else '')
|
92
|
+
run_controller_cmd = ('python -u -m sky.jobs.controller '
|
93
|
+
f'{dag_yaml_path} --job-id {job_id};')
|
94
|
+
|
95
|
+
# If the command line here is changed, please also update
|
96
|
+
# utils._controller_process_alive. `--job-id X` should be at
|
97
|
+
# the end.
|
98
|
+
run_cmd = (f'{activate_python_env_cmd}'
|
99
|
+
f'{source_environment_cmd}'
|
100
|
+
f'{run_controller_cmd}')
|
101
|
+
|
102
|
+
logs_dir = os.path.expanduser(
|
103
|
+
managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
|
104
|
+
os.makedirs(logs_dir, exist_ok=True)
|
105
|
+
log_path = os.path.join(logs_dir, f'{job_id}.log')
|
106
|
+
|
107
|
+
pid = subprocess_utils.launch_new_process_tree(run_cmd, log_output=log_path)
|
108
|
+
state.set_job_controller_pid(job_id, pid)
|
109
|
+
|
110
|
+
logger.debug(f'Job {job_id} started with pid {pid}')
|
111
|
+
|
112
|
+
|
87
113
|
def maybe_schedule_next_jobs() -> None:
|
88
114
|
"""Determine if any managed jobs can be scheduled, and if so, schedule them.
|
89
115
|
|
@@ -158,32 +184,9 @@ def maybe_schedule_next_jobs() -> None:
|
|
158
184
|
|
159
185
|
job_id = maybe_next_job['job_id']
|
160
186
|
dag_yaml_path = maybe_next_job['dag_yaml_path']
|
187
|
+
env_file_path = maybe_next_job['env_file_path']
|
161
188
|
|
162
|
-
|
163
|
-
f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
|
164
|
-
env_file = maybe_next_job['env_file_path']
|
165
|
-
source_environment_cmd = (f'source {env_file};'
|
166
|
-
if env_file else '')
|
167
|
-
run_controller_cmd = ('python -u -m sky.jobs.controller '
|
168
|
-
f'{dag_yaml_path} --job-id {job_id};')
|
169
|
-
|
170
|
-
# If the command line here is changed, please also update
|
171
|
-
# utils._controller_process_alive. `--job-id X` should be at
|
172
|
-
# the end.
|
173
|
-
run_cmd = (f'{activate_python_env_cmd}'
|
174
|
-
f'{source_environment_cmd}'
|
175
|
-
f'{run_controller_cmd}')
|
176
|
-
|
177
|
-
logs_dir = os.path.expanduser(
|
178
|
-
managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
|
179
|
-
os.makedirs(logs_dir, exist_ok=True)
|
180
|
-
log_path = os.path.join(logs_dir, f'{job_id}.log')
|
181
|
-
|
182
|
-
pid = subprocess_utils.launch_new_process_tree(
|
183
|
-
run_cmd, log_output=log_path)
|
184
|
-
state.set_job_controller_pid(job_id, pid)
|
185
|
-
|
186
|
-
logger.debug(f'Job {job_id} started with pid {pid}')
|
189
|
+
_start_controller(job_id, dag_yaml_path, env_file_path)
|
187
190
|
|
188
191
|
except filelock.Timeout:
|
189
192
|
# If we can't get the lock, just exit. The process holding the lock
|
@@ -203,10 +206,15 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
|
|
203
206
|
The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
|
204
207
|
"""
|
205
208
|
with filelock.FileLock(_get_lock_path()):
|
206
|
-
state.scheduler_set_waiting(job_id, dag_yaml_path,
|
207
|
-
|
208
|
-
|
209
|
-
|
209
|
+
is_resume = state.scheduler_set_waiting(job_id, dag_yaml_path,
|
210
|
+
original_user_yaml_path,
|
211
|
+
env_file_path,
|
212
|
+
common_utils.get_user_hash(),
|
213
|
+
priority)
|
214
|
+
if is_resume:
|
215
|
+
_start_controller(job_id, dag_yaml_path, env_file_path)
|
216
|
+
else:
|
217
|
+
maybe_schedule_next_jobs()
|
210
218
|
|
211
219
|
|
212
220
|
@contextlib.contextmanager
|
sky/jobs/server/core.py
CHANGED
@@ -102,14 +102,47 @@ def launch(
|
|
102
102
|
'name only and comment out the task names (so that they '
|
103
103
|
'will be auto-generated) .')
|
104
104
|
task_names.add(task_.name)
|
105
|
-
|
106
|
-
|
105
|
+
|
106
|
+
# Check for priority in resources first, then fall back to job priority
|
107
|
+
task_priority = None
|
108
|
+
if task_.resources:
|
109
|
+
# Convert set to list to access elements by index
|
110
|
+
resources_list = list(task_.resources)
|
111
|
+
# Take first resource's priority as reference
|
112
|
+
task_priority = resources_list[0].priority
|
113
|
+
|
114
|
+
# Check all other resources have same priority
|
115
|
+
for resource in resources_list[1:]:
|
116
|
+
if resource.priority != task_priority:
|
117
|
+
with ux_utils.print_exception_no_traceback():
|
118
|
+
raise ValueError(
|
119
|
+
f'Task {task_.name!r}: All resources must have the '
|
120
|
+
'same priority. Found priority '
|
121
|
+
f'{resource.priority} but expected {task_priority}.'
|
122
|
+
)
|
123
|
+
|
124
|
+
# Check for conflict between resources priority and job
|
125
|
+
# priority
|
126
|
+
if task_.job_priority is not None:
|
127
|
+
with ux_utils.print_exception_no_traceback():
|
128
|
+
raise ValueError(
|
129
|
+
f'Task {task_.name!r}: Cannot specify both '
|
130
|
+
f'resources.priority ({task_priority}) and '
|
131
|
+
f'job.priority ({task_.job_priority}). Please use only '
|
132
|
+
'one priority specification method.')
|
133
|
+
|
134
|
+
# Fall back to job priority if no resources priority found
|
135
|
+
if task_priority is None:
|
136
|
+
task_priority = task_.job_priority
|
137
|
+
|
138
|
+
if task_priority is not None:
|
139
|
+
if (priority is not None and priority != task_priority):
|
107
140
|
with ux_utils.print_exception_no_traceback():
|
108
141
|
raise ValueError(
|
109
142
|
'Multiple tasks in the DAG have different priorities. '
|
110
143
|
'Either specify a priority in only one task, or set '
|
111
144
|
'the same priority for each task.')
|
112
|
-
priority =
|
145
|
+
priority = task_priority
|
113
146
|
|
114
147
|
if priority is None:
|
115
148
|
priority = managed_job_constants.DEFAULT_PRIORITY
|