skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/nebius.py +43 -1
- sky/backends/backend_utils.py +74 -7
- sky/backends/cloud_vm_ray_backend.py +169 -29
- sky/catalog/cudo_catalog.py +1 -1
- sky/catalog/data_fetchers/fetch_cudo.py +1 -1
- sky/catalog/data_fetchers/fetch_nebius.py +6 -3
- sky/client/cli/command.py +62 -85
- sky/client/common.py +1 -1
- sky/client/sdk.py +69 -19
- sky/client/sdk_async.py +5 -4
- sky/clouds/aws.py +52 -1
- sky/clouds/kubernetes.py +15 -5
- sky/clouds/nebius.py +3 -1
- sky/dag.py +1 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-a96678fed5043c12.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-77d22ae2fad4071c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.8ce85b31e5c602e9.js +1 -0
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
- sky/dashboard/out/_next/static/chunks/4509-fa63866741388427.js +1 -0
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
- sky/dashboard/out/_next/static/chunks/4725.68d5ce4d6bcb7991.js +1 -0
- sky/dashboard/out/_next/static/chunks/6014.d466a44b73af8348.js +6 -0
- sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
- sky/dashboard/out/_next/static/chunks/6856-58370d8c9a79f72b.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-01359c57e018caa4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
- sky/dashboard/out/_next/static/chunks/7557-5855617d0421ed55.js +1 -0
- sky/dashboard/out/_next/static/chunks/8310.4ae62d5937045bf3.js +31 -0
- sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
- sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
- sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-f71c3c42670a4be0.js} +2 -2
- sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-ce361c6959bc2001.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-078751bad714c017.js → [job]-6d43d6a6bd1d4c77.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-30c5954a7b1f67d7.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-fa94c3548b5834aa.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-13d53fffc03ccb52.js → [context]-5264c5645299cde9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-fc9222e26c8e2f0d.js → infra-83991650ae4bd083.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-7d4182df6625fe10.js} +2 -7
- sky/dashboard/out/_next/static/chunks/pages/jobs-c6a6a8a737ad7e2d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-d112a9b3d854abb2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b87fec189298a0c0.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-8a86ca4c98812df9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-74ef46fc370f7c71.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-aba778a6d6eb496d.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +11 -1
- sky/exceptions.py +5 -0
- sky/execution.py +13 -10
- sky/global_user_state.py +191 -8
- sky/jobs/constants.py +1 -1
- sky/jobs/controller.py +0 -1
- sky/jobs/recovery_strategy.py +3 -3
- sky/jobs/scheduler.py +35 -87
- sky/jobs/server/core.py +82 -22
- sky/jobs/server/utils.py +1 -1
- sky/jobs/state.py +7 -5
- sky/jobs/utils.py +167 -8
- sky/provision/__init__.py +1 -0
- sky/provision/aws/config.py +25 -0
- sky/provision/aws/instance.py +37 -13
- sky/provision/azure/instance.py +2 -0
- sky/provision/cudo/cudo_wrapper.py +1 -1
- sky/provision/cudo/instance.py +2 -0
- sky/provision/do/instance.py +2 -0
- sky/provision/fluidstack/instance.py +2 -0
- sky/provision/gcp/instance.py +2 -0
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/kubernetes/instance.py +133 -0
- sky/provision/lambda_cloud/instance.py +2 -0
- sky/provision/nebius/instance.py +2 -0
- sky/provision/nebius/utils.py +101 -86
- sky/provision/oci/instance.py +2 -0
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/provisioner.py +13 -8
- sky/provision/runpod/instance.py +2 -0
- sky/provision/runpod/utils.py +1 -1
- sky/provision/scp/instance.py +2 -0
- sky/provision/vast/instance.py +2 -0
- sky/provision/vsphere/instance.py +2 -0
- sky/resources.py +6 -7
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +70 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/serve/constants.py +3 -7
- sky/serve/replica_managers.py +138 -117
- sky/serve/serve_state.py +42 -0
- sky/serve/serve_utils.py +58 -36
- sky/serve/server/impl.py +15 -19
- sky/serve/service.py +82 -33
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +6 -0
- sky/server/requests/serializers/decoders.py +12 -2
- sky/server/requests/serializers/encoders.py +10 -2
- sky/server/server.py +64 -16
- sky/setup_files/dependencies.py +11 -10
- sky/skylet/autostop_lib.py +38 -5
- sky/skylet/constants.py +3 -1
- sky/skylet/services.py +44 -0
- sky/skylet/skylet.py +49 -4
- sky/task.py +19 -16
- sky/templates/aws-ray.yml.j2 +2 -2
- sky/templates/jobs-controller.yaml.j2 +6 -0
- sky/templates/kubernetes-ray.yml.j2 +1 -0
- sky/utils/command_runner.py +1 -1
- sky/utils/common_utils.py +20 -0
- sky/utils/config_utils.py +29 -5
- sky/utils/controller_utils.py +86 -0
- sky/utils/db/db_utils.py +17 -0
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/log_utils.py +14 -5
- sky/utils/resources_utils.py +25 -1
- sky/utils/schemas.py +6 -0
- sky/utils/ux_utils.py +36 -5
- sky/volumes/server/core.py +2 -2
- sky/volumes/server/server.py +2 -2
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/METADATA +5 -7
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/RECORD +151 -142
- sky/dashboard/out/_next/static/Fuy7OzApYTUMz2QgoP7dP/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
- sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +0 -1
- sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
- sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
- sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
- sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
- sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
- sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
- sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
- sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-7fd0cf9dbecff10f.js +0 -1
- /sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → I-djf3wB8zZl_bI67BOyZ}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/top_level.txt +0 -0
sky/dashboard/out/infra.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-aba778a6d6eb496d.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/infra-83991650ae4bd083.js" defer=""></script><script src="/dashboard/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/I-djf3wB8zZl_bI67BOyZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/infra","query":{},"buildId":"I-djf3wB8zZl_bI67BOyZ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-aba778a6d6eb496d.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/754-d0da8ab45f9509e9.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-6d493b1e2fa45826.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs/%5Bjob%5D-ad2cd5aab787bc15.js" defer=""></script><script src="/dashboard/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/I-djf3wB8zZl_bI67BOyZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/jobs/[job]","query":{},"buildId":"I-djf3wB8zZl_bI67BOyZ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-aba778a6d6eb496d.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/754-d0da8ab45f9509e9.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-6d493b1e2fa45826.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs/pools/%5Bpool%5D-7d4182df6625fe10.js" defer=""></script><script src="/dashboard/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/I-djf3wB8zZl_bI67BOyZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/jobs/pools/[pool]","query":{},"buildId":"I-djf3wB8zZl_bI67BOyZ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/jobs.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-aba778a6d6eb496d.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs-c6a6a8a737ad7e2d.js" defer=""></script><script src="/dashboard/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/I-djf3wB8zZl_bI67BOyZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/jobs","query":{},"buildId":"I-djf3wB8zZl_bI67BOyZ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/users.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-aba778a6d6eb496d.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/users-d112a9b3d854abb2.js" defer=""></script><script src="/dashboard/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/I-djf3wB8zZl_bI67BOyZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/users","query":{},"buildId":"I-djf3wB8zZl_bI67BOyZ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/volumes.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-aba778a6d6eb496d.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/volumes-b87fec189298a0c0.js" defer=""></script><script src="/dashboard/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/I-djf3wB8zZl_bI67BOyZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/volumes","query":{},"buildId":"I-djf3wB8zZl_bI67BOyZ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-aba778a6d6eb496d.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js" defer=""></script><script src="/dashboard/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/I-djf3wB8zZl_bI67BOyZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"I-djf3wB8zZl_bI67BOyZ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-aba778a6d6eb496d.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/6633-efe924b9b8136699.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-6d493b1e2fa45826.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-08b2a1cae076a943.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/4509-fa63866741388427.js" defer=""></script><script src="/dashboard/_next/static/chunks/7557-5855617d0421ed55.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-77d22ae2fad4071c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-a96678fed5043c12.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-8a86ca4c98812df9.js" defer=""></script><script src="/dashboard/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/I-djf3wB8zZl_bI67BOyZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"I-djf3wB8zZl_bI67BOyZ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-aba778a6d6eb496d.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-74ef46fc370f7c71.js" defer=""></script><script src="/dashboard/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/I-djf3wB8zZl_bI67BOyZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"I-djf3wB8zZl_bI67BOyZ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/data/storage.py
CHANGED
|
@@ -4510,9 +4510,19 @@ class R2Store(S3CompatibleStore):
|
|
|
4510
4510
|
extra_cli_args=['--checksum-algorithm', 'CRC32'], # R2 specific
|
|
4511
4511
|
cloud_name=cloudflare.NAME,
|
|
4512
4512
|
default_region='auto',
|
|
4513
|
-
mount_cmd_factory=
|
|
4513
|
+
mount_cmd_factory=cls._get_r2_mount_cmd,
|
|
4514
4514
|
)
|
|
4515
4515
|
|
|
4516
|
+
@classmethod
|
|
4517
|
+
def _get_r2_mount_cmd(cls, bucket_name: str, mount_path: str,
|
|
4518
|
+
bucket_sub_path: Optional[str]) -> str:
|
|
4519
|
+
"""Factory method for R2 mount command."""
|
|
4520
|
+
endpoint_url = cloudflare.create_endpoint()
|
|
4521
|
+
return mounting_utils.get_r2_mount_cmd(cloudflare.R2_CREDENTIALS_PATH,
|
|
4522
|
+
cloudflare.R2_PROFILE_NAME,
|
|
4523
|
+
endpoint_url, bucket_name,
|
|
4524
|
+
mount_path, bucket_sub_path)
|
|
4525
|
+
|
|
4516
4526
|
def mount_cached_command(self, mount_path: str) -> str:
|
|
4517
4527
|
"""R2-specific cached mount implementation using rclone."""
|
|
4518
4528
|
install_cmd = mounting_utils.get_rclone_install_cmd()
|
sky/exceptions.py
CHANGED
sky/execution.py
CHANGED
|
@@ -173,19 +173,12 @@ def _execute(
|
|
|
173
173
|
if dryrun.
|
|
174
174
|
"""
|
|
175
175
|
dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
|
|
176
|
-
dag.resolve_and_validate_volumes()
|
|
177
|
-
if (not _is_launched_by_jobs_controller and
|
|
178
|
-
not _is_launched_by_sky_serve_controller):
|
|
179
|
-
# Only process pre-mount operations on API server.
|
|
180
|
-
dag.pre_mount_volumes()
|
|
181
176
|
for task in dag.tasks:
|
|
182
|
-
if task.storage_mounts is not None:
|
|
183
|
-
for storage in task.storage_mounts.values():
|
|
184
|
-
# Ensure the storage is constructed.
|
|
185
|
-
storage.construct()
|
|
186
177
|
for resource in task.resources:
|
|
187
178
|
# For backward compatibility, we need to override the autostop
|
|
188
|
-
# config at server-side for legacy clients.
|
|
179
|
+
# config at server-side for legacy clients. This should be set
|
|
180
|
+
# before admin policy to make the admin policy get the final
|
|
181
|
+
# value of autostop config.
|
|
189
182
|
# TODO(aylei): remove this after we bump the API version.
|
|
190
183
|
resource.override_autostop_config(
|
|
191
184
|
down=down, idle_minutes=idle_minutes_to_autostop)
|
|
@@ -200,6 +193,16 @@ def _execute(
|
|
|
200
193
|
down=down,
|
|
201
194
|
dryrun=dryrun,
|
|
202
195
|
)) as dag:
|
|
196
|
+
dag.resolve_and_validate_volumes()
|
|
197
|
+
if (not _is_launched_by_jobs_controller and
|
|
198
|
+
not _is_launched_by_sky_serve_controller):
|
|
199
|
+
# Only process pre-mount operations on API server.
|
|
200
|
+
dag.pre_mount_volumes()
|
|
201
|
+
for task in dag.tasks:
|
|
202
|
+
if task.storage_mounts is not None:
|
|
203
|
+
for storage in task.storage_mounts.values():
|
|
204
|
+
# Ensure the storage is constructed.
|
|
205
|
+
storage.construct()
|
|
203
206
|
return _execute_dag(
|
|
204
207
|
dag,
|
|
205
208
|
dryrun=dryrun,
|
sky/global_user_state.py
CHANGED
|
@@ -6,6 +6,7 @@ Concepts:
|
|
|
6
6
|
- Cluster handle: (non-user facing) an opaque backend handle for us to
|
|
7
7
|
interact with a cluster.
|
|
8
8
|
"""
|
|
9
|
+
import asyncio
|
|
9
10
|
import enum
|
|
10
11
|
import functools
|
|
11
12
|
import json
|
|
@@ -51,6 +52,9 @@ _ALLOWED_CLOUDS_KEY_PREFIX = 'allowed_clouds_'
|
|
|
51
52
|
_SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
|
52
53
|
_SQLALCHEMY_ENGINE_LOCK = threading.Lock()
|
|
53
54
|
|
|
55
|
+
DEFAULT_CLUSTER_EVENT_RETENTION_HOURS = 24.0
|
|
56
|
+
MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS = 3600
|
|
57
|
+
|
|
54
58
|
Base = declarative.declarative_base()
|
|
55
59
|
|
|
56
60
|
config_table = sqlalchemy.Table(
|
|
@@ -102,6 +106,9 @@ cluster_table = sqlalchemy.Table(
|
|
|
102
106
|
sqlalchemy.Text,
|
|
103
107
|
server_default=None),
|
|
104
108
|
sqlalchemy.Column('is_managed', sqlalchemy.Integer, server_default='0'),
|
|
109
|
+
sqlalchemy.Column('provision_log_path',
|
|
110
|
+
sqlalchemy.Text,
|
|
111
|
+
server_default=None),
|
|
105
112
|
)
|
|
106
113
|
|
|
107
114
|
storage_table = sqlalchemy.Table(
|
|
@@ -161,6 +168,9 @@ cluster_history_table = sqlalchemy.Table(
|
|
|
161
168
|
sqlalchemy.Text,
|
|
162
169
|
server_default=None),
|
|
163
170
|
sqlalchemy.Column('workspace', sqlalchemy.Text, server_default=None),
|
|
171
|
+
sqlalchemy.Column('provision_log_path',
|
|
172
|
+
sqlalchemy.Text,
|
|
173
|
+
server_default=None),
|
|
164
174
|
)
|
|
165
175
|
|
|
166
176
|
|
|
@@ -430,6 +440,17 @@ def get_user_by_name(username: str) -> List[models.User]:
|
|
|
430
440
|
]
|
|
431
441
|
|
|
432
442
|
|
|
443
|
+
@_init_db
|
|
444
|
+
def get_user_by_name_match(username_match: str) -> List[models.User]:
|
|
445
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
446
|
+
rows = session.query(user_table).filter(
|
|
447
|
+
user_table.c.name.like(f'%{username_match}%')).all()
|
|
448
|
+
return [
|
|
449
|
+
models.User(id=row.id, name=row.name, created_at=row.created_at)
|
|
450
|
+
for row in rows
|
|
451
|
+
]
|
|
452
|
+
|
|
453
|
+
|
|
433
454
|
@_init_db
|
|
434
455
|
def delete_user(user_id: str) -> None:
|
|
435
456
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
@@ -458,7 +479,8 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
458
479
|
is_launch: bool = True,
|
|
459
480
|
config_hash: Optional[str] = None,
|
|
460
481
|
task_config: Optional[Dict[str, Any]] = None,
|
|
461
|
-
is_managed: bool = False
|
|
482
|
+
is_managed: bool = False,
|
|
483
|
+
provision_log_path: Optional[str] = None):
|
|
462
484
|
"""Adds or updates cluster_name -> cluster_handle mapping.
|
|
463
485
|
|
|
464
486
|
Args:
|
|
@@ -473,6 +495,7 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
473
495
|
task_config: The config of the task being launched.
|
|
474
496
|
is_managed: Whether the cluster is launched by the
|
|
475
497
|
controller.
|
|
498
|
+
provision_log_path: Absolute path to provision.log, if available.
|
|
476
499
|
"""
|
|
477
500
|
assert _SQLALCHEMY_ENGINE is not None
|
|
478
501
|
# FIXME: launched_at will be changed when `sky launch -c` is called.
|
|
@@ -555,6 +578,10 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
555
578
|
if task_config else None,
|
|
556
579
|
'last_creation_command': last_use,
|
|
557
580
|
})
|
|
581
|
+
if provision_log_path is not None:
|
|
582
|
+
conditional_values.update({
|
|
583
|
+
'provision_log_path': provision_log_path,
|
|
584
|
+
})
|
|
558
585
|
|
|
559
586
|
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
560
587
|
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
@@ -618,6 +645,7 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
618
645
|
usage_intervals=pickle.dumps(usage_intervals),
|
|
619
646
|
user_hash=user_hash,
|
|
620
647
|
workspace=history_workspace,
|
|
648
|
+
provision_log_path=provision_log_path,
|
|
621
649
|
**creation_info,
|
|
622
650
|
)
|
|
623
651
|
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
|
@@ -633,6 +661,7 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
633
661
|
pickle.dumps(usage_intervals),
|
|
634
662
|
cluster_history_table.c.user_hash: history_hash,
|
|
635
663
|
cluster_history_table.c.workspace: history_workspace,
|
|
664
|
+
cluster_history_table.c.provision_log_path: provision_log_path,
|
|
636
665
|
**creation_info,
|
|
637
666
|
})
|
|
638
667
|
session.execute(do_update_stmt)
|
|
@@ -645,13 +674,32 @@ def add_cluster_event(cluster_name: str,
|
|
|
645
674
|
new_status: Optional[status_lib.ClusterStatus],
|
|
646
675
|
reason: str,
|
|
647
676
|
event_type: ClusterEventType,
|
|
648
|
-
nop_if_duplicate: bool = False
|
|
677
|
+
nop_if_duplicate: bool = False,
|
|
678
|
+
duplicate_regex: Optional[str] = None,
|
|
679
|
+
expose_duplicate_error: bool = False,
|
|
680
|
+
transitioned_at: Optional[int] = None) -> None:
|
|
681
|
+
"""Add a cluster event.
|
|
682
|
+
|
|
683
|
+
Args:
|
|
684
|
+
cluster_name: Name of the cluster.
|
|
685
|
+
new_status: New status of the cluster.
|
|
686
|
+
reason: Reason for the event.
|
|
687
|
+
event_type: Type of the event.
|
|
688
|
+
nop_if_duplicate: If True, do not add the event if it is a duplicate.
|
|
689
|
+
duplicate_regex: If provided, do not add the event if it matches the
|
|
690
|
+
regex. Only used if nop_if_duplicate is True.
|
|
691
|
+
expose_duplicate_error: If True, raise an error if the event is a
|
|
692
|
+
duplicate. Only used if nop_if_duplicate is True.
|
|
693
|
+
transitioned_at: If provided, use this timestamp for the event.
|
|
694
|
+
"""
|
|
649
695
|
assert _SQLALCHEMY_ENGINE is not None
|
|
650
696
|
cluster_hash = _get_hash_for_existing_cluster(cluster_name)
|
|
651
697
|
if cluster_hash is None:
|
|
652
698
|
logger.debug(f'Hash for cluster {cluster_name} not found. '
|
|
653
699
|
'Skipping event.')
|
|
654
700
|
return
|
|
701
|
+
if transitioned_at is None:
|
|
702
|
+
transitioned_at = int(time.time())
|
|
655
703
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
656
704
|
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
657
705
|
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
@@ -669,7 +717,10 @@ def add_cluster_event(cluster_name: str,
|
|
|
669
717
|
if nop_if_duplicate:
|
|
670
718
|
last_event = get_last_cluster_event(cluster_hash,
|
|
671
719
|
event_type=event_type)
|
|
672
|
-
if last_event
|
|
720
|
+
if duplicate_regex is not None and last_event is not None:
|
|
721
|
+
if re.search(duplicate_regex, last_event):
|
|
722
|
+
return
|
|
723
|
+
elif last_event == reason:
|
|
673
724
|
return
|
|
674
725
|
try:
|
|
675
726
|
session.execute(
|
|
@@ -679,15 +730,20 @@ def add_cluster_event(cluster_name: str,
|
|
|
679
730
|
starting_status=last_status,
|
|
680
731
|
ending_status=new_status.value if new_status else None,
|
|
681
732
|
reason=reason,
|
|
682
|
-
transitioned_at=
|
|
733
|
+
transitioned_at=transitioned_at,
|
|
683
734
|
type=event_type.value,
|
|
684
735
|
))
|
|
685
736
|
session.commit()
|
|
686
737
|
except sqlalchemy.exc.IntegrityError as e:
|
|
687
738
|
if 'UNIQUE constraint failed' in str(e):
|
|
688
739
|
# This can happen if the cluster event is added twice.
|
|
689
|
-
# We can ignore this error
|
|
690
|
-
|
|
740
|
+
# We can ignore this error unless the caller requests
|
|
741
|
+
# to expose the error.
|
|
742
|
+
if expose_duplicate_error:
|
|
743
|
+
raise db_utils.UniqueConstraintViolationError(
|
|
744
|
+
value=reason, message=str(e))
|
|
745
|
+
else:
|
|
746
|
+
pass
|
|
691
747
|
else:
|
|
692
748
|
raise e
|
|
693
749
|
|
|
@@ -704,6 +760,70 @@ def get_last_cluster_event(cluster_hash: str,
|
|
|
704
760
|
return row.reason
|
|
705
761
|
|
|
706
762
|
|
|
763
|
+
def cleanup_cluster_events_with_retention(retention_hours: float) -> None:
|
|
764
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
765
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
766
|
+
query = session.query(cluster_event_table).filter(
|
|
767
|
+
cluster_event_table.c.transitioned_at < time.time() -
|
|
768
|
+
retention_hours * 3600)
|
|
769
|
+
logger.debug(f'Deleting {query.count()} cluster events.')
|
|
770
|
+
query.delete()
|
|
771
|
+
session.commit()
|
|
772
|
+
|
|
773
|
+
|
|
774
|
+
async def cluster_event_retention_daemon():
|
|
775
|
+
"""Garbage collect cluster events periodically."""
|
|
776
|
+
while True:
|
|
777
|
+
logger.info('Running cluster event retention daemon...')
|
|
778
|
+
# Use the latest config.
|
|
779
|
+
skypilot_config.reload_config()
|
|
780
|
+
retention_hours = skypilot_config.get_nested(
|
|
781
|
+
('api_server', 'cluster_event_retention_hours'),
|
|
782
|
+
DEFAULT_CLUSTER_EVENT_RETENTION_HOURS)
|
|
783
|
+
try:
|
|
784
|
+
if retention_hours >= 0:
|
|
785
|
+
cleanup_cluster_events_with_retention(retention_hours)
|
|
786
|
+
except asyncio.CancelledError:
|
|
787
|
+
logger.info('Cluster event retention daemon cancelled')
|
|
788
|
+
break
|
|
789
|
+
except Exception as e: # pylint: disable=broad-except
|
|
790
|
+
logger.error(f'Error running cluster event retention daemon: {e}')
|
|
791
|
+
|
|
792
|
+
# Run daemon at most once every hour to avoid too frequent cleanup.
|
|
793
|
+
sleep_amount = max(retention_hours * 3600,
|
|
794
|
+
MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS)
|
|
795
|
+
await asyncio.sleep(sleep_amount)
|
|
796
|
+
|
|
797
|
+
|
|
798
|
+
def get_cluster_events(cluster_name: Optional[str], cluster_hash: Optional[str],
|
|
799
|
+
event_type: ClusterEventType) -> List[str]:
|
|
800
|
+
"""Returns the cluster events for the cluster.
|
|
801
|
+
|
|
802
|
+
Args:
|
|
803
|
+
cluster_name: Name of the cluster. Cannot be specified if cluster_hash
|
|
804
|
+
is specified.
|
|
805
|
+
cluster_hash: Hash of the cluster. Cannot be specified if cluster_name
|
|
806
|
+
is specified.
|
|
807
|
+
event_type: Type of the event.
|
|
808
|
+
"""
|
|
809
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
810
|
+
|
|
811
|
+
if cluster_name is not None and cluster_hash is not None:
|
|
812
|
+
raise ValueError('Cannot specify both cluster_name and cluster_hash')
|
|
813
|
+
if cluster_name is None and cluster_hash is None:
|
|
814
|
+
raise ValueError('Must specify either cluster_name or cluster_hash')
|
|
815
|
+
if cluster_name is not None:
|
|
816
|
+
cluster_hash = _get_hash_for_existing_cluster(cluster_name)
|
|
817
|
+
if cluster_hash is None:
|
|
818
|
+
raise ValueError(f'Hash for cluster {cluster_name} not found.')
|
|
819
|
+
|
|
820
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
821
|
+
rows = session.query(cluster_event_table).filter_by(
|
|
822
|
+
cluster_hash=cluster_hash, type=event_type.value).order_by(
|
|
823
|
+
cluster_event_table.c.transitioned_at.asc()).all()
|
|
824
|
+
return [row.reason for row in rows]
|
|
825
|
+
|
|
826
|
+
|
|
707
827
|
def _get_user_hash_or_current_user(user_hash: Optional[str]) -> str:
|
|
708
828
|
"""Returns the user hash or the current user hash, if user_hash is None.
|
|
709
829
|
|
|
@@ -742,6 +862,7 @@ def remove_cluster(cluster_name: str, terminate: bool) -> None:
|
|
|
742
862
|
assert _SQLALCHEMY_ENGINE is not None
|
|
743
863
|
cluster_hash = _get_hash_for_existing_cluster(cluster_name)
|
|
744
864
|
usage_intervals = _get_cluster_usage_intervals(cluster_hash)
|
|
865
|
+
provision_log_path = get_cluster_provision_log_path(cluster_name)
|
|
745
866
|
|
|
746
867
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
747
868
|
# usage_intervals is not None and not empty
|
|
@@ -752,6 +873,16 @@ def remove_cluster(cluster_name: str, terminate: bool) -> None:
|
|
|
752
873
|
usage_intervals.append((start_time, end_time))
|
|
753
874
|
_set_cluster_usage_intervals(cluster_hash, usage_intervals)
|
|
754
875
|
|
|
876
|
+
if provision_log_path:
|
|
877
|
+
assert cluster_hash is not None, cluster_name
|
|
878
|
+
session.query(cluster_history_table).filter_by(
|
|
879
|
+
cluster_hash=cluster_hash
|
|
880
|
+
).filter(
|
|
881
|
+
cluster_history_table.c.provision_log_path.is_(None)
|
|
882
|
+
).update({
|
|
883
|
+
cluster_history_table.c.provision_log_path: provision_log_path
|
|
884
|
+
})
|
|
885
|
+
|
|
755
886
|
if terminate:
|
|
756
887
|
session.query(cluster_table).filter_by(name=cluster_name).delete()
|
|
757
888
|
session.query(cluster_event_table).filter_by(
|
|
@@ -859,6 +990,58 @@ def get_cluster_info(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
859
990
|
return json.loads(row.metadata)
|
|
860
991
|
|
|
861
992
|
|
|
993
|
+
@_init_db
|
|
994
|
+
def get_cluster_provision_log_path(cluster_name: str) -> Optional[str]:
|
|
995
|
+
"""Returns provision_log_path from clusters table, if recorded."""
|
|
996
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
997
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
998
|
+
row = session.query(cluster_table).filter_by(name=cluster_name).first()
|
|
999
|
+
if row is None:
|
|
1000
|
+
return None
|
|
1001
|
+
return getattr(row, 'provision_log_path', None)
|
|
1002
|
+
|
|
1003
|
+
|
|
1004
|
+
@_init_db
|
|
1005
|
+
def get_cluster_history_provision_log_path(cluster_name: str) -> Optional[str]:
|
|
1006
|
+
"""Returns provision_log_path from cluster_history for this name.
|
|
1007
|
+
|
|
1008
|
+
If the cluster currently exists, we use its hash. Otherwise, we look up
|
|
1009
|
+
historical rows by name and choose the most recent one based on
|
|
1010
|
+
usage_intervals.
|
|
1011
|
+
"""
|
|
1012
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1013
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1014
|
+
# Try current cluster first (fast path)
|
|
1015
|
+
cluster_hash = _get_hash_for_existing_cluster(cluster_name)
|
|
1016
|
+
if cluster_hash is not None:
|
|
1017
|
+
row = session.query(cluster_history_table).filter_by(
|
|
1018
|
+
cluster_hash=cluster_hash).first()
|
|
1019
|
+
if row is not None:
|
|
1020
|
+
return getattr(row, 'provision_log_path', None)
|
|
1021
|
+
|
|
1022
|
+
# Fallback: search history by name and pick the latest by
|
|
1023
|
+
# usage_intervals
|
|
1024
|
+
rows = session.query(cluster_history_table).filter_by(
|
|
1025
|
+
name=cluster_name).all()
|
|
1026
|
+
if not rows:
|
|
1027
|
+
return None
|
|
1028
|
+
|
|
1029
|
+
def latest_timestamp(usages_bin) -> int:
|
|
1030
|
+
try:
|
|
1031
|
+
intervals = pickle.loads(usages_bin)
|
|
1032
|
+
# intervals: List[Tuple[int, Optional[int]]]
|
|
1033
|
+
if not intervals:
|
|
1034
|
+
return -1
|
|
1035
|
+
_, end = intervals[-1]
|
|
1036
|
+
return end if end is not None else int(time.time())
|
|
1037
|
+
except Exception: # pylint: disable=broad-except
|
|
1038
|
+
return -1
|
|
1039
|
+
|
|
1040
|
+
latest_row = max(rows,
|
|
1041
|
+
key=lambda r: latest_timestamp(r.usage_intervals))
|
|
1042
|
+
return getattr(latest_row, 'provision_log_path', None)
|
|
1043
|
+
|
|
1044
|
+
|
|
862
1045
|
@_init_db
|
|
863
1046
|
def set_cluster_info(cluster_name: str, metadata: Dict[str, Any]) -> None:
|
|
864
1047
|
assert _SQLALCHEMY_ENGINE is not None
|
|
@@ -1245,9 +1428,9 @@ def get_clusters_from_history(
|
|
|
1245
1428
|
def get_cluster_names_start_with(starts_with: str) -> List[str]:
|
|
1246
1429
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1247
1430
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1248
|
-
rows = session.query(cluster_table).filter(
|
|
1431
|
+
rows = session.query(cluster_table.c.name).filter(
|
|
1249
1432
|
cluster_table.c.name.like(f'{starts_with}%')).all()
|
|
1250
|
-
return [row
|
|
1433
|
+
return [row[0] for row in rows]
|
|
1251
1434
|
|
|
1252
1435
|
|
|
1253
1436
|
@_init_db
|
sky/jobs/constants.py
CHANGED
|
@@ -47,7 +47,7 @@ JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
|
|
|
47
47
|
# The version of the lib files that jobs/utils use. Whenever there is an API
|
|
48
48
|
# change for the jobs/utils, we need to bump this version and update
|
|
49
49
|
# job.utils.ManagedJobCodeGen to handle the version update.
|
|
50
|
-
MANAGED_JOBS_VERSION =
|
|
50
|
+
MANAGED_JOBS_VERSION = 9
|
|
51
51
|
|
|
52
52
|
# The command for setting up the jobs dashboard on the controller. It firstly
|
|
53
53
|
# checks if the systemd services are available, and if not (e.g., Kubernetes
|
sky/jobs/controller.py
CHANGED
|
@@ -30,7 +30,6 @@ from sky.jobs import recovery_strategy
|
|
|
30
30
|
from sky.jobs import scheduler
|
|
31
31
|
from sky.jobs import state as managed_job_state
|
|
32
32
|
from sky.jobs import utils as managed_job_utils
|
|
33
|
-
from sky.serve import serve_utils
|
|
34
33
|
from sky.skylet import constants
|
|
35
34
|
from sky.skylet import job_lib
|
|
36
35
|
from sky.usage import usage_lib
|
sky/jobs/recovery_strategy.py
CHANGED
|
@@ -10,8 +10,8 @@ import traceback
|
|
|
10
10
|
import typing
|
|
11
11
|
from typing import Optional
|
|
12
12
|
|
|
13
|
-
import sky
|
|
14
13
|
from sky import backends
|
|
14
|
+
from sky import dag as dag_lib
|
|
15
15
|
from sky import exceptions
|
|
16
16
|
from sky import execution
|
|
17
17
|
from sky import global_user_state
|
|
@@ -61,7 +61,7 @@ class StrategyExecutor:
|
|
|
61
61
|
"""
|
|
62
62
|
assert isinstance(backend, backends.CloudVmRayBackend), (
|
|
63
63
|
'Only CloudVMRayBackend is supported.')
|
|
64
|
-
self.dag =
|
|
64
|
+
self.dag = dag_lib.Dag()
|
|
65
65
|
self.dag.add(task)
|
|
66
66
|
# For jobs submitted to a pool, the cluster name might change after each
|
|
67
67
|
# recovery. Initially this is set to an empty string to indicate that no
|
|
@@ -447,7 +447,7 @@ class StrategyExecutor:
|
|
|
447
447
|
# We retry immediately for worker pool, since no sky.launch()
|
|
448
448
|
# is called and the overhead is minimal.
|
|
449
449
|
gap_seconds = (backoff.current_backoff()
|
|
450
|
-
if self.pool is None else
|
|
450
|
+
if self.pool is None else 1)
|
|
451
451
|
logger.info('Retrying to launch the cluster in '
|
|
452
452
|
f'{gap_seconds:.1f} seconds.')
|
|
453
453
|
time.sleep(gap_seconds)
|