skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250814__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (102) hide show
  1. sky/__init__.py +4 -2
  2. sky/backends/backend_utils.py +69 -6
  3. sky/backends/cloud_vm_ray_backend.py +156 -25
  4. sky/catalog/cudo_catalog.py +1 -1
  5. sky/catalog/data_fetchers/fetch_cudo.py +1 -1
  6. sky/catalog/data_fetchers/fetch_nebius.py +6 -3
  7. sky/client/cli/command.py +40 -77
  8. sky/client/common.py +1 -1
  9. sky/client/sdk.py +19 -19
  10. sky/client/sdk_async.py +5 -4
  11. sky/clouds/aws.py +52 -1
  12. sky/clouds/kubernetes.py +14 -0
  13. sky/dag.py +1 -0
  14. sky/dashboard/out/404.html +1 -1
  15. sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → Y0eNlwi85qGRecLTin11y}/_buildManifest.js +1 -1
  16. sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-37611fe6b86d274d.js} +1 -1
  17. sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-c2ea34fda4f1f8c8.js} +1 -1
  18. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-664c36eda967b1ba.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/{webpack-7fd0cf9dbecff10f.js → webpack-00c0a51d21157453.js} +1 -1
  20. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  21. sky/dashboard/out/clusters/[cluster].html +1 -1
  22. sky/dashboard/out/clusters.html +1 -1
  23. sky/dashboard/out/config.html +1 -1
  24. sky/dashboard/out/index.html +1 -1
  25. sky/dashboard/out/infra/[context].html +1 -1
  26. sky/dashboard/out/infra.html +1 -1
  27. sky/dashboard/out/jobs/[job].html +1 -1
  28. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  29. sky/dashboard/out/jobs.html +1 -1
  30. sky/dashboard/out/users.html +1 -1
  31. sky/dashboard/out/volumes.html +1 -1
  32. sky/dashboard/out/workspace/new.html +1 -1
  33. sky/dashboard/out/workspaces/[name].html +1 -1
  34. sky/dashboard/out/workspaces.html +1 -1
  35. sky/data/storage.py +11 -1
  36. sky/exceptions.py +5 -0
  37. sky/global_user_state.py +63 -7
  38. sky/jobs/constants.py +1 -1
  39. sky/jobs/controller.py +0 -1
  40. sky/jobs/recovery_strategy.py +3 -3
  41. sky/jobs/scheduler.py +23 -68
  42. sky/jobs/server/core.py +18 -12
  43. sky/jobs/state.py +6 -2
  44. sky/jobs/utils.py +8 -0
  45. sky/provision/__init__.py +1 -0
  46. sky/provision/aws/config.py +9 -0
  47. sky/provision/aws/instance.py +36 -13
  48. sky/provision/azure/instance.py +2 -0
  49. sky/provision/cudo/cudo_wrapper.py +1 -1
  50. sky/provision/cudo/instance.py +2 -0
  51. sky/provision/do/instance.py +2 -0
  52. sky/provision/fluidstack/instance.py +2 -0
  53. sky/provision/gcp/instance.py +2 -0
  54. sky/provision/hyperbolic/instance.py +2 -1
  55. sky/provision/kubernetes/instance.py +133 -0
  56. sky/provision/lambda_cloud/instance.py +2 -0
  57. sky/provision/nebius/instance.py +2 -0
  58. sky/provision/oci/instance.py +2 -0
  59. sky/provision/paperspace/instance.py +2 -1
  60. sky/provision/paperspace/utils.py +1 -1
  61. sky/provision/runpod/instance.py +2 -0
  62. sky/provision/runpod/utils.py +1 -1
  63. sky/provision/scp/instance.py +2 -0
  64. sky/provision/vast/instance.py +2 -0
  65. sky/provision/vsphere/instance.py +2 -0
  66. sky/resources.py +1 -2
  67. sky/schemas/__init__.py +0 -0
  68. sky/schemas/api/__init__.py +0 -0
  69. sky/schemas/api/responses.py +70 -0
  70. sky/schemas/generated/__init__.py +0 -0
  71. sky/schemas/generated/autostopv1_pb2.py +36 -0
  72. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  73. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  74. sky/serve/constants.py +3 -7
  75. sky/serve/replica_managers.py +15 -16
  76. sky/serve/serve_state.py +10 -0
  77. sky/serve/serve_utils.py +21 -20
  78. sky/serve/server/impl.py +15 -19
  79. sky/serve/service.py +31 -16
  80. sky/server/server.py +20 -14
  81. sky/setup_files/dependencies.py +11 -10
  82. sky/skylet/autostop_lib.py +38 -5
  83. sky/skylet/constants.py +3 -1
  84. sky/skylet/services.py +44 -0
  85. sky/skylet/skylet.py +49 -4
  86. sky/task.py +19 -16
  87. sky/templates/aws-ray.yml.j2 +2 -2
  88. sky/templates/jobs-controller.yaml.j2 +6 -0
  89. sky/utils/command_runner.py +1 -1
  90. sky/utils/config_utils.py +29 -5
  91. sky/utils/controller_utils.py +73 -0
  92. sky/utils/db/db_utils.py +17 -0
  93. sky/utils/schemas.py +3 -0
  94. sky/volumes/server/core.py +2 -2
  95. sky/volumes/server/server.py +2 -2
  96. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/METADATA +5 -7
  97. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/RECORD +102 -94
  98. /sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → Y0eNlwi85qGRecLTin11y}/_ssgManifest.js +0 -0
  99. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/WHEEL +0 -0
  100. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/entry_points.txt +0 -0
  101. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/licenses/LICENSE +0 -0
  102. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-7fd0cf9dbecff10f.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-491a4d699d95e808.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/users-7ed36e44e779d5c7.js" defer=""></script><script src="/dashboard/_next/static/Fuy7OzApYTUMz2QgoP7dP/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Fuy7OzApYTUMz2QgoP7dP/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/users","query":{},"buildId":"Fuy7OzApYTUMz2QgoP7dP","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-00c0a51d21157453.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c2ea34fda4f1f8c8.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/users-7ed36e44e779d5c7.js" defer=""></script><script src="/dashboard/_next/static/Y0eNlwi85qGRecLTin11y/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Y0eNlwi85qGRecLTin11y/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/users","query":{},"buildId":"Y0eNlwi85qGRecLTin11y","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-7fd0cf9dbecff10f.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-491a4d699d95e808.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js" defer=""></script><script src="/dashboard/_next/static/Fuy7OzApYTUMz2QgoP7dP/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Fuy7OzApYTUMz2QgoP7dP/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/volumes","query":{},"buildId":"Fuy7OzApYTUMz2QgoP7dP","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-00c0a51d21157453.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c2ea34fda4f1f8c8.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js" defer=""></script><script src="/dashboard/_next/static/Y0eNlwi85qGRecLTin11y/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Y0eNlwi85qGRecLTin11y/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/volumes","query":{},"buildId":"Y0eNlwi85qGRecLTin11y","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-7fd0cf9dbecff10f.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-491a4d699d95e808.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js" defer=""></script><script src="/dashboard/_next/static/Fuy7OzApYTUMz2QgoP7dP/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Fuy7OzApYTUMz2QgoP7dP/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"Fuy7OzApYTUMz2QgoP7dP","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-00c0a51d21157453.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c2ea34fda4f1f8c8.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js" defer=""></script><script src="/dashboard/_next/static/Y0eNlwi85qGRecLTin11y/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Y0eNlwi85qGRecLTin11y/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"Y0eNlwi85qGRecLTin11y","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-7fd0cf9dbecff10f.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-491a4d699d95e808.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/1559-6c00e20454194859.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-6129c1cfbcf51063.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-c9686994ddafcf01.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-0f886f16e0d55ff8.js" defer=""></script><script src="/dashboard/_next/static/chunks/8056-5bdeda81199c0def.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-85426374db04811e.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/9159-11421c0f2909236f.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-a8a8f1adba34c892.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-f72f73bcef9541dc.js" defer=""></script><script src="/dashboard/_next/static/Fuy7OzApYTUMz2QgoP7dP/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Fuy7OzApYTUMz2QgoP7dP/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"Fuy7OzApYTUMz2QgoP7dP","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-00c0a51d21157453.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c2ea34fda4f1f8c8.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/1559-6c00e20454194859.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-37611fe6b86d274d.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-c9686994ddafcf01.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-0f886f16e0d55ff8.js" defer=""></script><script src="/dashboard/_next/static/chunks/8056-5bdeda81199c0def.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-85426374db04811e.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/9159-11421c0f2909236f.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-a8a8f1adba34c892.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-f72f73bcef9541dc.js" defer=""></script><script src="/dashboard/_next/static/Y0eNlwi85qGRecLTin11y/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Y0eNlwi85qGRecLTin11y/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"Y0eNlwi85qGRecLTin11y","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-7fd0cf9dbecff10f.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-491a4d699d95e808.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-8f67be60165724cc.js" defer=""></script><script src="/dashboard/_next/static/Fuy7OzApYTUMz2QgoP7dP/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Fuy7OzApYTUMz2QgoP7dP/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"Fuy7OzApYTUMz2QgoP7dP","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-00c0a51d21157453.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c2ea34fda4f1f8c8.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-8f67be60165724cc.js" defer=""></script><script src="/dashboard/_next/static/Y0eNlwi85qGRecLTin11y/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Y0eNlwi85qGRecLTin11y/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"Y0eNlwi85qGRecLTin11y","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
sky/data/storage.py CHANGED
@@ -4510,9 +4510,19 @@ class R2Store(S3CompatibleStore):
4510
4510
  extra_cli_args=['--checksum-algorithm', 'CRC32'], # R2 specific
4511
4511
  cloud_name=cloudflare.NAME,
4512
4512
  default_region='auto',
4513
- mount_cmd_factory=mounting_utils.get_r2_mount_cmd,
4513
+ mount_cmd_factory=cls._get_r2_mount_cmd,
4514
4514
  )
4515
4515
 
4516
+ @classmethod
4517
+ def _get_r2_mount_cmd(cls, bucket_name: str, mount_path: str,
4518
+ bucket_sub_path: Optional[str]) -> str:
4519
+ """Factory method for R2 mount command."""
4520
+ endpoint_url = cloudflare.create_endpoint()
4521
+ return mounting_utils.get_r2_mount_cmd(cloudflare.R2_CREDENTIALS_PATH,
4522
+ cloudflare.R2_PROFILE_NAME,
4523
+ endpoint_url, bucket_name,
4524
+ mount_path, bucket_sub_path)
4525
+
4516
4526
  def mount_cached_command(self, mount_path: str) -> str:
4517
4527
  """R2-specific cached mount implementation using rclone."""
4518
4528
  install_cmd = mounting_utils.get_rclone_install_cmd()
sky/exceptions.py CHANGED
@@ -651,3 +651,8 @@ class RequestInterruptedError(Exception):
651
651
  this error is raised.
652
652
  """
653
653
  pass
654
+
655
+
656
+ class SkyletInternalError(Exception):
657
+ """Raised when a Skylet internal error occurs."""
658
+ pass
sky/global_user_state.py CHANGED
@@ -645,13 +645,32 @@ def add_cluster_event(cluster_name: str,
645
645
  new_status: Optional[status_lib.ClusterStatus],
646
646
  reason: str,
647
647
  event_type: ClusterEventType,
648
- nop_if_duplicate: bool = False) -> None:
648
+ nop_if_duplicate: bool = False,
649
+ duplicate_regex: Optional[str] = None,
650
+ expose_duplicate_error: bool = False,
651
+ transitioned_at: Optional[int] = None) -> None:
652
+ """Add a cluster event.
653
+
654
+ Args:
655
+ cluster_name: Name of the cluster.
656
+ new_status: New status of the cluster.
657
+ reason: Reason for the event.
658
+ event_type: Type of the event.
659
+ nop_if_duplicate: If True, do not add the event if it is a duplicate.
660
+ duplicate_regex: If provided, do not add the event if it matches the
661
+ regex. Only used if nop_if_duplicate is True.
662
+ expose_duplicate_error: If True, raise an error if the event is a
663
+ duplicate. Only used if nop_if_duplicate is True.
664
+ transitioned_at: If provided, use this timestamp for the event.
665
+ """
649
666
  assert _SQLALCHEMY_ENGINE is not None
650
667
  cluster_hash = _get_hash_for_existing_cluster(cluster_name)
651
668
  if cluster_hash is None:
652
669
  logger.debug(f'Hash for cluster {cluster_name} not found. '
653
670
  'Skipping event.')
654
671
  return
672
+ if transitioned_at is None:
673
+ transitioned_at = int(time.time())
655
674
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
656
675
  if (_SQLALCHEMY_ENGINE.dialect.name ==
657
676
  db_utils.SQLAlchemyDialect.SQLITE.value):
@@ -669,7 +688,10 @@ def add_cluster_event(cluster_name: str,
669
688
  if nop_if_duplicate:
670
689
  last_event = get_last_cluster_event(cluster_hash,
671
690
  event_type=event_type)
672
- if last_event == reason:
691
+ if duplicate_regex is not None and last_event is not None:
692
+ if re.search(duplicate_regex, last_event):
693
+ return
694
+ elif last_event == reason:
673
695
  return
674
696
  try:
675
697
  session.execute(
@@ -679,15 +701,20 @@ def add_cluster_event(cluster_name: str,
679
701
  starting_status=last_status,
680
702
  ending_status=new_status.value if new_status else None,
681
703
  reason=reason,
682
- transitioned_at=int(time.time()),
704
+ transitioned_at=transitioned_at,
683
705
  type=event_type.value,
684
706
  ))
685
707
  session.commit()
686
708
  except sqlalchemy.exc.IntegrityError as e:
687
709
  if 'UNIQUE constraint failed' in str(e):
688
710
  # This can happen if the cluster event is added twice.
689
- # We can ignore this error.
690
- pass
711
+ # We can ignore this error unless the caller requests
712
+ # to expose the error.
713
+ if expose_duplicate_error:
714
+ raise db_utils.UniqueConstraintViolationError(
715
+ value=reason, message=str(e))
716
+ else:
717
+ pass
691
718
  else:
692
719
  raise e
693
720
 
@@ -704,6 +731,35 @@ def get_last_cluster_event(cluster_hash: str,
704
731
  return row.reason
705
732
 
706
733
 
734
+ def get_cluster_events(cluster_name: Optional[str], cluster_hash: Optional[str],
735
+ event_type: ClusterEventType) -> List[str]:
736
+ """Returns the cluster events for the cluster.
737
+
738
+ Args:
739
+ cluster_name: Name of the cluster. Cannot be specified if cluster_hash
740
+ is specified.
741
+ cluster_hash: Hash of the cluster. Cannot be specified if cluster_name
742
+ is specified.
743
+ event_type: Type of the event.
744
+ """
745
+ assert _SQLALCHEMY_ENGINE is not None
746
+
747
+ if cluster_name is not None and cluster_hash is not None:
748
+ raise ValueError('Cannot specify both cluster_name and cluster_hash')
749
+ if cluster_name is None and cluster_hash is None:
750
+ raise ValueError('Must specify either cluster_name or cluster_hash')
751
+ if cluster_name is not None:
752
+ cluster_hash = _get_hash_for_existing_cluster(cluster_name)
753
+ if cluster_hash is None:
754
+ raise ValueError(f'Hash for cluster {cluster_name} not found.')
755
+
756
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
757
+ rows = session.query(cluster_event_table).filter_by(
758
+ cluster_hash=cluster_hash, type=event_type.value).order_by(
759
+ cluster_event_table.c.transitioned_at.asc()).all()
760
+ return [row.reason for row in rows]
761
+
762
+
707
763
  def _get_user_hash_or_current_user(user_hash: Optional[str]) -> str:
708
764
  """Returns the user hash or the current user hash, if user_hash is None.
709
765
 
@@ -1245,9 +1301,9 @@ def get_clusters_from_history(
1245
1301
  def get_cluster_names_start_with(starts_with: str) -> List[str]:
1246
1302
  assert _SQLALCHEMY_ENGINE is not None
1247
1303
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1248
- rows = session.query(cluster_table).filter(
1304
+ rows = session.query(cluster_table.c.name).filter(
1249
1305
  cluster_table.c.name.like(f'{starts_with}%')).all()
1250
- return [row.name for row in rows]
1306
+ return [row[0] for row in rows]
1251
1307
 
1252
1308
 
1253
1309
  @_init_db
sky/jobs/constants.py CHANGED
@@ -47,7 +47,7 @@ JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
47
47
  # The version of the lib files that jobs/utils use. Whenever there is an API
48
48
  # change for the jobs/utils, we need to bump this version and update
49
49
  # job.utils.ManagedJobCodeGen to handle the version update.
50
- MANAGED_JOBS_VERSION = 7
50
+ MANAGED_JOBS_VERSION = 8
51
51
 
52
52
  # The command for setting up the jobs dashboard on the controller. It firstly
53
53
  # checks if the systemd services are available, and if not (e.g., Kubernetes
sky/jobs/controller.py CHANGED
@@ -30,7 +30,6 @@ from sky.jobs import recovery_strategy
30
30
  from sky.jobs import scheduler
31
31
  from sky.jobs import state as managed_job_state
32
32
  from sky.jobs import utils as managed_job_utils
33
- from sky.serve import serve_utils
34
33
  from sky.skylet import constants
35
34
  from sky.skylet import job_lib
36
35
  from sky.usage import usage_lib
@@ -10,8 +10,8 @@ import traceback
10
10
  import typing
11
11
  from typing import Optional
12
12
 
13
- import sky
14
13
  from sky import backends
14
+ from sky import dag as dag_lib
15
15
  from sky import exceptions
16
16
  from sky import execution
17
17
  from sky import global_user_state
@@ -61,7 +61,7 @@ class StrategyExecutor:
61
61
  """
62
62
  assert isinstance(backend, backends.CloudVmRayBackend), (
63
63
  'Only CloudVMRayBackend is supported.')
64
- self.dag = sky.Dag()
64
+ self.dag = dag_lib.Dag()
65
65
  self.dag.add(task)
66
66
  # For jobs submitted to a pool, the cluster name might change after each
67
67
  # recovery. Initially this is set to an empty string to indicate that no
@@ -447,7 +447,7 @@ class StrategyExecutor:
447
447
  # We retry immediately for worker pool, since no sky.launch()
448
448
  # is called and the overhead is minimal.
449
449
  gap_seconds = (backoff.current_backoff()
450
- if self.pool is None else 0)
450
+ if self.pool is None else 1)
451
451
  logger.info('Retrying to launch the cluster in '
452
452
  f'{gap_seconds:.1f} seconds.')
453
453
  time.sleep(gap_seconds)
sky/jobs/scheduler.py CHANGED
@@ -15,13 +15,14 @@ following section for more details).
15
15
 
16
16
  The scheduling logic limits #running jobs according to three limits:
17
17
  1. The number of jobs that can be launching (that is, STARTING or RECOVERING) at
18
- once, based on the number of CPUs. (See _get_launch_parallelism.) This the
19
- most compute-intensive part of the job lifecycle, which is why we have an
20
- additional limit.
18
+ once, based on the number of CPUs. This the most compute-intensive part of
19
+ the job lifecycle, which is why we have an additional limit.
20
+ See sky/utils/controller_utils.py::_get_launch_parallelism.
21
21
  2. The number of jobs that can be running at any given time, based on the amount
22
- of memory. (See _get_job_parallelism.) Since the job controller is doing very
23
- little once a job starts (just checking its status periodically), the most
24
- significant resource it consumes is memory.
22
+ of memory. Since the job controller is doing very little once a job starts
23
+ (just checking its status periodically), the most significant resource it
24
+ consumes is memory.
25
+ See sky/utils/controller_utils.py::_get_job_parallelism.
25
26
  3. The number of jobs that can be running in a pool at any given time, based on
26
27
  the number of ready workers in the pool. (See _can_start_new_job.)
27
28
 
@@ -42,55 +43,27 @@ Nomenclature:
42
43
 
43
44
  from argparse import ArgumentParser
44
45
  import contextlib
45
- from functools import lru_cache
46
46
  import os
47
47
  import sys
48
48
  import time
49
- import typing
50
49
  from typing import Optional
51
50
 
52
51
  import filelock
53
52
 
54
53
  from sky import exceptions
55
54
  from sky import sky_logging
56
- from sky.adaptors import common as adaptors_common
57
55
  from sky.jobs import constants as managed_job_constants
58
56
  from sky.jobs import state
59
57
  from sky.serve import serve_utils
60
58
  from sky.skylet import constants
61
59
  from sky.utils import common_utils
60
+ from sky.utils import controller_utils
62
61
  from sky.utils import subprocess_utils
63
62
 
64
- if typing.TYPE_CHECKING:
65
- import psutil
66
- else:
67
- psutil = adaptors_common.LazyImport('psutil')
68
-
69
63
  logger = sky_logging.init_logger('sky.jobs.controller')
70
64
 
71
- # The _MANAGED_JOB_SCHEDULER_LOCK should be held whenever we are checking the
72
- # parallelism control or updating the schedule_state of any job.
73
- # Any code that takes this lock must conclude by calling
74
- # maybe_schedule_next_jobs.
75
- _MANAGED_JOB_SCHEDULER_LOCK = '~/.sky/locks/managed_job_scheduler.lock'
76
65
  _ALIVE_JOB_LAUNCH_WAIT_INTERVAL = 0.5
77
66
 
78
- # Based on testing, assume a running job uses 350MB memory.
79
- JOB_MEMORY_MB = 350
80
- # Past 2000 simultaneous jobs, we become unstable.
81
- # See https://github.com/skypilot-org/skypilot/issues/4649.
82
- MAX_JOB_LIMIT = 2000
83
- # Number of ongoing launches launches allowed per CPU.
84
- LAUNCHES_PER_CPU = 4
85
-
86
-
87
- @lru_cache(maxsize=1)
88
- def _get_lock_path() -> str:
89
- # TODO(tian): Per pool lock.
90
- path = os.path.expanduser(_MANAGED_JOB_SCHEDULER_LOCK)
91
- os.makedirs(os.path.dirname(path), exist_ok=True)
92
- return path
93
-
94
67
 
95
68
  def _start_controller(job_id: int, dag_yaml_path: str, env_file_path: str,
96
69
  pool: Optional[str]) -> None:
@@ -163,7 +136,8 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
163
136
  # parallelism control. If we cannot obtain the lock, exit immediately.
164
137
  # The current lock holder is expected to launch any jobs it can before
165
138
  # releasing the lock.
166
- with filelock.FileLock(_get_lock_path(), blocking=False):
139
+ with filelock.FileLock(controller_utils.get_resources_lock_path(),
140
+ blocking=False):
167
141
  while True:
168
142
  maybe_next_job = state.get_waiting_job(pool)
169
143
  if maybe_next_job is None:
@@ -184,7 +158,8 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
184
158
  # an ALIVE_WAITING job, but we would be able to launch a WAITING
185
159
  # job.
186
160
  if current_state == state.ManagedJobScheduleState.ALIVE_WAITING:
187
- if not _can_lauch_in_alive_job():
161
+ if not (controller_utils.can_provision() or
162
+ actual_pool is not None):
188
163
  # Can't schedule anything, break from scheduling loop.
189
164
  break
190
165
  elif current_state == state.ManagedJobScheduleState.WAITING:
@@ -234,7 +209,7 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
234
209
 
235
210
  The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
236
211
  """
237
- with filelock.FileLock(_get_lock_path()):
212
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
238
213
  is_resume = state.scheduler_set_waiting(job_id, dag_yaml_path,
239
214
  original_user_yaml_path,
240
215
  env_file_path,
@@ -286,11 +261,11 @@ def scheduled_launch(job_id: int):
286
261
  except exceptions.NoClusterLaunchedError:
287
262
  # NoClusterLaunchedError is indicates that the job is in retry backoff.
288
263
  # We should transition to ALIVE_BACKOFF instead of ALIVE.
289
- with filelock.FileLock(_get_lock_path()):
264
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
290
265
  state.scheduler_set_alive_backoff(job_id)
291
266
  raise
292
267
  else:
293
- with filelock.FileLock(_get_lock_path()):
268
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
294
269
  state.scheduler_set_alive(job_id)
295
270
  finally:
296
271
  maybe_schedule_next_jobs(pool)
@@ -310,56 +285,36 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
310
285
  return
311
286
  pool = state.get_pool_from_job_id(job_id)
312
287
 
313
- with filelock.FileLock(_get_lock_path()):
288
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
314
289
  state.scheduler_set_done(job_id, idempotent)
315
290
  maybe_schedule_next_jobs(pool)
316
291
 
317
292
 
318
293
  def _set_alive_waiting(job_id: int) -> None:
319
294
  """Should use wait_until_launch_okay() to transition to this state."""
320
- with filelock.FileLock(_get_lock_path()):
295
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
321
296
  state.scheduler_set_alive_waiting(job_id)
322
297
  pool = state.get_pool_from_job_id(job_id)
323
298
  maybe_schedule_next_jobs(pool)
324
299
 
325
300
 
326
- def _get_job_parallelism() -> int:
327
- job_memory = JOB_MEMORY_MB * 1024 * 1024
328
-
329
- job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
330
-
331
- return max(job_limit, 1)
332
-
333
-
334
- def _get_launch_parallelism() -> int:
335
- cpus = os.cpu_count()
336
- return cpus * LAUNCHES_PER_CPU if cpus is not None else 1
337
-
338
-
339
301
  def _can_start_new_job(pool: Optional[str]) -> bool:
340
- launching_jobs = state.get_num_launching_jobs()
341
- alive_jobs = state.get_num_alive_jobs()
342
-
343
302
  # Check basic resource limits
344
- if not (launching_jobs < _get_launch_parallelism() and
345
- alive_jobs < _get_job_parallelism()):
303
+ # Pool jobs don't need to provision resources, so we skip the check.
304
+ if not ((controller_utils.can_provision() or pool is not None) and
305
+ controller_utils.can_start_new_process()):
346
306
  return False
347
307
 
348
- # Check if there are available replicas in the pool
308
+ # Check if there are available workers in the pool
349
309
  if pool is not None:
350
310
  alive_jobs_in_pool = state.get_num_alive_jobs(pool)
351
- if alive_jobs_in_pool >= serve_utils.num_replicas(pool):
352
- logger.debug(f'No replicas available in pool {pool}')
311
+ if alive_jobs_in_pool >= len(serve_utils.get_ready_replicas(pool)):
312
+ logger.debug(f'No READY workers available in pool {pool}')
353
313
  return False
354
314
 
355
315
  return True
356
316
 
357
317
 
358
- def _can_lauch_in_alive_job() -> bool:
359
- launching_jobs = state.get_num_launching_jobs()
360
- return launching_jobs < _get_launch_parallelism()
361
-
362
-
363
318
  if __name__ == '__main__':
364
319
  parser = ArgumentParser()
365
320
  parser.add_argument('dag_yaml',
sky/jobs/server/core.py CHANGED
@@ -93,8 +93,8 @@ def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
93
93
  return local_to_controller_file_mounts
94
94
 
95
95
 
96
- def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag', pool: Optional[str],
97
- num_jobs: Optional[int]) -> Optional[List[int]]:
96
+ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag',
97
+ num_jobs: int) -> Optional[List[int]]:
98
98
  """Submit the managed job locally if in consolidation mode.
99
99
 
100
100
  In normal mode the managed job submission is done in the ray job submission.
@@ -109,12 +109,13 @@ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag', pool: Optional[str],
109
109
  # Create local directory for the managed job.
110
110
  pathlib.Path(prefix).expanduser().mkdir(parents=True, exist_ok=True)
111
111
  job_ids = []
112
+ pool = dag.pool
112
113
  pool_hash = None
113
114
  if pool is not None:
114
115
  pool_hash = serve_state.get_service_hash(pool)
115
116
  # Already checked in the sdk.
116
117
  assert pool_hash is not None, f'Pool {pool} not found'
117
- for _ in range(num_jobs if num_jobs is not None else 1):
118
+ for _ in range(num_jobs):
118
119
  # TODO(tian): We should have a separate name for each job when
119
120
  # submitting multiple jobs. Current blocker is that we are sharing
120
121
  # the same dag object for all jobs. Maybe we can do copy.copy() for
@@ -172,9 +173,6 @@ def launch(
172
173
  handle: Optional[backends.ResourceHandle]; handle to the controller VM.
173
174
  None if dryrun.
174
175
  """
175
- if pool is not None and not managed_job_utils.is_consolidation_mode():
176
- with ux_utils.print_exception_no_traceback():
177
- raise ValueError('pool is only supported in consolidation mode.')
178
176
  entrypoint = task
179
177
  # using hasattr instead of isinstance to avoid importing sky
180
178
  if hasattr(task, 'metadata'):
@@ -295,8 +293,13 @@ def launch(
295
293
  controller=controller,
296
294
  task_resources=sum([list(t.resources) for t in dag.tasks], []))
297
295
 
296
+ num_jobs = num_jobs if num_jobs is not None else 1
297
+ # We do this assignment after applying the admin policy, so that we don't
298
+ # need to serialize the pool name in the dag. The dag object will be
299
+ # preserved. See sky/admin_policy.py::MutatedUserRequest::decode.
300
+ dag.pool = pool
298
301
  consolidation_mode_job_ids = _maybe_submit_job_locally(
299
- prefix, dag, pool, num_jobs)
302
+ prefix, dag, num_jobs)
300
303
 
301
304
  # This is only needed for non-consolidation mode. For consolidation
302
305
  # mode, the controller uses the same catalog as API server.
@@ -373,8 +376,8 @@ def launch(
373
376
  controller_task._metadata = metadata
374
377
 
375
378
  job_identity = ''
376
- if consolidation_mode_job_id is not None:
377
- job_identity = f' (Job ID: {consolidation_mode_job_id})'
379
+ if job_rank is not None:
380
+ job_identity = f' (rank: {job_rank})'
378
381
  logger.info(f'{colorama.Fore.YELLOW}'
379
382
  f'Launching managed job {dag.name!r}{job_identity} '
380
383
  f'from jobs controller...{colorama.Style.RESET_ALL}')
@@ -428,14 +431,17 @@ def launch(
428
431
  backend.run_on_head(local_handle, run_script)
429
432
  return consolidation_mode_job_id, local_handle
430
433
 
431
- if consolidation_mode_job_ids is None:
432
- return _submit_one()
433
434
  if pool is None:
435
+ if consolidation_mode_job_ids is None:
436
+ return _submit_one()
434
437
  assert len(consolidation_mode_job_ids) == 1
435
438
  return _submit_one(consolidation_mode_job_ids[0])
439
+
436
440
  ids = []
437
441
  all_handle = None
438
- for job_rank, job_id in enumerate(consolidation_mode_job_ids):
442
+ for job_rank in range(num_jobs):
443
+ job_id = (consolidation_mode_job_ids[job_rank]
444
+ if consolidation_mode_job_ids is not None else None)
439
445
  jid, handle = _submit_one(job_id, job_rank)
440
446
  assert jid is not None, (job_id, handle)
441
447
  ids.append(jid)
sky/jobs/state.py CHANGED
@@ -441,7 +441,8 @@ class ManagedJobScheduleState(enum.Enum):
441
441
 
442
442
  # === Status transition functions ===
443
443
  @_init_db
444
- def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
444
+ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str,
445
+ pool: Optional[str], pool_hash: Optional[str]):
445
446
  assert _SQLALCHEMY_ENGINE is not None
446
447
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
447
448
  if (_SQLALCHEMY_ENGINE.dialect.name ==
@@ -457,7 +458,10 @@ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
457
458
  name=name,
458
459
  schedule_state=ManagedJobScheduleState.INACTIVE.value,
459
460
  workspace=workspace,
460
- entrypoint=entrypoint)
461
+ entrypoint=entrypoint,
462
+ pool=pool,
463
+ pool_hash=pool_hash,
464
+ )
461
465
  session.execute(insert_stmt)
462
466
  session.commit()
463
467
 
sky/jobs/utils.py CHANGED
@@ -1690,6 +1690,7 @@ class ManagedJobCodeGen:
1690
1690
  def set_pending(cls, job_id: int, managed_job_dag: 'dag_lib.Dag',
1691
1691
  workspace: str, entrypoint: str) -> str:
1692
1692
  dag_name = managed_job_dag.name
1693
+ pool = managed_job_dag.pool
1693
1694
  # Add the managed job to queue table.
1694
1695
  code = textwrap.dedent(f"""\
1695
1696
  set_job_info_kwargs = {{'workspace': {workspace!r}}}
@@ -1697,6 +1698,13 @@ class ManagedJobCodeGen:
1697
1698
  set_job_info_kwargs = {{}}
1698
1699
  if managed_job_version >= 5:
1699
1700
  set_job_info_kwargs['entrypoint'] = {entrypoint!r}
1701
+ if managed_job_version >= 8:
1702
+ from sky.serve import serve_state
1703
+ pool_hash = None
1704
+ if {pool!r} != None:
1705
+ pool_hash = serve_state.get_service_hash({pool!r})
1706
+ set_job_info_kwargs['pool'] = {pool!r}
1707
+ set_job_info_kwargs['pool_hash'] = pool_hash
1700
1708
  managed_job_state.set_job_info(
1701
1709
  {job_id}, {dag_name!r}, **set_job_info_kwargs)
1702
1710
  """)
sky/provision/__init__.py CHANGED
@@ -73,6 +73,7 @@ def _route_to_cloud_impl(func):
73
73
  @_route_to_cloud_impl
74
74
  def query_instances(
75
75
  provider_name: str,
76
+ cluster_name: str,
76
77
  cluster_name_on_cloud: str,
77
78
  provider_config: Optional[Dict[str, Any]] = None,
78
79
  non_terminated_only: bool = True,
@@ -19,6 +19,7 @@ import colorama
19
19
  from sky import exceptions
20
20
  from sky import sky_logging
21
21
  from sky.adaptors import aws
22
+ from sky.clouds import aws as aws_cloud
22
23
  from sky.provision import common
23
24
  from sky.provision.aws import utils
24
25
  from sky.utils import annotations
@@ -103,6 +104,14 @@ def bootstrap_instances(
103
104
  security_group_ids = _configure_security_group(ec2, vpc_id,
104
105
  expected_sg_name,
105
106
  extended_ip_rules)
107
+ if expected_sg_name != aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
108
+ # Ensure the default security group is created. This is needed
109
+ # to enable us to use the default security group to quickly
110
+ # delete the cluster. If the default security group is not created,
111
+ # we will need to block on instance termination to delete the
112
+ # security group.
113
+ _configure_security_group(ec2, vpc_id,
114
+ aws_cloud.DEFAULT_SECURITY_GROUP_NAME, [])
106
115
  end_time = time.time()
107
116
  elapsed = end_time - start_time
108
117
  logger.info(