skypilot-nightly 1.0.0.dev20250827__py3-none-any.whl → 1.0.0.dev20250829__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (86) hide show
  1. sky/__init__.py +2 -2
  2. sky/admin_policy.py +11 -10
  3. sky/authentication.py +1 -1
  4. sky/backends/backend.py +3 -5
  5. sky/backends/backend_utils.py +140 -52
  6. sky/backends/cloud_vm_ray_backend.py +30 -25
  7. sky/backends/local_docker_backend.py +3 -8
  8. sky/backends/wheel_utils.py +35 -8
  9. sky/client/cli/command.py +41 -9
  10. sky/client/sdk.py +23 -8
  11. sky/client/sdk_async.py +6 -2
  12. sky/clouds/aws.py +118 -1
  13. sky/core.py +1 -4
  14. sky/dashboard/out/404.html +1 -1
  15. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  16. sky/dashboard/out/clusters/[cluster].html +1 -1
  17. sky/dashboard/out/clusters.html +1 -1
  18. sky/dashboard/out/config.html +1 -1
  19. sky/dashboard/out/index.html +1 -1
  20. sky/dashboard/out/infra/[context].html +1 -1
  21. sky/dashboard/out/infra.html +1 -1
  22. sky/dashboard/out/jobs/[job].html +1 -1
  23. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  24. sky/dashboard/out/jobs.html +1 -1
  25. sky/dashboard/out/users.html +1 -1
  26. sky/dashboard/out/volumes.html +1 -1
  27. sky/dashboard/out/workspace/new.html +1 -1
  28. sky/dashboard/out/workspaces/[name].html +1 -1
  29. sky/dashboard/out/workspaces.html +1 -1
  30. sky/global_user_state.py +82 -22
  31. sky/jobs/client/sdk.py +5 -2
  32. sky/jobs/recovery_strategy.py +9 -4
  33. sky/jobs/server/server.py +2 -1
  34. sky/logs/agent.py +2 -2
  35. sky/logs/aws.py +6 -3
  36. sky/provision/aws/config.py +78 -3
  37. sky/provision/aws/instance.py +45 -6
  38. sky/provision/do/utils.py +2 -1
  39. sky/provision/kubernetes/instance.py +55 -11
  40. sky/provision/kubernetes/utils.py +11 -2
  41. sky/provision/nebius/utils.py +36 -2
  42. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  43. sky/serve/client/impl.py +5 -4
  44. sky/serve/replica_managers.py +4 -3
  45. sky/serve/serve_utils.py +2 -2
  46. sky/serve/server/impl.py +3 -2
  47. sky/serve/server/server.py +2 -1
  48. sky/server/auth/oauth2_proxy.py +10 -4
  49. sky/server/common.py +4 -4
  50. sky/server/daemons.py +16 -5
  51. sky/server/requests/executor.py +5 -3
  52. sky/server/requests/payloads.py +3 -1
  53. sky/server/requests/preconditions.py +3 -2
  54. sky/server/requests/requests.py +121 -19
  55. sky/server/server.py +85 -60
  56. sky/server/stream_utils.py +7 -5
  57. sky/setup_files/dependencies.py +6 -1
  58. sky/sky_logging.py +28 -0
  59. sky/skylet/constants.py +6 -0
  60. sky/skylet/events.py +2 -3
  61. sky/skypilot_config.py +10 -10
  62. sky/task.py +1 -1
  63. sky/templates/aws-ray.yml.j2 +1 -0
  64. sky/templates/nebius-ray.yml.j2 +4 -8
  65. sky/usage/usage_lib.py +3 -2
  66. sky/utils/annotations.py +8 -2
  67. sky/utils/cluster_utils.py +3 -3
  68. sky/utils/common_utils.py +0 -72
  69. sky/utils/controller_utils.py +4 -3
  70. sky/utils/dag_utils.py +4 -4
  71. sky/utils/db/db_utils.py +11 -0
  72. sky/utils/db/migration_utils.py +1 -1
  73. sky/utils/kubernetes/config_map_utils.py +3 -3
  74. sky/utils/kubernetes_enums.py +1 -0
  75. sky/utils/lock_events.py +94 -0
  76. sky/utils/schemas.py +3 -0
  77. sky/utils/timeline.py +24 -93
  78. sky/utils/yaml_utils.py +77 -10
  79. {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/METADATA +8 -2
  80. {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/RECORD +86 -84
  81. /sky/dashboard/out/_next/static/{-eL7Ky3bxVivzeLHNB9U6 → hYJYFIxp_ZFONR4wTIJqZ}/_buildManifest.js +0 -0
  82. /sky/dashboard/out/_next/static/{-eL7Ky3bxVivzeLHNB9U6 → hYJYFIxp_ZFONR4wTIJqZ}/_ssgManifest.js +0 -0
  83. {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/WHEEL +0 -0
  84. {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/entry_points.txt +0 -0
  85. {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/licenses/LICENSE +0 -0
  86. {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs-7421e63ac35f8fce.js" defer=""></script><script src="/dashboard/_next/static/-eL7Ky3bxVivzeLHNB9U6/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/-eL7Ky3bxVivzeLHNB9U6/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/jobs","query":{},"buildId":"-eL7Ky3bxVivzeLHNB9U6","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs-7421e63ac35f8fce.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/jobs","query":{},"buildId":"hYJYFIxp_ZFONR4wTIJqZ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/users-018bf31cda52e11b.js" defer=""></script><script src="/dashboard/_next/static/-eL7Ky3bxVivzeLHNB9U6/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/-eL7Ky3bxVivzeLHNB9U6/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/users","query":{},"buildId":"-eL7Ky3bxVivzeLHNB9U6","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/users-018bf31cda52e11b.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/users","query":{},"buildId":"hYJYFIxp_ZFONR4wTIJqZ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/volumes-739726d6b823f532.js" defer=""></script><script src="/dashboard/_next/static/-eL7Ky3bxVivzeLHNB9U6/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/-eL7Ky3bxVivzeLHNB9U6/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/volumes","query":{},"buildId":"-eL7Ky3bxVivzeLHNB9U6","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/volumes-739726d6b823f532.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/volumes","query":{},"buildId":"hYJYFIxp_ZFONR4wTIJqZ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js" defer=""></script><script src="/dashboard/_next/static/-eL7Ky3bxVivzeLHNB9U6/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/-eL7Ky3bxVivzeLHNB9U6/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"-eL7Ky3bxVivzeLHNB9U6","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"hYJYFIxp_ZFONR4wTIJqZ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/7205-88191679e7988c57.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-4a6f1a928fb6d370.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-08b2a1cae076a943.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-8afcf719ea87debc.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-6c9c09593b1e67b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-943efc7aff0f0c06.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-de06e613e20bc977.js" defer=""></script><script src="/dashboard/_next/static/-eL7Ky3bxVivzeLHNB9U6/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/-eL7Ky3bxVivzeLHNB9U6/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"-eL7Ky3bxVivzeLHNB9U6","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/7205-88191679e7988c57.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-4a6f1a928fb6d370.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-08b2a1cae076a943.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-8afcf719ea87debc.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-6c9c09593b1e67b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-943efc7aff0f0c06.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-de06e613e20bc977.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"hYJYFIxp_ZFONR4wTIJqZ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-be35b22e2046564c.js" defer=""></script><script src="/dashboard/_next/static/-eL7Ky3bxVivzeLHNB9U6/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/-eL7Ky3bxVivzeLHNB9U6/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"-eL7Ky3bxVivzeLHNB9U6","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-be35b22e2046564c.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/hYJYFIxp_ZFONR4wTIJqZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"hYJYFIxp_ZFONR4wTIJqZ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
sky/global_user_state.py CHANGED
@@ -53,6 +53,7 @@ _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
53
53
  _SQLALCHEMY_ENGINE_LOCK = threading.Lock()
54
54
 
55
55
  DEFAULT_CLUSTER_EVENT_RETENTION_HOURS = 24.0
56
+ DEBUG_CLUSTER_EVENT_RETENTION_HOURS = 30 * 24.0
56
57
  MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS = 3600
57
58
 
58
59
  _UNIQUE_CONSTRAINT_FAILED_ERROR_MSGS = [
@@ -205,6 +206,7 @@ cluster_event_table = sqlalchemy.Table(
205
206
  sqlalchemy.Column('reason', sqlalchemy.Text, primary_key=True),
206
207
  sqlalchemy.Column('transitioned_at', sqlalchemy.Integer, primary_key=True),
207
208
  sqlalchemy.Column('type', sqlalchemy.Text),
209
+ sqlalchemy.Column('request_id', sqlalchemy.Text, server_default=None),
208
210
  )
209
211
 
210
212
  ssh_key_table = sqlalchemy.Table(
@@ -595,7 +597,7 @@ def add_or_update_cluster(cluster_name: str,
595
597
  if (is_launch and not cluster_row or
596
598
  cluster_row.status != status_lib.ClusterStatus.UP.value):
597
599
  conditional_values.update({
598
- 'last_creation_yaml': common_utils.dump_yaml_str(task_config)
600
+ 'last_creation_yaml': yaml_utils.dump_yaml_str(task_config)
599
601
  if task_config else None,
600
602
  'last_creation_command': last_use,
601
603
  })
@@ -744,6 +746,7 @@ def add_cluster_event(cluster_name: str,
744
746
  elif last_event == reason:
745
747
  return
746
748
  try:
749
+ request_id = common_utils.get_current_request_id()
747
750
  session.execute(
748
751
  insert_func(cluster_event_table).values(
749
752
  cluster_hash=cluster_hash,
@@ -753,6 +756,7 @@ def add_cluster_event(cluster_name: str,
753
756
  reason=reason,
754
757
  transitioned_at=transitioned_at,
755
758
  type=event_type.value,
759
+ request_id=request_id,
756
760
  ))
757
761
  session.commit()
758
762
  except sqlalchemy.exc.IntegrityError as e:
@@ -807,12 +811,15 @@ def _get_last_cluster_event_multiple(
807
811
  return {row.cluster_hash: row.reason for row in rows}
808
812
 
809
813
 
810
- def cleanup_cluster_events_with_retention(retention_hours: float) -> None:
814
+ def cleanup_cluster_events_with_retention(retention_hours: float,
815
+ event_type: ClusterEventType) -> None:
811
816
  assert _SQLALCHEMY_ENGINE is not None
817
+ # Once for events with type STATUS_CHANGE.
812
818
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
813
819
  query = session.query(cluster_event_table).filter(
814
- cluster_event_table.c.transitioned_at < time.time() -
815
- retention_hours * 3600)
820
+ cluster_event_table.c.transitioned_at <
821
+ time.time() - retention_hours * 3600,
822
+ cluster_event_table.c.type == event_type.value)
816
823
  logger.debug(f'Deleting {query.count()} cluster events.')
817
824
  query.delete()
818
825
  session.commit()
@@ -827,9 +834,20 @@ async def cluster_event_retention_daemon():
827
834
  retention_hours = skypilot_config.get_nested(
828
835
  ('api_server', 'cluster_event_retention_hours'),
829
836
  DEFAULT_CLUSTER_EVENT_RETENTION_HOURS)
837
+ debug_retention_hours = skypilot_config.get_nested(
838
+ ('api_server', 'cluster_debug_event_retention_hours'),
839
+ DEBUG_CLUSTER_EVENT_RETENTION_HOURS)
830
840
  try:
831
841
  if retention_hours >= 0:
832
- cleanup_cluster_events_with_retention(retention_hours)
842
+ logger.debug('Cleaning up cluster events with retention '
843
+ f'{retention_hours} hours.')
844
+ cleanup_cluster_events_with_retention(
845
+ retention_hours, ClusterEventType.STATUS_CHANGE)
846
+ if debug_retention_hours >= 0:
847
+ logger.debug('Cleaning up debug cluster events with retention '
848
+ f'{debug_retention_hours} hours.')
849
+ cleanup_cluster_events_with_retention(debug_retention_hours,
850
+ ClusterEventType.DEBUG)
833
851
  except asyncio.CancelledError:
834
852
  logger.info('Cluster event retention daemon cancelled')
835
853
  break
@@ -837,8 +855,9 @@ async def cluster_event_retention_daemon():
837
855
  logger.error(f'Error running cluster event retention daemon: {e}')
838
856
 
839
857
  # Run daemon at most once every hour to avoid too frequent cleanup.
840
- sleep_amount = max(retention_hours * 3600,
841
- MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS)
858
+ sleep_amount = max(
859
+ min(retention_hours * 3600, debug_retention_hours * 3600),
860
+ MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS)
842
861
  await asyncio.sleep(sleep_amount)
843
862
 
844
863
 
@@ -904,8 +923,7 @@ def update_last_use(cluster_name: str):
904
923
 
905
924
 
906
925
  @_init_db
907
- def remove_cluster(cluster_name: str, terminate: bool,
908
- remove_events: bool) -> None:
926
+ def remove_cluster(cluster_name: str, terminate: bool) -> None:
909
927
  """Removes cluster_name mapping."""
910
928
  assert _SQLALCHEMY_ENGINE is not None
911
929
  cluster_hash = _get_hash_for_existing_cluster(cluster_name)
@@ -933,9 +951,6 @@ def remove_cluster(cluster_name: str, terminate: bool,
933
951
 
934
952
  if terminate:
935
953
  session.query(cluster_table).filter_by(name=cluster_name).delete()
936
- if remove_events:
937
- session.query(cluster_event_table).filter_by(
938
- cluster_hash=cluster_hash).delete()
939
954
  else:
940
955
  handle = get_handle_from_cluster_name(cluster_name)
941
956
  if handle is None:
@@ -2070,19 +2085,51 @@ def get_cluster_yaml_str(cluster_yaml_path: Optional[str]) -> Optional[str]:
2070
2085
  row = session.query(cluster_yaml_table).filter_by(
2071
2086
  cluster_name=cluster_name).first()
2072
2087
  if row is None:
2073
- # If the cluster yaml is not in the database, check if it exists
2074
- # on the local file system and migrate it to the database.
2075
- # TODO(syang): remove this check once we have a way to migrate the
2076
- # cluster from file to database. Remove on v0.12.0.
2077
- if cluster_yaml_path is not None and os.path.exists(cluster_yaml_path):
2078
- with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
2079
- yaml_str = f.read()
2080
- set_cluster_yaml(cluster_name, yaml_str)
2081
- return yaml_str
2082
- return None
2088
+ return _set_cluster_yaml_from_file(cluster_yaml_path, cluster_name)
2083
2089
  return row.yaml
2084
2090
 
2085
2091
 
2092
+ def get_cluster_yaml_str_multiple(cluster_yaml_paths: List[str]) -> List[str]:
2093
+ """Get the cluster yaml from the database or the local file system.
2094
+ """
2095
+ assert _SQLALCHEMY_ENGINE is not None
2096
+ cluster_names_to_yaml_paths = {}
2097
+ for cluster_yaml_path in cluster_yaml_paths:
2098
+ cluster_name, _ = os.path.splitext(os.path.basename(cluster_yaml_path))
2099
+ cluster_names_to_yaml_paths[cluster_name] = cluster_yaml_path
2100
+
2101
+ cluster_names = list(cluster_names_to_yaml_paths.keys())
2102
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2103
+ rows = session.query(cluster_yaml_table).filter(
2104
+ cluster_yaml_table.c.cluster_name.in_(cluster_names)).all()
2105
+ row_cluster_names_to_yaml = {row.cluster_name: row.yaml for row in rows}
2106
+
2107
+ yaml_strs = []
2108
+ for cluster_name in cluster_names:
2109
+ if cluster_name in row_cluster_names_to_yaml:
2110
+ yaml_strs.append(row_cluster_names_to_yaml[cluster_name])
2111
+ else:
2112
+ yaml_str = _set_cluster_yaml_from_file(
2113
+ cluster_names_to_yaml_paths[cluster_name], cluster_name)
2114
+ yaml_strs.append(yaml_str)
2115
+ return yaml_strs
2116
+
2117
+
2118
+ def _set_cluster_yaml_from_file(cluster_yaml_path: str,
2119
+ cluster_name: str) -> Optional[str]:
2120
+ """Set the cluster yaml in the database from a file."""
2121
+ # If the cluster yaml is not in the database, check if it exists
2122
+ # on the local file system and migrate it to the database.
2123
+ # TODO(syang): remove this check once we have a way to migrate the
2124
+ # cluster from file to database. Remove on v0.12.0.
2125
+ if cluster_yaml_path is not None and os.path.exists(cluster_yaml_path):
2126
+ with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
2127
+ yaml_str = f.read()
2128
+ set_cluster_yaml(cluster_name, yaml_str)
2129
+ return yaml_str
2130
+ return None
2131
+
2132
+
2086
2133
  def get_cluster_yaml_dict(cluster_yaml_path: Optional[str]) -> Dict[str, Any]:
2087
2134
  """Get the cluster yaml as a dictionary from the database.
2088
2135
 
@@ -2094,6 +2141,19 @@ def get_cluster_yaml_dict(cluster_yaml_path: Optional[str]) -> Dict[str, Any]:
2094
2141
  return yaml_utils.safe_load(yaml_str)
2095
2142
 
2096
2143
 
2144
+ def get_cluster_yaml_dict_multiple(
2145
+ cluster_yaml_paths: List[str]) -> List[Dict[str, Any]]:
2146
+ """Get the cluster yaml as a dictionary from the database."""
2147
+ yaml_strs = get_cluster_yaml_str_multiple(cluster_yaml_paths)
2148
+ yaml_dicts = []
2149
+ for idx, yaml_str in enumerate(yaml_strs):
2150
+ if yaml_str is None:
2151
+ raise ValueError(
2152
+ f'Cluster yaml {cluster_yaml_paths[idx]} not found.')
2153
+ yaml_dicts.append(yaml_utils.safe_load(yaml_str))
2154
+ return yaml_dicts
2155
+
2156
+
2097
2157
  @_init_db
2098
2158
  def set_cluster_yaml(cluster_name: str, yaml_str: str) -> None:
2099
2159
  """Set the cluster yaml in the database."""
sky/jobs/client/sdk.py CHANGED
@@ -243,7 +243,7 @@ def tail_logs(name: Optional[str] = None,
243
243
  controller: bool = False,
244
244
  refresh: bool = False,
245
245
  tail: Optional[int] = None,
246
- output_stream: Optional['io.TextIOBase'] = None) -> int:
246
+ output_stream: Optional['io.TextIOBase'] = None) -> Optional[int]:
247
247
  """Tails logs of managed jobs.
248
248
 
249
249
  You can provide either a job name or a job ID to tail logs. If both are not
@@ -263,6 +263,8 @@ def tail_logs(name: Optional[str] = None,
263
263
  Exit code based on success or failure of the job. 0 if success,
264
264
  100 if the job failed. See exceptions.JobExitCode for possible exit
265
265
  codes.
266
+ Will return None if follow is False
267
+ (see note in sky/client/sdk.py::stream_response)
266
268
 
267
269
  Request Raises:
268
270
  ValueError: invalid arguments.
@@ -289,7 +291,8 @@ def tail_logs(name: Optional[str] = None,
289
291
  return sdk.stream_response(request_id=request_id,
290
292
  response=response,
291
293
  output_stream=output_stream,
292
- resumable=(tail == 0))
294
+ resumable=(tail == 0),
295
+ get_result=follow)
293
296
 
294
297
 
295
298
  @usage_lib.entrypoint
@@ -327,10 +327,15 @@ class StrategyExecutor:
327
327
  cluster_name=self.cluster_name,
328
328
  # We expect to tear down the cluster as soon as
329
329
  # the job is finished. However, in case the
330
- # controller dies, set autodown to try and avoid
331
- # a resource leak.
332
- idle_minutes_to_autostop=_AUTODOWN_MINUTES,
333
- down=True,
330
+ # controller dies, we may end up with a
331
+ # resource leak.
332
+ # Ideally, we should autodown to be safe,
333
+ # but it's fine to disable it for now, as
334
+ # Nebius doesn't support autodown yet.
335
+ # TODO(kevin): set down=True once Nebius
336
+ # supports autodown.
337
+ # idle_minutes_to_autostop=_AUTODOWN_MINUTES,
338
+ # down=True,
334
339
  _is_launched_by_jobs_controller=True)
335
340
  else:
336
341
  self.cluster_name = (
sky/jobs/server/server.py CHANGED
@@ -79,7 +79,8 @@ async def logs(
79
79
  if jobs_logs_body.refresh else api_requests.ScheduleType.SHORT,
80
80
  request_cluster_name=common.JOB_CONTROLLER_NAME,
81
81
  )
82
- request_task = api_requests.get_request(request.state.request_id)
82
+ request_task = await api_requests.get_request_async(request.state.request_id
83
+ )
83
84
 
84
85
  return stream_utils.stream_response(
85
86
  request_id=request_task.request_id,
sky/logs/agent.py CHANGED
@@ -5,8 +5,8 @@ import shlex
5
5
  from typing import Any, Dict
6
6
 
7
7
  from sky.skylet import constants
8
- from sky.utils import common_utils
9
8
  from sky.utils import resources_utils
9
+ from sky.utils import yaml_utils
10
10
 
11
11
 
12
12
  class LoggingAgent(abc.ABC):
@@ -65,7 +65,7 @@ class FluentbitAgent(LoggingAgent):
65
65
  'outputs': [self.fluentbit_output_config(cluster_name)],
66
66
  }
67
67
  }
68
- return common_utils.dump_yaml_str(cfg_dict)
68
+ return yaml_utils.dump_yaml_str(cfg_dict)
69
69
 
70
70
  @abc.abstractmethod
71
71
  def fluentbit_output_config(
sky/logs/aws.py CHANGED
@@ -6,8 +6,8 @@ import pydantic
6
6
 
7
7
  from sky.logs.agent import FluentbitAgent
8
8
  from sky.skylet import constants
9
- from sky.utils import common_utils
10
9
  from sky.utils import resources_utils
10
+ from sky.utils import yaml_utils
11
11
 
12
12
  EC2_MD_URL = '"${AWS_EC2_METADATA_SERVICE_ENDPOINT:-http://169.254.169.254/}"'
13
13
 
@@ -130,7 +130,10 @@ class CloudwatchLoggingAgent(FluentbitAgent):
130
130
 
131
131
  # If region is specified, set it in the environment
132
132
  if self.config.region:
133
- pre_cmd += f' export AWS_REGION={self.config.region};'
133
+ pre_cmd += (f' export AWS_REGION={self.config.region}'
134
+ f' AWS_DEFAULT_REGION={self.config.region};'
135
+ ' command -v aws &>/dev/null && '
136
+ f'aws configure set region {self.config.region};')
134
137
  else:
135
138
  # If region is not specified, check if it's available in
136
139
  # the environment or credentials file
@@ -213,7 +216,7 @@ class CloudwatchLoggingAgent(FluentbitAgent):
213
216
  }
214
217
  }
215
218
 
216
- return common_utils.dump_yaml_str(cfg_dict)
219
+ return yaml_utils.dump_yaml_str(cfg_dict)
217
220
 
218
221
  def fluentbit_output_config(
219
222
  self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
@@ -87,6 +87,9 @@ def bootstrap_instances(
87
87
  use_internal_ips=config.provider_config.get('use_internal_ips', False),
88
88
  vpc_name=config.provider_config.get('vpc_name'))
89
89
 
90
+ max_efa_interfaces = config.provider_config.get('max_efa_interfaces', 0)
91
+ enable_efa = max_efa_interfaces > 0
92
+
90
93
  # Cluster workers should be in a security group that permits traffic within
91
94
  # the group, and also SSH access from outside.
92
95
  if security_group_ids is None:
@@ -103,7 +106,8 @@ def bootstrap_instances(
103
106
  extended_ip_rules = []
104
107
  security_group_ids = _configure_security_group(ec2, vpc_id,
105
108
  expected_sg_name,
106
- extended_ip_rules)
109
+ extended_ip_rules,
110
+ enable_efa)
107
111
  if expected_sg_name != aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
108
112
  logger.debug('Attempting to create the default security group.')
109
113
  # Attempt to create the default security group. This is needed
@@ -114,7 +118,7 @@ def bootstrap_instances(
114
118
  try:
115
119
  _configure_security_group(ec2, vpc_id,
116
120
  aws_cloud.DEFAULT_SECURITY_GROUP_NAME,
117
- [])
121
+ [], enable_efa)
118
122
  logger.debug('Default security group created.')
119
123
  except exceptions.NoClusterLaunchedError as e:
120
124
  if 'not authorized to perform: ec2:CreateSecurityGroup' in str(
@@ -148,6 +152,37 @@ def bootstrap_instances(
148
152
  return config
149
153
 
150
154
 
155
+ def _configure_placement_group(ec2: 'mypy_boto3_ec2.ServiceResource',
156
+ placement_group_name: str):
157
+ """Configure placement group for the cluster."""
158
+ # Create the placement group
159
+ logger.info(f'Creating placement group {placement_group_name}.')
160
+ try:
161
+ ec2.meta.client.create_placement_group(GroupName=placement_group_name,
162
+ Strategy='cluster')
163
+ except aws.botocore_exceptions().ClientError as exc:
164
+ if exc.response.get(
165
+ 'Error', {}).get('Code') == 'InvalidPlacementGroup.Duplicate':
166
+ logger.debug(
167
+ f'Placement group {placement_group_name} already exists.')
168
+ else:
169
+ raise exc
170
+
171
+
172
+ def delete_placement_group(ec2: 'mypy_boto3_ec2.ServiceResource',
173
+ placement_group_name: str):
174
+ """Delete the placement group."""
175
+ try:
176
+ ec2.meta.client.delete_placement_group(GroupName=placement_group_name)
177
+ except aws.botocore_exceptions().ClientError as exc:
178
+ if exc.response.get('Error',
179
+ {}).get('Code') == 'InvalidPlacementGroup.Unknown':
180
+ logger.debug(
181
+ f'Placement group {placement_group_name} does not exist.')
182
+ else:
183
+ raise exc
184
+
185
+
151
186
  def _configure_iam_role(iam) -> Dict[str, Any]:
152
187
 
153
188
  def _get_instance_profile(profile_name: str):
@@ -557,7 +592,8 @@ def _get_subnet_and_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
557
592
 
558
593
  def _configure_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
559
594
  vpc_id: str, expected_sg_name: str,
560
- extended_ip_rules: List) -> List[str]:
595
+ extended_ip_rules: List,
596
+ enable_efa: bool) -> List[str]:
561
597
  security_group = _get_or_create_vpc_security_group(ec2, vpc_id,
562
598
  expected_sg_name)
563
599
  sg_ids = [security_group.id]
@@ -583,16 +619,55 @@ def _configure_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
583
619
  },
584
620
  *extended_ip_rules,
585
621
  ]
622
+ outbound_rules = []
623
+ if enable_efa:
624
+ # EFA requires that outbound rules permit the same security group to
625
+ # communicate with each other
626
+ # Refer to https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start-nccl.html#nccl-start-base-setup # pylint: disable=line-too-long
627
+ outbound_rules.append({
628
+ 'FromPort': -1,
629
+ 'ToPort': -1,
630
+ 'IpProtocol': '-1',
631
+ 'UserIdGroupPairs': [{
632
+ 'GroupId': i
633
+ } for i in sg_ids],
634
+ })
586
635
  # upsert the default security group
587
636
  if not security_group.ip_permissions:
588
637
  # If users specify security groups, we should not change the rules
589
638
  # of these security groups. Here we change it because it is the default
590
639
  # security group for SkyPilot.
591
640
  security_group.authorize_ingress(IpPermissions=inbound_rules)
641
+ if _need_to_update_outbound_rules(security_group, outbound_rules):
642
+ security_group.authorize_egress(IpPermissions=outbound_rules)
592
643
 
593
644
  return sg_ids
594
645
 
595
646
 
647
+ def _need_to_update_outbound_rules(
648
+ security_group: Any,
649
+ outbound_rules: List[Dict[str, Any]],
650
+ ) -> bool:
651
+ """Check if we need to update the outbound rules of the security group."""
652
+ if not security_group.ip_permissions_egress:
653
+ return True # No outbound rules, we need to add them
654
+ existing_group_ids = []
655
+ for rule in security_group.ip_permissions_egress:
656
+ if 'UserIdGroupPairs' in rule:
657
+ group_pairs = rule['UserIdGroupPairs']
658
+ for pair in group_pairs:
659
+ existing_group_ids.append(pair['GroupId'])
660
+ logger.debug(f'Existing group ids: {existing_group_ids}')
661
+ for rule in outbound_rules:
662
+ if 'UserIdGroupPairs' in rule:
663
+ group_pairs = rule['UserIdGroupPairs']
664
+ for pair in group_pairs:
665
+ if pair['GroupId'] not in existing_group_ids:
666
+ logger.debug(f'New group id: {pair["GroupId"]}')
667
+ return True # New group id, we need to add it
668
+ return False # No need to update
669
+
670
+
596
671
  def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
597
672
  vpc_id: str,
598
673
  expected_sg_name: str) -> Any:
@@ -184,9 +184,15 @@ def _merge_tag_specs(tag_specs: List[Dict[str, Any]],
184
184
  tag_specs += [user_tag_spec]
185
185
 
186
186
 
187
- def _create_instances(ec2_fail_fast, cluster_name: str,
188
- node_config: Dict[str, Any], tags: Dict[str, str],
189
- count: int, associate_public_ip_address: bool) -> List:
187
+ def _create_instances(
188
+ ec2_fail_fast,
189
+ cluster_name: str,
190
+ node_config: Dict[str, Any],
191
+ tags: Dict[str, str],
192
+ count: int,
193
+ associate_public_ip_address: bool,
194
+ max_efa_interfaces: int,
195
+ ) -> List:
190
196
  tags = {
191
197
  'Name': cluster_name,
192
198
  constants.TAG_RAY_CLUSTER_NAME: cluster_name,
@@ -239,7 +245,36 @@ def _create_instances(ec2_fail_fast, cluster_name: str,
239
245
  # Whether the VM(s) should have a public IP.
240
246
  'AssociatePublicIpAddress': associate_public_ip_address,
241
247
  'Groups': security_group_ids,
248
+ 'InterfaceType': 'efa'
249
+ if max_efa_interfaces > 0 else 'interface',
242
250
  }]
251
+ # Due to AWS limitation, if an instance type supports multiple
252
+ # network cards, we cannot assign public IP addresses to the
253
+ # instance during creation, which will raise the following error:
254
+ # (InvalidParameterCombination) when calling the RunInstances
255
+ # operation: The associatePublicIPAddress parameter cannot be
256
+ # specified when launching with multiple network interfaces.
257
+ # So we only attach multiple network interfaces if public IP is
258
+ # not required.
259
+ # TODO(hailong): support attaching/detaching elastic IP to expose
260
+ # public IP in this case.
261
+ if max_efa_interfaces > 1 and not associate_public_ip_address:
262
+ instance_type = conf['InstanceType']
263
+ for i in range(1, max_efa_interfaces):
264
+ interface_type = 'efa-only'
265
+ # Special handling for P5 instances
266
+ # Refer to https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-acc-inst-types.html#efa-for-p5 for more details. # pylint: disable=line-too-long
267
+ if (instance_type == 'p5.48xlarge' or
268
+ instance_type == 'p5e.48xlarge'):
269
+ interface_type = 'efa' if i % 4 == 0 else 'efa-only'
270
+ network_interfaces.append({
271
+ 'SubnetId': subnet_id,
272
+ 'DeviceIndex': 1,
273
+ 'NetworkCardIndex': i,
274
+ 'AssociatePublicIpAddress': False,
275
+ 'Groups': security_group_ids,
276
+ 'InterfaceType': interface_type,
277
+ })
243
278
  conf['NetworkInterfaces'] = network_interfaces
244
279
 
245
280
  instances = _ec2_call_with_retry_on_server_error(
@@ -289,6 +324,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
289
324
  zone = None
290
325
  resumed_instance_ids: List[str] = []
291
326
  created_instance_ids: List[str] = []
327
+ max_efa_interfaces = config.provider_config.get('max_efa_interfaces', 0)
292
328
 
293
329
  # sort tags by key to support deterministic unit test stubbing
294
330
  tags = dict(sorted(copy.deepcopy(config.tags).items()))
@@ -504,7 +540,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
504
540
  tags,
505
541
  reservation_count,
506
542
  associate_public_ip_address=(
507
- not config.provider_config['use_internal_ips']))
543
+ not config.provider_config['use_internal_ips']),
544
+ max_efa_interfaces=max_efa_interfaces)
508
545
  created_instances.extend(created_reserved_instances)
509
546
  to_start_count -= reservation_count
510
547
  if to_start_count <= 0:
@@ -527,7 +564,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
527
564
  tags,
528
565
  to_start_count,
529
566
  associate_public_ip_address=(
530
- not config.provider_config['use_internal_ips']))
567
+ not config.provider_config['use_internal_ips']),
568
+ max_efa_interfaces=max_efa_interfaces)
531
569
 
532
570
  created_instances.extend(created_remaining_instances)
533
571
  created_instances.sort(key=lambda x: x.id)
@@ -686,6 +724,7 @@ def terminate_instances(
686
724
  filters,
687
725
  included_instances=None,
688
726
  excluded_instances=None)
727
+ instance_list = list(instances)
689
728
  default_sg = aws_config.get_security_group_from_vpc_id(
690
729
  ec2, _get_vpc_id(provider_config),
691
730
  aws_cloud.DEFAULT_SECURITY_GROUP_NAME)
@@ -719,7 +758,7 @@ def terminate_instances(
719
758
  # exist. We must block on instance termination so that we can
720
759
  # delete the security group.
721
760
  instances.terminate()
722
- for instance in instances:
761
+ for instance in instance_list:
723
762
  instance.wait_until_terminated()
724
763
 
725
764
  # TODO(suquark): Currently, the implementation of GCP and Azure will
sky/provision/do/utils.py CHANGED
@@ -17,6 +17,7 @@ from sky.provision import constants as provision_constants
17
17
  from sky.provision.do import constants
18
18
  from sky.utils import annotations
19
19
  from sky.utils import common_utils
20
+ from sky.utils import yaml_utils
20
21
 
21
22
  logger = sky_logging.init_logger(__name__)
22
23
 
@@ -61,7 +62,7 @@ def _init_client():
61
62
  if get_credentials_path() is None:
62
63
  raise DigitalOceanError(
63
64
  'No credentials found, please run `doctl auth init`')
64
- credentials = common_utils.read_yaml(get_credentials_path())
65
+ credentials = yaml_utils.read_yaml(get_credentials_path())
65
66
  default_token = credentials.get('access-token', None)
66
67
  if default_token is not None:
67
68
  try: