skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251011__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (42) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/cloud_vm_ray_backend.py +1 -0
  3. sky/client/cli/command.py +9 -1
  4. sky/dashboard/out/404.html +1 -1
  5. sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → Xs6jdcfyNaUuBO8jmzU9_}/_buildManifest.js +1 -1
  6. sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +1 -0
  7. sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-4f7079dcab6ed653.js → [job]-e5c9ce6a24fc0de4.js} +1 -1
  8. sky/dashboard/out/_next/static/chunks/{webpack-6a5ddd0184bfa22c.js → webpack-66f23594d38c7f16.js} +1 -1
  9. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  10. sky/dashboard/out/clusters/[cluster].html +1 -1
  11. sky/dashboard/out/clusters.html +1 -1
  12. sky/dashboard/out/config.html +1 -1
  13. sky/dashboard/out/index.html +1 -1
  14. sky/dashboard/out/infra/[context].html +1 -1
  15. sky/dashboard/out/infra.html +1 -1
  16. sky/dashboard/out/jobs/[job].html +1 -1
  17. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  18. sky/dashboard/out/jobs.html +1 -1
  19. sky/dashboard/out/users.html +1 -1
  20. sky/dashboard/out/volumes.html +1 -1
  21. sky/dashboard/out/workspace/new.html +1 -1
  22. sky/dashboard/out/workspaces/[name].html +1 -1
  23. sky/dashboard/out/workspaces.html +1 -1
  24. sky/data/mounting_utils.py +54 -15
  25. sky/jobs/server/server.py +2 -2
  26. sky/provision/kubernetes/instance.py +2 -27
  27. sky/provision/kubernetes/utils.py +47 -6
  28. sky/serve/constants.py +3 -0
  29. sky/serve/server/server.py +1 -1
  30. sky/serve/service_spec.py +8 -1
  31. sky/server/requests/executor.py +36 -36
  32. sky/server/server.py +6 -2
  33. sky/server/stream_utils.py +61 -26
  34. sky/utils/common_utils.py +6 -3
  35. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251011.dist-info}/METADATA +34 -34
  36. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251011.dist-info}/RECORD +41 -41
  37. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  38. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → Xs6jdcfyNaUuBO8jmzU9_}/_ssgManifest.js +0 -0
  39. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251011.dist-info}/WHEEL +0 -0
  40. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251011.dist-info}/entry_points.txt +0 -0
  41. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251011.dist-info}/licenses/LICENSE +0 -0
  42. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251011.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6a5ddd0184bfa22c.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/volumes-835d14ba94808f79.js" defer=""></script><script src="/dashboard/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/hIViZcQBkn0HE8SpaSsUU/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/volumes","query":{},"buildId":"hIViZcQBkn0HE8SpaSsUU","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-66f23594d38c7f16.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/volumes-835d14ba94808f79.js" defer=""></script><script src="/dashboard/_next/static/Xs6jdcfyNaUuBO8jmzU9_/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Xs6jdcfyNaUuBO8jmzU9_/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/volumes","query":{},"buildId":"Xs6jdcfyNaUuBO8jmzU9_","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6a5ddd0184bfa22c.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js" defer=""></script><script src="/dashboard/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/hIViZcQBkn0HE8SpaSsUU/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"hIViZcQBkn0HE8SpaSsUU","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-66f23594d38c7f16.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js" defer=""></script><script src="/dashboard/_next/static/Xs6jdcfyNaUuBO8jmzU9_/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Xs6jdcfyNaUuBO8jmzU9_/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"Xs6jdcfyNaUuBO8jmzU9_","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6a5ddd0184bfa22c.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/7359-c8d04e06886000b3.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-66237729cdf9749e.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-f6818c84ed8f1c86.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-d0782b9251f0fcd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-8d748834fcc60b46.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-3b40c39626f99c89.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-e8688c35c06f0ac5.js" defer=""></script><script src="/dashboard/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/hIViZcQBkn0HE8SpaSsUU/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"hIViZcQBkn0HE8SpaSsUU","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-66f23594d38c7f16.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/7359-c8d04e06886000b3.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-66237729cdf9749e.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-f6818c84ed8f1c86.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-d0782b9251f0fcd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-7e0e8f06bb2f881c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-3b40c39626f99c89.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-e8688c35c06f0ac5.js" defer=""></script><script src="/dashboard/_next/static/Xs6jdcfyNaUuBO8jmzU9_/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Xs6jdcfyNaUuBO8jmzU9_/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"Xs6jdcfyNaUuBO8jmzU9_","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6a5ddd0184bfa22c.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-69c80d677d3c2949.js" defer=""></script><script src="/dashboard/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/hIViZcQBkn0HE8SpaSsUU/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"hIViZcQBkn0HE8SpaSsUU","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-66f23594d38c7f16.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-69c80d677d3c2949.js" defer=""></script><script src="/dashboard/_next/static/Xs6jdcfyNaUuBO8jmzU9_/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Xs6jdcfyNaUuBO8jmzU9_/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"Xs6jdcfyNaUuBO8jmzU9_","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -185,27 +185,63 @@ def get_gcs_mount_cmd(bucket_name: str,
185
185
  def get_az_mount_install_cmd() -> str:
186
186
  """Returns a command to install AZ Container mount utility blobfuse2."""
187
187
  install_cmd = (
188
- 'sudo apt-get update; '
189
- 'sudo apt-get install -y '
190
- '-o Dpkg::Options::="--force-confdef" '
191
- 'fuse3 libfuse3-dev || { '
192
- ' echo "fuse3 not available, falling back to fuse"; '
193
- ' sudo apt-get install -y '
194
- ' -o Dpkg::Options::="--force-confdef" '
195
- ' fuse libfuse-dev; '
196
- '} && '
188
+ # Check architecture first - blobfuse2 only supports x86_64
197
189
  'ARCH=$(uname -m) && '
198
190
  'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
199
191
  ' echo "blobfuse2 is not supported on $ARCH" && '
200
192
  f' exit {exceptions.ARCH_NOT_SUPPORTED_EXIT_CODE}; '
193
+ 'fi && '
194
+ # Try to install fuse3 from default repos
195
+ 'sudo apt-get update && '
196
+ 'FUSE3_INSTALLED=0 && '
197
+ 'if sudo apt-get install -y '
198
+ '-o Dpkg::Options::="--force-confdef" '
199
+ 'fuse3 libfuse3-dev; then '
200
+ ' FUSE3_INSTALLED=1; '
201
+ ' echo "fuse3 installed from default repos"; '
201
202
  'else '
202
- ' ARCH_SUFFIX="x86_64"; '
203
+ # If fuse3 not available, try focal for Ubuntu <= 20.04
204
+ ' DISTRO=$(grep "^ID=" /etc/os-release | cut -d= -f2 | '
205
+ 'tr -d \'"\' | tr "[:upper:]" "[:lower:]") && '
206
+ ' VERSION=$(grep "^VERSION_ID=" /etc/os-release | cut -d= -f2 | '
207
+ 'tr -d \'"\') && '
208
+ ' if [ "$DISTRO" = "ubuntu" ] && '
209
+ '[ "$(echo "$VERSION 20.04" | '
210
+ 'awk \'{ print ($1 <= $2) }\')" = "1" ]; then '
211
+ ' echo "Trying to install fuse3 from focal for '
212
+ 'Ubuntu $VERSION"; '
213
+ ' echo "deb http://archive.ubuntu.com/ubuntu '
214
+ 'focal main universe" | '
215
+ 'sudo tee /etc/apt/sources.list.d/focal-fuse3.list && '
216
+ ' sudo apt-get update && '
217
+ ' if sudo apt-get install -y '
218
+ '-o Dpkg::Options::="--force-confdef" '
219
+ '-o Dpkg::Options::="--force-confold" '
220
+ 'fuse3 libfuse3-3 libfuse3-dev; then '
221
+ ' FUSE3_INSTALLED=1; '
222
+ ' echo "fuse3 installed from focal"; '
223
+ ' sudo rm /etc/apt/sources.list.d/focal-fuse3.list; '
224
+ ' sudo apt-get update; '
225
+ ' else '
226
+ ' sudo rm -f /etc/apt/sources.list.d/focal-fuse3.list; '
227
+ ' sudo apt-get update; '
228
+ ' fi; '
229
+ ' fi; '
203
230
  'fi && '
204
- 'wget -nc https://github.com/Azure/azure-storage-fuse'
205
- f'/releases/download/blobfuse2-{BLOBFUSE2_VERSION}'
206
- f'/blobfuse2-{BLOBFUSE2_VERSION}-Debian-11.0.${{ARCH_SUFFIX}}.deb '
231
+ # Install blobfuse2 only if fuse3 is available
232
+ 'if [ "$FUSE3_INSTALLED" = "1" ]; then '
233
+ ' echo "Installing blobfuse2 with libfuse3 support"; '
234
+ ' wget -nc https://github.com/Azure/azure-storage-fuse'
235
+ f'/releases/download/blobfuse2-{BLOBFUSE2_VERSION}/'
236
+ f'blobfuse2-{BLOBFUSE2_VERSION}-Debian-11.0.x86_64.deb '
207
237
  '-O /tmp/blobfuse2.deb && '
208
- 'sudo dpkg --install /tmp/blobfuse2.deb && '
238
+ ' sudo dpkg --install /tmp/blobfuse2.deb; '
239
+ 'else '
240
+ ' echo "Error: libfuse3 is required for Azure storage '
241
+ 'mounting with fusermount-wrapper."; '
242
+ ' echo "libfuse3 could not be installed on this system."; '
243
+ f' exit {exceptions.ARCH_NOT_SUPPORTED_EXIT_CODE}; '
244
+ 'fi && '
209
245
  f'mkdir -p {_BLOBFUSE_CACHE_ROOT_DIR};')
210
246
 
211
247
  return install_cmd
@@ -277,7 +313,10 @@ def get_az_mount_cmd(container_name: str,
277
313
  f'-- {blobfuse2_cmd} -o nonempty --foreground {{}}')
278
314
  original = f'{blobfuse2_cmd} {blobfuse2_options} {mount_path}'
279
315
  # If fusermount-wrapper is available, use it to wrap the blobfuse2 command
280
- # to avoid requiring root privilege.
316
+ # to avoid requiring privileged containers.
317
+ # fusermount-wrapper requires libfuse3;
318
+ # we install libfuse3 even on older distros like Ubuntu 18.04 by using
319
+ # Ubuntu 20.04 (focal) repositories.
281
320
  # TODO(aylei): feeling hacky, refactor this.
282
321
  get_mount_cmd = ('command -v fusermount-wrapper >/dev/null 2>&1 && '
283
322
  f'echo "{wrapped}" || echo "{original}"')
sky/jobs/server/server.py CHANGED
@@ -116,7 +116,7 @@ async def logs(
116
116
  # Cancel the coroutine after the request is done or client disconnects
117
117
  background_tasks.add_task(task.cancel)
118
118
 
119
- return stream_utils.stream_response(
119
+ return stream_utils.stream_response_for_long_request(
120
120
  request_id=request_task.request_id,
121
121
  logs_path=request_task.log_path,
122
122
  background_tasks=background_tasks,
@@ -201,7 +201,7 @@ async def pool_tail_logs(
201
201
 
202
202
  request_task = api_requests.get_request(request.state.request_id)
203
203
 
204
- return stream_utils.stream_response(
204
+ return stream_utils.stream_response_for_long_request(
205
205
  request_id=request_task.request_id,
206
206
  logs_path=request_task.log_path,
207
207
  background_tasks=background_tasks,
@@ -5,7 +5,7 @@ import json
5
5
  import re
6
6
  import sys
7
7
  import time
8
- from typing import Any, Callable, Dict, List, Optional, Tuple, Union
8
+ from typing import Any, Dict, List, Optional, Tuple, Union
9
9
 
10
10
  from sky import exceptions
11
11
  from sky import global_user_state
@@ -583,31 +583,6 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
583
583
  time.sleep(1)
584
584
 
585
585
 
586
- def _run_function_with_retries(func: Callable,
587
- operation_name: str,
588
- max_retries: int = _MAX_RETRIES,
589
- retry_delay: int = 5) -> Any:
590
- """Runs a function with retries on Kubernetes errors.
591
- Args:
592
- func: Function to retry
593
- operation_name: Name of the operation for logging
594
- max_retries: Maximum number of retry attempts
595
- retry_delay: Delay between retries in seconds
596
- Raises:
597
- The last exception encountered if all retries fail.
598
- """
599
- for attempt in range(max_retries + 1):
600
- try:
601
- return func()
602
- except config_lib.KubernetesError:
603
- if attempt < max_retries:
604
- logger.warning(f'Failed to {operation_name} - '
605
- f'retrying in {retry_delay} seconds.')
606
- time.sleep(retry_delay)
607
- else:
608
- raise
609
-
610
-
611
586
  @timeline.event
612
587
  def pre_init(namespace: str, context: Optional[str], new_nodes: List) -> None:
613
588
  """Pre-initialization step for SkyPilot pods.
@@ -954,7 +929,7 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
954
929
  nvidia_runtime_exists = False
955
930
  try:
956
931
  nvidia_runtime_exists = kubernetes_utils.check_nvidia_runtime_class(
957
- context)
932
+ context=context)
958
933
  except kubernetes.kubernetes.client.ApiException as e:
959
934
  logger.warning('run_instances: Error occurred while checking for '
960
935
  f'nvidia RuntimeClass - '
@@ -238,6 +238,40 @@ def normalize_tpu_accelerator_name(accelerator: str) -> Tuple[str, int]:
238
238
  return accelerator, 1
239
239
 
240
240
 
241
+ def _is_cloudflare_403_error(exception: Exception) -> bool:
242
+ """Check if an exception is a transient CloudFlare 403 error.
243
+
244
+ CloudFlare proxy 403 errors with CF-specific headers are transient and
245
+ should be retried, unlike real RBAC 403 errors.
246
+
247
+ Args:
248
+ exception: The exception to check
249
+
250
+ Returns:
251
+ True if this is a CloudFlare 403 error that should be retried
252
+ """
253
+ if not isinstance(exception, kubernetes.api_exception()):
254
+ return False
255
+
256
+ # Only check for 403 errors
257
+ if exception.status != 403:
258
+ return False
259
+
260
+ # Check for CloudFlare-specific headers
261
+ headers = exception.headers if hasattr(exception, 'headers') else {}
262
+ if not headers:
263
+ return False
264
+
265
+ # CloudFlare errors have CF-RAY header and/or Server: cloudflare
266
+ for k, v in headers.items():
267
+ if 'cf-ray' in k.lower():
268
+ return True
269
+ if 'server' in k.lower() and 'cloudflare' in str(v).lower():
270
+ return True
271
+
272
+ return False
273
+
274
+
241
275
  def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
242
276
  retry_interval=DEFAULT_RETRY_INTERVAL_SECONDS,
243
277
  resource_type: Optional[str] = None):
@@ -272,19 +306,25 @@ def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
272
306
  kubernetes.api_exception(),
273
307
  kubernetes.config_exception()) as e:
274
308
  last_exception = e
309
+
310
+ # Check if this is a CloudFlare transient 403 error
311
+ is_cloudflare_403 = _is_cloudflare_403_error(e)
312
+
275
313
  # Don't retry on permanent errors like 401 (Unauthorized)
276
- # or 403 (Forbidden)
314
+ # or 403 (Forbidden), unless it's a CloudFlare transient 403
277
315
  if (isinstance(e, kubernetes.api_exception()) and
278
- e.status in (401, 403)):
316
+ e.status in (401, 403) and not is_cloudflare_403):
279
317
  # Raise KubeAPIUnreachableError exception so that the
280
318
  # optimizer/provisioner can failover to other clouds.
281
319
  raise exceptions.KubeAPIUnreachableError(
282
320
  f'Kubernetes API error: {str(e)}') from e
283
321
  if attempt < max_retries - 1:
284
322
  sleep_time = backoff.current_backoff()
285
- logger.debug(f'Kubernetes API call {func.__name__} '
286
- f'failed with {str(e)}. Retrying in '
287
- f'{sleep_time:.1f}s...')
323
+ error_type = 'CloudFlare 403' if is_cloudflare_403 else 'error'
324
+ logger.debug(
325
+ f'Kubernetes API call {func.__name__} '
326
+ f'failed with {error_type} {str(e)}. Retrying in '
327
+ f'{sleep_time:.1f}s...')
288
328
  time.sleep(sleep_time)
289
329
  continue
290
330
 
@@ -2738,7 +2778,8 @@ def merge_custom_metadata(
2738
2778
  config_utils.merge_k8s_configs(original_metadata, custom_metadata)
2739
2779
 
2740
2780
 
2741
- def check_nvidia_runtime_class(context: Optional[str] = None) -> bool:
2781
+ @_retry_on_error(resource_type='runtimeclass')
2782
+ def check_nvidia_runtime_class(*, context: Optional[str] = None) -> bool:
2742
2783
  """Checks if the 'nvidia' RuntimeClass exists in the cluster"""
2743
2784
  # Fetch the list of available RuntimeClasses
2744
2785
  runtime_classes = kubernetes.node_api(context).list_runtime_class()
sky/serve/constants.py CHANGED
@@ -76,6 +76,9 @@ CONTROLLER_AUTOSTOP = {
76
76
  # A period of time to initialize your service. Any readiness probe failures
77
77
  # during this period will be ignored.
78
78
  DEFAULT_INITIAL_DELAY_SECONDS = 1200
79
+ # For pool, we shrink the initial delay to 300s to make the pool more
80
+ # responsive to the failure that setup command starts a long-running server.
81
+ DEFAULT_INITIAL_DELAY_SECONDS_POOL = 300
79
82
  DEFAULT_MIN_REPLICAS = 1
80
83
 
81
84
  # Default port range start for controller and load balancer. Ports will be
@@ -109,7 +109,7 @@ async def tail_logs(
109
109
  task = executor.execute_request_in_coroutine(request_task)
110
110
  # Cancel the coroutine after the request is done or client disconnects
111
111
  background_tasks.add_task(task.cancel)
112
- return stream_utils.stream_response(
112
+ return stream_utils.stream_response_for_long_request(
113
113
  request_id=request_task.request_id,
114
114
  logs_path=request_task.log_path,
115
115
  background_tasks=background_tasks,
sky/serve/service_spec.py CHANGED
@@ -125,6 +125,12 @@ class SkyServiceSpec:
125
125
  self.base_ondemand_fallback_replicas is not None and
126
126
  self.base_ondemand_fallback_replicas > 0)
127
127
 
128
+ @staticmethod
129
+ def _get_initial_delay_seconds(pool: bool) -> int:
130
+ if pool:
131
+ return constants.DEFAULT_INITIAL_DELAY_SECONDS_POOL
132
+ return constants.DEFAULT_INITIAL_DELAY_SECONDS
133
+
128
134
  @staticmethod
129
135
  def from_yaml_config(config: Dict[str, Any]) -> 'SkyServiceSpec':
130
136
  common_utils.validate_schema(config, schemas.get_service_schema(),
@@ -153,7 +159,8 @@ class SkyServiceSpec:
153
159
  'timeout_seconds', None)
154
160
  readiness_headers = readiness_section.get('headers', None)
155
161
  if initial_delay_seconds is None:
156
- initial_delay_seconds = constants.DEFAULT_INITIAL_DELAY_SECONDS
162
+ initial_delay_seconds = SkyServiceSpec._get_initial_delay_seconds(
163
+ config.get('pool', False))
157
164
  service_config['initial_delay_seconds'] = initial_delay_seconds
158
165
  if readiness_timeout_seconds is None:
159
166
  readiness_timeout_seconds = (
@@ -349,32 +349,6 @@ def override_request_env_and_config(
349
349
  os.environ.update(original_env)
350
350
 
351
351
 
352
- def _get_current_output() -> Tuple[int, int]:
353
- """Get the current stdout and stderr file descriptors."""
354
- return os.dup(sys.stdout.fileno()), os.dup(sys.stderr.fileno())
355
-
356
-
357
- def _redirect_output(file: TextIO) -> None:
358
- """Redirect stdout and stderr to the log file."""
359
- # Get the file descriptor from the file object
360
- fd = file.fileno()
361
- # Copy this fd to stdout and stderr
362
- os.dup2(fd, sys.stdout.fileno())
363
- os.dup2(fd, sys.stderr.fileno())
364
-
365
-
366
- def _restore_output(original_stdout: Optional[int],
367
- original_stderr: Optional[int]) -> None:
368
- """Restore stdout and stderr to their original file descriptors."""
369
- if original_stdout is not None:
370
- os.dup2(original_stdout, sys.stdout.fileno())
371
- os.close(original_stdout)
372
-
373
- if original_stderr is not None:
374
- os.dup2(original_stderr, sys.stderr.fileno())
375
- os.close(original_stderr)
376
-
377
-
378
352
  def _sigterm_handler(signum: int, frame: Optional['types.FrameType']) -> None:
379
353
  raise KeyboardInterrupt
380
354
 
@@ -402,6 +376,34 @@ def _request_execution_wrapper(request_id: str,
402
376
  logger.info(f'Running request {request_id} with pid {pid}')
403
377
 
404
378
  original_stdout = original_stderr = None
379
+
380
+ def _save_current_output() -> None:
381
+ """Save the current stdout and stderr file descriptors."""
382
+ nonlocal original_stdout, original_stderr
383
+ original_stdout = os.dup(sys.stdout.fileno())
384
+ original_stderr = os.dup(sys.stderr.fileno())
385
+
386
+ def _redirect_output(file: TextIO) -> None:
387
+ """Redirect stdout and stderr to the log file."""
388
+ # Get the file descriptor from the file object
389
+ fd = file.fileno()
390
+ # Copy this fd to stdout and stderr
391
+ os.dup2(fd, sys.stdout.fileno())
392
+ os.dup2(fd, sys.stderr.fileno())
393
+
394
+ def _restore_output() -> None:
395
+ """Restore stdout and stderr to their original file descriptors."""
396
+ nonlocal original_stdout, original_stderr
397
+ if original_stdout is not None:
398
+ os.dup2(original_stdout, sys.stdout.fileno())
399
+ os.close(original_stdout)
400
+ original_stdout = None
401
+
402
+ if original_stderr is not None:
403
+ os.dup2(original_stderr, sys.stderr.fileno())
404
+ os.close(original_stderr)
405
+ original_stderr = None
406
+
405
407
  try:
406
408
  # As soon as the request is updated with the executor PID, we can
407
409
  # receive SIGTERM from cancellation. So, we update the request inside
@@ -422,7 +424,7 @@ def _request_execution_wrapper(request_id: str,
422
424
  # Store copies of the original stdout and stderr file descriptors
423
425
  # We do this in two steps because we should make sure to restore the
424
426
  # original values even if we are cancelled or fail during the redirect.
425
- original_stdout, original_stderr = _get_current_output()
427
+ _save_current_output()
426
428
 
427
429
  # Append to the log file instead of overwriting it since there might be
428
430
  # logs from previous retries.
@@ -464,15 +466,14 @@ def _request_execution_wrapper(request_id: str,
464
466
  # clear the pid of the request.
465
467
  request_task.pid = None
466
468
  # Yield control to the scheduler for uniform handling of retries.
467
- _restore_output(original_stdout, original_stderr)
469
+ _restore_output()
468
470
  raise
469
471
  except (Exception, SystemExit) as e: # pylint: disable=broad-except
470
472
  api_requests.set_request_failed(request_id, e)
471
473
  # Manually reset the original stdout and stderr file descriptors early
472
474
  # so that the "Request xxxx failed due to ..." log message will be
473
475
  # written to the original stdout and stderr file descriptors.
474
- _restore_output(original_stdout, original_stderr)
475
- original_stdout = original_stderr = None
476
+ _restore_output()
476
477
  logger.info(f'Request {request_id} failed due to '
477
478
  f'{common_utils.format_exception(e)}')
478
479
  return
@@ -482,11 +483,10 @@ def _request_execution_wrapper(request_id: str,
482
483
  # Manually reset the original stdout and stderr file descriptors early
483
484
  # so that the "Request xxxx failed due to ..." log message will be
484
485
  # written to the original stdout and stderr file descriptors.
485
- _restore_output(original_stdout, original_stderr)
486
- original_stdout = original_stderr = None
486
+ _restore_output()
487
487
  logger.info(f'Request {request_id} finished')
488
488
  finally:
489
- _restore_output(original_stdout, original_stderr)
489
+ _restore_output()
490
490
  try:
491
491
  # Capture the peak RSS before GC.
492
492
  peak_rss = max(proc.memory_info().rss, metrics_lib.peak_rss_bytes)
@@ -580,11 +580,11 @@ async def _execute_request_coroutine(request: api_requests.Request):
580
580
  **request_body.to_kwargs())
581
581
 
582
582
  async def poll_task(request_id: str) -> bool:
583
- request = await api_requests.get_request_async(request_id)
584
- if request is None:
583
+ req_status = await api_requests.get_request_status_async(request_id)
584
+ if req_status is None:
585
585
  raise RuntimeError('Request not found')
586
586
 
587
- if request.status == api_requests.RequestStatus.CANCELLED:
587
+ if req_status.status == api_requests.RequestStatus.CANCELLED:
588
588
  ctx.cancel()
589
589
  return True
590
590
 
sky/server/server.py CHANGED
@@ -1243,7 +1243,7 @@ async def logs(
1243
1243
  background_tasks.add_task(task.cancel)
1244
1244
  # TODO(zhwu): This makes viewing logs in browser impossible. We should adopt
1245
1245
  # the same approach as /stream.
1246
- return stream_utils.stream_response(
1246
+ return stream_utils.stream_response_for_long_request(
1247
1247
  request_id=request.state.request_id,
1248
1248
  logs_path=request_task.log_path,
1249
1249
  background_tasks=background_tasks,
@@ -1539,6 +1539,7 @@ async def stream(
1539
1539
  'X-Accel-Buffering': 'no'
1540
1540
  })
1541
1541
 
1542
+ polling_interval = stream_utils.DEFAULT_POLL_INTERVAL
1542
1543
  # Original plain text streaming logic
1543
1544
  if request_id is not None:
1544
1545
  request_task = await requests_lib.get_request_async(request_id)
@@ -1553,6 +1554,8 @@ async def stream(
1553
1554
  raise fastapi.HTTPException(
1554
1555
  status_code=404,
1555
1556
  detail=f'Log of request {request_id!r} has been deleted')
1557
+ if request_task.schedule_type == requests_lib.ScheduleType.LONG:
1558
+ polling_interval = stream_utils.LONG_REQUEST_POLL_INTERVAL
1556
1559
  else:
1557
1560
  assert log_path is not None, (request_id, log_path)
1558
1561
  if log_path == constants.API_SERVER_LOGS:
@@ -1600,7 +1603,8 @@ async def stream(
1600
1603
  log_path_to_stream,
1601
1604
  plain_logs=format == 'plain',
1602
1605
  tail=tail,
1603
- follow=follow),
1606
+ follow=follow,
1607
+ polling_interval=polling_interval),
1604
1608
  media_type='text/plain',
1605
1609
  headers=headers,
1606
1610
  )