skypilot-nightly 1.0.0.dev20251008__py3-none-any.whl → 1.0.0.dev20251011__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +69 -34
- sky/client/cli/command.py +15 -7
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/Xs6jdcfyNaUuBO8jmzU9_/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +11 -0
- sky/dashboard/out/_next/static/chunks/{9037-d0c00018a5ba198c.js → 1871-49141c317f3a9020.js} +2 -2
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +26 -0
- sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.a19328ba41517b8b.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/{5339.4a881570243431a5.js → 9360.71e83b2ddc844ec2.js} +4 -24
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-72794fc3fcdd517a.js → [job]-8f058b0346db2aff.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-477555ab7c0b13d8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-2f61f65487f6d8ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-553b8b5cb65e100b.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-910a22500c50596f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-dd64309c3fe67ed2.js → [job]-e5c9ce6a24fc0de4.js} +7 -2
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-509b2977a6373bf6.js → [pool]-bc979970c247d8f3.js} +7 -2
- sky/dashboard/out/_next/static/chunks/pages/jobs-a35a9dc3c5ccd657.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-98d2ed979084162a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-835d14ba94808f79.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-e8688c35c06f0ac5.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7528cc0ef8c522c5.js → workspaces-69c80d677d3c2949.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-66f23594d38c7f16.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +54 -15
- sky/jobs/controller.py +10 -2
- sky/jobs/recovery_strategy.py +3 -1
- sky/jobs/server/server.py +2 -2
- sky/metrics/utils.py +28 -17
- sky/provision/kubernetes/instance.py +20 -30
- sky/provision/kubernetes/utils.py +47 -6
- sky/serve/constants.py +3 -0
- sky/serve/server/server.py +1 -1
- sky/serve/service_spec.py +8 -1
- sky/server/requests/executor.py +36 -36
- sky/server/server.py +6 -2
- sky/server/stream_utils.py +61 -26
- sky/templates/kubernetes-ray.yml.j2 +1 -1
- sky/utils/common_utils.py +6 -3
- {skypilot_nightly-1.0.0.dev20251008.dist-info → skypilot_nightly-1.0.0.dev20251011.dist-info}/METADATA +35 -35
- {skypilot_nightly-1.0.0.dev20251008.dist-info → skypilot_nightly-1.0.0.dev20251011.dist-info}/RECORD +68 -68
- sky/dashboard/out/_next/static/MnvNdzHHpiZG1_oKSpbxF/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-159df2d4c441a9d1.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.93d9336bdc032b3a.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.0fa442e16dd3f00e.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e052384df65ef200.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-3286453d56f3c0a0.js +0 -1
- /sky/dashboard/out/_next/static/{MnvNdzHHpiZG1_oKSpbxF → Xs6jdcfyNaUuBO8jmzU9_}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251008.dist-info → skypilot_nightly-1.0.0.dev20251011.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251008.dist-info → skypilot_nightly-1.0.0.dev20251011.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251008.dist-info → skypilot_nightly-1.0.0.dev20251011.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251008.dist-info → skypilot_nightly-1.0.0.dev20251011.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-66f23594d38c7f16.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-69c80d677d3c2949.js" defer=""></script><script src="/dashboard/_next/static/Xs6jdcfyNaUuBO8jmzU9_/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Xs6jdcfyNaUuBO8jmzU9_/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"Xs6jdcfyNaUuBO8jmzU9_","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/data/mounting_utils.py
CHANGED
|
@@ -185,27 +185,63 @@ def get_gcs_mount_cmd(bucket_name: str,
|
|
|
185
185
|
def get_az_mount_install_cmd() -> str:
|
|
186
186
|
"""Returns a command to install AZ Container mount utility blobfuse2."""
|
|
187
187
|
install_cmd = (
|
|
188
|
-
|
|
189
|
-
'sudo apt-get install -y '
|
|
190
|
-
'-o Dpkg::Options::="--force-confdef" '
|
|
191
|
-
'fuse3 libfuse3-dev || { '
|
|
192
|
-
' echo "fuse3 not available, falling back to fuse"; '
|
|
193
|
-
' sudo apt-get install -y '
|
|
194
|
-
' -o Dpkg::Options::="--force-confdef" '
|
|
195
|
-
' fuse libfuse-dev; '
|
|
196
|
-
'} && '
|
|
188
|
+
# Check architecture first - blobfuse2 only supports x86_64
|
|
197
189
|
'ARCH=$(uname -m) && '
|
|
198
190
|
'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
|
|
199
191
|
' echo "blobfuse2 is not supported on $ARCH" && '
|
|
200
192
|
f' exit {exceptions.ARCH_NOT_SUPPORTED_EXIT_CODE}; '
|
|
193
|
+
'fi && '
|
|
194
|
+
# Try to install fuse3 from default repos
|
|
195
|
+
'sudo apt-get update && '
|
|
196
|
+
'FUSE3_INSTALLED=0 && '
|
|
197
|
+
'if sudo apt-get install -y '
|
|
198
|
+
'-o Dpkg::Options::="--force-confdef" '
|
|
199
|
+
'fuse3 libfuse3-dev; then '
|
|
200
|
+
' FUSE3_INSTALLED=1; '
|
|
201
|
+
' echo "fuse3 installed from default repos"; '
|
|
201
202
|
'else '
|
|
202
|
-
|
|
203
|
+
# If fuse3 not available, try focal for Ubuntu <= 20.04
|
|
204
|
+
' DISTRO=$(grep "^ID=" /etc/os-release | cut -d= -f2 | '
|
|
205
|
+
'tr -d \'"\' | tr "[:upper:]" "[:lower:]") && '
|
|
206
|
+
' VERSION=$(grep "^VERSION_ID=" /etc/os-release | cut -d= -f2 | '
|
|
207
|
+
'tr -d \'"\') && '
|
|
208
|
+
' if [ "$DISTRO" = "ubuntu" ] && '
|
|
209
|
+
'[ "$(echo "$VERSION 20.04" | '
|
|
210
|
+
'awk \'{ print ($1 <= $2) }\')" = "1" ]; then '
|
|
211
|
+
' echo "Trying to install fuse3 from focal for '
|
|
212
|
+
'Ubuntu $VERSION"; '
|
|
213
|
+
' echo "deb http://archive.ubuntu.com/ubuntu '
|
|
214
|
+
'focal main universe" | '
|
|
215
|
+
'sudo tee /etc/apt/sources.list.d/focal-fuse3.list && '
|
|
216
|
+
' sudo apt-get update && '
|
|
217
|
+
' if sudo apt-get install -y '
|
|
218
|
+
'-o Dpkg::Options::="--force-confdef" '
|
|
219
|
+
'-o Dpkg::Options::="--force-confold" '
|
|
220
|
+
'fuse3 libfuse3-3 libfuse3-dev; then '
|
|
221
|
+
' FUSE3_INSTALLED=1; '
|
|
222
|
+
' echo "fuse3 installed from focal"; '
|
|
223
|
+
' sudo rm /etc/apt/sources.list.d/focal-fuse3.list; '
|
|
224
|
+
' sudo apt-get update; '
|
|
225
|
+
' else '
|
|
226
|
+
' sudo rm -f /etc/apt/sources.list.d/focal-fuse3.list; '
|
|
227
|
+
' sudo apt-get update; '
|
|
228
|
+
' fi; '
|
|
229
|
+
' fi; '
|
|
203
230
|
'fi && '
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
231
|
+
# Install blobfuse2 only if fuse3 is available
|
|
232
|
+
'if [ "$FUSE3_INSTALLED" = "1" ]; then '
|
|
233
|
+
' echo "Installing blobfuse2 with libfuse3 support"; '
|
|
234
|
+
' wget -nc https://github.com/Azure/azure-storage-fuse'
|
|
235
|
+
f'/releases/download/blobfuse2-{BLOBFUSE2_VERSION}/'
|
|
236
|
+
f'blobfuse2-{BLOBFUSE2_VERSION}-Debian-11.0.x86_64.deb '
|
|
207
237
|
'-O /tmp/blobfuse2.deb && '
|
|
208
|
-
'sudo dpkg --install /tmp/blobfuse2.deb
|
|
238
|
+
' sudo dpkg --install /tmp/blobfuse2.deb; '
|
|
239
|
+
'else '
|
|
240
|
+
' echo "Error: libfuse3 is required for Azure storage '
|
|
241
|
+
'mounting with fusermount-wrapper."; '
|
|
242
|
+
' echo "libfuse3 could not be installed on this system."; '
|
|
243
|
+
f' exit {exceptions.ARCH_NOT_SUPPORTED_EXIT_CODE}; '
|
|
244
|
+
'fi && '
|
|
209
245
|
f'mkdir -p {_BLOBFUSE_CACHE_ROOT_DIR};')
|
|
210
246
|
|
|
211
247
|
return install_cmd
|
|
@@ -277,7 +313,10 @@ def get_az_mount_cmd(container_name: str,
|
|
|
277
313
|
f'-- {blobfuse2_cmd} -o nonempty --foreground {{}}')
|
|
278
314
|
original = f'{blobfuse2_cmd} {blobfuse2_options} {mount_path}'
|
|
279
315
|
# If fusermount-wrapper is available, use it to wrap the blobfuse2 command
|
|
280
|
-
# to avoid requiring
|
|
316
|
+
# to avoid requiring privileged containers.
|
|
317
|
+
# fusermount-wrapper requires libfuse3;
|
|
318
|
+
# we install libfuse3 even on older distros like Ubuntu 18.04 by using
|
|
319
|
+
# Ubuntu 20.04 (focal) repositories.
|
|
281
320
|
# TODO(aylei): feeling hacky, refactor this.
|
|
282
321
|
get_mount_cmd = ('command -v fusermount-wrapper >/dev/null 2>&1 && '
|
|
283
322
|
f'echo "{wrapped}" || echo "{original}"')
|
sky/jobs/controller.py
CHANGED
|
@@ -870,8 +870,16 @@ class Controller:
|
|
|
870
870
|
# because when SkyPilot API server machine sends the yaml config to
|
|
871
871
|
# the controller machine, only storage metadata is sent, not the
|
|
872
872
|
# storage object itself.
|
|
873
|
-
|
|
874
|
-
storage.
|
|
873
|
+
try:
|
|
874
|
+
for storage in task.storage_mounts.values():
|
|
875
|
+
storage.construct()
|
|
876
|
+
except (exceptions.StorageSpecError, exceptions.StorageError) as e:
|
|
877
|
+
job_logger.warning(
|
|
878
|
+
f'Failed to construct storage object for teardown: {e}\n'
|
|
879
|
+
'This may happen because storage construction already '
|
|
880
|
+
'failed during launch, storage was deleted externally, '
|
|
881
|
+
'credentials expired/changed, or network connectivity '
|
|
882
|
+
'issues.')
|
|
875
883
|
try:
|
|
876
884
|
backend.teardown_ephemeral_storage(task)
|
|
877
885
|
except Exception as e: # pylint: disable=broad-except
|
sky/jobs/recovery_strategy.py
CHANGED
|
@@ -495,7 +495,9 @@ class StrategyExecutor:
|
|
|
495
495
|
self._logger.info('Managed job cluster launched.')
|
|
496
496
|
except (exceptions.InvalidClusterNameError,
|
|
497
497
|
exceptions.NoCloudAccessError,
|
|
498
|
-
exceptions.ResourcesMismatchError
|
|
498
|
+
exceptions.ResourcesMismatchError,
|
|
499
|
+
exceptions.StorageSpecError,
|
|
500
|
+
exceptions.StorageError) as e:
|
|
499
501
|
self._logger.error(
|
|
500
502
|
'Failure happened before provisioning. '
|
|
501
503
|
f'{common_utils.format_exception(e)}')
|
sky/jobs/server/server.py
CHANGED
|
@@ -116,7 +116,7 @@ async def logs(
|
|
|
116
116
|
# Cancel the coroutine after the request is done or client disconnects
|
|
117
117
|
background_tasks.add_task(task.cancel)
|
|
118
118
|
|
|
119
|
-
return stream_utils.
|
|
119
|
+
return stream_utils.stream_response_for_long_request(
|
|
120
120
|
request_id=request_task.request_id,
|
|
121
121
|
logs_path=request_task.log_path,
|
|
122
122
|
background_tasks=background_tasks,
|
|
@@ -201,7 +201,7 @@ async def pool_tail_logs(
|
|
|
201
201
|
|
|
202
202
|
request_task = api_requests.get_request(request.state.request_id)
|
|
203
203
|
|
|
204
|
-
return stream_utils.
|
|
204
|
+
return stream_utils.stream_response_for_long_request(
|
|
205
205
|
request_id=request_task.request_id,
|
|
206
206
|
logs_path=request_task.log_path,
|
|
207
207
|
background_tasks=background_tasks,
|
sky/metrics/utils.py
CHANGED
|
@@ -195,6 +195,8 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
|
195
195
|
port_forward_process = None
|
|
196
196
|
port_forward_exit = False
|
|
197
197
|
local_port = None
|
|
198
|
+
poller = None
|
|
199
|
+
fd = None
|
|
198
200
|
|
|
199
201
|
try:
|
|
200
202
|
# start the port forward process
|
|
@@ -204,8 +206,13 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
|
204
206
|
text=True,
|
|
205
207
|
env=env)
|
|
206
208
|
|
|
207
|
-
|
|
209
|
+
# Use poll() instead of select() to avoid FD_SETSIZE limit
|
|
210
|
+
poller = select.poll()
|
|
211
|
+
assert port_forward_process.stdout is not None
|
|
212
|
+
fd = port_forward_process.stdout.fileno()
|
|
213
|
+
poller.register(fd, select.POLLIN)
|
|
208
214
|
|
|
215
|
+
start_time = time.time()
|
|
209
216
|
buffer = ''
|
|
210
217
|
# wait for the port forward to start and extract the local port
|
|
211
218
|
while time.time() - start_time < start_port_forward_timeout:
|
|
@@ -215,22 +222,19 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
|
215
222
|
port_forward_exit = True
|
|
216
223
|
break
|
|
217
224
|
|
|
218
|
-
#
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
if match:
|
|
232
|
-
local_port = int(match.group(1))
|
|
233
|
-
break
|
|
225
|
+
# Wait up to 1000ms for data to be available without blocking
|
|
226
|
+
# poll() takes timeout in milliseconds
|
|
227
|
+
events = poller.poll(_SELECT_TIMEOUT * 1000)
|
|
228
|
+
|
|
229
|
+
if events:
|
|
230
|
+
# Read available bytes from the FD without blocking
|
|
231
|
+
raw = os.read(fd, _SELECT_BUFFER_SIZE)
|
|
232
|
+
chunk = raw.decode(errors='ignore')
|
|
233
|
+
buffer += chunk
|
|
234
|
+
match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', buffer)
|
|
235
|
+
if match:
|
|
236
|
+
local_port = int(match.group(1))
|
|
237
|
+
break
|
|
234
238
|
|
|
235
239
|
# sleep for 100ms to avoid busy-waiting
|
|
236
240
|
time.sleep(0.1)
|
|
@@ -239,6 +243,13 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
|
239
243
|
stop_svc_port_forward(port_forward_process,
|
|
240
244
|
timeout=terminate_port_forward_timeout)
|
|
241
245
|
raise
|
|
246
|
+
finally:
|
|
247
|
+
if poller is not None and fd is not None:
|
|
248
|
+
try:
|
|
249
|
+
poller.unregister(fd)
|
|
250
|
+
except (OSError, ValueError):
|
|
251
|
+
# FD may already be unregistered or invalid
|
|
252
|
+
pass
|
|
242
253
|
if port_forward_exit:
|
|
243
254
|
raise RuntimeError(f'Port forward failed for service {service} in '
|
|
244
255
|
f'namespace {namespace} on context {context}')
|
|
@@ -5,7 +5,7 @@ import json
|
|
|
5
5
|
import re
|
|
6
6
|
import sys
|
|
7
7
|
import time
|
|
8
|
-
from typing import Any,
|
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
9
9
|
|
|
10
10
|
from sky import exceptions
|
|
11
11
|
from sky import global_user_state
|
|
@@ -583,31 +583,6 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
|
|
|
583
583
|
time.sleep(1)
|
|
584
584
|
|
|
585
585
|
|
|
586
|
-
def _run_function_with_retries(func: Callable,
|
|
587
|
-
operation_name: str,
|
|
588
|
-
max_retries: int = _MAX_RETRIES,
|
|
589
|
-
retry_delay: int = 5) -> Any:
|
|
590
|
-
"""Runs a function with retries on Kubernetes errors.
|
|
591
|
-
Args:
|
|
592
|
-
func: Function to retry
|
|
593
|
-
operation_name: Name of the operation for logging
|
|
594
|
-
max_retries: Maximum number of retry attempts
|
|
595
|
-
retry_delay: Delay between retries in seconds
|
|
596
|
-
Raises:
|
|
597
|
-
The last exception encountered if all retries fail.
|
|
598
|
-
"""
|
|
599
|
-
for attempt in range(max_retries + 1):
|
|
600
|
-
try:
|
|
601
|
-
return func()
|
|
602
|
-
except config_lib.KubernetesError:
|
|
603
|
-
if attempt < max_retries:
|
|
604
|
-
logger.warning(f'Failed to {operation_name} - '
|
|
605
|
-
f'retrying in {retry_delay} seconds.')
|
|
606
|
-
time.sleep(retry_delay)
|
|
607
|
-
else:
|
|
608
|
-
raise
|
|
609
|
-
|
|
610
|
-
|
|
611
586
|
@timeline.event
|
|
612
587
|
def pre_init(namespace: str, context: Optional[str], new_nodes: List) -> None:
|
|
613
588
|
"""Pre-initialization step for SkyPilot pods.
|
|
@@ -934,8 +909,11 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
|
934
909
|
running_pods = kubernetes_utils.filter_pods(namespace, context, tags,
|
|
935
910
|
['Pending', 'Running'])
|
|
936
911
|
head_pod_name = _get_head_pod_name(running_pods)
|
|
912
|
+
running_pod_statuses = [{
|
|
913
|
+
pod.metadata.name: pod.status.phase
|
|
914
|
+
} for pod in running_pods.values()]
|
|
937
915
|
logger.debug(f'Found {len(running_pods)} existing pods: '
|
|
938
|
-
f'{
|
|
916
|
+
f'{running_pod_statuses}')
|
|
939
917
|
|
|
940
918
|
to_start_count = config.count - len(running_pods)
|
|
941
919
|
if to_start_count < 0:
|
|
@@ -951,7 +929,7 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
|
951
929
|
nvidia_runtime_exists = False
|
|
952
930
|
try:
|
|
953
931
|
nvidia_runtime_exists = kubernetes_utils.check_nvidia_runtime_class(
|
|
954
|
-
context)
|
|
932
|
+
context=context)
|
|
955
933
|
except kubernetes.kubernetes.client.ApiException as e:
|
|
956
934
|
logger.warning('run_instances: Error occurred while checking for '
|
|
957
935
|
f'nvidia RuntimeClass - '
|
|
@@ -1142,10 +1120,21 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
|
1142
1120
|
pods = created_resources
|
|
1143
1121
|
|
|
1144
1122
|
created_pods = {}
|
|
1123
|
+
valid_pods = []
|
|
1145
1124
|
for pod in pods:
|
|
1125
|
+
# In case Pod is not created
|
|
1126
|
+
if pod is None:
|
|
1127
|
+
continue
|
|
1128
|
+
valid_pods.append(pod)
|
|
1146
1129
|
created_pods[pod.metadata.name] = pod
|
|
1147
1130
|
if head_pod_name is None and _is_head(pod):
|
|
1148
1131
|
head_pod_name = pod.metadata.name
|
|
1132
|
+
pods = valid_pods
|
|
1133
|
+
|
|
1134
|
+
# The running_pods may include Pending Pods, so we add them to the pods
|
|
1135
|
+
# list to wait for scheduling and running
|
|
1136
|
+
if running_pods:
|
|
1137
|
+
pods = pods + list(running_pods.values())
|
|
1149
1138
|
|
|
1150
1139
|
provision_timeout = provider_config['timeout']
|
|
1151
1140
|
|
|
@@ -1369,8 +1358,9 @@ def get_cluster_info(
|
|
|
1369
1358
|
assert head_spec is not None, pod
|
|
1370
1359
|
cpu_request = head_spec.containers[0].resources.requests['cpu']
|
|
1371
1360
|
|
|
1372
|
-
|
|
1373
|
-
|
|
1361
|
+
if cpu_request is None:
|
|
1362
|
+
raise RuntimeError(f'Pod {cluster_name_on_cloud}-head not found'
|
|
1363
|
+
' or not Running, check the Pod status')
|
|
1374
1364
|
|
|
1375
1365
|
ssh_user = 'sky'
|
|
1376
1366
|
# Use pattern matching to extract SSH user, handling MOTD contamination.
|
|
@@ -238,6 +238,40 @@ def normalize_tpu_accelerator_name(accelerator: str) -> Tuple[str, int]:
|
|
|
238
238
|
return accelerator, 1
|
|
239
239
|
|
|
240
240
|
|
|
241
|
+
def _is_cloudflare_403_error(exception: Exception) -> bool:
|
|
242
|
+
"""Check if an exception is a transient CloudFlare 403 error.
|
|
243
|
+
|
|
244
|
+
CloudFlare proxy 403 errors with CF-specific headers are transient and
|
|
245
|
+
should be retried, unlike real RBAC 403 errors.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
exception: The exception to check
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
True if this is a CloudFlare 403 error that should be retried
|
|
252
|
+
"""
|
|
253
|
+
if not isinstance(exception, kubernetes.api_exception()):
|
|
254
|
+
return False
|
|
255
|
+
|
|
256
|
+
# Only check for 403 errors
|
|
257
|
+
if exception.status != 403:
|
|
258
|
+
return False
|
|
259
|
+
|
|
260
|
+
# Check for CloudFlare-specific headers
|
|
261
|
+
headers = exception.headers if hasattr(exception, 'headers') else {}
|
|
262
|
+
if not headers:
|
|
263
|
+
return False
|
|
264
|
+
|
|
265
|
+
# CloudFlare errors have CF-RAY header and/or Server: cloudflare
|
|
266
|
+
for k, v in headers.items():
|
|
267
|
+
if 'cf-ray' in k.lower():
|
|
268
|
+
return True
|
|
269
|
+
if 'server' in k.lower() and 'cloudflare' in str(v).lower():
|
|
270
|
+
return True
|
|
271
|
+
|
|
272
|
+
return False
|
|
273
|
+
|
|
274
|
+
|
|
241
275
|
def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
|
|
242
276
|
retry_interval=DEFAULT_RETRY_INTERVAL_SECONDS,
|
|
243
277
|
resource_type: Optional[str] = None):
|
|
@@ -272,19 +306,25 @@ def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
|
|
|
272
306
|
kubernetes.api_exception(),
|
|
273
307
|
kubernetes.config_exception()) as e:
|
|
274
308
|
last_exception = e
|
|
309
|
+
|
|
310
|
+
# Check if this is a CloudFlare transient 403 error
|
|
311
|
+
is_cloudflare_403 = _is_cloudflare_403_error(e)
|
|
312
|
+
|
|
275
313
|
# Don't retry on permanent errors like 401 (Unauthorized)
|
|
276
|
-
# or 403 (Forbidden)
|
|
314
|
+
# or 403 (Forbidden), unless it's a CloudFlare transient 403
|
|
277
315
|
if (isinstance(e, kubernetes.api_exception()) and
|
|
278
|
-
e.status in (401, 403)):
|
|
316
|
+
e.status in (401, 403) and not is_cloudflare_403):
|
|
279
317
|
# Raise KubeAPIUnreachableError exception so that the
|
|
280
318
|
# optimizer/provisioner can failover to other clouds.
|
|
281
319
|
raise exceptions.KubeAPIUnreachableError(
|
|
282
320
|
f'Kubernetes API error: {str(e)}') from e
|
|
283
321
|
if attempt < max_retries - 1:
|
|
284
322
|
sleep_time = backoff.current_backoff()
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
323
|
+
error_type = 'CloudFlare 403' if is_cloudflare_403 else 'error'
|
|
324
|
+
logger.debug(
|
|
325
|
+
f'Kubernetes API call {func.__name__} '
|
|
326
|
+
f'failed with {error_type} {str(e)}. Retrying in '
|
|
327
|
+
f'{sleep_time:.1f}s...')
|
|
288
328
|
time.sleep(sleep_time)
|
|
289
329
|
continue
|
|
290
330
|
|
|
@@ -2738,7 +2778,8 @@ def merge_custom_metadata(
|
|
|
2738
2778
|
config_utils.merge_k8s_configs(original_metadata, custom_metadata)
|
|
2739
2779
|
|
|
2740
2780
|
|
|
2741
|
-
|
|
2781
|
+
@_retry_on_error(resource_type='runtimeclass')
|
|
2782
|
+
def check_nvidia_runtime_class(*, context: Optional[str] = None) -> bool:
|
|
2742
2783
|
"""Checks if the 'nvidia' RuntimeClass exists in the cluster"""
|
|
2743
2784
|
# Fetch the list of available RuntimeClasses
|
|
2744
2785
|
runtime_classes = kubernetes.node_api(context).list_runtime_class()
|
sky/serve/constants.py
CHANGED
|
@@ -76,6 +76,9 @@ CONTROLLER_AUTOSTOP = {
|
|
|
76
76
|
# A period of time to initialize your service. Any readiness probe failures
|
|
77
77
|
# during this period will be ignored.
|
|
78
78
|
DEFAULT_INITIAL_DELAY_SECONDS = 1200
|
|
79
|
+
# For pool, we shrink the initial delay to 300s to make the pool more
|
|
80
|
+
# responsive to the failure that setup command starts a long-running server.
|
|
81
|
+
DEFAULT_INITIAL_DELAY_SECONDS_POOL = 300
|
|
79
82
|
DEFAULT_MIN_REPLICAS = 1
|
|
80
83
|
|
|
81
84
|
# Default port range start for controller and load balancer. Ports will be
|
sky/serve/server/server.py
CHANGED
|
@@ -109,7 +109,7 @@ async def tail_logs(
|
|
|
109
109
|
task = executor.execute_request_in_coroutine(request_task)
|
|
110
110
|
# Cancel the coroutine after the request is done or client disconnects
|
|
111
111
|
background_tasks.add_task(task.cancel)
|
|
112
|
-
return stream_utils.
|
|
112
|
+
return stream_utils.stream_response_for_long_request(
|
|
113
113
|
request_id=request_task.request_id,
|
|
114
114
|
logs_path=request_task.log_path,
|
|
115
115
|
background_tasks=background_tasks,
|
sky/serve/service_spec.py
CHANGED
|
@@ -125,6 +125,12 @@ class SkyServiceSpec:
|
|
|
125
125
|
self.base_ondemand_fallback_replicas is not None and
|
|
126
126
|
self.base_ondemand_fallback_replicas > 0)
|
|
127
127
|
|
|
128
|
+
@staticmethod
|
|
129
|
+
def _get_initial_delay_seconds(pool: bool) -> int:
|
|
130
|
+
if pool:
|
|
131
|
+
return constants.DEFAULT_INITIAL_DELAY_SECONDS_POOL
|
|
132
|
+
return constants.DEFAULT_INITIAL_DELAY_SECONDS
|
|
133
|
+
|
|
128
134
|
@staticmethod
|
|
129
135
|
def from_yaml_config(config: Dict[str, Any]) -> 'SkyServiceSpec':
|
|
130
136
|
common_utils.validate_schema(config, schemas.get_service_schema(),
|
|
@@ -153,7 +159,8 @@ class SkyServiceSpec:
|
|
|
153
159
|
'timeout_seconds', None)
|
|
154
160
|
readiness_headers = readiness_section.get('headers', None)
|
|
155
161
|
if initial_delay_seconds is None:
|
|
156
|
-
initial_delay_seconds =
|
|
162
|
+
initial_delay_seconds = SkyServiceSpec._get_initial_delay_seconds(
|
|
163
|
+
config.get('pool', False))
|
|
157
164
|
service_config['initial_delay_seconds'] = initial_delay_seconds
|
|
158
165
|
if readiness_timeout_seconds is None:
|
|
159
166
|
readiness_timeout_seconds = (
|
sky/server/requests/executor.py
CHANGED
|
@@ -349,32 +349,6 @@ def override_request_env_and_config(
|
|
|
349
349
|
os.environ.update(original_env)
|
|
350
350
|
|
|
351
351
|
|
|
352
|
-
def _get_current_output() -> Tuple[int, int]:
|
|
353
|
-
"""Get the current stdout and stderr file descriptors."""
|
|
354
|
-
return os.dup(sys.stdout.fileno()), os.dup(sys.stderr.fileno())
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
def _redirect_output(file: TextIO) -> None:
|
|
358
|
-
"""Redirect stdout and stderr to the log file."""
|
|
359
|
-
# Get the file descriptor from the file object
|
|
360
|
-
fd = file.fileno()
|
|
361
|
-
# Copy this fd to stdout and stderr
|
|
362
|
-
os.dup2(fd, sys.stdout.fileno())
|
|
363
|
-
os.dup2(fd, sys.stderr.fileno())
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
def _restore_output(original_stdout: Optional[int],
|
|
367
|
-
original_stderr: Optional[int]) -> None:
|
|
368
|
-
"""Restore stdout and stderr to their original file descriptors."""
|
|
369
|
-
if original_stdout is not None:
|
|
370
|
-
os.dup2(original_stdout, sys.stdout.fileno())
|
|
371
|
-
os.close(original_stdout)
|
|
372
|
-
|
|
373
|
-
if original_stderr is not None:
|
|
374
|
-
os.dup2(original_stderr, sys.stderr.fileno())
|
|
375
|
-
os.close(original_stderr)
|
|
376
|
-
|
|
377
|
-
|
|
378
352
|
def _sigterm_handler(signum: int, frame: Optional['types.FrameType']) -> None:
|
|
379
353
|
raise KeyboardInterrupt
|
|
380
354
|
|
|
@@ -402,6 +376,34 @@ def _request_execution_wrapper(request_id: str,
|
|
|
402
376
|
logger.info(f'Running request {request_id} with pid {pid}')
|
|
403
377
|
|
|
404
378
|
original_stdout = original_stderr = None
|
|
379
|
+
|
|
380
|
+
def _save_current_output() -> None:
|
|
381
|
+
"""Save the current stdout and stderr file descriptors."""
|
|
382
|
+
nonlocal original_stdout, original_stderr
|
|
383
|
+
original_stdout = os.dup(sys.stdout.fileno())
|
|
384
|
+
original_stderr = os.dup(sys.stderr.fileno())
|
|
385
|
+
|
|
386
|
+
def _redirect_output(file: TextIO) -> None:
|
|
387
|
+
"""Redirect stdout and stderr to the log file."""
|
|
388
|
+
# Get the file descriptor from the file object
|
|
389
|
+
fd = file.fileno()
|
|
390
|
+
# Copy this fd to stdout and stderr
|
|
391
|
+
os.dup2(fd, sys.stdout.fileno())
|
|
392
|
+
os.dup2(fd, sys.stderr.fileno())
|
|
393
|
+
|
|
394
|
+
def _restore_output() -> None:
|
|
395
|
+
"""Restore stdout and stderr to their original file descriptors."""
|
|
396
|
+
nonlocal original_stdout, original_stderr
|
|
397
|
+
if original_stdout is not None:
|
|
398
|
+
os.dup2(original_stdout, sys.stdout.fileno())
|
|
399
|
+
os.close(original_stdout)
|
|
400
|
+
original_stdout = None
|
|
401
|
+
|
|
402
|
+
if original_stderr is not None:
|
|
403
|
+
os.dup2(original_stderr, sys.stderr.fileno())
|
|
404
|
+
os.close(original_stderr)
|
|
405
|
+
original_stderr = None
|
|
406
|
+
|
|
405
407
|
try:
|
|
406
408
|
# As soon as the request is updated with the executor PID, we can
|
|
407
409
|
# receive SIGTERM from cancellation. So, we update the request inside
|
|
@@ -422,7 +424,7 @@ def _request_execution_wrapper(request_id: str,
|
|
|
422
424
|
# Store copies of the original stdout and stderr file descriptors
|
|
423
425
|
# We do this in two steps because we should make sure to restore the
|
|
424
426
|
# original values even if we are cancelled or fail during the redirect.
|
|
425
|
-
|
|
427
|
+
_save_current_output()
|
|
426
428
|
|
|
427
429
|
# Append to the log file instead of overwriting it since there might be
|
|
428
430
|
# logs from previous retries.
|
|
@@ -464,15 +466,14 @@ def _request_execution_wrapper(request_id: str,
|
|
|
464
466
|
# clear the pid of the request.
|
|
465
467
|
request_task.pid = None
|
|
466
468
|
# Yield control to the scheduler for uniform handling of retries.
|
|
467
|
-
_restore_output(
|
|
469
|
+
_restore_output()
|
|
468
470
|
raise
|
|
469
471
|
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
|
470
472
|
api_requests.set_request_failed(request_id, e)
|
|
471
473
|
# Manually reset the original stdout and stderr file descriptors early
|
|
472
474
|
# so that the "Request xxxx failed due to ..." log message will be
|
|
473
475
|
# written to the original stdout and stderr file descriptors.
|
|
474
|
-
_restore_output(
|
|
475
|
-
original_stdout = original_stderr = None
|
|
476
|
+
_restore_output()
|
|
476
477
|
logger.info(f'Request {request_id} failed due to '
|
|
477
478
|
f'{common_utils.format_exception(e)}')
|
|
478
479
|
return
|
|
@@ -482,11 +483,10 @@ def _request_execution_wrapper(request_id: str,
|
|
|
482
483
|
# Manually reset the original stdout and stderr file descriptors early
|
|
483
484
|
# so that the "Request xxxx failed due to ..." log message will be
|
|
484
485
|
# written to the original stdout and stderr file descriptors.
|
|
485
|
-
_restore_output(
|
|
486
|
-
original_stdout = original_stderr = None
|
|
486
|
+
_restore_output()
|
|
487
487
|
logger.info(f'Request {request_id} finished')
|
|
488
488
|
finally:
|
|
489
|
-
_restore_output(
|
|
489
|
+
_restore_output()
|
|
490
490
|
try:
|
|
491
491
|
# Capture the peak RSS before GC.
|
|
492
492
|
peak_rss = max(proc.memory_info().rss, metrics_lib.peak_rss_bytes)
|
|
@@ -580,11 +580,11 @@ async def _execute_request_coroutine(request: api_requests.Request):
|
|
|
580
580
|
**request_body.to_kwargs())
|
|
581
581
|
|
|
582
582
|
async def poll_task(request_id: str) -> bool:
|
|
583
|
-
|
|
584
|
-
if
|
|
583
|
+
req_status = await api_requests.get_request_status_async(request_id)
|
|
584
|
+
if req_status is None:
|
|
585
585
|
raise RuntimeError('Request not found')
|
|
586
586
|
|
|
587
|
-
if
|
|
587
|
+
if req_status.status == api_requests.RequestStatus.CANCELLED:
|
|
588
588
|
ctx.cancel()
|
|
589
589
|
return True
|
|
590
590
|
|
sky/server/server.py
CHANGED
|
@@ -1243,7 +1243,7 @@ async def logs(
|
|
|
1243
1243
|
background_tasks.add_task(task.cancel)
|
|
1244
1244
|
# TODO(zhwu): This makes viewing logs in browser impossible. We should adopt
|
|
1245
1245
|
# the same approach as /stream.
|
|
1246
|
-
return stream_utils.
|
|
1246
|
+
return stream_utils.stream_response_for_long_request(
|
|
1247
1247
|
request_id=request.state.request_id,
|
|
1248
1248
|
logs_path=request_task.log_path,
|
|
1249
1249
|
background_tasks=background_tasks,
|
|
@@ -1539,6 +1539,7 @@ async def stream(
|
|
|
1539
1539
|
'X-Accel-Buffering': 'no'
|
|
1540
1540
|
})
|
|
1541
1541
|
|
|
1542
|
+
polling_interval = stream_utils.DEFAULT_POLL_INTERVAL
|
|
1542
1543
|
# Original plain text streaming logic
|
|
1543
1544
|
if request_id is not None:
|
|
1544
1545
|
request_task = await requests_lib.get_request_async(request_id)
|
|
@@ -1553,6 +1554,8 @@ async def stream(
|
|
|
1553
1554
|
raise fastapi.HTTPException(
|
|
1554
1555
|
status_code=404,
|
|
1555
1556
|
detail=f'Log of request {request_id!r} has been deleted')
|
|
1557
|
+
if request_task.schedule_type == requests_lib.ScheduleType.LONG:
|
|
1558
|
+
polling_interval = stream_utils.LONG_REQUEST_POLL_INTERVAL
|
|
1556
1559
|
else:
|
|
1557
1560
|
assert log_path is not None, (request_id, log_path)
|
|
1558
1561
|
if log_path == constants.API_SERVER_LOGS:
|
|
@@ -1600,7 +1603,8 @@ async def stream(
|
|
|
1600
1603
|
log_path_to_stream,
|
|
1601
1604
|
plain_logs=format == 'plain',
|
|
1602
1605
|
tail=tail,
|
|
1603
|
-
follow=follow
|
|
1606
|
+
follow=follow,
|
|
1607
|
+
polling_interval=polling_interval),
|
|
1604
1608
|
media_type='text/plain',
|
|
1605
1609
|
headers=headers,
|
|
1606
1610
|
)
|