skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +565 -0
- sky/backends/backend_utils.py +95 -12
- sky/backends/cloud_vm_ray_backend.py +224 -65
- sky/backends/task_codegen.py +380 -4
- sky/catalog/__init__.py +0 -3
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +236 -0
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +25 -11
- sky/client/cli/command.py +391 -32
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +64 -2
- sky/client/sdk_async.py +9 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +610 -0
- sky/clouds/ssh.py +3 -2
- sky/clouds/vast.py +39 -16
- sky/core.py +197 -37
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +44 -5
- sky/global_user_state.py +111 -19
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +11 -0
- sky/optimizer.py +8 -6
- sky/provision/__init__.py +12 -9
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +163 -20
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +17 -7
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +618 -0
- sky/provision/slurm/utils.py +689 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +11 -6
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/serve/server/impl.py +1 -1
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +238 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +30 -1
- sky/server/requests/request_names.py +4 -0
- sky/server/requests/requests.py +33 -11
- sky/server/requests/serializers/encoders.py +22 -0
- sky/server/requests/serializers/return_value_serializers.py +70 -0
- sky/server/server.py +506 -109
- sky/server/server_utils.py +30 -0
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +22 -9
- sky/sky_logging.py +2 -1
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +55 -13
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +187 -0
- sky/skylet/job_lib.py +91 -5
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/services.py +18 -3
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +12 -6
- sky/templates/slurm-ray.yml.j2 +115 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +18 -41
- sky/users/model.conf +1 -1
- sky/users/permission.py +85 -52
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +389 -35
- sky/utils/command_runner.pyi +43 -4
- sky/utils/common_utils.py +47 -31
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +93 -19
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
|
@@ -1,379 +1,10 @@
|
|
|
1
1
|
#!/bin/bash
|
|
2
|
-
# ssh-tunnel.sh - SSH tunnel script for Kubernetes API access
|
|
3
|
-
# Used as kubectl exec credential plugin to establish SSH tunnel on demand.
|
|
4
|
-
# Returns a valid credential format for kubectl with expiration. The expiration
|
|
5
|
-
# is calculated based on the TTL argument and is required to force kubectl to
|
|
6
|
-
# check the tunnel status frequently.
|
|
7
2
|
|
|
8
|
-
#
|
|
3
|
+
# This redirect stub is needed because we use this script in the
|
|
4
|
+
# exec auth section when creating our kubeconfig. Therefore, node pools
|
|
5
|
+
# launched in older versions of SkyPilot will have kubeconfigs pointing
|
|
6
|
+
# to this path.
|
|
9
7
|
|
|
10
|
-
#
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
# Parse arguments
|
|
15
|
-
USE_SSH_CONFIG=0
|
|
16
|
-
SSH_KEY=""
|
|
17
|
-
CONTEXT=""
|
|
18
|
-
HOST=""
|
|
19
|
-
USER=""
|
|
20
|
-
PORT=6443 # Default port if not specified
|
|
21
|
-
|
|
22
|
-
# Debug log to ~/.sky/ssh_node_pools_info/$CONTEXT-tunnel.log
|
|
23
|
-
debug_log() {
|
|
24
|
-
local message="$(date): $1"
|
|
25
|
-
echo "$message" >> "$LOG_FILE"
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
# Generate expiration timestamp for credential
|
|
29
|
-
generate_expiration_timestamp() {
|
|
30
|
-
# Try macOS date format first, fallback to Linux format
|
|
31
|
-
date -u -v+${TTL_SECONDS}S +"%Y-%m-%dT%H:%M:%SZ" 2>/dev/null || date -u -d "+${TTL_SECONDS} seconds" +"%Y-%m-%dT%H:%M:%SZ"
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
# Acquire the lock, return 0 if successful, 1 if another process is already holding the lock
|
|
35
|
-
acquire_lock() {
|
|
36
|
-
# Check for flock command
|
|
37
|
-
if ! command -v flock >/dev/null 2>&1; then
|
|
38
|
-
debug_log "flock command not available, using alternative lock mechanism"
|
|
39
|
-
# Simple file-based locking
|
|
40
|
-
if [ -f "$LOCK_FILE" ]; then
|
|
41
|
-
lock_pid=$(cat "$LOCK_FILE" 2>/dev/null)
|
|
42
|
-
if [ -n "$lock_pid" ] && kill -0 "$lock_pid" 2>/dev/null; then
|
|
43
|
-
debug_log "Another process ($lock_pid) is starting the tunnel, waiting briefly"
|
|
44
|
-
return 1
|
|
45
|
-
else
|
|
46
|
-
# Stale lock file
|
|
47
|
-
debug_log "Removing stale lock file"
|
|
48
|
-
rm -f "$LOCK_FILE"
|
|
49
|
-
fi
|
|
50
|
-
fi
|
|
51
|
-
# Create our lock
|
|
52
|
-
echo $$ > "$LOCK_FILE"
|
|
53
|
-
return 0
|
|
54
|
-
else
|
|
55
|
-
# Use flock for better locking
|
|
56
|
-
exec 9>"$LOCK_FILE"
|
|
57
|
-
if ! flock -n 9; then
|
|
58
|
-
debug_log "Another process is starting the tunnel, waiting briefly"
|
|
59
|
-
return 1
|
|
60
|
-
fi
|
|
61
|
-
return 0
|
|
62
|
-
fi
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
# Release the lock
|
|
66
|
-
release_lock() {
|
|
67
|
-
if command -v flock >/dev/null 2>&1; then
|
|
68
|
-
# Using flock
|
|
69
|
-
exec 9>&- # Close file descriptor to release lock
|
|
70
|
-
else
|
|
71
|
-
# Using simple lock
|
|
72
|
-
rm -f "$LOCK_FILE"
|
|
73
|
-
fi
|
|
74
|
-
debug_log "Lock released"
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
# Generate SSH command based on available tools and parameters
|
|
78
|
-
generate_ssh_command() {
|
|
79
|
-
# Check for autossh
|
|
80
|
-
if ! command -v autossh >/dev/null 2>&1; then
|
|
81
|
-
debug_log "WARNING: autossh is not installed but recommended for reliable SSH tunnels"
|
|
82
|
-
debug_log "Install autossh: brew install autossh (macOS), apt-get install autossh (Ubuntu/Debian)"
|
|
83
|
-
|
|
84
|
-
# Fall back to regular ssh
|
|
85
|
-
if [[ $USE_SSH_CONFIG -eq 1 ]]; then
|
|
86
|
-
SSH_CMD=("ssh" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N" "$HOST")
|
|
87
|
-
else
|
|
88
|
-
SSH_CMD=("ssh" "-o" "StrictHostKeyChecking=no" "-o" "IdentitiesOnly=yes" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N")
|
|
89
|
-
|
|
90
|
-
# Add SSH key if provided
|
|
91
|
-
if [[ -n "$SSH_KEY" ]]; then
|
|
92
|
-
SSH_CMD+=("-i" "$SSH_KEY")
|
|
93
|
-
fi
|
|
94
|
-
|
|
95
|
-
# Add user@host
|
|
96
|
-
SSH_CMD+=("$USER@$HOST")
|
|
97
|
-
fi
|
|
98
|
-
else
|
|
99
|
-
# Configure autossh
|
|
100
|
-
if [[ $USE_SSH_CONFIG -eq 1 ]]; then
|
|
101
|
-
SSH_CMD=("autossh" "-M" "0" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N" "$HOST")
|
|
102
|
-
else
|
|
103
|
-
SSH_CMD=("autossh" "-M" "0" "-o" "StrictHostKeyChecking=no" "-o" "IdentitiesOnly=yes" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N")
|
|
104
|
-
|
|
105
|
-
# Add SSH key if provided
|
|
106
|
-
if [[ -n "$SSH_KEY" ]]; then
|
|
107
|
-
SSH_CMD+=("-i" "$SSH_KEY")
|
|
108
|
-
fi
|
|
109
|
-
|
|
110
|
-
# Add user@host
|
|
111
|
-
SSH_CMD+=("$USER@$HOST")
|
|
112
|
-
fi
|
|
113
|
-
fi
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
# Function to read certificate files if they exist
|
|
117
|
-
read_certificate_data() {
|
|
118
|
-
local client_cert_file="$TUNNEL_DIR/$CONTEXT-cert.pem"
|
|
119
|
-
local client_key_file="$TUNNEL_DIR/$CONTEXT-key.pem"
|
|
120
|
-
local cert_data=""
|
|
121
|
-
local key_data=""
|
|
122
|
-
|
|
123
|
-
if [[ -f "$client_cert_file" ]]; then
|
|
124
|
-
# Read the certificate file as is - it's already in PEM format
|
|
125
|
-
cert_data=$(cat "$client_cert_file")
|
|
126
|
-
debug_log "Found client certificate data for context $CONTEXT"
|
|
127
|
-
|
|
128
|
-
# Log the first and last few characters to verify PEM format
|
|
129
|
-
local cert_start=$(head -1 "$client_cert_file")
|
|
130
|
-
local cert_end=$(tail -1 "$client_cert_file")
|
|
131
|
-
debug_log "Certificate starts with: $cert_start"
|
|
132
|
-
debug_log "Certificate ends with: $cert_end"
|
|
133
|
-
|
|
134
|
-
# Check if it has proper PEM format
|
|
135
|
-
if ! grep -q "BEGIN CERTIFICATE" "$client_cert_file" || ! grep -q "END CERTIFICATE" "$client_cert_file"; then
|
|
136
|
-
debug_log "WARNING: Certificate file may not be in proper PEM format"
|
|
137
|
-
# Try to fix it if needed
|
|
138
|
-
if ! grep -q "BEGIN CERTIFICATE" "$client_cert_file"; then
|
|
139
|
-
echo "-----BEGIN CERTIFICATE-----" > "$client_cert_file.fixed"
|
|
140
|
-
cat "$client_cert_file" >> "$client_cert_file.fixed"
|
|
141
|
-
echo "-----END CERTIFICATE-----" >> "$client_cert_file.fixed"
|
|
142
|
-
mv "$client_cert_file.fixed" "$client_cert_file"
|
|
143
|
-
cert_data=$(cat "$client_cert_file")
|
|
144
|
-
debug_log "Fixed certificate format by adding BEGIN/END markers"
|
|
145
|
-
fi
|
|
146
|
-
fi
|
|
147
|
-
fi
|
|
148
|
-
|
|
149
|
-
if [[ -f "$client_key_file" ]]; then
|
|
150
|
-
# Read the key file as is - it's already in PEM format
|
|
151
|
-
key_data=$(cat "$client_key_file")
|
|
152
|
-
debug_log "Found client key data for context $CONTEXT"
|
|
153
|
-
|
|
154
|
-
# Log the first and last few characters to verify PEM format
|
|
155
|
-
local key_start=$(head -1 "$client_key_file")
|
|
156
|
-
local key_end=$(tail -1 "$client_key_file")
|
|
157
|
-
debug_log "Key starts with: $key_start"
|
|
158
|
-
debug_log "Key ends with: $key_end"
|
|
159
|
-
|
|
160
|
-
# Check if it has proper PEM format
|
|
161
|
-
if ! grep -q "BEGIN" "$client_key_file" || ! grep -q "END" "$client_key_file"; then
|
|
162
|
-
debug_log "WARNING: Key file may not be in proper PEM format"
|
|
163
|
-
# Try to fix it if needed
|
|
164
|
-
if ! grep -q "BEGIN" "$client_key_file"; then
|
|
165
|
-
echo "-----BEGIN PRIVATE KEY-----" > "$client_key_file.fixed"
|
|
166
|
-
cat "$client_key_file" >> "$client_key_file.fixed"
|
|
167
|
-
echo "-----END PRIVATE KEY-----" >> "$client_key_file.fixed"
|
|
168
|
-
mv "$client_key_file.fixed" "$client_key_file"
|
|
169
|
-
key_data=$(cat "$client_key_file")
|
|
170
|
-
debug_log "Fixed key format by adding BEGIN/END markers"
|
|
171
|
-
fi
|
|
172
|
-
fi
|
|
173
|
-
fi
|
|
174
|
-
|
|
175
|
-
echo "$cert_data:$key_data"
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
# Function to generate credentials JSON
|
|
179
|
-
generate_credentials_json() {
|
|
180
|
-
local expiration_time=$(generate_expiration_timestamp)
|
|
181
|
-
local cert_bundle=$(read_certificate_data)
|
|
182
|
-
local client_cert_data=${cert_bundle%:*}
|
|
183
|
-
local client_key_data=${cert_bundle#*:}
|
|
184
|
-
|
|
185
|
-
if [[ -n "$client_cert_data" && -n "$client_key_data" ]]; then
|
|
186
|
-
# Debug the certificate data
|
|
187
|
-
debug_log "Certificate data length: $(echo -n "$client_cert_data" | wc -c) bytes"
|
|
188
|
-
debug_log "Key data length: $(echo -n "$client_key_data" | wc -c) bytes"
|
|
189
|
-
|
|
190
|
-
# Check if we can create proper JSON with `jq`
|
|
191
|
-
if ! command -v jq &>/dev/null; then
|
|
192
|
-
echo "jq is not installed. Please install jq to use this script." >&2
|
|
193
|
-
exit 1
|
|
194
|
-
fi
|
|
195
|
-
debug_log "Using jq for JSON formatting"
|
|
196
|
-
|
|
197
|
-
# Create a temporary file for the JSON output to avoid shell escaping issues
|
|
198
|
-
local TEMP_JSON_FILE=$(mktemp)
|
|
199
|
-
|
|
200
|
-
# Write the JSON to the temporary file using jq for proper JSON formatting
|
|
201
|
-
cat > "$TEMP_JSON_FILE" << EOL
|
|
202
|
-
{
|
|
203
|
-
"apiVersion": "client.authentication.k8s.io/v1beta1",
|
|
204
|
-
"kind": "ExecCredential",
|
|
205
|
-
"status": {
|
|
206
|
-
"clientCertificateData": $(printf '%s' "$client_cert_data" | jq -R -s .),
|
|
207
|
-
"clientKeyData": $(printf '%s' "$client_key_data" | jq -R -s .),
|
|
208
|
-
"expirationTimestamp": "$expiration_time"
|
|
209
|
-
}
|
|
210
|
-
}
|
|
211
|
-
EOL
|
|
212
|
-
|
|
213
|
-
# Read the JSON from the file
|
|
214
|
-
local json_response=$(cat "$TEMP_JSON_FILE")
|
|
215
|
-
|
|
216
|
-
# Clean up
|
|
217
|
-
rm -f "$TEMP_JSON_FILE"
|
|
218
|
-
|
|
219
|
-
# Output the JSON
|
|
220
|
-
echo "$json_response"
|
|
221
|
-
else
|
|
222
|
-
# Fallback to token-based credential for tunnel-only authentication
|
|
223
|
-
echo "{\"apiVersion\":\"client.authentication.k8s.io/v1beta1\",\"kind\":\"ExecCredential\",\"status\":{\"token\":\"k8s-ssh-tunnel-token\",\"expirationTimestamp\":\"$expiration_time\"}}"
|
|
224
|
-
fi
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
while [[ $# -gt 0 ]]; do
|
|
228
|
-
case $1 in
|
|
229
|
-
--use-ssh-config)
|
|
230
|
-
USE_SSH_CONFIG=1
|
|
231
|
-
shift
|
|
232
|
-
;;
|
|
233
|
-
--ssh-key)
|
|
234
|
-
SSH_KEY="$2"
|
|
235
|
-
shift 2
|
|
236
|
-
;;
|
|
237
|
-
--context)
|
|
238
|
-
CONTEXT="$2"
|
|
239
|
-
shift 2
|
|
240
|
-
;;
|
|
241
|
-
--port)
|
|
242
|
-
PORT="$2"
|
|
243
|
-
shift 2
|
|
244
|
-
;;
|
|
245
|
-
--host)
|
|
246
|
-
HOST="$2"
|
|
247
|
-
shift 2
|
|
248
|
-
;;
|
|
249
|
-
--user)
|
|
250
|
-
USER="$2"
|
|
251
|
-
shift 2
|
|
252
|
-
;;
|
|
253
|
-
--ttl)
|
|
254
|
-
TTL_SECONDS="$2"
|
|
255
|
-
shift 2
|
|
256
|
-
;;
|
|
257
|
-
*)
|
|
258
|
-
echo "Unknown parameter: $1" >&2
|
|
259
|
-
exit 1
|
|
260
|
-
;;
|
|
261
|
-
esac
|
|
262
|
-
done
|
|
263
|
-
|
|
264
|
-
# Validate required parameters
|
|
265
|
-
if [[ -z "$HOST" ]]; then
|
|
266
|
-
echo "Error: --host parameter is required" >&2
|
|
267
|
-
exit 1
|
|
268
|
-
fi
|
|
269
|
-
|
|
270
|
-
# Setup directories
|
|
271
|
-
TUNNEL_DIR="$HOME/.sky/ssh_node_pools_info"
|
|
272
|
-
mkdir -p "$TUNNEL_DIR"
|
|
273
|
-
|
|
274
|
-
# Get context name for PID file
|
|
275
|
-
if [[ -z "$CONTEXT" ]]; then
|
|
276
|
-
CONTEXT="default"
|
|
277
|
-
fi
|
|
278
|
-
|
|
279
|
-
PID_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.pid"
|
|
280
|
-
LOG_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.log"
|
|
281
|
-
LOCK_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.lock"
|
|
282
|
-
|
|
283
|
-
debug_log "Starting ssh-tunnel.sh for context $CONTEXT, host $HOST, port $PORT"
|
|
284
|
-
debug_log "SSH Config: $USE_SSH_CONFIG, User: $USER, TTL: ${TTL_SECONDS}s"
|
|
285
|
-
|
|
286
|
-
# Check if specified port is already in use (tunnel may be running)
|
|
287
|
-
if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
|
|
288
|
-
debug_log "Port $PORT already in use, checking if it's our tunnel"
|
|
289
|
-
|
|
290
|
-
# Check if there's a PID file and if that process is running
|
|
291
|
-
if [[ -f "$PID_FILE" ]]; then
|
|
292
|
-
OLD_PID=$(cat "$PID_FILE")
|
|
293
|
-
if kill -0 "$OLD_PID" 2>/dev/null; then
|
|
294
|
-
debug_log "Tunnel appears to be running with PID $OLD_PID"
|
|
295
|
-
else
|
|
296
|
-
debug_log "PID file exists but process $OLD_PID is not running"
|
|
297
|
-
fi
|
|
298
|
-
else
|
|
299
|
-
debug_log "Port $PORT is in use but no PID file exists"
|
|
300
|
-
fi
|
|
301
|
-
|
|
302
|
-
# Return valid credential format for kubectl with expiration
|
|
303
|
-
generate_credentials_json
|
|
304
|
-
exit 0
|
|
305
|
-
fi
|
|
306
|
-
|
|
307
|
-
# Try to acquire the lock
|
|
308
|
-
if ! acquire_lock; then
|
|
309
|
-
# Wait briefly for the tunnel to be established
|
|
310
|
-
for i in {1..10}; do
|
|
311
|
-
if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
|
|
312
|
-
debug_log "Tunnel is now active"
|
|
313
|
-
|
|
314
|
-
# Return valid credential format for kubectl with expiration
|
|
315
|
-
generate_credentials_json
|
|
316
|
-
exit 0
|
|
317
|
-
fi
|
|
318
|
-
sleep 0.2
|
|
319
|
-
done
|
|
320
|
-
debug_log "Waited for tunnel but port $PORT still not available"
|
|
321
|
-
fi
|
|
322
|
-
|
|
323
|
-
# Check if we have a PID file with running process
|
|
324
|
-
if [[ -f "$PID_FILE" ]]; then
|
|
325
|
-
OLD_PID=$(cat "$PID_FILE")
|
|
326
|
-
if kill -0 "$OLD_PID" 2>/dev/null; then
|
|
327
|
-
# Process exists but port isn't open - something's wrong, kill it
|
|
328
|
-
kill "$OLD_PID" 2>/dev/null
|
|
329
|
-
debug_log "Killed stale tunnel process $OLD_PID"
|
|
330
|
-
else
|
|
331
|
-
debug_log "PID file exists but process $OLD_PID is not running anymore"
|
|
332
|
-
fi
|
|
333
|
-
# Remove the stale PID file
|
|
334
|
-
rm -f "$PID_FILE"
|
|
335
|
-
fi
|
|
336
|
-
|
|
337
|
-
# Generate the SSH command
|
|
338
|
-
generate_ssh_command
|
|
339
|
-
|
|
340
|
-
debug_log "Starting SSH tunnel: ${SSH_CMD[*]}"
|
|
341
|
-
|
|
342
|
-
# Start the tunnel in foreground and wait for it to establish
|
|
343
|
-
"${SSH_CMD[@]}" >> "$LOG_FILE" 2>&1 &
|
|
344
|
-
TUNNEL_PID=$!
|
|
345
|
-
|
|
346
|
-
# Save PID
|
|
347
|
-
echo $TUNNEL_PID > "$PID_FILE"
|
|
348
|
-
debug_log "Tunnel started with PID $TUNNEL_PID"
|
|
349
|
-
|
|
350
|
-
# Wait for tunnel to establish
|
|
351
|
-
tunnel_up=0
|
|
352
|
-
for i in {1..20}; do
|
|
353
|
-
if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
|
|
354
|
-
debug_log "Tunnel established successfully on port $PORT"
|
|
355
|
-
tunnel_up=1
|
|
356
|
-
break
|
|
357
|
-
fi
|
|
358
|
-
sleep 0.2
|
|
359
|
-
done
|
|
360
|
-
|
|
361
|
-
# Clean up lock file
|
|
362
|
-
release_lock
|
|
363
|
-
|
|
364
|
-
# Check if the tunnel process is still running
|
|
365
|
-
if ! kill -0 $TUNNEL_PID 2>/dev/null; then
|
|
366
|
-
debug_log "ERROR: Tunnel process exited unexpectedly! Check logs for details"
|
|
367
|
-
if [[ -f "$PID_FILE" ]]; then
|
|
368
|
-
rm -f "$PID_FILE"
|
|
369
|
-
fi
|
|
370
|
-
# Return error in case of tunnel failure
|
|
371
|
-
echo "Failed to establish SSH tunnel. See $TUNNEL_DIR/$CONTEXT-tunnel.log for details." >&2
|
|
372
|
-
exit 1
|
|
373
|
-
elif [[ $tunnel_up -eq 0 ]]; then
|
|
374
|
-
debug_log "WARNING: Tunnel process is running but port $PORT is not responding"
|
|
375
|
-
fi
|
|
376
|
-
|
|
377
|
-
# Return valid credential format with certificates if available
|
|
378
|
-
generate_credentials_json
|
|
379
|
-
exit 0
|
|
8
|
+
# TODO (kyuds): remove this script after v0.13.0. Kept here for backwards compat.
|
|
9
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
10
|
+
exec "$SCRIPT_DIR/../../ssh_node_pools/deploy/tunnel/ssh-tunnel.sh" "$@"
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Plugin extensions module.
|
|
2
|
+
|
|
3
|
+
This module provides extension points that plugins can hook into to provide
|
|
4
|
+
custom functionality.
|
|
5
|
+
"""
|
|
6
|
+
from sky.utils.plugin_extensions.external_failure_source import (
|
|
7
|
+
ExternalClusterFailure)
|
|
8
|
+
from sky.utils.plugin_extensions.external_failure_source import (
|
|
9
|
+
ExternalFailureSource)
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
'ExternalClusterFailure',
|
|
13
|
+
'ExternalFailureSource',
|
|
14
|
+
]
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"""External failure source interface for plugins.
|
|
2
|
+
|
|
3
|
+
This module provides an extension point that allows plugins to provide
|
|
4
|
+
cluster failure tracking functionality. By default, no-op implementations
|
|
5
|
+
are used. Plugins can register their own implementations to provide actual
|
|
6
|
+
failure tracking.
|
|
7
|
+
|
|
8
|
+
Example usage in a plugin:
|
|
9
|
+
from sky.utils.plugin_extensions import ExternalFailureSource
|
|
10
|
+
|
|
11
|
+
# Register custom failure source
|
|
12
|
+
ExternalFailureSource.register(
|
|
13
|
+
get_failures=my_get_cluster_failures,
|
|
14
|
+
clear_failures=my_clear_cluster_failures,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
Example usage in core SkyPilot:
|
|
18
|
+
from sky.utils.plugin_extensions import ExternalFailureSource
|
|
19
|
+
|
|
20
|
+
# Get failures for a cluster
|
|
21
|
+
failures = ExternalFailureSource.get(cluster_hash='abc123')
|
|
22
|
+
|
|
23
|
+
# Clear failures for a cluster
|
|
24
|
+
cleared = ExternalFailureSource.clear(cluster_name='my-cluster')
|
|
25
|
+
"""
|
|
26
|
+
import dataclasses
|
|
27
|
+
from typing import Any, Dict, List, Optional, Protocol
|
|
28
|
+
|
|
29
|
+
from sky import sky_logging
|
|
30
|
+
|
|
31
|
+
logger = sky_logging.init_logger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclasses.dataclass
|
|
35
|
+
class ExternalClusterFailure:
|
|
36
|
+
"""Represents a single cluster failure from an external source.
|
|
37
|
+
|
|
38
|
+
Attributes:
|
|
39
|
+
code: Machine-readable failure code (e.g. 'GPU_HARDWARE_FAILURE_XID_79')
|
|
40
|
+
reason: Human-readable description of the failure.
|
|
41
|
+
"""
|
|
42
|
+
code: str
|
|
43
|
+
reason: str
|
|
44
|
+
|
|
45
|
+
@classmethod
|
|
46
|
+
def from_failure_list(
|
|
47
|
+
cls, failures: List[Dict[str,
|
|
48
|
+
Any]]) -> List['ExternalClusterFailure']:
|
|
49
|
+
"""Create a list of ExternalClusterFailure from failure dicts.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
failures: List of dicts with 'failure_mode' and 'failure_reason'
|
|
53
|
+
keys (as returned by ExternalFailureSource.get()).
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
List of ExternalClusterFailure objects, one per failure.
|
|
57
|
+
"""
|
|
58
|
+
return [
|
|
59
|
+
cls(code=f['failure_mode'], reason=f['failure_reason'])
|
|
60
|
+
for f in failures
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# Protocol definitions for the failure source functions
|
|
65
|
+
class GetClusterFailuresFunc(Protocol):
|
|
66
|
+
"""Protocol for get_cluster_failures function."""
|
|
67
|
+
|
|
68
|
+
def __call__(self,
|
|
69
|
+
cluster_hash: Optional[str] = None,
|
|
70
|
+
cluster_name: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
71
|
+
...
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class ClearClusterFailuresFunc(Protocol):
|
|
75
|
+
"""Protocol for clear_cluster_failures function."""
|
|
76
|
+
|
|
77
|
+
def __call__(self,
|
|
78
|
+
cluster_hash: Optional[str] = None,
|
|
79
|
+
cluster_name: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
80
|
+
...
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class ExternalFailureSource:
|
|
84
|
+
"""Singleton class for external cluster failure source.
|
|
85
|
+
|
|
86
|
+
This class provides an extension point for plugins to register their own
|
|
87
|
+
cluster failure tracking implementations. By default, no-op implementations
|
|
88
|
+
are used that return empty lists.
|
|
89
|
+
|
|
90
|
+
Plugins can register their implementations during their install() phase,
|
|
91
|
+
and core SkyPilot code can use the get() and clear() methods to interact
|
|
92
|
+
with cluster failures without knowing which plugin (if any) is providing
|
|
93
|
+
the implementation.
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
_get_func: Optional[GetClusterFailuresFunc] = None
|
|
97
|
+
_clear_func: Optional[ClearClusterFailuresFunc] = None
|
|
98
|
+
|
|
99
|
+
@classmethod
|
|
100
|
+
def register(cls, get_failures: GetClusterFailuresFunc,
|
|
101
|
+
clear_failures: ClearClusterFailuresFunc) -> None:
|
|
102
|
+
"""Register an external failure source implementation.
|
|
103
|
+
|
|
104
|
+
This allows plugins to provide their own cluster failure tracking.
|
|
105
|
+
Only one external failure source can be registered at a time.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
get_failures: Function to get active cluster failures.
|
|
109
|
+
Signature: (cluster_hash: Optional[str],
|
|
110
|
+
cluster_name: Optional[str])
|
|
111
|
+
-> List[Dict[str, Any]]
|
|
112
|
+
Returns list of dicts with keys: cluster_hash, failure_mode,
|
|
113
|
+
failure_reason, cleared_at.
|
|
114
|
+
clear_failures: Function to clear cluster failures.
|
|
115
|
+
Signature: (cluster_hash: Optional[str],
|
|
116
|
+
cluster_name: Optional[str])
|
|
117
|
+
-> List[Dict[str, Any]]
|
|
118
|
+
Returns list of dicts of the failures that were cleared.
|
|
119
|
+
"""
|
|
120
|
+
cls._get_func = get_failures
|
|
121
|
+
cls._clear_func = clear_failures
|
|
122
|
+
logger.info('Registered external failure source')
|
|
123
|
+
|
|
124
|
+
@classmethod
|
|
125
|
+
def is_registered(cls) -> bool:
|
|
126
|
+
"""Check if an external failure source is registered."""
|
|
127
|
+
return cls._get_func is not None and cls._clear_func is not None
|
|
128
|
+
|
|
129
|
+
@classmethod
|
|
130
|
+
def get(cls,
|
|
131
|
+
cluster_hash: Optional[str] = None,
|
|
132
|
+
cluster_name: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
133
|
+
"""Get active cluster failures from the registered failure source.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
cluster_hash: Hash of the cluster to query failures for.
|
|
137
|
+
cluster_name: Name of the cluster to query failures for.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
List of dictionaries containing failure records.
|
|
141
|
+
Each dict contains: cluster_hash, failure_mode, failure_reason,
|
|
142
|
+
cleared_at. Returns empty list if no failure source is registered.
|
|
143
|
+
"""
|
|
144
|
+
if cls._get_func is None:
|
|
145
|
+
return []
|
|
146
|
+
try:
|
|
147
|
+
# pylint: disable=not-callable
|
|
148
|
+
return cls._get_func(cluster_name=cluster_name,
|
|
149
|
+
cluster_hash=cluster_hash)
|
|
150
|
+
except Exception as e: # pylint: disable=broad-except
|
|
151
|
+
logger.warning(f'Failed to get cluster failures: {e}')
|
|
152
|
+
return []
|
|
153
|
+
|
|
154
|
+
@classmethod
|
|
155
|
+
def clear(cls,
|
|
156
|
+
cluster_hash: Optional[str] = None,
|
|
157
|
+
cluster_name: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
158
|
+
"""Clear cluster failures via the registered failure source.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
cluster_hash: Hash of the cluster to clear failures for.
|
|
162
|
+
cluster_name: Name of the cluster to clear failures for.
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
List of dictionaries containing the failure records that were
|
|
166
|
+
cleared. Returns empty list if no failure source is registered.
|
|
167
|
+
"""
|
|
168
|
+
if cls._clear_func is None:
|
|
169
|
+
return []
|
|
170
|
+
try:
|
|
171
|
+
# pylint: disable=not-callable
|
|
172
|
+
return cls._clear_func(cluster_name=cluster_name,
|
|
173
|
+
cluster_hash=cluster_hash)
|
|
174
|
+
except Exception as e: # pylint: disable=broad-except
|
|
175
|
+
logger.warning(f'Failed to clear cluster failures: {e}')
|
|
176
|
+
return []
|
sky/utils/resources_utils.py
CHANGED
|
@@ -183,7 +183,8 @@ def simplify_ports(ports: List[str]) -> List[str]:
|
|
|
183
183
|
def format_resource(resource: 'resources_lib.Resources',
|
|
184
184
|
simplified_only: bool = False) -> Tuple[str, Optional[str]]:
|
|
185
185
|
resource = resource.assert_launchable()
|
|
186
|
-
is_k8s =
|
|
186
|
+
is_k8s = resource.cloud.canonical_name() == 'kubernetes'
|
|
187
|
+
vcpu, mem = None, None
|
|
187
188
|
if resource.accelerators is None or is_k8s or not simplified_only:
|
|
188
189
|
vcpu, mem = resource.cloud.get_vcpus_mem_from_instance_type(
|
|
189
190
|
resource.instance_type)
|
|
@@ -198,18 +199,19 @@ def format_resource(resource: 'resources_lib.Resources',
|
|
|
198
199
|
|
|
199
200
|
if (resource.accelerators is None or is_k8s):
|
|
200
201
|
if vcpu is not None:
|
|
201
|
-
elements_simple.append(f'cpus={
|
|
202
|
-
elements_full.append(f'cpus={
|
|
202
|
+
elements_simple.append(f'cpus={common_utils.format_float(vcpu)}')
|
|
203
|
+
elements_full.append(f'cpus={common_utils.format_float(vcpu)}')
|
|
203
204
|
if mem is not None:
|
|
204
|
-
elements_simple.append(f'mem={
|
|
205
|
-
elements_full.append(f'mem={
|
|
205
|
+
elements_simple.append(f'mem={common_utils.format_float(mem)}')
|
|
206
|
+
elements_full.append(f'mem={common_utils.format_float(mem)}')
|
|
206
207
|
elif not simplified_only:
|
|
207
208
|
if vcpu is not None:
|
|
208
|
-
elements_full.append(f'cpus={
|
|
209
|
+
elements_full.append(f'cpus={common_utils.format_float(vcpu)}')
|
|
209
210
|
if mem is not None:
|
|
210
|
-
elements_full.append(f'mem={
|
|
211
|
+
elements_full.append(f'mem={common_utils.format_float(mem)}')
|
|
211
212
|
|
|
212
|
-
|
|
213
|
+
is_slurm = resource.cloud.canonical_name() == 'slurm'
|
|
214
|
+
if not is_k8s and not is_slurm:
|
|
213
215
|
instance_type_full = resource.instance_type
|
|
214
216
|
instance_type_simple = common_utils.truncate_long_string(
|
|
215
217
|
instance_type_full, 15)
|
sky/utils/rich_utils.py
CHANGED
|
@@ -362,14 +362,14 @@ def decode_rich_status(
|
|
|
362
362
|
# Replace `\r\n` with `\n`, as printing a line ends with
|
|
363
363
|
# `\r\n` in linux will cause the line to be empty.
|
|
364
364
|
line = line[:-2] + '\n'
|
|
365
|
-
is_payload,
|
|
365
|
+
is_payload, decoded_line = message_utils.decode_payload(
|
|
366
366
|
line, raise_for_mismatch=False)
|
|
367
|
-
|
|
368
|
-
if is_payload:
|
|
369
|
-
control, encoded_status = Control.decode(line)
|
|
370
|
-
if control is None:
|
|
367
|
+
if not is_payload:
|
|
371
368
|
yield line
|
|
372
369
|
continue
|
|
370
|
+
control, encoded_status = Control.decode(decoded_line)
|
|
371
|
+
if control is None:
|
|
372
|
+
continue
|
|
373
373
|
|
|
374
374
|
if control == Control.RETRY:
|
|
375
375
|
raise exceptions.RequestInterruptedError(
|
|
@@ -481,15 +481,13 @@ async def decode_rich_status_async(
|
|
|
481
481
|
# Replace `\r\n` with `\n`, as printing a line ends with
|
|
482
482
|
# `\r\n` in linux will cause the line to be empty.
|
|
483
483
|
line = line[:-2] + '\n'
|
|
484
|
-
is_payload,
|
|
484
|
+
is_payload, decoded_line = message_utils.decode_payload(
|
|
485
485
|
line, raise_for_mismatch=False)
|
|
486
|
-
if
|
|
486
|
+
if not is_payload:
|
|
487
|
+
yield line
|
|
487
488
|
continue
|
|
488
|
-
control =
|
|
489
|
-
if is_payload:
|
|
490
|
-
control, encoded_status = Control.decode(line)
|
|
489
|
+
control, encoded_status = Control.decode(decoded_line)
|
|
491
490
|
if control is None:
|
|
492
|
-
yield line
|
|
493
491
|
continue
|
|
494
492
|
|
|
495
493
|
if control == Control.RETRY:
|