skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/adaptors/slurm.py +159 -72
- sky/backends/backend_utils.py +52 -10
- sky/backends/cloud_vm_ray_backend.py +192 -32
- sky/backends/task_codegen.py +40 -2
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +0 -7
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +11 -8
- sky/client/cli/command.py +106 -54
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +8 -0
- sky/client/sdk_async.py +9 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +44 -12
- sky/clouds/ssh.py +1 -1
- sky/clouds/vast.py +30 -17
- sky/core.py +69 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +29 -4
- sky/global_user_state.py +108 -16
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +9 -0
- sky/optimizer.py +2 -1
- sky/provision/__init__.py +11 -9
- sky/provision/kubernetes/utils.py +122 -15
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +2 -1
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/instance.py +75 -29
- sky/provision/slurm/utils.py +213 -107
- sky/provision/vast/utils.py +1 -0
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +16 -0
- sky/server/requests/payloads.py +18 -0
- sky/server/requests/request_names.py +2 -0
- sky/server/requests/requests.py +28 -10
- sky/server/requests/serializers/encoders.py +5 -0
- sky/server/requests/serializers/return_value_serializers.py +14 -4
- sky/server/server.py +434 -107
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +21 -10
- sky/sky_logging.py +2 -1
- sky/skylet/constants.py +22 -5
- sky/skylet/executor/slurm.py +4 -6
- sky/skylet/job_lib.py +89 -4
- sky/skylet/services.py +18 -3
- sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/templates/kubernetes-ray.yml.j2 +4 -6
- sky/templates/slurm-ray.yml.j2 +32 -2
- sky/templates/websocket_proxy.py +18 -41
- sky/users/permission.py +61 -51
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +256 -94
- sky/utils/command_runner.pyi +16 -0
- sky/utils/common_utils.py +30 -29
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +63 -20
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
- /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,379 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# ssh-tunnel.sh - SSH tunnel script for Kubernetes API access
|
|
3
|
+
# Used as kubectl exec credential plugin to establish SSH tunnel on demand.
|
|
4
|
+
# Returns a valid credential format for kubectl with expiration. The expiration
|
|
5
|
+
# is calculated based on the TTL argument and is required to force kubectl to
|
|
6
|
+
# check the tunnel status frequently.
|
|
7
|
+
|
|
8
|
+
# Usage: ssh-tunnel.sh --host HOST [--user USER] [--use-ssh-config] [--ssh-key KEY] [--context CONTEXT] [--port PORT] [--ttl SECONDS]
|
|
9
|
+
|
|
10
|
+
# Default time-to-live for credential in seconds
|
|
11
|
+
# This forces kubectl to check the tunnel status frequently
|
|
12
|
+
TTL_SECONDS=30
|
|
13
|
+
|
|
14
|
+
# Parse arguments
|
|
15
|
+
USE_SSH_CONFIG=0
|
|
16
|
+
SSH_KEY=""
|
|
17
|
+
CONTEXT=""
|
|
18
|
+
HOST=""
|
|
19
|
+
USER=""
|
|
20
|
+
PORT=6443 # Default port if not specified
|
|
21
|
+
|
|
22
|
+
# Debug log to ~/.sky/ssh_node_pools_info/$CONTEXT-tunnel.log
|
|
23
|
+
debug_log() {
|
|
24
|
+
local message="$(date): $1"
|
|
25
|
+
echo "$message" >> "$LOG_FILE"
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
# Generate expiration timestamp for credential
|
|
29
|
+
generate_expiration_timestamp() {
|
|
30
|
+
# Try macOS date format first, fallback to Linux format
|
|
31
|
+
date -u -v+${TTL_SECONDS}S +"%Y-%m-%dT%H:%M:%SZ" 2>/dev/null || date -u -d "+${TTL_SECONDS} seconds" +"%Y-%m-%dT%H:%M:%SZ"
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
# Acquire the lock, return 0 if successful, 1 if another process is already holding the lock
|
|
35
|
+
acquire_lock() {
|
|
36
|
+
# Check for flock command
|
|
37
|
+
if ! command -v flock >/dev/null 2>&1; then
|
|
38
|
+
debug_log "flock command not available, using alternative lock mechanism"
|
|
39
|
+
# Simple file-based locking
|
|
40
|
+
if [ -f "$LOCK_FILE" ]; then
|
|
41
|
+
lock_pid=$(cat "$LOCK_FILE" 2>/dev/null)
|
|
42
|
+
if [ -n "$lock_pid" ] && kill -0 "$lock_pid" 2>/dev/null; then
|
|
43
|
+
debug_log "Another process ($lock_pid) is starting the tunnel, waiting briefly"
|
|
44
|
+
return 1
|
|
45
|
+
else
|
|
46
|
+
# Stale lock file
|
|
47
|
+
debug_log "Removing stale lock file"
|
|
48
|
+
rm -f "$LOCK_FILE"
|
|
49
|
+
fi
|
|
50
|
+
fi
|
|
51
|
+
# Create our lock
|
|
52
|
+
echo $$ > "$LOCK_FILE"
|
|
53
|
+
return 0
|
|
54
|
+
else
|
|
55
|
+
# Use flock for better locking
|
|
56
|
+
exec 9>"$LOCK_FILE"
|
|
57
|
+
if ! flock -n 9; then
|
|
58
|
+
debug_log "Another process is starting the tunnel, waiting briefly"
|
|
59
|
+
return 1
|
|
60
|
+
fi
|
|
61
|
+
return 0
|
|
62
|
+
fi
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
# Release the lock
|
|
66
|
+
release_lock() {
|
|
67
|
+
if command -v flock >/dev/null 2>&1; then
|
|
68
|
+
# Using flock
|
|
69
|
+
exec 9>&- # Close file descriptor to release lock
|
|
70
|
+
else
|
|
71
|
+
# Using simple lock
|
|
72
|
+
rm -f "$LOCK_FILE"
|
|
73
|
+
fi
|
|
74
|
+
debug_log "Lock released"
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
# Generate SSH command based on available tools and parameters
|
|
78
|
+
generate_ssh_command() {
|
|
79
|
+
# Check for autossh
|
|
80
|
+
if ! command -v autossh >/dev/null 2>&1; then
|
|
81
|
+
debug_log "WARNING: autossh is not installed but recommended for reliable SSH tunnels"
|
|
82
|
+
debug_log "Install autossh: brew install autossh (macOS), apt-get install autossh (Ubuntu/Debian)"
|
|
83
|
+
|
|
84
|
+
# Fall back to regular ssh
|
|
85
|
+
if [[ $USE_SSH_CONFIG -eq 1 ]]; then
|
|
86
|
+
SSH_CMD=("ssh" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N" "$HOST")
|
|
87
|
+
else
|
|
88
|
+
SSH_CMD=("ssh" "-o" "StrictHostKeyChecking=no" "-o" "IdentitiesOnly=yes" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N")
|
|
89
|
+
|
|
90
|
+
# Add SSH key if provided
|
|
91
|
+
if [[ -n "$SSH_KEY" ]]; then
|
|
92
|
+
SSH_CMD+=("-i" "$SSH_KEY")
|
|
93
|
+
fi
|
|
94
|
+
|
|
95
|
+
# Add user@host
|
|
96
|
+
SSH_CMD+=("$USER@$HOST")
|
|
97
|
+
fi
|
|
98
|
+
else
|
|
99
|
+
# Configure autossh
|
|
100
|
+
if [[ $USE_SSH_CONFIG -eq 1 ]]; then
|
|
101
|
+
SSH_CMD=("autossh" "-M" "0" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N" "$HOST")
|
|
102
|
+
else
|
|
103
|
+
SSH_CMD=("autossh" "-M" "0" "-o" "StrictHostKeyChecking=no" "-o" "IdentitiesOnly=yes" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N")
|
|
104
|
+
|
|
105
|
+
# Add SSH key if provided
|
|
106
|
+
if [[ -n "$SSH_KEY" ]]; then
|
|
107
|
+
SSH_CMD+=("-i" "$SSH_KEY")
|
|
108
|
+
fi
|
|
109
|
+
|
|
110
|
+
# Add user@host
|
|
111
|
+
SSH_CMD+=("$USER@$HOST")
|
|
112
|
+
fi
|
|
113
|
+
fi
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
# Function to read certificate files if they exist
|
|
117
|
+
read_certificate_data() {
|
|
118
|
+
local client_cert_file="$TUNNEL_DIR/$CONTEXT-cert.pem"
|
|
119
|
+
local client_key_file="$TUNNEL_DIR/$CONTEXT-key.pem"
|
|
120
|
+
local cert_data=""
|
|
121
|
+
local key_data=""
|
|
122
|
+
|
|
123
|
+
if [[ -f "$client_cert_file" ]]; then
|
|
124
|
+
# Read the certificate file as is - it's already in PEM format
|
|
125
|
+
cert_data=$(cat "$client_cert_file")
|
|
126
|
+
debug_log "Found client certificate data for context $CONTEXT"
|
|
127
|
+
|
|
128
|
+
# Log the first and last few characters to verify PEM format
|
|
129
|
+
local cert_start=$(head -1 "$client_cert_file")
|
|
130
|
+
local cert_end=$(tail -1 "$client_cert_file")
|
|
131
|
+
debug_log "Certificate starts with: $cert_start"
|
|
132
|
+
debug_log "Certificate ends with: $cert_end"
|
|
133
|
+
|
|
134
|
+
# Check if it has proper PEM format
|
|
135
|
+
if ! grep -q "BEGIN CERTIFICATE" "$client_cert_file" || ! grep -q "END CERTIFICATE" "$client_cert_file"; then
|
|
136
|
+
debug_log "WARNING: Certificate file may not be in proper PEM format"
|
|
137
|
+
# Try to fix it if needed
|
|
138
|
+
if ! grep -q "BEGIN CERTIFICATE" "$client_cert_file"; then
|
|
139
|
+
echo "-----BEGIN CERTIFICATE-----" > "$client_cert_file.fixed"
|
|
140
|
+
cat "$client_cert_file" >> "$client_cert_file.fixed"
|
|
141
|
+
echo "-----END CERTIFICATE-----" >> "$client_cert_file.fixed"
|
|
142
|
+
mv "$client_cert_file.fixed" "$client_cert_file"
|
|
143
|
+
cert_data=$(cat "$client_cert_file")
|
|
144
|
+
debug_log "Fixed certificate format by adding BEGIN/END markers"
|
|
145
|
+
fi
|
|
146
|
+
fi
|
|
147
|
+
fi
|
|
148
|
+
|
|
149
|
+
if [[ -f "$client_key_file" ]]; then
|
|
150
|
+
# Read the key file as is - it's already in PEM format
|
|
151
|
+
key_data=$(cat "$client_key_file")
|
|
152
|
+
debug_log "Found client key data for context $CONTEXT"
|
|
153
|
+
|
|
154
|
+
# Log the first and last few characters to verify PEM format
|
|
155
|
+
local key_start=$(head -1 "$client_key_file")
|
|
156
|
+
local key_end=$(tail -1 "$client_key_file")
|
|
157
|
+
debug_log "Key starts with: $key_start"
|
|
158
|
+
debug_log "Key ends with: $key_end"
|
|
159
|
+
|
|
160
|
+
# Check if it has proper PEM format
|
|
161
|
+
if ! grep -q "BEGIN" "$client_key_file" || ! grep -q "END" "$client_key_file"; then
|
|
162
|
+
debug_log "WARNING: Key file may not be in proper PEM format"
|
|
163
|
+
# Try to fix it if needed
|
|
164
|
+
if ! grep -q "BEGIN" "$client_key_file"; then
|
|
165
|
+
echo "-----BEGIN PRIVATE KEY-----" > "$client_key_file.fixed"
|
|
166
|
+
cat "$client_key_file" >> "$client_key_file.fixed"
|
|
167
|
+
echo "-----END PRIVATE KEY-----" >> "$client_key_file.fixed"
|
|
168
|
+
mv "$client_key_file.fixed" "$client_key_file"
|
|
169
|
+
key_data=$(cat "$client_key_file")
|
|
170
|
+
debug_log "Fixed key format by adding BEGIN/END markers"
|
|
171
|
+
fi
|
|
172
|
+
fi
|
|
173
|
+
fi
|
|
174
|
+
|
|
175
|
+
echo "$cert_data:$key_data"
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
# Function to generate credentials JSON
|
|
179
|
+
generate_credentials_json() {
|
|
180
|
+
local expiration_time=$(generate_expiration_timestamp)
|
|
181
|
+
local cert_bundle=$(read_certificate_data)
|
|
182
|
+
local client_cert_data=${cert_bundle%:*}
|
|
183
|
+
local client_key_data=${cert_bundle#*:}
|
|
184
|
+
|
|
185
|
+
if [[ -n "$client_cert_data" && -n "$client_key_data" ]]; then
|
|
186
|
+
# Debug the certificate data
|
|
187
|
+
debug_log "Certificate data length: $(echo -n "$client_cert_data" | wc -c) bytes"
|
|
188
|
+
debug_log "Key data length: $(echo -n "$client_key_data" | wc -c) bytes"
|
|
189
|
+
|
|
190
|
+
# Check if we can create proper JSON with `jq`
|
|
191
|
+
if ! command -v jq &>/dev/null; then
|
|
192
|
+
echo "jq is not installed. Please install jq to use this script." >&2
|
|
193
|
+
exit 1
|
|
194
|
+
fi
|
|
195
|
+
debug_log "Using jq for JSON formatting"
|
|
196
|
+
|
|
197
|
+
# Create a temporary file for the JSON output to avoid shell escaping issues
|
|
198
|
+
local TEMP_JSON_FILE=$(mktemp)
|
|
199
|
+
|
|
200
|
+
# Write the JSON to the temporary file using jq for proper JSON formatting
|
|
201
|
+
cat > "$TEMP_JSON_FILE" << EOL
|
|
202
|
+
{
|
|
203
|
+
"apiVersion": "client.authentication.k8s.io/v1beta1",
|
|
204
|
+
"kind": "ExecCredential",
|
|
205
|
+
"status": {
|
|
206
|
+
"clientCertificateData": $(printf '%s' "$client_cert_data" | jq -R -s .),
|
|
207
|
+
"clientKeyData": $(printf '%s' "$client_key_data" | jq -R -s .),
|
|
208
|
+
"expirationTimestamp": "$expiration_time"
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
EOL
|
|
212
|
+
|
|
213
|
+
# Read the JSON from the file
|
|
214
|
+
local json_response=$(cat "$TEMP_JSON_FILE")
|
|
215
|
+
|
|
216
|
+
# Clean up
|
|
217
|
+
rm -f "$TEMP_JSON_FILE"
|
|
218
|
+
|
|
219
|
+
# Output the JSON
|
|
220
|
+
echo "$json_response"
|
|
221
|
+
else
|
|
222
|
+
# Fallback to token-based credential for tunnel-only authentication
|
|
223
|
+
echo "{\"apiVersion\":\"client.authentication.k8s.io/v1beta1\",\"kind\":\"ExecCredential\",\"status\":{\"token\":\"k8s-ssh-tunnel-token\",\"expirationTimestamp\":\"$expiration_time\"}}"
|
|
224
|
+
fi
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
while [[ $# -gt 0 ]]; do
|
|
228
|
+
case $1 in
|
|
229
|
+
--use-ssh-config)
|
|
230
|
+
USE_SSH_CONFIG=1
|
|
231
|
+
shift
|
|
232
|
+
;;
|
|
233
|
+
--ssh-key)
|
|
234
|
+
SSH_KEY="$2"
|
|
235
|
+
shift 2
|
|
236
|
+
;;
|
|
237
|
+
--context)
|
|
238
|
+
CONTEXT="$2"
|
|
239
|
+
shift 2
|
|
240
|
+
;;
|
|
241
|
+
--port)
|
|
242
|
+
PORT="$2"
|
|
243
|
+
shift 2
|
|
244
|
+
;;
|
|
245
|
+
--host)
|
|
246
|
+
HOST="$2"
|
|
247
|
+
shift 2
|
|
248
|
+
;;
|
|
249
|
+
--user)
|
|
250
|
+
USER="$2"
|
|
251
|
+
shift 2
|
|
252
|
+
;;
|
|
253
|
+
--ttl)
|
|
254
|
+
TTL_SECONDS="$2"
|
|
255
|
+
shift 2
|
|
256
|
+
;;
|
|
257
|
+
*)
|
|
258
|
+
echo "Unknown parameter: $1" >&2
|
|
259
|
+
exit 1
|
|
260
|
+
;;
|
|
261
|
+
esac
|
|
262
|
+
done
|
|
263
|
+
|
|
264
|
+
# Validate required parameters
|
|
265
|
+
if [[ -z "$HOST" ]]; then
|
|
266
|
+
echo "Error: --host parameter is required" >&2
|
|
267
|
+
exit 1
|
|
268
|
+
fi
|
|
269
|
+
|
|
270
|
+
# Setup directories
|
|
271
|
+
TUNNEL_DIR="$HOME/.sky/ssh_node_pools_info"
|
|
272
|
+
mkdir -p "$TUNNEL_DIR"
|
|
273
|
+
|
|
274
|
+
# Get context name for PID file
|
|
275
|
+
if [[ -z "$CONTEXT" ]]; then
|
|
276
|
+
CONTEXT="default"
|
|
277
|
+
fi
|
|
278
|
+
|
|
279
|
+
PID_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.pid"
|
|
280
|
+
LOG_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.log"
|
|
281
|
+
LOCK_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.lock"
|
|
282
|
+
|
|
283
|
+
debug_log "Starting ssh-tunnel.sh for context $CONTEXT, host $HOST, port $PORT"
|
|
284
|
+
debug_log "SSH Config: $USE_SSH_CONFIG, User: $USER, TTL: ${TTL_SECONDS}s"
|
|
285
|
+
|
|
286
|
+
# Check if specified port is already in use (tunnel may be running)
|
|
287
|
+
if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
|
|
288
|
+
debug_log "Port $PORT already in use, checking if it's our tunnel"
|
|
289
|
+
|
|
290
|
+
# Check if there's a PID file and if that process is running
|
|
291
|
+
if [[ -f "$PID_FILE" ]]; then
|
|
292
|
+
OLD_PID=$(cat "$PID_FILE")
|
|
293
|
+
if kill -0 "$OLD_PID" 2>/dev/null; then
|
|
294
|
+
debug_log "Tunnel appears to be running with PID $OLD_PID"
|
|
295
|
+
else
|
|
296
|
+
debug_log "PID file exists but process $OLD_PID is not running"
|
|
297
|
+
fi
|
|
298
|
+
else
|
|
299
|
+
debug_log "Port $PORT is in use but no PID file exists"
|
|
300
|
+
fi
|
|
301
|
+
|
|
302
|
+
# Return valid credential format for kubectl with expiration
|
|
303
|
+
generate_credentials_json
|
|
304
|
+
exit 0
|
|
305
|
+
fi
|
|
306
|
+
|
|
307
|
+
# Try to acquire the lock
|
|
308
|
+
if ! acquire_lock; then
|
|
309
|
+
# Wait briefly for the tunnel to be established
|
|
310
|
+
for i in {1..10}; do
|
|
311
|
+
if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
|
|
312
|
+
debug_log "Tunnel is now active"
|
|
313
|
+
|
|
314
|
+
# Return valid credential format for kubectl with expiration
|
|
315
|
+
generate_credentials_json
|
|
316
|
+
exit 0
|
|
317
|
+
fi
|
|
318
|
+
sleep 0.2
|
|
319
|
+
done
|
|
320
|
+
debug_log "Waited for tunnel but port $PORT still not available"
|
|
321
|
+
fi
|
|
322
|
+
|
|
323
|
+
# Check if we have a PID file with running process
|
|
324
|
+
if [[ -f "$PID_FILE" ]]; then
|
|
325
|
+
OLD_PID=$(cat "$PID_FILE")
|
|
326
|
+
if kill -0 "$OLD_PID" 2>/dev/null; then
|
|
327
|
+
# Process exists but port isn't open - something's wrong, kill it
|
|
328
|
+
kill "$OLD_PID" 2>/dev/null
|
|
329
|
+
debug_log "Killed stale tunnel process $OLD_PID"
|
|
330
|
+
else
|
|
331
|
+
debug_log "PID file exists but process $OLD_PID is not running anymore"
|
|
332
|
+
fi
|
|
333
|
+
# Remove the stale PID file
|
|
334
|
+
rm -f "$PID_FILE"
|
|
335
|
+
fi
|
|
336
|
+
|
|
337
|
+
# Generate the SSH command
|
|
338
|
+
generate_ssh_command
|
|
339
|
+
|
|
340
|
+
debug_log "Starting SSH tunnel: ${SSH_CMD[*]}"
|
|
341
|
+
|
|
342
|
+
# Start the tunnel in foreground and wait for it to establish
|
|
343
|
+
"${SSH_CMD[@]}" >> "$LOG_FILE" 2>&1 &
|
|
344
|
+
TUNNEL_PID=$!
|
|
345
|
+
|
|
346
|
+
# Save PID
|
|
347
|
+
echo $TUNNEL_PID > "$PID_FILE"
|
|
348
|
+
debug_log "Tunnel started with PID $TUNNEL_PID"
|
|
349
|
+
|
|
350
|
+
# Wait for tunnel to establish
|
|
351
|
+
tunnel_up=0
|
|
352
|
+
for i in {1..20}; do
|
|
353
|
+
if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
|
|
354
|
+
debug_log "Tunnel established successfully on port $PORT"
|
|
355
|
+
tunnel_up=1
|
|
356
|
+
break
|
|
357
|
+
fi
|
|
358
|
+
sleep 0.2
|
|
359
|
+
done
|
|
360
|
+
|
|
361
|
+
# Clean up lock file
|
|
362
|
+
release_lock
|
|
363
|
+
|
|
364
|
+
# Check if the tunnel process is still running
|
|
365
|
+
if ! kill -0 $TUNNEL_PID 2>/dev/null; then
|
|
366
|
+
debug_log "ERROR: Tunnel process exited unexpectedly! Check logs for details"
|
|
367
|
+
if [[ -f "$PID_FILE" ]]; then
|
|
368
|
+
rm -f "$PID_FILE"
|
|
369
|
+
fi
|
|
370
|
+
# Return error in case of tunnel failure
|
|
371
|
+
echo "Failed to establish SSH tunnel. See $TUNNEL_DIR/$CONTEXT-tunnel.log for details." >&2
|
|
372
|
+
exit 1
|
|
373
|
+
elif [[ $tunnel_up -eq 0 ]]; then
|
|
374
|
+
debug_log "WARNING: Tunnel process is running but port $PORT is not responding"
|
|
375
|
+
fi
|
|
376
|
+
|
|
377
|
+
# Return valid credential format with certificates if available
|
|
378
|
+
generate_credentials_json
|
|
379
|
+
exit 0
|
|
@@ -920,19 +920,17 @@ available_node_types:
|
|
|
920
920
|
{{ ray_installation_commands }}
|
|
921
921
|
|
|
922
922
|
# set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
|
|
923
|
-
# unset PYTHONPATH
|
|
924
|
-
VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false
|
|
923
|
+
# unset PYTHONPATH and set CWD to $HOME to avoid user image interfering with SkyPilot runtime.
|
|
924
|
+
VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false {{sky_unset_pythonpath_and_set_cwd}} ~/.local/bin/uv pip install skypilot[kubernetes,remote]
|
|
925
925
|
# Wait for `patch` package to be installed before applying ray patches
|
|
926
926
|
until dpkg -l | grep -q "^ii patch "; do
|
|
927
927
|
sleep 0.1
|
|
928
928
|
echo "Waiting for patch package to be installed..."
|
|
929
929
|
done
|
|
930
930
|
# Apply Ray patches for progress bar fix
|
|
931
|
-
# set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
|
|
932
|
-
# unset PYTHONPATH in case the user provided docker image set it.
|
|
933
931
|
# ~/.sky/python_path is seeded by conda_installation_commands
|
|
934
|
-
VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false
|
|
935
|
-
|
|
932
|
+
VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false {{sky_unset_pythonpath_and_set_cwd}} ~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
|
|
933
|
+
{{sky_unset_pythonpath_and_set_cwd}} $(cat ~/.sky/python_path) -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
|
|
936
934
|
}
|
|
937
935
|
touch /tmp/ray_skypilot_installation_complete
|
|
938
936
|
echo "=== Ray and skypilot installation completed ==="
|
sky/templates/slurm-ray.yml.j2
CHANGED
|
@@ -20,10 +20,18 @@ provider:
|
|
|
20
20
|
{% if slurm_proxy_command is not none %}
|
|
21
21
|
proxycommand: {{slurm_proxy_command | tojson }}
|
|
22
22
|
{% endif %}
|
|
23
|
+
{% if slurm_proxy_jump is not none %}
|
|
24
|
+
proxyjump: {{slurm_proxy_jump | tojson }}
|
|
25
|
+
{% endif %}
|
|
23
26
|
|
|
24
27
|
auth:
|
|
25
28
|
ssh_user: {{ssh_user}}
|
|
26
|
-
# TODO(jwj): Modify this tmp workaround.
|
|
29
|
+
# TODO(jwj,kevin): Modify this tmp workaround.
|
|
30
|
+
# Right now there's a chicken-and-egg problem:
|
|
31
|
+
# 1. ssh_credential_from_yaml reads from the auth.ssh_private_key: ~/.sky/clients/.../ssh/sky-key
|
|
32
|
+
# 2. This is SkyPilot's generated key, not the Slurm cluster's key
|
|
33
|
+
# 3. The internal_file_mounts stage tries to rsync using sky-key, but its public key isn't on the remote yet
|
|
34
|
+
# 4. The public key only gets added by setup_commands, which runs AFTER file_mounts
|
|
27
35
|
# ssh_private_key: {{ssh_private_key}}
|
|
28
36
|
ssh_private_key: {{slurm_private_key}}
|
|
29
37
|
ssh_proxy_command: {{slurm_proxy_command | tojson }}
|
|
@@ -67,9 +75,31 @@ initialization_commands: []
|
|
|
67
75
|
# Increment the following for catching performance bugs easier:
|
|
68
76
|
# current num items (num SSH connections): 1
|
|
69
77
|
setup_commands:
|
|
70
|
-
-
|
|
78
|
+
- |
|
|
79
|
+
{%- for initial_setup_command in initial_setup_commands %}
|
|
71
80
|
{{ initial_setup_command }}
|
|
72
81
|
{%- endfor %}
|
|
82
|
+
# Generate host key for sshd -i if not exists
|
|
83
|
+
mkdir -p ~{{ssh_user}}/.ssh && chmod 700 ~{{ssh_user}}/.ssh
|
|
84
|
+
[ -f ~{{ssh_user}}/.ssh/{{slurm_sshd_host_key_filename}} ] || ssh-keygen -t ed25519 -f ~{{ssh_user}}/.ssh/{{slurm_sshd_host_key_filename}} -N "" -q
|
|
85
|
+
# Add public key to user's authorized_keys if not already present
|
|
86
|
+
grep -qF 'skypilot:ssh_public_key_content' ~{{ssh_user}}/.ssh/authorized_keys 2>/dev/null || cat >> ~{{ssh_user}}/.ssh/authorized_keys <<'SKYPILOT_SSH_KEY_EOF'
|
|
87
|
+
skypilot:ssh_public_key_content
|
|
88
|
+
SKYPILOT_SSH_KEY_EOF
|
|
89
|
+
chmod 600 ~{{ssh_user}}/.ssh/authorized_keys
|
|
90
|
+
|
|
91
|
+
mkdir -p ~{{ssh_user}}/.sky
|
|
92
|
+
cat > ~{{ssh_user}}/.sky_ssh_rc <<'SKYPILOT_SSH_RC'
|
|
93
|
+
# Added by SkyPilot: override HOME for Slurm interactive sessions
|
|
94
|
+
if [ -n "${{slurm_cluster_name_env_var}}" ]; then
|
|
95
|
+
CLUSTER_DIR=~/.sky_clusters/${{slurm_cluster_name_env_var}}
|
|
96
|
+
if [ -d "$CLUSTER_DIR" ]; then
|
|
97
|
+
cd "$CLUSTER_DIR"
|
|
98
|
+
export HOME=$(pwd)
|
|
99
|
+
fi
|
|
100
|
+
fi
|
|
101
|
+
SKYPILOT_SSH_RC
|
|
102
|
+
grep -q "source ~/.sky_ssh_rc" ~{{ssh_user}}/.bashrc 2>/dev/null || (echo "" >> ~{{ssh_user}}/.bashrc && echo "source ~/.sky_ssh_rc" >> ~{{ssh_user}}/.bashrc)
|
|
73
103
|
{{ setup_sky_dirs_commands }}
|
|
74
104
|
{{ conda_installation_commands }}
|
|
75
105
|
{{ skypilot_wheel_installation_commands }}
|
sky/templates/websocket_proxy.py
CHANGED
|
@@ -9,13 +9,11 @@
|
|
|
9
9
|
This script is useful for users who do not have local Kubernetes credentials.
|
|
10
10
|
"""
|
|
11
11
|
import asyncio
|
|
12
|
-
from http.cookiejar import MozillaCookieJar
|
|
13
12
|
import os
|
|
14
13
|
import struct
|
|
15
14
|
import sys
|
|
16
15
|
import time
|
|
17
16
|
from typing import Dict, Optional
|
|
18
|
-
from urllib.request import Request
|
|
19
17
|
|
|
20
18
|
import requests
|
|
21
19
|
import websockets
|
|
@@ -24,46 +22,19 @@ from websockets.asyncio.client import connect
|
|
|
24
22
|
|
|
25
23
|
from sky import exceptions
|
|
26
24
|
from sky.client import service_account_auth
|
|
25
|
+
from sky.server import common as server_common
|
|
27
26
|
from sky.server import constants
|
|
28
|
-
from sky.server.server import
|
|
27
|
+
from sky.server.server import SSHMessageType
|
|
29
28
|
from sky.skylet import constants as skylet_constants
|
|
30
29
|
|
|
31
30
|
BUFFER_SIZE = 2**16 # 64KB
|
|
32
31
|
HEARTBEAT_INTERVAL_SECONDS = 10
|
|
33
|
-
|
|
34
|
-
# Environment variable for a file path to the API cookie file.
|
|
35
|
-
# Keep in sync with server/constants.py
|
|
36
|
-
API_COOKIE_FILE_ENV_VAR = 'SKYPILOT_API_COOKIE_FILE'
|
|
37
|
-
# Default file if unset.
|
|
38
|
-
# Keep in sync with server/constants.py
|
|
39
|
-
API_COOKIE_FILE_DEFAULT_LOCATION = '~/.sky/cookies.txt'
|
|
40
|
-
|
|
41
32
|
MAX_UNANSWERED_PINGS = 100
|
|
42
33
|
|
|
43
34
|
|
|
44
|
-
def _get_cookie_header(url: str) -> Dict[str, str]:
|
|
45
|
-
"""Extract Cookie header value from a cookie jar for a specific URL"""
|
|
46
|
-
cookie_path = os.environ.get(API_COOKIE_FILE_ENV_VAR)
|
|
47
|
-
if cookie_path is None:
|
|
48
|
-
cookie_path = API_COOKIE_FILE_DEFAULT_LOCATION
|
|
49
|
-
cookie_path = os.path.expanduser(cookie_path)
|
|
50
|
-
if not os.path.exists(cookie_path):
|
|
51
|
-
return {}
|
|
52
|
-
|
|
53
|
-
request = Request(url)
|
|
54
|
-
cookie_jar = MozillaCookieJar(os.path.expanduser(cookie_path))
|
|
55
|
-
cookie_jar.load(ignore_discard=True, ignore_expires=True)
|
|
56
|
-
cookie_jar.add_cookie_header(request)
|
|
57
|
-
cookie_header = request.get_header('Cookie')
|
|
58
|
-
# if cookie file is empty, return empty dict
|
|
59
|
-
if cookie_header is None:
|
|
60
|
-
return {}
|
|
61
|
-
return {'Cookie': cookie_header}
|
|
62
|
-
|
|
63
|
-
|
|
64
35
|
async def main(url: str, timestamps_supported: bool, login_url: str) -> None:
|
|
65
36
|
headers = {}
|
|
66
|
-
headers.update(
|
|
37
|
+
headers.update(server_common.get_cookie_header_for_url(url))
|
|
67
38
|
headers.update(service_account_auth.get_service_account_headers())
|
|
68
39
|
try:
|
|
69
40
|
async with connect(url, ping_interval=None,
|
|
@@ -142,8 +113,9 @@ async def latency_monitor(websocket: ClientConnection,
|
|
|
142
113
|
ping_time = time.time()
|
|
143
114
|
next_id += 1
|
|
144
115
|
last_ping_time_dict[next_id] = ping_time
|
|
145
|
-
message_header_bytes = struct.pack(
|
|
146
|
-
|
|
116
|
+
message_header_bytes = struct.pack('!BI',
|
|
117
|
+
SSHMessageType.PINGPONG.value,
|
|
118
|
+
next_id)
|
|
147
119
|
try:
|
|
148
120
|
async with websocket_lock:
|
|
149
121
|
await websocket.send(message_header_bytes)
|
|
@@ -176,7 +148,7 @@ async def stdin_to_websocket(reader: asyncio.StreamReader,
|
|
|
176
148
|
if timestamps_supported:
|
|
177
149
|
# Send message with type 0 to indicate data.
|
|
178
150
|
message_type_bytes = struct.pack(
|
|
179
|
-
'!B',
|
|
151
|
+
'!B', SSHMessageType.REGULAR_DATA.value)
|
|
180
152
|
data = message_type_bytes + data
|
|
181
153
|
async with websocket_lock:
|
|
182
154
|
await websocket.send(data)
|
|
@@ -201,10 +173,10 @@ async def websocket_to_stdout(websocket: ClientConnection,
|
|
|
201
173
|
if (timestamps_supported and len(message) > 0 and
|
|
202
174
|
last_ping_time_dict is not None):
|
|
203
175
|
message_type = struct.unpack('!B', message[:1])[0]
|
|
204
|
-
if message_type ==
|
|
176
|
+
if message_type == SSHMessageType.REGULAR_DATA.value:
|
|
205
177
|
# Regular data - strip type byte and write to stdout
|
|
206
178
|
message = message[1:]
|
|
207
|
-
elif message_type ==
|
|
179
|
+
elif message_type == SSHMessageType.PINGPONG.value:
|
|
208
180
|
# PONG response - calculate latency and send measurement
|
|
209
181
|
if not len(message) == struct.calcsize('!BI'):
|
|
210
182
|
raise ValueError(
|
|
@@ -222,8 +194,7 @@ async def websocket_to_stdout(websocket: ClientConnection,
|
|
|
222
194
|
|
|
223
195
|
# Send latency measurement (type 2)
|
|
224
196
|
message_type_bytes = struct.pack(
|
|
225
|
-
'!B',
|
|
226
|
-
KubernetesSSHMessageType.LATENCY_MEASUREMENT.value)
|
|
197
|
+
'!B', SSHMessageType.LATENCY_MEASUREMENT.value)
|
|
227
198
|
latency_bytes = struct.pack('!Q', latency_ms)
|
|
228
199
|
message = message_type_bytes + latency_bytes
|
|
229
200
|
# Send to server.
|
|
@@ -255,7 +226,7 @@ if __name__ == '__main__':
|
|
|
255
226
|
# TODO(aylei): remove the separate /api/health call and use the header
|
|
256
227
|
# during websocket handshake to determine the server version.
|
|
257
228
|
health_url = f'{server_url}/api/health'
|
|
258
|
-
cookie_hdr =
|
|
229
|
+
cookie_hdr = server_common.get_cookie_header_for_url(health_url)
|
|
259
230
|
health_response = requests.get(health_url, headers=cookie_hdr)
|
|
260
231
|
health_data = health_response.json()
|
|
261
232
|
timestamps_are_supported = int(health_data.get('api_version', 0)) > 21
|
|
@@ -272,7 +243,13 @@ if __name__ == '__main__':
|
|
|
272
243
|
client_version_str = (f'&client_version={constants.API_VERSION}'
|
|
273
244
|
if timestamps_are_supported else '')
|
|
274
245
|
|
|
275
|
-
|
|
246
|
+
# For backwards compatibility, fallback to kubernetes-pod-ssh-proxy if
|
|
247
|
+
# no endpoint is provided.
|
|
248
|
+
endpoint = sys.argv[3] if len(sys.argv) > 3 else 'kubernetes-pod-ssh-proxy'
|
|
249
|
+
# Worker index for Slurm.
|
|
250
|
+
worker_idx = sys.argv[4] if len(sys.argv) > 4 else '0'
|
|
251
|
+
websocket_url = (f'{server_url}/{endpoint}'
|
|
276
252
|
f'?cluster_name={sys.argv[2]}'
|
|
253
|
+
f'&worker={worker_idx}'
|
|
277
254
|
f'{client_version_str}')
|
|
278
255
|
asyncio.run(main(websocket_url, timestamps_are_supported, _login_url))
|