skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/slurm.py +159 -72
  3. sky/backends/backend_utils.py +52 -10
  4. sky/backends/cloud_vm_ray_backend.py +192 -32
  5. sky/backends/task_codegen.py +40 -2
  6. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  8. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  9. sky/catalog/seeweb_catalog.py +30 -15
  10. sky/catalog/shadeform_catalog.py +5 -2
  11. sky/catalog/slurm_catalog.py +0 -7
  12. sky/catalog/vast_catalog.py +30 -6
  13. sky/check.py +11 -8
  14. sky/client/cli/command.py +106 -54
  15. sky/client/interactive_utils.py +190 -0
  16. sky/client/sdk.py +8 -0
  17. sky/client/sdk_async.py +9 -0
  18. sky/clouds/aws.py +60 -2
  19. sky/clouds/azure.py +2 -0
  20. sky/clouds/kubernetes.py +2 -0
  21. sky/clouds/runpod.py +38 -7
  22. sky/clouds/slurm.py +44 -12
  23. sky/clouds/ssh.py +1 -1
  24. sky/clouds/vast.py +30 -17
  25. sky/core.py +69 -1
  26. sky/dashboard/out/404.html +1 -1
  27. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  29. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  30. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  31. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  32. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  35. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  36. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  37. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  39. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  40. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  44. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  45. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  50. sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
  51. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  52. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
  55. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  59. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  65. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  66. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  67. sky/dashboard/out/clusters/[cluster].html +1 -1
  68. sky/dashboard/out/clusters.html +1 -1
  69. sky/dashboard/out/config.html +1 -1
  70. sky/dashboard/out/index.html +1 -1
  71. sky/dashboard/out/infra/[context].html +1 -1
  72. sky/dashboard/out/infra.html +1 -1
  73. sky/dashboard/out/jobs/[job].html +1 -1
  74. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  75. sky/dashboard/out/jobs.html +1 -1
  76. sky/dashboard/out/plugins/[...slug].html +1 -1
  77. sky/dashboard/out/users.html +1 -1
  78. sky/dashboard/out/volumes.html +1 -1
  79. sky/dashboard/out/workspace/new.html +1 -1
  80. sky/dashboard/out/workspaces/[name].html +1 -1
  81. sky/dashboard/out/workspaces.html +1 -1
  82. sky/data/data_utils.py +26 -12
  83. sky/data/mounting_utils.py +29 -4
  84. sky/global_user_state.py +108 -16
  85. sky/jobs/client/sdk.py +8 -3
  86. sky/jobs/controller.py +191 -31
  87. sky/jobs/recovery_strategy.py +109 -11
  88. sky/jobs/server/core.py +81 -4
  89. sky/jobs/server/server.py +14 -0
  90. sky/jobs/state.py +417 -19
  91. sky/jobs/utils.py +73 -80
  92. sky/models.py +9 -0
  93. sky/optimizer.py +2 -1
  94. sky/provision/__init__.py +11 -9
  95. sky/provision/kubernetes/utils.py +122 -15
  96. sky/provision/kubernetes/volume.py +52 -17
  97. sky/provision/provisioner.py +2 -1
  98. sky/provision/runpod/instance.py +3 -1
  99. sky/provision/runpod/utils.py +13 -1
  100. sky/provision/runpod/volume.py +25 -9
  101. sky/provision/slurm/instance.py +75 -29
  102. sky/provision/slurm/utils.py +213 -107
  103. sky/provision/vast/utils.py +1 -0
  104. sky/resources.py +135 -13
  105. sky/schemas/api/responses.py +4 -0
  106. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  107. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  108. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  109. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  110. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  111. sky/schemas/generated/jobsv1_pb2.py +9 -5
  112. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  113. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  114. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  115. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  116. sky/serve/serve_utils.py +232 -40
  117. sky/server/common.py +17 -0
  118. sky/server/constants.py +1 -1
  119. sky/server/metrics.py +6 -3
  120. sky/server/plugins.py +16 -0
  121. sky/server/requests/payloads.py +18 -0
  122. sky/server/requests/request_names.py +2 -0
  123. sky/server/requests/requests.py +28 -10
  124. sky/server/requests/serializers/encoders.py +5 -0
  125. sky/server/requests/serializers/return_value_serializers.py +14 -4
  126. sky/server/server.py +434 -107
  127. sky/server/uvicorn.py +5 -0
  128. sky/setup_files/MANIFEST.in +1 -0
  129. sky/setup_files/dependencies.py +21 -10
  130. sky/sky_logging.py +2 -1
  131. sky/skylet/constants.py +22 -5
  132. sky/skylet/executor/slurm.py +4 -6
  133. sky/skylet/job_lib.py +89 -4
  134. sky/skylet/services.py +18 -3
  135. sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
  136. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  137. sky/templates/kubernetes-ray.yml.j2 +4 -6
  138. sky/templates/slurm-ray.yml.j2 +32 -2
  139. sky/templates/websocket_proxy.py +18 -41
  140. sky/users/permission.py +61 -51
  141. sky/utils/auth_utils.py +42 -0
  142. sky/utils/cli_utils/status_utils.py +19 -5
  143. sky/utils/cluster_utils.py +10 -3
  144. sky/utils/command_runner.py +256 -94
  145. sky/utils/command_runner.pyi +16 -0
  146. sky/utils/common_utils.py +30 -29
  147. sky/utils/context.py +32 -0
  148. sky/utils/db/db_utils.py +36 -6
  149. sky/utils/db/migration_utils.py +41 -21
  150. sky/utils/infra_utils.py +5 -1
  151. sky/utils/instance_links.py +139 -0
  152. sky/utils/interactive_utils.py +49 -0
  153. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  154. sky/utils/kubernetes/rsync_helper.sh +5 -1
  155. sky/utils/plugin_extensions/__init__.py +14 -0
  156. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  157. sky/utils/resources_utils.py +10 -8
  158. sky/utils/rich_utils.py +9 -11
  159. sky/utils/schemas.py +63 -20
  160. sky/utils/status_lib.py +7 -0
  161. sky/utils/subprocess_utils.py +17 -0
  162. sky/volumes/client/sdk.py +6 -3
  163. sky/volumes/server/core.py +65 -27
  164. sky_templates/ray/start_cluster +8 -4
  165. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
  166. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
  167. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
  168. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
  169. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  170. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  173. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  174. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
  175. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  176. sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
  177. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  178. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  179. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  180. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
  181. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  182. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
  183. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  184. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  185. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  186. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  187. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  188. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  189. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  190. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
  191. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
  192. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
  193. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
  194. sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
  195. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
  196. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
  197. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
  198. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
  199. sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
  200. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
  201. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
  202. /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  203. /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
  204. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  205. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  206. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  207. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,379 @@
1
+ #!/bin/bash
2
+ # ssh-tunnel.sh - SSH tunnel script for Kubernetes API access
3
+ # Used as kubectl exec credential plugin to establish SSH tunnel on demand.
4
+ # Returns a valid credential format for kubectl with expiration. The expiration
5
+ # is calculated based on the TTL argument and is required to force kubectl to
6
+ # check the tunnel status frequently.
7
+
8
+ # Usage: ssh-tunnel.sh --host HOST [--user USER] [--use-ssh-config] [--ssh-key KEY] [--context CONTEXT] [--port PORT] [--ttl SECONDS]
9
+
10
+ # Default time-to-live for credential in seconds
11
+ # This forces kubectl to check the tunnel status frequently
12
+ TTL_SECONDS=30
13
+
14
+ # Parse arguments
15
+ USE_SSH_CONFIG=0
16
+ SSH_KEY=""
17
+ CONTEXT=""
18
+ HOST=""
19
+ USER=""
20
+ PORT=6443 # Default port if not specified
21
+
22
+ # Debug log to ~/.sky/ssh_node_pools_info/$CONTEXT-tunnel.log
23
+ debug_log() {
24
+ local message="$(date): $1"
25
+ echo "$message" >> "$LOG_FILE"
26
+ }
27
+
28
+ # Generate expiration timestamp for credential
29
+ generate_expiration_timestamp() {
30
+ # Try macOS date format first, fallback to Linux format
31
+ date -u -v+${TTL_SECONDS}S +"%Y-%m-%dT%H:%M:%SZ" 2>/dev/null || date -u -d "+${TTL_SECONDS} seconds" +"%Y-%m-%dT%H:%M:%SZ"
32
+ }
33
+
34
+ # Acquire the lock, return 0 if successful, 1 if another process is already holding the lock
35
+ acquire_lock() {
36
+ # Check for flock command
37
+ if ! command -v flock >/dev/null 2>&1; then
38
+ debug_log "flock command not available, using alternative lock mechanism"
39
+ # Simple file-based locking
40
+ if [ -f "$LOCK_FILE" ]; then
41
+ lock_pid=$(cat "$LOCK_FILE" 2>/dev/null)
42
+ if [ -n "$lock_pid" ] && kill -0 "$lock_pid" 2>/dev/null; then
43
+ debug_log "Another process ($lock_pid) is starting the tunnel, waiting briefly"
44
+ return 1
45
+ else
46
+ # Stale lock file
47
+ debug_log "Removing stale lock file"
48
+ rm -f "$LOCK_FILE"
49
+ fi
50
+ fi
51
+ # Create our lock
52
+ echo $$ > "$LOCK_FILE"
53
+ return 0
54
+ else
55
+ # Use flock for better locking
56
+ exec 9>"$LOCK_FILE"
57
+ if ! flock -n 9; then
58
+ debug_log "Another process is starting the tunnel, waiting briefly"
59
+ return 1
60
+ fi
61
+ return 0
62
+ fi
63
+ }
64
+
65
+ # Release the lock
66
+ release_lock() {
67
+ if command -v flock >/dev/null 2>&1; then
68
+ # Using flock
69
+ exec 9>&- # Close file descriptor to release lock
70
+ else
71
+ # Using simple lock
72
+ rm -f "$LOCK_FILE"
73
+ fi
74
+ debug_log "Lock released"
75
+ }
76
+
77
+ # Generate SSH command based on available tools and parameters
78
+ generate_ssh_command() {
79
+ # Check for autossh
80
+ if ! command -v autossh >/dev/null 2>&1; then
81
+ debug_log "WARNING: autossh is not installed but recommended for reliable SSH tunnels"
82
+ debug_log "Install autossh: brew install autossh (macOS), apt-get install autossh (Ubuntu/Debian)"
83
+
84
+ # Fall back to regular ssh
85
+ if [[ $USE_SSH_CONFIG -eq 1 ]]; then
86
+ SSH_CMD=("ssh" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N" "$HOST")
87
+ else
88
+ SSH_CMD=("ssh" "-o" "StrictHostKeyChecking=no" "-o" "IdentitiesOnly=yes" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N")
89
+
90
+ # Add SSH key if provided
91
+ if [[ -n "$SSH_KEY" ]]; then
92
+ SSH_CMD+=("-i" "$SSH_KEY")
93
+ fi
94
+
95
+ # Add user@host
96
+ SSH_CMD+=("$USER@$HOST")
97
+ fi
98
+ else
99
+ # Configure autossh
100
+ if [[ $USE_SSH_CONFIG -eq 1 ]]; then
101
+ SSH_CMD=("autossh" "-M" "0" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N" "$HOST")
102
+ else
103
+ SSH_CMD=("autossh" "-M" "0" "-o" "StrictHostKeyChecking=no" "-o" "IdentitiesOnly=yes" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N")
104
+
105
+ # Add SSH key if provided
106
+ if [[ -n "$SSH_KEY" ]]; then
107
+ SSH_CMD+=("-i" "$SSH_KEY")
108
+ fi
109
+
110
+ # Add user@host
111
+ SSH_CMD+=("$USER@$HOST")
112
+ fi
113
+ fi
114
+ }
115
+
116
+ # Function to read certificate files if they exist
117
+ read_certificate_data() {
118
+ local client_cert_file="$TUNNEL_DIR/$CONTEXT-cert.pem"
119
+ local client_key_file="$TUNNEL_DIR/$CONTEXT-key.pem"
120
+ local cert_data=""
121
+ local key_data=""
122
+
123
+ if [[ -f "$client_cert_file" ]]; then
124
+ # Read the certificate file as is - it's already in PEM format
125
+ cert_data=$(cat "$client_cert_file")
126
+ debug_log "Found client certificate data for context $CONTEXT"
127
+
128
+ # Log the first and last few characters to verify PEM format
129
+ local cert_start=$(head -1 "$client_cert_file")
130
+ local cert_end=$(tail -1 "$client_cert_file")
131
+ debug_log "Certificate starts with: $cert_start"
132
+ debug_log "Certificate ends with: $cert_end"
133
+
134
+ # Check if it has proper PEM format
135
+ if ! grep -q "BEGIN CERTIFICATE" "$client_cert_file" || ! grep -q "END CERTIFICATE" "$client_cert_file"; then
136
+ debug_log "WARNING: Certificate file may not be in proper PEM format"
137
+ # Try to fix it if needed
138
+ if ! grep -q "BEGIN CERTIFICATE" "$client_cert_file"; then
139
+ echo "-----BEGIN CERTIFICATE-----" > "$client_cert_file.fixed"
140
+ cat "$client_cert_file" >> "$client_cert_file.fixed"
141
+ echo "-----END CERTIFICATE-----" >> "$client_cert_file.fixed"
142
+ mv "$client_cert_file.fixed" "$client_cert_file"
143
+ cert_data=$(cat "$client_cert_file")
144
+ debug_log "Fixed certificate format by adding BEGIN/END markers"
145
+ fi
146
+ fi
147
+ fi
148
+
149
+ if [[ -f "$client_key_file" ]]; then
150
+ # Read the key file as is - it's already in PEM format
151
+ key_data=$(cat "$client_key_file")
152
+ debug_log "Found client key data for context $CONTEXT"
153
+
154
+ # Log the first and last few characters to verify PEM format
155
+ local key_start=$(head -1 "$client_key_file")
156
+ local key_end=$(tail -1 "$client_key_file")
157
+ debug_log "Key starts with: $key_start"
158
+ debug_log "Key ends with: $key_end"
159
+
160
+ # Check if it has proper PEM format
161
+ if ! grep -q "BEGIN" "$client_key_file" || ! grep -q "END" "$client_key_file"; then
162
+ debug_log "WARNING: Key file may not be in proper PEM format"
163
+ # Try to fix it if needed
164
+ if ! grep -q "BEGIN" "$client_key_file"; then
165
+ echo "-----BEGIN PRIVATE KEY-----" > "$client_key_file.fixed"
166
+ cat "$client_key_file" >> "$client_key_file.fixed"
167
+ echo "-----END PRIVATE KEY-----" >> "$client_key_file.fixed"
168
+ mv "$client_key_file.fixed" "$client_key_file"
169
+ key_data=$(cat "$client_key_file")
170
+ debug_log "Fixed key format by adding BEGIN/END markers"
171
+ fi
172
+ fi
173
+ fi
174
+
175
+ echo "$cert_data:$key_data"
176
+ }
177
+
178
+ # Function to generate credentials JSON
179
+ generate_credentials_json() {
180
+ local expiration_time=$(generate_expiration_timestamp)
181
+ local cert_bundle=$(read_certificate_data)
182
+ local client_cert_data=${cert_bundle%:*}
183
+ local client_key_data=${cert_bundle#*:}
184
+
185
+ if [[ -n "$client_cert_data" && -n "$client_key_data" ]]; then
186
+ # Debug the certificate data
187
+ debug_log "Certificate data length: $(echo -n "$client_cert_data" | wc -c) bytes"
188
+ debug_log "Key data length: $(echo -n "$client_key_data" | wc -c) bytes"
189
+
190
+ # Check if we can create proper JSON with `jq`
191
+ if ! command -v jq &>/dev/null; then
192
+ echo "jq is not installed. Please install jq to use this script." >&2
193
+ exit 1
194
+ fi
195
+ debug_log "Using jq for JSON formatting"
196
+
197
+ # Create a temporary file for the JSON output to avoid shell escaping issues
198
+ local TEMP_JSON_FILE=$(mktemp)
199
+
200
+ # Write the JSON to the temporary file using jq for proper JSON formatting
201
+ cat > "$TEMP_JSON_FILE" << EOL
202
+ {
203
+ "apiVersion": "client.authentication.k8s.io/v1beta1",
204
+ "kind": "ExecCredential",
205
+ "status": {
206
+ "clientCertificateData": $(printf '%s' "$client_cert_data" | jq -R -s .),
207
+ "clientKeyData": $(printf '%s' "$client_key_data" | jq -R -s .),
208
+ "expirationTimestamp": "$expiration_time"
209
+ }
210
+ }
211
+ EOL
212
+
213
+ # Read the JSON from the file
214
+ local json_response=$(cat "$TEMP_JSON_FILE")
215
+
216
+ # Clean up
217
+ rm -f "$TEMP_JSON_FILE"
218
+
219
+ # Output the JSON
220
+ echo "$json_response"
221
+ else
222
+ # Fallback to token-based credential for tunnel-only authentication
223
+ echo "{\"apiVersion\":\"client.authentication.k8s.io/v1beta1\",\"kind\":\"ExecCredential\",\"status\":{\"token\":\"k8s-ssh-tunnel-token\",\"expirationTimestamp\":\"$expiration_time\"}}"
224
+ fi
225
+ }
226
+
227
+ while [[ $# -gt 0 ]]; do
228
+ case $1 in
229
+ --use-ssh-config)
230
+ USE_SSH_CONFIG=1
231
+ shift
232
+ ;;
233
+ --ssh-key)
234
+ SSH_KEY="$2"
235
+ shift 2
236
+ ;;
237
+ --context)
238
+ CONTEXT="$2"
239
+ shift 2
240
+ ;;
241
+ --port)
242
+ PORT="$2"
243
+ shift 2
244
+ ;;
245
+ --host)
246
+ HOST="$2"
247
+ shift 2
248
+ ;;
249
+ --user)
250
+ USER="$2"
251
+ shift 2
252
+ ;;
253
+ --ttl)
254
+ TTL_SECONDS="$2"
255
+ shift 2
256
+ ;;
257
+ *)
258
+ echo "Unknown parameter: $1" >&2
259
+ exit 1
260
+ ;;
261
+ esac
262
+ done
263
+
264
+ # Validate required parameters
265
+ if [[ -z "$HOST" ]]; then
266
+ echo "Error: --host parameter is required" >&2
267
+ exit 1
268
+ fi
269
+
270
+ # Setup directories
271
+ TUNNEL_DIR="$HOME/.sky/ssh_node_pools_info"
272
+ mkdir -p "$TUNNEL_DIR"
273
+
274
+ # Get context name for PID file
275
+ if [[ -z "$CONTEXT" ]]; then
276
+ CONTEXT="default"
277
+ fi
278
+
279
+ PID_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.pid"
280
+ LOG_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.log"
281
+ LOCK_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.lock"
282
+
283
+ debug_log "Starting ssh-tunnel.sh for context $CONTEXT, host $HOST, port $PORT"
284
+ debug_log "SSH Config: $USE_SSH_CONFIG, User: $USER, TTL: ${TTL_SECONDS}s"
285
+
286
+ # Check if specified port is already in use (tunnel may be running)
287
+ if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
288
+ debug_log "Port $PORT already in use, checking if it's our tunnel"
289
+
290
+ # Check if there's a PID file and if that process is running
291
+ if [[ -f "$PID_FILE" ]]; then
292
+ OLD_PID=$(cat "$PID_FILE")
293
+ if kill -0 "$OLD_PID" 2>/dev/null; then
294
+ debug_log "Tunnel appears to be running with PID $OLD_PID"
295
+ else
296
+ debug_log "PID file exists but process $OLD_PID is not running"
297
+ fi
298
+ else
299
+ debug_log "Port $PORT is in use but no PID file exists"
300
+ fi
301
+
302
+ # Return valid credential format for kubectl with expiration
303
+ generate_credentials_json
304
+ exit 0
305
+ fi
306
+
307
+ # Try to acquire the lock
308
+ if ! acquire_lock; then
309
+ # Wait briefly for the tunnel to be established
310
+ for i in {1..10}; do
311
+ if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
312
+ debug_log "Tunnel is now active"
313
+
314
+ # Return valid credential format for kubectl with expiration
315
+ generate_credentials_json
316
+ exit 0
317
+ fi
318
+ sleep 0.2
319
+ done
320
+ debug_log "Waited for tunnel but port $PORT still not available"
321
+ fi
322
+
323
+ # Check if we have a PID file with running process
324
+ if [[ -f "$PID_FILE" ]]; then
325
+ OLD_PID=$(cat "$PID_FILE")
326
+ if kill -0 "$OLD_PID" 2>/dev/null; then
327
+ # Process exists but port isn't open - something's wrong, kill it
328
+ kill "$OLD_PID" 2>/dev/null
329
+ debug_log "Killed stale tunnel process $OLD_PID"
330
+ else
331
+ debug_log "PID file exists but process $OLD_PID is not running anymore"
332
+ fi
333
+ # Remove the stale PID file
334
+ rm -f "$PID_FILE"
335
+ fi
336
+
337
+ # Generate the SSH command
338
+ generate_ssh_command
339
+
340
+ debug_log "Starting SSH tunnel: ${SSH_CMD[*]}"
341
+
342
+ # Start the tunnel in foreground and wait for it to establish
343
+ "${SSH_CMD[@]}" >> "$LOG_FILE" 2>&1 &
344
+ TUNNEL_PID=$!
345
+
346
+ # Save PID
347
+ echo $TUNNEL_PID > "$PID_FILE"
348
+ debug_log "Tunnel started with PID $TUNNEL_PID"
349
+
350
+ # Wait for tunnel to establish
351
+ tunnel_up=0
352
+ for i in {1..20}; do
353
+ if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
354
+ debug_log "Tunnel established successfully on port $PORT"
355
+ tunnel_up=1
356
+ break
357
+ fi
358
+ sleep 0.2
359
+ done
360
+
361
+ # Clean up lock file
362
+ release_lock
363
+
364
+ # Check if the tunnel process is still running
365
+ if ! kill -0 $TUNNEL_PID 2>/dev/null; then
366
+ debug_log "ERROR: Tunnel process exited unexpectedly! Check logs for details"
367
+ if [[ -f "$PID_FILE" ]]; then
368
+ rm -f "$PID_FILE"
369
+ fi
370
+ # Return error in case of tunnel failure
371
+ echo "Failed to establish SSH tunnel. See $TUNNEL_DIR/$CONTEXT-tunnel.log for details." >&2
372
+ exit 1
373
+ elif [[ $tunnel_up -eq 0 ]]; then
374
+ debug_log "WARNING: Tunnel process is running but port $PORT is not responding"
375
+ fi
376
+
377
+ # Return valid credential format with certificates if available
378
+ generate_credentials_json
379
+ exit 0
@@ -920,19 +920,17 @@ available_node_types:
920
920
  {{ ray_installation_commands }}
921
921
 
922
922
  # set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
923
- # unset PYTHONPATH in case the user provided docker image set it.
924
- VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip install skypilot[kubernetes,remote]
923
+ # unset PYTHONPATH and set CWD to $HOME to avoid user image interfering with SkyPilot runtime.
924
+ VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false {{sky_unset_pythonpath_and_set_cwd}} ~/.local/bin/uv pip install skypilot[kubernetes,remote]
925
925
  # Wait for `patch` package to be installed before applying ray patches
926
926
  until dpkg -l | grep -q "^ii patch "; do
927
927
  sleep 0.1
928
928
  echo "Waiting for patch package to be installed..."
929
929
  done
930
930
  # Apply Ray patches for progress bar fix
931
- # set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
932
- # unset PYTHONPATH in case the user provided docker image set it.
933
931
  # ~/.sky/python_path is seeded by conda_installation_commands
934
- VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
935
- env -u PYTHONPATH $(cat ~/.sky/python_path) -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
932
+ VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false {{sky_unset_pythonpath_and_set_cwd}} ~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
933
+ {{sky_unset_pythonpath_and_set_cwd}} $(cat ~/.sky/python_path) -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
936
934
  }
937
935
  touch /tmp/ray_skypilot_installation_complete
938
936
  echo "=== Ray and skypilot installation completed ==="
@@ -20,10 +20,18 @@ provider:
20
20
  {% if slurm_proxy_command is not none %}
21
21
  proxycommand: {{slurm_proxy_command | tojson }}
22
22
  {% endif %}
23
+ {% if slurm_proxy_jump is not none %}
24
+ proxyjump: {{slurm_proxy_jump | tojson }}
25
+ {% endif %}
23
26
 
24
27
  auth:
25
28
  ssh_user: {{ssh_user}}
26
- # TODO(jwj): Modify this tmp workaround.
29
+ # TODO(jwj,kevin): Modify this tmp workaround.
30
+ # Right now there's a chicken-and-egg problem:
31
+ # 1. ssh_credential_from_yaml reads from the auth.ssh_private_key: ~/.sky/clients/.../ssh/sky-key
32
+ # 2. This is SkyPilot's generated key, not the Slurm cluster's key
33
+ # 3. The internal_file_mounts stage tries to rsync using sky-key, but its public key isn't on the remote yet
34
+ # 4. The public key only gets added by setup_commands, which runs AFTER file_mounts
27
35
  # ssh_private_key: {{ssh_private_key}}
28
36
  ssh_private_key: {{slurm_private_key}}
29
37
  ssh_proxy_command: {{slurm_proxy_command | tojson }}
@@ -67,9 +75,31 @@ initialization_commands: []
67
75
  # Increment the following for catching performance bugs easier:
68
76
  # current num items (num SSH connections): 1
69
77
  setup_commands:
70
- - {%- for initial_setup_command in initial_setup_commands %}
78
+ - |
79
+ {%- for initial_setup_command in initial_setup_commands %}
71
80
  {{ initial_setup_command }}
72
81
  {%- endfor %}
82
+ # Generate host key for sshd -i if not exists
83
+ mkdir -p ~{{ssh_user}}/.ssh && chmod 700 ~{{ssh_user}}/.ssh
84
+ [ -f ~{{ssh_user}}/.ssh/{{slurm_sshd_host_key_filename}} ] || ssh-keygen -t ed25519 -f ~{{ssh_user}}/.ssh/{{slurm_sshd_host_key_filename}} -N "" -q
85
+ # Add public key to user's authorized_keys if not already present
86
+ grep -qF 'skypilot:ssh_public_key_content' ~{{ssh_user}}/.ssh/authorized_keys 2>/dev/null || cat >> ~{{ssh_user}}/.ssh/authorized_keys <<'SKYPILOT_SSH_KEY_EOF'
87
+ skypilot:ssh_public_key_content
88
+ SKYPILOT_SSH_KEY_EOF
89
+ chmod 600 ~{{ssh_user}}/.ssh/authorized_keys
90
+
91
+ mkdir -p ~{{ssh_user}}/.sky
92
+ cat > ~{{ssh_user}}/.sky_ssh_rc <<'SKYPILOT_SSH_RC'
93
+ # Added by SkyPilot: override HOME for Slurm interactive sessions
94
+ if [ -n "${{slurm_cluster_name_env_var}}" ]; then
95
+ CLUSTER_DIR=~/.sky_clusters/${{slurm_cluster_name_env_var}}
96
+ if [ -d "$CLUSTER_DIR" ]; then
97
+ cd "$CLUSTER_DIR"
98
+ export HOME=$(pwd)
99
+ fi
100
+ fi
101
+ SKYPILOT_SSH_RC
102
+ grep -q "source ~/.sky_ssh_rc" ~{{ssh_user}}/.bashrc 2>/dev/null || (echo "" >> ~{{ssh_user}}/.bashrc && echo "source ~/.sky_ssh_rc" >> ~{{ssh_user}}/.bashrc)
73
103
  {{ setup_sky_dirs_commands }}
74
104
  {{ conda_installation_commands }}
75
105
  {{ skypilot_wheel_installation_commands }}
@@ -9,13 +9,11 @@
9
9
  This script is useful for users who do not have local Kubernetes credentials.
10
10
  """
11
11
  import asyncio
12
- from http.cookiejar import MozillaCookieJar
13
12
  import os
14
13
  import struct
15
14
  import sys
16
15
  import time
17
16
  from typing import Dict, Optional
18
- from urllib.request import Request
19
17
 
20
18
  import requests
21
19
  import websockets
@@ -24,46 +22,19 @@ from websockets.asyncio.client import connect
24
22
 
25
23
  from sky import exceptions
26
24
  from sky.client import service_account_auth
25
+ from sky.server import common as server_common
27
26
  from sky.server import constants
28
- from sky.server.server import KubernetesSSHMessageType
27
+ from sky.server.server import SSHMessageType
29
28
  from sky.skylet import constants as skylet_constants
30
29
 
31
30
  BUFFER_SIZE = 2**16 # 64KB
32
31
  HEARTBEAT_INTERVAL_SECONDS = 10
33
-
34
- # Environment variable for a file path to the API cookie file.
35
- # Keep in sync with server/constants.py
36
- API_COOKIE_FILE_ENV_VAR = 'SKYPILOT_API_COOKIE_FILE'
37
- # Default file if unset.
38
- # Keep in sync with server/constants.py
39
- API_COOKIE_FILE_DEFAULT_LOCATION = '~/.sky/cookies.txt'
40
-
41
32
  MAX_UNANSWERED_PINGS = 100
42
33
 
43
34
 
44
- def _get_cookie_header(url: str) -> Dict[str, str]:
45
- """Extract Cookie header value from a cookie jar for a specific URL"""
46
- cookie_path = os.environ.get(API_COOKIE_FILE_ENV_VAR)
47
- if cookie_path is None:
48
- cookie_path = API_COOKIE_FILE_DEFAULT_LOCATION
49
- cookie_path = os.path.expanduser(cookie_path)
50
- if not os.path.exists(cookie_path):
51
- return {}
52
-
53
- request = Request(url)
54
- cookie_jar = MozillaCookieJar(os.path.expanduser(cookie_path))
55
- cookie_jar.load(ignore_discard=True, ignore_expires=True)
56
- cookie_jar.add_cookie_header(request)
57
- cookie_header = request.get_header('Cookie')
58
- # if cookie file is empty, return empty dict
59
- if cookie_header is None:
60
- return {}
61
- return {'Cookie': cookie_header}
62
-
63
-
64
35
  async def main(url: str, timestamps_supported: bool, login_url: str) -> None:
65
36
  headers = {}
66
- headers.update(_get_cookie_header(url))
37
+ headers.update(server_common.get_cookie_header_for_url(url))
67
38
  headers.update(service_account_auth.get_service_account_headers())
68
39
  try:
69
40
  async with connect(url, ping_interval=None,
@@ -142,8 +113,9 @@ async def latency_monitor(websocket: ClientConnection,
142
113
  ping_time = time.time()
143
114
  next_id += 1
144
115
  last_ping_time_dict[next_id] = ping_time
145
- message_header_bytes = struct.pack(
146
- '!BI', KubernetesSSHMessageType.PINGPONG.value, next_id)
116
+ message_header_bytes = struct.pack('!BI',
117
+ SSHMessageType.PINGPONG.value,
118
+ next_id)
147
119
  try:
148
120
  async with websocket_lock:
149
121
  await websocket.send(message_header_bytes)
@@ -176,7 +148,7 @@ async def stdin_to_websocket(reader: asyncio.StreamReader,
176
148
  if timestamps_supported:
177
149
  # Send message with type 0 to indicate data.
178
150
  message_type_bytes = struct.pack(
179
- '!B', KubernetesSSHMessageType.REGULAR_DATA.value)
151
+ '!B', SSHMessageType.REGULAR_DATA.value)
180
152
  data = message_type_bytes + data
181
153
  async with websocket_lock:
182
154
  await websocket.send(data)
@@ -201,10 +173,10 @@ async def websocket_to_stdout(websocket: ClientConnection,
201
173
  if (timestamps_supported and len(message) > 0 and
202
174
  last_ping_time_dict is not None):
203
175
  message_type = struct.unpack('!B', message[:1])[0]
204
- if message_type == KubernetesSSHMessageType.REGULAR_DATA.value:
176
+ if message_type == SSHMessageType.REGULAR_DATA.value:
205
177
  # Regular data - strip type byte and write to stdout
206
178
  message = message[1:]
207
- elif message_type == KubernetesSSHMessageType.PINGPONG.value:
179
+ elif message_type == SSHMessageType.PINGPONG.value:
208
180
  # PONG response - calculate latency and send measurement
209
181
  if not len(message) == struct.calcsize('!BI'):
210
182
  raise ValueError(
@@ -222,8 +194,7 @@ async def websocket_to_stdout(websocket: ClientConnection,
222
194
 
223
195
  # Send latency measurement (type 2)
224
196
  message_type_bytes = struct.pack(
225
- '!B',
226
- KubernetesSSHMessageType.LATENCY_MEASUREMENT.value)
197
+ '!B', SSHMessageType.LATENCY_MEASUREMENT.value)
227
198
  latency_bytes = struct.pack('!Q', latency_ms)
228
199
  message = message_type_bytes + latency_bytes
229
200
  # Send to server.
@@ -255,7 +226,7 @@ if __name__ == '__main__':
255
226
  # TODO(aylei): remove the separate /api/health call and use the header
256
227
  # during websocket handshake to determine the server version.
257
228
  health_url = f'{server_url}/api/health'
258
- cookie_hdr = _get_cookie_header(health_url)
229
+ cookie_hdr = server_common.get_cookie_header_for_url(health_url)
259
230
  health_response = requests.get(health_url, headers=cookie_hdr)
260
231
  health_data = health_response.json()
261
232
  timestamps_are_supported = int(health_data.get('api_version', 0)) > 21
@@ -272,7 +243,13 @@ if __name__ == '__main__':
272
243
  client_version_str = (f'&client_version={constants.API_VERSION}'
273
244
  if timestamps_are_supported else '')
274
245
 
275
- websocket_url = (f'{server_url}/kubernetes-pod-ssh-proxy'
246
+ # For backwards compatibility, fallback to kubernetes-pod-ssh-proxy if
247
+ # no endpoint is provided.
248
+ endpoint = sys.argv[3] if len(sys.argv) > 3 else 'kubernetes-pod-ssh-proxy'
249
+ # Worker index for Slurm.
250
+ worker_idx = sys.argv[4] if len(sys.argv) > 4 else '0'
251
+ websocket_url = (f'{server_url}/{endpoint}'
276
252
  f'?cluster_name={sys.argv[2]}'
253
+ f'&worker={worker_idx}'
277
254
  f'{client_version_str}')
278
255
  asyncio.run(main(websocket_url, timestamps_are_supported, _login_url))