skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +565 -0
  4. sky/backends/backend_utils.py +95 -12
  5. sky/backends/cloud_vm_ray_backend.py +224 -65
  6. sky/backends/task_codegen.py +380 -4
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  9. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  10. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  11. sky/catalog/kubernetes_catalog.py +12 -4
  12. sky/catalog/seeweb_catalog.py +30 -15
  13. sky/catalog/shadeform_catalog.py +5 -2
  14. sky/catalog/slurm_catalog.py +236 -0
  15. sky/catalog/vast_catalog.py +30 -6
  16. sky/check.py +25 -11
  17. sky/client/cli/command.py +391 -32
  18. sky/client/interactive_utils.py +190 -0
  19. sky/client/sdk.py +64 -2
  20. sky/client/sdk_async.py +9 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +60 -2
  23. sky/clouds/azure.py +2 -0
  24. sky/clouds/cloud.py +7 -0
  25. sky/clouds/kubernetes.py +2 -0
  26. sky/clouds/runpod.py +38 -7
  27. sky/clouds/slurm.py +610 -0
  28. sky/clouds/ssh.py +3 -2
  29. sky/clouds/vast.py +39 -16
  30. sky/core.py +197 -37
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  36. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  37. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  41. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  43. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  44. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  45. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  46. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  50. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  51. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  56. sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  59. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  66. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  73. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  74. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  75. sky/dashboard/out/clusters/[cluster].html +1 -1
  76. sky/dashboard/out/clusters.html +1 -1
  77. sky/dashboard/out/config.html +1 -1
  78. sky/dashboard/out/index.html +1 -1
  79. sky/dashboard/out/infra/[context].html +1 -1
  80. sky/dashboard/out/infra.html +1 -1
  81. sky/dashboard/out/jobs/[job].html +1 -1
  82. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  83. sky/dashboard/out/jobs.html +1 -1
  84. sky/dashboard/out/plugins/[...slug].html +1 -0
  85. sky/dashboard/out/users.html +1 -1
  86. sky/dashboard/out/volumes.html +1 -1
  87. sky/dashboard/out/workspace/new.html +1 -1
  88. sky/dashboard/out/workspaces/[name].html +1 -1
  89. sky/dashboard/out/workspaces.html +1 -1
  90. sky/data/data_utils.py +26 -12
  91. sky/data/mounting_utils.py +44 -5
  92. sky/global_user_state.py +111 -19
  93. sky/jobs/client/sdk.py +8 -3
  94. sky/jobs/controller.py +191 -31
  95. sky/jobs/recovery_strategy.py +109 -11
  96. sky/jobs/server/core.py +81 -4
  97. sky/jobs/server/server.py +14 -0
  98. sky/jobs/state.py +417 -19
  99. sky/jobs/utils.py +73 -80
  100. sky/models.py +11 -0
  101. sky/optimizer.py +8 -6
  102. sky/provision/__init__.py +12 -9
  103. sky/provision/common.py +20 -0
  104. sky/provision/docker_utils.py +15 -2
  105. sky/provision/kubernetes/utils.py +163 -20
  106. sky/provision/kubernetes/volume.py +52 -17
  107. sky/provision/provisioner.py +17 -7
  108. sky/provision/runpod/instance.py +3 -1
  109. sky/provision/runpod/utils.py +13 -1
  110. sky/provision/runpod/volume.py +25 -9
  111. sky/provision/slurm/__init__.py +12 -0
  112. sky/provision/slurm/config.py +13 -0
  113. sky/provision/slurm/instance.py +618 -0
  114. sky/provision/slurm/utils.py +689 -0
  115. sky/provision/vast/instance.py +4 -1
  116. sky/provision/vast/utils.py +11 -6
  117. sky/resources.py +135 -13
  118. sky/schemas/api/responses.py +4 -0
  119. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  120. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  121. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  122. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  123. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  124. sky/schemas/generated/jobsv1_pb2.py +9 -5
  125. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  126. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  127. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  128. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  129. sky/serve/serve_utils.py +232 -40
  130. sky/serve/server/impl.py +1 -1
  131. sky/server/common.py +17 -0
  132. sky/server/constants.py +1 -1
  133. sky/server/metrics.py +6 -3
  134. sky/server/plugins.py +238 -0
  135. sky/server/requests/executor.py +5 -2
  136. sky/server/requests/payloads.py +30 -1
  137. sky/server/requests/request_names.py +4 -0
  138. sky/server/requests/requests.py +33 -11
  139. sky/server/requests/serializers/encoders.py +22 -0
  140. sky/server/requests/serializers/return_value_serializers.py +70 -0
  141. sky/server/server.py +506 -109
  142. sky/server/server_utils.py +30 -0
  143. sky/server/uvicorn.py +5 -0
  144. sky/setup_files/MANIFEST.in +1 -0
  145. sky/setup_files/dependencies.py +22 -9
  146. sky/sky_logging.py +2 -1
  147. sky/skylet/attempt_skylet.py +13 -3
  148. sky/skylet/constants.py +55 -13
  149. sky/skylet/events.py +10 -4
  150. sky/skylet/executor/__init__.py +1 -0
  151. sky/skylet/executor/slurm.py +187 -0
  152. sky/skylet/job_lib.py +91 -5
  153. sky/skylet/log_lib.py +22 -6
  154. sky/skylet/log_lib.pyi +8 -6
  155. sky/skylet/services.py +18 -3
  156. sky/skylet/skylet.py +5 -1
  157. sky/skylet/subprocess_daemon.py +2 -1
  158. sky/ssh_node_pools/constants.py +12 -0
  159. sky/ssh_node_pools/core.py +40 -3
  160. sky/ssh_node_pools/deploy/__init__.py +4 -0
  161. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  162. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  163. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  164. sky/ssh_node_pools/deploy/utils.py +173 -0
  165. sky/ssh_node_pools/server.py +11 -13
  166. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  167. sky/templates/kubernetes-ray.yml.j2 +12 -6
  168. sky/templates/slurm-ray.yml.j2 +115 -0
  169. sky/templates/vast-ray.yml.j2 +1 -0
  170. sky/templates/websocket_proxy.py +18 -41
  171. sky/users/model.conf +1 -1
  172. sky/users/permission.py +85 -52
  173. sky/users/rbac.py +31 -3
  174. sky/utils/annotations.py +108 -8
  175. sky/utils/auth_utils.py +42 -0
  176. sky/utils/cli_utils/status_utils.py +19 -5
  177. sky/utils/cluster_utils.py +10 -3
  178. sky/utils/command_runner.py +389 -35
  179. sky/utils/command_runner.pyi +43 -4
  180. sky/utils/common_utils.py +47 -31
  181. sky/utils/context.py +32 -0
  182. sky/utils/db/db_utils.py +36 -6
  183. sky/utils/db/migration_utils.py +41 -21
  184. sky/utils/infra_utils.py +5 -1
  185. sky/utils/instance_links.py +139 -0
  186. sky/utils/interactive_utils.py +49 -0
  187. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  188. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  189. sky/utils/kubernetes/rsync_helper.sh +5 -1
  190. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  191. sky/utils/plugin_extensions/__init__.py +14 -0
  192. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  193. sky/utils/resources_utils.py +10 -8
  194. sky/utils/rich_utils.py +9 -11
  195. sky/utils/schemas.py +93 -19
  196. sky/utils/status_lib.py +7 -0
  197. sky/utils/subprocess_utils.py +17 -0
  198. sky/volumes/client/sdk.py +6 -3
  199. sky/volumes/server/core.py +65 -27
  200. sky_templates/ray/start_cluster +8 -4
  201. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
  202. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
  203. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  204. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
  205. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  206. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  207. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  208. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  209. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  210. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  211. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
  213. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  214. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  216. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  217. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  221. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  222. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  223. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  224. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  225. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  226. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  227. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  228. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  229. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  230. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
  231. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
  232. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
  233. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  234. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
  235. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
  236. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
  237. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
  238. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
  239. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  240. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  241. /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  243. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  244. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  245. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,379 @@
1
+ #!/bin/bash
2
+ # ssh-tunnel.sh - SSH tunnel script for Kubernetes API access
3
+ # Used as kubectl exec credential plugin to establish SSH tunnel on demand.
4
+ # Returns a valid credential format for kubectl with expiration. The expiration
5
+ # is calculated based on the TTL argument and is required to force kubectl to
6
+ # check the tunnel status frequently.
7
+
8
+ # Usage: ssh-tunnel.sh --host HOST [--user USER] [--use-ssh-config] [--ssh-key KEY] [--context CONTEXT] [--port PORT] [--ttl SECONDS]
9
+
10
+ # Default time-to-live for credential in seconds
11
+ # This forces kubectl to check the tunnel status frequently
12
+ TTL_SECONDS=30
13
+
14
+ # Parse arguments
15
+ USE_SSH_CONFIG=0
16
+ SSH_KEY=""
17
+ CONTEXT=""
18
+ HOST=""
19
+ USER=""
20
+ PORT=6443 # Default port if not specified
21
+
22
+ # Debug log to ~/.sky/ssh_node_pools_info/$CONTEXT-tunnel.log
23
+ debug_log() {
24
+ local message="$(date): $1"
25
+ echo "$message" >> "$LOG_FILE"
26
+ }
27
+
28
+ # Generate expiration timestamp for credential
29
+ generate_expiration_timestamp() {
30
+ # Try macOS date format first, fallback to Linux format
31
+ date -u -v+${TTL_SECONDS}S +"%Y-%m-%dT%H:%M:%SZ" 2>/dev/null || date -u -d "+${TTL_SECONDS} seconds" +"%Y-%m-%dT%H:%M:%SZ"
32
+ }
33
+
34
+ # Acquire the lock, return 0 if successful, 1 if another process is already holding the lock
35
+ acquire_lock() {
36
+ # Check for flock command
37
+ if ! command -v flock >/dev/null 2>&1; then
38
+ debug_log "flock command not available, using alternative lock mechanism"
39
+ # Simple file-based locking
40
+ if [ -f "$LOCK_FILE" ]; then
41
+ lock_pid=$(cat "$LOCK_FILE" 2>/dev/null)
42
+ if [ -n "$lock_pid" ] && kill -0 "$lock_pid" 2>/dev/null; then
43
+ debug_log "Another process ($lock_pid) is starting the tunnel, waiting briefly"
44
+ return 1
45
+ else
46
+ # Stale lock file
47
+ debug_log "Removing stale lock file"
48
+ rm -f "$LOCK_FILE"
49
+ fi
50
+ fi
51
+ # Create our lock
52
+ echo $$ > "$LOCK_FILE"
53
+ return 0
54
+ else
55
+ # Use flock for better locking
56
+ exec 9>"$LOCK_FILE"
57
+ if ! flock -n 9; then
58
+ debug_log "Another process is starting the tunnel, waiting briefly"
59
+ return 1
60
+ fi
61
+ return 0
62
+ fi
63
+ }
64
+
65
+ # Release the lock
66
+ release_lock() {
67
+ if command -v flock >/dev/null 2>&1; then
68
+ # Using flock
69
+ exec 9>&- # Close file descriptor to release lock
70
+ else
71
+ # Using simple lock
72
+ rm -f "$LOCK_FILE"
73
+ fi
74
+ debug_log "Lock released"
75
+ }
76
+
77
+ # Generate SSH command based on available tools and parameters
78
+ generate_ssh_command() {
79
+ # Check for autossh
80
+ if ! command -v autossh >/dev/null 2>&1; then
81
+ debug_log "WARNING: autossh is not installed but recommended for reliable SSH tunnels"
82
+ debug_log "Install autossh: brew install autossh (macOS), apt-get install autossh (Ubuntu/Debian)"
83
+
84
+ # Fall back to regular ssh
85
+ if [[ $USE_SSH_CONFIG -eq 1 ]]; then
86
+ SSH_CMD=("ssh" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N" "$HOST")
87
+ else
88
+ SSH_CMD=("ssh" "-o" "StrictHostKeyChecking=no" "-o" "IdentitiesOnly=yes" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N")
89
+
90
+ # Add SSH key if provided
91
+ if [[ -n "$SSH_KEY" ]]; then
92
+ SSH_CMD+=("-i" "$SSH_KEY")
93
+ fi
94
+
95
+ # Add user@host
96
+ SSH_CMD+=("$USER@$HOST")
97
+ fi
98
+ else
99
+ # Configure autossh
100
+ if [[ $USE_SSH_CONFIG -eq 1 ]]; then
101
+ SSH_CMD=("autossh" "-M" "0" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N" "$HOST")
102
+ else
103
+ SSH_CMD=("autossh" "-M" "0" "-o" "StrictHostKeyChecking=no" "-o" "IdentitiesOnly=yes" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N")
104
+
105
+ # Add SSH key if provided
106
+ if [[ -n "$SSH_KEY" ]]; then
107
+ SSH_CMD+=("-i" "$SSH_KEY")
108
+ fi
109
+
110
+ # Add user@host
111
+ SSH_CMD+=("$USER@$HOST")
112
+ fi
113
+ fi
114
+ }
115
+
116
+ # Function to read certificate files if they exist
117
+ read_certificate_data() {
118
+ local client_cert_file="$TUNNEL_DIR/$CONTEXT-cert.pem"
119
+ local client_key_file="$TUNNEL_DIR/$CONTEXT-key.pem"
120
+ local cert_data=""
121
+ local key_data=""
122
+
123
+ if [[ -f "$client_cert_file" ]]; then
124
+ # Read the certificate file as is - it's already in PEM format
125
+ cert_data=$(cat "$client_cert_file")
126
+ debug_log "Found client certificate data for context $CONTEXT"
127
+
128
+ # Log the first and last few characters to verify PEM format
129
+ local cert_start=$(head -1 "$client_cert_file")
130
+ local cert_end=$(tail -1 "$client_cert_file")
131
+ debug_log "Certificate starts with: $cert_start"
132
+ debug_log "Certificate ends with: $cert_end"
133
+
134
+ # Check if it has proper PEM format
135
+ if ! grep -q "BEGIN CERTIFICATE" "$client_cert_file" || ! grep -q "END CERTIFICATE" "$client_cert_file"; then
136
+ debug_log "WARNING: Certificate file may not be in proper PEM format"
137
+ # Try to fix it if needed
138
+ if ! grep -q "BEGIN CERTIFICATE" "$client_cert_file"; then
139
+ echo "-----BEGIN CERTIFICATE-----" > "$client_cert_file.fixed"
140
+ cat "$client_cert_file" >> "$client_cert_file.fixed"
141
+ echo "-----END CERTIFICATE-----" >> "$client_cert_file.fixed"
142
+ mv "$client_cert_file.fixed" "$client_cert_file"
143
+ cert_data=$(cat "$client_cert_file")
144
+ debug_log "Fixed certificate format by adding BEGIN/END markers"
145
+ fi
146
+ fi
147
+ fi
148
+
149
+ if [[ -f "$client_key_file" ]]; then
150
+ # Read the key file as is - it's already in PEM format
151
+ key_data=$(cat "$client_key_file")
152
+ debug_log "Found client key data for context $CONTEXT"
153
+
154
+ # Log the first and last few characters to verify PEM format
155
+ local key_start=$(head -1 "$client_key_file")
156
+ local key_end=$(tail -1 "$client_key_file")
157
+ debug_log "Key starts with: $key_start"
158
+ debug_log "Key ends with: $key_end"
159
+
160
+ # Check if it has proper PEM format
161
+ if ! grep -q "BEGIN" "$client_key_file" || ! grep -q "END" "$client_key_file"; then
162
+ debug_log "WARNING: Key file may not be in proper PEM format"
163
+ # Try to fix it if needed
164
+ if ! grep -q "BEGIN" "$client_key_file"; then
165
+ echo "-----BEGIN PRIVATE KEY-----" > "$client_key_file.fixed"
166
+ cat "$client_key_file" >> "$client_key_file.fixed"
167
+ echo "-----END PRIVATE KEY-----" >> "$client_key_file.fixed"
168
+ mv "$client_key_file.fixed" "$client_key_file"
169
+ key_data=$(cat "$client_key_file")
170
+ debug_log "Fixed key format by adding BEGIN/END markers"
171
+ fi
172
+ fi
173
+ fi
174
+
175
+ echo "$cert_data:$key_data"
176
+ }
177
+
178
+ # Function to generate credentials JSON
179
+ generate_credentials_json() {
180
+ local expiration_time=$(generate_expiration_timestamp)
181
+ local cert_bundle=$(read_certificate_data)
182
+ local client_cert_data=${cert_bundle%:*}
183
+ local client_key_data=${cert_bundle#*:}
184
+
185
+ if [[ -n "$client_cert_data" && -n "$client_key_data" ]]; then
186
+ # Debug the certificate data
187
+ debug_log "Certificate data length: $(echo -n "$client_cert_data" | wc -c) bytes"
188
+ debug_log "Key data length: $(echo -n "$client_key_data" | wc -c) bytes"
189
+
190
+ # Check if we can create proper JSON with `jq`
191
+ if ! command -v jq &>/dev/null; then
192
+ echo "jq is not installed. Please install jq to use this script." >&2
193
+ exit 1
194
+ fi
195
+ debug_log "Using jq for JSON formatting"
196
+
197
+ # Create a temporary file for the JSON output to avoid shell escaping issues
198
+ local TEMP_JSON_FILE=$(mktemp)
199
+
200
+ # Write the JSON to the temporary file using jq for proper JSON formatting
201
+ cat > "$TEMP_JSON_FILE" << EOL
202
+ {
203
+ "apiVersion": "client.authentication.k8s.io/v1beta1",
204
+ "kind": "ExecCredential",
205
+ "status": {
206
+ "clientCertificateData": $(printf '%s' "$client_cert_data" | jq -R -s .),
207
+ "clientKeyData": $(printf '%s' "$client_key_data" | jq -R -s .),
208
+ "expirationTimestamp": "$expiration_time"
209
+ }
210
+ }
211
+ EOL
212
+
213
+ # Read the JSON from the file
214
+ local json_response=$(cat "$TEMP_JSON_FILE")
215
+
216
+ # Clean up
217
+ rm -f "$TEMP_JSON_FILE"
218
+
219
+ # Output the JSON
220
+ echo "$json_response"
221
+ else
222
+ # Fallback to token-based credential for tunnel-only authentication
223
+ echo "{\"apiVersion\":\"client.authentication.k8s.io/v1beta1\",\"kind\":\"ExecCredential\",\"status\":{\"token\":\"k8s-ssh-tunnel-token\",\"expirationTimestamp\":\"$expiration_time\"}}"
224
+ fi
225
+ }
226
+
227
+ while [[ $# -gt 0 ]]; do
228
+ case $1 in
229
+ --use-ssh-config)
230
+ USE_SSH_CONFIG=1
231
+ shift
232
+ ;;
233
+ --ssh-key)
234
+ SSH_KEY="$2"
235
+ shift 2
236
+ ;;
237
+ --context)
238
+ CONTEXT="$2"
239
+ shift 2
240
+ ;;
241
+ --port)
242
+ PORT="$2"
243
+ shift 2
244
+ ;;
245
+ --host)
246
+ HOST="$2"
247
+ shift 2
248
+ ;;
249
+ --user)
250
+ USER="$2"
251
+ shift 2
252
+ ;;
253
+ --ttl)
254
+ TTL_SECONDS="$2"
255
+ shift 2
256
+ ;;
257
+ *)
258
+ echo "Unknown parameter: $1" >&2
259
+ exit 1
260
+ ;;
261
+ esac
262
+ done
263
+
264
+ # Validate required parameters
265
+ if [[ -z "$HOST" ]]; then
266
+ echo "Error: --host parameter is required" >&2
267
+ exit 1
268
+ fi
269
+
270
+ # Setup directories
271
+ TUNNEL_DIR="$HOME/.sky/ssh_node_pools_info"
272
+ mkdir -p "$TUNNEL_DIR"
273
+
274
+ # Get context name for PID file
275
+ if [[ -z "$CONTEXT" ]]; then
276
+ CONTEXT="default"
277
+ fi
278
+
279
+ PID_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.pid"
280
+ LOG_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.log"
281
+ LOCK_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.lock"
282
+
283
+ debug_log "Starting ssh-tunnel.sh for context $CONTEXT, host $HOST, port $PORT"
284
+ debug_log "SSH Config: $USE_SSH_CONFIG, User: $USER, TTL: ${TTL_SECONDS}s"
285
+
286
+ # Check if specified port is already in use (tunnel may be running)
287
+ if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
288
+ debug_log "Port $PORT already in use, checking if it's our tunnel"
289
+
290
+ # Check if there's a PID file and if that process is running
291
+ if [[ -f "$PID_FILE" ]]; then
292
+ OLD_PID=$(cat "$PID_FILE")
293
+ if kill -0 "$OLD_PID" 2>/dev/null; then
294
+ debug_log "Tunnel appears to be running with PID $OLD_PID"
295
+ else
296
+ debug_log "PID file exists but process $OLD_PID is not running"
297
+ fi
298
+ else
299
+ debug_log "Port $PORT is in use but no PID file exists"
300
+ fi
301
+
302
+ # Return valid credential format for kubectl with expiration
303
+ generate_credentials_json
304
+ exit 0
305
+ fi
306
+
307
+ # Try to acquire the lock
308
+ if ! acquire_lock; then
309
+ # Wait briefly for the tunnel to be established
310
+ for i in {1..10}; do
311
+ if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
312
+ debug_log "Tunnel is now active"
313
+
314
+ # Return valid credential format for kubectl with expiration
315
+ generate_credentials_json
316
+ exit 0
317
+ fi
318
+ sleep 0.2
319
+ done
320
+ debug_log "Waited for tunnel but port $PORT still not available"
321
+ fi
322
+
323
+ # Check if we have a PID file with running process
324
+ if [[ -f "$PID_FILE" ]]; then
325
+ OLD_PID=$(cat "$PID_FILE")
326
+ if kill -0 "$OLD_PID" 2>/dev/null; then
327
+ # Process exists but port isn't open - something's wrong, kill it
328
+ kill "$OLD_PID" 2>/dev/null
329
+ debug_log "Killed stale tunnel process $OLD_PID"
330
+ else
331
+ debug_log "PID file exists but process $OLD_PID is not running anymore"
332
+ fi
333
+ # Remove the stale PID file
334
+ rm -f "$PID_FILE"
335
+ fi
336
+
337
+ # Generate the SSH command
338
+ generate_ssh_command
339
+
340
+ debug_log "Starting SSH tunnel: ${SSH_CMD[*]}"
341
+
342
+ # Start the tunnel in foreground and wait for it to establish
343
+ "${SSH_CMD[@]}" >> "$LOG_FILE" 2>&1 &
344
+ TUNNEL_PID=$!
345
+
346
+ # Save PID
347
+ echo $TUNNEL_PID > "$PID_FILE"
348
+ debug_log "Tunnel started with PID $TUNNEL_PID"
349
+
350
+ # Wait for tunnel to establish
351
+ tunnel_up=0
352
+ for i in {1..20}; do
353
+ if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
354
+ debug_log "Tunnel established successfully on port $PORT"
355
+ tunnel_up=1
356
+ break
357
+ fi
358
+ sleep 0.2
359
+ done
360
+
361
+ # Clean up lock file
362
+ release_lock
363
+
364
+ # Check if the tunnel process is still running
365
+ if ! kill -0 $TUNNEL_PID 2>/dev/null; then
366
+ debug_log "ERROR: Tunnel process exited unexpectedly! Check logs for details"
367
+ if [[ -f "$PID_FILE" ]]; then
368
+ rm -f "$PID_FILE"
369
+ fi
370
+ # Return error in case of tunnel failure
371
+ echo "Failed to establish SSH tunnel. See $TUNNEL_DIR/$CONTEXT-tunnel.log for details." >&2
372
+ exit 1
373
+ elif [[ $tunnel_up -eq 0 ]]; then
374
+ debug_log "WARNING: Tunnel process is running but port $PORT is not responding"
375
+ fi
376
+
377
+ # Return valid credential format with certificates if available
378
+ generate_credentials_json
379
+ exit 0
@@ -0,0 +1,199 @@
1
+ """Utilities to setup SSH Tunnel"""
2
+ import os
3
+ import random
4
+ import re
5
+ import subprocess
6
+ import sys
7
+ from typing import Set
8
+
9
+ import colorama
10
+
11
+ from sky import sky_logging
12
+ from sky.ssh_node_pools import constants
13
+ from sky.ssh_node_pools.deploy import utils as deploy_utils
14
+
15
+ logger = sky_logging.init_logger(__name__)
16
+
17
+ # Get the directory of this script
18
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
19
+
20
+
21
+ def _get_used_localhost_ports() -> Set[int]:
22
+ """Get SSH port forwardings already in use on localhost"""
23
+ used_ports = set()
24
+
25
+ # Get ports from netstat (works on macOS and Linux)
26
+ try:
27
+ if sys.platform == 'darwin':
28
+ # macOS
29
+ result = subprocess.run(['netstat', '-an', '-p', 'tcp'],
30
+ capture_output=True,
31
+ text=True,
32
+ check=False)
33
+ else:
34
+ # Linux and other Unix-like systems
35
+ result = subprocess.run(['netstat', '-tln'],
36
+ capture_output=True,
37
+ text=True,
38
+ check=False)
39
+
40
+ if result.returncode == 0:
41
+ # Look for lines with 'localhost:<port>' or '127.0.0.1:<port>'
42
+ for line in result.stdout.splitlines():
43
+ if '127.0.0.1:' in line or 'localhost:' in line:
44
+ match = re.search(r':(64\d\d)\s', line)
45
+ if match:
46
+ port = int(match.group(1))
47
+ if 6400 <= port <= 6500: # Only consider our range
48
+ used_ports.add(port)
49
+ except (subprocess.SubprocessError, FileNotFoundError):
50
+ # If netstat fails, try another approach
51
+ pass
52
+
53
+ # Also check ports from existing kubeconfig entries
54
+ try:
55
+ result = subprocess.run([
56
+ 'kubectl', 'config', 'view', '-o',
57
+ 'jsonpath=\'{.clusters[*].cluster.server}\''
58
+ ],
59
+ capture_output=True,
60
+ text=True,
61
+ check=False)
62
+
63
+ if result.returncode == 0:
64
+ # Look for localhost URLs with ports
65
+ for url in result.stdout.split():
66
+ if 'localhost:' in url or '127.0.0.1:' in url:
67
+ match = re.search(r':(\d+)', url)
68
+ if match:
69
+ port = int(match.group(1))
70
+ if 6400 <= port <= 6500: # Only consider our range
71
+ used_ports.add(port)
72
+ except subprocess.SubprocessError:
73
+ pass
74
+
75
+ return used_ports
76
+
77
+
78
+ def get_available_port(start: int = 6443, end: int = 6499) -> int:
79
+ """Get an available port in the given range not used by other tunnels"""
80
+ used_ports = _get_used_localhost_ports()
81
+
82
+ # Try to use port 6443 first if available for the first cluster
83
+ if start == 6443 and start not in used_ports:
84
+ return start
85
+
86
+ # Otherwise find any available port in the range
87
+ available_ports = list(set(range(start, end + 1)) - used_ports)
88
+
89
+ if not available_ports:
90
+ # If all ports are used, pick a random one from our range
91
+ # (we'll terminate any existing connection in the setup)
92
+ return random.randint(start, end)
93
+
94
+ # Sort to get deterministic allocation
95
+ available_ports.sort()
96
+ return available_ports[0]
97
+
98
+
99
+ def setup_kubectl_ssh_tunnel(head_node,
100
+ ssh_user,
101
+ ssh_key,
102
+ context_name,
103
+ use_ssh_config=False):
104
+ """Set up kubeconfig exec credential plugin for SSH tunnel"""
105
+ logger.info(f'{colorama.Fore.YELLOW}➜ Setting up SSH tunnel for '
106
+ f'Kubernetes API access...{colorama.Style.RESET_ALL}')
107
+
108
+ # Get an available port for this cluster
109
+ port = get_available_port()
110
+
111
+ # Paths to scripts
112
+ tunnel_script = os.path.join(SCRIPT_DIR, 'tunnel', 'ssh-tunnel.sh')
113
+
114
+ # Make sure scripts are executable
115
+ os.chmod(tunnel_script, 0o755)
116
+
117
+ # Certificate files
118
+ client_cert_file = os.path.join(constants.NODE_POOLS_INFO_DIR,
119
+ f'{context_name}-cert.pem')
120
+ client_key_file = os.path.join(constants.NODE_POOLS_INFO_DIR,
121
+ f'{context_name}-key.pem')
122
+
123
+ # Update kubeconfig to use localhost with the selected port
124
+ deploy_utils.run_command([
125
+ 'kubectl', 'config', 'set-cluster', context_name,
126
+ f'--server=https://127.0.0.1:{port}', '--insecure-skip-tls-verify=true'
127
+ ])
128
+
129
+ # Build the exec args list based on auth method
130
+ exec_args = [
131
+ '--exec-command', tunnel_script, '--exec-api-version',
132
+ 'client.authentication.k8s.io/v1beta1'
133
+ ]
134
+
135
+ # Set credential TTL to force frequent tunnel checks
136
+ ttl_seconds = 30
137
+
138
+ # Verify if we have extracted certificate data files
139
+ has_cert_files = os.path.isfile(client_cert_file) and os.path.isfile(
140
+ client_key_file)
141
+ if has_cert_files:
142
+ logger.info(f'{colorama.Fore.GREEN}Client certificate data extracted '
143
+ 'and will be used for authentication'
144
+ f'{colorama.Style.RESET_ALL}')
145
+
146
+ if use_ssh_config:
147
+ deploy_utils.run_command(
148
+ ['kubectl', 'config', 'set-credentials', context_name] + exec_args +
149
+ [
150
+ '--exec-arg=--context', f'--exec-arg={context_name}',
151
+ '--exec-arg=--port', f'--exec-arg={port}', '--exec-arg=--ttl',
152
+ f'--exec-arg={ttl_seconds}', '--exec-arg=--use-ssh-config',
153
+ '--exec-arg=--host', f'--exec-arg={head_node}'
154
+ ])
155
+ else:
156
+ deploy_utils.run_command(
157
+ ['kubectl', 'config', 'set-credentials', context_name] + exec_args +
158
+ [
159
+ '--exec-arg=--context', f'--exec-arg={context_name}',
160
+ '--exec-arg=--port', f'--exec-arg={port}', '--exec-arg=--ttl',
161
+ f'--exec-arg={ttl_seconds}', '--exec-arg=--host',
162
+ f'--exec-arg={head_node}', '--exec-arg=--user',
163
+ f'--exec-arg={ssh_user}', '--exec-arg=--ssh-key',
164
+ f'--exec-arg={ssh_key}'
165
+ ])
166
+
167
+ logger.info(f'{colorama.Fore.GREEN}✔ SSH tunnel configured through '
168
+ 'kubectl credential plugin on port '
169
+ f'{port}{colorama.Style.RESET_ALL}')
170
+ logger.info('Your kubectl connection is now tunneled through SSH '
171
+ f'(port {port}).')
172
+ logger.info('This tunnel will be automatically established when needed.')
173
+ logger.info(f'Credential TTL set to {ttl_seconds}s to ensure tunnel '
174
+ 'health is checked frequently.')
175
+ return port
176
+
177
+
178
+ def cleanup_kubectl_ssh_tunnel(cluster_name, context_name):
179
+ """Clean up the SSH tunnel for a specific context"""
180
+ logger.info(f'{colorama.Fore.YELLOW}➜ Cleaning up SSH tunnel for '
181
+ f'`{cluster_name}`...{colorama.Style.RESET_ALL}')
182
+
183
+ # Path to cleanup script
184
+ cleanup_script = os.path.join(SCRIPT_DIR, 'tunnel', 'cleanup-tunnel.sh')
185
+
186
+ # Make sure script is executable
187
+ if os.path.exists(cleanup_script):
188
+ os.chmod(cleanup_script, 0o755)
189
+
190
+ # Run the cleanup script
191
+ subprocess.run([cleanup_script, context_name],
192
+ stdout=subprocess.DEVNULL,
193
+ stderr=subprocess.DEVNULL,
194
+ check=False)
195
+ logger.info(f'{colorama.Fore.GREEN}✔ SSH tunnel for `{cluster_name}` '
196
+ f'cleaned up.{colorama.Style.RESET_ALL}')
197
+ else:
198
+ logger.error(f'{colorama.Fore.YELLOW}Cleanup script not found: '
199
+ f'{cleanup_script}{colorama.Style.RESET_ALL}')