skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +565 -0
  4. sky/backends/backend_utils.py +95 -12
  5. sky/backends/cloud_vm_ray_backend.py +224 -65
  6. sky/backends/task_codegen.py +380 -4
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  9. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  10. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  11. sky/catalog/kubernetes_catalog.py +12 -4
  12. sky/catalog/seeweb_catalog.py +30 -15
  13. sky/catalog/shadeform_catalog.py +5 -2
  14. sky/catalog/slurm_catalog.py +236 -0
  15. sky/catalog/vast_catalog.py +30 -6
  16. sky/check.py +25 -11
  17. sky/client/cli/command.py +391 -32
  18. sky/client/interactive_utils.py +190 -0
  19. sky/client/sdk.py +64 -2
  20. sky/client/sdk_async.py +9 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +60 -2
  23. sky/clouds/azure.py +2 -0
  24. sky/clouds/cloud.py +7 -0
  25. sky/clouds/kubernetes.py +2 -0
  26. sky/clouds/runpod.py +38 -7
  27. sky/clouds/slurm.py +610 -0
  28. sky/clouds/ssh.py +3 -2
  29. sky/clouds/vast.py +39 -16
  30. sky/core.py +197 -37
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  36. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  37. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  41. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  43. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  44. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  45. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  46. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  50. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  51. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  56. sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  59. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  66. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  73. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  74. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  75. sky/dashboard/out/clusters/[cluster].html +1 -1
  76. sky/dashboard/out/clusters.html +1 -1
  77. sky/dashboard/out/config.html +1 -1
  78. sky/dashboard/out/index.html +1 -1
  79. sky/dashboard/out/infra/[context].html +1 -1
  80. sky/dashboard/out/infra.html +1 -1
  81. sky/dashboard/out/jobs/[job].html +1 -1
  82. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  83. sky/dashboard/out/jobs.html +1 -1
  84. sky/dashboard/out/plugins/[...slug].html +1 -0
  85. sky/dashboard/out/users.html +1 -1
  86. sky/dashboard/out/volumes.html +1 -1
  87. sky/dashboard/out/workspace/new.html +1 -1
  88. sky/dashboard/out/workspaces/[name].html +1 -1
  89. sky/dashboard/out/workspaces.html +1 -1
  90. sky/data/data_utils.py +26 -12
  91. sky/data/mounting_utils.py +44 -5
  92. sky/global_user_state.py +111 -19
  93. sky/jobs/client/sdk.py +8 -3
  94. sky/jobs/controller.py +191 -31
  95. sky/jobs/recovery_strategy.py +109 -11
  96. sky/jobs/server/core.py +81 -4
  97. sky/jobs/server/server.py +14 -0
  98. sky/jobs/state.py +417 -19
  99. sky/jobs/utils.py +73 -80
  100. sky/models.py +11 -0
  101. sky/optimizer.py +8 -6
  102. sky/provision/__init__.py +12 -9
  103. sky/provision/common.py +20 -0
  104. sky/provision/docker_utils.py +15 -2
  105. sky/provision/kubernetes/utils.py +163 -20
  106. sky/provision/kubernetes/volume.py +52 -17
  107. sky/provision/provisioner.py +17 -7
  108. sky/provision/runpod/instance.py +3 -1
  109. sky/provision/runpod/utils.py +13 -1
  110. sky/provision/runpod/volume.py +25 -9
  111. sky/provision/slurm/__init__.py +12 -0
  112. sky/provision/slurm/config.py +13 -0
  113. sky/provision/slurm/instance.py +618 -0
  114. sky/provision/slurm/utils.py +689 -0
  115. sky/provision/vast/instance.py +4 -1
  116. sky/provision/vast/utils.py +11 -6
  117. sky/resources.py +135 -13
  118. sky/schemas/api/responses.py +4 -0
  119. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  120. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  121. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  122. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  123. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  124. sky/schemas/generated/jobsv1_pb2.py +9 -5
  125. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  126. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  127. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  128. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  129. sky/serve/serve_utils.py +232 -40
  130. sky/serve/server/impl.py +1 -1
  131. sky/server/common.py +17 -0
  132. sky/server/constants.py +1 -1
  133. sky/server/metrics.py +6 -3
  134. sky/server/plugins.py +238 -0
  135. sky/server/requests/executor.py +5 -2
  136. sky/server/requests/payloads.py +30 -1
  137. sky/server/requests/request_names.py +4 -0
  138. sky/server/requests/requests.py +33 -11
  139. sky/server/requests/serializers/encoders.py +22 -0
  140. sky/server/requests/serializers/return_value_serializers.py +70 -0
  141. sky/server/server.py +506 -109
  142. sky/server/server_utils.py +30 -0
  143. sky/server/uvicorn.py +5 -0
  144. sky/setup_files/MANIFEST.in +1 -0
  145. sky/setup_files/dependencies.py +22 -9
  146. sky/sky_logging.py +2 -1
  147. sky/skylet/attempt_skylet.py +13 -3
  148. sky/skylet/constants.py +55 -13
  149. sky/skylet/events.py +10 -4
  150. sky/skylet/executor/__init__.py +1 -0
  151. sky/skylet/executor/slurm.py +187 -0
  152. sky/skylet/job_lib.py +91 -5
  153. sky/skylet/log_lib.py +22 -6
  154. sky/skylet/log_lib.pyi +8 -6
  155. sky/skylet/services.py +18 -3
  156. sky/skylet/skylet.py +5 -1
  157. sky/skylet/subprocess_daemon.py +2 -1
  158. sky/ssh_node_pools/constants.py +12 -0
  159. sky/ssh_node_pools/core.py +40 -3
  160. sky/ssh_node_pools/deploy/__init__.py +4 -0
  161. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  162. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  163. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  164. sky/ssh_node_pools/deploy/utils.py +173 -0
  165. sky/ssh_node_pools/server.py +11 -13
  166. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  167. sky/templates/kubernetes-ray.yml.j2 +12 -6
  168. sky/templates/slurm-ray.yml.j2 +115 -0
  169. sky/templates/vast-ray.yml.j2 +1 -0
  170. sky/templates/websocket_proxy.py +18 -41
  171. sky/users/model.conf +1 -1
  172. sky/users/permission.py +85 -52
  173. sky/users/rbac.py +31 -3
  174. sky/utils/annotations.py +108 -8
  175. sky/utils/auth_utils.py +42 -0
  176. sky/utils/cli_utils/status_utils.py +19 -5
  177. sky/utils/cluster_utils.py +10 -3
  178. sky/utils/command_runner.py +389 -35
  179. sky/utils/command_runner.pyi +43 -4
  180. sky/utils/common_utils.py +47 -31
  181. sky/utils/context.py +32 -0
  182. sky/utils/db/db_utils.py +36 -6
  183. sky/utils/db/migration_utils.py +41 -21
  184. sky/utils/infra_utils.py +5 -1
  185. sky/utils/instance_links.py +139 -0
  186. sky/utils/interactive_utils.py +49 -0
  187. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  188. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  189. sky/utils/kubernetes/rsync_helper.sh +5 -1
  190. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  191. sky/utils/plugin_extensions/__init__.py +14 -0
  192. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  193. sky/utils/resources_utils.py +10 -8
  194. sky/utils/rich_utils.py +9 -11
  195. sky/utils/schemas.py +93 -19
  196. sky/utils/status_lib.py +7 -0
  197. sky/utils/subprocess_utils.py +17 -0
  198. sky/volumes/client/sdk.py +6 -3
  199. sky/volumes/server/core.py +65 -27
  200. sky_templates/ray/start_cluster +8 -4
  201. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
  202. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
  203. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  204. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
  205. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  206. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  207. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  208. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  209. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  210. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  211. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
  213. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  214. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  216. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  217. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  221. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  222. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  223. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  224. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  225. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  226. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  227. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  228. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  229. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  230. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
  231. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
  232. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
  233. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  234. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
  235. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
  236. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
  237. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
  238. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
  239. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  240. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  241. /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  243. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  244. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  245. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -1,379 +1,10 @@
1
1
  #!/bin/bash
2
- # ssh-tunnel.sh - SSH tunnel script for Kubernetes API access
3
- # Used as kubectl exec credential plugin to establish SSH tunnel on demand.
4
- # Returns a valid credential format for kubectl with expiration. The expiration
5
- # is calculated based on the TTL argument and is required to force kubectl to
6
- # check the tunnel status frequently.
7
2
 
8
- # Usage: ssh-tunnel.sh --host HOST [--user USER] [--use-ssh-config] [--ssh-key KEY] [--context CONTEXT] [--port PORT] [--ttl SECONDS]
3
+ # This redirect stub is needed because we use this script in the
4
+ # exec auth section when creating our kubeconfig. Therefore, node pools
5
+ # launched in older versions of SkyPilot will have kubeconfigs pointing
6
+ # to this path.
9
7
 
10
- # Default time-to-live for credential in seconds
11
- # This forces kubectl to check the tunnel status frequently
12
- TTL_SECONDS=30
13
-
14
- # Parse arguments
15
- USE_SSH_CONFIG=0
16
- SSH_KEY=""
17
- CONTEXT=""
18
- HOST=""
19
- USER=""
20
- PORT=6443 # Default port if not specified
21
-
22
- # Debug log to ~/.sky/ssh_node_pools_info/$CONTEXT-tunnel.log
23
- debug_log() {
24
- local message="$(date): $1"
25
- echo "$message" >> "$LOG_FILE"
26
- }
27
-
28
- # Generate expiration timestamp for credential
29
- generate_expiration_timestamp() {
30
- # Try macOS date format first, fallback to Linux format
31
- date -u -v+${TTL_SECONDS}S +"%Y-%m-%dT%H:%M:%SZ" 2>/dev/null || date -u -d "+${TTL_SECONDS} seconds" +"%Y-%m-%dT%H:%M:%SZ"
32
- }
33
-
34
- # Acquire the lock, return 0 if successful, 1 if another process is already holding the lock
35
- acquire_lock() {
36
- # Check for flock command
37
- if ! command -v flock >/dev/null 2>&1; then
38
- debug_log "flock command not available, using alternative lock mechanism"
39
- # Simple file-based locking
40
- if [ -f "$LOCK_FILE" ]; then
41
- lock_pid=$(cat "$LOCK_FILE" 2>/dev/null)
42
- if [ -n "$lock_pid" ] && kill -0 "$lock_pid" 2>/dev/null; then
43
- debug_log "Another process ($lock_pid) is starting the tunnel, waiting briefly"
44
- return 1
45
- else
46
- # Stale lock file
47
- debug_log "Removing stale lock file"
48
- rm -f "$LOCK_FILE"
49
- fi
50
- fi
51
- # Create our lock
52
- echo $$ > "$LOCK_FILE"
53
- return 0
54
- else
55
- # Use flock for better locking
56
- exec 9>"$LOCK_FILE"
57
- if ! flock -n 9; then
58
- debug_log "Another process is starting the tunnel, waiting briefly"
59
- return 1
60
- fi
61
- return 0
62
- fi
63
- }
64
-
65
- # Release the lock
66
- release_lock() {
67
- if command -v flock >/dev/null 2>&1; then
68
- # Using flock
69
- exec 9>&- # Close file descriptor to release lock
70
- else
71
- # Using simple lock
72
- rm -f "$LOCK_FILE"
73
- fi
74
- debug_log "Lock released"
75
- }
76
-
77
- # Generate SSH command based on available tools and parameters
78
- generate_ssh_command() {
79
- # Check for autossh
80
- if ! command -v autossh >/dev/null 2>&1; then
81
- debug_log "WARNING: autossh is not installed but recommended for reliable SSH tunnels"
82
- debug_log "Install autossh: brew install autossh (macOS), apt-get install autossh (Ubuntu/Debian)"
83
-
84
- # Fall back to regular ssh
85
- if [[ $USE_SSH_CONFIG -eq 1 ]]; then
86
- SSH_CMD=("ssh" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N" "$HOST")
87
- else
88
- SSH_CMD=("ssh" "-o" "StrictHostKeyChecking=no" "-o" "IdentitiesOnly=yes" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N")
89
-
90
- # Add SSH key if provided
91
- if [[ -n "$SSH_KEY" ]]; then
92
- SSH_CMD+=("-i" "$SSH_KEY")
93
- fi
94
-
95
- # Add user@host
96
- SSH_CMD+=("$USER@$HOST")
97
- fi
98
- else
99
- # Configure autossh
100
- if [[ $USE_SSH_CONFIG -eq 1 ]]; then
101
- SSH_CMD=("autossh" "-M" "0" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N" "$HOST")
102
- else
103
- SSH_CMD=("autossh" "-M" "0" "-o" "StrictHostKeyChecking=no" "-o" "IdentitiesOnly=yes" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N")
104
-
105
- # Add SSH key if provided
106
- if [[ -n "$SSH_KEY" ]]; then
107
- SSH_CMD+=("-i" "$SSH_KEY")
108
- fi
109
-
110
- # Add user@host
111
- SSH_CMD+=("$USER@$HOST")
112
- fi
113
- fi
114
- }
115
-
116
- # Function to read certificate files if they exist
117
- read_certificate_data() {
118
- local client_cert_file="$TUNNEL_DIR/$CONTEXT-cert.pem"
119
- local client_key_file="$TUNNEL_DIR/$CONTEXT-key.pem"
120
- local cert_data=""
121
- local key_data=""
122
-
123
- if [[ -f "$client_cert_file" ]]; then
124
- # Read the certificate file as is - it's already in PEM format
125
- cert_data=$(cat "$client_cert_file")
126
- debug_log "Found client certificate data for context $CONTEXT"
127
-
128
- # Log the first and last few characters to verify PEM format
129
- local cert_start=$(head -1 "$client_cert_file")
130
- local cert_end=$(tail -1 "$client_cert_file")
131
- debug_log "Certificate starts with: $cert_start"
132
- debug_log "Certificate ends with: $cert_end"
133
-
134
- # Check if it has proper PEM format
135
- if ! grep -q "BEGIN CERTIFICATE" "$client_cert_file" || ! grep -q "END CERTIFICATE" "$client_cert_file"; then
136
- debug_log "WARNING: Certificate file may not be in proper PEM format"
137
- # Try to fix it if needed
138
- if ! grep -q "BEGIN CERTIFICATE" "$client_cert_file"; then
139
- echo "-----BEGIN CERTIFICATE-----" > "$client_cert_file.fixed"
140
- cat "$client_cert_file" >> "$client_cert_file.fixed"
141
- echo "-----END CERTIFICATE-----" >> "$client_cert_file.fixed"
142
- mv "$client_cert_file.fixed" "$client_cert_file"
143
- cert_data=$(cat "$client_cert_file")
144
- debug_log "Fixed certificate format by adding BEGIN/END markers"
145
- fi
146
- fi
147
- fi
148
-
149
- if [[ -f "$client_key_file" ]]; then
150
- # Read the key file as is - it's already in PEM format
151
- key_data=$(cat "$client_key_file")
152
- debug_log "Found client key data for context $CONTEXT"
153
-
154
- # Log the first and last few characters to verify PEM format
155
- local key_start=$(head -1 "$client_key_file")
156
- local key_end=$(tail -1 "$client_key_file")
157
- debug_log "Key starts with: $key_start"
158
- debug_log "Key ends with: $key_end"
159
-
160
- # Check if it has proper PEM format
161
- if ! grep -q "BEGIN" "$client_key_file" || ! grep -q "END" "$client_key_file"; then
162
- debug_log "WARNING: Key file may not be in proper PEM format"
163
- # Try to fix it if needed
164
- if ! grep -q "BEGIN" "$client_key_file"; then
165
- echo "-----BEGIN PRIVATE KEY-----" > "$client_key_file.fixed"
166
- cat "$client_key_file" >> "$client_key_file.fixed"
167
- echo "-----END PRIVATE KEY-----" >> "$client_key_file.fixed"
168
- mv "$client_key_file.fixed" "$client_key_file"
169
- key_data=$(cat "$client_key_file")
170
- debug_log "Fixed key format by adding BEGIN/END markers"
171
- fi
172
- fi
173
- fi
174
-
175
- echo "$cert_data:$key_data"
176
- }
177
-
178
- # Function to generate credentials JSON
179
- generate_credentials_json() {
180
- local expiration_time=$(generate_expiration_timestamp)
181
- local cert_bundle=$(read_certificate_data)
182
- local client_cert_data=${cert_bundle%:*}
183
- local client_key_data=${cert_bundle#*:}
184
-
185
- if [[ -n "$client_cert_data" && -n "$client_key_data" ]]; then
186
- # Debug the certificate data
187
- debug_log "Certificate data length: $(echo -n "$client_cert_data" | wc -c) bytes"
188
- debug_log "Key data length: $(echo -n "$client_key_data" | wc -c) bytes"
189
-
190
- # Check if we can create proper JSON with `jq`
191
- if ! command -v jq &>/dev/null; then
192
- echo "jq is not installed. Please install jq to use this script." >&2
193
- exit 1
194
- fi
195
- debug_log "Using jq for JSON formatting"
196
-
197
- # Create a temporary file for the JSON output to avoid shell escaping issues
198
- local TEMP_JSON_FILE=$(mktemp)
199
-
200
- # Write the JSON to the temporary file using jq for proper JSON formatting
201
- cat > "$TEMP_JSON_FILE" << EOL
202
- {
203
- "apiVersion": "client.authentication.k8s.io/v1beta1",
204
- "kind": "ExecCredential",
205
- "status": {
206
- "clientCertificateData": $(printf '%s' "$client_cert_data" | jq -R -s .),
207
- "clientKeyData": $(printf '%s' "$client_key_data" | jq -R -s .),
208
- "expirationTimestamp": "$expiration_time"
209
- }
210
- }
211
- EOL
212
-
213
- # Read the JSON from the file
214
- local json_response=$(cat "$TEMP_JSON_FILE")
215
-
216
- # Clean up
217
- rm -f "$TEMP_JSON_FILE"
218
-
219
- # Output the JSON
220
- echo "$json_response"
221
- else
222
- # Fallback to token-based credential for tunnel-only authentication
223
- echo "{\"apiVersion\":\"client.authentication.k8s.io/v1beta1\",\"kind\":\"ExecCredential\",\"status\":{\"token\":\"k8s-ssh-tunnel-token\",\"expirationTimestamp\":\"$expiration_time\"}}"
224
- fi
225
- }
226
-
227
- while [[ $# -gt 0 ]]; do
228
- case $1 in
229
- --use-ssh-config)
230
- USE_SSH_CONFIG=1
231
- shift
232
- ;;
233
- --ssh-key)
234
- SSH_KEY="$2"
235
- shift 2
236
- ;;
237
- --context)
238
- CONTEXT="$2"
239
- shift 2
240
- ;;
241
- --port)
242
- PORT="$2"
243
- shift 2
244
- ;;
245
- --host)
246
- HOST="$2"
247
- shift 2
248
- ;;
249
- --user)
250
- USER="$2"
251
- shift 2
252
- ;;
253
- --ttl)
254
- TTL_SECONDS="$2"
255
- shift 2
256
- ;;
257
- *)
258
- echo "Unknown parameter: $1" >&2
259
- exit 1
260
- ;;
261
- esac
262
- done
263
-
264
- # Validate required parameters
265
- if [[ -z "$HOST" ]]; then
266
- echo "Error: --host parameter is required" >&2
267
- exit 1
268
- fi
269
-
270
- # Setup directories
271
- TUNNEL_DIR="$HOME/.sky/ssh_node_pools_info"
272
- mkdir -p "$TUNNEL_DIR"
273
-
274
- # Get context name for PID file
275
- if [[ -z "$CONTEXT" ]]; then
276
- CONTEXT="default"
277
- fi
278
-
279
- PID_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.pid"
280
- LOG_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.log"
281
- LOCK_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.lock"
282
-
283
- debug_log "Starting ssh-tunnel.sh for context $CONTEXT, host $HOST, port $PORT"
284
- debug_log "SSH Config: $USE_SSH_CONFIG, User: $USER, TTL: ${TTL_SECONDS}s"
285
-
286
- # Check if specified port is already in use (tunnel may be running)
287
- if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
288
- debug_log "Port $PORT already in use, checking if it's our tunnel"
289
-
290
- # Check if there's a PID file and if that process is running
291
- if [[ -f "$PID_FILE" ]]; then
292
- OLD_PID=$(cat "$PID_FILE")
293
- if kill -0 "$OLD_PID" 2>/dev/null; then
294
- debug_log "Tunnel appears to be running with PID $OLD_PID"
295
- else
296
- debug_log "PID file exists but process $OLD_PID is not running"
297
- fi
298
- else
299
- debug_log "Port $PORT is in use but no PID file exists"
300
- fi
301
-
302
- # Return valid credential format for kubectl with expiration
303
- generate_credentials_json
304
- exit 0
305
- fi
306
-
307
- # Try to acquire the lock
308
- if ! acquire_lock; then
309
- # Wait briefly for the tunnel to be established
310
- for i in {1..10}; do
311
- if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
312
- debug_log "Tunnel is now active"
313
-
314
- # Return valid credential format for kubectl with expiration
315
- generate_credentials_json
316
- exit 0
317
- fi
318
- sleep 0.2
319
- done
320
- debug_log "Waited for tunnel but port $PORT still not available"
321
- fi
322
-
323
- # Check if we have a PID file with running process
324
- if [[ -f "$PID_FILE" ]]; then
325
- OLD_PID=$(cat "$PID_FILE")
326
- if kill -0 "$OLD_PID" 2>/dev/null; then
327
- # Process exists but port isn't open - something's wrong, kill it
328
- kill "$OLD_PID" 2>/dev/null
329
- debug_log "Killed stale tunnel process $OLD_PID"
330
- else
331
- debug_log "PID file exists but process $OLD_PID is not running anymore"
332
- fi
333
- # Remove the stale PID file
334
- rm -f "$PID_FILE"
335
- fi
336
-
337
- # Generate the SSH command
338
- generate_ssh_command
339
-
340
- debug_log "Starting SSH tunnel: ${SSH_CMD[*]}"
341
-
342
- # Start the tunnel in foreground and wait for it to establish
343
- "${SSH_CMD[@]}" >> "$LOG_FILE" 2>&1 &
344
- TUNNEL_PID=$!
345
-
346
- # Save PID
347
- echo $TUNNEL_PID > "$PID_FILE"
348
- debug_log "Tunnel started with PID $TUNNEL_PID"
349
-
350
- # Wait for tunnel to establish
351
- tunnel_up=0
352
- for i in {1..20}; do
353
- if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
354
- debug_log "Tunnel established successfully on port $PORT"
355
- tunnel_up=1
356
- break
357
- fi
358
- sleep 0.2
359
- done
360
-
361
- # Clean up lock file
362
- release_lock
363
-
364
- # Check if the tunnel process is still running
365
- if ! kill -0 $TUNNEL_PID 2>/dev/null; then
366
- debug_log "ERROR: Tunnel process exited unexpectedly! Check logs for details"
367
- if [[ -f "$PID_FILE" ]]; then
368
- rm -f "$PID_FILE"
369
- fi
370
- # Return error in case of tunnel failure
371
- echo "Failed to establish SSH tunnel. See $TUNNEL_DIR/$CONTEXT-tunnel.log for details." >&2
372
- exit 1
373
- elif [[ $tunnel_up -eq 0 ]]; then
374
- debug_log "WARNING: Tunnel process is running but port $PORT is not responding"
375
- fi
376
-
377
- # Return valid credential format with certificates if available
378
- generate_credentials_json
379
- exit 0
8
+ # TODO (kyuds): remove this script after v0.13.0. Kept here for backwards compat.
9
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
10
+ exec "$SCRIPT_DIR/../../ssh_node_pools/deploy/tunnel/ssh-tunnel.sh" "$@"
@@ -0,0 +1,14 @@
1
+ """Plugin extensions module.
2
+
3
+ This module provides extension points that plugins can hook into to provide
4
+ custom functionality.
5
+ """
6
+ from sky.utils.plugin_extensions.external_failure_source import (
7
+ ExternalClusterFailure)
8
+ from sky.utils.plugin_extensions.external_failure_source import (
9
+ ExternalFailureSource)
10
+
11
+ __all__ = [
12
+ 'ExternalClusterFailure',
13
+ 'ExternalFailureSource',
14
+ ]
@@ -0,0 +1,176 @@
1
+ """External failure source interface for plugins.
2
+
3
+ This module provides an extension point that allows plugins to provide
4
+ cluster failure tracking functionality. By default, no-op implementations
5
+ are used. Plugins can register their own implementations to provide actual
6
+ failure tracking.
7
+
8
+ Example usage in a plugin:
9
+ from sky.utils.plugin_extensions import ExternalFailureSource
10
+
11
+ # Register custom failure source
12
+ ExternalFailureSource.register(
13
+ get_failures=my_get_cluster_failures,
14
+ clear_failures=my_clear_cluster_failures,
15
+ )
16
+
17
+ Example usage in core SkyPilot:
18
+ from sky.utils.plugin_extensions import ExternalFailureSource
19
+
20
+ # Get failures for a cluster
21
+ failures = ExternalFailureSource.get(cluster_hash='abc123')
22
+
23
+ # Clear failures for a cluster
24
+ cleared = ExternalFailureSource.clear(cluster_name='my-cluster')
25
+ """
26
+ import dataclasses
27
+ from typing import Any, Dict, List, Optional, Protocol
28
+
29
+ from sky import sky_logging
30
+
31
+ logger = sky_logging.init_logger(__name__)
32
+
33
+
34
+ @dataclasses.dataclass
35
+ class ExternalClusterFailure:
36
+ """Represents a single cluster failure from an external source.
37
+
38
+ Attributes:
39
+ code: Machine-readable failure code (e.g. 'GPU_HARDWARE_FAILURE_XID_79')
40
+ reason: Human-readable description of the failure.
41
+ """
42
+ code: str
43
+ reason: str
44
+
45
+ @classmethod
46
+ def from_failure_list(
47
+ cls, failures: List[Dict[str,
48
+ Any]]) -> List['ExternalClusterFailure']:
49
+ """Create a list of ExternalClusterFailure from failure dicts.
50
+
51
+ Args:
52
+ failures: List of dicts with 'failure_mode' and 'failure_reason'
53
+ keys (as returned by ExternalFailureSource.get()).
54
+
55
+ Returns:
56
+ List of ExternalClusterFailure objects, one per failure.
57
+ """
58
+ return [
59
+ cls(code=f['failure_mode'], reason=f['failure_reason'])
60
+ for f in failures
61
+ ]
62
+
63
+
64
+ # Protocol definitions for the failure source functions
65
+ class GetClusterFailuresFunc(Protocol):
66
+ """Protocol for get_cluster_failures function."""
67
+
68
+ def __call__(self,
69
+ cluster_hash: Optional[str] = None,
70
+ cluster_name: Optional[str] = None) -> List[Dict[str, Any]]:
71
+ ...
72
+
73
+
74
+ class ClearClusterFailuresFunc(Protocol):
75
+ """Protocol for clear_cluster_failures function."""
76
+
77
+ def __call__(self,
78
+ cluster_hash: Optional[str] = None,
79
+ cluster_name: Optional[str] = None) -> List[Dict[str, Any]]:
80
+ ...
81
+
82
+
83
+ class ExternalFailureSource:
84
+ """Singleton class for external cluster failure source.
85
+
86
+ This class provides an extension point for plugins to register their own
87
+ cluster failure tracking implementations. By default, no-op implementations
88
+ are used that return empty lists.
89
+
90
+ Plugins can register their implementations during their install() phase,
91
+ and core SkyPilot code can use the get() and clear() methods to interact
92
+ with cluster failures without knowing which plugin (if any) is providing
93
+ the implementation.
94
+ """
95
+
96
+ _get_func: Optional[GetClusterFailuresFunc] = None
97
+ _clear_func: Optional[ClearClusterFailuresFunc] = None
98
+
99
+ @classmethod
100
+ def register(cls, get_failures: GetClusterFailuresFunc,
101
+ clear_failures: ClearClusterFailuresFunc) -> None:
102
+ """Register an external failure source implementation.
103
+
104
+ This allows plugins to provide their own cluster failure tracking.
105
+ Only one external failure source can be registered at a time.
106
+
107
+ Args:
108
+ get_failures: Function to get active cluster failures.
109
+ Signature: (cluster_hash: Optional[str],
110
+ cluster_name: Optional[str])
111
+ -> List[Dict[str, Any]]
112
+ Returns list of dicts with keys: cluster_hash, failure_mode,
113
+ failure_reason, cleared_at.
114
+ clear_failures: Function to clear cluster failures.
115
+ Signature: (cluster_hash: Optional[str],
116
+ cluster_name: Optional[str])
117
+ -> List[Dict[str, Any]]
118
+ Returns list of dicts of the failures that were cleared.
119
+ """
120
+ cls._get_func = get_failures
121
+ cls._clear_func = clear_failures
122
+ logger.info('Registered external failure source')
123
+
124
+ @classmethod
125
+ def is_registered(cls) -> bool:
126
+ """Check if an external failure source is registered."""
127
+ return cls._get_func is not None and cls._clear_func is not None
128
+
129
+ @classmethod
130
+ def get(cls,
131
+ cluster_hash: Optional[str] = None,
132
+ cluster_name: Optional[str] = None) -> List[Dict[str, Any]]:
133
+ """Get active cluster failures from the registered failure source.
134
+
135
+ Args:
136
+ cluster_hash: Hash of the cluster to query failures for.
137
+ cluster_name: Name of the cluster to query failures for.
138
+
139
+ Returns:
140
+ List of dictionaries containing failure records.
141
+ Each dict contains: cluster_hash, failure_mode, failure_reason,
142
+ cleared_at. Returns empty list if no failure source is registered.
143
+ """
144
+ if cls._get_func is None:
145
+ return []
146
+ try:
147
+ # pylint: disable=not-callable
148
+ return cls._get_func(cluster_name=cluster_name,
149
+ cluster_hash=cluster_hash)
150
+ except Exception as e: # pylint: disable=broad-except
151
+ logger.warning(f'Failed to get cluster failures: {e}')
152
+ return []
153
+
154
+ @classmethod
155
+ def clear(cls,
156
+ cluster_hash: Optional[str] = None,
157
+ cluster_name: Optional[str] = None) -> List[Dict[str, Any]]:
158
+ """Clear cluster failures via the registered failure source.
159
+
160
+ Args:
161
+ cluster_hash: Hash of the cluster to clear failures for.
162
+ cluster_name: Name of the cluster to clear failures for.
163
+
164
+ Returns:
165
+ List of dictionaries containing the failure records that were
166
+ cleared. Returns empty list if no failure source is registered.
167
+ """
168
+ if cls._clear_func is None:
169
+ return []
170
+ try:
171
+ # pylint: disable=not-callable
172
+ return cls._clear_func(cluster_name=cluster_name,
173
+ cluster_hash=cluster_hash)
174
+ except Exception as e: # pylint: disable=broad-except
175
+ logger.warning(f'Failed to clear cluster failures: {e}')
176
+ return []
@@ -183,7 +183,8 @@ def simplify_ports(ports: List[str]) -> List[str]:
183
183
  def format_resource(resource: 'resources_lib.Resources',
184
184
  simplified_only: bool = False) -> Tuple[str, Optional[str]]:
185
185
  resource = resource.assert_launchable()
186
- is_k8s = str(resource.cloud).lower() == 'kubernetes'
186
+ is_k8s = resource.cloud.canonical_name() == 'kubernetes'
187
+ vcpu, mem = None, None
187
188
  if resource.accelerators is None or is_k8s or not simplified_only:
188
189
  vcpu, mem = resource.cloud.get_vcpus_mem_from_instance_type(
189
190
  resource.instance_type)
@@ -198,18 +199,19 @@ def format_resource(resource: 'resources_lib.Resources',
198
199
 
199
200
  if (resource.accelerators is None or is_k8s):
200
201
  if vcpu is not None:
201
- elements_simple.append(f'cpus={int(vcpu)}')
202
- elements_full.append(f'cpus={int(vcpu)}')
202
+ elements_simple.append(f'cpus={common_utils.format_float(vcpu)}')
203
+ elements_full.append(f'cpus={common_utils.format_float(vcpu)}')
203
204
  if mem is not None:
204
- elements_simple.append(f'mem={int(mem)}')
205
- elements_full.append(f'mem={int(mem)}')
205
+ elements_simple.append(f'mem={common_utils.format_float(mem)}')
206
+ elements_full.append(f'mem={common_utils.format_float(mem)}')
206
207
  elif not simplified_only:
207
208
  if vcpu is not None:
208
- elements_full.append(f'cpus={int(vcpu)}')
209
+ elements_full.append(f'cpus={common_utils.format_float(vcpu)}')
209
210
  if mem is not None:
210
- elements_full.append(f'mem={int(mem)}')
211
+ elements_full.append(f'mem={common_utils.format_float(mem)}')
211
212
 
212
- if not is_k8s:
213
+ is_slurm = resource.cloud.canonical_name() == 'slurm'
214
+ if not is_k8s and not is_slurm:
213
215
  instance_type_full = resource.instance_type
214
216
  instance_type_simple = common_utils.truncate_long_string(
215
217
  instance_type_full, 15)
sky/utils/rich_utils.py CHANGED
@@ -362,14 +362,14 @@ def decode_rich_status(
362
362
  # Replace `\r\n` with `\n`, as printing a line ends with
363
363
  # `\r\n` in linux will cause the line to be empty.
364
364
  line = line[:-2] + '\n'
365
- is_payload, line = message_utils.decode_payload(
365
+ is_payload, decoded_line = message_utils.decode_payload(
366
366
  line, raise_for_mismatch=False)
367
- control = None
368
- if is_payload:
369
- control, encoded_status = Control.decode(line)
370
- if control is None:
367
+ if not is_payload:
371
368
  yield line
372
369
  continue
370
+ control, encoded_status = Control.decode(decoded_line)
371
+ if control is None:
372
+ continue
373
373
 
374
374
  if control == Control.RETRY:
375
375
  raise exceptions.RequestInterruptedError(
@@ -481,15 +481,13 @@ async def decode_rich_status_async(
481
481
  # Replace `\r\n` with `\n`, as printing a line ends with
482
482
  # `\r\n` in linux will cause the line to be empty.
483
483
  line = line[:-2] + '\n'
484
- is_payload, line = message_utils.decode_payload(
484
+ is_payload, decoded_line = message_utils.decode_payload(
485
485
  line, raise_for_mismatch=False)
486
- if line is None:
486
+ if not is_payload:
487
+ yield line
487
488
  continue
488
- control = None
489
- if is_payload:
490
- control, encoded_status = Control.decode(line)
489
+ control, encoded_status = Control.decode(decoded_line)
491
490
  if control is None:
492
- yield line
493
491
  continue
494
492
 
495
493
  if control == Control.RETRY: