skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/slurm.py +159 -72
  3. sky/backends/backend_utils.py +52 -10
  4. sky/backends/cloud_vm_ray_backend.py +192 -32
  5. sky/backends/task_codegen.py +40 -2
  6. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  8. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  9. sky/catalog/seeweb_catalog.py +30 -15
  10. sky/catalog/shadeform_catalog.py +5 -2
  11. sky/catalog/slurm_catalog.py +0 -7
  12. sky/catalog/vast_catalog.py +30 -6
  13. sky/check.py +11 -8
  14. sky/client/cli/command.py +106 -54
  15. sky/client/interactive_utils.py +190 -0
  16. sky/client/sdk.py +8 -0
  17. sky/client/sdk_async.py +9 -0
  18. sky/clouds/aws.py +60 -2
  19. sky/clouds/azure.py +2 -0
  20. sky/clouds/kubernetes.py +2 -0
  21. sky/clouds/runpod.py +38 -7
  22. sky/clouds/slurm.py +44 -12
  23. sky/clouds/ssh.py +1 -1
  24. sky/clouds/vast.py +30 -17
  25. sky/core.py +69 -1
  26. sky/dashboard/out/404.html +1 -1
  27. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  29. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  30. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  31. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  32. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  35. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  36. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  37. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  39. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  40. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  44. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  45. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  50. sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
  51. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  52. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
  55. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  59. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  65. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  66. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  67. sky/dashboard/out/clusters/[cluster].html +1 -1
  68. sky/dashboard/out/clusters.html +1 -1
  69. sky/dashboard/out/config.html +1 -1
  70. sky/dashboard/out/index.html +1 -1
  71. sky/dashboard/out/infra/[context].html +1 -1
  72. sky/dashboard/out/infra.html +1 -1
  73. sky/dashboard/out/jobs/[job].html +1 -1
  74. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  75. sky/dashboard/out/jobs.html +1 -1
  76. sky/dashboard/out/plugins/[...slug].html +1 -1
  77. sky/dashboard/out/users.html +1 -1
  78. sky/dashboard/out/volumes.html +1 -1
  79. sky/dashboard/out/workspace/new.html +1 -1
  80. sky/dashboard/out/workspaces/[name].html +1 -1
  81. sky/dashboard/out/workspaces.html +1 -1
  82. sky/data/data_utils.py +26 -12
  83. sky/data/mounting_utils.py +29 -4
  84. sky/global_user_state.py +108 -16
  85. sky/jobs/client/sdk.py +8 -3
  86. sky/jobs/controller.py +191 -31
  87. sky/jobs/recovery_strategy.py +109 -11
  88. sky/jobs/server/core.py +81 -4
  89. sky/jobs/server/server.py +14 -0
  90. sky/jobs/state.py +417 -19
  91. sky/jobs/utils.py +73 -80
  92. sky/models.py +9 -0
  93. sky/optimizer.py +2 -1
  94. sky/provision/__init__.py +11 -9
  95. sky/provision/kubernetes/utils.py +122 -15
  96. sky/provision/kubernetes/volume.py +52 -17
  97. sky/provision/provisioner.py +2 -1
  98. sky/provision/runpod/instance.py +3 -1
  99. sky/provision/runpod/utils.py +13 -1
  100. sky/provision/runpod/volume.py +25 -9
  101. sky/provision/slurm/instance.py +75 -29
  102. sky/provision/slurm/utils.py +213 -107
  103. sky/provision/vast/utils.py +1 -0
  104. sky/resources.py +135 -13
  105. sky/schemas/api/responses.py +4 -0
  106. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  107. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  108. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  109. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  110. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  111. sky/schemas/generated/jobsv1_pb2.py +9 -5
  112. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  113. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  114. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  115. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  116. sky/serve/serve_utils.py +232 -40
  117. sky/server/common.py +17 -0
  118. sky/server/constants.py +1 -1
  119. sky/server/metrics.py +6 -3
  120. sky/server/plugins.py +16 -0
  121. sky/server/requests/payloads.py +18 -0
  122. sky/server/requests/request_names.py +2 -0
  123. sky/server/requests/requests.py +28 -10
  124. sky/server/requests/serializers/encoders.py +5 -0
  125. sky/server/requests/serializers/return_value_serializers.py +14 -4
  126. sky/server/server.py +434 -107
  127. sky/server/uvicorn.py +5 -0
  128. sky/setup_files/MANIFEST.in +1 -0
  129. sky/setup_files/dependencies.py +21 -10
  130. sky/sky_logging.py +2 -1
  131. sky/skylet/constants.py +22 -5
  132. sky/skylet/executor/slurm.py +4 -6
  133. sky/skylet/job_lib.py +89 -4
  134. sky/skylet/services.py +18 -3
  135. sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
  136. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  137. sky/templates/kubernetes-ray.yml.j2 +4 -6
  138. sky/templates/slurm-ray.yml.j2 +32 -2
  139. sky/templates/websocket_proxy.py +18 -41
  140. sky/users/permission.py +61 -51
  141. sky/utils/auth_utils.py +42 -0
  142. sky/utils/cli_utils/status_utils.py +19 -5
  143. sky/utils/cluster_utils.py +10 -3
  144. sky/utils/command_runner.py +256 -94
  145. sky/utils/command_runner.pyi +16 -0
  146. sky/utils/common_utils.py +30 -29
  147. sky/utils/context.py +32 -0
  148. sky/utils/db/db_utils.py +36 -6
  149. sky/utils/db/migration_utils.py +41 -21
  150. sky/utils/infra_utils.py +5 -1
  151. sky/utils/instance_links.py +139 -0
  152. sky/utils/interactive_utils.py +49 -0
  153. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  154. sky/utils/kubernetes/rsync_helper.sh +5 -1
  155. sky/utils/plugin_extensions/__init__.py +14 -0
  156. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  157. sky/utils/resources_utils.py +10 -8
  158. sky/utils/rich_utils.py +9 -11
  159. sky/utils/schemas.py +63 -20
  160. sky/utils/status_lib.py +7 -0
  161. sky/utils/subprocess_utils.py +17 -0
  162. sky/volumes/client/sdk.py +6 -3
  163. sky/volumes/server/core.py +65 -27
  164. sky_templates/ray/start_cluster +8 -4
  165. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
  166. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
  167. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
  168. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
  169. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  170. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  173. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  174. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
  175. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  176. sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
  177. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  178. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  179. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  180. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
  181. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  182. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
  183. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  184. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  185. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  186. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  187. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  188. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  189. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  190. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
  191. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
  192. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
  193. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
  194. sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
  195. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
  196. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
  197. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
  198. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
  199. sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
  200. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
  201. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
  202. /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  203. /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
  204. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  205. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  206. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  207. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,49 @@
1
+ """Utilities for server-side interactive SSH functionality."""
2
+ import array
3
+ import socket
4
+
5
+
6
+ def get_pty_socket_path(session_id: str) -> str:
7
+ """Get the Unix socket path for PTY file descriptor passing."""
8
+ return f'/tmp/sky_pty_{session_id}.sock'
9
+
10
+
11
+ def send_fd(sock: socket.socket, fd: int) -> None:
12
+ """Send file descriptor via Unix socket using SCM_RIGHTS.
13
+
14
+ SCM_RIGHTS allows us to send or receive a set of open
15
+ file descriptors from another process.
16
+
17
+ See:
18
+ https://man7.org/linux/man-pages/man7/unix.7.html
19
+ https://man7.org/linux/man-pages/man3/cmsg.3.html
20
+
21
+ Args:
22
+ sock: Connected Unix socket.
23
+ fd: File descriptor to send.
24
+ """
25
+ sock.sendmsg(
26
+ [b'x'], # Dummy data
27
+ [(socket.SOL_SOCKET, socket.SCM_RIGHTS, array.array('i', [fd]))])
28
+
29
+
30
+ def recv_fd(sock: socket.socket) -> int:
31
+ """Receive file descriptor via Unix socket using SCM_RIGHTS.
32
+
33
+ Args:
34
+ sock: Connected Unix socket.
35
+
36
+ Returns:
37
+ Received file descriptor.
38
+
39
+ Raises:
40
+ RuntimeError: If no file descriptor was received.
41
+ """
42
+ # NOTE: recvmsg() has no async equivalent
43
+ _, ancdata, _, _ = sock.recvmsg(
44
+ 1, socket.CMSG_SPACE(array.array('i', [0]).itemsize))
45
+ if not ancdata:
46
+ raise RuntimeError('No file descriptor received - '
47
+ 'sender may have closed connection')
48
+ _, _, cmsg_data = ancdata[0]
49
+ return array.array('i', cmsg_data)[0]
@@ -12,20 +12,20 @@
12
12
  # * Specify SKYPILOT_NAMESPACE env var to override the default namespace where the service account is created.
13
13
  # * Specify SKYPILOT_SA_NAME env var to override the default service account name.
14
14
  # * Specify SKIP_SA_CREATION=1 to skip creating the service account and use an existing one
15
- # * Specify SUPER_USER=1 to create a service account with cluster-admin permissions
15
+ # * Specify SUPER_USER=0 to create a service account with minimal permissions
16
16
  #
17
17
  # Usage:
18
- # # Create "sky-sa" service account with minimal permissions in "default" namespace and generate kubeconfig
18
+ # # Create "sky-sa" service account in "default" namespace and generate kubeconfig
19
19
  # $ ./generate_kubeconfig.sh
20
20
  #
21
- # # Create "my-sa" service account with minimal permissions in "my-namespace" namespace and generate kubeconfig
21
+ # # Create "my-sa" service account in "my-namespace" namespace and generate kubeconfig
22
22
  # $ SKYPILOT_SA_NAME=my-sa SKYPILOT_NAMESPACE=my-namespace ./generate_kubeconfig.sh
23
23
  #
24
24
  # # Use an existing service account "my-sa" in "my-namespace" namespace and generate kubeconfig
25
25
  # $ SKIP_SA_CREATION=1 SKYPILOT_SA_NAME=my-sa SKYPILOT_NAMESPACE=my-namespace ./generate_kubeconfig.sh
26
26
  #
27
- # # Create "sky-sa" service account with cluster-admin permissions in "default" namespace
28
- # $ SUPER_USER=1 ./generate_kubeconfig.sh
27
+ # # Create "sky-sa" service account with minimal permissions in "default" namespace (manual setup may be required)
28
+ # $ SUPER_USER=0 ./generate_kubeconfig.sh
29
29
 
30
30
  set -eu -o pipefail
31
31
 
@@ -33,11 +33,18 @@ set -eu -o pipefail
33
33
  # use default.
34
34
  SKYPILOT_SA=${SKYPILOT_SA_NAME:-sky-sa}
35
35
  NAMESPACE=${SKYPILOT_NAMESPACE:-default}
36
- SUPER_USER=${SUPER_USER:-0}
36
+ SUPER_USER=${SUPER_USER:-1}
37
37
 
38
- echo "Service account: ${SKYPILOT_SA}"
39
- echo "Namespace: ${NAMESPACE}"
40
- echo "Super user permissions: ${SUPER_USER}"
38
+ echo "=========================================="
39
+ echo "SkyPilot Kubeconfig Generation"
40
+ echo "=========================================="
41
+ echo "Service Account: ${SKYPILOT_SA}"
42
+ echo "Namespace: ${NAMESPACE}"
43
+ if [ "${SUPER_USER}" != "1" ]; then
44
+ echo "Permissions: Minimal (manual setup may be required)"
45
+ SUPER_USER=0
46
+ fi
47
+ echo ""
41
48
 
42
49
  # Set OS specific values.
43
50
  if [[ "$OSTYPE" == "linux-gnu" ]]; then
@@ -53,7 +60,7 @@ fi
53
60
 
54
61
  # If the user has set SKIP_SA_CREATION=1, skip creating the service account.
55
62
  if [ -z ${SKIP_SA_CREATION+x} ]; then
56
- echo "Creating the Kubernetes Service Account with ${SUPER_USER:+super user}${SUPER_USER:-minimal} RBAC permissions."
63
+ echo "[1/3] Creating Kubernetes Service Account and RBAC permissions..."
57
64
  if [ "${SUPER_USER}" = "1" ]; then
58
65
  # Create service account with cluster-admin permissions
59
66
  kubectl apply -f - <<EOF
@@ -219,7 +226,8 @@ roleRef:
219
226
  EOF
220
227
  fi
221
228
  # Apply optional ingress-related roles, but don't make the script fail if it fails
222
- kubectl apply -f - <<EOF || echo "Failed to apply optional ingress-related roles. Nginx ingress is likely not installed. This is not critical and the script will continue."
229
+ echo " Applying optional ingress permissions (skipped if ingress-nginx not installed)..."
230
+ kubectl apply -f - 2>/dev/null <<EOF || true
223
231
  # Optional: Role for accessing ingress resources
224
232
  apiVersion: rbac.authorization.k8s.io/v1
225
233
  kind: Role
@@ -253,8 +261,13 @@ roleRef:
253
261
  name: ${SKYPILOT_SA}-role-ingress-nginx # Use the same name as the role at line 119
254
262
  apiGroup: rbac.authorization.k8s.io
255
263
  EOF
264
+ else
265
+ echo "[1/3] Skipping service account creation (using existing account)..."
256
266
  fi
257
267
 
268
+ echo ""
269
+ echo "[2/3] Creating service account token..."
270
+
258
271
  # Checks if secret entry was defined for Service account. If defined it means that Kubernetes server has a
259
272
  # version bellow 1.24, otherwise one must manually create the secret and bind it to the Service account to have a non expiring token.
260
273
  # After Kubernetes v1.24 Service accounts no longer generate automatic tokens/secrets.
@@ -293,7 +306,9 @@ CURRENT_CONTEXT=$(kubectl config current-context)
293
306
  CURRENT_CLUSTER=$(kubectl config view -o jsonpath="{.contexts[?(@.name == \"${CURRENT_CONTEXT}\"})].context.cluster}")
294
307
  CURRENT_CLUSTER_ADDR=$(kubectl config view -o jsonpath="{.clusters[?(@.name == \"${CURRENT_CLUSTER}\"})].cluster.server}")
295
308
 
296
- echo "Writing kubeconfig."
309
+ echo ""
310
+ echo "[3/3] Generating kubeconfig file..."
311
+
297
312
  cat > kubeconfig <<EOF
298
313
  apiVersion: v1
299
314
  clusters:
@@ -316,24 +331,18 @@ users:
316
331
  token: ${SA_TOKEN}
317
332
  EOF
318
333
 
319
- echo "---
320
- Done!
321
-
322
- Kubeconfig using service account '${SKYPILOT_SA}' in namespace '${NAMESPACE}' written at $(pwd)/kubeconfig
323
-
324
- Copy the generated kubeconfig file to your ~/.kube/ directory to use it with
325
- kubectl and skypilot:
326
-
327
- # Backup your existing kubeconfig file
328
- mv ~/.kube/config ~/.kube/config.bak
329
- cp kubeconfig ~/.kube/config
330
-
331
- # Verify that you can access the cluster
332
- kubectl get pods
333
-
334
- Also add this to your ~/.sky/config.yaml to use the new service account:
335
-
336
- # ~/.sky/config.yaml
337
- kubernetes:
338
- remote_identity: ${SKYPILOT_SA}
339
- "
334
+ echo ""
335
+ echo "=========================================="
336
+ echo "✓ SUCCESS!"
337
+ echo "=========================================="
338
+ echo ""
339
+ echo "Kubeconfig file created successfully!"
340
+ echo ""
341
+ echo " Service Account: ${SKYPILOT_SA}"
342
+ echo " Namespace: ${NAMESPACE}"
343
+ echo " Location: $(pwd)/kubeconfig"
344
+ echo ""
345
+ echo "Next steps:"
346
+ echo " Refer to this page for setting up the credential for remote API server:"
347
+ echo " https://docs.skypilot.co/en/latest/reference/api-server/api-server-admin-deploy.html#optional-configure-cloud-accounts"
348
+ echo ""
@@ -60,4 +60,8 @@ fi
60
60
  # We wrap the command in a bash script that waits for rsync, then execs the original command.
61
61
  # Timeout after MAX_WAIT_TIME_SECONDS seconds.
62
62
  MAX_WAIT_TIME_SECONDS=300
63
- eval "${kubectl_cmd_base% --} -i -- bash -c 'count=0; max_count=$MAX_WAIT_TIME_SECONDS*2; until which rsync >/dev/null 2>&1; do if [ \$count -ge \$max_count ]; then echo \"Error when trying to rsync files to kubernetes cluster. Package installation may have failed.\" >&2; exit 1; fi; sleep 0.5; count=\$((count+1)); done; exec \"\$@\"' -- \"\$@\""
63
+ MAX_WAIT_COUNT=$((MAX_WAIT_TIME_SECONDS * 2))
64
+ # Use --norc --noprofile to prevent bash from sourcing startup files that might
65
+ # output to stdout and corrupt the rsync protocol. All debug output must go to
66
+ # stderr (>&2) to keep stdout clean for rsync communication.
67
+ eval "${kubectl_cmd_base% --} -i -- bash --norc --noprofile -c 'count=0; until which rsync >/dev/null 2>&1; do if [ \$count -ge $MAX_WAIT_COUNT ]; then echo \"Error when trying to rsync files to kubernetes cluster. Package installation may have failed.\" >&2; exit 1; fi; sleep 0.5; count=\$((count+1)); done; exec \"\$@\"' -- \"\$@\""
@@ -0,0 +1,14 @@
1
+ """Plugin extensions module.
2
+
3
+ This module provides extension points that plugins can hook into to provide
4
+ custom functionality.
5
+ """
6
+ from sky.utils.plugin_extensions.external_failure_source import (
7
+ ExternalClusterFailure)
8
+ from sky.utils.plugin_extensions.external_failure_source import (
9
+ ExternalFailureSource)
10
+
11
+ __all__ = [
12
+ 'ExternalClusterFailure',
13
+ 'ExternalFailureSource',
14
+ ]
@@ -0,0 +1,176 @@
1
+ """External failure source interface for plugins.
2
+
3
+ This module provides an extension point that allows plugins to provide
4
+ cluster failure tracking functionality. By default, no-op implementations
5
+ are used. Plugins can register their own implementations to provide actual
6
+ failure tracking.
7
+
8
+ Example usage in a plugin:
9
+ from sky.utils.plugin_extensions import ExternalFailureSource
10
+
11
+ # Register custom failure source
12
+ ExternalFailureSource.register(
13
+ get_failures=my_get_cluster_failures,
14
+ clear_failures=my_clear_cluster_failures,
15
+ )
16
+
17
+ Example usage in core SkyPilot:
18
+ from sky.utils.plugin_extensions import ExternalFailureSource
19
+
20
+ # Get failures for a cluster
21
+ failures = ExternalFailureSource.get(cluster_hash='abc123')
22
+
23
+ # Clear failures for a cluster
24
+ cleared = ExternalFailureSource.clear(cluster_name='my-cluster')
25
+ """
26
+ import dataclasses
27
+ from typing import Any, Dict, List, Optional, Protocol
28
+
29
+ from sky import sky_logging
30
+
31
+ logger = sky_logging.init_logger(__name__)
32
+
33
+
34
+ @dataclasses.dataclass
35
+ class ExternalClusterFailure:
36
+ """Represents a single cluster failure from an external source.
37
+
38
+ Attributes:
39
+ code: Machine-readable failure code (e.g. 'GPU_HARDWARE_FAILURE_XID_79')
40
+ reason: Human-readable description of the failure.
41
+ """
42
+ code: str
43
+ reason: str
44
+
45
+ @classmethod
46
+ def from_failure_list(
47
+ cls, failures: List[Dict[str,
48
+ Any]]) -> List['ExternalClusterFailure']:
49
+ """Create a list of ExternalClusterFailure from failure dicts.
50
+
51
+ Args:
52
+ failures: List of dicts with 'failure_mode' and 'failure_reason'
53
+ keys (as returned by ExternalFailureSource.get()).
54
+
55
+ Returns:
56
+ List of ExternalClusterFailure objects, one per failure.
57
+ """
58
+ return [
59
+ cls(code=f['failure_mode'], reason=f['failure_reason'])
60
+ for f in failures
61
+ ]
62
+
63
+
64
+ # Protocol definitions for the failure source functions
65
+ class GetClusterFailuresFunc(Protocol):
66
+ """Protocol for get_cluster_failures function."""
67
+
68
+ def __call__(self,
69
+ cluster_hash: Optional[str] = None,
70
+ cluster_name: Optional[str] = None) -> List[Dict[str, Any]]:
71
+ ...
72
+
73
+
74
+ class ClearClusterFailuresFunc(Protocol):
75
+ """Protocol for clear_cluster_failures function."""
76
+
77
+ def __call__(self,
78
+ cluster_hash: Optional[str] = None,
79
+ cluster_name: Optional[str] = None) -> List[Dict[str, Any]]:
80
+ ...
81
+
82
+
83
+ class ExternalFailureSource:
84
+ """Singleton class for external cluster failure source.
85
+
86
+ This class provides an extension point for plugins to register their own
87
+ cluster failure tracking implementations. By default, no-op implementations
88
+ are used that return empty lists.
89
+
90
+ Plugins can register their implementations during their install() phase,
91
+ and core SkyPilot code can use the get() and clear() methods to interact
92
+ with cluster failures without knowing which plugin (if any) is providing
93
+ the implementation.
94
+ """
95
+
96
+ _get_func: Optional[GetClusterFailuresFunc] = None
97
+ _clear_func: Optional[ClearClusterFailuresFunc] = None
98
+
99
+ @classmethod
100
+ def register(cls, get_failures: GetClusterFailuresFunc,
101
+ clear_failures: ClearClusterFailuresFunc) -> None:
102
+ """Register an external failure source implementation.
103
+
104
+ This allows plugins to provide their own cluster failure tracking.
105
+ Only one external failure source can be registered at a time.
106
+
107
+ Args:
108
+ get_failures: Function to get active cluster failures.
109
+ Signature: (cluster_hash: Optional[str],
110
+ cluster_name: Optional[str])
111
+ -> List[Dict[str, Any]]
112
+ Returns list of dicts with keys: cluster_hash, failure_mode,
113
+ failure_reason, cleared_at.
114
+ clear_failures: Function to clear cluster failures.
115
+ Signature: (cluster_hash: Optional[str],
116
+ cluster_name: Optional[str])
117
+ -> List[Dict[str, Any]]
118
+ Returns list of dicts of the failures that were cleared.
119
+ """
120
+ cls._get_func = get_failures
121
+ cls._clear_func = clear_failures
122
+ logger.info('Registered external failure source')
123
+
124
+ @classmethod
125
+ def is_registered(cls) -> bool:
126
+ """Check if an external failure source is registered."""
127
+ return cls._get_func is not None and cls._clear_func is not None
128
+
129
+ @classmethod
130
+ def get(cls,
131
+ cluster_hash: Optional[str] = None,
132
+ cluster_name: Optional[str] = None) -> List[Dict[str, Any]]:
133
+ """Get active cluster failures from the registered failure source.
134
+
135
+ Args:
136
+ cluster_hash: Hash of the cluster to query failures for.
137
+ cluster_name: Name of the cluster to query failures for.
138
+
139
+ Returns:
140
+ List of dictionaries containing failure records.
141
+ Each dict contains: cluster_hash, failure_mode, failure_reason,
142
+ cleared_at. Returns empty list if no failure source is registered.
143
+ """
144
+ if cls._get_func is None:
145
+ return []
146
+ try:
147
+ # pylint: disable=not-callable
148
+ return cls._get_func(cluster_name=cluster_name,
149
+ cluster_hash=cluster_hash)
150
+ except Exception as e: # pylint: disable=broad-except
151
+ logger.warning(f'Failed to get cluster failures: {e}')
152
+ return []
153
+
154
+ @classmethod
155
+ def clear(cls,
156
+ cluster_hash: Optional[str] = None,
157
+ cluster_name: Optional[str] = None) -> List[Dict[str, Any]]:
158
+ """Clear cluster failures via the registered failure source.
159
+
160
+ Args:
161
+ cluster_hash: Hash of the cluster to clear failures for.
162
+ cluster_name: Name of the cluster to clear failures for.
163
+
164
+ Returns:
165
+ List of dictionaries containing the failure records that were
166
+ cleared. Returns empty list if no failure source is registered.
167
+ """
168
+ if cls._clear_func is None:
169
+ return []
170
+ try:
171
+ # pylint: disable=not-callable
172
+ return cls._clear_func(cluster_name=cluster_name,
173
+ cluster_hash=cluster_hash)
174
+ except Exception as e: # pylint: disable=broad-except
175
+ logger.warning(f'Failed to clear cluster failures: {e}')
176
+ return []
@@ -183,7 +183,8 @@ def simplify_ports(ports: List[str]) -> List[str]:
183
183
  def format_resource(resource: 'resources_lib.Resources',
184
184
  simplified_only: bool = False) -> Tuple[str, Optional[str]]:
185
185
  resource = resource.assert_launchable()
186
- is_k8s = str(resource.cloud).lower() == 'kubernetes'
186
+ is_k8s = resource.cloud.canonical_name() == 'kubernetes'
187
+ vcpu, mem = None, None
187
188
  if resource.accelerators is None or is_k8s or not simplified_only:
188
189
  vcpu, mem = resource.cloud.get_vcpus_mem_from_instance_type(
189
190
  resource.instance_type)
@@ -198,18 +199,19 @@ def format_resource(resource: 'resources_lib.Resources',
198
199
 
199
200
  if (resource.accelerators is None or is_k8s):
200
201
  if vcpu is not None:
201
- elements_simple.append(f'cpus={int(vcpu)}')
202
- elements_full.append(f'cpus={int(vcpu)}')
202
+ elements_simple.append(f'cpus={common_utils.format_float(vcpu)}')
203
+ elements_full.append(f'cpus={common_utils.format_float(vcpu)}')
203
204
  if mem is not None:
204
- elements_simple.append(f'mem={int(mem)}')
205
- elements_full.append(f'mem={int(mem)}')
205
+ elements_simple.append(f'mem={common_utils.format_float(mem)}')
206
+ elements_full.append(f'mem={common_utils.format_float(mem)}')
206
207
  elif not simplified_only:
207
208
  if vcpu is not None:
208
- elements_full.append(f'cpus={int(vcpu)}')
209
+ elements_full.append(f'cpus={common_utils.format_float(vcpu)}')
209
210
  if mem is not None:
210
- elements_full.append(f'mem={int(mem)}')
211
+ elements_full.append(f'mem={common_utils.format_float(mem)}')
211
212
 
212
- if not is_k8s:
213
+ is_slurm = resource.cloud.canonical_name() == 'slurm'
214
+ if not is_k8s and not is_slurm:
213
215
  instance_type_full = resource.instance_type
214
216
  instance_type_simple = common_utils.truncate_long_string(
215
217
  instance_type_full, 15)
sky/utils/rich_utils.py CHANGED
@@ -362,14 +362,14 @@ def decode_rich_status(
362
362
  # Replace `\r\n` with `\n`, as printing a line ends with
363
363
  # `\r\n` in linux will cause the line to be empty.
364
364
  line = line[:-2] + '\n'
365
- is_payload, line = message_utils.decode_payload(
365
+ is_payload, decoded_line = message_utils.decode_payload(
366
366
  line, raise_for_mismatch=False)
367
- control = None
368
- if is_payload:
369
- control, encoded_status = Control.decode(line)
370
- if control is None:
367
+ if not is_payload:
371
368
  yield line
372
369
  continue
370
+ control, encoded_status = Control.decode(decoded_line)
371
+ if control is None:
372
+ continue
373
373
 
374
374
  if control == Control.RETRY:
375
375
  raise exceptions.RequestInterruptedError(
@@ -481,15 +481,13 @@ async def decode_rich_status_async(
481
481
  # Replace `\r\n` with `\n`, as printing a line ends with
482
482
  # `\r\n` in linux will cause the line to be empty.
483
483
  line = line[:-2] + '\n'
484
- is_payload, line = message_utils.decode_payload(
484
+ is_payload, decoded_line = message_utils.decode_payload(
485
485
  line, raise_for_mismatch=False)
486
- if line is None:
486
+ if not is_payload:
487
+ yield line
487
488
  continue
488
- control = None
489
- if is_payload:
490
- control, encoded_status = Control.decode(line)
489
+ control, encoded_status = Control.decode(decoded_line)
491
490
  if control is None:
492
- yield line
493
491
  continue
494
492
 
495
493
  if control == Control.RETRY:
sky/utils/schemas.py CHANGED
@@ -208,26 +208,49 @@ def _get_single_resources_schema():
208
208
  },
209
209
  'job_recovery': {
210
210
  # Either a string or a dict.
211
- 'anyOf': [{
212
- 'type': 'string',
213
- }, {
214
- 'type': 'object',
215
- 'required': [],
216
- 'additionalProperties': False,
217
- 'properties': {
218
- 'strategy': {
219
- 'anyOf': [{
220
- 'type': 'string',
221
- }, {
222
- 'type': 'null',
223
- }],
224
- },
225
- 'max_restarts_on_errors': {
226
- 'type': 'integer',
227
- 'minimum': 0,
228
- },
211
+ 'anyOf': [
212
+ {
213
+ 'type': 'string',
214
+ },
215
+ {
216
+ 'type': 'object',
217
+ 'required': [],
218
+ 'additionalProperties': False,
219
+ 'properties': {
220
+ 'strategy': {
221
+ 'anyOf': [{
222
+ 'type': 'string',
223
+ }, {
224
+ 'type': 'null',
225
+ }],
226
+ },
227
+ 'max_restarts_on_errors': {
228
+ 'type': 'integer',
229
+ 'minimum': 0,
230
+ },
231
+ 'recover_on_exit_codes': {
232
+ 'anyOf': [
233
+ {
234
+ # Single exit code
235
+ 'type': 'integer',
236
+ 'minimum': 0,
237
+ 'maximum': 255,
238
+ },
239
+ {
240
+ # List of exit codes
241
+ 'type': 'array',
242
+ 'items': {
243
+ 'type': 'integer',
244
+ 'minimum': 0,
245
+ 'maximum': 255,
246
+ },
247
+ 'uniqueItems': True,
248
+ },
249
+ ],
250
+ },
251
+ }
229
252
  }
230
- }],
253
+ ],
231
254
  },
232
255
  'volumes': {
233
256
  'type': 'array',
@@ -1461,7 +1484,7 @@ def get_config_schema():
1461
1484
  'required': [],
1462
1485
  'additionalProperties': False,
1463
1486
  'properties': {
1464
- 'secure_only': {
1487
+ 'datacenter_only': {
1465
1488
  'type': 'boolean',
1466
1489
  },
1467
1490
  }
@@ -1845,6 +1868,25 @@ def get_config_schema():
1845
1868
  config['properties'].update(_REMOTE_IDENTITY_SCHEMA_KUBERNETES)
1846
1869
  else:
1847
1870
  config['properties'].update(_REMOTE_IDENTITY_SCHEMA)
1871
+
1872
+ data_schema = {
1873
+ 'type': 'object',
1874
+ 'required': [],
1875
+ 'additionalProperties': False,
1876
+ 'properties': {
1877
+ 'mount_cached': {
1878
+ 'type': 'object',
1879
+ 'required': [],
1880
+ 'additionalProperties': False,
1881
+ 'properties': {
1882
+ 'sequential_upload': {
1883
+ 'type': 'boolean',
1884
+ },
1885
+ },
1886
+ },
1887
+ },
1888
+ }
1889
+
1848
1890
  return {
1849
1891
  '$schema': 'https://json-schema.org/draft/2020-12/schema',
1850
1892
  'type': 'object',
@@ -1871,6 +1913,7 @@ def get_config_schema():
1871
1913
  'rbac': rbac_schema,
1872
1914
  'logs': logs_schema,
1873
1915
  'daemons': daemon_schema,
1916
+ 'data': data_schema,
1874
1917
  **cloud_configs,
1875
1918
  },
1876
1919
  }
sky/utils/status_lib.py CHANGED
@@ -27,6 +27,12 @@ class ClusterStatus(enum.Enum):
27
27
 
28
28
  STOPPED = 'STOPPED'
29
29
  """The cluster is stopped."""
30
+ PENDING = 'PENDING'
31
+ """The cluster is pending scheduling.
32
+
33
+ NOTE: This state is for display only and should not be used in state
34
+ machine logic without necessary considerations.
35
+ """
30
36
 
31
37
  def colored_str(self):
32
38
  color = _STATUS_TO_COLOR[self]
@@ -37,6 +43,7 @@ _STATUS_TO_COLOR = {
37
43
  ClusterStatus.INIT: colorama.Fore.BLUE,
38
44
  ClusterStatus.UP: colorama.Fore.GREEN,
39
45
  ClusterStatus.STOPPED: colorama.Fore.YELLOW,
46
+ ClusterStatus.PENDING: colorama.Fore.CYAN,
40
47
  }
41
48
 
42
49
 
@@ -7,6 +7,7 @@ import resource
7
7
  import shlex
8
8
  import subprocess
9
9
  import sys
10
+ import termios
10
11
  import threading
11
12
  import time
12
13
  import typing
@@ -450,3 +451,19 @@ def slow_start_processes(processes: List[Startable],
450
451
  break
451
452
  batch_size = min(batch_size * 2, max_batch_size)
452
453
  time.sleep(delay)
454
+
455
+
456
+ def is_echo_disabled(fd: int) -> bool:
457
+ """Check if terminal ECHO is disabled on the given fd.
458
+
459
+ When a subprocess wants password/sensitive input, it disables ECHO.
460
+ This is how pexpect's waitnoecho() works. See:
461
+ https://pexpect.readthedocs.io/en/stable/api/pexpect.html#pexpect.spawn.waitnoecho
462
+ """
463
+ assert os.isatty(fd), 'fd is not connected to a terminal'
464
+ try:
465
+ attr = termios.tcgetattr(fd)
466
+ echo_on = bool(attr[3] & termios.ECHO)
467
+ return not echo_on
468
+ except (termios.error, OSError):
469
+ return False
sky/volumes/client/sdk.py CHANGED
@@ -1,4 +1,4 @@
1
- """SDK functions for managed jobs."""
1
+ """SDK functions for volumes."""
2
2
  import json
3
3
  import typing
4
4
  from typing import List
@@ -135,16 +135,19 @@ def ls() -> server_common.RequestId[List[responses.VolumeRecord]]:
135
135
  @usage_lib.entrypoint
136
136
  @server_common.check_server_healthy_or_start
137
137
  @annotations.client_api
138
- def delete(names: List[str]) -> server_common.RequestId[None]:
138
+ def delete(names: List[str],
139
+ purge: bool = False) -> server_common.RequestId[None]:
139
140
  """Deletes volumes.
140
141
 
141
142
  Args:
142
143
  names: List of volume names to delete.
144
+ purge: If True, delete the volume from the database even if the
145
+ deletion API fails.
143
146
 
144
147
  Returns:
145
148
  The request ID of the delete request.
146
149
  """
147
- body = payloads.VolumeDeleteBody(names=names)
150
+ body = payloads.VolumeDeleteBody(names=names, purge=purge)
148
151
  response = server_common.make_authenticated_request(
149
152
  'POST', '/volumes/delete', json=json.loads(body.model_dump_json()))
150
153
  return server_common.get_request_id(response)