skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +565 -0
  4. sky/backends/backend_utils.py +95 -12
  5. sky/backends/cloud_vm_ray_backend.py +224 -65
  6. sky/backends/task_codegen.py +380 -4
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  9. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  10. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  11. sky/catalog/kubernetes_catalog.py +12 -4
  12. sky/catalog/seeweb_catalog.py +30 -15
  13. sky/catalog/shadeform_catalog.py +5 -2
  14. sky/catalog/slurm_catalog.py +236 -0
  15. sky/catalog/vast_catalog.py +30 -6
  16. sky/check.py +25 -11
  17. sky/client/cli/command.py +391 -32
  18. sky/client/interactive_utils.py +190 -0
  19. sky/client/sdk.py +64 -2
  20. sky/client/sdk_async.py +9 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +60 -2
  23. sky/clouds/azure.py +2 -0
  24. sky/clouds/cloud.py +7 -0
  25. sky/clouds/kubernetes.py +2 -0
  26. sky/clouds/runpod.py +38 -7
  27. sky/clouds/slurm.py +610 -0
  28. sky/clouds/ssh.py +3 -2
  29. sky/clouds/vast.py +39 -16
  30. sky/core.py +197 -37
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  36. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  37. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  41. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  43. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  44. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  45. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  46. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  50. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  51. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  56. sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  59. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  66. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  73. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  74. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  75. sky/dashboard/out/clusters/[cluster].html +1 -1
  76. sky/dashboard/out/clusters.html +1 -1
  77. sky/dashboard/out/config.html +1 -1
  78. sky/dashboard/out/index.html +1 -1
  79. sky/dashboard/out/infra/[context].html +1 -1
  80. sky/dashboard/out/infra.html +1 -1
  81. sky/dashboard/out/jobs/[job].html +1 -1
  82. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  83. sky/dashboard/out/jobs.html +1 -1
  84. sky/dashboard/out/plugins/[...slug].html +1 -0
  85. sky/dashboard/out/users.html +1 -1
  86. sky/dashboard/out/volumes.html +1 -1
  87. sky/dashboard/out/workspace/new.html +1 -1
  88. sky/dashboard/out/workspaces/[name].html +1 -1
  89. sky/dashboard/out/workspaces.html +1 -1
  90. sky/data/data_utils.py +26 -12
  91. sky/data/mounting_utils.py +44 -5
  92. sky/global_user_state.py +111 -19
  93. sky/jobs/client/sdk.py +8 -3
  94. sky/jobs/controller.py +191 -31
  95. sky/jobs/recovery_strategy.py +109 -11
  96. sky/jobs/server/core.py +81 -4
  97. sky/jobs/server/server.py +14 -0
  98. sky/jobs/state.py +417 -19
  99. sky/jobs/utils.py +73 -80
  100. sky/models.py +11 -0
  101. sky/optimizer.py +8 -6
  102. sky/provision/__init__.py +12 -9
  103. sky/provision/common.py +20 -0
  104. sky/provision/docker_utils.py +15 -2
  105. sky/provision/kubernetes/utils.py +163 -20
  106. sky/provision/kubernetes/volume.py +52 -17
  107. sky/provision/provisioner.py +17 -7
  108. sky/provision/runpod/instance.py +3 -1
  109. sky/provision/runpod/utils.py +13 -1
  110. sky/provision/runpod/volume.py +25 -9
  111. sky/provision/slurm/__init__.py +12 -0
  112. sky/provision/slurm/config.py +13 -0
  113. sky/provision/slurm/instance.py +618 -0
  114. sky/provision/slurm/utils.py +689 -0
  115. sky/provision/vast/instance.py +4 -1
  116. sky/provision/vast/utils.py +11 -6
  117. sky/resources.py +135 -13
  118. sky/schemas/api/responses.py +4 -0
  119. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  120. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  121. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  122. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  123. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  124. sky/schemas/generated/jobsv1_pb2.py +9 -5
  125. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  126. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  127. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  128. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  129. sky/serve/serve_utils.py +232 -40
  130. sky/serve/server/impl.py +1 -1
  131. sky/server/common.py +17 -0
  132. sky/server/constants.py +1 -1
  133. sky/server/metrics.py +6 -3
  134. sky/server/plugins.py +238 -0
  135. sky/server/requests/executor.py +5 -2
  136. sky/server/requests/payloads.py +30 -1
  137. sky/server/requests/request_names.py +4 -0
  138. sky/server/requests/requests.py +33 -11
  139. sky/server/requests/serializers/encoders.py +22 -0
  140. sky/server/requests/serializers/return_value_serializers.py +70 -0
  141. sky/server/server.py +506 -109
  142. sky/server/server_utils.py +30 -0
  143. sky/server/uvicorn.py +5 -0
  144. sky/setup_files/MANIFEST.in +1 -0
  145. sky/setup_files/dependencies.py +22 -9
  146. sky/sky_logging.py +2 -1
  147. sky/skylet/attempt_skylet.py +13 -3
  148. sky/skylet/constants.py +55 -13
  149. sky/skylet/events.py +10 -4
  150. sky/skylet/executor/__init__.py +1 -0
  151. sky/skylet/executor/slurm.py +187 -0
  152. sky/skylet/job_lib.py +91 -5
  153. sky/skylet/log_lib.py +22 -6
  154. sky/skylet/log_lib.pyi +8 -6
  155. sky/skylet/services.py +18 -3
  156. sky/skylet/skylet.py +5 -1
  157. sky/skylet/subprocess_daemon.py +2 -1
  158. sky/ssh_node_pools/constants.py +12 -0
  159. sky/ssh_node_pools/core.py +40 -3
  160. sky/ssh_node_pools/deploy/__init__.py +4 -0
  161. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  162. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  163. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  164. sky/ssh_node_pools/deploy/utils.py +173 -0
  165. sky/ssh_node_pools/server.py +11 -13
  166. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  167. sky/templates/kubernetes-ray.yml.j2 +12 -6
  168. sky/templates/slurm-ray.yml.j2 +115 -0
  169. sky/templates/vast-ray.yml.j2 +1 -0
  170. sky/templates/websocket_proxy.py +18 -41
  171. sky/users/model.conf +1 -1
  172. sky/users/permission.py +85 -52
  173. sky/users/rbac.py +31 -3
  174. sky/utils/annotations.py +108 -8
  175. sky/utils/auth_utils.py +42 -0
  176. sky/utils/cli_utils/status_utils.py +19 -5
  177. sky/utils/cluster_utils.py +10 -3
  178. sky/utils/command_runner.py +389 -35
  179. sky/utils/command_runner.pyi +43 -4
  180. sky/utils/common_utils.py +47 -31
  181. sky/utils/context.py +32 -0
  182. sky/utils/db/db_utils.py +36 -6
  183. sky/utils/db/migration_utils.py +41 -21
  184. sky/utils/infra_utils.py +5 -1
  185. sky/utils/instance_links.py +139 -0
  186. sky/utils/interactive_utils.py +49 -0
  187. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  188. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  189. sky/utils/kubernetes/rsync_helper.sh +5 -1
  190. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  191. sky/utils/plugin_extensions/__init__.py +14 -0
  192. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  193. sky/utils/resources_utils.py +10 -8
  194. sky/utils/rich_utils.py +9 -11
  195. sky/utils/schemas.py +93 -19
  196. sky/utils/status_lib.py +7 -0
  197. sky/utils/subprocess_utils.py +17 -0
  198. sky/volumes/client/sdk.py +6 -3
  199. sky/volumes/server/core.py +65 -27
  200. sky_templates/ray/start_cluster +8 -4
  201. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
  202. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
  203. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  204. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
  205. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  206. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  207. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  208. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  209. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  210. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  211. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
  213. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  214. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  216. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  217. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  221. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  222. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  223. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  224. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  225. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  226. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  227. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  228. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  229. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  230. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
  231. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
  232. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
  233. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  234. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
  235. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
  236. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
  237. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
  238. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
  239. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  240. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  241. /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  243. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  244. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  245. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -6,7 +6,7 @@ determine the return type based on the value of require_outputs.
6
6
  """
7
7
  import enum
8
8
  import typing
9
- from typing import Any, Iterable, List, Optional, Tuple, Union
9
+ from typing import Any, Callable, Iterable, List, Optional, Tuple, Union
10
10
 
11
11
  from typing_extensions import Literal
12
12
 
@@ -27,6 +27,7 @@ def ssh_options_list(
27
27
  ssh_control_name: Optional[str],
28
28
  *,
29
29
  ssh_proxy_command: Optional[str] = ...,
30
+ ssh_proxy_jump: Optional[str] = ...,
30
31
  docker_ssh_proxy_command: Optional[str] = ...,
31
32
  timeout: int = ...,
32
33
  port: int = ...,
@@ -63,6 +64,7 @@ class CommandRunner:
63
64
  connect_timeout: Optional[int] = ...,
64
65
  source_bashrc: bool = ...,
65
66
  skip_lines: int = ...,
67
+ run_in_background: bool = ...,
66
68
  **kwargs) -> int:
67
69
  ...
68
70
 
@@ -78,6 +80,7 @@ class CommandRunner:
78
80
  connect_timeout: Optional[int] = ...,
79
81
  source_bashrc: bool = ...,
80
82
  skip_lines: int = ...,
83
+ run_in_background: bool = ...,
81
84
  **kwargs) -> Tuple[int, str, str]:
82
85
  ...
83
86
 
@@ -93,6 +96,7 @@ class CommandRunner:
93
96
  connect_timeout: Optional[int] = ...,
94
97
  source_bashrc: bool = ...,
95
98
  skip_lines: int = ...,
99
+ run_in_background: bool = ...,
96
100
  **kwargs) -> Union[Tuple[int, str, str], int]:
97
101
  ...
98
102
 
@@ -130,22 +134,25 @@ class SSHCommandRunner(CommandRunner):
130
134
  ip: str
131
135
  port: int
132
136
  ssh_user: str
133
- ssh_private_key: str
137
+ ssh_private_key: Optional[str]
134
138
  ssh_control_name: Optional[str]
135
139
  docker_user: str
136
140
  disable_control_master: Optional[bool]
137
141
  port_forward_execute_remote_command: Optional[bool]
142
+ enable_interactive_auth: bool
138
143
 
139
144
  def __init__(
140
145
  self,
141
146
  node: Tuple[str, int],
142
147
  ssh_user: str,
143
- ssh_private_key: str,
148
+ ssh_private_key: Optional[str],
144
149
  ssh_control_name: Optional[str] = ...,
145
150
  ssh_proxy_command: Optional[str] = ...,
151
+ ssh_proxy_jump: Optional[str] = ...,
146
152
  docker_user: Optional[str] = ...,
147
153
  disable_control_master: Optional[bool] = ...,
148
154
  port_forward_execute_remote_command: Optional[bool] = ...,
155
+ enable_interactive_auth: bool = ...,
149
156
  ) -> None:
150
157
  ...
151
158
 
@@ -163,6 +170,7 @@ class SSHCommandRunner(CommandRunner):
163
170
  connect_timeout: Optional[int] = ...,
164
171
  source_bashrc: bool = ...,
165
172
  skip_lines: int = ...,
173
+ run_in_background: bool = ...,
166
174
  **kwargs) -> int:
167
175
  ...
168
176
 
@@ -180,6 +188,7 @@ class SSHCommandRunner(CommandRunner):
180
188
  connect_timeout: Optional[int] = ...,
181
189
  source_bashrc: bool = ...,
182
190
  skip_lines: int = ...,
191
+ run_in_background: bool = ...,
183
192
  **kwargs) -> Tuple[int, str, str]:
184
193
  ...
185
194
 
@@ -197,6 +206,7 @@ class SSHCommandRunner(CommandRunner):
197
206
  connect_timeout: Optional[int] = ...,
198
207
  source_bashrc: bool = ...,
199
208
  skip_lines: int = ...,
209
+ run_in_background: bool = ...,
200
210
  **kwargs) -> Union[Tuple[int, str, str], int]:
201
211
  ...
202
212
 
@@ -216,7 +226,8 @@ class SSHCommandRunner(CommandRunner):
216
226
  up: bool,
217
227
  log_path: str = ...,
218
228
  stream_logs: bool = ...,
219
- max_retry: int = ...) -> None:
229
+ max_retry: int = ...,
230
+ get_remote_home_dir: Callable[[], str] = ...) -> None:
220
231
  ...
221
232
 
222
233
  def port_forward_command(
@@ -251,6 +262,7 @@ class KubernetesCommandRunner(CommandRunner):
251
262
  connect_timeout: Optional[int] = ...,
252
263
  source_bashrc: bool = ...,
253
264
  skip_lines: int = ...,
265
+ run_in_background: bool = ...,
254
266
  **kwargs) -> int:
255
267
  ...
256
268
 
@@ -268,6 +280,7 @@ class KubernetesCommandRunner(CommandRunner):
268
280
  connect_timeout: Optional[int] = ...,
269
281
  source_bashrc: bool = ...,
270
282
  skip_lines: int = ...,
283
+ run_in_background: bool = ...,
271
284
  **kwargs) -> Tuple[int, str, str]:
272
285
  ...
273
286
 
@@ -285,6 +298,7 @@ class KubernetesCommandRunner(CommandRunner):
285
298
  connect_timeout: Optional[int] = ...,
286
299
  source_bashrc: bool = ...,
287
300
  skip_lines: int = ...,
301
+ run_in_background: bool = ...,
288
302
  **kwargs) -> Union[Tuple[int, str, str], int]:
289
303
  ...
290
304
 
@@ -306,6 +320,28 @@ class KubernetesCommandRunner(CommandRunner):
306
320
  ...
307
321
 
308
322
 
323
+ class SlurmCommandRunner(SSHCommandRunner):
324
+ """Runner for Slurm commands."""
325
+ sky_dir: str
326
+ skypilot_runtime_dir: str
327
+ job_id: str
328
+ slurm_node: str
329
+
330
+ def __init__(
331
+ self,
332
+ node: Tuple[str, int],
333
+ ssh_user: str,
334
+ ssh_private_key: Optional[str],
335
+ *,
336
+ sky_dir: str,
337
+ skypilot_runtime_dir: str,
338
+ job_id: str,
339
+ slurm_node: str,
340
+ **kwargs,
341
+ ) -> None:
342
+ ...
343
+
344
+
309
345
  class LocalProcessCommandRunner(CommandRunner):
310
346
 
311
347
  def __init__(self) -> None:
@@ -325,6 +361,7 @@ class LocalProcessCommandRunner(CommandRunner):
325
361
  connect_timeout: Optional[int] = ...,
326
362
  source_bashrc: bool = ...,
327
363
  skip_lines: int = ...,
364
+ run_in_background: bool = ...,
328
365
  **kwargs) -> int:
329
366
  ...
330
367
 
@@ -342,6 +379,7 @@ class LocalProcessCommandRunner(CommandRunner):
342
379
  connect_timeout: Optional[int] = ...,
343
380
  source_bashrc: bool = ...,
344
381
  skip_lines: int = ...,
382
+ run_in_background: bool = ...,
345
383
  **kwargs) -> Tuple[int, str, str]:
346
384
  ...
347
385
 
@@ -359,5 +397,6 @@ class LocalProcessCommandRunner(CommandRunner):
359
397
  connect_timeout: Optional[int] = ...,
360
398
  source_bashrc: bool = ...,
361
399
  skip_lines: int = ...,
400
+ run_in_background: bool = ...,
362
401
  **kwargs) -> Union[Tuple[int, str, str], int]:
363
402
  ...
sky/utils/common_utils.py CHANGED
@@ -29,6 +29,7 @@ from sky.adaptors import common as adaptors_common
29
29
  from sky.skylet import constants
30
30
  from sky.usage import constants as usage_constants
31
31
  from sky.utils import annotations
32
+ from sky.utils import context
32
33
  from sky.utils import ux_utils
33
34
  from sky.utils import validator
34
35
 
@@ -293,11 +294,11 @@ class Backoff:
293
294
  return self._backoff
294
295
 
295
296
 
296
- _current_command: Optional[str] = None
297
- _current_client_entrypoint: Optional[str] = None
298
- _using_remote_api_server: Optional[bool] = None
299
- _current_user: Optional['models.User'] = None
300
- _current_request_id: Optional[str] = None
297
+ _CLIENT_COMMAND_KEY = 'client_command'
298
+ _CLIENT_ENTRYPOINT_KEY = 'client_entrypoint'
299
+ _USING_REMOTE_API_SERVER_KEY = 'using_remote_api_server'
300
+ _USER_KEY = 'user'
301
+ _REQUEST_ID_KEY = 'request_id'
301
302
 
302
303
 
303
304
  def set_request_context(client_entrypoint: Optional[str],
@@ -309,22 +310,21 @@ def set_request_context(client_entrypoint: Optional[str],
309
310
  This is useful when we are on the SkyPilot API server side and we have a
310
311
  client entrypoint and command from the client.
311
312
  """
312
- global _current_command
313
- global _current_client_entrypoint
314
- global _using_remote_api_server
315
- global _current_user
316
- global _current_request_id
317
- _current_command = client_command
318
- _current_client_entrypoint = client_entrypoint
319
- _using_remote_api_server = using_remote_api_server
320
- _current_user = user
321
- _current_request_id = request_id
313
+ # This function will be called in process executor and coroutine executor.
314
+ # context.set_context_var ensures the context is safe in both cases.
315
+ context.set_context_var(_CLIENT_ENTRYPOINT_KEY, client_entrypoint)
316
+ context.set_context_var(_CLIENT_COMMAND_KEY, client_command)
317
+ context.set_context_var(_USING_REMOTE_API_SERVER_KEY,
318
+ using_remote_api_server)
319
+ context.set_context_var(_USER_KEY, user)
320
+ context.set_context_var(_REQUEST_ID_KEY, request_id)
322
321
 
323
322
 
324
323
  def get_current_request_id() -> str:
325
324
  """Returns the current request id."""
326
- if _current_request_id is not None:
327
- return _current_request_id
325
+ value = context.get_context_var('request_id')
326
+ if value is not None:
327
+ return value
328
328
  return 'dummy-request-id'
329
329
 
330
330
 
@@ -334,30 +334,43 @@ def get_current_command() -> str:
334
334
  Normally uses get_pretty_entry_point(), but will use the client command on
335
335
  the server side.
336
336
  """
337
- if _current_command is not None:
338
- return _current_command
339
-
337
+ value = context.get_context_var(_CLIENT_COMMAND_KEY)
338
+ if value is not None:
339
+ return value
340
340
  return get_pretty_entrypoint_cmd()
341
341
 
342
342
 
343
343
  def get_current_user() -> 'models.User':
344
- """Returns the current user."""
345
- if _current_user is not None:
346
- return _current_user
344
+ """Returns the user in current server session."""
345
+ value = context.get_context_var(_USER_KEY)
346
+ if value is not None:
347
+ return value
347
348
  return models.User.get_current_user()
348
349
 
349
350
 
350
351
  def get_current_user_name() -> str:
351
- """Returns the current user name."""
352
+ """Returns the user name in current server session."""
352
353
  name = get_current_user().name
353
354
  assert name is not None
354
355
  return name
355
356
 
356
357
 
358
+ def get_local_user_name() -> str:
359
+ """Returns the user name in local environment.
360
+
361
+ This is for backward compatibility where anonymous access is implicitly
362
+ allowed when no authentication method at server-side is configured and
363
+ the username from client environment variable will be used to identify the
364
+ user.
365
+ """
366
+ name = os.getenv(constants.USER_ENV_VAR, getpass.getuser())
367
+ assert name is not None
368
+ return name
369
+
370
+
357
371
  def set_current_user(user: 'models.User'):
358
372
  """Sets the current user."""
359
- global _current_user
360
- _current_user = user
373
+ context.set_context_var('user', user)
361
374
 
362
375
 
363
376
  def get_current_client_entrypoint(server_entrypoint: str) -> str:
@@ -366,8 +379,9 @@ def get_current_client_entrypoint(server_entrypoint: str) -> str:
366
379
  Gets the client entrypoint from the context, if it is not set, returns the
367
380
  server entrypoint.
368
381
  """
369
- if _current_client_entrypoint is not None:
370
- return _current_client_entrypoint
382
+ value = context.get_context_var(_CLIENT_ENTRYPOINT_KEY)
383
+ if value is not None:
384
+ return value
371
385
  return server_entrypoint
372
386
 
373
387
 
@@ -376,8 +390,9 @@ def get_using_remote_api_server() -> bool:
376
390
  if os.getenv(constants.USING_REMOTE_API_SERVER_ENV_VAR) is not None:
377
391
  return os.getenv(constants.USING_REMOTE_API_SERVER_ENV_VAR,
378
392
  '').lower() in ('true', '1')
379
- if _using_remote_api_server is not None:
380
- return _using_remote_api_server
393
+ value = context.get_context_var(_USING_REMOTE_API_SERVER_KEY)
394
+ if value is not None:
395
+ return value
381
396
  # This gets the right status for the local client.
382
397
  # TODO(zhwu): This is to prevent circular import. We should refactor this.
383
398
  # pylint: disable=import-outside-toplevel
@@ -724,7 +739,8 @@ def find_free_port(start_port: int) -> int:
724
739
  try:
725
740
  s.bind(('', port))
726
741
  return port
727
- except OSError:
742
+ except OSError as e:
743
+ logger.debug(f'Error binding port {port}: {e}')
728
744
  pass
729
745
  raise OSError('No free ports available.')
730
746
 
sky/utils/context.py CHANGED
@@ -17,6 +17,8 @@ from typing_extensions import ParamSpec
17
17
  if TYPE_CHECKING:
18
18
  from sky.skypilot_config import ConfigContext
19
19
 
20
+ _PROCESS_GLOBAL_VARS = {}
21
+
20
22
 
21
23
  class SkyPilotContext(object):
22
24
  """SkyPilot typed context vars for threads and coroutines.
@@ -65,6 +67,8 @@ class SkyPilotContext(object):
65
67
  self._log_file_handle = None
66
68
  self.env_overrides = {}
67
69
  self.config_context = None
70
+ self.request_context = None
71
+ self.vars = {}
68
72
 
69
73
  def cancel(self):
70
74
  """Cancel the context."""
@@ -113,6 +117,12 @@ class SkyPilotContext(object):
113
117
  self._log_file_handle.close()
114
118
  self._log_file_handle = None
115
119
 
120
+ def set_var(self, key: str, value: Any):
121
+ self.vars[key] = value
122
+
123
+ def get_var(self, key: str) -> Optional[Any]:
124
+ return self.vars.get(key)
125
+
116
126
  def __enter__(self):
117
127
  return self
118
128
 
@@ -150,6 +160,28 @@ def get() -> Optional[SkyPilotContext]:
150
160
  return _CONTEXT.get()
151
161
 
152
162
 
163
+ def set_context_var(key: str, value: Any):
164
+ ctx = get()
165
+ if ctx is not None:
166
+ # Set the var in context
167
+ ctx.set_var(key, value)
168
+ else:
169
+ # Fallback to process-isolated assumption, where we thought
170
+ # modifying process-scope vars is safe.
171
+ _PROCESS_GLOBAL_VARS[key] = value
172
+
173
+
174
+ def get_context_var(key: str) -> Any:
175
+ ctx = get()
176
+ if ctx is not None:
177
+ # Use `in` to check for key existence to distinguish
178
+ # "key not found" from "key's value is None".
179
+ if key in ctx.vars:
180
+ return ctx.get_var(key)
181
+ # Fallback to the variable set in process-scope
182
+ return _PROCESS_GLOBAL_VARS.get(key)
183
+
184
+
153
185
  class ContextualEnviron(MutableMapping[str, str]):
154
186
  """Environment variables wrapper with contextual overrides.
155
187
 
sky/utils/db/db_utils.py CHANGED
@@ -75,6 +75,18 @@ def safe_cursor(db_path: str):
75
75
  conn.close()
76
76
 
77
77
 
78
+ @contextlib.contextmanager
79
+ def safe_cursor_on_connection(conn: 'sqlite3.Connection'):
80
+ """A auto-committing, auto-closing cursor on an existing connection."""
81
+ # Ensure commit() is called when the context is exited.
82
+ with conn:
83
+ cursor = conn.cursor()
84
+ try:
85
+ yield cursor
86
+ finally:
87
+ cursor.close()
88
+
89
+
78
90
  def add_column_to_table(
79
91
  cursor: 'sqlite3.Cursor',
80
92
  conn: 'sqlite3.Connection',
@@ -286,6 +298,11 @@ def drop_column_from_table_alembic(
286
298
  raise
287
299
 
288
300
 
301
+ def fault_point():
302
+ """For test fault injection."""
303
+ pass
304
+
305
+
289
306
  class SQLiteConn(threading.local):
290
307
  """Thread-local connection to the sqlite3 database."""
291
308
 
@@ -345,8 +362,8 @@ class SQLiteConn(threading.local):
345
362
 
346
363
  def exec_and_commit(sql: str, parameters: Optional[Iterable[Any]]):
347
364
  # pylint: disable=protected-access
348
- conn._conn.execute(sql, parameters)
349
- conn._conn.commit()
365
+ with safe_cursor_on_connection(conn._conn) as cursor:
366
+ cursor.execute(sql, parameters)
350
367
 
351
368
  # pylint: disable=protected-access
352
369
  await conn._execute(exec_and_commit, sql, parameters)
@@ -357,7 +374,20 @@ class SQLiteConn(threading.local):
357
374
  parameters: Optional[Iterable[Any]] = None
358
375
  ) -> Iterable[sqlite3.Row]:
359
376
  conn = await self._get_async_conn()
360
- return await conn.execute_fetchall(sql, parameters)
377
+ if parameters is None:
378
+ parameters = []
379
+
380
+ def exec_fetch_all(sql: str, parameters: Optional[Iterable[Any]]):
381
+ # pylint: disable=protected-access
382
+ with safe_cursor_on_connection(conn._conn) as cursor:
383
+ cursor.execute(sql, parameters)
384
+ # Note(dev): sqlite3.Connection cannot be patched, keep
385
+ # fault_point here to test the integrity of exec_fetch_all()
386
+ fault_point()
387
+ return cursor.fetchall()
388
+
389
+ # pylint: disable=protected-access
390
+ return await conn._execute(exec_fetch_all, sql, parameters)
361
391
 
362
392
  async def execute_get_returning_value_async(
363
393
  self,
@@ -372,9 +402,9 @@ class SQLiteConn(threading.local):
372
402
  def exec_and_get_returning_value(sql: str,
373
403
  parameters: Optional[Iterable[Any]]):
374
404
  # pylint: disable=protected-access
375
- row = conn._conn.execute(sql, parameters).fetchone()
376
- conn._conn.commit()
377
- return row
405
+ with safe_cursor_on_connection(conn._conn) as cursor:
406
+ cursor.execute(sql, parameters)
407
+ return cursor.fetchone()
378
408
 
379
409
  # pylint: disable=protected-access
380
410
  return await conn._execute(exec_and_get_returning_value, sql,
@@ -3,6 +3,7 @@
3
3
  import contextlib
4
4
  import logging
5
5
  import os
6
+ from typing import Optional
6
7
 
7
8
  from alembic import command as alembic_command
8
9
  from alembic.config import Config
@@ -22,7 +23,7 @@ GLOBAL_USER_STATE_VERSION = '011'
22
23
  GLOBAL_USER_STATE_LOCK_PATH = f'~/.sky/locks/.{GLOBAL_USER_STATE_DB_NAME}.lock'
23
24
 
24
25
  SPOT_JOBS_DB_NAME = 'spot_jobs_db'
25
- SPOT_JOBS_VERSION = '007'
26
+ SPOT_JOBS_VERSION = '011'
26
27
  SPOT_JOBS_LOCK_PATH = f'~/.sky/locks/.{SPOT_JOBS_DB_NAME}.lock'
27
28
 
28
29
  SERVE_DB_NAME = 'serve_db'
@@ -52,12 +53,22 @@ def db_lock(db_name: str):
52
53
  f'file if you believe it is stale.') from e
53
54
 
54
55
 
55
- def get_alembic_config(engine: sqlalchemy.engine.Engine, section: str):
56
- """Get Alembic configuration for the given section"""
57
- # From sky/utils/db/migration_utils.py -> sky/setup_files/alembic.ini
58
- alembic_ini_path = os.path.join(
59
- os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
60
- 'setup_files', 'alembic.ini')
56
+ def get_alembic_config(engine: sqlalchemy.engine.Engine,
57
+ section: str,
58
+ alembic_ini_path: Optional[str] = None):
59
+ """Get Alembic configuration for the given section.
60
+
61
+ Args:
62
+ engine: SQLAlchemy engine for the database.
63
+ section: Alembic section name (e.g., 'state_db' or 'spot_jobs_db').
64
+ alembic_ini_path: Optional path to a custom alembic.ini file.
65
+ If not provided, uses the default SkyPilot alembic.ini.
66
+ """
67
+ if alembic_ini_path is None:
68
+ # Default to SkyPilot's alembic.ini
69
+ alembic_ini_path = os.path.join(
70
+ os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
71
+ 'setup_files', 'alembic.ini')
61
72
  alembic_cfg = Config(alembic_ini_path, ini_section=section)
62
73
 
63
74
  # Override the database URL to match SkyPilot's current connection
@@ -73,19 +84,23 @@ def get_alembic_config(engine: sqlalchemy.engine.Engine, section: str):
73
84
  return alembic_cfg
74
85
 
75
86
 
76
- def needs_upgrade(engine: sqlalchemy.engine.Engine, section: str,
77
- target_revision: str):
87
+ def needs_upgrade(engine: sqlalchemy.engine.Engine,
88
+ section: str,
89
+ target_revision: str,
90
+ alembic_ini_path: Optional[str] = None):
78
91
  """Check if the database needs to be upgraded.
79
92
 
80
93
  Args:
81
- engine: SQLAlchemy engine for the database
82
- section: Alembic section to upgrade (e.g., 'state_db' or 'spot_jobs_db')
83
- target_revision: Target revision to upgrade to (e.g., '001')
94
+ engine: SQLAlchemy engine for the database.
95
+ section: Alembic section to upgrade (e.g., 'state_db' or
96
+ 'spot_jobs_db').
97
+ target_revision: Target revision to upgrade to (e.g., '001').
98
+ alembic_ini_path: Optional path to a custom alembic.ini file.
84
99
  """
85
100
  current_rev = None
86
101
 
87
102
  # get alembic config for the given section
88
- alembic_config = get_alembic_config(engine, section)
103
+ alembic_config = get_alembic_config(engine, section, alembic_ini_path)
89
104
  version_table = alembic_config.get_section_option(
90
105
  alembic_config.config_ini_section, 'version_table', 'alembic_version')
91
106
 
@@ -112,26 +127,31 @@ def needs_upgrade(engine: sqlalchemy.engine.Engine, section: str,
112
127
  return current_rev_num < target_rev_num
113
128
 
114
129
 
115
- def safe_alembic_upgrade(engine: sqlalchemy.engine.Engine, section: str,
116
- target_revision: str):
130
+ def safe_alembic_upgrade(engine: sqlalchemy.engine.Engine,
131
+ section: str,
132
+ target_revision: str,
133
+ alembic_ini_path: Optional[str] = None):
117
134
  """Upgrade the database if needed. Uses a file lock to ensure
118
135
  that only one process tries to upgrade the database at a time.
119
136
 
120
137
  Args:
121
- engine: SQLAlchemy engine for the database
122
- section: Alembic section to upgrade (e.g., 'state_db' or 'spot_jobs_db')
123
- target_revision: Target revision to upgrade to (e.g., '001')
138
+ engine: SQLAlchemy engine for the database.
139
+ section: Alembic section to upgrade (e.g., 'state_db' or
140
+ 'spot_jobs_db').
141
+ target_revision: Target revision to upgrade to (e.g., '001').
142
+ alembic_ini_path: Optional path to a custom alembic.ini file.
124
143
  """
125
144
  # set alembic logger to warning level
126
145
  alembic_logger = logging.getLogger('alembic')
127
146
  alembic_logger.setLevel(logging.WARNING)
128
147
 
129
- alembic_config = get_alembic_config(engine, section)
148
+ alembic_config = get_alembic_config(engine, section, alembic_ini_path)
130
149
 
131
150
  # only acquire lock if db needs upgrade
132
- if needs_upgrade(engine, section, target_revision):
151
+ if needs_upgrade(engine, section, target_revision, alembic_ini_path):
133
152
  with db_lock(section):
134
153
  # check again if db needs upgrade in case another
135
154
  # process upgraded it while we were waiting for the lock
136
- if needs_upgrade(engine, section, target_revision):
155
+ if needs_upgrade(engine, section, target_revision,
156
+ alembic_ini_path):
137
157
  alembic_command.upgrade(alembic_config, target_revision)
sky/utils/infra_utils.py CHANGED
@@ -173,7 +173,11 @@ class InfraInfo:
173
173
  return '-'
174
174
 
175
175
  region_or_zone = None
176
- if self.zone is not None and self.zone != '*':
176
+ # For Slurm, zones = partitions. We want to show the cluster
177
+ # name (region) instead of the partition name (zone), as different
178
+ # Slurm clusters can easily have same partition name.
179
+ is_slurm = self.cloud.lower() == 'slurm'
180
+ if not is_slurm and self.zone is not None and self.zone != '*':
177
181
  region_or_zone = self.zone
178
182
  elif self.region is not None and self.region != '*':
179
183
  # If using region, we remove the ssh- prefix if it exists for SSH