skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/slurm.py +159 -72
  3. sky/backends/backend_utils.py +52 -10
  4. sky/backends/cloud_vm_ray_backend.py +192 -32
  5. sky/backends/task_codegen.py +40 -2
  6. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  8. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  9. sky/catalog/seeweb_catalog.py +30 -15
  10. sky/catalog/shadeform_catalog.py +5 -2
  11. sky/catalog/slurm_catalog.py +0 -7
  12. sky/catalog/vast_catalog.py +30 -6
  13. sky/check.py +11 -8
  14. sky/client/cli/command.py +106 -54
  15. sky/client/interactive_utils.py +190 -0
  16. sky/client/sdk.py +8 -0
  17. sky/client/sdk_async.py +9 -0
  18. sky/clouds/aws.py +60 -2
  19. sky/clouds/azure.py +2 -0
  20. sky/clouds/kubernetes.py +2 -0
  21. sky/clouds/runpod.py +38 -7
  22. sky/clouds/slurm.py +44 -12
  23. sky/clouds/ssh.py +1 -1
  24. sky/clouds/vast.py +30 -17
  25. sky/core.py +69 -1
  26. sky/dashboard/out/404.html +1 -1
  27. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  29. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  30. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  31. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  32. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  35. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  36. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  37. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  39. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  40. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  44. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  45. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  50. sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
  51. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  52. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
  55. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  59. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  65. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  66. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  67. sky/dashboard/out/clusters/[cluster].html +1 -1
  68. sky/dashboard/out/clusters.html +1 -1
  69. sky/dashboard/out/config.html +1 -1
  70. sky/dashboard/out/index.html +1 -1
  71. sky/dashboard/out/infra/[context].html +1 -1
  72. sky/dashboard/out/infra.html +1 -1
  73. sky/dashboard/out/jobs/[job].html +1 -1
  74. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  75. sky/dashboard/out/jobs.html +1 -1
  76. sky/dashboard/out/plugins/[...slug].html +1 -1
  77. sky/dashboard/out/users.html +1 -1
  78. sky/dashboard/out/volumes.html +1 -1
  79. sky/dashboard/out/workspace/new.html +1 -1
  80. sky/dashboard/out/workspaces/[name].html +1 -1
  81. sky/dashboard/out/workspaces.html +1 -1
  82. sky/data/data_utils.py +26 -12
  83. sky/data/mounting_utils.py +29 -4
  84. sky/global_user_state.py +108 -16
  85. sky/jobs/client/sdk.py +8 -3
  86. sky/jobs/controller.py +191 -31
  87. sky/jobs/recovery_strategy.py +109 -11
  88. sky/jobs/server/core.py +81 -4
  89. sky/jobs/server/server.py +14 -0
  90. sky/jobs/state.py +417 -19
  91. sky/jobs/utils.py +73 -80
  92. sky/models.py +9 -0
  93. sky/optimizer.py +2 -1
  94. sky/provision/__init__.py +11 -9
  95. sky/provision/kubernetes/utils.py +122 -15
  96. sky/provision/kubernetes/volume.py +52 -17
  97. sky/provision/provisioner.py +2 -1
  98. sky/provision/runpod/instance.py +3 -1
  99. sky/provision/runpod/utils.py +13 -1
  100. sky/provision/runpod/volume.py +25 -9
  101. sky/provision/slurm/instance.py +75 -29
  102. sky/provision/slurm/utils.py +213 -107
  103. sky/provision/vast/utils.py +1 -0
  104. sky/resources.py +135 -13
  105. sky/schemas/api/responses.py +4 -0
  106. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  107. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  108. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  109. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  110. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  111. sky/schemas/generated/jobsv1_pb2.py +9 -5
  112. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  113. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  114. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  115. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  116. sky/serve/serve_utils.py +232 -40
  117. sky/server/common.py +17 -0
  118. sky/server/constants.py +1 -1
  119. sky/server/metrics.py +6 -3
  120. sky/server/plugins.py +16 -0
  121. sky/server/requests/payloads.py +18 -0
  122. sky/server/requests/request_names.py +2 -0
  123. sky/server/requests/requests.py +28 -10
  124. sky/server/requests/serializers/encoders.py +5 -0
  125. sky/server/requests/serializers/return_value_serializers.py +14 -4
  126. sky/server/server.py +434 -107
  127. sky/server/uvicorn.py +5 -0
  128. sky/setup_files/MANIFEST.in +1 -0
  129. sky/setup_files/dependencies.py +21 -10
  130. sky/sky_logging.py +2 -1
  131. sky/skylet/constants.py +22 -5
  132. sky/skylet/executor/slurm.py +4 -6
  133. sky/skylet/job_lib.py +89 -4
  134. sky/skylet/services.py +18 -3
  135. sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
  136. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  137. sky/templates/kubernetes-ray.yml.j2 +4 -6
  138. sky/templates/slurm-ray.yml.j2 +32 -2
  139. sky/templates/websocket_proxy.py +18 -41
  140. sky/users/permission.py +61 -51
  141. sky/utils/auth_utils.py +42 -0
  142. sky/utils/cli_utils/status_utils.py +19 -5
  143. sky/utils/cluster_utils.py +10 -3
  144. sky/utils/command_runner.py +256 -94
  145. sky/utils/command_runner.pyi +16 -0
  146. sky/utils/common_utils.py +30 -29
  147. sky/utils/context.py +32 -0
  148. sky/utils/db/db_utils.py +36 -6
  149. sky/utils/db/migration_utils.py +41 -21
  150. sky/utils/infra_utils.py +5 -1
  151. sky/utils/instance_links.py +139 -0
  152. sky/utils/interactive_utils.py +49 -0
  153. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  154. sky/utils/kubernetes/rsync_helper.sh +5 -1
  155. sky/utils/plugin_extensions/__init__.py +14 -0
  156. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  157. sky/utils/resources_utils.py +10 -8
  158. sky/utils/rich_utils.py +9 -11
  159. sky/utils/schemas.py +63 -20
  160. sky/utils/status_lib.py +7 -0
  161. sky/utils/subprocess_utils.py +17 -0
  162. sky/volumes/client/sdk.py +6 -3
  163. sky/volumes/server/core.py +65 -27
  164. sky_templates/ray/start_cluster +8 -4
  165. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
  166. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
  167. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
  168. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
  169. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  170. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  173. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  174. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
  175. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  176. sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
  177. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  178. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  179. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  180. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
  181. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  182. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
  183. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  184. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  185. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  186. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  187. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  188. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  189. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  190. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
  191. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
  192. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
  193. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
  194. sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
  195. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
  196. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
  197. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
  198. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
  199. sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
  200. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
  201. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
  202. /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  203. /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
  204. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  205. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  206. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  207. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -27,6 +27,7 @@ def ssh_options_list(
27
27
  ssh_control_name: Optional[str],
28
28
  *,
29
29
  ssh_proxy_command: Optional[str] = ...,
30
+ ssh_proxy_jump: Optional[str] = ...,
30
31
  docker_ssh_proxy_command: Optional[str] = ...,
31
32
  timeout: int = ...,
32
33
  port: int = ...,
@@ -63,6 +64,7 @@ class CommandRunner:
63
64
  connect_timeout: Optional[int] = ...,
64
65
  source_bashrc: bool = ...,
65
66
  skip_lines: int = ...,
67
+ run_in_background: bool = ...,
66
68
  **kwargs) -> int:
67
69
  ...
68
70
 
@@ -78,6 +80,7 @@ class CommandRunner:
78
80
  connect_timeout: Optional[int] = ...,
79
81
  source_bashrc: bool = ...,
80
82
  skip_lines: int = ...,
83
+ run_in_background: bool = ...,
81
84
  **kwargs) -> Tuple[int, str, str]:
82
85
  ...
83
86
 
@@ -93,6 +96,7 @@ class CommandRunner:
93
96
  connect_timeout: Optional[int] = ...,
94
97
  source_bashrc: bool = ...,
95
98
  skip_lines: int = ...,
99
+ run_in_background: bool = ...,
96
100
  **kwargs) -> Union[Tuple[int, str, str], int]:
97
101
  ...
98
102
 
@@ -135,6 +139,7 @@ class SSHCommandRunner(CommandRunner):
135
139
  docker_user: str
136
140
  disable_control_master: Optional[bool]
137
141
  port_forward_execute_remote_command: Optional[bool]
142
+ enable_interactive_auth: bool
138
143
 
139
144
  def __init__(
140
145
  self,
@@ -143,9 +148,11 @@ class SSHCommandRunner(CommandRunner):
143
148
  ssh_private_key: Optional[str],
144
149
  ssh_control_name: Optional[str] = ...,
145
150
  ssh_proxy_command: Optional[str] = ...,
151
+ ssh_proxy_jump: Optional[str] = ...,
146
152
  docker_user: Optional[str] = ...,
147
153
  disable_control_master: Optional[bool] = ...,
148
154
  port_forward_execute_remote_command: Optional[bool] = ...,
155
+ enable_interactive_auth: bool = ...,
149
156
  ) -> None:
150
157
  ...
151
158
 
@@ -163,6 +170,7 @@ class SSHCommandRunner(CommandRunner):
163
170
  connect_timeout: Optional[int] = ...,
164
171
  source_bashrc: bool = ...,
165
172
  skip_lines: int = ...,
173
+ run_in_background: bool = ...,
166
174
  **kwargs) -> int:
167
175
  ...
168
176
 
@@ -180,6 +188,7 @@ class SSHCommandRunner(CommandRunner):
180
188
  connect_timeout: Optional[int] = ...,
181
189
  source_bashrc: bool = ...,
182
190
  skip_lines: int = ...,
191
+ run_in_background: bool = ...,
183
192
  **kwargs) -> Tuple[int, str, str]:
184
193
  ...
185
194
 
@@ -197,6 +206,7 @@ class SSHCommandRunner(CommandRunner):
197
206
  connect_timeout: Optional[int] = ...,
198
207
  source_bashrc: bool = ...,
199
208
  skip_lines: int = ...,
209
+ run_in_background: bool = ...,
200
210
  **kwargs) -> Union[Tuple[int, str, str], int]:
201
211
  ...
202
212
 
@@ -252,6 +262,7 @@ class KubernetesCommandRunner(CommandRunner):
252
262
  connect_timeout: Optional[int] = ...,
253
263
  source_bashrc: bool = ...,
254
264
  skip_lines: int = ...,
265
+ run_in_background: bool = ...,
255
266
  **kwargs) -> int:
256
267
  ...
257
268
 
@@ -269,6 +280,7 @@ class KubernetesCommandRunner(CommandRunner):
269
280
  connect_timeout: Optional[int] = ...,
270
281
  source_bashrc: bool = ...,
271
282
  skip_lines: int = ...,
283
+ run_in_background: bool = ...,
272
284
  **kwargs) -> Tuple[int, str, str]:
273
285
  ...
274
286
 
@@ -286,6 +298,7 @@ class KubernetesCommandRunner(CommandRunner):
286
298
  connect_timeout: Optional[int] = ...,
287
299
  source_bashrc: bool = ...,
288
300
  skip_lines: int = ...,
301
+ run_in_background: bool = ...,
289
302
  **kwargs) -> Union[Tuple[int, str, str], int]:
290
303
  ...
291
304
 
@@ -348,6 +361,7 @@ class LocalProcessCommandRunner(CommandRunner):
348
361
  connect_timeout: Optional[int] = ...,
349
362
  source_bashrc: bool = ...,
350
363
  skip_lines: int = ...,
364
+ run_in_background: bool = ...,
351
365
  **kwargs) -> int:
352
366
  ...
353
367
 
@@ -365,6 +379,7 @@ class LocalProcessCommandRunner(CommandRunner):
365
379
  connect_timeout: Optional[int] = ...,
366
380
  source_bashrc: bool = ...,
367
381
  skip_lines: int = ...,
382
+ run_in_background: bool = ...,
368
383
  **kwargs) -> Tuple[int, str, str]:
369
384
  ...
370
385
 
@@ -382,5 +397,6 @@ class LocalProcessCommandRunner(CommandRunner):
382
397
  connect_timeout: Optional[int] = ...,
383
398
  source_bashrc: bool = ...,
384
399
  skip_lines: int = ...,
400
+ run_in_background: bool = ...,
385
401
  **kwargs) -> Union[Tuple[int, str, str], int]:
386
402
  ...
sky/utils/common_utils.py CHANGED
@@ -29,6 +29,7 @@ from sky.adaptors import common as adaptors_common
29
29
  from sky.skylet import constants
30
30
  from sky.usage import constants as usage_constants
31
31
  from sky.utils import annotations
32
+ from sky.utils import context
32
33
  from sky.utils import ux_utils
33
34
  from sky.utils import validator
34
35
 
@@ -293,14 +294,13 @@ class Backoff:
293
294
  return self._backoff
294
295
 
295
296
 
296
- _current_command: Optional[str] = None
297
- _current_client_entrypoint: Optional[str] = None
298
- _using_remote_api_server: Optional[bool] = None
299
- _current_user: Optional['models.User'] = None
300
- _current_request_id: Optional[str] = None
297
+ _CLIENT_COMMAND_KEY = 'client_command'
298
+ _CLIENT_ENTRYPOINT_KEY = 'client_entrypoint'
299
+ _USING_REMOTE_API_SERVER_KEY = 'using_remote_api_server'
300
+ _USER_KEY = 'user'
301
+ _REQUEST_ID_KEY = 'request_id'
301
302
 
302
303
 
303
- # TODO(aylei,hailong): request context should be contextual
304
304
  def set_request_context(client_entrypoint: Optional[str],
305
305
  client_command: Optional[str],
306
306
  using_remote_api_server: bool,
@@ -310,22 +310,21 @@ def set_request_context(client_entrypoint: Optional[str],
310
310
  This is useful when we are on the SkyPilot API server side and we have a
311
311
  client entrypoint and command from the client.
312
312
  """
313
- global _current_command
314
- global _current_client_entrypoint
315
- global _using_remote_api_server
316
- global _current_user
317
- global _current_request_id
318
- _current_command = client_command
319
- _current_client_entrypoint = client_entrypoint
320
- _using_remote_api_server = using_remote_api_server
321
- _current_user = user
322
- _current_request_id = request_id
313
+ # This function will be called in process executor and coroutine executor.
314
+ # context.set_context_var ensures the context is safe in both cases.
315
+ context.set_context_var(_CLIENT_ENTRYPOINT_KEY, client_entrypoint)
316
+ context.set_context_var(_CLIENT_COMMAND_KEY, client_command)
317
+ context.set_context_var(_USING_REMOTE_API_SERVER_KEY,
318
+ using_remote_api_server)
319
+ context.set_context_var(_USER_KEY, user)
320
+ context.set_context_var(_REQUEST_ID_KEY, request_id)
323
321
 
324
322
 
325
323
  def get_current_request_id() -> str:
326
324
  """Returns the current request id."""
327
- if _current_request_id is not None:
328
- return _current_request_id
325
+ value = context.get_context_var('request_id')
326
+ if value is not None:
327
+ return value
329
328
  return 'dummy-request-id'
330
329
 
331
330
 
@@ -335,16 +334,17 @@ def get_current_command() -> str:
335
334
  Normally uses get_pretty_entry_point(), but will use the client command on
336
335
  the server side.
337
336
  """
338
- if _current_command is not None:
339
- return _current_command
340
-
337
+ value = context.get_context_var(_CLIENT_COMMAND_KEY)
338
+ if value is not None:
339
+ return value
341
340
  return get_pretty_entrypoint_cmd()
342
341
 
343
342
 
344
343
  def get_current_user() -> 'models.User':
345
344
  """Returns the user in current server session."""
346
- if _current_user is not None:
347
- return _current_user
345
+ value = context.get_context_var(_USER_KEY)
346
+ if value is not None:
347
+ return value
348
348
  return models.User.get_current_user()
349
349
 
350
350
 
@@ -370,8 +370,7 @@ def get_local_user_name() -> str:
370
370
 
371
371
  def set_current_user(user: 'models.User'):
372
372
  """Sets the current user."""
373
- global _current_user
374
- _current_user = user
373
+ context.set_context_var('user', user)
375
374
 
376
375
 
377
376
  def get_current_client_entrypoint(server_entrypoint: str) -> str:
@@ -380,8 +379,9 @@ def get_current_client_entrypoint(server_entrypoint: str) -> str:
380
379
  Gets the client entrypoint from the context, if it is not set, returns the
381
380
  server entrypoint.
382
381
  """
383
- if _current_client_entrypoint is not None:
384
- return _current_client_entrypoint
382
+ value = context.get_context_var(_CLIENT_ENTRYPOINT_KEY)
383
+ if value is not None:
384
+ return value
385
385
  return server_entrypoint
386
386
 
387
387
 
@@ -390,8 +390,9 @@ def get_using_remote_api_server() -> bool:
390
390
  if os.getenv(constants.USING_REMOTE_API_SERVER_ENV_VAR) is not None:
391
391
  return os.getenv(constants.USING_REMOTE_API_SERVER_ENV_VAR,
392
392
  '').lower() in ('true', '1')
393
- if _using_remote_api_server is not None:
394
- return _using_remote_api_server
393
+ value = context.get_context_var(_USING_REMOTE_API_SERVER_KEY)
394
+ if value is not None:
395
+ return value
395
396
  # This gets the right status for the local client.
396
397
  # TODO(zhwu): This is to prevent circular import. We should refactor this.
397
398
  # pylint: disable=import-outside-toplevel
sky/utils/context.py CHANGED
@@ -17,6 +17,8 @@ from typing_extensions import ParamSpec
17
17
  if TYPE_CHECKING:
18
18
  from sky.skypilot_config import ConfigContext
19
19
 
20
+ _PROCESS_GLOBAL_VARS = {}
21
+
20
22
 
21
23
  class SkyPilotContext(object):
22
24
  """SkyPilot typed context vars for threads and coroutines.
@@ -65,6 +67,8 @@ class SkyPilotContext(object):
65
67
  self._log_file_handle = None
66
68
  self.env_overrides = {}
67
69
  self.config_context = None
70
+ self.request_context = None
71
+ self.vars = {}
68
72
 
69
73
  def cancel(self):
70
74
  """Cancel the context."""
@@ -113,6 +117,12 @@ class SkyPilotContext(object):
113
117
  self._log_file_handle.close()
114
118
  self._log_file_handle = None
115
119
 
120
+ def set_var(self, key: str, value: Any):
121
+ self.vars[key] = value
122
+
123
+ def get_var(self, key: str) -> Optional[Any]:
124
+ return self.vars.get(key)
125
+
116
126
  def __enter__(self):
117
127
  return self
118
128
 
@@ -150,6 +160,28 @@ def get() -> Optional[SkyPilotContext]:
150
160
  return _CONTEXT.get()
151
161
 
152
162
 
163
+ def set_context_var(key: str, value: Any):
164
+ ctx = get()
165
+ if ctx is not None:
166
+ # Set the var in context
167
+ ctx.set_var(key, value)
168
+ else:
169
+ # Fallback to process-isolated assumption, where we thought
170
+ # modifying process-scope vars is safe.
171
+ _PROCESS_GLOBAL_VARS[key] = value
172
+
173
+
174
+ def get_context_var(key: str) -> Any:
175
+ ctx = get()
176
+ if ctx is not None:
177
+ # Use `in` to check for key existence to distinguish
178
+ # "key not found" from "key's value is None".
179
+ if key in ctx.vars:
180
+ return ctx.get_var(key)
181
+ # Fallback to the variable set in process-scope
182
+ return _PROCESS_GLOBAL_VARS.get(key)
183
+
184
+
153
185
  class ContextualEnviron(MutableMapping[str, str]):
154
186
  """Environment variables wrapper with contextual overrides.
155
187
 
sky/utils/db/db_utils.py CHANGED
@@ -75,6 +75,18 @@ def safe_cursor(db_path: str):
75
75
  conn.close()
76
76
 
77
77
 
78
+ @contextlib.contextmanager
79
+ def safe_cursor_on_connection(conn: 'sqlite3.Connection'):
80
+ """A auto-committing, auto-closing cursor on an existing connection."""
81
+ # Ensure commit() is called when the context is exited.
82
+ with conn:
83
+ cursor = conn.cursor()
84
+ try:
85
+ yield cursor
86
+ finally:
87
+ cursor.close()
88
+
89
+
78
90
  def add_column_to_table(
79
91
  cursor: 'sqlite3.Cursor',
80
92
  conn: 'sqlite3.Connection',
@@ -286,6 +298,11 @@ def drop_column_from_table_alembic(
286
298
  raise
287
299
 
288
300
 
301
+ def fault_point():
302
+ """For test fault injection."""
303
+ pass
304
+
305
+
289
306
  class SQLiteConn(threading.local):
290
307
  """Thread-local connection to the sqlite3 database."""
291
308
 
@@ -345,8 +362,8 @@ class SQLiteConn(threading.local):
345
362
 
346
363
  def exec_and_commit(sql: str, parameters: Optional[Iterable[Any]]):
347
364
  # pylint: disable=protected-access
348
- conn._conn.execute(sql, parameters)
349
- conn._conn.commit()
365
+ with safe_cursor_on_connection(conn._conn) as cursor:
366
+ cursor.execute(sql, parameters)
350
367
 
351
368
  # pylint: disable=protected-access
352
369
  await conn._execute(exec_and_commit, sql, parameters)
@@ -357,7 +374,20 @@ class SQLiteConn(threading.local):
357
374
  parameters: Optional[Iterable[Any]] = None
358
375
  ) -> Iterable[sqlite3.Row]:
359
376
  conn = await self._get_async_conn()
360
- return await conn.execute_fetchall(sql, parameters)
377
+ if parameters is None:
378
+ parameters = []
379
+
380
+ def exec_fetch_all(sql: str, parameters: Optional[Iterable[Any]]):
381
+ # pylint: disable=protected-access
382
+ with safe_cursor_on_connection(conn._conn) as cursor:
383
+ cursor.execute(sql, parameters)
384
+ # Note(dev): sqlite3.Connection cannot be patched, keep
385
+ # fault_point here to test the integrity of exec_fetch_all()
386
+ fault_point()
387
+ return cursor.fetchall()
388
+
389
+ # pylint: disable=protected-access
390
+ return await conn._execute(exec_fetch_all, sql, parameters)
361
391
 
362
392
  async def execute_get_returning_value_async(
363
393
  self,
@@ -372,9 +402,9 @@ class SQLiteConn(threading.local):
372
402
  def exec_and_get_returning_value(sql: str,
373
403
  parameters: Optional[Iterable[Any]]):
374
404
  # pylint: disable=protected-access
375
- row = conn._conn.execute(sql, parameters).fetchone()
376
- conn._conn.commit()
377
- return row
405
+ with safe_cursor_on_connection(conn._conn) as cursor:
406
+ cursor.execute(sql, parameters)
407
+ return cursor.fetchone()
378
408
 
379
409
  # pylint: disable=protected-access
380
410
  return await conn._execute(exec_and_get_returning_value, sql,
@@ -3,6 +3,7 @@
3
3
  import contextlib
4
4
  import logging
5
5
  import os
6
+ from typing import Optional
6
7
 
7
8
  from alembic import command as alembic_command
8
9
  from alembic.config import Config
@@ -22,7 +23,7 @@ GLOBAL_USER_STATE_VERSION = '011'
22
23
  GLOBAL_USER_STATE_LOCK_PATH = f'~/.sky/locks/.{GLOBAL_USER_STATE_DB_NAME}.lock'
23
24
 
24
25
  SPOT_JOBS_DB_NAME = 'spot_jobs_db'
25
- SPOT_JOBS_VERSION = '007'
26
+ SPOT_JOBS_VERSION = '011'
26
27
  SPOT_JOBS_LOCK_PATH = f'~/.sky/locks/.{SPOT_JOBS_DB_NAME}.lock'
27
28
 
28
29
  SERVE_DB_NAME = 'serve_db'
@@ -52,12 +53,22 @@ def db_lock(db_name: str):
52
53
  f'file if you believe it is stale.') from e
53
54
 
54
55
 
55
- def get_alembic_config(engine: sqlalchemy.engine.Engine, section: str):
56
- """Get Alembic configuration for the given section"""
57
- # From sky/utils/db/migration_utils.py -> sky/setup_files/alembic.ini
58
- alembic_ini_path = os.path.join(
59
- os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
60
- 'setup_files', 'alembic.ini')
56
+ def get_alembic_config(engine: sqlalchemy.engine.Engine,
57
+ section: str,
58
+ alembic_ini_path: Optional[str] = None):
59
+ """Get Alembic configuration for the given section.
60
+
61
+ Args:
62
+ engine: SQLAlchemy engine for the database.
63
+ section: Alembic section name (e.g., 'state_db' or 'spot_jobs_db').
64
+ alembic_ini_path: Optional path to a custom alembic.ini file.
65
+ If not provided, uses the default SkyPilot alembic.ini.
66
+ """
67
+ if alembic_ini_path is None:
68
+ # Default to SkyPilot's alembic.ini
69
+ alembic_ini_path = os.path.join(
70
+ os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
71
+ 'setup_files', 'alembic.ini')
61
72
  alembic_cfg = Config(alembic_ini_path, ini_section=section)
62
73
 
63
74
  # Override the database URL to match SkyPilot's current connection
@@ -73,19 +84,23 @@ def get_alembic_config(engine: sqlalchemy.engine.Engine, section: str):
73
84
  return alembic_cfg
74
85
 
75
86
 
76
- def needs_upgrade(engine: sqlalchemy.engine.Engine, section: str,
77
- target_revision: str):
87
+ def needs_upgrade(engine: sqlalchemy.engine.Engine,
88
+ section: str,
89
+ target_revision: str,
90
+ alembic_ini_path: Optional[str] = None):
78
91
  """Check if the database needs to be upgraded.
79
92
 
80
93
  Args:
81
- engine: SQLAlchemy engine for the database
82
- section: Alembic section to upgrade (e.g., 'state_db' or 'spot_jobs_db')
83
- target_revision: Target revision to upgrade to (e.g., '001')
94
+ engine: SQLAlchemy engine for the database.
95
+ section: Alembic section to upgrade (e.g., 'state_db' or
96
+ 'spot_jobs_db').
97
+ target_revision: Target revision to upgrade to (e.g., '001').
98
+ alembic_ini_path: Optional path to a custom alembic.ini file.
84
99
  """
85
100
  current_rev = None
86
101
 
87
102
  # get alembic config for the given section
88
- alembic_config = get_alembic_config(engine, section)
103
+ alembic_config = get_alembic_config(engine, section, alembic_ini_path)
89
104
  version_table = alembic_config.get_section_option(
90
105
  alembic_config.config_ini_section, 'version_table', 'alembic_version')
91
106
 
@@ -112,26 +127,31 @@ def needs_upgrade(engine: sqlalchemy.engine.Engine, section: str,
112
127
  return current_rev_num < target_rev_num
113
128
 
114
129
 
115
- def safe_alembic_upgrade(engine: sqlalchemy.engine.Engine, section: str,
116
- target_revision: str):
130
+ def safe_alembic_upgrade(engine: sqlalchemy.engine.Engine,
131
+ section: str,
132
+ target_revision: str,
133
+ alembic_ini_path: Optional[str] = None):
117
134
  """Upgrade the database if needed. Uses a file lock to ensure
118
135
  that only one process tries to upgrade the database at a time.
119
136
 
120
137
  Args:
121
- engine: SQLAlchemy engine for the database
122
- section: Alembic section to upgrade (e.g., 'state_db' or 'spot_jobs_db')
123
- target_revision: Target revision to upgrade to (e.g., '001')
138
+ engine: SQLAlchemy engine for the database.
139
+ section: Alembic section to upgrade (e.g., 'state_db' or
140
+ 'spot_jobs_db').
141
+ target_revision: Target revision to upgrade to (e.g., '001').
142
+ alembic_ini_path: Optional path to a custom alembic.ini file.
124
143
  """
125
144
  # set alembic logger to warning level
126
145
  alembic_logger = logging.getLogger('alembic')
127
146
  alembic_logger.setLevel(logging.WARNING)
128
147
 
129
- alembic_config = get_alembic_config(engine, section)
148
+ alembic_config = get_alembic_config(engine, section, alembic_ini_path)
130
149
 
131
150
  # only acquire lock if db needs upgrade
132
- if needs_upgrade(engine, section, target_revision):
151
+ if needs_upgrade(engine, section, target_revision, alembic_ini_path):
133
152
  with db_lock(section):
134
153
  # check again if db needs upgrade in case another
135
154
  # process upgraded it while we were waiting for the lock
136
- if needs_upgrade(engine, section, target_revision):
155
+ if needs_upgrade(engine, section, target_revision,
156
+ alembic_ini_path):
137
157
  alembic_command.upgrade(alembic_config, target_revision)
sky/utils/infra_utils.py CHANGED
@@ -173,7 +173,11 @@ class InfraInfo:
173
173
  return '-'
174
174
 
175
175
  region_or_zone = None
176
- if self.zone is not None and self.zone != '*':
176
+ # For Slurm, zones = partitions. We want to show the cluster
177
+ # name (region) instead of the partition name (zone), as different
178
+ # Slurm clusters can easily have same partition name.
179
+ is_slurm = self.cloud.lower() == 'slurm'
180
+ if not is_slurm and self.zone is not None and self.zone != '*':
177
181
  region_or_zone = self.zone
178
182
  elif self.region is not None and self.region != '*':
179
183
  # If using region, we remove the ssh- prefix if it exists for SSH
@@ -0,0 +1,139 @@
1
+ """Utility functions for generating instance links for cloud providers."""
2
+ from typing import Dict
3
+
4
+ from sky import sky_logging
5
+ from sky.provision import common
6
+ from sky.provision import constants as provision_constants
7
+
8
+ logger = sky_logging.init_logger(__name__)
9
+
10
+ # URL templates for each cloud provider
11
+ # Placeholders:
12
+ # {region} - Cloud region
13
+ # {project_id} - GCP project ID
14
+ # {subscription_id} - Azure subscription ID
15
+ # {resource_group} - Azure resource group
16
+ # {tag_key} - Tag key used to identify cluster instances
17
+ # {cluster_name} - Name of the cluster
18
+
19
+ AWS_INSTANCES_URL = ('https://{region}.console.aws.amazon.com/ec2/v2/home'
20
+ '?region={region}#Instances:tag:{tag_key}={cluster_name}')
21
+
22
+ # Azure doesn't support direct tag filter URLs, so we link to the resource group
23
+ AZURE_RESOURCE_GROUP_URL = (
24
+ 'https://portal.azure.com/#@/resource/subscriptions'
25
+ '/{subscription_id}/resourceGroups/{resource_group}/overview')
26
+
27
+ # GCP Console base URL
28
+ GCP_INSTANCES_BASE_URL = 'https://console.cloud.google.com/compute/instances'
29
+
30
+
31
+ def _build_gcp_instances_url(project_id: str, tag_key: str,
32
+ cluster_name: str) -> str:
33
+ """Build GCP instances URL with label filter.
34
+
35
+ GCP Console uses a pageState parameter with a specially encoded filter.
36
+ The filter JSON structure is:
37
+ [{"k":"","t":10,"v":"\"label_key:label_value\"","s":true}]
38
+
39
+ Where:
40
+ - k: filter key (empty for label filters)
41
+ - t: filter type (10 = label filter)
42
+ - v: filter value with escaped quotes around "label_key:label_value"
43
+ - s: unknown, always true
44
+
45
+ GCP uses a mix of:
46
+ - Standard URL encoding for outer structure (%22 for ")
47
+ - Underscore notation inside the filter (_22 for ", _3A for :, etc.)
48
+ - Double URL-encoding for brackets (%255B = %5B = [)
49
+ """
50
+ # Build the filter value: \"tag_key:cluster_name\"
51
+ # Using underscore notation: _5C_22 = \", _3A = :
52
+ filter_value = f'_5C_22{tag_key}_3A{cluster_name}_5C_22'
53
+
54
+ # Build the filter object using underscore notation for internal quotes and
55
+ # colons.
56
+ # {"k":"","t":10,"v":"<filter_value>","s":true}
57
+ # _22 = ", _3A = :, _2C = ,
58
+ filter_obj = (
59
+ f'_22k_22_3A_22_22_2C' # "k":"",
60
+ f'_22t_22_3A10_2C' # "t":10,
61
+ f'_22v_22_3A_22{filter_value}_22_2C' # "v":"<value>",
62
+ f'_22s_22_3Atrue') # "s":true
63
+
64
+ # Wrap in array brackets (double URL-encoded: %255B = %5B = [, %257D = %7D)
65
+ filter_array = f'%255B%257B{filter_obj}%257D%255D'
66
+
67
+ # Build pageState: ("instances":("p":0,"f":"<filter>"))
68
+ # %22 = " (standard URL encoding)
69
+ page_state = f'(%22instances%22:(%22p%22:0,%22f%22:%22{filter_array}%22))'
70
+
71
+ return (
72
+ f'{GCP_INSTANCES_BASE_URL}?project={project_id}&pageState={page_state}')
73
+
74
+
75
+ def generate_instance_links(
76
+ cluster_info: common.ClusterInfo,
77
+ cluster_name: str,
78
+ ) -> Dict[str, str]:
79
+ """Generate instance links for a cluster based on the cloud provider.
80
+
81
+ Creates links to filtered views in cloud consoles that show all instances
82
+ belonging to the cluster (useful for multi-node jobs).
83
+
84
+ Args:
85
+ cluster_info: ClusterInfo object containing instance information.
86
+ cluster_name: Cluster name for tag-based filtering.
87
+
88
+ Returns:
89
+ Dictionary mapping link labels to URLs. Empty dict if links cannot be
90
+ generated (e.g., for Kubernetes or unsupported clouds).
91
+ """
92
+ links: Dict[str, str] = {}
93
+ provider_name = cluster_info.provider_name.lower()
94
+ provider_config = cluster_info.provider_config or {}
95
+
96
+ # Skip Kubernetes and other non-cloud providers
97
+ if provider_name in ('kubernetes', 'local'):
98
+ return links
99
+
100
+ # Tag used by SkyPilot to identify cluster instances
101
+ tag_key = provision_constants.TAG_RAY_CLUSTER_NAME
102
+
103
+ if provider_name == 'aws':
104
+ region = provider_config.get('region')
105
+ if not region:
106
+ logger.debug('AWS region not found in provider config, '
107
+ 'skipping instance links')
108
+ return links
109
+ links['AWS Instances'] = AWS_INSTANCES_URL.format(
110
+ region=region,
111
+ tag_key=tag_key,
112
+ cluster_name=cluster_name,
113
+ )
114
+
115
+ elif provider_name == 'gcp':
116
+ project_id = provider_config.get('project_id')
117
+ if not project_id:
118
+ logger.debug('GCP project_id not found in provider config, '
119
+ 'skipping instance links')
120
+ return links
121
+ links['GCP Instances'] = _build_gcp_instances_url(
122
+ project_id=project_id,
123
+ tag_key=tag_key,
124
+ cluster_name=cluster_name,
125
+ )
126
+
127
+ elif provider_name == 'azure':
128
+ subscription_id = provider_config.get('subscription_id')
129
+ resource_group = provider_config.get('resource_group')
130
+ if not subscription_id or not resource_group:
131
+ logger.debug('Azure subscription_id or resource_group not found '
132
+ 'in provider config, skipping instance links')
133
+ return links
134
+ links['Azure Resource Group'] = AZURE_RESOURCE_GROUP_URL.format(
135
+ subscription_id=subscription_id,
136
+ resource_group=resource_group,
137
+ )
138
+
139
+ return links