skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/adaptors/kubernetes.py +64 -0
  5. sky/adaptors/shadeform.py +89 -0
  6. sky/admin_policy.py +20 -0
  7. sky/authentication.py +59 -149
  8. sky/backends/backend_utils.py +104 -63
  9. sky/backends/cloud_vm_ray_backend.py +84 -39
  10. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  11. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  12. sky/catalog/kubernetes_catalog.py +24 -28
  13. sky/catalog/runpod_catalog.py +5 -1
  14. sky/catalog/shadeform_catalog.py +165 -0
  15. sky/check.py +25 -13
  16. sky/client/cli/command.py +335 -86
  17. sky/client/cli/flags.py +4 -2
  18. sky/client/cli/table_utils.py +17 -9
  19. sky/client/sdk.py +59 -12
  20. sky/cloud_stores.py +73 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +71 -16
  23. sky/clouds/azure.py +12 -5
  24. sky/clouds/cloud.py +19 -9
  25. sky/clouds/cudo.py +12 -5
  26. sky/clouds/do.py +4 -1
  27. sky/clouds/fluidstack.py +12 -5
  28. sky/clouds/gcp.py +12 -5
  29. sky/clouds/hyperbolic.py +12 -5
  30. sky/clouds/ibm.py +12 -5
  31. sky/clouds/kubernetes.py +62 -25
  32. sky/clouds/lambda_cloud.py +12 -5
  33. sky/clouds/nebius.py +12 -5
  34. sky/clouds/oci.py +12 -5
  35. sky/clouds/paperspace.py +4 -1
  36. sky/clouds/primeintellect.py +4 -1
  37. sky/clouds/runpod.py +12 -5
  38. sky/clouds/scp.py +12 -5
  39. sky/clouds/seeweb.py +4 -1
  40. sky/clouds/shadeform.py +400 -0
  41. sky/clouds/ssh.py +4 -2
  42. sky/clouds/vast.py +12 -5
  43. sky/clouds/vsphere.py +4 -1
  44. sky/core.py +12 -11
  45. sky/dashboard/out/404.html +1 -1
  46. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  47. sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  50. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  62. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
  64. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
  66. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  67. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
  68. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
  69. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
  72. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
  73. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  74. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  75. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  76. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  77. sky/dashboard/out/clusters/[cluster].html +1 -1
  78. sky/dashboard/out/clusters.html +1 -1
  79. sky/dashboard/out/config.html +1 -1
  80. sky/dashboard/out/index.html +1 -1
  81. sky/dashboard/out/infra/[context].html +1 -1
  82. sky/dashboard/out/infra.html +1 -1
  83. sky/dashboard/out/jobs/[job].html +1 -1
  84. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  85. sky/dashboard/out/jobs.html +1 -1
  86. sky/dashboard/out/users.html +1 -1
  87. sky/dashboard/out/volumes.html +1 -1
  88. sky/dashboard/out/workspace/new.html +1 -1
  89. sky/dashboard/out/workspaces/[name].html +1 -1
  90. sky/dashboard/out/workspaces.html +1 -1
  91. sky/data/data_utils.py +92 -1
  92. sky/data/mounting_utils.py +143 -19
  93. sky/data/storage.py +168 -11
  94. sky/exceptions.py +13 -1
  95. sky/execution.py +13 -0
  96. sky/global_user_state.py +189 -113
  97. sky/jobs/client/sdk.py +32 -10
  98. sky/jobs/client/sdk_async.py +9 -3
  99. sky/jobs/constants.py +3 -1
  100. sky/jobs/controller.py +164 -192
  101. sky/jobs/file_content_utils.py +80 -0
  102. sky/jobs/log_gc.py +201 -0
  103. sky/jobs/recovery_strategy.py +59 -82
  104. sky/jobs/scheduler.py +20 -9
  105. sky/jobs/server/core.py +105 -23
  106. sky/jobs/server/server.py +40 -28
  107. sky/jobs/server/utils.py +32 -11
  108. sky/jobs/state.py +588 -110
  109. sky/jobs/utils.py +442 -209
  110. sky/logs/agent.py +1 -1
  111. sky/metrics/utils.py +45 -6
  112. sky/optimizer.py +1 -1
  113. sky/provision/__init__.py +7 -0
  114. sky/provision/aws/instance.py +2 -1
  115. sky/provision/azure/instance.py +2 -1
  116. sky/provision/common.py +2 -0
  117. sky/provision/cudo/instance.py +2 -1
  118. sky/provision/do/instance.py +2 -1
  119. sky/provision/fluidstack/instance.py +4 -3
  120. sky/provision/gcp/instance.py +2 -1
  121. sky/provision/hyperbolic/instance.py +2 -1
  122. sky/provision/instance_setup.py +10 -2
  123. sky/provision/kubernetes/constants.py +0 -1
  124. sky/provision/kubernetes/instance.py +222 -89
  125. sky/provision/kubernetes/network.py +12 -8
  126. sky/provision/kubernetes/utils.py +114 -53
  127. sky/provision/kubernetes/volume.py +5 -4
  128. sky/provision/lambda_cloud/instance.py +2 -1
  129. sky/provision/nebius/instance.py +2 -1
  130. sky/provision/oci/instance.py +2 -1
  131. sky/provision/paperspace/instance.py +2 -1
  132. sky/provision/provisioner.py +11 -2
  133. sky/provision/runpod/instance.py +2 -1
  134. sky/provision/scp/instance.py +2 -1
  135. sky/provision/seeweb/instance.py +3 -3
  136. sky/provision/shadeform/__init__.py +11 -0
  137. sky/provision/shadeform/config.py +12 -0
  138. sky/provision/shadeform/instance.py +351 -0
  139. sky/provision/shadeform/shadeform_utils.py +83 -0
  140. sky/provision/vast/instance.py +2 -1
  141. sky/provision/vsphere/instance.py +2 -1
  142. sky/resources.py +1 -1
  143. sky/schemas/api/responses.py +9 -5
  144. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  145. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  146. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  147. sky/schemas/generated/jobsv1_pb2.py +52 -52
  148. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  149. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  150. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  151. sky/serve/client/impl.py +11 -3
  152. sky/serve/replica_managers.py +5 -2
  153. sky/serve/serve_utils.py +9 -2
  154. sky/serve/server/impl.py +7 -2
  155. sky/serve/server/server.py +18 -15
  156. sky/serve/service.py +2 -2
  157. sky/server/auth/oauth2_proxy.py +2 -5
  158. sky/server/common.py +31 -28
  159. sky/server/constants.py +5 -1
  160. sky/server/daemons.py +27 -19
  161. sky/server/requests/executor.py +138 -74
  162. sky/server/requests/payloads.py +9 -1
  163. sky/server/requests/preconditions.py +13 -10
  164. sky/server/requests/request_names.py +120 -0
  165. sky/server/requests/requests.py +485 -153
  166. sky/server/requests/serializers/decoders.py +26 -13
  167. sky/server/requests/serializers/encoders.py +56 -11
  168. sky/server/requests/threads.py +106 -0
  169. sky/server/rest.py +70 -18
  170. sky/server/server.py +283 -104
  171. sky/server/stream_utils.py +233 -59
  172. sky/server/uvicorn.py +18 -17
  173. sky/setup_files/alembic.ini +4 -0
  174. sky/setup_files/dependencies.py +32 -13
  175. sky/sky_logging.py +0 -2
  176. sky/skylet/constants.py +30 -7
  177. sky/skylet/events.py +7 -0
  178. sky/skylet/log_lib.py +8 -2
  179. sky/skylet/log_lib.pyi +1 -1
  180. sky/skylet/services.py +26 -13
  181. sky/skylet/subprocess_daemon.py +103 -29
  182. sky/skypilot_config.py +87 -75
  183. sky/ssh_node_pools/server.py +9 -8
  184. sky/task.py +67 -54
  185. sky/templates/kubernetes-ray.yml.j2 +8 -1
  186. sky/templates/nebius-ray.yml.j2 +1 -0
  187. sky/templates/shadeform-ray.yml.j2 +72 -0
  188. sky/templates/websocket_proxy.py +142 -12
  189. sky/users/permission.py +8 -1
  190. sky/utils/admin_policy_utils.py +16 -3
  191. sky/utils/asyncio_utils.py +78 -0
  192. sky/utils/auth_utils.py +153 -0
  193. sky/utils/cli_utils/status_utils.py +8 -2
  194. sky/utils/command_runner.py +11 -0
  195. sky/utils/common.py +3 -1
  196. sky/utils/common_utils.py +7 -4
  197. sky/utils/context.py +57 -51
  198. sky/utils/context_utils.py +30 -12
  199. sky/utils/controller_utils.py +35 -8
  200. sky/utils/db/db_utils.py +37 -10
  201. sky/utils/db/migration_utils.py +8 -4
  202. sky/utils/locks.py +24 -6
  203. sky/utils/resource_checker.py +4 -1
  204. sky/utils/resources_utils.py +53 -29
  205. sky/utils/schemas.py +23 -4
  206. sky/utils/subprocess_utils.py +17 -4
  207. sky/volumes/server/server.py +7 -6
  208. sky/workspaces/server.py +13 -12
  209. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
  210. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
  211. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  213. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  214. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
  216. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  217. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  221. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  222. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
  223. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
  224. sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
  225. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  226. sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
  227. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  228. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
  229. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  230. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  231. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -141,6 +141,7 @@ _NODES_LAUNCHING_PROGRESS_TIMEOUT = {
141
141
  clouds.OCI: 300,
142
142
  clouds.Paperspace: 600,
143
143
  clouds.Kubernetes: 300,
144
+ clouds.Shadeform: 300,
144
145
  clouds.Vsphere: 240,
145
146
  }
146
147
 
@@ -211,6 +212,7 @@ _EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT = [
211
212
  ('too long', 255),
212
213
  ('request-uri too large', 1),
213
214
  ('request header fields too large', 1),
215
+ ('400 bad request', 1), # CloudFlare 400 error
214
216
  ]
215
217
 
216
218
  _RESOURCES_UNAVAILABLE_LOG = (
@@ -303,6 +305,7 @@ def _get_cluster_config_template(cloud):
303
305
  clouds.RunPod: 'runpod-ray.yml.j2',
304
306
  clouds.Kubernetes: 'kubernetes-ray.yml.j2',
305
307
  clouds.SSH: 'kubernetes-ray.yml.j2',
308
+ clouds.Shadeform: 'shadeform-ray.yml.j2',
306
309
  clouds.Vsphere: 'vsphere-ray.yml.j2',
307
310
  clouds.Vast: 'vast-ray.yml.j2',
308
311
  clouds.Fluidstack: 'fluidstack-ray.yml.j2',
@@ -602,7 +605,11 @@ class RayCodeGen:
602
605
  # skip the scheduling step.
603
606
  job_lib.scheduler.schedule_step()
604
607
 
605
- total_num_nodes = len(ray.nodes())
608
+ # If some nodes are down and then new nodes are added after launching again,
609
+ # the result of `ray.nodes()` will include all the nodes, so we need to get
610
+ # the alive nodes.
611
+ alive_nodes = [n for n in ray.nodes() if 'Alive' in n and n['Alive']]
612
+ total_num_nodes = len(alive_nodes)
606
613
  setup_bundles = [{{"CPU": _SETUP_CPUS}} for _ in range(total_num_nodes)]
607
614
  setup_pg = ray.util.placement_group(setup_bundles, strategy='STRICT_SPREAD')
608
615
  setup_workers = [run_bash_command_with_log_and_return_pid \\
@@ -2362,9 +2369,8 @@ class RetryingVmProvisioner(object):
2362
2369
  for (resource, exception) in resource_exceptions.items():
2363
2370
  table.add_row([
2364
2371
  resource.infra.formatted_str(),
2365
- resources_utils.format_resource(resource,
2366
- simplify=True),
2367
- exception
2372
+ resources_utils.format_resource(
2373
+ resource, simplified_only=True)[0], exception
2368
2374
  ])
2369
2375
  # Set the max width of REASON column to 80 to avoid the table
2370
2376
  # being wrapped in a unreadable way.
@@ -2464,6 +2470,9 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2464
2470
  def get_cluster_name(self):
2465
2471
  return self.cluster_name
2466
2472
 
2473
+ def get_cluster_name_on_cloud(self):
2474
+ return self.cluster_name_on_cloud
2475
+
2467
2476
  def _use_internal_ips(self):
2468
2477
  """Returns whether to use internal IPs for SSH connections."""
2469
2478
  # Directly load the `use_internal_ips` flag from the cluster yaml
@@ -2800,6 +2809,18 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2800
2809
  self.cluster_name,
2801
2810
  (tunnel.port, tunnel.pid) if tunnel is not None else None)
2802
2811
 
2812
+ def close_skylet_ssh_tunnel(self) -> None:
2813
+ """Terminate the SSH tunnel process and clear its metadata."""
2814
+ tunnel = self._get_skylet_ssh_tunnel()
2815
+ if tunnel is None:
2816
+ return
2817
+ logger.debug('Closing Skylet SSH tunnel for cluster %r on port %d',
2818
+ self.cluster_name, tunnel.port)
2819
+ try:
2820
+ self._terminate_ssh_tunnel_process(tunnel)
2821
+ finally:
2822
+ self._set_skylet_ssh_tunnel(None)
2823
+
2803
2824
  def get_grpc_channel(self) -> 'grpc.Channel':
2804
2825
  grpc_options = [
2805
2826
  # The task YAMLs can be large, so the default
@@ -2825,7 +2846,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2825
2846
  return grpc.insecure_channel(f'localhost:{tunnel.port}',
2826
2847
  options=grpc_options)
2827
2848
  except socket.error as e:
2828
- logger.warning(
2849
+ logger.debug(
2829
2850
  'Failed to connect to SSH tunnel for cluster '
2830
2851
  f'{self.cluster_name!r} on port {tunnel.port} ({e}), '
2831
2852
  'acquiring lock')
@@ -2851,7 +2872,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2851
2872
  return grpc.insecure_channel(f'localhost:{tunnel.port}',
2852
2873
  options=grpc_options)
2853
2874
  except socket.error as e:
2854
- logger.warning(
2875
+ logger.debug(
2855
2876
  'Failed to connect to SSH tunnel for cluster '
2856
2877
  f'{self.cluster_name!r} on port {tunnel.port} ({e}), '
2857
2878
  'opening new tunnel')
@@ -2866,19 +2887,14 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2866
2887
  f'the lock at {lock_id}. '
2867
2888
  f'{common_utils.format_exception(e)}') from e
2868
2889
 
2869
- def _cleanup_ssh_tunnel(self, tunnel_info: SSHTunnelInfo) -> None:
2870
- """Clean up an SSH tunnel by terminating the process."""
2890
+ def _terminate_ssh_tunnel_process(self, tunnel_info: SSHTunnelInfo) -> None:
2891
+ """Terminate the SSH tunnel process."""
2871
2892
  try:
2872
2893
  proc = psutil.Process(tunnel_info.pid)
2873
2894
  if proc.is_running() and proc.status() != psutil.STATUS_ZOMBIE:
2874
2895
  logger.debug(
2875
2896
  f'Terminating SSH tunnel process {tunnel_info.pid}')
2876
- proc.terminate()
2877
- try:
2878
- proc.wait(timeout=3)
2879
- except psutil.TimeoutExpired:
2880
- proc.kill()
2881
- proc.wait(timeout=1)
2897
+ subprocess_utils.kill_children_processes(proc.pid)
2882
2898
  except psutil.NoSuchProcess:
2883
2899
  pass
2884
2900
  except Exception as e: # pylint: disable=broad-except
@@ -2924,17 +2940,17 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2924
2940
  # Clean up existing tunnel before setting up the new one.
2925
2941
  old_tunnel = self._get_skylet_ssh_tunnel()
2926
2942
  if old_tunnel is not None:
2927
- self._cleanup_ssh_tunnel(old_tunnel)
2943
+ self._terminate_ssh_tunnel_process(old_tunnel)
2928
2944
  self._set_skylet_ssh_tunnel(tunnel_info)
2929
2945
  return tunnel_info
2930
2946
  except grpc.FutureTimeoutError as e:
2931
- self._cleanup_ssh_tunnel(tunnel_info)
2947
+ self._terminate_ssh_tunnel_process(tunnel_info)
2932
2948
  logger.warning(
2933
2949
  f'Skylet gRPC channel for cluster {self.cluster_name} not '
2934
2950
  f'ready after {constants.SKYLET_GRPC_TIMEOUT_SECONDS}s')
2935
2951
  raise e
2936
2952
  except Exception as e:
2937
- self._cleanup_ssh_tunnel(tunnel_info)
2953
+ self._terminate_ssh_tunnel_process(tunnel_info)
2938
2954
  raise e
2939
2955
 
2940
2956
  @property
@@ -2947,6 +2963,12 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2947
2963
  def cluster_yaml(self, value: Optional[str]):
2948
2964
  self._cluster_yaml = value
2949
2965
 
2966
+ @property
2967
+ def instance_ids(self):
2968
+ if self.cached_cluster_info is not None:
2969
+ return self.cached_cluster_info.instance_ids()
2970
+ return None
2971
+
2950
2972
  @property
2951
2973
  def ssh_user(self):
2952
2974
  if self.cached_cluster_info is not None:
@@ -3616,9 +3638,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3616
3638
  gap_seconds = _RETRY_UNTIL_UP_INIT_GAP_SECONDS
3617
3639
  retry_message = ux_utils.retry_message(
3618
3640
  f'Retry after {gap_seconds:.0f}s ')
3619
- hint_message = (f'\n{retry_message} '
3620
- f'{ux_utils.log_path_hint(log_path)}'
3621
- f'{colorama.Style.RESET_ALL}')
3641
+ hint_message = (
3642
+ f'\n{retry_message} '
3643
+ f'{ux_utils.provision_hint(cluster_name)}'
3644
+ f'{colorama.Style.RESET_ALL}')
3622
3645
 
3623
3646
  # Add cluster event for retry.
3624
3647
  global_user_state.add_cluster_event(
@@ -3647,7 +3670,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3647
3670
  logger.error(
3648
3671
  ux_utils.error_message(
3649
3672
  'Failed to provision resources. '
3650
- f'{ux_utils.log_path_hint(log_path)}'))
3673
+ f'{ux_utils.provision_hint(cluster_name)}'))
3651
3674
  error_message += (
3652
3675
  '\nTo keep retrying until the cluster is up, use '
3653
3676
  'the `--retry-until-up` flag.')
@@ -3706,6 +3729,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3706
3729
  # manually or by the cloud provider.
3707
3730
  # Optimize the case where the cluster's IPs can be retrieved
3708
3731
  # from cluster_info.
3732
+ handle.cached_cluster_info = cluster_info
3709
3733
  handle.docker_user = cluster_info.docker_user
3710
3734
  handle.update_cluster_ips(max_attempts=_FETCH_IP_MAX_ATTEMPTS,
3711
3735
  cluster_info=cluster_info)
@@ -3717,7 +3741,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3717
3741
 
3718
3742
  self._update_after_cluster_provisioned(
3719
3743
  handle, to_provision_config.prev_handle, task,
3720
- prev_cluster_status, lock_id, config_hash)
3744
+ prev_cluster_status, config_hash)
3721
3745
  return handle, False
3722
3746
 
3723
3747
  cluster_config_file = config_dict['ray']
@@ -3789,7 +3813,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3789
3813
 
3790
3814
  self._update_after_cluster_provisioned(
3791
3815
  handle, to_provision_config.prev_handle, task,
3792
- prev_cluster_status, lock_id, config_hash)
3816
+ prev_cluster_status, config_hash)
3793
3817
  return handle, False
3794
3818
 
3795
3819
  def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
@@ -3807,7 +3831,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3807
3831
  prev_handle: Optional[CloudVmRayResourceHandle],
3808
3832
  task: task_lib.Task,
3809
3833
  prev_cluster_status: Optional[status_lib.ClusterStatus],
3810
- lock_id: str, config_hash: str) -> None:
3834
+ config_hash: str) -> None:
3811
3835
  usage_lib.messages.usage.update_cluster_resources(
3812
3836
  handle.launched_nodes, handle.launched_resources)
3813
3837
  usage_lib.messages.usage.update_final_cluster_status(
@@ -3919,8 +3943,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3919
3943
  handle.cached_external_ssh_ports, handle.docker_user,
3920
3944
  handle.ssh_user)
3921
3945
 
3922
- locks.get_lock(lock_id).force_unlock()
3923
-
3924
3946
  def _sync_workdir(self, handle: CloudVmRayResourceHandle,
3925
3947
  workdir: Union[Path, Dict[str, Any]],
3926
3948
  envs_and_secrets: Dict[str, str]) -> None:
@@ -4215,6 +4237,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4215
4237
  codegen: str,
4216
4238
  job_id: int,
4217
4239
  managed_job_dag: Optional['dag.Dag'] = None,
4240
+ managed_job_user_id: Optional[str] = None,
4218
4241
  remote_log_dir: Optional[str] = None,
4219
4242
  ) -> None:
4220
4243
  """Executes generated code on the head node."""
@@ -4287,7 +4310,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4287
4310
  pool=managed_job_dag.pool,
4288
4311
  workspace=workspace,
4289
4312
  entrypoint=entrypoint,
4290
- tasks=managed_job_tasks)
4313
+ tasks=managed_job_tasks,
4314
+ user_id=managed_job_user_id)
4291
4315
 
4292
4316
  if _is_command_length_over_limit(codegen):
4293
4317
  _dump_code_to_file(codegen)
@@ -4324,7 +4348,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4324
4348
  managed_job_dag,
4325
4349
  skypilot_config.get_active_workspace(
4326
4350
  force_user_workspace=True),
4327
- entrypoint=common_utils.get_current_command())
4351
+ entrypoint=common_utils.get_current_command(),
4352
+ user_hash=managed_job_user_id)
4328
4353
  # Set the managed job to PENDING state to make sure that
4329
4354
  # this managed job appears in the `sky jobs queue`, even
4330
4355
  # if it needs to wait to be submitted.
@@ -5114,6 +5139,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5114
5139
  Raises:
5115
5140
  RuntimeError: If the cluster fails to be terminated/stopped.
5116
5141
  """
5142
+ try:
5143
+ handle.close_skylet_ssh_tunnel()
5144
+ except Exception as e: # pylint: disable=broad-except
5145
+ # Not critical to the cluster teardown, just log a warning.
5146
+ logger.warning(
5147
+ 'Failed to close Skylet SSH tunnel for cluster '
5148
+ f'{handle.cluster_name}: '
5149
+ f'{common_utils.format_exception(e, use_bracket=True)}')
5150
+
5117
5151
  exclude_request_to_kill = 'sky.down' if terminate else 'sky.stop'
5118
5152
  # We have to kill the cluster requests again within the lock, because
5119
5153
  # any pending requests on the same cluster should be cancelled after
@@ -5150,7 +5184,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5150
5184
  # observed in AWS. See also
5151
5185
  # _LAUNCH_DOUBLE_CHECK_WINDOW in backend_utils.py.
5152
5186
  force_refresh_statuses={status_lib.ClusterStatus.INIT},
5153
- cluster_lock_already_held=True))
5187
+ cluster_lock_already_held=True,
5188
+ retry_if_missing=False))
5154
5189
  cluster_status_fetched = True
5155
5190
  except exceptions.ClusterStatusFetchingError:
5156
5191
  logger.warning(
@@ -6269,6 +6304,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
6269
6304
  env_vars.update(self._skypilot_predefined_env_vars(handle))
6270
6305
  return env_vars
6271
6306
 
6307
+ def _get_managed_job_user_id(self, task: task_lib.Task) -> Optional[str]:
6308
+ """Returns the user id for the managed job."""
6309
+ if task.managed_job_dag is not None:
6310
+ return task.envs[constants.USER_ID_ENV_VAR]
6311
+ return None
6312
+
6272
6313
  def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
6273
6314
  task: task_lib.Task, job_id: int,
6274
6315
  remote_log_dir: str) -> None:
@@ -6307,11 +6348,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
6307
6348
 
6308
6349
  codegen.add_epilogue()
6309
6350
 
6310
- self._exec_code_on_head(handle,
6311
- codegen.build(),
6312
- job_id,
6313
- managed_job_dag=task.managed_job_dag,
6314
- remote_log_dir=remote_log_dir)
6351
+ self._exec_code_on_head(
6352
+ handle,
6353
+ codegen.build(),
6354
+ job_id,
6355
+ managed_job_dag=task.managed_job_dag,
6356
+ managed_job_user_id=self._get_managed_job_user_id(task),
6357
+ remote_log_dir=remote_log_dir)
6315
6358
 
6316
6359
  def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle,
6317
6360
  task: task_lib.Task, job_id: int,
@@ -6362,8 +6405,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
6362
6405
 
6363
6406
  codegen.add_epilogue()
6364
6407
  # TODO(zhanghao): Add help info for downloading logs.
6365
- self._exec_code_on_head(handle,
6366
- codegen.build(),
6367
- job_id,
6368
- managed_job_dag=task.managed_job_dag,
6369
- remote_log_dir=remote_log_dir)
6408
+ self._exec_code_on_head(
6409
+ handle,
6410
+ codegen.build(),
6411
+ job_id,
6412
+ managed_job_dag=task.managed_job_dag,
6413
+ managed_job_user_id=self._get_managed_job_user_id(task),
6414
+ remote_log_dir=remote_log_dir)